# R包加载
library(tidyverse)
library(Seurat)
# 数据Seurat处理
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "data/pbmc5k/filtered_feature_bc_matrix/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc5k", min.cells = 3, min.features = 200)
pbmc
#QC
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 25)
#Normalization
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
#高变基因选择
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
#标准化
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
#去除MT,重新进行标准化
pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt")
#PCA
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc), verbose = FALSE)
#聚类
pbmc <- FindNeighbors(pbmc, dims = 1:20)
pbmc <- FindClusters(pbmc, resolution = 0.5)
#可视化
pbmc <- RunUMAP(pbmc, dims = 1:20)
pbmc<- RunTSNE(pbmc, dims = 1:20)
## after we run UMAP and TSNE, there are more entries in the reduction slot
str(pbmc@reductions)
DimPlot(pbmc, reduction = "umap", label = TRUE)
#查找marker基因
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
saveRDS(pbmc, "data/pbmc5k/pbmc_5k_v3.rds")
# GSEA分析
library(Seurat)
pbmc<- readRDS("data/pbmc5k/pbmc_5k_v3.rds")
# 对于GSEA,需要所有基因的信息
# all 20,000 genes. instead let's try presto which performs a fast Wilcoxon rank sum test
#library(devtools)
#install_github('immunogenomics/presto')
library(presto)
Loading required package: Rcpp
pbmc.genes <- wilcoxauc(pbmc, 'seurat_clusters')
head(pbmc.genes)
# 我们拥有每个cluster的所有基因
dplyr::count(pbmc.genes, group)
# 基因集下载
library(msigdbr)
library(fgsea)
library(dplyr)
library(ggplot2)
msigdbr_show_species() # 查看物种
m_df<- msigdbr(species = "Homo sapiens", category = "C7")
fgsea_sets<- m_df %>% split(x = .$gene_symbol, f = .$gs_name)
fgsea_sets$GSE11057_NAIVE_VS_MEMORY_CD4_TCELL_UP # 查看特定基因集
pbmc.genes %>%
dplyr::filter(group == "0") %>%
arrange(desc(logFC), desc(auc)) %>%
head(n = 10) #进行降序排序
# 仅选择fgsea的feature和auc列
cluster0.genes<- pbmc.genes %>%
dplyr::filter(group == "0") %>%
arrange(desc(auc)) %>%
dplyr::select(feature, auc)
ranks<- deframe(cluster0.genes)
head(ranks)