singleseqgset 是用于单细胞 RNA-seq 数据的基因集富集分析的软件包。它使用简单的基础统计量(variance inflated Wilcoxon 秩和检验)来确定不同 cluster 中感兴趣的基因集的富集。
## 安装
library(devtools)
install_github("arc85/singleseqgset")
library(singleseqgset)
## 使用splatter R包模拟表达式数据;
suppressMessages({
library(splatter)
library(Seurat)
library(msigdbr)
library(singleseqgset)
library(heatmap3)
})
#Splatter是用于模拟单细胞RNA测序count数据的软件包。
#创建参数并模拟数据
sc.params <- newSplatParams(nGenes=1000,batchCells=5000)
sim.groups <- splatSimulate(params=sc.params,method="groups",group.prob=c(0.4,0.2,0.3,0.1),de.prob=c(0.20,0.20,0.1,0.3),verbose=F)
sim.groups #Check out the SingleCellExperiment object with simulated dataset
## 使用msigdbr下载感兴趣的基因集;
h.human <- msigdbr(species="Homo sapiens",category="H")
h.names <- unique(h.human$gs_name)
h.sets <- vector("list",length=length(h.names))
names(h.sets) <- h.names
for (i in names(h.sets)) {
h.sets[[i]] <- pull(h.human[h.human$gs_name==i,"gene_symbol"])
}
## 将特定基因集添加到我们的模拟数据中(就是让我们的模拟数据可以富含某基因集);
sets.to.use <- sample(seq(1,50,1),20,replace=F)
sets.and.groups <- data.frame(set=sets.to.use,group=paste("Group",rep(1:4,each=5),sep=""))
for (i in 1:nrow(sets.and.groups)) {
set.to.use <- sets.and.groups[i,"set"]
group.to.use <- sets.and.groups[i,"group"]
gene.set.length <- length(h.sets[[set.to.use]])
blank.matrix <- matrix(0,ncol=ncol(sim.counts),nrow=gene.set.length)
rownames(blank.matrix) <- h.sets[[sets.to.use[i]]]
matching <- rownames(blank.matrix) %in% rownames(sim.counts)
if (any(matching==TRUE)) {
matching.genes <- rownames(blank.matrix)[matching]
nonmatching.genes <- setdiff(rownames(blank.matrix),matching.genes)
blank.matrix <- blank.matrix[!matching,]
sim.counts <- rbind(sim.counts,blank.matrix)
} else {
sim.counts <- rbind(sim.counts,blank.matrix)
matching.genes <- rownames(blank.matrix)
}
group.cells <- colnames(sim.counts)[groups==group.to.use]
for (z in group.cells) {
if (any(matching==TRUE)) {
sim.counts[matching.genes,z] <- ifelse(rbinom(length(matching.genes),size=1,prob=0.5)>0,0,rpois(length(matching.genes),lambda=10))
sim.counts[nonmatching.genes,z] <- ifelse(rbinom(length(nonmatching.genes),size=1,prob=0.5)>0,0,rpois(length(nonmatching.genes),lambda=10))
} else {
sim.counts[matching.genes,z] <- ifelse(rbinom(length(matching.genes),size=1,prob=0.5)>0,0,rpois(length(matching.genes),lambda=10))
}
}
}
#Here, we will check out the sum of expression for the first gene set
len.set1 <- length(h.sets[[sets.to.use[[1]]]])
plot(apply(sim.counts[1001:(1000+len.set1),],2,sum),xlim=c(0,5000),xlab="Cells",ylab="Sum of Gene Set 1 Expression")
## 使用标准的Seurat工作流程(v.2.3.4)处理我们的数据;
#构建seurat object
ser <- CreateSeuratObject(raw.data=sim.counts)
#归一化
ser <- NormalizeData(ser)
ser@var.genes <- rownames(ser@raw.data)[1:1000]
ser <- ScaleData(ser,genes.use=ser@var.genes)
#我们会假设1000个模拟基因是“可变基因”,
#我们将跳过Seurat的FindVariableGenes
ser <- RunPCA(ser,pc.genes=ser@var.genes,do.print=FALSE)
PCElbowPlot(ser)
#其中top4PCs代表了多数的差异
#我们将会选择top5 PCs;
ser <- RunTSNE(ser,dims.use=1:5)
#将模拟的dataID号加入Seurat object
data.to.add <- colData(sim.groups)$Group
names(data.to.add) <- ser@cell.names
ser <- AddMetaData(ser,metadata=data.to.add,col.name="real_id")
ser <- SetAllIdent(ser,id="real_id")
#我们将会跳过使用Seurat聚类的步骤,因为我们已知道真正的聚类IDs
TSNEPlot(ser,group.by="real_id")
## 使用singleseqgset进行基因集富集分析;
### 将基于logFC计算基因集富集测试的矩阵(使用已针对library大小进行校正的log-normalized数据,存储在@data slot中
logfc.data <- logFC(cluster.ids=ser@meta.data$real_id,expr.mat=ser@data)
names(logfc.data)
# 计算enrichment scores和p值
gse.res <- wmw_gsea(expr.mat=ser@data,cluster.cells=logfc.data[[1]],log.fc.cluster=logfc.data[[2]],gene.sets=h.sets)
names(gse.res)
## 将结果绘制在热图中;
res.stats <- gse.res[["GSEA_statistics"]]
res.pvals <- gse.res[["GSEA_p_values"]]
res.pvals <- apply(res.pvals,2,p.adjust,method="fdr") #Correct for multiple comparisons
res.stats[order(res.stats[,1],decreasing=TRUE)[1:10],] #Top gene sets enriched by z scores
names(h.sets)[sets.to.use[1:5]] #Compare to the simulate sets we created
#Plot the z scores with heatmap3
heatmap3(res.stats,Colv=NA,cexRow=0.5,cexCol=1,scale="row")