Smart-seq2分析

单细胞Smart-seq2数据分析详解

Posted by CHY on July 28, 2020

由于 Smart-seq2 建库测序与 10X 存在较大差异,所以在数据分析(主要是前期表达矩阵的获取)存在一定差异,故借着生信星球推文进行分析流程整理。

数据说明

使用的是来自永生化小鼠骨髓祖细胞系的 2 个 96 孔板的 416B cells,并且在细胞裂解后文库制备前,在每个细胞中加入一定量的外源 RNA(ERCC),之后再进行高通量测序,得到每个基因的表达量(这个是通过计算比对到外显子区域的 reads 数得到的,就像 featureCounts 的操作);同样,spike-in 的含量也是计算有多少 reads 比对到了 spike-in 的参考序列。

分析实战流程

# 数据加载
library(scRNAseq)
sce.416b <- LunSpikeInData(which="416b")
sce.416b
table(sce.416b$block)  # 分为两个96孔板
rowData(sce.416b)

# 增加基因的信息(symbol ID、染色体位置)
# ID转换--uniquifyFeatureNames
library(AnnotationHub)
ens.mm.v97 <- AnnotationHub()[["AH73905"]]
columns(ens.mm.v97)
rowData(sce.416b)$ENSEMBL <- rownames(sce.416b)
rowData(sce.416b)$SYMBOL <- mapIds(ens.mm.v97, keys=rownames(sce.416b),
                                   keytype="GENEID", column="SYMBOL")
uniquifyFeatureNames(head(rowData(sce.416b)$ENSEMBL),
                     head(rowData(sce.416b)$SYMBOL))

# 增加基因的染色体编号(方便后面识别线粒体基因)
rowData(sce.416b)$SEQNAME <- mapIds(ens.mm.v97, keys=rownames(sce.416b),
                                    keytype="GENEID", column="SEQNAME")
table(rowData(sce.416b)$SEQNAME=='MT')

# 质控
mito <- which(rowData(sce.416b)$SEQNAME=="MT")
# 先计算各个细胞的QC指标
stats <- perCellQCMetrics(sce.416b, subsets=list(Mt=mito))
colnames(stats)
# 再根据QC指标进行判断,哪些该去除
qc <- quickPerCellQC(stats, percent_subsets=c("subsets_Mt_percent",
    "altexps_ERCC_percent"), batch=sce.416b$block)
sce.416b <- sce.416b[,!qc$discard]

colData(unfiltered) <- cbind(colData(unfiltered), stats)
unfiltered$discard <- qc$discard
gridExtra::grid.arrange(
    plotColData(unfiltered, x="block", y="sum",
        colour_by="discard") + scale_y_log10() + ggtitle("Total count"),
    plotColData(unfiltered, x="block", y="detected",
        colour_by="discard") + scale_y_log10() + ggtitle("Detected features"),
    plotColData(unfiltered, x="block", y="subsets_Mt_percent",
        colour_by="discard") + ggtitle("Mito percent"),
    plotColData(unfiltered, x="block", y="altexps_ERCC_percent",
        colour_by="discard") + ggtitle("ERCC percent"),
    nrow=2,
    ncol=2
)

gridExtra::grid.arrange(
    plotColData(unfiltered, x="sum", y="subsets_Mt_percent",
        colour_by="discard") + scale_x_log10(),
    plotColData(unfiltered, x="altexps_ERCC_percent", y="subsets_Mt_percent",
        colour_by="discard"),
    ncol=2
)
colSums(as.matrix(qc))

# 归一化
set.seed(100)
library(scran)
sce.416b <- computeSumFactors(sce.416b)
sce.416b <- logNormCounts(sce.416b)
summary(sizeFactors(sce.416b))

# 寻找高变基因
## 考虑批次信息
dec.416b <- modelGeneVarWithSpikes(sce.416b, "ERCC", block=sce.416b$block)
chosen.hvgs <- getTopHVGs(dec.416b, prop=0.1)
par(mfrow=c(1,2))
blocked.stats <- dec.416b$per.block
for (i in colnames(blocked.stats)) {
    current <- blocked.stats[[i]]
    plot(current$mean, current$total, main=i, pch=16, cex=0.5,
        xlab="Mean of log-expression", ylab="Variance of log-expression")
    curfit <- metadata(current)
    points(curfit$mean, curfit$var, col="red", pch=16)
    curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}

# 批次矫正
library(limma)
assay(sce.416b, "corrected") <- removeBatchEffect(logcounts(sce.416b),
    design=model.matrix(~sce.416b$phenotype), batch=sce.416b$block)

# 降维
sce.416b <- runPCA(sce.416b, ncomponents=10, subset_row=chosen.hvgs,
    exprs_values="corrected", BSPARAM=BiocSingular::ExactParam())
set.seed(1010)
sce.416b <- runTSNE(sce.416b, dimred="PCA", perplexity=10)

# 聚类
my.dist <- dist(reducedDim(sce.416b, "PCA"))
my.tree <- hclust(my.dist, method="ward.D2")
library(dynamicTreeCut)
my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist),
    minClusterSize=10, verbose=0))
colLabels(sce.416b) <- factor(my.clusters)

plotTSNE(sce.416b, colour_by="label")


# ”轮廓图“(silhouette width)检查分群的质量
# 对每个细胞都计算一个silhouette width值,如果一个细胞的width值为正并且越大,表示相对于其他亚群的细胞,这个细胞和它所在亚群中的细胞更接近,分群效果越好;如果width为负,就表示这个亚群的这个细胞和其他亚群的细胞更接近,即分群效果不太理想。
library(cluster)
sil <- silhouette(my.clusters, dist = my.dist)
colnames(sil)
clust.col <- scater:::.get_palette("tableau10medium") # 设置颜色
sil.cols <- clust.col[ifelse(sil[,3] > 0, sil[,1], sil[,2])]
sil.cols <- sil.cols[order(-sil[,1], sil[,3])]
plot(sil, main = paste(length(unique(my.clusters)), "clusters"),
    border=sil.cols, col=sil.cols, do.col.sort=FALSE)


# marker基因鉴定
markers <- findMarkers(sce.416b, my.clusters, block=sce.416b$block)
top.markers <- rownames(marker.set)[marker.set$Top <= 10]
plotHeatmap(sce.416b, features=top.markers, order_columns_by="label",
    colour_columns_by=c("label", "block", "phenotype"),
    center=TRUE, symmetric=TRUE, zlim=c(-5, 5))