ILoReg

单细胞rare cell鉴定分析

Posted by CHY on June 23, 2020

ILoReg并不像常规分析流程一样通过选择feature来进行降维,而是采用probabilistic feature extraction,利用ICP算法运行多次,生成多个概率矩阵(N x K,N指细胞数,K指每个细胞属于特定cluster的概率)后最终联合成一个矩阵,再进行PCA分析将维度将至二维。输入文件为根据文库大小标准化后的数据,with genes/features in rows and cells/samples in columns。输入文件可以为多种格式 matrix, data.frame or dgCMatrix class。

ILoReg

# 加载R包
library(devtools)
devtools::install_github("elolab/iloreg")
suppressMessages(library(ILoReg))
suppressMessages(library(SingleCellExperiment))
suppressMessages(library(cowplot))

# The dataset was normalized using the LogNormalize method from the Seurat R package.
sce <- SingleCellExperiment(assays = list(logcounts = pbmc3k_500))
sce <- PrepareILoReg(sce)

# Run the ICP clustering algorithm $L$ times
# 参数说明:
# k:给定初始类群数
# d: 取值0~1,规定每个cluster中多少比例的细胞用作训练数据集,一般设定为0.3,过高过低一定程度上影响结果。
# C:默认值为0.3,C值越低,L1-regularized feature selection越严格。(正数)
# r: 默认值为5,表示在ICP算法停止之前执行的最大重复次数。
# L: 默认值为200,ICP算法运行的最大次数
# type: 指定回归的种类
# threads: 线程数
set.seed(1)
sce <- RunParallelICP(object = sce, k = 15,
                      d = 0.3, L = 30, 
                      r = 5, C = 0.3, 
                      reg.type = "L1", threads = 0)
saveRDS(sce,file = "~/pbmc3k_500_iloreg.rds")

# 质控措施
# Projection accuracy 越接近1越好,过滤要求0.5以上
# A higher accuracy can be achieved using lower values of k, C and higher values of d and r.
# ARI similarity
# A higher d leads to a lower similarity and a smaller k to a higher similarity.
VisualizeQC(sce,return.plot = F)

# PCA transformation of the joint probability matrix
sce <- RunPCA(sce,p=50,scale = FALSE,threshold = 0)
PCAElbowPlot(sce)  # 拐点图

# 可视化
sce <- RunUMAP(sce)
sce <- RunTSNE(sce,perplexity=30)
GeneScatterPlot(sce,c("CD3D","CD79A","CST3","FCER1A"),
                dim.reduction.type = "umap",
                point.size = 0.3)
GeneScatterPlot(sce,c("CD3D","CD79A","CST3","FCER1A"),
                dim.reduction.type = "tsne",
                point.size = 0.3)

# 层次聚类
sce <- HierarchicalClustering(sce)

# Extracting a consensus clustering with K clusters共识聚类
sce <- SelectKClusters(sce,K=13)
plot_grid(ClusteringScatterPlot(sce,
                                dim.reduction.type = "umap",
                                return.plot = T,
                                title = "UMAP",
                                show.legend=FALSE),
          ClusteringScatterPlot(sce,
                                dim.reduction.type = "tsne",
                                return.plot = T
                                ,title="t-SNE",
                                show.legend=FALSE),
          ncol = 1
)

# Marker gene
gene_markers <- FindAllGeneMarkers(sce,
                                   clustering.type = "manual",
                                   test = "wilcox",
                                   log2fc.threshold = 0.25,
                                   min.pct = 0.25,
                                   min.diff.pct = NULL,
                                   pseudocount.use = 1,
                                   min.cells.group = 3,
                                   return.thresh = 0.01,
                                   only.pos = TRUE,
                                   max.cells.per.cluster = NULL)

# Marker gene绘图
top10_log2FC <- SelectTopGenes(gene_markers,
                               top.N = 10,
                               criterion.type = "log2FC",
                               reverse = FALSE)
top1_log2FC <- SelectTopGenes(gene_markers,
                              top.N = 1,
                              criterion.type = "log2FC",
                              reverse = FALSE)
top10_adj.p.value <- SelectTopGenes(gene_markers,
                                    top.N = 10,
                                    criterion.type = "adj.p.value",
                                    reverse = TRUE)
top1_adj.p.value <- SelectTopGenes(gene_markers,
                                   top.N = 1,
                                   criterion.type = "adj.p.value",
                                   reverse = TRUE)
GeneScatterPlot(sce,
                genes = unique(top1_log2FC$gene),
                dim.reduction.type = "tsne",
                point.size = 0.5,ncol=2)
GeneHeatmap(sce,
            clustering.type = "manual",
            gene.markers = top10_log2FC)

# cluster重命名
sce <- RenameAllClusters(sce,
                         new.cluster.names = c("GZMK+/CD8+ T cells",
                                               "IGKC+ B cells",
                                               "Naive CD4+ T cells",
                                               "NK cells",
                                               "CD16+ monocytes",
                                               "CD8+ T cells",
                                               "CD14+ monocytes",
                                               "IGLC+ B cells",
                                               "Intermediate monocytes",
                                               "IGKC+/IGLC+ B cells",
                                               "Memory CD4+ T cells",
                                               "Naive CD8+ T cells",
                                               "Dendritic cells"))
GeneHeatmap(sce,gene.markers = top10_log2FC)
VlnPlot(sce,genes = c("CD3D"),return.plot = F,rotate.x.axis.labels = T)

# 对一个合适的cluster数进行预估
sce <- CalculateSilhouetteInformation(sce,K.start = 2,K.end = 50)
SilhouetteCurve(sce,return.plot = F)

# 对cluster进行重命名
sce <- SelectKClusters(sce,K=20)
# Rename cluster 1 as A
sce <- RenameCluster(sce,old.cluster.name = 1,new.cluster.name = "A")

# 自定义可视化
# Select a clustering with K=20 clusters
sce <- SelectKClusters(sce,K=5)
# Generate a custom annotation with K=5 clusters and change the names to the five first alphabets.
custom_annotation <- plyr::mapvalues(metadata(sce)$iloreg$clustering.manual,c(1,2,3,4,5),LETTERS[1:5])
# Visualize the annotation
AnnotationScatterPlot(sce,
                      annotation = custom_annotation,
                      return.plot = F,
                      dim.reduction.type = "tsne",
                      show.legend = FALSE)

# cluster合并
sce <- SelectKClusters(sce,K=20)
sce <- MergeClusters(sce,clusters.to.merge  = c(3,4))

# Marker gene鉴定
GM_dendritic <- FindGeneMarkers(sce,
                                clusters.1 = "Dendritic cells",
                                logfc.threshold = 0.25,
                                min.pct = 0.25,
                                return.thresh = 0.01,
                                only.pos = TRUE)

iterative clustering projection (ICP) clustering algorithm

在ILoReg中,主要是运行ICP多次,生成多个概率矩阵,概率矩阵主要记录每个细胞在每个分群中的可能性(each of the N cells belonging to the k clusters.)。具体原理还不太明白。