拟时分析
拟时(pseudotime)分析,又称细胞轨迹(cell trajectory)分析,通过拟时分析可以推断出发育过程细胞的分化轨迹或细胞亚型的演化过程,在发育相关研究中使用频率较高。主要基于关键基因的表达模式,在拟时间中对单个细胞进行排序,模拟出时间发育过程的动态变化。
排序需要关注的在于:对什么排序,如何判断先后顺序,如何寻找分支点
排序离不开降维,降维离不开特征提取。
monocle2
主要分析内容:
- Clustering, classifying, and counting cells
- Constructing single-cell trajectories.
- Differential expression analysis
# 导入数据两种方式
# 第一种:Seurat to monocle2
library(monocle)
library(Seurat)
#Load Seurat object
pbmc <- readRDS('D:\\Users\\Administrator\\Desktop\\RStudio\\single_cell\\filtered_gene_bc_matrices\\hg19pbmc_tutorial.rds')
#Extract data, phenotype data, and feature data from the SeuratObject
data <- as(as.matrix(pbmc@assays$RNA@counts), 'sparseMatrix')
pd <- new('AnnotatedDataFrame', data = pbmc@meta.data)
fData <- data.frame(gene_short_name = row.names(data), row.names = row.names(data))
fd <- new('AnnotatedDataFrame', data = fData)
#Construct monocle cds
monocle_cds <- newCellDataSet(data,
phenoData = pd,
featureData = fd,
lowerDetectionLimit = 0.5,
expressionFamily = negbinomial.size())
# 第二种:直接读入count文件
# 主要是需要准备三个文件,表达矩阵、细胞元数据、基因元数据
HSMM_expr_matrix <- read.table("fpkm_matrix.txt")
HSMM_sample_sheet <- read.delim("cell_sample_sheet.txt")
HSMM_gene_annotation <- read.delim("gene_annotations.txt")
pd <- new("AnnotatedDataFrame", data = HSMM_sample_sheet)
fd <- new("AnnotatedDataFrame", data = HSMM_gene_annotation)
HSMM <- newCellDataSet(as.matrix(HSMM_expr_matrix),
phenoData = pd, featureData = fd)
# 数据过滤QC
### 归一化
HSMM <- estimateSizeFactors(HSMM)
HSMM <- estimateDispersions(HSMM)
### 过滤基因
# 基因至少在三个细胞中表达
HSMM <- detectGenes(HSMM, min_expr = 3 )
print(head(fData(HSMM)))
expressed_genes <- row.names(subset(fData(HSMM),
+ num_cells_expressed >= 10))
print(head(pData(HSMM)))
# 细胞分类
# 根据选择的marker基因的表达来标记细胞
# 聚类
## 首先进行特征选取(可以绘图)
HSMM <- reduceDimension(HSMM, max_components = 2, num_dim = 10,
+ reduction_method = 'tSNE', verbose = T)
Remove noise by PCA ...
Reduce dimension by tSNE ...
HSMM <- clusterCells(HSMM, num_clusters = 2)
Distance cutoff calculated to 2.589424
plot_cell_clusters(HSMM, 1, 2, color = "CellType",
+ markers = c("CDC20", "GABPB2"))
# 构建轨迹
# 基因选择 dpFeature
diff_test_res <- differentialGeneTest(HSMM[expressed_genes,],
fullModelFormulaStr = "~percent.mt")
ordering_genes <- row.names (subset(diff_test_res, qval < 0.1)) ## 不要也写0.1 ,而是要写0.01。
HSMM <- setOrderingFilter(HSMM, ordering_genes)
plot_ordering_genes(HSMM)
# 基于选择的基因降维
HSMM <- reduceDimension(HSMM, max_components = 2,
method = 'DDRTree')
# 排序
HSMM <- orderCells(HSMM)
plot_cell_trajectory(HSMM, color_by = "seurat_clusters")
# 差异分析
STREAM
拟时分析基本要素
- 一个基因表达矩阵
- 特征选择(关键基因:用来确定拟时序和轨迹分支)
- 降维(所谓的排序就是在低维空间排布高维数据)
- 聚类(哪些细胞可以排布到同一个分支中呢?)
- 结构拟合(轨迹基本形状)
- 路径确定以及细胞排序
- 可视化 win版 linux版
Monocle3
monocle3官网 更新:UMAP的添加、3D功能添加、可以包含多个根 按照官方代码分析一遍流程:
#####################
## monocle3安装
#####################
# 1. 安装BiocManager(vision > 3.5)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install()
# 2.安装辅助包
BiocManager::install(c('BiocGenerics', 'DelayedArray', 'DelayedMatrixStats',
'limma', 'S4Vectors', 'SingleCellExperiment',
'SummarizedExperiment', 'batchelor'))
# 3.安装monocle从github
install.packages("devtools")
devtools::install_github('cole-trapnell-lab/leidenbase')
devtools::install_github('cole-trapnell-lab/monocle3')
library(monocle3)
library(ggplot2)
library(dplyr)
#####################
## 加载数据
#####################
## 普遍加载数据
# Load the data
expression_matrix <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_expression.rds"))
cell_metadata <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_colData.rds"))
gene_annotation <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_rowData.rds"))
# Make the CDS object
cds <- new_cell_data_set(expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
## 加载10x数据
cds <- load_cellranger_data("~/Downloads/10x_data")
## 加载大型数据,需要构建稀疏矩阵
cds <- new_cell_data_set(as(umi_matrix, "sparseMatrix"),
cell_metadata = cell_metadata,
gene_metadata = gene_metadata)
#####################
## 数据预处理(如何标准化数据)
#####################
# PCA
cds <- preprocess_cds(cds, num_dim = 100)
plot_pc_variance_explained(cds)
#####################
## 降维可视化
#####################
cds <- reduce_dimension(cds)
plot_cells(cds)
#####################
## 检查去除批次效应
#####################
cds = align_cds(cds, num_dim = 100, alignment_group = "plate")
cds = reduce_dimension(cds)
plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE)
#####################
## 聚类
#####################
cds = cluster_cells(cds, resolution=1e-5)
plot_cells(cds)
plot_cells(cds, color_cells_by="partition", group_cells_by="partition")
#####################
## 差异表达Marker gene鉴定
#####################
marker_test_res <- top_markers(cds, group_cells_by="partition",
reference_cells=1000, cores=8)
#####################
## 细胞类型注释
#####################
略
#####################
## 轨迹构建
#####################
cds <- learn_graph(cds)
plot_cells(cds,
color_cells_by = "cell.type",
label_groups_by_cluster=FALSE,
label_leaves=FALSE,
label_branch_points=FALSE)
#####################
## 3D轨迹
#####################
cds_3d <- reduce_dimension(cds, max_components = 3)
cds_3d <- cluster_cells(cds_3d)
cds_3d <- learn_graph(cds_3d)
cds_3d <- order_cells(cds_3d, root_pr_nodes=get_earliest_principal_node(cds))
cds_3d_plot_obj <- plot_cells_3d(cds_3d, color_cells_by="partition")
#####################
## 差异表达
#####################
# 两种方法:
# Regression analysis: using fit_models(), you can evaluate whether each gene depends on variables such as time, treatments, etc.
# Graph-autocorrelation analysis: using graph_test(), you can find genes that vary over a trajectory or between clusters.