scater

单细胞软件scater运行脚本

Posted by CHY on June 30, 2020

scater主要对单细胞数据进行预处理和质控,为下游分析提供高质量的表达数据。 scater_qc_workflow

rm(list=ls())
options(stringsAsFactors = F)

# 导入包
suppressMessages(library(scater))
suppressMessages(library(scRNAseq))

# 载入示例数据
data("sc_example_counts")
data("sc_example_cell_info")
# 示例数据查看
head(sc_example_counts)[1:3,1:3]
head(sc_example_cell_info)[1:3,]

# 构建 SingleCellExperiment 对象
example_sce <- SingleCellExperiment(
  assays = list(counts = sc_example_counts),  # 建议以counts作为输入矩阵名称
  colData = sc_example_cell_info
)
example_sce
str(counts(example_sce))

# isSpike 对spike-in操作
# sizeFactors 对数据标准化时计算细胞文库大小
# reduceDim对降维结果进行操作
# 过滤掉不表达的基因
keep_feature <- rowSums(counts(example_sce) > 0) > 0
example_sce <- example_sce[keep_feature,]

# 计算CPM值存储到example_sce对象中cpm中去
cpm(example_sce) <- calculateCPM(example_sce)
# 归一化,log2转换,计算结果存储在logcounts中
example_sce <- normalize(example_sce)
assayNames(example_sce)
# 归一化后计算每个基因平均表达量
head(calcAverage(example_sce))

# 创建新的矩阵
assay(example_sce,"is_expr") <- counts(example_sce) > 0 

# 数据可视化方面
plotExpression(example_sce,rownames(example_sce)[1:6])  # 默认使用标准化后的logcounts值
plotExpression(example_sce,rownames(example_sce)[1:6],x = "Mutation_Status",exprs_values = "logcounts") # 分组比较基因表达
plotExpression(example_sce,rownames(example_sce)[1:6],colour_by = "Cell_Cycle",shape_by = "Mutation_Status",size_by = "Gene_0002",show_median = TRUE) # 定义大小颜色等

# PCA降维
example_sce <- runPCA(example_sce)
reducedDimNames(example_sce)
plotReducedDim(example_sce, use_dimred = "PCA", 
               colour_by = "Treatment", shape_by = "Mutation_Status")
set.seed(1000)
example_sce <- runTSNE(example_sce, perplexity=10) # tSNE降维绘图
plotTSNE(example_sce, colour_by = "Gene_0001", size_by = "Gene_1000")
example_sce <- runDiffusionMap(example_sce)  # UMAP降维绘图
plotDiffusionMap(example_sce, colour_by = "Gene_0001", size_by = "Gene_1000")

### 质控
# 计算 SingleCellExperiment 对象中每个特征和细胞的统计指标,存在colData和rowData中
example_sce <- calculateQCMetrics(example_sce)
colnames(colData(example_sce)) # 样本层面质控指标
colnames(rowData(example_sce)) # feature层面质控指标

# 细胞层面关注的指标
# total_counts:每个细胞中总表达量(文库大小)
# total_features_by_counts:细胞中超过规定阈值(默认是0)的feature数量
# pct_counts_X:属于feature control组(也就是这里的X)的表达量占比
# feature方面关注的指标
# mean_counts:基因/feature的平均表达量
# pct_dropout_by_counts:基因在细胞中表达量为0的细胞数占比(该基因丢失率)
# pct_counts_Y:属于cell control组的表达量占比
# log10_mean_counts:归一化 log10 scale
# n_cells_by_counts:多少个细胞表达了该基因

# 绘制高表达基因
plotHighestExprs(example_sce, exprs_values = "counts")

# 表达频率比均值
plotExprsFreqVsMean(example_sce)

# 总feature表达量 vs feature control(ERCC)占比
plotColData(example_sce, x = "total_features_by_counts",
            y = "pct_counts_feature_control", colour = "Mutation_Status") +
  theme(legend.position = "top") +
  stat_smooth(method = "lm", se = FALSE, size = 2, fullrange = TRUE)

# 表达量累计贡献图
plotScater(example_sce, block1 = "Mutation_Status", block2 = "Treatment",
           colour_by = "Cell_Cycle", nfeatures = 300, exprs_values = "counts")

# 设置阈值进行筛选(根据MAD值进行筛选)
keep.total <- isOutlier(example_sce$total_counts, nmads=3, 
                        type="lower", log=TRUE)
filtered <- example_sce[,keep.total]