一个RNAseq完整数据分析脚本

无参转录组的分析案例

Posted by CHY on August 10, 2020

使用 hisat2 比对,featureCounts 进行 reads 计数,使用 DESeq2 包进行定量。从测序数据比对,到得到差异表达基因,再到对差异表达可视化以及对差异表达基因进行功能注释。

准备工作

准备工作主要分成三部分,安装生物软件,下载对应的参考序列以及 gtf 文件,以及安装 R 相关的包。

1 软件安装
conda install -y hisat2 subread
2 参考序列以及GTF文件下载
http://asia.ensembl.org/Homo_sapiens/Info/Index
3、安装Bioconductor对应的包
BiocManager::install("rnaseqGene")

hisat2 比对

#!/usr/bin/env bash
#定义参考序列以及输入文件路径
IDX=/ifs1/Database/ensembl/release-75/homo_sapiens//Homo_sapiens.GRCh37.75.dna.primary_assembly.fa
DIR=/ifs1/Sequencing/airway
RUNLOG=runlog.txt

 #定义输入文件和输出文件名
# Iterate over each sample
for SAMPLE in SRX384345 SRX384346 SRX384349 SRX384350 SRX384353 SRX384354 SRX384357 SRX384358;
do
# Iterate over each of the replicates.
R1=${DIR}/${SAMPLE}_1.fastq.gz
R2=${DIR}/${SAMPLE}_2.fastq.gz
BAM=${SAMPLE}.bam

# 开始循环比对
hisat2 -p 2 --dta $IDX -1 $R1 -2 $R2 2>> $RUNLOG | samtools sort > $BAM 2>> $RUNLOG
samtools index $BAM
done

featureCounts 计数

#!/usr/bin/env bash
GTF=/ifs1/Database/ensembl/release-75/homo_sapiens/Homo_sapiens.GRCh37.75.gtf
DIR=/ifs1/Sequencing/airway/bam
featureCounts -a $GTF  -g gene_name -o counts.txt $DIR/*.bam
#提取文件对应的列
cut -f 1,7-12 counts.txt |grep -v "#" >CountMatrix.csv

DESeq2 包差异表达分析

# 生成DESeqDataSeq对象
#设置工作目录,所有文件放到此目录下
setwd(“~/RNAseq”)
library("DESeq2")
#通过Read Count矩阵来生成DESeqDataSeq对象----------------------
countdata <- read.csv("CountMatrix.csv",row.names = 1)
head(countdata, 10)
coldata <- read.csv("sample_table.csv",row.names = 1)

#关键步骤
dds <- DESeqDataSetFromMatrix(countData = countdata,colData = coldata,design = ~ cell + dex)

#查看DESeqDataSet对象
dim(dds)
assay(dds)
assayNames(dds)
colSums(assay(dds))
rowRanges(dds)
colData(dds)
#过滤没有reads比对上的基因,所有reads数为零
nrow(dds)
dds <- dds[rowSums(counts(dds)) > 1,]
nrow(dds)

# 将数据通过rolg方法与vst方法转换,这样可以用于后面计算距离矩阵
## ----rlog方法-------
rld <- rlog(dds, blind = FALSE)
head(assay(rld), 3)

## ----vst方法------
vsd <- vst(dds, blind = FALSE)
head(assay(vsd), 3)

#利用转换后的结果计算样品之间距离关系
#方法1--欧氏距离--------------------------
sampleDists <- dist(t(assay(rld)))
sampleDists

library("pheatmap")
library("RColorBrewer")

## ----distheatmap, fig.width = 6.1, fig.height = 4.5-----
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( rld$dex, rld$cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
pheatmap(sampleDistMatrix,clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists)

## PCA-----------
plotPCA(rld, intgroup = c("dex", "cell"))
pcaData <- plotPCA(rld, intgroup = c( "dex", "cell"), returnData = TRUE)
pcaData
percentVar <- round(100 * attr(pcaData, "percentVar"))
library(ggplot2)
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
  geom_point(size =3) +  xlab(paste0("PC1: ", percentVar[1], "% variance")) +   ylab(paste0("PC2: ", percentVar[2], "% variance")) +  coord_fixed()

# 差异基因筛选
# 差异表达计算
dep <- DESeq(dds)
res <- results(dep)
res
write.csv(x = res,file = "des.csv")

#筛选出p值小于0.05的基因
res.05 <- results(dep, alpha = 0.05)
table(res.05$padj < 0.05)

#统计p值小于0.05差异表达基因数目
sum(res$pvalue < 0.05, na.rm=TRUE)
sum(!is.na(res$pvalue))
sum(res$padj < 0.1, na.rm=TRUE)

#筛选出差异表达明显的基因Significant,设定标准为p值小于0.01,至于使用0.05还是0.01,具体问题具体分析
resSig <- subset(res, padj < 0.1)
#按log2差异倍数排序,先升序,设置decreasing = TRUE降序
head(resSig[ order(resSig$log2FoldChange), ])
head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ])
write.csv(dta, file = "results.csv")

# 可视化
m <- read.csv("des.csv",header = T,row.names = 1)
head(m)
m <- na.omit(m)
m <- transform(m,padj=-1*log10(m$padj))
down <- m[m$log2FoldChange<=-1,]
up <- m[m$log2FoldChange>=1,]
no <- m[m$log2FoldChange>-1 & m$log2FoldChange <1,]

plot(no$log2FoldChange,no$padj,xlim = c(-10,10),ylim=c(0,100),col="blue",pch=16,cex=0.8,main = "Gene Expression",xlab = "log2FoldChange",ylab="-log10(Qvalue)")
points(up$log2FoldChange,up$padj,col="red",pch=16,cex=0.8)
points(down$log2FoldChange,down$padj,col="green",pch=16,cex=0.8)

# clusterProfiler包进行注释
#加载各种包,如果加载失败就自行安装,注意除了前两个,都使用BiocManager::install()进行安装。
library(dplyr)
library(tidyr)
library(DOSE)
library(GO.db)
library(org.Hs.eg.db)
library(GSEABase)
library(clusterProfiler)
#读入文件,这里选择上一步保存的q值小于等于0.05的基因来做
dta <- read.csv("res0.05.csv",header = T,row.names = 1,stringsAsFactors = F)
x <- genelist
keytypes(org.Hs.eg.db)
## ------------------------------------------------------------------------
x <- rownames(dta)
length(x)
keytypes(org.Hs.eg.db)
#进行各种ID的匹配,此步骤只是用来练手
eg <-  bitr(x, fromType="ENSEMBL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
head(eg)
ids <- bitr(x, fromType="ENSEMBL", toType=c( "SYMBOL"), OrgDb="org.Hs.eg.db")
head(ids)
go <-  bitr(x,fromType = "ENSEMBL",toType = c("SYMBOL","GO","ONTOLOGY"),OrgDb = "org.Hs.eg.db")
head(go)

## GO功能注释,输入Gene ID 为GI号,如果输入原始ENSEMBL的ID也可以,需要单独在加选项参数keyType指定
gene <- eg$ENTREZID
gene.df <- bitr(gene, fromType = "ENTREZID",toType = c("ENSEMBL", "SYMBOL"),OrgDb = org.Hs.eg.db)
head(gene.df)
ggo <- groupGO(gene= gene,OrgDb = org.Hs.eg.db, ont = "MF", level    = 3,readable = TRUE)
head(ggo)
#得到GO注释的结果还不够,因为不一定说明就执行了功能,还需要做富集分析,进行超几何检验,关于超几何检验,请自行查阅相关资料
ego <- enrichGO(gene = gene,OrgDb = org.Hs.eg.db, ont  = "CC",pAdjustMethod = "BH",                               readable = TRUE)
head(ego)

## GO功能富集可视化
barplot(ggo, drop=TRUE, showCategory=12)

## ----fig.height=5, fig.width=8-------------------------------------------
barplot(ego, showCategory=15)
dotplot(ego)

## categorySize can be scaled by 'pvalue' or 'geneNum'
cnetplot(ego, categorySize="pvalue", foldChange=geneList)
goplot(ego)

## KEGG富集分析
search_kegg_organism('ece', by='kegg_code')
ecoli <- search_kegg_organism('Escherichia coli', by='scientific_name')
dim(ecoli)
head(ecoli)


kk <- enrichKEGG(gene= gene, organism= 'hsa',  pvalueCutoff = 0.05)
head(kk)

## ----eval = FALSE--------------------------------------------------------
mkk <- enrichMKEGG(gene = gene,organism = 'hsa')
browseKEGG(kk, 'hsa04110')
barplot(kk)
dotplot(kk)