RNA analysis pipeline

Benchmarking of RNA analysis pipelines

Posted by CHY on October 23, 2020

转录组测序已经是科研领域的家常菜了,但是对于纷繁杂乱的分析软件,哪些pipeline才是最好的呢?《Impact of RNA‑seq data analysis algorithms on gene expression estimation and downstream prediction》给出了初步的benchmarking结果。

文章测评了278个转录组分析pipeline,包括13种mapping软件、3种定量软件和7种标准化方法。
评价指标:accuracy; precision; reliability

RNA-pipeline-software
比对软件分类
RNA-pipeline-result1

具体分析代码

##############################
# Mapping
##############################

# 1. Bowtie (with RSEM quantification)

#Index Preparation
rsem-prepare-reference --gtf [GTF File] [Genome Fasta] [Output Prefix]

#Mapping and Quantification
rsem-calculate-expression --paired-end -p [CPUs] --bowtie-path [Path to Bowtie] --phred33/64-quals --no-bam-output Read_1.fastq Read_2.fastq [RSEM Index] [Output Prefix]

# 2. Bowtie2

#Index Preparation
bowtie2-build [Reference Fasta File] [Reference Index Output]

#Single-Hit Mapping
bowtie2 -p [CPUs] -x [Reference Index (either AceView or HG)] -1 [read1.fastq] -2 [read2.fastq] > [output file.sam]

#Multi-Hit Mapping
bowtie2 -p [CPUs] -k 200 -x [Reference Index (either AceView or HG)] -1 [read1.fastq] -2 [read2.fastq] > [output file.sam]

# 3. BWA

#Index Preparation
bwa_index [Reference Fasta File]

#Mapping
bwa aln -1 -t [CPUs] [Reference Index (either AceView or HG)] read1.fastq > output1.sai
bwa aln -2 -t [CPUs] [Reference Index (either AceView or HG)] read2.fastq > output2.sai
bwa sampe [Reference Index (either AceView or HG)] output1.sai output2.sai read1.fastq read2.fastq > output.sam

# 4.GSNAP

#Index Preparation
gmap_build -D [Reference Index Output Directory] -d [Reference Index Output Name] [Reference Fasta File]

#Spliced Index Preparation (for Spliced Mapping)
iit_store -o transcriptome.iit [Reference Transcriptome GTF]

#Single-Hit Un-Spliced Mapping
gsnap --nthreads [CPUs] -B 5 -D [Reference Index Directory] --format=sam -n 1 -J 33 -d [Reference Index Name] read1.fastq read2.fastq > output.sam

#Multi-Hit Un-Spliced Mapping
gsnap --nthreads [CPUs] -B 5 -D [Reference Index Directory] --format=sam -n 200 -J 33 -d [Reference Index Name] read1.fastq read2.fastq > output.sam

#Single-Hit Spliced Mapping
gsnap --nthreads [CPUs] -B 5 -D [Reference Index Directory] --format=sam -n 1 -J 33 --d [HG Reference Index Name] -s [Transcriptome Spliced Index (*.iit)] --nofails read1.fastq read2.fastq > output.sam

#Multi-Hit Spliced Mapping
gsnap --nthreads [CPUs] -B 5 -D [Reference Index Directory] --format=sam -n 200 -J 33 --d [HG Reference Index Name] -s [Transcriptome Spliced Index (*.iit)] --nofails read1.fastq read2.fastq > output.sam

# 5.MapSplice

#Index Preparation 
Same as Bowtie

#Mapping (SEQC-benchmark Data)
python mapsplice_segments.py -u Read_1.fastq,Read_2.fastq -B [Bowtie Genome Index] -c [Per Contig Genome FASTA Files] -o [Output Directory] [Configuration File]

#Configuration File:
reads_format = FASTQ
segment_mismatches = 1
segment_length = 25
read_length = 100
paired_end = yes
junction_type = non-canonical
full_running = yes
anchor_length = 8
remove_temp_files = yes
remap_mismatches = 2
splice_mismatches = 0
min_intron_length = 70
max_intron_length = 500000
threads = [CPUs]
search_whole_chromosome = no
map_segment_directly = no
run_MapPER = no
do_fusion = no
do_cluster = no

#Mapping (SEQC-application, Neuroblastoma data)
python mapsplice -p [CPUs] -o [Output Directory] --min-intron 70 --max-intron 500000 --bam -c [Per Contig Genome FASTA Files] -x [Bowtie Genome Index] -1 Read_1.fastq -2 Read_2.fastq

# 6.Novoalign

#Index Preparation
novoindex -t [CPUs] [Reference Output Index (*.ndx)] [Reference Fasta File]

#Single-Hit Mapping
novoalign -c [CPUs] -F ILMFQ -o SAM -r Random -d [Reference Index] -f read1.fastq read2.fastq > output.sam

#Multi-Hit Mapping
novoalign -c [CPUs] -F ILMFQ -o SAM -r all -e 200 -d [Reference Index] -f read1.fastq read2.fastq > output.sam

# 7.OSA

#Index Preparation
osa.exe --buildref [Base_Dir] [genome_fasta_file_name] [ref_lib_prefix]
osa.exe --buildgm [Base_Dir] [gtf_file_name] [ref_lib_prefix] [gene_model_prefix]

#Mapping
mono osa.exe --alignrna [OSA Index Directory] [Reference Genome Prefix] [Genome Annotation Prefix] [Configuration File]

#Configuration File:
<Files>
Read_1.fastq.gz
Read_2.fastq.gz
<Options>
ThreadNumber=CPUs
PairedEnd=True
FileFormat=FASTQ
AutoPenalty=True
FixedPenalty=2
Gzip=True
ExpressionMeasurement=TPM
SearchNovelExonJunction=False
GenerateSamFiles=False
WriteReadsInSeparateFiles=False
InsertSizeStandardDeviation=70
ExpectedInsertSize=170
ExcludeUnmappedInBam=True
<Output>
OutputName=OSA_ILM
OutputPath=XYZ

# 8.RUM

#Index Preparation
perl create_indexes_from_ucsc.pl NAME_genome.txt NAME_refseq_ucsc

#Mapping
rum_runner align -i [RUM Genome Index] -o [Output Directory] --name [Sample Name] --chunks [CPUs] Read_1.fastq Read_2.fastq

# 9.STAR

#Index Preparation
STAR --runThreadN 24 --runMode genomeGenerate --genomeDir aceview_seqc --genomeFastaFiles [HG Reference Fasta] --sjdbGTFfile [Transcriptome Reference GTF] --sjdbOverhang 100

#Mapping
STAR --genomeDir [STAR Genome Index] --readFilesIn [Read_1.fastq.gz] [Read_2.fastq.gz] --runThreadN [CPUs] --readFilesCommand zcat --outFileNamePrefix [Output Prefix] --outFilterMultimapNmax 200 --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --genomeLoad LoadAndKeep

# 10.Subread

#Index Preparation
subread-buildindex -B [HG Reference Index Name] [HG Reference Fasta]

#Mapping
subread-align -r read1.fastq -R read2.fastq -T [CPUs] -P 6 -u -H -i [HG Reference Index] -o output.sam

#featureCounts Quantification
featureCounts -p -T [CPUs] -t exon -g gene_id -a [Transcriptome GTF] -o output.counts input.sam

# 11.TopHat

#Index Preparation
Same as Bowtie2

#Single-Hit Mapping
tophat -p [CPUs] -o [Output Directory] --max-multihits 1 --library-type fr-unstranded --solexa1.3-quals --no-novel-juncs -G [GTF File] --transcriptome-index [Transcriptome Reference Index] [HG Reference Index] [Read_1.fastq.gz] [Read_2.fastq.gz] 

#Multi-Hit Mapping
tophat -p [CPUs] -o [Output Directory] --max-multihits 200 --library-type fr-unstranded --solexa1.3-quals --no-novel-juncs -G [GTF File] --transcriptome-index [Transcriptome Reference Index] [HG Reference Index] [Read_1.fastq.gz] [Read_2.fastq.gz] 

# 12.WHAM

#Index Preparation
wham-build -l 90 [Reference Fasta File] [Reference Index Output]

#Single-Hit Mapping
wham -l 90 -g 1 -X 500 --best -S -t [CPUs] -k 1 -1 read1.fastq -2 read2.fastq [Reference Index] output.sam

#Multi-Hit Mapping
wham -l 90 -g 1 -X 500 --best -S -t [CPUs] -k 200 -1 read1.fastq -2 read2.fastq [Reference Index] output.sam

##############################
# Expression Quantification
##############################
# 1.    HTSeq

samtools view -h [BAM File] | htseq-count --mode=intersection-nonempty --stranded=no --type=exon --idattr=gene_id - [GTF File] > [Output File]

# 2.    Cufflinks

cuffdiff -o [Output Directory] -p [CPUs] -b [Reference Genome FASTA] -u -L [Sample Labels] [GTF Files] [BAM Files]

# 3.    RSEM

rsem-calculate-expression --bam --paired-end -p [CPUs] --no-bam-output [BAM File] [RSEM Index] [Output Prefix]