Smart-seq2
GATK4流程分析–从fastq到vcf
### RNA-seq序列比对
### STAR 2-PASS模式
### star 1-pass index
nohup STAR --runThreadN 8 --runMode genomeGenerate \
--genomeDir /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_STAR_index/One_pass \
--genomeFastaFiles /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_hisat_index/TAIR10_chr_all.fa \
--sjdbGTFfile /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_hisat_index/Arabidopsis_thaliana.TAIR10.48.gtf &
### star 1-pass align
ls *fastq | while read id; do
(nohup STAR --runThreadN 8 --genomeDir /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_STAR_index/One_pass \
--readFilesIn ./${id} \
--readFilesCommand zcat \
--outFileNamePrefix /workcenters/workcenter3/chenhy/Singlecell_SNP/smartseq2/3h_sam/One_align/${id%%.*} &);
done
### star 2-pass index
ls *SJ.out.tab | while read id; do
(nohup STAR --runThreadN 8 --runMode genomeGenerate \
--genomeDir /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_STAR_index/Two_pass \
--genomeFastaFiles /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_hisat_index/TAIR10_chr_all.fa \
--sjdbFileChrStartEnd ./${id} &);
done
### star 2-pass align
ls *fastq | while read id; do
(nohup STAR --runThreadN 8 --genomeDir /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_STAR_index/Two_pass \
--readFilesIn ./${id} \
--readFilesCommand zcat \
--outFileNamePrefix /workcenters/workcenter3/chenhy/Singlecell_SNP/smartseq2/3h_sam/Two_align/${id%%.*} &);
done
### hisat2比对模式
ls *fastq | while read id;do
(nohup hisat2 -p 10 -x /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_hisat_index/TAIR10_chr \
-U ${id} -S /workcenters/workcenter3/chenhy/Singlecell_SNP/smartseq2/3h_sam/hisat2_sam/${id%%.*}.hisat.sam &);
done
### SAM文件处理--picard
### 添加标签,转为BAM格式
ls *sam | while read id;do
(nohup java -jar /workcenters/workcenter1/chenhy/software/picard-tools-1.124/picard.jar AddOrReplaceReadGroups \
I=./${id} \
O=./${id%%.*}.bam \
SO=coordinate \
RGID=${id%%.*} \
RGLB=rna \
RGPL=illumina \
RGPU=hiseq \
RGSM=${id%%.*} & );
done
### 标记duplicate
ls ./sub_bam/*bam | while read id;do
(nohup java -jar /workcenters/workcenter1/chenhy/software/picard-tools-1.124/picard.jar MarkDuplicates \
I=./${id} \
O=./bam_dedup/${id%%.*}_dedup.bam \
CREATE_INDEX=true \
VALIDATION_STRINGENCY=SILENT \
M=./bam_dedup/${id%%.*}_dedup.metrics &);
done
# GATK索引构建
java -jar /workcenters/workcenter1/chenhy/software/picard.jar CreateSequenceDictionary -R ref.fasta
samtools faidx ref.fasta
### 去除比对到内含子区域的read片段
ls *dedup.bam | while read id;do
(nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar -T SplitNCigarReads \
-R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/TAIR10_chr_all.fa \
-I ./${id} \
-o ./${id%%.*}_split.bam \
-rf ReassignOneMappingQuality \
-RMQF 255 \
-RMQT 60 \
-U ALLOW_N_CIGAR_READS &);
done
### indel realignment重新比对--可选
java -jar GenomeAnalysisTK.jar -T RealignerTargetCreator \
-R ./genome/chrX.fa \
-I ./star_2pass/ERR188044_dedup_split.bam \
-o ./star_2pass/ERR188044_realign_interval.list \
-known Mills_and_1000G_gold_standard.indels.hg19.sites.vcf
java -jar GenomeAnalysisTK.jar -T IndelRealigner \
-R ./genome/chrX.fa \
-I ./star_2pass/ERR188044_dedup_split.bam \
-known Mills_and_1000G_gold_standard.indels.hg19.sites.vcf \
-o ./star_2pass/ERR188044_realign.bam \
-targetIntervals ./star_2pass/ERR188044_realign_interval.list
### BQSR 碱基质量校正 -- 可选
java -jar GenomeAnalysisTK.jar \
-T BaseRecalibrator \
-R ./genome/chrX.fa \
-I ./star_2pass/ERR188044_realign.bam \
-knownSites 1000G_phase1.snps.high_confidence.hg19.sites.vcf \
-knownSites Mills_and_1000G_gold_standard.indels.hg19.sites.vcf \
-o ./star_2pass/ERR188044_recal_data.table
java -jar GenomeAnalysisTK.jar \
-T PrintReads \
-R ./genome/chrX.fa \
-I ./star_2pass/ERR188044_realign.bam \
-BQSR ./star_2pass/ERR188044_recal_data.table \
-o ./star_2pass/ERR188044_BQSR.bam
### 变异检测
ls *split.bam | while read id;do
( nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar -T HaplotypeCaller \
-R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/TAIR10_chr_all.fa \
-I ./${id} \
-dontUseSoftClippedBases \
-stand_call_conf 20.0 \
-o ../../../../bam_vcf/${id%%.*}.vcf &);
done
### 变异过滤
ls *vcf | while read id;do
(nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar \
-T VariantFiltration \
-R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/TAIR10_chr_all.fa \
-V ./${id} \
-window 35 \
-cluster 3 \
-filterName FS -filter "FS > 30.0" \
-filterName QD -filter "QD < 2.0" \
-o ./${id%%.*}_filtered.vcf &);
done
TBSP
nohup python /home/chenhy/anaconda3/envs/singlecell_R4.0/lib/python3.8/tbsp-master/tbsp/tbsp.py -i /workcenters/workcenter3/chenhy/Singlecell_SNP/smartseq2/3h_sam/hisat2_sam/bam/filter_vcf/ -o /workcenters/workcenter3/chenhy/Singlecell_SNP/smartseq2/3h_sam/hisat2_sam/bam/tbsp_result &
# GroupCells.txt 根据SNP进行聚类的结果
# SNP_matrix.tsv SNP矩阵
# SNP_matrix.jpg SNP根据cluster绘图
# Trajectory.dat 拟时分析结果,第一列为cluster id;第二列为相对坐标
# Trajectory.jpg 拟时分析图
10X
bam_split
# 针对10x数据的bam文件,可按细胞进行拆分
# https://github.com/10XGenomics/subset-bam
# https://github.com/brentp/hts-nim-tools/issues/5
# https://bioinformatics.stackexchange.com/questions/12868/is-there-a-command-line-tool-to-split-a-sam-bam-file-by-cb-cell-barcode-tag/12869
for id in `cat cell_barcode_choose1.txt`
do
sed -i -e "s/.*/$id/" cell_barcode.txt
new_id=$(echo $id | tr -d '\r\n')
/data1/chy/software/subset-bam_linux -b /data2/chy/singlecell/ara_root1_5/PRJNA517021/PRJNA517021/outs/possorted_genome_bam.bam -c /data2/chy/singlecell/ara_root1_5/PRJNA517021/SNP_tbsp/cell_barcode.txt -o ./sub_bam/${new_id}.bam --cores 2 &
done
对bam文件进行sort排序处理
ls *bam | while read id; do
nohup java -jar /workcenters/workcenter1/chenhy/software/picard.jar SortSam \
-INPUT ${id} \
-OUTPUT ../../bam_sort/${id%%.*}.sort.bam \
-SORT_ORDER coordinate &
done
对bam文件进行加头(head)处理
ls *bam | while read id; do
nohup java -jar /workcenters/workcenter1/chenhy/software/picard.jar AddOrReplaceReadGroups -I ${id} -O ../../bam_head/${id%%.*}.addhead.bam -ID ${id%%.*}ID -LB ${id%%.*}ID -PL illumine -PU ${id%%.*}PU -SM ${id%%.*} &
done
# 示例测试
nohup java -jar /workcenters/workcenter1/chenhy/software/picard.jar AddOrReplaceReadGroups -I AAACCTGAGGGCACTA-1.sort.bam \
-O ../bam_head/AAACCTGAGGGCACTA-1.addhead.bam \
-ID AAACCTGAGGGCACTA-1ID \
-LB AAACCTGAGGGCACTA-1ID \
-PL illumine \
-PU AAACCTGAGGGCACTA-1PU \
-SM AAACCTGAGGGCACTA-1 &
Duplicates Marking
# 如果两条reads具有相同的长度而且比对到了基因组的同一位置,那么就认为这样的reads是由PCR扩增而来,就会被GATK标记。
ls *bam | while read id; do
nohup java -jar /workcenters/workcenter1/chenhy/software/picard.jar MarkDuplicates --REMOVE_DUPLICATES false --MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 800 --INPUT ${id} --OUTPUT ../../bam_dedup/${id%%.*}.dedup.bam --METRICS_FILE ../../bam_dedup/bam_file/bam_dedup7/${id%%.*}.dedup.metrics &
done
对上一步得到的结果生成索引文件–samtools
ls *bam | while read id; do
samtools index ${id} &
done
去除比对到内含子区域的read片段
ls *dedup.bam | while read id;do
(nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar -T SplitNCigarReads \
-R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/Chr_ara/TAIR10_chr_all.fa \
-I ./${id} \
-o ./${id%%.*}_split.bam \
-rf ReassignOneMappingQuality \
-RMQF 255 \
-RMQT 60 \
-U ALLOW_N_CIGAR_READS &);
done
### 变异检测
ls *split.bam | while read id;do
( nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar -T HaplotypeCaller \
-R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/Chr_ara/TAIR10_chr_all.fa \
-I ./${id} \
-dontUseSoftClippedBases \
-stand_call_conf 20.0 \
-o ../../../bam_vcf/${id%%.*}.vcf &);
done
### 变异过滤
ls *vcf | while read id;do
(nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar \
-T VariantFiltration \
-R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/Chr_ara/TAIR10_chr_all.fa \
-V ./${id} \
-window 35 \
-cluster 3 \
-filterName FS -filter "FS > 30.0" \
-filterName QD -filter "QD < 2.0" \
-o ../../vcf_filter/${id%%.*}_filtered.vcf &);
done
TBSP
nohup python /home/chenhy/anaconda3/envs/singlecell_R4.0/lib/python3.8/tbsp-master/tbsp/tbsp.py -i /workcenters/workcenter3/chenhy/Singlecell_SNP/10XGENOMICS/vcf_filter/vcf -o /workcenters/workcenter3/chenhy/Singlecell_SNP/10XGENOMICS/tbsp_result &
参考链接
GATK calling variants in RNA-seq
RNA-seq 检测变异之 GATK 最佳实践流程
GATK使用方法详解(原始数据的处理)
GATK使用方法详解(变异检测)
Picard中文使用手册