单细胞SNP

单细胞SNP相关分析流程

Posted by CHY on October 26, 2020

Smart-seq2

GATK4流程分析–从fastq到vcf

### RNA-seq序列比对
### STAR 2-PASS模式

### star 1-pass index
nohup STAR --runThreadN 8 --runMode genomeGenerate \
        --genomeDir /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_STAR_index/One_pass \
        --genomeFastaFiles /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_hisat_index/TAIR10_chr_all.fa \
        --sjdbGTFfile /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_hisat_index/Arabidopsis_thaliana.TAIR10.48.gtf  &
 
### star 1-pass align
ls *fastq | while read id; do
(nohup STAR --runThreadN 8 --genomeDir /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_STAR_index/One_pass \
        --readFilesIn ./${id} \
        --readFilesCommand zcat \
        --outFileNamePrefix /workcenters/workcenter3/chenhy/Singlecell_SNP/smartseq2/3h_sam/One_align/${id%%.*} &);
done

### star 2-pass index
ls *SJ.out.tab | while read id; do
(nohup STAR --runThreadN 8 --runMode genomeGenerate \
        --genomeDir /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_STAR_index/Two_pass \
        --genomeFastaFiles /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_hisat_index/TAIR10_chr_all.fa \
        --sjdbFileChrStartEnd ./${id} &);
done

### star 2-pass align
ls *fastq | while read id; do
(nohup STAR --runThreadN 8 --genomeDir /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_STAR_index/Two_pass \
        --readFilesIn ./${id} \
        --readFilesCommand zcat \
        --outFileNamePrefix /workcenters/workcenter3/chenhy/Singlecell_SNP/smartseq2/3h_sam/Two_align/${id%%.*} &);
done
### hisat2比对模式
ls *fastq | while read id;do 
(nohup hisat2 -p 10 -x /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_hisat_index/TAIR10_chr \
-U ${id} -S /workcenters/workcenter3/chenhy/Singlecell_SNP/smartseq2/3h_sam/hisat2_sam/${id%%.*}.hisat.sam &);
done
### SAM文件处理--picard
### 添加标签,转为BAM格式
ls  *sam | while read id;do
(nohup java -jar /workcenters/workcenter1/chenhy/software/picard-tools-1.124/picard.jar AddOrReplaceReadGroups \
        I=./${id} \
        O=./${id%%.*}.bam \
        SO=coordinate \
        RGID=${id%%.*} \
        RGLB=rna \
        RGPL=illumina \
        RGPU=hiseq \
        RGSM=${id%%.*} & );
done

### 标记duplicate
ls ./sub_bam/*bam | while read id;do
(nohup java -jar /workcenters/workcenter1/chenhy/software/picard-tools-1.124/picard.jar MarkDuplicates \
        I=./${id} \
        O=./bam_dedup/${id%%.*}_dedup.bam  \
        CREATE_INDEX=true \
        VALIDATION_STRINGENCY=SILENT \
        M=./bam_dedup/${id%%.*}_dedup.metrics &);
done
# GATK索引构建
java -jar /workcenters/workcenter1/chenhy/software/picard.jar CreateSequenceDictionary -R ref.fasta
samtools faidx ref.fasta 
### 去除比对到内含子区域的read片段
ls *dedup.bam | while read id;do
(nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar -T SplitNCigarReads \
        -R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/TAIR10_chr_all.fa \
        -I ./${id} \
        -o ./${id%%.*}_split.bam \
        -rf ReassignOneMappingQuality \
        -RMQF 255 \
        -RMQT 60 \
        -U ALLOW_N_CIGAR_READS &);
done
### indel realignment重新比对--可选
java -jar GenomeAnalysisTK.jar -T RealignerTargetCreator \
        -R ./genome/chrX.fa \
        -I ./star_2pass/ERR188044_dedup_split.bam \
        -o ./star_2pass/ERR188044_realign_interval.list \
        -known Mills_and_1000G_gold_standard.indels.hg19.sites.vcf 
java -jar GenomeAnalysisTK.jar -T IndelRealigner \
        -R ./genome/chrX.fa \
        -I ./star_2pass/ERR188044_dedup_split.bam \
        -known Mills_and_1000G_gold_standard.indels.hg19.sites.vcf \
        -o ./star_2pass/ERR188044_realign.bam \
        -targetIntervals ./star_2pass/ERR188044_realign_interval.list
### BQSR 碱基质量校正 -- 可选
java -jar GenomeAnalysisTK.jar \
        -T BaseRecalibrator \
        -R ./genome/chrX.fa \
        -I ./star_2pass/ERR188044_realign.bam \
        -knownSites 1000G_phase1.snps.high_confidence.hg19.sites.vcf \
        -knownSites Mills_and_1000G_gold_standard.indels.hg19.sites.vcf \
        -o ./star_2pass/ERR188044_recal_data.table
java -jar GenomeAnalysisTK.jar  \
        -T PrintReads \
        -R ./genome/chrX.fa \
        -I ./star_2pass/ERR188044_realign.bam \
        -BQSR ./star_2pass/ERR188044_recal_data.table \
        -o ./star_2pass/ERR188044_BQSR.bam
### 变异检测
ls *split.bam | while read id;do
( nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar -T HaplotypeCaller \
        -R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/TAIR10_chr_all.fa \
        -I ./${id} \
        -dontUseSoftClippedBases \
        -stand_call_conf 20.0 \
        -o ../../../../bam_vcf/${id%%.*}.vcf &);
done
### 变异过滤
ls *vcf | while read id;do
(nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar \
        -T VariantFiltration \
        -R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/TAIR10_chr_all.fa \
        -V ./${id} \
        -window 35 \
        -cluster 3 \
        -filterName FS -filter "FS > 30.0" \
        -filterName QD -filter "QD < 2.0" \
        -o ./${id%%.*}_filtered.vcf &);
done

TBSP

nohup python /home/chenhy/anaconda3/envs/singlecell_R4.0/lib/python3.8/tbsp-master/tbsp/tbsp.py -i /workcenters/workcenter3/chenhy/Singlecell_SNP/smartseq2/3h_sam/hisat2_sam/bam/filter_vcf/ -o /workcenters/workcenter3/chenhy/Singlecell_SNP/smartseq2/3h_sam/hisat2_sam/bam/tbsp_result &

# GroupCells.txt  根据SNP进行聚类的结果
# SNP_matrix.tsv  SNP矩阵
# SNP_matrix.jpg  SNP根据cluster绘图
# Trajectory.dat  拟时分析结果,第一列为cluster id;第二列为相对坐标
# Trajectory.jpg  拟时分析图

10X

bam_split

# 针对10x数据的bam文件,可按细胞进行拆分
# https://github.com/10XGenomics/subset-bam
# https://github.com/brentp/hts-nim-tools/issues/5
# https://bioinformatics.stackexchange.com/questions/12868/is-there-a-command-line-tool-to-split-a-sam-bam-file-by-cb-cell-barcode-tag/12869

for id in `cat cell_barcode_choose1.txt`
do
sed -i -e "s/.*/$id/" cell_barcode.txt
new_id=$(echo $id | tr -d '\r\n')
/data1/chy/software/subset-bam_linux -b /data2/chy/singlecell/ara_root1_5/PRJNA517021/PRJNA517021/outs/possorted_genome_bam.bam -c /data2/chy/singlecell/ara_root1_5/PRJNA517021/SNP_tbsp/cell_barcode.txt -o ./sub_bam/${new_id}.bam --cores 2 &
done

对bam文件进行sort排序处理

ls *bam | while read id; do
nohup java -jar /workcenters/workcenter1/chenhy/software/picard.jar SortSam \
-INPUT ${id} \
-OUTPUT ../../bam_sort/${id%%.*}.sort.bam \
-SORT_ORDER coordinate &
done

对bam文件进行加头(head)处理

ls *bam | while read id; do
nohup java -jar /workcenters/workcenter1/chenhy/software/picard.jar AddOrReplaceReadGroups -I ${id} -O ../../bam_head/${id%%.*}.addhead.bam -ID ${id%%.*}ID -LB ${id%%.*}ID -PL illumine -PU ${id%%.*}PU -SM ${id%%.*} &
done
# 示例测试
nohup java -jar /workcenters/workcenter1/chenhy/software/picard.jar AddOrReplaceReadGroups -I AAACCTGAGGGCACTA-1.sort.bam \
-O ../bam_head/AAACCTGAGGGCACTA-1.addhead.bam \
-ID AAACCTGAGGGCACTA-1ID \
-LB AAACCTGAGGGCACTA-1ID \
-PL illumine \
-PU AAACCTGAGGGCACTA-1PU \
-SM AAACCTGAGGGCACTA-1 &

Duplicates Marking

# 如果两条reads具有相同的长度而且比对到了基因组的同一位置,那么就认为这样的reads是由PCR扩增而来,就会被GATK标记。
ls *bam | while read id; do
nohup java -jar /workcenters/workcenter1/chenhy/software/picard.jar MarkDuplicates --REMOVE_DUPLICATES false --MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 800 --INPUT ${id} --OUTPUT ../../bam_dedup/${id%%.*}.dedup.bam --METRICS_FILE ../../bam_dedup/bam_file/bam_dedup7/${id%%.*}.dedup.metrics &
done

对上一步得到的结果生成索引文件–samtools

ls *bam | while read id; do
samtools index ${id} &
done

去除比对到内含子区域的read片段

ls *dedup.bam | while read id;do
(nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar -T SplitNCigarReads \
        -R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/Chr_ara/TAIR10_chr_all.fa \
        -I ./${id} \
        -o ./${id%%.*}_split.bam \
        -rf ReassignOneMappingQuality \
        -RMQF 255 \
        -RMQT 60 \
        -U ALLOW_N_CIGAR_READS &);
done
### 变异检测
ls *split.bam | while read id;do
( nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar -T HaplotypeCaller \
        -R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/Chr_ara/TAIR10_chr_all.fa \
        -I ./${id} \
        -dontUseSoftClippedBases \
        -stand_call_conf 20.0 \
        -o ../../../bam_vcf/${id%%.*}.vcf &);
done
### 变异过滤
ls *vcf | while read id;do
(nohup java -jar /workcenters/workcenter4/laost/software/GATK/3.7/GenomeAnalysisTK.jar \
        -T VariantFiltration \
        -R /workcenters/workcenter3/chenhy/Singlecell_SNP/ara_GATK_index/Chr_ara/TAIR10_chr_all.fa \
        -V ./${id} \
        -window 35 \
        -cluster 3 \
        -filterName FS -filter "FS > 30.0" \
        -filterName QD -filter "QD < 2.0" \
        -o ../../vcf_filter/${id%%.*}_filtered.vcf &);
done

TBSP

nohup python /home/chenhy/anaconda3/envs/singlecell_R4.0/lib/python3.8/tbsp-master/tbsp/tbsp.py -i /workcenters/workcenter3/chenhy/Singlecell_SNP/10XGENOMICS/vcf_filter/vcf -o  /workcenters/workcenter3/chenhy/Singlecell_SNP/10XGENOMICS/tbsp_result &

参考链接

GATK calling variants in RNA-seq
RNA-seq 检测变异之 GATK 最佳实践流程
GATK使用方法详解(原始数据的处理)
GATK使用方法详解(变异检测)
Picard中文使用手册