上一期见:WGS分析实战-01:从SRA数据下载到构建GenomicsDatabase
for id in {1..5}
do
echo "gatk --java-options \"-Xmx16G\" GenotypeGVCFs -R ../00.ref/arab_ref.fa -V gendb://chr${id}_arab_DB -O chr${id}.vcf.gz" >> genotyping.commandlines
done
# 仍旧使用ParaFly
ParaFly -c genotyping.commandlines -CPU 5 1>genotyping.time.log 2>genotyping.err.log &
# ls *.vcf.gz | grep "chr" > chr_vcf_id.txt
for id in {1..5}
do
echo "gatk --java-options \"-Xmx16G\" SelectVariants -R ../00.ref/arab_ref.fa -V chr${id}.vcf.gz --select-type-to-include SNP -O SNP.chr${id}.vcf.gz" >> selectSNP.commandlines
done
ParaFly -c selectSNP.commandlines -CPU 5 2>selectSNP.err.log &
结果文件中会出现“*”,代表deletion造成的假SNP:
想要过滤由于deletion造成的SNP,需要需要加上“--restrict-alleles-to BIALLELIC”
for id in {1..5}
do
echo "gatk --java-options \"-Xmx16G\" SelectVariants -R ../00.ref/arab_ref.fa -V SNP.chr${id}.vcf.gz --restrict-alleles-to BIALLELIC -O BIALLELIC.SNP.chr${id}.vcf.gz" >> selectBIALLELIC.commandlines
done
ParaFly -c selectBIALLELIC.commandlines -CPU 5 2>selectBIALLELIC.err.log &
后续分析,即VariantFiltration该步骤需要分别不同类型对原始数据进行过滤,那这边还是先拆开再进行分析
# 提取INDEL
for id in {1..5}
do
echo "gatk --java-options \"-Xmx16G\" SelectVariants -R ../00.ref/arab_ref.fa -V chr${id}.vcf.gz --select-type-to-include INDEL -O INDEL.chr${id}.vcf.gz" >> selectINDEL.commandlines
done
# ParaFly运行
ParaFly -c selectINDEL.commandlines -CPU 5 2>selectINDEL.err.log &
### 按照GATK官网给出的参数来试运行
### 对SNP位点进行过滤
for id in {1..5}
do
echo "gatk --java-options \"-Xmx16G\" VariantFiltration -R ../00.ref/arab_ref.fa -V BIALLELIC.SNP.chr${id}.vcf.gz --filter-expression \"QD < 2.0 || QUAL < 30.0 || SOR > 3.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0\" --filter-name my_filters -O filtered.BIALLELIC.SNP.chr${id}.vcf.gz" >> filter.BIALLELIC.SNP.commandlines
done
ParaFly -c filter.BIALLELIC.SNP.commandlines -CPU 5 2>filter.BIALLELIC.SNP.err.log &
### 对INDEL位点进行过滤
for id in {1..5}
do
echo "gatk --java-options \"-Xmx16G\" VariantFiltration -R ../00.ref/arab_ref.fa -V INDEL.chr${id}.vcf.gz --filter-expression \"QD < 2.0 || QUAL < 30.0 || FS > 200.0 || ReadPosRankSum < -20.0\" --filter-name my_filters -O filtered.INDEL.chr${id}.vcf.gz" >> filter.INDEL.commandlines
done
ParaFly -c filter.INDEL.commandlines -CPU 5 2>filter.INDEL.err.log &
### 选择PASS SNP
for id in {1..5}
do
echo "gatk --java-options \"-Xmx16G\" SelectVariants -R ../00.ref/arab_ref.fa -V filtered.BIALLELIC.SNP.chr${id}.vcf.gz --exclude-filtered -O PASS.filtered.BIALLELIC.SNP.chr${id}.vcf.gz" >> select.PASS.SNP.commandlines
done
ParaFly -c select.PASS.SNP.commandlines -CPU 5 2>select.PASS.SNP.err.log &
# 合并PASS SNP
java -jar ~/biosoft/picard.jar MergeVcfs \
I=PASS.filtered.BIALLELIC.SNP.chr1.vcf.gz \
I=PASS.filtered.BIALLELIC.SNP.chr2.vcf.gz \
I=PASS.filtered.BIALLELIC.SNP.chr3.vcf.gz \
I=PASS.filtered.BIALLELIC.SNP.chr4.vcf.gz \
I=PASS.filtered.BIALLELIC.SNP.chr5.vcf.gz \
O=ALL.PASS.filtered.BIALLELIC.SNP.vcf.gz
到这一步就获得可以用于后续分析的SNP数据集了。原文的研究主要关注于不同强度离子光束对全基因组范围内引起的突变类型以及不同类型的突变的频数之间是否存在差别,已有一个pipeline —— AMAP:
[1] https://github.com/ion-beam-breeding/AMAP
GenomicDBImport在不拆分interval进行运行的情况,运行时间如下:
real 29m59.818s
user 70m50.167s
sys 8m39.001s
需要提前准备的一些文件:
下面开始正式测试。
# v1.genomicsDBImport.parafly.py
genomicsDBImport_commandlines="genomicsDBImport.commandlines"
with open(genomicsDBImport_commandlines, 'w') as output_sh:
scaffold_file = open('./interval.list', 'r')
scaffold_list = list()
for line in scaffold_file.readlines():
scaffold_list.append(line.strip("\n"))
# print(scaffold_list)
for L in scaffold_list:
gcommand="gatk --java-options \"-Xmx4g -Xms4g\" GenomicsDBImport "
L_gcommand = gcommand + f"--genomicsdb-workspace-path chr{L}_arab_DB -R ../00.ref/arab_ref.fa --sample-name-map sample.map -L {L} --batch-size 16" + "\n"
output_sh.write(L_gcommand)
python3 v1.genomicsDBImport.parafly.py
time ParaFly -c genomicsDBImport.commandlines -CPU 5 2>genomeDBImport.err.log &
# 运行时间
real 8m55.893s
user 67m29.994s
sys 12m47.420s
# v1.combinegvcf.parafly.py
combinegvcf_commandlines="combinegvcf.commandlines"
with open(combinegvcf_commandlines, 'w') as output_sh:
scaffold_file = open('./interval.list', 'r')
scaffold_list = list()
for line in scaffold_file.readlines():
scaffold_list.append(line.strip("\n"))
# print(scaffold_list)
gvcf_file = open('./gvcf_id.txt', 'r')
gvcf_list = list()
for line in gvcf_file.readlines():
gvcf_list.append(line.strip("\n"))
for L in scaffold_list:
gcommand="gatk --java-options \"-Xmx4g -Xms4g\" CombineGVCFs "
L_gcommand = gcommand + f"-L {L} "
# print(L_gcommand)
for gvcf in gvcf_list:
L_gcommand = L_gcommand + f"-V {gvcf} "
output_sh.write(L_gcommand + f"-O chr{L}.vcf.gz " + "-R ../00.ref/arab_ref.fa" +"\n")
python3 v1.combinegvcf.parafly.py
# ParaFly运行
time ParaFly -c combinegvcf.commandlines -CPU 5 2>combinegvcf.err.log &
# 运行时间
real 18m16.117s
user 140m22.506s
sys 20m0.973s
从上述可以得出一个很草率的结论(所使用的数据量很小),也就是:G(拆分interval运行)运行时间最短<C<G(不拆分interval运行)。
[1] https://gatk.broadinstitute.org/hc/en-us/articles/360035531912?id=11029