工欲善其事必先利其器
众所周知,对于VCF文件的注释常用的有VEP、SnpEff、ANNOVAR等,软件各有优势,选择哪个工具通常取决于具体的分析需求、数据类型和用户的技术背景。例如,VEP因其提供的丰富注释信息和易用性而被广泛使用。今天就先来详细了解一下VEP的注释结果。
至于VEP的入门介绍,详见:VEP — 高效的变异注释工具
screen -R down
wget -c https://ftp.ensembl.org/pub/release-111/variation/indexed_vep_cache/mus_musculus_vep_111_GRCm39.tar.gz
##文件大小
1.7G 11月 28 02:08 mus_musculus_vep_111_GRCm39.tar.gz
##或者
wget -c https://ftp.ensembl.org/pub/release-111/variation/indexed_vep_cache/mus_musculus_merged_vep_111_GRCm39.tar.gz
##文件大小
5.4G 5月 7 15:24 mus_musculus_merged_vep_111_GRCm39.tar.gz
singularity exec vep.sif \
vep --dir ~/vep_data/mouse \
--species mus_musculus --merged \
--cache --offline --format vcf --vcf --force_overwrite \
--input_file ~/vep/mouse_test.filter.vcf.gz \
--output_file ~/vep/vep_out/mouse_test_vepout.vcf \
--plugin NMD
4.5G 5月 7 17:56 mouse_test_vepout.vcf
18K 5月 7 17:56 mouse_test_vepout.vcf_summary.html
59K 5月 7 17:56 mouse_test_vepout.vcf_warnings.txt
##297M的vcf文件,运行耗时2:15:56
less -SN ../mouse_test.filter.vcf.gz |head -n 100 |tail -n 5 |cut -f 1-9
注释前
less -SN mouse_test_vepout.vcf |head -n 105 |tail -n 5 |cut -f 1-10
注释后
##查看增加的列
cat mouse_test_vepout.vcf |grep -v "^#" |cut -f 8 |awk -F ";" '{print $NF}' |head
##查看注释分类
cat mouse_test_vepout.vcf|grep -v "^#"|cut -f 8|awk -F ";" '{print $NF}'|cut -d "|" -f 2|sort |uniq -c|sort -k1n
##查看一个具体的例子
cat mouse_test_vepout.vcf |grep -v "^#" |grep "upstream_gene_variant"|head -n 1
info列
查看都有哪些变异信息:
$cat mouse_test_vepout.vcf|grep -v "^#"|cut -f 8|awk -F ";" '{print $NF}'|cut -d "|" -f 2|sort |uniq -c|sort -k1n
1 inframe_insertion&NMD_transcript_variant
1 mature_miRNA_variant
1 protein_altering_variant&incomplete_terminal_codon_variant
1 protein_altering_variant&splice_region_variant
1 splice_acceptor_variant&5_prime_UTR_variant
1 splice_acceptor_variant&NMD_transcript_variant
1 splice_acceptor_variant&splice_donor_variant&splice_donor_5th_base_variant&non_coding_transcript_exon_variant&intron_variant
1 splice_donor_variant&splice_donor_5th_base_variant&3_prime_UTR_variant&intron_variant
1 splice_donor_variant&splice_donor_5th_base_variant&intron_variant&non_coding_transcript_variant
1 splice_donor_variant&splice_donor_5th_base_variant&splice_polypyrimidine_tract_variant&intron_variant&non_coding_transcript_variant
1 splice_donor_variant&splice_donor_region_variant&coding_sequence_variant&intron_variant
1 splice_donor_variant&splice_donor_region_variant&non_coding_transcript_exon_variant&intron_variant
1 start_lost&5_prime_UTR_variant
1 start_lost&inframe_deletion&splice_region_variant
1 start_lost&splice_region_variant&5_prime_UTR_variant
1 stop_gained&frameshift_variant&start_lost
1 stop_gained&inframe_deletion
1 stop_gained&protein_altering_variant
2 inframe_deletion&NMD_transcript_variant
2 splice_acceptor_variant&non_coding_transcript_exon_variant
2 splice_acceptor_variant&splice_polypyrimidine_tract_variant&intron_variant&NMD_transcript_variant
2 splice_donor_5th_base_variant&intron_variant&NMD_transcript_variant
2 splice_donor_variant&splice_donor_5th_base_variant&5_prime_UTR_variant&intron_variant
2 stop_gained&NMD_transcript_variant
2 stop_lost&3_prime_UTR_variant
3 inframe_deletion&splice_region_variant
3 missense_variant&splice_region_variant&NMD_transcript_variant
3 non_coding_transcript_variant
3 splice_acceptor_variant&coding_sequence_variant
3 splice_acceptor_variant&non_coding_transcript_exon_variant&intron_variant
3 splice_acceptor_variant&splice_polypyrimidine_tract_variant&intron_variant
3 splice_acceptor_variant&splice_polypyrimidine_tract_variant&intron_variant&non_coding_transcript_variant
3 splice_donor_variant&coding_sequence_variant
3 splice_donor_variant&splice_donor_5th_base_variant&non_coding_transcript_exon_variant&intron_variant
3 splice_region_variant&5_prime_UTR_variant&NMD_transcript_variant
3 stop_gained&inframe_insertion&splice_region_variant
4 frameshift_variant&start_lost
4 splice_donor_variant&NMD_transcript_variant
4 splice_region_variant&synonymous_variant&NMD_transcript_variant
4 stop_gained&inframe_insertion
4 stop_retained_variant&3_prime_UTR_variant
5 frameshift_variant&start_lost&start_retained_variant
5 splice_acceptor_variant&splice_donor_5th_base_variant&coding_sequence_variant&intron_variant
5 splice_donor_variant&splice_donor_5th_base_variant&intron_variant
6 frameshift_variant&NMD_transcript_variant
6 splice_acceptor_variant&coding_sequence_variant&intron_variant
6 splice_donor_variant&splice_donor_region_variant&intron_variant
7 splice_region_variant&non_coding_transcript_variant
7 stop_gained&frameshift_variant&splice_region_variant
8 frameshift_variant&stop_lost
9 incomplete_terminal_codon_variant&coding_sequence_variant
9 splice_donor_region_variant&intron_variant&NMD_transcript_variant
9 stop_gained&splice_region_variant
12 splice_acceptor_variant&splice_donor_variant&splice_donor_5th_base_variant&splice_polypyrimidine_tract_variant&intron_variant
13 splice_acceptor_variant&splice_donor_variant&splice_donor_5th_base_variant&coding_sequence_variant&intron_variant
14 splice_region_variant&intron_variant&NMD_transcript_variant
15 splice_region_variant&3_prime_UTR_variant&NMD_transcript_variant
16 splice_region_variant&3_prime_UTR_variant
16 stop_retained_variant
19 protein_altering_variant
19 splice_donor_variant&splice_donor_5th_base_variant&coding_sequence_variant&intron_variant
26 stop_gained&frameshift_variant
29 start_lost
31 inframe_insertion&splice_region_variant
34 splice_region_variant&splice_polypyrimidine_tract_variant&intron_variant&NMD_transcript_variant
34 stop_lost
49 frameshift_variant&splice_region_variant
53 splice_donor_5th_base_variant&intron_variant&non_coding_transcript_variant
57 splice_polypyrimidine_tract_variant&intron_variant&NMD_transcript_variant
66 splice_acceptor_variant&non_coding_transcript_variant
77 5_prime_UTR_variant&NMD_transcript_variant
77 splice_region_variant&5_prime_UTR_variant
84 splice_acceptor_variant
88 splice_donor_variant
99 splice_donor_variant&non_coding_transcript_variant
104 missense_variant&NMD_transcript_variant
113 synonymous_variant&NMD_transcript_variant
148 splice_region_variant&intron_variant&non_coding_transcript_variant
181 splice_donor_region_variant&intron_variant&non_coding_transcript_variant
249 inframe_insertion
252 splice_donor_5th_base_variant&intron_variant
265 splice_region_variant&non_coding_transcript_exon_variant
284 stop_gained
344 splice_region_variant&splice_polypyrimidine_tract_variant&intron_variant&non_coding_transcript_variant
383 missense_variant&splice_region_variant
474 inframe_deletion
540 splice_region_variant&synonymous_variant
558 splice_polypyrimidine_tract_variant&intron_variant&non_coding_transcript_variant
624 3_prime_UTR_variant&NMD_transcript_variant
873 splice_donor_region_variant&intron_variant
1025 frameshift_variant
1108 splice_region_variant&intron_variant
2727 splice_region_variant&splice_polypyrimidine_tract_variant&intron_variant
4697 splice_polypyrimidine_tract_variant&intron_variant
8870 5_prime_UTR_variant
21309 intron_variant&NMD_transcript_variant
22383 missense_variant
29654 synonymous_variant
31913 3_prime_UTR_variant
49286 non_coding_transcript_exon_variant
293457 intron_variant&non_coding_transcript_variant
295676 downstream_gene_variant
296565 upstream_gene_variant
958522 intergenic_variant
1239374 intron_variant
具体查看几个例子来理解一下
例一
cat mouse_test_vepout.vcf |grep -v "^#" |grep "upstream_gene_variant"|cut -f 8|head -n 1 |awk -F ";" '{print $NF}'
CSQ=C|upstream_gene_variant|MODIFIER|4933401J01Rik|ENSMUSG00000102693|Transcript|ENSMUST00000193812|TEC|||||||||||4733|1||MGI|||Ensembl||
例二
$cat mouse_test_vepout.vcf |grep -v "^#" |grep "protein_altering_variant"|cut -f 8|head -n 1 |awk -F ";" '{print $NF}'
CSQ=GTA|protein_altering_variant|MODERATE|Cd244a|ENSMUSG00000004709|Transcript|ENSMUST00000004829|protein_coding|2/9||||340-341|208-209|70|Y/CN|tat/tGTAat|||1||MGI|||Ensembl||,GTA|non_coding_transcript_exon_variant|MODIFIER|Cd244a|ENSMUSG00000004709|Transcript|ENSMUST00000194170|retained_intron|2/5||||339-340|||||||1||MGI|||Ensembl||,GTA|protein_altering_variant&NMD_transcript_variant|MODERATE|Cd244a|ENSMUSG00000004709|Transcript|ENSMUST00000194797|nonsense_mediated_decay|2/7||||340-341|208-209|70|Y/CN|tat/tGTAat|||1||MGI|||Ensembl||,GTA|protein_altering_variant|MODERATE|Cd244a|18106|Transcript|NM_018729.2|protein_coding|2/9||||340-341|208-209|70|Y/CN|tat/tGTAat|||1||EntrezGene|||RefSeq||,GTA|protein_altering_variant|MODERATE|Cd244a|18106|Transcript|XM_006496695.5|protein_coding|2/8||||814-815|208-209|70|Y/CN|tat/tGTAat|||1||EntrezGene|||RefSeq||
记录1
记录2
记录3
记录4和5
例三
$cat mouse_test_vepout.vcf |grep -v "^#" |grep "intron_variant"|cut -f 8|head -n 1 |awk -F ";" '{print $NF}'
CSQ=G|downstream_gene_variant|MODIFIER|Xkr4|ENSMUSG00000051951|Transcript|ENSMUST00000070533|protein_coding|||||||||||4499|-1||MGI|||Ensembl||,G|intron_variant&non_coding_transcript_variant|MODIFIER|Xkr4|ENSMUSG00000051951|Transcript|ENSMUST00000159265|protein_coding_CDS_not_defined||1/1||||||||||-1||MGI|||Ensembl||,G|intron_variant&non_coding_transcript_variant|MODIFIER|Xkr4|ENSMUSG00000051951|Transcript|ENSMUST00000162897|protein_coding_CDS_not_defined||1/1||||||||||-1||MGI|||Ensembl||,G|downstream_gene_variant|MODIFIER|Xkr4|497097|Transcript|NM_001011874.1|protein_coding|||||||||||4499|-1||EntrezGene|||RefSeq||,G|intron_variant|MODIFIER|Xkr4|497097|Transcript|XM_006495550.5|protein_coding||3/3||||||||||-1||EntrezGene|||RefSeq||
记录 1
记录 2 和 3
记录 4
记录 5