一共三部分
awk
的简单使用efetch -db nucleotide -id NC_002549.1 -format gb > NC.gb
~/bin/readseq -format=GFF -o NC.gff NC.gb
找到每个feature的长度
cat NC.gff |awk '{print $1,$2,$3}'|head -5
##gff-version 2
# seqname source
NC_002549 - source
NC_002549 - 5'UTR
NC_002549 - gene
cat NC.gff|cut -f 1,2,3|head -5
##gff-version 2
# seqname source feature
NC_002549 - source
NC_002549 - 5'UTR
NC_002549 - gene
几乎等同于上个命令。
计算每个feature的长度
cat NC.gff | awk ' { print $3, $5-$4 + 1 } ' | head -5
1
source 1
source 18959
5'UTR 55
gene 2971
仅提取CDS features
cat NC.gff|awk '$3=="CDS" {print $3,$5-$4+1,$9}'
CDS 2220 gene
CDS 1023 gene
CDS 981 gene
CDS 885 group
CDS 1146 group
CDS 1095 gene
CDS 884 group
CDS 10 group
CDS 867 gene
CDS 756 gene
CDS 6639 gene
计算所有gene的累积长度
cat NC.gff | awk '$3 =="gene" { len=$5-$4 + 1; size += len; print "Size:", size } '
size+=len代表size=size+len,也就是在第一个len的基础上依次递加。为了清楚表示,不用这个运算符,比对结果看
cat NC.gff | awk '$3 =="gene" { len=$5-$4 + 1; size += len; print "Size:", size } '
Size: 2971
Size: 4347
Size: 5852
Size: 8258
Size: 9711
Size: 11345
Size: 18127
cat NC.gff | awk '$3 =="gene" { print $3, $5-$4 + 1, $9 } '
gene 2971 gene
gene 1376 gene
gene 1505 gene
gene 2406 gene
gene 1453 gene
gene 1634 gene
gene 6782 gene