wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_genomic.gff.gz
curl https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.9.6-1/sratoolkit.2.9.6-1-mac64.tar.gz
tar xzvf sratoolkit.2.9.6-1-mac64.tar.gz
echo export PATH=$PATH:~/src/sratoolkit.2.3.5-2-mac64/bin >> ~/.profile
source ~/.profile
prefetch SRR1553610
find ~/ncbi
自动下载到家目录下的ncbi文件夹
/Users/ucco/ncbi
/Users/ucco/ncbi/public
/Users/ucco/ncbi/public/sra
/Users/ucco/ncbi/public/sra/SRR1553610.sra
sra格式转变为fq格式
fastq-dump --split-files SRR1553610.sra
wc -l *.fastq
879348 SRR1553610_1.fastq
879348 SRR1553610_2.fastq
1758696 total
一次下载多个文件
$ echo SRR1553608 > sra.ids
$ echo SRR1553605 >> sra.ids
$ prefetch --option-file sra.ids
efetch -db nucleotide -id KM233090 -format fasta > KM233090.fa
efetch -db nucleotide -id 667853062 -format fasta > 667853062.fa
ucco:yeast ucco$ cat KM233090.fa |wc
270 277 19169
ucco:yeast ucco$ cat 667853062.fa |wc
270 277 19169
efetch -db nucleotide -id KM233090,KM233066,KM233113.1 -format fasta > multi.fa
看下header files
$ cat multi.fa |grep ">"
>KM233090.1 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G3816, complete genome
>KM233066.1 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G3769.2, complete genome
>KM233113.1 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G3856.1, complete genome
efetch -db nucleotide -id KM233090,KM233066,KM233113.1 -format fasta -seq_start 1 -seq_stop 10 > multi.fa
$ cat multi.fa
>KM233090.1:1-10 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G3816, complete genome
AAATTGTTAC
>KM233066.1:1-10 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G3769.2, complete genome
GAATAACTAT
>KM233113.1:1-10 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G3856.1, complete genome
CGGACACACA
esearch -db nucleotide -query PRJNA257197 | efetch -db nucleotide -format fasta -seq_start 1 -seq_stop 10 > starts.fa
$ cat starts.fa |head
>KR105345.1:1-10 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G6089.1, partial genome
ATAATTTTCC
>KR105328.1:1-10 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G5844.1, partial genome
GATTAATAAT
>KR105323.1:1-10 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G5743.1, partial genome
GATTAATAAT
>KR105302.1:1-10 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G5295.1, partial genome
GATTAATAAT
>KR105295.1:1-10 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G5039.1, partial genome
ATTAATAATT
···
假如名字为get_seq.sh
内容如下
esearch -db nucleotide -query PRJNA257197 | efetch -db nucleotide -format fasta -seq_start 1 -seq_stop 10
则可以直接运行
bash get_seq.sh > starts.fa
$ cat SRR1553605_1.fastq |head -10000|grep '###'|wc -l
82
cat SRR1553605_1.fastq |head -10000|grep 'ATG' --color=always|head
ATG
$ cat SRR1553605_1.fastq|head -10000|grep 'TATA' --color=always|head
TATA
cat SRR1553605_1.fastq|head -10000|grep 'GATCGGA' --color=always|head
chmod +x fastqc
mkdir -p ~/bin
echo 'export PATH=~/bin:$PATH' >> ~/.profile
source ~/.profile
ln -s ~/src/FastQC/fastqc ~/bin/fastqc
fastqc -h
fastqc *.fastq
质量差
质量好
借用碱基矿工的这部分内容
当我们理解了fq数据之后,做这些过滤就不会很难,你也完全可以自己编写工具来进行个性化的过滤。目前也已有很多工具用来切除接头序列和低质量碱基,比如SOAPnuke、cutadapt、untrimmed等不下十个,但这其中比较方便好用的是Trimmomatic(也是一个java程序)、sickle和seqtk。Trimmomatic的好处在于,它不但可以用来切除illumina测序平台的接头序列,还可以去除由我们自己指定的特定接头序列,而且同时也能够过滤read末尾的低质量序列,sickle和seqtk只能去除低质量碱基。具体的原理就是通过滑动一定长度的窗口,计算窗口内的碱基平均质量,如果过低,就直接往后全部切除,注!意!不是挖掉read中的这部分低质量序列,而是像切菜一样,直接从低质量区域开始把这条read后面的所有其它碱基全!部!剁!掉!否则就是在人为改变实际的基因组序列情况。
curl -O http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.39.zip
unzip Trimmomatic-0.39.zip
cd Trimmomatic-0.39/
java -jar trimmomatic-0.39.jar
看是否可用
Usage:
PE [-version] [-threads <threads>] [-phred33|-phred64] [-trimlog <trimLogFile>] [-summary <statsSummaryFile>] [-quiet] [-validatePairs] [-basein <inputBase> | <inputFile1> <inputFile2>] [-baseout <outputBase> | <outputFile1P> <outputFile1U> <outputFile2P> <outputFile2U>] <trimmer1>...
or:
SE [-version] [-threads <threads>] [-phred33|-phred64] [-trimlog <trimLogFile>] [-summary <statsSummaryFile>] [-quiet] <inputFile> <outputFile> <trimmer1>...
or:
-version
echo '#!/bin/bash' > ~/bin/trimmomatic
echo 'java -jar ~/src/Trimmomatic-0.39/trimmomatic-0.39.jar $@' >> ~/bin/trimmomatic
chmod +x ~/bin/trimmomatic
trimmomatic
trimmomatic PE SRR1553610_1.fastq SRR1553610_2.fastq trim_1P.fq trim_1U.fq trim_2P.fq trim_2U.fq ILLUMINACLIP:TruSeq3-PE-2.fa:2:10:10 SLIDINGWINDOW:4:30 TRAILING:20
关于trimmomatic相关内容请见https://zhuanlan.zhihu.com/p/28924793
https://www.jianshu.com/p/a8935adebaae
结果会产生以下四个文件
trim_1P.fq
trim_1U.fq
trim_2P.fq
trim_2U.fq
#ATG开头
768 cat SRR1553605_1.fastq |egrep "^ATG" --color=always|head
#ATG结尾
769 cat SRR1553605_1.fastq |egrep "ATG$" --color=always|head
#ATG结尾
770 cat SRR1553605_1.fastq |egrep "ATG\$" --color=always|head
# TAAATA或TACCTA
771 cat SRR1553605_1.fastq |egrep "TA(AA,CC)TA" --color=always|head
# TAAAA或TATAA
772 cat SRR1553605_1.fastq |egrep "TA[A,T]AA" --color=always|head
# TAAAA TATAA
773 cat SRR1553605_1.fastq |egrep "TA(A,T)AA" --color=always|head
#TA和AA中间有0或多个A
774 cat SRR1553605_1.fastq |egrep "TA(A*)AA" --color=always|head
#TA和TA之间有0个或多个A
775 cat SRR1553605_1.fastq |egrep "TA(A*)TA" --color=always|head
#TA和TA中间有1个或多个A
776 cat SRR1553605_1.fastq |egrep "TA(A+)TA" --color=always|head
#TAA和TA中间有2到5个A
777 cat SRR1553605_1.fastq |egrep "TAA{2,5}TA" --color=always|head
echo ">adapter" > adapter.fa
echo "AGATCGGAAGAG">>adapter.fa
ln -s ~/src/Trimmomatic-0.39/adapters/TruSeq3-PE.fa
ln -s ~/src/Trimmomatic-0.39/adapters/TruSeq3-PE-2.fa
ln -s ~/src/Trimmomatic-0.39/adapters/TruSeq3-SE.fa
cd ~/src
curl -O ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.9.0+-x64-macosx.tar.gz
tar zxvf ncbi-blast-2.9.0+-x64-macosx.tar.gz
echo 'export PATH=$PATH:~/src/ncbi-blast-2.9.0+/bin' >> ~/.profile
# for linux
#echo 'export PATH=$PATH:~/src/ncbi-blast-2.9.0+/bin' >> ~/.bashrc
source ~/.profile
blastn -h
已经可用
1 我们想发现什么?query sequence 2 到哪里寻找?数据库database也就是target sequence 3 如何寻找? search type
建一个Ebola病毒的基因组序列,因为index的时候会产生很多文件,所以建立一个新文件夹,命名为refs
因为reference可能包含很多fasta records,所以把它转变成blast database,以便程序可以处理
cd refs
makeblastdb -in KM233090.fa -dbtype nucl
会显示如下
Building a new DB, current time: 06/23/2019 19:10:50
New DB name: /Users/ucco/refs/KM233090.fa
New DB title: KM233090.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 1 sequences in 0.000593901 seconds.
ucco:refs ucco$ ls
KM233090.fa KM233090.fa.nhr KM233090.fa.nin KM233090.fa.nsq KM233118.fa
然后会产生以下文件
-rw-r--r-- 1 ucco staff 162B 6 23 19:10 KM233090.fa.nhr
-rw-r--r-- 1 ucco staff 88B 6 23 19:10 KM233090.fa.nin
-rw-r--r-- 1 ucco staff 4.6K 6 23 19:10 KM233090.fa.nsq
head -2 KM233090.fa >query.fa
cat query.fa
>KM233090.1 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G3816, complete genome
AAATTGTTACTGTAATCATACCTGGTTTGTTTCAGAGCCATATCACCAAGATAGAGAACAACCTAGGTCT
blastn -db KM233090.fa -query query.fa|more
>KM233090.1 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G3816,
complete genome
Length=18799
Score = 130 bits (70), Expect = 6e-34
Identities = 70/70 (100%), Gaps = 0/70 (0%)
Strand=Plus/Plus
Query 1 AAATTGTTACTGTAATCATACCTGGTTTGTTTCAGAGCCATATCACCAAGATAGAGAACA 60
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Sbjct 1 AAATTGTTACTGTAATCATACCTGGTTTGTTTCAGAGCCATATCACCAAGATAGAGAACA 60
Query 61 ACCTAGGTCT 70
||||||||||
Sbjct 61 ACCTAGGTCT 70
blastn -db KM233090.fa -query query.fa -outfmt 6
KM233090.1 KM233090.1 100.000 70 0 0 1 70 1 70
blastn -db KM233090.fa -query query.fa -outfmt 7
# BLASTN 2.9.0+
# Query: KM233090.1 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-G3816, complete genome
# Database: KM233090.fa
# Fields: query acc.ver, subject acc.ver, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
# 1 hits found
KM233090.1 KM233090.1 100.000 70 0 0 1 70 1 70 6.02e-34 130
# BLAST processed 1 queries
blastn -db KM233090.fa -query query.fa -outfmt '6 qseqid sseqid qlen length mismatch'
KM233090.1 KM233090.1 70 70 0
esearch -db nucleotide -query PRJNA257197 | efetch -db nucleotide -format fasta > all-genomes.fa
makeblastdb -in all-genomes.fa -dbtype nucl
Building a new DB, current time: 06/23/2019 20:39:06
New DB name: /Users/ucco/refs/all-genomes.fa
New DB title: all-genomes.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 249 sequences in 0.0657458 seconds.
efetch -db nucleotide -id KM233118 -format fasta -seq_start 1 -seq_stop 1000 > 1000.fa
cat 1000.fa |head
>KM233118.1:1-1000 Zaire ebolavirus isolate Ebola virus/H.sapiens-wt/SLE/2014/Makona-NM042.3, complete genome
AATCATACCTGGTTTGTTTCAGAGCCATATCACCAAGATAGAGAACAACCTAGGTCTCCGGAGGGGGCAA
GGGCATCAGTGTGCTCAGTTGAAAATCCCTTGTCAACATCTAGGCCTTATCACATCACAAGTTCCGCCTT
AAACTCTGCAGGGTGATCCAACAACCTTAATAGCAACATTATTGTTAAAGGACAGCATTAGTTCACAGTC
AAACAAGCAAGATTGAGAATTAACTTTGATTTTGAACCTGAACACCCAGAGGACTGGAGACTCAACAACC
CTAAAGCCTGGGGTAAAACATTAGAAATAGTTTAAAGACAAATTGCTCGGAATCACAAAATTCCGAGTAT
GGATTCTCGTCCTCAGAAAGTCTGGATGACGCCGAGTCTCACTGAATCTGACATGGATTACCACAAGATC
TTGACAGCAGGTCTGTCCGTTCAACAGGGGATTGTTCGGCAAAGAGTCATCCCAGTGTATCAAGTAAACA
ATCTTGAGGAAATTTGCCAACTTATCATACAGGCCTTTGAAGCTGGTGTTGATTTTCAAGAGAGTGCGGA
CAGTTTCCTTCTCATGCTTTGTCTTCATCATGCGTACCAAGGAGATTACAAACTTTTCTTGGAAAGTGGC
1 将新病毒和1999年爆发的病毒进行比较 https://www.ncbi.nlm.nih.gov/genome/genomes/4887? 2 RefSeq accession number NC_002549.1, Nucleoprotein example: NP_066243.1
efetch -db nucleotide -id NC_002549 --format fasta > refs/NC_002549.fa
esearch -db protein -query PRJNA14703 | efetch -format ft