comparison get_fasta_from_taxon.py @ 0:7ce8da107910 draft default tip

Uploaded
author crs4
date Wed, 11 Sep 2013 06:07:50 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7ce8da107910
1 # -*- coding: utf-8 -*-
2 """
3 From a taxonomy ID retrieves all the nucleotide sequences
4 It returns a multiFASTA nuc/prot file
5
6 Entrez Database UID common name E-utility Database Name
7 Nucleotide GI number nuccore
8 Protein GI number protein
9
10 Retrieve strategy:
11
12 esearch to get total number of UIDs (count)
13 esearch to get UIDs in batches
14 loop untile end of UIDs list:
15 epost to put a batch of UIDs in the history server
16 efetch to retrieve info from previous post
17
18 retmax of efetch is 1/10 of declared value from NCBI
19
20 queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request)
21
22
23 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein
24 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs
25 """
26
27 import logging
28 import optparse
29 import time
30 import urllib
31 import urllib2
32
33 class Eutils:
34
35 def __init__(self, options, logger):
36 self.logger = logger
37 self.base = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
38 self.taxid = options.taxid
39 self.dbname = options.dbname
40 if options.outname:
41 self.outname = options.outname
42 else:
43 self.outname = 'taxid' + self.taxid + '.' + self.dbname + '.fasta'
44 self.ids = []
45 self.retmax_esearch = 100000
46 self.retmax_efetch = 1000
47 self.count = 0
48 self.webenv = ""
49 self.query_key = ""
50
51 def retrieve(self):
52 """ """
53 self.get_count_value()
54 self.get_uids_list()
55 self.get_sequences()
56
57 def get_count_value(self):
58 """
59 just to retrieve Count (number of UIDs)
60 Total number of UIDs from the retrieved set to be shown in the XML
61 output (default=20). By default, ESearch only includes the first 20
62 UIDs retrieved in the XML output. If usehistory is set to 'y',
63 the remainder of the retrieved set will be stored on the History server;
64
65 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
66 """
67 self.logger.info("retrieving data from %s" % self.base)
68 self.logger.info("for TaxID: %s and database: %s" %
69 (self.taxid, self.dbname))
70 querylog = self.esearch(self.dbname, "txid%s[Organism:exp]" % self.taxid, '', '', "count")
71 self.logger.debug("taxonomy response:")
72 for line in querylog:
73 self.logger.debug(line.rstrip())
74 if '</Count>' in line:
75 self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')])
76 self.logger.info("Founded %d UIDs" % self.count)
77
78 def get_uids_list(self):
79 """
80 Increasing retmax allows more of the retrieved UIDs to be included in the XML output,
81 up to a maximum of 100,000 records.
82 from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
83 """
84 retmax = self.retmax_esearch
85 if (self.count > retmax):
86 num_batches = (self.count / retmax) + 1
87 else:
88 num_batches = 1
89 self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
90 self.logger.info("Number of batches for esearch action: %d " % num_batches)
91 for n in range(num_batches):
92 querylog = self.esearch(self.dbname, "txid%s[Organism:exp]" % self.taxid, n*retmax, retmax, '')
93 for line in querylog:
94 if '<Id>' in line and '</Id>' in line:
95 uid = (line[line.find('<Id>')+len('<Id>') : line.find('</Id>')])
96 self.ids.append(uid)
97 self.logger.info("Retrieved %d UIDs" % len(self.ids))
98
99 def esearch(self, db, term, retstart, retmax, rettype):
100 url = self.base + "esearch.fcgi"
101 self.logger.debug("url: %s" % url)
102 values = {'db': db,
103 'term': term,
104 'rettype': rettype,
105 'retstart': retstart,
106 'retmax': retmax}
107 data = urllib.urlencode(values)
108 self.logger.debug("data: %s" % str(data))
109 req = urllib2.Request(url, data)
110 response = urllib2.urlopen(req)
111 querylog = response.readlines()
112 time.sleep(1)
113 return querylog
114
115 def epost(self, db, ids):
116 url = self.base + "epost.fcgi"
117 self.logger.debug("url_epost: %s" % url)
118 values = {'db': db,
119 'id': ids}
120 data = urllib.urlencode(values)
121 req = urllib2.Request(url, data)
122 #self.logger.debug("data: %s" % str(data))
123 req = urllib2.Request(url, data)
124 response = urllib2.urlopen(req)
125 querylog = response.readlines()
126 self.logger.debug("taxonomy response:")
127 for line in querylog:
128 self.logger.debug(line.rstrip())
129 if '</QueryKey>' in line:
130 self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')])
131 if '</WebEnv>' in line:
132 self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):line.find('</WebEnv>')])
133 self.logger.debug("*** epost action ***")
134 self.logger.debug("query_key: %s" % self.query_key)
135 self.logger.debug("webenv: %s" % self.webenv)
136 time.sleep(1)
137
138 def efetch(self, db, query_key, webenv):
139 url = self.base + "efetch.fcgi"
140 self.logger.debug("url_efetch: %s" % url)
141 values = {'db': db,
142 'query_key': query_key,
143 'webenv': webenv,
144 'rettype': "fasta",
145 'retmode': "text"}
146 data = urllib.urlencode(values)
147 req = urllib2.Request(url, data)
148 self.logger.debug("data: %s" % str(data))
149 req = urllib2.Request(url, data)
150 response = urllib2.urlopen(req)
151 fasta = response.read()
152 assert fasta.startswith(">"), fasta
153 time.sleep(1)
154 return fasta
155
156 def get_sequences(self):
157 """
158 Total number of records from the input set to be retrieved, up to a maximum
159 of 10,000. Optionally, for a large set the value of retstart can be iterated
160 while holding retmax constant, thereby downloading the entire set in batches
161 of size retmax.
162
163 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
164
165 """
166 batch_size = self.retmax_efetch
167 count = self.count
168 uids_list = self.ids
169 self.logger.info("Batch size for efetch action: %d" % batch_size)
170 self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1))
171 with open(self.outname, 'w') as out:
172 for start in range(0, count, batch_size):
173 end = min(count, start+batch_size)
174 batch = uids_list[start:end]
175 self.epost(self.dbname, ",".join(batch))
176 self.logger.info("retrieving batch %d" % ((start / batch_size) + 1))
177 mfasta = self.efetch(self.dbname, self.query_key, self.webenv)
178 out.write(mfasta + '\n')
179
180
181 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
182 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
183 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
184
185
186 def __main__():
187 """ main function """
188 parser = optparse.OptionParser(description='Retrieve data from NCBI')
189 parser.add_option('-i', dest='taxid', help='taxonomy ID mandatory')
190 parser.add_option('-o', dest='outname', help='output file name')
191 parser.add_option('-l', '--logfile', help='log file (default=stderr)')
192 parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)')
193 parser.add_option('-d', dest='dbname', help='database type')
194 (options, args) = parser.parse_args()
195 if len(args) > 0:
196 parser.error('Wrong number of arguments')
197
198 log_level = getattr(logging, options.loglevel)
199 kwargs = {'format': LOG_FORMAT,
200 'datefmt': LOG_DATEFMT,
201 'level': log_level}
202 if options.logfile:
203 kwargs['filename'] = options.logfile
204 logging.basicConfig(**kwargs)
205 logger = logging.getLogger('data_from_NCBI')
206
207 E = Eutils(options, logger)
208 E.retrieve()
209
210
211 if __name__ == "__main__":
212 __main__()