comparison fetch_fasta_from_NCBI.py @ 1:7e41bbb94159 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit 6008aafac37eec1916d6b72c05d9cfcb002b8095
author artbio
date Sun, 15 Oct 2017 13:56:26 -0400
parents
children 50f5ef3313bb
comparison
equal deleted inserted replaced
0:c877548ebde1 1:7e41bbb94159
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """
4 From a taxonomy ID retrieves all the nucleotide sequences
5 It returns a multiFASTA nuc/prot file
6
7 Entrez Database UID common name E-utility Database Name
8 Nucleotide GI number nuccore
9 Protein GI number protein
10
11 Retrieve strategy:
12
13 esearch to get total number of UIDs (count)
14 esearch to get UIDs in batches
15 loop untile end of UIDs list:
16 epost to put a batch of UIDs in the history server
17 efetch to retrieve info from previous post
18
19 retmax of efetch is 1/10 of declared value from NCBI
20
21 queries are 1 sec delayed, to satisfy NCBI guidelines
22 (more than what they request)
23 """
24 import httplib
25 import logging
26 import optparse
27 import re
28 import sys
29 import time
30 import urllib
31 import urllib2
32
33
34 class QueryException(Exception):
35 pass
36
37
38 class Eutils:
39
40 def __init__(self, options, logger):
41 """
42 Initialize retrieval parameters
43 """
44 self.logger = logger
45 self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
46 self.query_string = options.query_string
47 self.dbname = options.dbname
48 if options.outname:
49 self.outname = options.outname
50 else:
51 self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta'
52 self.ids = []
53 self.retmax_esearch = 100000
54 self.retmax_efetch = 500
55 self.count = 0
56 self.webenv = ""
57 self.query_key = ""
58 self.datetype = options.datetype
59 if options.reldate:
60 self.reldate = options.reldate
61 else:
62 self.reldate = ''
63 if options.mindate:
64 self.mindate = options.mindate
65 else:
66 self.mindate = ''
67 if options.maxdate:
68 self.maxdate = options.maxdate
69 else:
70 self.maxdate = ''
71
72 def retrieve(self):
73 """
74 Retrieve the fasta sequences corresponding to the query
75 """
76 self.get_count_value()
77
78 # If no UIDs are found exit script
79 if self.count > 0:
80 self.get_uids_list()
81 try:
82 self.get_sequences()
83 except QueryException as e:
84 self.logger.error("Exiting script.")
85 raise e
86 else:
87 self.logger.error("No UIDs were found. Exiting script.")
88 raise Exception("")
89
90 def get_count_value(self):
91 """
92 just to retrieve Count (number of UIDs)
93 Total number of UIDs from the retrieved set to be shown in the XML
94 output (default=20). By default, ESearch only includes the first 20
95 UIDs retrieved in the XML output. If usehistory is set to 'y',
96 the remainder of the retrieved set will be stored on the History server
97
98 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
99 """
100 self.logger.info("retrieving data from %s" % self.base)
101 self.logger.info("for Query: %s and database: %s" %
102 (self.query_string, self.dbname))
103 querylog = self.esearch(self.dbname, self.query_string, '', '',
104 "count", self.datetype, self.reldate,
105 self.mindate, self.maxdate)
106 self.logger.debug("Query response:")
107 for line in querylog:
108 self.logger.debug(line.rstrip())
109 if '</Count>' in line:
110 self.count = int(line[line.find('<Count>')+len('<Count>'):
111 line.find('</Count>')])
112 self.logger.info("Found %d UIDs" % self.count)
113
114 def get_uids_list(self):
115 """
116 Increasing retmax allows more of the retrieved UIDs to be included in
117 the XML output, up to a maximum of 100,000 records.
118 from http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
119 """
120 retmax = self.retmax_esearch
121 if (self.count > retmax):
122 num_batches = (self.count / retmax) + 1
123 else:
124 num_batches = 1
125 self.logger.info("Batch size for esearch action: %d UIDs" % retmax)
126 self.logger.info("Number of batches for esearch action: %d " %
127 num_batches)
128 for n in range(num_batches):
129 querylog = self.esearch(self.dbname, self.query_string, n*retmax,
130 retmax, '', self.datetype, self.reldate,
131 self.mindate, self.maxdate)
132 for line in querylog:
133 if '<Id>' in line and '</Id>' in line:
134 uid = (line[line.find('<Id>')+len('<Id>'):
135 line.find('</Id>')])
136 self.ids.append(uid)
137 self.logger.info("Retrieved %d UIDs" % len(self.ids))
138
139 def esearch(self, db, term, retstart, retmax, rettype, datetype, reldate,
140 mindate, maxdate):
141 url = self.base + "esearch.fcgi"
142 self.logger.debug("url: %s" % url)
143 values = {'db': db,
144 'term': term,
145 'rettype': rettype,
146 'retstart': retstart,
147 'retmax': retmax,
148 'datetype': datetype,
149 'reldate': reldate,
150 'mindate': mindate,
151 'maxdate': maxdate}
152 data = urllib.urlencode(values)
153 self.logger.debug("data: %s" % str(data))
154 req = urllib2.Request(url, data)
155 response = urllib2.urlopen(req)
156 querylog = response.readlines()
157 response.close()
158 time.sleep(1)
159 return querylog
160
161 def epost(self, db, ids):
162 url = self.base + "epost.fcgi"
163 self.logger.debug("url_epost: %s" % url)
164 values = {'db': db,
165 'id': ids}
166 data = urllib.urlencode(values)
167 req = urllib2.Request(url, data)
168 serverResponse = False
169 nb_trials = 0
170 while not serverResponse:
171 nb_trials += 1
172 try:
173 self.logger.debug("Try number %s for opening and readin URL %s"
174 % (nb_trials, url+data))
175 response = urllib2.urlopen(req)
176 querylog = response.readlines()
177 response.close()
178 serverResponse = True
179 except urllib2.HTTPError as e:
180 self.logger.info("urlopen error:%s, %s" % (e.code, e.read()))
181 self.logger.info("Retrying in 1 sec")
182 serverResponse = False
183 time.sleep(1)
184 except urllib2.URLError as e:
185 self.logger.info("urlopen error: Failed to reach a server")
186 self.logger.info("Reason :%s" % (e.reason))
187 self.logger.info("Retrying in 1 sec")
188 serverResponse = False
189 time.sleep(1)
190 except httplib.IncompleteRead as e:
191 self.logger.info("IncompleteRead error: %s" % (e.partial))
192 self.logger.info("Retrying in 1 sec")
193 serverResponse = False
194 time.sleep(1)
195 self.logger.debug("query response:")
196 for line in querylog:
197 self.logger.debug(line.rstrip())
198 if '</QueryKey>' in line:
199 self.query_key = str(line[line.find('<QueryKey>') +
200 len('<QueryKey>'):line.find('</QueryKey>')
201 ])
202 if '</WebEnv>' in line:
203 self.webenv = str(line[line.find('<WebEnv>')+len('<WebEnv>'):
204 line.find('</WebEnv>')])
205 self.logger.debug("*** epost action ***")
206 self.logger.debug("query_key: %s" % self.query_key)
207 self.logger.debug("webenv: %s" % self.webenv)
208 time.sleep(1)
209
210 def efetch(self, db, query_key, webenv):
211 url = self.base + "efetch.fcgi"
212 self.logger.debug("url_efetch: %s" % url)
213 values = {'db': db,
214 'query_key': query_key,
215 'webenv': webenv,
216 'rettype': "fasta",
217 'retmode': "text"}
218 data = urllib.urlencode(values)
219 req = urllib2.Request(url, data)
220 self.logger.debug("data: %s" % str(data))
221 serverTransaction = False
222 counter = 0
223 response_code = 0
224 while not serverTransaction:
225 counter += 1
226 self.logger.info("Server Transaction Trial: %s" % (counter))
227 try:
228 response = urllib2.urlopen(req)
229 response_code = response.getcode()
230 fasta = response.read()
231 response.close()
232 if((response_code != 200) or
233 ("Resource temporarily unavailable" in fasta) or
234 ("Error" in fasta) or (not fasta.startswith(">"))):
235 serverTransaction = False
236 if (response_code != 200):
237 self.logger.info("urlopen error: Response code is not\
238 200")
239 elif ("Resource temporarily unavailable" in fasta):
240 self.logger.info("Ressource temporarily unavailable")
241 elif ("Error" in fasta):
242 self.logger.info("Error in fasta")
243 else:
244 self.logger.info("Fasta doesn't start with '>'")
245 else:
246 serverTransaction = True
247 except urllib2.HTTPError as e:
248 serverTransaction = False
249 self.logger.info("urlopen error:%s, %s" % (e.code, e.read()))
250 except urllib2.URLError as e:
251 serverTransaction = False
252 self.logger.info("urlopen error: Failed to reach a server")
253 self.logger.info("Reason :%s" % (e.reason))
254 except httplib.IncompleteRead as e:
255 serverTransaction = False
256 self.logger.info("IncompleteRead error: %s" % (e.partial))
257 if (counter > 500):
258 serverTransaction = True
259 if (counter > 500):
260 raise QueryException({"message":
261 "500 Server Transaction Trials attempted for\
262 this batch. Aborting."})
263 fasta = self.sanitiser(self.dbname, fasta)
264 time.sleep(0.1)
265 return fasta
266
267 def sanitiser(self, db, fastaseq):
268 if(db not in "nuccore protein"):
269 return fastaseq
270 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}")
271 sane_seqlist = []
272 seqlist = fastaseq.split("\n\n")
273 for seq in seqlist[:-1]:
274 fastalines = seq.split("\n")
275 if len(fastalines) < 2:
276 self.logger.info("Empty sequence for %s" %
277 ("|".join(fastalines[0].split("|")[:4])))
278 self.logger.info("%s download is skipped" %
279 ("|".join(fastalines[0].split("|")[:4])))
280 continue
281 if db == "nuccore":
282 badnuc = 0
283 for nucleotide in fastalines[1]:
284 if nucleotide not in "ATGC":
285 badnuc += 1
286 if float(badnuc)/len(fastalines[1]) > 0.4:
287 self.logger.info("%s ambiguous nucleotides in %s\
288 or download interrupted at this offset\
289 | %s" % (float(badnuc)/len(fastalines[1]),
290 "|".join(fastalines[0].split("|")
291 [:4]),
292 fastalines[1]))
293 self.logger.info("%s download is skipped" %
294 (fastalines[0].split("|")[:4]))
295 continue
296 """ remove spaces and trim the header to 100 chars """
297 fastalines[0] = fastalines[0].replace(" ", "_")[:100]
298 cleanseq = "\n".join(fastalines)
299 sane_seqlist.append(cleanseq)
300 elif db == "protein":
301 fastalines[0] = fastalines[0][0:100]
302 fastalines[0] = fastalines[0].replace(" ", "_")
303 fastalines[0] = fastalines[0].replace("[", "_")
304 fastalines[0] = fastalines[0].replace("]", "_")
305 fastalines[0] = fastalines[0].replace("=", "_")
306 """ because blast makedb doesn't like it """
307 fastalines[0] = fastalines[0].rstrip("_")
308 fastalines[0] = re.sub(regex, "_", fastalines[0])
309 cleanseq = "\n".join(fastalines)
310 sane_seqlist.append(cleanseq)
311 self.logger.info("clean sequences appended: %d" % (len(sane_seqlist)))
312 return "\n".join(sane_seqlist)
313
314 def get_sequences(self):
315 """
316 Total number of records from the input set to be retrieved,
317 up to a maximum of 10,000. Optionally, for a large set the value of
318 retstart can be iterated while holding retmax constant, thereby
319 downloading the entire set in batches of size retmax.
320 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
321 """
322 batch_size = self.retmax_efetch
323 count = self.count
324 uids_list = self.ids
325 self.logger.info("Batch size for efetch action: %d" % batch_size)
326 self.logger.info("Number of batches for efetch action: %d" %
327 ((count / batch_size) + 1))
328 with open(self.outname, 'w') as out:
329 for start in range(0, count, batch_size):
330 end = min(count, start+batch_size)
331 batch = uids_list[start:end]
332 if self.epost(self.dbname, ",".join(batch)) != -1:
333 mfasta = ''
334 while not mfasta:
335 self.logger.info("retrieving batch %d" %
336 ((start / batch_size) + 1))
337 try:
338 mfasta = self.efetch(self.dbname, self.query_key,
339 self.webenv)
340 out.write(mfasta + '\n')
341 except QueryException as e:
342 self.logger.error("%s" % e.message)
343 raise e
344
345
346 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
347 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
348 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
349
350
351 def __main__():
352 """ main function """
353 parser = optparse.OptionParser(description='Retrieve data from NCBI')
354 parser.add_option('-i', dest='query_string', help='NCBI Query String')
355 parser.add_option('-o', dest='outname', help='output file name')
356 parser.add_option('-d', dest='dbname', help='database type')
357 parser.add_option('-l', '--logfile', help='log file (default=stderr)')
358 parser.add_option('--datetype', dest='datetype',
359 choices=['mdat', 'pdat'],
360 help='Type of date used to limit a search.\
361 [ mdat(modification date), pdat(publication date)]\
362 (default=pdat)', default='pdat')
363 parser.add_option('--reldate', dest='reldate',
364 help='When reldate is set to an integer n, the search\
365 returns only those items that have a date\
366 specified by datetype within the last n days.')
367 parser.add_option('--maxdate', dest='maxdate',
368 help='Date range used to limit a search result by the\
369 date specified by datetype. These two parameters\
370 (mindate, maxdate) must be used together to\
371 specify an arbitrary date range. The general date\
372 format is YYYY/MM/DD, and these variants are also\
373 allowed: YYYY, YYYY/MM.')
374 parser.add_option('--mindate', dest='mindate',
375 help='Date range used to limit a search result by the\
376 date specified by datetype. These two parameters\
377 (mindate, maxdate) must be used together to\
378 specify an arbitrary date range. The general date\
379 format is YYYY/MM/DD, and these variants are also\
380 allowed: YYYY, YYYY/MM.')
381 parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO',
382 help='logging level (default: INFO)')
383 (options, args) = parser.parse_args()
384 if len(args) > 0:
385 parser.error('Wrong number of arguments')
386 if((options.reldate and options.maxdate) or
387 (options.reldate and options.mindate)):
388 parser.error("You can't mix 'reldate' and 'maxdate', 'mindate'\
389 parameters")
390 if((options.mindate and not options.maxdate) or
391 (options.maxdate and not options.mindate)):
392 parser.error("mindate and maxdate must be used together")
393
394 log_level = getattr(logging, options.loglevel)
395 kwargs = {'format': LOG_FORMAT,
396 'datefmt': LOG_DATEFMT,
397 'level': log_level}
398 if options.logfile:
399 kwargs['filename'] = options.logfile
400 logging.basicConfig(**kwargs)
401 logger = logging.getLogger('data_from_NCBI')
402
403 E = Eutils(options, logger)
404 try:
405 E.retrieve()
406 except Exception as e:
407 sys.exit(1)
408
409
410 if __name__ == "__main__":
411 __main__()