Mercurial > repos > drosofff > fetch_fasta_from_ncbi
comparison retrieve_fasta_from_NCBI.py @ 4:64f45c5e94a0 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ca132a4b5d5d7175e6e8bd62cc518397d14dad17
author | drosofff |
---|---|
date | Mon, 15 May 2017 03:10:11 -0400 |
parents | a9d8f69d59fb |
children | c6de5c7b4ae3 |
comparison
equal
deleted
inserted
replaced
3:a9d8f69d59fb | 4:64f45c5e94a0 |
---|---|
19 retmax of efetch is 1/10 of declared value from NCBI | 19 retmax of efetch is 1/10 of declared value from NCBI |
20 | 20 |
21 queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request) | 21 queries are 1 sec delayed, to satisfy NCBI guidelines (more than what they request) |
22 | 22 |
23 | 23 |
24 python get_fasta_from_taxon.py -i 1638 -o test.out -d protein | |
25 python get_fasta_from_taxon.py -i 327045 -o test.out -d nuccore # 556468 UIDs | |
26 """ | 24 """ |
27 import sys | 25 import sys |
28 import logging | 26 import logging |
29 import optparse | 27 import optparse |
30 import time | 28 import time |
35 | 33 |
36 | 34 |
37 class Eutils: | 35 class Eutils: |
38 | 36 |
39 def __init__(self, options, logger): | 37 def __init__(self, options, logger): |
38 """ | |
39 Initialize retrieval parameters | |
40 """ | |
40 self.logger = logger | 41 self.logger = logger |
41 self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" | 42 self.base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" |
42 self.query_string = options.query_string | 43 self.query_string = options.query_string |
43 self.dbname = options.dbname | 44 self.dbname = options.dbname |
44 if options.outname: | 45 if options.outname: |
45 self.outname = options.outname | 46 self.outname = options.outname |
46 else: | 47 else: |
47 self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta' | 48 self.outname = 'NCBI_download' + '.' + self.dbname + '.fasta' |
48 self.ids = [] | 49 self.ids = [] |
49 self.retmax_esearch = 100000 | 50 self.retmax_esearch = 100000 |
50 self.retmax_efetch = 1000 | 51 self.retmax_efetch = 500 |
51 self.count = 0 | 52 self.count = 0 |
52 self.webenv = "" | 53 self.webenv = "" |
53 self.query_key = "" | 54 self.query_key = "" |
54 | 55 |
55 def retrieve(self): | 56 def retrieve(self): |
56 """ """ | 57 """ |
58 Retrieve the fasta sequences corresponding to the query | |
59 """ | |
57 self.get_count_value() | 60 self.get_count_value() |
58 self.get_uids_list() | 61 |
59 self.get_sequences() | 62 # If no UIDs are found exit script |
63 if self.count > 0: | |
64 self.get_uids_list() | |
65 self.get_sequences() | |
66 else: | |
67 self.logger.info("No UIDs were found. Exiting script.") | |
60 | 68 |
61 def get_count_value(self): | 69 def get_count_value(self): |
62 """ | 70 """ |
63 just to retrieve Count (number of UIDs) | 71 just to retrieve Count (number of UIDs) |
64 Total number of UIDs from the retrieved set to be shown in the XML | 72 Total number of UIDs from the retrieved set to be shown in the XML |
75 self.logger.debug("Query response:") | 83 self.logger.debug("Query response:") |
76 for line in querylog: | 84 for line in querylog: |
77 self.logger.debug(line.rstrip()) | 85 self.logger.debug(line.rstrip()) |
78 if '</Count>' in line: | 86 if '</Count>' in line: |
79 self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')]) | 87 self.count = int(line[line.find('<Count>')+len('<Count>') : line.find('</Count>')]) |
80 self.logger.info("Founded %d UIDs" % self.count) | 88 self.logger.info("Found %d UIDs" % self.count) |
81 | 89 |
82 def get_uids_list(self): | 90 def get_uids_list(self): |
83 """ | 91 """ |
84 Increasing retmax allows more of the retrieved UIDs to be included in the XML output, | 92 Increasing retmax allows more of the retrieved UIDs to be included in the XML output, |
85 up to a maximum of 100,000 records. | 93 up to a maximum of 100,000 records. |
111 data = urllib.urlencode(values) | 119 data = urllib.urlencode(values) |
112 self.logger.debug("data: %s" % str(data)) | 120 self.logger.debug("data: %s" % str(data)) |
113 req = urllib2.Request(url, data) | 121 req = urllib2.Request(url, data) |
114 response = urllib2.urlopen(req) | 122 response = urllib2.urlopen(req) |
115 querylog = response.readlines() | 123 querylog = response.readlines() |
124 response.close() | |
116 time.sleep(1) | 125 time.sleep(1) |
117 return querylog | 126 return querylog |
118 | 127 |
119 def epost(self, db, ids): | 128 def epost(self, db, ids): |
120 url = self.base + "epost.fcgi" | 129 url = self.base + "epost.fcgi" |
121 self.logger.debug("url_epost: %s" % url) | 130 self.logger.debug("url_epost: %s" % url) |
122 values = {'db': db, | 131 values = {'db': db, |
123 'id': ids} | 132 'id': ids} |
124 data = urllib.urlencode(values) | 133 data = urllib.urlencode(values) |
125 req = urllib2.Request(url, data) | 134 req = urllib2.Request(url, data) |
126 #self.logger.debug("data: %s" % str(data)) | |
127 req = urllib2.Request(url, data) | |
128 serverResponse = False | 135 serverResponse = False |
136 nb_trials = 0 | |
129 while not serverResponse: | 137 while not serverResponse: |
138 nb_trials += 1 | |
130 try: | 139 try: |
140 self.logger.debug("Try number %s for opening and readin URL %s" % ( nb_trials, url+data )) | |
131 response = urllib2.urlopen(req) | 141 response = urllib2.urlopen(req) |
142 querylog = response.readlines() | |
143 response.close() | |
132 serverResponse = True | 144 serverResponse = True |
133 except: # catch *all* exceptions | 145 except urllib2.HTTPError as e: |
134 e = sys.exc_info()[0] | 146 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) |
135 self.logger.info( "Catched Error: %s" % e ) | 147 self.logger.info("Retrying in 1 sec") |
136 self.logger.info( "Retrying in 10 sec") | 148 serverResponse = False |
137 time.sleep(10) | 149 time.sleep(1) |
138 querylog = response.readlines() | 150 except urllib2.URLError as e: |
151 self.logger.info("urlopen error: Failed to reach a server") | |
152 self.logger.info("Reason :%s" % ( e.reason ) ) | |
153 self.logger.info("Retrying in 1 sec") | |
154 serverResponse = False | |
155 time.sleep(1) | |
156 except httplib.IncompleteRead as e: | |
157 self.logger.info("IncompleteRead error: %s" % ( e.partial ) ) | |
158 self.logger.info("Retrying in 1 sec") | |
159 serverResponse = False | |
160 time.sleep(1) | |
139 self.logger.debug("query response:") | 161 self.logger.debug("query response:") |
140 for line in querylog: | 162 for line in querylog: |
141 self.logger.debug(line.rstrip()) | 163 self.logger.debug(line.rstrip()) |
142 if '</QueryKey>' in line: | 164 if '</QueryKey>' in line: |
143 self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')]) | 165 self.query_key = str(line[line.find('<QueryKey>')+len('<QueryKey>'):line.find('</QueryKey>')]) |
157 'rettype': "fasta", | 179 'rettype': "fasta", |
158 'retmode': "text"} | 180 'retmode': "text"} |
159 data = urllib.urlencode(values) | 181 data = urllib.urlencode(values) |
160 req = urllib2.Request(url, data) | 182 req = urllib2.Request(url, data) |
161 self.logger.debug("data: %s" % str(data)) | 183 self.logger.debug("data: %s" % str(data)) |
162 req = urllib2.Request(url, data) | |
163 serverTransaction = False | 184 serverTransaction = False |
164 counter = 0 | 185 counter = 0 |
186 response_code = 0 | |
165 while not serverTransaction: | 187 while not serverTransaction: |
166 counter += 1 | 188 counter += 1 |
167 self.logger.info("Server Transaction Trial: %s" % ( counter ) ) | 189 self.logger.info("Server Transaction Trial: %s" % ( counter ) ) |
168 try: | 190 try: |
169 response = urllib2.urlopen(req) | 191 response = urllib2.urlopen(req) |
192 response_code = response.getcode() | |
170 fasta = response.read() | 193 fasta = response.read() |
171 if ("Resource temporarily unavailable" in fasta) or (not fasta.startswith(">") ): | 194 response.close() |
195 if ( (response_code != 200) or ("Resource temporarily unavailable" in fasta) | |
196 or ("Error" in fasta) or (not fasta.startswith(">") ) ): | |
172 serverTransaction = False | 197 serverTransaction = False |
173 else: | 198 else: |
174 serverTransaction = True | 199 serverTransaction = True |
175 except urllib2.HTTPError as e: | 200 except urllib2.HTTPError as e: |
176 serverTransaction = False | 201 serverTransaction = False |
177 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) | 202 self.logger.info("urlopen error:%s, %s" % (e.code, e.read() ) ) |
203 except urllib2.URLError as e: | |
204 serverTransaction = False | |
205 self.logger.info("urlopen error: Failed to reach a server") | |
206 self.logger.info("Reason :%s" % ( e.reason ) ) | |
178 except httplib.IncompleteRead as e: | 207 except httplib.IncompleteRead as e: |
179 serverTransaction = False | 208 serverTransaction = False |
180 self.logger.info("IncompleteRead error: %s" % ( e.partial ) ) | 209 self.logger.info("IncompleteRead error: %s" % ( e.partial ) ) |
181 fasta = self.sanitiser(self.dbname, fasta) # | 210 fasta = self.sanitiser(self.dbname, fasta) |
182 time.sleep(1) | 211 time.sleep(0.1) |
183 return fasta | 212 return fasta |
184 | 213 |
185 def sanitiser(self, db, fastaseq): | 214 def sanitiser(self, db, fastaseq): |
186 if db not in "nuccore protein" : return fastaseq | 215 if db not in "nuccore protein" : return fastaseq |
187 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") | 216 regex = re.compile(r"[ACDEFGHIKLMNPQRSTVWYBZ]{49,}") |
235 self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1)) | 264 self.logger.info("Number of batches for efetch action: %d" % ((count / batch_size) + 1)) |
236 with open(self.outname, 'w') as out: | 265 with open(self.outname, 'w') as out: |
237 for start in range(0, count, batch_size): | 266 for start in range(0, count, batch_size): |
238 end = min(count, start+batch_size) | 267 end = min(count, start+batch_size) |
239 batch = uids_list[start:end] | 268 batch = uids_list[start:end] |
240 self.epost(self.dbname, ",".join(batch)) | 269 if self.epost(self.dbname, ",".join(batch)) != -1: |
241 mfasta = '' | 270 mfasta = '' |
242 while not mfasta: | 271 while not mfasta: |
243 self.logger.info("retrieving batch %d" % ((start / batch_size) + 1)) | 272 self.logger.info("retrieving batch %d" % ((start / batch_size) + 1)) |
244 mfasta = self.efetch(self.dbname, self.query_key, self.webenv) | 273 mfasta = self.efetch(self.dbname, self.query_key, self.webenv) |
245 out.write(mfasta + '\n') | 274 out.write(mfasta + '\n') |
246 | 275 |
247 | 276 |
248 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' | 277 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' |
249 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' | 278 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' |
250 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] | 279 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] |