Mercurial > repos > iuc > ncbi_eutils_esearch
comparison eutils.py @ 3:e267701c187b draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_entrez_eutils commit dae34e5e182b4cceb808d7353080f14aa9a78ca9"
author | iuc |
---|---|
date | Wed, 23 Sep 2020 09:48:26 +0000 |
parents | c6096cd97120 |
children | ef77a3b01899 |
comparison
equal
deleted
inserted
replaced
2:c6096cd97120 | 3:e267701c187b |
---|---|
10 | 10 |
11 class Client(object): | 11 class Client(object): |
12 | 12 |
13 def __init__(self, history_file=None, user_email=None, admin_email=None): | 13 def __init__(self, history_file=None, user_email=None, admin_email=None): |
14 self.using_history = False | 14 self.using_history = False |
15 self.using_parsedids = False | |
15 | 16 |
16 if user_email is not None and admin_email is not None: | 17 if user_email is not None and admin_email is not None: |
17 Entrez.email = ';'.join((admin_email, user_email)) | 18 Entrez.email = ';'.join((admin_email, user_email)) |
18 elif user_email is not None: | 19 elif user_email is not None: |
19 Entrez.email = user_email | 20 Entrez.email = user_email |
27 "administrator email in NCBI_EUTILS_CONTACT") | 28 "administrator email in NCBI_EUTILS_CONTACT") |
28 | 29 |
29 if history_file is not None: | 30 if history_file is not None: |
30 with open(history_file, 'r') as handle: | 31 with open(history_file, 'r') as handle: |
31 data = json.loads(handle.read()) | 32 data = json.loads(handle.read()) |
32 self.query_key = data['QueryKey'] | 33 # esearch |
33 self.webenv = data['WebEnv'] | 34 if 'QueryKey' in data: |
34 self.using_history = True | 35 self.query_key = data['QueryKey'] |
36 self.webenv = data['WebEnv'] | |
37 self.query_keys = [] | |
38 self.query_keys += [data['QueryKey']] | |
39 self.using_history = True | |
40 elif 'query_key' in data: | |
41 self.query_key = data['query_key'] | |
42 self.webenv = data['WebEnv'] | |
43 self.query_keys = [] | |
44 self.query_keys += [data['query_key']] | |
45 self.using_history = True | |
46 elif 'esearchresult' in data: | |
47 self.query_key = data['esearchresult']['querykey'] | |
48 self.webenv = data['esearchresult']['webenv'] | |
49 self.query_keys = [] | |
50 self.query_keys += [data['esearchresult']['querykey']] | |
51 self.using_history = True | |
52 # elink | |
53 elif 'linksets' in data: | |
54 # elink for cmd=neighbor_history | |
55 if 'linksetdbhistories' in data['linksets'][0]: | |
56 self.webenv = data['linksets'][0]['webenv'] | |
57 self.query_key = data['linksets'][0]['linksetdbhistories'][0]['querykey'] | |
58 self.using_history = True | |
59 # elink for cmd=neighbor|neighbor_score | |
60 elif 'linksetdbs' in data['linksets'][0]: | |
61 self.using_parsedids = True | |
62 # elink for neighbor | |
63 if isinstance(data['linksets'][0]['linksetdbs'][0]['links'][0], str): | |
64 self.idstr = ','.join(data['linksets'][0]['linksetdbs'][0]['links']) | |
65 # elink for neighbor_score | |
66 else: | |
67 self.idstr = ','.join(map(lambda x: x['id'], data['linksets'][0]['linksetdbs'][0]['links'])) | |
68 if 'linksetdbhistories' in data['linksets'][0]: | |
69 self.webenv = data['linksets'][0]['webenv'] | |
70 self.query_keys = [] | |
71 for query in data['linksets'][0]['linksetdbhistories']: | |
72 if 'querykey' in query: | |
73 self.query_keys += [query['querykey']] | |
74 else: | |
75 print("No match") | |
76 print(data) | |
35 | 77 |
36 def get_history(self): | 78 def get_history(self): |
37 if not self.using_history: | 79 if self.using_history: |
38 return {} | |
39 else: | |
40 return { | 80 return { |
41 'query_key': self.query_key, | 81 'query_key': self.query_key, |
42 'WebEnv': self.webenv, | 82 'WebEnv': self.webenv, |
43 } | 83 } |
84 elif self.using_parsedids: | |
85 return { | |
86 'id': self.idstr, | |
87 } | |
88 else: | |
89 return {} | |
90 | |
91 def get_histories(self): | |
92 histories = [] | |
93 for key in self.query_keys: | |
94 histories += [{'WebEnv': self.webenv, 'query_key': key}] | |
95 return histories | |
44 | 96 |
45 def post(self, database, **payload): | 97 def post(self, database, **payload): |
46 return json.dumps(Entrez.read(Entrez.epost(database, **payload)), indent=4) | 98 return json.dumps(Entrez.read(Entrez.epost(database, **payload)), indent=4) |
47 | 99 |
48 def fetch(self, db, ftype=None, **payload): | 100 def fetch(self, db, ftype=None, **payload): |
49 os.makedirs("downloads") | 101 os.makedirs("downloads") |
50 | 102 |
51 if 'id' in payload: | 103 if 'id' in payload: |
52 summary = self.id_summary(db, payload['id']) | 104 summary = self.id_summary(db, payload['id']) |
105 elif 'WebEnv' not in payload or 'query_key' not in payload: | |
106 summary = self.history_summary(db) | |
53 else: | 107 else: |
54 summary = self.history_summary(db) | 108 summary = payload |
55 | 109 |
56 count = len(summary) | 110 count = len(summary) |
57 payload['retmax'] = BATCH_SIZE | 111 payload['retmax'] = BATCH_SIZE |
58 | 112 |
59 # This may be bad. I'm not sure yet. I think it will be ... but UGH. | 113 # This may be bad. I'm not sure yet. I think it will be ... but UGH. |
85 return Entrez.esummary(**payload).read() | 139 return Entrez.esummary(**payload).read() |
86 | 140 |
87 def link(self, **payload): | 141 def link(self, **payload): |
88 return Entrez.elink(**payload).read() | 142 return Entrez.elink(**payload).read() |
89 | 143 |
90 def extract_history(self, xml_data): | 144 def extract_history_from_xml_file(self, xml_file): |
91 parsed_data = Entrez.read(StringIO.StringIO(xml_data)) | |
92 history = {} | 145 history = {} |
93 for key in ('QueryKey', 'WebEnv'): | 146 with open(xml_file, 'r') as handle: |
94 if key in parsed_data: | 147 xml_str = handle.read() |
95 history[key] = parsed_data[key] | 148 history = self.extract_history_from_xml(xml_str) |
96 | |
97 return history | 149 return history |
150 | |
151 def extract_history_from_xml(self, xml_str): | |
152 try: | |
153 parsed_data = Entrez.read(StringIO(xml_str)) | |
154 history = {} | |
155 gotit = 0 | |
156 | |
157 # New code doesn't work for esearch input to elink - Parsing esearch output (reading an xml history) does not work as an elink input payload, which needs 'QueryKey'. Notably, if parsing elink output as input to elink, conversion of xml 'QueryKey' to 'query_key' is needed for some reason. Also Notably, efetch returned results using the 'QueryKey' key | |
158 # For esearch xml history results | |
159 if 'QueryKey' in parsed_data: | |
160 history['query_key'] = parsed_data['QueryKey'] | |
161 gotit += 1 | |
162 if 'WebEnv' in parsed_data: | |
163 history['WebEnv'] = parsed_data['WebEnv'] | |
164 gotit += 1 | |
165 # For elink xml history results | |
166 if gotit < 2: | |
167 if 'LinkSetDbHistory' in parsed_data[0]: | |
168 if 'QueryKey' in parsed_data[0]['LinkSetDbHistory'][0]: | |
169 history['query_key'] = parsed_data[0]['LinkSetDbHistory'][0]['QueryKey'] | |
170 gotit += 1 | |
171 if 'WebEnv' in parsed_data[0]: | |
172 history['WebEnv'] = parsed_data[0]['WebEnv'] | |
173 gotit += 1 | |
174 if gotit < 2: | |
175 raise Exception("Could not find WebEnv in xml response") | |
176 except Exception as e: | |
177 print("Error parsing...") | |
178 print(xml_str) | |
179 raise(e) | |
180 | |
181 return history | |
182 | |
183 def extract_histories_from_xml_file(self, xml_file): | |
184 histories = [] | |
185 with open(xml_file, 'r') as handle: | |
186 xml_str = handle.read() | |
187 histories = self.extract_histories_from_xml(xml_str) | |
188 return histories | |
189 | |
190 def extract_histories_from_xml(self, xml_str): | |
191 try: | |
192 parsed_data = Entrez.read(StringIO(xml_str)) | |
193 histories = [] | |
194 gotit = 0 | |
195 | |
196 # New code doesn't work for esearch input to elink - Parsing esearch output (reading an xml history) does not work as an elink input payload, which needs 'QueryKey'. Notably, if parsing elink output as input to elink, conversion of xml 'QueryKey' to 'query_key' is needed for some reason. Also Notably, efetch returned results using the 'QueryKey' key | |
197 # For esearch xml history results | |
198 if 'QueryKey' in parsed_data: | |
199 tmp_hist = {} | |
200 tmp_hist['query_key'] = parsed_data['QueryKey'] | |
201 gotit += 1 | |
202 if 'WebEnv' in parsed_data: | |
203 tmp_hist['WebEnv'] = parsed_data['WebEnv'] | |
204 gotit += 1 | |
205 if gotit == 2: | |
206 histories += [tmp_hist] | |
207 # For elink xml history results | |
208 else: | |
209 gotenv = 0 | |
210 if 'LinkSetDbHistory' in parsed_data[0]: | |
211 for query in parsed_data[0]['LinkSetDbHistory']: | |
212 tmp_hist = {} | |
213 if 'WebEnv' in parsed_data[0]: | |
214 tmp_hist['WebEnv'] = parsed_data[0]['WebEnv'] | |
215 if 'QueryKey' in query: | |
216 tmp_hist['query_key'] = query['QueryKey'] | |
217 histories += [tmp_hist] | |
218 gotit += 1 | |
219 if gotit == 0 and gotenv == 0: | |
220 raise Exception("Could not find WebEnv in xml response") | |
221 except Exception as e: | |
222 print("Error parsing...") | |
223 print(xml_str) | |
224 raise(e) | |
225 | |
226 return histories | |
98 | 227 |
99 def search(self, **payload): | 228 def search(self, **payload): |
100 return Entrez.esearch(**payload).read() | 229 return Entrez.esearch(**payload).read() |
101 | 230 |
102 def info(self, **kwargs): | 231 def info(self, **kwargs): |
107 | 236 |
108 def citmatch(self, **kwargs): | 237 def citmatch(self, **kwargs): |
109 return Entrez.ecitmatch(**kwargs).read() | 238 return Entrez.ecitmatch(**kwargs).read() |
110 | 239 |
111 @classmethod | 240 @classmethod |
112 def parse_ids(cls, id_list, id, history_file): | 241 def jsonstring2jsondata(cls, json_str): |
242 json_handle = StringIO(json_str) | |
243 json_data = json.loads(json_handle.read()) | |
244 return json_data | |
245 | |
246 @classmethod | |
247 def jsonfile2UIlist(cls, json_file): | |
248 merged_ids = [] | |
249 with open(json_file, 'r') as handle: | |
250 json_data = json.loads(handle.read()) | |
251 for id in cls.jsondata2UIlist(json_data): | |
252 merged_ids += [id] | |
253 return merged_ids | |
254 | |
255 @classmethod | |
256 def jsondata2UIlist(cls, json_data): | |
257 merged_ids = [] | |
258 | |
259 # Always prioritize the result links as opposed to the search links | |
260 # elink - retrieves linked IDs for cmd=neighbor|neighbor_score only | |
261 if 'linksets' in json_data: | |
262 for lnk in json_data['linksets'][0]['linksetdbs']: | |
263 if 'links' in lnk: | |
264 for id in lnk['links']: | |
265 # elink for neighbor | |
266 if isinstance(id, str): | |
267 merged_ids.append(id) | |
268 # elink for neighbor_score | |
269 else: | |
270 merged_ids.append(id['id']) | |
271 # esearch | |
272 elif 'esearchresult' in json_data: | |
273 for id in json_data['esearchresult']['idlist']: | |
274 merged_ids += [id] | |
275 | |
276 return merged_ids | |
277 | |
278 @classmethod | |
279 def xmlfile2UIlist(cls, xml_file): | |
280 merged_ids = [] | |
281 with open(xml_file, 'r') as handle: | |
282 xml_data = Entrez.read(handle) | |
283 for id in cls.xmldata2UIlist(xml_data): | |
284 merged_ids += [id] | |
285 return merged_ids | |
286 | |
287 @classmethod | |
288 def xmlstring2UIlist(cls, xml_str): | |
289 merged_ids = [] | |
290 xml_data = Entrez.read(StringIO(xml_str)) | |
291 for id in cls.xmldata2UIlist(xml_data): | |
292 merged_ids += [id] | |
293 return merged_ids | |
294 | |
295 @classmethod | |
296 def xmldata2UIlist(cls, xml_data): | |
297 merged_ids = [] | |
298 | |
299 try: | |
300 # Always prioritize the result links as opposed to the search links | |
301 # elink - retrieves linked IDs for cmd=neighbor|neighbor_score only | |
302 if 'LinkSetDb' in xml_data[0]: | |
303 for lnk in xml_data[0]['LinkSetDb'][0]['Link']: | |
304 # elink for neighbor | |
305 if isinstance(lnk, str): | |
306 merged_ids.append(lnk) | |
307 # elink for neighbor_score | |
308 else: | |
309 merged_ids.append(lnk['Id']) | |
310 # esearch | |
311 elif 'IdList' in xml_data: | |
312 for id in xml_data['IdList']: | |
313 merged_ids += [id] | |
314 # If it was not elink output, we will end up here | |
315 except Exception: | |
316 # esearch | |
317 if 'IdList' in xml_data: | |
318 for id in xml_data['IdList']: | |
319 merged_ids += [id] | |
320 | |
321 return merged_ids | |
322 | |
323 @classmethod | |
324 def parse_ids(cls, id_list, id, history_file, xml_file, json_file): | |
113 """Parse IDs passed on --cli or in a file passed to the cli | 325 """Parse IDs passed on --cli or in a file passed to the cli |
114 """ | 326 """ |
115 merged_ids = [] | 327 merged_ids = [] |
116 if id is not None: | 328 if id is not None: |
117 for pid in id.replace('__cn__', ',').replace('\n', ',').split(','): | 329 for pid in id.replace('__cn__', ',').replace('\n', ',').split(','): |
120 | 332 |
121 if id_list is not None: | 333 if id_list is not None: |
122 with open(id_list, 'r') as handle: | 334 with open(id_list, 'r') as handle: |
123 merged_ids += [x.strip() for x in handle.readlines()] | 335 merged_ids += [x.strip() for x in handle.readlines()] |
124 | 336 |
125 # Exception hanlded here for uniformity | 337 if xml_file is not None: |
126 if len(merged_ids) == 0 and history_file is None: | 338 tmp_ids = cls.xmlfile2UIlist(xml_file) |
127 raise Exception("Must provide history file or IDs") | 339 for id in tmp_ids: |
128 | 340 merged_ids += [id] |
129 return merged_ids | 341 |
342 if json_file is not None: | |
343 tmp_ids = cls.jsonfile2UIlist(json_file) | |
344 for id in tmp_ids: | |
345 merged_ids += [id] | |
346 | |
347 return merged_ids | |
348 | |
349 @classmethod | |
350 def getVersion(cls): | |
351 """Return the biopython version | |
352 """ | |
353 import Bio | |
354 return Bio.__version__ |