Mercurial > repos > bgruening > uniprot_rest_interface
comparison uniprot.py @ 10:95fb5712344f draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit 1c020106d4d7f957c9f1ec0d9885bbb2d56e70e7
author | bgruening |
---|---|
date | Tue, 06 Aug 2024 14:49:45 +0000 |
parents | 468c71dac78a |
children |
comparison
equal
deleted
inserted
replaced
9:468c71dac78a | 10:95fb5712344f |
---|---|
2 import json | 2 import json |
3 import re | 3 import re |
4 import sys | 4 import sys |
5 import time | 5 import time |
6 import zlib | 6 import zlib |
7 from time import sleep | |
7 from urllib.parse import ( | 8 from urllib.parse import ( |
8 parse_qs, | 9 parse_qs, |
9 urlencode, | 10 urlencode, |
10 urlparse, | 11 urlparse, |
11 ) | 12 ) |
16 HTTPAdapter, | 17 HTTPAdapter, |
17 Retry, | 18 Retry, |
18 ) | 19 ) |
19 | 20 |
20 | 21 |
21 POLLING_INTERVAL = 3 | 22 BATCH_SIZE = 50000 # Limit at UniProt is 100k |
23 POLLING_INTERVAL = 5 | |
22 API_URL = "https://rest.uniprot.org" | 24 API_URL = "https://rest.uniprot.org" |
23 | 25 |
24 | 26 |
25 retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) | 27 retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) |
26 session = requests.Session() | 28 session = requests.Session() |
29 | 31 |
30 def check_response(response): | 32 def check_response(response): |
31 try: | 33 try: |
32 response.raise_for_status() | 34 response.raise_for_status() |
33 except requests.HTTPError: | 35 except requests.HTTPError: |
34 print(response.json()) | |
35 raise | 36 raise |
36 | 37 |
37 | 38 |
38 def submit_id_mapping(from_db, to_db, ids): | 39 def submit_id_mapping(from_db, to_db, ids): |
39 print(f"{from_db} {to_db}") | 40 print(f"{from_db} {to_db}") |
57 while True: | 58 while True: |
58 request = session.get(f"{API_URL}/idmapping/status/{job_id}") | 59 request = session.get(f"{API_URL}/idmapping/status/{job_id}") |
59 check_response(request) | 60 check_response(request) |
60 j = request.json() | 61 j = request.json() |
61 if "jobStatus" in j: | 62 if "jobStatus" in j: |
62 if j["jobStatus"] == "RUNNING": | 63 if j["jobStatus"] in ["NEW", "RUNNING"]: |
63 print(f"Retrying in {POLLING_INTERVAL}s") | 64 print(f"Retrying in {POLLING_INTERVAL}s") |
64 time.sleep(POLLING_INTERVAL) | 65 time.sleep(POLLING_INTERVAL) |
65 else: | 66 else: |
66 raise Exception(j["jobStatus"]) | 67 raise Exception(j["jobStatus"]) |
67 else: | 68 else: |
100 if compressed: | 101 if compressed: |
101 decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS) | 102 decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS) |
102 if file_format == "json": | 103 if file_format == "json": |
103 j = json.loads(decompressed.decode("utf-8")) | 104 j = json.loads(decompressed.decode("utf-8")) |
104 return j | 105 return j |
105 elif file_format == "tsv": | 106 elif file_format in ["tsv", "gff"]: |
106 return [line for line in decompressed.decode("utf-8").split("\n") if line] | 107 return [line for line in decompressed.decode("utf-8").split("\n") if line] |
107 elif file_format == "xlsx": | 108 elif file_format == "xlsx": |
108 return [decompressed] | 109 return [decompressed] |
109 elif file_format == "xml": | 110 elif file_format == "xml": |
110 return [decompressed.decode("utf-8")] | 111 return [decompressed.decode("utf-8")] |
111 else: | 112 else: |
112 return decompressed.decode("utf-8") | 113 return decompressed.decode("utf-8") |
113 elif file_format == "json": | 114 elif file_format == "json": |
114 return response.json() | 115 return response.json() |
115 elif file_format == "tsv": | 116 elif file_format in ["tsv", "gff"]: |
116 return [line for line in response.text.split("\n") if line] | 117 return [line for line in response.text.split("\n") if line] |
117 elif file_format == "xlsx": | 118 elif file_format == "xlsx": |
118 return [response.content] | 119 return [response.content] |
119 elif file_format == "xml": | 120 elif file_format == "xml": |
120 return [response.text] | 121 return [response.text] |
139 def print_progress_batches(batch_index, size, total): | 140 def print_progress_batches(batch_index, size, total): |
140 n_fetched = min((batch_index + 1) * size, total) | 141 n_fetched = min((batch_index + 1) * size, total) |
141 print(f"Fetched: {n_fetched} / {total}") | 142 print(f"Fetched: {n_fetched} / {total}") |
142 | 143 |
143 | 144 |
144 def get_id_mapping_results_search(url): | 145 def get_id_mapping_results_search(url, first): |
145 parsed = urlparse(url) | 146 parsed = urlparse(url) |
146 query = parse_qs(parsed.query) | 147 query = parse_qs(parsed.query) |
147 file_format = query["format"][0] if "format" in query else "json" | 148 file_format = query["format"][0] if "format" in query else "json" |
148 if "size" in query: | 149 if "size" in query: |
149 size = int(query["size"][0]) | 150 size = int(query["size"][0]) |
161 total = int(request.headers["x-total-results"]) | 162 total = int(request.headers["x-total-results"]) |
162 print_progress_batches(0, size, total) | 163 print_progress_batches(0, size, total) |
163 for i, batch in enumerate(get_batch(request, file_format, compressed), 1): | 164 for i, batch in enumerate(get_batch(request, file_format, compressed), 1): |
164 results = combine_batches(results, batch, file_format) | 165 results = combine_batches(results, batch, file_format) |
165 print_progress_batches(i, size, total) | 166 print_progress_batches(i, size, total) |
167 if len(results) > 1 and file_format == "tsv" and not first: | |
168 results = results[1:] | |
166 if file_format == "xml": | 169 if file_format == "xml": |
167 return merge_xml_results(results) | 170 return merge_xml_results(results) |
168 return results | 171 return results |
169 | 172 |
170 | 173 |
264 # get the IDs from the file as sorted list | 267 # get the IDs from the file as sorted list |
265 # (sorted is convenient for testing) | 268 # (sorted is convenient for testing) |
266 query = set() | 269 query = set() |
267 for line in args.inp: | 270 for line in args.inp: |
268 query.add(line.strip()) | 271 query.add(line.strip()) |
269 query = sorted(query) | 272 query = list(query) |
270 | 273 results = [] |
271 if args.tool == "map": | 274 first = True # if False the header is removed |
272 job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=query) | 275 while len(query) > 0: |
273 elif args.tool == "retrieve": | 276 batch = query[:BATCH_SIZE] |
274 job_id = submit_id_mapping( | 277 query = query[BATCH_SIZE:] |
275 from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=query | 278 print(f"processing {len(batch)} left {len(query)}") |
276 ) | 279 if args.tool == "map": |
277 | 280 job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=batch) |
278 if check_id_mapping_results_ready(job_id): | 281 elif args.tool == "retrieve": |
279 link = get_id_mapping_results_link(job_id) | 282 job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=batch) |
280 link = f"{link}?format={args.format}" | 283 |
281 print(link) | 284 if check_id_mapping_results_ready(job_id): |
282 results = get_id_mapping_results_search(link) | 285 link = get_id_mapping_results_link(job_id) |
286 link = f"{link}?format={args.format}" | |
287 print(link) | |
288 results.extend(get_id_mapping_results_search(link, first)) | |
289 first = False | |
290 print(f"got {len(results)} results so far") | |
291 if len(query): | |
292 sleep(5) | |
283 | 293 |
284 if not isinstance(results, str): | 294 if not isinstance(results, str): |
285 results = "\n".join(results) | 295 results = "\n".join(results) |
286 args.out.write(f"{results}\n") | 296 args.out.write(f"{results}\n") |