comparison uniprot.py @ 10:95fb5712344f draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit 1c020106d4d7f957c9f1ec0d9885bbb2d56e70e7
author bgruening
date Tue, 06 Aug 2024 14:49:45 +0000
parents 468c71dac78a
children
comparison
equal deleted inserted replaced
9:468c71dac78a 10:95fb5712344f
2 import json 2 import json
3 import re 3 import re
4 import sys 4 import sys
5 import time 5 import time
6 import zlib 6 import zlib
7 from time import sleep
7 from urllib.parse import ( 8 from urllib.parse import (
8 parse_qs, 9 parse_qs,
9 urlencode, 10 urlencode,
10 urlparse, 11 urlparse,
11 ) 12 )
16 HTTPAdapter, 17 HTTPAdapter,
17 Retry, 18 Retry,
18 ) 19 )
19 20
20 21
21 POLLING_INTERVAL = 3 22 BATCH_SIZE = 50000 # Limit at UniProt is 100k
23 POLLING_INTERVAL = 5
22 API_URL = "https://rest.uniprot.org" 24 API_URL = "https://rest.uniprot.org"
23 25
24 26
25 retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) 27 retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
26 session = requests.Session() 28 session = requests.Session()
29 31
30 def check_response(response): 32 def check_response(response):
31 try: 33 try:
32 response.raise_for_status() 34 response.raise_for_status()
33 except requests.HTTPError: 35 except requests.HTTPError:
34 print(response.json())
35 raise 36 raise
36 37
37 38
38 def submit_id_mapping(from_db, to_db, ids): 39 def submit_id_mapping(from_db, to_db, ids):
39 print(f"{from_db} {to_db}") 40 print(f"{from_db} {to_db}")
57 while True: 58 while True:
58 request = session.get(f"{API_URL}/idmapping/status/{job_id}") 59 request = session.get(f"{API_URL}/idmapping/status/{job_id}")
59 check_response(request) 60 check_response(request)
60 j = request.json() 61 j = request.json()
61 if "jobStatus" in j: 62 if "jobStatus" in j:
62 if j["jobStatus"] == "RUNNING": 63 if j["jobStatus"] in ["NEW", "RUNNING"]:
63 print(f"Retrying in {POLLING_INTERVAL}s") 64 print(f"Retrying in {POLLING_INTERVAL}s")
64 time.sleep(POLLING_INTERVAL) 65 time.sleep(POLLING_INTERVAL)
65 else: 66 else:
66 raise Exception(j["jobStatus"]) 67 raise Exception(j["jobStatus"])
67 else: 68 else:
100 if compressed: 101 if compressed:
101 decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS) 102 decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
102 if file_format == "json": 103 if file_format == "json":
103 j = json.loads(decompressed.decode("utf-8")) 104 j = json.loads(decompressed.decode("utf-8"))
104 return j 105 return j
105 elif file_format == "tsv": 106 elif file_format in ["tsv", "gff"]:
106 return [line for line in decompressed.decode("utf-8").split("\n") if line] 107 return [line for line in decompressed.decode("utf-8").split("\n") if line]
107 elif file_format == "xlsx": 108 elif file_format == "xlsx":
108 return [decompressed] 109 return [decompressed]
109 elif file_format == "xml": 110 elif file_format == "xml":
110 return [decompressed.decode("utf-8")] 111 return [decompressed.decode("utf-8")]
111 else: 112 else:
112 return decompressed.decode("utf-8") 113 return decompressed.decode("utf-8")
113 elif file_format == "json": 114 elif file_format == "json":
114 return response.json() 115 return response.json()
115 elif file_format == "tsv": 116 elif file_format in ["tsv", "gff"]:
116 return [line for line in response.text.split("\n") if line] 117 return [line for line in response.text.split("\n") if line]
117 elif file_format == "xlsx": 118 elif file_format == "xlsx":
118 return [response.content] 119 return [response.content]
119 elif file_format == "xml": 120 elif file_format == "xml":
120 return [response.text] 121 return [response.text]
139 def print_progress_batches(batch_index, size, total): 140 def print_progress_batches(batch_index, size, total):
140 n_fetched = min((batch_index + 1) * size, total) 141 n_fetched = min((batch_index + 1) * size, total)
141 print(f"Fetched: {n_fetched} / {total}") 142 print(f"Fetched: {n_fetched} / {total}")
142 143
143 144
144 def get_id_mapping_results_search(url): 145 def get_id_mapping_results_search(url, first):
145 parsed = urlparse(url) 146 parsed = urlparse(url)
146 query = parse_qs(parsed.query) 147 query = parse_qs(parsed.query)
147 file_format = query["format"][0] if "format" in query else "json" 148 file_format = query["format"][0] if "format" in query else "json"
148 if "size" in query: 149 if "size" in query:
149 size = int(query["size"][0]) 150 size = int(query["size"][0])
161 total = int(request.headers["x-total-results"]) 162 total = int(request.headers["x-total-results"])
162 print_progress_batches(0, size, total) 163 print_progress_batches(0, size, total)
163 for i, batch in enumerate(get_batch(request, file_format, compressed), 1): 164 for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
164 results = combine_batches(results, batch, file_format) 165 results = combine_batches(results, batch, file_format)
165 print_progress_batches(i, size, total) 166 print_progress_batches(i, size, total)
167 if len(results) > 1 and file_format == "tsv" and not first:
168 results = results[1:]
166 if file_format == "xml": 169 if file_format == "xml":
167 return merge_xml_results(results) 170 return merge_xml_results(results)
168 return results 171 return results
169 172
170 173
264 # get the IDs from the file as sorted list 267 # get the IDs from the file as sorted list
265 # (sorted is convenient for testing) 268 # (sorted is convenient for testing)
266 query = set() 269 query = set()
267 for line in args.inp: 270 for line in args.inp:
268 query.add(line.strip()) 271 query.add(line.strip())
269 query = sorted(query) 272 query = list(query)
270 273 results = []
271 if args.tool == "map": 274 first = True # if False the header is removed
272 job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=query) 275 while len(query) > 0:
273 elif args.tool == "retrieve": 276 batch = query[:BATCH_SIZE]
274 job_id = submit_id_mapping( 277 query = query[BATCH_SIZE:]
275 from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=query 278 print(f"processing {len(batch)} left {len(query)}")
276 ) 279 if args.tool == "map":
277 280 job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=batch)
278 if check_id_mapping_results_ready(job_id): 281 elif args.tool == "retrieve":
279 link = get_id_mapping_results_link(job_id) 282 job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=batch)
280 link = f"{link}?format={args.format}" 283
281 print(link) 284 if check_id_mapping_results_ready(job_id):
282 results = get_id_mapping_results_search(link) 285 link = get_id_mapping_results_link(job_id)
286 link = f"{link}?format={args.format}"
287 print(link)
288 results.extend(get_id_mapping_results_search(link, first))
289 first = False
290 print(f"got {len(results)} results so far")
291 if len(query):
292 sleep(5)
283 293
284 if not isinstance(results, str): 294 if not isinstance(results, str):
285 results = "\n".join(results) 295 results = "\n".join(results)
286 args.out.write(f"{results}\n") 296 args.out.write(f"{results}\n")