Mercurial > repos > bgruening > uniprot_rest_interface
diff uniprot.py @ 10:95fb5712344f draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit 1c020106d4d7f957c9f1ec0d9885bbb2d56e70e7
author | bgruening |
---|---|
date | Tue, 06 Aug 2024 14:49:45 +0000 |
parents | 468c71dac78a |
children |
line wrap: on
line diff
--- a/uniprot.py Wed May 22 21:18:15 2024 +0000 +++ b/uniprot.py Tue Aug 06 14:49:45 2024 +0000 @@ -4,6 +4,7 @@ import sys import time import zlib +from time import sleep from urllib.parse import ( parse_qs, urlencode, @@ -18,7 +19,8 @@ ) -POLLING_INTERVAL = 3 +BATCH_SIZE = 50000 # Limit at UniProt is 100k +POLLING_INTERVAL = 5 API_URL = "https://rest.uniprot.org" @@ -31,7 +33,6 @@ try: response.raise_for_status() except requests.HTTPError: - print(response.json()) raise @@ -59,7 +60,7 @@ check_response(request) j = request.json() if "jobStatus" in j: - if j["jobStatus"] == "RUNNING": + if j["jobStatus"] in ["NEW", "RUNNING"]: print(f"Retrying in {POLLING_INTERVAL}s") time.sleep(POLLING_INTERVAL) else: @@ -102,7 +103,7 @@ if file_format == "json": j = json.loads(decompressed.decode("utf-8")) return j - elif file_format == "tsv": + elif file_format in ["tsv", "gff"]: return [line for line in decompressed.decode("utf-8").split("\n") if line] elif file_format == "xlsx": return [decompressed] @@ -112,7 +113,7 @@ return decompressed.decode("utf-8") elif file_format == "json": return response.json() - elif file_format == "tsv": + elif file_format in ["tsv", "gff"]: return [line for line in response.text.split("\n") if line] elif file_format == "xlsx": return [response.content] @@ -141,7 +142,7 @@ print(f"Fetched: {n_fetched} / {total}") -def get_id_mapping_results_search(url): +def get_id_mapping_results_search(url, first): parsed = urlparse(url) query = parse_qs(parsed.query) file_format = query["format"][0] if "format" in query else "json" @@ -163,6 +164,8 @@ for i, batch in enumerate(get_batch(request, file_format, compressed), 1): results = combine_batches(results, batch, file_format) print_progress_batches(i, size, total) + if len(results) > 1 and file_format == "tsv" and not first: + results = results[1:] if file_format == "xml": return merge_xml_results(results) return results @@ -266,20 +269,27 @@ query = set() for line in args.inp: query.add(line.strip()) - query = sorted(query) + query = list(query) + results = [] + first = True # if False the header is removed + while len(query) > 0: + batch = query[:BATCH_SIZE] + query = query[BATCH_SIZE:] + print(f"processing {len(batch)} left {len(query)}") + if args.tool == "map": + job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=batch) + elif args.tool == "retrieve": + job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=batch) - if args.tool == "map": - job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=query) - elif args.tool == "retrieve": - job_id = submit_id_mapping( - from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=query - ) - - if check_id_mapping_results_ready(job_id): - link = get_id_mapping_results_link(job_id) - link = f"{link}?format={args.format}" - print(link) - results = get_id_mapping_results_search(link) + if check_id_mapping_results_ready(job_id): + link = get_id_mapping_results_link(job_id) + link = f"{link}?format={args.format}" + print(link) + results.extend(get_id_mapping_results_search(link, first)) + first = False + print(f"got {len(results)} results so far") + if len(query): + sleep(5) if not isinstance(results, str): results = "\n".join(results)