Mercurial > repos > bgruening > uniprot_rest_interface
diff uniprot.py @ 7:bfdc6a7ffd3a draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit ddbed5f0b0879b4a001d2da6a521b0c9a39c1e7b"
author | bgruening |
---|---|
date | Thu, 22 Apr 2021 17:32:16 +0000 |
parents | f7ebd1b4783b |
children | af5eccf83605 |
line wrap: on
line diff
--- a/uniprot.py Sun Sep 16 13:41:08 2018 -0400 +++ b/uniprot.py Thu Apr 22 17:32:16 2021 +0000 @@ -7,47 +7,61 @@ available services: map retrieve + +rewitten using inspiration form: https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/ """ import argparse import sys import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry -url = 'https://www.uniprot.org/' + +DEFAULT_TIMEOUT = 5 # seconds +URL = 'https://www.uniprot.org/' + +retry_strategy = Retry( + total=5, + backoff_factor=2, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["HEAD", "GET", "OPTIONS", "POST"] +) -def _retrieve(query, format='txt'): - """_retrieve is not meant for use with the python interface, use `retrieve` - instead""" - tool = 'uploadlists/' - - query = list(set(query.split('\n'))) - queries = [query[i:i+100] for i in range(0, len(query), 100)] +class TimeoutHTTPAdapter(HTTPAdapter): + def __init__(self, *args, **kwargs): + self.timeout = DEFAULT_TIMEOUT + if "timeout" in kwargs: + self.timeout = kwargs["timeout"] + del kwargs["timeout"] + super().__init__(*args, **kwargs) - data = { - 'format': format, - 'from': 'ACC+ID', - 'to': 'ACC' - } - - responses = [requests.post(url + tool, data=data, files={'file': ' '.join(_)}) for _ in queries] - page = ''.join(response.text for response in responses) - return page + def send(self, request, **kwargs): + timeout = kwargs.get("timeout") + if timeout is None: + kwargs["timeout"] = self.timeout + return super().send(request, **kwargs) -def _map(query, f, t, format='tab'): +def _map(query, f, t, format='tab', chunk_size=100): """ _map is not meant for use with the python interface, use `map` instead """ tool = 'uploadlists/' + data = {'format': format, 'from': f, 'to': t} - data = { - 'from': f, - 'to': t, - 'format': format, - 'query': query - } - response = requests.post(url + tool, data=data) + req = [] + for i in range(0, len(query), chunk_size): + q = query[i:i + chunk_size] + req.append(dict([("url", URL + tool), + ('data', data), + ("files", {'file': ' '.join(q)})])) + return req + response = requests.post(URL + tool, data=data) + response.raise_for_status() page = response.text + if "The service is temporarily unavailable" in page: + exit("The UNIPROT service is temporarily unavailable. Please try again later.") return page @@ -72,10 +86,23 @@ retrieve.add_argument('-f', '--format', help='specify output format', default='txt') args = parser.parse_args() - query = args.inp.read() + + # get the IDs from the file as sorted list + # (sorted is convenient for testing) + query = set() + for line in args.inp: + query.add(line.strip()) + query = sorted(query) if args.tool == 'map': - args.out.write(_map(query, args.f, args.t, args.format)) + pload = _map(query, args.f, args.t, chunk_size=100) + elif args.tool == 'retrieve': + pload = _map(query, 'ACC+ID', 'ACC', args.format, chunk_size=100) - elif args.tool == 'retrieve': - args.out.write(_retrieve(query, format=args.format)) + adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) + http = requests.Session() + http.mount("https://", adapter) + for i, p in enumerate(pload): + response = http.post(**p) + args.out.write(response.text) + http.close()