diff uniprot.py @ 7:bfdc6a7ffd3a draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit ddbed5f0b0879b4a001d2da6a521b0c9a39c1e7b"
author bgruening
date Thu, 22 Apr 2021 17:32:16 +0000
parents f7ebd1b4783b
children af5eccf83605
line wrap: on
line diff
--- a/uniprot.py	Sun Sep 16 13:41:08 2018 -0400
+++ b/uniprot.py	Thu Apr 22 17:32:16 2021 +0000
@@ -7,47 +7,61 @@
 available services:
     map
     retrieve
+
+rewitten using inspiration form: https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/
 """
 import argparse
 import sys
 
 import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
 
-url = 'https://www.uniprot.org/'
+
+DEFAULT_TIMEOUT = 5  # seconds
+URL = 'https://www.uniprot.org/'
+
+retry_strategy = Retry(
+    total=5,
+    backoff_factor=2,
+    status_forcelist=[429, 500, 502, 503, 504],
+    allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
+)
 
 
-def _retrieve(query, format='txt'):
-    """_retrieve is not meant for use with the python interface, use `retrieve`
-    instead"""
-    tool = 'uploadlists/'
-
-    query = list(set(query.split('\n')))
-    queries = [query[i:i+100] for i in range(0, len(query), 100)]
+class TimeoutHTTPAdapter(HTTPAdapter):
+    def __init__(self, *args, **kwargs):
+        self.timeout = DEFAULT_TIMEOUT
+        if "timeout" in kwargs:
+            self.timeout = kwargs["timeout"]
+            del kwargs["timeout"]
+        super().__init__(*args, **kwargs)
 
-    data = {
-            'format': format,
-            'from': 'ACC+ID',
-            'to': 'ACC'
-            }
-
-    responses = [requests.post(url + tool, data=data, files={'file': ' '.join(_)}) for _ in queries]
-    page = ''.join(response.text for response in responses)
-    return page
+    def send(self, request, **kwargs):
+        timeout = kwargs.get("timeout")
+        if timeout is None:
+            kwargs["timeout"] = self.timeout
+        return super().send(request, **kwargs)
 
 
-def _map(query, f, t, format='tab'):
+def _map(query, f, t, format='tab', chunk_size=100):
     """ _map is not meant for use with the python interface, use `map` instead
     """
     tool = 'uploadlists/'
+    data = {'format': format, 'from': f, 'to': t}
 
-    data = {
-            'from': f,
-            'to': t,
-            'format': format,
-            'query': query
-            }
-    response = requests.post(url + tool, data=data)
+    req = []
+    for i in range(0, len(query), chunk_size):
+        q = query[i:i + chunk_size]
+        req.append(dict([("url", URL + tool),
+                         ('data', data),
+                         ("files", {'file': ' '.join(q)})]))
+    return req
+    response = requests.post(URL + tool, data=data)
+    response.raise_for_status()
     page = response.text
+    if "The service is temporarily unavailable" in page:
+        exit("The UNIPROT service is temporarily unavailable. Please try again later.")
     return page
 
 
@@ -72,10 +86,23 @@
     retrieve.add_argument('-f', '--format', help='specify output format', default='txt')
 
     args = parser.parse_args()
-    query = args.inp.read()
+
+    # get the IDs from the file as sorted list
+    # (sorted is convenient for testing)
+    query = set()
+    for line in args.inp:
+        query.add(line.strip())
+    query = sorted(query)
 
     if args.tool == 'map':
-        args.out.write(_map(query, args.f, args.t, args.format))
+        pload = _map(query, args.f, args.t, chunk_size=100)
+    elif args.tool == 'retrieve':
+        pload = _map(query, 'ACC+ID', 'ACC', args.format, chunk_size=100)
 
-    elif args.tool == 'retrieve':
-        args.out.write(_retrieve(query, format=args.format))
+    adapter = TimeoutHTTPAdapter(max_retries=retry_strategy)
+    http = requests.Session()
+    http.mount("https://", adapter)
+    for i, p in enumerate(pload):
+        response = http.post(**p)
+        args.out.write(response.text)
+    http.close()