comparison uniprot.py @ 7:bfdc6a7ffd3a draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit ddbed5f0b0879b4a001d2da6a521b0c9a39c1e7b"
author bgruening
date Thu, 22 Apr 2021 17:32:16 +0000
parents f7ebd1b4783b
children af5eccf83605
comparison
equal deleted inserted replaced
6:054483e27a35 7:bfdc6a7ffd3a
5 5
6 Based on work from Jan Rudolph: https://github.com/jdrudolph/uniprot 6 Based on work from Jan Rudolph: https://github.com/jdrudolph/uniprot
7 available services: 7 available services:
8 map 8 map
9 retrieve 9 retrieve
10
11 rewitten using inspiration form: https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/
10 """ 12 """
11 import argparse 13 import argparse
12 import sys 14 import sys
13 15
14 import requests 16 import requests
15 17 from requests.adapters import HTTPAdapter
16 url = 'https://www.uniprot.org/' 18 from requests.packages.urllib3.util.retry import Retry
17 19
18 20
19 def _retrieve(query, format='txt'): 21 DEFAULT_TIMEOUT = 5 # seconds
20 """_retrieve is not meant for use with the python interface, use `retrieve` 22 URL = 'https://www.uniprot.org/'
21 instead"""
22 tool = 'uploadlists/'
23 23
24 query = list(set(query.split('\n'))) 24 retry_strategy = Retry(
25 queries = [query[i:i+100] for i in range(0, len(query), 100)] 25 total=5,
26 26 backoff_factor=2,
27 data = { 27 status_forcelist=[429, 500, 502, 503, 504],
28 'format': format, 28 allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
29 'from': 'ACC+ID', 29 )
30 'to': 'ACC'
31 }
32
33 responses = [requests.post(url + tool, data=data, files={'file': ' '.join(_)}) for _ in queries]
34 page = ''.join(response.text for response in responses)
35 return page
36 30
37 31
38 def _map(query, f, t, format='tab'): 32 class TimeoutHTTPAdapter(HTTPAdapter):
33 def __init__(self, *args, **kwargs):
34 self.timeout = DEFAULT_TIMEOUT
35 if "timeout" in kwargs:
36 self.timeout = kwargs["timeout"]
37 del kwargs["timeout"]
38 super().__init__(*args, **kwargs)
39
40 def send(self, request, **kwargs):
41 timeout = kwargs.get("timeout")
42 if timeout is None:
43 kwargs["timeout"] = self.timeout
44 return super().send(request, **kwargs)
45
46
47 def _map(query, f, t, format='tab', chunk_size=100):
39 """ _map is not meant for use with the python interface, use `map` instead 48 """ _map is not meant for use with the python interface, use `map` instead
40 """ 49 """
41 tool = 'uploadlists/' 50 tool = 'uploadlists/'
51 data = {'format': format, 'from': f, 'to': t}
42 52
43 data = { 53 req = []
44 'from': f, 54 for i in range(0, len(query), chunk_size):
45 'to': t, 55 q = query[i:i + chunk_size]
46 'format': format, 56 req.append(dict([("url", URL + tool),
47 'query': query 57 ('data', data),
48 } 58 ("files", {'file': ' '.join(q)})]))
49 response = requests.post(url + tool, data=data) 59 return req
60 response = requests.post(URL + tool, data=data)
61 response.raise_for_status()
50 page = response.text 62 page = response.text
63 if "The service is temporarily unavailable" in page:
64 exit("The UNIPROT service is temporarily unavailable. Please try again later.")
51 return page 65 return page
52 66
53 67
54 if __name__ == '__main__': 68 if __name__ == '__main__':
55 parser = argparse.ArgumentParser(description='retrieve uniprot mapping') 69 parser = argparse.ArgumentParser(description='retrieve uniprot mapping')
70 retrieve.add_argument('out', nargs='?', type=argparse.FileType('w'), 84 retrieve.add_argument('out', nargs='?', type=argparse.FileType('w'),
71 default=sys.stdout, help='output file (default: stdout)') 85 default=sys.stdout, help='output file (default: stdout)')
72 retrieve.add_argument('-f', '--format', help='specify output format', default='txt') 86 retrieve.add_argument('-f', '--format', help='specify output format', default='txt')
73 87
74 args = parser.parse_args() 88 args = parser.parse_args()
75 query = args.inp.read() 89
90 # get the IDs from the file as sorted list
91 # (sorted is convenient for testing)
92 query = set()
93 for line in args.inp:
94 query.add(line.strip())
95 query = sorted(query)
76 96
77 if args.tool == 'map': 97 if args.tool == 'map':
78 args.out.write(_map(query, args.f, args.t, args.format)) 98 pload = _map(query, args.f, args.t, chunk_size=100)
99 elif args.tool == 'retrieve':
100 pload = _map(query, 'ACC+ID', 'ACC', args.format, chunk_size=100)
79 101
80 elif args.tool == 'retrieve': 102 adapter = TimeoutHTTPAdapter(max_retries=retry_strategy)
81 args.out.write(_retrieve(query, format=args.format)) 103 http = requests.Session()
104 http.mount("https://", adapter)
105 for i, p in enumerate(pload):
106 response = http.post(**p)
107 args.out.write(response.text)
108 http.close()