comparison uniprotxml_downloader.py @ 7:4ddc8da62671 draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 91705a9789b30878a55d1044c654e39a7726cf60
author galaxyp
date Wed, 11 Dec 2024 13:34:54 +0000
parents a371252a2cf6
children
comparison
equal deleted inserted replaced
6:a371252a2cf6 7:4ddc8da62671
15 import re 15 import re
16 import sys 16 import sys
17 from urllib import parse 17 from urllib import parse
18 18
19 import requests 19 import requests
20 from requests.adapters import HTTPAdapter 20 from requests.adapters import HTTPAdapter, Retry
21 from requests.packages.urllib3.util.retry import Retry
22
23 DEFAULT_TIMEOUT = 5 # seconds
24 retry_strategy = Retry(
25 total=5,
26 backoff_factor=2,
27 status_forcelist=[429, 500, 502, 503, 504],
28 allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
29 )
30
31
32 class TimeoutHTTPAdapter(HTTPAdapter):
33 def __init__(self, *args, **kwargs):
34 self.timeout = DEFAULT_TIMEOUT
35 if "timeout" in kwargs:
36 self.timeout = kwargs["timeout"]
37 del kwargs["timeout"]
38 super().__init__(*args, **kwargs)
39
40 def send(self, request, **kwargs):
41 timeout = kwargs.get("timeout")
42 if timeout is None:
43 kwargs["timeout"] = self.timeout
44 return super().send(request, **kwargs)
45 21
46 22
47 def __main__(): 23 def __main__():
48 # Parse Command Line 24 # Parse Command Line
49 parser = optparse.OptionParser() 25 parser = optparse.OptionParser()
50 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids') 26 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids')
51 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids') 27 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids')
52 parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot') 28 parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot')
53 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') 29 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries')
54 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') 30 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta', 'tsv'], default='xml', help='output format')
55 parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field') 31 parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field')
56 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') 32 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml')
33 parser.add_option('--output_columns', dest='output_columns', help='Columns to include in output (tsv)')
57 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') 34 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
58 (options, args) = parser.parse_args() 35 (options, args) = parser.parse_args()
59 search_ids = set(options.search_id) 36 search_ids = set(options.search_id)
60 if options.input: 37 if options.input:
61 with open(options.input, 'r') as inputFile: 38 with open(options.input, 'r') as inputFile:
73 dest_path = options.output 50 dest_path = options.output
74 else: 51 else:
75 dest_path = "uniprot_%s.xml" % '_'.join(search_ids) 52 dest_path = "uniprot_%s.xml" % '_'.join(search_ids)
76 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' 53 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
77 try: 54 try:
78 url = 'https://rest.uniprot.org/uniprotkb/stream' 55 re_next_link = re.compile(r'<(.+)>; rel="next"')
79 query = "%s%s" % (search_query, reviewed) 56 retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
80 params = {'query': query, 'format': options.format} 57 session = requests.Session()
81 if options.debug: 58 session.mount("https://", HTTPAdapter(max_retries=retries))
82 print("%s ? %s" % (url, params), file=sys.stderr)
83 data = parse.urlencode(params)
84 print(f"Retrieving: {url}?{data}")
85 adapter = TimeoutHTTPAdapter(max_retries=retry_strategy)
86 59
87 http = requests.Session() 60 def get_next_link(headers):
88 http.mount("https://", adapter) 61 if "Link" in headers:
89 response = http.get(url, params=params) 62 match = re_next_link.match(headers["Link"])
90 http.close() 63 if match:
64 return match.group(1)
91 65
92 if response.status_code != 200: 66 def get_batch(batch_url):
93 exit(f"Request failed with status code {response.status_code}:\n{response.text}") 67 while batch_url:
68 response = session.get(batch_url)
69 response.raise_for_status()
70 total = response.headers["x-total-results"]
71 release = response.headers["x-uniprot-release"]
72 yield response, total, release
73 batch_url = get_next_link(response.headers)
74
75 params = {'size': 500, 'format': options.format, 'query': search_query + reviewed}
76 if options.output_columns:
77 params['fields'] = options.output_columns
78 url = f'https://rest.uniprot.org/uniprotkb/search?{parse.urlencode(params)}'
79 print(f"Downloading from:{url}")
94 80
95 with open(dest_path, 'w') as fh: 81 with open(dest_path, 'w') as fh:
96 fh.write(response.text) 82 for batch, total, release in get_batch(url):
83 fh.write(batch.text)
97 84
98 if options.format == 'xml': 85 if options.format == 'xml':
99 with open(dest_path, 'r') as contents: 86 with open(dest_path, 'r') as contents:
100 while True: 87 while True:
101 line = contents.readline() 88 line = contents.readline()
110 if re.match(pattern, line): 97 if re.match(pattern, line):
111 break 98 break
112 else: 99 else:
113 print("failed: Not a uniprot xml file", file=sys.stderr) 100 print("failed: Not a uniprot xml file", file=sys.stderr)
114 exit(1) 101 exit(1)
115 print("Search IDs:%s" % search_ids, file=sys.stdout) 102 print(f"Search IDs:{search_ids}")
116 if 'X-UniProt-Release' in response.headers: 103 print(f"UniProt-Release:{release}")
117 print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) 104 print(f"Entries:{total}")
118 if 'X-Total-Results' in response.headers:
119 print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout)
120 except Exception as e: 105 except Exception as e:
121 exit("%s" % e) 106 exit("%s" % e)
122 107
123 108
124 if __name__ == "__main__": 109 if __name__ == "__main__":