Mercurial > repos > galaxyp > uniprotxml_downloader
comparison uniprotxml_downloader.py @ 7:4ddc8da62671 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 91705a9789b30878a55d1044c654e39a7726cf60
author | galaxyp |
---|---|
date | Wed, 11 Dec 2024 13:34:54 +0000 |
parents | a371252a2cf6 |
children |
comparison
equal
deleted
inserted
replaced
6:a371252a2cf6 | 7:4ddc8da62671 |
---|---|
15 import re | 15 import re |
16 import sys | 16 import sys |
17 from urllib import parse | 17 from urllib import parse |
18 | 18 |
19 import requests | 19 import requests |
20 from requests.adapters import HTTPAdapter | 20 from requests.adapters import HTTPAdapter, Retry |
21 from requests.packages.urllib3.util.retry import Retry | |
22 | |
23 DEFAULT_TIMEOUT = 5 # seconds | |
24 retry_strategy = Retry( | |
25 total=5, | |
26 backoff_factor=2, | |
27 status_forcelist=[429, 500, 502, 503, 504], | |
28 allowed_methods=["HEAD", "GET", "OPTIONS", "POST"] | |
29 ) | |
30 | |
31 | |
32 class TimeoutHTTPAdapter(HTTPAdapter): | |
33 def __init__(self, *args, **kwargs): | |
34 self.timeout = DEFAULT_TIMEOUT | |
35 if "timeout" in kwargs: | |
36 self.timeout = kwargs["timeout"] | |
37 del kwargs["timeout"] | |
38 super().__init__(*args, **kwargs) | |
39 | |
40 def send(self, request, **kwargs): | |
41 timeout = kwargs.get("timeout") | |
42 if timeout is None: | |
43 kwargs["timeout"] = self.timeout | |
44 return super().send(request, **kwargs) | |
45 | 21 |
46 | 22 |
47 def __main__(): | 23 def __main__(): |
48 # Parse Command Line | 24 # Parse Command Line |
49 parser = optparse.OptionParser() | 25 parser = optparse.OptionParser() |
50 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids') | 26 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids') |
51 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids') | 27 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids') |
52 parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot') | 28 parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot') |
53 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') | 29 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') |
54 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') | 30 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta', 'tsv'], default='xml', help='output format') |
55 parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field') | 31 parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field') |
56 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') | 32 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') |
33 parser.add_option('--output_columns', dest='output_columns', help='Columns to include in output (tsv)') | |
57 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') | 34 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') |
58 (options, args) = parser.parse_args() | 35 (options, args) = parser.parse_args() |
59 search_ids = set(options.search_id) | 36 search_ids = set(options.search_id) |
60 if options.input: | 37 if options.input: |
61 with open(options.input, 'r') as inputFile: | 38 with open(options.input, 'r') as inputFile: |
73 dest_path = options.output | 50 dest_path = options.output |
74 else: | 51 else: |
75 dest_path = "uniprot_%s.xml" % '_'.join(search_ids) | 52 dest_path = "uniprot_%s.xml" % '_'.join(search_ids) |
76 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' | 53 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' |
77 try: | 54 try: |
78 url = 'https://rest.uniprot.org/uniprotkb/stream' | 55 re_next_link = re.compile(r'<(.+)>; rel="next"') |
79 query = "%s%s" % (search_query, reviewed) | 56 retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) |
80 params = {'query': query, 'format': options.format} | 57 session = requests.Session() |
81 if options.debug: | 58 session.mount("https://", HTTPAdapter(max_retries=retries)) |
82 print("%s ? %s" % (url, params), file=sys.stderr) | |
83 data = parse.urlencode(params) | |
84 print(f"Retrieving: {url}?{data}") | |
85 adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) | |
86 | 59 |
87 http = requests.Session() | 60 def get_next_link(headers): |
88 http.mount("https://", adapter) | 61 if "Link" in headers: |
89 response = http.get(url, params=params) | 62 match = re_next_link.match(headers["Link"]) |
90 http.close() | 63 if match: |
64 return match.group(1) | |
91 | 65 |
92 if response.status_code != 200: | 66 def get_batch(batch_url): |
93 exit(f"Request failed with status code {response.status_code}:\n{response.text}") | 67 while batch_url: |
68 response = session.get(batch_url) | |
69 response.raise_for_status() | |
70 total = response.headers["x-total-results"] | |
71 release = response.headers["x-uniprot-release"] | |
72 yield response, total, release | |
73 batch_url = get_next_link(response.headers) | |
74 | |
75 params = {'size': 500, 'format': options.format, 'query': search_query + reviewed} | |
76 if options.output_columns: | |
77 params['fields'] = options.output_columns | |
78 url = f'https://rest.uniprot.org/uniprotkb/search?{parse.urlencode(params)}' | |
79 print(f"Downloading from:{url}") | |
94 | 80 |
95 with open(dest_path, 'w') as fh: | 81 with open(dest_path, 'w') as fh: |
96 fh.write(response.text) | 82 for batch, total, release in get_batch(url): |
83 fh.write(batch.text) | |
97 | 84 |
98 if options.format == 'xml': | 85 if options.format == 'xml': |
99 with open(dest_path, 'r') as contents: | 86 with open(dest_path, 'r') as contents: |
100 while True: | 87 while True: |
101 line = contents.readline() | 88 line = contents.readline() |
110 if re.match(pattern, line): | 97 if re.match(pattern, line): |
111 break | 98 break |
112 else: | 99 else: |
113 print("failed: Not a uniprot xml file", file=sys.stderr) | 100 print("failed: Not a uniprot xml file", file=sys.stderr) |
114 exit(1) | 101 exit(1) |
115 print("Search IDs:%s" % search_ids, file=sys.stdout) | 102 print(f"Search IDs:{search_ids}") |
116 if 'X-UniProt-Release' in response.headers: | 103 print(f"UniProt-Release:{release}") |
117 print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) | 104 print(f"Entries:{total}") |
118 if 'X-Total-Results' in response.headers: | |
119 print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout) | |
120 except Exception as e: | 105 except Exception as e: |
121 exit("%s" % e) | 106 exit("%s" % e) |
122 | 107 |
123 | 108 |
124 if __name__ == "__main__": | 109 if __name__ == "__main__": |