Mercurial > repos > galaxyp > uniprotxml_downloader
comparison uniprotxml_downloader.py @ 4:12692567c7f9 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 62afd9de6db50f4314e49d9f24881b6d3778a0a5"
author | galaxyp |
---|---|
date | Tue, 01 Jun 2021 11:54:47 +0000 |
parents | 1a5690a5eedc |
children | 265c35540faa |
comparison
equal
deleted
inserted
replaced
3:1a5690a5eedc | 4:12692567c7f9 |
---|---|
9 # | 9 # |
10 # James E Johnson | 10 # James E Johnson |
11 # | 11 # |
12 #------------------------------------------------------------------------------ | 12 #------------------------------------------------------------------------------ |
13 """ | 13 """ |
14 import optparse | |
15 import re | |
14 import sys | 16 import sys |
15 import re | 17 from urllib import parse |
16 import optparse | 18 |
17 import urllib | 19 import requests |
18 import urllib2 | 20 from requests.adapters import HTTPAdapter |
21 from requests.packages.urllib3.util.retry import Retry | |
22 | |
23 DEFAULT_TIMEOUT = 5 # seconds | |
24 retry_strategy = Retry( | |
25 total=5, | |
26 backoff_factor=2, | |
27 status_forcelist=[429, 500, 502, 503, 504], | |
28 allowed_methods=["HEAD", "GET", "OPTIONS", "POST"] | |
29 ) | |
30 | |
31 | |
32 class TimeoutHTTPAdapter(HTTPAdapter): | |
33 def __init__(self, *args, **kwargs): | |
34 self.timeout = DEFAULT_TIMEOUT | |
35 if "timeout" in kwargs: | |
36 self.timeout = kwargs["timeout"] | |
37 del kwargs["timeout"] | |
38 super().__init__(*args, **kwargs) | |
39 | |
40 def send(self, request, **kwargs): | |
41 timeout = kwargs.get("timeout") | |
42 if timeout is None: | |
43 kwargs["timeout"] = self.timeout | |
44 return super().send(request, **kwargs) | |
19 | 45 |
20 | 46 |
21 def __main__(): | 47 def __main__(): |
22 # Parse Command Line | 48 # Parse Command Line |
23 parser = optparse.OptionParser() | 49 parser = optparse.OptionParser() |
24 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') | 50 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') |
25 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs' ) | 51 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs') |
26 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') | 52 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') |
27 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') | 53 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') |
28 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') | 54 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') |
29 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') | 55 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') |
30 parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info') | |
31 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') | 56 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') |
32 (options, args) = parser.parse_args() | 57 (options, args) = parser.parse_args() |
33 taxids = set(options.taxon) | 58 taxids = set(options.taxon) |
34 if options.input: | 59 if options.input: |
35 with open(options.input,'r') as inputFile: | 60 with open(options.input, 'r') as inputFile: |
36 for linenum,line in enumerate(inputFile): | 61 for linenum, line in enumerate(inputFile): |
37 if line.startswith('#'): | 62 if line.startswith('#'): |
38 continue | 63 continue |
39 fields = line.rstrip('\r\n').split('\t') | 64 fields = line.rstrip('\r\n').split('\t') |
40 if len(fields) > abs(options.column): | 65 if len(fields) > abs(options.column): |
41 taxid = fields[options.column].strip() | 66 taxid = fields[options.column].strip() |
42 if taxid: | 67 if taxid: |
43 taxids.add(taxid) | 68 taxids.add(taxid) |
44 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] | 69 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] |
45 taxon_query = ' OR '.join(taxon_queries) | 70 taxon_query = ' OR '.join(taxon_queries) |
46 if options.output: | 71 if options.output: |
47 dest_path = options.output | 72 dest_path = options.output |
48 else: | 73 else: |
49 dest_path = "uniprot_%s.xml" % '_'.join(taxids) | 74 dest_path = "uniprot_%s.xml" % '_'.join(taxids) |
50 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' | 75 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' |
51 try: | 76 try: |
52 def reporthook(n1,n2,n3): | |
53 pass | |
54 url = 'https://www.uniprot.org/uniprot/' | 77 url = 'https://www.uniprot.org/uniprot/' |
55 query = "%s%s" % (taxon_query, reviewed) | 78 query = "%s%s" % (taxon_query, reviewed) |
56 params = {'query' : query, 'force' : 'yes' , 'format' : options.format} | 79 params = {'query': query, 'force': 'yes', 'format': options.format} |
57 if options.debug: | 80 if options.debug: |
58 print >> sys.stderr, "%s ? %s" % (url,params) | 81 print("%s ? %s" % (url, params), file=sys.stderr) |
59 data = urllib.urlencode(params) | 82 data = parse.urlencode(params) |
60 (fname, msg) = urllib.urlretrieve(url, dest_path,reporthook,data) | 83 print(f"Retrieving: {url+data}") |
61 headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} | 84 adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) |
62 if 'Content-Length' in headers and headers['Content-Length'] == 0: | 85 http = requests.Session() |
63 print >> sys.stderr, url | 86 http.mount("https://", adapter) |
64 print >> sys.stderr, msg | 87 response = http.post(url, data=params) |
65 exit(1) | 88 http.close() |
89 with open(dest_path, 'w') as fh: | |
90 fh.write(response.text) | |
66 if options.format == 'xml': | 91 if options.format == 'xml': |
67 with open(dest_path, 'r') as contents: | 92 with open(dest_path, 'r') as contents: |
68 while True: | 93 while True: |
69 line = contents.readline() | 94 line = contents.readline() |
70 if options.debug: | 95 if options.debug: |
71 print >> sys.stderr, line | 96 print(line, file=sys.stderr) |
72 if line is None: | 97 if line is None: |
73 break | 98 break |
74 if line.startswith('<?'): | 99 if line.startswith('<?'): |
75 continue | 100 continue |
76 # pattern match <root or <ns:root for any ns string | 101 # pattern match <root or <ns:root for any ns string |
77 pattern = '^<(\w*:)?uniprot' | 102 pattern = r'^<(\w*:)?uniprot' |
78 if re.match(pattern, line): | 103 if re.match(pattern, line): |
79 break | 104 break |
80 else: | 105 else: |
81 print >> sys.stderr, "failed: Not a uniprot xml file" | 106 print("failed: Not a uniprot xml file", file=sys.stderr) |
82 exit(1) | 107 exit(1) |
83 if options.verbose: | 108 print("NCBI Taxon ID:%s" % taxids, file=sys.stdout) |
84 print >> sys.stdout, "NCBI Taxon ID:%s" % taxids | 109 if 'X-UniProt-Release' in response.headers: |
85 if 'X-UniProt-Release' in headers: | 110 print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) |
86 print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release'] | 111 if 'X-Total-Results' in response.headers: |
87 if 'X-Total-Results' in headers: | 112 print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout) |
88 print >> sys.stdout, "Entries:%s" % headers['X-Total-Results'] | 113 except Exception as e: |
89 print >> sys.stdout, "%s" % url | 114 exit("%s" % e) |
90 except Exception, e: | |
91 print >> sys.stderr, "failed: %s" % e | |
92 | 115 |
93 | 116 |
94 if __name__ == "__main__": | 117 if __name__ == "__main__": |
95 __main__() | 118 __main__() |