Mercurial > repos > galaxyp > uniprotxml_downloader
changeset 4:12692567c7f9 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 62afd9de6db50f4314e49d9f24881b6d3778a0a5"
author | galaxyp |
---|---|
date | Tue, 01 Jun 2021 11:54:47 +0000 |
parents | 1a5690a5eedc |
children | 265c35540faa |
files | test-data/Helicobacter_strains_ids.tsv uniprotxml_downloader.py uniprotxml_downloader.xml |
diffstat | 3 files changed, 87 insertions(+), 50 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Helicobacter_strains_ids.tsv Tue Jun 01 11:54:47 2021 +0000 @@ -0,0 +1,2 @@ +blah 765963 +fasel 512562
--- a/uniprotxml_downloader.py Tue Jul 02 21:46:14 2019 -0400 +++ b/uniprotxml_downloader.py Tue Jun 01 11:54:47 2021 +0000 @@ -11,36 +11,61 @@ # #------------------------------------------------------------------------------ """ -import sys +import optparse import re -import optparse -import urllib -import urllib2 +import sys +from urllib import parse + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +DEFAULT_TIMEOUT = 5 # seconds +retry_strategy = Retry( + total=5, + backoff_factor=2, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["HEAD", "GET", "OPTIONS", "POST"] +) + + +class TimeoutHTTPAdapter(HTTPAdapter): + def __init__(self, *args, **kwargs): + self.timeout = DEFAULT_TIMEOUT + if "timeout" in kwargs: + self.timeout = kwargs["timeout"] + del kwargs["timeout"] + super().__init__(*args, **kwargs) + + def send(self, request, **kwargs): + timeout = kwargs.get("timeout") + if timeout is None: + kwargs["timeout"] = self.timeout + return super().send(request, **kwargs) def __main__(): # Parse Command Line parser = optparse.OptionParser() parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') - parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs' ) + parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs') parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') - parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info') parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') (options, args) = parser.parse_args() taxids = set(options.taxon) if options.input: - with open(options.input,'r') as inputFile: - for linenum,line in enumerate(inputFile): + with open(options.input, 'r') as inputFile: + for linenum, line in enumerate(inputFile): if line.startswith('#'): continue fields = line.rstrip('\r\n').split('\t') if len(fields) > abs(options.column): taxid = fields[options.column].strip() if taxid: - taxids.add(taxid) + taxids.add(taxid) taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] taxon_query = ' OR '.join(taxon_queries) if options.output: @@ -49,46 +74,44 @@ dest_path = "uniprot_%s.xml" % '_'.join(taxids) reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' try: - def reporthook(n1,n2,n3): - pass url = 'https://www.uniprot.org/uniprot/' query = "%s%s" % (taxon_query, reviewed) - params = {'query' : query, 'force' : 'yes' , 'format' : options.format} + params = {'query': query, 'force': 'yes', 'format': options.format} if options.debug: - print >> sys.stderr, "%s ? %s" % (url,params) - data = urllib.urlencode(params) - (fname, msg) = urllib.urlretrieve(url, dest_path,reporthook,data) - headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} - if 'Content-Length' in headers and headers['Content-Length'] == 0: - print >> sys.stderr, url - print >> sys.stderr, msg - exit(1) + print("%s ? %s" % (url, params), file=sys.stderr) + data = parse.urlencode(params) + print(f"Retrieving: {url+data}") + adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) + http = requests.Session() + http.mount("https://", adapter) + response = http.post(url, data=params) + http.close() + with open(dest_path, 'w') as fh: + fh.write(response.text) if options.format == 'xml': with open(dest_path, 'r') as contents: while True: line = contents.readline() if options.debug: - print >> sys.stderr, line + print(line, file=sys.stderr) if line is None: break if line.startswith('<?'): continue # pattern match <root or <ns:root for any ns string - pattern = '^<(\w*:)?uniprot' + pattern = r'^<(\w*:)?uniprot' if re.match(pattern, line): break else: - print >> sys.stderr, "failed: Not a uniprot xml file" + print("failed: Not a uniprot xml file", file=sys.stderr) exit(1) - if options.verbose: - print >> sys.stdout, "NCBI Taxon ID:%s" % taxids - if 'X-UniProt-Release' in headers: - print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release'] - if 'X-Total-Results' in headers: - print >> sys.stdout, "Entries:%s" % headers['X-Total-Results'] - print >> sys.stdout, "%s" % url - except Exception, e: - print >> sys.stderr, "failed: %s" % e + print("NCBI Taxon ID:%s" % taxids, file=sys.stdout) + if 'X-UniProt-Release' in response.headers: + print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) + if 'X-Total-Results' in response.headers: + print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout) + except Exception as e: + exit("%s" % e) if __name__ == "__main__":
--- a/uniprotxml_downloader.xml Tue Jul 02 21:46:14 2019 -0400 +++ b/uniprotxml_downloader.xml Tue Jun 01 11:54:47 2021 +0000 @@ -1,37 +1,37 @@ -<tool id="uniprotxml_downloader" name="UniProt" version="2.1.0"> +<tool id="uniprotxml_downloader" name="UniProt" version="2.2.0" profile="21.01"> <description>download proteome as XML or fasta</description> <requirements> - <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="2.25.1">requests</requirement> </requirements> <stdio> <exit_code range="1:" level="fatal" description="Error downloading proteome." /> </stdio> <command> <![CDATA[ -python $__tool_directory__/uniprotxml_downloader.py -v +python '$__tool_directory__/uniprotxml_downloader.py' #if $taxid.input_choice == 'common': ---taxon $taxid.organism -#if $taxid.reviewed: ---reviewed=$taxid.reviewed -#end if + --taxon $taxid.organism + #if $taxid.reviewed: + --reviewed=$taxid.reviewed + #end if #elif $taxid.input_choice == 'taxids': -#for $id in $taxid.taxons.split(','): --t "$id" -#end for + #for $id in $taxid.taxons.split(','): + -t '$id' + #end for #elif $taxid.input_choice == 'history': ---input="${taxid.taxon_file}" ---column=#echo int(str($taxid.column)) - 1# + --input='${taxid.taxon_file}' + --column=#echo int(str($taxid.column)) - 1# #end if ---format=$format ---output="${proteome}" +--format $format +--output '${proteome}' ]]> </command> <inputs> <conditional name="taxid"> <param name="input_choice" type="select" label="Select"> <option value="common">A Common Organism</option> - <option value="taxids">By Organism IDs</option> - <option value="history">A history dataset with an Organism Taxonomy Name column</option> + <option value="taxids">A manually entered list of Taxon IDs or names</option> + <option value="history">A history dataset with a column containing Taxon IDs or names</option> </param> <when value="common"> <param name="organism" type="select" label="Common Organisms" @@ -60,8 +60,8 @@ </param> </when> <when value="history"> - <param name="taxon_file" type="data" format="tabular" label="Dataset with Taxon Name column"/> - <param name="column" type="data_column" data_ref="taxon_file" label="Column with Taxon name"/> + <param name="taxon_file" type="data" format="tabular,txt" label="Dataset (tab separated) with Taxon ID/Name column"/> + <param name="column" type="data_column" data_ref="taxon_file" label="Column with Taxon ID/name"/> </when> </conditional> <param name="format" type="select" label="uniprot output format"> @@ -110,6 +110,18 @@ </assert_contents> </output> </test> + <test> + <param name="input_choice" value="history"/> + <param name="taxon_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/> + <param name="column" value="2"/> + <param name="format" value="fasta"/> + <output name="proteome"> + <assert_contents> + <has_text text="Shi470" /> + <has_text text="PeCan4" /> + </assert_contents> + </output> + </test> </tests> <help> <![CDATA[