changeset 4:12692567c7f9 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 62afd9de6db50f4314e49d9f24881b6d3778a0a5"
author galaxyp
date Tue, 01 Jun 2021 11:54:47 +0000
parents 1a5690a5eedc
children 265c35540faa
files test-data/Helicobacter_strains_ids.tsv uniprotxml_downloader.py uniprotxml_downloader.xml
diffstat 3 files changed, 87 insertions(+), 50 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Helicobacter_strains_ids.tsv	Tue Jun 01 11:54:47 2021 +0000
@@ -0,0 +1,2 @@
+blah	765963
+fasel	512562
--- a/uniprotxml_downloader.py	Tue Jul 02 21:46:14 2019 -0400
+++ b/uniprotxml_downloader.py	Tue Jun 01 11:54:47 2021 +0000
@@ -11,36 +11,61 @@
 #
 #------------------------------------------------------------------------------
 """
-import sys
+import optparse
 import re
-import optparse
-import urllib
-import urllib2
+import sys
+from urllib import parse
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+
+DEFAULT_TIMEOUT = 5  # seconds
+retry_strategy = Retry(
+    total=5,
+    backoff_factor=2,
+    status_forcelist=[429, 500, 502, 503, 504],
+    allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
+)
+
+
+class TimeoutHTTPAdapter(HTTPAdapter):
+    def __init__(self, *args, **kwargs):
+        self.timeout = DEFAULT_TIMEOUT
+        if "timeout" in kwargs:
+            self.timeout = kwargs["timeout"]
+            del kwargs["timeout"]
+        super().__init__(*args, **kwargs)
+
+    def send(self, request, **kwargs):
+        timeout = kwargs.get("timeout")
+        if timeout is None:
+            kwargs["timeout"] = self.timeout
+        return super().send(request, **kwargs)
 
 
 def __main__():
     # Parse Command Line
     parser = optparse.OptionParser()
     parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs')
-    parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs' )
+    parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs')
     parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download')
     parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries')
     parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format')
     parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml')
-    parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info')
     parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
     (options, args) = parser.parse_args()
     taxids = set(options.taxon)
     if options.input:
-        with open(options.input,'r') as inputFile:
-            for linenum,line in enumerate(inputFile):
+        with open(options.input, 'r') as inputFile:
+            for linenum, line in enumerate(inputFile):
                 if line.startswith('#'):
                     continue
                 fields = line.rstrip('\r\n').split('\t')
                 if len(fields) > abs(options.column):
                     taxid = fields[options.column].strip()
                     if taxid:
-                      taxids.add(taxid)
+                        taxids.add(taxid)
     taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids]
     taxon_query = ' OR '.join(taxon_queries)
     if options.output:
@@ -49,46 +74,44 @@
         dest_path = "uniprot_%s.xml" % '_'.join(taxids)
     reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
     try:
-        def reporthook(n1,n2,n3):
-            pass   
         url = 'https://www.uniprot.org/uniprot/'
         query = "%s%s" % (taxon_query, reviewed)
-        params = {'query' : query, 'force' : 'yes' , 'format' : options.format}
+        params = {'query': query, 'force': 'yes', 'format': options.format}
         if options.debug:
-            print >> sys.stderr, "%s ? %s" % (url,params)
-        data = urllib.urlencode(params)
-        (fname, msg) = urllib.urlretrieve(url, dest_path,reporthook,data)
-        headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]}
-        if 'Content-Length' in headers and headers['Content-Length'] == 0:
-            print >> sys.stderr, url
-            print >> sys.stderr, msg
-            exit(1)
+            print("%s ? %s" % (url, params), file=sys.stderr)
+        data = parse.urlencode(params)
+        print(f"Retrieving: {url+data}")
+        adapter = TimeoutHTTPAdapter(max_retries=retry_strategy)
+        http = requests.Session()
+        http.mount("https://", adapter)
+        response = http.post(url, data=params)
+        http.close()
+        with open(dest_path, 'w') as fh:
+            fh.write(response.text)
         if options.format == 'xml':
             with open(dest_path, 'r') as contents:
                 while True:
                     line = contents.readline()
                     if options.debug:
-                        print >> sys.stderr, line
+                        print(line, file=sys.stderr)
                     if line is None:
                         break
                     if line.startswith('<?'):
                         continue
                     # pattern match <root or <ns:root for any ns string
-                    pattern = '^<(\w*:)?uniprot'
+                    pattern = r'^<(\w*:)?uniprot'
                     if re.match(pattern, line):
                         break
                     else:
-                        print >> sys.stderr, "failed: Not a uniprot xml file"
+                        print("failed: Not a uniprot xml file", file=sys.stderr)
                         exit(1)
-        if options.verbose:
-            print >> sys.stdout, "NCBI Taxon ID:%s" % taxids
-            if 'X-UniProt-Release' in headers:
-                print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release']
-            if 'X-Total-Results' in headers:
-                print >> sys.stdout, "Entries:%s" % headers['X-Total-Results']
-            print >> sys.stdout, "%s" % url
-    except Exception, e:
-        print >> sys.stderr, "failed: %s" % e
+        print("NCBI Taxon ID:%s" % taxids, file=sys.stdout)
+        if 'X-UniProt-Release' in response.headers:
+            print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout)
+        if 'X-Total-Results' in response.headers:
+            print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout)
+    except Exception as e:
+        exit("%s" % e)
 
 
 if __name__ == "__main__":
--- a/uniprotxml_downloader.xml	Tue Jul 02 21:46:14 2019 -0400
+++ b/uniprotxml_downloader.xml	Tue Jun 01 11:54:47 2021 +0000
@@ -1,37 +1,37 @@
-<tool id="uniprotxml_downloader" name="UniProt" version="2.1.0">
+<tool id="uniprotxml_downloader" name="UniProt" version="2.2.0" profile="21.01">
     <description>download proteome as XML or fasta</description>
     <requirements>
-        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="2.25.1">requests</requirement>
     </requirements>
     <stdio>
         <exit_code range="1:"  level="fatal" description="Error downloading proteome." />
     </stdio>
     <command>
 <![CDATA[
-python $__tool_directory__/uniprotxml_downloader.py -v 
+python '$__tool_directory__/uniprotxml_downloader.py'
 #if $taxid.input_choice == 'common':
---taxon $taxid.organism
-#if $taxid.reviewed:
---reviewed=$taxid.reviewed
-#end if
+    --taxon $taxid.organism
+    #if $taxid.reviewed:
+        --reviewed=$taxid.reviewed
+    #end if
 #elif $taxid.input_choice == 'taxids':
-#for $id in $taxid.taxons.split(','):
--t "$id"
-#end for
+    #for $id in $taxid.taxons.split(','):
+        -t '$id'
+    #end for
 #elif $taxid.input_choice == 'history':
---input="${taxid.taxon_file}"
---column=#echo int(str($taxid.column)) - 1#
+    --input='${taxid.taxon_file}'
+    --column=#echo int(str($taxid.column)) - 1#
 #end if
---format=$format
---output="${proteome}"
+--format $format
+--output '${proteome}'
 ]]>
     </command>
     <inputs>
         <conditional name="taxid">
             <param name="input_choice" type="select" label="Select">
                 <option value="common">A Common Organism</option>
-                <option value="taxids">By Organism IDs</option>
-                <option value="history">A history dataset with an Organism Taxonomy Name column</option>
+                <option value="taxids">A manually entered list of Taxon IDs or names</option>
+                <option value="history">A history dataset with a column containing Taxon IDs or names</option>
             </param>
             <when value="common">
                 <param name="organism" type="select" label="Common Organisms"
@@ -60,8 +60,8 @@
                 </param>
             </when>
             <when value="history">
-                <param name="taxon_file" type="data" format="tabular" label="Dataset with Taxon Name column"/>
-                <param name="column" type="data_column" data_ref="taxon_file" label="Column with Taxon name"/>
+                <param name="taxon_file" type="data" format="tabular,txt" label="Dataset (tab separated) with Taxon ID/Name column"/>
+                <param name="column" type="data_column" data_ref="taxon_file" label="Column with Taxon ID/name"/>
             </when>
         </conditional>
         <param name="format" type="select" label="uniprot output format">
@@ -110,6 +110,18 @@
                 </assert_contents>
             </output>
         </test>
+        <test>
+            <param name="input_choice" value="history"/>
+            <param name="taxon_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/>
+            <param name="column" value="2"/>
+            <param name="format" value="fasta"/>
+            <output name="proteome">
+                <assert_contents>
+                    <has_text text="Shi470" />
+                    <has_text text="PeCan4" />
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help>
 <![CDATA[