Mercurial > repos > galaxyp > uniprotxml_downloader

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/uniprot_taxons.loc.sample	Tue Mar 08 12:03:49 2016 -0500
@@ -0,0 +1,14 @@
+Homo sapiens (Human)	9606
+Rattus norvegicus (Rat)	10116
+Mus musculus (Mouse)	10090
+Bos taurus (Bovine)	9913
+Sus scrofa (Pig)	9823
+Escherichia coli (strain K12)	83333
+Arabidopsis thaliana (Mouse-ear cress)	3702
+Caenorhabditis elegans (C.elegans)	6239
+Drosophila melanogaster (Fruit fly)	7227
+Saccharomyces cerevisiae (Baker's Yeast)	4932
+Canis familiaris (Dog)	9615
+Halobacterium salinarum (strain ATCC 700922 / JCM 11081 / NRC-1)	64091
+Apis mellifera (Honeybee)	7460
+Mycobacterium tuberculosis H37Rv (MTB) [Not complete proteome]	83332
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/uniprotxml_downloader.py	Tue Mar 08 12:03:49 2016 -0500
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+#                         University of Minnesota
+#         Copyright 2016, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#  James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+import sys
+import re
+import optparse
+import urllib
+
+
+def __main__():
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download')
+    parser.add_option('-r', '--reviewed', dest='reviewed', help='file path for th downloaed uniprot xml')
+    parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml')
+    parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info')
+    parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
+    (options, args) = parser.parse_args()
+
+    taxids = options.taxon if options.taxon else ['9606']
+    taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids]
+    taxon_query = ' OR '.join(taxon_queries)
+    if options.output:
+        dest_path = options.output
+    else:
+        dest_path = "uniprot_%s.xml" % '_'.join(taxids)
+    reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
+    url = 'http://www.uniprot.org/uniprot/?query=%s%s&force=yes&format=xml' % (taxon_query, reviewed)
+    if options.debug:
+        print >> sys.stderr, url
+    try:
+        (fname, msg) = urllib.urlretrieve(url, dest_path)
+        headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]}
+        if 'Content-Length' in headers and headers['Content-Length'] == 0:
+            print >> sys.stderr, url
+            print >> sys.stderr, msg
+            exit(1)
+        elif True:
+            pass
+        else:
+            with open(dest_path, 'r') as contents:
+                while True:
+                    line = contents.readline()
+                    if options.debug:
+                        print >> sys.stderr, line
+                    if line is None or not line.startswith('<?'):
+                        break
+                    # pattern match <root or <ns:root for any ns string
+                    pattern = '^<(\w*:)?uniprot'
+                    if re.match(pattern, line):
+                        break
+                    else:
+                        print >> sys.stderr, "failed: Not a uniprot xml file"
+                        exit(1)
+
+        if options.verbose:
+            print >> sys.stdout, "NCBI Taxon ID:%s" % taxids
+            if 'X-UniProt-Release' in headers:
+                print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release']
+            if 'X-Total-Results' in headers:
+                print >> sys.stdout, "Entries:%s" % headers['X-Total-Results']
+            print >> sys.stdout, "%s" % url
+    except Exception, e:
+        print >> sys.stderr, "failed: %s" % e
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/uniprotxml_downloader.xml	Tue Mar 08 12:03:49 2016 -0500
@@ -0,0 +1,90 @@
+<tool id="uniprotxml_downloader" name="UniProtXML Download" version="1.0.0">
+    <description>proteome</description>
+    <requirements>
+        <requirement type="binary">wget</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:"  level="fatal" description="Error downloading proteome." />
+    </stdio>
+    <command interpreter="python">
+<![CDATA[
+uniprotxml_downloader.py -v
+#if $taxid.choice == 'common':
+--taxon $taxid.organism
+#if $taxid.reviewed:
+--reviewed=$taxid.reviewed
+#end if
+#else:
+#for id in $taxid.taxons.split(','):
+-t $id
+#end for
+#end if
+--output="${proteome}"
+]]>
+    </command>
+    <inputs>
+        <conditional name="taxid">
+            <param name="choice" type="select" label="Select">
+                <option value="common">A Common Organism</option>
+                <option value="taxids">By Organism IDs</option>
+            </param>
+            <when value="common">
+                <param name="organism" type="select" label="Common Organisms"
+                       help="select species for protein database">
+                    <options from_file="uniprot_taxons.loc">
+                        <column name="name" index="0" />
+                        <column name="value" index="1" />
+                    </options>
+                </param>
+                <param name="reviewed" type="select" label="filter by reviewed status" optional="true">
+                    <help><![CDATA[
+                    UniProtKB/TrEMBL (unreviewed)is a large, automatically annotated database that may contain
+                    redundant sequences, but there is a higher chance peptides will be identified.
+                    UniProtKB/Swiss-Prot (reviewed) is a smaller, manually annotated database with
+                    less of a chance peptides will be identified but less sequence redundancy
+                    ]]>
+                    </help>
+                    <option value="yes">UniProtKB/Swiss-Prot (reviewed only)</option>
+                    <option value="no">UniProtKB/TrEMBL (unreviewed only)</option>
+                </param>
+            </when>
+            <when value="taxids">
+                <param name="taxons" type="text" label="NCBI taxon IDs"
+                       help="Enter one or more Organsim IDs (separated by commas) from http://www.uniprot.org/proteomes/">
+                    <validator type="regex" message="OrganismID[,OrganismID]">^\d+(,\d+)*$</validator>
+                </param>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="uniprotxml" name="proteome" label="UniProtXML" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="taxons" value="1566990"/>
+            <output name="uniprotxml">
+                <assert_contents>
+                    <has_text text="&lt;/uniprot&gt;" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+**UniProtXML Downloader**
+
+Downloads a UniProtXML file from UniProtKB
+
+The Morpheus proteomics search algorithm can use this format as a search database.
+
+Available proteomes: http://www.uniprot.org/proteomes/
+
+UniProtKB help: http://www.uniprot.org/help/uniprotkb
+
+]]>
+    </help>
+    <citations>
+      <citation type="doi">10.1093/nar/gku989</citation>
+    </citations>
+</tool>
+