Mercurial > repos > galaxyp > uniprotxml_downloader
changeset 0:0bd2688166a5 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit fa07533e9216dc40133a98e3129be9b87a963e80-dirty
author | galaxyp |
---|---|
date | Tue, 08 Mar 2016 12:03:49 -0500 |
parents | |
children | fc8c4bd28681 |
files | tool-data/uniprot_taxons.loc.sample uniprotxml_downloader.py uniprotxml_downloader.xml |
diffstat | 3 files changed, 182 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/uniprot_taxons.loc.sample Tue Mar 08 12:03:49 2016 -0500 @@ -0,0 +1,14 @@ +Homo sapiens (Human) 9606 +Rattus norvegicus (Rat) 10116 +Mus musculus (Mouse) 10090 +Bos taurus (Bovine) 9913 +Sus scrofa (Pig) 9823 +Escherichia coli (strain K12) 83333 +Arabidopsis thaliana (Mouse-ear cress) 3702 +Caenorhabditis elegans (C.elegans) 6239 +Drosophila melanogaster (Fruit fly) 7227 +Saccharomyces cerevisiae (Baker's Yeast) 4932 +Canis familiaris (Dog) 9615 +Halobacterium salinarum (strain ATCC 700922 / JCM 11081 / NRC-1) 64091 +Apis mellifera (Honeybee) 7460 +Mycobacterium tuberculosis H37Rv (MTB) [Not complete proteome] 83332
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/uniprotxml_downloader.py Tue Mar 08 12:03:49 2016 -0500 @@ -0,0 +1,78 @@ +#!/usr/bin/env python +""" +# +#------------------------------------------------------------------------------ +# University of Minnesota +# Copyright 2016, Regents of the University of Minnesota +#------------------------------------------------------------------------------ +# Author: +# +# James E Johnson +# +#------------------------------------------------------------------------------ +""" +import sys +import re +import optparse +import urllib + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') + parser.add_option('-r', '--reviewed', dest='reviewed', help='file path for th downloaed uniprot xml') + parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml') + parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info') + parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') + (options, args) = parser.parse_args() + + taxids = options.taxon if options.taxon else ['9606'] + taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] + taxon_query = ' OR '.join(taxon_queries) + if options.output: + dest_path = options.output + else: + dest_path = "uniprot_%s.xml" % '_'.join(taxids) + reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' + url = 'http://www.uniprot.org/uniprot/?query=%s%s&force=yes&format=xml' % (taxon_query, reviewed) + if options.debug: + print >> sys.stderr, url + try: + (fname, msg) = urllib.urlretrieve(url, dest_path) + headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} + if 'Content-Length' in headers and headers['Content-Length'] == 0: + print >> sys.stderr, url + print >> sys.stderr, msg + exit(1) + elif True: + pass + else: + with open(dest_path, 'r') as contents: + while True: + line = contents.readline() + if options.debug: + print >> sys.stderr, line + if line is None or not line.startswith('<?'): + break + # pattern match <root or <ns:root for any ns string + pattern = '^<(\w*:)?uniprot' + if re.match(pattern, line): + break + else: + print >> sys.stderr, "failed: Not a uniprot xml file" + exit(1) + + if options.verbose: + print >> sys.stdout, "NCBI Taxon ID:%s" % taxids + if 'X-UniProt-Release' in headers: + print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release'] + if 'X-Total-Results' in headers: + print >> sys.stdout, "Entries:%s" % headers['X-Total-Results'] + print >> sys.stdout, "%s" % url + except Exception, e: + print >> sys.stderr, "failed: %s" % e + + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/uniprotxml_downloader.xml Tue Mar 08 12:03:49 2016 -0500 @@ -0,0 +1,90 @@ +<tool id="uniprotxml_downloader" name="UniProtXML Download" version="1.0.0"> + <description>proteome</description> + <requirements> + <requirement type="binary">wget</requirement> + </requirements> + <stdio> + <exit_code range="1:" level="fatal" description="Error downloading proteome." /> + </stdio> + <command interpreter="python"> +<![CDATA[ +uniprotxml_downloader.py -v +#if $taxid.choice == 'common': +--taxon $taxid.organism +#if $taxid.reviewed: +--reviewed=$taxid.reviewed +#end if +#else: +#for id in $taxid.taxons.split(','): +-t $id +#end for +#end if +--output="${proteome}" +]]> + </command> + <inputs> + <conditional name="taxid"> + <param name="choice" type="select" label="Select"> + <option value="common">A Common Organism</option> + <option value="taxids">By Organism IDs</option> + </param> + <when value="common"> + <param name="organism" type="select" label="Common Organisms" + help="select species for protein database"> + <options from_file="uniprot_taxons.loc"> + <column name="name" index="0" /> + <column name="value" index="1" /> + </options> + </param> + <param name="reviewed" type="select" label="filter by reviewed status" optional="true"> + <help><![CDATA[ + UniProtKB/TrEMBL (unreviewed)is a large, automatically annotated database that may contain + redundant sequences, but there is a higher chance peptides will be identified. + UniProtKB/Swiss-Prot (reviewed) is a smaller, manually annotated database with + less of a chance peptides will be identified but less sequence redundancy + ]]> + </help> + <option value="yes">UniProtKB/Swiss-Prot (reviewed only)</option> + <option value="no">UniProtKB/TrEMBL (unreviewed only)</option> + </param> + </when> + <when value="taxids"> + <param name="taxons" type="text" label="NCBI taxon IDs" + help="Enter one or more Organsim IDs (separated by commas) from http://www.uniprot.org/proteomes/"> + <validator type="regex" message="OrganismID[,OrganismID]">^\d+(,\d+)*$</validator> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data format="uniprotxml" name="proteome" label="UniProtXML" /> + </outputs> + <tests> + <test> + <param name="taxons" value="1566990"/> + <output name="uniprotxml"> + <assert_contents> + <has_text text="</uniprot>" /> + </assert_contents> + </output> + </test> + </tests> + <help> +<![CDATA[ +**UniProtXML Downloader** + +Downloads a UniProtXML file from UniProtKB + +The Morpheus proteomics search algorithm can use this format as a search database. + +Available proteomes: http://www.uniprot.org/proteomes/ + +UniProtKB help: http://www.uniprot.org/help/uniprotkb + +]]> + </help> + <citations> + <citation type="doi">10.1093/nar/gku989</citation> + </citations> +</tool> +