Mercurial > repos > galaxyp > uniprotxml_downloader
changeset 2:e1abc9a35c64 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 15c2d28359584bcee25cdb456cff50892fff7347
author | galaxyp |
---|---|
date | Fri, 16 Dec 2016 17:33:05 -0500 |
parents | fc8c4bd28681 |
children | 1a5690a5eedc |
files | test-data/Helicobacter_strains.tsv uniprotxml_downloader.py uniprotxml_downloader.xml |
diffstat | 3 files changed, 112 insertions(+), 37 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Helicobacter_strains.tsv Fri Dec 16 17:33:05 2016 -0500 @@ -0,0 +1,2 @@ +PeCan4 +Shi470
--- a/uniprotxml_downloader.py Wed Dec 07 16:44:14 2016 -0500 +++ b/uniprotxml_downloader.py Fri Dec 16 17:33:05 2016 -0500 @@ -15,19 +15,32 @@ import re import optparse import urllib +import urllib2 def __main__(): # Parse Command Line parser = optparse.OptionParser() + parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') + parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs' ) parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') - parser.add_option('-r', '--reviewed', dest='reviewed', help='file path for th downloaed uniprot xml') + parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') + parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml',help='output format') parser.add_option('-o', '--output', dest='output', help='file path for th downloaed uniprot xml') parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info') parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') (options, args) = parser.parse_args() - - taxids = options.taxon if options.taxon else ['9606'] + taxids = set(options.taxon) + if options.input: + with open(options.input,'r') as inputFile: + for linenum,line in enumerate(inputFile): + if line.startswith('#'): + continue + fields = line.rstrip('\r\n').split('\t') + if len(fields) > abs(options.column): + taxid = fields[options.column].strip() + if taxid: + taxids.add(taxid) taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] taxon_query = ' OR '.join(taxon_queries) if options.output: @@ -35,26 +48,31 @@ else: dest_path = "uniprot_%s.xml" % '_'.join(taxids) reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' - url = 'http://www.uniprot.org/uniprot/?query=%s%s&force=yes&format=xml' % (taxon_query, reviewed) - if options.debug: - print >> sys.stderr, url try: - (fname, msg) = urllib.urlretrieve(url, dest_path) + def reporthook(n1,n2,n3): + pass + url = 'http://www.uniprot.org/uniprot/' + query = "%s%s" % (taxon_query, reviewed) + params = {'query' : query, 'force' : 'yes' , 'format' : options.format} + if options.debug: + print >> sys.stderr, "%s ? %s" % (url,params) + data = urllib.urlencode(params) + (fname, msg) = urllib.urlretrieve(url, dest_path,reporthook,data) headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} if 'Content-Length' in headers and headers['Content-Length'] == 0: print >> sys.stderr, url print >> sys.stderr, msg exit(1) - elif True: - pass - else: + if options.format == 'xml': with open(dest_path, 'r') as contents: while True: line = contents.readline() if options.debug: print >> sys.stderr, line - if line is None or not line.startswith('<?'): + if line is None: break + if line.startswith('<?'): + continue # pattern match <root or <ns:root for any ns string pattern = '^<(\w*:)?uniprot' if re.match(pattern, line): @@ -62,7 +80,6 @@ else: print >> sys.stderr, "failed: Not a uniprot xml file" exit(1) - if options.verbose: print >> sys.stdout, "NCBI Taxon ID:%s" % taxids if 'X-UniProt-Release' in headers:
--- a/uniprotxml_downloader.xml Wed Dec 07 16:44:14 2016 -0500 +++ b/uniprotxml_downloader.xml Fri Dec 16 17:33:05 2016 -0500 @@ -1,33 +1,37 @@ -<tool id="uniprotxml_downloader" name="UniProtXML Download" version="1.0.1"> - <description>proteome</description> +<tool id="uniprotxml_downloader" name="UniProt" version="2.0.0"> + <description>download proteome as XML or fasta</description> <requirements> - <requirement type="package" version="2.7.12">python</requirement> + <requirement type="package" version="2.7">python</requirement> </requirements> <stdio> <exit_code range="1:" level="fatal" description="Error downloading proteome." /> </stdio> <command> <![CDATA[ - python '$__tool_directory__/uniprotxml_downloader.py' - -v - #if $taxid.choice == 'common': - --taxon '$taxid.organism' - #if $taxid.reviewed: - --reviewed=$taxid.reviewed - #end if - #else: - #for id in $taxid.taxons.split(','): - -t $id - #end for - #end if - --output="${proteome}" +python $__tool_directory__/uniprotxml_downloader.py -v +#if $taxid.input_choice == 'common': +--taxon $taxid.organism +#if $taxid.reviewed: +--reviewed=$taxid.reviewed +#end if +#elif $taxid.input_choice == 'taxids': +#for $id in $taxid.taxons.split(','): +-t "$id" +#end for +#elif $taxid.input_choice == 'history': +--input="${taxid.taxon_file}" +--column=#echo int(str($taxid.column)) - 1# +#end if +--format=$format +--output="${proteome}" ]]> </command> <inputs> <conditional name="taxid"> - <param name="choice" type="select" label="Select"> + <param name="input_choice" type="select" label="Select"> <option value="common">A Common Organism</option> <option value="taxids">By Organism IDs</option> + <option value="history">A history dataset with an Organism Taxonomy Name column</option> </param> <when value="common"> <param name="organism" type="select" label="Common Organisms" @@ -50,37 +54,89 @@ </param> </when> <when value="taxids"> - <param name="taxons" type="text" label="NCBI taxon IDs" + <param name="taxons" type="text" label="NCBI Taxon IDs or names" help="Enter one or more Organsim IDs (separated by commas) from http://www.uniprot.org/proteomes/"> - <validator type="regex" message="OrganismID[,OrganismID]">^\d+(,\d+)*$</validator> + <validator type="regex" message="OrganismID[,OrganismID]">^\w+( \w+)*(,\w+( \w+)*)*$</validator> </param> </when> + <when value="history"> + <param name="taxon_file" type="data" format="tabular" label="Dataset with Taxon Name column"/> + <param name="column" type="data_column" data_ref="taxon_file" label="Column with Taxon name"/> + </when> </conditional> + <param name="format" type="select" label="uniprot output format"> + <option value="xml">xml</option> + <option value="fasta">fasta</option> + </param> </inputs> <outputs> - <data format="uniprotxml" name="proteome" label="UniProtXML" /> + <data format="uniprotxml" name="proteome" label="UniProt.${format}"> + <change_format> + <when input="format" value="fasta" format="fasta" /> + </change_format> + </data> </outputs> <tests> <test> - <param name="choice" value="taxids"/> + <param name="input_choice" value="taxids"/> <param name="taxons" value="1566990"/> - <output name="uniprotxml"> + <param name="format" value="xml"/> + <output name="proteome"> <assert_contents> <has_text text="</uniprot>" /> </assert_contents> </output> </test> + <test> + <param name="input_choice" value="taxids"/> + <param name="taxons" value="Shi470,PeCan4"/> + <param name="format" value="fasta"/> + <output name="proteome"> + <assert_contents> + <has_text text="Shi470" /> + <has_text text="PeCan4" /> + </assert_contents> + </output> + </test> + <test> + <param name="input_choice" value="history"/> + <param name="taxon_file" value="Helicobacter_strains.tsv" ftype="tabular"/> + <param name="column" value="1"/> + <param name="format" value="fasta"/> + <output name="proteome"> + <assert_contents> + <has_text text="Shi470" /> + <has_text text="PeCan4" /> + </assert_contents> + </output> + </test> </tests> <help> <![CDATA[ -**UniProtXML Downloader** +**UniProt Downloader** -Downloads a UniProtXML file from UniProtKB +Downloads either a UniProtXML file or a fasta file from UniProtKB -The Morpheus proteomics search algorithm can use this format as a search database. +The Morpheus proteomics search algorithm can use the UniProtXML format as a search database. Available proteomes: http://www.uniprot.org/proteomes/ +Available taxon names: http://www.uniprot.org/taxonomy/ + +Example taxon: http://www.uniprot.org/taxonomy/512562 + +Taxon IDs or names can be entered as text or read from a column in a tabular dataset from your history. + +Example IDs and names releated to the Bacteria Helicobacter pylori (strain Shi470) :: + + + - 512562 + - Shi470 + - Helicobacter pylori + - Helicobacter + - Helicobacteraceae + + UniProtKB help: http://www.uniprot.org/help/uniprotkb ]]>