Mercurial > repos > galaxyp > uniprotxml_downloader

<tool id="uniprotxml_downloader" name="UniProt" version="2.5.0" profile="23.1">
    <description>download proteome as XML or fasta</description>
    <macros>
        <xml name="query_field">
            <param name="field" type="select" label="Field">
                <option value="taxonomy_name">Taxonomy Name</option>
                <option value="taxonomy_id">Taxonomy ID</option>
                <option value="accession">Accession</option>
            </param>
        </xml>
    </macros>
    <requirements>
        <requirement type="package" version="2.25.1">requests</requirement>
    </requirements>
    <stdio>
        <exit_code range="1:"  level="fatal" description="Error downloading proteome." />
    </stdio>
    <command>
<![CDATA[
python '$__tool_directory__/uniprotxml_downloader.py'
#if $input_method.input_choice == 'common':
    --search-id $input_method.organism
    --field taxonomy_id
    #if $input_method.reviewed:
        --reviewed=$input_method.reviewed
    #end if
#elif $input_method.input_choice == 'enter_ids':
    --field $input_method.field
    #for $id in $input_method.ids.split(','):
        --search-id '$id'
    #end for
#elif $input_method.input_choice == 'history':
    --field $input_method.field
    --input='${input_method.id_file}'
    --column=#echo int(str($input_method.column)) - 1#
#end if
--format $format_cond.format
#if $format_cond.format == "tsv"
    --output_columns #echo ','.join($format_cond.columns)
#end if
--output '${proteome}'
]]>
    </command>
    <inputs>
        <conditional name="input_method">
            <param name="input_choice" type="select" label="Select">
                <option value="common">A Common Organism</option>
                <option value="enter_ids">A manually entered list of accessions or taxonomy IDs/names</option>
                <option value="history">A history dataset with a column containing accessions or taxonomy IDs/names</option>
            </param>
            <when value="common">
                <param name="organism" type="select" label="Common Organisms"
                       help="select species for protein database">
                    <options from_file="uniprot_taxons.loc">
                        <column name="name" index="0" />
                        <column name="value" index="1" />
                    </options>
                </param>
                <param name="reviewed" type="select" label="filter by reviewed status" optional="true">
                    <help><![CDATA[
                    UniProtKB/TrEMBL (unreviewed)is a large, automatically annotated database that may contain
                    redundant sequences, but there is a higher chance peptides will be identified.
                    UniProtKB/Swiss-Prot (reviewed) is a smaller, manually annotated database with
                    less of a chance peptides will be identified but less sequence redundancy
                    ]]>
                    </help>
                    <option value="yes">UniProtKB/Swiss-Prot (reviewed only)</option>
                    <option value="no">UniProtKB/TrEMBL (unreviewed only)</option>
                </param>
            </when>
            <when value="enter_ids">
                <param name="ids" type="text" label="Search ID values"
                       help="Enter one or more IDs (separated by commas) from http://www.uniprot.org/proteomes/">
                    <validator type="regex" message="OrganismID[,OrganismID]">^\w+( \w+)*(,\w+( \w+)*)*$</validator>
                </param>
                <expand macro="query_field"/>
            </when>
            <when value="history">
                <param name="id_file" type="data" format="tabular,txt" label="Dataset (tab separated) with ID column"/>
                <param name="column" type="data_column" data_ref="id_file" label="Column with ID"/>
                <expand macro="query_field"/>
            </when>
        </conditional>
        <conditional name="format_cond">
            <param name="format" type="select" label="uniprot output format">
                <option value="fasta">fasta</option>
                <option value="tsv">TSV</option>
                <option value="xml">xml</option>
            </param>
            <when value="fasta"/>
            <when value="xml"/>
            <when value="tsv">
                <param name="columns" type="select" multiple="true">
                    <options from_url="https://rest.uniprot.org/configure/uniprotkb/result-fields">
                        <postprocess_expression type="ecma5.1"><![CDATA[${
                            var options = [];
                            inputs.forEach(function(group) {
                                var groupName = group.groupName;
                                group.fields.forEach(function(field) {
                                    var D = ["accession", "id", "reviewed", "protein_name", "gene_names", "organism_name", "length"];
                                    var selected = D.includes(field.name);
                                    options.push([group.groupName + " - " + field.label, field.name, selected]);
                                });
                            });
                            return options;
                        }]]></postprocess_expression>
                    </options>
                </param>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data format="uniprotxml" name="proteome">
            <change_format>
                <when input="format_cond.format" value="fasta" format="fasta" />
                <when input="format_cond.format" value="tsv" format="tsv" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test>
            <conditional name="input_method">
                <param name="input_choice" value="enter_ids"/>
                <param name="ids" value="1566990"/>
            </conditional>
            <conditional name="format_cond">
                <param name="format" value="xml"/>
            </conditional>
            <output name="proteome" ftype="uniprotxml">
                <assert_contents>
                    <has_text text="&lt;/uniprot&gt;" />
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/>
                <has_line line="Entries:0"/> <!-- searching by name using an ID -->
            </assert_stdout>
        </test>
        <test>
            <conditional name="input_method">
                <param name="input_choice" value="enter_ids"/>
                <param name="ids" value="765963,512562"/>
                <param name="field" value="taxonomy_id"/>
            </conditional>
            <conditional name="format_cond">
                <param name="format" value="fasta"/>
            </conditional>
            <output name="proteome" ftype="fasta">
                <assert_contents>
                    <has_text text="Shi470" />
                    <has_text text="PeCan4" />
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/>
                <has_text_matching expression="Entries:\d+"/>
                <has_line line="Entries:0" negate="true"/>
            </assert_stdout>
        </test>
        <test>
            <conditional name="input_method">
                <param name="input_choice" value="enter_ids"/>
                <param name="ids" value="Shi470,PeCan4"/>
                <param name="field" value="taxonomy_name"/>
            </conditional>
            <conditional name="format_cond">
                <param name="format" value="fasta" ftype="fasta"/>
            </conditional>
            <output name="proteome">
                <assert_contents>
                    <has_text text="Shi470" />
                    <has_text text="PeCan4" />
                </assert_contents>
            </output>
        </test>
        <test>
            <conditional name="input_method">
                <param name="input_choice" value="enter_ids"/>
                <param name="ids" value="E1Q2I0,E1Q3C4"/>
                <param name="field" value="accession"/>
            </conditional>
            <conditional name="format_cond">
                <param name="format" value="fasta"/>
            </conditional>
            <output name="proteome" ftype="fasta">
                <assert_contents>
                    <has_text text="E1Q2I0" />
                    <has_text text="E1Q3C4" />
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/>
                <has_text_matching expression="Entries:\d+"/>
                <has_line line="Entries:0" negate="true"/>
            </assert_stdout>
        </test>
        <test>
            <conditional name="input_method">
                <param name="input_choice" value="history"/>
                <param name="id_file" value="Helicobacter_strains.tsv" ftype="tabular"/>
                <param name="column" value="1"/>
                <param name="field" value="taxonomy_name"/>
            </conditional>
            <conditional name="format_cond">
                <param name="format" value="fasta"/>
            </conditional>
            <output name="proteome" ftype="fasta">
                <assert_contents>
                    <has_text text="Shi470" />
                    <has_text text="PeCan4" />
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/>
                <has_text_matching expression="Entries:\d+"/>
                <has_line line="Entries:0" negate="true"/>
            </assert_stdout>
        </test>
        <test>
            <conditional name="input_method">
                <param name="input_choice" value="history"/>
                <param name="id_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/>
                <param name="column" value="2"/>
                <param name="field" value="taxonomy_id"/>
            </conditional>
                <conditional name="format_cond">
            <param name="format" value="fasta"/>
            </conditional>
            <output name="proteome" ftype="fasta">
                <assert_contents>
                    <has_text text="Shi470" />
                    <has_text text="PeCan4" />
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/>
                <has_text_matching expression="Entries:\d+"/>
                <has_line line="Entries:0" negate="true"/>
            </assert_stdout>
        </test>
        <test>
            <conditional name="input_method">
                <param name="input_choice" value="history"/>
                <param name="id_file" value="Helicobacter_protein_accessions.tsv" ftype="tabular"/>
                <param name="column" value="1"/>
                <param name="field" value="accession"/>
            </conditional>
            <conditional name="format_cond">
                <param name="format" value="fasta"/>
            </conditional>
            <output name="proteome" ftype="fasta">
                <assert_contents>
                    <has_text text="E1Q2I0" />
                    <has_text text="E1Q3C4" />
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/>
                <has_text_matching expression="Entries:\d+"/>
                <has_line line="Entries:0" negate="true"/>
            </assert_stdout>
        </test>
        <!-- tsv output -->
        <test>
            <conditional name="input_method">
                <param name="input_choice" value="enter_ids"/>
                <param name="ids" value="765963,512562"/>
                <param name="field" value="taxonomy_id"/>
            </conditional>
            <conditional name="format_cond">
                <param name="format" value="tsv"/>
            </conditional>
            <output name="proteome" ftype="tsv">
                <assert_contents>
                    <has_n_columns n="7" />
                    <has_text text="Shi470" />
                    <has_text text="PeCan4" />
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/>
                <has_text_matching expression="Entries:\d+"/>
                <has_line line="Entries:0" negate="true"/>
            </assert_stdout>
        </test>
        <!-- tsv output non default columns-->
        <test>
            <conditional name="input_method">
                <param name="input_choice" value="enter_ids"/>
                <param name="ids" value="765963,512562"/>
                <param name="field" value="taxonomy_id"/>
            </conditional>
            <conditional name="format_cond">
                <param name="format" value="tsv"/>
                <param name="columns" value="accession,sequence"/>
            </conditional>
            <output name="proteome" ftype="tsv">
                <assert_contents>
                    <has_n_columns n="2" />
                    <has_text text="Shi470" negate="true"/>
                    <has_text text="B2US14" />
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/>
                <has_text_matching expression="Entries:\d+"/>
                <has_line line="Entries:0" negate="true"/>
            </assert_stdout>
        </test>
    </tests>
    <help>
<![CDATA[
**UniProt Downloader**

Downloads either a UniProtXML file or a fasta file from UniProtKB

The Morpheus proteomics search algorithm can use the UniProtXML format as a search database.

Available proteomes: http://www.uniprot.org/proteomes/

Available taxon names: http://www.uniprot.org/taxonomy/

Example taxon: http://www.uniprot.org/taxonomy/512562

Example protein: https://www.uniprot.org/uniprotkb/E1Q2I0/entry

Description of query fields: https://www.uniprot.org/help/query-fields

IDs can be entered as text or read from a column in a tabular dataset from your history.

Example IDs and names releated to the Bacteria Helicobacter pylori (strain Shi470) ::


 - 512562
 - Shi470
 - Helicobacter pylori
 - Helicobacter
 - Helicobacteraceae

 Example protein accession numbers from Helicobacter pylori:

 - E1Q2I0
 - E1Q3C4


UniProtKB help: http://www.uniprot.org/help/uniprotkb

]]>
    </help>
    <citations>
      <citation type="doi">10.1093/nar/gku989</citation>
    </citations>
</tool>
author	galaxyp
date	Wed, 11 Dec 2024 13:34:54 +0000
parents	a371252a2cf6
children