view validate_fasta_database.xml @ 1:3b18022a7613 draft

"planemo upload commit 6b8665071f8d1bc9a26491e8f9a85b708169b500"
author galaxyp
date Thu, 26 Nov 2020 20:25:21 +0000
parents 48c2271171f2
children 9c246c2e24ad
line wrap: on
line source

<tool id="validate_fasta_database" name="Validate FASTA Database" version="0.1.4">
    <requirements>
        <requirement type="package" version="1.0">validate-fasta-database</requirement>
    </requirements>
    <stdio>
        <exit_code range="1" level="fatal" description="Invalid FASTA headers detected, was asked to fail"/>
    </stdio>
    <command detect_errors="exit_code"><![CDATA[
        validate-fasta-database
        '$inFasta'
        '$goodFastaOut'
        '$badFastaOut'
        '$crashIfInvalid'
        '$checkIsProtein'
        '$customLetters'
        '$checkHasAccession'
        '$minimumLength'
    ]]></command>
    <inputs>
        <param type="data" name="inFasta" format="fasta" label="Select input FASTA dataset"/>
        <param type="boolean" name="crashIfInvalid"
               label="Fail job if invalid FASTA headers detected?"
               value="false"/>
        <param type="boolean" name="checkIsProtein"
               label="Ensure that sequence is a valid amino acid sequence?"
               help="Checks that sequence only contains the 20 essential amino
                acids (and optional non-standard AAs), and checks that is not DNA or RNA"
               value="true"/>
        <param type="text" name="customLetters" value=""
               label="Optional: add one letter codes for any non-standard amino acids that you are using. "
               help="Anything that is not an upper case letter [A-Z] will be ignored."/>
        <param type="boolean" name="checkHasAccession"
               label="Only pass sequences with accession numbers?"
               value="false"/>
        <param type="integer" name="minimumLength"
               label="Minimum length for sequences to pass"
               value="0"/>
    </inputs>
    <outputs>
        <data name="goodFastaOut" format="fasta" label="Validate FASTA ${on_string}: passed"/>
        <data name="badFastaOut" format="fasta" label="Validate FASTA ${on_string}: failed"/>
    </outputs>
    <tests>
        <!-- test general filtering -->
        <test>
            <param name="inFasta" value="fastaFilteringTest_IN.fasta"/>
            <output name="goodFastaOut" file="fastaFilteringTest_OUT1.fasta" />
            <output name="badFastaOut" file="fastaFilteringTest_OUT2.fasta" />
        </test>

        <!--test filtering out genetic sequences and bad protein sequences-->
        <test>
            <param name="inFasta" value="geneticFiltering.in"/>
            <param name="checkIsProtein" value="true"/>
            <output name="goodFastaOut" file="geneticFilteringGood.out"/>
            <output name="badFastaOut" file="geneticFilteringBad.out"/>
        </test>

        <test>
            <param name="inFasta" value="length5Filtering.in"/>
            <param name="minimumLength" value="5"/>
            <output name="goodFastaOut" file="length5FilteringGood.out"/>
            <output name="badFastaOut" file="length5FilteringBad.out"/>
        </test>
    </tests>
    <help>

<![CDATA[
**Notes**

Takes a FASTA database and validates the headers using the Compomics (developers of SearchGUI and PeptideShaker) schema. 
Custom FASTA databases may be in an invalid format, which causes SearchGUI to crash.
        
**Output**

The main output of this tool, "Validate FASTA: Passed Sequences", is a FASTA database that can be run through SearchGUI without error.
The failed sequences may be examined for typos and other errors. 

In addition, the tool will print the databases assigned by the Compomics utility (i.e., UniProt), for a quick check of the validity of the custom FASTA database. 

Sequences that may cause the tool to report an exception are those that are not valid examples of the following formats: 
    * UniProt,
    * SwissProt (starts with ">sw|" or ">SW|")
    * NCBI (starts with ">gi|" or ">GI|")
    * Halobacterium from Max Planck (starts with "OE")
    * H Influenza, from Novartis (starts with ">hflu_")
    * C Trachomatis (starts with ">C.tr\_" or "C\_trachomatis\_")
    * M Tuberculosis (starts with ">M. tub")
    * Saccharomyces Genome Database (contains "SGDID")
    * Genome translation (ex. ">dm345\_3L-sense [2343534-234353938]")
    * Genome Annotation Framework for Flexible Analysis (GAFFA) (starts with ">GAFFA")
    * UPS (contains "\_HUMAN\_UPS")
           
Many sequences are reported as Generic, which may or may not allow for extraction of the accession number. 
]]>
    </help>
    <citations>
        <citation type="bibtex">
            @misc{fastaValidationTool,
            author = {The GalaxyP Team},
            date = {22 June 2017},
            title = {FASTA Database Validation Tool}
            }
        </citation>
    </citations>
</tool>