view CDS_search.xml @ 0:eb95bf7f90ae draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:26:37 -0500
parents
children c79bdda8abfb
line wrap: on
line source

<tool name="CDS_search" id="cds_search" version="2.1.2">

	<description>
		ORF and CDS search
	</description>

	<macros>
		<import>macros.xml</import>
	</macros>

	<requirements>
		<expand macro="python_required" />
	</requirements>

  	<command><![CDATA[        
        #for $input in $inputs
            ln -s '$input' '$input.element_identifier';
            echo '$input.element_identifier' >> list_files;
        #end for
        
        ln -s $__tool_directory__/scripts/dico.py . &&

        python $__tool_directory__/scripts/S01_find_orf_on_multiple_alignment.py
        $__tool_directory__/scripts/code_universel_modified.txt
        $length.min_length_seq
        $nb_species_keep
        list_files
        > '$log' &&

        python $__tool_directory__/scripts/S02_remove_too_short_bit_or_whole_sequence.py
        $nb_species_keep
        $methionine
        $length.min_length_seq
        $length.min_length_subseq
        >> '$log' &&

        python $__tool_directory__/scripts/S03_remove_site_with_not_enough_species_represented.py
        $nb_species_keep
        $length.min_length_nuc
        >> '$log';
    ]]></command>

 	<inputs>
        <param name="inputs" type="data" format="fasta" multiple="true" label="Input files" help="Only a fasta file with nucleic align sequences" />
		<!-- <param name="code_file" type="data" format="txt" label="Choose your file containing the universal code (codons and their amino acids)" /> -->

		<param name="nb_species_keep" type="integer" value="10" min="2" label="Minimal number of species in each locus" help="If you want to remove all the indels the maximum number of species is required" />

        <param name="methionine" type="boolean" checked="true" truevalue="oui" falsevalue="non" label="Do you want to consider the Methionine in the search of CDS? " />

        <section name="length" title="Do you want to choose the minimum length of the CDS?">
            <param name="min_length_seq" type="integer" value="50" min="0" label="Minimal length of the CDS, in proteic" help="By default it's 50" />
            <param name="min_length_subseq" type="integer" value="15" min="0" label="Minimal length of the subsequence, in proteic between two series of indels" help="By default it's 15" />
            <param name="min_length_nuc" type="integer" value="50" min="0" label="Minimal length of the CDS, in nucleic without the indel" help="By default it's 50" />
        </section>

		<param name="out_BESTORF" type="select" label="Do you want the outputs (dataset collection list) containing files with the BEST ORF? ">
			<option value="no">No</option>
			<option value="aa">Yes, with the proteic format</option>
			<option value="nuc">Yes, with the nucleic format</option>
			<option value="both">Yes, with the proteic and nucleic format</option>
		</param>

		<param name="out_CDS" type="select" label="Do you want the outputs (dataset collection list) containing files with CDS? ">
			<option value="no">No</option>
			<option value="aa">Yes, with the proteic format</option>
			<option value="nuc">Yes, with the nucleic format</option>
			<option value="both">Yes, with the proteic and nucleic format</option>
		</param>

		<param name="out_CDS_filter" type="select" label="Do you want the outputs (dataset collection list) containing files with CDS without indel? ">
			<option value="no">No</option>
			<option value="aa">Yes, with the proteic format</option>
			<option value="nuc">Yes, with the nucleic format</option>
			<option value="both">Yes, with the proteic and nucleic format</option>
		</param>
	</inputs>

	<outputs>
		<data format="txt" name="log" label="ORF_Search" />
        <collection name="output_BESTORF_aa" type="list" label="ORF_Search_Best_ORF_aa">
            <filter>out_BESTORF in ["aa","both"]</filter>
            <discover_datasets pattern="__name_and_ext__" directory="04_BEST_ORF_aa" />
        </collection>

        <collection name="output_BESTORF_nuc" type="list" label="ORF_Search_Best_ORF_nuc">
            <filter>out_BESTORF in ["nuc","both"]</filter>
            <discover_datasets pattern="__name_and_ext__" directory="04_BEST_ORF_nuc" />
        </collection>

        <collection name="output_CDS_aa" type="list" label="ORF_Search_CDS_aa">
            <filter>out_CDS in ["aa","both"] and not methionine</filter>
            <discover_datasets pattern="__name_and_ext__" directory="05_CDS_aa" />
        </collection>

        <collection name="output_CDS_nuc" type="list" label="ORF_Search_CDS_nuc">
            <filter>out_CDS in ["nuc","both"] and not methionine</filter>
            <discover_datasets pattern="__name_and_ext__" directory="05_CDS_nuc" />
        </collection>

        <collection name="output_CDS_M_aa" type="list" label="ORF_Search_CDS_with_M_aa">
            <filter>(out_CDS == "aa" and methionine) or (out_CDS == "both" and methionine)</filter>
            <discover_datasets pattern="__name_and_ext__" directory="06_CDS_with_M_aa" />
        </collection>

        <collection name="output_CDS_M_nuc" type="list" label="ORF_Search_CDS_with_M_nuc">
            <filter>(out_CDS == "nuc" and methionine) or (out_CDS == "both" and methionine)</filter>
            <discover_datasets pattern="__name_and_ext__" directory="06_CDS_with_M_nuc" />
        </collection>

        <collection name="output_filter_aa" type="list" label="ORF_Search_CDS_without_indel_aa">
            <filter>out_CDS_filter in ["aa","both"]</filter>
            <discover_datasets pattern="__name_and_ext__" directory="08_CDS_aa_MINIMUM_MISSING_SEQUENCES" />
        </collection>

        <collection name="output_filter_nuc" type="list" label="ORF_Search_CDS_without_indel_nuc">
            <filter>out_CDS_filter in ["nuc","both"]</filter>
            <discover_datasets pattern="__name_and_ext__" directory="08_CDS_nuc_MINIMUM_MISSING_SEQUENCES" />
        </collection>
	</outputs>

	<tests>

		<test>
            <param name="inputs" ftype="fasta" value="inputs/orthogroup_1_with_4_sequences.fasta,inputs/orthogroup_6_with_4_sequences.fasta,inputs/orthogroup_7_with_3_sequences.fasta,inputs/orthogroup_8_with_4_sequences.fasta,inputs/orthogroup_12_with_5_sequences.fasta,inputs/orthogroup_14_with_4_sequences.fasta" />
			<param name="nb_species_keep" value="3" />
			<param name="methionine" value="non" />
            <section name="length">
                <param name="min_length_seq" value="50" />
                <param name="min_length_subseq" value="15" />
                <param name="min_length_nuc" value="50" />
            </section>
			<param name="out_BESTORF" value="both" />
			<param name="out_CDS" value="both" />
			<param name="out_CDS_filter" value="both" />
            <output_collection name="output_BESTORF_aa" type="list" count="2">
                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_1_with_3_species.fasta" />
                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_7_with_3_species.fasta" />
            </output_collection>
            <output_collection name="output_BESTORF_nuc" type="list" count="2">
                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_1_with_3_species.fasta" />
                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_7_with_3_species.fasta" />
            </output_collection>
            <output_collection name="output_CDS_aa" type="list" count="2">
                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_1_with_3_species.fasta" />
                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_7_with_3_species.fasta" />
            </output_collection>
            <output_collection name="output_CDS_nuc" type="list" count="2">
                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_1_with_3_species.fasta" />
                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_7_with_3_species.fasta" />
            </output_collection>
            <output_collection name="output_filter_aa" type="list" count="1">
                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta" />
            </output_collection>
            <output_collection name="output_filter_nuc" type="list" count="1">
                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta" />
            </output_collection>
		</test>

        <test>
            <param name="inputs" ftype="fasta" value="inputs/orthogroup_1_with_4_sequences.fasta,inputs/orthogroup_6_with_4_sequences.fasta,inputs/orthogroup_7_with_3_sequences.fasta,inputs/orthogroup_8_with_4_sequences.fasta,inputs/orthogroup_12_with_5_sequences.fasta,inputs/orthogroup_14_with_4_sequences.fasta" />
            <param name="nb_species_keep" value="2" />
            <param name="methionine" value="oui" />
            <section name="length">
                <param name="min_length_seq" value="50" />
                <param name="min_length_subseq" value="15" />
                <param name="min_length_nuc" value="50" />
            </section>
            <param name="out_BESTORF" value="both" />
            <param name="out_CDS" value="both" />
            <param name="out_CDS_filter" value="both" />			
            <output_collection name="output_BESTORF_aa" type="list" count="4">
                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_1_with_3_species.fasta" />
                <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_6_with_2_species.fasta" />
                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_7_with_3_species.fasta" />
                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_14_with_2_species.fasta" />
            </output_collection>
            <output_collection name="output_BESTORF_nuc" type="list" count="4">
                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_1_with_3_species.fasta" />
                <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_6_with_2_species.fasta" />
                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_7_with_3_species.fasta" />
                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_14_with_2_species.fasta" />
            </output_collection>            
            <output_collection name="output_filter_aa" type="list" count="1">
                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test2/orthogroup_14_with_2_species.fasta" />
            </output_collection>
            <output_collection name="output_filter_nuc" type="list" count="1">
                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test2/orthogroup_14_with_2_species.fasta" />
            </output_collection>
		</test>

	</tests>
	<help>

@HELP_AUTHORS@

<![CDATA[

**Description**

This tool takes files containing nucleic aligned sequences and search the ORF and the CDS.

--------

**Inputs**

Input files : (multiple) fasta files with nucleic aligned sequences.

--------

**Parameters**

    - methionine : choose to consider the methionine in the search of CDS.
        yes/no.

    - 'Minimal number of species in each locus'        
        Default : 10 (integer).

    - 'min_length_seq' :
        minimal length of the sequence (in amino acids).        
        when the removal of the indel is done, the minimal length equals : previous length - 20.
        for example if you choose 50 for the minimal length, the actual length equals 30.
        Default : 50 (integer).

    - 'min_length_subseq' :
        minimal length of the subsequence (in amino acids).
        subsequence means the part of the original sequence between 2 sets of indels.
        an indel set is composed by more than 2 indels, if not the set is considered as unknown amino acid.
        Default : 15 (integer).

    - 'min_length_nuc' :
        Minimal length of the sequence in the nucleic format, without indels.
        Default : 50 (integer).

    - others parameters allowing to choose which outputs you desire :
        - outputs with best ORFs.
        - outputs with CDS, with or without indels.
        - in proteic or nucleic format.

--------

**Outputs**

    - ORF_Search
        the log file (mainly statistics about the tool).

    - ORF_Search_Best_ORF_aa
        the output with the best ORF in the proteic format.

    - ORF_Search_Best_ORF_nuc
        the output with the best ORF in the nucleic format.

    - ORF_Search_CDS_aa
        the output with the CDS (regardless the Methionine) in the proteic format.

    - ORF_Search_CDS_nuc
        the output with the CDS (regardless the Methionine) in the nucleic format.

    - ORF_Search_CDS_with_M_aa
        the output with the CDS (considering the Methionine) in proteic format.
        the rule : they must have a methionine before the minimal length of the sequence.
        for example before the 30 last amino acid.

    - ORF_Search_CDS_with_M_nuc
        the output with the CDS (considering the Methionine) in nucleic format.
        the rule : they must have a methionine before the minimale length of the sequence.
        for example before the 30 last amino acid.

    - ORF_Search_CDS_without_indel_aa
        is the output with the CDS without indel in proteic format.
        considering the Methionine or not : according to the option chosen.

    - ORF_Search_CDS_without_indel_nuc
        is the output with the CDS without indel in proteic format.
        considering the Methionine or not : according to the option chosen.

---------

**The AdaptSearch Pipeline**

.. image:: adaptsearch_picture_helps.png

---------

Changelog
---------

**Version 2.0 - 05/07/2017**

 - NEW: Replace the zip between tools by Dataset Collection

**Version 1.0 - 13/04/2017**

 - Added functional test with planemo
 - planemo test with conda dependency for python
 - Scripts renamed + symlinks to the directory 'scripts'

    ]]>

	</help>

    <citations>

    </citations>

</tool>