diff filter_assembly.xml @ 0:7a813e633d1c draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:22:32 -0500
parents
children a83562c0719f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_assembly.xml	Fri Feb 01 10:22:32 2019 -0500
@@ -0,0 +1,191 @@
+<tool name="Filter assemblies" id="filter_assemblies" version="2.0.3">
+
+    <description>
+        Filter the outputs of Velvet or Trinity assemblies
+    </description>
+
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <requirements>
+        <expand macro="python_required" />
+        <requirement type="package" version="0.0.14">fastx_toolkit</requirement>
+        <requirement type="package" version="10.2011">cap3</requirement>
+    </requirements>
+
+    <command>
+    <![CDATA[
+        #set $infiles = ""
+        #for $input in $inputs
+            ln -s '$input' '$input.element_identifier';
+            #set $infiles = $infiles + $input.element_identifier + ","
+        #end for
+        #set $infiles = $infiles[:-1]
+
+        ln -s '$__tool_directory__/scripts/S02a_remove_redondancy_from_velvet_oases.py' . &&
+        ln -s '$__tool_directory__/scripts/S02b_format_fasta_name_trinity.py' . &&
+        ln -s '$__tool_directory__/scripts/S03_choose_one_variants_per_locus_trinity.py' . &&
+        ln -s '$__tool_directory__/scripts/S04_find_orf.py' . &&
+        ln -s '$__tool_directory__/scripts/S05_filter.py' . &&
+
+        python '$__tool_directory__/scripts/S01_script_to_choose.py'
+
+        '$infiles'
+        $length_seq_max
+        $percent_identity
+        $overlap_length
+        > ${log}
+    ]]>
+    </command>
+
+    <inputs>
+        <param name="inputs" type="data" format="fasta" multiple="true" label="Input files" />
+        <param name="percent_identity" type="integer" value="100" label="Overlap percent identity cutoff" help="Cap3 parameter (-p N); minimum percent identity of an overlap. The specified value should be more than 65%." />
+        <param name="overlap_length" type="integer" value="60" label="Overlap length cutoff" help="Cap3 parameter (-o N); minimum length of an overlap (in base pairs). The specified value should be more than 15 base pairs." />
+        <param name="length_seq_max" type="integer" value="100" label="Minimum sequence length" help="Keep sequences which length is higher than the minimum sequence length  " />
+    </inputs>
+
+    <outputs>
+        <collection name="output_fasta" type="list" label="Filter Assemblies outputs">
+            <discover_datasets pattern="__name_and_ext__" directory="outputs" />
+        </collection>
+        <data format="txt" name="log" label="Filter Assemblies Summary"/>
+    </outputs>
+
+	<tests>
+        <test>
+            <param name="inputs" ftype="fasta" value="trinity/Pfiji_trinity.fasta,trinity/Apomp_trinity.fasta,trinity/Amphi_trinity.fasta,trinity/Acaud_trinity.fasta,velvet/Pg_transcriptome_90109.fasta,velvet/Ap_transcriptome_35099.fasta,velvet/Ac_transcriptome_25591.fasta" />
+            <param name="percent_identity" value="100" />
+            <param name="overlap_length" value="60" />
+            <param name="length_seq_max" value="100" />
+            <output name="log" value="trinity_and_velvet_up.output" />
+            <output_collection name="output_fasta" type="list">
+                <element name="AcAc_transcriptome_25591" value="velvet_out/AcAc_transcriptome_25591.fasta" />
+                <element name="ApAp_transcriptome_35099" value="velvet_out/ApAp_transcriptome_35099.fasta" />
+                <element name="PgPg_transcriptome_90109" value="velvet_out/PgPg_transcriptome_90109.fasta" />
+                <element name="AcAcaud_trinity" value="trinity_out/AcAcaud_trinity.fasta" />
+                <element name="AmAmphi_trinity" value="trinity_out/AmAmphi_trinity.fasta" />
+                <element name="ApApomp_trinity" value="trinity_out/ApApomp_trinity.fasta" />
+                <element name="PfPfiji_trinity" value="trinity_out/PfPfiji_trinity.fasta" />
+            </output_collection>
+        </test>        
+        <test>
+            <param name="inputs" ftype="fasta" value="trinity/Pfiji_trinity.fasta,trinity/Apomp_trinity.fasta,trinity/Amphi_trinity.fasta,trinity/Acaud_trinity.fasta" />
+            <param name="percent_identity" value="100" />
+            <param name="overlap_length" value="60" />
+            <param name="length_seq_max" value="100" />
+            <output name="log" value="trinity_up.output" />
+            <output_collection name="output_fasta" type="list">
+                <element name="AcAcaud_trinity" value="trinity_out/AcAcaud_trinity.fasta" />
+                <element name="AmAmphi_trinity" value="trinity_out/AmAmphi_trinity.fasta" />
+                <element name="ApApomp_trinity" value="trinity_out/ApApomp_trinity.fasta" />
+                <element name="PfPfiji_trinity" value="trinity_out/PfPfiji_trinity.fasta" />
+            </output_collection>
+        </test>
+        <test>
+            <param name="inputs" ftype="fasta" value="velvet/Pg_transcriptome_90109.fasta,velvet/Ap_transcriptome_35099.fasta,velvet/Ac_transcriptome_25591.fasta" />
+            <param name="percent_identity" value="100" />
+            <param name="overlap_length" value="60" />
+            <param name="length_seq_max" value="100" />
+            <output name="log" value="velvet_up.output" />
+            <output_collection name="output_fasta" type="list">
+                <element name="AcAc_transcriptome_25591" value="velvet_out/AcAc_transcriptome_25591.fasta" />
+                <element name="ApAp_transcriptome_35099" value="velvet_out/ApAp_transcriptome_35099.fasta" />
+                <element name="PgPg_transcriptome_90109" value="velvet_out/PgPg_transcriptome_90109.fasta" />
+            </output_collection>
+        </test>
+        
+    </tests>
+
+	<help>
+
+@HELP_AUTHORS@
+
+<![CDATA[
+
+**Description**
+
+This tool reformats Velvet Oases or Trinity assemblies for the AdaptSearch galaxy suite and selects only one variant per gene according to its length and quality check.
+
+---------
+
+**Input format**
+
+(1) Sequences are in the sequential format:
+
+| >seqname1
+| AAAGAGAGACCACATGTCAGTAGC -on one or several lines -
+| >seqname2
+| AAGGCCTGACCACATGAGTTAAGC -on one or several lines -
+| etc ...
+|
+
+2) The file name should begin with a two letter abbreviation of the species name (for isntance, 'Ap' if the species is Alvinella pompejana).
+
+**For Velvet Oases assemblies input**
+            
+    The headers must be as follow : *>Locus_i_Transcript_i/j_Confidence_x.xxx_Length_N* where i is the locus number, j the transcript variant among all versions of the transcript, x.xxx the confidence value and N the length.
+
+**For Trinity assemblies inputs**   
+            
+    The headers must be as follow : *>cj_gj_ij Len=j path=[j:0-j]* where all the j are integers (locus number, transcript variant, length, position...)
+
+**The tool handles the case if input files come from both assemblers (there is no need for input files to be exclusively from one or another assembler).**
+
+---------
+
+**Parameters**
+
+    - 'Input files' : a collection of fasta files (one file per species).
+    - 'Overlap percent identity cutoff' : cap3 -p parameter : minimum percent identity of an overlap.
+        must be > 65 ; default : 100.
+    - 'Overlap length cutoff' (integer) : cap3 -o parameter : minimum length of an overlap (in base pairs).
+        must be > 15 ; default : 60.
+    - 'Minimum sequence length' (integer) : only keep sequences which are longer than the specified value.
+        default : 100.
+
+---------
+
+**Steps**:
+    
+The tool:
+    1) Modifies the sequence name to add the species abbreviation using the 2 first letters of the name of the transcriptome file : note that each species abbreviation must be unique
+    2) Selects one allelic sequence from each transcript (c or locus) using the length of the sequence and its level of confidence
+    3) Selects the best ORF from the sequence between two stop codons
+    4) Performs a CAP3 from the full set of ORFs to minimize redundancy
+    5) Retrieves the initial transcript sequences from the remaining set of proceeded ORF sequences
+
+**Outputs**
+
+    - 'Filter Assemblies Summary' : the log file.
+    - 'Filter Assemblies outputs' : the main results.
+
+---------
+
+**The AdaptSearch Pipeline**
+
+.. image:: adaptsearch_picture_helps.png
+
+---------
+
+Changelog
+---------
+
+**Version 2.1 - 15/01/2018**
+
+    - Input files can be a mix from files coming either from Trinity or Velvet Oases assemblers
+
+**Version 2.0 - 14/04/2017**
+
+    - NEW: Replace the zip between tools by Dataset Collection
+
+**Version 1.0 - 13/04/2017**
+
+    - TEST: Add funtional test with planemo
+    - IMPROVEMENT: Use conda dependencies for cap3, fastaformatter and python
+
+    ]]>
+	</help>
+
+</tool>