Mercurial > repos > rnateam > sortmerna

--- a/sortmerna.xml	Wed Aug 05 02:50:43 2015 -0400
+++ b/sortmerna.xml	Tue Mar 29 07:01:13 2016 -0400
@@ -1,7 +1,7 @@
-<tool id="bg_sortmerna" name="Filter with SortMeRNA" version="2.0.0">
+<tool id="bg_sortmerna" name="Filter with SortMeRNA" version="2.1b.0">
     <description>Fast and accurate filtering of ribosomal RNAs in metatranscriptomic data</description>
     <requirements>
-        <requirement type='package' version="2.0">sortmerna</requirement>
+        <requirement type="package" version="2.1b">sortmerna</requirement>
     </requirements>
     <stdio>
         <regex match="This program builds a Burst trie on an input rRNA database"
@@ -12,6 +12,10 @@
             source="both"
             level="fatal"
             description="The database ${databases} has not been preprocessed using buildtrie before using SortMeRNA." />
+        <regex match="ERROR"
+            source="both"
+            level="fatal"
+            description="ERROR" />
     </stdio>
     <version_command>
 <![CDATA[
@@ -22,13 +26,18 @@
 <![CDATA[
     #set $ref = ''
     #set $sep=''
-    #if str( $databases_type.databases_selector ) == 'history':
+    #if str( $databases_type.databases_selector ) == 'history'
         #for $db in $databases_type.database_name
             #set $ref += $sep + str($db) + ',' + $os.path.splitext($os.path.basename(str($db)))[0]
             #set $sep = ':'
         #end for
-        indexdb_rna --ref $ref
-        &&
+    #else if str( $databases_type.databases_selector ) == 'cached_to_index'
+        ## databases path is not directly accessible, must match by hand with LOC file contents
+        #set $data_table = dict([(_[0], _[2]) for _ in $databases_type.input_databases.input.options.tool_data_table.data])
+        #for $db in $databases_type.input_databases.value
+            #set $ref += $sep + $data_table[$db] + ',' + $os.path.splitext($data_table[$db])[0] + '-reindexed'
+            #set $sep = ':'
+        #end for
     #else:
         ## databases path is not directly accessible, must match by hand with LOC file contents
         #set $data_table = dict([(_[0], _[2]) for _ in $databases_type.input_databases.input.options.tool_data_table.data])
@@ -37,24 +46,62 @@
             #set $sep = ':'
         #end for
     #end if
-    sortmerna --ref $ref --reads $input_reads --aligned aligned
-    #if str( $sequencing_type.sequencing_type_selector ) == 'paired'
-        $sequencing_type.paired_type
+
+    #if str( $databases_type.databases_selector ) != 'cached':
+        indexdb_rna
+            --ref $ref
+            -L $databases_type.seed_length
+            --max_pos $databases_type.max_pos
+        &&
     #end if
-    $strand_search
-    $aligned_fastx.aligned_fastx_selector
-    #if $aligned_fastx.aligned_fastx_selector == '--fastx'
-        #if $aligned_fastx.other
-            --other other_file
+
+    sortmerna
+    	--ref $ref
+    	--reads $input_reads
+    	--aligned aligned
+
+    	#if str( $sequencing_type.sequencing_type_selector ) == 'paired'
+            	$sequencing_type.paired_type
+    	#end if
+
+    	$strand_search
+    	$aligned_fastx.aligned_fastx_selector
+    	#if $aligned_fastx.aligned_fastx_selector == '--fastx'
+        	#if $aligned_fastx.other
+            		--other other_file
+        	#end if
+    	#end if
+    	$aligned_sam.aligned_sam_selector
+    	#if $aligned_sam.aligned_sam_selector == '--sam'
+        	$aligned_sam.sq
+    	#end if
+    	$aligned_blast
+
+    	$log
+
+        #if $report.report_type == 'best'
+            #if $report.report_best.report_best_type == '1'
+                --best 1
+                --min_lis $report.report_best.report_best_min_lis
+            #else
+                --best $report.report_best.report_best_value
+                --min_lis $report.report_best.report_best_min_lis
+            #end if
+        #else
+            #if $report.report_num_alignments.report_num_alignments_type == 'other_value'
+                --num_alignments $report.report_num_alignments.report_num_alignments_value
+            #else
+                --num_alignments $report.report_num_alignments.report_num_alignments_type
+            #end if
         #end if
-    #end if
-    $aligned_sam.aligned_sam_selector
-    #if $aligned_sam.aligned_sam_selector == '--sam'
-        $aligned_sam.sq
-    #end if
-    $aligned_blast
-    $log
-    -a \${GALAXY_SLOTS:-1}
+
+        -e $e_value
+        --match $match
+        --mismatch $mismatch
+        --gap_open $gap_open
+        --gap_ext $gap_ext
+        -N $ambiguous_letter
+    	-a \${GALAXY_SLOTS:-1}
 ]]>
     </command>
     <inputs>
@@ -64,6 +111,7 @@
                 <option value="not_paired">Reads are not paired</option>
                 <option value="paired">Reads are paired</option>
             </param>
+            <when value="not_paired" />
             <when value="paired">
                 <param name="paired_type" type="select" display="radio" label="If one of the paired-end reads aligns and the other one does not">
                     <option value="">leave the reads split between aligned and rejected files</option>
@@ -73,7 +121,7 @@
             </when>
         </conditional>

-        <param name="strand_search" type="select" label="Which strands to search" display="radio">
+        <param name="strand_search" type="select" label="Which strands to search">
             <option value="">Search both strands</option>
             <option value="-F">Search only the forward strand (-F)</option>
             <option value="-R">Search only the reverse-complementary strand (-R)</option>
@@ -84,7 +132,8 @@
                 help="Public rRNA databases provided with SortMeRNA have been indexed.
                     On the contrary, personal databases must be indexed each time SortMeRNA is launched.
                     Please be patient, this may take some time depending on the size of the given database.">
-                <option value="cached" selected="true">Public ribosomal databases</option>
+                <option value="cached" selected="true">Public pre-indexed ribosomal databases</option>
+                <option value="cached_to_index">Public ribosomal databases to index with non default parameters</option>
                 <option value="history">Databases from your history</option>
             </param>
             <when value="cached">
@@ -93,25 +142,35 @@
                     <validator type="no_options" message="Select at least one database"/>
                 </param>
             </when>
+            <when value="cached_to_index">
+                <param name="input_databases" label="rRNA databases" type="select" display="checkboxes" multiple="true">
+                    <options from_data_table="rRNA_databases" />
+                    <validator type="no_options" message="Select at least one database"/>
+                </param>
+                <param name="seed_length" type="integer" min="0" max="100" value="18" label="Seed length for database indexing" help="(-L)"/>
+                <param name="max_pos" type="integer" min="0" max="100000" value="10000" label="Maximum number of positions to store for each k-mer for database indexing" help="With 0, all positions are stored (--max_pos)"/>
+            </when>
             <when value="history">
                 <param name="database_name" type="data" format="fasta" multiple="true" label="rRNA databases"
                     help="Your databases will be indexed first, which may take up to several minutes."/>
+                <param name="seed_length" type="integer" min="0" max="100" value="18" label="Seed length for database indexing" help="(-L)"/>
+                <param name="max_pos" type="integer" min="0" max="100000" value="10000" label="Maximum number of positions to store for each k-mer for database indexing" help="With 0, all positions are stored (--max_pos)"/>
             </when>
         </conditional>

         <!-- Outputs -->
         <conditional name="aligned_fastx">
-            <param name="aligned_fastx_selector" type="select" label="Include aligned reads in FASTA/FASTQ format">
+            <param name="aligned_fastx_selector" type="select" label="Include aligned reads in FASTA/FASTQ format?">
                 <option value="--fastx">Yes (--fastx)</option>
                 <option value="">No</option>
             </param>
             <when value="--fastx">
-                <param name="other" type="boolean" label="Include rejected reads file" help="(--other)" />
+                <param name="other" type="boolean" label="Include rejected reads file?" help="(--other)" />
             </when>
             <when value="" />
         </conditional>
         <conditional name="aligned_sam">
-            <param name="aligned_sam_selector" type="select" label="Include alignments in SAM format">
+            <param name="aligned_sam_selector" type="select" label="Include alignments in SAM format?">
                 <option value="--sam">Yes (--sam)</option>
                 <option value="">No</option>
             </param>
@@ -130,6 +189,48 @@
         <param name="log" type="boolean" checked="False" truevalue="--log" falsevalue="" label="Generate statistics file"
                help="Generates statistics for the rRNA content of reads, as well as rRNA subunit distribution. (--log)">
         </param>
+	<conditional name="report">
+            <param name="report_type" type="select" label="Parameters for filtering and read mapping" help="">
+                <option value="best" selected="true">Report best alignments per read reaching E-value</option>
+                <option value="num_alignments">Report first alignements per read reaching E-value</option>
+            </param>
+            <when value="best">
+                <conditional name="report_best">
+                    <param name="report_best_type" type="select" label="Number of searched alignments" help="Only the best alignment is reported (--best)">
+                        <option value="1" selected="true">Only one high-candidate reference sequence is searched for alignments (fast). The high-candidate sequences are determined heuristically using a LIS of seed matches)</option>
+                        <option value="other_value">A custom number of reference sequences are searched for alignments (speed decrease for high value)</option>
+                    </param>
+                    <when value="1">
+                        <param name="report_best_min_lis" type="integer" min="0" max="100" value="2" label="Number of longest LIS an alignement needs to be searched" help="The alignements having the first INT longest LIS. LIS stands for Longest Increasing Subsequence, it is computed using seeds' positions to expand hits into longer matches prior to Smith-Waterman alignment. (--min_lis)"/>
+                    </when>
+                    <when value="other_value">
+                        <param name="report_best_value" type="integer" min="2" max="100" value="2" label="Number of alignments to be made" help="Only the best one is reported. The computation speed decrease with high value"/>
+                        <param name="report_best_min_lis" type="integer" min="0" max="100" value="2" label="Number of longest LIS an alignement needs to be searched" help="The alignements having the first INT longest LIS. LIS stands for Longest Increasing Subsequence, it is computed using seeds' positions to expand hits into longer matches prior to Smith-Waterman alignment. (--min_lis)"/>
+                    </when>
+                </conditional>
+            </when>
+            <when value="num_alignments">
+                <conditional name="report_num_alignments">
+                    <param name="report_num_alignments_type" type="select" label="Number of output alignments" help="(--num_alignments)">
+                        <option value="0">All alignments reaching the E-value threshold are reported (very slow, this option is not suggested for high similarity rRNA databases)</option>
+                        <option value="1" selected="true">The first alignment passing E-value threshold are reported (very fast, best choice if only filtering is needed)</option>
+                        <option value="other_value">A custom number of alignments are made and reported (speed decrease for high value)</option>
+                    </param>
+                    <when value="0" />
+                    <when value="1" />
+                    <when value="other_value">
+                        <param name="report_num_alignments_value" type="integer" min="0" max="100" value="1" label="Number of alignments to be made and reported" help=""/>
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+
+        <param name="e_value" type="float" min="0" max="10" value="1" label="E-value threshold" help="(-e)"/>
+        <param name="match" type="integer" min="0" max="10" value="2" label="SW score for a match" help="(--match)"/>
+        <param name="mismatch" type="integer" min="-10" max="0" value="-3" label="SW penalty for a mismatch" help="(--mismatch)"/>
+        <param name="gap_open" type="integer" min="0" max="10" value="5" label="SW penalty for introducing a gap" help="(--gap_open)"/>
+        <param name="gap_ext" type="integer" min="0" max="10" value="2" label="SW penalty for extending a gap" help="(--gap_ext)"/>
+        <param name="ambiguous_letter" type="integer" min="-10" max="0" value="-3" label="SW penalty for ambiguous letters (N's)" help="(-N)"/>
     </inputs>
     <outputs>
         <data format_source="input_reads" name="output_fastx" from_work_dir="aligned.dat"
@@ -245,8 +346,10 @@

 .. class:: warningmark

-Note that your personal databases are indexed each time, and that
-this may take some time depending on the size of the given database.
+Note that your personal databases are indexed each time. The public ribosomal
+databases are indexed when added, but they can be re-indexed with non-default indexing
+parameters. The indexing may take some time depending on the size of the given database.
+
 ]]>
     </help>
--- a/tool_dependencies.xml	Wed Aug 05 02:50:43 2015 -0400
+++ b/tool_dependencies.xml	Tue Mar 29 07:01:13 2016 -0400
@@ -1,18 +1,49 @@
 <?xml version="1.0"?>
 <tool_dependency>
-    <package name="sortmerna" version="2.0">
+    <package name="sortmerna" version="2.1b">
         <install version="1.0">
-            <actions>
-                <action type="download_by_url" target_filename="sortmerna-2.0.tar.gz">https://github.com/biocore/sortmerna/archive/2.0.tar.gz</action>
-                <action type="autoconf"/>
-                <action type="set_environment">
-                    <environment_variable name="SORTMERNADIR" action="set_to">$INSTALL_DIR</environment_variable>
-                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
-                </action>
-            </actions>
+            <actions_group>
+                <actions architecture="x86_64" os="linux">
+                    <action type="download_by_url" target_filename="sortmerna-2.1b.tar.gz">https://github.com/biocore/sortmerna/archive/2.1b.tar.gz</action>
+                    <action type="shell_command"><![CDATA[
+                        ./build.sh --prefix=$INSTALL_DIR
+                    ]]>
+                    </action>
+                    <action type="shell_command">make install</action>
+                    <action type="make_directory">$INSTALL_DIR/rRNA_databases/</action>
+                    <action type="move_directory_files">
+                        <source_directory>rRNA_databases/</source_directory>
+                        <destination_directory>$INSTALL_DIR/rRNA_databases/</destination_directory>
+                    </action>
+                    <action type="set_environment">
+                        <environment_variable name="SORTMERNADIR" action="set_to">$INSTALL_DIR/</environment_variable>
+                        <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                    </action>
+                </actions>
+                <actions architecture="x86_64" os="darwin">
+                    <action type="download_by_url" target_filename="sortmerna-2.1b.tar.gz">https://github.com/biocore/sortmerna/archive/2.1b.tar.gz</action>
+                    <action type="shell_command"><![CDATA[
+                        export CC=gcc-mp-4.8 && export CXX=g++-mp-4.8 && ./build.sh --prefix=$INSTALL_DIR && make install
+                    ]]>
+                    </action>
+                    <action type="make_directory">$INSTALL_DIR/rRNA_databases/</action>
+                    <action type="move_directory_files">
+                        <source_directory>rRNA_databases/</source_directory>
+                        <destination_directory>$INSTALL_DIR/rRNA_databases/</destination_directory>
+                    </action>
+                    <action type="set_environment">
+                        <environment_variable name="SORTMERNADIR" action="set_to">$INSTALL_DIR/</environment_variable>
+                        <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                    </action>
+                </actions>
+            </actions_group>
         </install>
         <readme>
-        SortMeRNA requires g++ 4.3 or later. Installation may take a moment since ribosomal databases have to be indexed.
+        SortMeRNA requires g++ 4.8 or later.
+
+        Note: the Clang compiler on Mac (distributed through Xcode) does not
+        support multithreading. The user is recommended to install the original
+        GCC compiler via MacPorts
         </readme>
     </package>
 </tool_dependency>