diff fastaptamer_cluster_1.xml @ 0:307254415eb1 draft

Uploaded
author fastaptamer
date Tue, 10 Feb 2015 14:30:29 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastaptamer_cluster_1.xml	Tue Feb 10 14:30:29 2015 -0500
@@ -0,0 +1,65 @@
+<tool id="fastaptamer_cluster_1_0_2" name="FASTAptamer-Cluster" version="1.0.2">
+
+	<description>Cluster closely-related sequences using Levenshtein edit distance.</description>
+	
+	<version_command>fastaptamer_cluster -v</version_command>
+	
+	<command interpreter="perl">fastaptamer_cluster -i $input -o $output -d $distance -f $filter > $report
+	</command>
+
+	<inputs>
+		<param name="input" type="data" format="fasta" label="Input file"  help="Must use FASTA output from FASTAptamer-Count"></param>
+		<param name="distance" type="integer" label="Levenshtein Edit Distance" value="1" help="Minimum number of insertions, deletions, or substitutions required to transfer a sequence into another"></param>
+		<param name="filter" type="integer" label="Read Filter" optional="true" value="1" help="Only sequences with total reads greater than the value supplied will be clustered."></param> 
+ 	</inputs>
+ 	
+    <outputs>
+    	<data name="output" format="fasta" label="FASTAptamer-Cluster output file"></data>
+	<data name="report" format="txt" label="FASTAptamer-Cluster Report"></data>
+ 	</outputs>
+ 	
+    <help>
+
+.. class:: warningmark
+
+FASTAptamer-Cluster requires a FASTA formatted input file generated by FASTAptamer-Count.
+
+.. class:: warningmark
+
+FASTAptamer-Cluster uses an exhaustive approach to clustering and can take *several* hours to process. For faster processing utilize the "Read Filter" option to exclude low read sequences.
+
+------
+
+**FASTAptamer-Cluster** uses the Levenshtein algorithm to cluster together closely-related sequences based on a user-defined edit distance (*the minimum number of insertions, deletions, or subsitutions required to transform one string into another*). 
+
+FASTAptamer-Cluster begins with the most abundant sequence in a population, referred to as the "seed sequence," and clusters with it every sequence in the file within an edit distance less than or equal to the specified edit distance (Cluster #1). The next most abundant unclustered sequence then serves as the next seed sequence for assembling the second cluster from the remaining sequences (Cluster #2), followed by the next most abundant unclustered sequence (Cluster #3), and so on. This process is iterated until every sequence is clustered.
+
+Output is FASTA formatted with the following information on the FASTA identifier line:
+
+	>Rank-Reads-RPM-Cluster#-RankWithinCluster-EditDistanceFromSeedSequence
+
+.. class:: infomark
+
+The "Read Filter" excludes from the clustering process sequences with a total number of reads less than or equal to the integer supplied. Because of the computational complexity of clustering large datasets, the default filter setting of 1 is designed to eliminate singleton sequences from clustering.
+
+------
+
+.. image:: 
+	http://burkelab.missouri.edu/images/fastaptamer-logo-xs.png
+	:height: 98
+	:width: 300
+
+For more information on FASTAptamer, visit our website_.
+
+FASTAptamer is distributed under a GNU GPL v3.0 license. For complete license click here_.
+
+.. _here: http://burkelab.missouri.edu/fastaptamer/LICENSE.txt
+.. _website: http://burkelab.missouri.edu/fastaptamer.html
+
+    </help>
+
+ 	<citations>
+    	<citation type="doi">doi:10.1038/mtna.2015.4</citation>
+    </citations>
+	
+</tool>