diff ExtractSeqsFromFasta.xml @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ExtractSeqsFromFasta.xml	Fri May 10 17:15:08 2013 -0400
@@ -0,0 +1,78 @@
+<!-- 
+# =====================================================
+# $Id: ExtractSeqsFromFasta.xml 90 2011-01-19 13:20:31Z pieter.neerincx@gmail.com $
+# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractSeqsFromFasta.xml $
+# $LastChangedDate: 2011-01-19 07:20:31 -0600 (Wed, 19 Jan 2011) $ 
+# $LastChangedRevision: 90 $
+# $LastChangedBy: pieter.neerincx@gmail.com $
+# =====================================================
+-->
+<tool id="ExtractSeqsFromFasta1" version="1.1" name="ExtractSeqsFromFasta">
+  <description>Extract sequences from a FASTA file based on a list of IDs</description>
+  <command interpreter="perl">ExtractSeqsFromFasta.pl $ignore_accession_number_versions -f $identifiers -i $input -o $output -l WARN</command>
+  <inputs>
+    <param format="fasta" name="input" type="data" label="FASTA sequences"/>
+    <param format="txt" name="identifiers" type="data" label="List of IDs to extract sequences for"/>
+    <param name="ignore_accession_number_versions" type="boolean" truevalue="-u" falsevalue="" optional="true" label="Ignore accession number versions"/>
+  </inputs>
+  <outputs>
+    <data format="fasta" name="output" label="FASTA sequences for ${identifiers.name}"/>
+  </outputs>
+<!--
+  <tests>
+    <test>
+      <param name="input"       value="*.fasta"/>
+      <param name="identifiers" value="*.txt"/>
+      <output name="output"     file="*.fasta"/>
+    </test>
+  </tests>
+-->
+  <help>
+
+.. class:: infomark
+
+**What it does**
+
+This tool filters a set of FASTA sequences for certain identifiers (IDs) or accession numbers. \
+Only sequences whose ID or accession number is present in the supplied list will remain in the filtered FASTA output. \
+The list of IDs or accession numbers to filter for must be a flat text file with one ID or accession per line.
+
+This tool can match IDs with and without colon prefixed database namespaces in FASTA sequence header line. \
+Hence your FASTA header can contain both &gt;UniProtKB:Q86Y46 ... or just plain &gt;Q86Y46 ... . \
+Database namespace prefixes should not be present in the list of IDs that you want to extract sequences for.
+
+FASTA headers may contain multiple IDs separated with pipe symbols (|) or semi colons (;). \
+If multiple IDs are supplied these should not contain any white space as everything after the \
+first white space is considered to be the (optional) description, which will not be matched against the list \
+of IDs to extract.
+
+If your FASTA file contains versioned IDs / accessions, your list of IDs / accessions to extract must also contain \
+versioned IDs / accessions and the version numbers must match.
+
+-----
+
+**Example**
+
+If the FASTA header is this::
+
+   &gt;IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73
+
+The following IDs / accession numbers will match this sequence header::
+
+   CON_IPI00174775.2
+   Q32MB2
+   Q86Y46
+
+These will not match::
+
+   IPI:CON_IPI00174775.2 (prefix should be removed)
+   KRT73                 (ID part of description and not part of list of IDs, 
+                          which is everything up until the first white space.)
+
+And finally these will not match unless *ignore accession number versions* is enabled::
+
+   CON_IPI00174775       (no version number, while FASTA file does contain versioned accession numbers)
+   CON_IPI00174775.1     (wrong version number)
+
+  </help>
+</tool>