Mercurial > repos > galaxyp > nbic_fasta
diff ExtractSeqsFromFasta.xml @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractSeqsFromFasta.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,78 @@ +<!-- +# ===================================================== +# $Id: ExtractSeqsFromFasta.xml 90 2011-01-19 13:20:31Z pieter.neerincx@gmail.com $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractSeqsFromFasta.xml $ +# $LastChangedDate: 2011-01-19 07:20:31 -0600 (Wed, 19 Jan 2011) $ +# $LastChangedRevision: 90 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# ===================================================== +--> +<tool id="ExtractSeqsFromFasta1" version="1.1" name="ExtractSeqsFromFasta"> + <description>Extract sequences from a FASTA file based on a list of IDs</description> + <command interpreter="perl">ExtractSeqsFromFasta.pl $ignore_accession_number_versions -f $identifiers -i $input -o $output -l WARN</command> + <inputs> + <param format="fasta" name="input" type="data" label="FASTA sequences"/> + <param format="txt" name="identifiers" type="data" label="List of IDs to extract sequences for"/> + <param name="ignore_accession_number_versions" type="boolean" truevalue="-u" falsevalue="" optional="true" label="Ignore accession number versions"/> + </inputs> + <outputs> + <data format="fasta" name="output" label="FASTA sequences for ${identifiers.name}"/> + </outputs> +<!-- + <tests> + <test> + <param name="input" value="*.fasta"/> + <param name="identifiers" value="*.txt"/> + <output name="output" file="*.fasta"/> + </test> + </tests> +--> + <help> + +.. class:: infomark + +**What it does** + +This tool filters a set of FASTA sequences for certain identifiers (IDs) or accession numbers. \ +Only sequences whose ID or accession number is present in the supplied list will remain in the filtered FASTA output. \ +The list of IDs or accession numbers to filter for must be a flat text file with one ID or accession per line. + +This tool can match IDs with and without colon prefixed database namespaces in FASTA sequence header line. \ +Hence your FASTA header can contain both >UniProtKB:Q86Y46 ... or just plain >Q86Y46 ... . \ +Database namespace prefixes should not be present in the list of IDs that you want to extract sequences for. + +FASTA headers may contain multiple IDs separated with pipe symbols (|) or semi colons (;). \ +If multiple IDs are supplied these should not contain any white space as everything after the \ +first white space is considered to be the (optional) description, which will not be matched against the list \ +of IDs to extract. + +If your FASTA file contains versioned IDs / accessions, your list of IDs / accessions to extract must also contain \ +versioned IDs / accessions and the version numbers must match. + +----- + +**Example** + +If the FASTA header is this:: + + >IPI:CON_IPI00174775.2|TREMBL:Q32MB2;Q86Y46 Tax_Id=9606 Gene_Symbol=KRT73 Keratin-73 + +The following IDs / accession numbers will match this sequence header:: + + CON_IPI00174775.2 + Q32MB2 + Q86Y46 + +These will not match:: + + IPI:CON_IPI00174775.2 (prefix should be removed) + KRT73 (ID part of description and not part of list of IDs, + which is everything up until the first white space.) + +And finally these will not match unless *ignore accession number versions* is enabled:: + + CON_IPI00174775 (no version number, while FASTA file does contain versioned accession numbers) + CON_IPI00174775.1 (wrong version number) + + </help> +</tool>