Mercurial > repos > galaxyp > nbic_fasta
diff ExtractPeptideSequenceContext.xml @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractPeptideSequenceContext.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,172 @@ +<!-- +# ===================================================== +# $Id: ExtractPeptideSequenceContext.xml 90 2011-01-19 13:20:31Z pieter.neerincx@gmail.com $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractPeptideSequenceContext.xml $ +# $LastChangedDate: 2011-01-19 07:20:31 -0600 (Wed, 19 Jan 2011) $ +# $LastChangedRevision: 90 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# ===================================================== +--> +<tool id="ExtractPeptideSequenceContext1" version="0.1" name="Extract Peptide Context"> + <description>by mapping peptides back to proteins and extending them on both termini to include their sequence context.</description> + <command interpreter="perl">ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol $strip --pepo $pepo --n $n --c $c --pc '$pc' --ll WARN</command> + <inputs> + <param name="fragments" type="data" format="tabular" label="Peptide sequences and their protein's identifiers" + help="(in tab delimited format)"/> + <param name="icol" type="data_column" value="1" data_ref="fragments" label="Protein identifier column"/> + <param name="pcol" type="data_column" value="2" data_ref="fragments" label="Peptide sequence column"/> + <!-- + <param name="icol" type="integer" value="1" label="Protein identifier column"/> + <param name="pcol" type="integer" value="2" label="Peptide sequence column"/> + --> + <param name="strip" type="select"> + <label>Lowercase characters in the peptide sequences represent</label> + <option value="--s">Modifications</option> + <option value="">Amino acids</option> + </param> + <param name="db" type="data" format="fasta" label="Protein sequences" + help="(in FASTA format)"/> + <param name="n" type="integer" value="5" label="N-terminal sequence context length"/> + <param name="c" type="integer" value="5" label="C-terminal sequence context length"/> + <param name="pc" type="select" help="to fill positions in the sequence context when the protein was too short for a full length context."> + <label>Padding character</label> + <option value="-">dash</option> + <option value=" ">space</option> + <option value="">none</option> + </param> + </inputs> + <outputs> + <data name="pepo" format="tabular" label="Peptide sequence contexts for ${fragments.name}"/> + </outputs> +<!-- + <tests> + <test> + <param name="input" value="*.fasta"/> + <param name="identifiers" value="*.txt"/> + <output name="output" file="*.fasta"/> + </test> + </tests> +--> + <help> + +.. role:: raw-html(raw) + :format: html + +.. class:: infomark + +**What it does** + +Map peptide sequences back to proteins and extend the peptides on both termini to include their sequence context. + +:raw-html:`<object data="static/images/nbic_gmr/ExtractPeptideSequenceContext.svg" type="image/svg+xml" width="100%"/>` + +=================================================== +*Peptide sequences and their protein's identifiers* +=================================================== + +This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \ +The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \ +If a sequence context was found, it will be appended in a new column to the right of the existing columns. \ +When another sequence context was found for the same peptide, it will appended as an extra row in the output. +Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \ +The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \ +For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \ +Amino acids in the petide sequences must be in uppercase. + +=============================================== +*Protein sequences* +=============================================== + +Input file containing all protein sequences in FASTA format. \ +This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \ +Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \ +Optionally IDs may be prefixed with a database namespace and a colon (:). \ +For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header: + + >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +and in this one: + + >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +=================================================== +*N-terminal and C-terminal sequence context length* +=================================================== + +Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the peptide termini. \ +So the total sequence context length for a peptide will be: +(N-terminal sequence context) + (length of the peptide) + (C-terminal sequence context). + +=============================================== +*Padding character* +=============================================== + +Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \ +when the protein was too short to get a complete sequence context. \ +Defaults to - a.k.a. dash or alignment gap character. \ + +----- + +**Getting input data** + +.. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl + +This tool requires \ +peptide sequences in TAB delimited format and \ +protein sequences from which the peptides were derived in FASTA format. \ +If your peptide sequences are not in TAB delimited format, you can convert from: + + - FASTA format using *FASTA manipulation* -> *FASTA-to-Tabular* + - A format using a different delimiter using *Text Manipulation* -> *Convert* + +When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\ +please make sure you provide the same FASTA database for this tool as the one used for your search. +If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \ +you can use the `my folder utility`_ to download the FASTA databases from the Mascot server. + +----- + +**Examples** + +Example input for peptides identified with a Mascot search, \ +some with phosphorylated residues indicated by pS, pT or pY \ +and in TAB delimited format:: + + sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches + AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253990 H2A1B_HUMAN[67-74] + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] + RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 P04075[101-110] + KIKELQAF 11.87 975.575287 0.003907 4.00481649347068 O60882[71-78] + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] + +=============================================== +*Appending peptide sequence contexts* +=============================================== + +With these options: + + - c6 as *Protein identifier column* + - c1 as *Peptide sequence column* + - 5 as *N-terminal sequence context length* + - 5 as *C-terminal sequence context length* + - a suitable FASTA database with *Protein sequences* + - and everything else set to defaults + +the example above will generate a result like this:: + + AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253990 H2A1B_HUMAN[67-74] EILELAGNAARDNKKTRI + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] LKKIFKLSAAVVLIGSHPN + RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] VDVMRRAGIKVTVAGLAGK + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 P04075[101-110] QVIKSKGGVVGIKVDKGVVP + KIKELQAF 11.87 975.575287 0.003907 4.00481649347068 O60882[71-78] NSMIRKIKELQAFFGLQV + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] VGEKEKISGTVNIRTRDNK + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] AILEYKLYEALKFIMLYQ + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] LTKVDKLDASESLRKEEEQ + +Note the header line was ignored. + + </help> +</tool>