Mercurial > repos > galaxyp > nbic_fasta
diff ExtractCleavageSiteSequenceContext.xml @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ExtractCleavageSiteSequenceContext.xml Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,218 @@ +<!-- +# ===================================================== +# $Id: ExtractCleavageSiteSequenceContext.xml 113 2011-03-04 16:59:11Z pieter.neerincx@gmail.com $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractCleavageSiteSequenceContext.xml $ +# $LastChangedDate: 2011-03-04 10:59:11 -0600 (Fri, 04 Mar 2011) $ +# $LastChangedRevision: 113 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# ===================================================== +--> +<tool id="ExtractPeptideSequenceContext2" version="2.1" name="Extract Cleavage Site Context"> + <description>by mapping peptides back to proteins and fetching the regions surrounding the peptide termini.</description> + <command interpreter="perl">ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol $strip --cleo $cleo --ca '$ca' --ct $ct --n $n --c $c --pc '$pc' --ll WARN</command> + <inputs> + <param name="fragments" type="data" format="tabular" label="Peptide sequences and their protein's identifiers" + help="(in tab delimited format)"/> + <param name="icol" type="data_column" default_value="1" data_ref="fragments" label="Protein identifier column"/> + <param name="pcol" type="data_column" default_value="2" data_ref="fragments" label="Peptide sequence column"/> + <param name="strip" type="select"> + <label>Lowercase characters in the peptide sequences represent</label> + <option value="--s">Modifications</option> + <option value="">Amino acids</option> + </param> + <param name="db" type="data" format="fasta" label="Protein sequences" + help="(in FASTA format)"/> + <param name="n" type="integer" value="5" label="N-terminal sequence context length"/> + <param name="c" type="integer" value="5" label="C-terminal sequence context length"/> + <param name="pc" type="select" help="to fill positions in the sequence context when the protein was too short for a full length context."> + <label>Padding character</label> + <option value="-">dash</option> + <option value=" ">space</option> + <option value="">none</option> + </param> + <param name="ca" type="select"> + <label>Protease recognizes amino acid</label> + <option value="A">A</option> + <!--<option value="B">B</option>--> + <option value="C">C</option> + <option value="D">D</option> + <option value="E">E</option> + <option value="F">F</option> + <option value="G">G</option> + <option value="H">H</option> + <option value="I">I</option> + <!--<option value="J">J</option>--> + <option value="K">K</option> + <option value="L">L</option> + <option value="M">M</option> + <option value="N">N</option> + <!--<option value="O">O</option>--> + <option value="P">P</option> + <option value="Q">Q</option> + <option value="R">R</option> + <option value="S">S</option> + <option value="T">T</option> + <!--<option value="U">U</option>--> + <option value="V">V</option> + <option value="W">W</option> + <option value="*">* (any amino acid)</option> + <option value="Y">Y</option> + <!--<option value="Z">Z</option>--> + </param> + <param name="ct" type="select"> + <label>Protease cleaves</label> + <option value="C">C-terminal of the recognized amino acid</option> + <option value="N">N-terminal of the recognized amino acid</option> + </param> + </inputs> + <outputs> + <data name="cleo" format="tabular" label="Cleavage site sequence contexts for ${fragments.name}"/> + </outputs> +<!-- + <tests> + <test> + <param name="input" value="*.fasta"/> + <param name="identifiers" value="*.txt"/> + <output name="output" file="*.fasta"/> + </test> + </tests> +--> + <help> + +.. role:: raw-html(raw) + :format: html + +.. class:: infomark + +**What it does** + +Map peptide sequences back to proteins and extract sequence contexts for cleavage sites. + +:raw-html:`<object data="static/images/nbic_gmr/ExtractCleavageSiteSequenceContext.svg" type="image/svg+xml" width="100%"/>` + +=================================================== +*Peptide sequences and their protein's identifiers* +=================================================== + +This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \ +The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \ +If a sequence context was found, it will be appended in a new column to the right of the existing columns. \ +When another sequence context was found for the same peptide, it will appended as an extra row in the output. +Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \ +The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \ +For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \ +Amino acids in the petide sequences must be in uppercase. + +=============================================== +*Protein sequences* +=============================================== + +Input file containing all protein sequences in FASTA format. \ +This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \ +Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \ +Optionally IDs may be prefixed with a database namespace and a colon (:). \ +For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header: + + >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +and in this one: + + >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) + +=================================================== +*N-terminal and C-terminal sequence context length* +=================================================== + +Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the modification site. \ +Note that the width of a cleavage site is 0 amino acids. \ +When defaults are used for both the N-terminal and C-terminal sequence context lengths, \ +the total sequence context length for a cleavage site will be: +(N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10. + +=============================================== +*Cleavage amino acid and terminus* +=============================================== + +This tool assumes the peptides were derived from cutting with a proteolytic enzyme, \ +that cuts on the *cleavage terminal* side of all *cleavage amino acids*. \ +When the specificity of the used protease is unknown, \ +you may provide an asterisk (*) to retrieve sequence context for any cleavage site, \ +but in that case this tool will not filter non-specifically cleaved fragments, \ +that may be the result of processes other than protease activity. + +=============================================== +*Padding character* +=============================================== + +Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \ +when the protein was too short to get a complete sequence context. \ +Defaults to - a.k.a. dash or alignment gap character. \ + +----- + +**Getting input data** + +.. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl + +This tool requires \ +peptide sequences in TAB delimited format and \ +protein sequences from which the peptides were derived in FASTA format. \ +If your peptide sequences are not in TAB delimited format, you can convert from: + + - FASTA format using *FASTA manipulation* -> *FASTA-to-Tabular* + - A format using a different delimiter using *Text Manipulation* -> *Convert* + +When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\ +please make sure you provide the same FASTA database for this tool as the one used for your search. +If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \ +you can use the `my folder utility`_ to download the FASTA databases from the Mascot server. + +----- + +**Examples** + +Example input for peptides identified with a Mascot search, \ +some with phosphorylated residues indicated by pS, pT or pY \ +and in TAB delimited format:: + + sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches + AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74] + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] + RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] + KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] + +=============================================== +*Appending cleavage site sequence contexts* +=============================================== + +With these options: + + - K as *cleavage amino acid* + - N-terminal as *cleavage terminus* + - c6 as *Protein identifier column* + - c1 as *Peptide sequence column* + - a suitable FASTA database with *Protein sequences* + - and everything else set to defaults + +the example above will generate a result like this:: + + AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74] AARDNKKTRI + KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] LKKIFKLSAA + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] QVIKSKGGVV + KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] GIKVDKGVVP + KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] NSMIRKIKEL + KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] VGEKEKISGT + KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] AILEYKLYEA + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] LTKVDKLDAS + KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] SESLRKEEEQ + + +Note the header line was ignored and if peptides were derived from specific LysN cleavage, they will occur twice in the output: \ +once with the sequence context for the peptide\'s N-terminus and once for its C-terminus. + + </help> +</tool>