diff ExtractCleavageSiteSequenceContext.xml @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ExtractCleavageSiteSequenceContext.xml	Fri May 10 17:15:08 2013 -0400
@@ -0,0 +1,218 @@
+<!-- 
+# =====================================================
+# $Id: ExtractCleavageSiteSequenceContext.xml 113 2011-03-04 16:59:11Z pieter.neerincx@gmail.com $
+# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractCleavageSiteSequenceContext.xml $
+# $LastChangedDate: 2011-03-04 10:59:11 -0600 (Fri, 04 Mar 2011) $ 
+# $LastChangedRevision: 113 $
+# $LastChangedBy: pieter.neerincx@gmail.com $
+# =====================================================
+-->
+<tool id="ExtractPeptideSequenceContext2" version="2.1" name="Extract Cleavage Site Context">
+  <description>by mapping peptides back to proteins and fetching the regions surrounding the peptide termini.</description>
+  <command interpreter="perl">ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol $strip --cleo $cleo --ca '$ca' --ct $ct --n $n --c $c --pc '$pc' --ll WARN</command>
+  <inputs>
+    <param name="fragments"     type="data" format="tabular"    label="Peptide sequences and their protein's identifiers"
+           help="(in tab delimited format)"/>
+    <param name="icol" type="data_column" default_value="1" data_ref="fragments" label="Protein identifier column"/>
+    <param name="pcol" type="data_column" default_value="2" data_ref="fragments" label="Peptide sequence column"/>
+    <param name="strip" type="select">
+      <label>Lowercase characters in the peptide sequences represent</label>
+      <option value="--s">Modifications</option>
+      <option value="">Amino acids</option>
+    </param>
+    <param name="db"            type="data" format="fasta"      label="Protein sequences"
+           help="(in FASTA format)"/>
+    <param name="n"             type="integer"	value="5"		label="N-terminal sequence context length"/>
+    <param name="c"             type="integer"	value="5"		label="C-terminal sequence context length"/>
+    <param name="pc"            type="select" help="to fill positions in the sequence context when the protein was too short for a full length context.">
+      <label>Padding character</label>
+      <option value="-">dash</option>
+      <option value=" ">space</option>
+      <option value="">none</option>
+    </param>
+    <param name="ca"	type="select">
+      <label>Protease recognizes amino acid</label>
+      <option value="A">A</option>
+      <!--<option value="B">B</option>-->
+      <option value="C">C</option>
+      <option value="D">D</option>
+      <option value="E">E</option>
+      <option value="F">F</option>
+      <option value="G">G</option>
+      <option value="H">H</option>
+      <option value="I">I</option>
+      <!--<option value="J">J</option>-->
+      <option value="K">K</option>
+      <option value="L">L</option>
+      <option value="M">M</option>
+      <option value="N">N</option>
+      <!--<option value="O">O</option>-->
+      <option value="P">P</option>
+      <option value="Q">Q</option>
+      <option value="R">R</option>
+      <option value="S">S</option>
+      <option value="T">T</option>
+      <!--<option value="U">U</option>-->
+      <option value="V">V</option>
+      <option value="W">W</option>
+      <option value="*">* (any amino acid)</option>
+      <option value="Y">Y</option>
+      <!--<option value="Z">Z</option>-->
+    </param>
+    <param name="ct"	type="select">
+      <label>Protease cleaves</label>
+      <option value="C">C-terminal of the recognized amino acid</option>
+      <option value="N">N-terminal of the recognized amino acid</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data name="cleo" format="tabular" label="Cleavage site sequence contexts for ${fragments.name}"/>
+  </outputs>
+<!--
+  <tests>
+    <test>
+      <param name="input"       value="*.fasta"/>
+      <param name="identifiers" value="*.txt"/>
+      <output name="output"     file="*.fasta"/>
+    </test>
+  </tests>
+-->
+  <help>
+
+.. role:: raw-html(raw)
+   :format: html
+
+.. class:: infomark
+
+**What it does**
+
+Map peptide sequences back to proteins and extract sequence contexts for cleavage sites.
+
+:raw-html:`&lt;object data="static/images/nbic_gmr/ExtractCleavageSiteSequenceContext.svg" type="image/svg+xml" width="100%"/&gt;`
+
+===================================================
+*Peptide sequences and their protein's identifiers*
+===================================================
+
+This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \
+The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \
+If a sequence context was found, it will be appended in a new column to the right of the existing columns. \
+When another sequence context was found for the same peptide, it will appended as an extra row in the output.
+Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \
+The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \	
+For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \
+Amino acids in the petide sequences must be in uppercase.
+
+===============================================
+*Protein sequences*
+===============================================
+
+Input file containing all protein sequences in FASTA format. \
+This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \
+Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \
+Optionally IDs may be prefixed with a database namespace and a colon (:). \
+For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header: 
+
+   >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)
+
+and in this one:
+
+   >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)
+
+===================================================
+*N-terminal and C-terminal sequence context length*
+===================================================
+
+Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the modification site. \
+Note that the width of a cleavage site is 0 amino acids. \
+When defaults are used for both the N-terminal and C-terminal sequence context lengths, \
+the total sequence context length for a cleavage site will be: 
+(N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10.
+    
+===============================================
+*Cleavage amino acid and terminus*
+===============================================
+
+This tool assumes the peptides were derived from cutting with a proteolytic enzyme, \
+that cuts on the *cleavage terminal* side of all *cleavage amino acids*. \
+When the specificity of the used protease is unknown, \
+you may provide an asterisk (*) to retrieve sequence context for any cleavage site, \
+but in that case this tool will not filter non-specifically cleaved fragments, \
+that may be the result of processes other than protease activity.
+
+===============================================
+*Padding character*
+===============================================
+
+Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \
+when the protein was too short to get a complete sequence context. \
+Defaults to - a.k.a. dash or alignment gap character. \
+
+-----
+ 
+**Getting input data** 
+
+.. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl
+
+This tool requires \
+peptide sequences in TAB delimited format and \
+protein sequences from which the peptides were derived in FASTA format. \
+If your peptide sequences are not in TAB delimited format, you can convert from:
+ 
+ - FASTA format using *FASTA manipulation* -&gt; *FASTA-to-Tabular* 
+ - A format using a different delimiter using *Text Manipulation* -&gt; *Convert*
+ 
+When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\
+please make sure you provide the same FASTA database for this tool as the one used for your search.
+If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \
+you can use the `my folder utility`_ to download the FASTA databases from the Mascot server.  
+
+-----
+
+**Examples**
+
+Example input for peptides identified with a Mascot search, \
+some with phosphorylated residues indicated by pS, pT or pY \
+and in TAB delimited format::
+
+   sequence     score   peptide mr   mass delta (abs)   mass delta (ppm)       all protein matches
+   AGNAARDN     54.24   787.357254   -4.223E-5          -0.05334300253998803   H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74]
+   KLpSAAVVLI   11.48   912.600784   0.001608           1.7619971713721432     OSGI2_HUMAN[405-413]
+   RAGIKVpTVA   23.01   913.570892   6.283E-5           0.06786555979719196    PARK7_HUMAN[28-36]
+   KGGVVGIKVD   44.61   970.581146   -0.001214          -1.2507970147608864    ALDOA_HUMAN[101-110]
+   KIKELQAF     11.87   975.575287   0.003907           4.004816493470687      MMP20_HUMAN[71-78]
+   KIpSGpTVNIR  57.17   986.587265   -0.002761          -2.798536022051734     SYTC_HUMAN[681-689]
+   KLpYEALKF    17.54   1010.580032  0.004782           4.731935966057164      F105A_HUMAN[238-245]
+   KLDApSEpSLR  31.31   1017.545441  -0.002377          -2.3360136110127785    CLH1_HUMAN[1612-1620]
+
+===============================================
+*Appending cleavage site sequence contexts*
+===============================================
+
+With these options:
+
+ - K as *cleavage amino acid* 
+ - N-terminal as *cleavage terminus*
+ - c6 as *Protein identifier column*
+ - c1 as *Peptide sequence column*
+ - a suitable FASTA database with *Protein sequences*
+ - and everything else set to defaults
+
+the example above will generate a result like this::
+
+   AGNAARDN     54.24   787.357254   -4.223E-5          -0.05334300253998803   H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74]   AARDNKKTRI
+   KLpSAAVVLI   11.48   912.600784   0.001608           1.7619971713721432     OSGI2_HUMAN[405-413]     LKKIFKLSAA
+   KGGVVGIKVD   44.61   970.581146   -0.001214          -1.2507970147608864    ALDOA_HUMAN[101-110]     QVIKSKGGVV
+   KGGVVGIKVD   44.61   970.581146   -0.001214          -1.2507970147608864    ALDOA_HUMAN[101-110]     GIKVDKGVVP
+   KIKELQAF     11.87   975.575287   0.003907           4.004816493470687      MMP20_HUMAN[71-78]       NSMIRKIKEL
+   KIpSGpTVNIR  57.17   986.587265   -0.002761          -2.798536022051734     SYTC_HUMAN[681-689]      VGEKEKISGT
+   KLpYEALKF    17.54   1010.580032  0.004782           4.731935966057164      F105A_HUMAN[238-245]     AILEYKLYEA
+   KLDApSEpSLR  31.31   1017.545441  -0.002377          -2.3360136110127785    CLH1_HUMAN[1612-1620]    LTKVDKLDAS
+   KLDApSEpSLR  31.31   1017.545441  -0.002377          -2.3360136110127785    CLH1_HUMAN[1612-1620]    SESLRKEEEQ
+
+
+Note the header line was ignored and if peptides were derived from specific LysN cleavage, they will occur twice in the output: \
+once with the sequence context for the peptide\'s N-terminus and once for its C-terminus.
+
+  </help>
+</tool>