comparison ExtractMiscleavageSiteSequenceContext.xml @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:163892325845
1 <!--
2 # =====================================================
3 # $Id: ExtractMiscleavageSiteSequenceContext.xml 90 2011-01-19 13:20:31Z pieter.neerincx@gmail.com $
4 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractMiscleavageSiteSequenceContext.xml $
5 # $LastChangedDate: 2011-01-19 07:20:31 -0600 (Wed, 19 Jan 2011) $
6 # $LastChangedRevision: 90 $
7 # $LastChangedBy: pieter.neerincx@gmail.com $
8 # =====================================================
9 -->
10 <tool id="ExtractPeptideSequenceContext3" version="2.1" name="Extract Miscleavage Site Context">
11 <description>by mapping peptides back to proteins and fetching the regions surrounding missed cleavage sites.</description>
12 <command interpreter="perl">ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol $strip --miso $miso --ca $ca --ct $ct --n $n --c $c --pc '$pc' --ll WARN</command>
13 <inputs>
14 <param name="fragments" type="data" format="tabular" label="Peptide sequences and their protein's identifiers"
15 help="(in tab delimited format)"/>
16 <param name="icol" type="data_column" value="1" data_ref="fragments" label="Protein identifier column"/>
17 <param name="pcol" type="data_column" value="2" data_ref="fragments" label="Peptide sequence column"/>
18 <!--
19 <param name="icol" type="integer" value="1" label="Protein identifier column"/>
20 <param name="pcol" type="integer" value="2" label="Peptide sequence column"/>
21 -->
22 <param name="strip" type="select">
23 <label>Lowercase characters in the peptide sequences represent</label>
24 <option value="--s">Modifications</option>
25 <option value="">Amino acids</option>
26 </param>
27 <param name="db" type="data" format="fasta" label="Protein sequences"
28 help="(in FASTA format)"/>
29 <param name="n" type="integer" value="5" label="N-terminal sequence context length"/>
30 <param name="c" type="integer" value="5" label="C-terminal sequence context length"/>
31 <param name="pc" type="select" help="to fill positions in the sequence context when the protein was too short for a full length context.">
32 <label>Padding character</label>
33 <option value="-">dash</option>
34 <option value=" ">space</option>
35 <option value="">none</option>
36 </param>
37 <param name="ca" type="select">
38 <label>Protease should recognize amino acid</label>
39 <option value="A">A</option>
40 <!--<option value="B">B</option>-->
41 <option value="C">C</option>
42 <option value="D">D</option>
43 <option value="E">E</option>
44 <option value="F">F</option>
45 <option value="G">G</option>
46 <option value="H">H</option>
47 <option value="I">I</option>
48 <!--<option value="J">J</option>-->
49 <option value="K">K</option>
50 <option value="L">L</option>
51 <option value="M">M</option>
52 <option value="N">N</option>
53 <!--<option value="O">O</option>-->
54 <option value="P">P</option>
55 <option value="Q">Q</option>
56 <option value="R">R</option>
57 <option value="S">S</option>
58 <option value="T">T</option>
59 <!--<option value="U">U</option>-->
60 <option value="V">V</option>
61 <option value="W">W</option>
62 <!--<option value="*">X</option>-->
63 <option value="Y">Y</option>
64 <!--<option value="Z">Z</option>-->
65 </param>
66 <param name="ct" type="select">
67 <label>Protease should have cleaved</label>
68 <option value="C">C-terminal of the recognized amino acid</option>
69 <option value="N">N-terminal of the recognized amino acid</option>
70 </param>
71 </inputs>
72 <outputs>
73 <data name="miso" format="tabular" label="Miscleavage site sequence contexts for ${fragments.name}"/>
74 </outputs>
75 <!--
76 <tests>
77 <test>
78 <param name="input" value="*.fasta"/>
79 <param name="identifiers" value="*.txt"/>
80 <output name="output" file="*.fasta"/>
81 </test>
82 </tests>
83 -->
84 <help>
85
86 .. role:: raw-html(raw)
87 :format: html
88
89 .. class:: infomark
90
91 **What it does**
92
93 Map peptide sequences back to proteins and extract sequence contexts for miscleavage sites.
94
95 :raw-html:`&lt;object data="static/images/nbic_gmr/ExtractMiscleavageSiteSequenceContext.svg" type="image/svg+xml" width="100%"/&gt;`
96
97 ===================================================
98 *Peptide sequences and their protein's identifiers*
99 ===================================================
100
101 This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \
102 The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \
103 If a sequence context was found, it will be appended in a new column to the right of the existing columns. \
104 When another sequence context was found for the same peptide, it will appended as an extra row in the output.
105 Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \
106 The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \
107 For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \
108 Amino acids in the petide sequences must be in uppercase.
109
110 ===============================================
111 *Protein sequences*
112 ===============================================
113
114 Input file containing all protein sequences in FASTA format. \
115 This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \
116 Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \
117 Optionally IDs may be prefixed with a database namespace and a colon (:). \
118 For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header:
119
120 >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)
121
122 and in this one:
123
124 >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)
125
126 ===================================================
127 *N-terminal and C-terminal sequence context length*
128 ===================================================
129
130 Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the modification site. \
131 Note that the width of a miscleavage site is 0 amino acids. \
132 When defaults are used for both the N-terminal and C-terminal sequence context lengths, \
133 the total sequence context length for a miscleavage site will be:
134 (N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10.
135
136 ===============================================
137 *Cleavage amino acid and terminus*
138 ===============================================
139
140 This tool assumes the peptides were derived from cutting with a proteolytic enzyme, \
141 that should have cut on the *cleavage terminal* side of all *cleavage amino acids*. \
142
143 ===============================================
144 *Padding character*
145 ===============================================
146
147 Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \
148 when the protein was too short to get a complete sequence context. \
149 Defaults to - a.k.a. dash or alignment gap character. \
150
151 -----
152
153 **Getting input data**
154
155 .. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl
156
157 This tool requires \
158 peptide sequences in TAB delimited format and \
159 protein sequences from which the peptides were derived in FASTA format. \
160 If your peptide sequences are not in TAB delimited format, you can convert from:
161
162 - FASTA format using *FASTA manipulation* -&gt; *FASTA-to-Tabular*
163 - A format using a different delimiter using *Text Manipulation* -&gt; *Convert*
164
165 When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\
166 please make sure you provide the same FASTA database for this tool as the one used for your search.
167 If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \
168 you can use the `my folder utility`_ to download the FASTA databases from the Mascot server.
169
170 -----
171
172 **Examples**
173
174 Example input for peptides identified with a Mascot search, \
175 some with phosphorylated residues indicated by pS, pT or pY \
176 and in TAB delimited format::
177
178 sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches
179 AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74]
180 KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413]
181 RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36]
182 KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110]
183 KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78]
184 KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689]
185 KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245]
186 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620]
187
188 ===============================================
189 *Appending miscleavage site sequence contexts*
190 ===============================================
191
192 With these options:
193
194 - K as the *amino acid* the protease should have recognized
195 - N-terminal as the side of the recognized amino where the protease should have cleaved.
196 - c6 as *Protein identifier column*
197 - c1 as *Peptide sequence column*
198 - a suitable FASTA database with *Protein sequences*
199 - and everything else set to defaults
200
201 the example above will generate a result like this::
202
203 RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] RRAGIKVTVA
204 KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] GVVGIKVDKG
205 KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] MIRKIKELQA
206 KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] LYEALKFIML
207
208 Note the header line was ignored and if peptides have more than one miscleavage site they will occur more than once in the output.
209
210 </help>
211 </tool>