0
|
1 <!--
|
|
2 # =====================================================
|
|
3 # $Id: ExtractCleavageSiteSequenceContext.xml 113 2011-03-04 16:59:11Z pieter.neerincx@gmail.com $
|
|
4 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractCleavageSiteSequenceContext.xml $
|
|
5 # $LastChangedDate: 2011-03-04 10:59:11 -0600 (Fri, 04 Mar 2011) $
|
|
6 # $LastChangedRevision: 113 $
|
|
7 # $LastChangedBy: pieter.neerincx@gmail.com $
|
|
8 # =====================================================
|
|
9 -->
|
|
10 <tool id="ExtractPeptideSequenceContext2" version="2.1" name="Extract Cleavage Site Context">
|
|
11 <description>by mapping peptides back to proteins and fetching the regions surrounding the peptide termini.</description>
|
|
12 <command interpreter="perl">ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol $strip --cleo $cleo --ca '$ca' --ct $ct --n $n --c $c --pc '$pc' --ll WARN</command>
|
|
13 <inputs>
|
|
14 <param name="fragments" type="data" format="tabular" label="Peptide sequences and their protein's identifiers"
|
|
15 help="(in tab delimited format)"/>
|
|
16 <param name="icol" type="data_column" default_value="1" data_ref="fragments" label="Protein identifier column"/>
|
|
17 <param name="pcol" type="data_column" default_value="2" data_ref="fragments" label="Peptide sequence column"/>
|
|
18 <param name="strip" type="select">
|
|
19 <label>Lowercase characters in the peptide sequences represent</label>
|
|
20 <option value="--s">Modifications</option>
|
|
21 <option value="">Amino acids</option>
|
|
22 </param>
|
|
23 <param name="db" type="data" format="fasta" label="Protein sequences"
|
|
24 help="(in FASTA format)"/>
|
|
25 <param name="n" type="integer" value="5" label="N-terminal sequence context length"/>
|
|
26 <param name="c" type="integer" value="5" label="C-terminal sequence context length"/>
|
|
27 <param name="pc" type="select" help="to fill positions in the sequence context when the protein was too short for a full length context.">
|
|
28 <label>Padding character</label>
|
|
29 <option value="-">dash</option>
|
|
30 <option value=" ">space</option>
|
|
31 <option value="">none</option>
|
|
32 </param>
|
|
33 <param name="ca" type="select">
|
|
34 <label>Protease recognizes amino acid</label>
|
|
35 <option value="A">A</option>
|
|
36 <!--<option value="B">B</option>-->
|
|
37 <option value="C">C</option>
|
|
38 <option value="D">D</option>
|
|
39 <option value="E">E</option>
|
|
40 <option value="F">F</option>
|
|
41 <option value="G">G</option>
|
|
42 <option value="H">H</option>
|
|
43 <option value="I">I</option>
|
|
44 <!--<option value="J">J</option>-->
|
|
45 <option value="K">K</option>
|
|
46 <option value="L">L</option>
|
|
47 <option value="M">M</option>
|
|
48 <option value="N">N</option>
|
|
49 <!--<option value="O">O</option>-->
|
|
50 <option value="P">P</option>
|
|
51 <option value="Q">Q</option>
|
|
52 <option value="R">R</option>
|
|
53 <option value="S">S</option>
|
|
54 <option value="T">T</option>
|
|
55 <!--<option value="U">U</option>-->
|
|
56 <option value="V">V</option>
|
|
57 <option value="W">W</option>
|
|
58 <option value="*">* (any amino acid)</option>
|
|
59 <option value="Y">Y</option>
|
|
60 <!--<option value="Z">Z</option>-->
|
|
61 </param>
|
|
62 <param name="ct" type="select">
|
|
63 <label>Protease cleaves</label>
|
|
64 <option value="C">C-terminal of the recognized amino acid</option>
|
|
65 <option value="N">N-terminal of the recognized amino acid</option>
|
|
66 </param>
|
|
67 </inputs>
|
|
68 <outputs>
|
|
69 <data name="cleo" format="tabular" label="Cleavage site sequence contexts for ${fragments.name}"/>
|
|
70 </outputs>
|
|
71 <!--
|
|
72 <tests>
|
|
73 <test>
|
|
74 <param name="input" value="*.fasta"/>
|
|
75 <param name="identifiers" value="*.txt"/>
|
|
76 <output name="output" file="*.fasta"/>
|
|
77 </test>
|
|
78 </tests>
|
|
79 -->
|
|
80 <help>
|
|
81
|
|
82 .. role:: raw-html(raw)
|
|
83 :format: html
|
|
84
|
|
85 .. class:: infomark
|
|
86
|
|
87 **What it does**
|
|
88
|
|
89 Map peptide sequences back to proteins and extract sequence contexts for cleavage sites.
|
|
90
|
|
91 :raw-html:`<object data="static/images/nbic_gmr/ExtractCleavageSiteSequenceContext.svg" type="image/svg+xml" width="100%"/>`
|
|
92
|
|
93 ===================================================
|
|
94 *Peptide sequences and their protein's identifiers*
|
|
95 ===================================================
|
|
96
|
|
97 This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \
|
|
98 The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \
|
|
99 If a sequence context was found, it will be appended in a new column to the right of the existing columns. \
|
|
100 When another sequence context was found for the same peptide, it will appended as an extra row in the output.
|
|
101 Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \
|
|
102 The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \
|
|
103 For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \
|
|
104 Amino acids in the petide sequences must be in uppercase.
|
|
105
|
|
106 ===============================================
|
|
107 *Protein sequences*
|
|
108 ===============================================
|
|
109
|
|
110 Input file containing all protein sequences in FASTA format. \
|
|
111 This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \
|
|
112 Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \
|
|
113 Optionally IDs may be prefixed with a database namespace and a colon (:). \
|
|
114 For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header:
|
|
115
|
|
116 >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)
|
|
117
|
|
118 and in this one:
|
|
119
|
|
120 >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)
|
|
121
|
|
122 ===================================================
|
|
123 *N-terminal and C-terminal sequence context length*
|
|
124 ===================================================
|
|
125
|
|
126 Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the modification site. \
|
|
127 Note that the width of a cleavage site is 0 amino acids. \
|
|
128 When defaults are used for both the N-terminal and C-terminal sequence context lengths, \
|
|
129 the total sequence context length for a cleavage site will be:
|
|
130 (N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10.
|
|
131
|
|
132 ===============================================
|
|
133 *Cleavage amino acid and terminus*
|
|
134 ===============================================
|
|
135
|
|
136 This tool assumes the peptides were derived from cutting with a proteolytic enzyme, \
|
|
137 that cuts on the *cleavage terminal* side of all *cleavage amino acids*. \
|
|
138 When the specificity of the used protease is unknown, \
|
|
139 you may provide an asterisk (*) to retrieve sequence context for any cleavage site, \
|
|
140 but in that case this tool will not filter non-specifically cleaved fragments, \
|
|
141 that may be the result of processes other than protease activity.
|
|
142
|
|
143 ===============================================
|
|
144 *Padding character*
|
|
145 ===============================================
|
|
146
|
|
147 Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \
|
|
148 when the protein was too short to get a complete sequence context. \
|
|
149 Defaults to - a.k.a. dash or alignment gap character. \
|
|
150
|
|
151 -----
|
|
152
|
|
153 **Getting input data**
|
|
154
|
|
155 .. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl
|
|
156
|
|
157 This tool requires \
|
|
158 peptide sequences in TAB delimited format and \
|
|
159 protein sequences from which the peptides were derived in FASTA format. \
|
|
160 If your peptide sequences are not in TAB delimited format, you can convert from:
|
|
161
|
|
162 - FASTA format using *FASTA manipulation* -> *FASTA-to-Tabular*
|
|
163 - A format using a different delimiter using *Text Manipulation* -> *Convert*
|
|
164
|
|
165 When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\
|
|
166 please make sure you provide the same FASTA database for this tool as the one used for your search.
|
|
167 If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \
|
|
168 you can use the `my folder utility`_ to download the FASTA databases from the Mascot server.
|
|
169
|
|
170 -----
|
|
171
|
|
172 **Examples**
|
|
173
|
|
174 Example input for peptides identified with a Mascot search, \
|
|
175 some with phosphorylated residues indicated by pS, pT or pY \
|
|
176 and in TAB delimited format::
|
|
177
|
|
178 sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches
|
|
179 AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74]
|
|
180 KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413]
|
|
181 RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36]
|
|
182 KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110]
|
|
183 KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78]
|
|
184 KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689]
|
|
185 KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245]
|
|
186 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620]
|
|
187
|
|
188 ===============================================
|
|
189 *Appending cleavage site sequence contexts*
|
|
190 ===============================================
|
|
191
|
|
192 With these options:
|
|
193
|
|
194 - K as *cleavage amino acid*
|
|
195 - N-terminal as *cleavage terminus*
|
|
196 - c6 as *Protein identifier column*
|
|
197 - c1 as *Peptide sequence column*
|
|
198 - a suitable FASTA database with *Protein sequences*
|
|
199 - and everything else set to defaults
|
|
200
|
|
201 the example above will generate a result like this::
|
|
202
|
|
203 AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74] AARDNKKTRI
|
|
204 KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] LKKIFKLSAA
|
|
205 KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] QVIKSKGGVV
|
|
206 KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] GIKVDKGVVP
|
|
207 KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] NSMIRKIKEL
|
|
208 KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] VGEKEKISGT
|
|
209 KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] AILEYKLYEA
|
|
210 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] LTKVDKLDAS
|
|
211 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] SESLRKEEEQ
|
|
212
|
|
213
|
|
214 Note the header line was ignored and if peptides were derived from specific LysN cleavage, they will occur twice in the output: \
|
|
215 once with the sequence context for the peptide\'s N-terminus and once for its C-terminus.
|
|
216
|
|
217 </help>
|
|
218 </tool>
|