Mercurial > repos > galaxyp > nbic_fasta
comparison ExtractMiscleavageSiteSequenceContext.xml @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:163892325845 |
---|---|
1 <!-- | |
2 # ===================================================== | |
3 # $Id: ExtractMiscleavageSiteSequenceContext.xml 90 2011-01-19 13:20:31Z pieter.neerincx@gmail.com $ | |
4 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractMiscleavageSiteSequenceContext.xml $ | |
5 # $LastChangedDate: 2011-01-19 07:20:31 -0600 (Wed, 19 Jan 2011) $ | |
6 # $LastChangedRevision: 90 $ | |
7 # $LastChangedBy: pieter.neerincx@gmail.com $ | |
8 # ===================================================== | |
9 --> | |
10 <tool id="ExtractPeptideSequenceContext3" version="2.1" name="Extract Miscleavage Site Context"> | |
11 <description>by mapping peptides back to proteins and fetching the regions surrounding missed cleavage sites.</description> | |
12 <command interpreter="perl">ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol $strip --miso $miso --ca $ca --ct $ct --n $n --c $c --pc '$pc' --ll WARN</command> | |
13 <inputs> | |
14 <param name="fragments" type="data" format="tabular" label="Peptide sequences and their protein's identifiers" | |
15 help="(in tab delimited format)"/> | |
16 <param name="icol" type="data_column" value="1" data_ref="fragments" label="Protein identifier column"/> | |
17 <param name="pcol" type="data_column" value="2" data_ref="fragments" label="Peptide sequence column"/> | |
18 <!-- | |
19 <param name="icol" type="integer" value="1" label="Protein identifier column"/> | |
20 <param name="pcol" type="integer" value="2" label="Peptide sequence column"/> | |
21 --> | |
22 <param name="strip" type="select"> | |
23 <label>Lowercase characters in the peptide sequences represent</label> | |
24 <option value="--s">Modifications</option> | |
25 <option value="">Amino acids</option> | |
26 </param> | |
27 <param name="db" type="data" format="fasta" label="Protein sequences" | |
28 help="(in FASTA format)"/> | |
29 <param name="n" type="integer" value="5" label="N-terminal sequence context length"/> | |
30 <param name="c" type="integer" value="5" label="C-terminal sequence context length"/> | |
31 <param name="pc" type="select" help="to fill positions in the sequence context when the protein was too short for a full length context."> | |
32 <label>Padding character</label> | |
33 <option value="-">dash</option> | |
34 <option value=" ">space</option> | |
35 <option value="">none</option> | |
36 </param> | |
37 <param name="ca" type="select"> | |
38 <label>Protease should recognize amino acid</label> | |
39 <option value="A">A</option> | |
40 <!--<option value="B">B</option>--> | |
41 <option value="C">C</option> | |
42 <option value="D">D</option> | |
43 <option value="E">E</option> | |
44 <option value="F">F</option> | |
45 <option value="G">G</option> | |
46 <option value="H">H</option> | |
47 <option value="I">I</option> | |
48 <!--<option value="J">J</option>--> | |
49 <option value="K">K</option> | |
50 <option value="L">L</option> | |
51 <option value="M">M</option> | |
52 <option value="N">N</option> | |
53 <!--<option value="O">O</option>--> | |
54 <option value="P">P</option> | |
55 <option value="Q">Q</option> | |
56 <option value="R">R</option> | |
57 <option value="S">S</option> | |
58 <option value="T">T</option> | |
59 <!--<option value="U">U</option>--> | |
60 <option value="V">V</option> | |
61 <option value="W">W</option> | |
62 <!--<option value="*">X</option>--> | |
63 <option value="Y">Y</option> | |
64 <!--<option value="Z">Z</option>--> | |
65 </param> | |
66 <param name="ct" type="select"> | |
67 <label>Protease should have cleaved</label> | |
68 <option value="C">C-terminal of the recognized amino acid</option> | |
69 <option value="N">N-terminal of the recognized amino acid</option> | |
70 </param> | |
71 </inputs> | |
72 <outputs> | |
73 <data name="miso" format="tabular" label="Miscleavage site sequence contexts for ${fragments.name}"/> | |
74 </outputs> | |
75 <!-- | |
76 <tests> | |
77 <test> | |
78 <param name="input" value="*.fasta"/> | |
79 <param name="identifiers" value="*.txt"/> | |
80 <output name="output" file="*.fasta"/> | |
81 </test> | |
82 </tests> | |
83 --> | |
84 <help> | |
85 | |
86 .. role:: raw-html(raw) | |
87 :format: html | |
88 | |
89 .. class:: infomark | |
90 | |
91 **What it does** | |
92 | |
93 Map peptide sequences back to proteins and extract sequence contexts for miscleavage sites. | |
94 | |
95 :raw-html:`<object data="static/images/nbic_gmr/ExtractMiscleavageSiteSequenceContext.svg" type="image/svg+xml" width="100%"/>` | |
96 | |
97 =================================================== | |
98 *Peptide sequences and their protein's identifiers* | |
99 =================================================== | |
100 | |
101 This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \ | |
102 The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \ | |
103 If a sequence context was found, it will be appended in a new column to the right of the existing columns. \ | |
104 When another sequence context was found for the same peptide, it will appended as an extra row in the output. | |
105 Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \ | |
106 The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \ | |
107 For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \ | |
108 Amino acids in the petide sequences must be in uppercase. | |
109 | |
110 =============================================== | |
111 *Protein sequences* | |
112 =============================================== | |
113 | |
114 Input file containing all protein sequences in FASTA format. \ | |
115 This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \ | |
116 Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \ | |
117 Optionally IDs may be prefixed with a database namespace and a colon (:). \ | |
118 For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header: | |
119 | |
120 >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) | |
121 | |
122 and in this one: | |
123 | |
124 >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly) | |
125 | |
126 =================================================== | |
127 *N-terminal and C-terminal sequence context length* | |
128 =================================================== | |
129 | |
130 Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the modification site. \ | |
131 Note that the width of a miscleavage site is 0 amino acids. \ | |
132 When defaults are used for both the N-terminal and C-terminal sequence context lengths, \ | |
133 the total sequence context length for a miscleavage site will be: | |
134 (N-terminal sequence context) + (C-terminal sequence context) = 5 + 5 = 10. | |
135 | |
136 =============================================== | |
137 *Cleavage amino acid and terminus* | |
138 =============================================== | |
139 | |
140 This tool assumes the peptides were derived from cutting with a proteolytic enzyme, \ | |
141 that should have cut on the *cleavage terminal* side of all *cleavage amino acids*. \ | |
142 | |
143 =============================================== | |
144 *Padding character* | |
145 =============================================== | |
146 | |
147 Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \ | |
148 when the protein was too short to get a complete sequence context. \ | |
149 Defaults to - a.k.a. dash or alignment gap character. \ | |
150 | |
151 ----- | |
152 | |
153 **Getting input data** | |
154 | |
155 .. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl | |
156 | |
157 This tool requires \ | |
158 peptide sequences in TAB delimited format and \ | |
159 protein sequences from which the peptides were derived in FASTA format. \ | |
160 If your peptide sequences are not in TAB delimited format, you can convert from: | |
161 | |
162 - FASTA format using *FASTA manipulation* -> *FASTA-to-Tabular* | |
163 - A format using a different delimiter using *Text Manipulation* -> *Convert* | |
164 | |
165 When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\ | |
166 please make sure you provide the same FASTA database for this tool as the one used for your search. | |
167 If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \ | |
168 you can use the `my folder utility`_ to download the FASTA databases from the Mascot server. | |
169 | |
170 ----- | |
171 | |
172 **Examples** | |
173 | |
174 Example input for peptides identified with a Mascot search, \ | |
175 some with phosphorylated residues indicated by pS, pT or pY \ | |
176 and in TAB delimited format:: | |
177 | |
178 sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches | |
179 AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74] | |
180 KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] | |
181 RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] | |
182 KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] | |
183 KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] | |
184 KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] | |
185 KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] | |
186 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] | |
187 | |
188 =============================================== | |
189 *Appending miscleavage site sequence contexts* | |
190 =============================================== | |
191 | |
192 With these options: | |
193 | |
194 - K as the *amino acid* the protease should have recognized | |
195 - N-terminal as the side of the recognized amino where the protease should have cleaved. | |
196 - c6 as *Protein identifier column* | |
197 - c1 as *Peptide sequence column* | |
198 - a suitable FASTA database with *Protein sequences* | |
199 - and everything else set to defaults | |
200 | |
201 the example above will generate a result like this:: | |
202 | |
203 RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] RRAGIKVTVA | |
204 KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110] GVVGIKVDKG | |
205 KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78] MIRKIKELQA | |
206 KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] LYEALKFIML | |
207 | |
208 Note the header line was ignored and if peptides have more than one miscleavage site they will occur more than once in the output. | |
209 | |
210 </help> | |
211 </tool> |