comparison ExtractModificationSiteSequenceContext.xml @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:163892325845
1 <!--
2 # =====================================================
3 # $Id: ExtractModificationSiteSequenceContext.xml 90 2011-01-19 13:20:31Z pieter.neerincx@gmail.com $
4 # $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ExtractModificationSiteSequenceContext.xml $
5 # $LastChangedDate: 2011-01-19 07:20:31 -0600 (Wed, 19 Jan 2011) $
6 # $LastChangedRevision: 90 $
7 # $LastChangedBy: pieter.neerincx@gmail.com $
8 # =====================================================
9 -->
10 <tool id="ExtractPeptideSequenceContext4" version="2.1" name="Extract Modification Site Context">
11 <description>by mapping modified amino acids in peptides back to proteins and fetching the sequence surrounding the modified sites.</description>
12 <command interpreter="perl">ExtractPeptideSequenceContext.pl --db $db --dbf FASTA --f $fragments --icol $icol --pcol $pcol --s --modo $modo --ma '$ma' --n $n --c $c --pc '$pc' --ll ERROR</command>
13 <inputs>
14 <param name="fragments" type="data" format="tabular" label="Peptide sequences and their protein's identifiers"
15 help="(in tab delimited format)"/>
16 <param name="icol" type="data_column" value="1" data_ref="fragments" label="Protein identifier column"/>
17 <param name="pcol" type="data_column" value="2" data_ref="fragments" label="Peptide sequence column"/>
18 <!--
19 <param name="icol" type="integer" value="1" label="Protein identifier column"/>
20 <param name="pcol" type="integer" value="2" label="Peptide sequence column"/>
21 -->
22 <param name="db" type="data" format="fasta" label="Protein sequences"
23 help="(in FASTA format)"/>
24 <param name="n" type="integer" value="5" label="N-terminal sequence context length"/>
25 <param name="c" type="integer" value="5" label="C-terminal sequence context length"/>
26 <param name="pc" type="select" help="to fill positions in the sequence context when the protein was too short for a full length context.">
27 <label>Padding character</label>
28 <option value="-">dash</option>
29 <option value=" ">space</option>
30 <option value="">none</option>
31 </param>
32 <param name="ma" type="text" label="Modified amino acid"/>
33 </inputs>
34 <outputs>
35 <data name="modo" format="tabular" label="Modification site sequence contexts for ${fragments.name}"/>
36 </outputs>
37 <!--
38 <tests>
39 <test>
40 <param name="input" value="*.fasta"/>
41 <param name="identifiers" value="*.txt"/>
42 <output name="output" file="*.fasta"/>
43 </test>
44 </tests>
45 -->
46 <help>
47
48 .. role:: raw-html(raw)
49 :format: html
50
51 .. class:: infomark
52
53 **What it does**
54
55 Map peptide sequences back to proteins and extract sequence contexts for modification sites.
56
57 :raw-html:`&lt;object data="static/images/nbic_gmr/ExtractModificationSiteSequenceContext.svg" type="image/svg+xml" width="100%"/&gt;`
58
59
60 ===================================================
61 *Peptide sequences and their protein's identifiers*
62 ===================================================
63
64 This file must contain at least peptides and accession numbers or IDs of the proteins the peptides were derived from. \
65 The data must be in TAB delimited format and may contain other columns, which will be preserved in the output. \
66 If a sequence context was found, it will be appended in a new column to the right of the existing columns. \
67 When another sequence context was found for the same peptide, it will appended as an extra row in the output.
68 Protein accession numbers / IDs must be in the same format as was used in the FASTA file with protein sequences (database). \
69 The only exception to this rule is that accession numbers / IDs may be optionally suffixed with the peptide\'s position in its protein between brackets. \
70 For example: CLH1_HUMAN[1612-1620] will be matched to CLH1_HUMAN in a FASTA file with protein sequences. \
71 Amino acids in the petide sequences must be in uppercase.
72
73 ===============================================
74 *Protein sequences*
75 ===============================================
76
77 Input file containing all protein sequences in FASTA format. \
78 This tool will look for any type of protein ID in the first part of FASTA sequence headers up until the first white space. \
79 Optionally multiple IDs may be present separated with pipe symbols (|) or semicolons (;). \
80 Optionally IDs may be prefixed with a database namespace and a colon (:). \
81 For example the accession number P32234 as well as the ID 128UP_DROME would be recognized in both this sequence header:
82
83 >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)
84
85 and in this one:
86
87 >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)
88
89 ===================================================
90 *N-terminal and C-terminal sequence context length*
91 ===================================================
92
93 Integers specifying the length of the N-terminal and C-terminal sequence context to retrieve starting from the modification site. \
94 Note that the width of a modification site is 1 amino acid. \
95 When defaults are used for both the N-terminal and C-terminal sequence context lengths, \
96 the total sequence context length for a modification site will be:
97 (N-terminal sequence context) + (modified amino acid) + (C-terminal sequence context) = 5 + 1 + 5 = 11.
98
99 ===============================================
100 *Modified amino acid*
101 ===============================================
102
103 The amino acid must be specified in uppercase and the modification in lower case. \
104 The order is not important. \
105 Hence a phophorylated serine in a peptide sequence can be indicated with either pS or Sp, \
106 but you cannot mix both pS and Sp in a single peptide sequence file. \
107 You may provide an asterisk (*) instead of an upper case amino acid to retrieve sequence contexts \
108 for the specified modification no matter what amino acid it was located on. \
109 A modification may be specified with more than one lower case character, \
110 so for example phosphoS or Sphospho can also be used for a phosphorylated serine.
111
112 ===============================================
113 *Padding character*
114 ===============================================
115
116 Optional padding character to fill N-terminal or C-terminal positions in the sequence context, \
117 when the protein was too short to get a complete sequence context. \
118 Defaults to - a.k.a. dash or alignment gap character. \
119
120 -----
121
122 **Getting input data**
123
124 .. _my folder utility: http://mascotinternal.chem.uu.nl/mascot/cgi/uu_myfolder.pl
125
126 This tool requires \
127 peptide sequences in TAB delimited format and \
128 protein sequences from which the peptides were derived in FASTA format. \
129 If your peptide sequences are not in TAB delimited format, you can convert from:
130
131 - FASTA format using *FASTA manipulation* -&gt; *FASTA-to-Tabular*
132 - A format using a different delimiter using *Text Manipulation* -&gt; *Convert*
133
134 When your peptides were derived from a mass spectrometry experiment and identified with a search engine like Mascot, Sequest, etc.,\
135 please make sure you provide the same FASTA database for this tool as the one used for your search.
136 If you used Mascot hosted by the Biomolecular Mass Spectrometry and Proteomics Group @ Utrecht University, \
137 you can use the `my folder utility`_ to download the FASTA databases from the Mascot server.
138
139 -----
140
141 **Examples**
142
143 Example input for peptides identified with a Mascot search, \
144 some with phosphorylated residues indicated by pS, pT or pY \
145 and in TAB delimited format::
146
147 sequence score peptide mr mass delta (abs) mass delta (ppm) all protein matches
148 AGNAARDN 54.24 787.357254 -4.223E-5 -0.05334300253998803 H2A1B_HUMAN[67-74]; H2A1C_HUMAN[67-74]; H2A1D_HUMAN[67-74]
149 KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413]
150 RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36]
151 KGGVVGIKVD 44.61 970.581146 -0.001214 -1.2507970147608864 ALDOA_HUMAN[101-110]
152 KIKELQAF 11.87 975.575287 0.003907 4.004816493470687 MMP20_HUMAN[71-78]
153 KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689]
154 KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245]
155 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620]
156
157 ===============================================
158 *Appending modification site sequence contexts*
159 ===============================================
160
161 With these options:
162
163 - p\* as *modified amino acid*
164 - c6 as *Protein identifier column*
165 - c1 as *Peptide sequence column*
166 - a suitable FASTA database with *Protein sequences*
167 - and everything else set to defaults
168
169 the example above will generate a result like this::
170
171 KLpSAAVVLI 11.48 912.600784 0.001608 1.7619971713721432 OSGI2_HUMAN[405-413] KIFKLSAAVVL
172 RAGIKVpTVA 23.01 913.570892 6.283E-5 0.06786555979719196 PARK7_HUMAN[28-36] AGIKVTVAGLA
173 KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] EKEKISGTVNI
174 KIpSGpTVNIR 57.17 986.587265 -0.002761 -2.798536022051734 SYTC_HUMAN[681-689] EKISGTVNIRT
175 KLpYEALKF 17.54 1010.580032 0.004782 4.731935966057164 F105A_HUMAN[238-245] LEYKLYEALKF
176 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] DKLDASESLRK
177 KLDApSEpSLR 31.31 1017.545441 -0.002377 -2.3360136110127785 CLH1_HUMAN[1612-1620] LDASESLRKEE
178
179 Note the header line was ignored, peptides like AGNAARDN without any modified amino acids are absent from the output \
180 and peptides like KLDApSEpSLR with more than one modified amino acid occur more than once in the output.
181
182 </help>
183 </tool>