0
|
1 <tool name="Quantifere" id="quantifere1" version="1.0.2">
|
|
2 <description>Protein Inference by Peptide Quantification patterns</description>
|
|
3 <!--
|
|
4 For remote debugging start you listener on port 8000 and use the following as command interpreter:
|
|
5 java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000
|
|
6 //////////////////////////
|
|
7 -->
|
|
8 <command interpreter="java -jar ">
|
|
9 Quantifere.jar
|
|
10 -annotatedQuantificationFilesList $annotatedQuantificationFilesList
|
|
11 -identificationFilesList $identificationFilesList
|
|
12 -statisticalMeasuresConfigFile $statisticalMeasuresConfigFile
|
|
13 -quantificationDataToUse $quantificationDataToUse
|
|
14 -minCorrel $minCorrel
|
|
15 -minProtCoverage $minProtCoverage
|
|
16 -minAboveAverageHits $minAboveAverageHits
|
|
17 -minNrIdsForInferencePeptide $minNrIdsForInferencePeptide
|
|
18 -refineModel $refineModel
|
|
19 -functionalAnnotationCSV $functionalAnnotationCSV
|
|
20 -outputCSV $outputCSV
|
|
21 -outputInferenceLogCSV $outputInferenceLogCSV
|
|
22 -outputSummaryAnnotationCSV $outputSummaryAnnotationCSV
|
|
23 -outReport $htmlReportFile
|
|
24 -outReportPicturesPath $htmlReportFile.files_path
|
|
25 #if $is2D_LC_MS.fractions == True
|
|
26 -namingConventionCodesForFractions $is2D_LC_MS.namingConventionCodesForFractions
|
|
27 #end if
|
|
28 </command>
|
|
29
|
|
30 <inputs>
|
|
31
|
|
32 <repeat name="annotatedQuantificationFiles" title="Peptide (filtered) quantification files (APML)"
|
|
33 help="The APML contents as aligned, annotated and scored feature lists,
|
|
34 as produced by MsFilt tool. Select one or more files. For 2D-LC-MS we expect one file per fraction.">
|
|
35 <param name="annotatedQuantificationFile" size="50" type="data" format="apml" label="File (APML format)" />
|
|
36 </repeat>
|
|
37
|
|
38 <repeat name="identificationFiles" title="Peptide (filtered) identification files (MS/MS identifications)"
|
|
39 help="Full set of MS/MS peptide identification files, including peptides that could not be quantified.
|
|
40 This set of identifications is ideally filtered on some quality and
|
|
41 statistical measures (e.g. as is done by MsFilt). Tip: to base the inference only on the
|
|
42 selected peptide quantification files, you
|
|
43 can select the same quantification files here as well. Select one or more files.">
|
|
44 <param name="identificationFile" size="50" type="data" format="apml,mzid" label="File (APML or MZIDENTML format)" />
|
|
45 </repeat>
|
|
46
|
|
47 <conditional name="is2D_LC_MS">
|
|
48 <param name="fractions" type="boolean" truevalue="Yes" falsevalue="No" checked="false"
|
|
49 label="Data is from 2D LC-MS"
|
|
50 help="Data acquisition was done in multiple fractions."/>
|
|
51 <when value="Yes">
|
|
52 <param name="namingConventionCodesForFractions" type="text" size="100" value=""
|
|
53 label="Part of run/file name that identifies the 2D LC-MS fraction"
|
|
54 help="Add the CSV list of codes that occur in the file names
|
|
55 and that stand for a fraction code. E.g. '_F1,_F2,_F3,etc.' In this
|
|
56 way different peptide identifications from the same sample but measured
|
|
57 in different fractions can be merged together. Otherwise each (fraction) file
|
|
58 is seen as a separate sample."/> <!-- could do regular expressions as well but this would be hard for biologists, e.g. _F\d\b -->
|
|
59 </when>
|
|
60 </conditional>
|
|
61
|
|
62 <param name="statisticalMeasuresConfig" type="text" area="true" size="6x70" label="Statistical measures configuration"
|
|
63 help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values).
|
|
64 The format is: SM alias => SM name,type,mode[min/max]. Leaving this configuration out while these are present in the
|
|
65 dataset will have the effect that they will be wrongly used as a regular scoring scheme, having effect on for example
|
|
66 the filter criteria below like 'Minimum number of peptide matches with a score above average' ."
|
|
67 value="smXTD => MS:1001330,XSLASH!Tandem:expect,min
|
|
68 
pvCSVEX => p_value,CSV_EXPORT,min
|
|
69 
smAUTO_LIKELIHOOD => AUTOMOD_LOGLIKELIHOOD,PLGS/Auto-mod,max
|
|
70 
smLIKELIHOOD => LOGLIKELIHOOD,PLGS/Databank-search,max
|
|
71 "/>
|
|
72 <!-- keep value attribute above aligned like this to avoid white spaces in the value -->
|
|
73 <param name="quantificationDataToUse" type="select"
|
|
74 label="Quantification data to use"
|
|
75 help="Quantification data to use for the pattern clustering and inference steps. NB: check if the chosen data is also
|
|
76 present in your file, or choose 'auto' to let Quantifere check which quantification type is present in most peptides.">
|
|
77 <option value="auto" selected="true">auto</option>
|
|
78 <option value="getIntensity">(TODO)raw intensities</option>
|
|
79 <option value="getApexIntensity">(TODO)apex intensities</option>
|
|
80 <option value="getNormalizedIntensity">(TODO)normalized intensities</option>
|
|
81 </param>
|
|
82 <!-- TODO let minCorrel default value vary according to quantification type chosen above -->
|
|
83 <param name="minCorrel" type="float" size="10" value="0.85" label="Minimum correlation in a cluster" help="Features will be grouped by their protein annotation and
|
|
84 sample intensity values correlation. Set here the minimum correlation expected between grouped members. This is used to guide the clustering algorithm."/>
|
|
85
|
|
86 <!-- simple extra heuristics to remove some "noise" protein hits -->
|
|
87 <param name="minProtCoverage" type="float" size="10" value="5.0" label="Minimum protein coverage (%)" help="This will remove proteins that have a too small
|
|
88 portion of their sequence covered by peptide matches."/>
|
|
89
|
|
90 <param name="minAboveAverageHits" type="integer" size="10" value="1" label="Minimum number of different peptide matches with a score above average"
|
|
91 help="This will remove proteins that do not have enough reasonable peptides hits."/>
|
|
92
|
|
93 <param name="minNrIdsForInferencePeptide" type="integer" size="10" value="1" label="Minimum number of peptide identifications for inference peptides"
|
|
94 help="Minimum number of peptide identifications a peptide needs to be used as inference peptide for secondary proteins."/>
|
|
95
|
|
96
|
|
97 <param name="functionalAnnotationCSV" type="data" format="csv,txt,tsv" optional="true"
|
|
98 label="(Functional)annotation mapping file (csv or tsv format)"
|
|
99 help="Optional file that maps protein accessions to a network, pathway or other higher level annotations. In this file a header line is expected with these 2 columns (names and lower case is important): accession,annotation"/>
|
|
100
|
|
101 <param name="refineModel" type="boolean" checked="true" label="Refine matches model"
|
|
102 help="This will let the algorithm search for a reduced set of secondary protein matches that still explains the variation in the peptide quantification patterns"/>
|
|
103
|
|
104
|
|
105 <param name="summaryReport" type="boolean" checked="true" label="Generate summary report"/>
|
|
106
|
|
107 </inputs>
|
|
108 <configfiles>
|
|
109 <configfile name="annotatedQuantificationFilesList">## start comment
|
|
110 ## iterate over the selected files and store their names in the config file
|
|
111 #for $i, $s in enumerate( $annotatedQuantificationFiles )
|
|
112 ${s.annotatedQuantificationFile}
|
|
113 #end for
|
|
114 ## end comment</configfile>
|
|
115
|
|
116 <configfile name="identificationFilesList">## start comment
|
|
117 ## iterate over the selected files and store their names in the config file
|
|
118 #for $i, $s in enumerate( $identificationFiles )
|
|
119 ${s.identificationFile}
|
|
120 ## also print out the datatype in the next line, based on previously configured datatype
|
|
121 #if isinstance( $s.identificationFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__):
|
|
122 apml
|
|
123 #else:
|
|
124 mzid
|
|
125 #end if
|
|
126 #end for
|
|
127 ## end comment</configfile>
|
|
128 <configfile name="statisticalMeasuresConfigFile">## start comment
|
|
129 ${statisticalMeasuresConfig}
|
|
130 </configfile>
|
|
131 </configfiles>
|
|
132 <outputs>
|
|
133 <data name="outputCSV" format="csv" label="${tool.name} on ${on_string}: Proteins list (CSV)" />
|
|
134 <data name="outputInferenceLogCSV" format="csv" label="${tool.name} on ${on_string}: Inference log (CSV)"/>
|
|
135 <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report">
|
|
136 <!-- If the expression is false, the file is not created -->
|
|
137 <filter>( summaryReport == True )</filter>
|
|
138 </data>
|
|
139 <data name="outputSummaryAnnotationCSV" format="csv" label="${tool.name} on ${on_string} - Functional annotation summary (CSV)">
|
|
140 <!-- If the expression is false, the file is not created -->
|
|
141 <filter>( functionalAnnotationCSV != None )</filter>
|
|
142 </data>
|
|
143 </outputs>
|
|
144 <tests>
|
|
145 </tests>
|
|
146 <help>
|
|
147
|
|
148 .. class:: infomark
|
|
149
|
|
150 This tool takes Peptide Quantification patterns and uses this to do Protein Inference of both Primary Protein
|
|
151 identifications as well as Secondary Protein identifications. This last class of protein identifications
|
|
152 can not be done by traditional protein inference methods that look only at peptide identifications and
|
|
153 their quality parameters.
|
|
154
|
|
155
|
|
156 -----
|
|
157
|
|
158 **List of definitions**
|
|
159
|
|
160 Primary Protein identification: protein identification belonging to the minimum set of proteins needed
|
|
161 to account for the observed peptides.
|
|
162
|
|
163 Secondary Protein identification: extra protein identifications that do not below to the minimum set
|
|
164 of proteins mentioned above.
|
|
165
|
|
166 raw intensities : is the intensity value resulting from the integration of the feature peak area
|
|
167
|
|
168 apex intensities: is the intensity value as on the highest point of the feature peak
|
|
169
|
|
170 normalized intensities : is the intensity normalized by some means
|
|
171
|
|
172 -----
|
|
173
|
|
174 **Minimum correlation in a cluster**
|
|
175
|
|
176 TODO - add doc.
|
|
177
|
|
178 -----
|
|
179
|
|
180 **Output details**
|
|
181
|
|
182 *Proteins list (CSV)*
|
|
183
|
|
184 This is the list of primary and secondary proteins and their calculated inference score. Proteins
|
|
185 with exactly the same peptide hits are also grouped together and labeled as primary_group and secondary_group
|
|
186 instead of simply primary and secondary.
|
|
187
|
|
188
|
|
189 *Inference log (CSV)*
|
|
190
|
|
191 This CSV table shows all data, both inferred and ruled out proteins. This can be used by the user to
|
|
192 troubleshoot the inference process and understand why certain proteins might have been ruled out.
|
|
193 The CSV is provided in such a format that the data can easily be explored in a Cytoscape network.
|
|
194
|
|
195 The figure below shows an example of the data being explored in Cytoscape using also the
|
|
196 `Cytoscape chartplugin`_ to visualize the quantification data when selecting the peptide nodes.
|
|
197
|
|
198 .. image:: $PATH_TO_IMAGES/quantifere_cyto_out.png
|
|
199
|
4
|
200 .
|
0
|
201
|
|
202 .. _Cytoscape chartplugin: http://apps.cytoscape.org/apps/chartplugin
|
|
203
|
|
204
|
|
205
|
|
206 </help>
|
|
207 </tool>
|