comparison msfilt.xml @ 0:d50f079096ee

Push to main toolshed
author pieter.lukasse@wur.nl
date Wed, 08 Jan 2014 11:39:16 +0100
parents
children 72d4a37869ee
comparison
equal deleted inserted replaced
-1:000000000000 0:d50f079096ee
1 <tool name="MsFilt" id="msfilt" version="1.0.2">
2 <description>Filters annotations based MS/MS peptide identification and annotation quality measures</description>
3 <!--
4 For remote debugging start you listener on port 8000 and use the following as command interpreter:
5 java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000
6 //////////////////////////
7 -->
8 <command interpreter="java -jar ">
9 MsFilt.jar
10 -apmlFile $apmlFile
11 -datasetCode $apmlFile.metadata.base_name
12 -rankingMetadataFile $rankingMetadataFile
13 -statisticalMeasuresConfigFile $statisticalMeasuresConfigFile
14 -annotationSourceConfigFile $annotationSourceConfigFile
15 -outApml $outputApml
16 -outNewIdsApml $outNewIdsApml
17 -outFullCSV $outputCSV
18 -outRankingTable $outRankingTable
19 -outProteinCoverageCSV $outProteinCoverageCSV
20 -fpCriteriaExpression "$fpCriteriaExpression"
21 -filterOutFPAnnotations $filterOutFPAnnotations
22 -fpCriteriaExpressionForIds "$fpCriteriaExpressionForIds"
23 -filterOutFPIds $filterOutFPIds
24 -filterOutUnannotatedAlignments $filterOutUnannotatedAlignments
25 -addRawRankingInfo $addRawRankingInfo
26 -addScaledIntensityInfo $addScaledIntensityInfo
27 -addRawIntensityInfo $addRawIntensityInfo
28 -outReport $htmlReportFile
29 -outReportPicturesPath $htmlReportFile.files_path
30 </command>
31
32 <inputs>
33
34 <param name="apmlFile" type="data" format="apml" optional="true"
35 label="(Optional) Peptide quantification file (APML)"
36 help="The APML contents as aligned and annotated feature lists. E.g. produced by
37 SEDMAT or Quantiline tools." />
38
39 <repeat name="annotationSourceFiles" title="(Optional) Peptide identification files" help="Full set of MS/MS peptide identification files, including peptides that could not be quantified.">
40 <param name="identificationsFile" type="data" format="apml,mzidentml,prims.fileset.zip" label="Identifications file (APML or MZIDENTML or MZIDENTML fileSet)" />
41 <param name="spectraFile" type="data" format="mzidentml,prims.fileset.zip" optional="true" label="(Optional) Spectra fileSet (mzml file or fileSet)"
42 help="Select this in case your Identifications file is MZIDENTML or MZIDENTML fileSet" />
43 </repeat>
44
45 <!--
46 <param name="maxNrRankings" type="integer" size="10" value="0" label="Maximum nr. of items to leave in the final ranking (set=0 for no limit) " />
47 -->
48 <!-- TODO add info somewhere that deltaRt is 'corrected deltaRt' -->
49 <param name="rankingWeightConfig" type="text" area="true" size="13x70" label="Quality Measures (qm's) and ranking weights configuration"
50 help="Here you may specify a weight for each of the Quality Measures (QMs). These are used for the final QM score and possibly for ranking (e.g. in case of label-free data
51 processed by SEDMAT). The format is: QM alias => QM name,weight. "
52 value="qmDRT =&gt; delta rt (standard score),1
53 &#xd;&#xa;qmDMA =&gt; delta mass annotation (standard score),1
54 &#xd;&#xa;qmDMP =&gt; delta mass psm (standard score),1
55 &#xd;&#xa;qmBSCR =&gt; best peptide score (standard score),1
56 &#xd;&#xa;qmALCV =&gt; alignment coverage (fraction),1
57 &#xd;&#xa;qmSTCV =&gt; score type coverage (fraction),1
58 &#xd;&#xa;qmPACV =&gt; peptide's best proteinAnnotCoverage (standard score),1
59 &#xd;&#xa;qmPICV =&gt; peptide's best proteinIdentifCoverage (standard score),1
60 &#xd;&#xa;qmANS =&gt; annotation sources (count),1
61 &#xd;&#xa;qmCSEV =&gt; charge states evidence (count),0.2
62 &#xd;&#xa;qmBCSP=&gt; best correlation with source or product peptide (correl),1
63 &#xd;&#xa;qmBCCS =&gt; best correlation with other charge state (correl),1
64 &#xd;&#xa;qmBCOS =&gt; best correlation with other sibling peptide (correl),1
65 "/>
66
67 <param name="statisticalMeasuresConfig" type="text" area="true" size="6x70" label="Statistical measures configuration"
68 help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values).
69 The format is: SM alias => SM name,type,mode[min/max]. "
70 value="smXTD =&gt; MS:1001330,XSLASH!Tandem:expect,min
71 &#xd;&#xa;pvCSVEX =&gt; p_value,CSV_EXPORT,min
72 &#xd;&#xa;smAUTO_LIKELIHOOD =&gt; AUTOMOD_LOGLIKELIHOOD,PLGS/Auto-mod,max
73 &#xd;&#xa;smLIKELIHOOD =&gt; LOGLIKELIHOOD,PLGS/Databank-search,max
74 "/>
75
76 <param name="filterOutUnannotatedAlignments" type="boolean" checked="true"
77 label="Filter out unannotated alignments"
78 help="This helps decrease the output file size (features with no annotation are then not reported anymore)"/>
79
80 <param name="filterOutFPAnnotations" type="boolean" checked="true"
81 label="Filter out False Positive (FP) annotations" />
82
83 <param name="fpCriteriaExpression" type="text" size="120" label="False Positive (FP) criteria for annotations"
84 help="Criteria (in standard score measures) for classifying an annotation as False Positive (FP).
85 You can build logical rules using the QM aliases above, the keywords 'and', 'or' and parenthesis.
86 Comparisons can be made with '==,&lt;,&gt;&lt;=,&gt;='"
87 value="qmDRT &lt;0 or qmDMA &lt;-0.5 or (qmDMP &lt;-0.5 and qmBSCR&lt;-0.5) or (!isNaN(smXTD) and smXTD &gt;0.01)"/>
88
89
90 <param name="filterOutFPIds" type="boolean" checked="true"
91 label="Filter out False Positive (FP) peptide identifications" />
92
93 <param name="fpCriteriaExpressionForIds" type="text" size="120"
94 label="False Positive (FP) criteria for identifications"
95 help="Criteria (in standard score measures) for classifying a peptide identification as False Positive (FP).
96 Here you can use a subset of the quality measures (qmDMP, qmBSCR, qmSTCV, qmPICV, qmCSEV) and all statistical measures."
97 value="(qmDMP &lt;-0.5 and qmBSCR&lt;-0.5) or (!isNaN(smXTD) and smXTD &gt;0.01)"/>
98
99
100 <param name="addRawRankingInfo" type="boolean" checked="false"
101 label="Include the raw scores/values of the ranking attributes in the CSV output"
102 help="This will result in one extra column per ranking attribute, each column holding the original data for this attribute (before normalization)."/>
103
104 <param name="addScaledIntensityInfo" type="boolean" checked="false"
105 label="Include computed scaled intensity values in the CSV output"
106 help="The autoscaled and 'z-score'scaled (aka 'standard-score'scaled) intensity values are then added to the full CSV output file"/>
107
108 <param name="addRawIntensityInfo" type="boolean" checked="false"
109 label="Include the raw intensity values in the CSV output"
110 help="The original intensity values (as found in the input file) are then added to the full CSV output file"/>
111
112
113 </inputs>
114 <configfiles>
115 <configfile name="rankingMetadataFile">${rankingWeightConfig}</configfile>
116 <configfile name="statisticalMeasuresConfigFile">${statisticalMeasuresConfig}</configfile>
117 <configfile name="annotationSourceConfigFile">## start comment
118 ## iterate over the selected files and store their names in the config file
119 #for $i, $s in enumerate( $annotationSourceFiles )
120 ${s.identificationsFile}|${s.spectraFile}
121 ## also print out the datatype in the next line, based on previously configured datatype
122 #if isinstance( $s.identificationsFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__):
123 apml
124 #else:
125 mzid
126 #end if
127 #end for
128 ## end comment</configfile>
129 </configfiles>
130 <outputs>
131 <data name="outputApml" format="apml" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: quantifications (filtered APML)" metadata_source="apmlFile">
132 <!-- If the expression is false, the file is not created -->
133 <filter>( apmlFile != None )</filter>
134 </data>
135 <data name="outNewIdsApml" format="apml" label="${tool.name} on ${on_string}: identifications (filtered APML)" >
136 <filter>( filterOutFPIds == True )</filter>
137 </data>
138 <data name="outputCSV" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Full CSV" metadata_source="apmlFile">
139 <filter>( apmlFile != None )</filter>
140 </data>
141 <data name="outRankingTable" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Ranking table (CSV)" metadata_source="apmlFile">
142 <filter>( apmlFile != None )</filter>
143 </data>
144 <data name="outProteinCoverageCSV" format="csv" label="${tool.name} on ${on_string}: Protein coverage details (CSV)">
145 <!-- If the expression is false, the file is not created -->
146 <filter>( len(list(enumerate(annotationSourceFiles))) > 0 )</filter>
147 </data>
148 <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"/>
149 </outputs>
150 <tests>
151 </tests>
152 <help>
153
154 .. class:: infomark
155
156 This tool takes in peptide quantification results (e.g. either by SEDMAT for label-free data or by Quantiline for labeled data)
157 and calculates a number of quality measures that can help in assessing the correctness of the quantification assignment and of the MS/MS peptide
158 identification itself. The user can use any combination of quality measures (qm's) and statistical measures (sm's) to filter out
159 low scoring entries.
160
161 .. class:: infomark
162
163 In the label-free data processed by SEDMAT it is possible that a feature quantification gets assigned to different peptides. This means
164 we have an ambiguous assignment. In such a case
165 this tool also does a ranking of the different assignments according to their quality measures so that the best scoring assignment
166 gets ranked as first.
167
168 -----
169
170 **List of abbreviations**
171
172 QM: Quality Measure
173
174 SM: Statistical Measure (e.g. p-value, e-value from MS/MS identification)
175
176 PSM: "Peptide to Spectrum Match" (aka peptide identification)
177
178 FP: False Positive
179
180 -----
181
182 **Filtering options details**
183
184 The FP criteria will be applied to an annotation even if the corresponding quality measures involved
185 in the expression can NOT ALL be determined. QMs that cannot be determined, get the value 0 (zero) which is
186 equal to giving it the average value.
187
188 The output report shows some plots that visualize the filtering done. This can help in fine-tuning the right filtering
189 criteria.
190
191 -----
192
193 **Output details**
194
195 *APML output*
196
197 This tools returns the given APML alignment file further annotated at the alignment level with the best ranking
198 peptides of each respective alignment. This APML can be used in subsequent Galaxy tools like the proteomics tools
199 from NBIC.
200
201 The APML output can also be used for the Protein Inference step (see Quantifere tool).
202
203 *CSV output*
204
205 It also returns a CSV format output with the full quality measures and scoring and ranking details. The user could use
206 this to manually determine new weights for some of the quality measures by techniques such as
207 linear regression. In other words, this CSV can then be used to fine-tune the weights in a next run.
208
209 Many of the quality measures (QMs) are normalized to their Standard Score (aka z-score).
210 `See Standard Score for more details...`__
211
212 Next to giving insight into how the ranking was established, a more complete version of this CSV file is also
213 generated for tools that cannot or won't process the APML output format.
214
215 Below an brief overview of the CSV and an illustration of the ranking done in case of ambiguous peptides to feature assignments
216 (explained above, can happen in case of label-free data processing by SEDMAT).
217
218
219 .. image:: $PATH_TO_IMAGES/msfilt_csv_out.png
220
221
222
223 .. __: javascript:window.open('http://en.wikipedia.org/wiki/Standard_score','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes')
224
225
226
227
228 </help>
229 </tool>