Mercurial > repos > pieterlukasse > prims_proteomics
comparison msfilt.xml @ 0:d50f079096ee
Push to main toolshed
author | pieter.lukasse@wur.nl |
---|---|
date | Wed, 08 Jan 2014 11:39:16 +0100 |
parents | |
children | 72d4a37869ee |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d50f079096ee |
---|---|
1 <tool name="MsFilt" id="msfilt" version="1.0.2"> | |
2 <description>Filters annotations based MS/MS peptide identification and annotation quality measures</description> | |
3 <!-- | |
4 For remote debugging start you listener on port 8000 and use the following as command interpreter: | |
5 java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 | |
6 ////////////////////////// | |
7 --> | |
8 <command interpreter="java -jar "> | |
9 MsFilt.jar | |
10 -apmlFile $apmlFile | |
11 -datasetCode $apmlFile.metadata.base_name | |
12 -rankingMetadataFile $rankingMetadataFile | |
13 -statisticalMeasuresConfigFile $statisticalMeasuresConfigFile | |
14 -annotationSourceConfigFile $annotationSourceConfigFile | |
15 -outApml $outputApml | |
16 -outNewIdsApml $outNewIdsApml | |
17 -outFullCSV $outputCSV | |
18 -outRankingTable $outRankingTable | |
19 -outProteinCoverageCSV $outProteinCoverageCSV | |
20 -fpCriteriaExpression "$fpCriteriaExpression" | |
21 -filterOutFPAnnotations $filterOutFPAnnotations | |
22 -fpCriteriaExpressionForIds "$fpCriteriaExpressionForIds" | |
23 -filterOutFPIds $filterOutFPIds | |
24 -filterOutUnannotatedAlignments $filterOutUnannotatedAlignments | |
25 -addRawRankingInfo $addRawRankingInfo | |
26 -addScaledIntensityInfo $addScaledIntensityInfo | |
27 -addRawIntensityInfo $addRawIntensityInfo | |
28 -outReport $htmlReportFile | |
29 -outReportPicturesPath $htmlReportFile.files_path | |
30 </command> | |
31 | |
32 <inputs> | |
33 | |
34 <param name="apmlFile" type="data" format="apml" optional="true" | |
35 label="(Optional) Peptide quantification file (APML)" | |
36 help="The APML contents as aligned and annotated feature lists. E.g. produced by | |
37 SEDMAT or Quantiline tools." /> | |
38 | |
39 <repeat name="annotationSourceFiles" title="(Optional) Peptide identification files" help="Full set of MS/MS peptide identification files, including peptides that could not be quantified."> | |
40 <param name="identificationsFile" type="data" format="apml,mzidentml,prims.fileset.zip" label="Identifications file (APML or MZIDENTML or MZIDENTML fileSet)" /> | |
41 <param name="spectraFile" type="data" format="mzidentml,prims.fileset.zip" optional="true" label="(Optional) Spectra fileSet (mzml file or fileSet)" | |
42 help="Select this in case your Identifications file is MZIDENTML or MZIDENTML fileSet" /> | |
43 </repeat> | |
44 | |
45 <!-- | |
46 <param name="maxNrRankings" type="integer" size="10" value="0" label="Maximum nr. of items to leave in the final ranking (set=0 for no limit) " /> | |
47 --> | |
48 <!-- TODO add info somewhere that deltaRt is 'corrected deltaRt' --> | |
49 <param name="rankingWeightConfig" type="text" area="true" size="13x70" label="Quality Measures (qm's) and ranking weights configuration" | |
50 help="Here you may specify a weight for each of the Quality Measures (QMs). These are used for the final QM score and possibly for ranking (e.g. in case of label-free data | |
51 processed by SEDMAT). The format is: QM alias => QM name,weight. " | |
52 value="qmDRT => delta rt (standard score),1 | |
53 
qmDMA => delta mass annotation (standard score),1 | |
54 
qmDMP => delta mass psm (standard score),1 | |
55 
qmBSCR => best peptide score (standard score),1 | |
56 
qmALCV => alignment coverage (fraction),1 | |
57 
qmSTCV => score type coverage (fraction),1 | |
58 
qmPACV => peptide's best proteinAnnotCoverage (standard score),1 | |
59 
qmPICV => peptide's best proteinIdentifCoverage (standard score),1 | |
60 
qmANS => annotation sources (count),1 | |
61 
qmCSEV => charge states evidence (count),0.2 | |
62 
qmBCSP=> best correlation with source or product peptide (correl),1 | |
63 
qmBCCS => best correlation with other charge state (correl),1 | |
64 
qmBCOS => best correlation with other sibling peptide (correl),1 | |
65 "/> | |
66 | |
67 <param name="statisticalMeasuresConfig" type="text" area="true" size="6x70" label="Statistical measures configuration" | |
68 help="Here you may specify the statistical measures that are found in the ms/ms results (e.g. p or e-values). | |
69 The format is: SM alias => SM name,type,mode[min/max]. " | |
70 value="smXTD => MS:1001330,XSLASH!Tandem:expect,min | |
71 
pvCSVEX => p_value,CSV_EXPORT,min | |
72 
smAUTO_LIKELIHOOD => AUTOMOD_LOGLIKELIHOOD,PLGS/Auto-mod,max | |
73 
smLIKELIHOOD => LOGLIKELIHOOD,PLGS/Databank-search,max | |
74 "/> | |
75 | |
76 <param name="filterOutUnannotatedAlignments" type="boolean" checked="true" | |
77 label="Filter out unannotated alignments" | |
78 help="This helps decrease the output file size (features with no annotation are then not reported anymore)"/> | |
79 | |
80 <param name="filterOutFPAnnotations" type="boolean" checked="true" | |
81 label="Filter out False Positive (FP) annotations" /> | |
82 | |
83 <param name="fpCriteriaExpression" type="text" size="120" label="False Positive (FP) criteria for annotations" | |
84 help="Criteria (in standard score measures) for classifying an annotation as False Positive (FP). | |
85 You can build logical rules using the QM aliases above, the keywords 'and', 'or' and parenthesis. | |
86 Comparisons can be made with '==,<,><=,>='" | |
87 value="qmDRT <0 or qmDMA <-0.5 or (qmDMP <-0.5 and qmBSCR<-0.5) or (!isNaN(smXTD) and smXTD >0.01)"/> | |
88 | |
89 | |
90 <param name="filterOutFPIds" type="boolean" checked="true" | |
91 label="Filter out False Positive (FP) peptide identifications" /> | |
92 | |
93 <param name="fpCriteriaExpressionForIds" type="text" size="120" | |
94 label="False Positive (FP) criteria for identifications" | |
95 help="Criteria (in standard score measures) for classifying a peptide identification as False Positive (FP). | |
96 Here you can use a subset of the quality measures (qmDMP, qmBSCR, qmSTCV, qmPICV, qmCSEV) and all statistical measures." | |
97 value="(qmDMP <-0.5 and qmBSCR<-0.5) or (!isNaN(smXTD) and smXTD >0.01)"/> | |
98 | |
99 | |
100 <param name="addRawRankingInfo" type="boolean" checked="false" | |
101 label="Include the raw scores/values of the ranking attributes in the CSV output" | |
102 help="This will result in one extra column per ranking attribute, each column holding the original data for this attribute (before normalization)."/> | |
103 | |
104 <param name="addScaledIntensityInfo" type="boolean" checked="false" | |
105 label="Include computed scaled intensity values in the CSV output" | |
106 help="The autoscaled and 'z-score'scaled (aka 'standard-score'scaled) intensity values are then added to the full CSV output file"/> | |
107 | |
108 <param name="addRawIntensityInfo" type="boolean" checked="false" | |
109 label="Include the raw intensity values in the CSV output" | |
110 help="The original intensity values (as found in the input file) are then added to the full CSV output file"/> | |
111 | |
112 | |
113 </inputs> | |
114 <configfiles> | |
115 <configfile name="rankingMetadataFile">${rankingWeightConfig}</configfile> | |
116 <configfile name="statisticalMeasuresConfigFile">${statisticalMeasuresConfig}</configfile> | |
117 <configfile name="annotationSourceConfigFile">## start comment | |
118 ## iterate over the selected files and store their names in the config file | |
119 #for $i, $s in enumerate( $annotationSourceFiles ) | |
120 ${s.identificationsFile}|${s.spectraFile} | |
121 ## also print out the datatype in the next line, based on previously configured datatype | |
122 #if isinstance( $s.identificationsFile.datatype, $__app__.datatypes_registry.get_datatype_by_extension('apml').__class__): | |
123 apml | |
124 #else: | |
125 mzid | |
126 #end if | |
127 #end for | |
128 ## end comment</configfile> | |
129 </configfiles> | |
130 <outputs> | |
131 <data name="outputApml" format="apml" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: quantifications (filtered APML)" metadata_source="apmlFile"> | |
132 <!-- If the expression is false, the file is not created --> | |
133 <filter>( apmlFile != None )</filter> | |
134 </data> | |
135 <data name="outNewIdsApml" format="apml" label="${tool.name} on ${on_string}: identifications (filtered APML)" > | |
136 <filter>( filterOutFPIds == True )</filter> | |
137 </data> | |
138 <data name="outputCSV" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Full CSV" metadata_source="apmlFile"> | |
139 <filter>( apmlFile != None )</filter> | |
140 </data> | |
141 <data name="outRankingTable" format="csv" label="${apmlFile.metadata.base_name} - ${tool.name} on ${on_string}: Ranking table (CSV)" metadata_source="apmlFile"> | |
142 <filter>( apmlFile != None )</filter> | |
143 </data> | |
144 <data name="outProteinCoverageCSV" format="csv" label="${tool.name} on ${on_string}: Protein coverage details (CSV)"> | |
145 <!-- If the expression is false, the file is not created --> | |
146 <filter>( len(list(enumerate(annotationSourceFiles))) > 0 )</filter> | |
147 </data> | |
148 <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"/> | |
149 </outputs> | |
150 <tests> | |
151 </tests> | |
152 <help> | |
153 | |
154 .. class:: infomark | |
155 | |
156 This tool takes in peptide quantification results (e.g. either by SEDMAT for label-free data or by Quantiline for labeled data) | |
157 and calculates a number of quality measures that can help in assessing the correctness of the quantification assignment and of the MS/MS peptide | |
158 identification itself. The user can use any combination of quality measures (qm's) and statistical measures (sm's) to filter out | |
159 low scoring entries. | |
160 | |
161 .. class:: infomark | |
162 | |
163 In the label-free data processed by SEDMAT it is possible that a feature quantification gets assigned to different peptides. This means | |
164 we have an ambiguous assignment. In such a case | |
165 this tool also does a ranking of the different assignments according to their quality measures so that the best scoring assignment | |
166 gets ranked as first. | |
167 | |
168 ----- | |
169 | |
170 **List of abbreviations** | |
171 | |
172 QM: Quality Measure | |
173 | |
174 SM: Statistical Measure (e.g. p-value, e-value from MS/MS identification) | |
175 | |
176 PSM: "Peptide to Spectrum Match" (aka peptide identification) | |
177 | |
178 FP: False Positive | |
179 | |
180 ----- | |
181 | |
182 **Filtering options details** | |
183 | |
184 The FP criteria will be applied to an annotation even if the corresponding quality measures involved | |
185 in the expression can NOT ALL be determined. QMs that cannot be determined, get the value 0 (zero) which is | |
186 equal to giving it the average value. | |
187 | |
188 The output report shows some plots that visualize the filtering done. This can help in fine-tuning the right filtering | |
189 criteria. | |
190 | |
191 ----- | |
192 | |
193 **Output details** | |
194 | |
195 *APML output* | |
196 | |
197 This tools returns the given APML alignment file further annotated at the alignment level with the best ranking | |
198 peptides of each respective alignment. This APML can be used in subsequent Galaxy tools like the proteomics tools | |
199 from NBIC. | |
200 | |
201 The APML output can also be used for the Protein Inference step (see Quantifere tool). | |
202 | |
203 *CSV output* | |
204 | |
205 It also returns a CSV format output with the full quality measures and scoring and ranking details. The user could use | |
206 this to manually determine new weights for some of the quality measures by techniques such as | |
207 linear regression. In other words, this CSV can then be used to fine-tune the weights in a next run. | |
208 | |
209 Many of the quality measures (QMs) are normalized to their Standard Score (aka z-score). | |
210 `See Standard Score for more details...`__ | |
211 | |
212 Next to giving insight into how the ranking was established, a more complete version of this CSV file is also | |
213 generated for tools that cannot or won't process the APML output format. | |
214 | |
215 Below an brief overview of the CSV and an illustration of the ranking done in case of ambiguous peptides to feature assignments | |
216 (explained above, can happen in case of label-free data processing by SEDMAT). | |
217 | |
218 | |
219 .. image:: $PATH_TO_IMAGES/msfilt_csv_out.png | |
220 | |
221 | |
222 | |
223 .. __: javascript:window.open('http://en.wikipedia.org/wiki/Standard_score','popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') | |
224 | |
225 | |
226 | |
227 | |
228 </help> | |
229 </tool> |