Mercurial > repos > pieterlukasse > prims_metabolomics
annotate msclust.xml @ 31:31e6e2242d33
small fix in doc
author | pieter.lukasse@wur.nl |
---|---|
date | Sat, 30 Aug 2014 16:21:32 +0200 |
parents | 60b53f2aa48a |
children | 746cb34f10ed |
rev | line source |
---|---|
28
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
1 <tool name="MsClust" id="msclust2" version="2.0.4"> |
3 | 2 <description>Extracts fragmentation spectra from aligned data</description> |
3 <!-- | |
4 For remote debugging start you listener on port 8000 and use the following as command interpreter: | |
5 java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 | |
6 ////////////////////////// | |
7 | |
8 TODO in command below: add conditionals according to options of using or NOT the tolerances/thresholds from previous steps | |
9 --> | |
10 <command interpreter="java -jar "> | |
11 MsClust.jar | |
12 -peaksFileName $inputPeaks | |
13 -dataType $dataType | |
14 -imputationMethod $imputationMethod.type | |
15 #if $imputationMethod.type == "valueRange" | |
16 -rangeUpperLimit $imputationMethod.rangeUpperLimit | |
17 #end if | |
18 -plInputFormat "metalign" | |
19 -potDensFuncType $potDensFuncType.type | |
20 -centerSelectionType $centerSelectionType.type | |
21 -clusteringType $clusteringType.type | |
22 -neighborhoodWindowSize $potDensFuncType.pdf_neighborhoodWindowSize | |
23 -clusterSearchStopCriterium $centerSelectionType.cs_stop_criterion | |
24 -pearsonDistTreshold $potDensFuncType.pdf_pears_treshold | |
25 -pearsonTresholdConfidence $potDensFuncType.pdf_pears_conf | |
26 -pearsonPDReductionThreshold $centerSelectionType.cs_pears_pd_reductionTreshold | |
27 -pearsonPDReductionSlope $centerSelectionType.cs_pears_pd_reductionSlope | |
28
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
28 -rtDistTolUnit $potDensFuncType.rt_dist_tol_unit.type |
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
29 -rtDistTol $potDensFuncType.rt_dist_tol_unit.pdf_rt_toler |
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
30 -rtDistanceConfidence $potDensFuncType.pdf_scan_conf |
3 | 31 -centrotypesOut $centrotypesOut |
32 -simOut $simOut | |
33 -micOut $micOut | |
34 -mspOut $mspOut | |
35 -classOut $classOut | |
36 -outReport $htmlReportFile | |
37 -outReportPicturesPath $htmlReportFile.files_path | |
38 #if $clusteringType.type == "fuzzyCMeans" | |
39 -fcmMembershipWeightingExponent $clusteringType.fcmMembershipWeightingExponent | |
40 -fcmStopCriterion $clusteringType.fcmStopCriterion | |
41 -fcmCorrelationWeight $clusteringType.fcmCorrelationWeight | |
42 -fcmFinalAssemblyType $clusteringType.finalClusterAssembly.type | |
43 #if $clusteringType.finalClusterAssembly.type == "membershipBased" | |
44 -fcmMembershipCutoff $clusteringType.finalClusterAssembly.fcmMembershipCutoff | |
45 #end if | |
46 #end if | |
47 -verbose "false" | |
48 #if $advancedSettings.settings == True | |
49 -advancedSettings YES | |
50 -saturationLimit $advancedSettings.saturationLimit | |
51 -sampleSelectionSortType $advancedSettings.sampleSelectionSortType | |
52 -simSelectionAlgorithm $advancedSettings.simSelectionAlgorithm | |
53 -simMassFilter "$advancedSettings.simMassFilter" | |
54 -simMembershipThreshold $advancedSettings.simMembershipThreshold | |
55 -simSaturationThreshold $advancedSettings.simSaturationThreshold | |
56 -simAbsenseThreshold $advancedSettings.simAbsenseThreshold | |
57 -micMembershipThreshold $advancedSettings.micMembershipThreshold | |
58 -peakIntensityCorrectionAlgorithm $advancedSettings.peakIntensityCorrectionAlgorithm | |
59 #else | |
60 -advancedSettings YES | |
61 -sampleSelectionSortType SIM_INTENSITY | |
62 -peakIntensityCorrectionAlgorithm CORRELATION_BASED | |
63 #end if | |
64 | |
65 </command> | |
66 <inputs> | |
30
60b53f2aa48a
Small fixes, added microminutes support to MsClust, removed TIC or MsClust output
pieter.lukasse@wur.nl
parents:
29
diff
changeset
|
67 |
3 | 68 <param name="inputPeaks" type="data" format="txt" label="Ion-wise aligned data (e.g. MetAlign output data)" /> |
69 <param name="dataType" type="select" size="30" label="Data type"> | |
70 <option value="gcms" selected="true">GC-MS</option> | |
71 <option value="lcms">LC-MS</option> | |
72 </param> | |
73 <conditional name="imputationMethod"> | |
74 <param name="type" type="select" size="30" label="Select the approach used for imputing missing values (optional)" help="select how you generated the values to fill in the data gaps"> | |
75 <option value="none" >none</option> | |
76 <option value="metot" selected="true">MeTot</option> | |
77 <option value="valueRange">Values range</option> | |
78 </param> | |
79 <when value="valueRange"> | |
80 <param name="rangeUpperLimit" type="integer" size="10" value="0" label="Range upper limit" help="values up to this limit will be considered 'generated' values" /> | |
81 </when> | |
19 | 82 <when value="metot"> |
83 </when> | |
84 <when value="none"> | |
85 </when> | |
3 | 86 </conditional> |
87 <conditional name="potDensFuncType"> | |
88 <param name="type" type="select" size="30" label="Select PD function type ====================================================="> | |
89 <option value="original" selected="true">Original</option> | |
90 </param> | |
91 <when value="original"> | |
92 <param name="pdf_neighborhoodWindowSize" type="integer" size="10" value="200" label="Effective Peaks" /> | |
28
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
93 <conditional name="rt_dist_tol_unit"> |
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
94 <param name="type" type="select" size="30" label="Peak time unit"> |
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
95 <option value="1" selected="true">scan nr</option> |
29 | 96 <option value="2" >(average) micro minutes</option> |
28
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
97 </param> |
29 | 98 <when value="1"> |
28
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
99 <param name="pdf_rt_toler" type="float" size="10" value="10" label="Peak Width, in scans" /> |
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
100 </when> |
29 | 101 <when value="2"> |
28
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
102 <param name="pdf_rt_toler" type="float" size="10" value="100000" label="Peak Width, in micro minutes" help="e.g. 100,000=6 seconds" /> |
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
103 </when> |
b1015b9241cd
Added option for using retentionMean (micro minutes) for the time dimension
pieter.lukasse@wur.nl
parents:
21
diff
changeset
|
104 </conditional> |
3 | 105 <param name="pdf_scan_conf" type="float" size="10" value="80" label="Peak Width confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" /> |
106 <param name="pdf_pears_treshold" type="float" size="10" value="0.8" label="Correlation threshold (0.0 - 1.0)" /> | |
107 <param name="pdf_pears_conf" type="float" size="10" value="98.0" label="Correlation threshold confidence (0.0 to 99.99)" help="example: 0[no confidence]...50[good guess]...99.9[quite certain])" /> | |
108 </when> | |
109 </conditional> | |
110 <conditional name="centerSelectionType"> | |
111 <param name="type" type="select" label="Initial Centers selection type ==================================================" > | |
112 <option value="original" selected="true">Original - Subtractive potential reductions with stop criterion and REUSE tolerances (from PD function)</option> | |
113 </param> | |
114 <when value="original"> | |
115 <param name="cs_pears_pd_reductionTreshold" type="float" size="10" value="0.8" label="Potential Density reduction (0.0 - 1.0)" /> | |
116 <param name="cs_pears_pd_reductionSlope" type="float" size="10" value="0.01" label="Potential Density reduction softness " /> | |
117 <param name="cs_stop_criterion" type="float" size="10" value="2" label="Stop Criterion " /> | |
118 </when> | |
119 </conditional> | |
120 <conditional name="clusteringType"> | |
121 <param name="type" type="select" label="Classify using ==========================================================="> | |
122 <option value="original" selected="true">Original - Fuzzy clustering, keep original centers and REUSE (scan distance) tolerances</option> | |
123 <option value="fuzzyCMeans">(experimental) Fuzzy C-Means - Fuzzy clustering, optimize centers</option> | |
124 </param> | |
125 <when value="original"> | |
126 <!-- nothing --> | |
127 </when> | |
19 | 128 <!-- one idea would be to have clustering specific tolerance values, not reusing the centrotype selection ones |
3 | 129 <when value="originalNewTol"> |
130 <param name="clust_scan_toler" type="float" size="10" value="10" label="Peak Width, in scans" /> | |
131 <param name="clust_scan_slope" type="float" size="10" value="2" label="Peak Width margin softness" /> | |
132 </when> | |
19 | 133 --> |
3 | 134 <when value="fuzzyCMeans"> |
135 <param name="fcmMembershipWeightingExponent" type="float" size="10" value="2.0" label="Membership Weighting Exponent" help="Influences cluster center repositioning in the iterations 1.1 (exploratory) to around 3.0 (conservative)" /> | |
136 <param name="fcmStopCriterion" type="float" size="10" value="0.05" label="Stop Criterion" help="When convergence is 'reached' (e.g. 0.05 means memberships only changed with 5% in last iteration)" /> | |
137 <param name="fcmCorrelationWeight" type="float" size="10" value="2" label="Correlation weight factor" help="Increase this if you think the correlation is reliable (e.g. you have a high number of samples)" /> | |
138 <conditional name="finalClusterAssembly"> | |
139 <param name="type" type="select" label="Final cluster assembly" > | |
140 <option value="original" selected="true">Original - distance based</option> | |
141 <option value="membershipBased">Membership based</option> | |
142 </param> | |
143 <when value="membershipBased"> | |
144 <param name="fcmMembershipCutoff" type="select" label="Maximum allowed peak overlap" > | |
145 <option value="0.05" >~7 clusters</option> | |
146 <option value="0.10" >~5 clusters</option> | |
147 <option value="0.20" >~3 clusters</option> | |
148 </param> | |
149 </when> | |
150 <when value="original"> | |
151 <!-- nothing --> | |
152 </when> | |
153 </conditional> | |
154 </when> | |
155 </conditional> | |
156 | |
157 <param name="summaryReport" type="boolean" checked="true" label="Generate summary report" help="NB: this will increase the processing time (in some cases up to a few extra minutes)"/> | |
158 | |
159 <conditional name="advancedSettings"> | |
160 <param name="settings" type="boolean" truevalue="Yes" falsevalue="No" checked="false" label="Advanced settings ========================================================"/> | |
161 <when value="Yes"> | |
162 <param name="saturationLimit" optional="true" type="integer" size="10" label="Saturation limit (optional)" help="fill in if you have saturation problems in your data" /> | |
163 <param name="sampleSelectionSortType" type="select" label="Sample selection scheme for spectrum peak intensity correction algorithm (optional/experimental)" help="The intensity values to use to select the samples for each cluster/metabolite in which it is most intense/abundant. These samples are used in the peak intensity correction (see parameter below). Use this option to try to avoid samples that have insufficient signal or saturation." > | |
164 <option value="None">None</option> | |
165 <!-- in order of best FORWARD scoring when tested on /test/data/report_test_sets/(P2) Relative peak heights in spectra/Input (Test set 1) --> | |
166 <option value="SIM_INTENSITY" selected="true">SIM intensities</option> | |
167 <option value="MAX_INTENSITY">Maximum intensities</option> | |
168 <option value="CENTROTYPE_INTENSITY">Centrotype peak intensities</option> | |
169 <option value="MIC_INTENSITY">MIC intensities</option> | |
170 </param> | |
171 <param name="peakIntensityCorrectionAlgorithm" type="select" label="Spectrum peak intensity correction algorithm (optional/experimental)" help="Whether spectrum peak heights should be adjusted according to their membership to the cluster or to their correlation to the cluster's centrotype ion" > | |
172 <option value="MEMBERSHIP_BASED">Membership based (msclust 1.0 mode)</option> | |
173 <option value="CORRELATION_BASED" selected="true">Correlation based</option> | |
174 </param> | |
175 <param name="simSelectionAlgorithm" type="select" label="SIM selection algorithm (experimental)" help="Set this if you want to deviate from the standard which is: allow shared SIM peaks for GC-MS data, and force unique SIM peaks for LC-MS data"> | |
176 <option value="" selected="true"></option> | |
177 <option value="uniqueSIM">Unique SIM peak</option> | |
178 <option value="sharedSIM">Shared SIM peak</option> | |
179 </param> | |
180 <param name="simMassFilter" type="text" optional="true" size="30" label="SIM mass exclusion list" help="Comma-separated list of masses NOT to use as SIM peaks. E.g. '73,147,...' " /> | |
181 <param name="simMembershipThreshold" optional="true" type="float" size="10" label="SIM membership threshold" help="Minimum membership a peak should have to qualify as a SIM candidate. E.g. 0.8 " /> | |
182 <param name="simSaturationThreshold" optional="true" type="float" size="10" label="SIM saturation threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be saturated. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found this criteria, mass 0 is reported" /> | |
183 <param name="simAbsenseThreshold" optional="true" type="float" size="10" label="SIM absence threshold (%)" help="Maximum % of samples in which a SIM candidate peak may be absent. If the candidate peak exceeds this threshold, then another peak is chosen. If no peak can be found meeting this criteria, mass 0 is reported" /> | |
184 | |
185 <param name="micMembershipThreshold" optional="true" type="float" size="10" label="MIC membership threshold" help="Minimum membership a peak should have to be counted in the MIC sum. E.g. 0.8 " /> | |
186 | |
187 </when> | |
19 | 188 <when value="No"> |
189 </when> | |
3 | 190 </conditional> |
191 | |
192 | |
193 </inputs> | |
194 <outputs> | |
195 <data name="centrotypesOut" format="msclust.csv" label="${tool.name} on ${on_string} - centrotypes file"/> | |
196 <data name="simOut" format="msclust.csv" label="${tool.name} on ${on_string} - SIM file"/> | |
197 <data name="micOut" format="msclust.csv" label="${tool.name} on ${on_string} - MIC file"/> | |
198 <data name="mspOut" format="msp" label="${tool.name} on ${on_string} - SPECTRA file"/> | |
199 <data name="classOut" format="msclust.csv" label="${tool.name} on ${on_string} - Classification file"/> | |
200 <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"> | |
201 <!-- If the expression is false, the file is not created --> | |
202 <filter>( summaryReport == True )</filter> | |
203 </data> | |
204 </outputs> | |
205 <tests> | |
206 <!-- find out how to use --> | |
207 </tests> | |
208 <help> | |
209 | |
210 <!-- see also http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#hyperlink-targets --> | |
211 | |
212 .. class:: infomark | |
213 | |
214 This tool extracts spectra from ion-wise aligned MS(/MS) results. It uses expression profiles and | |
215 retention times of the putative ions to cluster them. Each cluster is then used to generate | |
216 one spectrum containing the clustered ions (peaks). | |
217 | |
4 | 218 .. image:: msclust_summary.png |
3 | 219 |
220 | |
221 ----- | |
222 | |
30
60b53f2aa48a
Small fixes, added microminutes support to MsClust, removed TIC or MsClust output
pieter.lukasse@wur.nl
parents:
29
diff
changeset
|
223 **Input** |
60b53f2aa48a
Small fixes, added microminutes support to MsClust, removed TIC or MsClust output
pieter.lukasse@wur.nl
parents:
29
diff
changeset
|
224 |
60b53f2aa48a
Small fixes, added microminutes support to MsClust, removed TIC or MsClust output
pieter.lukasse@wur.nl
parents:
29
diff
changeset
|
225 The input file should contain the following columns (in this order), followed by the sample intensity columns (one column with the |
60b53f2aa48a
Small fixes, added microminutes support to MsClust, removed TIC or MsClust output
pieter.lukasse@wur.nl
parents:
29
diff
changeset
|
226 intensity value for each sample): |
60b53f2aa48a
Small fixes, added microminutes support to MsClust, removed TIC or MsClust output
pieter.lukasse@wur.nl
parents:
29
diff
changeset
|
227 |
31 | 228 *ScanNR* |
229 | |
230 *Ret(umin)* | |
231 | |
232 *Mass(uD)* | |
233 | |
234 *(Optional)retentionMean* | |
235 | |
236 *(only required if retentionMean is present)retentionSD* | |
237 | |
238 *N sample intensity columns...* | |
239 | |
30
60b53f2aa48a
Small fixes, added microminutes support to MsClust, removed TIC or MsClust output
pieter.lukasse@wur.nl
parents:
29
diff
changeset
|
240 |
60b53f2aa48a
Small fixes, added microminutes support to MsClust, removed TIC or MsClust output
pieter.lukasse@wur.nl
parents:
29
diff
changeset
|
241 ----- |
60b53f2aa48a
Small fixes, added microminutes support to MsClust, removed TIC or MsClust output
pieter.lukasse@wur.nl
parents:
29
diff
changeset
|
242 |
3 | 243 **Output** |
244 | |
245 This tools returns a number of ouptut files and a small report. | |
246 | |
247 **Parameters index** | |
248 | |
249 | |
250 *Select the approach used for imputing missing values:* only select this if you have used a specific method to | |
251 fill in the data gaps in the input file. One example is replacing zero values by some randomly generated low value. | |
252 If MeTot is chosen, then a value is considered generated if: the value contains a dot '.' and some number | |
253 other than 0 (zero) after the dot. | |
254 | |
255 *Effective Peaks:* Neighborhood window size to consider when calculating density. Smaller values increase | |
256 performance but are less reliable. | |
257 | |
258 *Peak Width, in scans:* Scan window width of scans to consider 'close'. One can see this as the | |
259 'tolerated variation in scans' for the apex positions of the fragment peaks composing a cluster. | |
260 Note: if MetAlign was used, this is the variation *after* pre-processing by MetAlign. | |
261 | |
262 *Peak Width confidence:* The higher the confidence, the stricter the threshold. | |
263 | |
264 *Correlation threshold (0.0 - 1.0):* Tolerance center for pearson distance calculation. The higher this value, | |
265 the higher the correlation between 2 items has to be for them to be considered 'close'. | |
266 | |
267 *Correlation threshold confidence:* The higher the confidence, the stricter the threshold. `More...`__ | |
268 | |
269 *Potential Density reduction (0.0 - 1.0):* Reduction tolerance center for pearson distance calculation. | |
270 The higher this value, the less the low correlated items get reduced, getting a chance to form a cluster of their own. | |
271 | |
272 *Potential Density reduction softness:* Reduction curve slope for pearson distance tolerance. Lower | |
273 values = stricter separation at the value determined in 'Potential Density reduction' above | |
274 (TODO review this comment). | |
275 | |
276 *Stop Criterion:* When to stop reducing and looking for new clusters. Lower values = more iterations | |
277 | |
13 | 278 .. __: javascript:window.open('.. image:: confidence_and_slope_params_explain.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') |
3 | 279 |
280 | |
281 ----- | |
282 | |
283 **Output files described below** | |
284 | |
285 ----- | |
286 | |
287 *SPECTRA:* this file can be submitted to NIST for identification of the spectra. | |
288 | |
289 `Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation`_ | |
290 | |
14 | 291 .. _Click here for more details on the Sample selection and Spectrum peak intensity correction algorithm parameters related to SPECTRA generation: javascript:window.open('.. image:: sample_sel_and_peak_height_correction.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') |
3 | 292 |
293 ----- | |
294 | |
295 *MIC:* stands for Measured Ions Count -> it contains, for each cluster, the sum of the ion count | |
296 values (corrected by their membership) for all MEASURED cluster ions in the given sample. | |
297 | |
298 The MIC for a **cluster i** in **sample s**, where **cluster i** has **n** members is thus: | |
299 | |
300 sum ( [intensity of member n in **sample s**] x [membership value of member n in **cluster i** ] ) | |
301 | |
302 ----- | |
303 | |
304 *SIM:* stands for Selective Ion Mode -> it contains, for each cluster, the intensity values of the | |
305 most representative member ion peak of this cluster. The most representative member peak is the one with the | |
306 highest membership*average_intensity. This definition leads to conflicts as a peak can have a | |
307 membership in two or more clusters. The assignment of a SIM peak to a cluster depends on | |
308 the configured data type (LC or GC-MS). NB: this can be overruled in the "advanced settings": | |
309 | |
310 (1) LC-MS SIM: select SIM peak only once and for the centrotype in which this specific mass has its | |
311 highest membership; for neighboring centrotypes use its "second best SIM", etcetera. In other words, | |
312 if the SIM peak has been identified as the SIM in more than 1 cluster, assign as SIM to the cluster | |
313 with highest membership. Continue searching for other SIM peaks to assign to the other clusters until | |
314 all ambiguities are solved. | |
315 | |
316 (2) GC-MS SIM: the SIM peak can be "shared" by multiple clusters. However, the intensity values are corrected | |
317 by the membership value of the peak in the cluster in case the SIM peak is "shared". If the SIM peak is not | |
318 "shared" then the "raw" intensity values of the SIM peak are recorded in the SIM file. | |
319 | |
320 `Click here for more details on the SIM output file`_ | |
321 | |
15 | 322 .. _Click here for more details on the SIM output file: javascript:window.open('.. image:: sample_SIM.png'.replace('.. image:: ', ''),'popUpWindow','height=700,width=800,left=10,top=10,resizable=yes,scrollbars=yes,toolbar=yes,menubar=no,location=no,directories=no,status=yes') |
3 | 323 |
324 | |
325 | |
326 </help> | |
327 </tool> |