comparison process_scans.xml @ 0:62c4813fbe7a draft

"planemo upload for repository https://github.com/computational-metabolomics/dimspy-galaxy commit 6321871098b2c4bc9e321d20b7e66fff3d641839"
author computational-metabolomics
date Sat, 11 Apr 2020 16:48:19 -0400
parents
children deafa30d6570
comparison
equal deleted inserted replaced
-1:000000000000 0:62c4813fbe7a
1 <tool id="dimspy_process_scans" name="Process Scans (and SIM-Stitch)" version="@TOOL_VERSION@+galaxy@GALAXY_TOOL_VERSION@">
2 <description> - Read, filter and average MS scans</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements" />
7 <command detect_errors="exit_code">
8 <![CDATA[
9 #if $data.input[0].is_of_type("zip")
10 dimspy process-scans
11 --input $data.input[0]
12 #else
13 #for $fn in $data.input
14 ln -s '$fn' '$fn.name'
15 &&
16 #end for
17 dimspy process-scans
18 --input .
19 #end if
20 --output '$hdf5_file_out'
21 #if $filelist
22 --filelist '$filelist'
23 #end if
24 --function-noise $function_noise
25 --snr-threshold $snr_threshold
26 --ppm $mults.ppm
27 --min_scans $mults.min_scans
28 #if float($mults.min_fraction) > 0.0
29 --min-fraction $mults.min_fraction
30 #else
31 --min-fraction 0.0
32 #end if
33 #if float($mults.rsd_threshold) > 0.0
34 --rsd-threshold $mults.rsd_threshold
35 #end if
36 #if $adv.skip_stitching
37 --skip-stitching
38 #end if
39 #if float($adv.ringing_threshold) > 0.0
40 --ringing-threshold $adv.ringing_threshold
41 #end if
42 #for $mzr in $adv.remove_mz_range
43 --remove-mz-range $mzr.start $mzr.end
44 #end for
45 #if $scan_events.filter == 'true'
46 #for $se in $scan_events.descriptions
47 #if $scan_events.incl_excl == 'include'
48 --include-scan-events $se.start $se.end $se.scan_type
49 #elif $scan_events.incl_excl == 'exclude'
50 --exclude-scan-events $se.start $se.end $se.scan_type
51 #end if
52 #end for
53 #end if
54 --report '$report'
55 &&
56 dimspy hdf5-pls-to-txt
57 --input '$hdf5_file_out'
58 --output .
59 --delimiter $delimiter
60 ]]>
61 </command>
62 <inputs>
63 <conditional name="data">
64 <param name="type" type="select" label="Select the MS data file type?">
65 <option value="mzml" selected="true">*.mzML files</option>
66 <option value="raw">*.raw files</option>
67 </param>
68 <when value="raw">
69 <param name="license_agreement" type="boolean" label="Do you agree to the RawFileReader license terms?" help="*.raw files are read using the RawFileReader reading tool (Copyright © 2016 by Thermo Fisher Scientific, Inc. All rights reserved). To run this tool and process .raw files you must agree to the RawFileReader license terms. Read it at https://github.com/computational-metabolomics/dimspy-galaxy/blob/master/tools/dimspy/RawFileReaderLicense.md. See generic help section of this tool for more details.">
70 <validator type="expression" message="You must agree to the RawFileReader license terms to run this tool and process *.raw files.">True == value</validator>
71 </param>
72 <param name="input" argument="--source" type="data" format="thermo.raw" multiple="true" label="*.raw files" />
73 </when>
74 <when value="mzml">
75 <param name="input" argument="--source" type="data" format="zip,mzml" multiple="true" label="*.mzML files" />
76 </when>
77 </conditional>
78 <param name="filelist" argument="--filelist" type="data" format="tsv,tabular" optional="true" label="Filelist / Samplelist" />
79 <param name="function_noise" argument="--function-noise" type="select" label="Function to calculate the noise from each scan" help="">
80 <option value="median" selected="true">median intensity</option>
81 <option value="mean">mean intensity</option>
82 <option value="mad">mad (mean absolute deviation) intensity</option>
83 <option value="noise_packets">As shown in Xcalibur Qual Browser (Available for *.RAW files only)</option>
84 </param>
85 <param name="snr_threshold" argument="--snr-threshold" type="float" value="3.0" label="Signal-to-noise ratio threshold" help="" />
86 <conditional name="scan_events">
87 <param name="filter" type="boolean" label="Filter specific windows or scan events?" help="(--include-scan-events / --exclude-scan-events)"/>
88 <when value="true">
89 <param name="incl_excl" type="select" label="Include / Exclude scan event(s)" >
90 <option value="exclude" selected="true">Exclude</option>
91 <option value="include">Include</option>
92 </param>
93 <repeat name="descriptions" title="Description">
94 <param name="start" type="float" value="0" label="Start m/z for scan event"/>
95 <param name="end" type="float" value="0" label="End m/z for scan event">
96 <validator type="expression" message="M/z value must be larger than 0.0">float(value) > 0.0</validator>
97 </param>
98 <param name="scan_type" type="select" label="Scan type">
99 <option value="full" selected="true">Full scan</option>
100 <option value="sim">SIM scan</option>
101 </param>
102 </repeat>
103 </when>
104 <when value="false">
105 </when>
106 </conditional>
107 <section name="mults" title="Show options for multiple scans" expanded="True">
108 <param name="min_scans" argument="--min_scans" type="integer" min="1" value="1" label="Minimum number of scans required for each m/z window or event" help="" />
109 <param name="ppm" argument="--ppm" type="float" value="2.0" label="Ppm error tolerance" help="Maximum tolerated m/z deviation in consecutive scans in parts per million." />
110 <param name="min_fraction" argument="--min-fraction" type="float" min="0.0" max="1.0" value="0.0" label="Minimum fraction (i.e. percentage) of scans a peak has to be present in." help="Select '0' to skip this step." />
111 <param name="rsd_threshold" argument="--rsd-threshold" type="float" min="0.0" value="0.0" label="Relative standard deviation threshold" help="Select '0' to skip this step. Maximum tolerated relative standard deviation (RSD) of the peak intensities across scans." />
112 </section>
113 <section name="adv" title="Show advanced options" expanded="True">
114 <param name="skip_stitching" argument="--skip-stitching" type="boolean" value="false" label="Skip SIM-Stitching?" help="When set to 'yes' it will skip the processing step where (SIM) windows are 'stitched' or 'joined' together. Set this option to 'yes' if you like to proces individual scan/SIM windows (events/ranges) without 'stitching' them."/>
115 <repeat name="remove_mz_range" title="Remove m/z range(s)?">
116 <param name="start" type="float" value="0.0" label="Start m/z of removal range"/>
117 <param name="end" type="float" value="0.0" label="End m/z of removal range">
118 <validator type="expression" message="M/z value must be larger than 0.0">float(value) > 0.0</validator>
119 </param>
120 </repeat>
121 <param name="ringing_threshold" argument="--ringing-threshold" type="float" value="0.0" min="0.0" max="1.0" label="Relative intensity threshold used to remove ringing artifacts" help="Select '0' to skip this filter." />
122 </section>
123 <param name="delimiter" argument="--delimiter" type="hidden" value="tab" />
124 </inputs>
125 <outputs>
126 <data name="hdf5_file_out" format="h5" label="${tool.name} on ${on_string}: Peaklists (HDF5 file)" />
127 <data name="report" format="txt" label="${tool.name} on ${on_string}: Report" />
128 <collection name="peaklists_txt" type="list" label="${tool.name} on ${on_string}: Peaklists">
129 <discover_datasets pattern="(?P&lt;designation&gt;.+)\.txt" format="tsv" directory="." visible="false" />
130 </collection>
131 </outputs>
132 <tests>
133 <test>
134 <conditional name="data">
135 <param name="type" value="mzml"/>
136 <param name="input" value="batch04_QC17_rep02_263.mzML,batch04_QC17_rep01_262.mzML,batch04_QC17_rep03_264.mzML" ftype="mzml" />
137 </conditional>
138 <param name="filelist" value="filelist_mzml_QC17_triplicate.txt" ftype="tsv" />
139 <param name="function" value="median" />
140 <param name="snr_threshold" value="100.0" />
141 <conditional name="mults">
142 <param name="ppm" value="2.0" />
143 <param name="min_scans" value="1" />
144 <param name="min_fraction" value="0.5" />
145 <param name="rsd_threshold" value="0" />
146 </conditional>
147 <param name="delimiter" value="tab" />
148 <output name="hdf5_file_out" value="pls_scan5.h5" ftype="h5" compare="sim_size"/>
149 <output name="report" value="report_pls_scan5.txt" ftype="txt"/>
150 <output_collection name="peaklists_txt" type="list">
151 <element name="batch04_QC17_rep01_262" file="batch04_QC17_rep01_262_scan5.txt" ftype="tsv"/>
152 <element name="batch04_QC17_rep02_263" file="batch04_QC17_rep02_263_scan5.txt" ftype="tsv"/>
153 <element name="batch04_QC17_rep03_264" file="batch04_QC17_rep03_264_scan5.txt" ftype="tsv"/>
154 </output_collection>
155 </test>
156 <test>
157 <conditional name="data">
158 <param name="type" value="mzml"/>
159 <param name="input" value="MTBLS79_mzml_triplicates.zip" ftype="zip"/>
160 </conditional>
161 <param name="filelist" value="filelist_mzml_triplicates.txt" ftype="tsv" />
162 <param name="function" value="median" />
163 <param name="snr_threshold" value="10.0" />
164 <conditional name="mults">
165 <param name="ppm" value="2.0" />
166 <param name="min_scans" value="1" />
167 <param name="min_fraction" value="0.5" />
168 <param name="rsd_threshold" value="0" />
169 </conditional>
170 <param name="delimiter" value="tab" />
171 <output name="hdf5_file_out" value="pls.h5" ftype="h5" compare="sim_size"/>
172 <output name="report" value="report_pls_01.xt" ftype="txt"/>
173 <output_collection name="peaklists_txt" type="list">
174 <element name="batch04_QC17_rep01_262" file="batch04_QC17_rep01_262.txt" ftype="tsv"/>
175 <element name="batch04_QC17_rep02_263" file="batch04_QC17_rep02_263.txt" ftype="tsv"/>
176 <element name="batch04_QC17_rep03_264" file="batch04_QC17_rep03_264.txt" ftype="tsv"/>
177 </output_collection>
178 </test>
179 <test>
180 <conditional name="data">
181 <param name="type" value="mzml"/>
182 <param name="input" value="batch_04_QC18_mzml_triplicate.zip" ftype="zip"/>
183 </conditional>
184 <param name="function" value="median" />
185 <param name="snr_threshold" value="10.0" />
186 <conditional name="mults">
187 <param name="ppm" value="2.0" />
188 <param name="min_scans" value="1" />
189 <param name="min_fraction" value="0.8" />
190 <param name="rsd_threshold" value="20.0" />
191 </conditional>
192 <param name="delimiter" value="tab" />
193 <output name="hdf5_file_out" value="pls_QC18.h5" ftype="h5" compare="sim_size"/>
194 <output name="report" value="report_pls_02.xt" ftype="txt"/>
195 <output_collection name="peaklists_txt" type="list">
196 <element name="batch04_QC18_rep01_280" file="batch04_QC18_rep01_280.txt" ftype="tsv"/>
197 <element name="batch04_QC18_rep02_281" file="batch04_QC18_rep02_281.txt" ftype="tsv"/>
198 <element name="batch04_QC18_rep03_282" file="batch04_QC18_rep03_282.txt" ftype="tsv"/>
199 </output_collection>
200 </test>
201 </tests>
202 <help>
203
204
205 Process Scans (and SIM stitch)
206 ==============================
207
208 ..
209
210 ----------------
211
212 Description
213 -----------
214
215 Standard DIMS processing workflow: **Process Scans** -> Replicate Filter -> Align Samples -> [Missing values sample filter] -> Blank Filter -> Sample Filter -> Matrix processing -> Statistics
216
217 This tool is used to generate a single mass spectral peaklist for each of the data files defined in the ‘Filelist/Samplelist’. The tool extracts mass spectral peaks from a data file (in either .mzML or .RAW format) and then filters these in accordance with user-defined parameter settings. All peaks remaining after filtering are hierarchically clustered in one-dimension, during which pairs of peaks with similar m/z values are grouped together if the difference between their m/z values, when divided by the average of their m/z values and multiplied by 1 x 10\ :sup:`6` \, equates to less-than the user-defined ppm error tolerance.
218
219 **IMPORTANT:** when using .mzML files generated using the Proteowizard tool, SIM-type scans will only be treated as spectra if the ‘simAsSpectra’ filter was set to true during the conversion process, e.g.:
220
221 *msconvert.exe example.raw* **--simAsSpectra** *--64 --zlib --filter "peakPicking true 1-”*
222
223 -----------------
224
225
226 Parameters
227 ----------
228
229 ***.mzml or *.raw files** (REQUIRED) - use one of the following inputs:
230
231 * **Single or multiple .mzML or .raw file**
232
233 * **Data collection** - use this option if .mzml or .raw files are contained within a Galaxy dataset collection. Dataset collections may be generated within the Galaxy environment.
234
235 * **Zip file** from history - use this option if you have uploaded a \*.zip directory containing \*.mzML files (.raw files are not supported).
236
237
238 **Filelist / Samplelist** (HIGHLY RECOMMENDED) - a table containing **filename** and **classLabel** information for each experimental sample. These column headers MUST be included in the first row of the table.
239
240 For a standard DIMS experiment, users are advised to also include the following additional columns in order to ensure their data remains compatible with future versions of the dimspy processing pipeline:
241
242 * **injectionOrder** - integer values ranging from 1 to i, where i is the total number of independent infusions performed as part of a DIMS experiment. e.g. if a study included 20 samples, each of which was injected as four independent replicates, there would be at least 20 * 4 injections, so i = 80 and the range for injection order would be from 1 to 80 in steps of 1.
243
244 * **replicate** - integer value from 1 to r, indicating the order in which technical replicates of each study sample were injected in to the mass spectrometer, e.g. if study samples were analysed in quadruplicate, r = 4 and integer values are accordingly 1, 2, 3, 4.
245
246 * **batch** - integer value from 1 to b, where b corresponds to the total number of batches analysed under define analysis conditions, for any given experiment. e.g. : if 4 independent plates of polar extracts were analysed in the positive ionisation mode, then valid values for batch are 1, 2, 3 and 4.
247
248 * **NOTE**: for DIMS experiments, “batch” is synonymous with plate, i.e. each independent plate analysed under a given analytical configuration may be considered an individual “batch”.
249
250 This file:
251
252 * must be uploaded to (or be accessible to) the active Galaxy history in order to allow for its selection in the Filelist / Samplelist drop-down menu. The file list / sample list may be created in .txt format, however, when imported in to the active Galaxy history, users must ensure to select ‘.tabular’ format.
253
254 * may include additional columns, e.g. additional metadata relating to study samples. Ensure that columns names do not conflict with existing column names.
255
256 |
257
258 @example_filelist@
259
260 |
261
262 **Function to calculate the noise from each scan** (REQUIRED; default = **median**) - toggle requiring selection of one option from the drop-down menu to indicate the preferred algorithm to apply for spectral noise calculation. The following options are available:
263
264 * **Median** - the median of all peak intensities within a given file is used as the noise value. This simplistic approach to estimating noise may be suitable for spectra with many low abundant features, but it is generally not recommended for use when spectra contain relatively few low-abundant peaks e.g. MS2 spectra.
265
266 * **Mean** - the unweighted mean average of all peak intensities within a given file is used as the noise value. This simplistic approach to estimating noise may be suitable for spectra with many low abundant features, but it is generally not recommended for use when spectra contain relatively few low-abundant peaks e.g. MS2 spectra.
267
268 * **Mean absolute deviation (MAD)** - the noise value is set as the mean of the absolute differences between peak intensities and the mean peak intensity (calculated across all peak intensities within a given file).
269
270 * **Xcalibur** - the noise value is calculated using the proprietary algorithms contained in Thermo Fisher Scientific’s reader libdrary. This option should only be applied when you are processing .RAW files.
271
272 |
273
274 **Signal-to-noise ratio (SNR) threshold** (REQUIRED; default = 3.0) - a numerical value from 0 upwards.
275
276 Peaks with a signal-to-noise ratio (SNR) less-than or equal-to this value will be removed from the output peaklist. In the comprehensive peaklist output (.tsv-formatted), peaks with a SNR below the user-defined threshold will have a ‘0’ in the ‘snr-flag’ column, which indicates that they should be ignored in downstream processing procedures. Peaks with a SNR exceeding the user-defined cutoff will have a ‘1’ in the ‘snr-flag’ column.
277
278 |
279
280 **Filter specific scan windows or scan events?** (OPTIONAL; default = **No**) - a boolean toggle where:
281
282 * **No** - do not perform scan event filtering;
283
284 * **Yes** - filter specific scan events
285
286 * when selected, users must specify whether to 'Exclude' or 'Include' specific scan events. This can be useful if, for example, a user wishes to run the Process Scans tool on only a subset of scan types collected in each file. e.g. some SIM stitch acquisitions may be initiated with an initial 30 second stabilisation period, during which full-scan data are acquired. This full-scan data can be excluded from further consideration by using the ‘exclude’ toggle.
287
288 * Included or excluded scan events must be fully defined by the user, else ALL scan events will be included. To do so:
289 * Click the '+ Description' button and insert the start and stop m/z values for the scan event to be included/excluded..
290 * Select the 'scan type' to be filtered. Options are: 'Full scan' or 'SIM scan'
291 * Click '+ Description' to 'Exclude/Include' an additional scan event.
292
293 |
294
295 **Show options for multiple scans** (OPTIONAL)
296
297 * **Minimum number of scans required for each m/z window or event within a raw/mzML data file** (default = 1) - A positive integer equal-to or greater-than 1 that specifies the number of times a given scan event must occur in a given file in order for this scan event to be included in downstream processing steps and in the output .tsv-formatted peaklist.
298
299 * **ppm error tolerance** (default = 2.0) - A positive numerical value equal-to or greater-than zero. This option impacts the clustering of peaks extracted from an input file. If the mass-to-charge ratios of two peaks, when divided by the average of their mass-to-charge ratios and then multiplied by 1 × 106, is equal-to or less-than this user-defined value, then these peaks are clustered together as a single peak. Clustering is applied across all replicates of a given scan event type i.e. with a given input file, all peaks detected in the three replicates of a 50-400 m/z scan event would undergo assessment for the need for clustering.
300
301 * **Minimum fraction (i.e. percentage; default = 0, i.e. skip) of scans a peak has to be present in** - A numerical value from 0 to 1 that specifies the minimum proportion of scans a given mass spectral peak must be detected in, in order for it to be kept in the output peaklist. Here, scans refers to replicates of the same scan event type, i.e. if set to 0.33, then a peak would need to be detected in at least 1 of the 3 replicates of a given scan event type. The ppm error specified by the user will significantly impact which peaks fulfil this criteria.
302
303 * **Relative standard deviation threshold** (default = 0, i.e. skip) - A numerical value equal-to or greater-than 0. If greater than 0, then peaks whose intensity values have a percent relative standard deviation (otherwise termed the percent coefficient of variation) greater-than this value are excluded from the output peaklist.
304
305 |
306
307 **Show advanced options** (OPTIONAL)
308
309 * **Skip SIM-stitching** (REQUIRED; default = **No**) - a boolean toggle where:
310
311 * **No** - perform SIM stitching
312
313 * **Yes** - skip the processing step where (SIM) windows are 'stitched' or 'joined' together. Use this option if you would like to process individual scan/SIM windows (events/ranges) without 'stitching' them.
314
315 * **Remove m/z range(s)** (OPTIONAL) - this option allows for specific regions of the output peak matrices to be deleted by the user - this option may be useful for removing sections of a spectrum known to correspond to system noise peaks.
316
317 * **Start m/z of removal range** - a positive numerical value corresponding to the lowest m/z value in the spectral region to be removed.
318
319 * **End m/z of removal range** - a positive numerical value corresponding to the highest m/z value in the spectral region to be removed (must be greater than the ‘start m/z of removal range’).
320
321 * **Relative intensity threshold used to remove ringing artefacts** (OPTIONAL) - Fourier transform-based mass spectra often contain peaks (ringing artefacts) around spectral features arising from detection of charged, gas-phase bio-molecules.
322
323 * A positive numerical value indicating the required relative intensity a peak must exceed (with reference to the largest peak in a cluster of peaks) in order to be retained.
324
325 ----------------------------------
326
327
328 Output file(s)
329 --------------
330
331 |
332
333 The Process scans (and SIM stitch) tool will output three file types:
334
335 1) **A HDF5 file** containing the processed peaklists
336
337 2) **A processed peaklist**, presented in tabular format, for each study sample specified in the filelist/samplelist. Each row corresponds to a single peak. Where multiple peaks were grouped together during the hierarchical clustering process, each peaklist metric constitutes an average of the groups’ values. Metrics included in the peaklist are:
338
339 @help_columns_peaklist@
340
341 @example_peaklist@
342
343 |
344
345 3) **A tabular “report” file** that details, for each scan event processed in each file:
346
347 * Scan range of scan event
348
349 * Scan number of scan event
350
351 * Number of peaks detected in scan event
352
353 * Median RSD of peaks detected in each scan event type (only applied if number of scans for a given scan event is <![CDATA[ > ]]> 1
354
355 -----------------------------------
356
357 @github_developers_contributors@
358 @license@
359
360 RawFileReader reading tool. Copyright © 2016 by Thermo Fisher Scientific, Inc. All rights reserved. **Using this galaxy tool implies the acceptance of the RawFileReader** `license terms`_.
361
362 .. _`license terms`: https://github.com/computational-metabolomics/dimspy-galaxy/blob/master/tools/dimspy/RawFileReaderLicense.md
363
364 |
365 </help>
366
367 <expand macro="citations" />
368
369 </tool>
370