comparison replicate_filter.xml @ 0:cb2acfaec200 draft

"planemo upload for repository https://github.com/computational-metabolomics/dimspy-galaxy commit 6321871098b2c4bc9e321d20b7e66fff3d641839"
author computational-metabolomics
date Sat, 11 Apr 2020 16:50:23 -0400
parents
children 0cdf340364ed
comparison
equal deleted inserted replaced
-1:000000000000 0:cb2acfaec200
1 <tool id="dimspy_replicate_filter" name="Replicate Filter" version="@TOOL_VERSION@+galaxy@GALAXY_TOOL_VERSION@">
2 <description> - Remove peaks that fail to appear in at least x-out-of-n (technical) replicates</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements" />
7 <command detect_errors="exit_code">
8 <![CDATA[
9 dimspy replicate-filter
10 --input '$hdf5_file_in'
11 --output '$hdf5_file_out'
12 #if $filelist
13 --filelist '$filelist'
14 #end if
15 --ppm $ppm
16 --replicates $replicates
17 --min-peak-present $min_peaks
18 #if $rsd_threshold
19 --rsd-threshold $rsd_threshold
20 #end if
21 --report '$report'
22 &&
23 dimspy create-sample-list
24 --input '$hdf5_file_out'
25 --output '$samplelist'
26 --delimiter tab
27 &&
28 dimspy hdf5-pls-to-txt
29 --input '$hdf5_file_out'
30 --output .
31 --delimiter $delimiter
32 ]]>
33 </command>
34 <inputs>
35 <param name="hdf5_file_in" argument="--hdf5_file_in" type="data" format="h5" label="Peaklists (HDF5 file)" help="Peaklists generated by Process Scans (SIM-Stitch)." />
36 <param name="filelist" argument="--filelist" type="data" format="tsv,tabular" optional="true" label="Filelist / Samplelist" help="Only provide a filelist if you like to exclude Peaklists, update the metadata (e.g. classLabel), or if you have not provided a filelist for Process Scans." />
37 <param name="replicates" argument="--replicates" type="integer" value="3" label="Number of technical replicates for each sample" help="" />
38 <param name="min_peaks" argument="--min_peaks" type="integer" value="2" label="Minimum number of technical replicates a peak has to be present in" help="" />
39 <param name="ppm" argument="--ppm" type="float" value="2.0" label="Ppm error tolerance" help="Maximum tolerated m/z deviation across technical replicates in parts per million (ppm)." />
40 <param name="rsd_threshold" argument="--rsd-threshold" type="text" value="" label="Relative standard deviation threshold" help="Maximum tolerated relative standard deviation (RSD) of the peak intensities across technical replicates. Leave empty to skip this filter step." />
41 <param name="delimiter" type="hidden" value="tab" argument="--delimiter" label="" help=""/>
42 </inputs>
43 <outputs>
44 <data name="hdf5_file_out" format="h5" label="${tool.name} on ${on_string}: Peaklists (HDF5 file)"/>
45 <data name="report" format="txt" label="${tool.name} on ${on_string}: Report"/>
46 <data name="samplelist" format="tsv" label="${tool.name} on ${on_string}: Sample Metadata (updated)" />
47 <collection name="peaklists_txt" type="list" label="${tool.name} on ${on_string}: Peaklists">
48 <discover_datasets pattern="(?P&lt;designation&gt;.+)\.txt" format="tsv" directory="." visible="false" />
49 </collection>
50 </outputs>
51 <tests>
52 <test>
53 <param name="hdf5_file_in" value="pls.h5" ftype="h5"/>
54 <param name="replicates" value="3"/>
55 <param name="min_peaks" value="2"/>
56 <param name="ppm" value="2.0"/>
57 <param name="rsd_threshold" value=""/>
58 <output name="hdf5_file_out" value="pls_rf.h5" ftype="h5" compare="sim_size"/>
59 <output name="report" value="report_pls_rf_01.txt" ftype="txt"/>
60 <output name="samplelist" value="samplelist_1.txt" ftype="tsv"/>
61 <output_collection name="peaklists_txt" type="list">
62 <element name="batch04_QC17_rep01_262_2_263_3_264" file="batch04_QC17_rep01_262_2_263_3_264.txt" ftype="tsv"/>
63 </output_collection>
64 </test>
65 <test>
66 <param name="hdf5_file_in" value="pls.h5" ftype="h5"/>
67 <param name="filelist" value="filelist_mzml_triplicates.txt" ftype="tsv"/>
68 <param name="replicates" value="3"/>
69 <param name="min_peaks" value="2"/>
70 <param name="ppm" value="2.0"/>
71 <param name="rsd_threshold" value=""/>
72 <output name="hdf5_file_out" value="pls_rf.h5" ftype="h5" compare="sim_size"/>
73 <output name="report" value="report_pls_rf_02.txt" ftype="txt"/>
74 <output name="samplelist" value="samplelist_2.txt" ftype="tsv"/>
75 <output_collection name="peaklists_txt" type="list">
76 <element name="batch04_QC17_rep01_262_2_263_3_264" file="batch04_QC17_rep01_262_2_263_3_264.txt" ftype="tsv"/>
77 </output_collection>
78 </test>
79 <test>
80 <param name="hdf5_file_in" value="pls_scan5.h5" ftype="h5"/>
81 <param name="replicates" value="3"/>
82 <param name="min_peaks" value="2"/>
83 <param name="ppm" value="2.0"/>
84 <param name="rsd_threshold" value=""/>
85 <output name="report" value="report_pls_rf_03.txt" ftype="txt"/>
86 </test>
87 </tests>
88 <help>
89 ----------------
90 Replicate filter
91 ----------------
92
93 ..
94
95 --------------------
96
97 Description
98 -----------
99
100 Standard DIMS processing workflow: Process Scans -> **Replicate Filter** -> Align Samples -> [Missing value sample filter] -> Blank Filter -> Sample Filter -> Matrix processing -> Statistics
101
102 |
103
104 To draw robust conclusions from DIMS-based metabolomics datasets, the data itself must be collected and processed in a robust and reproducible way. To support this aim, study samples are often divided in to a set of equivalent aliquots, each of which is analysed under defined and consistent analytical conditions. As aliquots of the same sample are assumed to comprise identical biological material, differences in their resulting spectra are assumed to arise due to technical variability. Removing artifacts associated with this technical variability, and removing mass spectral peaks that are detected irreproducible, is possible by filtering across these technical replicate spectra. The Replicate Filter tools facilitates this process, combining the peaklists extracted (using the Process Scans tool) from each technical replicate of a given study in to a single merged peaklist, before applying a series of user-defined filters to yield a replicate-filtered peaklist.
105
106 |
107
108 **Replicate filter process**: peaks from each technical replicate (for a given study sample) are aligned using a one-dimensional hierarchical clustering procedure (applied on the mass-to-charge scale). Peaks are aligned only if the difference in their mass-to-charge ratios, when divided by the average of their mass-to-charge ratios and multiplied by 1 × 10\ :sup:`6` \ (i.e. when measured in units of parts-per-million, ppm), is less-than or equal-to the user-defined ‘ppm error tolerance’. After alignment, a set of user-defined filters are applied to retain only those peaks that:
109
110 * occur in equal-to or more-than the user-defined 'Number of technical replicates a peak has to be present in', i.e. if set to 2, then a peak must be detected in at least two of the replicate analyses, **and/or**
111
112 * have relative standard deviation (measured in %; may otherwise be referred to as the percent coefficient of variation) of intensity values, across technical replicates, that is equal-to or less-than the user-defined ‘relative standard deviation threshold’ (if defined, otherwise ignored).
113
114 **IMPORTANT**:
115
116 * When a user sets the parameter “number of technical replicates for each sample” to a value less-than the total number of technical replicates actually acquired for each study sample, this tool will automatically determine which combination of technical replicates to combine. See the parameter description (below) for further details.
117
118 * If a specific scan event has been replicated within a given file, then users should not use this tool for filtering and aligning peaks. In this case, users should instead FIRST use the Process Scans (and SIM stitch) tool - see the documentation for the Process Scans tool for a description of how to perform this.
119
120 |
121
122 **Output** - a replicate-filtered peaklist per input study sample. In each, rows correspond to replicate-filtered mass spectral peaks, while columns provide a range of metrics for each of the detected peaks (see “Output file(s)” section, below).
123
124 ------------------
125
126 Parameters
127 ----------
128
129 * **Number of technical replicates for each sample** (REQUIRED; default = 3) - the total number of technical replicates acquired for each study sample. This value must be set to the lowest number of technical replicates acquired for ANY of the study samples, or alternatively, may be set to the minimum number of replicates the user would like to select from the total number of technical replicates for a biological sample.
130
131 * **Minimum number of technical replicates a peak has to be present in** (REQUIRED; default = 2) - For a given biological sample, the number of replicates that will be used to generate the replicate-filtered peaklist. By default, if this parameter is set to a value less-than the total number of technical replicates acquired for each biological sample, then the tool automatically determines which combination of technical replicates yields the best overall rank. Otherwise, all technical replicates are used. Ranking of the combinations of technical replicates is based on the average of the following three scores:
132
133 # score 1: peak count / peak count present in n-out-n (e.g. 3-out-of-3)
134
135 # score 2: peak count present in x-out-of-n (e.g. 3-out-of-3) / MAX peak count present in x-out-of-n across sets of replicates
136
137 # score 3: RSD categories (0-5 (score=1.0), 5-10 (score=0.9), 10-15 (score=0.8), etc)
138
139 * **ppm error tolerance** (REQUIRED; default = 2) - this parameter will influence the alignment of peaks across technical replicates. Peaks from distinct technical replicates of a given study sample are aligned if the difference between their mass-to-charge ratios, when divided by the average of their mass-to-charge ratios and multiplied by 1 × 106, is equal-to or less-than than this parameter value (i.e. the difference between the mass-to-charge ratios, measured on the ppm scale, is less than the user-defined threshold).
140
141 * **Relative standard deviation threshold** (OPTIONAL) - a numerical value from 0 upwards that defines the acceptable percentage relative standard deviation (otherwise termed the percent coefficient of variation) of a peak’s intensity across technical replicates. Peaks are removed from the output ‘replicate-filtered’ peaklist if this condition is not met.
142
143 -------------------
144
145 Output file(s)
146 --------------
147
148 |
149
150 The Replicate Filter tool will output three file types:
151
152 1) **A HDF5 file containing the replicate-filtered Peaklists**
153
154 2) **A replicate-filtered peaklist**, in .tsv format, for each biological sample defined in the filelist/samplelist. Tab-delimited text file containing a numeric data matrix, with . as decimal, and NA for missing values. Each row corresponds to a mass spectral peak. Columns provide metrics associated with each mass spectral peak. Metrics include:
155
156 @help_columns_peaklist@
157
158 @example_peaklist@
159
160 |
161
162 3) **A tabular “report” file** that details, for each biological sample included in the filelist/samplelist:
163
164 - **Set** - a numeric index value that is unique to a specific biological sample
165
166 - **Rank** - a positive integer indicating which combination of technical replicates offered the best-ranked replicate-filtered peaklist.
167
168 - **Name** - a string indicating which of the technical replicates for a given biological sample were combined together.
169
170 - **Peaks_[XooY]** - a numeric value indicating the total number of peaks that were present in X-out-of-Y technical replicates.
171
172 - **Median_rsd_[XooY]** - the median of the percent relative standard deviations of all peaks in the replicate-filtered peaklist that were present in X-out-of-Y technical replicates.
173
174 - **Score** - a numeric value between 0 and 1 that serves to indicate the best combination of technical replicates for a given biological sample (as defined by the ‘name’ column)
175
176 --------------------------
177
178 @github_developers_contributors@
179 @license@
180 </help>
181 <expand macro="citations" />
182 </tool>
183