comparison w4mclassfilter.xml @ 8:d5cf23369d12 draft

planemo upload for repository https://github.com/HegemanLab/w4mclassfilter_galaxy_wrapper/tree/master commit 7b824bc01884125dc8bb2e4c9ef70fb0a6d88db1
author eschen42
date Sat, 03 Mar 2018 22:58:14 -0500
parents 582a8a42a93b
children 1ced8b5dfa3e
comparison
equal deleted inserted replaced
7:582a8a42a93b 8:d5cf23369d12
1 <tool id="w4mclassfilter" name="Sample_Subset" version="0.98.7"> 1 <tool id="w4mclassfilter" name="W4m Data Subset" version="0.98.8">
2 <!-- this file is utf-8, not ASCII, because it contains the character é --> 2 <description>Filter W4m data by values or metadata</description>
3 <description>Filter W4M data by sample class</description> 3
4 <!-- Here is the hyphenation standard that I *try* to apply consistently in my documentation: http://www.sandranoonan.com/dont-let-hyphenation-drive-crazy/ -->
4 5
5 <requirements> 6 <requirements>
6 <!-- <requirement type="package" version="6.2">readline</requirement> --> 7 <!-- <requirement type="package" version="6.2">readline</requirement> -->
7 <requirement type="package" version="3.4.1">r-base</requirement> 8 <requirement type="package" version="3.4.1">r-base</requirement>
8 <requirement type="package" version="1.1_4">r-batch</requirement> 9 <requirement type="package" version="1.1_4">r-batch</requirement>
21 variableMetadata_in '$variableMetadata_in' 22 variableMetadata_in '$variableMetadata_in'
22 sampleclassNames '$sampleclassNames' 23 sampleclassNames '$sampleclassNames'
23 inclusive '$inclusive' 24 inclusive '$inclusive'
24 wildcards '$wildcards' 25 wildcards '$wildcards'
25 classnameColumn '$classnameColumn' 26 classnameColumn '$classnameColumn'
26 samplenameColumn '$samplenameColumn' 27 samplenameColumn 'sampleMetadata'
27 variable_range_filter '$variableRangeFilter' 28 variable_range_filter '$variableRangeFilter'
29 transformation '$transformation'
28 dataMatrix_out '$dataMatrix_out' 30 dataMatrix_out '$dataMatrix_out'
29 sampleMetadata_out '$sampleMetadata_out' 31 sampleMetadata_out '$sampleMetadata_out'
30 variableMetadata_out '$variableMetadata_out' 32 variableMetadata_out '$variableMetadata_out'
31 ]]></command> 33 ]]></command>
32 34
33 <inputs> 35 <inputs>
34 <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> 36 <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" />
35 <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> 37 <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" />
36 <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> 38 <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" />
37 <param name="samplenameColumn" label="Column that names the sample" type="text" value = "sampleMetadata" help="name of the column in the sample metadata file that has the name of the sample - defaults to 'sampleMetadata'" /> 39 <param name="classnameColumn" label="Column that names the sample-class" type="text" value = "class" help="name of the column in sample metadata that has the values to be tested against the 'Names of sample-classes' input parameter - defaults to 'class'">
38 <param name="classnameColumn" label="Column that names the sample-class" type="text" value = "class" help="name of the column in sample metadata that has the values to be tested against the 'classes' input parameter - defaults to 'class'" />
39 <param name="sampleclassNames" label="Names of sample classes" type="text" value = "" help="comma-separated names (or comma-less regular expressions to match names) of sample-classes to filter in or out; defaults to no names">
40 <sanitizer> 40 <sanitizer>
41 <valid initial="string.letters"> 41 <valid initial="string.letters">
42 <add preset="string.digits"/> 42 <add preset="string.digits"/>
43 <add value="&#45;" /> <!-- dash, hyphen -->
44 <add value="&#46;" /> <!-- dot, period -->
45 <add value="&#95;" /> <!-- underscore -->
46 </valid>
47 </sanitizer>
48 </param>
49 <param name="sampleclassNames" label="Names of sample-classes" type="text" value = "" help="comma-separated names (or regular expressions to match names) of sample-classes to filter in or out; defaults to no names">
50 <sanitizer>
51 <valid initial="string.letters">
52 <add preset="string.digits"/>
53 <add value="&#123;" /> <!-- l-cube, left-curly-bracket -->
54 <add value="&#124;" /> <!-- pipe -->
55 <add value="&#125;" /> <!-- r-cube, right-curly-bracket -->
43 <add value="&#36;" /> <!-- dollar, dollar-sign --> 56 <add value="&#36;" /> <!-- dollar, dollar-sign -->
44 <add value="&#40;" /> <!-- left-paren --> 57 <add value="&#40;" /> <!-- left-paren -->
45 <add value="&#41;" /> <!-- right-paren --> 58 <add value="&#41;" /> <!-- right-paren -->
46 <add value="&#42;" /> <!-- splat, asterisk --> 59 <add value="&#42;" /> <!-- splat, asterisk -->
47 <add value="&#43;" /> <!-- plus --> 60 <add value="&#43;" /> <!-- plus -->
52 <add value="&#63;" /> <!-- what, question mark --> 65 <add value="&#63;" /> <!-- what, question mark -->
53 <add value="&#91;" /> <!-- l-squib, left-squre-bracket --> 66 <add value="&#91;" /> <!-- l-squib, left-squre-bracket -->
54 <add value="&#92;" /> <!-- whack, backslash --> 67 <add value="&#92;" /> <!-- whack, backslash -->
55 <add value="&#93;" /> <!-- r-squib, right-squre-bracket --> 68 <add value="&#93;" /> <!-- r-squib, right-squre-bracket -->
56 <add value="&#94;" /> <!-- hat, caret --> 69 <add value="&#94;" /> <!-- hat, caret -->
57 <add value="&#123;" /> <!-- l-cube, left-curly-bracket --> 70 <add value="&#95;" /> <!-- underscore -->
58 <add value="&#124;" /> <!-- pipe -->
59 <add value="&#125;" /> <!-- r-cube, right-curly-bracket -->
60 </valid> 71 </valid>
61 </sanitizer> 72 </sanitizer>
62 </param> 73 </param>
63 74
64 <param name="wildcards" label="Use wild-cards or regular-expressions" type="select" help="wild-cards (the default) - use '*' and '?' to match class names; regular-expressions - use comma-less regular expressions to match class names"> 75 <param name="wildcards" label="Use 'wild cards' or 'regular expressions'" type="select" help="'wild-cards' (the default) - use '*' and '?' to match class names; 'regular-expressions' - use regular expressions to match class names">
65 <option value="TRUE" selected="true">wild-cards</option> 76 <option value="TRUE" selected="true">wild-cards</option>
66 <option value="FALSE">regular-expressions</option> 77 <option value="FALSE">regular-expressions</option>
67 </param> 78 </param>
68 <param name="inclusive" label="Include named classes" type="select" help="filter-in - include only the named sample classes; filter-out (the default) - exclude only the named sample classes"> 79 <param name="inclusive" label="Exclude/include named classes" type="select" help="'filter-out' (the default) - exclude only the named sample-classes; 'filter-in' - include only the named sample-classes">
69 <option value="TRUE">filter-in</option> 80 <option value="TRUE">filter-in</option>
70 <option value="FALSE" selected="true">filter-out</option> 81 <option value="FALSE" selected="true">filter-out</option>
71 </param> 82 </param>
72 83
73 <param name="variableRangeFilter" label="Variable range-filters" type="text" value = "" help="comma-separated filters, each specified as 'variableMetadataColumnName:min:max'; default is no filters. (See help below.)"> 84 <param name="variableRangeFilter" label="Variable-range filters" type="text" value = "" help="comma-separated filters, each specified as 'variableMetadataColumnName:min:max'; default is no filters. (See help below.)">
74 <sanitizer> 85 <sanitizer>
75 <valid initial="string.letters"> 86 <valid initial="string.letters">
76 <add preset="string.digits"/> 87 <add preset="string.digits"/>
77 <add value="&#44;" /> <!-- comma --> 88 <add value="&#44;" /> <!-- comma -->
89 <add value="&#45;" /> <!-- dash, hyphen -->
90 <add value="&#46;" /> <!-- dot, period -->
78 <add value="&#58;" /> <!-- colon --> 91 <add value="&#58;" /> <!-- colon -->
79 <add value="&#46;" /> <!-- dot, period --> 92 <add value="&#95;" /> <!-- underscore -->
80 </valid> 93 </valid>
81 </sanitizer> 94 </sanitizer>
95 </param>
96 <param name="transformation" label="Data-transformation" type="select" help="'none' (the default) - do not transform data; 'log2' - log base 2 of data; 'log10' - log base 10 of data; in all cases, negative and missing values are imputed to zero">
97 <option value="none" selected="true">none</option>
98 <option value="log2">log2</option>
99 <option value="log10">log10</option>
82 </param> 100 </param>
83 101
84 </inputs> 102 </inputs>
85 <outputs> 103 <outputs>
86 <data name="dataMatrix_out" label="${tool.name}_${dataMatrix_in.name}" format="tabular" ></data> 104 <data name="dataMatrix_out" label="${dataMatrix_in.name}.subset" format="tabular" ></data>
87 <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data> 105 <data name="sampleMetadata_out" label="${sampleMetadata_in.name}.subset" format="tabular" ></data>
88 <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data> 106 <data name="variableMetadata_out" label="${variableMetadata_in.name}.subset" format="tabular" ></data>
89 </outputs> 107 </outputs>
90 108
91 <tests> 109 <tests>
110 <test>
111 <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
112 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
113 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
114 <param name="classnameColumn" value="gender"/>
115 <param name="sampleclassNames" value="M"/>
116 <param name="wildcards" value="FALSE"/>
117 <param name="inclusive" value="filter-in"/>
118 <param name="variableRangeFilter" value="FEATMAX:6.30103:,mz:200:,rt::800"/>
119 <param name="transformation" value="log10"/>
120 <output name="dataMatrix_out">
121 <assert_contents>
122 <has_text text="5.87336711011293" />
123 </assert_contents>
124 </output>
125 <output name="sampleMetadata_out">
126 <assert_contents>
127 <not_has_text text="HU_028" />
128 <not_has_text text="HU_051" />
129 <not_has_text text="HU_060" />
130 <not_has_text text="HU_110" />
131 <not_has_text text="HU_149" />
132 <not_has_text text="HU_152" />
133 <not_has_text text="HU_175" />
134 <not_has_text text="HU_178" />
135 <not_has_text text="HU_185" />
136 <not_has_text text="HU_204" />
137 <not_has_text text="HU_208" />
138 <has_text text="HU_017" />
139 <has_text text="HU_034" />
140 <has_text text="HU_078" />
141 <has_text text="HU_091" />
142 <has_text text="HU_093" />
143 <has_text text="HU_099" />
144 <has_text text="HU_130" />
145 <has_text text="HU_134" />
146 <has_text text="HU_138" />
147 </assert_contents>
148 </output>
149 <output name="variableMetadata_out">
150 <assert_contents>
151 <not_has_text text="HMDB00191" />
152 <has_text text="HMDB00208" />
153 <not_has_text text="HMDB00251" />
154 <not_has_text text="HMDB00299" />
155 <not_has_text text="HMDB00512" />
156 <not_has_text text="HMDB00518" />
157 <not_has_text text="HMDB00715" />
158 <not_has_text text="HMDB00822" />
159 <has_text text="HMDB01032" />
160 <has_text text="HMDB01101.1" />
161 <not_has_text text="HMDB03193" />
162 <not_has_text text="HMDB04824" />
163 <not_has_text text="HMDB10348" />
164 <has_text text="HMDB13189" />
165 <not_has_text text="HMDB59717" />
166 </assert_contents>
167 </output>
168 </test>
92 <test> 169 <test>
93 <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> 170 <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
94 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> 171 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
95 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> 172 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
96 <param name="classnameColumn" value="class"/> 173 <param name="classnameColumn" value="class"/>
97 <param name="sampleclassNames" value=""/> 174 <param name="sampleclassNames" value=""/>
98 <param name="wildcards" value="FALSE"/> 175 <param name="wildcards" value="FALSE"/>
99 <param name="samplenameColumn" value="sampleMetadata"/>
100 <param name="inclusive" value="filter-out"/> 176 <param name="inclusive" value="filter-out"/>
101 <param name="variableRangeFilter" value="FEATMAX:2e6:,mz:200:,rt::800"/> 177 <param name="variableRangeFilter" value="FEATMAX:20.93157:,mz:200:,rt::800"/>
178 <param name="transformation" value="log2"/>
179 <output name="dataMatrix_out">
180 <assert_contents>
181 <has_text text="19.5109032146715" />
182 </assert_contents>
183 </output>
102 <output name="sampleMetadata_out"> 184 <output name="sampleMetadata_out">
103 <assert_contents> 185 <assert_contents>
104 <has_text text="HU_028" /> 186 <has_text text="HU_028" />
105 <has_text text="HU_051" /> 187 <has_text text="HU_051" />
106 <has_text text="HU_060" /> 188 <has_text text="HU_060" />
147 <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> 229 <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
148 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> 230 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
149 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> 231 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
150 <param name="classnameColumn" value="gender"/> 232 <param name="classnameColumn" value="gender"/>
151 <param name="sampleclassNames" value="M"/> 233 <param name="sampleclassNames" value="M"/>
152 <param name="samplenameColumn" value="sampleMetadata"/>
153 <param name="inclusive" value="filter-in"/> 234 <param name="inclusive" value="filter-in"/>
235 <param name="transformation" value="none"/>
154 <output name="dataMatrix_out"> 236 <output name="dataMatrix_out">
155 <assert_contents> 237 <assert_contents>
156 <not_has_text text="HU_028" /> 238 <not_has_text text="HU_028" />
157 <not_has_text text="HU_051" /> 239 <not_has_text text="HU_051" />
158 <not_has_text text="HU_060" /> 240 <not_has_text text="HU_060" />
197 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> 279 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
198 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> 280 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
199 <param name="classnameColumn" value="gender"/> 281 <param name="classnameColumn" value="gender"/>
200 <param name="sampleclassNames" value="*"/> 282 <param name="sampleclassNames" value="*"/>
201 <param name="wildcards" value="TRUE"/> 283 <param name="wildcards" value="TRUE"/>
202 <param name="samplenameColumn" value="sampleMetadata"/>
203 <param name="inclusive" value="filter-in"/> 284 <param name="inclusive" value="filter-in"/>
204 <output name="sampleMetadata_out"> 285 <output name="sampleMetadata_out">
205 <assert_contents> 286 <assert_contents>
206 <not_has_text text="HU_204" /> 287 <not_has_text text="HU_204" />
207 <has_text text="HU_028" /> 288 <has_text text="HU_028" />
231 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> 312 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
232 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> 313 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
233 <param name="classnameColumn" value="gender"/> 314 <param name="classnameColumn" value="gender"/>
234 <param name="sampleclassNames" value="M"/> 315 <param name="sampleclassNames" value="M"/>
235 <param name="wildcards" value="FALSE"/> 316 <param name="wildcards" value="FALSE"/>
236 <param name="samplenameColumn" value="sampleMetadata"/>
237 <param name="inclusive" value="filter-in"/> 317 <param name="inclusive" value="filter-in"/>
238 <output name="sampleMetadata_out"> 318 <output name="sampleMetadata_out">
239 <assert_contents> 319 <assert_contents>
240 <not_has_text text="HU_028" /> 320 <not_has_text text="HU_028" />
241 <not_has_text text="HU_051" /> 321 <not_has_text text="HU_051" />
265 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> 345 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
266 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> 346 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
267 <param name="classnameColumn" value="gender"/> 347 <param name="classnameColumn" value="gender"/>
268 <param name="sampleclassNames" value="M"/> 348 <param name="sampleclassNames" value="M"/>
269 <param name="wildcards" value="FALSE"/> 349 <param name="wildcards" value="FALSE"/>
270 <param name="samplenameColumn" value="sampleMetadata"/>
271 <param name="inclusive" value="filter-in"/> 350 <param name="inclusive" value="filter-in"/>
272 <output name="variableMetadata_out"> 351 <output name="variableMetadata_out">
273 <assert_contents> 352 <assert_contents>
274 <has_text text="HMDB03193" /> 353 <has_text text="HMDB03193" />
275 <not_has_text text="HMDB00822" /> 354 <not_has_text text="HMDB00822" />
295 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> 374 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
296 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> 375 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
297 <param name="classnameColumn" value="gender"/> 376 <param name="classnameColumn" value="gender"/>
298 <param name="sampleclassNames" value="M"/> 377 <param name="sampleclassNames" value="M"/>
299 <param name="wildcards" value="FALSE"/> 378 <param name="wildcards" value="FALSE"/>
300 <param name="samplenameColumn" value="sampleMetadata"/>
301 <param name="inclusive" value="filter-in"/> 379 <param name="inclusive" value="filter-in"/>
302 <output name="variableMetadata_out"> 380 <output name="variableMetadata_out">
303 <assert_contents> 381 <assert_contents>
304 <has_text text="HMDB03193" /> 382 <has_text text="HMDB03193" />
305 <not_has_text text="HMDB00822" /> 383 <not_has_text text="HMDB00822" />
325 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> 403 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
326 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> 404 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
327 <param name="classnameColumn" value="gender"/> 405 <param name="classnameColumn" value="gender"/>
328 <param name="sampleclassNames" value="[Mm],[fF]"/> 406 <param name="sampleclassNames" value="[Mm],[fF]"/>
329 <param name="wildcards" value="FALSE"/> 407 <param name="wildcards" value="FALSE"/>
330 <param name="samplenameColumn" value="sampleMetadata"/>
331 <param name="inclusive" value="filter-in"/> 408 <param name="inclusive" value="filter-in"/>
332 <output name="sampleMetadata_out"> 409 <output name="sampleMetadata_out">
333 <assert_contents> 410 <assert_contents>
334 <has_text text="HU_028" /> 411 <has_text text="HU_028" />
335 <has_text text="HU_051" /> 412 <has_text text="HU_051" />
359 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> 436 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
360 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> 437 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
361 <param name="classnameColumn" value=""/> 438 <param name="classnameColumn" value=""/>
362 <param name="sampleclassNames" value="M"/> 439 <param name="sampleclassNames" value="M"/>
363 <param name="wildcards" value="FALSE"/> 440 <param name="wildcards" value="FALSE"/>
364 <param name="samplenameColumn" value="sampleMetadata"/>
365 <param name="inclusive" value="filter-in"/> 441 <param name="inclusive" value="filter-in"/>
366 <output name="sampleMetadata_out"> 442 <output name="sampleMetadata_out">
367 <assert_contents> 443 <assert_contents>
368 <has_text text="HU_028" /> 444 <has_text text="HU_028" />
369 <has_text text="HU_051" /> 445 <has_text text="HU_051" />
400 -------------------------------------------------------------------------- 476 --------------------------------------------------------------------------
401 477
402 478
403 **R package** 479 **R package**
404 480
405 The *w4mclassfilter* package is available from the Hegeman lab github repository (https://github.com/HegemanLab/w4mclassfilter/releases). 481 The *w4mclassfilter* package (which is used by the W4m Data Subset tool) is available from the Hegeman lab github repository (https://github.com/HegemanLab/w4mclassfilter/releases).
406 482
407 ----------------------------------------------------------------------------------------------------------------------------------------- 483 -----------------------------------------------------------------------------------------------------------------------------------------
408 484
409 485
410 **Tool updates** 486 **Tool updates**
411 487
412 See the **NEWS** section at the bottom of this page 488 See the **NEWS** section at the bottom of this page
413 489
414 --------------------------------------------------- 490 ---------------------------------------------------
415 491
416 ============================================== 492 ===========================================================
417 Filter Workflow4Metabolomics data matrix files 493 "W4m Data Subset" - Filter Workflow4Metabolomics data files
418 ============================================== 494 ===========================================================
495
496 ----------
497 Motivation
498 ----------
499
500 GC-MS and LC-MS experiments seek to resolve as features chemicals that have distinct chromatographic retention-time ("rt") and (after ionization) mass-to-charge ratio ("m/z" or "mz").
501 (If the MS protocol includes fragmentation, several features may result for each chemical.)
502 Data for a sample are collected as MS intensities, each of which is associated with a position on a 2D plane with dimensions of rt and m/z.
503 Ideally, features would be sufficiently reproducible among sample-runs to distinguish features that are commmon among samples from those that differ.
504
505 The chromatographic retention-time for a chemical can vary from one chromatography run to the next.
506 Workflow4Metabolomics (W4m, [Giacomoni *et al.*, 2014, Guitton *et al.* 2017]) is a "flavor" of Galaxy that uses the XCMS preprocessing tools for "retention-time correction" to align features among samples.
507 Features may be better aligned if pooled samples and blanks are included.
508
509 Multivariate statistical techniques may be used to discover clusters of similar samples (Th]]>&#233;<![CDATA[venot *et al.*, 2015).
510 However, once retention-time alignment of features has been achieved among samples in GC-MS and LC-MS datasets:
511
512 - The presence of pools and blanks may confound identification and separation of clusters.
513 - Multivariate statistical algorithms may be impacted by missing values or dimensions that have zero variance.
419 514
420 ----------- 515 -----------
421 Description 516 Description
422 ----------- 517 -----------
423 518
424 Filter a set of retention-corrected W4M files (dataMatrix, sampleMetadata, variableMetadata) by sample-class 519 The **W4m Data Subset** tool **selects subsets of samples, features, or data values** for further analysis.
520
521 - The tool takes as input the data matrix, sample metadata, and variable metadata datasets produced by W4m's XCMS [Smith *et al.*, 2006] and CAMERA [Kuhl *et al.*, 2012] tools.
522 - The tool produces the same trio of output datasets, modified as follows.
523
524 This tool can perform several operations to reduce the number samples or features to be analyzed (although **this should be done only in a statistically sound manner** consistent with the nature of the experiment):
525
526 - Samples may be eliminated by filtering on a designated “sample class” column in sampleMetadata.
527 - Features may be eliminated by specifying minimum or maximum value (or both) allowable in columns of variableMetadata.
528 - Features may be eliminated by “range of row-maximum for each feature”, i.e., by specifying minimum or maximum intensity (or both) allowable in each row of the dataMatrix (i.e., for the feature across all samples).
529
530 This tool also performs several operations to address several data issues that may impede downstream statistical analysis:
531
532 - Missing values in dataMatrix are imputed to zero.
533 - The values in the dataMatrix may be log-transformed if desired.
534 - Samples that are missing from either sampleMetadata or dataMatrix are eliminated.
535 - Features that are missing from either variableMetadata or dataMatrix are eliminated.
536 - Features and samples that have zero variance are eliminated.
537 - Samples and features are sorted alphabetically in rows and columns of dataMatrix and in rows of variableMetadata and sampleMetadata.
538 - The names of the first columns of variableMetadata and sampleMetadata are set respectively to "variableMetadata" and "sampleMetadata".
539
540 This tool may be applied several times sequentially, which may be useful for:
541
542 - analyzing subsets of samples for progressively smaller sets of treatment-levels, or
543 - choosing subsets of samples based on criteria in several columns of the sampleMetadata table.
425 544
426 ----------------- 545 -----------------
427 Workflow Position 546 Workflow Position
428 ----------------- 547 -----------------
429 548
430 - Upstream tool category: Preprocessing 549 This tool can be used at any point downstream of Preprocessing.
431 - Downstream tool categories: Normalisation, Statistical Analysis, Quality Control, Filter and Sort 550
432 551 - Possible upstream tool categories: Preprocessing, Quality Control, Statistical Analysis, Filter and Sort
433 ---------- 552 - Possible downstream tool categories: Normalisation, Statistical Analysis, Quality Control, Filter and Sort
434 Motivation
435 ----------
436
437 GC-MS1 and LC-MS1 experiments seek to resolve chemicals as features that have distinct chromatographic behavior and (after ionization) mass-to-charge ratio.
438 Data for a sample are collected as MS intensities, each of which is associated with a position on a 2D plane with dimensions of m/z ratio and chromatographic retention time.
439 Ideally, features would be sufficiently reproducible from sample-run to sample-run to identify features that are commmon among samples and those that differ.
440 However, the chromatographic retention time for a chemical can vary from one run to another.
441 In the Workflow4Metabolomics (W4M, [Giacomoni *et al.*, 2014, Guitton *et al.* 2017]) "flavor" of Galaxy, the XCMS [Smith *et al.*, 2006] preprocessing tools provide for "retention time correction" to align features among samples, but features may be better aligned if pooled samples and blanks are included.
442
443 Multivariate statistical techniques may be used to discover clusters of similar samples, and sometimes it is desirable to apply clustering iteratively to smaller and smaller subsets of samples until observable separation of clusters is no longer significant.
444 Once feature-alignment has been achieved among samples in GC-MS and LC-MS datasets, however, the presence of pools and blanks may confound identification and separation of clusters.
445 Multivariate statistical algorithms also may be impacted by missing values or dimensions that have zero variance (Thévenot *et al.*, 2015).
446
447 The w4mclassfilter tool provides a way to choose subsets of samples for further analysis.
448 The tool takes as input the data matrix, sample metadata, and variable metadata Galaxy datasets produced by W4M and produces the same trio of datasets with data only for the selected samples.
449 The tool uses a "sample-class" column in the sample metadata as the basis for including or eliminating samples for further analysis.
450 Class-values to be considered are provided by the user as a comma-separated list.
451 The user also provides an indication whether the list specifies classes to be included in further analysis ("filter-in") or rather to be excluded from it ("filter-out").
452 Next, missing and negative intensites for features of the remaining samples are imputed to zero.
453 Finally, samples or features with zero variance are eliminated.
454 553
455 ----------- 554 -----------
456 Input files 555 Input files
457 ----------- 556 -----------
458 557
470 ---------- 569 ----------
471 Parameters 570 Parameters
472 ---------- 571 ----------
473 572
474 Data matrix file 573 Data matrix file
475 | variable x sample **dataMatrix** (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and variable metadata, respectively (see below) 574 | variable x sample **dataMatrix** (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical, respectively, to the rownames of the sample metadata file and variable metadata file
476 | 575 |
477 576
478 Sample metadata file 577 Sample metadata file
479 | sample x metadata **sampleMetadata** (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values 578 | sample x metadata **sampleMetadata** (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values
480 | 579 |
481 580
482 Variable metadata file 581 Variable metadata file
483 | variable x metadata **variableMetadata** (tabular separated values) file of the numeric and/or character variable metadata, with . as decimal and NA for missing values 582 | variable x metadata **variableMetadata** (tabular separated values) file of the numeric and/or character variable metadata, with . as decimal and NA for missing values
484 | 583 |
485 584
486 Column that names the sample (default = '``sampleMetadata``')
487 | name of the column in sample metadata that has the name of the sample
488 |
489
490 Column that names the sample-class (default = '``class``') 585 Column that names the sample-class (default = '``class``')
491 | name of the column in sample metadata that has the values to be tested against the '``classes``' input parameter 586 | name of the column in **sampleMetadata** that has the values to be tested against the '``Names of sample-classes``' input parameter
492 | 587 |
493 588
494 Names of sample classes (default = no names) 589 Names of sample-classes (default = no names)
495 | comma-separated names of sample classes to include or exclude 590 | comma-separated names (or regular expressions to match names) of sample-classes to include or exclude
496 | 591 |
497 592
498 Wild-cards (default = '``wild-cards``') 593 'Wild cards' or 'regular expressions' (default = '``wild-cards``')
499 | '``wild-cards``' - use wild-cards to match names of sample classes (see 'Wild card patterns to match class-names' below) 594 | '``wild-cards``' - use wild cards to match names of sample-classes (see the 'Wild card patterns to match class-names' section below)
500 | '``regular-expressions``' - exclude only the named sample classes (see 'Regular expression patterns to match class-names' below) 595 | '``regular-expressions``' - use regular expressions to match the named sample-classes (see the 'Regular expression patterns to match class-names' section below)
501 | 596 |
502 597
503 Include named classes (default = '``filter-out``') 598 Exclude/include named classes (default = '``filter-out``')
504 | '``filter-in``' - include only the named sample classes 599 | '``filter-in``' - include only the named sample-classes
505 | '``filter-out``' - exclude only the named sample classes 600 | '``filter-out``' - exclude only the named sample-classes
506 | 601 |
507 602
508 Variable-range filters (default = no filters) 603 Variable-range filters (default = no filters)
509 | comma-separated names of variable-range filters (see 'Variable-range filters' below) 604 | comma-separated names of variable-range filters (see the 'Variable-range filters' section below)
510 | 605 |
511 606
607 Data-transformation (default = '``none``')
608 | '``none``' - do not transform data matrix values
609 | '``log2``' - take the log base 2 of the values in the data matrix
610 | '``log10``' - take the log base 10 of the values in the data matrix
611 | In both cases, negative and missing values are imputed to zero.
612 |
512 613
513 614
514 ------------ 615 ------------
515 Output files 616 Output files
516 ------------ 617 ------------
517 618
518
519 sampleMetadata 619 sampleMetadata
520 | (tabular separated values) file identical to the **sampleMetadata** file given as an input argument, excepting lacking rows for samples (xC-MS features) that have been filtered out (by the sample-class filter or because of zero variance) 620 | (tabular separated values) file identical to the **sampleMetadata** file given as an input argument, excepting lacking rows for samples that have been filtered out (by the sample-class filter, or because of zero variance, or because they were missing in the input data matrix)
521 | 621 |
522 622
523 variableMetadata 623 variableMetadata
524 | (tabular separated values) file identical to the **variableMetadata** file given as an input argument, excepting lacking rows for variables (xC-MS features) that have been filtered out (because of zero variance) 624 | (tabular separated values) file identical to the **variableMetadata** file given as an input argument, excepting lacking rows for variables (xC-MS features) that have been filtered out (by the variable-range filter, or because of zero variance, or because they were missing in the input data matrix)
525 | 625 |
526 626
527 dataMatrix 627 dataMatrix
528 | (tabular separated values) file identical to the **dataMatrix** file given as an input argument, excepting lacking rows for variables (xC-MS features) that have been filtered out (because of zero variance) and columns that have been filtered out (by the sample-class filter or because of zero variance) 628 | (tabular separated values) file identical to the **dataMatrix** file given as an input argument, excepting lacking rows and columns for variables and samples that have been filtered out, respectively
529 | 629 |
530 630
531 631
532 --------------------------------------- 632 -----------------------------------------
533 Wild card patterns to match class-names 633 'Wild card' patterns to match class-names
534 --------------------------------------- 634 -----------------------------------------
535 635
536 Beginning with v0.98.2, w4mclassfilter supports use of R "wild card" patterns to select class-names. 636 W4m Data Subset supports use of R "wild card" patterns to select class-names.
537 637
538 - use '``?``' to match a single character 638 - use '``?``' to match a single character
539 - use '``*``' to match zero or more characters 639 - use '``*``' to match zero or more characters
540 - the entire pattern must match the sample name 640 - the entire pattern must match the sample name
541 641
543 643
544 - '``??.samp*``' matches '``my.sample``' but not '``my.own.sample``' 644 - '``??.samp*``' matches '``my.sample``' but not '``my.own.sample``'
545 - '``*.sample``' matches '``my.sample``' and '``my.own.sample``' 645 - '``*.sample``' matches '``my.sample``' and '``my.own.sample``'
546 - '``*.sampl``' matches neither '``my.sample``' nor '``my.own.sample``' 646 - '``*.sampl``' matches neither '``my.sample``' nor '``my.own.sample``'
547 647
548 ------------------------------------------------ 648 --------------------------------------------------
549 Regular expression patterns to match class-names 649 'Regular expression' patterns to match class-names
550 ------------------------------------------------ 650 --------------------------------------------------
551 651
552 Beginning with v0.98.2, w4mclassfilter supports use of R "regular expression" patterns to select class-names. 652 W4m Data Subset supports use of R "regular expression" patterns to select class-names.
553 653
554 R uses POSIX 1003.2 standard regular expressions, which allow precise pattern-matching and are exhaustively defined at: 654 R uses POSIX 1003.2 standard regular expressions, which allow precise pattern-matching and are exhaustively defined at:
555 http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html 655 http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
556 656
557 However, only a few basic building blocks of regular expressions need to be mastered for most cases: 657 However, only a few basic building blocks of regular expressions need to be mastered for most cases:
636 736
637 This example retains only samples whose 'gender' attribute is 'M'. 737 This example retains only samples whose 'gender' attribute is 'M'.
638 738
639 **Input parameters** 739 **Input parameters**
640 740
641 +------------------------------------+-----------------+ 741 +------------------------------------+-------------------------------+
642 | Input Parameter | Value | 742 | Input Parameter | Value |
643 +====================================+=================+ 743 +====================================+===============================+
644 | Names of sample classes | M | 744 | Names of sample-classes | M |
645 +------------------------------------+-----------------+ 745 +------------------------------------+-------------------------------+
646 | Include named classes | filter-in | 746 | Include named classes | filter-in |
647 +------------------------------------+-----------------+ 747 +------------------------------------+-------------------------------+
648 | Column that names the sample-class | gender | 748 | Column that names the sample-class | gender |
649 +------------------------------------+-----------------+ 749 +------------------------------------+-------------------------------+
650 | Column that names the sample | sampleMetadata | 750 | Variable range-filters | (Leave this field empty.) |
651 +------------------------------------+-----------------+ 751 +------------------------------------+-------------------------------+
752 | Data transforamtion | none |
753 +------------------------------------+-------------------------------+
652 754
653 **Expected outputs** 755 **Expected outputs**
654 756
655 +-------------------+---------------------------------------------------------------------------------------------------------------------------+ 757 +-------------------+---------------------------------------------------------------------------------------------------------------------------+
656 | Expected Output | Download from URL | 758 | Expected Output | Download from URL |
671 This example retains only features whose mz is greater than 200, whose rt is less than 800, and whose maximum intensity across all samples is 2,000,000. 773 This example retains only features whose mz is greater than 200, whose rt is less than 800, and whose maximum intensity across all samples is 2,000,000.
672 This example retains all samples (except those having zero variance for all feature), although it would be possible to filter on samples as well. 774 This example retains all samples (except those having zero variance for all feature), although it would be possible to filter on samples as well.
673 775
674 **Input parameters** 776 **Input parameters**
675 777
676 +------------------------------------+-------------------------------+ 778 +------------------------------------+------------------------------------+
677 | Input Parameter | Value | 779 | Input Parameter | Value |
678 +====================================+===============================+ 780 +====================================+====================================+
679 | Names of sample classes | (Leave this field empty.) | 781 | Names of sample-classes | (Leave this field empty.) |
680 +------------------------------------+-------------------------------+ 782 +------------------------------------+------------------------------------+
681 | Include named classes | filter-out | 783 | Include named classes | filter-out |
682 +------------------------------------+-------------------------------+ 784 +------------------------------------+------------------------------------+
683 | Column that names the sample-class | class | 785 | Column that names the sample-class | gender |
684 +------------------------------------+-------------------------------+ 786 +------------------------------------+------------------------------------+
685 | Column that names the sample | sampleMetadata | 787 | Variable range-filters | FEATMAX:20.93157:,mz:200:,rt::800 |
686 +------------------------------------+-------------------------------+ 788 +------------------------------------+------------------------------------+
687 | Variable range-filters | FEATMAX:2e6:,mz:200:,rt::800 | 789 | Data transforamtion | log2 |
688 +------------------------------------+-------------------------------+ 790 +------------------------------------+------------------------------------+
689 791
690 **Expected outputs** 792 **Expected outputs**
691 793
692 +-------------------+------------------------------------------------------------------------------------------------------------------------------+ 794 +-------------------+------------------------------------------------------------------------------------------------------------------------------+
693 | Expected Output | Download from URL | 795 | Expected Output | Download from URL |
703 805
704 ---- 806 ----
705 NEWS 807 NEWS
706 ---- 808 ----
707 809
708 CHANGES IN VERSION 0.98.7 810 Changes in version 0.98.8
709 ========================= 811 =========================
710 812
711 New features 813 New features
712 814
713 * First column of output variableMetadata (that has feature names) now is always named "variableMetadata". 815 - The tool now appears in Galaxy with a new, more representative name: "W4m Data Subset". (Earlier versions of this tool appeared in Galaxy with the name "Sample Subset".)
714 * First column of output sampleMetadata now (that has sample names) is always named "sampleMetadata". 816 - Option was added to log-transform data matrix values.
817 - Output datasets are named in conformance with the W4m convention of appending the name of each preprocessing tool to the input dataset name.
818 - Superflous "Column that names the sample" input parameter was eliminated.
819 - Some documentation was updated or clarified.
715 820
716 Internal modifications 821 Internal modifications
717 822
718 * Now uses w4mclassfilter R package v0.98.7. 823 - None
719 824
720 CHANGES IN VERSION 0.98.6 825 Changes in version 0.98.7
721 ========================= 826 =========================
722 827
723 New features 828 New features
724 829
725 * Added support for filtering out features whose attributes fall outside specified ranges. 830 - First column of output variableMetadata (that has feature names) now is always named "variableMetadata".
831 - First column of output sampleMetadata now (that has sample names) now is always named "sampleMetadata".
832
833 Internal modifications
834
835 - Now uses w4mclassfilter R package v0.98.7.
836
837 Changes in version 0.98.6
838 =========================
839
840 New features
841
842 - Added support for filtering out features whose attributes fall outside specified ranges.
726 For more detail, see "Variable-range filters" above. 843 For more detail, see "Variable-range filters" above.
727 844
728 Internal modifications 845 Internal modifications
729 846
730 * Now uses w4mclassfilter R package v0.98.6. 847 - Now uses w4mclassfilter R package v0.98.6.
731 * Now sorts sample names and feature names in output files because some statistical tools expect the same order in `dataMatrix` row and column names as in the corresponding metadata files. 848 - Now sorts sample names and feature names in output files because some statistical tools expect the same order in `dataMatrix` row and column names as in the corresponding metadata files.
732 849
733 Changes in version 0.98.3 850 Changes in version 0.98.3
734 ========================= 851 =========================
735 852
736 Internal modifications 853 Internal modifications
737 854
738 * Improved input handling. 855 - Improved input handling.
739 * Now uses w4mclassfilter R package v0.98.3, although that version has no functional implications for this tool. 856 - Now uses w4mclassfilter R package v0.98.3, although that version has no functional implications for this tool.
740 * Improved reference-list. 857 - Improved reference-list.
741 858
742 Changes in version 0.98.2 859 Changes in version 0.98.2
743 ========================= 860 =========================
744 861
745 New features 862 New features
746 863
747 * Added support for R-flavored regular expression pattern-matching when selecting names of sample-classes. 864 - Added support for R-flavored regular expression pattern-matching when selecting names of sample-classes.
748 * Empty classes argument or zero-length class_column result in no samples filtered out. 865 - Empty classes argument or zero-length class_column result in no samples filtered out.
749 866
750 Internal modifications 867 Internal modifications
751 868
752 * Support and tests for new features. 869 - Support and tests for new features.
753 870
754 Changes in version 0.98.1 871 Changes in version 0.98.1
755 ========================= 872 =========================
756 873
757 First release - Wrap the w4mclassfilter R package that implements filtering of W4M data matrix, variable metadata, and sample metadata by class of sample. 874 First release - Wrap the w4mclassfilter R package that implements filtering of W4m data matrix, variable metadata, and sample metadata by class of sample.
758 875
759 New features 876 New features
760 877
761 * *dataMatrix* *is* modified by the tool, so it *does* appear as an output file 878 - Output *dataMatrix* is input dataMatrix as modified by the tool
762 * *sampleMetadata* *is* modified by the tool, so it *does* appear as an output file 879 - Output *sampleMetadata* is input sampleMetadata as modified by the tool
763 * *variableMetadata* *is* modified by the tool, so it *does* appear as an output file 880 - Output *variableMetadata* is input variableMetadata as modified by the tool
764
765 Internal modifications
766
767 * N/A
768 881
769 ]]></help> 882 ]]></help>
770 <citations> 883 <citations>
771 <!-- Giacomoni_2014 W4M 2.5 --> 884 <!-- Giacomoni_2014 W4m 2.5 -->
772 <citation type="doi">10.1093/bioinformatics/btu813</citation> 885 <citation type="doi">10.1093/bioinformatics/btu813</citation>
773 <!-- Guitton_2017 W4M 3.0 --> 886 <!-- Guitton_2017 W4m 3.0 -->
774 <citation type="doi">10.1016/j.biocel.2017.07.002</citation> 887 <citation type="doi">10.1016/j.biocel.2017.07.002</citation>
888 <!-- Kuhl_2012 CAMERA -->
889 <citation type="doi">10.1021/ac202450g</citation>
775 <!-- Smith_2006 XCMS --> 890 <!-- Smith_2006 XCMS -->
776 <citation type="doi">10.1021/ac051437y</citation> 891 <citation type="doi">10.1021/ac051437y</citation>
777 <!-- Th_venot_2015 Urinary metabolome statistics --> 892 <!-- Thevenot_2015 Urinary metabolome statistics -->
778 <citation type="doi">10.1021/acs.jproteome.5b00354</citation> 893 <citation type="doi">10.1021/acs.jproteome.5b00354</citation>
779 </citations> 894 </citations>
780 <!-- 895 <!--
781 vim:noet:sw=2:ts=2 896 vim:noet:sw=2:ts=2
782 --> </tool> 897 --> </tool>