comparison macros.xml @ 0:42c2a25ff197 draft

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/ramclustr commit c321421a07bfdcc9ed423e9ed2ee794157984ba1
author recetox
date Wed, 29 Jun 2022 10:00:43 +0000
parents
children 25625114618e
comparison
equal deleted inserted replaced
-1:000000000000 0:42c2a25ff197
1 <macros>
2 <token name="@TOOL_VERSION@">1.2.4</token>
3
4 <xml name="creator">
5 <creator>
6 <person
7 givenName="Helge"
8 familyName="Hecht"
9 url="https://github.com/hechth"
10 identifier="0000-0001-6744-996X" />
11 <person
12 givenName="Maksym"
13 familyName="Skoryk"
14 url="https://github.com/maximskorik"
15 identifier="0000-0003-2056-8018" />
16 <person
17 givenName="Matej"
18 familyName="Troják"
19 url="https://github.com/xtrojak"
20 identifier="0000-0003-0841-2707" />
21 <person
22 givenName="Martin"
23 familyName="Čech"
24 url="https://github.com/martenson"
25 identifier="0000-0002-9318-1781" />
26 <organization
27 url="https://www.recetox.muni.cz/"
28 email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"
29 name="RECETOX MUNI"/>
30 </creator>
31 </xml>
32
33 <xml name="parameters_csv">
34 <section name="ms_csv" title="Input MS Data as CSV" expanded="true">
35 <param label="Input CSV" name="ms" type="data" format="csv"
36 help="Features as columns, rows as samples. Column header in format mz_rt."/>
37 <param label="idMSMS" name="idmsms" type="data" format="csv" optional="true"
38 help="Optional idMSMS / MSe csv data. Same dimension and names as in input CSV are required."/>
39 </section>
40 </xml>
41
42 <xml name="parameters_xcms">
43 <section name="xcms" title="Input MS Data as XCMS" expanded="true">
44 <param name="input_xcms" label="Input XCMS" type="data" format="rdata.xcms.fillpeaks"
45 help="Grouped feature data for clustering." />
46 <param label="Preserve phenotype" name="usePheno" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true"
47 help="Transfer phenotype data from XCMS object to Spec abundance file."/>
48 </section>
49 </xml>
50
51 <xml name="parameters_required">
52 <param label="Sigma r" name="sr" type="float" value="0.5" help="Correlational similarity between features."/>
53 <param label="Correlation method" name="cor_method" type="select" display="radio"
54 help="Choose correlational method to be used - see [1] for details.">
55 <option value="pearson" selected="true">pearson</option>
56 <option value="everything">everything</option>
57 <option value="spearman">spearman</option>
58 <option value="kendall">kendall</option>
59 </param>
60 <param label="Maximum RT difference" name="maxt" value="60" type="float"
61 help="Maximum difference to calculate RT similarity - values beyond this are assigned zero similarity."/>
62 </xml>
63
64 <xml name="main_parameters">
65 <section name="clustering" title="Clustering" expanded="true">
66 <param label="Clustering linkage method" name="linkage" type="select" display="radio"
67 help="Choose hierarchical clustering linkage method - see [2] for details.">
68 <option value="average" selected="true">average</option>
69 <option value="ward.D">ward.D</option>
70 <option value="ward.D2">ward.D2</option>
71 <option value="single">single</option>
72 <option value="complete">complete</option>
73 <option value="mcquitty">mcquitty</option>
74 <option value="median">median</option>
75 <option value="centroid">centroid</option>
76 </param>
77 <param label="Minimal cluster size" name="minModuleSize" type="integer" value="2"
78 help="Minimal size (number of features) of a cluster."/>
79 <param label="Maximal tree height" name="hmax" type="float" value="0.3"
80 help="Cut the Hierarchical Cluster Analysis tree at this height, see [3] for details."/>
81 <param label="Use deepSplit" name="deepSplit" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false"
82 help="Check to produce more smaller clusters, uncheck for fewer bigger clusters, see [3] for details."/>
83 </section>
84
85 <section name="normalisation" title="Normalisation" expanded="true">
86 <conditional name="normalisation_method">
87 <param label="Normalisation method" name="normalize" type="select" display="radio"
88 help="Choose method for normalization of feature intensities.">
89 <option value="none" selected="true">none</option>
90 <option value="TIC">TIC</option>
91 <option value="quantile">quantile</option>
92 <option value="batch.qc">batch.qc</option>
93 </param>
94 <when value="batch.qc">
95 <param label="Metadata details" name="batch_order_qc" type="data" format="csv" optional="true"
96 help="CSV with sample names (or indices, currently not handled) on rows and columns with:
97 batch number ('batch'), position in sequence ('order'), and whether it is a QC sample or not
98 ('qc' with true/false OR 'sampleType' with 'sample/qc/blank')."/>
99 <param label="QC injection range" name="qc_inj_range" type="integer" value="20"
100 help="How many injections around each injection are to be scanned for presence of QC samples?
101 A good rule of thumb is between 1 and 3 times the typical
102 injection span between QC injections. i.e. if you inject QC ever 7 samples, set this to
103 between 7 and 21. Smaller values provide more local precision but make normalization sensitive
104 to individual poor outliers (though these are first removed using the boxplot function outlier
105 detection), while wider values provide less local precision in normalization but better
106 stability to individual peak areas."/>
107 </when>
108 </conditional>
109 </section>
110
111 <section name="performance" title="Performance">
112 <param label="Blocksize" name="blocksize" type="integer" value="2000"
113 help="Number of features processed in one block."/>
114 <param label="Blocksize factor" name="mult" type="integer" value="5"
115 help="Factor to scale blocksize to influence processing speed."/>
116 </section>
117
118 <section name="msp_output_details" title="MSP output">
119 <param label="Merge MSP Files" name="merge_msp" type="boolean" truevalue="TRUE" falsevalue="FALSE"
120 checked="true" help="Merge all MSP in one file or export one MSP per spectra."/>
121 <param label="m/z decimal places" name="mzdec" type="integer" value="6"
122 help="Number of decimal places used in printing m/z values."/>
123 <!--
124 Currently not forwarded because the MSP is exported always manually afterwards
125 <param label="mspout" name="mspout" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="write msp formatted spectra to file?" />
126 -->
127 </section>
128
129 <section name="extras" title="Extras">
130 <param label="RT only low n" name="rt_only_low_n" type="boolean" truevalue="TRUE" falsevalue="FALSE"
131 checked="true"
132 help="At low injection numbers, correlational relationships of peak intensities may be unreliable.
133 By default, RAMClustR will simply ignore the correlational Sigma r value and cluster on retention time alone.
134 If you wish to use correlation with at n less than 5, set this value to FALSE."/>
135 <param label="Replace zeros" name="replace_zeros" type="boolean" truevalue="TRUE" falsevalue="FALSE"
136 checked="true"
137 help="NA, NaN, and Inf values are replaced with zero, and zero values are sometimes returned from
138 peak peaking. When TRUE, zero values will be replaced with a small amount of noise, with noise level
139 set based on the detected signal intensities for that feature."/>
140 <param label="Experimental design metadata" name="ExpDes" type="data" format="csv" optional="true"
141 help="Definition of experimental design in CSV format." />
142 </section>
143 </xml>
144
145 <xml name="output_msp">
146 <collection label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra_collection" type="list">
147 <discover_datasets pattern="__name_and_ext__" directory="spectra" recurse="true" ext="msp"/>
148 <filter>not msp_output_details['merge_msp']</filter>
149 </collection>
150 <data label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra_merged" format="msp">
151 <filter>msp_output_details['merge_msp']</filter>
152 </data>
153 </xml>
154
155 <xml name="citations">
156 <citations>
157 <!-- Example of annotating a citation using a BibTex entry. -->
158 <citation type="bibtex">
159 @article{Broeckling2014e,
160 abstract = {Metabolomic data are frequently acquired using chromatographically coupled mass spectrometry
161 (MS) platforms. For such datasets, the first step in data analysis relies on feature detection, where a
162 feature is defined by a mass and retention time. While a feature typically is derived from a single
163 compound, a spectrum of mass signals is more a more-accurate representation of the mass spectrometric
164 signal for a given metabolite. Here, we report a novel feature grouping method that operates in an
165 unsupervised manner to group signals from MS data into spectra without relying on predictability of the
166 in-source phenomenon. We additionally address a fundamental bottleneck in metabolomics, annotation of MS
167 level signals, by incorporating indiscriminant MS/MS (idMS/MS) data implicitly: feature detection is
168 performed on both MS and idMS/MS data, and feature-feature relationships are determined simultaneously
169 from the MS and idMS/MS data. This approach facilitates identification of metabolites using in-source MS
170 and/or idMS/MS spectra from a single experiment, reduces quantitative analytical variation compared to
171 single-feature measures, and decreases false positive annotations of unpredictable phenomenon as novel
172 compounds. This tool is released as a freely available R package, called RAMClustR, and is sufficiently
173 versatile to group features from any chromatographic-spectrometric platform or feature-finding software.
174 {\textcopyright} 2014 American Chemical Society.},
175 author = {Broeckling, C. D. and Afsar, F. A. and Neumann, S. and Ben-Hur, A. and Prenni, J. E.},
176 doi = {10.1021/ac501530d},
177 issn = {15206882},
178 journal = {Analytical Chemistry},
179 number = {14},
180 pages = {6812--6817},
181 pmid = {24927477},
182 title = {{RAMClust: A novel feature clustering method enables spectral-matching-based annotation for
183 metabolomics data}},
184 volume = {86},
185 year = {2014}
186 }
187 </citation>
188 </citations>
189 </xml>
190
191 <token name="@HELP@">
192 <![CDATA[
193 Documentation
194 For documentation on the tool see https://github.com/cbroeckl/RAMClustR/blob/master/vignettes/RAMClustR.Rmd
195
196 Upstream Tools
197 +------------------------------+-------------------------------+----------------------+---------------------+
198 | Name | Output File | Format | Parameter |
199 +==============================+===============================+======================+=====================+
200 | xcms | xset.fillPeaks.RData | rdata.xcms.fillpeaks | xcmsObj |
201 +------------------------------+-------------------------------+----------------------+---------------------+
202 | RAMClustR define experiment | Table with experiment details | csv | Experimental design |
203 +------------------------------+-------------------------------+----------------------+---------------------+
204
205 The tool takes an **xcmsSet** object as input and extracts all relevant information.
206
207 +-------+------------------------+--------+------------+
208 | Name | Output File | Format | Parameter |
209 +=======+========================+========+============+
210 | ??? | Feature Table with MS1 | csv | ms |
211 +-------+------------------------+--------+------------+
212 | ??? | Feature Table with MS2 | csv | idmsms |
213 +-------+------------------------+--------+------------+
214
215 Alternatively, the tool takes a **csv** table as input which has to fulfill the following requirements
216
217 (1) no more than one sample (or file) name column and one feature name row;
218 (2) feature names that contain the mass and retention times, separated by a constant delimiter; and
219 (3) features in columns and samples in rows.
220
221 +----------------------+-------------------+-------------------+--------------------+--------------------+
222 | sample | 100.88_262.464 | 100.01_423.699 | 100.003_128.313 | 100.0057_154.686 |
223 +======================+===================+===================+====================+====================+
224 | 10_qc_16x_dil_milliq | 0 | 195953.6376 | 0 | 0 |
225 +----------------------+-------------------+-------------------+--------------------+--------------------+
226 | 11_qc_8x_dil_milliq | 0 | 117742.1828 | 4247300.664 | 0 |
227 +----------------------+-------------------+-------------------+--------------------+--------------------+
228 | 12_qc_32x_dil_milliq | 4470859.38 | 0 | 2206092.112 | 0 |
229 +----------------------+-------------------+-------------------+--------------------+--------------------+
230 | 15_qc_16x_dil_milliq | 0 | 0 | 2767477.481 | 0 |
231 +----------------------+-------------------+-------------------+--------------------+--------------------+
232
233
234 Downstream Tools
235 The output is a msp file or a collection of msp files, with additional Spec Abundance file.
236
237 +---------+--------------+----------------------+
238 | Name | Output File | Format |
239 +=========+==============+======================+
240 | matchMS | Mass Spectra | collection (tgz/msp) |
241 +---------+--------------+----------------------+
242
243 @GENERAL_HELP@
244 ]]>
245 </token>
246
247 <token name="@GENERAL_HELP@">
248 Background
249 Metabolomics
250 Metabolomics is frequently performed using chromatographically coupled mass spectrometry, with gas
251 chromatography, liquid chromatography, and capillary electrophoresis being the most frequently utilized
252 methods of separation. The coupling of chromatography to mass spectrometry is enabled with an
253 appropriate ionization source - electron impact (EI) for gas phase separations and electrospray
254 ionization (ESI) for liquid phase separations. XCMS is a commonly used tool to detect all the signals
255 from a metabolomics dataset, generating aligned features, where a feature is represented by a mass and
256 retention time. Each feature is presumed to derive from a single compound. However, each compound is
257 represented by several features. With any ionization method, isotopic peaks will be observed reflective
258 of the elemental composition of the analyte. In EI, fragmentation is a byproduct of ionization, and has
259 driven the generation of large mass spectral libraries. In ESI, in-source fragmentation frequently
260 occurs, the magnitude of which is compound dependent, with more labile compounds being more prone to
261 in-source fragmentation. ESI can also product multiple adduct forms (protonated, potassiated, sodiated,
262 ammoniated...), and can produce multimers (i.e. [2M+H]+, [3M+K]+, etc) and multiple charged species
263 ([M+2H]++). This can become further complicated by considering combinations of these phenomena. For
264 example [2M+3H]+++ (triply charged dimer) or an in-source fragment of a dimer.
265
266 RAMClustR approach
267 RAMClustR was designed to group features designed from the same compound using an approach which is
268 **1.** unsupervised, **2.** platform agnostic, and **3.** devoid of curated rules, as the depth of
269 understanding of these processes is insufficient to enable accurate curation/prediction of all phenomenon
270 that may occur. We achieve this by making two assumptions. The first is that two features derived
271 from the same compound with have (approximately) the same retention time. The second is that two
272 features derived from the same compound will have (approximately) the same quantitative trend across
273 all samples in the xcms sample set. From these assumptions, we can calculate a retention time
274 similarity score and a correlational similarity score for each feature pair. A high similarity score
275 for both retention time and correlation indicates a strong probability that two features derive from
276 the same compound. Since both conditions must be met, the product of the two similarity scores provides
277 the best approximation of the total similarity score - i.e. a feature pair with retention time similarity
278 of 1 and correlational similarity of 0 is unlikely to derive from one compound - 1 x 0 = 0, the final
279 similarity score is zero, indicating the two features represent two different compounds. Similarly, a
280 feature pair with retention time similarity of 0 and correlational similarity of 1 is unlikely to derive
281 from one compound - 0 x 1 = 0. Alternatively - a feature pair with retention time similarity of 1 and
282 correlational similarity of 1 is likely to derive from one compound - 1 x 1 = 1.
283
284 The RAMClustR algorithm is built on creating similarity scores for all pairs of features, submitting
285 this score matrix for hierarchical clustering, and then cutting the resulting dendrogram into neat
286 chunks using the dynamicTreeCut package - where each 'chunk' of the dendrogram results in a group of
287 features likely to be derived from a single compound. Importantly, this is achieved without looking for
288 specific phenomenon (i.e. sodiation), meaning that grouping can be performed on any dataset, whether it
289 is positive or negative ionization mode, EI or ESI, LC-MS GC-MS or CE-MS, in-source fragment or complex
290 adduction event, and predictable or unpredictable signals.
291 </token>
292
293 <token name="@HELP_experiment@">
294 <![CDATA[
295 Create an Experimental Design specification for RAMClustR experiment.
296
297 Downstream Tools
298 +-----------+-----------------------+--------+
299 | Name | Output File | Format |
300 +===========+=======================+========+
301 | RAMClustR | Experiment definition | csv |
302 +-----------+-----------------------+--------+
303
304 ]]>
305 </token>
306 </macros>