Mercurial > repos > recetox > ramclustr
view macros.xml @ 11:da7722f665f4 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/ramclustr commit bc3445f7c41271b0062c7674108f57708d08dd28
author | recetox |
---|---|
date | Thu, 30 May 2024 14:52:11 +0000 |
parents | 2d94da58904b |
children |
line wrap: on
line source
<macros> <token name="@TOOL_VERSION@">1.3.0</token> <xml name="creator"> <creator> <person givenName="Helge" familyName="Hecht" url="https://github.com/hechth" identifier="0000-0001-6744-996X" /> <person givenName="Maksym" familyName="Skoryk" url="https://github.com/maximskorik" identifier="0000-0003-2056-8018" /> <person givenName="Matej" familyName="Troják" url="https://github.com/xtrojak" identifier="0000-0003-0841-2707" /> <person givenName="Martin" familyName="Čech" url="https://github.com/martenson" identifier="0000-0002-9318-1781" /> <person givenName="Zargham" familyName="Ahmad" url="https://github.com/zargham-ahmad" identifier="0000-0002-6096-224X" /> <organization url="https://www.recetox.muni.cz/" email="GalaxyToolsDevelopmentandDeployment@space.muni.cz" name="RECETOX MUNI"/> </creator> </xml> <xml name="annotation"> <edam_topics> <edam_topic>topic_3520</edam_topic> </edam_topics> </xml> <xml name="refs"> <xrefs> <xref type="bio.tools">ramclustr</xref> </xrefs> </xml> <xml name="parameters_csv"> <section name="ms_csv" title="Input MS Data as CSV" expanded="true"> <param label="Input CSV" name="ms" type="data" format="csv" help="Features as columns, rows as samples. Column header in format mz_rt."/> <param label="idMSMS" name="idmsms" type="data" format="csv" optional="true" help="Optional idMSMS / MSe csv data. Same dimension and names as in input CSV are required."/> <param label="phenoData" name="csv_phenoData" type="data" format="csv" optional="true" help="Optional csv containing phenoData."/> </section> </xml> <xml name="parameters_xcms"> <section name="xcms" title="Input MS Data as XCMS" expanded="true"> <param name="input_xcms" label="Input XCMS" type="data" format="rdata.xcms.fillpeaks" help="Grouped feature data for clustering." /> </section> </xml> <xml name="parameters_recetox_aplcms"> <section name="ms_dataframe" title="Input MS Data as parquet/tabular (output from recetox-aplcms)" expanded="true"> <param label="Input MS1 featureDefinitions" name="ms1_featureDefinitions" type="data" format="parquet,tabular" help="Metadata with columns: mz, rt, feature names containing MS data."/> <param label="Input MS1 featureValues" name="ms1_featureValues" type="data" format="parquet,tabular" help="data with rownames = sample names, colnames = feature names containing MS data."/> <param label="phenoData" name="df_phenoData" type="data" format="tsv,csv" optional="true" help="CSV/TSV file containing phenoData (optional)."/> </section> </xml> <xml name="parameters_required"> <param label="Sigma r" name="sr" type="float" value="0.5" help="Correlational similarity between features."/> <param label="Correlation method" name="cor_method" type="select" display="radio" help="Choose correlational method to be used - see [1] for details."> <option value="pearson" selected="true">pearson</option> <option value="spearman">spearman</option> <option value="kendall">kendall</option> </param> <param label="Maximum RT difference" name="maxt" value="60" type="float" help="Maximum difference to calculate RT similarity - values beyond this are assigned zero similarity."/> </xml> <xml name="main_parameters"> <section name="clustering" title="Clustering" expanded="true"> <param label="Clustering linkage method" name="linkage" type="select" display="radio" help="Choose hierarchical clustering linkage method - see [2] for details."> <option value="average" selected="true">average</option> <option value="ward.D">ward.D</option> <option value="ward.D2">ward.D2</option> <option value="single">single</option> <option value="complete">complete</option> <option value="mcquitty">mcquitty</option> <option value="median">median</option> <option value="centroid">centroid</option> </param> <param label="Minimal cluster size" name="minModuleSize" type="integer" value="2" help="Minimal size (number of features) of a cluster."/> <param label="Maximal tree height" name="hmax" type="float" value="0.3" help="Cut the Hierarchical Cluster Analysis tree at this height, see [3] for details."/> <param label="Use deepSplit" name="deepSplit" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" help="Check to produce more smaller clusters, uncheck for fewer bigger clusters, see [3] for details."/> </section> <section name="normalisation" title="Normalisation" expanded="true"> <conditional name="normalisation_method"> <param label="Normalisation method" name="normalize" type="select" display="radio" help="Choose method for normalization of feature intensities."> <option value="none" selected="true">none</option> <option value="TIC">TIC</option> <option value="quantile">quantile</option> <option value="batch.qc">batch.qc</option> <option value="qc">qc</option> </param> <when value="batch.qc"> <param label="Metadata details" name="batch_order_qc" type="data" format="csv" help="CSV with sample names (or indices, currently not handled) on rows and columns with: batch number ('batch'), position in sequence ('order'), and whether it is a QC sample or not ('qc' with true/false OR 'sampleType' with 'sample/qc/blank')."/> <param label="QC injection range" name="qc_inj_range" type="integer" value="20" help="How many injections around each injection are to be scanned for presence of QC samples? A good rule of thumb is between 1 and 3 times the typical injection span between QC injections. i.e. if you inject QC ever 7 samples, set this to between 7 and 21. Smaller values provide more local precision but make normalization sensitive to individual poor outliers (though these are first removed using the boxplot function outlier detection), while wider values provide less local precision in normalization but better stability to individual peak areas."/> </when> <when value="qc"> <param label="Metadata details" name="batch_order_qc" type="data" format="csv" optional="true" help="CSV with sample names (or indices, currently not handled) on rows and columns with: batch number ('batch'), position in sequence ('order'), and whether it is a QC sample or not ('qc' with true/false OR 'sampleType' with 'sample/qc/blank')."/> <param label="p.cut" name="p_cut" type="float" value="0.05" help="Numeric when run order correction is applied, only features showing a run order vs signal with a linear p-value (after FDR correction) < p.cut will be adjusted. also requires r-squared < rsq.cut."/> <param label="rsq.cut" name="rsq_cut" type="float" value="0.1" help="Numeric when run order correction is applied, only features showing a run order vs signal with a linear r-squared > rsq.cut will be adjusted. also requires p values < p.cut."/> <param label="p.adjust" name="p_adjust" type="text" value="none" help="Which p-value adjustment should be used? one of ['holm', 'hochberg', 'hommel', 'bonferroni', 'BH', 'BY', 'fdr', 'none']"/> </when> <when value="none"/> <when value="TIC"/> <when value="quantile"/> </conditional> </section> <section name="performance" title="Performance"> <param label="Blocksize" name="blocksize" type="integer" value="2000" help="Number of features processed in one block."/> <param label="Blocksize factor" name="mult" type="integer" value="5" help="Factor to scale blocksize to influence processing speed."/> </section> <section name="msp_output_details" title="MSP output"> <param label="Merge MSP Files" name="merge_msp" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="Merge all MSP in one file or export one MSP per spectra."/> <param label="m/z decimal places" name="mzdec" type="integer" value="6" help="Number of decimal places used in printing m/z values."/> <!-- Currently not forwarded because the MSP is exported always manually afterwards <param label="mspout" name="mspout" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="write msp formatted spectra to file?" /> --> </section> <section name="extras" title="Extras"> <param label="RT only low n" name="rt_only_low_n" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="At low injection numbers, correlational relationships of peak intensities may be unreliable. By default, RAMClustR will simply ignore the correlational Sigma r value and cluster on retention time alone. If you wish to use correlation with at n less than 5, set this value to FALSE."/> <param label="Replace zeros" name="replace_zeros" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="NA, NaN, and Inf values are replaced with zero, and zero values are sometimes returned from peak peaking. When TRUE, zero values will be replaced with a small amount of noise, with noise level set based on the detected signal intensities for that feature."/> <param label="Quality control" name="quality_control" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" help="Add graph to Rplots.pdf output which contains summarize quality control for clustering and for quality control sample variation based on compound ($SpecAbund) and feature ($MSdata and $MSMSdata, if present)."/> <param label="Experimental design metadata" name="ExpDes" type="data" format="csv" optional="true" help="Definition of experimental design in CSV format." /> </section> <section name="filtering" title="Filtering"> <conditional name="feature_filter_blanks"> <param label="Filter blanks" name="filter_blanks" type="select" help="Is used to remove features which are found at similar intensity in blank samples"> <option value="FALSE" selected="true">FALSE</option> <option value="TRUE">TRUE</option> </param> <when value="TRUE"> <param label="qc tag" name="qc_tag" type="text" value="QC" help="Character vector of length one or two. If length is two, enter search string and factor name in $phenoData slot (i.e. c('QC', 'sample.type'). If length one (i.e. 'QC'), will search for this string in the 'sample.names' slot by default. Default is 'QC'"/> <param label="blank tag" name="blank_tag" type="text" value="blank" help="See 'qc tag' , but for blanks to use as background. Default is 'blank'"/> <param label="signal to noise (sn)" name="sn" type="integer" value="3" help="Numeric defines the ratio for 'signal'. i.e. sn = 3 indicates that signal intensity must be 3 fold higher in sample than in blanks, on average, to be retained. Default is '3'"/> <param label="Remove blanks" name="remove_blanks" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="TRUE by default. This removes any recognized blanks samples from the MSdata and MSMSdata sets after they are used to filter contaminant features."/> </when> <when value="FALSE"></when> </conditional> <conditional name="feature_filter_cv"> <param label="Filter cv" name="filter_cv" type="select" help="Extractor for xcms objects in preparation for clustering. This function offers normalization by total extracted ion signal. It is recommended to first run 'Filter blanks' to remove non-sample derived signal"> <option value="FALSE" selected="true">FALSE</option> <option value="TRUE">TRUE</option> </param> <when value="TRUE"> <param label="qc tag" name="qc_tag" type="text" value="QC" help="Character vector of length one or two. If length is two, enter search string and factor name in $phenoData slot (i.e. c('QC', 'sample.type'). If length one (i.e. 'QC'), will search for this string in the 'sample.names' slot by default. Default is 'QC'"/> <param label="Max cv" name="max_cv" type="float" value="0.5" help="Numeric maximum allowable cv for any feature. Default = 0.5."/> </when> <when value="FALSE"></when> </conditional> </section> </xml> <xml name="output_msp"> <collection label="Mass spectra from ${tool.name} on ${on_string} list" name="mass_spectra_collection" type="list"> <discover_datasets pattern="__name_and_ext__" directory="spectra" recurse="true" ext="msp"/> <filter>not msp_output_details['merge_msp']</filter> </collection> <data label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra_merged" format="msp"> <filter>msp_output_details['merge_msp']</filter> </data> <data format="pdf" name="pdf_plot" from_work_dir="Rplots.pdf" label="PDF plot."> <filter>extras['quality_control'] or filtering['feature_filter_cv']['filter_cv'] == "TRUE"</filter> </data> </xml> <xml name="citations"> <citations> <!-- Example of annotating a citation using a BibTex entry. --> <citation type="bibtex"> @article{Broeckling2014e, abstract = {Metabolomic data are frequently acquired using chromatographically coupled mass spectrometry (MS) platforms. For such datasets, the first step in data analysis relies on feature detection, where a feature is defined by a mass and retention time. While a feature typically is derived from a single compound, a spectrum of mass signals is more a more-accurate representation of the mass spectrometric signal for a given metabolite. Here, we report a novel feature grouping method that operates in an unsupervised manner to group signals from MS data into spectra without relying on predictability of the in-source phenomenon. We additionally address a fundamental bottleneck in metabolomics, annotation of MS level signals, by incorporating indiscriminant MS/MS (idMS/MS) data implicitly: feature detection is performed on both MS and idMS/MS data, and feature-feature relationships are determined simultaneously from the MS and idMS/MS data. This approach facilitates identification of metabolites using in-source MS and/or idMS/MS spectra from a single experiment, reduces quantitative analytical variation compared to single-feature measures, and decreases false positive annotations of unpredictable phenomenon as novel compounds. This tool is released as a freely available R package, called RAMClustR, and is sufficiently versatile to group features from any chromatographic-spectrometric platform or feature-finding software. {\textcopyright} 2014 American Chemical Society.}, author = {Broeckling, C. D. and Afsar, F. A. and Neumann, S. and Ben-Hur, A. and Prenni, J. E.}, doi = {10.1021/ac501530d}, issn = {15206882}, journal = {Analytical Chemistry}, number = {14}, pages = {6812--6817}, pmid = {24927477}, title = {{RAMClust: A novel feature clustering method enables spectral-matching-based annotation for metabolomics data}}, volume = {86}, year = {2014} } </citation> </citations> </xml> <token name="@HELP@"> <![CDATA[ Documentation For documentation on the tool see https://github.com/cbroeckl/RAMClustR/blob/master/vignettes/RAMClustR.Rmd Upstream Tools +------------------------------+-------------------------------+----------------------+---------------------+ | Name | Output File | Format | Parameter | +==============================+===============================+======================+=====================+ | xcms | xset.fillPeaks.RData | rdata.xcms.fillpeaks | xcmsObj | +------------------------------+-------------------------------+----------------------+---------------------+ | RAMClustR define experiment | Table with experiment details | csv | Experimental design | +------------------------------+-------------------------------+----------------------+---------------------+ The tool takes an **xcmsSet** object as input and extracts all relevant information. +-------+------------------------+--------+------------+ | Name | Output File | Format | Parameter | +=======+========================+========+============+ | ??? | Feature Table with MS1 | csv | ms | +-------+------------------------+--------+------------+ | ??? | Feature Table with MS2 | csv | idmsms | +-------+------------------------+--------+------------+ Alternatively, the tool takes a **csv** table as input which has to fulfill the following requirements (1) no more than one sample (or file) name column and one feature name row; (2) feature names that contain the mass and retention times, separated by a constant delimiter; and (3) features in columns and samples in rows. +----------------------+-------------------+-------------------+--------------------+--------------------+ | sample | 100.88_262.464 | 100.01_423.699 | 100.003_128.313 | 100.0057_154.686 | +======================+===================+===================+====================+====================+ | 10_qc_16x_dil_milliq | 0 | 195953.6376 | 0 | 0 | +----------------------+-------------------+-------------------+--------------------+--------------------+ | 11_qc_8x_dil_milliq | 0 | 117742.1828 | 4247300.664 | 0 | +----------------------+-------------------+-------------------+--------------------+--------------------+ | 12_qc_32x_dil_milliq | 4470859.38 | 0 | 2206092.112 | 0 | +----------------------+-------------------+-------------------+--------------------+--------------------+ | 15_qc_16x_dil_milliq | 0 | 0 | 2767477.481 | 0 | +----------------------+-------------------+-------------------+--------------------+--------------------+ Downstream Tools The output is a msp file or a collection of msp files, with additional Spec Abundance file. +---------+--------------+----------------------+ | Name | Output File | Format | +=========+==============+======================+ | matchms | Mass Spectra | collection (tgz/msp) | +---------+--------------+----------------------+ @GENERAL_HELP@ ]]> </token> <token name="@GENERAL_HELP@"> Background Metabolomics Metabolomics is frequently performed using chromatographically coupled mass spectrometry, with gas chromatography, liquid chromatography, and capillary electrophoresis being the most frequently utilized methods of separation. The coupling of chromatography to mass spectrometry is enabled with an appropriate ionization source - electron impact (EI) for gas phase separations and electrospray ionization (ESI) for liquid phase separations. XCMS is a commonly used tool to detect all the signals from a metabolomics dataset, generating aligned features, where a feature is represented by a mass and retention time. Each feature is presumed to derive from a single compound. However, each compound is represented by several features. With any ionization method, isotopic peaks will be observed reflective of the elemental composition of the analyte. In EI, fragmentation is a byproduct of ionization, and has driven the generation of large mass spectral libraries. In ESI, in-source fragmentation frequently occurs, the magnitude of which is compound dependent, with more labile compounds being more prone to in-source fragmentation. ESI can also product multiple adduct forms (protonated, potassiated, sodiated, ammoniated...), and can produce multimers (i.e. [2M+H]+, [3M+K]+, etc) and multiple charged species ([M+2H]++). This can become further complicated by considering combinations of these phenomena. For example [2M+3H]+++ (triply charged dimer) or an in-source fragment of a dimer. RAMClustR approach RAMClustR was designed to group features designed from the same compound using an approach which is **1.** unsupervised, **2.** platform agnostic, and **3.** devoid of curated rules, as the depth of understanding of these processes is insufficient to enable accurate curation/prediction of all phenomenon that may occur. We achieve this by making two assumptions. The first is that two features derived from the same compound with have (approximately) the same retention time. The second is that two features derived from the same compound will have (approximately) the same quantitative trend across all samples in the xcms sample set. From these assumptions, we can calculate a retention time similarity score and a correlational similarity score for each feature pair. A high similarity score for both retention time and correlation indicates a strong probability that two features derive from the same compound. Since both conditions must be met, the product of the two similarity scores provides the best approximation of the total similarity score - i.e. a feature pair with retention time similarity of 1 and correlational similarity of 0 is unlikely to derive from one compound - 1 x 0 = 0, the final similarity score is zero, indicating the two features represent two different compounds. Similarly, a feature pair with retention time similarity of 0 and correlational similarity of 1 is unlikely to derive from one compound - 0 x 1 = 0. Alternatively - a feature pair with retention time similarity of 1 and correlational similarity of 1 is likely to derive from one compound - 1 x 1 = 1. The RAMClustR algorithm is built on creating similarity scores for all pairs of features, submitting this score matrix for hierarchical clustering, and then cutting the resulting dendrogram into neat chunks using the dynamicTreeCut package - where each 'chunk' of the dendrogram results in a group of features likely to be derived from a single compound. Importantly, this is achieved without looking for specific phenomenon (i.e. sodiation), meaning that grouping can be performed on any dataset, whether it is positive or negative ionization mode, EI or ESI, LC-MS GC-MS or CE-MS, in-source fragment or complex adduction event, and predictable or unpredictable signals. </token> <token name="@HELP_experiment@"> <![CDATA[ Create an Experimental Design specification for RAMClustR experiment. Downstream Tools +-----------+-----------------------+--------+ | Name | Output File | Format | +===========+=======================+========+ | RAMClustR | Experiment definition | csv | +-----------+-----------------------+--------+ ]]> </token> </macros>