Mercurial > repos > malex > secimtools
changeset 1:2e7d47c0b027 draft
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/anova_fixed.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,101 @@ +<tool id="secimtools_anova_fixed" name="Analysis of Variance (ANOVA) Fixed Effects Model" version="@WRAPPER_VERSION@"> + <description>- Perform a multi-way ANOVA with covariates and fixed effects.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +anova_fixed.py +--input $input +--design $design +--ID $uniqID +--factors "$factor" +--factorTypes $factorTypes +--out $results_table +--flags $flags_table +--fig $qq_plots +--fig2 $volcano_plots +#if $interactions + --interactions +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note: you need a 'sampleID' column. If not-tab separated, see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature Identifier" help="Name of the column in your wide dataset that contains unique feature identifiers."/> + <param name="factor" type="text" size="30" value="" label="Group(s)/Treatment(s)" help="Name of the column(s) (comma separated) in your design file that you want to use for ANOVA (ie. treatment1,treatment2,weight)."/> + <param name="factorTypes" type="text" size="30" value="" label=" Type of Group(s)/Treatment(s)" help="Type of data in your treatment columns: 'C' for Categorical and 'N' for numerical. Match the order the groups used. (ie. C,C,N)."/> + <param name="interactions" type="boolean" label="Calculate ANOVA with interactions" help="If 'Yes', the ANOVA program will output the interactions."/> + </inputs> + <outputs> + <data format="tabular" name="results_table" label="${tool.name} on ${on_string}: Results Table"/> + <data format="tabular" name="flags_table" label="${tool.name} on ${on_string}: Flags Table"/> + <data format="pdf" name="qq_plots" label="${tool.name} on ${on_string}: QQ Plots"/> + <data format="pdf" name="volcano_plots" label="${tool.name} on ${on_string}: Volcano Plots"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="factor" value="White_wine_type_and_source" /> + <param name="factorTypes" value="C" /> + <output name="results_table" file="ST000006_anova_fixed_with_group_summary.tsv" /> + <output name="flags_table" file="ST000006_anova_fixed_with_group_flags.tsv" /> + <output name="qq_plots" file="ST000006_anova_fixed_with_group_qq_plots.pdf" compare="sim_size" delta="10000" /> + <output name="volcano_plots" file="ST000006_anova_fixed_with_group_volcano.pdf" compare="sim_size" delta="10000" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool fits analysis of variance (ANOVA) fixed effects model with multiple grouping variables, their interactions and numerical characteristics. +The analysis is performed row wise, independently for each feature. +The user can choose whether to include interactions between grouping variables in the model or to use a pure additive model. +Numerical characteristics of the samples can be included for both scenarios. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Group(s)/Treatment(s)** + + - Name of the column(s) (comma separated) in your Design File that you want to use for ANOVA.(ie. treatment1,treatment2,weight). + +**Type of Group(s)/Treatment(s)** + + - Type of data in your treatment column(s): 'C' for Categorical and 'N' for numerical. Match the order the groups are used. (ie. C,C,N). + +**Calculate ANOVA with interactions** + + - If 'Yes', the ANOVA program will output interactions. + +-------------------------------------------------------------------------------- + +**Output** + +The user will get four different outputs from the fixed effects ANOVA tool: + +(1) a TSV file with the results table containing the fixed effects ANOVA results for each variable, the corresponding contrast and analysis of the means. +(2) a TSV file with the flags for significant p-values. The flags in the TSV file are equal to 1 if the difference between the groups is statistically significant using the specified α level. +(3) a PDF file with QQ (quantile-quantile) plots displaying the expected quantiles of a normal distribution on x-axis versus the observed quantiles on y-axis. +(4) and a Volcano plot with the fold change displayed on the x-axis and the -log 10 of the p-value from the test of the null hypothesis is displayed on the y-axis. The red dashed line corresponds to p=0.01. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bland_altman_plot.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,129 @@ +<tool id="secimtools_bland_altman_plot" name="Bland-Altman (BA) Plot" version="@WRAPPER_VERSION@"> + <description>- Create pairwise BA plots for outlier detection.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +bland_altman_plot.py +--input $input +--design $design +--ID $uniqID +--figure $ba_plots +--flag_dist $outlier_dist_plots +--flag_sample $flag_sample +--flag_feature $flag_feature +--resid_cutoff $resid_cutoff +--sample_flag_cutoff $sample_cutoff +--feature_flag_cutoff $feature_cutoff +--prop_feature $proportion_of_features +--prop_sample $proportion_of_samples + +#if $group + --group $group + + #if $processOnly: + --process_only "$processOnly" + #end if +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab-separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" size="30" type="text" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that contains unique feature identifiers."/> + <param name="resid_cutoff" type="integer" size="30" value="3" label="Outlier Cutoff" help="Residual cutoff value, this value will flag samples with residuals ≥ this cutoff value."/> + <param name="sample_cutoff" type="float" size="30" value="0.2" min="0" max="1" label="Sample Flag Cutoff" help="Flag a sample as 1 if the proportion of features within a sample that are outliers exceeds this cutoff. [Number between 0-1]."/> + <param name="feature_cutoff" type="float" size="30" value="0.05" min="0" max="1" label="Feature Flag Cutoff" help="Flag a feature as 1 if the proportion of times this feature was identified as an outlier exceeds this cutoff. [Number between 0-1]."/> + <param name="group" type="text" size="30" value="" optional="true" label="Group/Treatment [Optional]" help="Name of the column in your Design File that contains group classifications."/> + <param name="processOnly" size="30" type="text" value="" optional="true" label="Group Name [Optional]" help="Name of the group(s) that you want to process. Separate multiple group names with spaces (e.g. RC,control,treatment). Leave blank to process all groups. Requires the group parameter."/> + </inputs> + <outputs> + <data format="pdf" name="ba_plots" label= "${tool.name} on ${on_string}: BA plot" /> + <data format="pdf" name="outlier_dist_plots" label= "${tool.name} on ${on_string}: Distribution"/> + <data format="tabular" name="flag_sample" label= "${tool.name} on ${on_string}: Flag Sample"/> + <data format="tabular" name="flag_feature" label= "${tool.name} on ${on_string}: Flag Feature"/> + <data format="tabular" name="proportion_of_features" label= "${tool.name} on ${on_string}: Proportion of Feature"/> + <data format="tabular" name="proportion_of_samples" label= "${tool.name} on ${on_string}: Proportion of Samples"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <output name="ba_plots" file="ST000006_bland_altman_plot_with_group_figure.pdf" compare="sim_size" delta="10000" /> + <output name="outlier_dist_plots" file="ST000006_bland_altman_plot_with_group_flag_distribution.pdf" compare="sim_size" delta="10000" /> + <output name="flag_sample" file="ST000006_bland_altman_plot_with_group_flag_sample.tsv" /> + <output name="flag_feature" file="ST000006_bland_altman_plot_with_group_flag_feature.tsv" /> + <output name="prop_feature" file="ST000006_bland_altman_plot_with_group_proportion_feature.tsv" /> + <output name="prop_sample" file="ST000006_bland_altman_plot_with_group_proportion_sample.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The Bland-Altman plot (BA-Plot) is used to look at the concordance of data between pairs of samples, particularly between replicates. +The script generates BA-plots for all pairwise combinations of samples. +If the Group/Treatment column and group name(s) in that column are provided then BA-Plots are generated only for pairwise combinations within the specified Group -- group name combination. +In addition to generating the BA-plots, a linear regression fit is calculated between the values that correspond to the pair of samples to identify (flag) any unusual outlying values. +The flags produced by the regression fit are used to generate distribution plots and text files for (i) each sample (column) and for (ii) each feature (row). + + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + + +**Outlier Cutoff – flagging values** + +- Residual cutoff value, this value will flag samples with residuals ≥ than this cutoff value. + + (1) If the magnitude of the residuals from the linear regression on the BA-plot exceeds the user-defined threshold, then a value is flagged as an outlier. This cutoff can be adjusted by the user, the default is 3. + + (2) If a value is identified as a leverage point using Cook's D with a p-value cutoff of 0.5, then the value is flagged. This cannot be adjusted. + + (3) If a value is identified as a leverage point using the DFFITS technique it is also flagged. This cannot be adjusted. + +**Sample Flag Cutoff – flagging samples** + + - Flag a sample as 1 if the proportion of features within a sample that are outliers exceeds this cutoff. [Number between 0-1]. + +**Feature Flag Cutoff – flagging features** + + - Flag a feature as 1 if the proportion of times this feature was identified as an outlier exceeds this cutoff. [Number between 0-1]. + +@GROUP@ + +**Group ID** + + - Name of the group(s) that you want to process. Separate multiple groupIDs with spaces. Leave blank to process all groups. Requires the group parameter. + +-------------------------------------------------------------------------------- + +**Output** + +This tool outputs four (or five) different files depending on the input settings: +(1) a PDF file containing BA-plots and scatterplots for each pair of samples +(2) a PDF file containing histograms of the most flagged features and samples +(3) two TSV files containing flags: one for samples and one for features +(4) if a grouping variable name is specified in the input, a TSV file containing flags for each group is also generated. +(5) two TSV files containing (i) the proportion of features flagged per sample and (ii) the proportion of samples flagged per feature. +If a sample (or feature) is flagged, the user should consider removing it from further analysis. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blank_feature_filtering_flags.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,103 @@ +<tool id="secimtools_blank_feature_filtering_flags" name="Blank Feature Filtering (BFF)" version="@WRAPPER_VERSION@"> + <description> - Calculate LOD and flag features in non-blank samples below threshold.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +blank_feature_filtering_flags.py +--input $input +--design $design +--uniqID $uniqID +--group $group +--blank "$blank" +--bff $bff +--criteria $cv +--outflags $outflags +--outbff $outbff + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input tab-separated wide format dataset. If not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Design file, tab separated. Note: you need a 'sampleID' column. If not tab-separated, see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that contains unique feature identifiers."/> + <param name="group" type="text" size="30" value="" optional="true" label="Group/Treatment" help="Name of the column in your design file that contains classification information."/> + <param name="blank" type="text" size="30" optional="true" label="Blank Name" help="Name given to the blank samples in your Group/Treatment column. Used to calculate baseline in filtering."/> + <param name="bff" type="integer" size="30" value="5000" optional="true" label="BFF Threshold" help="Default value to use as the limit of detection."/> + <param name="cv" type="integer" size="30" value="100" optional="true" label="Criterion Value" help="Number of times the signal in the samples shoud be stronger than the corresponding signal in the blanks."/> + </inputs> + <outputs> + <data format="tabular" name="outflags" label="${tool.name} on ${on_string}: Flags"/> + <data format="tabular" name="outbff" label="${tool.name} on ${on_string}: Value"/> + </outputs> + <tests> + <test> + <param name="input" value="TEST0000_data.tsv"/> + <param name="design" value="TEST0000_design.tsv"/> + <param name="uniqID" value="rowID" /> + <param name="group" value="group_combined" /> + <param name="blank" value="blank" /> + <param name="bff" value="5000" /> + <param name="cv" value="100" /> + <output name="outflags" file="TEST0000_blank_feature_filtering_flags_outflags.tsv" /> + <output name="outbff" file="TEST0000_blank_feature_filtering_flags_outbff.tsv" /> + </test> + </tests> +<help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +**NOTE:** The tool is relevant for Liquid Chromatography Mass Spectroscopy (LC-MS) data analysis and removes “noise” from the data using values of features in the blank samples as a reference. +The tool becomes relevant when the values of features in the experimental samples are not much larger than the values of the same features in the blank samples. +The features with such a signal are treated as noise inherited by the sample due to the instrument and are flagged by the tool. + +The computed BFF Threshold for each feature is equal to ((3*Standard Deviation of the blank group) + (the average of the blank group)) and is calculated aross blank samples only. +If, for a given feature, the computed BFF Threshold is less than or equal to 0, the user specified BFF Threshold overrides the computed BFF Threshold (default value for user specfied BFF Threshold is 5000). + +The user specified BFF Threshold becomes relevant when the blank group contains a lot of zero values or is on log-transformed scale. + +A feature is flagged as below the detection limit for a given group if the ratio ((group mean – BFF Threshold) / BFF Threshold) is less than the Criterion Value (default 100) for the average within the group. + +------------------------------------------------------------------------------------------ + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs in the wide dataset must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +@GROUP@ + +**Blank Name** + + - Name given to the blanks samples in your Group/Treatment column. Used to calculate the computed BFF Threshold for filtering. + +**BFF Threshold** + + - User-specified BFF threshold to use as the limit of detection. The default value is 5000. + +**Criterion Value** + + - Number of times the signal in the experimental samples should be greater than the corresponding signal in the blanks. + +-------------------------------------------------------------------------------- + +**Output** + +This tool outputs two files: + +(1) a TSV file with values that were compared to the Criterion Value and +(2) a TSV file containing flags for each feature. Flag values of one (1) correspond to features which failed to satisfy the BFF Threshold Criterion Value and are considered below the detection limit for the given group. + + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/coefficient_variation_flags.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,85 @@ +<tool id="secimtools_coefficient_variation_flags" name="Coefficient of Variation (CV) Flags" version="@WRAPPER_VERSION@"> + <description>- Calculate the coefficient of variation and flag potential outliers.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +coefficient_variation_flags.py +--input $input +--design $design +--ID $uniqID +--figure $CVplot +--flag $CVflag +#if $group: + --group $group +#end if +#if $CVcutoff: + --CVcutoff $CVcutoff +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note: you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that contains unique feature identifiers."/> + <param name="group" type="text" size="30" label="Group/Treatment [Optional]" help="Name of the column in your design file that contains group classifications." /> + <param name="CVcutoff" type="float" optional="true" size="4" value="0.1" label="CV Cutoff [Optional]" help="The cutoff (in decimals) that specifies the proportion of features to flag. The default CV cutoff is 0.1 which implies that 10% of the features with the largest CV-s will be flagged."/> + </inputs> + <outputs> + <data format="pdf" name="CVplot" label="${tool.name} on ${on_string}: Plot" /> + <data format="tabular" name="CVflag" label="${tool.name} on ${on_string}: Flag" /> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <output name="CVplot" file="ST000006_coefficient_variation_flags_with_group_figure.pdf" compare="sim_size" delta="100000"/> + <output name="CVflag" file="ST000006_coefficient_variation_flags_with_group_flag.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +This tool calculates the coefficient of variation (standard deviation as a percentage of the mean) and is often used to look at the consistency of features across samples. +The user can define what percent of features with the highest CV to flag. +If no percentage is selected, then the top 10% of features with the highest CV are flagged (default value of 0.1). +The CV value corresponding to the percentage is given in the resulting histogram plot. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**CV cutoff [Optional]** + + - The cutoff (in decimals) that specifies the proportion of the features to flag. The default CV cutoff is 0.1 which implies that 10% of the features with the largest CV will be flagged. If the Group/Treatment variable is provided, then the analysis is performed independently for each group. If no Group/Treatment variable is provided, the analysis is performed on the entire dataset. + +@GROUP_OPTIONAL@ + + +-------------------------------------------------------------------------------- + +**Output** + +This tool outputs two different files: + +(1) a TSV file containing the CV Flags for each feature for each group (if group variable is specified). A flag value of one (1) corresponds to features with large CV values as specified by the CV cutoff. +(2) a PDF file containing histograms with overlayed density plots of the coefficients of variation for each group (optional) and a summary density plot containing the densities for each group without the histograms. + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/compare_flags.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,88 @@ +<tool id="secimtools_compare_flags" name="Compare Flags" version="@WRAPPER_VERSION@"> + <description>within a flag file.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +compare_flags.py +--input $input +--output $output +--flag1 $flag1 +--flag2 $flag2 + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Flag File" + help="Input tab-separated Flag File. If file is not tab separated see TIP below."/> + <param name="flag1" type="text" size="30" optional="False" label="Column Name for Flag 1" help="Name of the column containing the first flag to compare"/> + <param name="flag2" type="text" size="30" optional="False" label="Column Name for Flag 2" help="Name of the column containing the second flag to compare"/> + </inputs> + <outputs> + <data format="tabular" name="output" label="${tool.name} on ${on_string}: Flag Feature"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_run_order_regression_flags.tsv"/> + <param name="flag1" value="flag_feature_runOrder_pval_05" /> + <param name="flag2" value="flag_feature_runOrder_pval_01" /> + <output name="output" file="ST000006_compare_flags_output.tsv" /> + </test> + </tests> + <help><![CDATA[ + + +**Tool Description** + +This tool compares 2 columns containing binary indicators (flags) in a flag file and generates a 'cross tabulation' results file. +Flags from multiple flag files can by combined by first running the 'Merge_Flags' tool. + +-------------------------------------------------------------------------------- + +**Input** + +The tool is intended to work with Flag Files but will work with Wide Format Datasets as well. Column names should be used instead of flag names for Wide Format Datasets. + + +**Flag File:** + +A wide format dataset that contains flags for each sample or feature: + + +----------+---------+---------+---------+-----+ + | Feature | flag_A | flag_B | flag_C | ... | + +==========+=========+=========+=========+=====+ + | one | 0 | 0 | 0 | ... | + +----------+---------+---------+---------+-----+ + | two | 0 | 1 | 1 | ... | + +----------+---------+---------+---------+-----+ + | three | 0 | 1 | 0 | ... | + +----------+---------+---------+---------+-----+ + | four | 1 | 0 | 0 | ... | + +----------+---------+---------+---------+-----+ + | ... | ... | ... | ... | ... | + +----------+---------+---------+---------+-----+ + +**NOTE:** The 'Feature' column defines the rows within a flag file. + + @WIDE@ + + + +-------------------------------------------------------------------------------- + +**Output** + +The tool outputs a single TSV file containing the frequencies of the compared flags in the appropriate cells. +An example is shown below: + + +-----------------+----------------+-----------------+ + | |flag_A_[value1] | flag_A_[value2] | + +=================+================+=================+ + | flag_B_[value1] | 12 | 22 | + +-----------------+----------------+-----------------+ + | flag_B_[value2] | 0 | 100 | + +-----------------+----------------+-----------------+ + + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/compound_identification.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,105 @@ +<tool id="secimtools_compound_identification" name="Compound Identification" version="@WRAPPER_VERSION@"> + <description>based on m/z ratio and retention time.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +compound_identification.py +--anno $anno +--uniqID $uniqID +--mzID $mzID +--rtID $rtID +--library $library +--libuniqID $libuniqID +--libmzID $libmzID +--librtID $librtID +--output $output + ]]></command> + <inputs> + <param name="anno" type="data" format="tabular" label="Target Annotation File" help="Input tab-separated dataset in wide format. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" optional="false" label="Target Annotation Unique Feature ID column" help="Name of the column in your target annotation file containing unique identifiers."/> + <param name="mzID" type="text" size="30" value="" optional="false" label="Target Annotation Mass/Charge column." help="Name of the column in your target annotation file containing m/z ratios."/> + <param name="rtID" type="text" size="30" value="" optional="false" label="Target Annotation Retention Time" help="Name of the column in your target annotation file containing retention times."/> + <param name="library" type="data" format="tabular" label="Library File" help="Library dataset. If not tab separated see TIP below."/> + <param name="libuniqID" type="text" size="30" value="" optional="false" label="Library compound name column" help="Name of the column in your library file containing the compound/adduct names to use for identification."/> + <param name="libmzID" type="text" size="30" value="" optional="false" label="Library Mass/Charge column" help="Name of the column in your library file containing m/z ratios."/> + <param name="librtID" type="text" size="30" value="" optional="false" label="Library Retention Time column" help="Name of the column in your library file containing the retention times."/> + </inputs> + <outputs> + <data format="tabular" name="output" label="identified_compounds_adducts_on_${anno.name}"/> + </outputs> + <tests> + <test> + <param name="anno" value="TEST0000_mzrt_first.tsv"/> + <param name="uniqID" value="rowID_first"/> + <param name="mzID" value="MZ_first" /> + <param name="rtID" value="RT_first" /> + <param name="library" value="TEST0000_database.tsv"/> + <param name="libuniqID" value="name_database"/> + <param name="libmzID" value="MZ_database" /> + <param name="librtID" value="RT_database" /> + <output name="output" file="TEST0000_compound_identification_output.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +This tool is primarily intended for identification of compounds in a target file given a mass spectroscopy library file. + +Each metabolite (feature) is characterized by a mass to charge (m/z) ratio and retention time (RT). + +This tool matches two files: (1) a mass spectroscopy library file and (2) a target annotation file. + +The library file (in tsv format) contains a list of compounds and their associated m/z ratios and RTs. + +The target annotation file (in tsv format) contains the m/z ratios and RTs for the experimental samples. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required, a library file and an annotation target file. + +@MZRTFILE@ + +**Target Annotation Unique Feature ID column** + + - Name of the column in your target annotation file containing unique ID. + + +**Annotation Mass/Charge column** + + - Name of the column in your target annotation file containing the m/z ratio. + +**Annotation Retention Time** + + - Name of the column in your target annotation file containing Retention Times. + +**Library File** + +**Library compound name colum** + + - Name of the column in your library file containing the compound names to use. + +**Library Mass/Charge column** + + - Name of the column in your library file containing the m/z ratios. + +**Library Retention Time column** + + - Name of the column in your library file containing Retention Times. + + +-------------------------------------------------------------------------------- + +**Output** + +A TSV file containing the original target annotation input file plus an additional column containing the name of any compounds that were matched using the m/z ratio and RT. + +]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_normalization_and_rescaling.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,100 @@ +<tool id="secimtools_data_normalization_and_rescaling" name="Normalization and Re-Scaling" version="@WRAPPER_VERSION@"> + <description>of data.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +data_normalization_and_rescaling.py +--input $input +--design $design +--uniqID $uniqID +--method $method +--out $out + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input tab-separated wide format dataset. If not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Wide Dataset that has unique identifiers."/> + <param name="method" size="30" type="select" value="" display="radio" label="Normalization Method" help="Method to be used for normalization and re-scaling of the data."> + <option value="mean" selected="true">Mean (samples)</option> + <option value="sum" selected="true">Sum (samples)</option> + <option value="median" selected="true">Median (samples)</option> + <option value="centering" selected="true">Centering (features)</option> + <option value="auto" selected="true">Autoscaling (features)</option> + <option value="pareto" selected="true">Pareto (features)</option> + <option value="range" selected="true">Range (features)</option> + <option value="level" selected="true">Level (features)</option> + <option value="vast" selected="true">VAST (features)</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="out" label="${tool.name} on ${on_string}: Normalized data"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="method" value="mean" /> + <output name="out" file="ST000006_data_normalization_and_rescaling_mean_output.tsv" /> + </test> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="method" value="vast" /> + <output name="out" file="ST000006_data_normalization_and_rescaling_vast_output.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The first three normalization methods (Mean, Sum and Median) perform re-scaling of the data by sample. +Each individual sample (column) in the wide dataset is re-scaled by dividing all feature values within that column by the mean, median or sum of those feature values. +Each sample (column) is re-scaled independently from other samples (columns). + +The last six normalization methods (Centering, Pareto, Autoscaling, Range, Level, and Variable Stability (VAST)) perform scaling of the data by features. +Each feature (row) is re-scaled independently from other features. +Each individual feature (row) in the wide dataset is centered by subtraction of the mean of that feature and is re-scaled by dividing all the feature values within that row by the scaling factor. +The scaling factor is computed from the feature values in the current row and depends on the selected method. +Centering does not have a scaling factor and does not perform division, Autoscaling uses standard deviation, Pareto scaling uses the square root of the standard deviation, Range uses the difference between the max and min values, and Level uses the mean. +VAST scaling is performed in two steps. The first step is Autoscaling, followed by division of the resulting feature values in each row by the coefficient of variation for that feature. + +More information on the scaling methods are available from the literature: + +Keun, Hector C., Timothy MD Ebbels, Henrik Antti, Mary E. Bollard, Olaf Beckonert, Elaine Holmes, John C. Lindon, and Jeremy K. Nicholson. "Improved analysis of multivariate data by variable stability scaling: application to NMR-based metabolic profiling." Analytica chimica acta 490, no. 1 (2003): 265-276. +van den Berg, Robert A., Huub CJ Hoefsloot, Johan A. Westerhuis, Age K. Smilde, and Mariët J. van der Werf. "Centering, scaling, and transformations: improving the biological information content of metabolomics data." BMC genomics 7, no. 1 (2006): 142 + + +------------------------------------------------------------------------------------------ + +**Input Files** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Normalization Method** + + - Method to be used for normalization and re-scaling of the data. The parenthesis indicates whether the method will be applied to samples or features. + +-------------------------------------------------------------------------------- + +**Output** + +TSV file containing the same column names as in the original Wide Dataset where the values in each cell correspond to the values after normalization/re-scaling. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/distribution_features.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,77 @@ +<tool id="secimtools_distribution_features" name="Generate Distribution of Features across Samples." version="@WRAPPER_VERSION@"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +distribution_features.py +--input $input +--design $design +--ID $uniqID +--figure $figure +#if $group + --group $group +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Wide Dataset that has unique identifiers."/> + <param name="group" type="text" size="30" value="" optional="true" label="Group/Treatment [Optional]" help="Name of the column in your Design File that contains group classifications."/> + </inputs> + <outputs> + <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Feature Distribution"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <output name="figure" file="ST000006_distribution_features_with_group_figure.pdf" compare="sim_size" delta="10000"/> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool summarizes the distribution of 50 randomly selected features (rows) across all samples. +Boxplots with outliers and mean value are provided for each selected feature across all samples. +If group or treatment information is provided, boxplots are generated for samples within each group and for all samples. +If a group or treatment variable is not provided, boxplots are provided for all samples. + +**NOTE:** This script works best with log transformed data. + + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + + +@METADATA@ + +@UNIQID@ + +@GROUP_OPTIONAL@ + +-------------------------------------------------------------------------------- + +**Output** + +A PDF file with boxplot(s) and density plot(s): + +if Group/Treatment [Optional] is provided plots will be generated for every group as well as for all samples. Otherwise, a single plot will be generated for all samples. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/distribution_samples.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,85 @@ +<tool id="secimtools_distribution_samples" name="Generate Distribution of Features within Samples." version="@WRAPPER_VERSION@"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +distribution_samples.py +--input $input +--design $design +--ID $uniqID +--figure $figure + +#if $group + --group $group +#end if + +#if $order + --order $order +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Wide Dataset that has unique identifiers."/> + <param name="group" type="text" size="30" value="" optional="true" label="Group/Treatment [Optional]" help="Name of the column in your design file that contains group classifications."/> + <param name="order" type="text" size="30" value="" optional="true" label="Run Order [Optional]" help="The column name in your design file that contains the order samples were run."/> + </inputs> + <outputs> + <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Plot"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <output name="figure" file="ST000006_distribution_samples_with_group_figure.pdf" compare="sim_size" delta="10000"/> + </test> + </tests> +<help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool plots the distribution of features within each sample. +All samples are colored by group and are plotted on the same graph for comparison purposes. +The distributions of the features within each sample are presented as estimated densities and box-and-whiskers plots with potential outliers. +If the order samples were run is specified in the input, box plots will be displayed according to the run order. + +**NOTE:** This script works best with log transformed data. + + +-------------------------------------------------------------------------------- + +**Input** + +- Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +@GROUP_OPTIONAL@ + +@RUNORDER_OPTIONAL@ + +-------------------------------------------------------------------------------- + +**Output** + +PDF file containing two plots: + +(1) Density plots illustrating the distribution of features within a given sample +(2) Boxplots of the distribution of features within each sample + +]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/generate_rank_wide.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,55 @@ +<tool id="secimtools_generate_rank_wide" name="Ranked wide file" version="@WRAPPER_VERSION@"> + <description>Generate a wide format file with ranked columns from an input wide file.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +addGroupRank.py +--wide $wide +--design $design +--out $out +#if $ngroup: + --ngroup $ngroup +#end if +--rank $rank +--uniqID $uniqID + ]]></command> + <inputs> + <param name="wide" type="data" format="tabular" label="Wide Dataset" help="Input dataset in wide format and tab separated. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Design file tab separated. Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="rowID" label="Unique Feature ID" help="Name of the column in your Wide Dataset that has unique Feature IDs."/> + <param name="ngroup" type="integer" size="30" value="" optional="true" label="number of groups for each feature being ranked" help="number of bins/groups for each feature being ranked. If there is no input, the default behavir is ranking each column."/> + <param name="rank" type="text" size="30" value="Rank" label=" Name of flag column" help="The column name in your design file that contains the flags to specify whether a sampleID will be ranked in the input wide file."/> + </inputs> + <outputs> + <data format="tabular" name="out" label="${tool.name} on ${on_string}: Results Table"/> + </outputs> + + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool is used to generate a wide format file with each column being ranked in the input wide format dataset. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + + +-------------------------------------------------------------------------------- + +**Output** + +The user will get a output file from the tool: + +(1) a wide format file with each column being ranked. The columns will be selected using the flag column in the design file. + + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hierarchical_clustering_heatmap.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,81 @@ +<tool id="secimtools_hierarchical_clustering_heatmap" name="Hierarchical Clustering Heatmap" version="@WRAPPER_VERSION@"> + <description>- Calculate means per group and plot a heatmap.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +hierarchical_clustering_heatmap.py +--input $input +--design $design +--uniqID $uniqID +#if $dendogram + --dendogram +#end if +--labels $labels +--fig $fig + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="dendogram" type="boolean" value="False" label="Add dendogram on heatmap" help="Select 'Yes' to print a dendogram over the heatmap."/> + <param name="labels" type="select" label="Select to remove labels from plots" multiple="true" display="checkboxes"> + <option value="x">X-axis labels</option> + <option value="y">Y-axis labels</option> + </param> + </inputs> + <outputs> + <data format="pdf" name="fig" label="${tool.name} on ${on_string}"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="dendogram" value="True" /> + <param name="labels" value="x,y" /> + <output name="fig" file="ST000006_hierarchical_clustering_heatmap_figure.pdf" compare="sim_size" delta="10000" /> + </test> + </tests> +<help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +This tool generates a hierarchical cluster heatmap from a wide format dataset. +An option to add a hierarchical clustering dendrogram on the top of the heatmap figure is included along with an option to removal plot labels. + +**NOTE:** This script works best with log transformed data that contains no missing data. + +-------------------------------------------------------------------------------- + +**Input** + +- Two input datasets are required. + + @WIDE@ + + **NOTE:** The sample IDs must match the sample IDs in the Design File + (below). Extra columns will automatically be ignored. + + + @METADATA@ + +**In addition to your datasets, you need to provide:** + +**Unique Feature ID** + + - The column name in your wide dataset that contains the unique IDs for + your features. In our example dataset you would input *Compound*. + +-------------------------------------------------------------------------------- + +**Output** + +A PDF file with a hierarchical cluster heatmap of the data + +]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/imputation.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,190 @@ +<tool id="secimtools_imputation" name="Imputation (Mean, Median, K-Nearest Neighbours, Stochastic)" version="@WRAPPER_VERSION@"> + <description>of missing values using selected algorithm.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <stdio> + <exit_code range="1:" level="warning" description="UserWarning"/> + <exit_code range="1:" level="warning" description="VisibleDeprecationWarning"/> + </stdio> + <command detect_errors="exit_code"><![CDATA[ +imputation.py +--input $input +--design $design +--ID $uniqID +--group $group + +--output $imputed + +--knn $k +--strategy $imputation +--row_cutoff $rowCutoff +--col_cutoff $colCutoff +--distribution $distribution + +#if $noZero + --no_zero +#end if + +#if $noNeg + --no_negative +#end if + +#if $exclude + --exclude $exclude +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers.."/> + <param name="group" type="text" size="30" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/> + <param name="imputation" size="30" type="select" value="" label="Imputation Strategy" help="Choose an imputation strategy."> + <option value="knn" selected="true">K-Nearest Neighbors</option> + <option value="bayesian" selected="true">Stochastic</option> + <option value="mean" selected="true">Mean</option> + <option value="median" selected="true">Median</option> + </param> + <param name="noZero" type="boolean" label="Count Zeroes as missing" help="Zeroes can be counted as missing or left as data."/> + <param name="noNeg" type="boolean" label="Count Negative as missing" help="Negatives can be counted as missing or left as data."/> + <param name="exclude" type="text" size="30" value="" label="Additional values to treat as missing [Optional]" help="Separate additional values to treat as missing data with commas."/> + <param name="rowCutoff" type="text" size="30" value=".5" label="Row Percent Cutoff Value" help="Proportion of missing values allowed per group per row. If the proportion of missing values for each feature is greater than the specifed value, then the sample mean is imputed instead of values from the K-Nearest Neighbors algorithm. Default: 0.5 (50%)."/> + <param name="k" type="text" size="30" value="5" label="K value" help="Only for K-Nearest Neighbors Imputation, ignore for other imputation methods. K value is the number of neighbors to search. Default: 5. If less then 5 neighbours are available, all are used."/> + <param name="colCutoff" type="text" size="30" value=".8" label="Column Percent Cutoff Value" help="Only for K-Nearest Neighbors Imputation, ignore for other imputation methods. If the proportion of missing values is greater than the specified value, the imputation stops and the tool returns an error. Default: 0.8 (80%)."/> + <param name="distribution" size="30" type="select" value="" label="Bayesian Distribution" help="Only for Stochastic Imputation, ignore for other imputation methods. Choose between normal and Poisson distributions."> + <option value="Poisson" selected="true">Poisson</option> + <option value="Normal" selected="true">Normal</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="imputed" label="${tool.name} on ${on_string}"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="imputation" value="knn" /> + <output name="imputed" file="ST000006_imputation_output_KNN.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool performs an imputation procedure for missing data based on three conceptually different methods: + +(1) naive imputation (mean, median), +(2) K-nearest neighbor imputation (KNN) and +(3) stochastic imputation (based on normal and Poisson distributions) + +Imputations are performed separately for each sample group since treatment groups are expected to be different. +If only a single sample (column) is available for a given group, nothing is imputed and the sample is kept intact. +An option to select which values should be treated as missing is included. +The default value for missing data is an empty cell in the dataset with the option to treat zeroes, negative values and user-defined value(s) as missing and subsequently impute missing values. + +(1) Naive imputation: + +Computes the mean (or median) of the features within the samples for a given group and uses that value to impute values for that feature among the missing samples. +Feature values for all missing samples in the group get the same value equal to the mean (median) of the available samples, provided the allowed missing threshold is met. + +(2) K-Nearest Neighbors (KNN) imputation: + +Based on the procedure where nearest neighbor samples (K value default = 5) for the given sample within each group are considered. +The neighboring samples are used to generate the missing value for the current samples. +If less than the specified K value number of neighbors are available for the current sample in the current group, then the maximum available number of neighbors is used. +If the proportion of missing values for each row (feature) is greater than the specified Row Percent Cutoff value (default 0.5), then the column (sample) mean is imputed instead of values from the KNN algorithm. +The proportion of missing values for each column (sample) can be specified (Column Percent Cutoff default = 0.8) and determines whether a sample should be imputed or not. +If the proportion of missing values for each sample is greater than the specified value, then the missing values are not imputed and the imputation process is interrupted. +The algorithm is deterministic and always imputes the same missing values for the same settings. +More details on the algorithm are available via the reference and link below: + +Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value estimation methods for DNA microarrays BIOINFORMATICS Vol. 17 no. 6. 2001 Pages 520-525. + +https://bioconductor.org/packages/release/bioc/html/impute.html + +(3) Stochastic imputation: + +Based on the assumption that each feature within a given group follows some underlying distribution. +As a result, all missing values are generated from the underlying distribution. +The parameter(s) of the underlying distribution is (are) estimated from the observed features. +Two distribution options are available: + +Normal (recommended for logged and negative data) and Poisson (recommended for nonnegative counts). +The normal distribution parameters are estimated by the mean and standard deviation of the observed samples for a given feature. +If all observed values for a feature are the same, then the standard deviation is assumed to be 1/3 the absolute value of the mean. +The Poisson distribution parameter is estimated by the mean of the observed values for a given feature and is expected to be positive for the imputation procedure to work correctly. + +-------------------------------------------------------------------------------- + +**Note** + +- This tool currently treats all variables as continuous numeric + variables. Running the tool on categorical variables might result in + incorrect results. +- Rows containing non-numeric (or missing) data in any + of the chosen columns will be skipped from the analysis. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +@GROUP@ + +**Imputation Strategy.** + + - Select an imputation strategy. + +**Count Zeroes as missing.** + + - Zeroes can be treated as missing or left as data. + +**Count Negative as missing.** + + - Negatives can be treated as missing or left as data. + +**Additional values to treat missing [Optional].** + + - Additional values to treat as missing data, separate with commas. + +**Row Percent Cutoff Value.** + + - Proportion of missing values allowed per group per row. If the proportion of missing samples in the row is greater than the cutoff value specified, nothing will be imputed for that row. Default: 0.5 (50%). + +**K value.** + + - If you are not using the KNN Imputation, then ignore. K value is the number of neighbors to search. Default: 5. If less then 5 neighbours are available, all are used. + +**Column Percent Cutoff Value.** + + - If you are not using the KNN Imputation, then ignore. The maximum proportion of missing data allowed in any data column (sample). Default: 0.8 (80%). The imputation will fail if the proportion in the data exceeds this cutoff! + +**Bayesian Distribution.** + + - Choose between Normal and Poisson distributions for stochastic imputation. + +-------------------------------------------------------------------------------- + +**Output** + +TSV file containing the same column names as the original Wide Dataset where the values in each cell correspond to either the original values or to values obtained during the imputation procedure. + + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kruskal_wallis.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,86 @@ +<tool id="secimtools_kruskal_wallis" name="Kruskal-Wallis Non-Parametric Test" version="@WRAPPER_VERSION@"> + <description>on features (rows).</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +kruskal_wallis.py +--input $input +--design $design +--uniqueID $uniqueID +--group $group +--summaries $summaries +--flags $flags +--volcano $volcano + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqueID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="group" type="text" size="30" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/> + </inputs> + <outputs> + <data format="tabular" name="summaries" label="${tool.name} on ${on_string}: Summaries that include p-values and mean differences."/> + <data format="tabular" name="flags" label="${tool.name} on ${on_string}: Flags that include 0.01, 0.05 and 0.10 significance levels for the pairwise differences. "/> + <data format="pdf" name="volcano" label="${tool.name} on ${on_string}: Volcano plots for the pairwise differences."/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqueID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <output name="summaries" file="ST000006_kruskal_wallis_with_group_summary.tsv" /> + <output name="flags" file="ST000006_kruskal_wallis_with_group_flags.tsv" /> + <output name="volcano" file="ST000006_kruskal_wallis_with_group_volcano.pdf" compare="sim_size" delta="10000" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool performs Kruskal-Wallis non-parametric test, an analog of the one-way ANOVA F-test that does not rely on the normality assumption of the distribution. +Unlike t-tests or an ANOVA F-test, a Kruskal-Wallis test is based on ranks where ranks are compared between groups. +The test is performed (1) for samples from all groups together and (2) for the samples belonging to each group. +The user is referred to the literature for more details on the Kruskal-Wallis test and the computation/approximation of corresponding p-values. + +Kruskal, William H., and W. Allen Wallis. "Use of ranks in one-criterion variance analysis." Journal of the American statistical Association 47, no. 260 (1952): 583-621. + +Meyer, J. Patrick, and Michael A. Seaman. "A comparison of the exact Kruskal-Wallis distribution to asymptotic approximations for all sample sizes up to 105." The Journal of Experimental Education 81, no. 2 (2013): 139-156. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Group/Treatment** + + - Name of the column the Design File that contain group classifications. + +-------------------------------------------------------------------------------- + +**Output** + +Three different outputs are generated: + +(1) a TSV file with the results, including p-values for each test and the corresponding differences between the means for comparisons between the groups. +(2) a TSV file containing indicator flags. A flag = 1 if the difference between the groups is statistically significant. +(3) a PDF file with volcano plots for visual inspection of the differences between the treatment groups. The red dashed line in the volcano plot(s) corresponds to a 0.01 cutoff for p-values (2 on the negative log base 10 scale). + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lasso_enet_var_select.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,110 @@ +<tool id="secimtools_lasso_enet_var_select" name="LASSO/Elastic Net Variable Selection," version="@WRAPPER_VERSION@"> + <description>for feature selection.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <stdio> + <exit_code range="1:" level="warning" description="RuntimeWarning"/> + </stdio> + <command><![CDATA[ +lasso_enet_var_select.py +--input $input +--design $design +--ID $uniqID +--group $group +--alpha $alpha +--coefficients $coefficients +--flags $flags +--plots $plots + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers.."/> + <param name="group" type="text" size="30" label="Group/Treatment." help="Name of the column in your design file that contains group classifications."/> + <param name="alpha" type="text" value=".5" size="30" label="shrinkage parameter α" help="Shrinkage parameter α specifies the penalty for the LASSO/Elastic Net procedure. Default 0.5"/> + </inputs> + <outputs> + <data format="tabular" name="coefficients" label="${tool.name} on ${on_string}: Coefficients"/> + <data format="tabular" name="flags" label="${tool.name} on ${on_string}: Flags"/> + <data format="pdf" name="plots" label="${tool.name} on ${on_string}: Plots"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="alpha" value="0.5" /> + <output name="coefficients" file="ST000006_lasso_enet_var_select_coefficients.tsv" /> + <output name="flags" file="ST000006_lasso_enet_var_select_flags.tsv" /> + <output name="plots" file="ST000006_lasso_enet_var_select_plots.pdf" compare="sim_size" delta="10000" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool selects (identifies) features that are different between pairs of treatment groups. +The selection is performed based on the logistic regression with Elastic Net shrinkage (with LASSO being a special case). +The selection method is defined by shrinkage parameter α. +Variable selection can be performed for any value of α in the range [0:1] where α = 1 corresponds to the fewest number of variables and the most strict selection criterion (LASSO) and α = 0 corresponds to shrinkage without variable selection (Ridge regression). The default value is α = 0.5. +The best subset of variables for a given α is selected by a cross validation procedure. +Lambda is a penalty parameter determined during the cross validation procedure. + +More details about the Elastic Net and LASSO methods can be found in the reference below: + +Zou, H., and Hastie, T. (2005). Regularization and variable selection via the elastic net. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 67(2), 301-320. + +Tibshirani, Robert. "Regression shrinkage and selection via the lasso." Journal of the Royal Statistical Society. Series B (Methodological) (1996): 267-288. + +Friedman, Jerome, Trevor Hastie, and Rob Tibshirani. "Regularization paths for generalized linear models via coordinate descent." Journal of statistical software 33, no. 1 (2010): 1. + +-------------------------------------------------------------------------------- + +**Note** + +- This tool currently treats all variables as continuous numeric + variables. Running the tool on categorical variables might result in + incorrect results. +- Rows containing non-numeric (or missing) data will be excluded. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +@GROUP@ + +**shrinkage parameter α** + + - Specifies the penalty for the LASSO/Elastic Net procedure. Default = 0.5 + +-------------------------------------------------------------------------------- + +**Output** + +This file outputs three files: + +(1) A TSV file containing the values of the coefficients (including zeroes) for each feature generated by the tool for each pair of comparisons (in columns). These coefficients are produced from the transformed data (as part of the LASSO/EN method) and should be interpreted with caution. + +(2) A TSV file containing the corresponding flags for each feature where the value “1” corresponds to features selected by the method. + +(3) A PDF file containing graphs for each pairwise comparison between the groups. The first graph displays the behavior of the coefficients based on the value of penalty parameter lambda and the shrinkage parameter α. The second graph provides details of cross-validation procedure used for detection of the optimal penalty and for feature selection. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linear_discriminant_analysis.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,128 @@ +<tool id="secimtools_linear_discriminant_analysis" name="Linear Discriminant Analysis (LDA)" version="@WRAPPER_VERSION@"> + <description>.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +linear_discriminant_analysis.py +--input $input +--design $design +--ID $uniqID +--group $group +--cross_validation $cross_validation +--outClassification $outClassification +--outClassificationAccuracy $outClassificationAccuracy +--nComponents $nComponents +--out $out +--figure $figure + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers.."/> + <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/> + <param name="cross_validation" type="select" size="30" display="radio" value="double" label="Cross-Validation Choice - NOTE: a minimum of 100 samples is required for single or nested cross validation"> + <option value="none">None</option> + <option value="single">Single</option> + <option value="double">Double</option> + </param> + <param name="nComponents" type="text" size="30" value="2" label="Number of Components" help="Enter the number of components to use in the analysis. This value should be less than the number of groups and is used only when the cross-validation options field is set to 'none'."/> + </inputs> + <outputs> + <data format="tabular" name="out" label="${tool.name} on ${on_string}: Components"/> + <data format="tabular" name="outClassification" label="${tool.name} on ${on_string}: Classification of Samples"/> + <data format='tabular' name="outClassificationAccuracy" label="${tool.name} on ${on_string}: Classification Accuracy of Samples"/> + <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Scatter Plots"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="cross_validation" value="none"/> + <param name="nComponents" value="2"/> + <output name="out" file="ST000006_linear_discriminant_analysis_none_scores.tsv" /> + <output name="outClassification" file="ST000006_linear_discriminant_analysis_none_classification.tsv" /> + <output name="outClassificationAccuracy" file="ST000006_linear_discriminant_analysis_none_classification_accuracy.tsv" /> + <output name="figure" file="ST000006_linear_discriminant_analysis_none_figure.pdf" compare="sim_size" delta="10000"/> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + + +The tool performs linear discriminant analysis (LDA) on the data. + +***NOTE: A minimum of 100 samples is required by the tool for single or double cross validation*** + +LDA is a supervised method based on the projection of data in the linear subspace to achieve maximum separation between samples in different groups and minimum separation between samples within groups. The subspace dimension defines the number of components used to describe the variability within the data. +Due to the LDA method specification, the subspace dimension must be less than the number of treatment groups. The user has an option to specify the dimension of the subspace directly (default = 2) or to perform single or double cross-validation to determine the dimension of the subspace. For single and double cross-validation, the dataset is split when model fit is performed. For double cross-validation, the data set is split into pieces and the model fit is performed on one piece using cross-validation and evaluated on the other pieces. For single cross-validation, the data are used to both fit and evaluate the model using a three-fold cross validation. + +Visual summaries are provided in the form of a 2D plot where samples are colored by group and plotted along the determined subspace components pairwise. + +More details about the method are available via: + +Trevor J.. Hastie, Tibshirani, R. J., and Friedman, J. H. (2011). The elements of statistical learning: data mining, inference, and prediction. Springer. p106-119 + + +-------------------------------------------------------------------------------- + +**Note** + +- This tool currently treats all variables as continuous numeric variables. Running the tool on categorical variables may result in incorrect results. +- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +@GROUP@ + +**Cross-Validation Choice** + + - The choice of cross-validation options available for the user. None corresponds to no cross-validation where the user specifies the number of components manually. ***The tool requires a minimum of 100 samples***. + + +**Number of Components** + + - This parameter is used only when the "None" cross-validation option is selected. If the field is left blank, the number of components is set to the default value (2). + + +-------------------------------------------------------------------------------- + +**Output** + +This tool outputs: + +(1) TSV file containing the components produced by the model for each sample. +Component_{i}: contains the score values for each sample. The number of levels {i} is specified in the Number of components text box or determined via cross validation. + +(2) TSV file containing the sample classifications produced by the model. +Group_Observed: Initial group labels. +Group_Predicted: Predicted group labels. + +(3) TSV file containing the classification accuracy (in percent) of the algorithm with respect to the number of correctly classified samples. + +(4) A PDF file containing 2D plots for all pairwise comparisons of components. Colored by treatment group. + + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/log_and_glog_transformation.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,115 @@ +<tool id="secimtools_log_and_glog_transformation" name="Log and Generalized Log (G-Log) Transformation." version="@WRAPPER_VERSION@"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +log_and_glog_transformation.py +--input $input +--design $design +--uniqID $uniqID +--transformation $transformation +--log_base $log_base +--lambda_value $lambda_value +--oname $oname + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" size="30" type="text" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers.."/> + <param name="transformation" size="30" display="radio" type="select" value="log" label="Select Transformation" help="Select log or G-log."> + <option value="log" selected="true">Logarithm</option> + <option value="glog" selected="true">Generalized Logarithm (G-Log)</option> + </param> + <param name="log_base" size="30" type="select" display="radio" value="log" label="Logarithm Base" help="Select logarithm base."> + <option value="log" selected="true">Logarithm base e (natural)</option> + <option value="log2" selected="true">Logarithm base 2</option> + <option value="log10" selected="true">Logarithm base 10</option> + </param> + <param name="lambda_value" size="30" type="text" value="100" label="Regularization Parameter Lambda" help='Regularization parameter lambda is used only for G-log transformation and is ignored for log transformation. Lambda must be non-negative.'/> + </inputs> + <outputs> + <data format="tabular" name="oname" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="transformation" value="glog" /> + <param name="log_base" value="log" /> + <param name="lambda_value" value="1000000" /> + <output name="oname" file="ST000006_log_and_glog_transformation_glog_lambda_1000000.tsv" /> + </test> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="transformation" value="log" /> + <param name="log_base" value="log" /> + <param name="lambda_value" value="0" /> + <output name="oname" file="ST000006_log_and_glog_transformation_log.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +***NOTE: Zero or negative values in the original dataset will be replaced with missing values after log transformation since logarithms are not defined for non-positive values.*** + +This tool carries out either log or generalized log (G-log) transformation of values in a Wide Format dataset using the base specified by the user. + +The logarithmic transformation has the formula: log(data). + +The generalized logarithmic transformation has the formula: log(data + sqrt(data^2 + lambda)). + +The generalized version becomes the standard logarithmic transformation re-scaled by sqrt(2) if the lambda value is 0. + +Three bases are available for both logarithmic transformations: + +base e (natural), base 2, and base 10. + + + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Transformation Choice** + + - User has to choose between logarithmic and generalized logarithmic (G-log) transformation. + +**Logarithm Base** + + - Select base of the logarithm. + +**Regularization Parameter Lambda** + + - Enter a value for regularization parameter lambda. The value must to be non-negative and is used only for G-log transformation. Default = 100. + +-------------------------------------------------------------------------------- + +**Output** + +A TSV file containing the same column names as the original Wide Dataset where the values in each cell correspond to the values obtained by the selected log transformation procedure. + +***NOTE:*** If the original dataset contains 0 or negative values, they will be replaced with missing values after log transformation since logarithms are not defined for non-positive values. +Any values missing in the original dataset will remain missing. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,195 @@ +<?xml version="1.0" ?> +<macros> + <token name="@WRAPPER_VERSION@">21.3.4.2</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@WRAPPER_VERSION@">secimtools</requirement> + <yield/> + </requirements> + </xml> <xml name="citations"> + <citations> + <citation type="bibtex">@ARTICLE{Kirpich17secimtools, + author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre}, + title = {SECIMTools: A suite of Metabolomics Data Analysis Tools}, + journal = {BMC Bioinformatics}, + year = {in press} + }</citation> + </citations> + </xml> + + <token name="@TIP_AND_WARNING@"> + **TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + + **WARNINGS:** + + (1) SampleIDs in the wide data that have no matching name in the design file, as well as sampleIDs in the design file that have no matching name in the data, will be excluded from the analysis. + (2) This script automatically removes spaces and special characters from strings. + (3) If a compound/feature name starts with a number it will prepend an '_'. + (4) Input names are case sensitive and must match exactly (e.g use 'Feature' not 'feature'). + </token> + <token name="@WIDE@"> + **Wide Formatted Dataset** + + A wide formatted dataset that contains measurements for each sample: + + +---------+---------+---------+---------+-----+ + | Feature | sample1 | sample2 | sample3 | ... | + +=========+=========+=========+=========+=====+ + | one | 10 | 20 | 10 | ... | + +---------+---------+---------+---------+-----+ + | two | 5 | 22 | 30 | ... | + +---------+---------+---------+---------+-----+ + | three | 30 | 27 | 2 | ... | + +---------+---------+---------+---------+-----+ + | four | 32 | 17 | 8 | ... | + +---------+---------+---------+---------+-----+ + | ... | ... | ... | ... | ... | + +---------+---------+---------+---------+-----+ + + **NOTE:** The 'Feature' column defines the rows within a wide formatted dataset. + </token> + <token name="@METADATA@"> + **Design File** + + A Design file relating samples to various groups/treatment: + + +----------+--------+ + | sampleID | group | + +==========+========+ + | sample1 | g1 | + +----------+--------+ + | sample2 | g1 | + +----------+--------+ + | sample3 | g1 | + +----------+--------+ + | sample4 | g2 | + +----------+--------+ + | sample5 | g2 | + +----------+--------+ + | sample6 | g2 | + +----------+--------+ + | ... | ... | + +----------+--------+ + + **NOTE:** You must have a column named **sampleID** and the values in this column must match the column names in the wide dataset. + </token> + <token name="@DF@"> + **Design File** + + A Design file relating samples to various groups/treatment: + + +----------+--------+ + | sampleID | group | + +==========+========+ + | sample1 | g1 | + +----------+--------+ + | sample2 | g1 | + +----------+--------+ + | sample3 | g1 | + +----------+--------+ + | sample4 | g2 | + +----------+--------+ + | sample5 | g2 | + +----------+--------+ + | sample6 | g2 | + +----------+--------+ + | ... | ... | + +----------+--------+ + + **NOTE:** You must have a column named **sampleID** and the values in this column must match + the column names in the long dataset. + </token> + <token name="@LONG@"> + **Long Dataset:** + + A dataset in long/stacked format that contains measurements for each sample: + + +----------+----------+------------+ + | Feature | sampleID | Peak Height| + +==========+==========+============+ + | One | 1 | 10 | + +----------+----------+------------+ + | One | 2 | 5 | + +----------+----------+------------+ + | One | 3 | 30 | + +----------+----------+------------+ + | Two | 1 | 20 | + +----------+----------+------------+ + | Two | 2 | 22 | + +----------+----------+------------+ + | Two | 3 | 27 | + +----------+----------+------------+ + | ... | ... | ... | + +----------+----------+------------+ + + </token> + <token name="@FLAGS@"> + **Flag File:** + + A wide formated dataset that contains flags for each sample or feature: + + +----------+---------+---------+---------+-----+ + | Feature | flag_A | flag_B | flag_C | ... | + +==========+=========+=========+=========+=====+ + | one | 0 | 0 | 0 | ... | + +----------+---------+---------+---------+-----+ + | two | 0 | 1 | 1 | ... | + +----------+---------+---------+---------+-----+ + | three | 0 | 1 | 0 | ... | + +----------+---------+---------+---------+-----+ + | four | 1 | 0 | 0 | ... | + +----------+---------+---------+---------+-----+ + | ... | ... | ... | ... | ... | + +----------+---------+---------+---------+-----+ + + </token> + <token name="@MZRTFILE@"> + **M/Z RT File:** + + A wide formated dataset that contains M/Z and RT measurements for each sample: + + +----------+--------+----------------+ + | sampleID | M/Z | Retention Time | + +==========+========+================+ + | sample1 | 0.1556 | 0.253618769 | + +----------+--------+----------------+ + | sample2 | 0.1675 | 0.327658519 | + +----------+--------+----------------+ + | sample3 | 0.1341 | 0.156587769 | + +----------+--------+----------------+ + | sample4 | 0.2341 | 0.153658165 | + +----------+--------+----------------+ + | sample5 | 0.4557 | 0.315765787 | + +----------+--------+----------------+ + | sample6 | 0.1879 | 0.253655765 | + +----------+--------+----------------+ + | ... | ... | ... | + +----------+--------+----------------+ + + </token> + <token name="@GROUP_OPTIONAL@"> + **Group/Treatment [Optional]** + + - Name of the column in your Design File that contains group classifications. + </token> + <token name="@GROUP@"> + **Group/Treatment** + + - Name of the column in your Design File that contains group classifications. + </token> + <token name="@UNIQID@"> + **Unique Feature ID** + + - Name of the column in your Wide Dataset that has unique Feature IDs. + </token> + <token name="@RUNORDER@"> + **Run Order ID** + + - The column name in your Design file that contains the order samples were run. + </token> + <token name="@RUNORDER_OPTIONAL@"> + **Run Order ID [Optional]** + + - The column name in your Design file that contains the order samples were run. + </token> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/magnitude_difference_flags.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,98 @@ +<tool id="secimtools_magnitude_difference_flags" name="Magnitude Difference Flags" version="@WRAPPER_VERSION@"> + <description>- Count the number of digits before the decimal place.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +magnitude_difference_flags.py +--input $input +--design $design +--ID $uniqID +--flags $flags +--figure $figure +--counts "counts" +--html $html +--htmlPath "$html.files_path" + +#if $nozero: + --noZero +#end if +#if $group: + --group $group +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Wide Dataset that has unique identifiers.."/> + <param name="nozero" type="select" size="30" display="radio" value="yes" label="Remove zeros before processing" help="If not removed, zeros may skew the results."> + <option value="yes">Remove zeros</option> + <option value="no">Do not remove zeros</option> + </param> + <param name="group" size="30" type="text" label="Group/Treatment [Optional]" help="Name of the column in your Design File that contains group classifications." /> + </inputs> + <outputs> + <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Figure" /> + <data format="tabular" name="flags" label="C${tool.name} on ${on_string}: Flags" /> + <data format="html" name="html" label="${tool.name} on ${on_string}: Counts" /> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="nonzero" value="yes" /> + <output name="figure" file="ST000006_magnitude_difference_flags_figure.pdf" compare="sim_size" delta="10000"/> + <output name="flags" file="ST000006_magnitude_difference_flags_flags.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +This tool counts the number of digits before the decimal place for each feature in each sample. +The tool identifies features with different orders of magnitude across different samples in a given group and produces corresponding indicator flags. +Unusual samples are identified by finding systematically low or high feature values for that particular sample. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Remove zeros before processing** + + - If zeros are not removed before processing, they may skew the results. + +@GROUP_OPTIONAL@ + +**NOTE:** Groups with one element will be excluded from the test. + +-------------------------------------------------------------------------------- + +**Output** + +The tool outputs a variable number of files (from 2 to n+1) depending on the number of groups (n). + +(1) TSV file containing a 0/1 indicator flag where “1” is used to flag features where the difference in the digit counts is greater than 2. + +(2) TSV file containing the digit counts for all samples or for the samples within groups, depending on whether the Group/Treatment [Optional] parameter was provided. + +(3) A PDF file of the distribution of digit counts within each group of samples. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mahalanobis_distance.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,136 @@ +<tool id="secimtools_mahalanobis_distance" name="Penalized Mahalanobis Distance (PMD)" version="@WRAPPER_VERSION@"> + <description>to compare groups</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <stdio> + <exit_code range="1:" level="warning" description="RuntimeWarning"/> + </stdio> + <command detect_errors="exit_code"><![CDATA[ +mahalanobis_distance.py +--input $input +--design $design +--ID $uniqID +--figure $plot +--distanceToMean $out1 +--distancePairwise $out2 + +#if $group + --group $group +#end if + +#if $levels + --levels $levels +#end if + +#if $p + --per $p +#end if + +#if $order + --order $order +#end if + +#if $penalty + --penalty $penalty +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers.."/> + <param name="group" type="text" size="30" label="Group/Treatment [Optional]" help="Name of the column in your design file that contains group classifications." /> + <param name="order" type="text" size="30" label="Input Run Order Name [Optional]" help="Enter the name of the column containing the order samples were run. Spelling and capitalization must be exact." /> + <param name="levels" type="text" size="30" label="Additional groups to separate by [Optional]" help="Enter additional group(s) name(s) to include. Spelling and capitalization must be exact. If more than one group separate with ','." /> + <param name="p" type="float" value= ".95" size="6" label="Threshold" help="Threshold for standard distribution, specified as a percentile. Default = 0.95." /> + <param name="penalty" type="float" value= "0.5" size="6" label="λ Penalty" help="λ Penalty to use in the distance. The default is λ=0.5." /> + </inputs> + <outputs> + <data format="pdf" name="plot" label="${tool.name} on ${on_string}: plot" /> + <data format="tabular" name="out1" label="${tool.name} on ${on_string}: toMean" /> + <data format="tabular" name="out2" label="${tool.name} on ${on_string}: pairwise" /> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="penalty" value="0.5" /> + <output name="plot" file="ST000006_mahalanobis_distance_figure.pdf" compare="sim_size" delta="10000" /> + <output name="out1" file="ST000006_mahalanobis_distance_to_mean.tsv" /> + <output name="out2" file="ST000006_mahalanobis_distance_pairwise.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The Penalized Mahalanobis distance (PMD) tool can be used to compare samples within a group and accounts for the correlation structure between metabolites. +In contrast, Standardized Euclidian distance (SED) relies solely on geometric distance and ignores any dependency structures between features. +PMD incorporates the correlation structure inside the distance measurement. + +When correlation structure and dependency between metabolites is ignored, the features inverse variance-covariance matrix simplifies to a diagonal matrix with diagonal values - in this case, MD simplifies to SED. +When the number of features is greater than the number of samples, the inverse of the features variance-covariance matrix does not exist. +This is the case for most -omic data. Here, the inverse is estimated using a regularization method (Archambeau et al. 2004). +The details of the regularization algorithm can be found in Supplementary file 3 in Kirpich et al. 2017. + +Archambeau C, Vrins F, Verleysen M. Flexible and Robust Bayesian Classification by Finite Mixture Models. InESANN 2004 (pp. 75-80). + +**NOTE:** Because of the nature of the tool, groups with less than 3 samples will be discarded from the analysis. + + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +@GROUP_OPTIONAL@ + + - **Warning:** All groups must contain 3 or more samples. + + +@RUNORDER_OPTIONAL@ + +**Additional groups to separate by [Optional]** + + - Enter additional group(s) name(s) to include. Spelling and capitalization must be exact. If more than one group, separate them with a comma + - **Warning:** All groups must contain 3 or more samples. + + +**Percentile cutoff** + +- The percentile cutoff for standard distributions. The default is 0.95. + +**λ Penalty** + +- λ Penalty to use in the distance. The default is λ=0.5. + +-------------------------------------------------------------------------------- + +**Output** + +The tool outputs three different files: + +(1) a PDF file containing 2D scatter plots and boxplots for the distances + +(2) a TSV file containing distances from the sample to the estimated mean + +(3) a TSV file containing distances from the sample to other samples. + +If the grouping variable is specified by the user, the distances are computed both within the groups and for the entire dataset. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_flags.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,79 @@ +<tool id="secimtools_merge_flags" name="Merge Flag Files" version="@WRAPPER_VERSION@"> + <description>with the same unique identifiers into a single file.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +merge_flags.py +--input "$input" +--output "$output" +#set names = '" "'.join( [ str( $i.display_name ) for $i in $input ] ) +--filename "${names}" +#if $flagUniqID + --flagUniqID $flagUniqID +#end if + ]]></command> + <inputs> + <param name="input" format="tabular" type="data" label="Input Flag Files" multiple="true" help="Input your tab-separated flag files. CTRL+CLICK to select multiple files. If not tab separated see TIP below." /> + <param name="flagUniqID" type="text" size="30" value="" label="Unique identifier in the flag files (feature or sample)" help="Name of the column in your flag file that contains unique identifiers."/> + </inputs> + <outputs> + <data format="tabular" name="output" label="${tool.name} on ${on_string}: Flags"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_run_order_regression_flags.tsv,ST000006_lasso_enet_var_select_flags.tsv"/> + <param name="flagUniqID" value="Retention_Index" /> + <param name="filename" value="ST000006_run_order_regression_flags ST000006_lasso_enet_var_select_flags" /> + <output name="output" file="ST000006_merge_flags_output.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool merges two or more flag files together. The flag files can be either in wide format or in design format. + +The merging requirements are: + +(1) the number of rows should be the same in all files being merged and +(2) all files should contain the same unique ID column name to merge by. + +**Note:** More broadly, the tool can merge non-metabolomics data as long ast he above requirements are met. + + +-------------------------------------------------------------------------------- + +**Input** + + - Two or more datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + +@METADATA@ + +**Unique ID for Flag file (feature ID or sample ID).** + + - Name of the column in your Flag file that contains unique IDs. + + +-------------------------------------------------------------------------------- + +**Output** + +The TSV output contains all columns from the flag files. The column with the unique row ID will be included once in the output dataset. + +**Note:** If the input flag files have the same flag column name in multiple files, the merged file will have columns from all imputed files. +To distinguish columns obtained from different files, column names will be altered by appending the corresponding file name to the end of the column name. +All non-supported file name characters will be changed to ‘_’. + + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/modify_design_file.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,77 @@ +<tool id="secimtools_modify_design_file" name="Modify design file" version="@WRAPPER_VERSION@"> + <description>to remove specified group types or sampleIDs</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +modify_design_file.py +--input $input +--design $design +--uniqID $uniqID +#if $group + --group $group +#end if +--drops $toDrop +--out $out + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers.."/> + <param name="group" type="text" size="30" value="" optional="false" label="Group/Treatment [Optional]" help="Name of the column in your design file that contains group classifications. If not provided, the drop will be performed by 'sampleID'."/> + <param name="toDrop" type="text" size="30" optional="false" label="Group(s)/Sample(s) to drop" help="Name of the Group(s) or Sample(s), comma separated, to remove from your design file."/> + </inputs> + <outputs> + <data format="tabular" name="out" label="${tool.name} on ${on_string}: Value"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design_group_name_underscore.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source_no_space" /> + <param name="toDrop" value="Chardonnay_Napa_CA2003,Riesling_CA2004,SauvignonBlanc_LakeCounty_CA2001" /> + <output name="out" file="ST000006_modify_design_output.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool creates a new design file based on the existing wide dataset and design file where the specified group types or samples are removed from the existing design file. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Group/Treatment [Optional]** + + - Name of the column in your Design File that contains group classifications. If not provided, the drop will be performed by 'sampleID's. + +**Group(s)/Sample(s) to drop** + + - Name of the Group(s) or Sample(s), comma separated, to remove from the design file. + +-------------------------------------------------------------------------------- + +**Output** + +This tool will output a single design file that contains only the samples not selected for removal. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/modulated_modularity_clustering.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,106 @@ +<tool id="secimtools_modulated_modularity_clustering" name="Modulated Modularity Clustering (MMC)" version="@WRAPPER_VERSION@"> + <description>with visual summaries.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +modulated_modularity_clustering.py +--input $input +--design $design +--ID $uniqID +--out $output +--figure $figure +--sigmaLow $sigmaLow +--sigmaHigh $sigmaHigh +--sigmaNum $sigmaNum +--correlation $corr + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If not tab separated see TIP below." /> + <param name="design" type="data" format="tabular" label="Design Dataset" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers.." /> + <param name="sigmaLow" type="float" size="6" value="0.05" label="Lower sigma bound" help="Default: 0.05." /> + <param name="sigmaHigh" type="float" size="6" value="0.50" label="Upper sigma bound" help="Default: 0.50." /> + <param name="sigmaNum" type="float" size="6" value="451" label="Number of Sigma values" help="Number of values of sigma to search. Default: 451." /> + <param name="corr" type="select" value="pearson" label="Correlation method" help="Select correlation method for preliminary correlation before clustering. Default: Pearson." > + <option value="pearson" selected="true">Pearson</option> + <option value="kendall" selected="true">Kendall</option> + <option value="spearman" selected="true">Spearman</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="output" label="${tool.name} on ${on_string}: Values"/> + <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Heatmaps"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="corr" value="pearson" /> + <output name="output" file="ST000006_modulated_modularity_clustering_out.tsv" compare="sim_size" delta="10000"/> + <output name="figure" file="ST000006_modulated_modularity_clustering_figure.pdf" compare="sim_size" delta="10000" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +Modulated Modularity Clustering method (MMC) was designed to detect latent structure in data using weighted graphs. +The method searches for optimal community structure and detects the magnitude of pairwise relationships. +The optimal number of clusters and the optimal cluster size are selected by the method during the analysis. + +The initial boundaries (lower and upper) for sigma as well as the number of points in the search grid (number of sigma values) are specified initially by the user. +The boundaries are extended automatically by the algorithm if the values are close to the boundary. The correlation type (Pearson, Kendall or Spearman) can be specified. + +More details about the method can be found in: + +Stone, E. A., and Ayroles, J. F. (2009). Modulated modularity clustering as an exploratory tool for functional genomic inference. PLoS Genet, 5(5), e1000479. + + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Lower sigma value** + + - Default: 0.05. + +**Upper sigma value** + + - Default: 0.50. + +**Sigma values** + + - Number of values of sigma to search. Default: 451. Higher numbers increase the precision but decrease the performance time. + +**Correlation method** + + - Correlation method for preliminary correlation before clustering. Default = Pearson. + +-------------------------------------------------------------------------------- + +**Output** + +The tool produces four files: a single TSV file and three PDF files: + +(1) a TSV file containing the algorithm summaries and +(2) three PDF files containing (i) unsorted, (ii) sorted, and (iii) sorted and smoothed dependency heatmaps produced by the MMC algorithm respectively. + + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multiple_testing_adjustment.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,86 @@ +<tool id="secimtools_multiple_testing_adjustment" name="Multiple Testing Adjustment (MTA)" version="@WRAPPER_VERSION@"> + <description>of p-values.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +multiple_testing_adjustment.py +--input $input +--uniqID $uniqID +--pval "$pval" +--alpha $alpha +--outadjusted $outadjusted +--flags $flags + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Wide Dataset that has unique identifiers.."/> + <param name="pval" type="text" size="30" value="" label="p-value column" help="Name of the column in your wide dataset that contains the p-values."/> + <param name="alpha" type="float" size="6" value="0.05" label="α" help="Value of α to be used for multiple correction. Default α = 0.05."/> + </inputs> + <outputs> + <data format="tabular" name="outadjusted" label="${tool.name} on ${on_string}: Adjusted pval."/> + <data format="tabular" name="flags" label="${tool.name} on ${on_string}: Flags."/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_anova_fixed_with_group_summary.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="pval" value="prob_greater_than_t_for_diff_Chardonnay, Carneros, CA 2003 (CH01)-Chardonnay, Carneros, CA 2003 (CH02)" /> + <param name="alpha" value="0.05" /> + <output name="outadjusted" file="ST000006_multiple_testing_adjustment_outadjusted.tsv" /> + <output name="flags" file="ST000006_multiple_testing_adjustment_flags.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool is designed to adjust p-values for multiple comparisons using three different methods: + +(1) The Bonferroni method and two false discovery rate (FDR) methods, (2) the Benjamini-Hochberg method (BH) and (3) the Benjamini-Yekutieli method (BY). +The p-value correction can be carried out on p-values generated from the following tools: Analysis of Variance (ANOVA) Fixed Effects Model, Kruskal-Wallis Non-Parametric Test, T-test (Single Group) and T-test (Paired and/or Unpaired) in addition to p-values generated outside of these tools. +The user can specify the total type I error α value. + +More details about the PH and BY methods are available in the papers: + +Benjamini, Y., and Hochberg, Y. (1995). Controlling the false discovery rate: a practical and powerful approach to multiple testing. Journal of the royal statistical society. Series B (Methodological), 289-300. + +Benjamini, Y., and Yekutieli, D. (2001). The control of the false discovery rate in multiple testing under dependency. Annals of statistics, 1165-1188. + +------------------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +@UNIQID@ + +**Name for p-value column** + + - Name of the column in your Wide Dataset that contains the p-values. + +**α** + + - Value of α to be used for multiple correction. Default α = 0.05. + +------------------------------------------------------------------------------------------- + +**Output** + +The tool produces two TSV files: + +(1) One TSV that contains the following five columns: + a column with unique feature IDs, + a column of the original p-values and + the last three columns contain the p-values adjusted using the 3 methods described above which are reflected in the column name. +(2) The second TSV file contains flags where all significant values are flagged as 1 and non-significant values are flagged as 0. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mzrt_match.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,160 @@ +<tool id="secimtools_mzrt_match" name="Mass to Charge Ratio - Retention Time (m/z - RT) Matching" version="@WRAPPER_VERSION@"> + <description>across 2 files.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +mzrt_match.py +--anno1 $anno1 +--anno2 $anno2 +--uniqID1 $uniqID1 +--uniqID2 $uniqID2 +--mzID1 $mzID1 +--mzID2 $mzID2 +--rtID1 $rtID1 +--rtID2 $rtID2 +--all $all +--matched $matched +--unmatched1 $unmatched1 +--unmatched2 $unmatched2 +--summary $summary +--figure $figure +--mzcut $mz +--rtcut $rt +--name1 $name1 +--name2 $name2 + ]]></command> + <inputs> + <param name="anno1" type="data" format="tabular" label="File 1" help="Input dataset 1 in tab-separated wide format. If not tab separated see TIP below."/> + <param name="anno2" type="data" format="tabular" label="File 2" help="Input dataset 2 in tab-separated wide format. If not tab separated see TIP below."/> + <param name="uniqID1" type="text" size="30" value="" optional="false" label="Unique IDs for File 1" help="Name of the column in dataset 1 containing unique IDs."/> + <param name="uniqID2" type="text" size="30" value="" optional="false" label="Unique IDs for File 2" help="Name of the column in dataset 2 containing unique IDs."/> + <param name="mzID1" type="text" size="30" value="" optional="false" label="Mass/Charge column for File 1" help="Name of the column in dataset 1 containing m/z ratios."/> + <param name="mzID2" type="text" size="30" value="" optional="false" label="Mass/Charge column for File 2" help="Name of the column in dataset 2 containing m/z ratios."/> + <param name="rtID1" type="text" size="30" value="" optional="false" label="Retention Time column for File 1" help="Name of the column in dataset 1 containing RTs."/> + <param name="rtID2" type="text" size="30" value="" optional="false" label="Retention Time column for File 2" help="Name of the column in dataset 2 containing RTs."/> + <param name="mz" type="text" size="30" value="0.005" optional="true" label="Mass/Charge window" help="Window width for the m/z ratio (Default = 0.005)."/> + <param name="rt" type="text" size="30" value="0.15" optional="true" label="Retention Time window" help="Window width for RT (Default = 0.15)."/> + <param name="name1" type="text" size="30" value="F1" optional="true" label="Dataset 1 name" help="Short name for dataset 1 (By default F1)."/> + <param name="name2" type="text" size="30" value="F2" optional="true" label="Dataset 2 name" help="Short name for dataset 2 (By default F2)."/> + </inputs> + <outputs> + <data format="tabular" name="all" label="${tool.name} on ${on_string}: All"/> + <data format="tabular" name="matched" label="${tool.name} on ${on_string}: Matches"/> + <data format="tabular" name="unmatched1" label="${tool.name} on ${on_string}: Unmatched 1"/> + <data format="tabular" name="unmatched2" label="${tool.name} on ${on_string}: Unmatched 2"/> + <data format="tabular" name="summary" label="${tool.name} on ${on_string}: Summary"/> + <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Venn"/> + </outputs> + <tests> + <test> + <param name="anno1" value="TEST0000_mzrt_first.tsv"/> + <param name="anno2" value="TEST0000_mzrt_second.tsv"/> + <param name="uniqID1" value="rowID_first"/> + <param name="uniqID2" value="rowID_second"/> + <param name="mzID1" value="MZ_first" /> + <param name="mzID2" value="MZ_second" /> + <param name="rtID1" value="RT_first" /> + <param name="rtID2" value="RT_second" /> + <output name="all" file="TEST0000_mzrt_match_all.tsv" /> + <output name="matched" file="TEST0000_mzrt_match_matched.tsv" /> + <output name="unmatched1" file="TEST0000_mzrt_match_unmatched_first.tsv" /> + <output name="unmatched2" file="TEST0000_mzrt_match_unmatched_second.tsv" /> + <output name="summary" file="TEST0000_mzrt_match_summary.tsv" /> + <output name="figure" file="TEST0000_mzrt_match_figure.pdf" compare="sim_size" delta="10000" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +**NOTE:** This tool is primarily intended for matching mass spectrometry data processed using different parameter settings. + +Each metabolite (feature) is characterized by a mass to charge (m/z) ratio and retention time (RT). +After raw metabolomics data are processed (such as in mzMine), features are given internal identifers that are often different for every run or set of parameters, making it very difficult to impossible to directly compare results across different parameter setting using the internal identifiers. +However, it is possible to link internal identifiers using the m/z ratio and RT for each feature since changing parameter settings are predicted to result in only minor variations in m/z ratio and RT. +This tool matches two mass spectroscopy (MS) datasets generated using different parameter settings in mzMine. + +Each file should contain at least three columns: + +(1) the m/z ratio, +(2) the RT and +(3) the internal identifier (feature ID). + +A feature matches across datasets if the m/z ratio and RT values in both MS datasets fall within a user defined window surrounding the m/z ratio (m/z window) and RT (RT window). +The size of the windows can be specified by the user - the final window width is 2 times the specified value. + +**NOTE:** Since this is a 'many to many' merge where matching occurs within windows around the m/z ratio and the RT, a single internal identifier in one dataset may match many identifiers in the other dataset. + +**NOTE:** While initially designed for MS data, this tool could also be used for other types of data where there is a need to match unique identifiers across datasets using values in 2 columns. +A detection window set to zero (0) would provide an exact match + + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@MZRTFILE@ + +**Unique Feature ID for File 1** + + - Name of the column in annotation file 1 containing unique IDs. + +**Unique Feature ID for File 2** + + - Name of the column in annotation file 2 containing unique IDs. + +**Mass/Charge for File 1** + + - Name of the column in annotation file 1 containing m/z ratios. + +**Mass/Charge for File 2** + + - Name of the column in annotation file 2 containing m/z ratios. + +**Retention Time for File 1** + + - Name of the column on you annotation file 1 containing RTs. + +**Retention Time for File 2** + + - Name of the column on you annotation file 2 containing RTs. + +**Mass/Charge window value** + + - Window value for the m/z ratio (Default = 0.005). + +**Retention Time window value** + + - Window value for the RT (Default = 0.15). + +**File Name 1** + + - A short name to idenfiy your dataset 1. + +**File Name 2** + + - A short name to idenfiy your dataset 2. + + +-------------------------------------------------------------------------------- + +**Output** + +This tool outputs six files: + +(1) a TSV All peak combinations file that contains all combinations of possiblefeatures between File 1 and File 2. +(2) a TSV Matched peak combinations file that contains only the features that match between File 1 and File 2. +(3) a TSV Unmatched peak combinations in file1 that contains the features in File 1 that do not have a match in File 2. +(4) a TSV Unmatched peak combinations in file2 that contains the features in File 2 that do not have a match in File 1. +(5) a PDF file containing a set of 3 Venn diagrams to visualize matching between File 1 and File 2. + + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/partial_least_squares.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,145 @@ +<tool id="secimtools_partial_least_squares" name="Partial Least Squares Discriminant Analysis (PLS-DA)" version="@WRAPPER_VERSION@"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +partial_least_squares.py +--input $input +--design $design +--ID $uniqID +--group $group +--toCompare "$toCompare" +--cross_validation $cross_validation +--nComp $nComp +--outScores $outScores +--outWeights $outWeights +--outClassification $outClassification +--outClassificationAccuracy $outClassificationAccuracy +--figure $figures + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="group" type="text" size="30" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/> + <param name="toCompare" type="text" size="30" label="Names of the Groups to Compare" help="Names of the two groups to compare. The user should insure that group names do not contain commas. The separator for the two groups should only include commas (no spaces)."/> + <param name="cross_validation" type="select" size="30" display="radio" value="double" label="Cross-Validation Options"> + <option value="none">None</option> + <option value="single">Single</option> + <option value="double">Double</option> + </param> + <param name="nComp" type="text" size="30" value="2" label="Number of Components" help="Number of components for the analysis to use (default = 2). This field is used only when the cross validation field is set to none."/> + </inputs> + <outputs> + <data format="tabular" name="outScores" label="${tool.name} on ${on_string}: Scores"/> + <data format="tabular" name="outWeights" label="${tool.name} on ${on_string}: Weights"/> + <data format="tabular" name="outClassification" label="${tool.name} on ${on_string}: Classification of Samples"/> + <data format='tabular' name="outClassificationAccuracy" label="${tool.name} on ${on_string}: Classification Accuracy of Samples"/> + <data format="pdf" name="figures" label="${tool.name} on ${on_string}: Scatter Plots"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design_group_name_underscore.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="toCompare" value="Chardonnay_ Napa_ CA 2003,Riesling_ CA 2004" /> + <param name="cross_validation" value="none"/> + <param name="nComp" value="2"/> + <output name="outScores" file="ST000006_partial_least_squares_none_scores.tsv" /> + <output name="outWeights" file="ST000006_partial_least_squares_none_weights.tsv" /> + <output name="outClassification" file="ST000006_partial_least_squares_none_classification.tsv" /> + <output name="outClassificationAccuracy" file="ST000006_partial_least_squares_none_classification_accuracy.tsv" /> + <output name="figures" file="ST000006_partial_least_squares_none_figure.pdf" compare="sim_size" delta="10000"/> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool performs partial least square discriminant analysis (PLS-DA) for two treatment groups selected by the user. + +**NOTE: A minimum of 100 samples is required by the tool for single or double cross validation** + +The subspace dimension defines the number of components that will be used to describe the variability within the data. +The user can specify subspace dimension in the range of two to the sample number. +The user has the option to specify the dimension of the subspace directly (Default =2) or to perform single or double cross-validation to determine the dimension of the subspace. + +For single and double cross-validation: the data set is split differently when the model fit is performed. + +For double cross-validation: the data set is split into pieces and the model fit is performed on one piece using cross-validation and evaluated on the other pieces. + +For single cross-validation: the same data are used to fit the model and to evaluate the model using three-fold cross validation. + +More details can be found in: + +Geladi, Paul, and Bruce R. Kowalski. "Partial least-squares regression: a tutorial." Analytica chimica acta 185 (1986): 1-17. + + +-------------------------------------------------------------------------------- + +**Note** + +- This tool currently treats all variables as continuous numeric + variables. Running the tool on categorical variables may result in + incorrect results. +- Rows containing non-numeric (or missing) data in any + of the chosen columns will be skipped from the analysis. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + + +@METADATA@ + +@UNIQID@ + +@GROUP@ + + +**Names of the Groups to Compare** + + - Comma separated names of the two groups in your Group/Treatment column that you want to compare. The user should ensure that group names do not contain commas. The separator for the two groups should only include commas (no spaces). + +**Cross-Validation Options** + + - The choice of cross-validation options available for the user. None corresponds to no cross-validation when the user specifies the number of components manually. + +**Number of Components** + + - The parameter is used only when the "None" cross-validation option is selected. If the field is left blank, the number of components is set to the default value (2). +-------------------------------------------------------------------------------- + +**Output** + + +Three different files are generated: + +(1) a TSV file containing the scores produced by the model for each sample + +(2) a TSV file containing the weights produced by the model for each feature. + +(3) a TSV file containing the classification produced by the model for each sample. + +(4) a TSV file containing the algorithm classification accuracy (in percent). + +(5) a PDF file containing the 2D plots for all pairwise comparisons of components between the two treatment groups. + +**NOTE:** Regardless how many components are selected for the algorithm, pairwise 2D plots are produced for the pairs of components. +Increasing the number of components will increase the number of plots produced. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/principal_component_analysis.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,95 @@ +<tool id="secimtools_principal_component_analysis" name="Principal Component Analysis (PCA)" version="@WRAPPER_VERSION@"> + <description>for visual summaries of the components.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +principal_component_analysis.py +--input $input +--design $design +--ID $uniqID +--load_out $loadings +--score_out $scores +--summary_out $summary +--figure $figures + +#if $group + --group $group +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="group" type="text" size="30" label="Group/Treatment [Optional]" help="Name of the column in your design file that contains group classifications."/> + </inputs> + <outputs> + <data format="tabular" name="loadings" label="${tool.name} on ${on_string}: loadings"/> + <data format="tabular" name="scores" label="${tool.name} on ${on_string}: scores"/> + <data format="tabular" name="summary" label="${tool.name} on ${on_string}: summary"/> + <data format="pdf" name="figures" label="${tool.name} on ${on_string}: scatter plots"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <output name="loadings" file="ST000006_principal_component_analysis_load_out.tsv" /> + <output name="scores" file="ST000006_principal_component_analysis_score_out.tsv" /> + <output name="summary" file="ST000006_principal_component_analysis_summary_out.tsv" /> + <output name="figures" file="ST000006_principal_component_analysis_figure.pdf" compare="sim_size" delta="10000" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool performs principal component analysis (PCA) of the data. +Visual summaries are provided in the from of 2D and 3D scatter plots for the first three principal components. +Samples in the scatter plots are colored based on the group classification. + +-------------------------------------------------------------------------------- + +**Note** + +- This tool currently treats all variables as continuous numeric variables. Running the tool on categorical variables might result in incorrect results. +- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + + +@METADATA@ + +@UNIQID@ + +@GROUP_OPTIONAL@ + +-------------------------------------------------------------------------------- + +**Output** + +Four different outputs are produced by the Principal Component Analysis tool: + +(1) a TSV file containing eigenvectors/variable loadings +(2) a TSV file containing scores of input data on principal components +(3) a TSV file with the summary for each component +(4) and a PDF file of scatter plots of the first three principal components + +There are a total of four scatterplots: three pairwise plots for the first three components and a single 3D plot of the first three components. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/random_forest.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,103 @@ +<tool id="secimtools_random_forest" name="Random Forest (RF)" version="@WRAPPER_VERSION@"> + <description>algorithm to select features.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +random_forest.py +--input $input +--design $design +--ID $uniqID +--group $group +--snum $number_of_estimators +--num $number_of_factors +--out $outfile1 +--out2 $outfile2 +--figure $figure + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/> + <param name="number_of_estimators" type="integer" size="30" value="1000" label="Number of trees in the forest" help="Recommend a minimum of 1000 trees."/> + <param name="number_of_factors" type="integer" size="30" value="20" label="Number of factors to plot" help="Plots the (Default = 20) most important factors."/> + </inputs> + <outputs> + <data format="csv" name="outfile1" label="${tool.name} on ${on_string}: Transformed Data"/> + <data format="csv" name="outfile2" label="${tool.name} on ${on_string}: Importance Factors"/> + <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Variable Importance Plot"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <output name="outfile1" file="ST000006_random_forest_out.tsv" compare="sim_size" delta="10000" /> + <output name="outfile2" file="ST000006_random_forest_out2.tsv" compare="sim_size" delta="10000" /> + <output name="figure" file="ST000006_random_forest_figure.pdf" compare="sim_size" delta="10000" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool identifies features that are different between treatment groups based on the random forest algorithm. +Based on Classification and Regression Trees (CART), random forests are an ensemble learning method for classification, regression and variable importance evaluation. +More details about the algorithm can be found in the book: + +Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32. + +**NOTE: The use of machine learning algorithms (including random forest) on datasets with a small number of samples is ambiguous and should be executed with caution.** + +-------------------------------------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + + +@METADATA@ + +@UNIQID@ + +@GROUP@ + +**Number of Trees in the Forest** + + - Run a minimum of 1000 trees. + +**Number of factors to plot** + + - Plots the 20 most important factors. + +-------------------------------------------------------------------------------- + +**Output** + +This tool will always output three different files: + +(1) a TSV file with features ranked according to their relative importance + +(2) a TSV file where ranked features from the wide format dataset are saved in columns in the order that corresponds to their relative importance + +(3) and a PDF file a variable importance plot for the first 50 components. The variable importance plot displays the X (Default = 20) most important features based on the random forest algorithm. The color of each feature changes from the most important (dark blue) to the least important (light blue). + + **NOTE:** The user can take the resulting TSV file and plot any two (or three) features using the Scatter Plot 2D or Scatter Plot 2D tools. + +A plot of two (or three) most important features is recommended since they are probably the most meaningful, but other features can be also considered for plotting. + + **To plot the 2 most important features**: use the SECIM Tools 'Scatter Plot 2D' tool on the transformed dataset to plot the features against each other and evaluate separation levels. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/remove_selected_features_samples.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,140 @@ +<tool id="secimtools_remove_selected_features" name="Remove Selected Features or Samples" version="@WRAPPER_VERSION@"> + <description>from the data using a flag file.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +remove_selected_features_samples.py +--input $input +--design $design +--ID $uniqID +--flags $flags +--outWide $outWide +--outFlags $outFlags +--flagDrop $flagToDrop +--value $reference +--flagfiletype $typeofdrop + +#if $flagUniqID + --flagUniqID $flagUniqID +#end if +#if str( $conditional ) == '0' + --condition '0' +#end if +#if str( $conditional ) == '1' + --condition '1' +#end if +#if str( $conditional ) == '2' + --condition '2' +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" + help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" + help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="flags" type="data" format="tabular" label="Flag File" + help="Input dataset containing binary indicator flag value for each feature or sample."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" + help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="flagUniqID" type="text" size="30" value="" label="Unique ID for Flag file (feature ID or sample ID)" + help="Name of the column in your flag file that has unique identifiers.."/> + <param name="flagToDrop" type="text" size="30" value="" label="Flag to Drop" + help="Name of the column/row in your flag file to use for dropping."/> + <param name="conditional" size="30" type="select" value="" label="Condition of drop." help="Select type of conditional to use."> + <option value="0" selected="true">Equals to</option> + <option value="1" selected="false">Greater than</option> + <option value="2" selected="false">Less than</option> + </param> + <param name="reference" type="text" size="30" value="1" label="Cutoff Value" + help="Any rows or columns with a flag value equal to, greater than or less than (as set above) this Cutoff Value will be dropped."/> + <param name="typeofdrop" type="select" size="30" display="radio" value="row" label="Type of drop to be carried out." help="Select whether you want to drop by rows or by columns, default = rows."> + <option value="row">Drop Rows</option> + <option value="column">Drop Columns</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="outWide" label="${tool.name} on ${on_string}: Dropped wide"/> + <data format="tabular" name="outFlags" label="${tool.name} on ${on_string}: Dropped flags"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="flags" value="ST000006_kruskal_wallis_with_group_flags.tsv"/> + <param name="typeofdrop" value="row"/> + <param name="flagUniqID" value="Retention_Index" /> + <param name="flagToDrop" value="flag_significant_0p10_on_all_groups" /> + <param name="reference" value="0" /> + <param name="conditional" value="0" /> + <output name="outWide" file="ST000006_remove_selected_features_samples_wide.tsv" /> + <output name="outFlags" file="ST000006_remove_selected_features_samples_flags.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool removes features (rows) or samples (columns) from a wide format dataset based on the flags in a separate flag file. +The user specifies a flag file and column to indicate removal. +Features or samples with a flag value equal to, greater than, or less than a user specified Cutoff Value (Default = 1) will be dropped from the wide dataset. + +The flag file should be either a wide format flag file (used for dropping features) or design format flag file (used for dropping samples). +The difference between the flag file formats is described in the beginning of the user manual (Kirpich et al. 2017). + +**NOTE:** Flag files generated outside of SECIM Tools can be used. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@FLAGS@ + +@UNIQID@ + +**Unique ID for Flag file (feature ID or sample ID).** + + - Name of the column in your Flag file that contains unique Feature IDs. + +**Flag to use for Drop.** + + - Name of the column/row in your Flag File to use drop. + +**Condition of drop.** + + -Select type of conditional. + +**Cutoff Value** + + - Any row (or column) with a flag value equals to, greater than or less than this Cutoff Value (Default = 1) will be dropped. + +**Type of drop.** + + - Select whether you want to drop rows or columns, default = rows. + +-------------------------------------------------------------------------------- + +**Output** + +This tool outputs two TSV files: + +(1) The first TSV file is a wide format dataset generated from the input wide format dataset where features (or samples) flagged in the flag file have been removed. + +(2) The second TSV file is the flag file generated from the input flag file where the flagged features (or samples) have been dropped. The resulting flag file contains only features (or samples) that have not been dropped from the wide format dataset. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/remove_user_specified_row_col.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,87 @@ +<tool id="secimtools_remove_user_specified_features" name="Remove User-Specified Features or Samples" version="@WRAPPER_VERSION@"> + <description>from the data.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +remove_user_specified_row_col.py +--input $input +--design $design +--ID $uniqID +--outWide $outWide +--row $rowID +--col $colID + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" + help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" + help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" + help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="rowID" type="text" size="30" value="" label="Features to drop [Optional]." + help="Space-separated list of unique identifiers (rows) in your wide dataset to delete."/> + <param name="colID" type="text" size="30" value="" label="Samples to drop [Optional]." + help="Space-separated list of unique sampleIDs (columns) to be removed from the wide dataset."/> + </inputs> + <outputs> + <data format="tabular" name="outWide" label="${tool.name} on ${on_string}: Dropped wide"/> + </outputs> + <tests> + <test> + <param name="input" value="fly_test_sbys.tsv"/> + <param name="design" value="fly_test_design.tsv"/> + <param name="uniqID" value="rowID" /> + <param name="rowID" value="_15" /> + <param name="colID" value="r101_V_3" /> + <output name="outWide" file="fly_remove_specified_features_samples_wide.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool removes user-specified features (rows) or samples (columns) from a wide format dataset. +The user manually enters space delimited rows (features) and / or columns (samples). + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Unique ID for Flag file (feature ID or sample ID).** + + - Name of the column in your Wide dataset that contains unique Feature IDs. + +**rowID.** + + - Name of the row(s) in your Wide dataset to drop. + +**colID.** + + - Name of the column(s) in your Wide dataset to drop. + +-------------------------------------------------------------------------------- + +**Output** + +This tool outputs one TSV file: + +(1) A wide format dataset generated from the input wide format dataset where features and/or samples specified by the user have been removed. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/retention_time_flags.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,110 @@ +<tool id="secimtools_retention_time_flags" name="Retention Time (RT) Flags" version="@WRAPPER_VERSION@"> + <description>- Flag features with discrepancies in retention time.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <stdio> + <exit_code range="1:" level="warning" description="RuntimeWarning"/> + </stdio> + <command detect_errors="exit_code"><![CDATA[ +retention_time_flags.py +--input $input +--design $design +--ID $uniqID +--figure $RTplot +--flag $RTflag +--minutes $minutes +#if $CVcutoff: + --CVcutoff $CVcutoff +#end if +#if $pctl + --pctl +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file(tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Wide Dataset that has unique identifiers."/> + <param name="CVcutoff" optional="true" type="float" value="0.1" size="4" label="Coefficient of Variation (CV) Cutoff" help="Coefficient of variation (CV) cutoff (in decimals) that specifies the proportion of features to flag. Default CV cutoff = 0.1, which implies that 10% of the features with the largest CVs will be flagged." /> + <param name="minutes" type="float" size="4" value="0.2" label="Retention Time Cutoff Value" help="If the difference in the retention time between the 95th and 5th percentiles (or 90th and 10th) is greater than this specified RT Cutoff value, features are flagged. (A default value of 0.2 assumes data units are in minutes)" /> + <param name="pctl" type="boolean" size="6" label="90th and 10th percentiles [Optional]" help="See RT Cutoff Value above. Check this box to use the 90th and 10th percentiles instead of the default 95th and 5th percentiles."/> + </inputs> + <outputs> + <data format="pdf" name="RTplot" label="${tool.name} on ${on_string}: plot" /> + <data format="tabular" name="RTflag" label="${tool.name} on ${on_string}: flag" /> + </outputs> + <tests> + <test> + <param name="input" value="TEST0000_rt.tsv"/> + <param name="design" value="TEST0000_design.tsv"/> + <param name="uniqID" value="rowID" /> + <output name="RTplot" file="TEST0000_retention_time_flags_figure.pdf" compare="sim_size" delta="10000"/> + <output name="RTflag" file="TEST0000_retention_time_flags_flag.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +***NOTE:*** This tool is primarily intended for flagging features with variation in retention times in mass spectrometry data analysis. +The goal of the tool is to identify potential problems with the instrument or with data processing and pre-processing. + +The retention time for a given feature is predicted to be relatively consistent across samples. This tool identifies potential abnormalities or shifts in the retention time for a feature. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Coefficient of Variation (CV) Cutoff** + + - The coefficient of variation (CV) cutoff (in decimals) specifies the proportion of features to flag. Default CV cutoff = 0.1, implying that 10% of the features with the largest CV will be flagged. + + +**Retention Time Cutoff Value** + + - A user specified value (Default value = 0.2 assumes the data units are in minutes) used with the percentile button below. Features where the difference in the RT between the 95th and 5th percentiles is greater than the given Retention Time Cutoff Value are flagged. + + +**90th percentile [Optional]** + + - See Retention Time Cutoff Value above. Check this box to use a 90th percentile. The default is a 95th percentile. + +-------------------------------------------------------------------------------- + +**Output** + +The tool outputs two files: + +(1) a TSV file with flags for each feature, where the results from each flagging method are saved in a separate column + + - flag_RT_Q95Q05_outlier: 0/1 flag where the value “1” is for features where the difference in the retention time between the 95th and 5th percentile (or 90th and 10th percentiles) is greater than the user specified Retention Time Cutoff Value (default is 0.2 minutes). + + - flag_RT_max_gt_threshold: 0/1 flag where the value “1” is for features where the difference between the retention time maximum and median is greater than the Retention Time Cutoff Value divided by 2. + + - flag_RT_min_lt_threshold: 0/1 flag where the value “1” is for features where the difference between the retention time minimum and median is greater than the Retention Time Cutoff Value divided by 2. + + - flag_RT_min_max_outlier: 0/1 flag where the value “1” is for features where the difference between the retention time minimum and maximum is greater than 3 times the standard deviation from the mean. + + - flag_RT_big_CV: 0/1 flag where the value “1” is for features where the coefficient of variation (CV) in retention time is greater than the CV Cutoff. The default value is 0.1 which corresponds to flagging the 10% of the features with the largest CV. + +(2) and a PDF file containing a density plot of the coefficients of variation (CV) for the retention time. The vertical red dotted line shows the CV cutoff for the top XX% of the data as specified by the CV cutoff values. + + + +]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/run_order_regression.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,93 @@ +<tool id="secimtools_run_order_regression" name="Run Order Regression (ROR)" version="@WRAPPER_VERSION@"> + <description>using the order samples were run.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +run_order_regression.py +--input $input +--design $design +--ID $uniqID +--group $group +--order $order +--fig $order_plots +--table $order_summary +--flags $flags + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file(tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/> + <param name="order" type="text" size="30" value="" label="Run Order ID" help="The name of the column in your design file that contains the order the samples were run."/> + </inputs> + <outputs> + <data name="order_plots" format="pdf" label="${tool.name} on ${on_string}: Plots" /> + <data name="order_summary" format="tabular" label="${tool.name} on ${on_string}: Summary"/> + <data name="flags" format="tabular" label="${tool.name} on ${on_string}: Flags"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="order" value="run_Order_fake_variable" /> + <output name="order_plots" file="ST000006_run_order_regression_figure.pdf" compare="sim_size" delta="10000" /> + <output name="order_summary" file="ST000006_run_order_regression_table.tsv" /> + <output name="flags" file="ST000006_run_order_regression_flags.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +**NOTE:** The tool is intended to evaluate the impact of sample run order on feature (row) values. Not applicable in the absence of known run order. + +It uses linear regression to identify features where the regression slope is not zero for nominal levels of significance. + +The tool fits a simple linear regression by feature (row) using values for each feature as a response and sample run order as a linear predictor. +The goal is to identify a linear trend that changes over time and determine whether the trends are statistically significant. +The tool generates flags if the slope is statistically significant for two different levels of statistical significance ( alpha = 0.05 and alpha = 0.01). + +NOTE: Groups with one element are excluded from the analysis. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +@GROUP@ + + - **NOTE:** Groups with one element will be excluded. + +@RUNORDER@ + + +----------------------------------------------------------------------------------- + +**Output** + +This tool outputs three different files: + +(1) a TSV file of regression summaries including the values of the regression slope, corresponding p-value and r-squared value. + +(2) a TSV file with the corresponding flags for two levels of statistical significance (alpha = 0.05 and alpha = 0.01). + +(3) and a PDF file with fitted regression plots for each feature. The values of the feature are displayed on the plot together with the regression line, bands, slopes, and corresponding p and r-squared values. The values are colored according to group classification. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scatter_plot_2D.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,154 @@ +<tool id="secimtools_scatter_plot_2D" name="Scatter Plot 2D" version="@WRAPPER_VERSION@"> + <description>- A standalone tool.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +scatter_plot_2D.py +--input $input +--ID $uniqID + +--X $x +--Y $y +--figure $figure + +#if $design + --design $design +#end if + +#if $group + --group $group +#end if + +#if $color + --color $color +#end if + +#if $palette + --palette $palette +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Long Dataset" help="Input dataset in tab-separated long format. Please see the description of the file format below. If the file is not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="SampleID" help="Name of the column in your long dataset that has the unique sample idenifiers (sampleID)."/> + <param name="x" type="text" size="30" value="" label="X Group Title" help="Name of the column in your long format dataset for X values."/> + <param name="y" type="text" size="30" value="" label="Y Group Title" help="Name of the column in your long format dataset for Y values."/> + <param name="design" type="data" format="tabular" optional="true" label="Design File [Optional]" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="group" type="text" size="30" value="" label="Group/Treatment [Optional]" help="Name of the column in your design file that contains group classification to be used for coloring."/> + <param name="palette" type="text" size="30" label="Palette [Optional]" help="Select color palette. Default = tableau. Other options are diverging, qualitative, sequential, cubehelix, tableau, and wesanderson. Please see the descriptions for the palettes below."/> + <param name="color" type="text" size="30" label="Color Scheme [Optional]" help="Select color scheme within the palette. Default color scheme for palette tableau = Tableau_20. User must specify a color scheme if the palette field has been filled. Please see the descriptions for the color schemes below."/> + </inputs> + <outputs> + <data format="pdf" name="figure" label="${tool.name} on ${on_string}: scatter plots"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_principal_component_analysis_score_out.tsv"/> + <param name="design" value="ST000006_design_group_name_underscore.tsv"/> + <param name="uniqID" value="sampleID" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="x" value="PC1" /> + <param name="y" value="PC2" /> + <output name="figure" file="ST000006_scatter_plot_2D_default_figure.pdf" compare="sim_size" delta="10000"/> + </test> + <test> + <param name="input" value="ST000006_principal_component_analysis_score_out.tsv"/> + <param name="design" value="ST000006_design_group_name_underscore.tsv"/> + <param name="uniqID" value="sampleID" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="x" value="PC1" /> + <param name="y" value="PC2" /> + <param name="palette" value="sequential" /> + <param name="color" value="Blues_3" /> + <output name="figure" file="ST000006_scatter_plot_2D_palette_color_figure.pdf" compare="sim_size" delta="10000"/> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + + + +**Tool Description** + +The tool provides a 2D scatter plot of values in a Long Format file. Please see the description of the Long Format below. +If coloring by group is desired, the column with the sample names in the Long Format dataset has to have the name "sampleID" to match the name in the Design File. +Scatter plot 2D allows the user to plot any pair of values from the Principal Component Analysis (PCA) output or plot other data. + +NOTE: The user should ensure that the input datasets have no missing values. + +The user has an option to specify the palette and the color scheme within the palette. +If the palette is specified by the user, the color scheme must to be specified. +The list of available palettes are: + +diverging, +qualitative, +sequential, +cubehelix, +tableau (default), and +wesanderson. + +The lists of corresponding color schemes for each palattes are available via the links below: + + +https://jiffyclub.github.io/palettable/tableau/ + +https://jiffyclub.github.io/palettable/colorbrewer/diverging/ + +https://jiffyclub.github.io/palettable/colorbrewer/qualitative/ + +https://jiffyclub.github.io/palettable/colorbrewer/sequential/ + +https://jiffyclub.github.io/palettable/cubehelix/ + +https://jiffyclub.github.io/palettable/wesanderson/ + + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@LONG@ + + +**Sample ID** + + - Name of the column in your Long Dataset that has unique sample IDs. If coloring by group, the sampleIDs must match the sampleIDs in the Design File (below). + + +**X Group Title** + + - Name of the column in the Long Format dataset for X values. + +**Y Group Title** + + - Name of the column in the Long Format dataset for Y values. + +@DF@ + +**Group/Treatment [Optional]** + + - Name of the column in your Design File that contains group classification to be used for coloring. + + +**Palette [Optional]** + + - Choice of the palette. Default = tableau. Other options include: diverging, qualitative, sequential, cubehelix, and wesanderson. + +**Color Scheme [Optional]** + + - Choice of the color scheme within the palette. The default color scheme for palette tableau is Tableau_20. The user must specify the color scheme if the Palette field has been filled. + +-------------------------------------------------------------------------------- + +**Output** + +The tool produces a PDF file with the 2D scatter plot. +Coloring of the features by group and the corresponding legend will be included in the plot if the user provides a Design file and Palette. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scatter_plot_3D.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,182 @@ +<tool id="secimtools_scatter_plot_3D" name="Scatter Plot 3D" version="@WRAPPER_VERSION@"> + <description>- A standalone tool.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +scatter_plot_3D.py +--input $input +--ID $uniqID + +--X $x +--Y $y +--Z $z +--figure $figure + +#if $design + --design $design +#end if + +#if $group + --group $group +#end if + +#if $color + --color $color +#end if + +#if $palette + --palette $palette +#end if + +#if $rotation + --rotation $rotation +#end if + +#if $elevation + --elevation $elevation +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Long Dataset" help="Input dataset in tab-separated long format. Please see the description of the file format below. If the file is not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="SampleID" help="Name of the column in your long dataset with unique sample identifiers (sampleID). If coloring by group, sampleIDs must match sampleIDs in the Design File."/> + <param name="x" type="text" size="30" value="" label="X Group Title" help="Name of the column in long format dataset for X values."/> + <param name="y" type="text" size="30" value="" label="Y Group Title" help="Name of the column in long format dataset for Y values."/> + <param name="z" type="text" size="30" value="" label="Z Group Title" help="Name of the column in long format dataset for Z values."/> + <param name="rotation" type="text" size="30" value="" label="Azimuth (Rotation) Angle for Viewing [Optional]" help="The azimuth (rotation) angle for viewing in degrees. Default = 45 degrees. The ideal azimuth (rotation) angle may be a process of trial and error."/> + <param name="elevation" type="text" size="30" value="" label="Elevation Angle for Viewing [Optional]" help="The elevation angle for viewing in degrees. Default = 45 degrees. The ideal elevation angle may be a process of trial and error."/> + <param name="design" type="data" format="tabular" optional="true" label="Design File [Optional]" help="Design file tab separated. Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="group" type="text" size="30" value="" label="Group/Treatment [Optional]" help="Name of the column in your Design File that contains group classification that will be used for coloring."/> + <param name="palette" type="text" size="30" label="Palette [Optional]" help="Select color palette. Default = tableau. Other options include diverging, qualitative, sequential, cubehelix, and wesanderson. Please see the descriptions for the palettes below."/> + <param name="color" type="text" size="30" label="Color Scheme [Optional]" help="Select color scheme within the palette. The default color scheme for palette tableau = Tableau_20. User must specify Color Scheme if the Palette field has been filled. Please see the descriptions for the color schemes below."/> + </inputs> + <outputs> + <data format="pdf" name="figure" label="${tool.name} on ${on_string}: scatter plots"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_principal_component_analysis_score_out.tsv"/> + <param name="design" value="ST000006_design_group_name_underscore.tsv"/> + <param name="uniqID" value="sampleID" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="x" value="PC1" /> + <param name="y" value="PC2" /> + <param name="z" value="PC3" /> + <output name="figure" file="ST000006_scatter_plot_3D_default_figure.pdf" compare="sim_size" delta="10000" /> + </test> + <test> + <param name="input" value="ST000006_principal_component_analysis_score_out.tsv"/> + <param name="design" value="ST000006_design_group_name_underscore.tsv"/> + <param name="uniqID" value="sampleID" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="x" value="PC1" /> + <param name="y" value="PC2" /> + <param name="z" value="PC3" /> + <param name="rotation" value="30" /> + <param name="elevation" value="23" /> + <param name="palette" value="sequential" /> + <param name="color" value="Blues_3" /> + <output name="figure" file="ST000006_scatter_plot_3D_palette_color_figure.pdf" compare="sim_size" delta="10000" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + + +**Tool Description** + +The tool provides a 3D scatter plot of values in a Long Format file. Please see the description of the Long Format below. +If coloring by group is desired, the column with the sample names in the Long Format dataset has to have the name "sampleID" to match the name in the Design File. +Scatter plot 2D allows the user to plot any pair of values from the Principal Component Analysis (PCA) output or plot other data. + +NOTE: The user should ensure that the input datasets have no missing values. + +The user has an option to specify the palette and the color scheme within the palette. +If the palette is specified by the user, the color scheme must to be specified. +The list of available palettes are: + +diverging, +qualitative, +sequential, +cubehelix, +tableau (default), and +wesanderson. + +The lists of corresponding color schemes for each palattes are available via the links below: + +https://jiffyclub.github.io/palettable/tableau/ + +https://jiffyclub.github.io/palettable/colorbrewer/diverging/ + +https://jiffyclub.github.io/palettable/colorbrewer/qualitative/ + +https://jiffyclub.github.io/palettable/colorbrewer/sequential/ + +https://jiffyclub.github.io/palettable/cubehelix/ + +https://jiffyclub.github.io/palettable/wesanderson/ + + +**NOTE:** The user should ensure that the Long Format and Design File datasets have no missing values. + + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@LONG@ + + +**Sample ID** + + - Name of the column in your Long Dataset that has the unique sample IDs. If coloring by group is desired based on the Design File sample IDs have to be saved in a column with the name sampleID to match the name in the Design File + + +**X Group Title** + + - Name of the column in the Long Format dataset for X values. + +**Y Group Title** + + - Name of the column in the Long Format dataset for Y values. + +**Z Group Title** + + - Name of the column in the Long Format dataset for Z values. + +**Azimuth (Rotation) Angle for Viewing [Optional]:** + + - The azimuth (rotation) angle for viewing in degrees. The default value = 45 degrees. The ideal azimuth (rotation) angle for viewing may involve trial and error. + +**Elevation Angle for Viewing [Optional]:** + + - The elevation angle for viewing in degrees. The default value = 45 degrees. The ideal elevation angle for viewing may involve trial and error. + +@DF@ + +**Group/Treatment [Optional]** + + - Name of the column in your Design File that contains group classification for coloring. + +**Palette [Optional]** + + - Choice of the palette. Default = tableau. The other options include: diverging, qualitative, sequential, cubehelix, tableau, and wesanderson. + +**Color Scheme [Optional]** + + - Choice of the color scheme within the palette. The default color scheme for palette tableau is Tableau_20. User has to specify Color Scheme if the Palette field has been filled. + +-------------------------------------------------------------------------------- + +**Output** + +The tool produces a PDF file containing a 3D scatter plot. +The coloring of the features by group and the corresponding legend will be included in the plot if the user provides the Design File and Palette. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/standardized_euclidean_distance.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,116 @@ +<tool id="secimtools_standardized_euclidean_distance" name="Standardized Euclidean Distance (SED)" version="@WRAPPER_VERSION@"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +standardized_euclidean_distance.py +--input $input +--design $design +--ID $uniqID +--fig $plot +--SEDtoMean $out1 +--SEDpairwise $out2 + +#if $group + --group $group +#end if +#if $levels + --levels $levels +#end if +#if $p + --per $p +#end if + +#if $order + --order $order +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="group" type="text" size="30" label="Group/Treatment [Optional]" help="Name of the column in your design file that contains group classifications." /> + <param name="order" type="text" size="30" label="Input Run Order Name [Optional]" help="Enter the name of the column containing the order samples were run. Spelling and capitalization must be exact." /> + <param name="levels" type="text" size="30" label="Additional groups to separate by [Optional]" help="Enter additional group(s) name(s). Spelling and capitalization must be exact. If more than one group separate with a ','." /> + <param name="p" type="float" value= "0.95" size="6" label="Threshold" help="Threshold for standard distribution, specified as percentile. Default = 0.95." /> + </inputs> + <outputs> + <data format="pdf" name="plot" label="${tool.name} on ${on_string}: Plot" /> + <data format="tabular" name="out1" label="${tool.name} on ${on_string}: SEDtoMean" /> + <data format="tabular" name="out2" label="${tool.name} on ${on_string}: SEDpairwise" /> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <output name="plot" value="ST000006_standardized_euclidean_distance_figure.pdf" compare="sim_size" delta="50000" /> + <output name="out1" file="ST000006_standardized_euclidean_distance_to_mean.tsv" /> + <output name="out2" file="ST000006_standardized_euclidean_distance_pairwise.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool is designed to identify samples that are different using the standardized Euclidian distance (SED) between samples. +The tool estimates the variance of features and calculates the SED between each pair of samples in addition to the SED between each sample and the estimated mean. +If a group or treatment variable is provided, then the same distance plots are generated for each group and for all samples together. + +**NOTE:** Groups with less than three samples will be excluded from the analysis. + + + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +@GROUP_OPTIONAL@ + + - **Warning:** All groups must contain 3 or more samples. + + +@RUNORDER_OPTIONAL@ + +**Additional groups to separate by [Optional]** + + - Enter group(s) name(s). Spelling and capitalization must be exact. If more than one group, separate with commas. + - **Warning:** All groups must contain 3 or more samples. + - **NOTE:** Groups with one element will be excluded from the analysis. + + +**Percentile cutoff** + +- The percentile cutoff for standard distributions. The default is 0.95. + +-------------------------------------------------------------------------------- + +**Output** + +The tool outputs three different files: + +(1) a TSV file that contains a n x n matrix (where n is the number computed samples) of the pairwise distances between the samples. +If the Group/Treatment [Optional] variable is specified, the distances will be computed within groups. + +(2) A PDF file containing: +(i) Boxplots of the distribution of distances. The distances are computed between samples in the group and summarized as boxplots. +The outliers (blue dots), means (red squares) and median (blue bars) of the distances are presented for each sample within the group. +(ii) 2D scatter plots that show distances computed pairwise within the group + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/subset_data.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,82 @@ +<tool id="secimtools_subset_data" name="Subset Wide Data and Design File" version="@WRAPPER_VERSION@"> + <description>- Based on groups.</description> + <requirements> + <requirement type="python-module">os</requirement> + <requirement type="python-module">pandas</requirement> + <requirement type="python-module">interface</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + subset_data.py + --input $input + --design $design + --uniqID $uniqID + #if $group + --group $group + #end if + --drops $toDrop + --out $out + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="group" type="text" size="30" value="" optional="false" label="Group/Treatment [Optional]" help="Name of the column in your design file that contains group classifications. If not provided, the drop will be performed by 'sampleID'."/> + <param name="toDrop" type="text" size="30" optional="false" label="Group(s)/Sample(s) to drop" help="Name of the Group(s)/Sample(s), comma separated, that will be removed from your wide datset."/> + </inputs> + <outputs> + <data format="tabular" name="out" label="${tool.name} on ${on_string}: Value"/> + </outputs> + <macros> + <import>macros.xml</import> + </macros> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design_names_underscore.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="drops" value="Chardonnay_ Napa_ CA 2003,Riesling_ CA 2004" /> + <output name="out" file="ST000006_subset_data_output.tsv" /> + </test> + </tests> +<help> + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool creates new wide format dataset and design dataset based on the existing wide and design datasets where only groups specified by the user are present. +The user chooses which group(s) to include in the new datasets. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Group/Treatment [Optional]** + + - Name of the column in your Design File that contains group classifications. If none provided the drop will be performed by 'sampleID'. + +**Group(s)/Sample(s) to drop** + + - Name of the Group(s)/Sample(s), comma separeted, that will be removed from your wide datset. + +-------------------------------------------------------------------------------- + +**Output** + +This tool will output two TSV files: a TSV file containing the subset of the original wide format dataset and a TSV file containing the subset of the original design dataset. Both datasets will contain only the samples belonging to groups selected by the user. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/summarize_flags.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,70 @@ +<tool id="secimtools_summarize_flags" name="Summarize Flags in a Flag File" version="@WRAPPER_VERSION@"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +summarize_flags.py +--flags $flags +--ID $uniqID +--outSummary $summaryFlags + ]]></command> + <inputs> + <param name="flags" type="data" format="tabular" label="Flag File" + help="Input dataset containing the binary indicator flag values for each feature."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" + help="Name of the column in your flag file containing unique identifiers."/> + </inputs> + <outputs> + <data name="summaryFlags" format="tabular" label="${tool.name} on ${on_string}"/> + </outputs> + <tests> + <test> + <param name="flags" value="ST000006_lasso_enet_var_select_flags.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <output name="summaryFlags" file="ST000006_summarize_flags_outSummary.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +Summary of the Flags + +This tool takes a wide format flag file and summarizes information about the flags. +The flag summary includes, for each feature: +(i) the sum of the flags, +(ii) the mean of the flags, +(iii) an indicator if at least one of the original flags had a value of 1 and +(iv) a second indicator if all of the original flags had a value of 1. + +-------------------------------------------------------------------------------- + +**Input** + + - One input dataset is required. + +@FLAGS@ + +@UNIQID@ + +-------------------------------------------------------------------------------- + +**Output** + +This tool will output a TSV file containing all original input flag data with four additional columns containing the flag summaries. + +The four additional columns are: + +(i) a column with the sum of the flags, +(ii) a column with the flag means, and +(iii) a flag indicator column if at least one of the original flags had a value of 1 +(iv) a second flag indicator column if all of the original flags had a value of 1 + + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/svm_classifier.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,198 @@ +<tool id="secimtools_svm_classifier" name="Support Vector Machine (SVM) Classifier" version="@WRAPPER_VERSION@"> + <description>- Predict sample groups.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +svm_classifier.py +--train_wide $train_wide +--train_design $train_design +--test_wide $test_wide +--test_design $test_design +--group $group +--ID $uniqID +--kernel $kernel +--degree $degree +--C $C +--cross_validation $cross_validation +--C_lower_bound $C_lower_bound +--C_upper_bound $C_upper_bound +--a $a +--b $b +--outClassification $outClassification +--outClassificationAccuracy $outClassificationAccuracy +--outPrediction $outPrediction +--outPredictionAccuracy $outPredictionAccuracy + ]]></command> + <inputs> + <param name="train_wide" type="data" format="tabular" label="Training wide dataset" help="Dataset missing? See TIP below."/> + <param name="train_design" type="data" format="tabular" label="Training design file" help="Dataset missing? See TIP below."/> + <param name="test_wide" type="data" format="tabular" label="Target wide dataset" help="Dataset missing? See TIP below."/> + <param name="test_design" type="data" format="tabular" label="Target design file" help="Dataset missing? See TIP below."/> + <param name="group" size="30" type="text" value="" label="Group/Treatment" help="Name of the column in your Training and Target design files that contain group classifications."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Training and Target wide datasets that contain unique identifiers."/> + <param name="kernel" type="select" size="30" display="radio" value="rbf" label="Select a SVM Kernel Function"> + <option value="rbf">Radial Basis function (Gaussian)</option> + <option value="linear">Linear</option> + <option value="poly">Polynomial</option> + <option value="sigmoid">Sigmoid</option> + </param> + <param name="degree" size="30" type="text" value="3" label="Polynomial Degree" help='Only used for the polynomial kernel.'/> + <param name="cross_validation" type="select" size="30" display="radio" value="double" label="Select Cross-Validation"> + <option value="none">None</option> + <option value="single">Single</option> + <option value="double">Double</option> + </param> + <param name="C" size="30" type="text" value="1" label="Regularization Parameter C" help='See references in tool description for setting this parameter. Value must be positive (C > 0). Used only if cross-validation is not selected. Default = 1.'/> + <param name="C_lower_bound" size="30" type="text" value="0.1" label="Regularization Parameter C (Lower Bound)" help='Defines the lower bound for regularization parameter C when cross-validation is used. Must have a positive value (C > 0) Default = 0.1. '/> + <param name="C_upper_bound" size="30" type="text" value="10" label="Regularization Parameter C (Upper Bound)" help='Defines the upper bound for regularization parameter C when cross-validation is used. Must have a positive value that is larger than the lower bound. Default = 10. '/> + <param name="a" size="30" type="text" value="0.0" label="Coefficient A" help='Used in the kernel functions above. Must be greater than zero. However, the default = 0 and translates to a = 1/n_features, where n_features is the number of features. Default = 0.'/> + <param name="b" size="30" type="text" value="0.0" label="Coefficient B" help='Independent term in kernel function. It is only significant in polynomial and sigmoid kernels. Default = 0.'/> + </inputs> + <outputs> + <data name="outClassification" format="tabular" label="${tool.name} on ${on_string}: Classification of the Training Data Set"/> + <data name="outClassificationAccuracy" format='tabular' label="${tool.name} on ${on_string}: Classification Accuracy of the Training Data Set"/> + <data name="outPrediction" format="tabular" label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/> + <data name="outPredictionAccuracy" format='tabular' label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/> + </outputs> + <tests> + <test> + <param name="train_wide" value="ST000006_data.tsv"/> + <param name="train_design" value="ST000006_design.tsv"/> + <param name="test_wide" value="ST000006_data.tsv"/> + <param name="test_design" value="ST000006_design.tsv"/> + <param name="group" value="White_wine_type_and_source" /> + <param name="uniqID" value="Retention_Index" /> + <param name="kernel" value="linear"/> + <param name="degree" value="3"/> + <param name="cross_validation" value="none"/> + <param name="C" value="1"/> + <param name="C_lower_bound" value="0.1"/> + <param name="C_upper_bound" value="2"/> + <param name="a" value="1"/> + <param name="b" value="1"/> + <output name="outClassification" file="ST000006_svm_classifier_train_classification.tsv" /> + <output name="outClassificationAccuracy" file="ST000006_svm_classifier_train_classification_accuracy.tsv" /> + <output name="outPrediction" file="ST000006_svm_classifier_target_classification.tsv" /> + <output name="outPredictionAccuracy" file="ST000006_svm_classifier_target_classification_accuracy.tsv" /> + </test> + </tests> + <help><![CDATA[ + +**TIP:** +If your data is not TAB delimited, use *Text Manipulation->Convert*. + +**WARNINGS:** + - (1) This script automatically removes spaces and special characters from strings. + - (2) If a feature name starts with a number it will prepend an '_'. + +**Tool Description** + +**NOTE: A minimum of 100 samples is required by the tool for single or double cross validation** + +Given a set of supervised samples in a Training Dataset, the SVM training algorithm builds a model based on these samples that can be used for predicting the categories of new, unclassified samples in a Target Dataset. +The Target Dataset is not used for model training or evaluation, only for prediction based on the finalized model. +SVM classification is performed on the target data and accuracy is estimated for both Target and Training Datasets. + +SVM uses different kernel functions to carry out different types of classification such as radial bassis (gaussian), linear, polynomial, and sigmoid. +The classification model can be trained with and without cross-validation (single or double). + +For single and double cross-validation: the training dataset is split differently when the model fit is performed. + +In single cross-validation: the same data are used to both fit and evaluate the model. + +In double cross-validation: the training dataset is split into pieces and the model fit is performed on one of the pieces and evaluated on the other pieces. + +Under cross-validation, the user specifies Regularization Parameter C and the Upper and Lower bounds of Regularization Parameter C. +For more information about Regularization Parameter C, see references below: + +Cortes, C. and Vapnik, V. 1995. Support-vector networks. Machine Learning. 20(3) 273-297. + +Steinwart, I and Christmann, A. 2008. Support vector machines. Springer Science and Business Media. + + +To use the SVM tool, users need the following information: + +(i) a Training Dataset with known categories in the training design file and +(ii) a Target Dataset with predicted categories in the target design file. +(iii) the name of the Group/Treatment classification column should be the same for both design files. +(iv) the Unique Feature IDs should be the same in both the wide datasets. +(v) the number of Unique Feature IDs should be the same in both the wide datasets. + +------------------------------------------------------------------------------ + +**Input** + + - Four input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@GROUP@ + +@UNIQID@ + +**SVM Kernel Function** + + - Kernel functions available for the SVM algorithm. + +**Polynomial Degree** + + - Only used for the polynomial kernel. + +**Cross-Validation Choice** + + - Cross-validation options available for the user. 'None' corresponds to no cross-validation- the user specifies regularization parameter C manually. + + +**Regularization Parameter C** + + - Penalizes potential overfitting, must be positive. + + +**Regularization Parameter C (Lower Bound)** + + - Lower bound for regularization parameter C. Value must be greater than 0. Only if cross-validation is selected. + + +**Regularization Parameter C (Upper Bound)** + + - Upper bound for regularization parameter C. Value must be greater than the Lower Bound. + + +**Coefficient A** + + - Used in the kernel functions above. Must be greater than zero. Default = 0, however, + this translates to a = 1/n_features, where n_features is the number of features. + +**Coefficent B** + + - Independent term in the kernel function. It is only significant in + polynomial and sigmoid kernels. + +------------------------------------------------------------------------------ + +**Output** + +This tool will output two files for the Training dataset and two for the Target datset: + +Training: + +(1) a TSV file containing the observed and predicted grouping classifications for each sample and +(2) a TSV file containing the accuracy (percentage) of the classification. + +Target: + +(3) a TSV file containing suspected and predicted grouping classifications for each sample and +(4) a TSV file containing the accuracy (percentage) of the prediction in comparison to the suspected grouping specified in the design file. + +**NOTE:** Some knowledge about the SVM classifier algorithm and different kernel types is recommended for users who plan to use the tool frequently with different settings. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/threshold_based_flags.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,78 @@ +<tool id="secimtools_threshold_based_flags" name="Threshold Based Flags" version="@WRAPPER_VERSION@"> + <description>- Flag features based on a user-specified threshold.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +threshold_based_flags.py +--input $input +--design $design +--ID $uniqID +--output $output +--group $group + +#if $cutoff: +--cutoff $cutoff +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Wide Dataset that has unique identifiers."/> + <param name="group" type="text" size="30" label="Group/Treatment" help="Name of the column in your design file that contains group classifications. The resulting indicator flags will generated for these group categories." /> + <param name="cutoff" type="integer" optional="true" size="6" value="30000" label="Cutoff" help="Cutoff to use for which values to flag. Default = 30,000."/> + </inputs> + <outputs> + <data format="tabular" name="output" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="cutoff" value="3000" /> + <output name="output" file="ST000006_threshold_based_flags_output.tsv" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +This tool flags a feature in a given group with a binary indicator if, for half (or more) of the samples within the group, the feature value is below a user specified threshold or is missing. +The default threshold value of 30,000 is primarily useful for peak intensities from mass spectroscopy data and should be evaluated carefully for other types of values (e.g. for peak height). + +-------------------------------------------------------------------------------- + + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +@GROUP@ + +**Cutoff Value** + + - Cutoff to use for which values to flag. Default = 30,000. + +-------------------------------------------------------------------------------- + +**Output** + +This tool outputs a TSV file containing indicator flags for each group, where the number of indicator flags is determined by the number of groups. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_conf.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,40 @@ + <section id="secim" name="SECIM Tools" version=""> + <tool file="SECIMTools/galaxy/anova_fixed.xml"/> + <tool file="SECIMTools/galaxy/blank_feature_filtering_flags.xml"/> + <tool file="SECIMTools/galaxy/bland_altman_plot.xml"/> + <tool file="SECIMTools/galaxy/coefficient_variation_flags.xml"/> + <tool file="SECIMTools/galaxy/compare_flags.xml"/> + <tool file="SECIMTools/galaxy/compound_identification.xml"/> + <tool file="SECIMTools/galaxy/data_normalization_and_rescaling.xml"/> + <tool file="SECIMTools/galaxy/distribution_features.xml"/> + <tool file="SECIMTools/galaxy/distribution_samples.xml"/> + <tool file="SECIMTools/galaxy/hierarchical_clustering_heatmap.xml"/> + <tool file="SECIMTools/galaxy/imputation.xml"/> + <tool file="SECIMTools/galaxy/kruskal_wallis.xml"/> + <tool file="SECIMTools/galaxy/lasso_enet_var_select.xml"/> + <tool file="SECIMTools/galaxy/linear_discriminant_analysis.xml"/> + <tool file="SECIMTools/galaxy/log_and_glog_transformation.xml"/> + <tool file="SECIMTools/galaxy/macros.xml"/> + <tool file="SECIMTools/galaxy/magnitude_difference_flags.xml"/> + <tool file="SECIMTools/galaxy/mahalanobis_distance.xml"/> + <tool file="SECIMTools/galaxy/merge_flags.xml"/> + <tool file="SECIMTools/galaxy/modify_design_file.xml"/> + <tool file="SECIMTools/galaxy/modulated_modularity_clustering.xml"/> + <tool file="SECIMTools/galaxy/multiple_testing_adjustment.xml"/> + <tool file="SECIMTools/galaxy/mzrt_match.xml"/> + <tool file="SECIMTools/galaxy/partial_least_squares.xml"/> + <tool file="SECIMTools/galaxy/principal_component_analysis.xml"/> + <tool file="SECIMTools/galaxy/random_forest.xml"/> + <tool file="SECIMTools/galaxy/remove_selected_features_samples.xml"/> + <tool file="SECIMTools/galaxy/retention_time_flags.xml"/> + <tool file="SECIMTools/galaxy/run_order_regression.xml"/> + <tool file="SECIMTools/galaxy/scatter_plot_2D.xml"/> + <tool file="SECIMTools/galaxy/scatter_plot_3D.xml"/> + <tool file="SECIMTools/galaxy/standardized_euclidean_distance.xml"/> + <tool file="SECIMTools/galaxy/summarize_flags.xml"/> + <tool file="SECIMTools/galaxy/svm_classifier.xml"/> + <tool file="SECIMTools/galaxy/threshold_based_flags.xml"/> + <tool file="SECIMTools/galaxy/tool_conf.xml"/> + <tool file="SECIMTools/galaxy/ttest_single_group.xml"/> + <tool file="SECIMTools/galaxy/ttest.xml"/> + </section>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ttest.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,125 @@ +<tool id="secimtools_ttest" name="T-Test (Paired or Unpaired)" version="@WRAPPER_VERSION@"> + <description>on features.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +ttest.py +--input $input +--design $design +--uniqueID $uniqueID +--group $group +--pairing $pairing +--summaries $summaries +--flags $flags +--volcano $volcano +#if $order + --order $order +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqueID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="group" type="text" size="30" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/> + <param name="pairing" size="30" display="radio" type="select" value="unpaired" label="Select Test" help="Select either paired (dependent samples) or unpaired (independent samples) tests."> + <option value="unpaired" selected="true">Unpaired (Independent Samples)</option> + <option value="paired" selected="true">Paired (Dependent Samples)</option> + </param> + <param name="order" type="text" value="" size="30" label="Pairing ID" help="Name of the column in your design file that contains Pairing IDs. Ignored for Unpaired (Independent Samples) test."/> + </inputs> + <outputs> + <data format="tabular" name="summaries" label="${tool.name} on ${on_string}: Summaries that include p-values and mean differences."/> + <data format="tabular" name="flags" label="${tool.name} on ${on_string}: Flags that include 0.01, 0.05 and 0.10 significance levels for the differences. "/> + <data format="pdf" name="volcano" label="${tool.name} on ${on_string}: Volcano plots for the differences."/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqueID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <param name="pairing" value="unpaired" /> + <output name="summaries" file="ST000006_ttest_select_unpaired_summary.tsv" /> + <output name="flags" file="ST000006_ttest_select_unpaired_flags.tsv" /> + <output name="volcano" file="ST000006_ttest_select_unpaired_volcano.pdf" compare="sim_size" delta="10000"/> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool performs a two-sided t-test for two groups of dependent samples (paired or dependent t-test) or multiple (two or more) groups of independent samples (unpaired or independent t-test). +The user selects which test (paired or unpaired) to perform. + +In an unpaired t-test, the samples within and between groups are independent. +The test is performed for all pairs of conditions specified using the Group/Treatment field. +For example, if there are three treatment conditions (Control, Time1 and Time2) then t-tests will be performed for: (i) Control vs Time1, (ii) Control vs Time2, and (iii) Time1 vs Time2. +Note that this will give slightly different results than the contrast in an ANOVA because the ANOVA uses all groups to estimate the error. + +A paired t-test can be performed for pairs of treatment conditions where sample pairs are known and specified by the user in the Pairing ID field. +Here, the difference between the measurements for the pairs is calculated. +To ensure that the differences are taken in the same order across all pairs, the Group/Treatment variable is required. +The differences will be calculated beween the groups in the order that the groups appear in the Design File. +The Pairing ID specifies which samples are paired. A two sided t-test will be performed for the test that the difference is zero. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Group/Treatment** + + - List with the name of the column the Design File that contains group classifications. + +**Pairing ID** + + - Name of the column in your Design File that contains Pairing IDs. An example is given below: + + +----------+--------+--------+ + | sampleID | group | pairID | + +==========+========+========+ + | sample1 | g1 | p1 | + +----------+--------+--------+ + | sample2 | g1 | p2 | + +----------+--------+--------+ + | sample3 | g1 | p3 | + +----------+--------+--------+ + | sample4 | g2 | p1 | + +----------+--------+--------+ + | sample5 | g2 | p2 | + +----------+--------+--------+ + | sample6 | g2 | p3 | + +----------+--------+--------+ + | ... | ... | ... | + +----------+--------+--------+ + + + +-------------------------------------------------------------------------------- + +**Output** + +The tool outputs 3 files: + +(1) a TSV file with the results table containing p-values for each test and the corresponding differences between the means for comparisons between the groups. +(2) a TSV file with an indicator flag = 1 if the difference between the groups is statistically significant using provided α levels. +(3) a PDF file with volcano plots visual inspection of the differences between group means and p-values. The red dashed line in volcano plot(s) corresponds to a p-value = 0.01 cutoff (2 on the negative log base 10 scale). + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ttest_perm.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,92 @@ +<tool id="secimtools_ttest_perm" name="Group Comparison by Permutation" version="@WRAPPER_VERSION@"> + <description>on features.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +ttest_perm.py +--input $input +--design $design +--uniqueID $uniqueID +--group $group +--reps $reps +--summaries $summaries +--flags $flags +--volcano $volcano + </command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input dataset in wide format and tab separated. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Design file tab separated. Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqueID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Wide Dataset that has unique Feature IDs."/> + <param name="group" type="text" size="30" label="Group/Treatment" help="Name of the column in your Design File that contains group classifications."/> + <param name="reps" type="text" size="30" value="" label ="Iteration Number." help="Enter the number of iterations you want to carry out."/> + </inputs> + <outputs> + <data format="tabular" name="summaries" label="${tool.name} on ${on_string}: Summaries that include p-values and mean differences."/> + <data format="tabular" name="flags" label="${tool.name} on ${on_string}: Flags that include 0.01, 0.05 and 0.10 significance levels for the differences. "/> + <data format="pdf" name="volcano" label="${tool.name} on ${on_string}: Volcano plots for the differences."/> + </outputs> + <tests> + <test> + <param name="input" value="fly_test_sbys.tsv"/> + <param name="design" value="fly_test_design.tsv"/> + <param name="uniqueID" value="rowID" /> + <param name="group" value="mating_source" /> + <param name="reps" value="1000" /> + <output name="summaries" file="fly_ttest_permuted_summary.tsv" /> + <output name="flags" file="fly_ttest_permuted_flags.tsv" /> + <output name="volcano" file="fly_ttest_permuted_volcano.pdf" compare="sim_size" delta="10000"/> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool performs a permuted two-sided t-test for multiple (two or more) groups of independent sample (unpaired or independent t-test). +The user selects the number of iterations to perform. + +In an unpaired t-test the samples within and between groups are independent. The p-value is calculated by permutation of the data. +The permutation test is performed for all pairs of conditions specified using the Group/Treatment field and for the number of specified iterations. +If there are three treatment conditions (Control, Time1 and Time2) then t-tests will be performed for: (i) Control vs Time1, (ii) Control vs Time2, and (iii) Time1 vs Time2. +Note that this will give slightly different results than the contrast in an ANOVA because the ANOVA uses all groups to estimate the error. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Group/Treatment** + + - List with the name of the column the Design File that contains group classifications. + +** Reps** + + - Number of iterations (permutations) to carry out + +-------------------------------------------------------------------------------- + +**Output** + +The tool outputs 3 files: + +(1) a TSV file with the results table containing p-values for each test and the corresponding differences between the means for comparisons between the groups. +(2) a TSV file with an indicator flag = 1 if the difference between the groups is statistically significant using provided α levels. +(3) a PDF file with volcano plots visual inspection of the differences between group means and p-values. The red dashed line in volcano plot(s) corresponds to a p-value = 0.01 cutoff (2 on the negative log base 10 scale). + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ttest_single_group.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,100 @@ +<tool id="secimtools_ttest_single_group" name="T-Test (Single Group)" version="@WRAPPER_VERSION@"> + <description>for the specified mean.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +ttest_single_group.py +--input $input +--design $design +--uniqueID $uniqueID +--mu $mu +--summaries $summaries +--flags $flags +--volcano $volcano +#if $group + --group $group +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqueID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="mu" type="text" size="30" value = "0" label="Mu" help="The value of the mean under the null hypothesis. Default = 0. "/> + <param name="group" type="text" size="30" label="Group/Treatment [Optional]" help="Name of the column in your design file that contains group classifications."/> + </inputs> + <outputs> + <data format="tabular" name="summaries" label="${tool.name} on ${on_string}: Summaries that include p-values and mean differences."/> + <data format="tabular" name="flags" label="${tool.name} on ${on_string}: Flags that include 0.01, 0.05 and 0.10 significance levels for the differences. "/> + <data format="pdf" name="volcano" label="${tool.name} on ${on_string}: Volcano plots for the differences."/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqueID" value="Retention_Index" /> + <param name="mu" value="0" /> + <output name="summaries" file="ST000006_ttest_single_group_no_group_summary.tsv" /> + <output name="flags" file="ST000006_ttest_single_group_no_group_flags.tsv" /> + <output name="volcano" file="ST000006_ttest_single_group_no_group_volcano.pdf" compare="sim_size" delta="10000"/> + </test> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqueID" value="Retention_Index" /> + <param name="mu" value="0" /> + <param name="group" value="White_wine_type_and_source" /> + <output name="summaries" file="ST000006_ttest_single_group_with_group_summary.tsv" /> + <output name="flags" file="ST000006_ttest_single_group_with_group_flags.tsv" /> + <output name="volcano" file="ST000006_ttest_single_group_with_group_volcano.pdf" compare="sim_size" delta="10000"/> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool performs a one sample t-test for each feature. +Two options are available for the t-test: if the user provides the Group/Treatment variable, then the mean for each treatment condition is compared with Mu, the user-specified value of the true mean under the null hypothesis. +If Group/Treatment is not provided, then the mean across all samples is compared to Mu. + +-------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@UNIQID@ + +**Group/Treatment [Optional]** + + - Name of the column the Design File that contains group classifications. + +**Mu** + + - The value of the mean under the null hypothesis. Default = 0. + + +-------------------------------------------------------------------------------- + +**Output** + +The tool produces three outputs: + +(1) a TSV file with the results table containing p-values for each test and the corresponding differences between the group means and the mu value under the null. +(2) a TSV file containing indicator flags equal to 1 if the difference between the groups and the mean under the null is statistically significant using the indicated α levels. +(3) a PDF file with volcano plots for visual inspection of the differences between the group means and p-values. The red dashed line in the volcano plot(s) corresponds to a p-value = 0.01 cutoff (2 on the negative log base 10 scale). + + ]]></help> + <expand macro="citations"/> +</tool>