view maaslin2.xml @ 3:43ccbef89f1d draft

planemo upload for repository https://github.com/biobakery/Maaslin2 commit 62a738f626aee9c8e1f1c5cbd63a59b3390d4ed5
author iuc
date Wed, 26 Jun 2024 09:40:13 +0000
parents faacef62bb54
children
line wrap: on
line source

<tool id="maaslin2" name="MaAsLin 2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Microbiome Multivariable Association with Linear Models</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="edam_ontology"/>
    <expand macro="xrefs"/>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
    
## get column names of fixed and random effect from the input file, since galaxy 
## can only return indices with type="data_column" 
## using awk so that the file is only parsed on command line execution

#if $fixed_effects
#set idx = []
#for $i in $fixed_effects:
    #silent idx.append(f'${i}')
#end for
#set idx_for_awk = ','.join(idx)

fixed_effects=`awk -v OFS=',' -F"\t" 'NR == 1 { print $idx_for_awk}' '$input_metadata'` &&
echo 'Assigned fixed effects as:' \$fixed_effects &&
#end if


#if $random_effects
#set idx = []
#for $i in $random_effects:
    #silent idx.append(f'${i}')
#end for
#set idx_for_awk = ','.join(idx)

random_effects=`awk -v OFS=',' -F"\t" 'NR == 1 { print $idx_for_awk}' '$input_metadata'` &&
echo 'Assigned random effects as:' \$random_effects &&
#end if

ln -s '$input_data' 'input_data.tsv'
&&
ln -s '$input_metadata' 'input_metadata.tsv'
&&   
Maaslin2.R
#if $additional_options.min_abundance   
    --min_abundance '$additional_options.min_abundance'
#end if
#if $additional_options.min_prevalence
    --min_prevalence '$additional_options.min_prevalence'
#end if
#if $additional_options.max_significance
    --max_significance '$additional_options.max_significance'
#end if
#if $additional_options.normalization
    --normalization '$additional_options.normalization'
#end if
#if $additional_options.transform
    --transform '$additional_options.transform'
#end if
#if $additional_options.analysis_method
    --analysis_method '$additional_options.analysis_method'
#end if
#if $random_effects
    --random_effects \$random_effects
#end if
#if $fixed_effects
    --fixed_effects \$fixed_effects
#end if
#if $reference
    --reference '$reference'
#end if
    
#if $additional_options.correction
    --correction '$additional_options.correction'
#end if
    $additional_options.standardize
    $output.plot_heatmap
#if $output.heatmap_first_n
    --heatmap_first_n '$output.heatmap_first_n'
#end if
    $output.plot_scatter
    --cores 1
    'input_data.tsv'
    'input_metadata.tsv'
    'outputFolder'
&&
zip -r out.zip outputFolder &&
cd outputFolder && 
mkdir -p figures/ && 
cp *.pdf figures


    ]]></command>
    <inputs>
        <param name="input_data" type="data" format="tabular" label="Data (or features) file"/>
        <param name="input_metadata" type="data" format="tabular" label="Metadata file"/>
        <param argument="--fixed_effects" type="data_column" data_ref="input_metadata" use_header_names="true"  multiple="true" optional="true" label="Interactions: Fixed effects" help="The fixed effects for the model, comma-delimited for multiple effects, Default value: All " />
        <param argument="--random_effects" type="data_column" data_ref="input_metadata" use_header_names="true" multiple="true" optional="true" label="Random effects" help="The random effects for the model,  comma-delimited for multiple effects, Default: None" />
        <param argument="--reference" type="text"  label="Reference" help="Reference for a variable with more than two levels provided as a string of 'variable,reference' comma delimited for multiple variables. " />
        <section name="additional_options" title="Additional Options" expanded="true">
            <param argument="--min_abundance" type="float" value="0.0" optional="true" label="Minimum abundance" help="The minimum abundance for each feature"/>
            <param argument="--min_prevalence" type="float" value="0.1" optional="true" label="Minimum prevalence" help="The minimum percent of samples for which a feature is detected at minimum abundance"/>
            <param argument="--max_significance" type="float" value="0.25" optional="true" label="Maximum significance" help="The q-value threshold for significance"/>
            <param argument="--normalization" type="select" optional="true" label="The normalization method to apply">
                <option value="TSS" selected="true">TSS</option>
                <option value="CLR">CLR</option>
                <option value="CSS">CSS</option>
                <option value="NONE">NONE</option>
                <option value="TMM">TMM</option>
            </param>
            <param argument="--transform" type="select" optional="true" label="The transform to apply">
                <option value="LOG" selected="true">LOG</option>
                <option value="LOGIT">LOGIT</option>
                <option value="AST">AST</option>
                <option value="NONE">NONE</option>
            </param>
            <param argument="--analysis_method" type="select" optional="true" label="The analysis method to apply">
                <option value="LM" selected="true">LM</option>
                <option value="CPLM">CPLM</option>
                <option value="NEGBIN">NEGBIN</option>
                <option value="ZINB">ZINB</option>
            </param>
            <param argument="--correction" type="select" value="BH" optional="true" label="Correction" help="The correction method for computing  the q-value, Default: BH ">
                <option value="BH">Benjamini-Hochberg(BH)</option>
                <option value="BY">Benjamini-Yekutieli(BY)</option>
                <option value="Bonferroni">Bonferroni</option>
                <option value="Holm">Holm</option>
                <option value="Hochberg">Hochberg</option>
                <option value="Hommel">Hommel</option>
	    </param>
            <param argument="--standardize" type="boolean" truevalue="--standardize TRUE" falsevalue="--standardize FALSE" checked="true" label="Apply z-score so continuous metadata are on  the same scale"/>
        </section>          
        <section name="output" title="Set Plotting Output" expanded="true">
            <param argument="--plot_heatmap" type="boolean" truevalue="--plot_heatmap TRUE" falsevalue="--plot_heatmap FALSE" checked="true" label="Generate a heatmap for the significant  associations"/>
            <param argument="--heatmap_first_n" type="integer" value="50" optional="true" label="Heatmap plot first N" help="In heatmap, plot top N features with significant associations"/>
            <param argument="--plot_scatter" type="boolean" truevalue="--plot_scatter TRUE" falsevalue="--plot_scatter FALSE" checked="true" label="Generate scatter plots for the significant associations"/> 
            <param name="residuals_output" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Output data frame with residuals for each feature"/>    
        </section>   
   </inputs>
   <outputs>
        <data name="archive_output" format="zip" from_work_dir="out.zip" label="${tool.name} on ${on_string}: zip of the complete output" />
        <data name="all_results" format="tabular" from_work_dir="outputFolder/all_results.tsv" label="All results ordered by increasing q-value"/>
        <data name="significant_results" format="tabular" from_work_dir="outputFolder/significant_results.tsv" label="Q-values smaller than or equal to the threshold"/>
        <data name="residuals" format="rdata" from_work_dir="outputFolder/fits/residuals.rds" label="Data frame with residuals for each feature">       
            <filter>output['residuals_output'] is True</filter>
        </data>         
        <data format="pdf" name="headmap" from_work_dir="outputFolder/figures/heatmap.pdf" label="Heatmap of the significant associations" >
            <filter>output['plot_heatmap'] is True</filter>
        </data>        
        <collection name="figures_pdfs" type="list" label="Plots" >
            <discover_datasets pattern="(?P&lt;designation&gt;.+)" directory="outputFolder/figures/" format="pdf"/>
            <filter>output['plot_scatter'] is True</filter>
        </collection>
    </outputs>
    <tests>
        <test expect_num_outputs="6">
            <param name="input_data" value="HMP2_taxonomy.tsv"/>
            <param name="input_metadata" value="HMP2_metadata.tsv"/>
            <param name="random_effects" value= "2,5"/> 
            <param name="fixed_effects" value="4,9,10,11,6,3"/> 
            <param name="reference" value="diagnosis,nonIBD"/>
            <section name="additional_options">
                <param name="min_abundance" value="0.0"/>
                <param name="min_prevalence" value="0.1"/>
                <param name="max_significance" value="0.25"/>
                <param name="normalization" value="NONE"/>
                <param name="transform" value="AST"/>
                <param name="analysis_method" value="LM"/>
                <param name="correction" value="BH"/>
                <param name="standardize" value="false"/>
            </section>
            <section name="output">
                <param name="plot_heatmap" value="true"/>
                <param name="heatmap_first_n" value="50"/>
                <param name="plot_scatter" value="true"/>
                <param name="residuals_output" value="true"/>
            </section>
            <output name="archive_output">
                <assert_contents>
                    <has_size value="15005328" delta="1000000" />
                </assert_contents>
            </output>
            <output name="all_results">
                <assert_contents>
                    <has_text text="feature"/>
                    <has_n_lines n="610"/>
                    <has_n_columns n="9"/>
                </assert_contents>
            </output>
            <output name="significant_results">
                <assert_contents>
                    <has_text text="dysbiosisCD"/>
                    <has_n_lines n="159" delta="10"/>
                    <has_n_columns n="9"/>
                </assert_contents>
            </output>
            <output name="residuals">
                <assert_contents>
                    <has_size value="462386" />
                </assert_contents>
            </output>
            <output name="headmap">
                <assert_contents>
                    <has_size value="7373" delta="1000" />
                </assert_contents>
            </output>
            <output_collection name="figures_pdfs" type="list">
                <element name="heatmap.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="7373" delta="1000" />
                    </assert_contents>
                </element>
                <element name="age.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="407859" delta="100000" />
                    </assert_contents>
                </element>
                <element name="antibiotics.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="1590476" delta="1000000" />
                    </assert_contents>
                </element>
                <element name="diagnosis.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="1407572" delta="1000000" />
                    </assert_contents>
                </element>
                <element name="dysbiosisCD.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="2328523" delta="1000000" />
                    </assert_contents>
                </element>
                <element name="dysbiosisnonIBD.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="1032602" delta="1000000" />
                    </assert_contents>
                </element>
                <element name="dysbiosisUC.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="1037005" delta="1000000" />
                    </assert_contents>
                </element>                                                                        
            </output_collection>
        </test>
        <test expect_num_outputs="6">
            <param name="input_data" value="HMP2_taxonomy.tsv"/>
            <param name="input_metadata" value="HMP2_metadata.tsv"/>
            <param name="fixed_effects" value="4,9"/>
            <param name="reference" value="diagnosis,nonIBD"/>
            <section name="additional_options">
                <param name="min_abundance" value="0.0"/>
                <param name="min_prevalence" value="0.1"/>
                <param name="max_significance" value="0.25"/>
                <param name="normalization" value="NONE"/>
                <param name="transform" value="AST"/>
                <param name="analysis_method" value="LM"/>
                <param name="correction" value="BH"/>
                <param name="standardize" value="false"/>
            </section>
            <section name="output">
                <param name="plot_heatmap" value="true"/>
                <param name="heatmap_first_n" value="50"/>
                <param name="plot_scatter" value="true"/>
                <param name="residuals_output" value="true"/>
            </section>
            <output name="archive_output">
                <assert_contents>
                    <has_size value="12630049" delta="1000000" />
                </assert_contents>
            </output>
            <output name="all_results">
                <assert_contents>
                    <has_text text="feature"/>
                    <has_n_lines n="262"/>
                    <has_n_columns n="9"/>
                </assert_contents>
            </output>
            <output name="significant_results">
                <assert_contents>
                    <has_text text="diagnosis"/>
                    <has_n_lines n="175" delta="5"/>
                    <has_n_columns n="9"/>
                </assert_contents>
            </output>
            <output name="residuals">
                <assert_contents>
                    <has_size value="366875"/>
                </assert_contents>
            </output>
            <output_collection name="figures_pdfs" type="list">
                <element name="diagnosis.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="6234127" delta="1000000" />
                    </assert_contents>
                </element>                                                              
            </output_collection>
        </test>
        <test expect_num_outputs="6">
            <param name="input_data" value="HMP2_taxonomy.tsv"/>
            <param name="input_metadata" value="HMP2_metadata.tsv"/>
            <param name="fixed_effects" value="2,4"/>
            <param name="reference" value="site,Cedars-Sinai,diagnosis,UC"/>
            <section name="additional_options">
                <param name="min_abundance" value="0.0001"/>
                <param name="min_prevalence" value="0.1"/>
                <param name="max_significance" value="0.25"/>
                <param name="normalization" value="NONE"/>
                <param name="transform" value="AST"/>
                <param name="analysis_method" value="LM"/>
                <param name="correction" value="BH"/>
                <param name="standardize" value="false"/>
            </section>
            <section name="output">
                <param name="plot_heatmap" value="true"/>
                <param name="heatmap_first_n" value="50"/>
                <param name="plot_scatter" value="true"/>
                <param name="residuals_output" value="true"/>
            </section>
            <output name="archive_output">
                <assert_contents>
                    <has_size value="18278259" delta="1000000" />
                </assert_contents>
            </output>
            <output name="all_results">
                <assert_contents>
                    <has_text text="feature"/>
                    <has_n_lines n="415" delta="10"/>
                    <has_n_columns n="9"/>
                </assert_contents>
            </output>
            <output name="significant_results">
                <assert_contents>
                    <has_text text="diagnosis"/>
                    <has_n_lines n="300" delta="15"/>
                    <has_n_columns n="9"/>
                </assert_contents>
            </output>
            <output name="residuals">
                <assert_contents>
                    <has_size value="363118"/>
                </assert_contents>
            </output>
            <output name="headmap">
                <assert_contents>
                    <has_size value="7000" delta="1000" />
                </assert_contents>
            </output>
            <output_collection name="figures_pdfs" type="list">
                <element name="heatmap.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="7693" delta="100" />
                    </assert_contents>
                </element>
                <element name="diagnosis.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="6061545" delta="1000000" />
                    </assert_contents>
                </element>
                                                                        
            </output_collection>
        </test> 
        <test expect_num_outputs="6">  
      	<param name="input_data" value="HMP2_taxonomy.tsv"/>
            <param name="input_metadata" value="HMP2_metadata.tsv"/>
            <param name="fixed_effects" value="7,9"/>
            <param name="random_effects" value="3" />
          
            <section name="additional_options">
                <param name="min_abundance" value="0.0"/>
                <param name="min_prevalence" value="0.1"/>
                <param name="max_significance" value="0.25"/>
                <param name="normalization" value="TSS"/>
                <param name="transform" value="LOG"/>
                <param name="analysis_method" value="LM"/>
                <param name="correction" value="BY"/>
                <param name="standardize" value="True"/>
            </section>
            <section name="output">
                <param name="plot_heatmap" value="true"/>
                <param name="heatmap_first_n" value="50"/>
                <param name="plot_scatter" value="true"/>
                <param name="residuals_output" value="true"/>
            </section>
            <output name="archive_output">
                <assert_contents>
                    <has_size value="8567935" delta="1000000" />
                </assert_contents>
            </output>
            <output name="all_results">
                <assert_contents>
                    <has_text text="feature"/>
                    <has_n_lines n="175" delta="10"/>
                    <has_n_columns n="9"/>
                </assert_contents>
            </output>
            <output name="significant_results">
                <assert_contents>
                    <has_text text="dysbiosisnonIBD"/>
                    <has_n_lines n="95" delta="5"/>
                    <has_n_columns n="9"/>
                </assert_contents>
            </output>
            <output name="residuals">
                <assert_contents>
                    <has_size value="434087"/>
                </assert_contents>
            </output>
            <output_collection name="figures_pdfs" type="list">
                <element name="heatmap.pdf" ftype="pdf">
                    <assert_contents>
                        <has_size value="7000" delta="1000" />
                    </assert_contents>
                </element>                                                              
            </output_collection>
        </test> 
       
            
    </tests>
    <help><![CDATA[
@HELP_HEADER@
MaAsLin 2
=========
MaAsLin2 is the next generation of MaAsLin (Microbiome Multivariable Association with Linear Models).

Input
=====
MaAsLin2 requires two input files:

    - Data (or features) tabular file
        - This file is tab-delimited.
        - Formatted with features as columns and samples as rows.
        - The transpose of this format is also okay.
        - Possible features in this file include taxonomy or genes.
    - Metadata tabular file
        - This file is tab-delimited.
        - Formatted with features as columns and samples as rows.
        - The transpose of this format is also okay.
        - Possible metadata in this file include gender or age.

The data file can contain samples not included in the metadata file (along with the reverse case). For both cases, those samples not included in both files will be removed from the analysis. Also the samples do not need to be in the same order in the two files.

Output
======
1- Data output files
    - All results ordered by increasing q-value (tabular file)
        - Full list of associations that pass MaAsLin2's significance threshold, ordered by increasing q-values
        - This includes the same data as the data.frame returned.
        - This file contains all results ordered by increasing q-value.
        - The first columns are the metadata and feature names.
        - The next two columns are the value and coefficient from the model.
        - The next column is the standard deviation from the model.
        - The N column is the total number of data points.
        - The N.not.zero column is the total of non-zero data points.
        - The pvalue from the calculation is the second to last column.
        - The qvalue is computed with p.adjust with the correction method.
    - Q-values smaller than or equal to the threshold (tabular file)
        - This file is a subset of the results in the first file.
        - It only includes associations with q-values <= to the threshold.
    - Data frame with residuals for each feature (R data file)
        - This file contains a data frame with residuals for each feature.
        
Correction methods to compute the q-value : https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/p.adjust

2- Visualization output files
    - Heatmap of the significant associations (PDF file)
        - This file contains a heatmap of the significant associations.
    - A plot for every significant association found (PDF file(s))
        - A plot is generated for each significant association.
        - Scatter plots are used for continuous metadata.
        - Box plots are for categorical data.
        - Data points plotted are after normalization, filtering, and transform.

    ]]></help> 
    <citations>
        <citation type="doi">10.1101/2021.01.20.427420</citation>
    </citations>
 </tool>