view masigpro.xml @ 4:402e0b9cfd87 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/masigpro commit 7dc16d5d4169a8fbfb3fad3b2212fd8b6d481f5f"
author iuc
date Sat, 27 Nov 2021 09:58:07 +0000
parents 83f8b5ceff43
children
line wrap: on
line source

<!-- TODO remove .1 in tool version with next update -->
<tool id="masigpro" name="maSigPro" version="@TOOL_VERSION@.1+galaxy@VERSION_SUFFIX@">
    <description>Significant Gene Expression Profile Differences in Time Course Gene Expression Data</description>
    <macros>
        <token name="@TOOL_VERSION@">1.49.3</token>
        <token name="@VERSION_SUFFIX@">0</token>
    </macros>        
    <xrefs>
        <xref type="bio.tools">masigpro</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="8.25">coreutils</requirement>
        <requirement type="package" version="@TOOL_VERSION@">bioconductor-masigpro</requirement>
        <requirement type="package" version="1.3.2">r-optparse</requirement>
        <requirement type="package" version="4.4">sed</requirement>
    </requirements>
    <stdio>
        <regex match="Execution halted"
            source="both"
            level="fatal"
            description="Execution halted." />
        <regex match="Error in"
            source="both"
            level="fatal"
            description="An undefined error occurred, please check your input carefully and contact your administrator." />
        <regex match="Fatal error"
            source="both"
            level="fatal"
            description="An undefined error occurred, please check your input carefully and contact your administrator." />
    </stdio>
    <version_command>
    <![CDATA[
        echo $(R --version | grep version | grep -v GNU)", maSigPro version" $(R --vanilla --slave -e "library(maSigPro); cat(sessionInfo()\$otherPkgs\$maSigPro\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
    ]]>
    </version_command>
    <command>
    <![CDATA[
    #if str($source.source_selector) == "advanced":
        paste
        #set $start = True
        #set $header = ''
        #for $time in $source.rep_time:
            #for $file in $time.files:
                #if $start:
                    <(cut -f1 $file)
                    #set $start = False
                #end if
                #set $header += ' "' + $file.name + '"'
                <(cut -f2 $file)
            #end for
        #end for
        > data && sed -i '1i$header' data &&
        #if $source.enable_output:
            cp $design_matrix $edesign_out &&
        #end if
        #set $data = 'data'
        #set $edesign = $design_matrix
    #else:
        #set $data = $source.data
        #set $edesign = $source.edesign
    #end if
    Rscript '${__tool_directory__}/masigpro.R'
    -e '$edesign'
    -d '$data'
    -o '$masigpro_out'
    #if str($source.source_selector) == "defaults":
        --time_col $source.time_col
        --repl_col $source.repl_col
    #end if
    --degree $makeDesignMatrix.degree
    --qvalue $p_vector.qvalue
    --min_obs $p_vector.min_obs
    --step_method '$Tfit.step_method'
    --nvar_correction $Tfit.nvar_correction
    --alfa $Tfit.alfa
    --rsq $getSiggenes.rsq
    --vars '$getSiggenes.vars'
    --significant_intercept '$getSiggenes.significant_intercept'
    #if $pdf.pdf_selector:
        --cluster_data $pdf.seeGenes.clusterData
        -k $pdf.seeGenes.k
        --print_cluster $pdf.seeGenes.print_cluster
        --cluster_method $pdf.seeGenes.clustering.clusterMethod
        #if str($pdf.seeGenes.clustering.clusterMethod) == "hclust":
            --distance $pdf.seeGenes.clustering.distance
            --agglo_method $pdf.seeGenes.clustering.aggloMethod
        #end if
        #if str($pdf.seeGenes.clustering.clusterMethod) == "kmeans":
            --iter_max $pdf.seeGenes.clustering.iterMax
        #end if
        --color_mode $pdf.seeGenes.colorMode
        --show_fit $pdf.seeGenes.showFit
        --show_lines $pdf.seeGenes.showLines
        --cexlab $pdf.seeGenes.cexlab
        --legend $pdf.seeGenes.legend
    #end if
    #if str($source.source_selector) == "advanced" and $source.enable_output
        && mv data $data_out
    #end if
    ]]>
    </command>
    <configfiles>
<configfile name="design_matrix">#if str($source.source_selector) == "advanced":
#set $header = "Name Time Replicate"
#for $group in $source.rep_groups:
    #set $header = $header + ' ' + str($group.name)
#end for
$header
#set $c = len($source.rep_repl) + 1
#for $time in $source.rep_time:
    #for $file in $time.files:
        #set $is_repl = False
        #for $i, $repl in enumerate($source.rep_repl):
            #if str($file) in str($repl.files):
                #set $r = $i + 1
                #set $is_repl = True
            #end if
        #end for
        #if $is_repl == False:
            #set $r = $c
            #set $c += 1
        #end if
        #set $line = '"' + str($file.name) + '" ' + str($time.time) + ' ' + str($r)
        #for $group in $source.rep_groups:
            #if str($file) in str($group.files):
                #set $line += " 1"
            #else
                #set $line += " 0"
            #end if
        #end for
$line
    #end for
#end for
#end if
</configfile>
    </configfiles>
    <inputs>
        <conditional name="source">
            <param label="Choose data source" name="source_selector"
                help="Choose if you want to provide seperate count files (e.g. from HTSeq-count or feature-seq) 
                    and define your experiment design matrix here, or if you have maSigPro edesign and data input files already."
                type="select">
                <option value="defaults">Use maSigPro edesign and data files</option>
                <option value="advanced">Seperate count data (e.g. from HTSeq-count or feature-count)</option>
            </param>
            <when value="defaults">
                <param name="edesign" format="tabular,txt" type="data" label="Experiment matrix"
                    help="Matrix describing experimental design. Rows must be arrays and columns experiment descriptors" />
                <param name="data" format="tabular,txt" type="data" label="Gene expression matrix"
                    help="Matrix containing normalized gene expression data. Genes must be in rows and arrays in columns" />
                <param name="time_col" label="Column number containing time values" type="integer" value="1"
                    help="Column number in edesign containing time values. Default is first column" />
                <param name="repl_col" label="Column number containing replicate coding" type="integer" value="2"
                    help="Column number in edesign containing coding for replicate arrays. Default is second column" />
            </when>
            <when value="advanced">
                <param name="enable_output" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Output generated maSigPro input files?"
                    help="Choose if you want to output the generated edesign and data files for direct use in maSigPro as history elements." />
                <repeat name="rep_time" title="Time values" min="1" default="1">
                    <param name="time" type="integer" value="0" label="Specify a numerical time value" help="Only numbers will be allowed">
                    <sanitizer>
                        <valid initial="string.digits"></valid>
                    </sanitizer>
                    </param>
                    <param name="files" type="data" format="tabular" multiple="true" label="Counts file(s) at this time value" />
                </repeat>
                <repeat name="rep_groups" title="Experimental groups" min="1" default="1">
                    <param name="name" type="text" value="Group title" label="Specify the name of this experimental group"
                        help="Use a single name without spaces or special characters">
                    </param>
                    <param name="files" type="data" format="tabular" multiple="true"
                        label="Counts file(s) belonging to this experimental group" />
                </repeat>
                <repeat name="rep_repl" title="Replicates" min="0" default="0">
                    <param name="files" type="data" format="tabular" multiple="true" label="Counts files that are replicates" />
                </repeat>
            </when>
        </conditional>
        <section name="makeDesignMatrix"
            title="Step 1: make.Design.Matrix - Defining the regression model"
            help="‘make.design.matrix’ creates the design matrix of dummies for
                fitting time series micorarray gene expression experiments.">
            <param name="degree" type="integer" value="1"
                label="Degree of regression fit polynome"
                help="The degree of the regression fit polynome. ‘degree’ = 1 returns 
                    linear regression, ‘degree’ = 2 returns quadratic regression, etc" />
        </section>
        <section name="p_vector"
            title="Step 2: p.vector - Finding significant genes"
            help="‘p.vector’ performs a regression fit for each gene taking all
                variables present in the model given by a regression matrix and
                returns a list of FDR corrected significant genes">
            <param name="qvalue" type="float" value="0.05" label="Q" help="Significance level" />
            <param name="min_obs" label="Minimum values" type="integer" value="6"
                help="Genes with less than this number of true numerical values
                    will be excluded from the analysis. Minimum value to estimate
                    the model is (degree+1)xGroups+1. Default is 6." />
        </section>
        <section name="Tfit" title="Step 3: T.fit - Finding significant differences"
            help="‘T.fit’ selects the best regression model for each gene using
                stepwise regression. In the maSigPro approach ‘p.vector’ and ‘T.fit’ are subsequent
                steps, meaning that significant genes are first selected on the
                basis of a general model and then the significant variables for
                each gene are found by step-wise regression.">
            <param name="step_method" type="select" label="Step regression"
                help="The step regression can be ‘backward’ or ‘forward’ indicating
                    whether the step procedure starts from the model with all or none
                    variables. With the ‘two.ways.backward’ or ‘two.ways.forward’
                    options the variables are both allowed to get in and out. At each
                    step the p-value of each variable is computed and variables get
                    in/out the model when this p-value is lower or higher than given
                    threshold alfa.">
                <option selected="True" value="backward">backward</option>
                <option value="forward">forward</option>
                <option value="two.ways.backward">two.ways.backward</option>
                <option value="two.ways.forward">two.ways.forward</option>
            </param>
            <param type="boolean" name="nvar_correction" label="nvar correction" truevalue="TRUE" falsevalue="FALSE" checked="false"
                help="When nvar.correction is TRUE the given significance
                    level is corrected by the number of variables in the model.">
                <option selected="True" value="FALSE">False</option>
                <option value="TRUE">True</option>
            </param>
            <param name="alfa" type="float" value="0.05" label="alfa" help="Significance level used for variable selection in the stepwise regression" />
        </section>
        <section name="getSiggenes"
            title="Step 4: get.siggenes - Obtaining lists of significant genes"
                help="This function creates lists of significant genes for a set of
                    variables whose significance value has been computed with the
                    ‘T.fit’ function.">
            <param name="rsq" type="float" value="0.7" label="rsq"
                help="cut-off level at the R-squared value for the stepwise
                    regression fit. Only genes with R-squared more than rsq are
                    selected" />
            <param name="vars" type="select" label="Variables"
                help="Variables for which to extract significant genes.
                    ‘all’: generates one single matrix or gene list with all
                    significant genes.

                    ‘each’: generates as many significant genes extractions as
                    variables in the general regression model. Each extraction
                    contains the significant genes for that variable.

                    ‘groups’: generates a significant genes extraction for each
                    experimental group.

                    The difference between ‘each’ and ‘groups’ is that in the
                    first case the variables of the same group (e.g.  ‘TreatmentA’
                    and ‘time*TreatmentA’) will be extracted separately and in t
                    he
                    second case jointly.">
                <option selected="True" value="groups">Groups</option>
                <option value="each">Each</option>
                <option value="all">All</option>
            </param>
            <param name="significant_intercept" type="select" label="Significant intercept"
                help="The argument ‘significant.intercept’ modulates the treatment for
                    intercept coefficients to apply for selecting significant genes
                    when ‘vars’ equals ‘groups’. There are three possible values:
                    ‘none’, no significant intercept (differences) are considered
                    for significant gene selection, ‘dummy’, includes genes with
                    significant intercept differences between control and experimental
                    groups, and ‘all’ when both significant intercept coefficient
                    for the control group and significant intercept differences are
                    considered for selecting significant genes.">
                <option selected="True" value="dummy">Dummy</option>
                <option value="none">None</option>
                <option value="all">All</option>
            </param>
        </section>
        <conditional name="pdf">
            <param label="Generate visualization PDF" name="pdf_selector" type="boolean"
                truevalue="1" falsevalue="0" checked="true"
                help="Choose if you want to generate a PDF file containing the visualizations" />
            <when value="1">
                <section name="seeGenes" title="Step 5: see.genes - Visualization"
                    help="This function provides visualisation tools for gene expression
                        values in a time course experiment. The function first calls the
                        heatmap function for a general overview of experiment results.
                        Next a partioning of the data is generated using a clustering
                        method.  The results of the clustering are visualized both as gene
                        expression profiles extended along all arrays in the experiment,
                        as provided by the plot.profiles function, and as summary
                        expression profiles for comparison among experimental groups.">
                    <param name="clusterData" label="Cluster Data" type="integer" value="1"
                        help="Data clustering can be done on the basis of either the original
                            expression values, the regression coefficients, or the t.scores.
                            In case ‘data’ is a ‘get.siggenes’ object, this is given by
                            providing the element names of the list
                            ‘c(sig.profiles,coefficients,t.score)’ of their list
                            position (1,2 or 3)." />
                    <param name="k" type="integer" label="Number of clusters for data partioning" value="9" />
                    <param name="print_cluster" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true"
                        label="Add cluster information to summary file?"
                        help="Adds columns with the cluster assignment for each gene." />
                    <conditional name="clustering">
                        <param name="clusterMethod" label="Cluster Method" type="select"
                            help="clustering method for data partioning. Currently
                                ‘hclust’, ‘kmeans’ and ‘Mclust’ are supported">
                            <option selected="True" value="hclust">hclust</option>
                            <option value="kmeans">kmeans</option>
                            <option value="Mclust">Mclust</option>
                        </param>
                        <when value="hclust">
                            <param name="distance" type="select" label="Distance measure"
                                help="Distance measurement function when ‘cluster.method’ is
                                    ‘hclust’. Default uses correlation.">
                                <option selected="True" value="cor">Correlation</option>
                                <option value="euclidean">Euclidean</option>
                                <option value="maximum">Maximum</option>
                                <option value="manhattan">Manhattan</option>
                                <option value="Canberra">Canberra</option>
                                <option value="binary">Binary</option>
                                <option value="minkowski">Minkowski</option>
                            </param>
                            <param name="aggloMethod" type="select" label="Agglomeration method"
                                help="The agglomeration method to be used when ‘cluster.method’ is ‘hclust’.">
                                <option selected="True" value="ward.D">ward.D</option>
                                <option value="ward.D2">ward.D2</option>
                                <option value="single">single</option>
                                <option value="complete">complete</option>
                                <option value="average">average (= UPGMA)</option>
                                <option value="mcquitty">mcquitty (= WPGMA)</option>
                                <option value="median">median (= WPGMC)</option>
                                <option value="centroid">centroid (= UPGMC)</option>
                            </param>
                        </when>
                        <when value="kmeans">
                            <param name="iterMax" type="integer" label="Maximum number of iterations" value="500"
                                help="Maximum number of iterations when ‘cluster.method’ is ‘kmeans’" />
                        </when>
                        <when value="Mclust">
                        </when>
                    </conditional>
                    <param name="colorMode" label="Color Mode" type="select" help="Color scale for plotting profiles. Can be either ‘rainbow’ or ‘gray’">
                        <option selected="True" value="rainbow">Rainbow</option>
                        <option value="gray">Gray</option>
                    </param>
                    <param name="showFit" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Show regression fit curves?"
                        help="Indicating whether regression fit curves must be plotted" />
                    <param name="showLines" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Draw lines?"
                        help="Indicating whether a line must be drawn joining plotted data points for each group" />
                    <param name="cexlab" type="float" value="0.8" label="Magnification for x labels"
                        help="Graphical parameter maginfication to be used for x labels in plotting functions" />
                    <param name="legend" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Add legend to plotting profiles?"
                        help="Indicating whether legend must be added when plotting profiles" />
                </section>
            </when>
            <when value="0">
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data format="txt" name="data_out" label="maSigPro data file on ${on_string}">
            <filter>
                ((
                    source['source_selector'] == 'advanced' and
                    source['enable_output'] == True
                ))
            </filter>
        </data>
        <data format="txt" name="edesign_out" label="maSigPro edesign file on ${on_string}">
            <filter>
                (( 
                    source['source_selector'] == 'advanced' and
                    source['enable_output'] == True 
                ))
            </filter>
        </data>
        <data format="pdf" name="pdf_out" from_work_dir="Results.pdf" label="maSigPro Plot file on ${on_string}">
            <filter>
                ((
                    pdf['pdf_selector'] == True
                ))
            </filter>
        </data>
        <data format="tabular" name="masigpro_out" label="maSigPro result file on ${on_string}">
        </data>
    </outputs>
    <tests>
        <test>
            <param name="source_selector" value="advanced" />
            <param name="enable_output" value="1" />
            <repeat name="rep_time">
                <param name="time" value="1" />
                <param name="files" value="control_1H.counts,treat_1H.counts" />
            </repeat>
            <repeat name="rep_time">
                <param name="time" value="2" />
                <param name="files" value="control_2H.counts,treat_2H.counts" />
            </repeat>
            <repeat name="rep_time">
                <param name="time" value="3" />
                <param name="files" value="control_3H.counts,treat_3H_1.counts,treat_3H_2.counts" />
            </repeat>
            <repeat name="rep_repl">
                <param name="files" value="treat_3H_1.counts,treat_3H_2.counts" />
            </repeat>
            <repeat name="rep_groups">
                <param name="name" value="Control" />
                <param name="files" value="control_1H.counts,control_2H.counts,control_3H.counts" />
            </repeat>
            <repeat name="rep_groups">
                <param name="name" value="Treatment" />
                <param name="files" value="treat_1H.counts,treat_2H.counts,treat_3H_1.counts,treat_3H_2.counts" />
            </repeat>
            <output name="masigpro_out" file="masigpro_out.tab" />
            <output name="data_out" file="data_out.txt" />
            <output name="edesign_out" file="edesign_out.txt" />
            <output name="pdf_out" file="Results.pdf" />
        </test>
        <test>
            <param name="source_selector" value="defaults" />
            <param name="edesign" value="edesign_out.txt" />
            <param name="data" value="data_out.txt" />
            <output name="masigpro_out" file="masigpro_out.tab" />
            <output name="pdf_out" file="Results.pdf" />
        </test>
        <test>
            <param name="source_selector" value="defaults" />
            <param name="edesign" value="edesign_out.txt" />
            <param name="data" value="data_out.txt" />
            <param name="print_cluster" value="FALSE" />
            <output name="masigpro_out" file="masigpro_out_no_cluster.tab" />
            <output name="pdf_out" file="Results.pdf" />
        </test>
    </tests>
    <help>
<![CDATA[
.. class:: infomark

**What it does**

maSigPro_ is a regression based approach to find genes for which there are significant gene expression profile differences between experimental groups in time course microarray and RNA-Seq experiments.

**Inputs**

The maSigPro wrapper has two options for input data:

    - directly through two seperate text files containing the experiment design (edesign) and the data or
    - count tables generated from HTSeq-count. Count tables must be generated for each sample individually.
      
To set up an experimental design from seperate count files you first have to select which files belong to a certain time point.
Likewise you can specify which files are replicates. In a third step you have to create the experimental groups and select the related files.
For a more comfortable setup in future analysis you have the option to output the generated edesign and data files.

**Output**

maSigPro_ generates a summary file containing the list of significant genes. Additionally you can obtain a PDF file containing plots of profiles and groups that visualize the clustering analysis.

.. _maSigPro: https://bioconductor.org/packages/release/bioc/html/maSigPro.html
]]>
    </help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btl056</citation>
    </citations>
</tool>