view cloudmap.xml @ 6:85214e4428fd

upgrade to mimodd version 0.1.7.0
author Wolfgang Maier
date Fri, 11 Sep 2015 16:25:49 +0200
parents bdd1995c9e66
children 3619e85a5477
line wrap: on
line source

<tool id="nacreousmap" name="NacreousMap" version="0.1.7.0">
  <description>Map causative mutations by multi-variant linkage analysis.</description>
  <macros>
    <import>toolshed_macros.xml</import>
  </macros>
  <expand macro="requirements"/>
  <version_command>mimodd version -q</version_command>
  <command>
    mimodd map "${opt.source.ifile}" ${opt.mode}
    #if $str($opt.source.sample):
      -m "${opt.source.sample}"
    #end if
    #if $str($opt.source.related_parent_sample):
        -r "${opt.source.related_parent_sample}"
    #end if
    #if $str($opt.source.unrelated_parent_sample):
        -u "${opt.source.unrelated_parent_sample}"
    #end if
    $opt.source.infer_missing
    -o "$ofile"
    #if $str($opt.source.seqdict_required.required) == "yes":
        -s "${opt.source.seqdict_required.seqdict}"
    #end if
    $opt.source.norm
    #if $len($opt.source.bin_sizes):
      --bin-sizes
      #for $size in $opt.source.bin_sizes:
        "${size.bin_size}"
      #end for
    #end if
    #if $str($opt.source.tabfile):
      $str($opt.source.tabfile) $tfile
    #end if
    #if $str($opt.source.plotopts.plots):
      $str($opt.source.plotopts.plots) "$pfile"
      $str($opt.source.plotopts.xlim)
      #if $str($opt.source.plotopts.hylim):
        --ylim-hist $str($opt.source.plotopts.hylim)
      #end if
      #if $str($opt.source.plotopts.hcols) and $len($opt.source.plotopts.hcols):
        --hist-colors
        #for $color in $opt.source.plotopts.hcols:
          "${color.hcolor}"
        #end for
      #end if
      #if $str($opt.source.plotopts.sylim):
        --ylim-scatter $str($opt.source.plotopts.sylim)
      #end if
      #if $str($opt.source.plotopts.pcol):
        --points-color "$str($opt.source.plotopts.pcol)"
      #end if
      #if $str($opt.source.plotopts.lcol):
        --loess-color "$str($opt.source.plotopts.lcol)"
      #end if
      #if $str($opt.source.plotopts.span):
        --loess-span "$str($opt.source.plotopts.span)"
      #end if      
    #end if
    
  </command>

  <macros>
    <macro name="svd_unconditional">
      <expand macro="hidden_vaf_algo_params" />
      <expand macro="seqdict_param" />
      <expand macro="bins" />
      <param name="norm" type="boolean" label="normalize variant counts to bin-width" truevalue="" falsevalue="--no-normalize" checked="true" help="without normalization the tool will just report the number of nucleotides per bin; with normalization the results for different bin-widths will be comparable." />
      <conditional name="plotopts">
        <param name="plots" type="select" label="graphical output settings">
          <option value="">Do not generate graphs.</option>
          <option value="-p">Give me graphics.</option>
        </param>
        <when value="-p">
          <expand macro="scatter_default" />
          <param name="hylim" type="text" label="upper limit for the histogram y-axis (leave blank for automatic scaling)" />
          <param name="xlim" type="select" label="x-axis scaling">
            <option value="">preserve relative contig sizes</option>
            <option value="--fit-width">scale each contig to fit the plot width</option>
          </param>
          <expand macro="hist_colors" />
        </when>
      </conditional>
    </macro>
    <macro name="vaf_unconditional">
      <expand macro="bins" />
      <param name="norm" type="boolean" label="normalize variant counts to bin-width" truevalue="" falsevalue="--no-normalize" checked="true" />
      <conditional name="plotopts">
        <param name="plots" type="select" label="graphical output settings">
          <option value="">Do not generate graphs.</option>
          <option value="--no-scatter -p">Generate only histograms</option>
          <option value="--no-hist -p">Generate only scatter plots</option>
          <option value="-p">Give me everything (scatter plots and histograms)</option>
        </param>
        <when value="--no-scatter -p">
          <expand macro="scatter_default" />
          <param name="hylim" type="text" label="upper limit for the histogram y-axis (leave blank for automatic scaling)" />
          <param name="xlim" type="select" label="x-axis scaling">
            <option value="">preserve relative contig sizes</option>
            <option value="--fit-width">scale each contig to fit the plot width</option>
          </param>
          <expand macro="hist_colors" />
        </when>
        <when value="--no-hist -p">
          <expand macro="hist_default" />
          <param name="sylim" type="text" label="upper limit for the scatter plot y-axis (default: 1)" />
          <param name="xlim" type="select" label="x-axis scaling">
            <option value="">preserve relative contig sizes</option>
            <option value="--fit-width">scale each contig to fit the plot width</option>
          </param>
          <param name="span" type="text" label="span value to be used in calculating the Loess regression line through the scatter data (default=0.1)" help="smaller values give a more responsive curve that often picks up local evidence for tight linkage better, but too small values lead to plotting failures (in that case just rerun the tool with a larger value)." />
          <expand macro="scatter_colors" />
        </when>
        <when value="-p">
          <expand macro="plot_all" />
        </when>
      </conditional>
    </macro>
    <macro name="hidden_vaf_algo_params">
      <param name="sample" type="hidden" value="" />
      <param name="related_parent_sample" type="hidden" value="" />
      <param name="unrelated_parent_sample" type="hidden" value="" />
      <param name="infer_missing" type="hidden" value="" />
    </macro>
    <macro name="bins">
      <repeat name="bin_sizes" title="bin sizes to analyze variants in (defaults to: 1Mb and 500Kb" default="0" min="0" help="Values can be entered in bases (e.g., 1000000), kilobases (e.g., 500Kb) or megabases (e.g., 1Mb), but must be integral, i.e. no decimal numbers are allowed.">
        <param name="bin_size" type="text" />
      </repeat>
    </macro>
    <macro name="scatter_default">
      <param name="sylim" type="hidden" value="" />
      <param name="span" type="hidden" value="" />
      <param name="pcol" type="hidden" value="" />
      <param name="lcol" type="hidden" value="" />
    </macro>
    <macro name="hist_default">
      <param name="hylim" type="hidden" value="" />
      <param name="hcols" type="hidden" value="" />
    </macro>
    <macro name="hist_colors">
      <repeat name="hcols" title="histogram colors" default="0" min="0" help="For each bin size chosen above a histogram will be generated with its color selected from the list provided here (defaults to alternating darkgrey, red).">
        <param name="hcolor" type="color" value="darkgrey">
          <sanitizer><valid><add value="#" /></valid></sanitizer>
        </param>
      </repeat>
    </macro>
    <macro name="scatter_colors">
      <param name="pcol" type="color" value="#454545" label="color to be used for the scatter plot data points (default: gray27)">
        <sanitizer><valid><add value="#" /></valid></sanitizer>
      </param>
      <param name="lcol" type="color" value="red" label="color to be used for the regression line (default: red)">
        <sanitizer><valid><add value="#" /></valid></sanitizer>
      </param>
    </macro>
    <macro name="plot_all">
      <param name="hylim" type="text" label="upper limit for the histogram y-axis (leave blank for automatic scaling)" />
      <param name="sylim" type="text" label="upper limit for the scatter plot y-axis (default: 1)" />
      <param name="xlim" type="select" label="x-axis scaling">
        <option value="">preserve relative contig sizes</option>
        <option value="--fit-width">scale each contig to fit the plot width</option>
      </param>
      <param name="span" type="text" label="span value to be used in calculating the Loess regression line through the scatter data (default=0.1)" help="smaller values give a more responsive curve that often picks up local evidence for tight linkage better, but too small values lead to plotting failures (in that case just rerun the tool with a larger value)." />
      <expand macro="hist_colors" />
      <expand macro="scatter_colors" />
    </macro>
    <macro name="seqdict_param">
      <conditional name="seqdict_required">
        <param name="required" type="select" label="does this input file require a CloudMap-style sequence dictionary?" help="A sequence dictionary file is required ONLY if the input file does not provide information about the sizes of the chromosomes defined in it. It is NEVER needed for MiModD-generated input files.">
          <option value="no">No</option>
          <option value="yes">Yes</option>
        </param>
        <when value="yes">
          <param name="seqdict" type="data" format="tabular" label="CloudMap-style sequence dictionary file" />
        </when>
      </conditional>
    </macro>
  </macros>
  
  <inputs>
    <conditional name="opt">
      <param name="mode" type="select" label="type of mapping analysis to perform" help="Select Simple Variant Density (SVD) Mapping to map mutations based on linked inheritance in near isogenic populations, Variant Allele Frequency (VAF) Mapping for bulk segregant analysis. Select Reprocess for rapidly replotting the result of a previous VAF analysis.">
        <option value="SVD">Simple Variant Density Mapping</option>
        <option value="VAF">Variant Allele Frequency Mapping</option>
      </param>
      <when value="SVD">
        <conditional name="source">
          <param name="inputtype" type="select" label="data source to use">
            <option value="vcf">VCF file of variants (for de-novo mapping)</option>
            <option value="rep">per-variant report file (for remapping a previous analysis)</option>
          </param>
          <when value="vcf">
            <param name="ifile" type="data" format="vcf" label="input file with variants to analyze" />
            <expand macro="svd_unconditional" />
            <param name="tabfile" type="select" label="additional per-variant output file" help="You can either choose to produce a tabular per-variant report, which is useful for fast replotting with different plot settings or a vcf-like CloudMap-compatibility file that can be used as input for the CloudMap EMS Variant Density Mapping tool as an alternative plotting tool.">
              <option value="">Do not generate per-variant output</option>
              <option value="-t">Tabular per-variant report</option>
              <option value="--cloudmap -t">CloudMap compatibility file</option>
            </param>
          </when>
          <when value="rep">
            <param name="ifile" type="data" format="tabular" label="input file with variants to analyze" />
            <param name="tabfile" type="hidden" value="" />
            <expand macro="svd_unconditional" />
          </when>
        </conditional>
      </when>
      <when value="VAF">
        <conditional name="source">
          <param name="inputtype" type="select" label="data source to use">
            <option value="vcf">VCF file of variants (for de-novo mapping)</option>
            <option value="rep">per-variant report file (for remapping a previous analysis)</option>
          </param>
          <when value="vcf">
            <param name="ifile" type="data" format="vcf" label="input file with variants to analyze" />
            <expand macro="seqdict_param" />
            <param name="sample" type="text" label="mapping sample name" help="the sample to perform mutation mapping for" />
            <param name="related_parent_sample" type="text" label="name of the related parent sample" help="the sample that provides variants present in your original mutant strain or in an ancestor (like the pre-mutagenesis strain); leave blank if not available" />
            <param name="unrelated_parent_sample" type="text" label="name of the unrelated parent sample" help="the sample that provides variants present in the unrelated mapping strain (or in an ancestor of it) used in the mapping cross; leave blank if not available" />
            <param name="infer_missing" type="boolean" checked="false" truevalue="--infer-missing" falsevalue="" label="Infer alleles for missing parent" help="if variant data for either the related or the unrelated parent strain is not available, the tool can try to infer the alleles present in that parent from the allele spectrum found in the mapping sample. This is an EXPERIMENTAL option that will give a benefit only in certain situations. Enable at your own risk." />
            <expand macro="vaf_unconditional" />
            <param name="tabfile" type="select" label="additional per-variant output file" help="You can either choose to produce a tabular per-variant report, which is useful for fast replotting with different plot settings or a vcf-like CloudMap-compatibility file that can be used as input for the CloudMap Hawaiian Variant Mapping tool as an alternative plotting tool.">
              <option value="">Do not generate per-variant output</option>
              <option value="-t">Tabular per-variant report</option>
              <option value="--cloudmap -t">CloudMap compatibility file</option>
            </param>
          </when>
          <when value="rep">
            <param name="ifile" type="data" format="tabular" label="input file with variants to analyze" />
            <expand macro="seqdict_param" />
            <param name="tabfile" type="hidden" value="" />
            <expand macro="hidden_vaf_algo_params" />
            <expand macro="vaf_unconditional" />
          </when>
        </conditional>
      </when>
    </conditional>
  </inputs>

  <outputs>
    <data name="ofile" format="tabular" label="MiModD ${opt.mode} Mapping - binned variant counts for ${on_string}" />
    <data name="tfile" format="tabular" label="MiModD ${opt.mode} Mapping - per-variant report for ${on_string}">
      <filter>(opt['source']['tabfile'])</filter>
    </data>
    <data name="pfile" format="pdf" label="MiModD ${opt.mode} Mapping - linkage plots for ${on_string}">
      <filter>(opt['source']['plotopts']['plots'])</filter>
    </data>
  </outputs>

  <help>
.. class:: infomark

   **What it does**

This tool is a complete rewrite of and improves the EMS Variant Density and Hawaiian Variant Mapping tools of `CloudMap`_. It is the most downstream tool in `mapping-by-sequencing analysis workflows in MiModD`_.

It can be used to analyze and visualize the inheritance pattern of variants detected and selected by other MiModD tools or as an alternative (and more versatile) plotting engine for data generated with `CloudMap`_.

-------------

**Usage Modes:**

This tool can be run in one of two different modes depending on the type of mapping analysis that should be performed:

1) *Simple Variant Density (SVD) Mapping* mode analyzes the density of variants along the reference genome by dividing each chromosome into regions of user-defined size (bins) and counting the variants found in each bin. 
    
   All variants listed in the input file are analyzed in this mode, which means that as input you will typically want to use filtered lists of variants (as produced by the VCF Filter tool). 
    
   The aim of SVD analysis is to identify clusters of variants in an outcrossed strain carrying a selectable unknown mutation, which is interpreted as linkage between the corresponding genomic region and the unknown mutation.
    
   This mode corresponds roughly to EMS Variant Density Mapping in CloudMap.

2) *Variant Allele Frequency (VAF) Mapping** mode analyzes the inheritance pattern in cross-progeny at sites, at which the parents are homozygous for different alleles. 
    
   The aim of VAF analysis is to identify clusters of variants with (near) homozygous inheritance in a F2 (or later generation) population obtained from a cross between a strain carrying a selectable unknown mutation and an unrelated mapping strain. Such a cluster is interpreted as linkage between the corresponding genomic region and the unknown mutation selected for in the F2 generation. 
    
   This mode corresponds roughly to Hawaiian Variant Mapping in CloudMap, but can simultaneously take into account non-reference alleles found in either parent strain (CloudMap users may think of this as a combined Hawaiian Variant and Variant Discovery Mapping analysis).

-------------

**Input:**

Valid input for this tool are VCF files (any VCF file in SVD mode, a MiModD-generated multi-sample VCF file in VAF mode) or a CloudMap tabular report file as generated by the Hawaiian Variant Mapping tool. Alternatively, the tool can generate (in both modes) its own tabular report file, which can be used as input instead of the original VCF file when rerunning the tool with different plotting parameters to reduce analysis time. 

.. class:: infomark

   CloudMap-generated tabular input files require, as additional input, a CloudMap-style sequence dictionary (even if the original CloudMap analysis was possible without one) as described in the original CloudMap paper. This file has a simple two-column tab-delimited format, in which each line lists the chromosome name (as it appears in the input VCF file) and the up-rounded length of the chromosome in megabases.

-------------

**Output:**

The tool produces up to three output files:

1) a default tabular file of binned variant counts that can be used to plot the data with external software such as Excel,
  
  
2) an optional pdf containing linkage plots, which should look just like the plots produced by CloudMap, but are optimized for file size and display speed and offer more user-configurable parameters and
  
  
3) an optional tabular per-variant report file, which can be configured to be either a valid input file for the corresponding original CloudMap tool (for users who really, really want to continue using CloudMap for plotting) or to be reusable in fast reruns of the tool (which can be useful to experiment with different plotting parameters).

-------------

**Settings:**

1) Analysis settings

   *bin size to analyze variants in* - determines the width of the regions along each chromosome, in which variants are counted and analyzed together. 
     
   Several bin sizes can be specified and for each size you will get a corresponding report section in the binned variant counts file and a histogram plot in the linkage plots file.
   
   *normalize variant counts to bin-width* - if selected (as per default) the variant counts for different bin sizes are not absolute, but normalized to the bin width
   
   *sample names (in VAF mode only)* - to analyze inheritance patterns, VAF mode needs information about the relationship between the samples defined in the input VCF file:
      
   The *mapping sample name* should be set to the name of the sample for which the inheritance pattern is to be analyzed (the pooled progeny population). 
     
   The *name of the related sample* should be that of the parent sample that carried and brought in the unknown mutation to be mapped (or, alternatively, that of a closely related ancestor). 
     
   Finally, the *name of the unrelated sample* should be that of the other parent strain used in the cross. 
     
   At least one of the parent samples MUST be specified, but if the input file contains variant information for both parents, they can be analyzed together for higher mapping accuracy. If you are reanalyzing a tabular report file from a previous tool run or from CloudMap, the association between variants and samples is already incorporated into the input file and cannot be specified again.

2) Graphical output settings

   .. class:: warningmark
  
      To be able to generate plots the system running MiModD needs to have the statistical programming environment R and its Python interface rpy2 installed.


   *y-axes scaling* - if you want to override the defaults

   *x-axis scaling* - choose *preserve relative contig sizes* if you want the largest chromosome to fit the page width and smaller chromosomes to appear according to their relative size or choose *scale each contig to fit the plot width* if all chromosomes should exploit the available space

   *span value to be used in calculating the Loess regression line* - this value determines the degree of smoothing of the regression line through the scatterplot data. Information on loess regression and the loess span parameter can be found at http://en.wikipedia.org/wiki/Local_regression. The default is 0.1 as in CloudMap.

   *colors used for plotting* - can be selected freely from the offered palette. For histogram colors, the list of selected colors will be used to provide the colors for the different histograms plotted. If less colors than histograms (determined by the number of bin sizes selected) are specified, colors from the list will be recycled.


.. _CloudMap: https://usegalaxy.org/u/gm2123/p/cloudmap
.. _mapping-by-sequencing analysis workflows in MiModD: http://mimodd.readthedocs.org/en/latest/cloudmap.html
  </help>
</tool>