view phylogenetic_tree.xml @ 17:a3af29edcce2

Uploaded Miller Lab Devshed version a51c894f5bed
author miller-lab
date Fri, 28 Sep 2012 11:57:18 -0400
parents 8ae67e9fb6ff
children f04f40a36cc8
line wrap: on
line source

<tool id="gd_phylogenetic_tree" name="Phylogenetic Tree" version="1.0.0">
  <description>: Show genetic relationships among individuals</description>

  <command interpreter="python">
    phylogenetic_tree.py "$input"
    #if $individuals.choice == '0'
      "all_individuals"
    #else if $individuals.choice == '1'
      "$p1_input"
    #end if
    "$output" "$output.files_path" "$minimum_coverage" "$minimum_quality"
	#if ((str($input.metadata.scaffold) == str($input.metadata.ref)) and (str($input.metadata.pos) == str($input.metadata.rPos))) or (str($include_reference) == '0')
        "none"
    #else
        "$input.metadata.dbkey"
    #end if
    "$data_source"
    #set $draw_tree_options = ''.join(str(x) for x in [$branch_style, $scale_style, $length_style, $layout_style])
    #if $draw_tree_options == ''
        ""
    #else
        "-$draw_tree_options"
    #end if
    #for $individual_name, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
        #set $arg = '%s:%s' % ($individual_col, $individual_name)
        "$arg"
    #end for
  </command>

  <inputs>
    <param name="input" type="data" format="gd_snp" label="SNP dataset" />

    <conditional name="individuals">
      <param name="choice" type="select" label="Individuals">
        <option value="0" selected="true">All individuals</option>
        <option value="1">Individuals in a population</option>
      </param>
      <when value="0" />
      <when value="1">
        <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
      </when>
    </conditional>

    <param name="minimum_coverage" type="integer" min="0" value="0" label="Minimum coverage" />

    <param name="minimum_quality" type="integer" min="0" value="0" label="Minimum quality" help="Note: minimum coverage and minimum quality cannot both be 0" />

    <param name="include_reference" type="select" format="integer" label="Include reference sequence">
      <option value="1" selected="true">Yes</option>
      <option value="0">No</option>
    </param>

    <param name="data_source" type="select" format="integer" label="Data source">
      <option value="0" selected="true">sequence coverage</option>
      <option value="1">estimated genotype</option>
    </param>

    <param name="branch_style" type="select" display="radio">
      <label>Branch type</label>
      <option value="" selected="true">square</option>
      <option value="d">diagonal</option>
    </param>
     
    <param name="scale_style" type="select" display="radio">
      <label>Draw branches to scale</label>
      <option value="" selected="true">yes</option>
      <option value="s">no</option>
    </param>
     
    <param name="length_style" type="select" display="radio">
      <label>Show branch lengths</label>
      <option value="" selected="true">yes</option>
      <option value="b">no</option>
    </param>
     
    <param name="layout_style" type="select" display="radio">
      <label>Tree layout</label>
      <option value="" selected="true">horizontal</option>
      <option value="v">vertical</option>
    </param>
  </inputs>

  <outputs>
    <data name="output" format="html" />
  </outputs>

  <tests>
    <test>
      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
      <param name="choice" value="0" />
      <param name="minimum_coverage" value="3" />
      <param name="minimum_quality" value="30" />
      <param name="data_source" value="0" />
      <param name="branch_style" value="" />
      <param name="scale_style" value="" />
      <param name="length_style" value="" />
      <param name="layout_style" value="" />
      <output name="output" file="test_out/phylogenetic_tree/phylogenetic_tree.html" ftype="html" compare="diff" lines_diff="2">
        <extra_files type="file" name="distance_matrix.phylip" value="test_out/phylogenetic_tree/distance_matrix.phylip" />
        <extra_files type="file" name="informative_snps.txt" value="test_out/phylogenetic_tree/informative_snps.txt" />
        <extra_files type="file" name="mega_distance_matrix.txt" value="test_out/phylogenetic_tree/mega_distance_matrix.txt" />
        <extra_files type="file" name="phylogenetic_tree.newick" value="test_out/phylogenetic_tree/phylogenetic_tree.newick" />
        <extra_files type="file" name="tree.pdf" value="test_out/phylogenetic_tree/tree.pdf" compare="sim_size" delta = "1000"/>
      </output>
    </test>
  </tests>

  <help>

**Dataset formats**

The input dataset is in gd_snp_ format.
The output is a composite dataset, containing the tree in both text (Newick_)
and PostScript formats, as well as supplemental text information.
(`Dataset missing?`_)

.. _gd_snp: ./static/formatHelp.html#gd_snp
.. _Newick: http://evolution.genetics.washington.edu/phylip/newicktree.html
.. _Dataset missing?: ./static/formatHelp.html

-----

**What it does**

This tool uses a gd_snp dataset to determine a kind of "genetic distance"
between each pair of individuals.  That information is used to
produce a tree-shaped figure that depicts how the individuals are related,
both as a text files and as a diagram.
The text files include a common tree format, Newick, as well as distance
matrices and counts of informative SNPs for each pairwise comparison.
The informative SNPs can be used as a guide to how reliable the tree is.

The input parameters are:

SNP dataset
  A table of SNPs for various individuals, in gd_snp format.

Individuals
  By default all individuals are included in the analysis, but this can
  optionally be restricted to a subset that has been defined using the
  Specify Individuals tool.

Minimum coverage
  For each pair of individuals, the tool looks for informative SNPs, i.e.,
  where the sequence data for both individuals is adequate according to
  some criterion.  Specifying, say, 7 for this option instructs the tool
  to consider only SNPs with coverage at least 7 in both individuals
  when estimating their "genetic distance".

Minimum quality
  Specifying, say, 37 for this option instructs the tool to consider
  only SNPs with SAMtools quality value at least 37 in both individuals
  when estimating their "genetic distance".

Include reference sequence
  For gd_snp datasets containing columns for a reference sequence, the
  user can ask that the reference be indicated in the tree, to help with
  rooting it.  If the dataset has no reference columns, this option has
  no effect.

Data source
  The genetic distance between two individuals at a given SNP can
  be estimated two ways.  One method is to use the absolute value of the
  difference in the frequency of the first allele (or equivalently, the
  second allele).  For instance, if the first individual has 5 reads of
  each allele and the second individual has respectively 3 and 6 reads,
  then the frequencies are 1/2 and 1/3, giving a distance 1/6 at that
  SNP.  The other approach is to use the SAMtools genotypes to estimate
  the difference in the number of occurrences of the first allele.
  For instance, if the two genotypes are 2 and 1, i.e., the individuals
  are estimated to have respectively 2 and 1 occurrences of the first
  allele at this location, then the distance is 1 (the absolute value
  of the difference of the two numbers).

Output options
  The final four options apply mostly to the graphical drawing of the
  tree, except that the branch lengths are also added to the Newick text
  file.

-----

**Acknowledgments**

To convert the distance matrix to a Newick-formatted tree, we use the
QuickTree program from
http://www.sanger.ac.uk/resources/software/quicktree/ .

To make the diagram we use draw_tree, available at
http://compgen.bscb.cornell.edu/phast/ .

  </help>
</tool>