Mercurial > repos > iuc > gubbins

<tool id="gubbins" name="Gubbins" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
    <description>Recombination detection in Bacteria</description>
    <macros>
        <token name="@TOOL_VERSION@">3.2.1</token>
        <token name="@VERSION_SUFFIX@">0</token>
    </macros>
    <xrefs>
        <xref type="bio.tools">gubbins</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">gubbins</requirement>
    </requirements>

    <version_command>run_gubbins.py --version</version_command>

    <command detect_errors="exit_code"><![CDATA[

        ln -s '$alignment_file' foo.aln &&

        run_gubbins.py

        --threads \${GALAXY_SLOTS:-1}

        -i '$adv.iterations'
        -z '$adv.converge_method'
        $adv.extensive_search

        #if $adv.outgroup
            -o '$adv.outgroup'
        #end if

        -t '$really_adv.tree_builder'
        -m '$really_adv.min_snps'
        -f '$really_adv.filter_percentage'
        -a '$really_adv.min_window_size'
        -b '$really_adv.max_window_size'
        -p '$really_adv.p_value'
        --trimming-ratio '$really_adv.trimming_ratio'
        $really_adv.remove_identical_sequences

        foo.aln

        ## Requited because the p-value is included as prefix

        && mv *branch_base_reconstruction.embl branch_base_reconstruction.embl
        && mv *filtered_polymorphic_sites.fasta filtered_polymorphic_sites.fasta
        && mv *filtered_polymorphic_sites.phylip filtered_polymorphic_sites.phylip
        && mv *node_labelled.final_tree.tre node_labelled.final_tree
        && mv *final_tree.tre final_tree.tre
        && mv *per_branch_statistics.csv per_branch_statistics.csv
        && mv *recombination_predictions.embl recombination_predictions.embl
        && mv *recombination_predictions.gff recombination_predictions.gff
        && mv *summary_of_snp_distribution.vcf summary_of_snp_distribution.vcf

    ]]></command>
    <inputs>
        <!-- Just the data set.. -->
        <param type="data" name="alignment_file" format="fasta" label="Whole genome alignment file" help="Whole genome alignment file in fasta format"/>

        <!-- output file picker -->
        <param name="outfiles" type="select" multiple="true" display="checkboxes" label="Select the required output files" help="Default selections are the Final Tree in Newick format, the Recombination Predictions in gff3 format and the Summary of SNP Distribution">
            <option value="ftree" selected="true">Final Tree in newick format</option>
            <option value="gff" selected="true">Recombination Predictions in gff3 format</option>
            <option value="vcf" selected="true">Summary of SNP Distribution in vcf format</option>
            <option value="recomb_embl">Recombination Predictions in embl format</option>
            <option value="fpoly">Filtered Polymorphic Sites in fasta format</option>
            <option value="ppoly">Filtered Polymorphic Sites in phylip format</option>
            <option value="stats">Per Branch Statistics in csv format</option>
            <option value="baseb">Base Branch Reconstruction in embl format</option>
        </param>

        <!-- A semi advanced section, really shouldn't have to change anything. -->
        <section name="adv" title="Advanced options" expanded="True">
            <param argument="--iterations" type="integer" label="Iterations" min="0" value="5" help="Maximum No. of iterations, default is 5" />
            <param argument="--converge-method" type="select" label="Model conversion Options" help="Criteria to use to know when to halt iterations">
                <option value="weighted_robinson_foulds" selected="True">Weighted Robinson Foulds</option>
                <option value="robinson_foulds">Robinson Foulds</option>
                <option value="recombination">Recombination</option>
            </param>
            <param argument="--outgroup" type="text" label="Outgroup" value="" help="Outgroup name for rerooting. A list of comma separated names can be used if they form a clade.">
                <sanitizer invalid_char="">
                    <valid initial="string.letters,string.digits">
                        <add value="+" />
                        <add value="-" />
                        <add value=" " />
                        <add value="_" />
                    </valid>
                </sanitizer>
            </param>
            <param argument="--extensive-search" type="boolean" truevalue="--extensive-search" falsevalue="" checked="false" label="Extensive search " help="Default: False" />
        </section>

        <!-- a really advanced section, play with this at your peril! -->
        <section name="really_adv" title="Really advanced options - change these if you really know what you are doing." expanded="false">
            <param argument="--tree-builder" type="select" label="Tree builder" help="Application to use for tree building, default RAxML">
                <option value="raxml" selected="true">RAxML</option>
                <option value="raxmlng">RAxML-NG</option>
                <option value="iqtree">IQ-TREE</option>
                <option value="fasttree">FastTree</option>
                <option value="hybrid">Hybrid</option>
                <option value="rapidnj">RapidNJ</option>
            </param>
            <param argument="--min-snps" type="integer" label="Minimum SNPs" min="0" value="3" help="Min SNPs to identify a recombination block. Default: 3" />
            <param argument="--filter-percentage" type="float" label="Filter percentage" min="0" value="25" help="Filter out taxa with more than this percentage of gaps. Default: 25" />
            <param argument="--min-window-size" type="integer" label="Minimum window size" min="0" value="100" help="Minimum window size. Default: 100" />
            <param argument="--max-window-size" type="integer" label="Maximum window size" min="0" value="10000" help="Maximum window size. Default: 10000" />
            <param argument="--remove-identical-sequences" type="boolean" label="Remove identical sequences" falsevalue="" truevalue="-d" />
            <param argument="--p-value" type="float" min="0" max="1" value="0.05" label="p-Value" help="p-Value for detecting recombinations. Default: 0.05" />
            <param argument="--trimming_ratio" type="float" min="0" value="1" label="Trimming ration" help="Ratio of log probabilities used to trim recombinations. Default: 1" />

        </section>

    </inputs>

    <outputs>
        <data format="txt" name="final_tree" label="${tool.name} on ${on_string} Final Tree" from_work_dir="final_tree.tre" >
            <filter>outfiles and 'ftree' in outfiles</filter>
        </data>
        <data format="gff3" name="recomb_pred_gff" label="${tool.name} on ${on_string} Recombinations Prediction gff" from_work_dir="recombination_predictions.gff" >
            <filter>outfiles and 'gff' in outfiles</filter>
        </data>
        <data format="embl" name="recomb_pred_embl" label="${tool.name} on ${on_string} Recombinations Prediction embl" from_work_dir="recombination_predictions.embl" >
            <filter>outfiles and 'recomb_embl' in outfiles</filter>
        </data>
        <data format="fasta" name="filt_polymorph_fna" label="${tool.name} on ${on_string} Filtered Polymorphic Sites fasta" from_work_dir="filtered_polymorphic_sites.fasta" >
            <filter>outfiles and 'fpoly' in outfiles</filter>
        </data>
        <data format="phylip" name="filt_polymorph_phy" label="${tool.name} on ${on_string} Filtered Polymorphic Sites phylip" from_work_dir="filtered_polymorphic_sites.phylip" >
            <filter>outfiles and 'ppoly' in outfiles</filter>
        </data>
        <data format="csv" name="per_b_stat_csv" label="${tool.name} on ${on_string} Per Branch Statistics csv" from_work_dir="per_branch_statistics.csv" >
            <filter>outfiles and 'stats' in outfiles</filter>
        </data>
        <data format="vcf" name="sum_snp_vcf" label="${tool.name} on ${on_string} Summary of SNP Distribution vcf" from_work_dir="summary_of_snp_distribution.vcf" >
            <filter>outfiles and 'vcf' in outfiles</filter>
        </data>
        <data format="embl" name="base_branch_embl" label="${tool.name} on ${on_string} Branch Base Reconstruction embl" from_work_dir="branch_base_reconstruction.embl" >
            <filter>outfiles and 'baseb' in outfiles</filter>
        </data>
    </outputs>


    <tests>
        <test expect_num_outputs="3">
            <param name="alignment_file" value="multiple_recombinations.aln" ftype="fasta" />
            <output name="recomb_pred_gff">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                </assert_contents>
            </output>
            <output name="sum_snp_vcf">
                <assert_contents>
                    <has_text text="##fileformat=VCFv4.2" />
                </assert_contents>
            </output>
        </test>

        <test expect_num_outputs="3">
            <param name="alignment_file" value="multiple_recombinations.aln" ftype="fasta" />
            <param name="iterations" value="3"/>
            <output name="recomb_pred_gff">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                </assert_contents>
            </output>
            <output name="sum_snp_vcf">
                <assert_contents>
                    <has_text text="##fileformat=VCFv4.2" />
                </assert_contents>
            </output>
        </test>

        <test expect_num_outputs="3">
            <param name="alignment_file" value="multiple_recombinations.aln" ftype="fasta" />
            <param name="converge_method" value="recombination" />
            <output name="recomb_pred_gff">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                </assert_contents>
            </output>
            <output name="sum_snp_vcf">
                <assert_contents>
                    <has_text text="##fileformat=VCFv4.2" />
                </assert_contents>
            </output>
        </test>

        <test expect_num_outputs="8">
            <param name="alignment_file" value="multiple_recombinations.aln" ftype="fasta" />
            <param name="converge_method" value="recombination" />
            <param name="outfiles" value="gff,vcf,ftree,recomb_embl,fpoly,ppoly,stats,baseb"/>
            <output name="recomb_pred_gff">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                </assert_contents>
            </output>
            <output name="sum_snp_vcf">
                <assert_contents>
                    <has_text text="##fileformat=VCFv4.2" />
                </assert_contents>
            </output>
            <output name="filt_polymorph_fna">
                <assert_contents>
                    <has_text text=">sequence_1" />
                </assert_contents>
            </output>
            <output name="recomb_pred_embl">
                <assert_contents>
                    <has_text text="FT" />
                </assert_contents>
            </output>
            <output name="filt_polymorph_phy">
                <assert_contents>
                    <has_text text="sequence_1" />
                </assert_contents>
            </output>
            <output name="per_b_stat_csv">
                <assert_contents>
                    <has_text text="Node" />
                </assert_contents>
            </output>
            <output name="base_branch_embl">
                <assert_contents>
                    <has_text text="FT" />
                </assert_contents>
            </output>
            <output name="final_tree">
                <assert_contents>
                    <has_text text="((sequence_" />
                </assert_contents>
            </output>
        </test>
        <!-- Test new options-->
        <test expect_num_outputs="8" >
            <param name="alignment_file" value="multiple_recombinations.aln" ftype="fasta" />
            <param name="outfiles" value="ftree,gff,vcf,recomb_embl,fpoly,ppoly,stats,baseb"/>
            <section name="adv">
                <param name="expensive_research" value="true"/>
            </section>
            <section name="really_adv">
                <param name="tree_builder" value="fasttree"/>
                <param name="remove_identical_sequences" value="true"/>
            </section>
            <output name="recomb_pred_gff">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                    <has_text text="sequence_4" />
                </assert_contents>
            </output>
            <output name="sum_snp_vcf">
                <assert_contents>
                    <has_text text="##fileformat=VCFv4.2" />
                </assert_contents>
            </output>
            <output name="base_branch_embl">
                <assert_contents>
                    <has_n_lines n="1638"/>
                    <has_text text="sequence_8" />
                </assert_contents>
            </output>
            <output name="final_tree">
                <assert_contents>
                    <has_n_lines n="1"/>
                    <has_text text="sequence_10" />
                </assert_contents>
            </output>
            <output name="recomb_pred_embl">
                <assert_contents>
                    <has_n_lines n="24"/>
                    <has_text text="sequence_2" />
                </assert_contents>
            </output>
            <output name="filt_polymorph_fna">
                <assert_contents>
                    <has_n_lines n="18"/>
                    <has_text text="GAAAA" />
                </assert_contents>
            </output>
            <output name="filt_polymorph_phy">
                <assert_contents>
                    <has_n_lines n="10"/>
                    <has_text text="GAAAA" />
                </assert_contents>
            </output>
            <output name="per_b_stat_csv">
                <assert_contents>
                    <has_n_lines n="18"/>
                    <has_text text="Number of SNPs Inside Recombinations" />
                </assert_contents>
            </output>
            <output name="sum_snp_vcf">
                <assert_contents>
                    <has_text text="ID=1,length=242" />
                </assert_contents>
            </output>
        </test>
    </tests>


    <help><![CDATA[
**Gubbins**

Since the introduction of high-throughput, second-generation DNA sequencing technologies, there has been an enormous increase in the size of datasets being used for estimating bacterial population phylodynamics. Although many phylogenetic techniques are scalable to hundreds of bacterial genomes, methods which have been used for mitigating the effect of mechanisms of horizontal sequence transfer on phylogenetic reconstructions cannot cope with these new datasets. Gubbins (Genealogies Unbiased By recomBinations In Nucleotide Sequences) is an algorithm that iteratively identifies loci containing elevated densities of base substitutions while concurrently constructing a phylogeny based on the putative point mutations outside of these regions. Simulations demonstrate the algorithm generates highly accurate reconstructions under realistic models of short-term bacterial evolution, and can be run in only a few hours on alignments of hundreds of bacterial genome sequences.

**Running Gubbins**

To run Gubbins with default settings:

Supply a "fasta" genome alignment file and press execute.

**Other options**

**Advanced**

*Iterations*, (-i)

The maximum number of iterations to perform; the algorithm will stop earlier than this if it converges on the same tree in two successive iterations. Default is 5.

*Converge_method* (-z)

Criteria to use to know when to halt iterations [weighted_robinson_foulds|robinson_foulds|recombination]. Default is weighted_robinson_foulds.

*Outgroup*, (-o)

The name of a sequence in the alignment on which to root the tree

**Really Advanced**

These options are here for completeness and you shouldn't need to change them from their defaults. You really need to know what you are doing before you use these.

*Tree builder*, (-t)

The algorithm to use in the construction of phylogenies in the analysis; can be ‘raxml’, to use RAxML, ‘fasttree’, to use Fasttree, or ‘hybrid’, to use Fasttree for the first iteration and RAxML in all subsequent iterations. Default is raxml

Filter Percentage, (-f)

Filter out taxa with more than this percentage of missing data. Default is 25%

*Minimum snps*, (-m)

The minimum number of base substitutions required to identify a recombination. Default is 3.


And others...


**Output files**

* Recombination predictions in EMBL tab file format.


* Recombination predictions in GFF3 format


* Base substitution reconstruction in EMBL tab format.


* VCF file summarising the distribution of SNPs


* Per branch reporting of the base substitutions inside and outside recombinations events.


* FASTA format alignment of filtered polymorphic sites used to generate the phylogeny in the final iteration.


* Phylip format alignment of filtered polymorphic sites used to generate the phylogeny in the final iteration.


* Final phylogenetic tree in newick format.

    ]]></help>


    <citations>
        <citation type="doi">doi:10.1093/nar/gku1196</citation>
    </citations>

</tool>
author	iuc
date	Wed, 24 Aug 2022 07:37:39 +0000
parents	637ec5d5368c
children