view unicycler.xml @ 4:2db911a4efc5 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/unicycler commit c9e02d7a4de4bda4809f6c7006990901602cc3ad
author iuc
date Thu, 06 Sep 2018 10:59:26 -0400
parents c4eac0c7e542
children 23300b42ca18
line wrap: on
line source

<tool id="unicycler" name="Create assemblies with Unicycler" version="@VERSION@.0">
    <macros>
        <token name="@VERSION@">0.4.6</token>
    </macros>
    <requirements>
        <requirement type="package" version="@VERSION@">unicycler</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
## Preparing files
#if str( $paired_unpaired.fastq_input_selector ) == "paired"
    #if $paired_unpaired.fastq_input1.is_of_type('fastqsanger')
        #set fq1 = "fq1.fastq"
    #elif $paired_unpaired.fastq_input1.is_of_type('fastqsanger.gz')
        #set fq1 = "fq1.fastq.gz"
    #end if
    #if $paired_unpaired.fastq_input2.is_of_type('fastqsanger')
        #set fq2 = "fq2.fastq"
    #elif $paired_unpaired.fastq_input2.is_of_type('fastqsanger.gz')
        #set fq2 = "fq2.fastq.gz"
    #end if
    ln -s '${paired_unpaired.fastq_input1}' $fq1 &&
    ln -s '${paired_unpaired.fastq_input2}' $fq2 &&
#elif str( $paired_unpaired.fastq_input_selector ) == "paired_collection"
    #if $paired_unpaired.fastq_input1.forward.is_of_type('fastqsanger')
        #set fq1 = "fq1.fastq"
    #elif $paired_unpaired.fastq_input1.forward.is_of_type('fastqsanger.gz')
        #set fq1 = "fq1.fastq.gz"
    #end if
    #if $paired_unpaired.fastq_input1.reverse.is_of_type('fastqsanger')
        #set fq2 = "fq2.fastq"
    #elif $paired_unpaired.fastq_input1.reverse.is_of_type('fastqsanger.gz')
        #set fq2 = "fq2.fastq.gz"
    #end if
    ln -s '${paired_unpaired.fastq_input1.forward}' $fq1 &&
    ln -s '${paired_unpaired.fastq_input1.reverse}' $fq2 &&
#elif str( $paired_unpaired.fastq_input_selector ) == "single"
    #if $paired_unpaired.fastq_input1.is_of_type('fastqsanger')
        #set fq = "fq.fastq"
    #elif $paired_unpaired.fastq_input1.is_of_type('fastqsanger.gz')
        #set fq = "fq.fastq.gz"
    #end if
    ln -s '${paired_unpaired.fastq_input1}' '$fq' &&
#end if
#if $long
    #if $long.is_of_type('fastqsanger')
        #set lr = "lr.fastq"
    #elif $long.is_of_type('fastqsanger.gz')
        #set lr = "lr.fastq.gz"
    #elif $long.is_of_type('fasta')
        #set lr = "lr.fasta"
    #end if
    ln -s '${long}' '$lr' &&
#end if
## Get location for pilon installation
pilon=`pilon --jar_dir` &&
## Running Unicycler
unicycler -t "\${GALAXY_SLOTS:-4}"
-o ./
--verbosity 3
--pilon_path \$pilon
#if str( $paired_unpaired.fastq_input_selector ) == "paired"
    -1 '$fq1'
    -2 '$fq2'
#elif str( $paired_unpaired.fastq_input_selector ) == "paired_collection"
    -1 '$fq1'
    -2 '$fq2'
#elif str( $paired_unpaired.fastq_input_selector ) == "single"
    -s '$fq'
#end if
#if $long
    -l $lr
#end if
## General Unicycler Options section
## ----------------------------------------------------------
--mode '$mode'
--min_fasta_length '$min_fasta_length'
--linear_seqs '$linear_seqs'
#if str($min_anchor_seg_len) != ''
--min_anchor_seg_len '$min_anchor_seg_len'
#end if
## Spades Options section
## ----------------------------------------------------------
$spades.no_correct
--min_kmer_frac '$spades.min_kmer_frac'
--max_kmer_frac '$spades.max_kmer_frac'
#if str($spades.kmers) != ''
--kmers '$spades.kmers'
#end if
--kmer_count '$spades.kmer_count'
--depth_filter '$spades.depth_filter'
## Rotation Options section
## ----------------------------------------------------------
$rotation.no_rotate
#if $rotation.start_genes
    --start_genes '$rotation.start_genes'
#end if
--start_gene_id '$rotation.start_gene_id'
--start_gene_cov '$rotation.start_gene_cov'
## Pilon Options section
## ----------------------------------------------------------
$pilon.no_pilon
#if str($pilon.min_polish_size) != ''
    --min_polish_size '$pilon.min_polish_size'
#end if
## Graph cleaning Options sdection
## ----------------------------------------------------------
--min_component_size '$graph_clean.min_component_size'
--min_dead_end_size '$graph_clean.min_dead_end_size'
## Long Read Alignment Options
## ----------------------------------------------------------
#if $lr_align.contamination
    --contamination '$lr_align.contamination'
#end if
--scores '${lr_align.scores}'
#if str($lr_align.low_score) != ''
    --low_score '$lr_align.low_score'
#end if
    ]]></command>
    <inputs>
        <conditional name="paired_unpaired">
            <param name="fastq_input_selector" type="select" label="Paired or Single end data?" help="Select between paired and single end data">
                <option selected="True" value="paired">Paired</option>
                <option value="paired_collection">Paired Collection</option>
                <option value="single">Single</option>
                <option value="none">None</option>
            </param>
            <when value="paired">
                <param name="fastq_input1" argument="-1" type="data" format="fastqsanger,fastqsanger.gz" label="Select first set of reads" help="Specify dataset with forward reads"/>
                <param name="fastq_input2" argument="-2" type="data" format="fastqsanger,fastqsanger.gz" label="Select second set of reads" help="Specify dataset with reverse reads"/>
            </when>
            <when value="paired_collection">
                <param name="fastq_input1" format="fastqsanger,fastqsanger.gz" type="data_collection" collection_type="paired" label="Select a paired collection" />
            </when>
            <when value="single">
                <param name="fastq_input1" argument="-s" type="data" format="fastqsanger,fastqsanger.gz" label="Select unpaired reads" help="Specify dataset with unpaired reads"/>
            </when>
            <when value="none">
            </when>
        </conditional>
        <param argument="--long" optional="true" type="data" format="fastqsanger,fastqsanger.gz,fasta" label="Select long reads. If there are no long reads, leave this empty"/>
        <param argument="--mode" type="select" label="Select Bridging mode">
            <option value="conservative">Conservative (smaller contigs, lower misassembly)</option>
            <option value="normal" selected="True">Normal (moderate contig size and misassembly rate)</option>
            <option value="bold">Bold (longest contigs, higher misassembly rate)</option>
        </param>
        <param argument="--min_fasta_length" type="integer" value="100" label="Exclude contigs from the FASTA file which are shorter than this length (bp)"/>
        <param argument="--linear_seqs" type="integer" value="0" label="The expected number of linear (i.e. non-circular) sequences in the assembly"/>
        <param argument="--min_anchor_seg_len" type="integer" min="0" optional="true" label="Unicycler will not use segments shorter than this as scaffolding anchors"/>
        <section name="spades" expanded="False" title="SPAdes options" help="Unicycler uses SPAdes to construct assembly graphs. You can modify some of the SPAdes settings here. Use this ONLY if you know what you are doing!">
            <param argument="--no_correct" type="boolean" checked="false" truevalue="--no_correct" falsevalue="" label="Skip SPAdes error correction step" help="This option turns off SPAdes error correction. Generally it is highly recommended to use correction."/>
            <param argument="--min_kmer_frac" type="float" min="0" max="1" value="0.2" label="Lowest k-mer size for SPAdes assembly, expressed as a fraction of the read length"/>
            <param argument="--max_kmer_frac" type="float" min="0" max="1" value="0.95" label="Highest k-mer size for SPAdes assembly, expressed as a fraction of the read length"/>
            <param argument="--kmers" type="text" value="" optional="true" label="Exact k-mers to use for SPAdes assembly, comma-separated">
                <validator type="regex" message="Kmers must be comma-separated odd integers (no repitition) without space in the range of 11 to 127 (inclusive)">^(\d*[13579],)*(\d*[13579])$</validator>
            </param>
            <param argument="--kmer_count" type="integer" min="0" value="10" label="Number of k-mer steps to use in SPAdes assembly"/>
            <param argument="--depth_filter" type="float" min="0" max="1" value="0.25" label="Filter out contigs lower than this fraction of the chromosomal depth" help="It is done if does not result in graph dead ends"/>
        </section>
        <section name="rotation" expanded="false" title="Rotation options" help="These options control the rotation of completed circular sequence near the end of the Unicycler pipeline. Use this ONLY if you know what you are doing!">
            <param argument="--no_rotate" type="boolean" checked="false" truevalue="--no_rotate" falsevalue="" label="Do not rotate completed replicons to start at a standard gene." help="Unicycler uses TBLASTN to search for dnaA or repA alleles in each completed replicon. If one is found, the sequence is rotated and/or flipped so that it begins with that gene encoded on the forward strand. This provides consistently oriented assemblies and reduces the risk that a gene will be split across the start and end of the sequence."/>
            <param argument="--start_genes" optional="true" type="data" format="fasta" label="FASTA file of genes for start point of rotated replicons" />
            <param argument="--start_gene_id" type="float" min="0" max="100" value="90" label="The minimum required BLAST percent identity for a start gene search"/>
            <param argument="--start_gene_cov" type="float" min="0" max="100" value="95" label="The minimum required BLAST percent coverage for a start gene search"/>
        </section>
        <section name="pilon" title="Pilon options" expanded="false">
            <param argument="--no_pilon" type="boolean" checked="false" truevalue="--no_pilon" falsevalue="" label="Do not use Pilon to polish the final assembly." help="Unicycler uses Pilon tool for polishing final assembly."/>
            <param argument="--min_polish_size" type="integer" min="0" value="1000" label="Contigs shorter than this value (bp) will not be polished using Pilon"/>
        </section>
        <section name="graph_clean" expanded="false" title="Graph cleaning options" help="These options control the removal of small leftover sequences after bridging is complete.">
            <param argument="--min_component_size" type="integer" min="0" value="1000" label="Unbridged graph components smaller than this size will be removed from the final graph" />
            <param argument="--min_dead_end_size" type="integer" min="0" value="1000" label="Graph dead ends smaller than this size will be removed from the final graph"/>
        </section>
        <section name="lr_align" expanded="false" title="Long read alignment parameters" help="These options control the alignment of long reads to the assembly graph.">
            <param argument="--contamination" optional="true" type="data" format="fasta" label="FASTA file of known contamination in long reads, e.g. lambda, phiXm or puc18 spike-ins." />
            <param argument="--scores" type="text" value="3,-6,-5,-2" label="Comma-delimited string of alignment scores: match, mismatch, gap open, gap extend"/>
            <param argument="--low_score" optional="true" type="integer" value="" label="Score threshold - alignments below this are considered poor" help="default = set automatically"/>
        </section>
    </inputs>
    <outputs>
        <data name="assembly_graph" format="txt" from_work_dir="assembly.gfa" label="${tool.name} on ${on_string}: Final Assembly Graph" />
        <data name="assembly" format="fasta" from_work_dir="assembly.fasta" label="${tool.name} on ${on_string}: Final Assembly"/>
    </outputs>
    <tests>
        <test>
            <conditional name="paired_unpaired">
                <param name="fastq_input_selector" value="paired" />
                <param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger" />
                <param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger" />
            </conditional>
            <param name="mode" value="normal" />
            <param name="min_fasta_length" value="100"/>
            <param name="linear_seqs" value="0"/>
            <section name="spades">
                <param name="no_correct" value="true"/>
                <param name="min_kmer_frac" value="0.2"/>
                <param name="max_kmer_frac" value="0.95"/>
                <param name="kmer_count" value="10"/>
                <param name="depth_filter" value="0.25"/>
            </section>
            <section name="rotation">
                <param name="no_rotate" value=""/>
                <param name="start_gene_id" value="90"/>
                <param name="start_gene_cov" value="95"/>
            </section>
            <section name="pilon">
                <param name="no_pilon" value=""/>
                <param name="min_polish_size" value="1000"/>
            </section>
            <section name="graph_clean">
                <param name="min_component_size" value="1000"/>
                <param name="min_dead_end_size" value="1000"/>
            </section>
            <section name="lr_align">
                <param name="scores" value="3,-6,-5,-2"/>
            </section>
            <output name="assembly_graph" ftype="txt">
                <assert_contents>
                    <has_text text="TACGGGGAAGGACGTC"/>
                </assert_contents>
            </output>
            <output name="assembly" ftype="fasta">
                <assert_contents>
                    <has_text text="length=5386" />
                </assert_contents>
            </output>
        </test>
        <!--
            Following test corresponds to the command:

              unicycler -t "${GALAXY_SLOTS:-8}"  -o ./ - -verbose 3 - -pilon_path `pilon - -jar_dir` \
                        -1 test-data/phix_f.fq.gz  -2 test-data/phix_r.fq.gz  -l test-data/onp.fa \
                         - -mode 'normal' - -no_correct

            This command causes a segfault with the current version of unicycler on bioconda for Linux
            during the minimap step (which seems to be compiled C code). A gist of the log can be found
            at: https://gist.github.com/jmchilton/b411b695170c1daea6589f5d76e326cb.
        -->
        <test>
            <conditional name="paired_unpaired">
                <param name="fastq_input_selector" value="paired" />
                <param name="fastq_input1" value="phix_f.fq.gz" ftype="fastqsanger.gz" />
                <param name="fastq_input2" value="phix_r.fq.gz" ftype="fastqsanger.gz" />
            </conditional>
            <param name="long" value="onp.fa" ftype="fasta" />
            <param name="mode" value="normal" />
            <param name="min_fasta_length" value="100"/>
            <param name="linear_seqs" value="0"/>
            <section name="spades">
                <param name="no_correct" value="true"/>
                <param name="min_kmer_frac" value="0.2"/>
                <param name="max_kmer_frac" value="0.95"/>
                <param name="kmer_count" value="10"/>
                <param name="depth_filter" value="0.25"/>
            </section>
            <section name="rotation">
                <param name="no_rotate" value=""/>
                <param name="start_gene_id" value="90"/>
                <param name="start_gene_cov" value="95"/>
            </section>
            <section name="pilon">
                <param name="no_pilon" value=""/>
                <param name="min_polish_size" value="1000"/>
            </section>
            <section name="graph_clean">
                <param name="min_component_size" value="1000"/>
                <param name="min_dead_end_size" value="1000"/>
            </section>
            <section name="lr_align">
                <param name="scores" value="3,-6,-5,-2"/>
            </section>
            <output name="assembly_graph" ftype="txt">
                <assert_contents>
                    <has_text text="TACGGGGAAGGACGTC" />
                </assert_contents>
            </output>
            <output name="assembly" ftype="fasta">
                <assert_contents>
                    <has_text text="length=5386" />
                </assert_contents>
            </output>
        </test>
        <test>
            <conditional name="paired_unpaired">
                <param name="fastq_input_selector" value="paired_collection"/>
                <param name="fastq_input1">
                    <collection type="paired">
                        <element name="forward" value="phix_f.fq.gz" ftype="fastqsanger" />
                        <element name="reverse" value="phix_r.fq.gz" ftype="fastqsanger" />
                    </collection>
                </param>
            </conditional>
            <param name="mode" value="normal" />
            <param name="min_fasta_length" value="100"/>
            <param name="linear_seqs" value="0"/>
            <section name="spades">
                <param name="no_correct" value="true"/>
                <param name="min_kmer_frac" value="0.2"/>
                <param name="max_kmer_frac" value="0.95"/>
                <param name="kmer_count" value="10"/>
                <param name="depth_filter" value="0.25"/>
            </section>
            <section name="rotation">
                <param name="no_rotate" value=""/>
                <param name="start_gene_id" value="90"/>
                <param name="start_gene_cov" value="95"/>
            </section>
            <section name="pilon">
                <param name="no_pilon" value="true"/>
                <param name="min_polish_size" value="1000"/>
            </section>
            <section name="graph_clean">
                <param name="min_component_size" value="1000"/>
                <param name="min_dead_end_size" value="1000"/>
            </section>
            <section name="lr_align">
                <param name="scores" value="3,-6,-5,-2"/>
            </section>
            <output name="assembly_graph" ftype="txt">
                <assert_contents>
                    <has_text text="TACGGGGAAGGACGTC" />
                </assert_contents>
            </output>
            <output name="assembly" ftype="fasta">
                <assert_contents>
                    <has_text text="length=5386" />
                </assert_contents>
            </output>
        </test>
        <test>
            <conditional name="paired_unpaired">
                <param name="fastq_input_selector" value="none"/>
            </conditional>
            <param name="min_anchor_seg_len" value="10"/>
            <section name="spades">
                <param name="kmers" value="21,23"/>
            </section>
            <param name="long" value="only_long.fasta" ftype="fasta" />
            <output name="assembly_graph" ftype="txt">
                <assert_contents>
                    <has_text text="S" />
                </assert_contents>
            </output>
            <output name="assembly" ftype="fasta">
                <assert_contents>
                    <has_text text=">1" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[

**Unicycler**

Unicycler is a hybrid assembly pipeline for bacterial genomes. It uses both Illumina reads and long reads (PacBio or Nanopore) to produce complete and accurate assemblies. It is written by `Ryan Wick`_ at the University of Melbourne's Centre for Systems Genomics. Much of the description below is lifted from Unicycler's `github page`_.

.. _`Ryan Wick`: https://github.com/rrwick
.. _`github page`: https://github.com/rrwick/Unicycler

-----

**Input data**

Unicycler accepts inputs short (Illumina) reads in FASTQ format. Galaxy places additional requirement of having these in FASTQ format with `Sanger encoding`_ of quality scores. Long reads (from Oxford Nanopore or PacBio) can be either in FASTQ of FASTA form.

.. _`Sanger encoding`: https://en.wikipedia.org/wiki/FASTQ_format#Quality

The input options are::

    -1 SHORT1, --short1 SHORT1
        FASTQ file of short reads (first reads in each pair)
    -2 SHORT2, --short2 SHORT2
        FASTQ file of short reads (second reads in each pair)
    -s SHORT_UNPAIRED, --short_unpaired SHORT_UNPAIRED
        FASTQ file of unpaired short reads
    -l LONG, --long LONG
        FASTQ or FASTA file of long reads, if all reads are available at start.

-----

**Bridging mode**

Unicycler can be run in three modes: conservative, normal (the default) and bold, set with the --mode option. Conservative mode is least likely to produce a complete assembly but has a very low risk of misassembly. Bold mode is most likely to produce a complete assembly but carries greater risk of misassembly. Normal mode is intermediate regarding both completeness and misassembly risk. See `description of modes`_ for more information.

.. _`description of modes`: https://github.com/rrwick/Unicycler#conservative-normal-and-bold

The available modes are::

    --mode {conservative,normal,bold}
        Bridging mode (default: normal)
        conservative = smaller contigs, lowest misassembly rate
        normal = moderate contig size and misassembly rate
        bold = longest contigs, higher misassembly rate

----

**Skip SPAdes error correction step**

Sequencing data contains a substantial number of sequencing errors that manifest themselves as deviations (bulges and non-connected components) within the assembly graph. One of the ways to improve the graph even constructing it is to minimize the amount sequencing errors by performing error correction. SPAdes, which is used by Unicycler for error correction and assembly, uses `BayesHammer`_ to correct the reads. Here is a brief summary of what it does:

 1. SPAdes (or rather BayesHammer) counts *k*-mers in reads and computed *k*-mer statistics that takes into account base quality values.
 2. `Hamming graph`_ is constructed for *k*-mers is which *k*-mers are nodes. In this graph edges connect nodes (*k*-mers) is they differ from each other by a number of nucleotides up to a certain threshold (the `Hamming distance`_). The graph is central to the error correction algorithm.
 3. At this step Bayesian subclustering of the graph produced in the previous step. For each *k*-mer we now know the center of its subcluster.
 4. Solid *k*-mers are derived from cluster centers and are assumed to be *error free*.
 5. Solid *k*-mers are mapped back to the reads and used to correct them.

This step takes considerable time, so if one need to quickly evaluate assemblies this step can be skipped. However, this is not recommended if one if trying to produce a final high quality assembly.

.. _`BayesHammer`: https://goo.gl/1iGkMe
.. _`Hamming graph`: https://en.wikipedia.org/wiki/Hamming_graph
.. _`Hamming distance`: https://en.wikipedia.org/wiki/Hamming_distance

This following option turns error correction on and off::

    --no_correct
        Skip SPAdes error correction step
        (default: conduct SPAdes error correction)

-----

**Do not rotate completed replicons to start at a standard gene**

Unicycler uses TBLASTN to search for dnaA or repA alleles in each completed replicon. If one is found, the sequence is rotated and/or flipped so that it begins with that gene encoded on the forward strand. This provides consistently oriented assemblies and reduces the risk that a gene will be split across the start and end of the sequence.

The following option turns rotation on and off::

    --no_rotate
        Do not rotate completed replicons
        to start at a standard gene
        (default: completed replicons are rotated)

**Do not use Pilon to polish the final assembly**

`Pilon`_ is a tool for improving overall quality of draft assemblies and finding variation among strains. Unicycler uses it for assembly *polishing*.

The following option turns pilon part of Unicycler pipeline on and off::

    --no_pilon
        Do not use Pilon to polish the
        final assembly (default: Pilon is used)

.. _`Pilon`: https://github.com/broadinstitute/pilon/wiki

------

**Expected number of linear sequences**

If you expect your sample to contain linear (non circular) sequences, set this option::

    --linear_seqs EXPECTED_LINEAR_SEQS
        The expected number of linear (i.e. non-circular)
        sequences in the underlying sequence

----

**SPAdes options**

This section provides control of SPAdes options::

    --min_kmer_frac MIN_KMER_FRAC
        Lowest k-mer size for SPAdes assembly,
        expressed as a fraction of the read length
        (default: 0.2)
    --max_kmer_frac MAX_KMER_FRAC
        Highest k-mer size for SPAdes assembly,
        expressed as a fraction of the read length
        (default: 0.95)
    --kmer_count KMER_COUNT
        Number of k-mer steps to use in
        SPAdes assembly (default: 10)
    --depth_filter DEPTH_FILTER
        Filter out contigs lower than this fraction
        of the chromosomal depth, if doing so does
        not result in graph dead ends (default: 0.25)

----

**Rotation options**

Unicycler attempts to rotate circular assemblies to make sure that they begin at a consistent starting gene. The following parameters control assembly rotation::

    --start_genes START_GENES
        FASTA file of genes for start point
        of rotated replicons
        (default: start_genes.fasta)
    --start_gene_id START_GENE_ID
        The minimum required BLAST percent identity
        for a start gene search
        (default: 90.0)
    --start_gene_cov START_GENE_COV
        The minimum required BLAST percent coverage
        for a start gene search
        (default: 95.0)

-----

**Graph cleaning options**

These options control the removal of small leftover sequences after bridging is complete::

    --min_component_size MIN_COMPONENT_SIZE
        Unbridged graph components smaller
        than this size (bp) will be removed
        from the final graph (default: 1000)
    --min_dead_end_size MIN_DEAD_END_SIZE
        Graph dead ends smaller than this size (bp)
        will be removed from the final graph
        (default: 1000)

-----

**Long read alignment options**

These options control the alignment of long reads to the assembly graph::

    --contamination CONTAMINATION
        FASTA file of known contamination in long reads
    --scores SCORES
        Comma-delimited string of alignment scores:
        match, mismatch, gap open, gap extend
        (default: 3,-6,-5,-2)
    --low_score LOW_SCORE
        Score threshold - alignments below this
        are considered poor
        (default: set threshold automatically)

-----

**Outputs**

Galaxy's wrapped for Unicycler produces two outputs:

 * final assembly in FASTA format
 * final assembly grapth in graph format

 While most will likely be interested in the FASTA dataset, the graph dataset is also quite useful and can be visualized using tools such as `Bandage`_.


.. _`Bandage`: https://github.com/rrwick/Bandage


    ]]></help>
    <citations>
        <citation type="doi">10.1101/096412</citation>
    </citations>
</tool>