Mercurial > repos > mbernt > maxbin2

<tool id="maxbin2" name="MaxBin2" version="@MAXBIN_VERSION@+galaxy4">
    <description>clusters metagenomic contigs into bins</description>
    <macros>
        <token name="@MAXBIN_VERSION@">2.2.7</token>
        <xml name="contig">
            <param argument="-contig" type="data" format="fasta,fasta.gz" label="Contig file"/>
        </xml>
        <xml name="input_type">
            <param name="type" type="select" label="Input type">
                <option value="reads" selected="true">Sequencing Reads</option>
                <option value="abund">Abundances</option>
            </param>
        </xml>
        <xml name="reads_extra_params">
            <param name="output_abundances" type="boolean" checked="false" label="Output abundances" help="" />
            <param argument="--reassembly" type="boolean" truevalue="-reassembly" falsevalue="" checked="false" label="Reassembly" help="Reassembly option is still highly experimental. To use this function, you need to feed MaxBin interleaved paired-end fastq or fasta file." />
        </xml>
    </macros>
    <xrefs>
        <xref type="bio.tools">maxbin</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@MAXBIN_VERSION@">maxbin2</requirement>
    </requirements>
    <version_command><![CDATA[run_MaxBin.pl -version | head -n 1]]></version_command>
    <command detect_errors="exit_code"><![CDATA[
## generate read or abundance files
#import re
#if $assembly.inputs.type == 'reads'
    #if $assembly.type == 'individual'
        ## uncompress .gz reads files if necessary
        #set $e = $assembly.inputs.reads
        #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($e.element_identifier))
        #if $e.ext.endswith(".gz")
gunzip -c '$e' > '$identifier' &&
echo '$identifier' >> reads_list &&
        #else
ln -s '$e' '$identifier' &&
echo '$identifier' >> reads_list &&
        #end if
    #else
        #for $i, $r in enumerate($assembly.inputs.reads)
            #if $r
                #set $identifier = re.sub('[^\s\w\-\\.]', '_', str($r.element_identifier))
                #set $newid = $identifier +  '_'  + str($i)
                #if $r.ext.endswith(".gz")
gunzip -c '$r' > '$newid' &&
echo '$newid' >> reads_list &&
                #else
ln -s '$r' '$newid' &&
echo '$newid' >> reads_list &&
                #end if
            #end if
        #end for
    #end if
#else if $assembly.inputs.type == 'abund'
    #if $assembly.type == 'individual'
echo '$assembly.inputs.abund' >> abund_list &&
    #else
        #for $a in $assembly.inputs.abund
            #if $a
echo '$a' >> abund_list &&
            #end if
        #end for
    #end if
#end if

## in case of reassembly the IBDA out and err is appended
## to differentiate this a header is added also befor the
## MaxBin2 outputs
#if $assembly.inputs.type == 'reads' and $assembly.inputs.reassembly != ""
echo "==== MaxBin2 stdout ====" &&
echo "==== MaxBin2 stderr ====" 1>&2 &&
#end if

run_MaxBin.pl
    -contig '$contig'
    -out out
#if $assembly.inputs.type == 'reads':
    -reads_list reads_list
    $assembly.inputs.reassembly
#else if $assembly.inputs.type == 'abund':
    -abund_list abund_list
#end if
    -min_contig_length $adv.min_contig_length
    -max_iteration $adv.max_iteration
    -prob_threshold $adv.prob_threshold
    $output.plotmarker
    -markerset $output.markerset
    -thread \${GALAXY_SLOTS:-1}

&& gzip -cd out.marker_of_each_bin.tar.gz | tar -xf -

## redirect the idba out and err file content to stdout and err
## since this is also wanted in case the error case ';' is used here to
## separate commands
#if $assembly.inputs.type == 'reads' and $assembly.inputs.reassembly != ""
; echo "==== IDBA stdout ===="
&& if [[ -f out.idba.out ]]; then cat out.idba.out; fi
&& echo "==== IDBA stderr ====" 1>&2
&& if [[ -f out.idba.err ]]; then cat out.idba.err 1>&2; fi
#end if
    ]]></command>
    <inputs>
        <expand macro="contig"/>
        <conditional name="assembly">
            <param name="type" type="select" label="Assembly type used to generate contig(s)">
                <option value="individual">Assembly of sample(s) one by one (individual assembly)</option>
                <option value="coassembly">Assembly of different samples together (co-assembly)</option>
            </param>
            <when value="individual">
                <conditional name="inputs">
                    <expand macro="input_type"/>
                    <when value="reads">
                        <param argument="-reads" type="data" format="fasta,fastq,fastq.gz,fasta.gz" label="Reads file"/>
                        <expand macro="reads_extra_params"/>
                    </when>
                    <when value="abund">
                        <param argument="-abund" type="data" format="tabular" label="Abundance file"/>
                    </when>
                </conditional>
            </when>
            <when value="coassembly">
                <conditional name="inputs">
                    <expand macro="input_type"/>
                    <when value="reads">
                        <param argument="-reads" type="data" multiple="true" format="fasta,fastq" label="Reads file(s)"/>
                        <expand macro="reads_extra_params"/>
                    </when>
                    <when value="abund">
                        <param argument="-abund" type="data" format="tabular" multiple="true" label="Abundance file(s)"/>
                    </when>
                </conditional>
            </when>
        </conditional>
        <section name="adv" title="Advanced options">
            <param argument="-min_contig_length" type="integer" min="0" value="1000" label="minimum contig length" />
            <param argument="-max_iteration" type="integer" min="0" value="50" label="Maximum Expectation-Maximization algorithm iteration number" />
            <param argument="-prob_threshold" type="float" min="0" max="1.0" value="0.5" label="Probability threshold for EM final classification" />
        </section>
        <section name="output" title="Outputs">
            <param argument="-plotmarker" type="boolean" truevalue="-plotmarker" falsevalue="" checked="false" label="Generate visualization of the marker gene presence numbers" />
            <param name="marker" type="boolean" checked="false" label="Output marker gene presence for bins table" />
            <param name="markers" type="boolean" checked="false" label="Output marker genes for each bin as fasta" />
            <param name="log" type="boolean" checked="false" label="Output log" />
            <param argument="-markerset" type="select" label="Marker gene set">
                <option value="107" selected="true">107 marker genes present in >95% of bacteria</option>
                <option value="40">40 marker gene sets that are universal among bacteria and archaea</option>
            </param>
        </section>
    </inputs>
    <outputs>
        <!-- default outputs -->
        <collection name="bins" type="list" label="${tool.name} on ${on_string}: Bins">
            <discover_datasets pattern="out.(?P&lt;designation&gt;[0-9]+).fasta" format="fasta" visible="false" />
        </collection>
        <data name="noclass" format="fasta" label="${tool.name} on ${on_string}: Unclassified sequences" from_work_dir="out.noclass"/>
        <data name="toshort" format="fasta" label="${tool.name} on ${on_string}: Too short sequences" from_work_dir="out.tooshort"/>
        <data name="summary" format="tabular" label="${tool.name} on ${on_string}: Summary" from_work_dir="out.summary"/>
        <!-- optional outputs -->
        <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="out.log">
            <filter>output['log']</filter>
        </data>
        <data name="marker" format="tabular" label="${tool.name} on ${on_string}: Marker gene presence" from_work_dir="out.marker">
            <filter>output['marker']</filter>
        </data>
        <data name="abundout" format="tabular" label="${tool.name} on ${on_string}: Abundances" from_work_dir="out.abund1">
            <filter>assembly['inputs']['type']=='reads' and assembly['inputs']['output_abundances']</filter>
        </data>
        <data name="plot" format="pdf" label="${tool.name} on ${on_string}: Marker gene presence plot" from_work_dir="out.marker.pdf">
            <filter>output['plotmarker']</filter>
        </data>
        <collection name="markers" type="list" label="${tool.name} on ${on_string}: Markers prediced for bins">
            <discover_datasets pattern="out.(?P&lt;designation&gt;[0-9]+).marker.fasta" format="fasta" visible="false" />
            <filter>output['markers']</filter>
        </collection>
        <!-- additional output in case of reassembly -->
        <collection name="reassembly_bins" type="list" label="${tool.name} on ${on_string}: Reassembly bins">
            <discover_datasets directory="out.reassem" pattern="out.(?P&lt;designation&gt;[0-9]+).fasta" format="fasta" visible="false" />
            <filter>assembly['inputs']['type']=='reads' and assembly['inputs']['reassembly']</filter>
        </collection>
        <collection name="reassembly_reads" type="list" label="${tool.name} on ${on_string}: Reassembly reads">
            <discover_datasets directory="out.reassem" pattern="out.reads.(?P&lt;designation&gt;[0-9]+)" format="fasta" visible="false" />
            <filter>assembly['inputs']['type']=='reads' and assembly['inputs']['reassembly']</filter>
        </collection>
        <data name="reassembly_noclass" format="fasta" label="${tool.name} on ${on_string}: Reassembly unclassified sequences" from_work_dir="out.reassem/out.reads.noclass">
            <filter>assembly['inputs']['type']=='reads' and assembly['inputs']['reassembly']</filter>
        </data>
        <data name="reassembly_n50" format="txt" label="${tool.name} on ${on_string}: Reassembly N50" from_work_dir="out.reassem/N50.txt">
            <filter>assembly['inputs']['type']=='reads' and assembly['inputs']['reassembly']</filter>
        </data>
    </outputs>
    <tests>
        <!-- test w contigs and reads as input -->
        <test expect_num_outputs="4">
            <param name="contig" value="Bin_Sample3_contigs.fasta" ftype="fasta" />
            <conditional name="assembly">
                <param name="type" value="individual"/>
                <conditional name="inputs">
                    <param name="type" value="reads"/>
                    <param name="reads" value="interleavedPE_unmapped_Sample3_small.fasta" ftype="fasta"/>
                    <param name="output_abundances" value="false"/>
                    <param name="reassembly" value=""/>
                </conditional>
            </conditional>
            <section name="adv">
                <param name="min_contig_length" value="1000"/>
                <param name="max_iteration" value="50"/>
                <param name="prob_threshold" value="0.5"/>
            </section>
            <section name="output">
                <param name="plotmarker" value=""/>
                <param name="marker" value="false"/>
                <param name="markers" value="false" />
                <param name="log" value="false"/>
                <param name="markerset" value="107"/>
            </section>
            <output_collection name="bins" type="list" count="2">
                <element name="001" file="1/out.001.fasta" ftype="fasta"/>
                <element name="002" file="1/out.002.fasta" ftype="fasta"/>
            </output_collection>
            <output name="summary" file="1/out.summary" ftype="tabular" />
            <output name="noclass" file="1/out.noclass" ftype="fasta" />
            <output name="toshort" file="1/out.tooshort" ftype="fasta" />
        </test>
        <!-- test w co-assembled contigs and multiple reads and optional outputs -->
        <test expect_num_outputs="9">
            <param name="contig" value="Bin_Sample3_contigs.fasta" ftype="fasta" />
            <conditional name="assembly">
                <param name="type" value="coassembly"/>
                <conditional name="inputs">
                    <param name="type" value="reads"/>
                    <param name="reads" value="interleavedPE_unmapped_Sample3_small.fasta,interleavedPE_unmapped_Sample3_small.fasta" ftype="fasta"/>
                    <param name="output_abundances" value="true"/>
                    <param name="reassembly" value=""/>
                </conditional>
            </conditional>
            <section name="adv">
                <param name="min_contig_length" value="1000"/>
                <param name="max_iteration" value="50"/>
                <param name="prob_threshold" value="0.5"/>
            </section>
            <section name="output">
                <param name="plotmarker" value="true"/>
                <param name="marker" value="true"/>
                <param name="markers" value="true" />
                <param name="log" value="true"/>
                <param name="markerset" value="107"/>
            </section>
            <output_collection name="bins" type="list" count="2">
                <element name="001" file="1/out.001.fasta" ftype="fasta"/>
                <element name="002" file="1/out.002.fasta" ftype="fasta"/>
            </output_collection>
            <output name="summary" ftype="tabular">
                <assert_contents>
                    <has_text text="Completeness"/>
                    <has_text text="out.001.fasta"/>
                </assert_contents>
            </output>
            <output name="noclass" file="1/out.noclass" ftype="fasta" />
            <output name="toshort" file="1/out.tooshort" ftype="fasta" />
            <output name="log" ftype="txt" >
                <assert_contents>
                    <has_text text="Input contig"/>
                    <has_text text="Elapsed time"/>
                    <has_text text="Yielded 2 bins for contig (scaffold) file"/>
                </assert_contents>
            </output>
            <output name="abundout" file="1/out.abund1" ftype="tabular" />
            <output name="marker" file="1/out.marker" ftype="tabular" />
            <output name="plot" file="1/out.marker.pdf" ftype="pdf" compare="sim_size" />
            <output_collection name="markers" type="list" count="1">
                <element name="001" file="1/out.001.marker.fasta" ftype="fasta"/>
            </output_collection>
        </test>
        <!--test w contigs and abundances as input + advanced options -->
        <test expect_num_outputs="5">
            <param name="contig" value="Bin_Sample3_contigs.fasta" ftype="fasta" />
            <conditional name="assembly">
                <param name="type" value="individual"/>
                <conditional name="inputs">
                    <param name="type" value="abund"/>
                    <param name="abund" value="abundances.tsv" ftype="tabular"/>
                </conditional>
            </conditional>
            <section name="adv">
                <param name="min_contig_length" value="500"/>
                <param name="max_iteration" value="10"/>
                <param name="prob_threshold" value="0.95"/>
            </section>
            <section name="output">
                <param name="plotmarker" value="-plotmarker"/>
                <param name="marker" value="false"/>
                <param name="markers" value="false" />
                <param name="log" value="false"/>
                <param name="markerset" value="107"/>
            </section>
            <output_collection name="bins" type="list" count="2">
                <element name="001" file="2/out.001.fasta" ftype="fasta"/>
                <element name="002" file="2/out.002.fasta" ftype="fasta"/>
            </output_collection>
            <output name="summary" file="2/out.summary" ftype="tabular" />
            <output name="noclass" file="2/out.noclass" ftype="fasta" />
            <output name="toshort" file="2/out.tooshort" ftype="fasta" />
            <output name="plot" file="2/out.marker.pdf" ftype="pdf" compare="sim_size" />
        </test>
        <!-- test w contigs and reads as input + reassembly-->
        <test expect_num_outputs="8">
            <param name="contig" value="Bin_Sample3_contigs.fasta" ftype="fasta" />
            <conditional name="assembly">
                <param name="type" value="individual"/>
                <conditional name="inputs">
                    <param name="type" value="reads"/>
                    <param name="reads" value="interleavedPE_unmapped_Sample3_small.fasta" ftype="fasta"/>
                    <param name="output_abundances" value="false"/>
                    <param name="reassembly" value="-reassembly"/>
                </conditional>
            </conditional>
            <section name="adv">
                <param name="min_contig_length" value="1000"/>
                <param name="max_iteration" value="50"/>
                <param name="prob_threshold" value="0.5"/>
            </section>
            <section name="output">
                <param name="plotmarker" value=""/>
                <param name="marker" value="false"/>
                <param name="markers" value="false" />
                <param name="log" value="false"/>
                <param name="markerset" value="107"/>
            </section>
            <output_collection name="bins" type="list" count="2">
                <element name="001" file="3/out.001.fasta" ftype="fasta"/>
                <element name="002" file="3/out.002.fasta" ftype="fasta"/>
            </output_collection>
            <output name="summary" file="3/out.summary" ftype="tabular" />
            <output name="noclass" file="3/out.noclass" ftype="fasta" />
            <output name="toshort" file="3/out.tooshort" ftype="fasta" />
            <output_collection name="reassembly_bins" type="list" count="2">
                <element name="001" ftype="fasta">
                    <assert_contents>
                        <has_text text=">scaffold_0"/>
                        <has_text text=">scaffold_523"/>
                    </assert_contents>
                </element>
                <element name="002" ftype="fasta">
                    <assert_contents>
                        <has_text text=">scaffold_0"/>
                        <has_text text=">scaffold_523"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="reassembly_reads" type="list" count="2">
                <element name="001" file="3/out.reassem/out.reads.001" ftype="fasta"/>
                <element name="002" file="3/out.reassem/out.reads.002" ftype="fasta"/>
            </output_collection>
            <output name="reassembly_noclass" file="3/out.reassem/out.reads.noclass" ftype="fasta" />
            <output name="reassembly_n50" ftype="txt">
                <assert_contents>
                    <has_text text="N50 before reassem"/>
                    <has_text text="out.002.fasta"/>
                    <has_text text="2878"/>
                </assert_contents>
            </output>
        </test>
        <!-- test w contigs and reads in fastqsanger format as input -->
        <test expect_num_outputs="4">
            <param name="contig" value="test4_contigs.fasta" ftype="fasta" />
            <conditional name="assembly">
                <param name="type" value="individual"/>
                <conditional name="inputs">
                    <param name="type" value="reads"/>
                    <param name="reads" value="test4_reads.fastqsanger" ftype="fastqsanger"/>
                    <param name="output_abundances" value="false"/>
                    <param name="reassembly" value=""/>
                </conditional>
            </conditional>
            <section name="adv">
                <param name="min_contig_length" value="1000"/>
                <param name="max_iteration" value="50"/>
                <param name="prob_threshold" value="0.5"/>
            </section>
            <section name="output">
                <param name="plotmarker" value=""/>
                <param name="marker" value="false"/>
                <param name="markers" value="false" />
                <param name="log" value="false"/>
                <param name="markerset" value="107"/>
            </section>
            <output_collection name="bins" type="list" count="2">
                <element name="001" file="4/out.001.fasta" ftype="fasta"/>
                <element name="002" file="4/out.002.fasta" ftype="fasta"/>
            </output_collection>
            <output name="summary" file="4/out.summary" ftype="tabular" />
            <output name="noclass" file="4/out.noclass" ftype="fasta" />
            <output name="toshort" file="4/out.tooshort" ftype="fasta" />
        </test>
        <!-- test w contigs and reads in fastqsanger.gz format as input -->
        <test expect_num_outputs="4">
            <param name="contig" value="test4_contigs.fasta" ftype="fasta" />
            <conditional name="assembly">
                <param name="type" value="individual"/>
                <conditional name="inputs">
                    <param name="type" value="reads"/>
                    <param name="reads" value="test4_reads.fastqsanger.gz" ftype="fastqsanger.gz"/>
                    <param name="output_abundances" value="false"/>
                    <param name="reassembly" value=""/>
                </conditional>
            </conditional>
            <section name="adv">
                <param name="min_contig_length" value="1000"/>
                <param name="max_iteration" value="50"/>
                <param name="prob_threshold" value="0.5"/>
            </section>
            <section name="output">
                <param name="plotmarker" value=""/>
                <param name="marker" value="false"/>
                <param name="markers" value="false" />
                <param name="log" value="false"/>
                <param name="markerset" value="107"/>
            </section>
            <output_collection name="bins" type="list" count="2">
                <element name="001" file="4/out.001.fasta" ftype="fasta"/>
                <element name="002" file="4/out.002.fasta" ftype="fasta"/>
            </output_collection>
            <output name="summary" file="4/out.summary" ftype="tabular" />
            <output name="noclass" file="4/out.noclass" ftype="fasta" />
            <output name="toshort" file="4/out.tooshort" ftype="fasta" />
        </test>
        <!-- test w co-assembled contigs and multiple reads as .gz and optional outputs -->
        <test expect_num_outputs="9">
            <param name="contig" value="test4_contigs.fasta" ftype="fasta" />
            <conditional name="assembly">
                <param name="type" value="coassembly"/>
                <conditional name="inputs">
                    <param name="type" value="reads"/>
                    <param name="reads" value="test4_reads.fastqsanger.gz,test4_reads.fastqsanger.gz" ftype="fasta"/>
                    <param name="output_abundances" value="true"/>
                    <param name="reassembly" value=""/>
                </conditional>
            </conditional>
            <section name="adv">
                <param name="min_contig_length" value="1000"/>
                <param name="max_iteration" value="50"/>
                <param name="prob_threshold" value="0.5"/>
            </section>
            <section name="output">
                <param name="plotmarker" value="true"/>
                <param name="marker" value="true"/>
                <param name="markers" value="true" />
                <param name="log" value="true"/>
                <param name="markerset" value="107"/>
            </section>
            <output_collection name="bins" type="list" count="2">
                <element name="001" file="4/out.001.fasta" ftype="fasta"/>
                <element name="002" file="4/out.002.fasta" ftype="fasta"/>
            </output_collection>
            <output name="summary" ftype="tabular">
                <assert_contents>
                    <has_text text="Completeness"/>
                    <has_text text="out.001.fasta"/>
                </assert_contents>
            </output>
            <output name="noclass" file="4/out.noclass" ftype="fasta" />
            <output name="toshort" file="4/out.tooshort" ftype="fasta" />
            <output name="log" ftype="txt" >
                <assert_contents>
                    <has_text text="Input contig"/>
                    <has_text text="Elapsed time"/>
                    <has_text text="Yielded 2 bins for contig (scaffold) file"/>
                </assert_contents>
            </output>
            <output name="abundout" file="4/out.abund1" ftype="tabular" />
            <output name="marker" file="4/out.marker" ftype="tabular" />
            <output name="plot" file="4/out.marker.pdf" ftype="pdf" compare="sim_size" />
            <output_collection name="markers" type="list" count="2">
                <element name="001" ftype="fasta">
                    <assert_contents>
                        <has_text text=">out.001.Methyltransf_5"/>
                    </assert_contents>
                </element>
                <element name="002" ftype="fasta">
                    <assert_contents>
                        <has_text text=">out.002.Methyltransf_5"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[
MaxBin is a software that clusters metagenomic contigs into different bins,
each consists (hopefully) of contigs from one species. MaxBin uses the
nucleotide composition information and contig abundance information to do
achieve binning through an Expectation-Maximization algorithm.

**Input**:

MaxBin need the contigs and contig abundance information. The contig abundance
information can be provided in two ways: the user can choose to provide

- the abundance file or
- the sequencing reads in fasta format (and MaxBin will use Bowtie2 to map the
  sequencing reads against the contigs and generate the abundance information)

The abundance information can be provided as tabular file:

For example, assume I have three contigs named A0001, A0002, and A0003, then my abundance file will look like

A0001	30.89
A0002	20.02
A0003	78.93

Reads/Abundundance files can be given in multiple files.

By default MaxBin will look for 107 marker genes present in >95% of bacteria.
Alternatively you can also choose 40 marker gene sets that are universal among
bacteria and archaea (Wu et al., PLoS ONE 2013). This option may be better
suited for environment dominated by archaea; however it tend to split genomes
into more bins. You can choose between different marker gene sets and see which
one works better.

**Outputs**

- bins: binned sequences
- summary: a summary file describing which contigs are being classified into which bin.
- log: a log file recording the core steps of MaxBin algorithm
- abundances (only if reads are used as input): a summary file describing which contigs are being classified into which bin
- marker: marker gene presence numbers for each bin. This table is ready to be plotted by R or other 3rd-party software.
- marker plot (anly present if selected in the advanced options): visualization of the marker gene presence numbers using R. Will only appear if -plotmarker is specified.
- unclassified sequences: this file stores all sequences that pass the minimum length threshold but are not classified successfully.
- to short sequences: this file stores all sequences that do not meet the minimum length threshold.
- markers prediced for bins: these data sets store all markers predicted from the individual bins.

**Reassembly**

This is an experimental feature of MaxBin. It calls for each read bin IDBA_UD with the pre_correction parameter.  Of course this IDBA_UD call can be done also with the corresponding Galaxy tool


** More information **

https://web.archive.org/web/20190417100740/https://downloads.jbei.org/data/microbial_communities/MaxBin/MaxBin.html

]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btv638</citation>
    </citations>
</tool>
author	iuc
date	Mon, 16 Oct 2023 07:46:13 +0000
parents	8a0473eb465e
children	28a0b1446d2b