Mercurial > repos > iuc > semibin_train

<tool id="semibin_train" name="SemiBin: Train" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>
        the semi-supervised deep learning model
    </description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="biotools"/>
    <expand macro="requirements"/>
    <expand macro="version"/>
    <command detect_errors="exit_code"><![CDATA[
#import re
#if $mode.select == 'single'
ln -s '$mode.input_fasta' 'contigs.$input_fasta.ext' &&
#else
    #for $e in $mode.input_fasta
        #set $identifier = re.sub('[^\s\w\-]', '_', str($e.element_identifier))
ln -s '$e' '${identifier}.$e.ext' &&
    #end for
#end if
SemiBin2 train_semi
#if $mode.select == 'single'
    --input-fasta 'contigs.$input_fasta.ext'
    --data '$mode.data'
    --data-split '$mode.data_split'
    --cannot-link '$mode.cannot_link'
#else
    --train-from-many
    --input-fasta
    #for $e in $mode.input_fasta
        #set $identifier = re.sub('[^\s\w\-]', '_', str($e.element_identifier))
        '${identifier}.$e.ext'
    #end for
    --data
    #for $e in $mode.data
        '$e'
    #end for
    --data-split
    #for $e in $mode.data_split
        '$e'
    #end for
    --cannot-link
    #for $e in $mode.cannot_link
        '$e'
    #end for
#end if
    --output 'output'
    --threads \${GALAXY_SLOTS:-1}
    --epoches $epoches
    --batch-size $batch_size
    --random-seed $random_seed
#if $min_len.method == 'min-len'
    --min-len $min_len.min_len
#else if $min_len.method == 'ratio'
    --ratio $min_len.ratio
#end if
    --orf-finder '$orf_finder'
    ]]></command>
    <inputs>
        <conditional name="mode">
            <param argument="select" type="select" label="Mode to train the models">
                <option value="single" selected="true">From one sample</option>
                <option value="several">From multiple samples (train model across several samples can get better pre-trained model for single-sample binning)</option>
            </param>
            <when value="single">
                <param argument="--input-fasta" type="data" format="fasta,fasta.gz,fasta.bz2" label="Contig sequences"/>
                <param argument="--data" type="data"  format="csv" label="Train data"/>
                <param argument="--data-split" type="data" format="csv" label="Split train data"/>
                <param argument="--cannot-link" type="data" format="txt" label="Cannot-link constraints"/>
            </when>
            <when value="several">
                <param argument="--input-fasta" type="data" multiple="true" format="fasta,fasta.gz,fasta.bz2" label="Contig sequences"/>
                <param argument="--data" type="data" format="csv" multiple="true" label="Train data"/>
                <param argument="--data-split" type="data" format="csv" multiple="true" label="Split train data"/>
                <param argument="--cannot-link" type="data" format="txt" multiple="true" label="Cannot-link constraints"/>
            </when>
        </conditional>
        <expand macro="min_len"/>
        <expand macro="orf-finder"/>
        <expand macro="random-seed"/>
        <expand macro="epoches"/>
        <expand macro="batch-size"/>
    </inputs>
    <outputs>
        <expand macro="train_output"/>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <conditional name="mode">
                <param name="select" value="single"/>
                <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
                <param name="data" ftype="csv" value="data.csv"/>
                <param name="data_split" ftype="csv" value="data_split.csv"/>
                <param name="cannot_link" ftype="txt" value="cannot.txt"/>
            </conditional>
            <conditional name="min_len">
                <param name="method" value="min-len"/>
                <param name="min_len" value="2500" />
            </conditional>
            <param name="orf_finder" value="prodigal"/>
            <param name="random_seed" value="0"/>
            <param name="epoches" value="1"/>
            <param name="batch_size" value="2048"/>
            <output name="model" ftype="h5">
                <assert_contents>
                    <has_size value="3119000" delta="2000" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="mode">
                <param name="select" value="single"/>
                <param name="input_fasta" ftype="fasta.bz2" value="input_single.fasta.bz2"/>
                <param name="data" ftype="csv" value="data.csv"/>
                <param name="data_split" ftype="csv" value="data_split.csv"/>
                <param name="cannot_link" ftype="txt" value="cannot.txt"/>
            </conditional>
            <conditional name="min_len">
                <param name="method" value="min-len"/>
                <param name="min_len" value="2500" />
            </conditional>
            <param name="orf_finder" value="prodigal"/>
            <param name="random_seed" value="0"/>
            <param name="epoches" value="1"/>
            <param name="batch_size" value="2048"/>
            <output name="model" ftype="h5">
                <assert_contents>
                    <has_size value="3119000" delta="2000" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="mode">
                <param name="select" value="single"/>
                <param name="input_fasta" ftype="fasta.gz" value="input_single.fasta.gz"/>
                <param name="data" ftype="csv" value="data.csv"/>
                <param name="data_split" ftype="csv" value="data_split.csv"/>
                <param name="cannot_link" ftype="txt" value="cannot.txt"/>
            </conditional>
            <conditional name="min_len">
                <param name="method" value="min-len"/>
                <param name="min_len" value="2500" />
            </conditional>
            <param name="orf_finder" value="prodigal"/>
            <param name="random_seed" value="0"/>
            <param name="epoches" value="1"/>
            <param name="batch_size" value="2048"/>
            <output name="model" ftype="h5">
                <assert_contents>
                    <has_size value="3119000" delta="2000" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="mode">
                <param name="select" value="several"/>
                <param name="input_fasta" ftype="fasta" value="input_single.fasta,input_single_2.fasta,input_single_3.fasta"/>
                <param name="data" ftype="csv" value="data.csv,data.csv,data.csv"/>
                <param name="data_split" ftype="csv" value="data_split.csv,data_split.csv,data_split.csv"/>
                <param name="cannot_link" ftype="txt" value="cannot.txt,cannot.txt,cannot.txt"/>
            </conditional>
            <conditional name="min_len">
                <param name="method" value="ratio"/>
                <param name="ratio" value="0.05" />
            </conditional>
            <param name="orf_finder" value="prodigal"/>
            <param name="random_seed" value="0"/>
            <param name="epoches" value="20"/>
            <param name="batch_size" value="2048"/>
            <output name="model" ftype="h5">
                <assert_contents>
                    <has_size value="3119000" delta="2000" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
@HELP_HEADER@

This tool trains the semi-supervised deep learning model

Inputs
======

@HELP_INPUT_FASTA@
@HELP_CANNOT@
@HELP_DATA@

Outputs
=======

@HELP_MODEL@
    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Tue, 28 Oct 2025 08:20:57 +0000
parents	2344bc30a326
children