Mercurial > repos > iuc > miniprot

<?xml version="1.0"?>
<tool id="miniprot" name="Miniprot align" version="@TOOL_VERSION@+galaxy0" profile="21.05">
    <description>align a protein sequence against a genome with affine gap penalty, splicing and frameshift</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">miniprot</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        miniprot
        -t \${GALAXY_SLOTS:-1}
        #if str($adv.options) == "yes"
            $adv.mapping.no_splicing
            -c $adv.mapping.max_kmer
            -G $adv.mapping.max_intron
            -n $adv.mapping.min_syncmers
            -m $adv.mapping.min_chain_score
            -l $adv.mapping.second_round_kmer_size
            -e $adv.mapping.max_extension
            -p $adv.mapping.score_ratio
            -N $adv.mapping.max_secondary_alignments
            -O $adv.alignment.gap_open
            -E $adv.alignment.gap_extension
            -J $adv.alignment.intron_open
            -C $adv.alignment.non_canonical_splice
            -F $adv.alignment.frameshift
            -B $adv.alignment.end_bonus
        #end if
        #if str($db.dbtype) == 'fasta'
            '$db.genomic_fasta'
            -k $db.kmer_size
            -s $db.submer_size
            -b $db.bits_per_block
        #else
            '$db.genomic_db'
        #end if
        #if str($output_format) == "gff"
            --gff
        #end if
        '$protein_fasta'
        >'$output_alignment'
    ]]></command>
    <inputs>
        <conditional name="db">
            <param name="dbtype" type="select" label="Database type" help="Build an index from FASTA or use a pre-indexed database">
                <option value="fasta" selected="true">FASTA</option>
                <option value="preindexed">Pre-indexed</option>
            </param>
            <when value="fasta">
                <param name="genomic_fasta" type="data" format="fasta,fasta.gz" label="Genomic sequence (FASTA)" help="Genomic contigs / scaffolds to be aligned against in FASTA format" />
                <param argument="-k" name="kmer_size" type="integer" min="1" value="6" label="K-mer size" />
                <param argument="-s" name="submer_size" type="integer" min="1" value="4" label="Submer size" help="Submer size (density: 1/(2*(kmer_size-submer_size)+1))" />
                <param argument="-b" name="bits_per_block" type="integer" min="1" value="8" label="Bits per block" />
            </when>
            <when value="preindexed">
                <!-- refine the datatype here once Miniprot index data type is in Galaxy -->
                <param name="genomic_db" type="data" format="binary" label="Pre-indexed genomic database" help="A pre-indexed database built by miniprot" />
            </when>
        </conditional>
        <param name="protein_fasta" type="data" format="fasta,fasta.gz" label="Protein sequence (FASTA)" help="Protein sequences to be aligned in FASTA format" />
        <param name="output_format" type="select" label="Output format" >
            <option value="gff" selected="true">GFF3</option>
            <option value="paf">PAF</option>
        </param>
        <conditional name="adv">
            <param name="options" type="select" label="Advanced options">
                <option value="yes">Show</option>
                <option value="no" selected="true">Hide</option>
            </param>
            <when value="yes">
                <section name="mapping" title="Mapping">
                    <param argument="-S" name="no_splicing" type="boolean" truevalue="-S" falsevalue="" checked="false" label="No splicing" help="No splicing (apply -G1000 -J1000 -e1000)" />
                    <param argument="-c" name="max_kmer" type="integer" min="1" value="50000" label="Max k-mer occurences" />
                    <param argument="-G" name="max_intron" type="integer" min="0" value="200000" label="Max intron size" />
                    <param argument="-n" name="min_syncmers" type="integer" min="1" value="5" label="Minimum number of syncmers in a chain" />
                    <param argument="-m" name="min_chain_score" type="integer" min="0" value="0" label="Minimum chaining score" />
                    <param argument="-l" name="second_round_kmer_size" type="integer" min="1" value="5" label="K-mer size for second round of chaining" />
                    <param argument="-e" name="max_extension" type="integer" min="0" value="10000" label="Max extension for second round of chaining" />
                    <param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.5" label="Minimum secondary-to-primary score ratio" />
                    <param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="100" label="Max secondary alignments to consider" />
                </section>
                <section name="alignment" title="Alignment">
                    <param argument="-O" name="gap_open" type="integer" min="0" value="11" label="Gap open penalty" />
                    <param argument="-E" name="gap_extension" type="integer" min="0" value="1" label="Gap extension penalty" help="A k-long gap costs open_penalty+k*extension_penalty" />
                    <param argument="-J" name="intron_open" type="integer" min="0" value="31" label="Intron open penalty" />
                    <param argument="-C" name="non_canonical_splice" type="integer" min="0" value="11" label="Penalty for non-canonical splicing" />
                    <param argument="-F" name="frameshift" type="integer" min="0" value="15" label="Frameshift penalty" />
                    <param argument="-B" name="end_bonus" type="integer" min="0" value="5" label="End bonus" />
                </section>
                <param argument="-K" name="query_batch_size" type="integer" min="1" value="2000000" label="Query batch size" />
            </when>
            <when value="no">
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="output_alignment" format="gff3" label="Miniprot on ${on_string}">
            <change_format>
                <when input="output_format" value="paf" format="paf" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <conditional name="db">
                <param name="dbtype" value="fasta" />
                <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
            </conditional>
            <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
            <output name="output_alignment" ftype="gff3">
                <assert_contents>
                    <has_text text="ID=MP000001;Identity=1.0000;Positive=1.0000;Target=tr|O06302|O06302_MYCTU 1 126" />
                    <has_text text="Parent=MP000372;Target=tr|V5QPR5|V5QPR5_MYCTU 1 53" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="db">
                <param name="dbtype" value="fasta" />
                <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
            </conditional>
            <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
            <param name="output_format" value="paf" />
            <output name="output_alignment" ftype="paf">
                <assert_contents>
                    <has_text text="tr|O06302|O06302_MYCTU" />
                    <has_text text="cs:Z::29*agcG:3*gtgA:5*ccgA:9*accS:1*gccV:4*cagL:1*gtcS:3*gtcA*gtcI*accA*gccG:8*gccS:2*ggtA:5*gccI*agcG:1*ctgA:4*gccV:5*gggL:1*gtgS:2" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="db">
                <param name="dbtype" value="fasta" />
                <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
            </conditional>
            <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
            <param name="output_format" value="gff" />
            <conditional name="adv">
                <param name="options" value="yes" />
                <param name="second_round_kmer_size" value="32" />
            </conditional>
            <output name="output_alignment" ftype="gff3">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
        miniprot_  rapidly aligns a protein sequence against a genome with affine gap penalty, splicing and frameshift.
        It is primarily intended for annotating protein-coding genes in a new species using known genes from other species.

        **NOTE:** miniprot is in the early stages of development and should be considered experimental at this stage.
        .. _miniprot: https://github.com/lh3/miniprot
    ]]></help>
</tool>
author	iuc
date	Mon, 19 Sep 2022 12:30:10 +0000
parents
children	ce04c239454b