Mercurial > repos > iuc > miniprot

<?xml version="1.0"?>
<tool id="miniprot" name="Miniprot align" version="@TOOL_VERSION@+galaxy0" profile="21.05">
    <description>align a protein sequence against a genome with affine gap penalty, splicing and frameshift</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">miniprot</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        miniprot
        -t \${GALAXY_SLOTS:-1}
        #if str($adv.options) == "yes"
            $adv.mapping.no_splicing
            -c $adv.mapping.max_kmer
            -G $adv.mapping.max_intron
            -n $adv.mapping.min_syncmers
            -m $adv.mapping.min_chain_score
            -l $adv.mapping.second_round_kmer_size
            -e $adv.mapping.max_extension
            -p $adv.mapping.score_ratio
            -N $adv.mapping.max_secondary_alignments
            -O $adv.alignment.gap_open
            -E $adv.alignment.gap_extension
            -J $adv.alignment.intron_open
            -C $adv.alignment.non_canonical_splice
            -F $adv.alignment.frameshift
            -B $adv.alignment.end_bonus
        #if str($adv.output.prefix) != 'MP'
            -P '$adv.output.prefix'
        #end if
            $adv.output.print_unmapped_proteins
            --outn=$adv.output.outputs_per_query
        #end if
        #if str($db.dbtype) == 'fasta'
            '$db.genomic_fasta'
            -k $db.kmer_size
            -s $db.submer_size
            -b $db.bits_per_block
        #else
            '$db.genomic_db'
        #end if
        #if str($output_format) == "gff"
            --gff
        #end if
        '$protein_fasta'
        >'$output_alignment'
    ]]></command>
    <inputs>
        <conditional name="db">
            <param name="dbtype" type="select" label="Database type" help="Build an index from FASTA or use a pre-indexed database">
                <option value="fasta" selected="true">FASTA</option>
                <option value="preindexed">Pre-indexed</option>
            </param>
            <when value="fasta">
                <param name="genomic_fasta" type="data" format="fasta,fasta.gz" label="Genomic sequence (FASTA)" help="Genomic contigs / scaffolds to be aligned against in FASTA format" />
                <param argument="-k" name="kmer_size" type="integer" min="1" value="6" label="K-mer size" />
                <param argument="-s" name="submer_size" type="integer" min="1" value="4" label="Submer size" help="Submer size (density: 1/(2*(kmer_size-submer_size)+1))" />
                <param argument="-b" name="bits_per_block" type="integer" min="1" value="8" label="Bits per block" />
            </when>
            <when value="preindexed">
                <!-- refine the datatype here once Miniprot index data type is in Galaxy -->
                <param name="genomic_db" type="data" format="binary" label="Pre-indexed genomic database" help="A pre-indexed database built by miniprot" />
            </when>
        </conditional>
        <param name="protein_fasta" type="data" format="fasta,fasta.gz" label="Protein sequence (FASTA)" help="Protein sequences to be aligned in FASTA format" />
        <param name="output_format" type="select" label="Output format" >
            <option value="gff" selected="true">GFF3</option>
            <option value="paf">PAF</option>
        </param>
        <conditional name="adv">
            <param name="options" type="select" label="Advanced options">
                <option value="yes">Show</option>
                <option value="no" selected="true">Hide</option>
            </param>
            <when value="yes">
                <section name="mapping" title="Mapping">
                    <param argument="-S" name="no_splicing" type="boolean" truevalue="-S" falsevalue="" checked="false" label="No splicing" help="No splicing (apply -G1000 -J1000 -e1000)" />
                    <param argument="-c" name="max_kmer" type="integer" min="1" value="50000" label="Max k-mer occurences" />
                    <param argument="-G" name="max_intron" type="integer" min="0" value="200000" label="Max intron size" />
                    <!-- the -w option is mentioned in the help text but apparently not implmented: https://github.com/lh3/miniprot/issues/12 -->
                    <!-- <param argument="-w" name="log_gap_penalty_weight" type="float" value="0.75" label="Log gap penalty weight" /> -->
                    <param argument="-n" name="min_syncmers" type="integer" min="1" value="5" label="Minimum number of syncmers in a chain" />
                    <param argument="-m" name="min_chain_score" type="integer" min="0" value="0" label="Minimum chaining score" />
                    <param argument="-l" name="second_round_kmer_size" type="integer" min="1" value="5" label="K-mer size for second round of chaining" />
                    <param argument="-e" name="max_extension" type="integer" min="0" value="10000" label="Max extension for second round of chaining" />
                    <param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.7" label="Minimum secondary-to-primary score ratio" />
                    <param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="50" label="Max secondary alignments to consider" />
                </section>
                <section name="alignment" title="Alignment">
                    <param argument="-O" name="gap_open" type="integer" min="0" value="11" label="Gap open penalty" />
                    <param argument="-E" name="gap_extension" type="integer" min="0" value="1" label="Gap extension penalty" help="A k-long gap costs open_penalty+k*extension_penalty" />
                    <param argument="-J" name="intron_open" type="integer" min="0" value="31" label="Intron open penalty" />
                    <param argument="-C" name="non_canonical_splice" type="integer" min="0" value="11" label="Penalty for non-canonical splicing" />
                    <param argument="-F" name="frameshift" type="integer" min="0" value="17" label="Frameshift penalty" />
                    <param argument="-B" name="end_bonus" type="integer" min="0" value="5" label="End bonus" />
                </section>
                <section name="output" title="Output">
                    <param argument="-P" name="prefix" type="text" label="Prefix for IDs in GFF3 output" value="MP">
                        <sanitizer invalid_char="">
                            <valid initial="string.ascii_letters,string.digits">
                                <add value="_" />
                                <add value="-" />
                            </valid>
                        </sanitizer>
                    </param>
                    <param argument="-u" name="print_unmapped_proteins" type="boolean" truevalue="-u" falsevalue="" label="Print unmapped proteins" checked="false" />
                    <param argument="--outn" name="outputs_per_query" type="integer" min="0" value="100" label="Outputs per query" help="The number of outputs will be the minimum of this and the max secondary alignments option" />
                </section>
                <param argument="-K" name="query_batch_size" type="integer" min="1" value="2000000" label="Query batch size" />
            </when>
            <when value="no">
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="output_alignment" format="gff3" label="Miniprot on ${on_string}">
            <change_format>
                <when input="output_format" value="paf" format="paf" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <conditional name="db">
                <param name="dbtype" value="fasta" />
                <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
            </conditional>
            <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
            <output name="output_alignment" ftype="gff3">
                <assert_contents>
                    <has_text text="ID=MP000001;Identity=1.0000;Positive=1.0000;Target=tr|O06302|O06302_MYCTU 1 126" />
                    <has_text text="ID=MP000359;Identity=0.9811;Positive=1.0000;Target=tr|V5QPR5|V5QPR5_MYCTU 1 53" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="db">
                <param name="dbtype" value="fasta" />
                <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
            </conditional>
            <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
            <param name="output_format" value="paf" />
            <output name="output_alignment" ftype="paf">
                <assert_contents>
                    <has_text text="tr|O06302|O06302_MYCTU" />
                    <has_text text="cs:Z::29*agcG:3*gtgA:5*ccgA:9*accS:1*gccV:4*cagL:1*gtcS:3*gtcA*gtcI*accA*gccG:8*gccS:2*ggtA:5*gccI*agcG:1*ctgA:4*gccV:5*gggL:1*gtgS:2" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="db">
                <param name="dbtype" value="fasta" />
                <param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
            </conditional>
            <param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
            <param name="output_format" value="gff" />
            <conditional name="adv">
                <param name="options" value="yes" />
                <param name="second_round_kmer_size" value="32" />
            </conditional>
            <output name="output_alignment" ftype="gff3">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
        miniprot_  rapidly aligns a protein sequence against a genome with affine gap penalty, splicing and frameshift.
        It is primarily intended for annotating protein-coding genes in a new species using known genes from other species.

        While an index of the genome to be mapped to can be built "on the fly", the Miniprot index tool can pre-index a genome
        and will result in faster performance if the genome index is reused multiple times.

        For details of the algorithm and some insight into how parameters can be tuned see this overview_.

        .. _miniprot: https://github.com/lh3/miniprot
        .. _overview: https://github.com/lh3/miniprot#algorithm-overview
    ]]></help>
    <citations>
        <citation type="bibtex"><![CDATA[
            @misc{Li2022,
                author = {Li, Heng},
                title = {miniprot},
                year = {2022},
                publisher = {GitHub},
                journal = {GitHub repository},
                howpublished = {\url{https://github.com/lh3/miniprot}},
                commit = {b442b7a6b60dbd15f460ea9af75fa0b7293d4a8c}
              }
        ]]></citation>
    </citations>
</tool>
author	iuc
date	Fri, 23 Sep 2022 22:35:23 +0000
parents	ef712a5e9834
children	d518cf04b55c