view blat.xml @ 12:e79965d0351c draft default tip

planemo upload commit 6358ce7c332f42526423d365f06516983ca677a0
author iuc
date Sat, 03 Dec 2022 10:38:40 +0000
parents 2a89f630fa85
children
line wrap: on
line source

<tool id="ucsc_blat" name="UCSC BLAT Alignment Tool" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>BLAST-like sequence alignment tool</description>
    <macros>
        <token name="@TOOL_VERSION@">377</token>
        <token name="@VERSION_SUFFIX@">1</token>

        <xml name="mask_cond" tokens="maskarg,label,help">
            <conditional name="@MASKARG@_type">
                <param  argument="-@MASKARG@" type="select" label="@LABEL@" help="@HELP@">
                    <option value="" selected="true">No masking</option>
                    <option value="lower">lower - mask out lower-cased sequence</option>
                    <option value="upper">upper - mask out upper-cased sequence</option>
                    <option value="file.out">out - mask database according to RepeatMasker out</option>
                </param>
                <when value="" />
                <when value="lower" />
                <when value="upper" />
                <when value="file.out">
                    <param name="@MASKARG@_file" type="data" format="txt" label="RepeatMasker file.out" />
                </when>
            </conditional>
        </xml>
    </macros>
    <xrefs>
        <xref type="bio.tools">blat</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">ucsc-blat</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
    #if str($reference_source.reference_source_selector) == "history":
        ## blat depends on file extension
        #if $reference_source.database.is_of_type("fasta.gz"):
            #set $reference_fasta_filename = "localref.fa.gz"
        #elif $reference_source.database.is_of_type("fasta"):
            #set $reference_fasta_filename = "localref.fa"
        #elif $reference_source.database.is_of_type("twobit"):
            #set $reference_fasta_filename = "localref.2bit"
        #else
            #set $reference_fasta_filename = "localref"
        #end if
        ln -s '$reference_source.database' '$reference_fasta_filename' &&
    #else:
        #set $reference_fasta_filename = str($reference_source.database.fields.path)
    #end if

    ## blat depends on file extension
    #if $query.is_of_type("fasta.gz"):
        #set $query_filename = "query.fa.gz"
    #elif $query.is_of_type("fasta"):
        #set $query_filename = "query.fa"
    #elif $query.is_of_type("twobit"):
        #set $query_filename = "query.2bit"
    #else
        #set $query_filename = "query"
    #end if
    ln -s '$query' '$query_filename' &&

    blat
        -q=$query_type
        -t=$database_type
        ## Basic alignment parameters
        #if str($basic_align.minScore)
            -minScore=$basic_align.minScore
        #end if
        #if str($basic_align.minIdentity)
            -minIdentity=$basic_align.minIdentity
        #end if
        $basic_align.trimT
        $basic_align.noTrimA
        $basic_align.trimHardA
        $basic_align.fastMap
        $basic_align.fine
        #if str($basic_align.maxIntron)
            -maxIntron=$basic_align.maxIntron
        #end if
        $basic_align.extendThroughN
        ## Advanced alignment parameters
        #if str($adv_align.tileSize)
            -tileSize=$adv_align.tileSize
        #end if
        #if str($adv_align.stepSize)
            -stepSize=$adv_align.stepSize
        #end if
        $adv_align.oneOff
        #if str($adv_align.minMatch)
            -minMatch=$adv_align.minMatch
        #end if
        -maxGap=$adv_align.maxGap
        #if str($adv_align.repMatch)
            -repMatch=$adv_align.repMatch
        #end if
        ## Repeat masking parameters
        #if $repeat.mask_type.mask == "file.out":
            -mask='$repeat.mask_type.mask_file'
        #elif $repeat.mask_type.mask:
            -mask=$repeat.mask_type.mask
        #end if
        #if $repeat.qMask_type.qMask == "file.out":
            -qMask='$repeat.qMask_type.qMask_file'
        #elif $repeat.qMask_type.qMask:
            -qmask=$repeat.qMask_type.qMask
        #end if
        #if $repeat.repeats_type.repeats == "file.out":
            -repeats='$repeat.repeats_type.repeats_file'
        #elif $repeat.repeats_type.repeats:
            -repeats=$repeat.repeats_type.repeats
        #end if
        #if str($repeat.minRepDivergence)
            -minRepDivergence=$repeat.minRepDivergence
        #end if
        
        #if str($dots)
            -dots=$dots
        #end if
        '$reference_fasta_filename'
        '$query_filename'
        -out=$out
        '$output'
    ]]></command>
    <inputs>
        <conditional name="reference_source">
            <param name="reference_source_selector" type="select" label="Choose the source for the database">
                <option value="cached">Locally cached</option>
                <option value="history">History</option>
            </param>
            <when value="cached">
                <param name="database" type="select" label="Select database">
                    <options from_data_table="all_fasta">
                        <!-- <column name="name" index="0"/>
                        <column name="value" index="2"/> -->
                        <filter type="sort_by" column="2" />
                    </options>
                    <validator type="no_options" message="A built-in database is not available" />
                </param>
            </when>
            <when value="history">
                <param name="database" type="data" format="fasta,fasta.gz,twobit" label="Using database file, either a fasta, fasta.gz or twobit dataset" />
            </when>
        </conditional>
        <param name="query" type="data" format="fasta,fasta.gz,twobit" label="Query data, either a fasta, fasta.gz or twobit dataset"/>
        <param argument="-t" name="database_type" type="select" format="txt" multiple="false" label="database type" help="Choose your database type, the default is dnax">
            <option value="dna" selected="true">dna - DNA sequence</option>
            <option value="prot">prot - protein sequence</option>
            <option value="dnax">dnax - DNA sequence translated in six frames to protein</option>
        </param>
        <param argument="-q" name="query_type" type="select" format="txt" multiple="false" label="query type" help="Choose your query type, the default is rnax">
            <option value="dna" selected="true">dna - DNA sequence </option>
            <option value="rna">rna - RNA sequence</option>
            <option value="prot">prot - protein sequence</option>
            <option value="dnax">dnax - DNA sequence translated in six frames to protein</option>
            <option value="rnax">rnax - DNA sequence translated in three frames to protein</option>
        </param>
        <section name="basic_align" title="Alignment parameters" expanded="true">
            <param argument="-minScore" type="integer" value="30" label="Minimum score" help="It is the matches minus the mismatches minus some sort of gap penalty" />
            <param argument="-minIdentity" type="integer" value="" optional="true" min="0" max="100" label="Minimum sequence identity (in percent)" help="Default is 90 for nucleotide searches, 25 for protein or translated protein searches" />
            <param argument="-trimT" type="boolean" truevalue="-trimT" falsevalue="" label="Trim leading poly-T" />
            <param argument="-noTrimA" type="boolean" truevalue="-noTrimA" falsevalue="" label="Don't trim trailing poly-A" />
            <param argument="-trimHardA" type="boolean" truevalue="-trimHardA" falsevalue="" label="Remove poly-A tail from qSize and alignments in .psl output" />
            <param argument="-fastMap" type="boolean" truevalue="-fastMap" falsevalue="" label="Run for fast DNA/DNA remapping" help="It does not allow introns and require high %ID. Query sizes must not exceed 5000" />
            <param argument="-fine" type="boolean" truevalue="-fine" falsevalue="" label="Refine search for small initial and terminal exons" help="For high-quality mRNAs. Not recommended for ESTs" />
            <param argument="-maxIntron" type="integer" value="750000" optional="true" label="Maximum intron size" />
            <param argument="-extendThroughN" type="boolean" truevalue="-extendThroughN" falsevalue="" label="Allow extension of alignment through large blocks of N's" />
        </section>
        <section name="adv_align" title="Advanced alignment parameters" expanded="false">
            <param argument="-tileSize" type="integer" value="" optional="true" min="1" label="Tile size" help="Sets the size of match that triggers an alignment. Usually between 8 and 12. Default is 11 for DNA and 5 for protein" />
            <param argument="-stepSize" type="integer" value="" optional="true" min="1" label="Spacing between tiles" help="Default is tileSize" />
            <param argument="-oneOff" type="boolean" truevalue="-oneOff=1" falsevalue="" label="If set, this allows one mismatch in tile and still triggers an alignments" />
            <param argument="-minMatch" type="integer" value="" optional="true" min="1" label="Minimum number of tile matches" help="Usually set from 2 to 4. Default is 2 for nucleotide, 1 for protein." />
            <param argument="-maxGap" type="integer" value="2" min="0" max="3" label="Maximum gap between tiles in a clump" help="Usually set from 0 to 3. Only relevant for minMatch > 1" />
            <param argument="-repMatch" type="integer" value="" optional="true" label="Number of repetitions of a tile allowed before it is marked as overused" help="Typically this is 256 for tileSize 12, 1024 for tileSize 11, 4096 for tileSize 10. Also affected by stepSize. When stepSize is halved repMatch is doubled to compensate" />
        </section>
        <section name="repeat" title="Repeat masking parameters" expanded="true">
            <expand macro="mask_cond" maskarg="mask" label="Mask out repeats" help="Alignments won't be started in masked region but may extend through it in nucleotide searches. Masked areas are ignored entirely in protein or translated searches. Default is no masking"/>
            <expand macro="mask_cond" maskarg="qMask" label="Mask out repeats in query sequence" help="Analoguous to -mask, but for the query sequence"/>
            <expand macro="mask_cond" maskarg="repeats" label="Report matches in repeats separately" help="Repeat bases will not be masked in any way, but matches in repeat areas will be reported separately from matches in other areas in the output"/>
            <param argument="-minRepDivergence" type="integer" value=""  min="0" max="100" optional="true" label="Minimum divergence of repeats (percent)" help="to allow them to be unmasked.  Default is 15.  Only relevant for masking using RepeatMasker .out files" />
        </section>
        <param argument="-dots" type="integer" value="" optional="true" label="Output a dot every N sequences in log" help="Dots show program's progress" />
        <param name="out" type="select" label="Select output file format (-out)">
            <option value="psl">Tab-separated format, no sequence (psl)</option>
            <option value="psl -noHead">Tab-separated format, no sequence, no header (psl -noHead)</option>
            <option value="axt">Blastz-associated axt format (axt)</option>
            <option value="maf">Multiz-associated maf format (maf)</option>
            <option value="sim4">Similar to sim4 format (sim4)</option>
            <option value="wublast">Similar to WU-BLAST format (wublast)</option>
            <option value="blast">Similar to NCBI BLAST format (blast)</option>
            <option value="blast8">NCBI BLAST tabular format (blast8)</option>
            <option value="blast9">NCBI BLAST tabular format with comments (blast9)</option>
        </param>
    </inputs>
    <outputs>
        <data name="output" format="tabular" label="${tool.name} on ${on_string}">
            <change_format><!-- add test -->
                <when input="out" value="axt" format="axt" />
                <when input="out" value="maf" format="maf" />
                <when input="out" value="sim4" format="txt" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <!-- test on query of GenBank RefSeq records for Gallus gallus and database of Amazona vittata -->
        <test>
            <conditional name="reference_source">
                <param name="reference_source_selector" value="history" />
                <param name="database" value="amaVit1_Gallus/amaVit1.fa" ftype="fasta" />
            </conditional>
            <param name="query" value="amaVit1_Gallus/Gallus_gallus_RefSeq.fa" ftype="fasta" />
            <param name="database_type" value="dnax" />
            <param name="query_type" value="rnax" />
            <conditional name="mask_type">
                <param name="mask" value="lower" />
            </conditional>
            <param name="out" value="maf" />
            <output name="output" value="amaVit1_Gallus/amaVit1_Gallus_gallus_sorted.maf" ftype="maf"/>
            <assert_command>
                <has_text text="-tileSize=" negate="true"/>
                <has_text text="-stepSize=" negate="true"/>
                <has_text text="-mask=lower"/>
            </assert_command>
        </test>
        <!-- test on query of partial mRNA of Drosophila melanogaster and the 
            database of Drosophila biamipes dot chromosome 
            - also test cached reference -->
        <test>
            <conditional name="reference_source">
                <param name="reference_source_selector" value="cached"/>
                <param name="database" value="dbdia display name"/>
            </conditional>
            <param name="query" value="dbia3/dmel-transcript.fa" ftype="fasta" />
            <param name="database_type" value="dnax" />
            <param name="query_type" value="rnax" />
            <section name="basic_align">
                <param name="maxIntron" value="" />
            </section>
            <section name="adv_align">
                <param name="tileSize" value="5"/><!--explicitly set default .. to check if it is on the CL-->
                <param name="stepSize" value="5"/><!--explicitly set default .. to check if it is on the CL-->
            </section>
            <param name="out" value="psl -noHead" />
            <output name="output" value="dbia3/dbia3.sorted.psl" ftype="tabular" sort="true"/>
            <assert_command>
                <has_text text="-tileSize=5"/>
                <has_text text="-mask" negate="true"/>
            </assert_command>
        </test>
        <!-- test on the database masked by repeat masker -->
        <test>
            <conditional name="reference_source">
                <param name="reference_source_selector" value="history" />
                <param name="database" value="dbia3/dbia3_masked.2bit" ftype="twobit" />
            </conditional>
            <param name="query" value="dbia3/dmel-transcript.fa" ftype="fasta"/>
            <param name="database_type" value="dnax" />
            <param name="query_type" value="rnax" />
            <param name="oneOff" value="false" />
            <param name="minScore" value="30" />
            <param name="maxGap" value="2" />
            <param name="trimT" value="false" />
            <param name="noTrimA" value="false" />
            <param name="fine" value="false" />
            <param name="maxIntron" value="750000" />
            <param name="extendThroughN" value="false" />
            <conditional name="mask_type">
                <param name="mask" value="file.out" />
                <param name="mask_file" value="dbia3/dbia3_RM.out" />
            </conditional>
            <param name="out" value="psl" ftype="tabular" />
            <output name="output" value="dbia3/dbia3_masked.sorted.psl"/>
            <assert_command>
                <has_text text="-tileSize=" negate="true"/>
                <has_text text="-stepSize=" negate="true"/>
                <has_text text="-mask='/"/>
            </assert_command>
        </test>
        <!-- tiny test data from https://davetang.org/muse/2012/05/15/using-blat/ -->
        <test>
            <conditional name="reference_source">
                <param name="reference_source_selector" value="history" />
                <param name="database" value="mini-db.fa.gz" ftype="fasta.gz" />
            </conditional>
            <param name="query" value="mini-query.fa.gz" ftype="fasta.gz"/>
            <param name="minScore" value="0" />
            <section name="adv_align">
                <param name="stepSize" value="1"/>
            </section>
            <param name="out" value="psl" ftype="tabular" />
            <output name="output">
                <assert_contents>
                    <has_n_lines n="7"/>
                </assert_contents>
            </output>
            <assert_command>
                <has_text text="-minScore=0"/>
                <has_text text="-stepSize=1"/>
            </assert_command>
        </test> </tests>
    <help>
        <![CDATA[
BLAT
====
BLAT is a bioinformatics software a tool which performs rapid sequence alignments (mRNA/DNA and cross-species protein).
It is designed to find sequences of high similarity and have a certain minimum length. With the default setting this is

- >95% similarity and a minimum length of 25 bases for nucleotide sequences
- >80% similarity and a minimum lenth of 20 amino acids for proteins

More divergent or shorter sequence alignments may be missed.
The algorithm works in two phases: 

1. Search phase: find regions of probable homology using an index of the reference sequence
2. Alignment phase: Detailed Alignment of the sequences in these regions

Search phase
++++++++++++

Builds an index of the reference containing the nonoverlapping K-mers and their
positions (by default, can be changed using `-tileSize` and `-stepSize`).  Hits,
i.e. exactly matching k-mers in query and reference, are then found by looking
up each overlapping K-mer of the query sequence.  By enabling `-oneOff` the
algorithm allows for a single substitition. Note that this increases the run
time of this phase significantly.

The hits are then split into buckets of 64k (based on the database position)
and sorted on the diagonal (database minus query positions). Hits within the
gap limit form so called proto-clumps. Those are then sorted by database position
and put into clumps if they are within the window limit (wrt database coordinate).

Clumps with less than the minimum number of hits are discarded (-minMatch) and
those within 300 bases or 100 amino acids in the database are merged together.
The resulting clumps define regions of the database which are homologous to the
query sequence which are then aligned.

Alignment phase
+++++++++++++++

The alignment is performed differently for nucleotide and
aminoacid sequences.

**Alignment for nucleotide sequences**: A hit list (exactly matching k-mers) for
the query and the homologous region of the database is generated. If necessary
hits are mode unique by extending them until they are unique or have a maximum
size. The hits are then extended maximally allowing no mismatches, and overlapping
hits are merged.
Subsequent (wrt query and reference) extended hits are then linked in an
alignment. If there are gaps in query and reference, the algorithm recurses
using a smaller value for k until no additional hits are found or gaps are
smaller than 6 bases.

**Protein Alignments**: The hits from the search stage are extended into maximally
scoring ungapped alignments (HSPs) (match cost 2 and mismatch cost 1). The HSPs
are organized in a directed graph where an edge connect HSPs A and B if A starts
before B wrt query and database coordinates. The weight of the edge is then
defined as the score of B minus a gap penalty based on the distance between A
and B (overlapping HSPs are treated differently, see Kent 2002).  The maximal
scoring alignment is then determined as the maximum weight path through the
graph and the HSPs of this path are removed. This is repeated until no HSPs are
left.

**Stitching and Filling In**:
In order to find also alignments of genes scattered across multiple homologous
regions that have been determined in the search phase a variation of the
alignment algorithm for proteins is employed. For details see Kent 2002.

Documentation:
++++++++++++++

See Blat documentation (http://genome.ucsc.edu/goldenPath/help/blatSpec.html)

Source code:
++++++++++++

http://hgdownload.cse.ucsc.edu/admin/exe/

    ]]></help>
    <citations>
        <citation type="doi">10.1101/gr.229202</citation>
    </citations>
</tool>