Mercurial > repos > yating-l > ucsc_blat

diff blat.xml @ 11:2a89f630fa85 draft
planemo upload commit 3bb07d25ab817c936018d57b6d81f728915cfadf
author: iuc
date: Fri, 02 Dec 2022 09:35:54 +0000
parents: c449963debd5
children: e79965d0351c
--- a/blat.xml	Mon Nov 21 11:12:14 2022 +0000
+++ b/blat.xml	Fri Dec 02 09:35:54 2022 +0000
@@ -3,6 +3,23 @@
     <macros>
         <token name="@TOOL_VERSION@">377</token>
         <token name="@VERSION_SUFFIX@">0</token>
+
+        <xml name="mask_cond" tokens="maskarg,label,help">
+            <conditional name="@MASKARG@_type">
+                <param  argument="-@MASKARG@" type="select" label="@LABEL@" help="@HELP@">
+                    <option value="" selected="true">No masking</option>
+                    <option value="lower">lower - mask out lower-cased sequence</option>
+                    <option value="upper">upper - mask out upper-cased sequence</option>
+                    <option value="file.out">out - mask database according to RepeatMasker out</option>
+                </param>
+                <when value="" />
+                <when value="lower" />
+                <when value="upper" />
+                <when value="file.out">
+                    <param name="@MASKARG@_file" type="data" format="txt" label="RepeatMasker file.out" />
+                </when>
+            </conditional>
+        </xml>
     </macros>
     <xrefs>
         <xref type="bio.tools">blat</xref>
@@ -28,31 +45,60 @@
     blat
         -q=$query_type
         -t=$database_type
-        $oneOff
-        #if str($minScore)
-            -minScore=$minScore
+        ## Basic alignment parameters
+        #if str($basic_align.minScore)
+            -minScore=$basic_align.minScore
+        #end if
+        #if str($basic_align.minIdentity)
+            -minIdentity=$basic_align.minIdentity
+        #end if
+        $basic_align.trimT
+        $basic_align.noTrimA
+        $basic_align.trimHardA
+        $basic_align.fastMap
+        $basic_align.fine
+        #if str($basic_align.maxIntron)
+            -maxIntron=$basic_align.maxIntron
+        #end if
+        $basic_align.extendThroughN
+        ## Advanced alignment parameters
+        #if str($adv_align.tileSize)
+            -tileSize=$adv_align.tileSize
+        #end if
+        #if str($adv_align.stepSize)
+            -stepSize=$adv_align.stepSize
         #end if
-        -maxGap=$maxGap
-        #if str($repMatch)
-            -repMatch=$repMatch
+        $adv_align.oneOff
+        #if str($adv_align.minMatch)
+            -minMatch=$adv_align.minMatch
+        #end if
+        -maxGap=$adv_align.maxGap
+        #if str($adv_align.repMatch)
+            -repMatch=$adv_align.repMatch
+        #end if
+        ## Repeat masking parameters
+        #if $repeat.mask_type.mask == "file.out":
+            -mask='$repeat.mask_type.mask_file'
+        #elif $repeat.mask_type.mask:
+            -mask=$repeat.mask_type.mask
         #end if
-        #if $mask_type.mask == "file.out":
-            -mask='$mask_type.mask_file'
-        #else:
-            -mask=$mask_type.mask
+        #if $repeat.qMask_type.qMask == "file.out":
+            -qMask='$repeat.qMask_type.qMask_file'
+        #elif $repeat.qMask_type.qMask:
+            -qmask=$repeat.qMask_type.qMask
         #end if
+        #if $repeat.repeats_type.repeats == "file.out":
+            -repeats='$repeat.repeats_type.repeats_file'
+        #elif $repeat.repeats_type.repeats:
+            -repeats=$repeat.repeats_type.repeats
+        #end if
+        #if str($repeat.minRepDivergence)
+            -minRepDivergence=$repeat.minRepDivergence
+        #end if
+        
         #if str($dots)
             -dots=$dots
         #end if
-        $trimT
-        $noTrimA
-        $trimHardA
-        $fastMap
-        $fine
-        #if str($maxIntron)
-            -maxIntron=$maxIntron
-        #end if
-        $extendThroughN
         '$reference_fasta_filename'
         '$query'
         -out=$out
@@ -67,54 +113,56 @@
             <when value="cached">
                 <param name="database" type="select" label="Select database">
                     <options from_data_table="all_fasta">
+                        <!-- <column name="name" index="0"/>
+                        <column name="value" index="2"/> -->
                         <filter type="sort_by" column="2" />
                     </options>
                     <validator type="no_options" message="A built-in database is not available" />
                 </param>
             </when>
             <when value="history">
-                <param name="database" type="data" format="fasta, twobit" label="Using database file, either a .fa, .nib or .2bit file" />
+                <param name="database" type="data" format="fasta,twobit" label="Using database file, either a .fa, .nib or .2bit file" />
             </when>
         </conditional>
         <param name="query" type="data" format="fasta, twobit" label="Query data, either a .fa, .nib or .2bit file"/>
         <param argument="-t" name="database_type" type="select" format="txt" multiple="false" label="database type" help="Choose your database type, the default is dnax">
-            <option value="dna">dna - DNA sequence</option>
+            <option value="dna" selected="true">dna - DNA sequence</option>
             <option value="prot">prot - protein sequence</option>
-            <option value="dnax" selected="true">dnax - DNA sequence translated in six frames to protein</option>
+            <option value="dnax">dnax - DNA sequence translated in six frames to protein</option>
         </param>
         <param argument="-q" name="query_type" type="select" format="txt" multiple="false" label="query type" help="Choose your query type, the default is rnax">
-            <option value="dna">dna - DNA sequence </option>
+            <option value="dna" selected="true">dna - DNA sequence </option>
             <option value="rna">rna - RNA sequence</option>
             <option value="prot">prot - protein sequence</option>
             <option value="dnax">dnax - DNA sequence translated in six frames to protein</option>
-            <option value="rnax" selected="true">rnax - DNA sequence translated in three frames to protein</option>
+            <option value="rnax">rnax - DNA sequence translated in three frames to protein</option>
         </param>
-        <param argument="-oneOff" type="boolean" truevalue="-oneOff=1" falsevalue="" label="If set, this allows one mismatch in tile and still triggers an alignments" />
-        <param argument="-minScore" type="integer" value="30" label="Minimum score" help="It is the matches minus the mismatches minus some sort of gap penalty" />
-        <param argument="-maxGap" type="integer" value="2" min="0" max="3" label="Maximum gap between tiles in a clump" help="Usually set from 0 to 3. Only relevant for minMatch > 1" />
-        <param argument="-repMatch" type="integer" value="" optional="true" label="Number of repetitions of a tile allowed before it is marked as overused" help="Typically this is 256 for tileSize 12, 1024 for tileSize 11, 4096 for tileSize 10. Also affected by stepSize. When stepSize is halved repMatch is doubled to compensate" />
-        <conditional name="mask_type">
-            <param  argument="-mask" type="select" label="Mask out repeats" help="Alignments won't be started in masked region but may extend through it in nucleotide searches. Masked areas are ignored entirely in protein or translated searches. Default is lower">
-                <option value="lower" selected="true">lower - mask out lower-cased sequence</option>
-                <option value="upper">upper - mask out upper-cased sequence</option>
-                <option value="out">out - mask according to database.out RepeatMasker .out file</option>
-                <option value="file.out">file.out - mask database according to RepeatMasker file.out</option>
-            </param>
-            <when value="lower" />
-            <when value="upper" />
-            <when value="out" />
-            <when value="file.out">
-                <param name="mask_file" type="data" format="txt" label="RepeatMasker file.out" />
-            </when>
-        </conditional>
+        <section name="basic_align" title="Alignment parameters" expanded="true">
+            <param argument="-minScore" type="integer" value="30" label="Minimum score" help="It is the matches minus the mismatches minus some sort of gap penalty" />
+            <param argument="-minIdentity" type="integer" value="" optional="true" min="0" max="100" label="Minimum sequence identity (in percent)" help="Default is 90 for nucleotide searches, 25 for protein or translated protein searches" />
+            <param argument="-trimT" type="boolean" truevalue="-trimT" falsevalue="" label="Trim leading poly-T" />
+            <param argument="-noTrimA" type="boolean" truevalue="-noTrimA" falsevalue="" label="Don't trim trailing poly-A" />
+            <param argument="-trimHardA" type="boolean" truevalue="-trimHardA" falsevalue="" label="Remove poly-A tail from qSize and alignments in .psl output" />
+            <param argument="-fastMap" type="boolean" truevalue="-fastMap" falsevalue="" label="Run for fast DNA/DNA remapping" help="It does not allow introns and require high %ID. Query sizes must not exceed 5000" />
+            <param argument="-fine" type="boolean" truevalue="-fine" falsevalue="" label="Refine search for small initial and terminal exons" help="For high-quality mRNAs. Not recommended for ESTs" />
+            <param argument="-maxIntron" type="integer" value="750000" optional="true" label="Maximum intron size" />
+            <param argument="-extendThroughN" type="boolean" truevalue="-extendThroughN" falsevalue="" label="Allow extension of alignment through large blocks of N's" />
+        </section>
+        <section name="adv_align" title="Advanced alignment parameters" expanded="false">
+            <param argument="-tileSize" type="integer" value="" optional="true" min="1" label="Tile size" help="Sets the size of match that triggers an alignment. Usually between 8 and 12. Default is 11 for DNA and 5 for protein" />
+            <param argument="-stepSize" type="integer" value="" optional="true" min="1" label="Spacing between tiles" help="Default is tileSize" />
+            <param argument="-oneOff" type="boolean" truevalue="-oneOff=1" falsevalue="" label="If set, this allows one mismatch in tile and still triggers an alignments" />
+            <param argument="-minMatch" type="integer" value="" optional="true" min="1" label="Minimum number of tile matches" help="Usually set from 2 to 4. Default is 2 for nucleotide, 1 for protein." />
+            <param argument="-maxGap" type="integer" value="2" min="0" max="3" label="Maximum gap between tiles in a clump" help="Usually set from 0 to 3. Only relevant for minMatch > 1" />
+            <param argument="-repMatch" type="integer" value="" optional="true" label="Number of repetitions of a tile allowed before it is marked as overused" help="Typically this is 256 for tileSize 12, 1024 for tileSize 11, 4096 for tileSize 10. Also affected by stepSize. When stepSize is halved repMatch is doubled to compensate" />
+        </section>
+        <section name="repeat" title="Repeat masking parameters" expanded="true">
+            <expand macro="mask_cond" maskarg="mask" label="Mask out repeats" help="Alignments won't be started in masked region but may extend through it in nucleotide searches. Masked areas are ignored entirely in protein or translated searches. Default is no masking"/>
+            <expand macro="mask_cond" maskarg="qMask" label="Mask out repeats in query sequence" help="Analoguous to -mask, but for the query sequence"/>
+            <expand macro="mask_cond" maskarg="repeats" label="Report matches in repeats separately" help="Repeat bases will not be masked in any way, but matches in repeat areas will be reported separately from matches in other areas in the output"/>
+            <param argument="-minRepDivergence" type="integer" value=""  min="0" max="100" optional="true" label="Minimum divergence of repeats (percent)" help="to allow them to be unmasked.  Default is 15.  Only relevant for masking using RepeatMasker .out files" />
+        </section>
         <param argument="-dots" type="integer" value="" optional="true" label="Output a dot every N sequences in log" help="Dots show program's progress" />
-        <param argument="-trimT" type="boolean" truevalue="-trimT" falsevalue="" label="Trim leading poly-T" />
-        <param argument="-noTrimA" type="boolean" truevalue="-noTrimA" falsevalue="" label="Don't trim trailing poly-A" />
-        <param argument="-trimHardA" type="boolean" truevalue="-trimHardA" falsevalue="" label="Remove poly-A tail from qSize and alignments in .psl output" />
-        <param argument="-fastMap" type="boolean" truevalue="-fastMap" falsevalue="" label="Run for fast DNA/DNA remapping" help="It does not allow introns and require high %ID. Query sizes must not exceed 5000" />
-        <param argument="-fine" type="boolean" truevalue="-fine" falsevalue="" label="Refine search for small initial and terminal exons" help="For high-quality mRNAs. Not recommended for ESTs" />
-        <param argument="-maxIntron" type="integer" value="750000" optional="true" label="Maximum intron size" />
-        <param argument="-extendThroughN" type="boolean" truevalue="-extendThroughN" falsevalue="" label="Allow extension of alignment through large blocks of N's" />
         <param name="out" type="select" label="Select output file format (-out)">
             <option value="psl">Tab-separated format, no sequence (psl)</option>
             <option value="psl -noHead">Tab-separated format, no sequence, no header (psl -noHead)</option>
@@ -129,44 +177,66 @@
     </inputs>
     <outputs>
         <data name="output" format="tabular" label="${tool.name} on ${on_string}">
-            <change_format>
+            <change_format><!-- add test -->
                 <when input="out" value="axt" format="axt" />
                 <when input="out" value="maf" format="maf" />
                 <when input="out" value="sim4" format="txt" />
-                <when input="out" value="wublast" format="tabular" />
-                <when input="out" value="blast" format="tabular" />
             </change_format>
         </data>
     </outputs>
     <tests>
         <!-- test on query of GenBank RefSeq records for Gallus gallus and database of Amazona vittata -->
         <test>
-            <param name="reference_source_selector" value="history" />
-            <param name="database" value="amaVit1_Gallus/amaVit1.fa" />
-            <param name="query" value="amaVit1_Gallus/Gallus_gallus_RefSeq.fa" />
+            <conditional name="reference_source">
+                <param name="reference_source_selector" value="history" />
+                <param name="database" value="amaVit1_Gallus/amaVit1.fa" ftype="fasta" />
+            </conditional>
+            <param name="query" value="amaVit1_Gallus/Gallus_gallus_RefSeq.fa" ftype="fasta" />
             <param name="database_type" value="dnax" />
             <param name="query_type" value="rnax" />
-            <param name="mask" value="lower" />
-            <param name="out" value="psl -noHead" />
-            <output name="output" value="amaVit1_Gallus/amaVit1_Gallus_gallus_sorted.psl"  sort="true"/>
+            <conditional name="mask_type">
+                <param name="mask" value="lower" />
+            </conditional>
+            <param name="out" value="maf" />
+            <output name="output" value="amaVit1_Gallus/amaVit1_Gallus_gallus_sorted.maf" ftype="maf"/>
+            <assert_command>
+                <has_text text="-tileSize=" negate="true"/>
+                <has_text text="-stepSize=" negate="true"/>
+                <has_text text="-mask=lower"/>
+            </assert_command>
         </test>
-        <!-- test on query of partial mRNA of Drosophila melanogaster and the database of Drosophila biamipes dot chromosome -->
+        <!-- test on query of partial mRNA of Drosophila melanogaster and the 
+            database of Drosophila biamipes dot chromosome 
+            - also test cached reference -->
         <test>
-            <param name="reference_source_selector" value="history" />
-            <param name="database" value="dbia3/dbia3.fa" />
-            <param name="query" value="dbia3/dmel-transcript.fa" />
+            <conditional name="reference_source">
+                <param name="reference_source_selector" value="cached"/>
+                <param name="database" value="dbdia display name"/>
+            </conditional>
+            <param name="query" value="dbia3/dmel-transcript.fa" ftype="fasta" />
             <param name="database_type" value="dnax" />
             <param name="query_type" value="rnax" />
-            <param name="mask" value="lower" />
+            <section name="basic_align">
+                <param name="maxIntron" value="" />
+            </section>
+            <section name="adv_align">
+                <param name="tileSize" value="5"/><!--explicitly set default .. to check if it is on the CL-->
+                <param name="stepSize" value="5"/><!--explicitly set default .. to check if it is on the CL-->
+            </section>
             <param name="out" value="psl -noHead" />
-            <param name="maxIntron" value="" />
-            <output name="output" value="dbia3/dbia3.sorted.psl" sort="true"/>
+            <output name="output" value="dbia3/dbia3.sorted.psl" ftype="tabular" sort="true"/>
+            <assert_command>
+                <has_text text="-tileSize=5"/>
+                <has_text text="-mask" negate="true"/>
+            </assert_command>
         </test>
         <!-- test on the database masked by repeat masker -->
         <test>
-            <param name="reference_source_selector" value="history" />
-            <param name="database" value="dbia3/dbia3_masked.2bit" />
-            <param name="query" value="dbia3/dmel-transcript.fa" />
+            <conditional name="reference_source">
+                <param name="reference_source_selector" value="history" />
+                <param name="database" value="dbia3/dbia3_masked.2bit" ftype="twobit" />
+            </conditional>
+            <param name="query" value="dbia3/dmel-transcript.fa" ftype="fasta"/>
             <param name="database_type" value="dnax" />
             <param name="query_type" value="rnax" />
             <param name="oneOff" value="false" />
@@ -177,35 +247,87 @@
             <param name="fine" value="false" />
             <param name="maxIntron" value="750000" />
             <param name="extendThroughN" value="false" />
-            <param name="mask" value="file.out" />
-            <param name="mask_file" value="dbia3/dbia3_RM.out" />
-            <param name="out" value="psl -noHead" />
+            <conditional name="mask_type">
+                <param name="mask" value="file.out" />
+                <param name="mask_file" value="dbia3/dbia3_RM.out" />
+            </conditional>
+            <param name="out" value="psl" ftype="tabular" />
             <output name="output" value="dbia3/dbia3_masked.sorted.psl"/>
+            <assert_command>
+                <has_text text="-tileSize=" negate="true"/>
+                <has_text text="-stepSize=" negate="true"/>
+                <has_text text="-mask='/"/>
+            </assert_command>
         </test>
     </tests>
     <help>
         <![CDATA[
 BLAT
 ====
-BLAT is a bioinformatics software a tool which performs rapid mRNA/DNA and cross-species protein alignments.
+BLAT is a bioinformatics software a tool which performs rapid sequence alignments (mRNA/DNA and cross-species protein).
+It is designed to find sequences of high similarity and have a certain minimum length. With the default setting this is
+
+- >95% similarity and a minimum length of 25 bases for nucleotide sequences
+- >80% similarity and a minimum lenth of 20 amino acids for proteins
 
-blat (version: v36)- Standalone blat sequence search command line tool.
--------------------------------------------------------------------------
+More divergent or shorter sequence alignments may be missed.
+The algorithm works in two phases: 
+
+1. Search phase: find regions of probable homology using an index of the reference sequence
+2. Alignment phase: Detailed Alignment of the sequences in these regions
+
+Search phase
+++++++++++++
 
-usage:
-++++++
+Builds an index of the reference containing the nonoverlapping K-mers and their
+positions (by default, can be changed using `-tileSize` and `-stepSize`).  Hits,
+i.e. exactly matching k-mers in query and reference, are then found by looking
+up each overlapping K-mer of the query sequence.  By enabling `-oneOff` the
+algorithm allows for a single substitition. Note that this increases the run
+time of this phase significantly.
 
-  $ blat database query [-ooc=11.ooc] output.psl
+The hits are then split into buckets of 64k (based on the database position)
+and sorted on the diagonal (database minus query positions). Hits within the
+gap limit form so called proto-clumps. Those are then sorted by database position
+and put into clumps if they are within the window limit (wrt database coordinate).
+
+Clumps with less than the minimum number of hits are discarded (-minMatch) and
+those within 300 bases or 100 amino acids in the database are merged together.
+The resulting clumps define regions of the database which are homologous to the
+query sequence which are then aligned.
 
-where:
-   database and query are each either a .fa, .nib or .2bit file,
-   or a list of these files with one file name per line.
-   -ooc=11.ooc tells the program to load over-occurring 11-mers from
-   an external file.  This will increase the speed
-   by a factor of 40 in many cases, but is not required.
-   output.psl is the name of the output file.
+Alignment phase
++++++++++++++++
+
+The alignment is performed differently for nucleotide and
+aminoacid sequences.
+
+**Alignment for nucleotide sequences**: A hit list (exactly matching k-mers) for
+the query and the homologous region of the database is generated. If necessary
+hits are mode unique by extending them until they are unique or have a maximum
+size. The hits are then extended maximally allowing no mismatches, and overlapping
+hits are merged.
+Subsequent (wrt query and reference) extended hits are then linked in an
+alignment. If there are gaps in query and reference, the algorithm recurses
+using a smaller value for k until no additional hits are found or gaps are
+smaller than 6 bases.
 
-documentation:
+**Protein Alignments**: The hits from the search stage are extended into maximally
+scoring ungapped alignments (HSPs) (match cost 2 and mismatch cost 1). The HSPs
+are organized in a directed graph where an edge connect HSPs A and B if A starts
+before B wrt query and database coordinates. The weight of the edge is then
+defined as the score of B minus a gap penalty based on the distance between A
+and B (overlapping HSPs are treated differently, see Kent 2002).  The maximal
+scoring alignment is then determined as the maximum weight path through the
+graph and the HSPs of this path are removed. This is repeated until no HSPs are
+left.
+
+**Stitching and Filling In**:
+In order to find also alignments of genes scattered across multiple homologous
+regions that have been determined in the search phase a variation of the
+alignment algorithm for proteins is employed. For details see Kent 2002.
+
+Documentation:
 ++++++++++++++
 
 See Blat documentation (http://genome.ucsc.edu/goldenPath/help/blatSpec.html)
author	iuc
date	Fri, 02 Dec 2022 09:35:54 +0000
parents	c449963debd5
children	e79965d0351c