comparison blat.xml @ 11:2a89f630fa85 draft

planemo upload commit 3bb07d25ab817c936018d57b6d81f728915cfadf
author iuc
date Fri, 02 Dec 2022 09:35:54 +0000
parents c449963debd5
children e79965d0351c
comparison
equal deleted inserted replaced
10:c449963debd5 11:2a89f630fa85
1 <tool id="ucsc_blat" name="UCSC BLAT Alignment Tool" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"> 1 <tool id="ucsc_blat" name="UCSC BLAT Alignment Tool" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
2 <description>BLAST-like sequence alignment tool</description> 2 <description>BLAST-like sequence alignment tool</description>
3 <macros> 3 <macros>
4 <token name="@TOOL_VERSION@">377</token> 4 <token name="@TOOL_VERSION@">377</token>
5 <token name="@VERSION_SUFFIX@">0</token> 5 <token name="@VERSION_SUFFIX@">0</token>
6
7 <xml name="mask_cond" tokens="maskarg,label,help">
8 <conditional name="@MASKARG@_type">
9 <param argument="-@MASKARG@" type="select" label="@LABEL@" help="@HELP@">
10 <option value="" selected="true">No masking</option>
11 <option value="lower">lower - mask out lower-cased sequence</option>
12 <option value="upper">upper - mask out upper-cased sequence</option>
13 <option value="file.out">out - mask database according to RepeatMasker out</option>
14 </param>
15 <when value="" />
16 <when value="lower" />
17 <when value="upper" />
18 <when value="file.out">
19 <param name="@MASKARG@_file" type="data" format="txt" label="RepeatMasker file.out" />
20 </when>
21 </conditional>
22 </xml>
6 </macros> 23 </macros>
7 <xrefs> 24 <xrefs>
8 <xref type="bio.tools">blat</xref> 25 <xref type="bio.tools">blat</xref>
9 </xrefs> 26 </xrefs>
10 <requirements> 27 <requirements>
26 #end if 43 #end if
27 44
28 blat 45 blat
29 -q=$query_type 46 -q=$query_type
30 -t=$database_type 47 -t=$database_type
31 $oneOff 48 ## Basic alignment parameters
32 #if str($minScore) 49 #if str($basic_align.minScore)
33 -minScore=$minScore 50 -minScore=$basic_align.minScore
34 #end if 51 #end if
35 -maxGap=$maxGap 52 #if str($basic_align.minIdentity)
36 #if str($repMatch) 53 -minIdentity=$basic_align.minIdentity
37 -repMatch=$repMatch 54 #end if
38 #end if 55 $basic_align.trimT
39 #if $mask_type.mask == "file.out": 56 $basic_align.noTrimA
40 -mask='$mask_type.mask_file' 57 $basic_align.trimHardA
41 #else: 58 $basic_align.fastMap
42 -mask=$mask_type.mask 59 $basic_align.fine
43 #end if 60 #if str($basic_align.maxIntron)
61 -maxIntron=$basic_align.maxIntron
62 #end if
63 $basic_align.extendThroughN
64 ## Advanced alignment parameters
65 #if str($adv_align.tileSize)
66 -tileSize=$adv_align.tileSize
67 #end if
68 #if str($adv_align.stepSize)
69 -stepSize=$adv_align.stepSize
70 #end if
71 $adv_align.oneOff
72 #if str($adv_align.minMatch)
73 -minMatch=$adv_align.minMatch
74 #end if
75 -maxGap=$adv_align.maxGap
76 #if str($adv_align.repMatch)
77 -repMatch=$adv_align.repMatch
78 #end if
79 ## Repeat masking parameters
80 #if $repeat.mask_type.mask == "file.out":
81 -mask='$repeat.mask_type.mask_file'
82 #elif $repeat.mask_type.mask:
83 -mask=$repeat.mask_type.mask
84 #end if
85 #if $repeat.qMask_type.qMask == "file.out":
86 -qMask='$repeat.qMask_type.qMask_file'
87 #elif $repeat.qMask_type.qMask:
88 -qmask=$repeat.qMask_type.qMask
89 #end if
90 #if $repeat.repeats_type.repeats == "file.out":
91 -repeats='$repeat.repeats_type.repeats_file'
92 #elif $repeat.repeats_type.repeats:
93 -repeats=$repeat.repeats_type.repeats
94 #end if
95 #if str($repeat.minRepDivergence)
96 -minRepDivergence=$repeat.minRepDivergence
97 #end if
98
44 #if str($dots) 99 #if str($dots)
45 -dots=$dots 100 -dots=$dots
46 #end if 101 #end if
47 $trimT
48 $noTrimA
49 $trimHardA
50 $fastMap
51 $fine
52 #if str($maxIntron)
53 -maxIntron=$maxIntron
54 #end if
55 $extendThroughN
56 '$reference_fasta_filename' 102 '$reference_fasta_filename'
57 '$query' 103 '$query'
58 -out=$out 104 -out=$out
59 '$output' 105 '$output'
60 ]]></command> 106 ]]></command>
65 <option value="history">History</option> 111 <option value="history">History</option>
66 </param> 112 </param>
67 <when value="cached"> 113 <when value="cached">
68 <param name="database" type="select" label="Select database"> 114 <param name="database" type="select" label="Select database">
69 <options from_data_table="all_fasta"> 115 <options from_data_table="all_fasta">
116 <!-- <column name="name" index="0"/>
117 <column name="value" index="2"/> -->
70 <filter type="sort_by" column="2" /> 118 <filter type="sort_by" column="2" />
71 </options> 119 </options>
72 <validator type="no_options" message="A built-in database is not available" /> 120 <validator type="no_options" message="A built-in database is not available" />
73 </param> 121 </param>
74 </when> 122 </when>
75 <when value="history"> 123 <when value="history">
76 <param name="database" type="data" format="fasta, twobit" label="Using database file, either a .fa, .nib or .2bit file" /> 124 <param name="database" type="data" format="fasta,twobit" label="Using database file, either a .fa, .nib or .2bit file" />
77 </when> 125 </when>
78 </conditional> 126 </conditional>
79 <param name="query" type="data" format="fasta, twobit" label="Query data, either a .fa, .nib or .2bit file"/> 127 <param name="query" type="data" format="fasta, twobit" label="Query data, either a .fa, .nib or .2bit file"/>
80 <param argument="-t" name="database_type" type="select" format="txt" multiple="false" label="database type" help="Choose your database type, the default is dnax"> 128 <param argument="-t" name="database_type" type="select" format="txt" multiple="false" label="database type" help="Choose your database type, the default is dnax">
81 <option value="dna">dna - DNA sequence</option> 129 <option value="dna" selected="true">dna - DNA sequence</option>
82 <option value="prot">prot - protein sequence</option> 130 <option value="prot">prot - protein sequence</option>
83 <option value="dnax" selected="true">dnax - DNA sequence translated in six frames to protein</option> 131 <option value="dnax">dnax - DNA sequence translated in six frames to protein</option>
84 </param> 132 </param>
85 <param argument="-q" name="query_type" type="select" format="txt" multiple="false" label="query type" help="Choose your query type, the default is rnax"> 133 <param argument="-q" name="query_type" type="select" format="txt" multiple="false" label="query type" help="Choose your query type, the default is rnax">
86 <option value="dna">dna - DNA sequence </option> 134 <option value="dna" selected="true">dna - DNA sequence </option>
87 <option value="rna">rna - RNA sequence</option> 135 <option value="rna">rna - RNA sequence</option>
88 <option value="prot">prot - protein sequence</option> 136 <option value="prot">prot - protein sequence</option>
89 <option value="dnax">dnax - DNA sequence translated in six frames to protein</option> 137 <option value="dnax">dnax - DNA sequence translated in six frames to protein</option>
90 <option value="rnax" selected="true">rnax - DNA sequence translated in three frames to protein</option> 138 <option value="rnax">rnax - DNA sequence translated in three frames to protein</option>
91 </param> 139 </param>
92 <param argument="-oneOff" type="boolean" truevalue="-oneOff=1" falsevalue="" label="If set, this allows one mismatch in tile and still triggers an alignments" /> 140 <section name="basic_align" title="Alignment parameters" expanded="true">
93 <param argument="-minScore" type="integer" value="30" label="Minimum score" help="It is the matches minus the mismatches minus some sort of gap penalty" /> 141 <param argument="-minScore" type="integer" value="30" label="Minimum score" help="It is the matches minus the mismatches minus some sort of gap penalty" />
94 <param argument="-maxGap" type="integer" value="2" min="0" max="3" label="Maximum gap between tiles in a clump" help="Usually set from 0 to 3. Only relevant for minMatch > 1" /> 142 <param argument="-minIdentity" type="integer" value="" optional="true" min="0" max="100" label="Minimum sequence identity (in percent)" help="Default is 90 for nucleotide searches, 25 for protein or translated protein searches" />
95 <param argument="-repMatch" type="integer" value="" optional="true" label="Number of repetitions of a tile allowed before it is marked as overused" help="Typically this is 256 for tileSize 12, 1024 for tileSize 11, 4096 for tileSize 10. Also affected by stepSize. When stepSize is halved repMatch is doubled to compensate" /> 143 <param argument="-trimT" type="boolean" truevalue="-trimT" falsevalue="" label="Trim leading poly-T" />
96 <conditional name="mask_type"> 144 <param argument="-noTrimA" type="boolean" truevalue="-noTrimA" falsevalue="" label="Don't trim trailing poly-A" />
97 <param argument="-mask" type="select" label="Mask out repeats" help="Alignments won't be started in masked region but may extend through it in nucleotide searches. Masked areas are ignored entirely in protein or translated searches. Default is lower"> 145 <param argument="-trimHardA" type="boolean" truevalue="-trimHardA" falsevalue="" label="Remove poly-A tail from qSize and alignments in .psl output" />
98 <option value="lower" selected="true">lower - mask out lower-cased sequence</option> 146 <param argument="-fastMap" type="boolean" truevalue="-fastMap" falsevalue="" label="Run for fast DNA/DNA remapping" help="It does not allow introns and require high %ID. Query sizes must not exceed 5000" />
99 <option value="upper">upper - mask out upper-cased sequence</option> 147 <param argument="-fine" type="boolean" truevalue="-fine" falsevalue="" label="Refine search for small initial and terminal exons" help="For high-quality mRNAs. Not recommended for ESTs" />
100 <option value="out">out - mask according to database.out RepeatMasker .out file</option> 148 <param argument="-maxIntron" type="integer" value="750000" optional="true" label="Maximum intron size" />
101 <option value="file.out">file.out - mask database according to RepeatMasker file.out</option> 149 <param argument="-extendThroughN" type="boolean" truevalue="-extendThroughN" falsevalue="" label="Allow extension of alignment through large blocks of N's" />
102 </param> 150 </section>
103 <when value="lower" /> 151 <section name="adv_align" title="Advanced alignment parameters" expanded="false">
104 <when value="upper" /> 152 <param argument="-tileSize" type="integer" value="" optional="true" min="1" label="Tile size" help="Sets the size of match that triggers an alignment. Usually between 8 and 12. Default is 11 for DNA and 5 for protein" />
105 <when value="out" /> 153 <param argument="-stepSize" type="integer" value="" optional="true" min="1" label="Spacing between tiles" help="Default is tileSize" />
106 <when value="file.out"> 154 <param argument="-oneOff" type="boolean" truevalue="-oneOff=1" falsevalue="" label="If set, this allows one mismatch in tile and still triggers an alignments" />
107 <param name="mask_file" type="data" format="txt" label="RepeatMasker file.out" /> 155 <param argument="-minMatch" type="integer" value="" optional="true" min="1" label="Minimum number of tile matches" help="Usually set from 2 to 4. Default is 2 for nucleotide, 1 for protein." />
108 </when> 156 <param argument="-maxGap" type="integer" value="2" min="0" max="3" label="Maximum gap between tiles in a clump" help="Usually set from 0 to 3. Only relevant for minMatch > 1" />
109 </conditional> 157 <param argument="-repMatch" type="integer" value="" optional="true" label="Number of repetitions of a tile allowed before it is marked as overused" help="Typically this is 256 for tileSize 12, 1024 for tileSize 11, 4096 for tileSize 10. Also affected by stepSize. When stepSize is halved repMatch is doubled to compensate" />
158 </section>
159 <section name="repeat" title="Repeat masking parameters" expanded="true">
160 <expand macro="mask_cond" maskarg="mask" label="Mask out repeats" help="Alignments won't be started in masked region but may extend through it in nucleotide searches. Masked areas are ignored entirely in protein or translated searches. Default is no masking"/>
161 <expand macro="mask_cond" maskarg="qMask" label="Mask out repeats in query sequence" help="Analoguous to -mask, but for the query sequence"/>
162 <expand macro="mask_cond" maskarg="repeats" label="Report matches in repeats separately" help="Repeat bases will not be masked in any way, but matches in repeat areas will be reported separately from matches in other areas in the output"/>
163 <param argument="-minRepDivergence" type="integer" value="" min="0" max="100" optional="true" label="Minimum divergence of repeats (percent)" help="to allow them to be unmasked. Default is 15. Only relevant for masking using RepeatMasker .out files" />
164 </section>
110 <param argument="-dots" type="integer" value="" optional="true" label="Output a dot every N sequences in log" help="Dots show program's progress" /> 165 <param argument="-dots" type="integer" value="" optional="true" label="Output a dot every N sequences in log" help="Dots show program's progress" />
111 <param argument="-trimT" type="boolean" truevalue="-trimT" falsevalue="" label="Trim leading poly-T" />
112 <param argument="-noTrimA" type="boolean" truevalue="-noTrimA" falsevalue="" label="Don't trim trailing poly-A" />
113 <param argument="-trimHardA" type="boolean" truevalue="-trimHardA" falsevalue="" label="Remove poly-A tail from qSize and alignments in .psl output" />
114 <param argument="-fastMap" type="boolean" truevalue="-fastMap" falsevalue="" label="Run for fast DNA/DNA remapping" help="It does not allow introns and require high %ID. Query sizes must not exceed 5000" />
115 <param argument="-fine" type="boolean" truevalue="-fine" falsevalue="" label="Refine search for small initial and terminal exons" help="For high-quality mRNAs. Not recommended for ESTs" />
116 <param argument="-maxIntron" type="integer" value="750000" optional="true" label="Maximum intron size" />
117 <param argument="-extendThroughN" type="boolean" truevalue="-extendThroughN" falsevalue="" label="Allow extension of alignment through large blocks of N's" />
118 <param name="out" type="select" label="Select output file format (-out)"> 166 <param name="out" type="select" label="Select output file format (-out)">
119 <option value="psl">Tab-separated format, no sequence (psl)</option> 167 <option value="psl">Tab-separated format, no sequence (psl)</option>
120 <option value="psl -noHead">Tab-separated format, no sequence, no header (psl -noHead)</option> 168 <option value="psl -noHead">Tab-separated format, no sequence, no header (psl -noHead)</option>
121 <option value="axt">Blastz-associated axt format (axt)</option> 169 <option value="axt">Blastz-associated axt format (axt)</option>
122 <option value="maf">Multiz-associated maf format (maf)</option> 170 <option value="maf">Multiz-associated maf format (maf)</option>
127 <option value="blast9">NCBI BLAST tabular format with comments (blast9)</option> 175 <option value="blast9">NCBI BLAST tabular format with comments (blast9)</option>
128 </param> 176 </param>
129 </inputs> 177 </inputs>
130 <outputs> 178 <outputs>
131 <data name="output" format="tabular" label="${tool.name} on ${on_string}"> 179 <data name="output" format="tabular" label="${tool.name} on ${on_string}">
132 <change_format> 180 <change_format><!-- add test -->
133 <when input="out" value="axt" format="axt" /> 181 <when input="out" value="axt" format="axt" />
134 <when input="out" value="maf" format="maf" /> 182 <when input="out" value="maf" format="maf" />
135 <when input="out" value="sim4" format="txt" /> 183 <when input="out" value="sim4" format="txt" />
136 <when input="out" value="wublast" format="tabular" />
137 <when input="out" value="blast" format="tabular" />
138 </change_format> 184 </change_format>
139 </data> 185 </data>
140 </outputs> 186 </outputs>
141 <tests> 187 <tests>
142 <!-- test on query of GenBank RefSeq records for Gallus gallus and database of Amazona vittata --> 188 <!-- test on query of GenBank RefSeq records for Gallus gallus and database of Amazona vittata -->
143 <test> 189 <test>
144 <param name="reference_source_selector" value="history" /> 190 <conditional name="reference_source">
145 <param name="database" value="amaVit1_Gallus/amaVit1.fa" /> 191 <param name="reference_source_selector" value="history" />
146 <param name="query" value="amaVit1_Gallus/Gallus_gallus_RefSeq.fa" /> 192 <param name="database" value="amaVit1_Gallus/amaVit1.fa" ftype="fasta" />
193 </conditional>
194 <param name="query" value="amaVit1_Gallus/Gallus_gallus_RefSeq.fa" ftype="fasta" />
147 <param name="database_type" value="dnax" /> 195 <param name="database_type" value="dnax" />
148 <param name="query_type" value="rnax" /> 196 <param name="query_type" value="rnax" />
149 <param name="mask" value="lower" /> 197 <conditional name="mask_type">
150 <param name="out" value="psl -noHead" /> 198 <param name="mask" value="lower" />
151 <output name="output" value="amaVit1_Gallus/amaVit1_Gallus_gallus_sorted.psl" sort="true"/> 199 </conditional>
200 <param name="out" value="maf" />
201 <output name="output" value="amaVit1_Gallus/amaVit1_Gallus_gallus_sorted.maf" ftype="maf"/>
202 <assert_command>
203 <has_text text="-tileSize=" negate="true"/>
204 <has_text text="-stepSize=" negate="true"/>
205 <has_text text="-mask=lower"/>
206 </assert_command>
152 </test> 207 </test>
153 <!-- test on query of partial mRNA of Drosophila melanogaster and the database of Drosophila biamipes dot chromosome --> 208 <!-- test on query of partial mRNA of Drosophila melanogaster and the
209 database of Drosophila biamipes dot chromosome
210 - also test cached reference -->
154 <test> 211 <test>
155 <param name="reference_source_selector" value="history" /> 212 <conditional name="reference_source">
156 <param name="database" value="dbia3/dbia3.fa" /> 213 <param name="reference_source_selector" value="cached"/>
157 <param name="query" value="dbia3/dmel-transcript.fa" /> 214 <param name="database" value="dbdia display name"/>
215 </conditional>
216 <param name="query" value="dbia3/dmel-transcript.fa" ftype="fasta" />
158 <param name="database_type" value="dnax" /> 217 <param name="database_type" value="dnax" />
159 <param name="query_type" value="rnax" /> 218 <param name="query_type" value="rnax" />
160 <param name="mask" value="lower" /> 219 <section name="basic_align">
220 <param name="maxIntron" value="" />
221 </section>
222 <section name="adv_align">
223 <param name="tileSize" value="5"/><!--explicitly set default .. to check if it is on the CL-->
224 <param name="stepSize" value="5"/><!--explicitly set default .. to check if it is on the CL-->
225 </section>
161 <param name="out" value="psl -noHead" /> 226 <param name="out" value="psl -noHead" />
162 <param name="maxIntron" value="" /> 227 <output name="output" value="dbia3/dbia3.sorted.psl" ftype="tabular" sort="true"/>
163 <output name="output" value="dbia3/dbia3.sorted.psl" sort="true"/> 228 <assert_command>
229 <has_text text="-tileSize=5"/>
230 <has_text text="-mask" negate="true"/>
231 </assert_command>
164 </test> 232 </test>
165 <!-- test on the database masked by repeat masker --> 233 <!-- test on the database masked by repeat masker -->
166 <test> 234 <test>
167 <param name="reference_source_selector" value="history" /> 235 <conditional name="reference_source">
168 <param name="database" value="dbia3/dbia3_masked.2bit" /> 236 <param name="reference_source_selector" value="history" />
169 <param name="query" value="dbia3/dmel-transcript.fa" /> 237 <param name="database" value="dbia3/dbia3_masked.2bit" ftype="twobit" />
238 </conditional>
239 <param name="query" value="dbia3/dmel-transcript.fa" ftype="fasta"/>
170 <param name="database_type" value="dnax" /> 240 <param name="database_type" value="dnax" />
171 <param name="query_type" value="rnax" /> 241 <param name="query_type" value="rnax" />
172 <param name="oneOff" value="false" /> 242 <param name="oneOff" value="false" />
173 <param name="minScore" value="30" /> 243 <param name="minScore" value="30" />
174 <param name="maxGap" value="2" /> 244 <param name="maxGap" value="2" />
175 <param name="trimT" value="false" /> 245 <param name="trimT" value="false" />
176 <param name="noTrimA" value="false" /> 246 <param name="noTrimA" value="false" />
177 <param name="fine" value="false" /> 247 <param name="fine" value="false" />
178 <param name="maxIntron" value="750000" /> 248 <param name="maxIntron" value="750000" />
179 <param name="extendThroughN" value="false" /> 249 <param name="extendThroughN" value="false" />
180 <param name="mask" value="file.out" /> 250 <conditional name="mask_type">
181 <param name="mask_file" value="dbia3/dbia3_RM.out" /> 251 <param name="mask" value="file.out" />
182 <param name="out" value="psl -noHead" /> 252 <param name="mask_file" value="dbia3/dbia3_RM.out" />
253 </conditional>
254 <param name="out" value="psl" ftype="tabular" />
183 <output name="output" value="dbia3/dbia3_masked.sorted.psl"/> 255 <output name="output" value="dbia3/dbia3_masked.sorted.psl"/>
256 <assert_command>
257 <has_text text="-tileSize=" negate="true"/>
258 <has_text text="-stepSize=" negate="true"/>
259 <has_text text="-mask='/"/>
260 </assert_command>
184 </test> 261 </test>
185 </tests> 262 </tests>
186 <help> 263 <help>
187 <![CDATA[ 264 <![CDATA[
188 BLAT 265 BLAT
189 ==== 266 ====
190 BLAT is a bioinformatics software a tool which performs rapid mRNA/DNA and cross-species protein alignments. 267 BLAT is a bioinformatics software a tool which performs rapid sequence alignments (mRNA/DNA and cross-species protein).
191 268 It is designed to find sequences of high similarity and have a certain minimum length. With the default setting this is
192 blat (version: v36)- Standalone blat sequence search command line tool. 269
193 ------------------------------------------------------------------------- 270 - >95% similarity and a minimum length of 25 bases for nucleotide sequences
194 271 - >80% similarity and a minimum lenth of 20 amino acids for proteins
195 usage: 272
196 ++++++ 273 More divergent or shorter sequence alignments may be missed.
197 274 The algorithm works in two phases:
198 $ blat database query [-ooc=11.ooc] output.psl 275
199 276 1. Search phase: find regions of probable homology using an index of the reference sequence
200 where: 277 2. Alignment phase: Detailed Alignment of the sequences in these regions
201 database and query are each either a .fa, .nib or .2bit file, 278
202 or a list of these files with one file name per line. 279 Search phase
203 -ooc=11.ooc tells the program to load over-occurring 11-mers from 280 ++++++++++++
204 an external file. This will increase the speed 281
205 by a factor of 40 in many cases, but is not required. 282 Builds an index of the reference containing the nonoverlapping K-mers and their
206 output.psl is the name of the output file. 283 positions (by default, can be changed using `-tileSize` and `-stepSize`). Hits,
207 284 i.e. exactly matching k-mers in query and reference, are then found by looking
208 documentation: 285 up each overlapping K-mer of the query sequence. By enabling `-oneOff` the
286 algorithm allows for a single substitition. Note that this increases the run
287 time of this phase significantly.
288
289 The hits are then split into buckets of 64k (based on the database position)
290 and sorted on the diagonal (database minus query positions). Hits within the
291 gap limit form so called proto-clumps. Those are then sorted by database position
292 and put into clumps if they are within the window limit (wrt database coordinate).
293
294 Clumps with less than the minimum number of hits are discarded (-minMatch) and
295 those within 300 bases or 100 amino acids in the database are merged together.
296 The resulting clumps define regions of the database which are homologous to the
297 query sequence which are then aligned.
298
299 Alignment phase
300 +++++++++++++++
301
302 The alignment is performed differently for nucleotide and
303 aminoacid sequences.
304
305 **Alignment for nucleotide sequences**: A hit list (exactly matching k-mers) for
306 the query and the homologous region of the database is generated. If necessary
307 hits are mode unique by extending them until they are unique or have a maximum
308 size. The hits are then extended maximally allowing no mismatches, and overlapping
309 hits are merged.
310 Subsequent (wrt query and reference) extended hits are then linked in an
311 alignment. If there are gaps in query and reference, the algorithm recurses
312 using a smaller value for k until no additional hits are found or gaps are
313 smaller than 6 bases.
314
315 **Protein Alignments**: The hits from the search stage are extended into maximally
316 scoring ungapped alignments (HSPs) (match cost 2 and mismatch cost 1). The HSPs
317 are organized in a directed graph where an edge connect HSPs A and B if A starts
318 before B wrt query and database coordinates. The weight of the edge is then
319 defined as the score of B minus a gap penalty based on the distance between A
320 and B (overlapping HSPs are treated differently, see Kent 2002). The maximal
321 scoring alignment is then determined as the maximum weight path through the
322 graph and the HSPs of this path are removed. This is repeated until no HSPs are
323 left.
324
325 **Stitching and Filling In**:
326 In order to find also alignments of genes scattered across multiple homologous
327 regions that have been determined in the search phase a variation of the
328 alignment algorithm for proteins is employed. For details see Kent 2002.
329
330 Documentation:
209 ++++++++++++++ 331 ++++++++++++++
210 332
211 See Blat documentation (http://genome.ucsc.edu/goldenPath/help/blatSpec.html) 333 See Blat documentation (http://genome.ucsc.edu/goldenPath/help/blatSpec.html)
212 334
213 Source code: 335 Source code: