comparison macros.xml @ 0:1dde27bbdcba draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hmmer3 commit 4261b86af790a3535c0b9a8122f92225f8f67b47
author iuc
date Sat, 25 Jun 2016 15:06:44 -0400
parents
children e779d71f871a
comparison
equal deleted inserted replaced
-1:000000000000 0:1dde27bbdcba
1 <?xml version="1.0"?>
2 <macros>
3 <xml name="requirements">
4 <requirements>
5 <requirement type="package" version="3.1b2">hmmer</requirement>
6 <yield/>
7 </requirements>
8 </xml>
9 <token name="@WRAPPER_VERSION@">0.1</token>
10 <xml name="stdio">
11 <stdio>
12 <!-- Anything other than zero is an error -->
13 <exit_code range="1:"/>
14 <exit_code range=":-1"/>
15 <!-- In case the return code has not been set propery check stderr too -->
16 <regex match="Error:"/>
17 <regex match="Exception:"/>
18 </stdio>
19 </xml>
20 <token name="@THRESHOLDS@">
21 -E $E
22 --domE $domE
23
24 #if $T:
25 -T $T
26 #end if
27
28 #if $domT:
29 --domT $domT
30 #end if
31
32 #if $incE:
33 --incE $incE
34 #end if
35
36 #if $incT:
37 --incT $incT
38 #end if
39
40 #if $incdomE:
41 --incdomE $incdomE
42 #end if
43
44 #if $incdomT:
45 --incdomT $incdomT
46 #end if
47 </token>
48 <xml name="thresholds_xml">
49 <!-- Options controlling reporting thresholds -->
50 <param name="E" label="report sequences &lt;= this E-Value threshold in output" help="(-E)" value="10.0" type="float" min="0"/>
51 <param name="domE" label="report domains &lt;= this E-Value threshold in output" help="(--domE)" value="10.0" type="float" min="0"/>
52 <param name="T" label="report sequences &gt;= this score threshold in output" help="(-T)" type="float" optional="True"/>
53 <param name="domT" label="report domains &gt;= this score threshold in output" help="(--domT)" type="float" optional="True"/>
54 <!-- Options controlling inclusion (significance) thresholds -->
55 <param name="incE" label="consider sequences &lt;= this E-Value threshold as significant" help="(--incE)" type="float" optional="True"/>
56 <param name="incdomE" label="consider domains &lt;= this E-Value threshold as significant" help="(--incdomE)" type="float" optional="True"/>
57 <param name="incT" label="consider sequences &gt;= this score threshold as significant" help="(--incT)" type="float" optional="True"/>
58 <param name="incdomT" label="consider domains &gt;= this score threshold as significant" help="(--incdomT)" type="float" optional="True"/>
59 </xml>
60 <token name="@THRESHOLDS_NODOM@">
61 -E $E
62
63 #if $T:
64 -T $T
65 #end if
66
67 #if $incE:
68 --incE $incE
69 #end if
70
71 #if $incT:
72 --incT $incT
73 #end if
74 </token>
75 <xml name="thresholds_nodom">
76 <!-- Options controlling reporting thresholds -->
77 <param name="E" label="report sequences &lt;= this E-Value threshold in output" help="(-E)" value="10.0" type="float" min="0"/>
78 <param name="T" label="report sequences &gt;= this score threshold in output" help="(-T)" type="float" optional="True"/>
79 <!-- Options controlling inclusion (significance) thresholds -->
80 <param name="incE" label="consider sequences &lt;= this E-Value threshold as significant" help="(--incE)" type="float" optional="True"/>
81 <param name="incT" label="consider sequences &gt;= this score threshold as significant" help="(--incT)" type="float" optional="True"/>
82 </xml>
83 <token name="@ACCEL_HEUR@">
84 $max
85 --F1 $F1
86 --F2 $F2
87 --F3 $F3
88 $nobias
89
90 </token>
91 <xml name="accel_heur_xml">
92 <!-- Options controlling acceleration heuristics -->
93 <param name="max" type="boolean" truevalue="--max" label="Turn all heuristic filters off (less speed, more power)" help="(--max)" falsevalue=""/>
94 <param name="F1" type="float" label="Stage 1 (MSV) threshold: promote hits w/ P &lt;= F1" help="(--F1)" value="0.02"/>
95 <param name="F2" type="float" label="Stage 2 (Vit) threshold: promote hits w/ P &lt;= F2" help="(--F2)" value="1e-3"/>
96 <param name="F3" type="float" label="Stage 3 (Fwd) threshold: promote hits w/ P &lt;= F3" help="(--F3)" value="1e-5"/>
97 <param name="nobias" type="boolean" truevalue="--nobias" label="Turn off composition bias filter" help="(--nobias)" falsevalue=""/>
98 </xml>
99 <token name="@EVAL_CALIB@">
100 --EmL $EmL
101 --EmN $EmN
102 --EvL $EvL
103 --EvN $EvN
104 --EfL $EfL
105 --EfN $EfN
106 --Eft $Eft
107 </token>
108 <xml name="eval_calib_xml">
109 <!-- Control of E-value calibration -->
110 <param name="EmL" type="integer" value="200" min="1" help="(--EmL)" label="Length of sequences for MSV Gumbel mu fit"/>
111 <param name="EmN" type="integer" value="200" min="1" help="(--EmN)" label="Number of sequences for MSV Gumbel mu fit"/>
112 <param name="EvL" type="integer" value="200" min="1" help="(--EvL)" label="Length of sequences for Viterbi Gumbel mu fit"/>
113 <param name="EvN" type="integer" value="200" min="1" help="(--EvN)" label="Number of sequences for Viterbi Gumbel mu fit"/>
114 <param name="EfL" type="integer" value="100" min="1" help="(--EfL)" label="Length of sequences for Forward exp tail tau fit"/>
115 <param name="EfN" type="integer" value="200" min="1" help="(--EfN)" label="Number of sequences for Forward exp tail tau fit"/>
116 <param name="Eft" type="float" value="0.04" min="0" max="1" help="(--Eft)" label="tail mass for Forward exponential tail tau fit"/>
117 </xml>
118 <token name="@OFORMAT_WITH_OPTS_NOPFAM@">
119 #if 'tblout' in str($oformat):
120 --tblout $tblout
121 #end if
122
123 #if 'domtblout' in str($oformat):
124 --domtblout $domtblout
125 #end if
126
127 $acc $noali $notextw
128 </token>
129 <xml name="oformat_with_opts_nopfam">
130 <!-- Options directing output -->
131 <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
132 <option value="tblout" selected="true">Table of per-sequence hits (--tblout)</option>
133 <option value="domtblout" selected="true">Table of per-domain hits (--domtblout)</option>
134 </param>
135 <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
136 <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
137 <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
138 </xml>
139 <token name="@OFORMAT_WITH_OPTS@">
140 #if 'tblout' in str($oformat):
141 --tblout $tblout
142 #end if
143
144 #if 'domtblout' in str($oformat):
145 --domtblout $domtblout
146 #end if
147
148 #if 'pfamtblout' in str($oformat):
149 --pfamtblout $pfamtblout
150 #end if
151
152 $acc $noali $notextw
153 </token>
154 <xml name="oformat_with_opts">
155 <!-- Options directing output -->
156 <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
157 <option value="tblout" selected="true">Table of per-sequence hits (--tblout)</option>
158 <option value="domtblout" selected="true">Table of per-domain hits (--domtblout)</option>
159 <option value="pfamtblout" selected="true">Table of hits and domains in Pfam format (--pfamtblout)</option>
160 </param>
161 <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
162 <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
163 <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
164 </xml>
165 <xml name="oformat_test">
166 <param name="notextw" value="True" />
167 </xml>
168 <!-- TODO: tblout will match 'pfamtblout,dfamtblout' -->
169 <token name="@OFORMAT_WITH_OPTS_N@">
170 #if 'tblout' in str($oformat):
171 --tblout $tblout
172 #end if
173
174 #if 'dfamtblout' in str($oformat):
175 --dfamtblout $dfamtblout
176 #end if
177
178 #if 'aliscoresout' in str($oformat):
179 --aliscoresout $aliscoresout
180 #end if
181
182 $acc $noali $notextw
183 </token>
184 <xml name="oformat_with_opts_n">
185 <!-- Options directing output -->
186 <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
187 <option value="tblout" selected="true">Table of hits (--tblout)</option>
188 <option value="dfamtblout" selected="true">Table of hits in Dfam format (--dfamtblout)</option>
189 <option value="aliscoresout">Scores for each position in each alignment to file (--aliscoresout)</option>
190 </param>
191 <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
192 <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
193 <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
194 </xml>
195 <token name="@HSSI@">
196 #if $hssi.hssi_select == "singlemx":
197 --popen $hssi.popen
198 --pextend $hssi.pextend
199 #end if
200 </token>
201 <xml name="hssi">
202 <!-- Handling single sequence inputs -->
203 <conditional name="hssi">
204 <param name="hssi_select" type="select" label="Options for handling single sequence inputs">
205 <option value="false" selected="true">Disable</option>
206 <option value="singlemx">Use substitution score matrix for single-sequence inputs</option>
207 </param>
208 <when value="singlemx">
209 <param name="popen" type="float" value="0.02" label="Gap open probability" help="(--popen)" min="0.0" max="0.5"/>
210 <param name="pextend" type="float" value="0.4" label="Gap extend probability" help="(--pextend)" min="0.0" max="1.0"/>
211 </when>
212 <when value="false">
213 </when>
214 <!-- -mx <s> : substitution score matrix (built-in matrices, with -singlemx)-->
215 <!-- -mxfile <f> : read substitution score matrix from file <f> (with -singlemx)-->
216 </conditional>
217 </xml>
218 <token name="@CPU@">
219 --cpu \${GALAXY_SLOTS:-2}
220 </token>
221 <token name="@SEED@">
222 --seed $seed
223 </token>
224 <xml name="seed">
225 <param name="seed" label="RNG seed, 0 generates a random seed" value="42" type="integer" help="(--seed)" min="0"/>
226 </xml>
227 <xml name="seed_test">
228 <param name="seed" value="4" />
229 </xml>
230 <token name="@ADV_OPTS@">
231 $nonull2
232
233 #if $Z:
234 -Z $Z
235 #end if
236
237 #if $domZ:
238 --domZ $domZ
239 #end if
240 </token>
241 <xml name="adv_opts">
242 <!-- Other options -->
243 <param name="nonull2" type="boolean" truevalue="--nonull2" label="Turn off biased composition score corrections" help="(--nonull2)" falsevalue=""/>
244 <param name="Z" type="integer" label="# of comparisons done for E-value calculation" help="(-Z)" optional="True"/>
245 <param name="domZ" type="integer" label="# of significant sequences, for domain E-value calculation" help="(--domZ)" optional="True"/>
246 </xml>
247 <token name="@FORMAT_SELECTOR@">
248 $input_format_select
249 </token>
250 <xml name="format_selector">
251 <param name="input_format_select" type="select" label="Format of sequence and model">
252 <option value="--amino">Protein</option>
253 <option value="--dna">DNA</option>
254 <option value="--rna">RNA</option>
255 </param>
256 </xml>
257 <xml name="format_selector_noprot">
258 <param name="input_format_select" type="select" label="Format of sequence and model">
259 <option value="--dna">DNA</option>
260 <option value="--rna">RNA</option>
261 </param>
262 </xml>
263 <token name="@ARSWS@">
264 $arsws.arsws_select
265
266 #if $arsws.arsws_select == "--wblosum":
267 --wid $arsws.wid
268 #end if
269 </token>
270 <xml name="arsws">
271 <!-- Alternative relative sequence weighting strategies -->
272 <conditional name="arsws">
273 <param name="arsws_select" type="select" label="Alternative relative sequence weighting strategies">
274 <option value="--wpb" selected="true">Henikoff position-based weights (--wpb)</option>
275 <option value="--wgsc">Gerstein/Sonnhammer/Chothia tree weights (--wgsc)</option>
276 <option value="--wblosum">Henikoff simple filter weights (--wblosum)</option>
277 <option value="--wnone">don't do any relative weighting; set all to 1 (--wnnoe)</option>
278 <option value="--wgiven">use weights as given in MSA file (--wgiven)</option>
279 </param>
280 <when value="--wpb">
281 </when>
282 <when value="--wgsc">
283 </when>
284 <when value="--wblosum">
285 <param name="wid" label="Set identity cutoff" value="0.62" type="float" help="(--wid)"/>
286 </when>
287 <when value="--wnone">
288 </when>
289 <when value="--wgiven">
290 </when>
291 </conditional>
292 </xml>
293 <token name="@AEEWS@">
294 #if $aeews.aeews_select != "":
295 --$aeews.aeews_select
296 #if $aeews.aeews_select == "eent":
297 --eset $aeews.eset
298 --ere $aeews.ere
299 --esigma $aeews.esigma
300 #elif $aeews.aeews_select == "eclust":
301 --eset $aeews.eset
302 --eid $aeews.eid
303 #end if
304 #end if
305 </token>
306 <xml name="aeews">
307 <!-- Alternative effective sequence weighting strategies -->
308 <conditional name="aeews">
309 <param name="aeews_select" type="select" label="Alternative effective sequence weighting strategies">
310 <option value="">Disabled</option>
311 <option value="eent">Adjust eff seq # to achieve relative entropy target (--eent)</option>
312 <option value="eclust">Eff seq # is the # of single linkage clusters (--eclust)</option>
313 <option value="enone">No effective seq # weighting: just use nseq (--enone)</option>
314 </param>
315 <when value="">
316 </when>
317 <when value="eent">
318 <param name="eset" type="float" value="0" label="set eff seq # for all models" help="(--eset)"/>
319 <param name="ere" type="float" value="0" label="set minimum rel entropy/position" help="(--ere)"/>
320 <param name="esigma" type="float" value="45" label="set sigma param" help="(--esigma)"/>
321 </when>
322 <when value="eclust">
323 <param name="eset" type="float" value="0" label="set eff seq # for all models" help="(--eset)"/>
324 <param name="eid" type="float" value="0.62" label="set fractional identity cutoff" min="0" max="1" help="(--eid)"/>
325 </when>
326 <when value="enone">
327 </when>
328 </conditional>
329 </xml>
330 <token name="@CUT@">
331 $cut_ga
332 $cut_nc
333 $cut_tc
334 </token>
335 <xml name="cut">
336 <param name="cut_ga" type="boolean" truevalue="--cut_ga" label="use profile's GA gathering cutoffs to set all thresholding" help="(--cut_ga)" falsevalue=""/>
337 <param name="cut_nc" type="boolean" truevalue="--cut_nc" label="use profile's NC gathering cutoffs to set all thresholding" help="(--cut_nc)" falsevalue=""/>
338 <param name="cut_tc" type="boolean" truevalue="--cut_tc" label="use profile's TC gathering cutoffs to set all thresholding" help="(--cut_tc)" falsevalue=""/>
339 </xml>
340 <token name="@MCSS@">
341 --$mcs.model_construction_strategy_select
342
343 #if $mcs.model_construction_strategy_select == "fast":
344 --symfrac $mcs.symfrac
345 #end if
346
347 </token>
348 <xml name="mcss">
349 <!-- Alternative model construction strategies -->
350 <conditional name="mcs">
351 <param name="model_construction_strategy_select" type="select" label="Model Construction Strategy">
352 <option value="fast" selected="true">Assign columns with &gt;= symfrac residues as consensus (--fast)</option>
353 <option value="hand">Manual construction (requires reference annotation) (--hand)</option>
354 </param>
355 <when value="fast">
356 <param name="symfrac" value="0.5" type="float" label="Sets sym fraction controlling --fast construction"/>
357 </when>
358 <when value="hand"></when>
359 </conditional>
360 <param name="fragthresh" label="Fraction of alignment length, under which sequences are excluded" help="HMMER infers fragments if the sequence length L is less than or equal to a fraction x times the alignment length in columns (--fragthresh)" value="0.5" optional="True" type="float" />
361
362 </xml>
363 <token name="@PRIOR@">
364 $aps_select
365 </token>
366 <xml name="prior">
367 <param name="aps_select" type="select" label="Alternative Prior Strategies">
368 <option value="" selected="true">Unspecified</option>
369 <option value="--pnone">Don't use any prior; parameters are frequencies (--pnone)</option>
370 <option value="--plaplace">Use a Laplace +1 prior (--plaplace)</option>
371 </param>
372 </xml>
373 <xml name="citation">
374 <citations>
375 <citation type="doi">10.1093/nar/gkr367</citation>
376 </citations>
377 </xml>
378 <token name="@LENGTHS@">
379 #if $w_beta:
380 --w_beta $w_beta
381 #end if
382
383 #if $w_length:
384 --w_length $w_length
385 #end if
386
387 </token>
388 <xml name="lengths">
389 <param name="w_beta" label="Tail mass at which window length is determined"
390 help="(--w_beta)" optional="True" type="float"/>
391 <param name="w_length" label="Window Length"
392 help="(--w_length)" optional="True" type="integer" />
393 </xml>
394 <xml name="input_hmm">
395 <param name="hmmfile" type="data" label="HMM model" format="hmm2,hmm3"/>
396 </xml>
397 <xml name="input_msa">
398 <param name="msafile" type="data" label="Multiple Sequence Alignment" format="stockholm,clustal,fasta"
399 help="in Stockholm, Clustal, or Fasta format. While this tool accepts fasta, please ensure that the sequences are not unaligned"/>
400 </xml>
401
402
403 <token name="@ACCEL_HEUR_HELP@"><![CDATA[
404 Acceleration Heuristicts (--F1, --F2, --F3)
405 -------------------------------------------
406
407 **MSV filter**
408
409 The sequence is aligned to the profile using a specialized model that
410 allows multiple high-scoring local ungapped segments to match. The
411 optimal alignment score (Viterbi score) is calculated under this multi-
412 segment model, hence the term MSV, for “multi-segment Viterbi”. This is
413 HMMER’s main speed heuristic. The MSV score is comparable to BLAST’s sum
414 score (optimal sum of ungapped alignment segments). Roughly speaking,
415 MSV is comparable to skipping the heuristic word hit and hit extension
416 steps of the BLAST acceleration algorithm.
417
418 The MSV filter is very, very fast. In addition to avoiding indel
419 calculations in the dynamic programming table, it uses reduced precision
420 scores scaled to 8-bit integers, enabling acceleration via 16-way
421 parallel SIMD vector instructions.
422
423 The MSV score is a true log-odds likelihood ratio, so it obeys
424 conjectures about the expected score distribution (Eddy, 2008) that
425 allow immediate and accurate calculation of the statistical significance
426 (P- value) of the MSV bit score.
427
428 By default, comparisons with a P-value of ≤ 0.02 pass this filter,
429 meaning that about 2% of nonhomol- ogous sequences are expected to pass.
430 You can use the --F1 option to change this threshold. For example, --F1
431 <0.05> would pass 5% of the comparisons, making a search more sensitive
432 but slower. Setting the threshold to ≥ 1.0 (--F1 99 for example) assures
433 that all comparisons will pass. Shutting off the MSV filter may be
434 worthwhile if you want to make sure you don’t miss comparisons that have
435 a lot of scattered insertions and deletions. Alternatively, the --max
436 option causes the MSV filter step (and all other filter steps) to be
437 bypassed.
438
439 The MSV bit score is calculated as a log-odds score using the null model
440 for comparison. No correction for a biased composition or repetitive
441 sequence is done at this stage. For comparisons involving biased
442 sequences and/or profiles, more than 2% of comparisons will pass the MSV
443 filter. At the end of search output, there is a line like:
444
445 Passed MSV filter: 107917 (0.020272); expected 106468.8 (0.02)
446
447 which tells you how many and what fraction of comparisons passed the MSV
448 filter, versus how many (and what fraction) were expected.
449
450 **Viterbi filter**
451
452 The sequence is now aligned to the profile using a fast Viterbi algorithm for
453 optimal gapped alignment.
454
455 This Viterbi implementation is specialized for speed. It is implemented in
456 8-way parallel SIMD vector instructions, using reduced precision scores that
457 have been scaled to 16-bit integers. Only one row of the dynamic programming
458 matrix is stored, so the routine only recovers the score, not the optimal
459 alignment itself. The reduced representation has limited range; local alignment
460 scores will not underflow, but high scoring comparisons can overflow and return
461 infinity, in which case they automatically pass the filter.
462
463 The final Viterbi filter bit score is then computed using the appropriate null
464 model log likelihood (by default the biased composition filter model score, or
465 if the biased filter is off, just the null model score). If the P-value of this
466 score passes the Viterbi filter threshold, the sequence passes on to the next
467 step of the pipeline.
468
469 The --F2 <x> option controls the P-value threshold for passing the Viterbi
470 filter score. The default is 0.001. The --max option bypasses all filters in
471 the pipeline. At the end of a search output, you will see a line like:
472
473 Passed Vit filter: 2207 (0.00443803); expected 497.3 (0.001)
474
475 which tells you how many and what fraction of comparisons passed the Viterbi
476 filter, versus how many were expected.
477
478 **Forward filter/parser**
479
480 The sequence is now aligned to the profile using the full Forward algorithm,
481 which calculates the likelihood of the target sequence given the profile,
482 summed over the ensemble of all possible alignments.
483
484 This is a specialized time- and memory-efficient Forward implementation called
485 the “Forward parser”. It is implemented in 4-way parallel SIMD vector
486 instructions, in full precision (32-bit floating point). It stores just enough
487 information that, in combination with the results of the Backward parser
488 (below), posterior probabilities of start and stop points of alignments
489 (domains) can be calculated in the domain definition step (below), although the
490 detailed alignments themselves cannot be.
491
492 The Forward filter bit score is calculated by correcting this score using the
493 appropriate null model log likelihood (by default the biased composition filter
494 model score, or if the biased filter is off, just the null model score). If the
495 P-value of this bit score passes the Forward filter threshold, the sequence
496 passes on to the next step of the pipeline.
497
498 The bias filter score has no further effect in the pipeline. It is only used in
499 filter stages. It has no effect on final reported bit scores or P-values.
500 Biased composition compensation for final bit scores is done by a more complex
501 domain-specific algorithm, described below.
502
503 The --F3 <x> option controls the P-value threshold for passing the Forward
504 filter score. The default is 1e-5. The --max option bypasses all filters in the
505 pipeline. At the end of a search output, you will see a line like:
506
507 Passed Fwd filter: 1076 (0.00216371); expected 5.0 (1e-05)
508
509 which tells you how many and what fraction of comparisons passed the Forward
510 filter, versus how many were expected.
511
512 **Bias Filter Options**
513
514 The --max option bypasses all filters in the pipeline, including the bias
515 filter.
516
517 The --nobias option turns off (bypasses) the biased composition filter. The
518 simple null model is used as a null hypothesis for MSV and in subsequent filter
519 steps. The biased composition filter step compromises a small amount of
520 sensitivity. Though it is good to have it on by default, you may want to shut
521 it off if you know you will have no problem with biased composition hits.
522
523
524 **Advanced Documentation**
525
526 A more detailed look at the internals of the various filter pipelines was
527 posted on the `developer's blog <http://selab.janelia.org/people/eddys/blog/?p=508>`__.
528 The information posted there may be useful to those who are struggling with
529 poor-scoring sequences.
530
531 ]]></token>
532 <token name="@ADV_OPTS_HELP@"><![CDATA[
533 Advanced Options
534 ----------------
535
536 **nonull2**
537
538 can be too aggressive sometimes, causing you to miss homologs. You can turn the
539 biased-composition score correction off with the --nonull2 option (and if
540 you’re doing that, you may also want to set --nobias, to turn off another
541 biased composition step called the bias filter, which affects which sequences
542 get scored at all).
543
544 **domZ**
545
546 Assert that the total number of targets in your searches is <x>, for the
547 purposes of per-domain conditional E-value calculations, rather than the number
548 of targets that passed the reporting thresholds.
549
550 **Z**
551
552 Assert that the total number of targets in your searches is <x>, for the
553 purposes of per-sequence E-value calculations, rather than the actual number of
554 targets seen.
555 ]]></token>
556 <token name="@AEEWS_HELP@"><![CDATA[
557 Effective Sequence Number
558 -------------------------
559
560 After relative weights are determined, they are normalized to sum to a total
561 effective sequence number, eff nseq. This number may be the actual number of
562 sequences in the alignment, but it is almost always smaller than that. The
563 default entropy weighting method (--eent) reduces the effective sequence num-
564 ber to reduce the information content (relative entropy, or average expected
565 score on true homologs) per consensus position. The target relative entropy is
566 controlled by a two-parameter function, where the two parameters are settable
567 with --ere and --esigma.
568
569 **--eent**
570
571 Adjust effective sequence number to achieve a specific relative entropy per
572 position (see --ere). This is the default.
573
574 **--eclust**
575
576 Set effective sequence number to the number of single-linkage clusters at a
577 specific identity threshold (see --eid). This option is not recommended; it’s
578 for experiments evaluating how much better --eent is.
579
580 **--enone**
581
582 Turn off effective sequence number determination and just use the actual number
583 of sequences. One reason you might want to do this is to try to maximize the
584 relative entropy/position of your model, which may be useful for short models.
585
586 **--eset**
587
588 Explicitly set the effective sequence number for all models to <x>.
589
590 **--ere**
591
592 Set the minimum relative entropy/position target to <x>. Requires --eent. Default
593 depends on the sequence alphabet. For protein sequences, it is 0.59 bits/position;
594 for nucleotide sequences, it is 0.45 bits/position.
595
596 **--esigma**
597
598 Sets the minimum relative entropy contributed by an entire model alignment, over
599 its whole length. This has the effect of making short models have higher relative
600 entropy per position than --ere alone would give. The default is 45.0 bits.
601
602 **--eid**
603
604 Sets the fractional pairwise identity cutoff used by single linkage clustering
605 with the --eclust option. The default is 0.62.
606 ]]></token>
607 <token name="@ARSWS_HELP@"><![CDATA[
608 Options Controlling Relative Weights
609 ------------------------------------
610
611 HMMER uses an ad hoc sequence weighting algorithm to downweight closely related
612 sequences and up-weight distantly related ones. This has the effect of making
613 models less biased by uneven phylogenetic representation. For example, two
614 identical sequences would typically each receive half the weight that one
615 sequence would. These options control which algorithm gets used.
616
617
618 **--wpb**
619
620 Use the Henikoff position-based sequence weighting scheme [Henikoff and
621 Henikoff, J. Mol. Biol. 243:574, 1994]. This is the default.
622
623 **--wgsc**
624
625 Use the Gerstein/Sonnhammer/Chothia weighting algorithm [Gerstein et al, J.
626 Mol. Biol. 235:1067, 1994].
627
628 **--wblosum**
629
630 Use the same clustering scheme that was used to weight data in calculating
631 BLOSUM subsitution matrices [Henikoff and Henikoff, Proc. Natl. Acad. Sci
632 89:10915, 1992]. Sequences are single-linkage clustered at an identity
633 threshold (default 0.62; see --wid) and within each cluster of c sequences,
634 each sequence gets rela- tive weight 1/c.
635
636 **--wnone**
637
638 No relative weights. All sequences are assigned uniform weight.
639
640 **--wid**
641
642 Sets the identity threshold used by single-linkage clustering when using
643 --wblosum. Invalid with any other weighting scheme. Default is 0.62.
644 ]]></token>
645 <token name="@BIAS_COMP_HELP@"><![CDATA[
646 Bias Composition
647 ----------------
648
649 The next number, the bias, is a correction term for biased sequence composition
650 that has been applied to the sequence bit score.1 For instance, for the top hit
651 MYG PHYCA that scored 222.7 bits, the bias of 3.2 bits means that this sequence
652 originally scored 225.9 bits, which was adjusted by the slight 3.2 bit biased-
653 composition correction. The only time you really need to pay attention to the
654 bias value is when it’s large, on the same order of magnitude as the sequence
655 bit score. Sometimes (rarely) the bias correction isn’t aggressive enough, and
656 allows a non-homolog to retain too much score. Conversely, the bias correction
657 can be too aggressive sometimes, causing you to miss homologs. You can turn the
658 biased-composition score correction off with the --nonull2 option (and if
659 you’re doing that, you may also want to set --nobias, to turn off another
660 biased composition step called the bias filter, which affects which sequences
661 get scored at all).
662
663 ]]></token>
664 <token name="@CUT_HELP@"><![CDATA[
665 Options for Model-specific Score Thresholding
666 ---------------------------------------------
667
668 Curated profile databases may define specific bit score thresholds for each
669 profile, superseding any thresholding based on statistical significance alone.
670 To use these options, the profile must contain the appropriate (GA, TC, and/or
671 NC) optional score threshold annotation; this is picked up by hmmbuild from
672 Stockholm format alignment files. Each thresholding option has two scores: the
673 per-sequence threshold <x1> and the per-domain threshold <x2> These act as if
674 -T<x1> --incT<x1> --domT<x2> --incdomT<x2> has been applied specifically using
675 each model’s curated thresholds.
676
677 **--cut_ga**
678
679 Use the GA (gathering) bit scores in the model to set per-sequence (GA1) and
680 per-domain (GA2) reporting and inclusion thresholds. GA thresholds are
681 generally considered to be the reliable curated thresholds defining family
682 membership; for example, in Pfam, these thresholds define what gets included in
683 Pfam Full alignments based on searches with Pfam Seed models.
684
685 **--cut_nc**
686
687 Use the NC (noise cutoff) bit score thresholds in the model to set
688 per-sequence (NC1) and per-domain (NC2) reporting and inclusion thresholds. NC
689 thresholds are generally considered to be the score of the highest-scoring
690 known false positive.
691
692 **--cut_tc**
693
694 Use the NC (trusted cutoff) bit score thresholds in the model to set
695 per-sequence (TC1) and per-domain (TC2) reporting and inclusion thresholds. TC
696 thresholds are generally considered to be the score of the lowest-scoring known
697 true positive that is above all known false positives.
698 ]]></token>
699 <token name="@EVAL_CALIB_HELP@"><![CDATA[
700 Options Controlling H3 Parameter Estimation Methods
701 ---------------------------------------------------
702
703 H3 uses three short random sequence simulations to estimating the location
704 parameters for the expected score distributions for MSV scores, Viterbi scores,
705 and Forward scores. These options allow these simulations to be modified.
706
707 **--EmL**
708
709 Sets the sequence length in simulation that estimates the location parameter mu
710 for MSV E-values. Default is 200.
711
712 **--EmN**
713
714 Sets the number of sequences in simulation that estimates the location parameter
715 mu for MSV E-values. Default is 200.
716
717 **--EvL**
718
719 Sets the sequence length in simulation that estimates the location parameter mu
720 for Viterbi E-values. Default is 200.
721
722 **--EvN**
723
724 Sets the number of sequences in simulation that estimates the location parameter
725 mu for Viterbi E-values. Default is 200.
726
727
728 **--EfL**
729
730 Sets the sequence length in simulation that estimates the location parameter tau
731 for Forward E-values. Default is 100.
732
733 **--EfN**
734
735 Sets the number of sequences in simulation that estimates the location parameter
736 tau for Forward E-values. Default is 200.
737
738 **--Eft**
739
740 Sets the tail mass fraction to fit in the simulation that estimates the location param-
741 eter tau for Forward evalues. Default is 0.04.
742 ]]></token>
743 <token name="@FORMAT_SELECTOR_HELP@"><![CDATA[
744 Options for Specifying the Alphabet
745 -----------------------------------
746
747 The alphabet type (amino, DNA, or RNA) is autodetected by default, by looking
748 at the composition of the msafile. Autodetection is normally quite reliable,
749 but occasionally alphabet type may be ambiguous and autodetection can fail (for
750 instance, on tiny toy alignments of just a few residues). To avoid this, or to
751 increase robustness in automated analysis pipelines, you may specify the
752 alphabet type of msafile with these options.
753 ]]></token>
754 <token name="@HSSI_HELP@"><![CDATA[
755 Options Controlling Single Sequence Scoring (first Iteration)
756 -------------------------------------------------------------
757
758 By default, the first iteration uses a search model constructed from a single
759 query sequence. This model is constructed using a standard 20x20 substitution
760 matrix for residue probabilities, and two additional pa- rameters for
761 position-independent gap open and gap extend probabilities. These options allow
762 the default single-sequence scoring parameters to be changed.
763
764 **Gap Open (--popen)**
765
766 Set the gap open probability for a single sequence query model to <x>
767
768 **Gap Extend (--pextend)**
769
770 Set the gap extend probability for a single sequence query model to <x>.
771
772
773 **--mx/--mxfile**
774
775 These options are not currently supported
776 ]]></token>
777 <token name="@LENGTHS_HELP@"><![CDATA[
778 Tail Mass Options
779 -----------------
780
781 **Window length tail mass (--w_beta)**
782
783 The upper bound, W, on the length at which nhmmer expects to find an instance
784 of the model is set such that the fraction of all sequences generated by the
785 model with length >= W is less than <x>. The default is 1e-7.
786
787
788 **Model instance length upper bound (--w length)**
789
790 Override the model instance length upper bound, W, which is otherwise
791 controlled by --w beta. It should be larger than the model length. The value of
792 W is used deep in the acceleration pipeline, and modest changes are not
793 expected to impact results (though larger values of W do lead to longer run
794 time).
795
796 ]]></token>
797 <token name="@MCSS_HELP@"><![CDATA[
798 **Options Controlling Profile Construction**
799
800 These options control how consensus columns are defined in an alignment.
801
802 **--fast**
803
804 Define consensus columns as those that have a fraction >= symfrac of residues
805 as opposed to gaps. (See below for the --symfrac option.) This is the default.
806
807 **--hand**
808
809 Define consensus columns in next profile using reference annotation to the multiple
810 alignment. This allows you to define any consensus columns you like.
811
812
813 **--symfrac**
814
815 Define the residue fraction threshold necessary to define a consensus column
816 when using the --fast option. The default is 0.5. The symbol fraction in each
817 column is calculated after taking relative sequence weighting into account, and
818 ignoring gap characters corresponding to ends of sequence fragments (as opposed
819 to internal insertions/deletions). Setting this to 0.0 means that every
820 alignment column will be assigned as consensus, which may be useful in some
821 cases. Setting it to 1.0 means that only columns that include 0 gaps (internal
822 insertions/deletions) will be assigned as consensus.
823
824 **--fragthresh**
825
826 We only want to count terminal gaps as deletions if the aligned sequence is
827 known to be full-length, not if it is a fragment (for instance, because only
828 part of it was sequenced). HMMER uses a simple rule to infer fragments: if the
829 sequence length L is less than or equal to a fraction <x> times the alignment
830 length in columns, then the sequence is handled as a fragment. The default is
831 0.5. Setting --fragthresh0 will define no (nonempty) sequence as a fragment;
832 you might want to do this if you know you’ve got a carefully curated alignment
833 of full-length sequences. Setting --fragthresh1 will define all sequences as
834 fragments; you might want to do this if you know your alignment is entirely
835 composed of fragments, such as translated short reads in metagenomic shotgun
836 data.
837
838 ]]></token>
839 <token name="@OFORMAT_WITH_OPTS_HELP@"><![CDATA[
840 Options for Controlling Output
841 ------------------------------
842
843 **Table of hits**
844
845 Save a simple tabular (space-delimited) file summarizing the per-target output, with
846 one data line per homologous target model found.
847
848 **Table of per-domain hits**
849
850 Save a simple tabular (space-delimited) file summarizing the per-domain output,
851 with one data line per homologous domain detected in a query sequence for each
852 homologous model.
853
854 **Table of hits and domains in Pfam Format**
855
856 Save an especially succinct tabular (space-delimited) file summarizing the
857 per-target output, with one data line per homologous target model found.
858 ]]></token>
859 <token name="@OFORMAT_WITH_OPTS_NOPFAM_HELP@"><![CDATA[
860 Options for Controlling Output
861 ------------------------------
862
863 **Table of hits**
864
865 Save a simple tabular (space-delimited) file summarizing the per-target output, with
866 one data line per homologous target model found.
867
868 **Table of per-domain hits**
869
870 Save a simple tabular (space-delimited) file summarizing the per-domain output,
871 with one data line per homologous domain detected in a query sequence for each
872 homologous model.
873 ]]></token>
874 <token name="@OFORMAT_WITH_OPTS_N_HELP@"><![CDATA[
875 Options for Controlling Output
876 ------------------------------
877
878 **Table of hits**
879
880 Save a simple tabular (space-delimited) file summarizing the per-target output, with
881 one data line per homologous target model found.
882
883 **Table of hits (dfam)**
884
885 Save a tabular (space-delimited) file summarizing the per-hit output, similar
886 to --tblout but more succinct.
887
888
889 **List of per-position scores for each hit (--aliscoreout)**
890
891 Save to file a list of per-position scores for each hit. This is useful, for
892 example, in identifying regions of high score density for use in resolving
893 overlapping hits from different models.
894
895 ]]></token>
896 <token name="@PRIOR_HELP@"><![CDATA[
897 Options Controlling Priors
898 --------------------------
899
900 By default, weighted counts are converted to mean posterior probability
901 parameter estimates using mixture Dirichlet priors. Default mixture Dirichlet
902 prior parameters for protein models and for nucleic acid (RNA and DNA) models
903 are built in. The following options allow you to override the default priors.
904
905 **No priors (--pnone)**
906
907 Don’t use any priors. Probability parameters will simply be the observed
908 frequencies, after relative sequence weighting.
909
910 **Laplace +1 prior**
911
912 Use a Laplace +1 prior in place of the default mixture Dirichlet prior.
913 ]]></token>
914 <token name="@SEED_HELP@"><![CDATA[
915 Random Seeding
916 --------------
917
918 Seed the random number generator with <n>, an integer >= 0. If <n> is nonzero,
919 any stochastic simulations will be reproducible; the same command will give the
920 same results. If <n> is 0, the random number generator is seeded arbitrarily,
921 and stochastic simulations will vary from run to run of the same command.
922
923 ]]></token>
924 <token name="@THRESHOLDS_HELP@"><![CDATA[
925 Options for Reporting Thresholds
926 --------------------------------
927
928 Reporting thresholds control which hits are reported in output files (the main
929 output, --tblout, and --domtblout).
930
931 **E-value (-E)**
932
933 In the per-target output, report target profiles with an E-value of <= <x>. The
934 default is 10.0, meaning that on average, about 10 false positives will be
935 reported per query, so you can see the top of the noise and decide for yourself
936 if it’s really noise.
937
938 **Bit score (-T)**
939
940 Instead of thresholding per-profile output on E-value, instead report target profiles
941 with a bit score of >= <x>.
942
943 **domain E-value (--domE)**
944
945 In the per-domain output, for target profiles that have already satisfied the
946 per-profile reporting threshold, report individual domains with a conditional
947 E-value of <= <x>. The default is 10.0. A conditional E-value means the
948 expected number of additional false positive domains in the smaller search
949 space of those comparisons that already satisfied the per-profile reporting
950 threshold (and thus must have at least one homologous domain already).
951
952 **domain Bit scores (--domT)**
953
954 Instead of thresholding per-domain output on E-value, instead report domains
955 with a bit score of >= <x>.
956
957 Options for Inclusion Thresholds
958 --------------------------------
959
960 Inclusion thresholds are stricter than reporting thresholds. Inclusion
961 thresholds control which hits are considered to be reliable enough to be
962 included in an output alignment or a subsequent search round. In hmmscan, which
963 does not have any alignment output (like hmmsearch or phmmer) nor any iterative
964 search steps (like jackhmmer), inclusion thresholds have little effect. They
965 only affect what domains get marked as significant (!) or questionable (?) in
966 domain output.
967
968 **E-value of per target inclusion threshold**
969
970 Use an E-value of <= <x> as the per-target inclusion threshold. The default is
971 0.01, meaning that on average, about 1 false positive would be expected in
972 every 100 searches with different query sequences.
973
974 **Bit score of per target inclusion threshold**
975
976 Instead of using E-values for setting the inclusion threshold, instead use a
977 bit score of >= <x> as the per-target inclusion threshold. It would be unusual
978 to use bit score thresholds with hmmscan, because you don’t expect a single
979 score threshold to work for different profiles; different profiles have
980 slightly different expected score distributions.
981
982 **domain E-value per target inclusion treshold**
983
984 Use a conditional E-value of <= <x> as the per-domain inclusion threshold, in
985 targets that have already satisfied the overall per-target inclusion threshold.
986
987 **domain Bit score per target inclusion treshold**
988
989 Instead of using E-values, instead use a bit score of >= <x> as the per-domain
990 inclusion threshold. As with --incT above, it would be unusual to use a single
991 bit score threshold in hmmscan.
992
993 ]]></token>
994 <token name="@THRESHOLDS_NODOM_HELP@"><![CDATA[
995 Options for Reporting Thresholds
996 --------------------------------
997
998 Reporting thresholds control which hits are reported in output files (the main
999 output, --tblout, and --domtblout).
1000
1001 **E-value (-E)**
1002
1003 In the per-target output, report target profiles with an E-value of <= <x>. The
1004 default is 10.0, meaning that on average, about 10 false positives will be
1005 reported per query, so you can see the top of the noise and decide for yourself
1006 if it’s really noise.
1007
1008 **Bit score (-T)**
1009
1010 Instead of thresholding per-profile output on E-value, instead report target profiles
1011 with a bit score of >= <x>.
1012
1013 Options for Inclusion Thresholds
1014 --------------------------------
1015
1016 Inclusion thresholds are stricter than reporting thresholds. Inclusion
1017 thresholds control which hits are considered to be reliable enough to be
1018 included in an output alignment or a subsequent search round. In hmmscan, which
1019 does not have any alignment output (like hmmsearch or phmmer) nor any iterative
1020 search steps (like jackhmmer), inclusion thresholds have little effect. They
1021 only affect what domains get marked as significant (!) or questionable (?) in
1022 domain output.
1023
1024 **E-value of per target inclusion threshold**
1025
1026 Use an E-value of <= <x> as the per-target inclusion threshold. The default is
1027 0.01, meaning that on average, about 1 false positive would be expected in
1028 every 100 searches with different query sequences.
1029
1030 **Bit score of per target inclusion threshold**
1031
1032 Instead of using E-values for setting the inclusion threshold, instead use a
1033 bit score of >= <x> as the per-target inclusion threshold. It would be unusual
1034 to use bit score thresholds with hmmscan, because you don’t expect a single
1035 score threshold to work for different profiles; different profiles have
1036 slightly different expected score distributions.
1037
1038 ]]></token>
1039 <token name="@ATTRIBUTION@"><![CDATA[
1040
1041 Attribution
1042 -----------
1043
1044 This Galaxy tool relies on HMMER3_ from http://hmmer.janelia.org/
1045 Internally the software is cited as:
1046
1047 ::
1048
1049 # hmmscan :: search sequence(s) against a profile database
1050 # HMMER 3.1 (February 2013); http://hmmer.org/
1051 # Copyright (C) 2011 Howard Hughes Medical Institute.
1052 # Freely distributed under the GNU General Public License (GPLv3).
1053 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1054
1055 The wrappers were written by Eric Rasche and is licensed under Apache2_. The
1056 documentation is copied from the HMMER3 documentation.
1057
1058 .. _Apache2: http://www.apache.org/licenses/LICENSE-2.0
1059 .. _HMMER3: http://hmmer.janelia.org/
1060
1061
1062 ]]></token>
1063 <token name="@HELP_PRE@"><![CDATA[
1064
1065 What it does
1066 ============
1067 ]]></token>
1068 <token name="@HELP_PRE_OTH@"><![CDATA[
1069 Options
1070 =======
1071 ]]></token>
1072 </macros>