Mercurial > repos > iuc > hmmer_hmmscan

<?xml version="1.0"?>
<macros>
  <xml name="requirements">
    <requirements>
      <requirement type="package" version="3.1b2">hmmer</requirement>
      <yield/>
    </requirements>
  </xml>
  <token name="@WRAPPER_VERSION@">0.1</token>
  <xml name="stdio">
    <stdio>
      <!-- Anything other than zero is an error -->
      <exit_code range="1:"/>
      <exit_code range=":-1"/>
      <!-- In case the return code has not been set propery check stderr too -->
      <regex match="Error:"/>
      <regex match="Exception:"/>
    </stdio>
  </xml>
  <token name="@THRESHOLDS@">
-E $E
--domE $domE

#if $T:
-T $T
#end if

#if $domT:
--domT $domT
#end if

#if $incE:
--incE $incE
#end if

#if $incT:
--incT $incT
#end if

#if $incdomE:
--incdomE $incdomE
#end if

#if $incdomT:
--incdomT $incdomT
#end if
  </token>
  <xml name="thresholds_xml">
    <!-- Options controlling reporting thresholds -->
    <param name="E" label="report sequences &lt;= this E-Value threshold in output" help="(-E)" value="10.0" type="float" min="0"/>
    <param name="domE" label="report domains &lt;= this E-Value threshold in output" help="(--domE)" value="10.0" type="float" min="0"/>
    <param name="T" label="report sequences &gt;= this score threshold in output" help="(-T)" type="float" optional="True"/>
    <param name="domT" label="report domains &gt;= this score threshold in output" help="(--domT)" type="float" optional="True"/>
    <!-- Options controlling inclusion (significance) thresholds -->
    <param name="incE" label="consider sequences &lt;= this E-Value threshold as significant" help="(--incE)" type="float" optional="True"/>
    <param name="incdomE" label="consider domains &lt;= this E-Value threshold as significant" help="(--incdomE)" type="float" optional="True"/>
    <param name="incT" label="consider sequences &gt;= this score threshold as significant" help="(--incT)" type="float" optional="True"/>
    <param name="incdomT" label="consider domains &gt;= this score threshold as significant" help="(--incdomT)" type="float" optional="True"/>
  </xml>
  <token name="@THRESHOLDS_NODOM@">
-E $E

#if $T:
-T $T
#end if

#if $incE:
--incE $incE
#end if

#if $incT:
--incT $incT
#end if
  </token>
  <xml name="thresholds_nodom">
    <!-- Options controlling reporting thresholds -->
    <param name="E" label="report sequences &lt;= this E-Value threshold in output" help="(-E)" value="10.0" type="float" min="0"/>
    <param name="T" label="report sequences &gt;= this score threshold in output" help="(-T)" type="float" optional="True"/>
    <!-- Options controlling inclusion (significance) thresholds -->
    <param name="incE" label="consider sequences &lt;= this E-Value threshold as significant" help="(--incE)" type="float" optional="True"/>
    <param name="incT" label="consider sequences &gt;= this score threshold as significant" help="(--incT)" type="float" optional="True"/>
  </xml>
  <token name="@ACCEL_HEUR@">
$max
--F1 $F1
--F2 $F2
--F3 $F3
$nobias

  </token>
  <xml name="accel_heur_xml">
    <!-- Options controlling acceleration heuristics -->
    <param name="max" type="boolean" truevalue="--max" label="Turn all heuristic filters off (less speed, more power)" help="(--max)" falsevalue=""/>
    <param name="F1" type="float" label="Stage 1 (MSV) threshold: promote hits w/ P &lt;= F1" help="(--F1)" value="0.02"/>
    <param name="F2" type="float" label="Stage 2 (Vit) threshold: promote hits w/ P &lt;= F2" help="(--F2)" value="1e-3"/>
    <param name="F3" type="float" label="Stage 3 (Fwd) threshold: promote hits w/ P &lt;= F3" help="(--F3)" value="1e-5"/>
    <param name="nobias" type="boolean" truevalue="--nobias" label="Turn off composition bias filter" help="(--nobias)" falsevalue=""/>
  </xml>
  <token name="@EVAL_CALIB@">
--EmL $EmL
--EmN $EmN
--EvL $EvL
--EvN $EvN
--EfL $EfL
--EfN $EfN
--Eft $Eft
  </token>
  <xml name="eval_calib_xml">
    <!-- Control of E-value calibration -->
    <param name="EmL" type="integer" value="200" min="1" help="(--EmL)" label="Length of sequences for MSV Gumbel mu fit"/>
    <param name="EmN" type="integer" value="200" min="1" help="(--EmN)" label="Number of sequences for MSV Gumbel mu fit"/>
    <param name="EvL" type="integer" value="200" min="1" help="(--EvL)" label="Length of sequences for Viterbi Gumbel mu fit"/>
    <param name="EvN" type="integer" value="200" min="1" help="(--EvN)" label="Number of sequences for Viterbi Gumbel mu fit"/>
    <param name="EfL" type="integer" value="100" min="1" help="(--EfL)" label="Length of sequences for Forward exp tail tau fit"/>
    <param name="EfN" type="integer" value="200" min="1" help="(--EfN)" label="Number of sequences for Forward exp tail tau fit"/>
    <param name="Eft" type="float" value="0.04" min="0" max="1" help="(--Eft)" label="tail mass for Forward exponential tail tau fit"/>
  </xml>
  <token name="@OFORMAT_WITH_OPTS_NOPFAM@">
#if 'tblout' in str($oformat):
    --tblout $tblout
#end if

#if 'domtblout' in str($oformat):
    --domtblout $domtblout
#end if

$acc $noali $notextw
  </token>
  <xml name="oformat_with_opts_nopfam">
    <!-- Options directing output -->
    <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
      <option value="tblout" selected="true">Table of per-sequence hits (--tblout)</option>
      <option value="domtblout" selected="true">Table of per-domain hits (--domtblout)</option>
    </param>
    <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
    <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
    <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
  </xml>
  <token name="@OFORMAT_WITH_OPTS@">
#if 'tblout' in str($oformat):
    --tblout $tblout
#end if

#if 'domtblout' in str($oformat):
    --domtblout $domtblout
#end if

#if 'pfamtblout' in str($oformat):
    --pfamtblout $pfamtblout
#end if

$acc $noali $notextw
  </token>
  <xml name="oformat_with_opts">
    <!-- Options directing output -->
    <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
      <option value="tblout" selected="true">Table of per-sequence hits (--tblout)</option>
      <option value="domtblout" selected="true">Table of per-domain hits (--domtblout)</option>
      <option value="pfamtblout" selected="true">Table of hits and domains in Pfam format (--pfamtblout)</option>
    </param>
    <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
    <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
    <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
  </xml>
  <xml name="oformat_test">
      <param name="notextw" value="True" />
  </xml>
  <!-- TODO: tblout will match 'pfamtblout,dfamtblout' -->
  <token name="@OFORMAT_WITH_OPTS_N@">
#if 'tblout' in str($oformat):
    --tblout $tblout
#end if

#if 'dfamtblout' in str($oformat):
    --dfamtblout $dfamtblout
#end if

#if 'aliscoresout' in str($oformat):
    --aliscoresout $aliscoresout
#end if

$acc $noali $notextw
  </token>
  <xml name="oformat_with_opts_n">
    <!-- Options directing output -->
    <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
      <option value="tblout" selected="true">Table of hits (--tblout)</option>
      <option value="dfamtblout" selected="true">Table of hits in Dfam format (--dfamtblout)</option>
      <option value="aliscoresout">Scores for each position in each alignment to file (--aliscoresout)</option>
    </param>
    <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
    <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
    <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
  </xml>
  <token name="@HSSI@">
#if $hssi.hssi_select == "singlemx":
    --popen $hssi.popen
    --pextend $hssi.pextend
#end if
  </token>
  <xml name="hssi">
    <!-- Handling single sequence inputs -->
    <conditional name="hssi">
      <param name="hssi_select" type="select" label="Options for handling single sequence inputs">
        <option value="false" selected="true">Disable</option>
        <option value="singlemx">Use substitution score matrix for single-sequence inputs</option>
      </param>
      <when value="singlemx">
        <param name="popen" type="float" value="0.02" label="Gap open probability" help="(--popen)" min="0.0" max="0.5"/>
        <param name="pextend" type="float" value="0.4" label="Gap extend probability" help="(--pextend)" min="0.0" max="1.0"/>
      </when>
      <when value="false">
      </when>
      <!-- -mx <s>      : substitution score matrix (built-in matrices, with -singlemx)-->
      <!-- -mxfile <f>  : read substitution score matrix from file <f> (with -singlemx)-->
    </conditional>
  </xml>
  <token name="@CPU@">
      --cpu \${GALAXY_SLOTS:-2}
  </token>
  <token name="@SEED@">
      --seed $seed
  </token>
  <xml name="seed">
    <param name="seed" label="RNG seed, 0 generates a random seed" value="42" type="integer" help="(--seed)" min="0"/>
  </xml>
  <xml name="seed_test">
      <param name="seed" value="4" />
  </xml>
  <token name="@ADV_OPTS@">
$nonull2

#if $Z:
-Z $Z
#end if

#if $domZ:
--domZ $domZ
#end if
  </token>
  <xml name="adv_opts">
    <!-- Other options -->
    <param name="nonull2" type="boolean" truevalue="--nonull2" label="Turn off biased composition score corrections" help="(--nonull2)" falsevalue=""/>
    <param name="Z" type="integer" label="# of comparisons done for E-value calculation" help="(-Z)" optional="True"/>
    <param name="domZ" type="integer" label="# of significant sequences, for domain E-value calculation" help="(--domZ)" optional="True"/>
  </xml>
  <token name="@FORMAT_SELECTOR@">
      $input_format_select
  </token>
  <xml name="format_selector">
    <param name="input_format_select" type="select" label="Format of sequence and model">
      <option value="--amino">Protein</option>
      <option value="--dna">DNA</option>
      <option value="--rna">RNA</option>
    </param>
  </xml>
  <xml name="format_selector_noprot">
    <param name="input_format_select" type="select" label="Format of sequence and model">
      <option value="--dna">DNA</option>
      <option value="--rna">RNA</option>
    </param>
  </xml>
  <token name="@ARSWS@">
$arsws.arsws_select

#if $arsws.arsws_select == "--wblosum":
--wid $arsws.wid
#end if
  </token>
  <xml name="arsws">
    <!-- Alternative relative sequence weighting strategies -->
    <conditional name="arsws">
      <param name="arsws_select" type="select" label="Alternative relative sequence weighting strategies">
        <option value="--wpb" selected="true">Henikoff position-based weights (--wpb)</option>
        <option value="--wgsc">Gerstein/Sonnhammer/Chothia tree weights (--wgsc)</option>
        <option value="--wblosum">Henikoff simple filter weights (--wblosum)</option>
        <option value="--wnone">don't do any relative weighting; set all to 1 (--wnnoe)</option>
        <option value="--wgiven">use weights as given in MSA file (--wgiven)</option>
      </param>
      <when value="--wpb">
      </when>
      <when value="--wgsc">
      </when>
      <when value="--wblosum">
        <param name="wid" label="Set identity cutoff" value="0.62" type="float" help="(--wid)"/>
      </when>
      <when value="--wnone">
      </when>
      <when value="--wgiven">
      </when>
    </conditional>
  </xml>
  <token name="@AEEWS@">
#if $aeews.aeews_select != "":
--$aeews.aeews_select
    #if $aeews.aeews_select == "eent":
        --eset $aeews.eset
        --ere $aeews.ere
        --esigma $aeews.esigma
    #elif $aeews.aeews_select == "eclust":
        --eset $aeews.eset
        --eid $aeews.eid
    #end if
#end if
  </token>
  <xml name="aeews">
    <!-- Alternative effective sequence weighting strategies -->
    <conditional name="aeews">
      <param name="aeews_select" type="select" label="Alternative effective sequence weighting strategies">
        <option value="">Disabled</option>
        <option value="eent">Adjust eff seq # to achieve relative entropy target (--eent)</option>
        <option value="eclust">Eff seq # is the # of single linkage clusters (--eclust)</option>
        <option value="enone">No effective seq # weighting: just use nseq (--enone)</option>
      </param>
      <when value="">
      </when>
      <when value="eent">
        <param name="eset" type="float" value="0" label="set eff seq # for all models" help="(--eset)"/>
        <param name="ere" type="float" value="0" label="set minimum rel entropy/position" help="(--ere)"/>
        <param name="esigma" type="float" value="45" label="set sigma param" help="(--esigma)"/>
      </when>
      <when value="eclust">
        <param name="eset" type="float" value="0" label="set eff seq # for all models" help="(--eset)"/>
        <param name="eid" type="float" value="0.62" label="set fractional identity cutoff" min="0" max="1" help="(--eid)"/>
      </when>
      <when value="enone">
      </when>
    </conditional>
  </xml>
  <token name="@CUT@">
$cut_ga
$cut_nc
$cut_tc
  </token>
  <xml name="cut">
    <param name="cut_ga" type="boolean" truevalue="--cut_ga" label="use profile's GA gathering cutoffs to set all thresholding" help="(--cut_ga)" falsevalue=""/>
    <param name="cut_nc" type="boolean" truevalue="--cut_nc" label="use profile's NC gathering cutoffs to set all thresholding" help="(--cut_nc)" falsevalue=""/>
    <param name="cut_tc" type="boolean" truevalue="--cut_tc" label="use profile's TC gathering cutoffs to set all thresholding" help="(--cut_tc)" falsevalue=""/>
  </xml>
  <token name="@MCSS@">
--$mcs.model_construction_strategy_select

#if $mcs.model_construction_strategy_select == "fast":
--symfrac $mcs.symfrac
#end if

  </token>
  <xml name="mcss">
    <!-- Alternative model construction strategies -->
    <conditional name="mcs">
      <param name="model_construction_strategy_select" type="select" label="Model Construction Strategy">
        <option value="fast" selected="true">Assign columns with &gt;= symfrac residues as consensus (--fast)</option>
        <option value="hand">Manual construction (requires reference annotation) (--hand)</option>
      </param>
      <when value="fast">
        <param name="symfrac" value="0.5" type="float" label="Sets sym fraction controlling --fast construction"/>
      </when>
      <when value="hand"></when>
    </conditional>
    <param name="fragthresh" label="Fraction of alignment length, under which sequences are excluded" help="HMMER infers fragments if the sequence length L is less than or equal to a fraction x times the alignment length in columns (--fragthresh)" value="0.5" optional="True" type="float" />

  </xml>
  <token name="@PRIOR@">
$aps_select
  </token>
  <xml name="prior">
    <param name="aps_select" type="select" label="Alternative Prior Strategies">
      <option value="" selected="true">Unspecified</option>
      <option value="--pnone">Don't use any prior; parameters are frequencies (--pnone)</option>
      <option value="--plaplace">Use a Laplace +1 prior (--plaplace)</option>
    </param>
  </xml>
  <xml name="citation">
    <citations>
      <citation type="doi">10.1093/nar/gkr367</citation>
    </citations>
  </xml>
  <token name="@LENGTHS@">
#if $w_beta:
--w_beta $w_beta
#end if

#if $w_length:
--w_length $w_length
#end if

  </token>
  <xml name="lengths">
    <param name="w_beta" label="Tail mass at which window length is determined"
        help="(--w_beta)" optional="True" type="float"/>
    <param name="w_length" label="Window Length"
        help="(--w_length)" optional="True" type="integer" />
  </xml>
  <xml name="input_hmm">
    <param name="hmmfile" type="data" label="HMM model" format="hmm2,hmm3"/>
  </xml>
  <xml name="input_msa">
    <param name="msafile" type="data" label="Multiple Sequence Alignment" format="stockholm,clustal,fasta"
      help="in Stockholm, Clustal, or Fasta format. While this tool accepts fasta, please ensure that the sequences are not unaligned"/>
  </xml>


  <token name="@ACCEL_HEUR_HELP@"><![CDATA[
Acceleration Heuristicts (--F1, --F2, --F3)
-------------------------------------------

**MSV filter**

The sequence is aligned to the profile using a specialized model that
allows multiple high-scoring local ungapped segments to match. The
optimal alignment score (Viterbi score) is calculated under this multi-
segment model, hence the term MSV, for “multi-segment Viterbi”. This is
HMMER’s main speed heuristic. The MSV score is comparable to BLAST’s sum
score (optimal sum of ungapped alignment segments). Roughly speaking,
MSV is comparable to skipping the heuristic word hit and hit extension
steps of the BLAST acceleration algorithm.

The MSV filter is very, very fast. In addition to avoiding indel
calculations in the dynamic programming table, it uses reduced precision
scores scaled to 8-bit integers, enabling acceleration via 16-way
parallel SIMD vector instructions.

The MSV score is a true log-odds likelihood ratio, so it obeys
conjectures about the expected score distribution (Eddy, 2008) that
allow immediate and accurate calculation of the statistical significance
(P- value) of the MSV bit score.

By default, comparisons with a P-value of ≤ 0.02 pass this filter,
meaning that about 2% of nonhomol- ogous sequences are expected to pass.
You can use the --F1 option to change this threshold. For example, --F1
<0.05> would pass 5% of the comparisons, making a search more sensitive
but slower. Setting the threshold to ≥ 1.0 (--F1 99 for example) assures
that all comparisons will pass. Shutting off the MSV filter may be
worthwhile if you want to make sure you don’t miss comparisons that have
a lot of scattered insertions and deletions. Alternatively, the --max
option causes the MSV filter step (and all other filter steps) to be
bypassed.

The MSV bit score is calculated as a log-odds score using the null model
for comparison. No correction for a biased composition or repetitive
sequence is done at this stage. For comparisons involving biased
sequences and/or profiles, more than 2% of comparisons will pass the MSV
filter. At the end of search output, there is a line like:

    Passed MSV filter: 107917 (0.020272); expected 106468.8 (0.02)

which tells you how many and what fraction of comparisons passed the MSV
filter, versus how many (and what fraction) were expected.

**Viterbi filter**

The sequence is now aligned to the profile using a fast Viterbi algorithm for
optimal gapped alignment.

This Viterbi implementation is specialized for speed. It is implemented in
8-way parallel SIMD vector instructions, using reduced precision scores that
have been scaled to 16-bit integers. Only one row of the dynamic programming
matrix is stored, so the routine only recovers the score, not the optimal
alignment itself. The reduced representation has limited range; local alignment
scores will not underflow, but high scoring comparisons can overflow and return
infinity, in which case they automatically pass the filter.

The final Viterbi filter bit score is then computed using the appropriate null
model log likelihood (by default the biased composition filter model score, or
if the biased filter is off, just the null model score). If the P-value of this
score passes the Viterbi filter threshold, the sequence passes on to the next
step of the pipeline.

The --F2 <x> option controls the P-value threshold for passing the Viterbi
filter score. The default is 0.001. The --max option bypasses all filters in
the pipeline.  At the end of a search output, you will see a line like:

    Passed Vit filter: 2207  (0.00443803); expected 497.3 (0.001)

which tells you how many and what fraction of comparisons passed the Viterbi
filter, versus how many were expected.

**Forward filter/parser**

The sequence is now aligned to the profile using the full Forward algorithm,
which calculates the likelihood of the target sequence given the profile,
summed over the ensemble of all possible alignments.

This is a specialized time- and memory-efficient Forward implementation called
the “Forward parser”. It is implemented in 4-way parallel SIMD vector
instructions, in full precision (32-bit floating point). It stores just enough
information that, in combination with the results of the Backward parser
(below), posterior probabilities of start and stop points of alignments
(domains) can be calculated in the domain definition step (below), although the
detailed alignments themselves cannot be.

The Forward filter bit score is calculated by correcting this score using the
appropriate null model log likelihood (by default the biased composition filter
model score, or if the biased filter is off, just the null model score). If the
P-value of this bit score passes the Forward filter threshold, the sequence
passes on to the next step of the pipeline.

The bias filter score has no further effect in the pipeline. It is only used in
filter stages. It has no effect on final reported bit scores or P-values.
Biased composition compensation for final bit scores is done by a more complex
domain-specific algorithm, described below.

The --F3 <x> option controls the P-value threshold for passing the Forward
filter score. The default is 1e-5. The --max option bypasses all filters in the
pipeline.  At the end of a search output, you will see a line like:

    Passed Fwd filter: 1076 (0.00216371); expected 5.0 (1e-05)

which tells you how many and what fraction of comparisons passed the Forward
filter, versus how many were expected.

**Bias Filter Options**

The --max option bypasses all filters in the pipeline, including the bias
filter.

The --nobias option turns off (bypasses) the biased composition filter. The
simple null model is used as a null hypothesis for MSV and in subsequent filter
steps. The biased composition filter step compromises a small amount of
sensitivity. Though it is good to have it on by default, you may want to shut
it off if you know you will have no problem with biased composition hits.


**Advanced Documentation**

A more detailed look at the internals of the various filter pipelines was
posted on the `developer's blog <http://selab.janelia.org/people/eddys/blog/?p=508>`__.
The information posted there may be useful to those who are struggling with
poor-scoring sequences.

]]></token>
  <token name="@ADV_OPTS_HELP@"><![CDATA[
Advanced Options
----------------

**nonull2**

can be too aggressive sometimes, causing you to miss homologs. You can turn the
biased-composition score correction off with the --nonull2 option (and if
you’re doing that, you may also want to set --nobias, to turn off another
biased composition step called the bias filter, which affects which sequences
get scored at all).

**domZ**

Assert that the total number of targets in your searches is <x>, for the
purposes of per-domain conditional E-value calculations, rather than the number
of targets that passed the reporting thresholds.

**Z**

Assert that the total number of targets in your searches is <x>, for the
purposes of per-sequence E-value calculations, rather than the actual number of
targets seen.
]]></token>
  <token name="@AEEWS_HELP@"><![CDATA[
Effective Sequence Number
-------------------------

After relative weights are determined, they are normalized to sum to a total
effective sequence number, eff nseq. This number may be the actual number of
sequences in the alignment, but it is almost always smaller than that. The
default entropy weighting method (--eent) reduces the effective sequence num-
ber to reduce the information content (relative entropy, or average expected
score on true homologs) per consensus position. The target relative entropy is
controlled by a two-parameter function, where the two parameters are settable
with --ere and --esigma.

**--eent**

Adjust effective sequence number to achieve a specific relative entropy per
position (see --ere). This is the default.

**--eclust**

Set effective sequence number to the number of single-linkage clusters at a
specific identity threshold (see --eid). This option is not recommended; it’s
for experiments evaluating how much better --eent is.

**--enone**

Turn off effective sequence number determination and just use the actual number
of sequences. One reason you might want to do this is to try to maximize the
relative entropy/position of your model, which may be useful for short models.

**--eset**

Explicitly set the effective sequence number for all models to <x>.

**--ere**

Set the minimum relative entropy/position target to <x>. Requires --eent. Default
depends on the sequence alphabet. For protein sequences, it is 0.59 bits/position;
for nucleotide sequences, it is 0.45 bits/position.

**--esigma**

Sets the minimum relative entropy contributed by an entire model alignment, over
its whole length. This has the effect of making short models have higher relative
entropy per position than --ere alone would give. The default is 45.0 bits.

**--eid**

Sets the fractional pairwise identity cutoff used by single linkage clustering
with the --eclust option. The default is 0.62.
]]></token>
  <token name="@ARSWS_HELP@"><![CDATA[
Options Controlling Relative Weights
------------------------------------

HMMER uses an ad hoc sequence weighting algorithm to downweight closely related
sequences and up-weight distantly related ones. This has the effect of making
models less biased by uneven phylogenetic representation. For example, two
identical sequences would typically each receive half the weight that one
sequence would. These options control which algorithm gets used.


**--wpb**

Use the Henikoff position-based sequence weighting scheme [Henikoff and
Henikoff, J. Mol. Biol. 243:574, 1994]. This is the default.

**--wgsc**

Use the Gerstein/Sonnhammer/Chothia weighting algorithm [Gerstein et al, J.
Mol. Biol. 235:1067, 1994].

**--wblosum**

Use the same clustering scheme that was used to weight data in calculating
BLOSUM subsitution matrices [Henikoff and Henikoff, Proc. Natl. Acad. Sci
89:10915, 1992]. Sequences are single-linkage clustered at an identity
threshold (default 0.62; see --wid) and within each cluster of c sequences,
each sequence gets rela- tive weight 1/c.

**--wnone**

No relative weights. All sequences are assigned uniform weight.

**--wid**

Sets the identity threshold used by single-linkage clustering when using
--wblosum.  Invalid with any other weighting scheme. Default is 0.62.
]]></token>
  <token name="@BIAS_COMP_HELP@"><![CDATA[
Bias Composition
----------------

The next number, the bias, is a correction term for biased sequence composition
that has been applied to the sequence bit score.1 For instance, for the top hit
MYG PHYCA that scored 222.7 bits, the bias of 3.2 bits means that this sequence
originally scored 225.9 bits, which was adjusted by the slight 3.2 bit biased-
composition correction. The only time you really need to pay attention to the
bias value is when it’s large, on the same order of magnitude as the sequence
bit score. Sometimes (rarely) the bias correction isn’t aggressive enough, and
allows a non-homolog to retain too much score.  Conversely, the bias correction
can be too aggressive sometimes, causing you to miss homologs. You can turn the
biased-composition score correction off with the --nonull2 option (and if
you’re doing that, you may also want to set --nobias, to turn off another
biased composition step called the bias filter, which affects which sequences
get scored at all).

]]></token>
  <token name="@CUT_HELP@"><![CDATA[
Options for Model-specific Score Thresholding
---------------------------------------------

Curated profile databases may define specific bit score thresholds for each
profile, superseding any thresholding based on statistical significance alone.
To use these options, the profile must contain the appropriate (GA, TC, and/or
NC) optional score threshold annotation; this is picked up by hmmbuild from
Stockholm format alignment files. Each thresholding option has two scores: the
per-sequence threshold <x1> and the per-domain threshold <x2> These act as if
-T<x1> --incT<x1> --domT<x2> --incdomT<x2> has been applied specifically using
each model’s curated thresholds.

**--cut_ga**

Use the GA (gathering) bit scores in the model to set per-sequence (GA1) and
per-domain (GA2) reporting and inclusion thresholds. GA thresholds are
generally considered to be the reliable curated thresholds defining family
membership; for example, in Pfam, these thresholds define what gets included in
Pfam Full alignments based on searches with Pfam Seed models.

**--cut_nc**

Use the NC (noise cutoff) bit score thresholds in the model to set
per-sequence (NC1) and per-domain (NC2) reporting and inclusion thresholds. NC
thresholds are generally considered to be the score of the highest-scoring
known false positive.

**--cut_tc**

Use the NC (trusted cutoff) bit score thresholds in the model to set
per-sequence (TC1) and per-domain (TC2) reporting and inclusion thresholds. TC
thresholds are generally considered to be the score of the lowest-scoring known
true positive that is above all known false positives.
]]></token>
  <token name="@EVAL_CALIB_HELP@"><![CDATA[
Options Controlling H3 Parameter Estimation Methods
---------------------------------------------------

H3 uses three short random sequence simulations to estimating the location
parameters for the expected score distributions for MSV scores, Viterbi scores,
and Forward scores. These options allow these simulations to be modified.

**--EmL**

Sets the sequence length in simulation that estimates the location parameter mu
for MSV E-values. Default is 200.

**--EmN**

Sets the number of sequences in simulation that estimates the location parameter
mu for MSV E-values. Default is 200.

**--EvL**

Sets the sequence length in simulation that estimates the location parameter mu
for Viterbi E-values. Default is 200.

**--EvN**

Sets the number of sequences in simulation that estimates the location parameter
mu for Viterbi E-values. Default is 200.


**--EfL**

Sets the sequence length in simulation that estimates the location parameter tau
for Forward E-values. Default is 100.

**--EfN**

Sets the number of sequences in simulation that estimates the location parameter
tau for Forward E-values. Default is 200.

**--Eft**

Sets the tail mass fraction to fit in the simulation that estimates the location param-
eter tau for Forward evalues. Default is 0.04.
]]></token>
  <token name="@FORMAT_SELECTOR_HELP@"><![CDATA[
Options for Specifying the Alphabet
-----------------------------------

The alphabet type (amino, DNA, or RNA) is autodetected by default, by looking
at the composition of the msafile. Autodetection is normally quite reliable,
but occasionally alphabet type may be ambiguous and autodetection can fail (for
instance, on tiny toy alignments of just a few residues). To avoid this, or to
increase robustness in automated analysis pipelines, you may specify the
alphabet type of msafile with these options.
]]></token>
  <token name="@HSSI_HELP@"><![CDATA[
Options Controlling Single Sequence Scoring (first Iteration)
-------------------------------------------------------------

By default, the first iteration uses a search model constructed from a single
query sequence. This model is constructed using a standard 20x20 substitution
matrix for residue probabilities, and two additional pa- rameters for
position-independent gap open and gap extend probabilities. These options allow
the default single-sequence scoring parameters to be changed.

**Gap Open (--popen)**

Set the gap open probability for a single sequence query model to <x>

**Gap Extend (--pextend)**

Set the gap extend probability for a single sequence query model to <x>.


**--mx/--mxfile**

These options are not currently supported
]]></token>
  <token name="@LENGTHS_HELP@"><![CDATA[
Tail Mass Options
-----------------

**Window length tail mass (--w_beta)**

The upper bound, W, on the length at which nhmmer expects to find an instance
of the model is set such that the fraction of all sequences generated by the
model with length >= W is less than <x>. The default is 1e-7.


**Model instance length upper bound (--w length)**

Override the model instance length upper bound, W, which is otherwise
controlled by --w beta. It should be larger than the model length. The value of
W is used deep in the acceleration pipeline, and modest changes are not
expected to impact results (though larger values of W do lead to longer run
time).

]]></token>
  <token name="@MCSS_HELP@"><![CDATA[
**Options Controlling Profile Construction**

These options control how consensus columns are defined in an alignment.

**--fast**

Define consensus columns as those that have a fraction >= symfrac of residues
as opposed to gaps. (See below for the --symfrac option.) This is the default.

**--hand**

Define consensus columns in next profile using reference annotation to the multiple
alignment. This allows you to define any consensus columns you like.


**--symfrac**

Define the residue fraction threshold necessary to define a consensus column
when using the --fast option. The default is 0.5. The symbol fraction in each
column is calculated after taking relative sequence weighting into account, and
ignoring gap characters corresponding to ends of sequence fragments (as opposed
to internal insertions/deletions). Setting this to 0.0 means that every
alignment column will be assigned as consensus, which may be useful in some
cases. Setting it to 1.0 means that only columns that include 0 gaps (internal
insertions/deletions) will be assigned as consensus.

**--fragthresh**

We only want to count terminal gaps as deletions if the aligned sequence is
known to be full-length, not if it is a fragment (for instance, because only
part of it was sequenced). HMMER uses a simple rule to infer fragments: if the
sequence length L is less than or equal to a fraction <x> times the alignment
length in columns, then the sequence is handled as a fragment. The default is
0.5. Setting --fragthresh0 will define no (nonempty) sequence as a fragment;
you might want to do this if you know you’ve got a carefully curated alignment
of full-length sequences. Setting --fragthresh1 will define all sequences as
fragments; you might want to do this if you know your alignment is entirely
composed of fragments, such as translated short reads in metagenomic shotgun
data.

]]></token>
  <token name="@OFORMAT_WITH_OPTS_HELP@"><![CDATA[
Options for Controlling Output
------------------------------

**Table of hits**

Save a simple tabular (space-delimited) file summarizing the per-target output, with
one data line per homologous target model found.

**Table of per-domain hits**

Save a simple tabular (space-delimited) file summarizing the per-domain output,
with one data line per homologous domain detected in a query sequence for each
homologous model.

**Table of hits and domains in Pfam Format**

Save an especially succinct tabular (space-delimited) file summarizing the
per-target output, with one data line per homologous target model found.
]]></token>
  <token name="@OFORMAT_WITH_OPTS_NOPFAM_HELP@"><![CDATA[
Options for Controlling Output
------------------------------

**Table of hits**

Save a simple tabular (space-delimited) file summarizing the per-target output, with
one data line per homologous target model found.

**Table of per-domain hits**

Save a simple tabular (space-delimited) file summarizing the per-domain output,
with one data line per homologous domain detected in a query sequence for each
homologous model.
]]></token>
  <token name="@OFORMAT_WITH_OPTS_N_HELP@"><![CDATA[
Options for Controlling Output
------------------------------

**Table of hits**

Save a simple tabular (space-delimited) file summarizing the per-target output, with
one data line per homologous target model found.

**Table of hits (dfam)**

Save a tabular (space-delimited) file summarizing the per-hit output, similar
to --tblout but more succinct.


**List of per-position scores for each hit (--aliscoreout)**

Save to file a list of per-position scores for each hit. This is useful, for
example, in identifying regions of high score density for use in resolving
overlapping hits from different models.

]]></token>
  <token name="@PRIOR_HELP@"><![CDATA[
Options Controlling Priors
--------------------------

By default, weighted counts are converted to mean posterior probability
parameter estimates using mixture Dirichlet priors. Default mixture Dirichlet
prior parameters for protein models and for nucleic acid (RNA and DNA) models
are built in. The following options allow you to override the default priors.

**No priors (--pnone)**

Don’t use any priors. Probability parameters will simply be the observed
frequencies, after relative sequence weighting.

**Laplace +1 prior**

Use a Laplace +1 prior in place of the default mixture Dirichlet prior.
]]></token>
  <token name="@SEED_HELP@"><![CDATA[
Random Seeding
--------------

Seed the random number generator with <n>, an integer >= 0. If <n> is nonzero,
any stochastic simulations will be reproducible; the same command will give the
same results. If <n> is 0, the random number generator is seeded arbitrarily,
and stochastic simulations will vary from run to run of the same command.

]]></token>
  <token name="@THRESHOLDS_HELP@"><![CDATA[
Options for Reporting Thresholds
--------------------------------

Reporting thresholds control which hits are reported in output files (the main
output, --tblout, and --domtblout).

**E-value (-E)**

In the per-target output, report target profiles with an E-value of <= <x>. The
default is 10.0, meaning that on average, about 10 false positives will be
reported per query, so you can see the top of the noise and decide for yourself
if it’s really noise.

**Bit score (-T)**

Instead of thresholding per-profile output on E-value, instead report target profiles
with a bit score of >= <x>.

**domain E-value (--domE)**

In the per-domain output, for target profiles that have already satisfied the
per-profile reporting threshold, report individual domains with a conditional
E-value of <= <x>. The default is 10.0. A conditional E-value means the
expected number of additional false positive domains in the smaller search
space of those comparisons that already satisfied the per-profile reporting
threshold (and thus must have at least one homologous domain already).

**domain Bit scores (--domT)**

Instead of thresholding per-domain output on E-value, instead report domains
with a bit score of >= <x>.

Options for Inclusion Thresholds
--------------------------------

Inclusion thresholds are stricter than reporting thresholds. Inclusion
thresholds control which hits are considered to be reliable enough to be
included in an output alignment or a subsequent search round. In hmmscan, which
does not have any alignment output (like hmmsearch or phmmer) nor any iterative
search steps (like jackhmmer), inclusion thresholds have little effect. They
only affect what domains get marked as significant (!) or questionable (?) in
domain output.

**E-value of per target inclusion threshold**

Use an E-value of <= <x> as the per-target inclusion threshold. The default is
0.01, meaning that on average, about 1 false positive would be expected in
every 100 searches with different query sequences.

**Bit score of per target inclusion threshold**

Instead of using E-values for setting the inclusion threshold, instead use a
bit score of >= <x> as the per-target inclusion threshold. It would be unusual
to use bit score thresholds with hmmscan, because you don’t expect a single
score threshold to work for different profiles; different profiles have
slightly different expected score distributions.

**domain E-value per target inclusion treshold**

Use a conditional E-value of <= <x> as the per-domain inclusion threshold, in
targets that have already satisfied the overall per-target inclusion threshold.

**domain Bit score per target inclusion treshold**

Instead of using E-values, instead use a bit score of >= <x> as the per-domain
inclusion threshold. As with --incT above, it would be unusual to use a single
bit score threshold in hmmscan.

]]></token>
  <token name="@THRESHOLDS_NODOM_HELP@"><![CDATA[
Options for Reporting Thresholds
--------------------------------

Reporting thresholds control which hits are reported in output files (the main
output, --tblout, and --domtblout).

**E-value (-E)**

In the per-target output, report target profiles with an E-value of <= <x>. The
default is 10.0, meaning that on average, about 10 false positives will be
reported per query, so you can see the top of the noise and decide for yourself
if it’s really noise.

**Bit score (-T)**

Instead of thresholding per-profile output on E-value, instead report target profiles
with a bit score of >= <x>.

Options for Inclusion Thresholds
--------------------------------

Inclusion thresholds are stricter than reporting thresholds. Inclusion
thresholds control which hits are considered to be reliable enough to be
included in an output alignment or a subsequent search round. In hmmscan, which
does not have any alignment output (like hmmsearch or phmmer) nor any iterative
search steps (like jackhmmer), inclusion thresholds have little effect. They
only affect what domains get marked as significant (!) or questionable (?) in
domain output.

**E-value of per target inclusion threshold**

Use an E-value of <= <x> as the per-target inclusion threshold. The default is
0.01, meaning that on average, about 1 false positive would be expected in
every 100 searches with different query sequences.

**Bit score of per target inclusion threshold**

Instead of using E-values for setting the inclusion threshold, instead use a
bit score of >= <x> as the per-target inclusion threshold. It would be unusual
to use bit score thresholds with hmmscan, because you don’t expect a single
score threshold to work for different profiles; different profiles have
slightly different expected score distributions.

]]></token>
  <token name="@ATTRIBUTION@"><![CDATA[

Attribution
-----------

This Galaxy tool relies on HMMER3_ from http://hmmer.janelia.org/
Internally the software is cited as:

::

    # hmmscan :: search sequence(s) against a profile database
    # HMMER 3.1 (February 2013); http://hmmer.org/
    # Copyright (C) 2011 Howard Hughes Medical Institute.
    # Freely distributed under the GNU General Public License (GPLv3).
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

The wrappers were written by Eric Rasche and is licensed under Apache2_. The
documentation is copied from the HMMER3 documentation.

.. _Apache2: http://www.apache.org/licenses/LICENSE-2.0
.. _HMMER3: http://hmmer.janelia.org/


  ]]></token>
  <token name="@HELP_PRE@"><![CDATA[

What it does
============
  ]]></token>
  <token name="@HELP_PRE_OTH@"><![CDATA[
Options
=======
  ]]></token>
</macros>
author	iuc
date	Sat, 25 Jun 2016 15:06:16 -0400
parents
children	b49a4465041d