comparison cmalign._x_m_l_todo @ 3:2c2c5e5e495b draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/infernal commit 9eeedfaf35c069d75014c5fb2e42046106bf813c-dirty
author bgruening
date Fri, 04 Mar 2016 07:24:53 -0500
parents
children
comparison
equal deleted inserted replaced
2:fac157e22e1b 3:2c2c5e5e495b
1 <tool id="infernal_cmalign" name="Align sequences to a covariance model" version="1.1.0.2">
2 <description>against a sequence database (cmsearch)</description>
3 <parallelism method="multi" split_inputs="seqdb" split_mode="to_size" split_size="100" shared_inputs="" merge_outputs="outfile,multiple_alignment_output"></parallelism>
4 <requirements>
5 <requirement type="package">infernal</requirement>
6 <requirement type="package" version="1.1">infernal</requirement>
7 <requirement type="package" version="8.22">gnu_coreutils</requirement>
8 </requirements>
9 <command>
10 <![CDATA[
11 ## a temp file is needed, because the standard tabular output from infernal is not usefull in Galaxy
12 ## it will be converted to a tab delimited file and piped to Galaxy
13 temp_tabular_output=\$(mktemp);
14
15 cmsearch
16 ## Infernal Options
17 --cpu "\${GALAXY_SLOTS:-12}"
18 -o /dev/null
19 --tformat $seqdb.ext ##target format: fasta, embl, genbank, ddbj, stockholm, pfam, a2m, afa, clustal, and phylip
20 $bottomonly
21 $toponly
22 $cyk
23 $notrunc
24 $max
25 $nohmm
26 $mid
27 ##$bitscore_thresholds
28 --tblout \$temp_tabular_output
29 $g
30 #if $A:
31 $A $multiple_alignment_output
32 #end if
33
34 #if $inclusion_thresholds_opts.inclusion_thresholds_selector == "--incE":
35 --incE $inclusion_thresholds_opts.incE
36 #elif $inclusion_thresholds_opts.inclusion_thresholds_selector == "--incT":
37 --incT $inclusion_thresholds_opts.incT
38 #end if
39
40 #if $reporting_thresholds_opts.reporting_thresholds_selector == "-E":
41 -E $reporting_thresholds_opts.E
42 #elif $reporting_thresholds_opts.reporting_thresholds_selector == "-T":
43 -T $reporting_thresholds_opts.T
44 #end if
45
46 ## CM file from the history or stored as database on disc
47
48 #if $cm_opts.cm_opts_selector == "db":
49 $cm_opts.database.fields.path
50 #else:
51 $cm_opts.cmfile
52 #end if
53
54 ## sequence file
55 $seqdb
56 2>&1
57 ;
58
59 ## 1. replace all lines starting # (comment lines)
60 ## 2. replace the first 18 spaces with tabs, 18th field is a free text field (can contain spaces)
61 sed -e 's/#.*$//' -e '/^$/d' -e 's/ /\t/g' -e 's/\t/ /18g' \$temp_tabular_output > $outfile
62
63 ]]>
64 </command>
65 <inputs>
66
67 <param name="seqdb" type="data" format="fasta" label="Sequence database"/>
68 <conditional name="cm_opts">
69 <param name="cm_opts_selector" type="select" label="Subject covariance models">
70 <option value="db" selected="True">Locally installed covariance models</option>
71 <option value="histdb">Covariance model from your history</option>
72 </param>
73 <when value="db">
74 <param name="database" type="select" label="Covariance models">
75 <options from_file="infernal.loc">
76 <column name="value" index="0"/>
77 <column name="name" index="1"/>
78 <column name="path" index="2"/>
79 </options>
80 </param>
81 </when>
82 <when value="histdb">
83 <param name="cmfile" type="data" format="txt" label="Covariance models file from the history."/>
84 </when>
85 </conditional>
86
87 <param name="g" truevalue="-g" falsevalue="" checked="False" type="boolean"
88 label="Turn on the glocal alignment algorithm" help="... global with respect to the query model and local with respect to the target database."/>
89
90 <param name="bottomonly" truevalue="--bottomonly" falsevalue="" checked="False" type="boolean"
91 label="Only search the bottom (Crick) strand of target sequences" help="in the sequence database"/>
92 <param name="toponly" truevalue="--toponly" falsevalue="" checked="False" type="boolean"
93 label="Only search the top (Watson) strand of target sequences" help="in the sequence database"/>
94
95 <param name="cyk" truevalue="--cyk" falsevalue="" checked="False" type="boolean"
96 label="Use the CYK algorithm, not Inside, to determine the final score of all hits" help=""/>
97 <param name="--acyk" truevalue="--cyk" falsevalue="" checked="False" type="boolean"
98 label="Use the CYK algorithm to align hits" help="By default, the Durbin/Holmes optimal accuracy algorithm is used, which finds the alignment that maximizes the expected accuracy of all aligned residues."/>
99
100 <param name="notrunc" truevalue="--notrunc" falsevalue="" checked="False" type="boolean"
101 label="Turn off truncated hit detection" help=""/>
102
103 <!-- accelleration pipeline -->
104
105 <param name="max" truevalue="--max" falsevalue="" checked="False" type="boolean"
106 label="Turn off all filters, and run non-banded Inside on every full-length target sequence" help="This
107 increases sensitivity somewhat, at an extremely large cost in speed."/>
108
109 <param name="nohmm" truevalue="--nohmm" falsevalue="" checked="False" type="boolean"
110 label="Turn off all HMM filter stages " help=""/>
111
112 <param name="mid" truevalue="--mid" falsevalue="" checked="False" type="boolean"
113 label="Turn off the HMM SSV and Viterbi filter stages" help=""/>
114
115
116 <!-- Options for model-specific score thresholding -->
117 <!--
118 <param name="bitscore_thresholds" type="select" label="Bit score thresholds" help="Curated CM databases may define specific bit score thresholds for each CM, superseding any thresholding based on statistical significance alone.">
119 <option value="" selected="true">None</option>
120 <option value=" - -cut_ga">GA (gathering) bit scores</option>
121 <option value=" - -cut_nc">NC (noise cutoff) bit score</option>
122 <option value=" - -cut_tc">TC (trusted cutoff) bit score</option>
123 </param>
124 -->
125 <!-- Options for inclusion thresholds -->
126
127
128 <conditional name="inclusion_thresholds_opts">
129 <param name="inclusion_thresholds_selector" type="select" label="Inclusion thresholds"
130 help="Inclusion thresholds are stricter than reporting thresholds. Inclusion thresholds control which hits are considered to be reliable enough to be included in an output alignment or in a possible subsequent search round, or marked as significant (”!”) as opposed to questionable (”?”) in hit output.">
131 <option value="" selected="true">default</option>
132 <option value="--incE">Use E-value</option>
133 <option value="--incT">Use bit score</option>
134 </param>
135 <when />
136 <when value="--incE">
137 <param name="incE" type="float" value="0.01" label="Use E-value" help="of &lt;= X as the hit inclusion threshold.">
138 <sanitizer>
139 <valid initial="string.printable">
140 <remove value="&apos;"/>
141 </valid>
142 </sanitizer>
143 </param>
144 </when>
145 <when value="--incT">
146 <param name="incT" type="integer" value="0" label="Use bit score" help="of >= X as the hit inclusion threshold.">
147 <sanitizer>
148 <valid initial="string.printable">
149 <remove value="&apos;"/>
150 </valid>
151 </sanitizer>
152 </param>
153 </when>
154 </conditional>
155
156 <!-- Options controlling reporting thresholds -->
157
158 <conditional name="reporting_thresholds_opts">
159 <param name="reporting_thresholds_selector" type="select" label="reporting thresholds"
160 help="Reporting thresholds control which hits are reported in output files">
161 <option value="" selected="true">default</option>
162 <option value="-E">Use E-value</option>
163 <option value="-T">Use bit score</option>
164 </param>
165 <when />
166 <when value="-E">
167 <param name="E" type="float" value="10.0" label="Use E-value" help="of &lt;= X as the hit reporting threshold. The default is 10.0, meaning that on average, about 10 false positives will be reported per query, so you can see the top of the noise and decide for yourself if it’s really noise.">
168 <sanitizer>
169 <valid initial="string.printable">
170 <remove value="&apos;"/>
171 </valid>
172 </sanitizer>
173 </param>
174 </when>
175 <when value="-T">
176 <param name="T" type="integer" value="0" label="Use bit score" help="of >= X as the hit reporting threshold.">
177 <sanitizer>
178 <valid initial="string.printable">
179 <remove value="&apos;"/>
180 </valid>
181 </sanitizer>
182 </param>
183 </when>
184 </conditional>
185
186 <param name="A" truevalue="-A" falsevalue="" checked="False" type="boolean"
187 label="Save a multiple alignment of all significant hits" help="... those satisfying inclusion thresholds"/>
188
189 </inputs>
190 <outputs>
191
192 <data format="tabular" name="outfile" label="cmsearch on ${on_string}"/>
193 <data format="tabular" name="multiple_alignment_output" label="cmsearch on ${on_string} (multi alignment)">
194 <filter>A is True</filter>
195 </data>
196
197 </outputs>
198 <help>
199 <![CDATA[
200
201
202 **What it does**
203
204 cmalign aligns the RNA sequences to the covariance model (CM).
205
206
207 The sequence file must be in FASTA or Genbank format. cmalign
208 uses an HMM banding technique to accelerate alignment by default. By default,
209 cmalign computes the alignment with maximum expected accuracy that is consistent with constraints
210 (bands) derived from an HMM, using a banded version of the Durbin/Holmes optimal accuracy algorithm. cmalign takes special care to correctly align truncated sequences, where some nucleotides from the beginning (5’) and/or end (3’) of the actual full length biological sequence are not present in the input sequence. This behavior is on by default.
211
212
213
214 **Output format**
215
216
217 (1) target name: The name of the target sequence or profile.
218 (2) accession: The accession of the target sequence or profile, or ’-’ if none.
219 (3) query name: The name of the query sequence or profile.
220 (4) accession: The accession of the query sequence or profile, or ’-’ if none.
221 (5) mdl (model): Which type of model was used to compute the final score. Either ’cm’ or ’hmm’. A CM is used to compute the final hit scores unless the model has zero basepairs or the --hmmonly option is used, in which case a HMM will be used.
222 (6) mdl from (model coord): The start of the alignment of this hit with respect to the profile (CM or HMM), numbered 1..N for a profile of N consensus positions.
223 (7) mdl to (model coord): The end of the alignment of this hit with respect to the profile (CM or HMM), numbered 1..N for a profile of N consensus positions.
224 (8) seq from (ali coord): The start of the alignment of this hit with respect to the sequence, numbered 1..L for a sequence of L residues.
225 (9) seq to (ali coord): The end of the alignment of this hit with respect to the sequence, numbered 1..L for a sequence of L residues.
226 (10) strand: The strand on which the hit occurs on the sequence. ’+’ if the hit is on the top (Watson) strand, ’-’ if the hit is on the bottom (Crick) strand. If on the top strand, the “seq from” value will be less than or equal to the “seq to” value, else it will be greater than or equal to it.
227 (11) trunc: Indicates if this is predicted to be a truncated CM hit or not. This will be “no” if it is a CM hit that is not predicted to be truncated by the end of the sequence, “5’ ” or “3’ ” if the hit is predicted to have one or more 5’ or 3’ residues missing due to a artificial truncation of the sequence, or “5’&3”’ if the hit is predicted to have one or more 5’ residues missing and one or more 3’ residues missing. If the hit is an HMM hit, this will always be ’-’.
228 (12) pass: Indicates what “pass” of the pipeline the hit was detected on. This is probably only useful for testing and debugging. Non-truncated hits are found on the first pass, truncated hits are found on successive passes.
229 (13) gc: Fraction of G and C nucleotides in the hit.
230 (14) bias: The biased-composition correction: the bit score difference contributed by the null3 model for CM hits, or the null2 model for HMM hits. High bias scores may be a red flag for a false positive. It is difficult to correct for all possible ways in which a nonrandom but nonhomologous biological sequences can appear to be similar, such as short-period tandem repeats, so there are cases where the bias correction is not strong enough (creating false positives).
231 (15) score: The score (in bits) for this target/query comparison. It includes the biased-composition cor-rection (the “null3” model for CM hits, or the “null2” model for HMM hits).
232 (16) E-value: The expectation value (statistical significance) of the target. This is a per query E-value; i.e. calculated as the expected number of false positives achieving this comparison’s score for a single query against the search space Z. For cmsearch Z is defined as the total number of nucleotides in the target dataset multiplied by 2 because both strands are searched. For cmscan Z is the total number of nucleotides in the query sequence multiplied by 2 because both strands are searched and multiplied by the number of models in the target database. If you search with multiple queries and if you want to control the overall false positive rate of that search rather than the false positive rate per query, you will want to multiply this per-query E-value by how many queries you’re doing.
233 (17) inc: Indicates whether or not this hit achieves the inclusion threshold: ’!’ if it does, ’?’ if it does not (and rather only achieves the reporting threshold). By default, the inclusion threshold is an E-value of 0.01 and the reporting threshold is an E-value of 10.0, but these can be changed with command line options as described in the manual pages.
234 (18) description of target: The remainder of the line is the target’s description line, as free text.
235
236
237 For further questions please refere to the Infernal `Userguide <http://selab.janelia.org/software/infernal/Userguide.pdf>`_.
238
239
240 ]]>
241 </help>
242
243 <citations>
244 <citation type="doi">10.1093/bioinformatics/btt509</citation>
245 <citation type="bibtex">
246 @ARTICLE{bgruening_galaxytools,
247 Author = {Björn Grüning, Cameron Smith, Torsten Houwaart, Nicola Soranzo, Eric Rasche},
248 keywords = {bioinformatics, ngs, galaxy, cheminformatics, rna},
249 title = {{Galaxy Tools - A collection of bioinformatics and cheminformatics tools for the Galaxy environment}},
250 url = {https://github.com/bgruening/galaxytools}
251 }
252 </citation>
253 </citations>
254
255
256 </tool>