0
|
1 <tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.2">
|
|
2 <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description>
|
|
3 <requirements>
|
|
4 <requirement type="package" version="3.02b">glimmer</requirement>
|
|
5 <requirement type="package" version="1.61">biopython</requirement>
|
|
6 <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement>
|
|
7 </requirements>
|
|
8 <command>
|
|
9 #import tempfile, os
|
|
10 #set $temp = tempfile.NamedTemporaryFile( delete=False )
|
|
11 #silent $temp.close()
|
|
12 #set $temp = $temp.name
|
|
13
|
|
14 glimmer3
|
|
15 --max_olap $max_olap
|
|
16 --gene_len $gene_len
|
|
17 --threshold $threshold
|
|
18 #if float( str($gc_percent) ) > 0.0:
|
|
19 --gc_percent $gc_percent
|
|
20 #end if
|
|
21
|
|
22 #if $stop_codon_opts.stop_codon_opts_selector == "gb":
|
|
23 --trans_table "${stop_codon_opts.genbank_gencode}"
|
|
24 #else:
|
|
25 --stop_codons "${stop_codon_opts.stop_codons}"
|
|
26 #end if
|
|
27
|
|
28 --start_codons $start_codons
|
|
29
|
|
30 $linear
|
|
31 $no_indep
|
|
32 $extend
|
|
33 $seq_input
|
|
34 $icm_input
|
|
35 $temp 2>&1;
|
|
36
|
|
37 ## convert prediction to FASTA sequences
|
|
38 \$GLIMMER_SCRIPT_PATH/glimmer2seq.py $temp".predict" $seq_input $genes_output;
|
|
39
|
|
40 #if $report:
|
|
41 mv $temp".predict" $report_output;
|
|
42 #else:
|
|
43 rm $temp".predict";
|
|
44 #end if
|
|
45
|
|
46 #if $detailed_report:
|
|
47 mv $temp".detail" $detailed_output;
|
|
48 #else:
|
|
49 rm $temp".detail";
|
|
50 #end if
|
|
51
|
|
52 rm $temp
|
|
53 </command>
|
|
54 <inputs>
|
|
55 <param name="seq_input" type="data" format="fasta" label="Genome Sequence" />
|
|
56 <param name="icm_input" type="data" format="data" label="Interpolated context model (ICM)" />
|
|
57
|
|
58 <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." />
|
|
59 <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/>
|
|
60 <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." />
|
|
61 <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." />
|
|
62
|
|
63 <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
|
|
64 <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." />
|
|
65 <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" />
|
|
66 <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" />
|
|
67
|
|
68 <conditional name="stop_codon_opts">
|
|
69 <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
|
|
70 <option value="gb" selected="True">Genbank translation table entry</option>
|
|
71 <option value="free_form">Comma-separated list</option>
|
|
72 </param>
|
|
73 <when value="gb">
|
|
74 <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
|
|
75 <option value="1" select="True">1. Standard</option>
|
|
76 <option value="2">2. Vertebrate Mitochondrial</option>
|
|
77 <option value="3">3. Yeast Mitochondrial</option>
|
|
78 <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
|
|
79 <option value="5">5. Invertebrate Mitochondrial</option>
|
|
80 <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
|
|
81 <option value="9">9. Echinoderm Mitochondrial</option>
|
|
82 <option value="10">10. Euplotid Nuclear</option>
|
|
83 <option value="11">11. Bacteria and Archaea</option>
|
|
84 <option value="12">12. Alternative Yeast Nuclear</option>
|
|
85 <option value="13">13. Ascidian Mitochondrial</option>
|
|
86 <option value="14">14. Flatworm Mitochondrial</option>
|
|
87 <option value="15">15. Blepharisma Macronuclear</option>
|
|
88 <option value="16">16. Chlorophycean Mitochondrial</option>
|
|
89 <option value="21">21. Trematode Mitochondrial</option>
|
|
90 <option value="22">22. Scenedesmus obliquus mitochondrial</option>
|
|
91 <option value="23">23. Thraustochytrium Mitochondrial</option>
|
|
92 <option value="24">24. Pterobranchia mitochondrial</option>
|
|
93 </param>
|
|
94 </when>
|
|
95 <when value="free_form">
|
|
96 <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
|
|
97 </when>
|
|
98 </conditional>
|
|
99
|
|
100 <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/>
|
|
101 <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/>
|
|
102 </inputs>
|
|
103 <outputs>
|
|
104 <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
|
|
105 <data name="report_output" format="txt" label="Glimmer3 on ${on_string} (Gene Prediction table)">
|
|
106 <filter>report == True</filter>
|
|
107 </data>
|
|
108 <data name="detailed_output" format="txt" label="Glimmer3 on ${on_string} (detailed report)">
|
|
109 <filter>detailed_report == True</filter>
|
|
110 </data>
|
|
111 </outputs>
|
|
112 <tests>
|
|
113 <test>
|
|
114 <param name="seq_input" value='streptomyces_Tu6071_genomic.fasta' />
|
|
115 <param name="icm_input" value='streptomyces_Tu6071_plasmid_genes.icm' />
|
|
116 <param name="max_olap" value="50" />
|
|
117 <param name="gene_len" value="90" />
|
|
118 <param name="threshold" value="30" />
|
|
119 <param name="gc_percent" value="0.0" />
|
|
120 <param name="linear" value="--linear" />
|
|
121 <param name="no_indep" value="" />
|
|
122 <param name="extend" value="" />
|
|
123 <param name="start_codons" value="atg,gtg,ttg" />
|
|
124 <param name="genbank_gencode" value="11" />
|
|
125 <param name="detailed_report" value="" />
|
|
126 <param name="report" value="" />
|
|
127 <output name="genes_output" file='glimmer_w_icm_trans-table-11_genomic.fasta' ftype="fasta" />
|
|
128 </test>
|
|
129 </tests>
|
|
130 <help>
|
|
131
|
|
132
|
|
133 **What it does**
|
|
134
|
|
135 This is the main program that makes gene preditions based on an interpolated context model (ICM).
|
|
136
|
|
137 The ICM can be generated with extracted CDS from related organisms (ICM builder). If you can't generate an ICM model you can use the non knowlegde-based Glimmer with a de novo prediction.
|
|
138
|
|
139 -----
|
|
140
|
|
141 **Example**
|
|
142
|
|
143 *Input*::
|
|
144
|
|
145 - interpolated context model (ICM): Use the 'Glimmer ICM builder' tool to create one
|
|
146 - Genome Sequence in FASTA format
|
|
147
|
|
148 >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7
|
|
149 GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
|
|
150 GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
|
|
151 TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
|
|
152 TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
|
|
153 GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
|
|
154 ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
|
|
155 AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
|
|
156 CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
|
|
157 TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
|
|
158 AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
|
|
159 GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
|
|
160 AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
|
|
161 CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
|
|
162 AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
|
|
163 GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
|
|
164 .....
|
|
165
|
|
166 *Output*::
|
|
167
|
|
168 - FASTA file with predicted proteins
|
|
169 - Glimmer prediction file (optional)
|
|
170
|
|
171 >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
|
|
172 orf00001 40137 52 +2 8.68
|
|
173 orf00004 603 34 -1 2.91
|
|
174 orf00006 1289 1095 -3 3.16
|
|
175 orf00007 1555 1391 -2 2.33
|
|
176 orf00008 1809 1576 -1 1.02
|
|
177 orf00010 1953 2066 +3 3.09
|
|
178 orf00011 2182 2304 +1 0.89
|
|
179 orf00013 2390 2521 +2 0.60
|
|
180 orf00018 2570 3073 +2 2.54
|
|
181 orf00020 3196 3747 +1 2.91
|
|
182 orf00022 3758 4000 +2 0.83
|
|
183 orf00023 4399 4157 -2 1.31
|
|
184 orf00025 4463 4759 +2 2.92
|
|
185 orf00026 4878 5111 +3 0.78
|
|
186 orf00027 5468 5166 -3 1.64
|
|
187 orf00029 5590 5832 +1 0.29
|
|
188 orf00032 6023 6226 +2 6.02
|
|
189 orf00033 6217 6336 +1 3.09
|
|
190 ........
|
|
191
|
|
192 - Glimmer detailed report (optional)
|
|
193
|
|
194 >CELF22B7 C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
|
|
195 Sequence length = 40222
|
|
196
|
|
197 ----- Start ----- --- Length ---- ------------- Scores -------------
|
|
198 ID Frame of Orf of Gene Stop of Orf of Gene Raw InFrm F1 F2 F3 R1 R2 R3 NC
|
|
199 0001 +2 40137 40137 52 135 135 9.26 96 - 96 - - 3 - 0
|
|
200 0002 +1 58 64 180 120 114 5.01 69 69 - - 30 - - 0
|
|
201 +3 300 309 422 120 111 -0.68 20 - - 20 38 - - 41
|
|
202 +3 423 432 545 120 111 1.29 21 - 51 21 13 - 8 5
|
|
203 0003 +2 401 416 595 192 177 2.51 93 - 93 - 5 - - 1
|
|
204 0004 -1 645 552 34 609 516 2.33 99 - - - 99 - - 0
|
|
205 +1 562 592 762 198 168 -2.54 1 1 - - - - - 98
|
|
206 +1 763 772 915 150 141 -1.34 1 1 - - - - 86 11
|
|
207 +3 837 846 1007 168 159 1.35 28 - 50 28 - - 17 3
|
|
208 0005 -3 1073 977 654 417 321 0.52 84 - - - - - 84 15
|
|
209 0006 -3 1373 1319 1095 276 222 3.80 99 - - - - - 99 0
|
|
210 0007 -2 1585 1555 1391 192 162 2.70 98 - - - - 98 - 1
|
|
211 0008 -1 1812 1809 1576 234 231 1.26 94 - - - 94 - - 5
|
|
212 0009 +2 1721 1730 1945 222 213 0.68 80 - 80 - - - - 19
|
|
213 .....
|
|
214
|
|
215 -------
|
|
216
|
|
217 **References**
|
|
218
|
|
219 A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
|
|
220
|
|
221
|
|
222 </help>
|
|
223
|
|
224 </tool>
|