diff glimmer_w_icm.xml @ 0:841357e0acbf draft

Uploaded
author bgruening
date Sat, 06 Jul 2013 10:09:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmer_w_icm.xml	Sat Jul 06 10:09:30 2013 -0400
@@ -0,0 +1,224 @@
+<tool id="glimmer_knowlegde-based" name="Glimmer3" version="0.2">
+    <description>Predict ORFs in prokaryotic genomes (knowlegde-based)</description>
+    <requirements>
+        <requirement type="package" version="3.02b">glimmer</requirement>
+        <requirement type="package" version="1.61">biopython</requirement>
+        <requirement type="set_environment">GLIMMER_SCRIPT_PATH</requirement>
+    </requirements>
+    <command>
+    #import tempfile, os
+    #set $temp = tempfile.NamedTemporaryFile( delete=False )
+    #silent $temp.close()
+    #set $temp = $temp.name
+
+    glimmer3
+        --max_olap $max_olap
+        --gene_len $gene_len
+        --threshold $threshold
+        #if float( str($gc_percent) ) > 0.0:
+            --gc_percent $gc_percent
+        #end if
+
+        #if $stop_codon_opts.stop_codon_opts_selector == "gb":
+            --trans_table "${stop_codon_opts.genbank_gencode}"
+        #else:
+            --stop_codons "${stop_codon_opts.stop_codons}"
+        #end if
+
+        --start_codons $start_codons
+
+        $linear
+        $no_indep
+        $extend
+        $seq_input
+        $icm_input
+        $temp 2>&#38;1;
+
+    ## convert prediction to FASTA sequences
+    \$GLIMMER_SCRIPT_PATH/glimmer2seq.py $temp".predict" $seq_input $genes_output;
+
+    #if $report:
+        mv $temp".predict" $report_output;
+    #else:
+        rm $temp".predict";
+    #end if
+
+    #if $detailed_report:
+        mv $temp".detail" $detailed_output;
+    #else:
+        rm $temp".detail";
+    #end if
+
+    rm $temp
+    </command>
+    <inputs>
+        <param name="seq_input" type="data" format="fasta" label="Genome Sequence" />
+        <param name="icm_input" type="data" format="data" label="Interpolated context model (ICM)" />
+
+        <param name="max_olap" type="integer" value="50" label="Set maximum overlap length" help="Overlaps this short or shorter are ignored." />
+        <param name="gene_len" type="integer" value="90" label="Set the minimum gene length to n nucleotides" hrlp="This does not include the bases in the stop codon."/>
+        <param name="threshold" type="integer" value="30" label="Set threshold score for calling as gene" help="If the in-frame score >= N, then the region is given a number and considered a potential gene." />
+        <param name="gc_percent" type="float" value="0.0" label="Set the GC percentage of the independent model, i.e., the model of intergenic sequence" help="If 0.0 specified, the GC percentage will be counted from the input file." />
+
+        <param name="linear" type="boolean" truevalue="--linear" falsevalue="" checked="true" label="Assume linear rather than circular genome, i.e., no wraparound" />
+        <param name="no_indep" type="boolean" truevalue="--no_indep" falsevalue="" checked="false" label="Don’t use the independent probability score column at all" help="Using this option will produce more short gene predictions." />
+        <param name="extend" type="boolean" truevalue="--extend" falsevalue="" checked="false" label="Also score orfs that extend off the end of the sequence(s)" />
+        <param name="start_codons" type="text" value="atg,gtg,ttg" label="Specify start codons as a comma-separated list" />
+
+        <conditional name="stop_codon_opts">
+            <param name="stop_codon_opts_selector" type="select" label="Specify start codons as">
+              <option value="gb" selected="True">Genbank translation table entry</option>
+              <option value="free_form">Comma-separated list</option>
+            </param>
+            <when value="gb">
+                <param name="genbank_gencode" type="select" label="Use Genbank translation table to specify stop codons">
+                    <option value="1" select="True">1. Standard</option>
+                    <option value="2">2. Vertebrate Mitochondrial</option>
+                    <option value="3">3. Yeast Mitochondrial</option>
+                    <option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+                    <option value="5">5. Invertebrate Mitochondrial</option>
+                    <option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
+                    <option value="9">9. Echinoderm Mitochondrial</option>
+                    <option value="10">10. Euplotid Nuclear</option>
+                    <option value="11">11. Bacteria and Archaea</option>
+                    <option value="12">12. Alternative Yeast Nuclear</option>
+                    <option value="13">13. Ascidian Mitochondrial</option>
+                    <option value="14">14. Flatworm Mitochondrial</option>
+                    <option value="15">15. Blepharisma Macronuclear</option>
+                    <option value="16">16. Chlorophycean Mitochondrial</option>
+                    <option value="21">21. Trematode Mitochondrial</option>
+                    <option value="22">22. Scenedesmus obliquus mitochondrial</option>
+                    <option value="23">23. Thraustochytrium Mitochondrial</option>
+                    <option value="24">24. Pterobranchia mitochondrial</option>
+                </param>
+            </when>
+            <when value="free_form">
+                <param name="stop_codons" type="text" value="tag,tga,taa" label="Specify stop codons as a comma-separated list" />
+            </when>
+        </conditional>
+
+        <param name="report" type="boolean" truevalue="" falsevalue="" checked="false" label="Report the classic glimmer table output"/>
+        <param name="detailed_report" type="boolean" truevalue="" falsevalue="" checked="false" label="Output a detailed gene prediction report as separate file"/>
+    </inputs>
+    <outputs>
+        <data name="genes_output" format="fasta" label="Glimmer3 on ${on_string} (Gene Prediction FASTA)" />
+        <data name="report_output" format="txt" label="Glimmer3 on ${on_string} (Gene Prediction table)">
+            <filter>report == True</filter>
+        </data>
+        <data name="detailed_output" format="txt" label="Glimmer3 on ${on_string} (detailed report)">
+            <filter>detailed_report == True</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="seq_input" value='streptomyces_Tu6071_genomic.fasta' />
+            <param name="icm_input" value='streptomyces_Tu6071_plasmid_genes.icm' />
+            <param name="max_olap" value="50" />
+            <param name="gene_len" value="90" />
+            <param name="threshold" value="30" />
+            <param name="gc_percent" value="0.0" />
+            <param name="linear" value="--linear" />
+            <param name="no_indep" value="" />
+            <param name="extend" value="" />
+            <param name="start_codons" value="atg,gtg,ttg" />
+            <param name="genbank_gencode" value="11" />
+            <param name="detailed_report" value="" />
+            <param name="report" value="" />
+            <output name="genes_output" file='glimmer_w_icm_trans-table-11_genomic.fasta' ftype="fasta" />
+        </test>
+    </tests>
+    <help>
+
+
+**What it does**
+
+This is the main program that makes gene preditions based on an interpolated context model (ICM).
+
+The ICM can be generated with extracted CDS from related organisms (ICM builder). If you can't generate an ICM model you can use the non knowlegde-based Glimmer with a de novo prediction.
+
+-----
+
+**Example**
+
+*Input*::
+	
+	- interpolated context model (ICM): Use the 'Glimmer ICM builder' tool to create one
+	- Genome Sequence in FASTA format
+	
+		>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7
+		GATCCTTGTAGATTTTGAATTTGAAGTTTTTTCTCATTCCAAAACTCTGT
+		GATCTGAAATAAAATGTCTCAAAAAAATAGAAGAAAACATTGCTTTATAT
+		TTATCAGTTATGGTTTTCAAAATTTTCTGACATACCGTTTTGCTTCTTTT
+		TTTCTCATCTTCTTCAAATATCAATTGTGATAATCTGACTCCTAACAATC
+		GAATTTCTTTTCCTTTTTCTTTTTCCAACAACTCCAGTGAGAACTTTTGA
+		ATATCTTCAAGTGACTTCACCACATCAGAAGGTGTCAACGATCTTGTGAG
+		AACATCGAATGAAGATAATTTTAATTTTAGAGTTACAGTTTTTCCTCCGA
+		CAATTCCTGATTTACGAACATCTTCTTCAAGCATTCTACAGATTTCTTGA
+		TGCTCTTCTAGGAGGATGTTGAAATCCGAAGTTGGAGAAAAAGTTCTCTC
+		AACTGAAATGCTTTTTCTTCGTGGATCCGATTCAGATGGACGACCTGGCA
+		GTCCGAGAGCCGTTCGAAGGAAAGATTCTTGTGAGAGAGGCGTGAAACAC
+		AAAGGGTATAGGTTCTTCTTCAGATTCATATCACCAACAGTTTGAATATC
+		CATTGCTTTCAGTTGAGCTTCGCATACACGACCAATTCCTCCAACCTAAA
+		AAATTATCTAGGTAAAACTAGAAGGTTATGCTTTAATAGTCTCACCTTAC
+		GAATCGGTAAATCCTTCAAAAACTCCATAATCGCGTTTTTATCATTTTCT
+		.....
+
+*Output*:: 
+
+	- FASTA file with predicted proteins
+	- Glimmer prediction file (optional)
+
+		>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
+		orf00001    40137       52  +2     8.68
+		orf00004      603       34  -1     2.91
+		orf00006     1289     1095  -3     3.16
+		orf00007     1555     1391  -2     2.33
+		orf00008     1809     1576  -1     1.02
+		orf00010     1953     2066  +3     3.09
+		orf00011     2182     2304  +1     0.89
+		orf00013     2390     2521  +2     0.60
+		orf00018     2570     3073  +2     2.54
+		orf00020     3196     3747  +1     2.91
+		orf00022     3758     4000  +2     0.83
+		orf00023     4399     4157  -2     1.31
+		orf00025     4463     4759  +2     2.92
+		orf00026     4878     5111  +3     0.78
+		orf00027     5468     5166  -3     1.64
+		orf00029     5590     5832  +1     0.29
+		orf00032     6023     6226  +2     6.02
+		orf00033     6217     6336  +1     3.09
+		........
+
+	- Glimmer detailed report (optional)
+
+		>CELF22B7  C.aenorhabditis elegans (Bristol N2) cosmid F22B7.
+		Sequence length = 40222
+			
+			   ----- Start -----           --- Length ----  ------------- Scores -------------
+		 ID  Frame   of Orf  of Gene     Stop   of Orf of Gene      Raw InFrm F1 F2 F3 R1 R2 R3 NC
+		0001    +2    40137    40137       52      135     135     9.26    96  - 96  -  -  3  -  0
+		0002    +1       58       64      180      120     114     5.01    69 69  -  - 30  -  -  0
+			+3      300      309      422      120     111    -0.68    20  -  - 20 38  -  - 41
+			+3      423      432      545      120     111     1.29    21  - 51 21 13  -  8  5
+		0003    +2      401      416      595      192     177     2.51    93  - 93  -  5  -  -  1
+		0004    -1      645      552       34      609     516     2.33    99  -  -  - 99  -  -  0
+			+1      562      592      762      198     168    -2.54     1  1  -  -  -  -  - 98
+			+1      763      772      915      150     141    -1.34     1  1  -  -  -  - 86 11
+			+3      837      846     1007      168     159     1.35    28  - 50 28  -  - 17  3
+		0005    -3     1073      977      654      417     321     0.52    84  -  -  -  -  - 84 15
+		0006    -3     1373     1319     1095      276     222     3.80    99  -  -  -  -  - 99  0
+		0007    -2     1585     1555     1391      192     162     2.70    98  -  -  -  - 98  -  1
+		0008    -1     1812     1809     1576      234     231     1.26    94  -  -  - 94  -  -  5
+		0009    +2     1721     1730     1945      222     213     0.68    80  - 80  -  -  -  - 19
+		.....
+
+-------
+
+**References**
+
+A.L. Delcher, K.A. Bratke, E.C. Powers, and S.L. Salzberg. Identifying bacterial genes and endosymbiont DNA with Glimmer. Bioinformatics (Advance online version) (2007).
+
+
+    </help>
+
+</tool>