Mercurial > repos > bgruening > glimmer_hmm
comparison glimmerHMM/glimmerhmm_tabular_to_sequence.py @ 0:c9699375fcf6 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/glimmer_hmm commit 0dc67759bcbdf5a8a285ded9ba751340d741fe63
author | bgruening |
---|---|
date | Wed, 13 Jul 2016 10:55:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c9699375fcf6 |
---|---|
1 #!/usr/bin/env python | |
2 """Convert GlimmerHMM gene predictions into protein sequences. | |
3 | |
4 This works with the GlimmerHMM specific output format: | |
5 | |
6 12 1 + Initial 10748 10762 15 | |
7 12 2 + Internal 10940 10971 32 | |
8 12 3 + Internal 11035 11039 5 | |
9 12 4 + Internal 11072 11110 39 | |
10 12 5 + Internal 11146 11221 76 | |
11 12 6 + Terminal 11265 11388 124 | |
12 | |
13 http://www.cbcb.umd.edu/software/GlimmerHMM/ | |
14 | |
15 Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmer_to_proteins.py | |
16 | |
17 Usage: | |
18 glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True> | |
19 """ | |
20 | |
21 import sys | |
22 import os | |
23 import operator | |
24 | |
25 from Bio import SeqIO | |
26 from Bio.SeqRecord import SeqRecord | |
27 | |
28 def main(glimmer_file, ref_file, out_file, to_protein): | |
29 with open(ref_file) as in_handle: | |
30 ref_rec = SeqIO.read(in_handle, "fasta") | |
31 | |
32 base, ext = os.path.splitext(glimmer_file) | |
33 | |
34 with open(out_file, "w") as out_handle: | |
35 SeqIO.write(protein_recs(glimmer_file, ref_rec, to_protein), out_handle, "fasta") | |
36 | |
37 def protein_recs(glimmer_file, ref_rec, to_protein): | |
38 """Generate protein records | |
39 """ | |
40 with open(glimmer_file) as in_handle: | |
41 for gene_num, exons, strand in glimmer_predictions(in_handle): | |
42 seq_exons = [] | |
43 for start, end in exons: | |
44 seq_exons.append(ref_rec.seq[start:end]) | |
45 gene_seq = reduce(operator.add, seq_exons) | |
46 if strand == '-': | |
47 gene_seq = gene_seq.reverse_complement() | |
48 if to_protein: | |
49 yield SeqRecord(gene_seq.translate(), gene_num, "", "") | |
50 else: | |
51 yield SeqRecord(gene_seq, gene_num, "", "") | |
52 | |
53 def glimmer_predictions(in_handle): | |
54 """Parse Glimmer output, generating a exons and strand for each prediction. | |
55 """ | |
56 # read the header | |
57 while 1: | |
58 line = in_handle.readline() | |
59 if line.startswith(" # #"): | |
60 break | |
61 in_handle.readline() | |
62 # read gene predictions one at a time | |
63 cur_exons, cur_gene_num, cur_strand = ([], None, None) | |
64 while 1: | |
65 line = in_handle.readline() | |
66 if not line: | |
67 break | |
68 parts = line.strip().split() | |
69 # new exon | |
70 if len(parts) == 0: | |
71 yield cur_gene_num, cur_exons, cur_strand | |
72 cur_exons, cur_gene_num, cur_strand = ([], None, None) | |
73 else: | |
74 this_gene_num = parts[0] | |
75 this_strand = parts[2] | |
76 this_start = int(parts[4]) - 1 # 1 based | |
77 this_end = int(parts[5]) | |
78 if cur_gene_num is None: | |
79 cur_gene_num = this_gene_num | |
80 cur_strand = this_strand | |
81 else: | |
82 assert cur_gene_num == this_gene_num | |
83 assert cur_strand == this_strand | |
84 cur_exons.append((this_start, this_end)) | |
85 if len(cur_exons) > 0: | |
86 yield cur_gene_num, cur_exons, cur_strand | |
87 | |
88 if __name__ == "__main__": | |
89 if len(sys.argv) != 5: | |
90 print __doc__ | |
91 sys.exit() | |
92 main(*sys.argv[1:]) |