0
|
1 #!/usr/bin/env python
|
|
2 """Convert GlimmerHMM gene predictions into protein sequences.
|
|
3
|
|
4 This works with the GlimmerHMM specific output format:
|
|
5
|
|
6 12 1 + Initial 10748 10762 15
|
|
7 12 2 + Internal 10940 10971 32
|
|
8 12 3 + Internal 11035 11039 5
|
|
9 12 4 + Internal 11072 11110 39
|
|
10 12 5 + Internal 11146 11221 76
|
|
11 12 6 + Terminal 11265 11388 124
|
|
12
|
|
13 http://www.cbcb.umd.edu/software/GlimmerHMM/
|
|
14
|
|
15 Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmer_to_proteins.py
|
|
16
|
|
17 Usage:
|
|
18 glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True>
|
|
19 """
|
|
20
|
|
21 import sys
|
|
22 import os
|
|
23 import operator
|
|
24
|
|
25 from Bio import SeqIO
|
|
26 from Bio.SeqRecord import SeqRecord
|
|
27
|
|
28 def main(glimmer_file, ref_file, out_file, to_protein):
|
|
29 with open(ref_file) as in_handle:
|
|
30 ref_rec = SeqIO.read(in_handle, "fasta")
|
|
31
|
|
32 base, ext = os.path.splitext(glimmer_file)
|
|
33
|
|
34 with open(out_file, "w") as out_handle:
|
|
35 SeqIO.write(protein_recs(glimmer_file, ref_rec, to_protein), out_handle, "fasta")
|
|
36
|
|
37 def protein_recs(glimmer_file, ref_rec, to_protein):
|
|
38 """Generate protein records
|
|
39 """
|
|
40 with open(glimmer_file) as in_handle:
|
|
41 for gene_num, exons, strand in glimmer_predictions(in_handle):
|
|
42 seq_exons = []
|
|
43 for start, end in exons:
|
|
44 seq_exons.append(ref_rec.seq[start:end])
|
|
45 gene_seq = reduce(operator.add, seq_exons)
|
|
46 if strand == '-':
|
|
47 gene_seq = gene_seq.reverse_complement()
|
|
48 if to_protein:
|
|
49 yield SeqRecord(gene_seq.translate(), gene_num, "", "")
|
|
50 else:
|
|
51 yield SeqRecord(gene_seq, gene_num, "", "")
|
|
52
|
|
53 def glimmer_predictions(in_handle):
|
|
54 """Parse Glimmer output, generating a exons and strand for each prediction.
|
|
55 """
|
|
56 # read the header
|
|
57 while 1:
|
|
58 line = in_handle.readline()
|
|
59 if line.startswith(" # #"):
|
|
60 break
|
|
61 in_handle.readline()
|
|
62 # read gene predictions one at a time
|
|
63 cur_exons, cur_gene_num, cur_strand = ([], None, None)
|
|
64 while 1:
|
|
65 line = in_handle.readline()
|
|
66 if not line:
|
|
67 break
|
|
68 parts = line.strip().split()
|
|
69 # new exon
|
|
70 if len(parts) == 0:
|
|
71 yield cur_gene_num, cur_exons, cur_strand
|
|
72 cur_exons, cur_gene_num, cur_strand = ([], None, None)
|
|
73 else:
|
|
74 this_gene_num = parts[0]
|
|
75 this_strand = parts[2]
|
|
76 this_start = int(parts[4]) - 1 # 1 based
|
|
77 this_end = int(parts[5])
|
|
78 if cur_gene_num is None:
|
|
79 cur_gene_num = this_gene_num
|
|
80 cur_strand = this_strand
|
|
81 else:
|
|
82 assert cur_gene_num == this_gene_num
|
|
83 assert cur_strand == this_strand
|
|
84 cur_exons.append((this_start, this_end))
|
|
85 if len(cur_exons) > 0:
|
|
86 yield cur_gene_num, cur_exons, cur_strand
|
|
87
|
|
88 if __name__ == "__main__":
|
|
89 if len(sys.argv) != 5:
|
|
90 print __doc__
|
|
91 sys.exit()
|
|
92 main(*sys.argv[1:])
|