5
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Convert data from Genbank format to GFF.
|
|
4
|
|
5 Usage:
|
|
6 python gbk_to_gff.py in.gbk > out.gff
|
|
7
|
|
8 Requirements:
|
|
9 BioPython:- http://biopython.org/
|
|
10 helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
|
|
11
|
|
12 Copyright (C)
|
|
13 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
|
|
14 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
|
|
15 """
|
|
16
|
|
17 import os
|
|
18 import re
|
|
19 import sys
|
|
20 import collections
|
|
21 from Bio import SeqIO
|
|
22 import helper
|
|
23
|
|
24 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk):
|
|
25 """
|
|
26 Write the feature information
|
|
27 """
|
|
28
|
|
29 for gname, ginfo in genes.items():
|
|
30 line = [str(chr_id),
|
|
31 'gbk_to_gff',
|
|
32 ginfo[3],
|
|
33 str(ginfo[0]),
|
|
34 str(ginfo[1]),
|
|
35 '.',
|
|
36 ginfo[2],
|
|
37 '.',
|
|
38 'ID=%s;Name=%s' % (str(gname), str(gname))]
|
|
39 print '\t'.join(line)
|
|
40 ## construct the transcript line is not defined in the original file
|
|
41 t_line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', ginfo[2], '.']
|
|
42
|
|
43 if not transcripts:
|
|
44 t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname)))
|
|
45
|
|
46 if exons: ## get the entire transcript region from the defined feature
|
|
47 t_line[3] = str(exons[gname][0][0])
|
|
48 t_line[4] = str(exons[gname][0][-1])
|
|
49 elif cds:
|
|
50 t_line[3] = str(cds[gname][0][0])
|
|
51 t_line[4] = str(cds[gname][0][-1])
|
|
52 print '\t'.join(t_line)
|
|
53
|
|
54 if exons:
|
|
55 exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon')
|
|
56
|
|
57 if cds:
|
|
58 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS')
|
|
59 if not exons:
|
|
60 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon')
|
|
61
|
|
62 else: ## transcript is defined
|
|
63 for idx in transcripts[gname]:
|
|
64 t_line[2] = idx[3]
|
|
65 t_line[3] = str(idx[0])
|
|
66 t_line[4] = str(idx[1])
|
|
67 t_line.append('ID='+str(idx[2])+';Parent='+str(gname))
|
|
68 print '\t'.join(t_line)
|
|
69
|
|
70 ## feature line print call
|
|
71 if exons:
|
|
72 exon_line_print(t_line, exons[gname], str(idx[2]), 'exon')
|
|
73 if cds:
|
|
74 exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS')
|
|
75 if not exons:
|
|
76 exon_line_print(t_line, cds[gname], str(idx[2]), 'exon')
|
|
77
|
|
78 if len(genes) == 0: ## feature entry with fragment information
|
|
79
|
|
80 line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', orient, '.']
|
|
81 fStart = fStop = None
|
|
82
|
|
83 for eid, ex in cds.items():
|
|
84 fStart = ex[0][0]
|
|
85 fStop = ex[0][-1]
|
|
86
|
|
87 for eid, ex in exons.items():
|
|
88 fStart = ex[0][0]
|
|
89 fStop = ex[0][-1]
|
|
90
|
|
91 if fStart or fStart:
|
|
92
|
|
93 line[2] = 'gene'
|
|
94 line[3] = str(fStart)
|
|
95 line[4] = str(fStop)
|
|
96 line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk))
|
|
97 print "\t".join(line)
|
|
98
|
|
99 if not cds:
|
|
100 line[2] = 'transcript'
|
|
101 else:
|
|
102 line[2] = 'mRNA'
|
|
103
|
|
104 line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk)
|
|
105 print "\t".join(line)
|
|
106
|
|
107 if exons:
|
|
108 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
|
|
109
|
|
110 if cds:
|
|
111 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS')
|
|
112 if not exons:
|
|
113 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
|
|
114
|
|
115 unk +=1
|
|
116
|
|
117 return unk
|
|
118
|
|
119 def exon_line_print(temp_line, trx_exons, parent, ftype):
|
|
120 """
|
|
121 Print the EXON feature line
|
|
122 """
|
|
123
|
|
124 for ex in trx_exons:
|
|
125 temp_line[2] = ftype
|
|
126 temp_line[3] = str(ex[0])
|
|
127 temp_line[4] = str(ex[1])
|
|
128 temp_line[8] = 'Parent=%s' % parent
|
|
129 print '\t'.join(temp_line)
|
|
130
|
|
131 def gbk_parse(fname):
|
|
132 """
|
|
133 Extract genome annotation recods from genbank format
|
|
134
|
|
135 @args fname: gbk file name
|
|
136 @type fname: str
|
|
137 """
|
|
138
|
|
139 fhand = helper.open_file(gbkfname)
|
|
140 unk = 1
|
|
141
|
|
142 for record in SeqIO.parse(fhand, "genbank"):
|
|
143
|
|
144 gene_tags = dict()
|
|
145 tx_tags = collections.defaultdict(list)
|
|
146 exon = collections.defaultdict(list)
|
|
147 cds = collections.defaultdict(list)
|
|
148 mol_type, chr_id = None, None
|
|
149
|
|
150 for rec in record.features:
|
|
151
|
|
152 if rec.type == 'source':
|
|
153 try:
|
|
154 mol_type = rec.qualifiers['mol_type'][0]
|
|
155 except:
|
|
156 mol_type = '.'
|
|
157 pass
|
|
158 try:
|
|
159 chr_id = rec.qualifiers['chromosome'][0]
|
|
160 except:
|
|
161 chr_id = record.name
|
|
162 continue
|
|
163
|
|
164 strand='-'
|
|
165 strand='+' if rec.strand>0 else strand
|
|
166
|
|
167 fid = None
|
|
168 try:
|
|
169 fid = rec.qualifiers['gene'][0]
|
|
170 except:
|
|
171 pass
|
|
172
|
|
173 transcript_id = None
|
|
174 try:
|
|
175 transcript_id = rec.qualifiers['transcript_id'][0]
|
|
176 except:
|
|
177 pass
|
|
178
|
|
179 if re.search(r'gene', rec.type):
|
|
180 gene_tags[fid] = (rec.location._start.position+1,
|
|
181 rec.location._end.position,
|
|
182 strand,
|
|
183 rec.type
|
|
184 )
|
|
185 elif rec.type == 'exon':
|
|
186 exon[fid].append((rec.location._start.position+1,
|
|
187 rec.location._end.position))
|
|
188 elif rec.type=='CDS':
|
|
189 cds[fid].append((rec.location._start.position+1,
|
|
190 rec.location._end.position))
|
|
191 else:
|
|
192 # get all transcripts
|
|
193 if transcript_id:
|
|
194 tx_tags[fid].append((rec.location._start.position+1,
|
|
195 rec.location._end.position,
|
|
196 transcript_id,
|
|
197 rec.type))
|
|
198 # record extracted, generate feature table
|
|
199 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk)
|
|
200
|
|
201 fhand.close()
|
|
202
|
|
203
|
|
204 if __name__=='__main__':
|
|
205
|
|
206 try:
|
|
207 gbkfname = sys.argv[1]
|
|
208 except:
|
|
209 print __doc__
|
|
210 sys.exit(-1)
|
|
211
|
|
212 ## extract gbk records
|
|
213 gbk_parse(gbkfname)
|