annotate gbk_to_gff.py @ 7:ee541c1852da

Uploaded version 2.1.0
author vipints
date Thu, 23 Apr 2015 17:43:11 -0400
parents 6e589f267c14
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
6e589f267c14 Uploaded
devteam
parents:
diff changeset
1 #!/usr/bin/env python
6e589f267c14 Uploaded
devteam
parents:
diff changeset
2 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
3 Convert data from Genbank format to GFF.
6e589f267c14 Uploaded
devteam
parents:
diff changeset
4
6e589f267c14 Uploaded
devteam
parents:
diff changeset
5 Usage:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
6 python gbk_to_gff.py in.gbk > out.gff
6e589f267c14 Uploaded
devteam
parents:
diff changeset
7
6e589f267c14 Uploaded
devteam
parents:
diff changeset
8 Requirements:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
9 BioPython:- http://biopython.org/
6e589f267c14 Uploaded
devteam
parents:
diff changeset
10 helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
6e589f267c14 Uploaded
devteam
parents:
diff changeset
11
6e589f267c14 Uploaded
devteam
parents:
diff changeset
12 Copyright (C)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
13 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
6e589f267c14 Uploaded
devteam
parents:
diff changeset
14 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
6e589f267c14 Uploaded
devteam
parents:
diff changeset
15 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
16
6e589f267c14 Uploaded
devteam
parents:
diff changeset
17 import os
6e589f267c14 Uploaded
devteam
parents:
diff changeset
18 import re
6e589f267c14 Uploaded
devteam
parents:
diff changeset
19 import sys
6e589f267c14 Uploaded
devteam
parents:
diff changeset
20 import collections
6e589f267c14 Uploaded
devteam
parents:
diff changeset
21 from Bio import SeqIO
6e589f267c14 Uploaded
devteam
parents:
diff changeset
22 import helper
6e589f267c14 Uploaded
devteam
parents:
diff changeset
23
6e589f267c14 Uploaded
devteam
parents:
diff changeset
24 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk):
6e589f267c14 Uploaded
devteam
parents:
diff changeset
25 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
26 Write the feature information
6e589f267c14 Uploaded
devteam
parents:
diff changeset
27 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
28
6e589f267c14 Uploaded
devteam
parents:
diff changeset
29 for gname, ginfo in genes.items():
6e589f267c14 Uploaded
devteam
parents:
diff changeset
30 line = [str(chr_id),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
31 'gbk_to_gff',
6e589f267c14 Uploaded
devteam
parents:
diff changeset
32 ginfo[3],
6e589f267c14 Uploaded
devteam
parents:
diff changeset
33 str(ginfo[0]),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
34 str(ginfo[1]),
6e589f267c14 Uploaded
devteam
parents:
diff changeset
35 '.',
6e589f267c14 Uploaded
devteam
parents:
diff changeset
36 ginfo[2],
6e589f267c14 Uploaded
devteam
parents:
diff changeset
37 '.',
6e589f267c14 Uploaded
devteam
parents:
diff changeset
38 'ID=%s;Name=%s' % (str(gname), str(gname))]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
39 print '\t'.join(line)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
40 ## construct the transcript line is not defined in the original file
6e589f267c14 Uploaded
devteam
parents:
diff changeset
41 t_line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', ginfo[2], '.']
6e589f267c14 Uploaded
devteam
parents:
diff changeset
42
6e589f267c14 Uploaded
devteam
parents:
diff changeset
43 if not transcripts:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
44 t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname)))
6e589f267c14 Uploaded
devteam
parents:
diff changeset
45
6e589f267c14 Uploaded
devteam
parents:
diff changeset
46 if exons: ## get the entire transcript region from the defined feature
6e589f267c14 Uploaded
devteam
parents:
diff changeset
47 t_line[3] = str(exons[gname][0][0])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
48 t_line[4] = str(exons[gname][0][-1])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
49 elif cds:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
50 t_line[3] = str(cds[gname][0][0])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
51 t_line[4] = str(cds[gname][0][-1])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
52 print '\t'.join(t_line)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
53
6e589f267c14 Uploaded
devteam
parents:
diff changeset
54 if exons:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
55 exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon')
6e589f267c14 Uploaded
devteam
parents:
diff changeset
56
6e589f267c14 Uploaded
devteam
parents:
diff changeset
57 if cds:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
58 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS')
6e589f267c14 Uploaded
devteam
parents:
diff changeset
59 if not exons:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
60 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon')
6e589f267c14 Uploaded
devteam
parents:
diff changeset
61
6e589f267c14 Uploaded
devteam
parents:
diff changeset
62 else: ## transcript is defined
6e589f267c14 Uploaded
devteam
parents:
diff changeset
63 for idx in transcripts[gname]:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
64 t_line[2] = idx[3]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
65 t_line[3] = str(idx[0])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
66 t_line[4] = str(idx[1])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
67 t_line.append('ID='+str(idx[2])+';Parent='+str(gname))
6e589f267c14 Uploaded
devteam
parents:
diff changeset
68 print '\t'.join(t_line)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
69
6e589f267c14 Uploaded
devteam
parents:
diff changeset
70 ## feature line print call
6e589f267c14 Uploaded
devteam
parents:
diff changeset
71 if exons:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
72 exon_line_print(t_line, exons[gname], str(idx[2]), 'exon')
6e589f267c14 Uploaded
devteam
parents:
diff changeset
73 if cds:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
74 exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS')
6e589f267c14 Uploaded
devteam
parents:
diff changeset
75 if not exons:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
76 exon_line_print(t_line, cds[gname], str(idx[2]), 'exon')
6e589f267c14 Uploaded
devteam
parents:
diff changeset
77
6e589f267c14 Uploaded
devteam
parents:
diff changeset
78 if len(genes) == 0: ## feature entry with fragment information
6e589f267c14 Uploaded
devteam
parents:
diff changeset
79
6e589f267c14 Uploaded
devteam
parents:
diff changeset
80 line = [str(chr_id), 'gbk_to_gff', source, 0, 1, '.', orient, '.']
6e589f267c14 Uploaded
devteam
parents:
diff changeset
81 fStart = fStop = None
6e589f267c14 Uploaded
devteam
parents:
diff changeset
82
6e589f267c14 Uploaded
devteam
parents:
diff changeset
83 for eid, ex in cds.items():
6e589f267c14 Uploaded
devteam
parents:
diff changeset
84 fStart = ex[0][0]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
85 fStop = ex[0][-1]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
86
6e589f267c14 Uploaded
devteam
parents:
diff changeset
87 for eid, ex in exons.items():
6e589f267c14 Uploaded
devteam
parents:
diff changeset
88 fStart = ex[0][0]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
89 fStop = ex[0][-1]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
90
6e589f267c14 Uploaded
devteam
parents:
diff changeset
91 if fStart or fStart:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
92
6e589f267c14 Uploaded
devteam
parents:
diff changeset
93 line[2] = 'gene'
6e589f267c14 Uploaded
devteam
parents:
diff changeset
94 line[3] = str(fStart)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
95 line[4] = str(fStop)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
96 line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk))
6e589f267c14 Uploaded
devteam
parents:
diff changeset
97 print "\t".join(line)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
98
6e589f267c14 Uploaded
devteam
parents:
diff changeset
99 if not cds:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
100 line[2] = 'transcript'
6e589f267c14 Uploaded
devteam
parents:
diff changeset
101 else:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
102 line[2] = 'mRNA'
6e589f267c14 Uploaded
devteam
parents:
diff changeset
103
6e589f267c14 Uploaded
devteam
parents:
diff changeset
104 line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
105 print "\t".join(line)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
106
6e589f267c14 Uploaded
devteam
parents:
diff changeset
107 if exons:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
108 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
6e589f267c14 Uploaded
devteam
parents:
diff changeset
109
6e589f267c14 Uploaded
devteam
parents:
diff changeset
110 if cds:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
111 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS')
6e589f267c14 Uploaded
devteam
parents:
diff changeset
112 if not exons:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
113 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon')
6e589f267c14 Uploaded
devteam
parents:
diff changeset
114
6e589f267c14 Uploaded
devteam
parents:
diff changeset
115 unk +=1
6e589f267c14 Uploaded
devteam
parents:
diff changeset
116
6e589f267c14 Uploaded
devteam
parents:
diff changeset
117 return unk
6e589f267c14 Uploaded
devteam
parents:
diff changeset
118
6e589f267c14 Uploaded
devteam
parents:
diff changeset
119 def exon_line_print(temp_line, trx_exons, parent, ftype):
6e589f267c14 Uploaded
devteam
parents:
diff changeset
120 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
121 Print the EXON feature line
6e589f267c14 Uploaded
devteam
parents:
diff changeset
122 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
123
6e589f267c14 Uploaded
devteam
parents:
diff changeset
124 for ex in trx_exons:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
125 temp_line[2] = ftype
6e589f267c14 Uploaded
devteam
parents:
diff changeset
126 temp_line[3] = str(ex[0])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
127 temp_line[4] = str(ex[1])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
128 temp_line[8] = 'Parent=%s' % parent
6e589f267c14 Uploaded
devteam
parents:
diff changeset
129 print '\t'.join(temp_line)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
130
6e589f267c14 Uploaded
devteam
parents:
diff changeset
131 def gbk_parse(fname):
6e589f267c14 Uploaded
devteam
parents:
diff changeset
132 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
133 Extract genome annotation recods from genbank format
6e589f267c14 Uploaded
devteam
parents:
diff changeset
134
6e589f267c14 Uploaded
devteam
parents:
diff changeset
135 @args fname: gbk file name
6e589f267c14 Uploaded
devteam
parents:
diff changeset
136 @type fname: str
6e589f267c14 Uploaded
devteam
parents:
diff changeset
137 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
138
6e589f267c14 Uploaded
devteam
parents:
diff changeset
139 fhand = helper.open_file(gbkfname)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
140 unk = 1
6e589f267c14 Uploaded
devteam
parents:
diff changeset
141
6e589f267c14 Uploaded
devteam
parents:
diff changeset
142 for record in SeqIO.parse(fhand, "genbank"):
6e589f267c14 Uploaded
devteam
parents:
diff changeset
143
6e589f267c14 Uploaded
devteam
parents:
diff changeset
144 gene_tags = dict()
6e589f267c14 Uploaded
devteam
parents:
diff changeset
145 tx_tags = collections.defaultdict(list)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
146 exon = collections.defaultdict(list)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
147 cds = collections.defaultdict(list)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
148 mol_type, chr_id = None, None
6e589f267c14 Uploaded
devteam
parents:
diff changeset
149
6e589f267c14 Uploaded
devteam
parents:
diff changeset
150 for rec in record.features:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
151
6e589f267c14 Uploaded
devteam
parents:
diff changeset
152 if rec.type == 'source':
6e589f267c14 Uploaded
devteam
parents:
diff changeset
153 try:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
154 mol_type = rec.qualifiers['mol_type'][0]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
155 except:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
156 mol_type = '.'
6e589f267c14 Uploaded
devteam
parents:
diff changeset
157 pass
6e589f267c14 Uploaded
devteam
parents:
diff changeset
158 try:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
159 chr_id = rec.qualifiers['chromosome'][0]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
160 except:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
161 chr_id = record.name
6e589f267c14 Uploaded
devteam
parents:
diff changeset
162 continue
6e589f267c14 Uploaded
devteam
parents:
diff changeset
163
6e589f267c14 Uploaded
devteam
parents:
diff changeset
164 strand='-'
6e589f267c14 Uploaded
devteam
parents:
diff changeset
165 strand='+' if rec.strand>0 else strand
6e589f267c14 Uploaded
devteam
parents:
diff changeset
166
6e589f267c14 Uploaded
devteam
parents:
diff changeset
167 fid = None
6e589f267c14 Uploaded
devteam
parents:
diff changeset
168 try:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
169 fid = rec.qualifiers['gene'][0]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
170 except:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
171 pass
6e589f267c14 Uploaded
devteam
parents:
diff changeset
172
6e589f267c14 Uploaded
devteam
parents:
diff changeset
173 transcript_id = None
6e589f267c14 Uploaded
devteam
parents:
diff changeset
174 try:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
175 transcript_id = rec.qualifiers['transcript_id'][0]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
176 except:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
177 pass
6e589f267c14 Uploaded
devteam
parents:
diff changeset
178
6e589f267c14 Uploaded
devteam
parents:
diff changeset
179 if re.search(r'gene', rec.type):
6e589f267c14 Uploaded
devteam
parents:
diff changeset
180 gene_tags[fid] = (rec.location._start.position+1,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
181 rec.location._end.position,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
182 strand,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
183 rec.type
6e589f267c14 Uploaded
devteam
parents:
diff changeset
184 )
6e589f267c14 Uploaded
devteam
parents:
diff changeset
185 elif rec.type == 'exon':
6e589f267c14 Uploaded
devteam
parents:
diff changeset
186 exon[fid].append((rec.location._start.position+1,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
187 rec.location._end.position))
6e589f267c14 Uploaded
devteam
parents:
diff changeset
188 elif rec.type=='CDS':
6e589f267c14 Uploaded
devteam
parents:
diff changeset
189 cds[fid].append((rec.location._start.position+1,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
190 rec.location._end.position))
6e589f267c14 Uploaded
devteam
parents:
diff changeset
191 else:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
192 # get all transcripts
6e589f267c14 Uploaded
devteam
parents:
diff changeset
193 if transcript_id:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
194 tx_tags[fid].append((rec.location._start.position+1,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
195 rec.location._end.position,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
196 transcript_id,
6e589f267c14 Uploaded
devteam
parents:
diff changeset
197 rec.type))
6e589f267c14 Uploaded
devteam
parents:
diff changeset
198 # record extracted, generate feature table
6e589f267c14 Uploaded
devteam
parents:
diff changeset
199 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
200
6e589f267c14 Uploaded
devteam
parents:
diff changeset
201 fhand.close()
6e589f267c14 Uploaded
devteam
parents:
diff changeset
202
6e589f267c14 Uploaded
devteam
parents:
diff changeset
203
6e589f267c14 Uploaded
devteam
parents:
diff changeset
204 if __name__=='__main__':
6e589f267c14 Uploaded
devteam
parents:
diff changeset
205
6e589f267c14 Uploaded
devteam
parents:
diff changeset
206 try:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
207 gbkfname = sys.argv[1]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
208 except:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
209 print __doc__
6e589f267c14 Uploaded
devteam
parents:
diff changeset
210 sys.exit(-1)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
211
6e589f267c14 Uploaded
devteam
parents:
diff changeset
212 ## extract gbk records
6e589f267c14 Uploaded
devteam
parents:
diff changeset
213 gbk_parse(gbkfname)