Mercurial > repos > vipints > fml_gff3togtf
comparison gbk_to_gff.py @ 10:c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Thu, 23 Apr 2015 18:01:45 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:7d67331368f3 | 10:c42c69aa81f8 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Convert data from Genbank format to GFF. | |
4 | |
5 Usage: | |
6 python gbk_to_gff.py in.gbk > out.gff | |
7 | |
8 Requirements: | |
9 BioPython:- http://biopython.org/ | |
10 helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py | |
11 | |
12 Copyright (C) | |
13 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. | |
14 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. | |
15 """ | |
16 | |
17 import os | |
18 import re | |
19 import sys | |
20 import helper | |
21 import collections | |
22 from Bio import SeqIO | |
23 | |
24 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk): | |
25 """ | |
26 Write the feature information | |
27 """ | |
28 for gname, ginfo in genes.items(): | |
29 line = [str(chr_id), | |
30 'gbk2gff', | |
31 ginfo[3], | |
32 str(ginfo[0]), | |
33 str(ginfo[1]), | |
34 '.', | |
35 ginfo[2], | |
36 '.', | |
37 'ID=%s;Name=%s' % (str(gname), str(gname))] | |
38 sys.stdout.write('\t'.join(line)+"\n") | |
39 ## construct the transcript line is not defined in the original file | |
40 t_line = [str(chr_id), 'gbk2gff', source, 0, 1, '.', ginfo[2], '.'] | |
41 | |
42 if not transcripts: | |
43 t_line.append('ID=Transcript:%s;Parent=%s' % (str(gname), str(gname))) | |
44 | |
45 if exons: ## get the entire transcript region from the defined feature | |
46 t_line[3] = str(exons[gname][0][0]) | |
47 t_line[4] = str(exons[gname][0][-1]) | |
48 elif cds: | |
49 t_line[3] = str(cds[gname][0][0]) | |
50 t_line[4] = str(cds[gname][0][-1]) | |
51 | |
52 if not cds: | |
53 t_line[2] = 'transcript' | |
54 else: | |
55 t_line[2] = 'mRNA' | |
56 sys.stdout.write('\t'.join(t_line)+"\n") | |
57 | |
58 if exons: | |
59 exon_line_print(t_line, exons[gname], 'Transcript:'+str(gname), 'exon') | |
60 | |
61 if cds: | |
62 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'CDS') | |
63 if not exons: | |
64 exon_line_print(t_line, cds[gname], 'Transcript:'+str(gname), 'exon') | |
65 | |
66 else: ## transcript is defined | |
67 for idx in transcripts[gname]: | |
68 t_line[2] = idx[3] | |
69 t_line[3] = str(idx[0]) | |
70 t_line[4] = str(idx[1]) | |
71 t_line.append('ID='+str(idx[2])+';Parent='+str(gname)) | |
72 sys.stdout.write('\t'.join(t_line)+"\n") | |
73 | |
74 ## feature line print call | |
75 if exons: | |
76 exon_line_print(t_line, exons[gname], str(idx[2]), 'exon') | |
77 if cds: | |
78 exon_line_print(t_line, cds[gname], str(idx[2]), 'CDS') | |
79 if not exons: | |
80 exon_line_print(t_line, cds[gname], str(idx[2]), 'exon') | |
81 | |
82 if len(genes) == 0: ## feature entry with fragment information | |
83 | |
84 line = [str(chr_id), 'gbk2gff', source, 0, 1, '.', orient, '.'] | |
85 fStart = fStop = None | |
86 | |
87 for eid, ex in cds.items(): | |
88 fStart = ex[0][0] | |
89 fStop = ex[0][-1] | |
90 | |
91 for eid, ex in exons.items(): | |
92 fStart = ex[0][0] | |
93 fStop = ex[0][-1] | |
94 | |
95 if fStart or fStart: | |
96 | |
97 line[2] = 'gene' | |
98 line[3] = str(fStart) | |
99 line[4] = str(fStop) | |
100 line.append('ID=Unknown_Gene_' + str(unk) + ';Name=Unknown_Gene_' + str(unk)) | |
101 sys.stdout.write('\t'.join(line)+"\n") | |
102 | |
103 if not cds: | |
104 line[2] = 'transcript' | |
105 else: | |
106 line[2] = 'mRNA' | |
107 | |
108 line[8] = 'ID=Unknown_Transcript_' + str(unk) + ';Parent=Unknown_Gene_' + str(unk) | |
109 sys.stdout.write('\t'.join(line)+"\n") | |
110 | |
111 if exons: | |
112 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') | |
113 | |
114 if cds: | |
115 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'CDS') | |
116 if not exons: | |
117 exon_line_print(line, cds[None], 'Unknown_Transcript_' + str(unk), 'exon') | |
118 | |
119 unk +=1 | |
120 | |
121 return unk | |
122 | |
123 | |
124 def exon_line_print(temp_line, trx_exons, parent, ftype): | |
125 """ | |
126 Print the EXON feature line | |
127 """ | |
128 for ex in trx_exons: | |
129 temp_line[2] = ftype | |
130 temp_line[3] = str(ex[0]) | |
131 temp_line[4] = str(ex[1]) | |
132 temp_line[8] = 'Parent=%s' % parent | |
133 sys.stdout.write('\t'.join(temp_line)+"\n") | |
134 | |
135 | |
136 def gbk_parse(fname): | |
137 """ | |
138 Extract genome annotation recods from genbank format | |
139 | |
140 @args fname: gbk file name | |
141 @type fname: str | |
142 """ | |
143 fhand = helper.open_file(gbkfname) | |
144 unk = 1 | |
145 | |
146 for record in SeqIO.parse(fhand, "genbank"): | |
147 gene_tags = dict() | |
148 tx_tags = collections.defaultdict(list) | |
149 exon = collections.defaultdict(list) | |
150 cds = collections.defaultdict(list) | |
151 mol_type, chr_id = None, None | |
152 | |
153 for rec in record.features: | |
154 | |
155 if rec.type == 'source': | |
156 try: | |
157 mol_type = rec.qualifiers['mol_type'][0] | |
158 except: | |
159 mol_type = '.' | |
160 pass | |
161 try: | |
162 chr_id = rec.qualifiers['chromosome'][0] | |
163 except: | |
164 chr_id = record.name | |
165 continue | |
166 | |
167 strand='-' | |
168 strand='+' if rec.strand>0 else strand | |
169 | |
170 fid = None | |
171 try: | |
172 fid = rec.qualifiers['gene'][0] | |
173 except: | |
174 pass | |
175 | |
176 transcript_id = None | |
177 try: | |
178 transcript_id = rec.qualifiers['transcript_id'][0] | |
179 except: | |
180 pass | |
181 | |
182 if re.search(r'gene', rec.type): | |
183 gene_tags[fid] = (rec.location._start.position+1, | |
184 rec.location._end.position, | |
185 strand, | |
186 rec.type | |
187 ) | |
188 elif rec.type == 'exon': | |
189 exon[fid].append((rec.location._start.position+1, | |
190 rec.location._end.position)) | |
191 elif rec.type=='CDS': | |
192 cds[fid].append((rec.location._start.position+1, | |
193 rec.location._end.position)) | |
194 else: | |
195 # get all transcripts | |
196 if transcript_id: | |
197 tx_tags[fid].append((rec.location._start.position+1, | |
198 rec.location._end.position, | |
199 transcript_id, | |
200 rec.type)) | |
201 # record extracted, generate feature table | |
202 unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk) | |
203 | |
204 fhand.close() | |
205 | |
206 | |
207 if __name__=='__main__': | |
208 | |
209 try: | |
210 gbkfname = sys.argv[1] | |
211 except: | |
212 print __doc__ | |
213 sys.exit(-1) | |
214 | |
215 ## extract gbk records | |
216 gbk_parse(gbkfname) |