0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Extract genome annotation from a GFF3 (a tab delimited
|
|
4 format for storing sequence features and annotations:
|
|
5 http://www.sequenceontology.org/gff3.shtml) file.
|
|
6
|
|
7 Usage: ParseGFF.py in.gff3 out.mat
|
|
8
|
|
9 Requirements:
|
|
10 Scipy :- http://scipy.org/
|
|
11 Numpy :- http://numpy.org/
|
|
12
|
|
13 Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany
|
|
14 2012- Memorial Sloan-Kettering Cancer Center, New York City, USA
|
|
15 """
|
|
16
|
|
17 import re, sys
|
|
18 import scipy.io as sio
|
|
19 import numpy as np
|
|
20
|
|
21 def addCDSphase(strand, cds):
|
|
22 """Add CDS phase to the CDS exons
|
|
23 """
|
|
24 cds_region, cds_flag = [], 0
|
|
25 if strand == '+':
|
|
26 for cdspos in cds:
|
|
27 if cds_flag == 0:
|
|
28 cdspos = (cdspos[0], cdspos[1], 0)
|
|
29 diff = (cdspos[1]-(cdspos[0]-1))%3
|
|
30 else:
|
|
31 xy = 0
|
|
32 if diff == 0:
|
|
33 cdspos = (cdspos[0], cdspos[1], 0)
|
|
34 elif diff == 1:
|
|
35 cdspos = (cdspos[0], cdspos[1], 2)
|
|
36 xy = 2
|
|
37 elif diff == 2:
|
|
38 cdspos = (cdspos[0], cdspos[1], 1)
|
|
39 xy = 1
|
|
40 diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
|
|
41 cds_region.append(cdspos)
|
|
42 cds_flag = 1
|
|
43 elif strand == '-':
|
|
44 cds.reverse()
|
|
45 for cdspos in cds:
|
|
46 if cds_flag == 0:
|
|
47 cdspos = (cdspos[0], cdspos[1], 0)
|
|
48 diff = (cdspos[1]-(cdspos[0]-1))%3
|
|
49 else:
|
|
50 xy = 0
|
|
51 if diff == 0:
|
|
52 cdspos = (cdspos[0], cdspos[1], 0)
|
|
53 elif diff == 1:
|
|
54 cdspos = (cdspos[0], cdspos[1], 2)
|
|
55 xy = 2
|
|
56 elif diff == 2:
|
|
57 cdspos = (cdspos[0], cdspos[1], 1)
|
|
58 xy = 1
|
|
59 diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
|
|
60 cds_region.append(cdspos)
|
|
61 cds_flag = 1
|
|
62 cds_region.reverse()
|
|
63 return cds_region
|
|
64
|
|
65 def createExon(strand_p, five_p_utr, cds_cod, three_p_utr):
|
|
66 """Create exon cordinates from UTR's and CDS region
|
|
67 """
|
|
68 exon_pos = []
|
|
69 if strand_p == '+':
|
|
70 utr5_start, utr5_end = 0, 0
|
|
71 if five_p_utr != []:
|
|
72 utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1]
|
|
73 cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1]
|
|
74 jun_exon = []
|
|
75 if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1:
|
|
76 jun_exon = [utr5_start, cds_5end]
|
|
77 if len(cds_cod) == 1:
|
|
78 five_prime_flag = 0
|
|
79 if jun_exon != []:
|
|
80 five_p_utr = five_p_utr[:-1]
|
|
81 five_prime_flag = 1
|
|
82 for utr5 in five_p_utr:
|
|
83 exon_pos.append(utr5)
|
|
84 jun_exon = []
|
|
85 utr3_start, utr3_end = 0, 0
|
|
86 if three_p_utr != []:
|
|
87 utr3_start = three_p_utr[0][0]
|
|
88 utr3_end = three_p_utr[0][1]
|
|
89 if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1:
|
|
90 jun_exon = [cds_5start, utr3_end]
|
|
91 three_prime_flag = 0
|
|
92 if jun_exon != []:
|
|
93 cds_cod = cds_cod[:-1]
|
|
94 three_p_utr = three_p_utr[1:]
|
|
95 three_prime_flag = 1
|
|
96 if five_prime_flag == 1 and three_prime_flag == 1:
|
|
97 exon_pos.append([utr5_start, utr3_end])
|
|
98 if five_prime_flag == 1 and three_prime_flag == 0:
|
|
99 exon_pos.append([utr5_start, cds_5end])
|
|
100 cds_cod = cds_cod[:-1]
|
|
101 if five_prime_flag == 0 and three_prime_flag == 1:
|
|
102 exon_pos.append([cds_5start, utr3_end])
|
|
103 for cds in cds_cod:
|
|
104 exon_pos.append(cds)
|
|
105 for utr3 in three_p_utr:
|
|
106 exon_pos.append(utr3)
|
|
107 else:
|
|
108 if jun_exon != []:
|
|
109 five_p_utr = five_p_utr[:-1]
|
|
110 cds_cod = cds_cod[1:]
|
|
111 for utr5 in five_p_utr:
|
|
112 exon_pos.append(utr5)
|
|
113 exon_pos.append(jun_exon) if jun_exon != [] else ''
|
|
114 jun_exon = []
|
|
115 utr3_start, utr3_end = 0, 0
|
|
116 if three_p_utr != []:
|
|
117 utr3_start = three_p_utr[0][0]
|
|
118 utr3_end = three_p_utr[0][1]
|
|
119 cds_3start = cds_cod[-1][0]
|
|
120 cds_3end = cds_cod[-1][1]
|
|
121 if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1:
|
|
122 jun_exon = [cds_3start, utr3_end]
|
|
123 if jun_exon != []:
|
|
124 cds_cod = cds_cod[:-1]
|
|
125 three_p_utr = three_p_utr[1:]
|
|
126 for cds in cds_cod:
|
|
127 exon_pos.append(cds)
|
|
128 exon_pos.append(jun_exon) if jun_exon != [] else ''
|
|
129 for utr3 in three_p_utr:
|
|
130 exon_pos.append(utr3)
|
|
131 elif strand_p == '-':
|
|
132 utr3_start, utr3_end = 0, 0
|
|
133 if three_p_utr != []:
|
|
134 utr3_start = three_p_utr[-1][0]
|
|
135 utr3_end = three_p_utr[-1][1]
|
|
136 cds_3start = cds_cod[0][0]
|
|
137 cds_3end = cds_cod[0][1]
|
|
138 jun_exon = []
|
|
139 if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:
|
|
140 jun_exon = [utr3_start, cds_3end]
|
|
141 if len(cds_cod) == 1:
|
|
142 three_prime_flag = 0
|
|
143 if jun_exon != []:
|
|
144 three_p_utr = three_p_utr[:-1]
|
|
145 three_prime_flag = 1
|
|
146 for utr3 in three_p_utr:
|
|
147 exon_pos.append(utr3)
|
|
148 jun_exon = []
|
|
149 (utr5_start, utr5_end) = (0, 0)
|
|
150 if five_p_utr != []:
|
|
151 utr5_start = five_p_utr[0][0]
|
|
152 utr5_end = five_p_utr[0][1]
|
|
153 if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:
|
|
154 jun_exon = [cds_3start, utr5_end]
|
|
155 five_prime_flag = 0
|
|
156 if jun_exon != []:
|
|
157 cds_cod = cds_cod[:-1]
|
|
158 five_p_utr = five_p_utr[1:]
|
|
159 five_prime_flag = 1
|
|
160 if three_prime_flag == 1 and five_prime_flag == 1:
|
|
161 exon_pos.append([utr3_start, utr5_end])
|
|
162 if three_prime_flag == 1 and five_prime_flag == 0:
|
|
163 exon_pos.append([utr3_start, cds_3end])
|
|
164 cds_cod = cds_cod[:-1]
|
|
165 if three_prime_flag == 0 and five_prime_flag == 1:
|
|
166 exon_pos.append([cds_3start, utr5_end])
|
|
167 for cds in cds_cod:
|
|
168 exon_pos.append(cds)
|
|
169 for utr5 in five_p_utr:
|
|
170 exon_pos.append(utr5)
|
|
171 else:
|
|
172 if jun_exon != []:
|
|
173 three_p_utr = three_p_utr[:-1]
|
|
174 cds_cod = cds_cod[1:]
|
|
175 for utr3 in three_p_utr:
|
|
176 exon_pos.append(utr3)
|
|
177 if jun_exon != []:
|
|
178 exon_pos.append(jun_exon)
|
|
179 jun_exon = []
|
|
180 (utr5_start, utr5_end) = (0, 0)
|
|
181 if five_p_utr != []:
|
|
182 utr5_start = five_p_utr[0][0]
|
|
183 utr5_end = five_p_utr[0][1]
|
|
184 cds_5start = cds_cod[-1][0]
|
|
185 cds_5end = cds_cod[-1][1]
|
|
186 if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:
|
|
187 jun_exon = [cds_5start, utr5_end]
|
|
188 if jun_exon != []:
|
|
189 cds_cod = cds_cod[:-1]
|
|
190 five_p_utr = five_p_utr[1:]
|
|
191 for cds in cds_cod:
|
|
192 exon_pos.append(cds)
|
|
193 if jun_exon != []:
|
|
194 exon_pos.append(jun_exon)
|
|
195 for utr5 in five_p_utr:
|
|
196 exon_pos.append(utr5)
|
|
197 return exon_pos
|
|
198
|
|
199 def init_gene():
|
|
200 """Initializing the gene structure
|
|
201 """
|
|
202 gene_details=dict(
|
|
203 id = '',
|
|
204 anno_id = [],
|
|
205 confgenes_id = [],
|
|
206 name = '',
|
|
207 source = '',
|
|
208 gene_info = {},
|
|
209 alias = '',
|
|
210 name2 = [],
|
|
211 strand = '',
|
|
212 chr = '',
|
|
213 chr_num = [],
|
|
214 paralogs = [],
|
|
215 start = '',
|
|
216 stop = '',
|
|
217 transcripts = [],
|
|
218 transcript_info = [],
|
|
219 transcript_status = [],
|
|
220 transcript_valid = [],
|
|
221 exons = [],
|
|
222 exons_confirmed = [],
|
|
223 cds_exons = [],
|
|
224 utr5_exons = [],
|
|
225 utr3_exons = [],
|
|
226 tis = [],
|
|
227 tis_conf = [],
|
|
228 tis_info = [],
|
|
229 cdsStop = [],
|
|
230 cdsStop_conf = [],
|
|
231 cdsStop_info = [],
|
|
232 tss = [],
|
|
233 tss_info = [],
|
|
234 tss_conf = [],
|
|
235 cleave = [],
|
|
236 cleave_info = [],
|
|
237 cleave_conf = [],
|
|
238 polya = [],
|
|
239 polya_info = [],
|
|
240 polya_conf = [],
|
|
241 is_alt = [],
|
|
242 is_alt_spliced = 0,
|
|
243 is_valid = [],
|
|
244 transcript_complete = [],
|
|
245 is_complete = [],
|
|
246 is_correctly_gff3_referenced = '',
|
|
247 splicegraph = []
|
|
248 )
|
|
249 return gene_details
|
|
250
|
|
251 def FeatureValueFormat(singlegene):
|
|
252 """Make feature value compactable to write in a .mat format
|
|
253 """
|
|
254 comp_exon = np.zeros((len(singlegene['exons']),), dtype=np.object)
|
|
255 for i in range(len(singlegene['exons'])):
|
|
256 comp_exon[i]= np.array(singlegene['exons'][i])
|
|
257 singlegene['exons'] = comp_exon
|
|
258 comp_cds = np.zeros((len(singlegene['cds_exons']),), dtype=np.object)
|
|
259 for i in range(len(singlegene['cds_exons'])):
|
|
260 comp_cds[i]= np.array(singlegene['cds_exons'][i])
|
|
261 singlegene['cds_exons'] = comp_cds
|
|
262 comp_utr3 = np.zeros((len(singlegene['utr3_exons']),), dtype=np.object)
|
|
263 for i in range(len(singlegene['utr3_exons'])):
|
|
264 comp_utr3[i]= np.array(singlegene['utr3_exons'][i])
|
|
265 singlegene['utr3_exons'] = comp_utr3
|
|
266 comp_utr5 = np.zeros((len(singlegene['utr5_exons']),), dtype=np.object)
|
|
267 for i in range(len(singlegene['utr5_exons'])):
|
|
268 comp_utr5[i]= np.array(singlegene['utr5_exons'][i])
|
|
269 singlegene['utr5_exons'] = comp_utr5
|
|
270 comp_transcripts = np.zeros((len(singlegene['transcripts']),), dtype=np.object)
|
|
271 for i in range(len(singlegene['transcripts'])):
|
|
272 comp_transcripts[i]= np.array(singlegene['transcripts'][i])
|
|
273 singlegene['transcripts'] = comp_transcripts
|
|
274 comp_tss = np.zeros((len(singlegene['tss']),), dtype=np.object)
|
|
275 for i in range(len(singlegene['tss'])):
|
|
276 comp_tss[i]= np.array(singlegene['tss'][i])
|
|
277 singlegene['tss'] = comp_tss
|
|
278 comp_tis = np.zeros((len(singlegene['tis']),), dtype=np.object)
|
|
279 for i in range(len(singlegene['tis'])):
|
|
280 comp_tis[i]= np.array(singlegene['tis'][i])
|
|
281 singlegene['tis'] = comp_tis
|
|
282 comp_cleave = np.zeros((len(singlegene['cleave']),), dtype=np.object)
|
|
283 for i in range(len(singlegene['cleave'])):
|
|
284 comp_cleave[i]= np.array(singlegene['cleave'][i])
|
|
285 singlegene['cleave'] = comp_cleave
|
|
286 comp_cdsStop = np.zeros((len(singlegene['cdsStop']),), dtype=np.object)
|
|
287 for i in range(len(singlegene['cdsStop'])):
|
|
288 comp_cdsStop[i]= np.array(singlegene['cdsStop'][i])
|
|
289 singlegene['cdsStop'] = comp_cdsStop
|
|
290
|
|
291 return singlegene
|
|
292
|
|
293 def CreateGeneModels(genes_cmpt, transcripts_cmpt, exons_cmpt, utr3_cmpt, utr5_cmpt, cds_cmpt):
|
|
294 """Creating Coding/Non-coding gene models from parsed GFF objects.
|
|
295 """
|
|
296 gene_counter, gene_models = 1, []
|
|
297 for gene_entry in genes_cmpt: ## Figure out the genes and transcripts associated feature
|
|
298 if gene_entry in transcripts_cmpt:
|
|
299 gene=init_gene()
|
|
300 gene['id']=gene_counter
|
|
301 gene['name']=gene_entry[1]
|
|
302 gene['chr']=genes_cmpt[gene_entry]['chr']
|
|
303 gene['source']=genes_cmpt[gene_entry]['source']
|
|
304 gene['start']=genes_cmpt[gene_entry]['start']
|
|
305 gene['stop']=genes_cmpt[gene_entry]['stop']
|
|
306 gene['strand']=genes_cmpt[gene_entry]['strand']
|
|
307 if not gene['strand'] in ['+', '-']:
|
|
308 gene['strand']='.' # Strand info not known replaced with a dot symbol instead of None, ?, . etc.
|
|
309 if len(transcripts_cmpt[gene_entry])>1:
|
|
310 gene['is_alt_spliced'] = 1
|
|
311 gene['is_alt'] = 1
|
|
312 gtype=[]
|
|
313 for tids in transcripts_cmpt[gene_entry]: ## transcript section related tags
|
|
314 gene['transcripts'].append(tids['ID'])
|
|
315 gtype.append(tids['type'])
|
|
316 exon_cod, utr5_cod, utr3_cod, cds_cod = [], [], [], []
|
|
317 if (gene['chr'], tids['ID']) in exons_cmpt:
|
|
318 exon_cod = [[feat_exon['start'], feat_exon['stop']] for feat_exon in exons_cmpt[(gene['chr'], tids['ID'])]]
|
|
319 if (gene['chr'], tids['ID']) in utr5_cmpt:
|
|
320 utr5_cod = [[feat_utr5['start'], feat_utr5['stop']] for feat_utr5 in utr5_cmpt[(gene['chr'], tids['ID'])]]
|
|
321 if (gene['chr'], tids['ID']) in utr3_cmpt:
|
|
322 utr3_cod = [[feat_utr3['start'], feat_utr3['stop']] for feat_utr3 in utr3_cmpt[(gene['chr'], tids['ID'])]]
|
|
323 if (gene['chr'], tids['ID']) in cds_cmpt:
|
|
324 cds_cod = [[feat_cds['start'], feat_cds['stop']] for feat_cds in cds_cmpt[(gene['chr'], tids['ID'])]]
|
|
325 if len(exon_cod) == 0: ## build exon coordinates from UTR3, UTR5 and CDS
|
|
326 if cds_cod != []:
|
|
327 exon_cod=createExon(gene['strand'], utr5_cod, cds_cod, utr3_cod)
|
|
328
|
|
329 if gene['strand']=='-': ## general order to coordinates
|
|
330 if len(exon_cod) >1:
|
|
331 if exon_cod[0][0] > exon_cod[-1][0]:
|
|
332 exon_cod.reverse()
|
|
333 if len(cds_cod) >1:
|
|
334 if cds_cod[0][0] > cds_cod[-1][0]:
|
|
335 cds_cod.reverse()
|
|
336 if len(utr3_cod) >1:
|
|
337 if utr3_cod[0][0] > utr3_cod[-1][0]:
|
|
338 utr3_cod.reverse()
|
|
339 if len(utr5_cod) >1:
|
|
340 if utr5_cod[0][0] > utr5_cod[-1][0]:
|
|
341 utr5_cod.reverse()
|
|
342
|
|
343 tis, cdsStop, tss, cleave = [], [], [], [] ## speacial sited in the gene region
|
|
344 if cds_cod != []:
|
|
345 if gene['strand'] == '+':
|
|
346 tis = [cds_cod[0][0]]
|
|
347 cdsStop = [cds_cod[-1][1]-3]
|
|
348 elif gene['strand'] == '-':
|
|
349 tis = [cds_cod[-1][1]]
|
|
350 cdsStop = [cds_cod[0][0]+3]
|
|
351 if utr5_cod != []:
|
|
352 if gene['strand'] == '+':
|
|
353 tss = [utr5_cod[0][0]]
|
|
354 elif gene['strand'] == '-':
|
|
355 tss = [utr5_cod[-1][1]]
|
|
356 if utr3_cod != []:
|
|
357 if gene['strand'] == '+':
|
|
358 cleave = [utr3_cod[-1][1]]
|
|
359 elif gene['strand'] == '-':
|
|
360 cleave = [utr3_cod[0][0]]
|
|
361
|
|
362 cds_status, exon_status, utr_status = 0, 0, 0 ## status of the complete elements of the gene
|
|
363 if cds_cod != []: ## adding phase to the CDS region
|
|
364 cds_cod_phase = addCDSphase(gene['strand'], cds_cod)
|
|
365 cds_status = 1
|
|
366 gene['cds_exons'].append(cds_cod_phase)
|
|
367
|
|
368 if exon_cod != []:
|
|
369 exon_status = 1
|
|
370 if utr5_cod != [] or utr3_cod != []:
|
|
371 utr_status = 1
|
|
372 if cds_status != 0 and exon_status != 0 and utr_status != 0:
|
|
373 gene['transcript_status'].append(1)
|
|
374 else:
|
|
375 gene['transcript_status'].append(0)
|
|
376
|
|
377 if exon_cod: ## final check point for a valid gene model
|
|
378 gene['exons'].append(exon_cod)
|
|
379 gene['utr3_exons'].append(utr3_cod)
|
|
380 gene['utr5_exons'].append(utr5_cod)
|
|
381 gene['tis'].append(tis)
|
|
382 gene['cdsStop'].append(cdsStop)
|
|
383 gene['tss'].append(tss)
|
|
384 gene['cleave'].append(cleave)
|
|
385
|
|
386 gtype=list(set(gtype)) ## different types
|
|
387 gene['gene_info']=dict(ID=gene_entry[1],
|
|
388 Source=genes_cmpt[gene_entry]['source'],
|
|
389 Type=gtype)
|
|
390 gene=FeatureValueFormat(gene) ## get prepare for MAT writing
|
|
391 gene_counter+=1
|
|
392 gene_models.append(gene)
|
|
393 return gene_models
|
|
394
|
|
395 def GFFParse(gff_file):
|
|
396 """Parsing GFF file based on feature relationship.
|
|
397 """
|
|
398 genes, utr5, exons=dict(), dict(), dict()
|
|
399 transcripts, utr3, cds=dict(), dict(), dict()
|
|
400 # TODO Include growing key words of different non-coding/coding transcripts
|
|
401 features=['mrna', 'transcript', 'ncRNA', 'mirna', 'pseudogenic_transcript', 'rrna', 'snorna', 'snrna', 'trna', 'scrna']
|
|
402 gff_handle=open(gff_file, "rU")
|
|
403 for gff_line in gff_handle:
|
|
404 gff_line=gff_line.strip('\n\r').split('\t')
|
|
405 if re.match(r'#|>', gff_line[0]): # skip commented line or fasta identifier line
|
|
406 continue
|
|
407 if len(gff_line)==1: # skip fasta sequence/empty line if present
|
|
408 continue
|
|
409 assert len(gff_line)==9, '\t'.join(gff_line) # not found 9 tab-delimited fields in this line
|
|
410 if '' in gff_line: # skip this line if there any field with an empty value
|
|
411 print 'Skipping..', '\t'.join(gff_line)
|
|
412 continue
|
|
413 if gff_line[-1][-1]==';': # trim the last ';' character
|
|
414 gff_line[-1]=gff_line[-1].strip(';')
|
|
415 if gff_line[2].lower() in ['gene', 'pseudogene']:
|
|
416 gid, gene_info=None, dict()
|
|
417 gene_info['start']=int(gff_line[3])
|
|
418 gene_info['stop']=int(gff_line[4])
|
|
419 gene_info['chr']=gff_line[0]
|
|
420 gene_info['source']=gff_line[1]
|
|
421 gene_info['strand']=gff_line[6]
|
|
422 for attb in gff_line[-1].split(';'):
|
|
423 attb=attb.split('=') # gff attributes are separated by key=value pair
|
|
424 if attb[0]=='ID':
|
|
425 gid=attb[1]
|
|
426 break
|
|
427 genes[(gff_line[0], gid)]=gene_info # store gene information based on the chromosome and gene symbol.
|
|
428 elif gff_line[2].lower() in features:
|
|
429 gid, mrna_info=None, dict()
|
|
430 mrna_info['start']=int(gff_line[3])
|
|
431 mrna_info['stop']=int(gff_line[4])
|
|
432 mrna_info['chr']=gff_line[0]
|
|
433 mrna_info['strand']=gff_line[6]
|
|
434 mrna_info['type'] = gff_line[2]
|
|
435 for attb in gff_line[-1].split(';'):
|
|
436 attb=attb.split('=')
|
|
437 if attb[0]=='Parent':
|
|
438 gid=attb[1]
|
|
439 elif attb[0]=='ID':
|
|
440 mrna_info[attb[0]]=attb[1]
|
|
441 for fid in gid.split(','): # child may be mapped to multiple parents ex: Parent=AT01,AT01-1-Protein
|
|
442 if (gff_line[0], fid) in transcripts:
|
|
443 transcripts[(gff_line[0], fid)].append(mrna_info)
|
|
444 else:
|
|
445 transcripts[(gff_line[0], fid)]=[mrna_info]
|
|
446 elif gff_line[2].lower() in ['exon']:
|
|
447 tids, exon_info=None, dict()
|
|
448 exon_info['start']=int(gff_line[3])
|
|
449 exon_info['stop']=int(gff_line[4])
|
|
450 exon_info['chr']=gff_line[0]
|
|
451 exon_info['strand']=gff_line[6]
|
|
452 for attb in gff_line[-1].split(';'):
|
|
453 attb=attb.split('=')
|
|
454 if attb[0]=='Parent':
|
|
455 tids=attb[1]
|
|
456 break
|
|
457 for tid in tids.split(','):
|
|
458 if (gff_line[0], tid) in exons:
|
|
459 exons[(gff_line[0], tid)].append(exon_info)
|
|
460 else:
|
|
461 exons[(gff_line[0], tid)]=[exon_info]
|
|
462 elif gff_line[2].lower() in ['five_prime_utr']:
|
|
463 utr5_info, tids=dict(), None
|
|
464 utr5_info['start']=int(gff_line[3])
|
|
465 utr5_info['stop']=int(gff_line[4])
|
|
466 utr5_info['chr']=gff_line[0]
|
|
467 utr5_info['strand']=gff_line[6]
|
|
468 for attb in gff_line[-1].split(';'):
|
|
469 attb=attb.split('=')
|
|
470 if attb[0]=='Parent':
|
|
471 tids=attb[1]
|
|
472 break
|
|
473 for tid in tids.split(','):
|
|
474 if (gff_line[0], tid) in utr5:
|
|
475 utr5[(gff_line[0], tid)].append(utr5_info)
|
|
476 else:
|
|
477 utr5[(gff_line[0], tid)]=[utr5_info]
|
|
478 elif gff_line[2].lower() in ['cds']:
|
|
479 cds_info, tids=dict(), None
|
|
480 cds_info['start']=int(gff_line[3])
|
|
481 cds_info['stop']=int(gff_line[4])
|
|
482 cds_info['chr']=gff_line[0]
|
|
483 cds_info['strand']=gff_line[6]
|
|
484 for attb in gff_line[-1].split(';'):
|
|
485 attb=attb.split('=')
|
|
486 if attb[0]=='Parent':
|
|
487 tids=attb[1]
|
|
488 break
|
|
489 for tid in tids.split(','):
|
|
490 if (gff_line[0], tid) in cds:
|
|
491 cds[(gff_line[0], tid)].append(cds_info)
|
|
492 else:
|
|
493 cds[(gff_line[0], tid)]=[cds_info]
|
|
494 elif gff_line[2].lower() in ['three_prime_utr']:
|
|
495 utr3_info, tids=dict(), None
|
|
496 utr3_info['start']=int(gff_line[3])
|
|
497 utr3_info['stop']=int(gff_line[4])
|
|
498 utr3_info['chr']=gff_line[0]
|
|
499 utr3_info['strand']=gff_line[6]
|
|
500 for attb in gff_line[-1].split(';'):
|
|
501 attb=attb.split('=')
|
|
502 if attb[0]=='Parent':
|
|
503 tids=attb[1]
|
|
504 break
|
|
505 for tid in tids.split(','):
|
|
506 if (gff_line[0], tid) in utr3:
|
|
507 utr3[(gff_line[0], tid)].append(utr3_info)
|
|
508 else:
|
|
509 utr3[(gff_line[0], tid)]=[utr3_info]
|
|
510 gff_handle.close()
|
|
511 return genes, transcripts, exons, utr3, utr5, cds
|
|
512
|
|
513 def __main__():
|
|
514 """extract genome feature information
|
|
515 """
|
|
516 try:
|
|
517 gff_file = sys.argv[1]
|
|
518 mat_file = sys.argv[2]
|
|
519 except:
|
|
520 print __doc__
|
|
521 sys.exit(-1)
|
|
522
|
|
523 genes, transcripts, exons, utr3, utr5, cds = GFFParse(gff_file)
|
|
524 gene_models = CreateGeneModels(genes, transcripts, exons, utr3, utr5, cds)
|
|
525 # TODO Write to matlab/octave struct instead of cell arrays.
|
|
526 sio.savemat(mat_file,
|
|
527 mdict=dict(genes=gene_models),
|
|
528 format='5',
|
|
529 oned_as='row')
|
|
530
|
|
531 if __name__=='__main__':
|
|
532 __main__()
|