Mercurial > repos > vipints > fml_gff3togtf
comparison GFFParser.py @ 10:c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Thu, 23 Apr 2015 18:01:45 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:7d67331368f3 | 10:c42c69aa81f8 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file. | |
4 | |
5 Requirements: | |
6 Numpy :- http://numpy.org/ | |
7 | |
8 Copyright (C) | |
9 | |
10 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. | |
11 2012-2015 Memorial Sloan Kettering Cancer Center, New York City, USA. | |
12 """ | |
13 | |
14 import re | |
15 import os | |
16 import sys | |
17 import urllib | |
18 import numpy as np | |
19 import helper as utils | |
20 from collections import defaultdict | |
21 | |
22 def attribute_tags(col9): | |
23 """ | |
24 Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file | |
25 | |
26 @args col9: attribute column from GFF file | |
27 @type col9: str | |
28 """ | |
29 info = defaultdict(list) | |
30 is_gff = False | |
31 | |
32 if not col9: | |
33 return is_gff, info | |
34 | |
35 # trim the line ending semi-colon ucsc may have some white-space | |
36 col9 = col9.rstrip(';| ') | |
37 # attributes from 9th column | |
38 atbs = col9.split(" ; ") | |
39 if len(atbs) == 1: | |
40 atbs = col9.split("; ") | |
41 if len(atbs) == 1: | |
42 atbs = col9.split(";") | |
43 # check the GFF3 pattern which has key value pairs like: | |
44 gff3_pat = re.compile("\w+=") | |
45 # sometime GTF have: gene_id uc002zkg.1; | |
46 gtf_pat = re.compile("\s?\w+\s") | |
47 | |
48 key_vals = [] | |
49 | |
50 if gff3_pat.match(atbs[0]): # gff3 pattern | |
51 is_gff = True | |
52 key_vals = [at.split('=') for at in atbs] | |
53 elif gtf_pat.match(atbs[0]): # gtf pattern | |
54 for at in atbs: | |
55 key_vals.append(at.strip().split(" ",1)) | |
56 else: | |
57 # to handle attribute column has only single value | |
58 key_vals.append(['ID', atbs[0]]) | |
59 # get key, val items | |
60 for item in key_vals: | |
61 key, val = item | |
62 # replace the double qoutes from feature identifier | |
63 val = re.sub('"', '', val) | |
64 # replace the web formating place holders to plain text format | |
65 info[key].extend([urllib.unquote(v) for v in val.split(',') if v]) | |
66 | |
67 return is_gff, info | |
68 | |
69 def spec_features_keywd(gff_parts): | |
70 """ | |
71 Specify the feature key word according to the GFF specifications | |
72 | |
73 @args gff_parts: attribute field key | |
74 @type gff_parts: str | |
75 """ | |
76 for t_id in ["transcript_id", "transcriptId", "proteinId"]: | |
77 try: | |
78 gff_parts["info"]["Parent"] = gff_parts["info"][t_id] | |
79 break | |
80 except KeyError: | |
81 pass | |
82 for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]: | |
83 try: | |
84 gff_parts["info"]["GParent"] = gff_parts["info"][g_id] | |
85 break | |
86 except KeyError: | |
87 pass | |
88 ## TODO key words | |
89 for flat_name in ["Transcript", "CDS"]: | |
90 if gff_parts["info"].has_key(flat_name): | |
91 # parents | |
92 if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE): | |
93 if not gff_parts['id']: | |
94 gff_parts['id'] = gff_parts['info'][flat_name][0] | |
95 #gff_parts["info"]["ID"] = [gff_parts["id"]] | |
96 # children | |
97 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", | |
98 "coding_exon", "five_prime_UTR", "CDS", "stop_codon", | |
99 "start_codon"]: | |
100 gff_parts["info"]["Parent"] = gff_parts["info"][flat_name] | |
101 break | |
102 return gff_parts | |
103 | |
104 def Parse(ga_file): | |
105 """ | |
106 Parsing GFF/GTF file based on feature relationship, it takes the input file. | |
107 | |
108 @args ga_file: input file name | |
109 @type ga_file: str | |
110 """ | |
111 child_map = defaultdict(list) | |
112 parent_map = dict() | |
113 | |
114 ga_handle = utils.open_file(ga_file) | |
115 | |
116 for rec in ga_handle: | |
117 rec = rec.strip('\n\r') | |
118 | |
119 # skip empty line fasta identifier and commented line | |
120 if not rec or rec[0] in ['#', '>']: | |
121 continue | |
122 # skip the genome sequence | |
123 if not re.search('\t', rec): | |
124 continue | |
125 | |
126 parts = rec.split('\t') | |
127 assert len(parts) >= 8, rec | |
128 | |
129 # process the attribute column (9th column) | |
130 ftype, tags = attribute_tags(parts[-1]) | |
131 if not tags: # skip the line if no attribute column. | |
132 continue | |
133 | |
134 # extract fields | |
135 if parts[1]: | |
136 tags["source"] = parts[1] | |
137 if parts[7]: | |
138 tags["phase"] = parts[7] | |
139 | |
140 gff_info = dict() | |
141 gff_info['info'] = dict(tags) | |
142 gff_info["is_gff3"] = ftype | |
143 gff_info['chr'] = parts[0] | |
144 gff_info['score'] = parts[5] | |
145 | |
146 if parts[3] and parts[4]: | |
147 gff_info['location'] = [int(parts[3]) , | |
148 int(parts[4])] | |
149 gff_info['type'] = parts[2] | |
150 gff_info['id'] = tags.get('ID', [''])[0] | |
151 if parts[6] in ['?', '.']: | |
152 parts[6] = None | |
153 gff_info['strand'] = parts[6] | |
154 | |
155 # key word according to the GFF spec. | |
156 # is_gff3 flag is false check this condition and get the attribute fields | |
157 if not ftype: | |
158 gff_info = spec_features_keywd(gff_info) | |
159 | |
160 # link the feature relationships | |
161 if gff_info['info'].has_key('Parent'): | |
162 for p in gff_info['info']['Parent']: | |
163 if p == gff_info['id']: | |
164 gff_info['id'] = '' | |
165 break | |
166 rec_category = 'child' | |
167 elif gff_info['id']: | |
168 rec_category = 'parent' | |
169 else: | |
170 rec_category = 'record' | |
171 | |
172 # depends on the record category organize the features | |
173 if rec_category == 'child': | |
174 for p in gff_info['info']['Parent']: | |
175 # create the data structure based on source and feature id | |
176 child_map[(gff_info['chr'], gff_info['info']['source'], p)].append( | |
177 dict( type = gff_info['type'], | |
178 location = gff_info['location'], | |
179 strand = gff_info['strand'], | |
180 score = gff_info['score'], | |
181 ID = gff_info['id'], | |
182 gene_id = gff_info['info'].get('GParent', '') | |
183 )) | |
184 elif rec_category == 'parent': | |
185 parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict( | |
186 type = gff_info['type'], | |
187 location = gff_info['location'], | |
188 strand = gff_info['strand'], | |
189 score = gff_info['score'], | |
190 name = tags.get('Name', [''])[0]) | |
191 elif rec_category == 'record': | |
192 #TODO how to handle plain records? | |
193 c = 1 | |
194 ga_handle.close() | |
195 | |
196 # depends on file type create parent feature | |
197 if not ftype: | |
198 parent_map, child_map = create_missing_feature_type(parent_map, child_map) | |
199 | |
200 # connecting parent child relations | |
201 # essentially the parent child features are here from any type of GTF/GFF2/GFF3 file | |
202 gene_mat = format_gene_models(parent_map, child_map) | |
203 | |
204 return gene_mat | |
205 | |
206 def format_gene_models(parent_nf_map, child_nf_map): | |
207 """ | |
208 Genarate GeneObject based on the parsed file contents | |
209 | |
210 @args parent_nf_map: parent features with source and chromosome information | |
211 @type parent_nf_map: collections defaultdict | |
212 @args child_nf_map: transctipt and exon information are encoded | |
213 @type child_nf_map: collections defaultdict | |
214 """ | |
215 | |
216 g_cnt = 0 | |
217 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene()) | |
218 | |
219 for pkey, pdet in parent_nf_map.items(): | |
220 # considering only gene features | |
221 #if not re.search(r'gene', pdet.get('type', '')): | |
222 # continue | |
223 | |
224 # infer the gene start and stop if not there in the | |
225 if not pdet.get('location', []): | |
226 GNS, GNE = [], [] | |
227 # multiple number of transcripts | |
228 for L1 in child_nf_map[pkey]: | |
229 GNS.append(L1.get('location', [])[0]) | |
230 GNE.append(L1.get('location', [])[1]) | |
231 GNS.sort() | |
232 GNE.sort() | |
233 pdet['location'] = [GNS[0], GNE[-1]] | |
234 | |
235 orient = pdet.get('strand', '') | |
236 gene[g_cnt]['id'] = g_cnt +1 | |
237 gene[g_cnt]['chr'] = pkey[0] | |
238 gene[g_cnt]['source'] = pkey[1] | |
239 gene[g_cnt]['name'] = pkey[-1] | |
240 gene[g_cnt]['start'] = pdet.get('location', [])[0] | |
241 gene[g_cnt]['stop'] = pdet.get('location', [])[1] | |
242 gene[g_cnt]['strand'] = orient | |
243 gene[g_cnt]['score'] = pdet.get('score','') | |
244 | |
245 # default value | |
246 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0 | |
247 if len(child_nf_map[pkey]) > 1: | |
248 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1 | |
249 | |
250 # complete sub-feature for all transcripts | |
251 dim = len(child_nf_map[pkey]) | |
252 TRS = np.zeros((dim,), dtype=np.object) | |
253 TR_TYP = np.zeros((dim,), dtype=np.object) | |
254 EXON = np.zeros((dim,), dtype=np.object) | |
255 UTR5 = np.zeros((dim,), dtype=np.object) | |
256 UTR3 = np.zeros((dim,), dtype=np.object) | |
257 CDS = np.zeros((dim,), dtype=np.object) | |
258 TISc = np.zeros((dim,), dtype=np.object) | |
259 TSSc = np.zeros((dim,), dtype=np.object) | |
260 CLV = np.zeros((dim,), dtype=np.object) | |
261 CSTOP = np.zeros((dim,), dtype=np.object) | |
262 TSTAT = np.zeros((dim,), dtype=np.object) | |
263 TSCORE = np.zeros((dim,), dtype=np.object) | |
264 | |
265 # fetching corresponding transcripts | |
266 for xq, Lv1 in enumerate(child_nf_map[pkey]): | |
267 | |
268 TID = Lv1.get('ID', '') | |
269 TRS[xq]= np.array([TID]) | |
270 | |
271 TYPE = Lv1.get('type', '') | |
272 TR_TYP[xq] = np.array('') | |
273 TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq] | |
274 | |
275 orient = Lv1.get('strand', '') | |
276 tr_score = Lv1.get('score', '') | |
277 | |
278 # fetching different sub-features | |
279 child_feat = defaultdict(list) | |
280 for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]: | |
281 E_TYP = Lv2.get('type', '') | |
282 child_feat[E_TYP].append(Lv2.get('location')) | |
283 | |
284 # make general ascending order of coordinates | |
285 if orient == '-': | |
286 for etype, excod in child_feat.items(): | |
287 if len(excod) > 1: | |
288 if excod[0][0] > excod[-1][0]: | |
289 excod.reverse() | |
290 child_feat[etype] = excod | |
291 | |
292 # make exon coordinate from cds and utr regions | |
293 if not child_feat.get('exon'): | |
294 if child_feat.get('CDS'): | |
295 exon_cod = utils.make_Exon_cod( orient, | |
296 NonetoemptyList(child_feat.get('five_prime_UTR')), | |
297 NonetoemptyList(child_feat.get('CDS')), | |
298 NonetoemptyList(child_feat.get('three_prime_UTR'))) | |
299 child_feat['exon'] = exon_cod | |
300 else: | |
301 # TODO only UTR's | |
302 # searching through keys to find a pattern describing exon feature | |
303 ex_key_pattern = [k for k in child_feat if k.endswith("exon")] | |
304 if ex_key_pattern: | |
305 child_feat['exon'] = child_feat[ex_key_pattern[0]] | |
306 | |
307 # stop_codon are seperated from CDS, add the coordinates based on strand | |
308 if child_feat.get('stop_codon'): | |
309 if orient == '+': | |
310 if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1: | |
311 child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]] | |
312 else: | |
313 child_feat['CDS'].append(child_feat.get('stop_codon')[0]) | |
314 elif orient == '-': | |
315 if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1: | |
316 child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]] | |
317 else: | |
318 child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0]) | |
319 | |
320 # transcript signal sites | |
321 TIS, cdsStop, TSS, cleave = [], [], [], [] | |
322 cds_status, exon_status, utr_status = 0, 0, 0 | |
323 | |
324 if child_feat.get('exon'): | |
325 TSS = [child_feat.get('exon')[-1][1]] | |
326 TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS | |
327 cleave = [child_feat.get('exon')[0][0]] | |
328 cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave | |
329 exon_status = 1 | |
330 | |
331 if child_feat.get('CDS'): | |
332 if orient == '+': | |
333 TIS = [child_feat.get('CDS')[0][0]] | |
334 cdsStop = [child_feat.get('CDS')[-1][1]-3] | |
335 else: | |
336 TIS = [child_feat.get('CDS')[-1][1]] | |
337 cdsStop = [child_feat.get('CDS')[0][0]+3] | |
338 cds_status = 1 | |
339 # cds phase calculation | |
340 child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS')) | |
341 | |
342 # sub-feature status | |
343 if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'): | |
344 utr_status =1 | |
345 | |
346 if utr_status == cds_status == exon_status == 1: | |
347 t_status = 1 | |
348 else: | |
349 t_status = 0 | |
350 | |
351 # add sub-feature # make array for export to different out | |
352 TSTAT[xq] = t_status | |
353 EXON[xq] = np.array(child_feat.get('exon'), np.float64) | |
354 UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR'))) | |
355 UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR'))) | |
356 CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS'))) | |
357 TISc[xq] = np.array(TIS) | |
358 CSTOP[xq] = np.array(cdsStop) | |
359 TSSc[xq] = np.array(TSS) | |
360 CLV[xq] = np.array(cleave) | |
361 TSCORE[xq] = tr_score | |
362 | |
363 # add sub-features to the parent gene feature | |
364 gene[g_cnt]['transcript_status'] = TSTAT | |
365 gene[g_cnt]['transcripts'] = TRS | |
366 gene[g_cnt]['exons'] = EXON | |
367 gene[g_cnt]['utr5_exons'] = UTR5 | |
368 gene[g_cnt]['cds_exons'] = CDS | |
369 gene[g_cnt]['utr3_exons'] = UTR3 | |
370 gene[g_cnt]['transcript_type'] = TR_TYP | |
371 gene[g_cnt]['tis'] = TISc | |
372 gene[g_cnt]['cdsStop'] = CSTOP | |
373 gene[g_cnt]['tss'] = TSSc | |
374 gene[g_cnt]['cleave'] = CLV | |
375 gene[g_cnt]['transcript_score'] = TSCORE | |
376 | |
377 gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], | |
378 Name = pdet.get('name'), | |
379 Source = pkey[1]) | |
380 # few empty fields // TODO fill this: | |
381 gene[g_cnt]['anno_id'] = [] | |
382 gene[g_cnt]['confgenes_id'] = [] | |
383 gene[g_cnt]['alias'] = '' | |
384 gene[g_cnt]['name2'] = [] | |
385 gene[g_cnt]['chr_num'] = [] | |
386 gene[g_cnt]['paralogs'] = [] | |
387 gene[g_cnt]['transcript_valid'] = [] | |
388 gene[g_cnt]['exons_confirmed'] = [] | |
389 gene[g_cnt]['tis_conf'] = [] | |
390 gene[g_cnt]['tis_info'] = [] | |
391 gene[g_cnt]['cdsStop_conf'] = [] | |
392 gene[g_cnt]['cdsStop_info'] = [] | |
393 gene[g_cnt]['tss_info'] = [] | |
394 gene[g_cnt]['tss_conf'] = [] | |
395 gene[g_cnt]['cleave_info'] = [] | |
396 gene[g_cnt]['cleave_conf'] = [] | |
397 gene[g_cnt]['polya_info'] = [] | |
398 gene[g_cnt]['polya_conf'] = [] | |
399 gene[g_cnt]['is_valid'] = [] | |
400 gene[g_cnt]['transcript_complete'] = [] | |
401 gene[g_cnt]['is_complete'] = [] | |
402 gene[g_cnt]['is_correctly_gff3_referenced'] = '' | |
403 gene[g_cnt]['splicegraph'] = [] | |
404 g_cnt += 1 | |
405 | |
406 ## deleting empty gene records from the main array | |
407 XPFLG=0 | |
408 for XP, ens in enumerate(gene): | |
409 if ens[0]==0: | |
410 XPFLG=1 | |
411 break | |
412 | |
413 if XPFLG==1: | |
414 XQC = range(XP, len(gene)+1) | |
415 gene = np.delete(gene, XQC) | |
416 | |
417 return gene | |
418 | |
419 def NonetoemptyList(XS): | |
420 """ | |
421 Convert a None type to empty list | |
422 | |
423 @args XS: None type | |
424 @type XS: str | |
425 """ | |
426 return [] if XS is None else XS | |
427 | |
428 def create_missing_feature_type(p_feat, c_feat): | |
429 """ | |
430 GFF/GTF file defines only child features. This function tries to create | |
431 the parent feature from the information provided in the attribute column. | |
432 | |
433 example: | |
434 chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; | |
435 chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; | |
436 chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2"; | |
437 | |
438 This function gets the parsed feature annotations. | |
439 | |
440 @args p_feat: Parent feature map | |
441 @type p_feat: collections defaultdict | |
442 @args c_feat: Child feature map | |
443 @type c_feat: collections defaultdict | |
444 """ | |
445 | |
446 child_n_map = defaultdict(list) | |
447 for fid, det in c_feat.items(): | |
448 # get the details from grand child | |
449 GID = STRD = SCR = None | |
450 SPOS, EPOS = [], [] | |
451 TYP = dict() | |
452 for gchild in det: | |
453 GID = gchild.get('gene_id', [''])[0] | |
454 SPOS.append(gchild.get('location', [])[0]) | |
455 EPOS.append(gchild.get('location', [])[1]) | |
456 STRD = gchild.get('strand', '') | |
457 SCR = gchild.get('score', '') | |
458 if gchild.get('type', '') == "gene": ## gencode GTF file has this problem | |
459 continue | |
460 TYP[gchild.get('type', '')] = 1 | |
461 SPOS.sort() | |
462 EPOS.sort() | |
463 | |
464 # infer transcript type | |
465 transcript_type = 'transcript' | |
466 transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type | |
467 | |
468 # gene id and transcript id are same | |
469 transcript_id = fid[-1] | |
470 if GID == transcript_id: | |
471 transcript_id = 'Transcript:' + str(GID) | |
472 | |
473 # level -1 feature type | |
474 p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene', | |
475 location = [], ## infer location based on multiple transcripts | |
476 strand = STRD, | |
477 name = GID ) | |
478 # level -2 feature type | |
479 child_n_map[(fid[0], fid[1], GID)].append( | |
480 dict( type = transcript_type, | |
481 location = [SPOS[0], EPOS[-1]], | |
482 strand = STRD, | |
483 score = SCR, | |
484 ID = transcript_id, | |
485 gene_id = '' )) | |
486 # reorganizing the grand child | |
487 for gchild in det: | |
488 child_n_map[(fid[0], fid[1], transcript_id)].append( | |
489 dict( type = gchild.get('type', ''), | |
490 location = gchild.get('location'), | |
491 strand = gchild.get('strand'), | |
492 ID = gchild.get('ID'), | |
493 score = gchild.get('score'), | |
494 gene_id = '' )) | |
495 return p_feat, child_n_map | |
496 |