comparison rDiff/tools/GFFParser.py @ 2:233c30f91d66

updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 07:15:44 -0400
parents
children
comparison
equal deleted inserted replaced
0:0f80a5141704 2:233c30f91d66
1 #!/usr/bin/env python
2 """
3 Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file.
4
5 Requirements:
6 Numpy :- http://numpy.org/
7 Scipy :- http://scipy.org/
8
9 Copyright (C)
10
11 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
12 2012-2013 Memorial Sloan-Kettering Cancer Center, New York City, USA.
13 """
14
15 import re
16 import os
17 import sys
18 import urllib
19 import numpy as np
20 import scipy.io as sio
21 from collections import defaultdict
22 import helper as utils
23
24 def _attribute_tags(col9):
25 """
26 Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file
27 """
28 info = defaultdict(list)
29 is_gff = False
30
31 if not col9:
32 return is_gff, info
33
34 # trim the line ending semi-colon ucsc may have some white-space
35 col9 = col9.rstrip(';| ')
36 # attributes from 9th column
37 atbs = col9.split(" ; ")
38 if len(atbs) == 1:
39 atbs = col9.split("; ")
40 if len(atbs) == 1:
41 atbs = col9.split(";")
42 # check the GFF3 pattern which has key value pairs like:
43 gff3_pat = re.compile("\w+=")
44 # sometime GTF have: gene_id uc002zkg.1;
45 gtf_pat = re.compile("\s?\w+\s")
46
47 key_vals = []
48
49 if gff3_pat.match(atbs[0]): # gff3 pattern
50 is_gff = True
51 key_vals = [at.split('=') for at in atbs]
52 elif gtf_pat.match(atbs[0]): # gtf pattern
53 for at in atbs:
54 key_vals.append(at.strip().split(" ",1))
55 else:
56 # to handle attribute column has only single value
57 key_vals.append(['ID', atbs[0]])
58 # get key, val items
59 for item in key_vals:
60 key, val = item
61 # replace the double qoutes from feature identifier
62 val = re.sub('"', '', val)
63 # replace the web formating place holders to plain text format
64 info[key].extend([urllib.unquote(v) for v in val.split(',') if v])
65
66 return is_gff, info
67
68 def _spec_features_keywd(gff_parts):
69 """
70 Specify the feature key word according to the GFF specifications
71 """
72 for t_id in ["transcript_id", "transcriptId", "proteinId"]:
73 try:
74 gff_parts["info"]["Parent"] = gff_parts["info"][t_id]
75 break
76 except KeyError:
77 pass
78 for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]:
79 try:
80 gff_parts["info"]["GParent"] = gff_parts["info"][g_id]
81 break
82 except KeyError:
83 pass
84 ## TODO key words
85 for flat_name in ["Transcript", "CDS"]:
86 if gff_parts["info"].has_key(flat_name):
87 # parents
88 if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE):
89 if not gff_parts['id']:
90 gff_parts['id'] = gff_parts['info'][flat_name][0]
91 #gff_parts["info"]["ID"] = [gff_parts["id"]]
92 # children
93 elif gff_parts["type"] in ["intron", "exon", "pseudogenic_exon", "three_prime_UTR",
94 "coding_exon", "five_prime_UTR", "CDS", "stop_codon",
95 "start_codon"]:
96 gff_parts["info"]["Parent"] = gff_parts["info"][flat_name]
97 break
98 return gff_parts
99
100 def Parse(ga_file):
101 """
102 Parsing GFF/GTF file based on feature relationship, it takes the input file.
103 """
104 child_map = defaultdict(list)
105 parent_map = dict()
106
107 ga_handle = utils._open_file(ga_file)
108
109 for rec in ga_handle:
110 rec = rec.strip('\n\r')
111
112 # skip empty line fasta identifier and commented line
113 if not rec or rec[0] in ['#', '>']:
114 continue
115 # skip the genome sequence
116 if not re.search('\t', rec):
117 continue
118
119 parts = rec.split('\t')
120 assert len(parts) >= 8, rec
121
122 # process the attribute column (9th column)
123 ftype, tags = _attribute_tags(parts[-1])
124 if not tags: # skip the line if no attribute column.
125 continue
126
127 # extract fields
128 if parts[1]:
129 tags["source"] = parts[1]
130 if parts[7]:
131 tags["phase"] = parts[7]
132
133 gff_info = dict()
134 gff_info['info'] = dict(tags)
135 #gff_info["is_gff3"] = ftype
136 gff_info['chr'] = parts[0]
137
138 if parts[3] and parts[4]:
139 gff_info['location'] = [int(parts[3]) ,
140 int(parts[4])]
141 gff_info['type'] = parts[2]
142 gff_info['id'] = tags.get('ID', [''])[0]
143 if parts[6] in ['?', '.']:
144 parts[6] = None
145 gff_info['strand'] = parts[6]
146
147 # key word according to the GFF spec.
148 if not ftype:
149 gff_info = _spec_features_keywd(gff_info)
150
151 # link the feature relationships
152 if gff_info['info'].has_key('Parent'):
153 for p in gff_info['info']['Parent']:
154 if p == gff_info['id']:
155 gff_info['id'] = ''
156 break
157 rec_category = 'child'
158 elif gff_info['id']:
159 rec_category = 'parent'
160 else:
161 rec_category = 'record'
162
163 # depends on the record category organize the features
164 if rec_category == 'child':
165 for p in gff_info['info']['Parent']:
166 # create the data structure based on source and feature id
167 child_map[(gff_info['chr'], gff_info['info']['source'], p)].append(
168 dict( type = gff_info['type'],
169 location = gff_info['location'],
170 strand = gff_info['strand'],
171 ID = gff_info['id'],
172 gene_id = gff_info['info'].get('GParent', '')
173 ))
174 elif rec_category == 'parent':
175 parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict(
176 type = gff_info['type'],
177 location = gff_info['location'],
178 strand = gff_info['strand'],
179 name = tags.get('Name', [''])[0])
180 elif rec_category == 'record':
181 #TODO how to handle plain records?
182 c = 1
183 ga_handle.close()
184
185 # depends on file type create parent feature
186 if not ftype:
187 parent_map, child_map = _create_missing_feature_type(parent_map, child_map)
188
189 # connecting parent child relations
190 # // essentially the parent child features are here from any type of GTF/GFF2/GFF3 file
191 gene_mat = _format_gene_models(parent_map, child_map)
192
193 return gene_mat
194
195 def _format_gene_models(parent_nf_map, child_nf_map):
196 """
197 Genarate GeneObject based on the parsed file contents
198
199 parent_map: parent features with source and chromosome information
200 child_map: transctipt and exon information are encoded
201 """
202 g_cnt = 0
203 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene_DE())
204
205 for pkey, pdet in parent_nf_map.items():
206 # considering only gene features
207 if not re.search(r'gene', pdet.get('type', '')):
208 continue
209 # infer the gene start and stop if not there in the
210 if not pdet.get('location', []):
211 GNS, GNE = [], []
212 # multiple number of transcripts
213 for L1 in child_nf_map[pkey]:
214 GNS.append(L1.get('location', [])[0])
215 GNE.append(L1.get('location', [])[1])
216 GNS.sort()
217 GNE.sort()
218 pdet['location'] = [GNS[0], GNE[-1]]
219 orient = pdet.get('strand', '')
220
221 gene[g_cnt]['id'] = g_cnt +1
222 gene[g_cnt]['chr'] = pkey[0]
223 gene[g_cnt]['source'] = pkey[1]
224 gene[g_cnt]['name'] = pkey[-1]
225 gene[g_cnt]['start'] = pdet.get('location', [])[0]
226 gene[g_cnt]['stop'] = pdet.get('location', [])[1]
227 gene[g_cnt]['strand'] = orient
228
229 # default value
230 gene[g_cnt]['is_alt_spliced'] = 0
231 if len(child_nf_map[pkey]) > 1:
232 gene[g_cnt]['is_alt_spliced'] = 1
233
234 # complete sub-feature for all transcripts
235 dim = len(child_nf_map[pkey])
236 TRS = np.zeros((dim,), dtype=np.object)
237 EXON = np.zeros((dim,), dtype=np.object)
238
239 # fetching corresponding transcripts
240 for xq, Lv1 in enumerate(child_nf_map[pkey]):
241
242 TID = Lv1.get('ID', '')
243 TRS[xq]= np.array([TID])
244
245 orient = Lv1.get('strand', '')
246
247 # fetching different sub-features
248 child_feat = defaultdict(list)
249 for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
250 E_TYP = Lv2.get('type', '')
251 child_feat[E_TYP].append(Lv2.get('location'))
252
253 # make exon coordinate from cds and utr regions
254 if not child_feat.get('exon'):
255 if child_feat.get('CDS'):
256 exon_cod = utils.make_Exon_cod( orient,
257 NonetoemptyList(child_feat.get('five_prime_UTR')),
258 NonetoemptyList(child_feat.get('CDS')),
259 NonetoemptyList(child_feat.get('three_prime_UTR')))
260 child_feat['exon'] = exon_cod
261 else:
262 # searching through keys to find a pattern describing exon feature
263 ex_key_pattern = [k for k in child_feat if k.endswith("exon")]
264 child_feat['exon'] = child_feat[ex_key_pattern[0]]
265 # TODO only UTR's
266
267 # make general ascending order of coordinates
268 if orient == '-':
269 for etype, excod in child_feat.items():
270 if len(excod) > 1:
271 if excod[0][0] > excod[-1][0]:
272 excod.reverse()
273 child_feat[etype] = excod
274
275 # add sub-feature # make array for export to different out
276 EXON[xq] = np.array(child_feat.get('exon'), np.float64)
277
278 # add sub-features to the parent gene feature
279 gene[g_cnt]['transcripts'] = TRS
280 gene[g_cnt]['exons'] = EXON
281
282 gene[g_cnt]['gene_info'] = dict( ID = pkey[-1],
283 Name = pdet.get('name'),
284 Source = pkey[1])
285 g_cnt += 1
286
287 ## deleting empty gene records from the main array
288 for XP, ens in enumerate(gene):
289 if ens[0]==0:
290 break
291
292 XQC = range(XP, len(gene)+1)
293 gene = np.delete(gene, XQC)
294
295 return gene
296
297 def NonetoemptyList(XS):
298 """
299 Convert a None type to empty list
300 """
301 return [] if XS is None else XS
302
303 def _create_missing_feature_type(p_feat, c_feat):
304 """
305 GFF/GTF file defines only child features. This function tries to create
306 the parent feature from the information provided in the attribute column.
307
308 example:
309 chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1";
310 chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1";
311 chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2";
312
313 This function gets the parsed feature annotations.
314 """
315 child_n_map = defaultdict(list)
316 for fid, det in c_feat.items():
317 # get the details from grand child
318 GID = STRD = None
319 SPOS, EPOS = [], []
320 TYP = dict()
321 for gchild in det:
322 GID = gchild.get('gene_id', [''])[0]
323 SPOS.append(gchild.get('location', [])[0])
324 EPOS.append(gchild.get('location', [])[1])
325 STRD = gchild.get('strand', '')
326 TYP[gchild.get('type', '')] = 1
327 SPOS.sort()
328 EPOS.sort()
329
330 # infer transcript type
331 transcript_type = 'transcript'
332 transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type
333
334 # gene id and transcript id are same
335 transcript_id = fid[-1]
336 if GID == transcript_id:
337 transcript_id = 'Transcript:' + str(GID)
338
339 # level -1 feature type
340 p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene',
341 location = [], ## infer location based on multiple transcripts
342 strand = STRD,
343 name = GID )
344 # level -2 feature type
345 child_n_map[(fid[0], fid[1], GID)].append(
346 dict( type = transcript_type,
347 location = [SPOS[0], EPOS[-1]],
348 strand = STRD,
349 ID = transcript_id,
350 gene_id = '' ))
351 # reorganizing the grand child
352 for gchild in det:
353 child_n_map[(fid[0], fid[1], transcript_id)].append(
354 dict( type = gchild.get('type', ''),
355 location = gchild.get('location'),
356 strand = gchild.get('strand'),
357 ID = gchild.get('ID'),
358 gene_id = '' ))
359 return p_feat, child_n_map
360
361
362 ## General instruction to use the above functions:
363 ## Usage: GFFParser.py in.gff3 out.mat
364
365 try:
366 gff_file = sys.argv[1]
367 out_mat = sys.argv[2]
368 except:
369 print __doc__
370 sys.exit(-1)
371
372 ## Parse the file accoring to the type and returns the genes informations --
373 gene_struct = Parse(gff_file)
374
375 ## Write the gene annotations to a matlab struct array format --
376 sio.savemat(out_mat,
377 mdict = dict(genes = gene_struct),
378 format = '5',
379 oned_as = 'row')