Mercurial > repos > vipints > deseq_hts
comparison deseq-hts_1.0/tools/ParseGFF.py @ 0:94a108763d9e draft
deseq-hts version 1.0 wraps the DESeq 1.6.0
author | vipints |
---|---|
date | Wed, 09 May 2012 20:43:47 -0400 |
parents | |
children | 8ab01cc29c4b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:94a108763d9e |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Extract genome annotation from a GFF3 (a tab delimited format | |
4 for storing sequence features and annotations: | |
5 http://www.sequenceontology.org/gff3.shtml) file. | |
6 | |
7 Usage: ParseGFF.py in.gff3 out.mat | |
8 | |
9 Requirements: | |
10 Scipy :- http://scipy.org/ | |
11 Numpy :- http://numpy.org/ | |
12 | |
13 Copyright (C) 2010-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany | |
14 """ | |
15 import re, sys, os | |
16 import scipy.io as sio | |
17 import numpy as np | |
18 | |
19 def createExon(strand_p, five_p_utr, cds_cod, three_p_utr): | |
20 """Create exon cordinates from UTR's and CDS region | |
21 """ | |
22 exon_pos = [] | |
23 if strand_p == '+': | |
24 utr5_start, utr5_end = 0, 0 | |
25 if five_p_utr != []: | |
26 utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1] | |
27 cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1] | |
28 jun_exon = [] | |
29 if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1: | |
30 jun_exon = [utr5_start, cds_5end] | |
31 if len(cds_cod) == 1: | |
32 five_prime_flag = 0 | |
33 if jun_exon != []: | |
34 five_p_utr = five_p_utr[:-1] | |
35 five_prime_flag = 1 | |
36 for utr5 in five_p_utr: | |
37 exon_pos.append(utr5) | |
38 jun_exon = [] | |
39 utr3_start, utr3_end = 0, 0 | |
40 if three_p_utr != []: | |
41 utr3_start = three_p_utr[0][0] | |
42 utr3_end = three_p_utr[0][1] | |
43 if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1: | |
44 jun_exon = [cds_5start, utr3_end] | |
45 three_prime_flag = 0 | |
46 if jun_exon != []: | |
47 cds_cod = cds_cod[:-1] | |
48 three_p_utr = three_p_utr[1:] | |
49 three_prime_flag = 1 | |
50 if five_prime_flag == 1 and three_prime_flag == 1: | |
51 exon_pos.append([utr5_start, utr3_end]) | |
52 if five_prime_flag == 1 and three_prime_flag == 0: | |
53 exon_pos.append([utr5_start, cds_5end]) | |
54 cds_cod = cds_cod[:-1] | |
55 if five_prime_flag == 0 and three_prime_flag == 1: | |
56 exon_pos.append([cds_5start, utr3_end]) | |
57 for cds in cds_cod: | |
58 exon_pos.append(cds) | |
59 for utr3 in three_p_utr: | |
60 exon_pos.append(utr3) | |
61 else: | |
62 if jun_exon != []: | |
63 five_p_utr = five_p_utr[:-1] | |
64 cds_cod = cds_cod[1:] | |
65 for utr5 in five_p_utr: | |
66 exon_pos.append(utr5) | |
67 exon_pos.append(jun_exon) if jun_exon != [] else '' | |
68 jun_exon = [] | |
69 utr3_start, utr3_end = 0, 0 | |
70 if three_p_utr != []: | |
71 utr3_start = three_p_utr[0][0] | |
72 utr3_end = three_p_utr[0][1] | |
73 cds_3start = cds_cod[-1][0] | |
74 cds_3end = cds_cod[-1][1] | |
75 if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1: | |
76 jun_exon = [cds_3start, utr3_end] | |
77 if jun_exon != []: | |
78 cds_cod = cds_cod[:-1] | |
79 three_p_utr = three_p_utr[1:] | |
80 for cds in cds_cod: | |
81 exon_pos.append(cds) | |
82 exon_pos.append(jun_exon) if jun_exon != [] else '' | |
83 for utr3 in three_p_utr: | |
84 exon_pos.append(utr3) | |
85 elif strand_p == '-': | |
86 utr3_start, utr3_end = 0, 0 | |
87 if three_p_utr != []: | |
88 utr3_start = three_p_utr[-1][0] | |
89 utr3_end = three_p_utr[-1][1] | |
90 cds_3start = cds_cod[0][0] | |
91 cds_3end = cds_cod[0][1] | |
92 jun_exon = [] | |
93 if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1: | |
94 jun_exon = [utr3_start, cds_3end] | |
95 if len(cds_cod) == 1: | |
96 three_prime_flag = 0 | |
97 if jun_exon != []: | |
98 three_p_utr = three_p_utr[:-1] | |
99 three_prime_flag = 1 | |
100 for utr3 in three_p_utr: | |
101 exon_pos.append(utr3) | |
102 jun_exon = [] | |
103 (utr5_start, utr5_end) = (0, 0) | |
104 if five_p_utr != []: | |
105 utr5_start = five_p_utr[0][0] | |
106 utr5_end = five_p_utr[0][1] | |
107 if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1: | |
108 jun_exon = [cds_3start, utr5_end] | |
109 five_prime_flag = 0 | |
110 if jun_exon != []: | |
111 cds_cod = cds_cod[:-1] | |
112 five_p_utr = five_p_utr[1:] | |
113 five_prime_flag = 1 | |
114 if three_prime_flag == 1 and five_prime_flag == 1: | |
115 exon_pos.append([utr3_start, utr5_end]) | |
116 if three_prime_flag == 1 and five_prime_flag == 0: | |
117 exon_pos.append([utr3_start, cds_3end]) | |
118 cds_cod = cds_cod[:-1] | |
119 if three_prime_flag == 0 and five_prime_flag == 1: | |
120 exon_pos.append([cds_3start, utr5_end]) | |
121 for cds in cds_cod: | |
122 exon_pos.append(cds) | |
123 for utr5 in five_p_utr: | |
124 exon_pos.append(utr5) | |
125 else: | |
126 if jun_exon != []: | |
127 three_p_utr = three_p_utr[:-1] | |
128 cds_cod = cds_cod[1:] | |
129 for utr3 in three_p_utr: | |
130 exon_pos.append(utr3) | |
131 if jun_exon != []: | |
132 exon_pos.append(jun_exon) | |
133 jun_exon = [] | |
134 (utr5_start, utr5_end) = (0, 0) | |
135 if five_p_utr != []: | |
136 utr5_start = five_p_utr[0][0] | |
137 utr5_end = five_p_utr[0][1] | |
138 cds_5start = cds_cod[-1][0] | |
139 cds_5end = cds_cod[-1][1] | |
140 if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1: | |
141 jun_exon = [cds_5start, utr5_end] | |
142 if jun_exon != []: | |
143 cds_cod = cds_cod[:-1] | |
144 five_p_utr = five_p_utr[1:] | |
145 for cds in cds_cod: | |
146 exon_pos.append(cds) | |
147 if jun_exon != []: | |
148 exon_pos.append(jun_exon) | |
149 for utr5 in five_p_utr: | |
150 exon_pos.append(utr5) | |
151 return exon_pos | |
152 | |
153 def init_gene(): | |
154 """Initializing the gene structure | |
155 """ | |
156 gene_details=dict(chr='', | |
157 exons=[], | |
158 gene_info={}, | |
159 id='', | |
160 is_alt_spliced=0, | |
161 name='', | |
162 source='', | |
163 start='', | |
164 stop='', | |
165 strand='', | |
166 transcripts=[]) | |
167 return gene_details | |
168 | |
169 def FeatureValueFormat(singlegene): | |
170 """Make feature value compactable to write in a .mat format | |
171 """ | |
172 comp_exon = np.zeros((len(singlegene['exons']),), dtype=np.object) | |
173 for i in range(len(singlegene['exons'])): | |
174 comp_exon[i]= np.array(singlegene['exons'][i]) | |
175 singlegene['exons'] = comp_exon | |
176 comp_transcripts = np.zeros((len(singlegene['transcripts']),), dtype=np.object) | |
177 for i in range(len(singlegene['transcripts'])): | |
178 comp_transcripts[i] = np.array(singlegene['transcripts'][i]) | |
179 singlegene['transcripts'] = comp_transcripts | |
180 return singlegene | |
181 | |
182 def CreateGeneModels(genes_cmpt, transcripts_cmpt, exons_cmpt, utr3_cmpt, utr5_cmpt, cds_cmpt): | |
183 """Creating Coding/Non-coding gene models from parsed GFF objects. | |
184 """ | |
185 gene_counter, gene_models=1, [] | |
186 for gene_entry in genes_cmpt: ## Figure out the genes and transcripts associated feature | |
187 if gene_entry in transcripts_cmpt: | |
188 gene=init_gene() | |
189 gene['id']=gene_counter | |
190 gene['name']=gene_entry[1] | |
191 gene['chr']=genes_cmpt[gene_entry]['chr'] | |
192 gene['source']=genes_cmpt[gene_entry]['source'] | |
193 gene['start']=genes_cmpt[gene_entry]['start'] | |
194 gene['stop']=genes_cmpt[gene_entry]['stop'] | |
195 gene['strand']=genes_cmpt[gene_entry]['strand'] | |
196 if not gene['strand'] in ['+', '-']: | |
197 gene['strand']='.' # Strand info not known replaced with a dot symbol instead of None, ?, . etc. | |
198 gene['gene_info']=dict(ID=gene_entry[1]) | |
199 if len(transcripts_cmpt[gene_entry])>1: | |
200 gene['is_alt_spliced'] = 1 | |
201 for tids in transcripts_cmpt[gene_entry]: ## transcript section related tags | |
202 gene['transcripts'].append(tids['ID']) | |
203 if len(exons_cmpt) != 0: | |
204 if (gene['chr'], tids['ID']) in exons_cmpt: | |
205 exon_cod=[[feat_exon['start'], feat_exon['stop']] for feat_exon in exons_cmpt[(gene['chr'], tids['ID'])]] | |
206 else: ## build exon coordinates from UTR3, UTR5 and CDS | |
207 utr5_pos, cds_pos, utr3_pos = [], [], [] | |
208 if (gene['chr'], tids['ID']) in utr5_cmpt: | |
209 utr5_pos=[[feat_utr5['start'], feat_utr5['stop']] for feat_utr5 in utr5_cmpt[(gene['chr'], tids['ID'])]] | |
210 if (gene['chr'], tids['ID']) in cds_cmpt: | |
211 cds_pos=[[feat_cds['start'], feat_cds['stop']] for feat_cds in cds_cmpt[(gene['chr'], tids['ID'])]] | |
212 if (gene['chr'], tids['ID']) in utr3_cmpt: | |
213 utr3_pos=[[feat_utr3['start'], feat_utr3['stop']] for feat_utr3 in utr3_cmpt[(gene['chr'], tids['ID'])]] | |
214 exon_cod=createExon(gene['strand'], utr5_pos, cds_pos, utr3_pos) | |
215 if gene['strand']=='-': | |
216 if len(exon_cod) >1: | |
217 if exon_cod[0][0] > exon_cod[-1][0]: | |
218 exon_cod.reverse() | |
219 if exon_cod: | |
220 gene['exons'].append(exon_cod) | |
221 gene=FeatureValueFormat(gene) # get prepare for MAT writing | |
222 gene_counter+=1 | |
223 gene_models.append(gene) | |
224 return gene_models | |
225 | |
226 def GFFParse(gff_file): | |
227 """Parsing GFF file based on feature relationship. | |
228 """ | |
229 genes, utr5, exons=dict(), dict(), dict() | |
230 transcripts, utr3, cds=dict(), dict(), dict() | |
231 # TODO Include growing key words of different non-coding/coding transcripts | |
232 features=['mRNA', 'transcript', 'ncRNA', 'miRNA', 'pseudogenic_transcript', 'rRNA', 'snoRNA', 'snRNA', 'tRNA', 'scRNA'] | |
233 gff_handle=open(gff_file, "rU") | |
234 for gff_line in gff_handle: | |
235 gff_line=gff_line.strip('\n\r').split('\t') | |
236 if re.match(r'#|>', gff_line[0]): # skip commented line and fasta identifier line | |
237 continue | |
238 if len(gff_line)==1: # skip fasta sequence/empty line if present | |
239 continue | |
240 assert len(gff_line)==9, '\t'.join(gff_line) # not found 9 tab-delimited fields in this line | |
241 if '' in gff_line: # skip this line if there any field with an empty value | |
242 print 'Skipping..', '\t'.join(gff_line) | |
243 continue | |
244 if gff_line[-1][-1]==';': # trim the last ';' character | |
245 gff_line[-1]=gff_line[-1].strip(';') | |
246 if gff_line[2] in ['gene', 'pseudogene']: | |
247 gid, gene_info=None, dict() | |
248 gene_info['start']=int(gff_line[3]) | |
249 gene_info['stop']=int(gff_line[4]) | |
250 gene_info['chr']=gff_line[0] | |
251 gene_info['source']=gff_line[1] | |
252 gene_info['strand']=gff_line[6] | |
253 for attb in gff_line[-1].split(';'): | |
254 attb=attb.split('=') # gff attributes are separated by key=value pair | |
255 if attb[0]=='ID': | |
256 gid=attb[1] | |
257 break | |
258 genes[(gff_line[0], gid)]=gene_info # store gene information based on the chromosome and gene symbol. | |
259 elif gff_line[2] in features: | |
260 gid, mrna_info=None, dict() | |
261 mrna_info['start']=int(gff_line[3]) | |
262 mrna_info['stop']=int(gff_line[4]) | |
263 mrna_info['chr']=gff_line[0] | |
264 mrna_info['strand']=gff_line[6] | |
265 for attb in gff_line[-1].split(';'): | |
266 attb=attb.split('=') | |
267 if attb[0]=='Parent': | |
268 gid=attb[1] | |
269 elif attb[0]=='ID': | |
270 mrna_info[attb[0]]=attb[1] | |
271 for fid in gid.split(','): # child may be mapped to multiple parents ex: Parent=AT01,AT01-1-Protein | |
272 if (gff_line[0], fid) in transcripts: | |
273 transcripts[(gff_line[0], fid)].append(mrna_info) | |
274 else: | |
275 transcripts[(gff_line[0], fid)]=[mrna_info] | |
276 elif gff_line[2] in ['exon']: | |
277 tids, exon_info=None, dict() | |
278 exon_info['start']=int(gff_line[3]) | |
279 exon_info['stop']=int(gff_line[4]) | |
280 exon_info['chr']=gff_line[0] | |
281 exon_info['strand']=gff_line[6] | |
282 for attb in gff_line[-1].split(';'): | |
283 attb=attb.split('=') | |
284 if attb[0]=='Parent': | |
285 tids=attb[1] | |
286 break | |
287 for tid in tids.split(','): | |
288 if (gff_line[0], tid) in exons: | |
289 exons[(gff_line[0], tid)].append(exon_info) | |
290 else: | |
291 exons[(gff_line[0], tid)]=[exon_info] | |
292 elif gff_line[2] in ['five_prime_UTR']: | |
293 utr5_info, tids=dict(), None | |
294 utr5_info['start']=int(gff_line[3]) | |
295 utr5_info['stop']=int(gff_line[4]) | |
296 utr5_info['chr']=gff_line[0] | |
297 utr5_info['strand']=gff_line[6] | |
298 for attb in gff_line[-1].split(';'): | |
299 attb=attb.split('=') | |
300 if attb[0]=='Parent': | |
301 tids=attb[1] | |
302 break | |
303 for tid in tids.split(','): | |
304 if (gff_line[0], tid) in utr5: | |
305 utr5[(gff_line[0], tid)].append(utr5_info) | |
306 else: | |
307 utr5[(gff_line[0], tid)]=[utr5_info] | |
308 elif gff_line[2] in ['CDS']: | |
309 cds_info, tids=dict(), None | |
310 cds_info['start']=int(gff_line[3]) | |
311 cds_info['stop']=int(gff_line[4]) | |
312 cds_info['chr']=gff_line[0] | |
313 cds_info['strand']=gff_line[6] | |
314 for attb in gff_line[-1].split(';'): | |
315 attb=attb.split('=') | |
316 if attb[0]=='Parent': | |
317 tids=attb[1] | |
318 break | |
319 for tid in tids.split(','): | |
320 if (gff_line[0], tid) in cds: | |
321 cds[(gff_line[0], tid)].append(cds_info) | |
322 else: | |
323 cds[(gff_line[0], tid)]=[cds_info] | |
324 elif gff_line[2] in ['three_prime_UTR']: | |
325 utr3_info, tids=dict(), None | |
326 utr3_info['start']=int(gff_line[3]) | |
327 utr3_info['stop']=int(gff_line[4]) | |
328 utr3_info['chr']=gff_line[0] | |
329 utr3_info['strand']=gff_line[6] | |
330 for attb in gff_line[-1].split(';'): | |
331 attb=attb.split('=') | |
332 if attb[0]=='Parent': | |
333 tids=attb[1] | |
334 break | |
335 for tid in tids.split(','): | |
336 if (gff_line[0], tid) in utr3: | |
337 utr3[(gff_line[0], tid)].append(utr3_info) | |
338 else: | |
339 utr3[(gff_line[0], tid)]=[utr3_info] | |
340 gff_handle.close() | |
341 return genes, transcripts, exons, utr3, utr5, cds | |
342 | |
343 def __main__(): | |
344 """This function provides a best way to extract genome feature | |
345 information from a GFF3 file for the rQuant downstream processing. | |
346 """ | |
347 try: | |
348 gff_file = sys.argv[1] | |
349 mat_file = sys.argv[2] | |
350 except: | |
351 print __doc__ | |
352 sys.exit(-1) | |
353 genes, transcripts, exons, utr3, utr5, cds=GFFParse(gff_file) | |
354 gene_models=CreateGeneModels(genes, transcripts, exons, utr3, utr5, cds) | |
355 # TODO Write to matlab/octave struct instead of cell arrays. | |
356 sio.savemat(mat_file, | |
357 mdict=dict(genes=gene_models), | |
358 format='5', | |
359 oned_as='row') | |
360 | |
361 if __name__=='__main__': | |
362 __main__() |