annotate rDiff/tools/ParseGFF.py @ 0:0f80a5141704

version 0.3 uploaded
author vipints
date Thu, 14 Feb 2013 23:38:36 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
1 #!/usr/bin/env python
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
2 """
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
3 Extract genome annotation from a GFF3 (a tab delimited
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
4 format for storing sequence features and annotations:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
5 http://www.sequenceontology.org/gff3.shtml) file.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
6
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
7 Usage: ParseGFF.py in.gff3 out.mat
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
8
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
9 Requirements:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
10 Scipy :- http://scipy.org/
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
11 Numpy :- http://numpy.org/
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
12
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
13 Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
14 2012- Memorial Sloan-Kettering Cancer Center, New York City, USA
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
15 """
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
16
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
17 import re, sys
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
18 import scipy.io as sio
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
19 import numpy as np
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
20
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
21 def addCDSphase(strand, cds):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
22 """Add CDS phase to the CDS exons
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
23 """
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
24 cds_region, cds_flag = [], 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
25 if strand == '+':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
26 for cdspos in cds:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
27 if cds_flag == 0:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
28 cdspos = (cdspos[0], cdspos[1], 0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
29 diff = (cdspos[1]-(cdspos[0]-1))%3
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
30 else:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
31 xy = 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
32 if diff == 0:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
33 cdspos = (cdspos[0], cdspos[1], 0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
34 elif diff == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
35 cdspos = (cdspos[0], cdspos[1], 2)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
36 xy = 2
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
37 elif diff == 2:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
38 cdspos = (cdspos[0], cdspos[1], 1)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
39 xy = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
40 diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
41 cds_region.append(cdspos)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
42 cds_flag = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
43 elif strand == '-':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
44 cds.reverse()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
45 for cdspos in cds:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
46 if cds_flag == 0:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
47 cdspos = (cdspos[0], cdspos[1], 0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
48 diff = (cdspos[1]-(cdspos[0]-1))%3
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
49 else:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
50 xy = 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
51 if diff == 0:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
52 cdspos = (cdspos[0], cdspos[1], 0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
53 elif diff == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
54 cdspos = (cdspos[0], cdspos[1], 2)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
55 xy = 2
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
56 elif diff == 2:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
57 cdspos = (cdspos[0], cdspos[1], 1)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
58 xy = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
59 diff = ((cdspos[1]-(cdspos[0]-1))-xy)%3
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
60 cds_region.append(cdspos)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
61 cds_flag = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
62 cds_region.reverse()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
63 return cds_region
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
64
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
65 def createExon(strand_p, five_p_utr, cds_cod, three_p_utr):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
66 """Create exon cordinates from UTR's and CDS region
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
67 """
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
68 exon_pos = []
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
69 if strand_p == '+':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
70 utr5_start, utr5_end = 0, 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
71 if five_p_utr != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
72 utr5_start, utr5_end = five_p_utr[-1][0], five_p_utr[-1][1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
73 cds_5start, cds_5end = cds_cod[0][0], cds_cod[0][1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
74 jun_exon = []
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
75 if cds_5start-utr5_end == 0 or cds_5start-utr5_end == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
76 jun_exon = [utr5_start, cds_5end]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
77 if len(cds_cod) == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
78 five_prime_flag = 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
79 if jun_exon != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
80 five_p_utr = five_p_utr[:-1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
81 five_prime_flag = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
82 for utr5 in five_p_utr:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
83 exon_pos.append(utr5)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
84 jun_exon = []
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
85 utr3_start, utr3_end = 0, 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
86 if three_p_utr != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
87 utr3_start = three_p_utr[0][0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
88 utr3_end = three_p_utr[0][1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
89 if utr3_start-cds_5end == 0 or utr3_start-cds_5end == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
90 jun_exon = [cds_5start, utr3_end]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
91 three_prime_flag = 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
92 if jun_exon != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
93 cds_cod = cds_cod[:-1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
94 three_p_utr = three_p_utr[1:]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
95 three_prime_flag = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
96 if five_prime_flag == 1 and three_prime_flag == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
97 exon_pos.append([utr5_start, utr3_end])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
98 if five_prime_flag == 1 and three_prime_flag == 0:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
99 exon_pos.append([utr5_start, cds_5end])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
100 cds_cod = cds_cod[:-1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
101 if five_prime_flag == 0 and three_prime_flag == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
102 exon_pos.append([cds_5start, utr3_end])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
103 for cds in cds_cod:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
104 exon_pos.append(cds)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
105 for utr3 in three_p_utr:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
106 exon_pos.append(utr3)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
107 else:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
108 if jun_exon != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
109 five_p_utr = five_p_utr[:-1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
110 cds_cod = cds_cod[1:]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
111 for utr5 in five_p_utr:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
112 exon_pos.append(utr5)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
113 exon_pos.append(jun_exon) if jun_exon != [] else ''
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
114 jun_exon = []
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
115 utr3_start, utr3_end = 0, 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
116 if three_p_utr != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
117 utr3_start = three_p_utr[0][0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
118 utr3_end = three_p_utr[0][1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
119 cds_3start = cds_cod[-1][0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
120 cds_3end = cds_cod[-1][1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
121 if utr3_start-cds_3end == 0 or utr3_start-cds_3end == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
122 jun_exon = [cds_3start, utr3_end]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
123 if jun_exon != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
124 cds_cod = cds_cod[:-1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
125 three_p_utr = three_p_utr[1:]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
126 for cds in cds_cod:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
127 exon_pos.append(cds)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
128 exon_pos.append(jun_exon) if jun_exon != [] else ''
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
129 for utr3 in three_p_utr:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
130 exon_pos.append(utr3)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
131 elif strand_p == '-':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
132 utr3_start, utr3_end = 0, 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
133 if three_p_utr != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
134 utr3_start = three_p_utr[-1][0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
135 utr3_end = three_p_utr[-1][1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
136 cds_3start = cds_cod[0][0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
137 cds_3end = cds_cod[0][1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
138 jun_exon = []
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
139 if cds_3start-utr3_end == 0 or cds_3start-utr3_end == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
140 jun_exon = [utr3_start, cds_3end]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
141 if len(cds_cod) == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
142 three_prime_flag = 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
143 if jun_exon != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
144 three_p_utr = three_p_utr[:-1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
145 three_prime_flag = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
146 for utr3 in three_p_utr:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
147 exon_pos.append(utr3)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
148 jun_exon = []
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
149 (utr5_start, utr5_end) = (0, 0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
150 if five_p_utr != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
151 utr5_start = five_p_utr[0][0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
152 utr5_end = five_p_utr[0][1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
153 if utr5_start-cds_3end == 0 or utr5_start-cds_3end == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
154 jun_exon = [cds_3start, utr5_end]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
155 five_prime_flag = 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
156 if jun_exon != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
157 cds_cod = cds_cod[:-1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
158 five_p_utr = five_p_utr[1:]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
159 five_prime_flag = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
160 if three_prime_flag == 1 and five_prime_flag == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
161 exon_pos.append([utr3_start, utr5_end])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
162 if three_prime_flag == 1 and five_prime_flag == 0:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
163 exon_pos.append([utr3_start, cds_3end])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
164 cds_cod = cds_cod[:-1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
165 if three_prime_flag == 0 and five_prime_flag == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
166 exon_pos.append([cds_3start, utr5_end])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
167 for cds in cds_cod:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
168 exon_pos.append(cds)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
169 for utr5 in five_p_utr:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
170 exon_pos.append(utr5)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
171 else:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
172 if jun_exon != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
173 three_p_utr = three_p_utr[:-1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
174 cds_cod = cds_cod[1:]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
175 for utr3 in three_p_utr:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
176 exon_pos.append(utr3)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
177 if jun_exon != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
178 exon_pos.append(jun_exon)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
179 jun_exon = []
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
180 (utr5_start, utr5_end) = (0, 0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
181 if five_p_utr != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
182 utr5_start = five_p_utr[0][0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
183 utr5_end = five_p_utr[0][1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
184 cds_5start = cds_cod[-1][0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
185 cds_5end = cds_cod[-1][1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
186 if utr5_start-cds_5end == 0 or utr5_start-cds_5end == 1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
187 jun_exon = [cds_5start, utr5_end]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
188 if jun_exon != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
189 cds_cod = cds_cod[:-1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
190 five_p_utr = five_p_utr[1:]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
191 for cds in cds_cod:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
192 exon_pos.append(cds)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
193 if jun_exon != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
194 exon_pos.append(jun_exon)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
195 for utr5 in five_p_utr:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
196 exon_pos.append(utr5)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
197 return exon_pos
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
198
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
199 def init_gene():
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
200 """Initializing the gene structure
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
201 """
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
202 gene_details=dict(
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
203 id = '',
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
204 anno_id = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
205 confgenes_id = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
206 name = '',
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
207 source = '',
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
208 gene_info = {},
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
209 alias = '',
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
210 name2 = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
211 strand = '',
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
212 chr = '',
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
213 chr_num = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
214 paralogs = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
215 start = '',
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
216 stop = '',
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
217 transcripts = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
218 transcript_info = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
219 transcript_status = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
220 transcript_valid = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
221 exons = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
222 exons_confirmed = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
223 cds_exons = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
224 utr5_exons = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
225 utr3_exons = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
226 tis = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
227 tis_conf = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
228 tis_info = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
229 cdsStop = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
230 cdsStop_conf = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
231 cdsStop_info = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
232 tss = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
233 tss_info = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
234 tss_conf = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
235 cleave = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
236 cleave_info = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
237 cleave_conf = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
238 polya = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
239 polya_info = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
240 polya_conf = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
241 is_alt = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
242 is_alt_spliced = 0,
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
243 is_valid = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
244 transcript_complete = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
245 is_complete = [],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
246 is_correctly_gff3_referenced = '',
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
247 splicegraph = []
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
248 )
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
249 return gene_details
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
250
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
251 def FeatureValueFormat(singlegene):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
252 """Make feature value compactable to write in a .mat format
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
253 """
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
254 comp_exon = np.zeros((len(singlegene['exons']),), dtype=np.object)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
255 for i in range(len(singlegene['exons'])):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
256 comp_exon[i]= np.array(singlegene['exons'][i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
257 singlegene['exons'] = comp_exon
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
258 comp_cds = np.zeros((len(singlegene['cds_exons']),), dtype=np.object)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
259 for i in range(len(singlegene['cds_exons'])):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
260 comp_cds[i]= np.array(singlegene['cds_exons'][i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
261 singlegene['cds_exons'] = comp_cds
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
262 comp_utr3 = np.zeros((len(singlegene['utr3_exons']),), dtype=np.object)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
263 for i in range(len(singlegene['utr3_exons'])):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
264 comp_utr3[i]= np.array(singlegene['utr3_exons'][i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
265 singlegene['utr3_exons'] = comp_utr3
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
266 comp_utr5 = np.zeros((len(singlegene['utr5_exons']),), dtype=np.object)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
267 for i in range(len(singlegene['utr5_exons'])):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
268 comp_utr5[i]= np.array(singlegene['utr5_exons'][i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
269 singlegene['utr5_exons'] = comp_utr5
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
270 comp_transcripts = np.zeros((len(singlegene['transcripts']),), dtype=np.object)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
271 for i in range(len(singlegene['transcripts'])):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
272 comp_transcripts[i]= np.array(singlegene['transcripts'][i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
273 singlegene['transcripts'] = comp_transcripts
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
274 comp_tss = np.zeros((len(singlegene['tss']),), dtype=np.object)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
275 for i in range(len(singlegene['tss'])):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
276 comp_tss[i]= np.array(singlegene['tss'][i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
277 singlegene['tss'] = comp_tss
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
278 comp_tis = np.zeros((len(singlegene['tis']),), dtype=np.object)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
279 for i in range(len(singlegene['tis'])):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
280 comp_tis[i]= np.array(singlegene['tis'][i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
281 singlegene['tis'] = comp_tis
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
282 comp_cleave = np.zeros((len(singlegene['cleave']),), dtype=np.object)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
283 for i in range(len(singlegene['cleave'])):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
284 comp_cleave[i]= np.array(singlegene['cleave'][i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
285 singlegene['cleave'] = comp_cleave
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
286 comp_cdsStop = np.zeros((len(singlegene['cdsStop']),), dtype=np.object)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
287 for i in range(len(singlegene['cdsStop'])):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
288 comp_cdsStop[i]= np.array(singlegene['cdsStop'][i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
289 singlegene['cdsStop'] = comp_cdsStop
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
290
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
291 return singlegene
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
292
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
293 def CreateGeneModels(genes_cmpt, transcripts_cmpt, exons_cmpt, utr3_cmpt, utr5_cmpt, cds_cmpt):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
294 """Creating Coding/Non-coding gene models from parsed GFF objects.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
295 """
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
296 gene_counter, gene_models = 1, []
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
297 for gene_entry in genes_cmpt: ## Figure out the genes and transcripts associated feature
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
298 if gene_entry in transcripts_cmpt:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
299 gene=init_gene()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
300 gene['id']=gene_counter
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
301 gene['name']=gene_entry[1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
302 gene['chr']=genes_cmpt[gene_entry]['chr']
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
303 gene['source']=genes_cmpt[gene_entry]['source']
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
304 gene['start']=genes_cmpt[gene_entry]['start']
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
305 gene['stop']=genes_cmpt[gene_entry]['stop']
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
306 gene['strand']=genes_cmpt[gene_entry]['strand']
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
307 if not gene['strand'] in ['+', '-']:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
308 gene['strand']='.' # Strand info not known replaced with a dot symbol instead of None, ?, . etc.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
309 if len(transcripts_cmpt[gene_entry])>1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
310 gene['is_alt_spliced'] = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
311 gene['is_alt'] = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
312 gtype=[]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
313 for tids in transcripts_cmpt[gene_entry]: ## transcript section related tags
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
314 gene['transcripts'].append(tids['ID'])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
315 gtype.append(tids['type'])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
316 exon_cod, utr5_cod, utr3_cod, cds_cod = [], [], [], []
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
317 if (gene['chr'], tids['ID']) in exons_cmpt:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
318 exon_cod = [[feat_exon['start'], feat_exon['stop']] for feat_exon in exons_cmpt[(gene['chr'], tids['ID'])]]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
319 if (gene['chr'], tids['ID']) in utr5_cmpt:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
320 utr5_cod = [[feat_utr5['start'], feat_utr5['stop']] for feat_utr5 in utr5_cmpt[(gene['chr'], tids['ID'])]]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
321 if (gene['chr'], tids['ID']) in utr3_cmpt:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
322 utr3_cod = [[feat_utr3['start'], feat_utr3['stop']] for feat_utr3 in utr3_cmpt[(gene['chr'], tids['ID'])]]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
323 if (gene['chr'], tids['ID']) in cds_cmpt:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
324 cds_cod = [[feat_cds['start'], feat_cds['stop']] for feat_cds in cds_cmpt[(gene['chr'], tids['ID'])]]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
325 if len(exon_cod) == 0: ## build exon coordinates from UTR3, UTR5 and CDS
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
326 if cds_cod != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
327 exon_cod=createExon(gene['strand'], utr5_cod, cds_cod, utr3_cod)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
328
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
329 if gene['strand']=='-': ## general order to coordinates
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
330 if len(exon_cod) >1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
331 if exon_cod[0][0] > exon_cod[-1][0]:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
332 exon_cod.reverse()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
333 if len(cds_cod) >1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
334 if cds_cod[0][0] > cds_cod[-1][0]:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
335 cds_cod.reverse()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
336 if len(utr3_cod) >1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
337 if utr3_cod[0][0] > utr3_cod[-1][0]:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
338 utr3_cod.reverse()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
339 if len(utr5_cod) >1:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
340 if utr5_cod[0][0] > utr5_cod[-1][0]:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
341 utr5_cod.reverse()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
342
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
343 tis, cdsStop, tss, cleave = [], [], [], [] ## speacial sited in the gene region
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
344 if cds_cod != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
345 if gene['strand'] == '+':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
346 tis = [cds_cod[0][0]]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
347 cdsStop = [cds_cod[-1][1]-3]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
348 elif gene['strand'] == '-':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
349 tis = [cds_cod[-1][1]]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
350 cdsStop = [cds_cod[0][0]+3]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
351 if utr5_cod != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
352 if gene['strand'] == '+':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
353 tss = [utr5_cod[0][0]]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
354 elif gene['strand'] == '-':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
355 tss = [utr5_cod[-1][1]]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
356 if utr3_cod != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
357 if gene['strand'] == '+':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
358 cleave = [utr3_cod[-1][1]]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
359 elif gene['strand'] == '-':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
360 cleave = [utr3_cod[0][0]]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
361
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
362 cds_status, exon_status, utr_status = 0, 0, 0 ## status of the complete elements of the gene
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
363 if cds_cod != []: ## adding phase to the CDS region
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
364 cds_cod_phase = addCDSphase(gene['strand'], cds_cod)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
365 cds_status = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
366 gene['cds_exons'].append(cds_cod_phase)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
367
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
368 if exon_cod != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
369 exon_status = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
370 if utr5_cod != [] or utr3_cod != []:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
371 utr_status = 1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
372 if cds_status != 0 and exon_status != 0 and utr_status != 0:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
373 gene['transcript_status'].append(1)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
374 else:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
375 gene['transcript_status'].append(0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
376
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
377 if exon_cod: ## final check point for a valid gene model
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
378 gene['exons'].append(exon_cod)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
379 gene['utr3_exons'].append(utr3_cod)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
380 gene['utr5_exons'].append(utr5_cod)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
381 gene['tis'].append(tis)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
382 gene['cdsStop'].append(cdsStop)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
383 gene['tss'].append(tss)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
384 gene['cleave'].append(cleave)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
385
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
386 gtype=list(set(gtype)) ## different types
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
387 gene['gene_info']=dict(ID=gene_entry[1],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
388 Source=genes_cmpt[gene_entry]['source'],
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
389 Type=gtype)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
390 gene=FeatureValueFormat(gene) ## get prepare for MAT writing
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
391 gene_counter+=1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
392 gene_models.append(gene)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
393 return gene_models
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
394
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
395 def GFFParse(gff_file):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
396 """Parsing GFF file based on feature relationship.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
397 """
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
398 genes, utr5, exons=dict(), dict(), dict()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
399 transcripts, utr3, cds=dict(), dict(), dict()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
400 # TODO Include growing key words of different non-coding/coding transcripts
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
401 features=['mrna', 'transcript', 'ncRNA', 'mirna', 'pseudogenic_transcript', 'rrna', 'snorna', 'snrna', 'trna', 'scrna']
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
402 gff_handle=open(gff_file, "rU")
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
403 for gff_line in gff_handle:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
404 gff_line=gff_line.strip('\n\r').split('\t')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
405 if re.match(r'#|>', gff_line[0]): # skip commented line or fasta identifier line
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
406 continue
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
407 if len(gff_line)==1: # skip fasta sequence/empty line if present
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
408 continue
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
409 assert len(gff_line)==9, '\t'.join(gff_line) # not found 9 tab-delimited fields in this line
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
410 if '' in gff_line: # skip this line if there any field with an empty value
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
411 print 'Skipping..', '\t'.join(gff_line)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
412 continue
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
413 if gff_line[-1][-1]==';': # trim the last ';' character
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
414 gff_line[-1]=gff_line[-1].strip(';')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
415 if gff_line[2].lower() in ['gene', 'pseudogene']:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
416 gid, gene_info=None, dict()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
417 gene_info['start']=int(gff_line[3])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
418 gene_info['stop']=int(gff_line[4])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
419 gene_info['chr']=gff_line[0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
420 gene_info['source']=gff_line[1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
421 gene_info['strand']=gff_line[6]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
422 for attb in gff_line[-1].split(';'):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
423 attb=attb.split('=') # gff attributes are separated by key=value pair
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
424 if attb[0]=='ID':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
425 gid=attb[1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
426 break
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
427 genes[(gff_line[0], gid)]=gene_info # store gene information based on the chromosome and gene symbol.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
428 elif gff_line[2].lower() in features:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
429 gid, mrna_info=None, dict()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
430 mrna_info['start']=int(gff_line[3])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
431 mrna_info['stop']=int(gff_line[4])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
432 mrna_info['chr']=gff_line[0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
433 mrna_info['strand']=gff_line[6]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
434 mrna_info['type'] = gff_line[2]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
435 for attb in gff_line[-1].split(';'):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
436 attb=attb.split('=')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
437 if attb[0]=='Parent':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
438 gid=attb[1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
439 elif attb[0]=='ID':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
440 mrna_info[attb[0]]=attb[1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
441 for fid in gid.split(','): # child may be mapped to multiple parents ex: Parent=AT01,AT01-1-Protein
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
442 if (gff_line[0], fid) in transcripts:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
443 transcripts[(gff_line[0], fid)].append(mrna_info)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
444 else:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
445 transcripts[(gff_line[0], fid)]=[mrna_info]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
446 elif gff_line[2].lower() in ['exon']:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
447 tids, exon_info=None, dict()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
448 exon_info['start']=int(gff_line[3])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
449 exon_info['stop']=int(gff_line[4])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
450 exon_info['chr']=gff_line[0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
451 exon_info['strand']=gff_line[6]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
452 for attb in gff_line[-1].split(';'):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
453 attb=attb.split('=')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
454 if attb[0]=='Parent':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
455 tids=attb[1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
456 break
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
457 for tid in tids.split(','):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
458 if (gff_line[0], tid) in exons:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
459 exons[(gff_line[0], tid)].append(exon_info)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
460 else:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
461 exons[(gff_line[0], tid)]=[exon_info]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
462 elif gff_line[2].lower() in ['five_prime_utr']:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
463 utr5_info, tids=dict(), None
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
464 utr5_info['start']=int(gff_line[3])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
465 utr5_info['stop']=int(gff_line[4])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
466 utr5_info['chr']=gff_line[0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
467 utr5_info['strand']=gff_line[6]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
468 for attb in gff_line[-1].split(';'):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
469 attb=attb.split('=')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
470 if attb[0]=='Parent':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
471 tids=attb[1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
472 break
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
473 for tid in tids.split(','):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
474 if (gff_line[0], tid) in utr5:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
475 utr5[(gff_line[0], tid)].append(utr5_info)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
476 else:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
477 utr5[(gff_line[0], tid)]=[utr5_info]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
478 elif gff_line[2].lower() in ['cds']:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
479 cds_info, tids=dict(), None
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
480 cds_info['start']=int(gff_line[3])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
481 cds_info['stop']=int(gff_line[4])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
482 cds_info['chr']=gff_line[0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
483 cds_info['strand']=gff_line[6]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
484 for attb in gff_line[-1].split(';'):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
485 attb=attb.split('=')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
486 if attb[0]=='Parent':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
487 tids=attb[1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
488 break
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
489 for tid in tids.split(','):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
490 if (gff_line[0], tid) in cds:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
491 cds[(gff_line[0], tid)].append(cds_info)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
492 else:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
493 cds[(gff_line[0], tid)]=[cds_info]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
494 elif gff_line[2].lower() in ['three_prime_utr']:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
495 utr3_info, tids=dict(), None
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
496 utr3_info['start']=int(gff_line[3])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
497 utr3_info['stop']=int(gff_line[4])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
498 utr3_info['chr']=gff_line[0]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
499 utr3_info['strand']=gff_line[6]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
500 for attb in gff_line[-1].split(';'):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
501 attb=attb.split('=')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
502 if attb[0]=='Parent':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
503 tids=attb[1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
504 break
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
505 for tid in tids.split(','):
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
506 if (gff_line[0], tid) in utr3:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
507 utr3[(gff_line[0], tid)].append(utr3_info)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
508 else:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
509 utr3[(gff_line[0], tid)]=[utr3_info]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
510 gff_handle.close()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
511 return genes, transcripts, exons, utr3, utr5, cds
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
512
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
513 def __main__():
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
514 """extract genome feature information
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
515 """
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
516 try:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
517 gff_file = sys.argv[1]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
518 mat_file = sys.argv[2]
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
519 except:
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
520 print __doc__
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
521 sys.exit(-1)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
522
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
523 genes, transcripts, exons, utr3, utr5, cds = GFFParse(gff_file)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
524 gene_models = CreateGeneModels(genes, transcripts, exons, utr3, utr5, cds)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
525 # TODO Write to matlab/octave struct instead of cell arrays.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
526 sio.savemat(mat_file,
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
527 mdict=dict(genes=gene_models),
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
528 format='5',
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
529 oned_as='row')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
530
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
531 if __name__=='__main__':
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
532 __main__()