Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/SubexonGraph.hpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #ifndef _MOURISL_CLASSES_SUBEXONGRAPH_HEADER | |
2 #define _MOURISL_CLASSES_SUBEXONGRAPH_HEADER | |
3 | |
4 #include "alignments.hpp" | |
5 #include "blocks.hpp" | |
6 | |
7 struct _subexon | |
8 { | |
9 int chrId ; | |
10 int geneId ; | |
11 int start, end ; | |
12 int leftType, rightType ; | |
13 double avgDepth ; | |
14 //double ratio, classifier ; | |
15 double leftRatio, rightRatio ; | |
16 double leftClassifier, rightClassifier ; | |
17 int lcCnt, rcCnt ; | |
18 int leftStrand, rightStrand ; | |
19 | |
20 int nextCnt, prevCnt ; | |
21 int *next, *prev ; | |
22 | |
23 bool canBeStart, canBeEnd ; | |
24 } ; | |
25 | |
26 struct _geneInterval | |
27 { | |
28 int startIdx, endIdx ; | |
29 int start, end ; // The start and end of a gene interval might be adjusted, so it does not | |
30 // need to be match with the corresponding subexons | |
31 } ; | |
32 | |
33 class SubexonGraph | |
34 { | |
35 private: | |
36 int *visit ; | |
37 double classifierThreshold ; | |
38 | |
39 int usedGeneId ; | |
40 int baseGeneId ; | |
41 | |
42 // The function to assign gene ids to subexons. | |
43 void SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id ) ; | |
44 void GetGeneBoundary( int tag, int &boundary, int timeStamp ) ; | |
45 void UpdateGeneId( struct _subexon *subexons, int seCnt ) ; | |
46 public: | |
47 std::vector<struct _subexon> subexons ; | |
48 std::vector<struct _geneInterval> geneIntervals ; | |
49 | |
50 ~SubexonGraph() | |
51 { | |
52 int i ; | |
53 int size = subexons.size() ; | |
54 for ( i = 0 ; i < size ; ++i ) | |
55 { | |
56 if ( subexons[i].next ) | |
57 delete[] subexons[i].next ; | |
58 if ( subexons[i].prev ) | |
59 delete[] subexons[i].prev ; | |
60 } | |
61 } | |
62 | |
63 SubexonGraph( double classifierThreshold, Alignments &bam, FILE *fpSubexon ) | |
64 { | |
65 // Read in the subexons | |
66 rewind( fpSubexon ) ; | |
67 char buffer[2048] ; | |
68 int subexonCnt ; | |
69 int i, j, k ; | |
70 while ( fgets( buffer, sizeof( buffer ), fpSubexon ) != NULL ) | |
71 { | |
72 if ( buffer[0] == '#' ) | |
73 continue ; | |
74 | |
75 struct _subexon se ; | |
76 InputSubexon( buffer, bam, se, true ) ; | |
77 | |
78 // filter. | |
79 if ( ( se.leftType == 0 && se.rightType == 0 ) | |
80 || ( se.leftType == 0 && se.rightType == 1 ) // overhang | |
81 || ( se.leftType == 2 && se.rightType == 0 ) // overhang | |
82 || ( se.leftType == 2 && se.rightType == 1 ) ) // ir | |
83 { | |
84 if ( ( se.leftType == 0 && se.rightType == 1 ) | |
85 || ( se.leftType == 2 && se.rightType == 0 ) ) // if the overhang is too small | |
86 { | |
87 if ( se.end - se.start + 1 <= 7 ) | |
88 { | |
89 if ( se.next ) | |
90 delete[] se.next ; | |
91 if ( se.prev ) | |
92 delete[] se.prev ; | |
93 continue ; | |
94 } | |
95 } | |
96 | |
97 if ( se.leftClassifier >= classifierThreshold || se.leftClassifier < 0 ) | |
98 { | |
99 if ( se.next ) | |
100 delete[] se.next ; | |
101 if ( se.prev ) | |
102 delete[] se.prev ; | |
103 continue ; | |
104 } | |
105 } | |
106 | |
107 // Adjust the coordinate. | |
108 subexons.push_back( se ) ; | |
109 } | |
110 | |
111 // Convert the coordinate to index | |
112 // Note that each coordinate can only associate with one subexon. | |
113 subexonCnt = subexons.size() ; | |
114 for ( i = 0 ; i < subexonCnt ; ++i ) | |
115 { | |
116 struct _subexon &se = subexons[i] ; | |
117 //printf( "hi1 %d: %d %d\n", i, se.prevCnt, se.prev[0] ) ; | |
118 int cnt = 0 ; | |
119 | |
120 // due to filter, we may not fully match the coordinate and the subexon | |
121 int bound = 0 ; | |
122 if ( se.prevCnt > 0 ) | |
123 bound = se.prev[0] ; | |
124 for ( j = i - 1, k = 0 ; k < se.prevCnt && j >= 0 && subexons[j].end >= bound ; --j ) | |
125 { | |
126 //printf( " %d %d: %d %d\n", j, k, se.prev[ se.prevCnt - 1 - k], subexons[j].end ) ; | |
127 if ( subexons[j].end == se.prev[se.prevCnt - 1 - k] ) // notice the order is reversed | |
128 { | |
129 se.prev[se.prevCnt - 1 - cnt] = j ; | |
130 ++k ; | |
131 ++cnt ; | |
132 } | |
133 else if ( subexons[j].end < se.prev[ se.prevCnt - 1 - k ] ) // the corresponding subexon gets filtered. | |
134 { | |
135 ++k ; | |
136 ++j ; // counter the --j in the loop | |
137 } | |
138 } | |
139 //printf( "hi2 %d : %d\n", i, se.prevCnt ) ; | |
140 // shft the list | |
141 for ( j = 0, k = se.prevCnt - cnt ; j < cnt ; ++j, ++k ) | |
142 { | |
143 se.prev[j] = se.prev[k] ; | |
144 } | |
145 se.prevCnt = cnt ; | |
146 cnt = 0 ; | |
147 if ( se.nextCnt > 0 ) | |
148 bound = se.next[ se.nextCnt - 1] ; | |
149 for ( j = i + 1, k = 0 ; k < se.nextCnt && j < subexonCnt && subexons[j].start <= bound ; ++j ) | |
150 { | |
151 if ( subexons[j].start == se.next[k] ) | |
152 { | |
153 se.next[cnt] = j ; // cnt is always less than k, so we don't need to worry about overwrite. | |
154 ++k ; | |
155 ++cnt ; | |
156 } | |
157 else if ( subexons[j].start > se.next[k] ) | |
158 { | |
159 ++k ; | |
160 --j ; | |
161 } | |
162 } | |
163 se.nextCnt = cnt ; | |
164 } | |
165 | |
166 // Adjust the coordinate | |
167 int seCnt = subexons.size() ; | |
168 for ( i = 0 ; i < seCnt ; ++i ) | |
169 { | |
170 --subexons[i].start ; | |
171 --subexons[i].end ; | |
172 } | |
173 rewind( fpSubexon ) ; | |
174 | |
175 // Adjust the classifier for hard boundary, if there is a overhang attached to that region. | |
176 for ( i = 0 ; i < seCnt ; ++i ) | |
177 { | |
178 if ( subexons[i].leftType == 1 && subexons[i].leftClassifier < 1 ) | |
179 { | |
180 for ( j = i - 1 ; j >= 0 ; --j ) | |
181 if ( subexons[j].end < subexons[j + 1].start - 1 ) | |
182 break ; | |
183 if ( subexons[j + 1].leftType == 0 ) | |
184 subexons[i].leftClassifier = 1 ; | |
185 } | |
186 if ( subexons[i].rightType == 2 && subexons[i].rightClassifier < 1 ) | |
187 { | |
188 for ( j = i + 1 ; j < seCnt ; ++j ) | |
189 if ( subexons[j].start > subexons[j - 1].end + 1 ) | |
190 break ; | |
191 if ( subexons[j - 1].rightType == 0 ) | |
192 subexons[i].rightClassifier = 1 ; | |
193 } | |
194 } | |
195 | |
196 // For the region of mixture of plus and minus strand subexons, if there is | |
197 // no overhang attached to it, we need to let the hard boundary be a candidate terminal sites. | |
198 for ( i = 0 ; i < seCnt ; ) | |
199 { | |
200 // [i,j) is a region | |
201 int support[2] = {0, 0} ; // the index, 0 is for minus strand, 1 is for plus strand | |
202 for ( j = i + 1 ; j < seCnt ; ++j ) | |
203 { | |
204 if ( subexons[j].start > subexons[j - 1].end + 1 ) | |
205 break ; | |
206 } | |
207 | |
208 for ( k = i ; k < j ; ++k ) | |
209 { | |
210 if ( subexons[k].leftStrand != 0 ) | |
211 ++support[ ( subexons[k].leftStrand + 1 ) / 2 ] ; | |
212 if ( subexons[k].rightStrand != 0 ) | |
213 ++support[ ( subexons[k].rightStrand + 1 ) / 2 ] ; | |
214 } | |
215 if ( support[0] == 0 || support[1] == 0 ) | |
216 { | |
217 i = j ; | |
218 continue ; | |
219 } | |
220 // a mixture region. | |
221 // We force a terminal site if we have only coming-in and no going-out introns. | |
222 int leftSupport[2] = {0, 0}, rightSupport[2] = {0, 0}; | |
223 int l ; | |
224 for ( k = i ; k < j ; ++k ) | |
225 { | |
226 int cnt = subexons[k].prevCnt ; | |
227 if ( subexons[k].leftStrand != 0 ) | |
228 for ( l = 0 ; l < cnt ; ++l ) | |
229 if ( subexons[k].prev[l] < i ) | |
230 { | |
231 ++leftSupport[ ( subexons[k].leftStrand + 1 ) / 2 ] ; | |
232 break ; | |
233 } | |
234 cnt = subexons[k].nextCnt ; | |
235 if ( subexons[k].rightStrand != 0 ) | |
236 for ( l = 0 ; l < cnt ; ++l ) | |
237 if ( subexons[k].next[l] >= j ) | |
238 { | |
239 ++rightSupport[ ( subexons[k].rightStrand + 1 ) / 2 ] ; | |
240 break ; | |
241 } | |
242 } | |
243 | |
244 if ( ( ( leftSupport[0] > 0 && rightSupport[0] == 0 ) || | |
245 ( leftSupport[1] > 0 && rightSupport[1] == 0 ) ) && | |
246 subexons[j - 1].rightType != 0 ) | |
247 { | |
248 subexons[j - 1].rightClassifier = 0 ; | |
249 } | |
250 | |
251 if ( ( ( leftSupport[0] == 0 && rightSupport[0] > 0 ) || | |
252 ( leftSupport[1] == 0 && rightSupport[1] > 0 ) ) && | |
253 subexons[j - 1].leftType != 0 ) | |
254 { | |
255 subexons[j - 1].leftClassifier = 0 ; | |
256 } | |
257 | |
258 i = j ; | |
259 } | |
260 | |
261 this->classifierThreshold = classifierThreshold ; | |
262 | |
263 usedGeneId = baseGeneId = 0 ; | |
264 } | |
265 | |
266 static bool IsSameStrand( int a, int b ) | |
267 { | |
268 if ( a == 0 || b == 0 ) | |
269 return true ; | |
270 if ( a != b ) | |
271 return false ; | |
272 return true ; | |
273 } | |
274 // Parse the input line | |
275 static int InputSubexon( char *in, Alignments &alignments, struct _subexon &se, bool needPrevNext = false ) | |
276 { | |
277 int i ; | |
278 char chrName[50] ; | |
279 char ls[3], rs[3] ; | |
280 sscanf( in, "%s %d %d %d %d %s %s %lf %lf %lf %lf %lf", chrName, &se.start, &se.end, &se.leftType, &se.rightType, ls, rs, | |
281 &se.avgDepth, &se.leftRatio, &se.rightRatio, | |
282 &se.leftClassifier, &se.rightClassifier ) ; | |
283 se.chrId = alignments.GetChromIdFromName( chrName ) ; | |
284 se.nextCnt = se.prevCnt = 0 ; | |
285 se.next = se.prev = NULL ; | |
286 se.lcCnt = se.rcCnt = 0 ; | |
287 | |
288 if ( ls[0] == '+' ) | |
289 se.leftStrand = 1 ; | |
290 else if ( ls[0] == '-' ) | |
291 se.leftStrand = -1 ; | |
292 else | |
293 se.leftStrand = 0 ; | |
294 | |
295 if ( rs[0] == '+' ) | |
296 se.rightStrand = 1 ; | |
297 else if ( rs[0] == '-' ) | |
298 se.rightStrand = -1 ; | |
299 else | |
300 se.rightStrand = 0 ; | |
301 | |
302 if ( needPrevNext ) | |
303 { | |
304 char *p = in ; | |
305 // Locate the offset for prevCnt | |
306 for ( i = 0 ; i <= 11 ; ++i ) | |
307 { | |
308 p = strchr( p, ' ' ) ; | |
309 ++p ; | |
310 } | |
311 | |
312 sscanf( p, "%d", &se.prevCnt ) ; | |
313 p = strchr( p, ' ' ) ; | |
314 ++p ; | |
315 se.prev = new int[ se.prevCnt ] ; | |
316 for ( i = 0 ; i < se.prevCnt ; ++i ) | |
317 { | |
318 sscanf( p, "%d", &se.prev[i] ) ; | |
319 p = strchr( p, ' ' ) ; | |
320 ++p ; | |
321 } | |
322 | |
323 sscanf( p, "%d", &se.nextCnt ) ; | |
324 p = strchr( p, ' ' ) ; | |
325 ++p ; | |
326 se.next = new int[ se.nextCnt ] ; | |
327 for ( i = 0 ; i < se.nextCnt ; ++i ) | |
328 { | |
329 sscanf( p, "%d", &se.next[i] ) ; | |
330 p = strchr( p, ' ' ) ; | |
331 ++p ; | |
332 } | |
333 | |
334 } | |
335 return 1 ; | |
336 } | |
337 | |
338 int GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp ) ; | |
339 | |
340 //@return: the number of intervals found | |
341 int ComputeGeneIntervals() ; | |
342 | |
343 // Return a list of subexons in that interval and in retList the id of subexon | |
344 // should be adjusted to start from 0. | |
345 int ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList ) ; | |
346 } ; | |
347 | |
348 #endif |