0
|
1 #include "SubexonGraph.hpp"
|
|
2
|
|
3 void SubexonGraph::GetGeneBoundary( int tag, int &boundary, int timeStamp )
|
|
4 {
|
|
5 if ( visit[tag] == timeStamp )
|
|
6 return ;
|
|
7 //printf( "%d %d\n", tag, timeStamp ) ;
|
|
8 visit[tag] = timeStamp ;
|
|
9 if ( subexons[tag].end > boundary )
|
|
10 boundary = subexons[tag].end ;
|
|
11 //if ( subexons[tag].start == 2858011 )
|
|
12 // printf( "%d: %d %d\n", tag, subexons[tag].nextCnt, subexons[tag].prevCnt) ;
|
|
13 int i ;
|
|
14 int cnt = subexons[tag].nextCnt ;
|
|
15 for ( i = 0 ; i < cnt ; ++i )
|
|
16 {
|
|
17 //printf( "next of %d: %d %d\n", tag, i, subexons[tag].next[i] ) ;
|
|
18 GetGeneBoundary( subexons[tag].next[i], boundary, timeStamp ) ;
|
|
19 }
|
|
20 }
|
|
21
|
|
22 int SubexonGraph::GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp )
|
|
23 {
|
|
24 int i ;
|
|
25 int seCnt = subexons.size() ;
|
|
26 if ( startIdx >= seCnt )
|
|
27 return -1 ;
|
|
28 int farthest = -1 ;
|
|
29 GetGeneBoundary( startIdx, farthest, timeStamp ) ;
|
|
30
|
|
31 for ( i = startIdx + 1 ; i < seCnt ; ++i )
|
|
32 {
|
|
33 if ( subexons[i].start > farthest || subexons[i].chrId != subexons[ startIdx ].chrId )
|
|
34 break ;
|
|
35
|
|
36 GetGeneBoundary( i, farthest, timeStamp ) ;
|
|
37 }
|
|
38 endIdx = i - 1 ;
|
|
39
|
|
40 return endIdx ;
|
|
41 }
|
|
42
|
|
43 int SubexonGraph::ComputeGeneIntervals()
|
|
44 {
|
|
45 int i, cnt ;
|
|
46 int seCnt = subexons.size() ;
|
|
47 visit = new int[seCnt] ;
|
|
48 memset( visit, -1, sizeof( int ) * seCnt ) ;
|
|
49 int tag = 0 ;
|
|
50 cnt = 0 ;
|
|
51 while ( 1 )
|
|
52 {
|
|
53 struct _geneInterval ngi ;
|
|
54 //printf( "%d %d %d\n", tag, subexons[tag].start + 1, subexons[tag].end + 1 ) ;
|
|
55 if ( GetGeneIntervalIdx( tag, ngi.endIdx, cnt ) == -1 )
|
|
56 break ;
|
|
57 ++cnt ;
|
|
58 ngi.startIdx = tag ;
|
|
59 ngi.start = subexons[ ngi.startIdx ].start ;
|
|
60 ngi.end = subexons[ ngi.endIdx ].end ;
|
|
61
|
|
62 tag = ngi.endIdx + 1 ;
|
|
63 // Adjust the extent
|
|
64 // Adjust the start
|
|
65 if ( subexons[ ngi.startIdx ].leftStrand != 0
|
|
66 && subexons[ngi.startIdx].leftStrand != subexons[ngi.startIdx ].rightStrand )
|
|
67 // We should make sure that rightstrand is non-zero whenever left-strand is non-zero for the startIdx.
|
|
68 {
|
|
69 for ( i = ngi.startIdx ; i >= 0 ; --i )
|
|
70 {
|
|
71 if ( ( subexons[i].leftType == 1 && subexons[i].leftClassifier < classifierThreshold ) // an end within the subexon
|
|
72 || ( subexons[i].leftType == 0 ) // probably a overhang subexon. It should be a subset of the criterion following.
|
|
73 || ( i > 0 && subexons[i - 1].end + 1 < subexons[i].start ) ) // a gap.
|
|
74 break ;
|
|
75 }
|
|
76 ngi.start = subexons[i].start ;
|
|
77 }
|
|
78
|
|
79 // Adjust the end.
|
|
80 // And here, we also need to decide wether we need to adjust "tag" or not,
|
|
81 // because the next interval might be overlap with current interval by the last subexon.
|
|
82 // We solve the overlap genes now, so we DON'T need to adjust tag.
|
|
83 if ( subexons[ ngi.endIdx ].rightStrand != 0
|
|
84 && subexons[ngi.endIdx].leftStrand != subexons[ngi.endIdx ].rightStrand )
|
|
85 {
|
|
86 for ( i = ngi.endIdx ; i < seCnt ; ++i )
|
|
87 {
|
|
88 if ( ( subexons[i].rightType == 2 && subexons[i].rightClassifier < classifierThreshold ) // an end within the subexon
|
|
89 || ( subexons[i].rightType == 0 ) // probably a overhang subexon.
|
|
90 || ( i < seCnt - 1 && subexons[i].end + 1 < subexons[i + 1].start ) ) // a gap
|
|
91 break ;
|
|
92 }
|
|
93 ngi.end = subexons[i].end ;
|
|
94
|
|
95 /*if ( subexons[ ngi.endIdx ].rightType == 2 )
|
|
96 {
|
|
97 for ( i = ngi.endIdx ; i >= ngi.startIdx ; --i )
|
|
98 {
|
|
99 if ( subexons[i].leftType == 1 )
|
|
100 break ;
|
|
101 }
|
|
102 // The last region overlapps.
|
|
103 if ( i >= ngi.startIdx && subexons[i].leftStrand != subexons[ ngi.endIdx ].rightStrand )
|
|
104 --tag ;
|
|
105 }*/
|
|
106 }
|
|
107 geneIntervals.push_back( ngi ) ;
|
|
108 }
|
|
109 delete[] visit ;
|
|
110
|
|
111 return cnt ;
|
|
112 }
|
|
113
|
|
114 int SubexonGraph::ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList )
|
|
115 {
|
|
116 int i, j, k ;
|
|
117 int cnt = endIdx - startIdx + 1 ;
|
|
118 //printf( "%s: %d %d %d\n", __func__, startIdx, endIdx, cnt ) ;
|
|
119 for ( i = 0 ; i < cnt ; ++i )
|
|
120 {
|
|
121 retList[i] = subexons[i + startIdx] ;
|
|
122 retList[i].geneId = -1 ;
|
|
123 retList[i].prev = new int[ retList[i].prevCnt ] ;
|
|
124 retList[i].next = new int[ retList[i].nextCnt ] ;
|
|
125
|
|
126 for ( j = 0 ; j < retList[i].prevCnt ; ++j )
|
|
127 retList[i].prev[j] = subexons[i + startIdx].prev[j] - startIdx ;
|
|
128 for ( j = 0 ; j < retList[i].nextCnt ; ++j )
|
|
129 retList[i].next[j] = subexons[i + startIdx].next[j] - startIdx ;
|
|
130
|
|
131 for ( j = 0, k = 0 ; j < retList[i].prevCnt ; ++j )
|
|
132 if ( retList[i].prev[j] >= 0 && retList[i].prev[j] < cnt )
|
|
133 {
|
|
134 retList[i].prev[k] = retList[i].prev[j] ;
|
|
135 ++k ;
|
|
136 }
|
|
137 retList[i].prevCnt = k ;
|
|
138
|
|
139 for ( j = 0, k = 0 ; j < retList[i].nextCnt ; ++j )
|
|
140 if ( retList[i].next[j] >= 0 && retList[i].next[j] < cnt )
|
|
141 {
|
|
142 retList[i].next[k] = retList[i].next[j] ;
|
|
143 ++k ;
|
|
144 }
|
|
145 retList[i].nextCnt = k ;
|
|
146 }
|
|
147 UpdateGeneId( retList, cnt ) ;
|
|
148 return cnt ;
|
|
149 }
|
|
150
|
|
151 void SubexonGraph::SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id )
|
|
152 {
|
|
153 if ( subexons[tag].geneId != -1 && subexons[tag].geneId != -2 )
|
|
154 {
|
|
155 if ( subexons[tag].geneId != id ) // a subexon may belong to more than one gene.
|
|
156 {
|
|
157 //printf( "Set -2, %d: %d %d %d %d\n", id, tag, subexons[tag].geneId, subexons[tag].start + 1, strand ) ;
|
|
158 subexons[tag].geneId = -2 ;
|
|
159 }
|
|
160 else
|
|
161 return ;
|
|
162 // There is no need to terminate at the ambiguous exon, the strand will prevent
|
|
163 // us from overwriting previous gene ids.
|
|
164 //return ;
|
|
165 }
|
|
166 else if ( subexons[tag].geneId == -2 )
|
|
167 return ;
|
|
168 //printf( "%d: %d %d %d %d\n", id, tag, subexons[tag].geneId, subexons[tag].start + 1, strand ) ;
|
|
169 int i ;
|
|
170 if ( subexons[tag].geneId != -2 )
|
|
171 subexons[ tag ].geneId = id ;
|
|
172 int cnt = subexons[tag].nextCnt ;
|
|
173 // Set through the introns.
|
|
174 if ( IsSameStrand( strand, subexons[tag].rightStrand ) )
|
|
175 {
|
|
176 for ( i = 0 ; i < cnt ; ++i )
|
|
177 if ( subexons[ subexons[tag].next[i] ].start > subexons[tag].end + 1 )
|
|
178 SetGeneId( subexons[tag].next[i], strand, subexons, seCnt, id ) ;
|
|
179 }
|
|
180
|
|
181 cnt = subexons[tag].prevCnt ;
|
|
182 if ( IsSameStrand( strand, subexons[tag].leftStrand ) )
|
|
183 {
|
|
184 for ( i = 0 ; i < cnt ; ++i )
|
|
185 if ( subexons[ subexons[tag].prev[i] ].end < subexons[tag].start - 1 )
|
|
186 SetGeneId( subexons[tag].prev[i], strand, subexons, seCnt, id ) ;
|
|
187 }
|
|
188
|
|
189 // Set through the adjacent subexons.
|
|
190 if ( tag < seCnt - 1 && subexons[tag + 1].start == subexons[tag].end + 1 )
|
|
191 {
|
|
192 SetGeneId( tag + 1, strand, subexons, seCnt, id ) ;
|
|
193 }
|
|
194
|
|
195 if ( tag > 0 && subexons[tag].start - 1 == subexons[tag - 1].end )
|
|
196 {
|
|
197 SetGeneId( tag - 1, strand, subexons, seCnt, id ) ;
|
|
198 }
|
|
199
|
|
200 }
|
|
201
|
|
202 void SubexonGraph::UpdateGeneId( struct _subexon *subexons, int seCnt )
|
|
203 {
|
|
204 int i ;
|
|
205 baseGeneId = usedGeneId ;
|
|
206 int lastMinusStrandGeneId = -1 ;
|
|
207 for ( int strand = -1 ; strand <= 1 ; strand +=2 )
|
|
208 {
|
|
209 for ( i = 0 ; i < seCnt ; ++i )
|
|
210 {
|
|
211 //printf( "%d (%d %d) %d.\n", i, subexons[i].start + 1, subexons[i].end + 1, subexons[i].geneId ) ;
|
|
212 if ( ( subexons[i].geneId == -1 && ( ( strand == 1 && subexons[i].rightStrand == 0 ) || subexons[i].rightStrand == strand ) )
|
|
213 || ( strand == 1 && baseGeneId <= subexons[i].geneId && subexons[i].geneId <= lastMinusStrandGeneId && subexons[i].rightStrand == strand ) )
|
|
214 {
|
|
215 SetGeneId( i, strand, subexons, seCnt, usedGeneId ) ;
|
|
216 if ( strand == -1 )
|
|
217 lastMinusStrandGeneId = usedGeneId ;
|
|
218 ++usedGeneId ;
|
|
219 }
|
|
220 }
|
|
221 }
|
|
222
|
|
223 for ( i = 0 ; i < seCnt ; ++i )
|
|
224 if ( subexons[i].leftType == 0 && subexons[i].rightType == 0 )
|
|
225 {
|
|
226 subexons[i].geneId = usedGeneId ;
|
|
227 ++usedGeneId ;
|
|
228 }
|
|
229 // Put base and usedGeneId in lcCnt, rcCnt field.
|
|
230 for ( i = 0 ; i < seCnt ; ++i )
|
|
231 {
|
|
232 subexons[i].lcCnt = baseGeneId ;
|
|
233 subexons[i].rcCnt = usedGeneId ;
|
|
234 }
|
|
235
|
|
236 /*for ( i = 0 ; i < seCnt ; ++i )
|
|
237 {
|
|
238 printf( "geneId %d: %d-%d %d\n", i, subexons[i].start + 1, subexons[i].end + 1, subexons[i].geneId ) ;
|
|
239 }
|
|
240 printf("%d %d\n", baseGeneId, usedGeneId ) ;*/
|
|
241 }
|