Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/SubexonGraph.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include "SubexonGraph.hpp" | |
2 | |
3 void SubexonGraph::GetGeneBoundary( int tag, int &boundary, int timeStamp ) | |
4 { | |
5 if ( visit[tag] == timeStamp ) | |
6 return ; | |
7 //printf( "%d %d\n", tag, timeStamp ) ; | |
8 visit[tag] = timeStamp ; | |
9 if ( subexons[tag].end > boundary ) | |
10 boundary = subexons[tag].end ; | |
11 //if ( subexons[tag].start == 2858011 ) | |
12 // printf( "%d: %d %d\n", tag, subexons[tag].nextCnt, subexons[tag].prevCnt) ; | |
13 int i ; | |
14 int cnt = subexons[tag].nextCnt ; | |
15 for ( i = 0 ; i < cnt ; ++i ) | |
16 { | |
17 //printf( "next of %d: %d %d\n", tag, i, subexons[tag].next[i] ) ; | |
18 GetGeneBoundary( subexons[tag].next[i], boundary, timeStamp ) ; | |
19 } | |
20 } | |
21 | |
22 int SubexonGraph::GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp ) | |
23 { | |
24 int i ; | |
25 int seCnt = subexons.size() ; | |
26 if ( startIdx >= seCnt ) | |
27 return -1 ; | |
28 int farthest = -1 ; | |
29 GetGeneBoundary( startIdx, farthest, timeStamp ) ; | |
30 | |
31 for ( i = startIdx + 1 ; i < seCnt ; ++i ) | |
32 { | |
33 if ( subexons[i].start > farthest || subexons[i].chrId != subexons[ startIdx ].chrId ) | |
34 break ; | |
35 | |
36 GetGeneBoundary( i, farthest, timeStamp ) ; | |
37 } | |
38 endIdx = i - 1 ; | |
39 | |
40 return endIdx ; | |
41 } | |
42 | |
43 int SubexonGraph::ComputeGeneIntervals() | |
44 { | |
45 int i, cnt ; | |
46 int seCnt = subexons.size() ; | |
47 visit = new int[seCnt] ; | |
48 memset( visit, -1, sizeof( int ) * seCnt ) ; | |
49 int tag = 0 ; | |
50 cnt = 0 ; | |
51 while ( 1 ) | |
52 { | |
53 struct _geneInterval ngi ; | |
54 //printf( "%d %d %d\n", tag, subexons[tag].start + 1, subexons[tag].end + 1 ) ; | |
55 if ( GetGeneIntervalIdx( tag, ngi.endIdx, cnt ) == -1 ) | |
56 break ; | |
57 ++cnt ; | |
58 ngi.startIdx = tag ; | |
59 ngi.start = subexons[ ngi.startIdx ].start ; | |
60 ngi.end = subexons[ ngi.endIdx ].end ; | |
61 | |
62 tag = ngi.endIdx + 1 ; | |
63 // Adjust the extent | |
64 // Adjust the start | |
65 if ( subexons[ ngi.startIdx ].leftStrand != 0 | |
66 && subexons[ngi.startIdx].leftStrand != subexons[ngi.startIdx ].rightStrand ) | |
67 // We should make sure that rightstrand is non-zero whenever left-strand is non-zero for the startIdx. | |
68 { | |
69 for ( i = ngi.startIdx ; i >= 0 ; --i ) | |
70 { | |
71 if ( ( subexons[i].leftType == 1 && subexons[i].leftClassifier < classifierThreshold ) // an end within the subexon | |
72 || ( subexons[i].leftType == 0 ) // probably a overhang subexon. It should be a subset of the criterion following. | |
73 || ( i > 0 && subexons[i - 1].end + 1 < subexons[i].start ) ) // a gap. | |
74 break ; | |
75 } | |
76 ngi.start = subexons[i].start ; | |
77 } | |
78 | |
79 // Adjust the end. | |
80 // And here, we also need to decide wether we need to adjust "tag" or not, | |
81 // because the next interval might be overlap with current interval by the last subexon. | |
82 // We solve the overlap genes now, so we DON'T need to adjust tag. | |
83 if ( subexons[ ngi.endIdx ].rightStrand != 0 | |
84 && subexons[ngi.endIdx].leftStrand != subexons[ngi.endIdx ].rightStrand ) | |
85 { | |
86 for ( i = ngi.endIdx ; i < seCnt ; ++i ) | |
87 { | |
88 if ( ( subexons[i].rightType == 2 && subexons[i].rightClassifier < classifierThreshold ) // an end within the subexon | |
89 || ( subexons[i].rightType == 0 ) // probably a overhang subexon. | |
90 || ( i < seCnt - 1 && subexons[i].end + 1 < subexons[i + 1].start ) ) // a gap | |
91 break ; | |
92 } | |
93 ngi.end = subexons[i].end ; | |
94 | |
95 /*if ( subexons[ ngi.endIdx ].rightType == 2 ) | |
96 { | |
97 for ( i = ngi.endIdx ; i >= ngi.startIdx ; --i ) | |
98 { | |
99 if ( subexons[i].leftType == 1 ) | |
100 break ; | |
101 } | |
102 // The last region overlapps. | |
103 if ( i >= ngi.startIdx && subexons[i].leftStrand != subexons[ ngi.endIdx ].rightStrand ) | |
104 --tag ; | |
105 }*/ | |
106 } | |
107 geneIntervals.push_back( ngi ) ; | |
108 } | |
109 delete[] visit ; | |
110 | |
111 return cnt ; | |
112 } | |
113 | |
114 int SubexonGraph::ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList ) | |
115 { | |
116 int i, j, k ; | |
117 int cnt = endIdx - startIdx + 1 ; | |
118 //printf( "%s: %d %d %d\n", __func__, startIdx, endIdx, cnt ) ; | |
119 for ( i = 0 ; i < cnt ; ++i ) | |
120 { | |
121 retList[i] = subexons[i + startIdx] ; | |
122 retList[i].geneId = -1 ; | |
123 retList[i].prev = new int[ retList[i].prevCnt ] ; | |
124 retList[i].next = new int[ retList[i].nextCnt ] ; | |
125 | |
126 for ( j = 0 ; j < retList[i].prevCnt ; ++j ) | |
127 retList[i].prev[j] = subexons[i + startIdx].prev[j] - startIdx ; | |
128 for ( j = 0 ; j < retList[i].nextCnt ; ++j ) | |
129 retList[i].next[j] = subexons[i + startIdx].next[j] - startIdx ; | |
130 | |
131 for ( j = 0, k = 0 ; j < retList[i].prevCnt ; ++j ) | |
132 if ( retList[i].prev[j] >= 0 && retList[i].prev[j] < cnt ) | |
133 { | |
134 retList[i].prev[k] = retList[i].prev[j] ; | |
135 ++k ; | |
136 } | |
137 retList[i].prevCnt = k ; | |
138 | |
139 for ( j = 0, k = 0 ; j < retList[i].nextCnt ; ++j ) | |
140 if ( retList[i].next[j] >= 0 && retList[i].next[j] < cnt ) | |
141 { | |
142 retList[i].next[k] = retList[i].next[j] ; | |
143 ++k ; | |
144 } | |
145 retList[i].nextCnt = k ; | |
146 } | |
147 UpdateGeneId( retList, cnt ) ; | |
148 return cnt ; | |
149 } | |
150 | |
151 void SubexonGraph::SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id ) | |
152 { | |
153 if ( subexons[tag].geneId != -1 && subexons[tag].geneId != -2 ) | |
154 { | |
155 if ( subexons[tag].geneId != id ) // a subexon may belong to more than one gene. | |
156 { | |
157 //printf( "Set -2, %d: %d %d %d %d\n", id, tag, subexons[tag].geneId, subexons[tag].start + 1, strand ) ; | |
158 subexons[tag].geneId = -2 ; | |
159 } | |
160 else | |
161 return ; | |
162 // There is no need to terminate at the ambiguous exon, the strand will prevent | |
163 // us from overwriting previous gene ids. | |
164 //return ; | |
165 } | |
166 else if ( subexons[tag].geneId == -2 ) | |
167 return ; | |
168 //printf( "%d: %d %d %d %d\n", id, tag, subexons[tag].geneId, subexons[tag].start + 1, strand ) ; | |
169 int i ; | |
170 if ( subexons[tag].geneId != -2 ) | |
171 subexons[ tag ].geneId = id ; | |
172 int cnt = subexons[tag].nextCnt ; | |
173 // Set through the introns. | |
174 if ( IsSameStrand( strand, subexons[tag].rightStrand ) ) | |
175 { | |
176 for ( i = 0 ; i < cnt ; ++i ) | |
177 if ( subexons[ subexons[tag].next[i] ].start > subexons[tag].end + 1 ) | |
178 SetGeneId( subexons[tag].next[i], strand, subexons, seCnt, id ) ; | |
179 } | |
180 | |
181 cnt = subexons[tag].prevCnt ; | |
182 if ( IsSameStrand( strand, subexons[tag].leftStrand ) ) | |
183 { | |
184 for ( i = 0 ; i < cnt ; ++i ) | |
185 if ( subexons[ subexons[tag].prev[i] ].end < subexons[tag].start - 1 ) | |
186 SetGeneId( subexons[tag].prev[i], strand, subexons, seCnt, id ) ; | |
187 } | |
188 | |
189 // Set through the adjacent subexons. | |
190 if ( tag < seCnt - 1 && subexons[tag + 1].start == subexons[tag].end + 1 ) | |
191 { | |
192 SetGeneId( tag + 1, strand, subexons, seCnt, id ) ; | |
193 } | |
194 | |
195 if ( tag > 0 && subexons[tag].start - 1 == subexons[tag - 1].end ) | |
196 { | |
197 SetGeneId( tag - 1, strand, subexons, seCnt, id ) ; | |
198 } | |
199 | |
200 } | |
201 | |
202 void SubexonGraph::UpdateGeneId( struct _subexon *subexons, int seCnt ) | |
203 { | |
204 int i ; | |
205 baseGeneId = usedGeneId ; | |
206 int lastMinusStrandGeneId = -1 ; | |
207 for ( int strand = -1 ; strand <= 1 ; strand +=2 ) | |
208 { | |
209 for ( i = 0 ; i < seCnt ; ++i ) | |
210 { | |
211 //printf( "%d (%d %d) %d.\n", i, subexons[i].start + 1, subexons[i].end + 1, subexons[i].geneId ) ; | |
212 if ( ( subexons[i].geneId == -1 && ( ( strand == 1 && subexons[i].rightStrand == 0 ) || subexons[i].rightStrand == strand ) ) | |
213 || ( strand == 1 && baseGeneId <= subexons[i].geneId && subexons[i].geneId <= lastMinusStrandGeneId && subexons[i].rightStrand == strand ) ) | |
214 { | |
215 SetGeneId( i, strand, subexons, seCnt, usedGeneId ) ; | |
216 if ( strand == -1 ) | |
217 lastMinusStrandGeneId = usedGeneId ; | |
218 ++usedGeneId ; | |
219 } | |
220 } | |
221 } | |
222 | |
223 for ( i = 0 ; i < seCnt ; ++i ) | |
224 if ( subexons[i].leftType == 0 && subexons[i].rightType == 0 ) | |
225 { | |
226 subexons[i].geneId = usedGeneId ; | |
227 ++usedGeneId ; | |
228 } | |
229 // Put base and usedGeneId in lcCnt, rcCnt field. | |
230 for ( i = 0 ; i < seCnt ; ++i ) | |
231 { | |
232 subexons[i].lcCnt = baseGeneId ; | |
233 subexons[i].rcCnt = usedGeneId ; | |
234 } | |
235 | |
236 /*for ( i = 0 ; i < seCnt ; ++i ) | |
237 { | |
238 printf( "geneId %d: %d-%d %d\n", i, subexons[i].start + 1, subexons[i].end + 1, subexons[i].geneId ) ; | |
239 } | |
240 printf("%d %d\n", baseGeneId, usedGeneId ) ;*/ | |
241 } |