Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/Constraints.hpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #ifndef _MOURISL_CLASSES_CONSTRAINTS_HEADER | |
2 #define _MOURISL_CLASSES_CONSTRAINTS_HEADER | |
3 | |
4 #include <vector> | |
5 #include <map> | |
6 #include <algorithm> | |
7 #include <string> | |
8 #include <string.h> | |
9 | |
10 #include "BitTable.hpp" | |
11 #include "alignments.hpp" | |
12 #include "SubexonGraph.hpp" | |
13 | |
14 struct _constraint | |
15 { | |
16 BitTable vector ; // subexon vector | |
17 double weight ; | |
18 double normAbund ; | |
19 double abundance ; | |
20 int support ; | |
21 int uniqSupport ; | |
22 int maxReadLen ; // the longest read length support this constraint. | |
23 | |
24 int info ; // other usages. | |
25 int first, last ; // indicate the first and last index of the subexons. | |
26 } ; | |
27 | |
28 struct _matePairConstraint | |
29 { | |
30 int i, j ; | |
31 | |
32 int support ; | |
33 int uniqSupport ; | |
34 | |
35 double abundance ; | |
36 double normAbund ; | |
37 int effectiveCount ; | |
38 int type ; | |
39 } ; | |
40 | |
41 struct _readIdHeap | |
42 { | |
43 char *readId ; | |
44 int pos ; | |
45 int matePos ; | |
46 int idx ; | |
47 } ; | |
48 | |
49 //---------------------------------------------------------------------------------- | |
50 // We assume the access to the data structure is sorted by matePos. | |
51 // So we can "pre-cache" the ids with the same matePos. | |
52 class MateReadIds | |
53 { | |
54 private: | |
55 std::vector< struct _readIdHeap > heap ; | |
56 | |
57 void HeapifyUp( int tag ) | |
58 { | |
59 while ( tag > 1 ) | |
60 { | |
61 if ( heap[tag / 2].matePos < heap[tag].matePos ) | |
62 return ; | |
63 struct _readIdHeap tmp ; | |
64 tmp = heap[tag / 2] ; | |
65 heap[tag / 2] = heap[tag] ; | |
66 heap[tag] = tmp ; | |
67 | |
68 tag /= 2 ; | |
69 } | |
70 } | |
71 | |
72 void HeapifyDown( int tag ) | |
73 { | |
74 int size = heap.size() ; | |
75 while ( 2 * tag < size ) | |
76 { | |
77 int choose = 2 * tag ; | |
78 if ( 2 * tag + 1 < size && | |
79 heap[ 2 * tag + 1].matePos < heap[2 * tag ].matePos ) | |
80 { | |
81 choose = 2 * tag + 1 ; | |
82 } | |
83 | |
84 if ( heap[tag].matePos < heap[choose].matePos ) | |
85 return ; | |
86 | |
87 struct _readIdHeap tmp ; | |
88 tmp = heap[choose] ; | |
89 heap[choose] = heap[tag] ; | |
90 heap[tag] = tmp ; | |
91 | |
92 tag = choose ; | |
93 } | |
94 } | |
95 | |
96 struct _readIdHeap Pop() | |
97 { | |
98 struct _readIdHeap ret ; | |
99 int size = heap.size() ; | |
100 if ( size < 2 ) | |
101 { | |
102 ret.readId = NULL ; | |
103 return ret ; | |
104 } | |
105 | |
106 ret = heap[1] ; | |
107 | |
108 heap[1] = heap[ heap.size() - 1] ; | |
109 heap.pop_back() ; | |
110 HeapifyDown( 1 ) ; | |
111 | |
112 return ret ; | |
113 } | |
114 | |
115 int cachedMatePos ; | |
116 std::map<std::string, int> cachedIdx ; | |
117 bool hasMateReadIdSuffix ; // ignore the last ".{1,2}" or "/{1,2}" . | |
118 public: | |
119 MateReadIds() | |
120 { | |
121 // Push a dummy element so the vector becomes 1-based. | |
122 struct _readIdHeap nh ; | |
123 nh.readId = NULL ; | |
124 nh.pos = nh.idx = nh.matePos = -1 ; | |
125 heap.push_back( nh ) ; | |
126 cachedMatePos = -1 ; | |
127 hasMateReadIdSuffix = false ; | |
128 } | |
129 ~MateReadIds() | |
130 { | |
131 int size = heap.size() ; | |
132 std::map<std::string, int>().swap( cachedIdx ) ; | |
133 for ( int i = 0 ; i < size ; ++i ) | |
134 if ( heap[i].readId != NULL ) | |
135 free( heap[i].readId ) ; | |
136 } | |
137 | |
138 void Clear() | |
139 { | |
140 std::map<std::string, int>().swap( cachedIdx ) ; | |
141 | |
142 int size = heap.size() ; | |
143 for ( int i = 0 ; i < size ; ++i ) | |
144 if ( heap[i].readId != NULL ) | |
145 free( heap[i].readId ) ; | |
146 std::vector<struct _readIdHeap>().swap( heap ) ; | |
147 | |
148 struct _readIdHeap nh ; | |
149 nh.readId = NULL ; | |
150 nh.pos = nh.idx = nh.matePos = -1 ; | |
151 heap.push_back( nh ) ; | |
152 cachedMatePos = -1 ; | |
153 } | |
154 | |
155 void Insert( char *id, int pos, int idx, int matePos ) | |
156 { | |
157 struct _readIdHeap nh ; | |
158 nh.readId = strdup( id ) ; | |
159 nh.pos = pos ; | |
160 nh.idx = idx ; | |
161 nh.matePos = matePos ; | |
162 | |
163 heap.push_back( nh ) ; | |
164 HeapifyUp( heap.size() - 1 ) ; | |
165 } | |
166 | |
167 // If the id does not exist, return -1. | |
168 int Query( char *id, int matePos ) | |
169 { | |
170 int size ; | |
171 size = heap.size() ; | |
172 if ( matePos > cachedMatePos ) | |
173 { | |
174 std::map<std::string, int>().swap( cachedIdx ) ; | |
175 | |
176 while ( size >= 2 && heap[1].matePos < matePos ) | |
177 { | |
178 struct _readIdHeap r = Pop() ; | |
179 if ( r.readId ) | |
180 { | |
181 free( r.readId ) ; | |
182 } | |
183 --size ; | |
184 } | |
185 | |
186 while ( size >= 2 && heap[1].matePos == matePos ) | |
187 { | |
188 struct _readIdHeap r = Pop() ; | |
189 cachedIdx[ std::string( r.readId ) ] = r.idx ; | |
190 if ( r.readId ) | |
191 free( r.readId ) ; | |
192 --size ; | |
193 } | |
194 cachedMatePos = matePos ; | |
195 } | |
196 std::string s( id ) ; | |
197 if ( hasMateReadIdSuffix ) | |
198 { | |
199 int len = s.length() ; | |
200 if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) | |
201 && ( s[len - 2] == '.' || s[len - 2] == '/' ) ) | |
202 { | |
203 s[len - 1] = '2' - s[len - 1] + '1' ; | |
204 } | |
205 } | |
206 | |
207 if ( cachedIdx.find( s ) != cachedIdx.end() ) | |
208 { | |
209 return cachedIdx[s] ; | |
210 } | |
211 return -1 ; | |
212 } | |
213 | |
214 void UpdateIdx( std::vector<int> &newIdx ) | |
215 { | |
216 int size = heap.size() ; | |
217 int i ; | |
218 for ( i = 1 ; i < size ; ++i ) | |
219 { | |
220 heap[i].idx = newIdx[ heap[i].idx ] ; | |
221 } | |
222 | |
223 for ( std::map<std::string, int>::iterator it = cachedIdx.begin() ; it != cachedIdx.end() ; ++it ) | |
224 it->second = newIdx[ it->second ] ; | |
225 } | |
226 | |
227 void SetHasMateReadIdSuffix( bool in ) | |
228 { | |
229 hasMateReadIdSuffix = true ; | |
230 } | |
231 } ; | |
232 | |
233 | |
234 //-------------------------------------------------------------------------- | |
235 class Constraints | |
236 { | |
237 private: | |
238 int prevStart, prevEnd ; | |
239 bool usePrimaryAsUnique ; | |
240 MateReadIds mateReadIds ; | |
241 | |
242 Alignments *pAlignments ; | |
243 | |
244 //@return: whether this alignment is compatible with the subexons or not. | |
245 bool ConvertAlignmentToBitTable( struct _pair *segments, int segCnt, struct _subexon *subexons, int seCnt, int seStart, struct _constraint &ct ) ; | |
246 | |
247 // Sort to increasing order. Since the first subexon occupies the least important digit. | |
248 static bool CompSortConstraints( const struct _constraint &a, const struct _constraint &b ) | |
249 { | |
250 //int k | |
251 if ( a.first < b.first ) | |
252 return true ; | |
253 else if ( a.first > b.first ) | |
254 return false ; | |
255 | |
256 int diffPos = a.vector.GetFirstDifference( b.vector ) ; | |
257 if ( diffPos == -1 ) // case of equal. | |
258 return false ; | |
259 | |
260 if ( a.vector.Test( diffPos )) | |
261 return false ; | |
262 else | |
263 return true ; | |
264 } | |
265 | |
266 static bool CompSortMatePairs( const struct _matePairConstraint &a, const struct _matePairConstraint &b ) | |
267 { | |
268 if ( a.i < b.i ) | |
269 return true ; | |
270 else if ( a.i > b.i ) | |
271 return false ; | |
272 else | |
273 { | |
274 if ( a.j < b.j ) | |
275 return true ; | |
276 else | |
277 return false ; | |
278 } | |
279 } | |
280 | |
281 void CoalesceSameConstraints() ; | |
282 void ComputeNormAbund( struct _subexon *subexons ) ; | |
283 public: | |
284 std::vector<struct _constraint> constraints ; | |
285 std::vector<struct _matePairConstraint> matePairs ; | |
286 | |
287 Constraints() | |
288 { | |
289 usePrimaryAsUnique = false ; | |
290 } | |
291 | |
292 Constraints( Alignments *a ): pAlignments( a ) | |
293 { | |
294 } | |
295 | |
296 ~Constraints() | |
297 { | |
298 int i ; | |
299 int size = constraints.size() ; | |
300 for ( i = 0 ; i < size ; ++i ) | |
301 constraints[i].vector.Release() ; | |
302 constraints.clear() ; | |
303 std::vector<struct _constraint>().swap( constraints ) ; | |
304 matePairs.clear() ; | |
305 std::vector<struct _matePairConstraint>().swap( matePairs ) ; | |
306 } | |
307 | |
308 void Clear() | |
309 { | |
310 //TODO: do I need to release the memory from BitTable? | |
311 constraints.clear() ; | |
312 } | |
313 | |
314 void SetAlignments( Alignments *a ) | |
315 { | |
316 pAlignments = a ; | |
317 } | |
318 | |
319 void SetUsePrimaryAsUnique( bool in ) | |
320 { | |
321 usePrimaryAsUnique = in ; | |
322 } | |
323 | |
324 void Assign( Constraints &c ) | |
325 { | |
326 int i ; | |
327 int size = constraints.size() ; | |
328 if ( size > 0 ) | |
329 { | |
330 for ( i = 0 ; i < size ; ++i ) | |
331 constraints[i].vector.Release() ; | |
332 constraints.clear() ; | |
333 std::vector<struct _constraint>().swap( constraints ) ; | |
334 } | |
335 matePairs.clear() ; | |
336 std::vector<struct _matePairConstraint>().swap( matePairs ) ; | |
337 | |
338 //constraints.resize( c.constraints.size() ) ; | |
339 constraints = c.constraints ; | |
340 size = c.constraints.size() ; | |
341 for ( i = 0 ; i < size ; ++i ) | |
342 { | |
343 /*struct _constraint nc ; | |
344 nc.weight = c.constraints[i].weight ; | |
345 nc.normAbund = c.constraints[i].normAbund ; | |
346 nc.abundance = c.constraints[i].abundance ; | |
347 nc.support = c.constraints[i].support ; | |
348 nc.uniqSupport = c.constraints[i].uniqSupport ; | |
349 nc.maxReadLen = c.constraints[i].maxReadLen ; | |
350 nc.info = c.constraints[i].info ; | |
351 nc.first = c.constraints[i].first ; | |
352 nc.last = c.constraints[i].last ; | |
353 nc.vector.Duplicate( c.constraints[i].vector ) ; | |
354 constraints[i] = ( nc ) ; */ | |
355 constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c" | |
356 constraints[i].vector.Duplicate( c.constraints[i].vector ) ; | |
357 } | |
358 matePairs = c.matePairs ; | |
359 pAlignments = c.pAlignments ; | |
360 } | |
361 | |
362 void DownsampleConstraintsFrom( Constraints &c, int stride = 10 ) | |
363 { | |
364 int i ; | |
365 int size = constraints.size(), k ; | |
366 | |
367 if ( size > 0 ) | |
368 { | |
369 for ( i = 0 ; i < size ; ++i ) | |
370 constraints[i].vector.Release() ; | |
371 constraints.clear() ; | |
372 std::vector<struct _constraint>().swap( constraints ) ; | |
373 } | |
374 matePairs.clear() ; | |
375 std::vector<struct _matePairConstraint>().swap( matePairs ) ; | |
376 | |
377 //constraints.resize( c.constraints.size() ) ; | |
378 //constraints = c.constraints ; | |
379 k = 0 ; | |
380 size = c.constraints.size() ; | |
381 for ( i = 0 ; i < size ; i += stride, ++k ) | |
382 { | |
383 constraints.push_back( c.constraints[i] ) ; | |
384 constraints[k].vector.Nullify() ; // so that it won't affect the BitTable in "c" | |
385 constraints[k].vector.Duplicate( c.constraints[i].vector ) ; | |
386 | |
387 /*std::vector<int> seIdx ; | |
388 constraints[k].vector.GetOnesIndices( seIdx ) ; | |
389 int j, l = seIdx.size() ; | |
390 for ( j = 2 ; j < l ; ++j ) | |
391 { | |
392 constraints[k].vector.Unset( seIdx[j] ) ; | |
393 } | |
394 constraints[k].last = seIdx[1] ;*/ | |
395 } | |
396 // mate pairs is not used. if we down-sampling | |
397 pAlignments = c.pAlignments ; | |
398 } | |
399 | |
400 void TruncateConstraintsCoverFrom( Constraints &c, int seCnt, int maxConstraintSize ) | |
401 { | |
402 int i ; | |
403 int size = constraints.size() ; | |
404 | |
405 if ( size > 0 ) | |
406 { | |
407 for ( i = 0 ; i < size ; ++i ) | |
408 constraints[i].vector.Release() ; | |
409 constraints.clear() ; | |
410 std::vector<struct _constraint>().swap( constraints ) ; | |
411 } | |
412 matePairs.clear() ; | |
413 std::vector<struct _matePairConstraint>().swap( matePairs ) ; | |
414 | |
415 //constraints.resize( c.constraints.size() ) ; | |
416 //constraints = c.constraints ; | |
417 size = c.constraints.size() ; | |
418 for ( i = 0 ; i < size ; ++i ) | |
419 { | |
420 constraints.push_back( c.constraints[i] ) ; | |
421 constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c" | |
422 constraints[i].vector.Init( seCnt ) ; | |
423 std::vector<int> seIdx ; | |
424 c.constraints[i].vector.GetOnesIndices( seIdx ) ; | |
425 int j, l = seIdx.size() ; | |
426 for ( j = 0 ; j < maxConstraintSize && j < l ; ++j ) | |
427 { | |
428 constraints[i].vector.Set( seIdx[j] ) ; | |
429 } | |
430 constraints[i].last = seIdx[j - 1] ; | |
431 } | |
432 // mate pairs is not used. if we down-sampling | |
433 pAlignments = c.pAlignments ; | |
434 } | |
435 | |
436 void SetHasMateReadIdSuffix( bool in ) | |
437 { | |
438 mateReadIds.SetHasMateReadIdSuffix( in ) ; | |
439 } | |
440 | |
441 int BuildConstraints( struct _subexon *subexons, int seCnt, int start, int end ) ; | |
442 | |
443 } ; | |
444 | |
445 #endif |