0
|
1 #ifndef _MOURISL_CLASSES_CONSTRAINTS_HEADER
|
|
2 #define _MOURISL_CLASSES_CONSTRAINTS_HEADER
|
|
3
|
|
4 #include <vector>
|
|
5 #include <map>
|
|
6 #include <algorithm>
|
|
7 #include <string>
|
|
8 #include <string.h>
|
|
9
|
|
10 #include "BitTable.hpp"
|
|
11 #include "alignments.hpp"
|
|
12 #include "SubexonGraph.hpp"
|
|
13
|
|
14 struct _constraint
|
|
15 {
|
|
16 BitTable vector ; // subexon vector
|
|
17 double weight ;
|
|
18 double normAbund ;
|
|
19 double abundance ;
|
|
20 int support ;
|
|
21 int uniqSupport ;
|
|
22 int maxReadLen ; // the longest read length support this constraint.
|
|
23
|
|
24 int info ; // other usages.
|
|
25 int first, last ; // indicate the first and last index of the subexons.
|
|
26 } ;
|
|
27
|
|
28 struct _matePairConstraint
|
|
29 {
|
|
30 int i, j ;
|
|
31
|
|
32 int support ;
|
|
33 int uniqSupport ;
|
|
34
|
|
35 double abundance ;
|
|
36 double normAbund ;
|
|
37 int effectiveCount ;
|
|
38 int type ;
|
|
39 } ;
|
|
40
|
|
41 struct _readIdHeap
|
|
42 {
|
|
43 char *readId ;
|
|
44 int pos ;
|
|
45 int matePos ;
|
|
46 int idx ;
|
|
47 } ;
|
|
48
|
|
49 //----------------------------------------------------------------------------------
|
|
50 // We assume the access to the data structure is sorted by matePos.
|
|
51 // So we can "pre-cache" the ids with the same matePos.
|
|
52 class MateReadIds
|
|
53 {
|
|
54 private:
|
|
55 std::vector< struct _readIdHeap > heap ;
|
|
56
|
|
57 void HeapifyUp( int tag )
|
|
58 {
|
|
59 while ( tag > 1 )
|
|
60 {
|
|
61 if ( heap[tag / 2].matePos < heap[tag].matePos )
|
|
62 return ;
|
|
63 struct _readIdHeap tmp ;
|
|
64 tmp = heap[tag / 2] ;
|
|
65 heap[tag / 2] = heap[tag] ;
|
|
66 heap[tag] = tmp ;
|
|
67
|
|
68 tag /= 2 ;
|
|
69 }
|
|
70 }
|
|
71
|
|
72 void HeapifyDown( int tag )
|
|
73 {
|
|
74 int size = heap.size() ;
|
|
75 while ( 2 * tag < size )
|
|
76 {
|
|
77 int choose = 2 * tag ;
|
|
78 if ( 2 * tag + 1 < size &&
|
|
79 heap[ 2 * tag + 1].matePos < heap[2 * tag ].matePos )
|
|
80 {
|
|
81 choose = 2 * tag + 1 ;
|
|
82 }
|
|
83
|
|
84 if ( heap[tag].matePos < heap[choose].matePos )
|
|
85 return ;
|
|
86
|
|
87 struct _readIdHeap tmp ;
|
|
88 tmp = heap[choose] ;
|
|
89 heap[choose] = heap[tag] ;
|
|
90 heap[tag] = tmp ;
|
|
91
|
|
92 tag = choose ;
|
|
93 }
|
|
94 }
|
|
95
|
|
96 struct _readIdHeap Pop()
|
|
97 {
|
|
98 struct _readIdHeap ret ;
|
|
99 int size = heap.size() ;
|
|
100 if ( size < 2 )
|
|
101 {
|
|
102 ret.readId = NULL ;
|
|
103 return ret ;
|
|
104 }
|
|
105
|
|
106 ret = heap[1] ;
|
|
107
|
|
108 heap[1] = heap[ heap.size() - 1] ;
|
|
109 heap.pop_back() ;
|
|
110 HeapifyDown( 1 ) ;
|
|
111
|
|
112 return ret ;
|
|
113 }
|
|
114
|
|
115 int cachedMatePos ;
|
|
116 std::map<std::string, int> cachedIdx ;
|
|
117 bool hasMateReadIdSuffix ; // ignore the last ".{1,2}" or "/{1,2}" .
|
|
118 public:
|
|
119 MateReadIds()
|
|
120 {
|
|
121 // Push a dummy element so the vector becomes 1-based.
|
|
122 struct _readIdHeap nh ;
|
|
123 nh.readId = NULL ;
|
|
124 nh.pos = nh.idx = nh.matePos = -1 ;
|
|
125 heap.push_back( nh ) ;
|
|
126 cachedMatePos = -1 ;
|
|
127 hasMateReadIdSuffix = false ;
|
|
128 }
|
|
129 ~MateReadIds()
|
|
130 {
|
|
131 int size = heap.size() ;
|
|
132 std::map<std::string, int>().swap( cachedIdx ) ;
|
|
133 for ( int i = 0 ; i < size ; ++i )
|
|
134 if ( heap[i].readId != NULL )
|
|
135 free( heap[i].readId ) ;
|
|
136 }
|
|
137
|
|
138 void Clear()
|
|
139 {
|
|
140 std::map<std::string, int>().swap( cachedIdx ) ;
|
|
141
|
|
142 int size = heap.size() ;
|
|
143 for ( int i = 0 ; i < size ; ++i )
|
|
144 if ( heap[i].readId != NULL )
|
|
145 free( heap[i].readId ) ;
|
|
146 std::vector<struct _readIdHeap>().swap( heap ) ;
|
|
147
|
|
148 struct _readIdHeap nh ;
|
|
149 nh.readId = NULL ;
|
|
150 nh.pos = nh.idx = nh.matePos = -1 ;
|
|
151 heap.push_back( nh ) ;
|
|
152 cachedMatePos = -1 ;
|
|
153 }
|
|
154
|
|
155 void Insert( char *id, int pos, int idx, int matePos )
|
|
156 {
|
|
157 struct _readIdHeap nh ;
|
|
158 nh.readId = strdup( id ) ;
|
|
159 nh.pos = pos ;
|
|
160 nh.idx = idx ;
|
|
161 nh.matePos = matePos ;
|
|
162
|
|
163 heap.push_back( nh ) ;
|
|
164 HeapifyUp( heap.size() - 1 ) ;
|
|
165 }
|
|
166
|
|
167 // If the id does not exist, return -1.
|
|
168 int Query( char *id, int matePos )
|
|
169 {
|
|
170 int size ;
|
|
171 size = heap.size() ;
|
|
172 if ( matePos > cachedMatePos )
|
|
173 {
|
|
174 std::map<std::string, int>().swap( cachedIdx ) ;
|
|
175
|
|
176 while ( size >= 2 && heap[1].matePos < matePos )
|
|
177 {
|
|
178 struct _readIdHeap r = Pop() ;
|
|
179 if ( r.readId )
|
|
180 {
|
|
181 free( r.readId ) ;
|
|
182 }
|
|
183 --size ;
|
|
184 }
|
|
185
|
|
186 while ( size >= 2 && heap[1].matePos == matePos )
|
|
187 {
|
|
188 struct _readIdHeap r = Pop() ;
|
|
189 cachedIdx[ std::string( r.readId ) ] = r.idx ;
|
|
190 if ( r.readId )
|
|
191 free( r.readId ) ;
|
|
192 --size ;
|
|
193 }
|
|
194 cachedMatePos = matePos ;
|
|
195 }
|
|
196 std::string s( id ) ;
|
|
197 if ( hasMateReadIdSuffix )
|
|
198 {
|
|
199 int len = s.length() ;
|
|
200 if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' )
|
|
201 && ( s[len - 2] == '.' || s[len - 2] == '/' ) )
|
|
202 {
|
|
203 s[len - 1] = '2' - s[len - 1] + '1' ;
|
|
204 }
|
|
205 }
|
|
206
|
|
207 if ( cachedIdx.find( s ) != cachedIdx.end() )
|
|
208 {
|
|
209 return cachedIdx[s] ;
|
|
210 }
|
|
211 return -1 ;
|
|
212 }
|
|
213
|
|
214 void UpdateIdx( std::vector<int> &newIdx )
|
|
215 {
|
|
216 int size = heap.size() ;
|
|
217 int i ;
|
|
218 for ( i = 1 ; i < size ; ++i )
|
|
219 {
|
|
220 heap[i].idx = newIdx[ heap[i].idx ] ;
|
|
221 }
|
|
222
|
|
223 for ( std::map<std::string, int>::iterator it = cachedIdx.begin() ; it != cachedIdx.end() ; ++it )
|
|
224 it->second = newIdx[ it->second ] ;
|
|
225 }
|
|
226
|
|
227 void SetHasMateReadIdSuffix( bool in )
|
|
228 {
|
|
229 hasMateReadIdSuffix = true ;
|
|
230 }
|
|
231 } ;
|
|
232
|
|
233
|
|
234 //--------------------------------------------------------------------------
|
|
235 class Constraints
|
|
236 {
|
|
237 private:
|
|
238 int prevStart, prevEnd ;
|
|
239 bool usePrimaryAsUnique ;
|
|
240 MateReadIds mateReadIds ;
|
|
241
|
|
242 Alignments *pAlignments ;
|
|
243
|
|
244 //@return: whether this alignment is compatible with the subexons or not.
|
|
245 bool ConvertAlignmentToBitTable( struct _pair *segments, int segCnt, struct _subexon *subexons, int seCnt, int seStart, struct _constraint &ct ) ;
|
|
246
|
|
247 // Sort to increasing order. Since the first subexon occupies the least important digit.
|
|
248 static bool CompSortConstraints( const struct _constraint &a, const struct _constraint &b )
|
|
249 {
|
|
250 //int k
|
|
251 if ( a.first < b.first )
|
|
252 return true ;
|
|
253 else if ( a.first > b.first )
|
|
254 return false ;
|
|
255
|
|
256 int diffPos = a.vector.GetFirstDifference( b.vector ) ;
|
|
257 if ( diffPos == -1 ) // case of equal.
|
|
258 return false ;
|
|
259
|
|
260 if ( a.vector.Test( diffPos ))
|
|
261 return false ;
|
|
262 else
|
|
263 return true ;
|
|
264 }
|
|
265
|
|
266 static bool CompSortMatePairs( const struct _matePairConstraint &a, const struct _matePairConstraint &b )
|
|
267 {
|
|
268 if ( a.i < b.i )
|
|
269 return true ;
|
|
270 else if ( a.i > b.i )
|
|
271 return false ;
|
|
272 else
|
|
273 {
|
|
274 if ( a.j < b.j )
|
|
275 return true ;
|
|
276 else
|
|
277 return false ;
|
|
278 }
|
|
279 }
|
|
280
|
|
281 void CoalesceSameConstraints() ;
|
|
282 void ComputeNormAbund( struct _subexon *subexons ) ;
|
|
283 public:
|
|
284 std::vector<struct _constraint> constraints ;
|
|
285 std::vector<struct _matePairConstraint> matePairs ;
|
|
286
|
|
287 Constraints()
|
|
288 {
|
|
289 usePrimaryAsUnique = false ;
|
|
290 }
|
|
291
|
|
292 Constraints( Alignments *a ): pAlignments( a )
|
|
293 {
|
|
294 }
|
|
295
|
|
296 ~Constraints()
|
|
297 {
|
|
298 int i ;
|
|
299 int size = constraints.size() ;
|
|
300 for ( i = 0 ; i < size ; ++i )
|
|
301 constraints[i].vector.Release() ;
|
|
302 constraints.clear() ;
|
|
303 std::vector<struct _constraint>().swap( constraints ) ;
|
|
304 matePairs.clear() ;
|
|
305 std::vector<struct _matePairConstraint>().swap( matePairs ) ;
|
|
306 }
|
|
307
|
|
308 void Clear()
|
|
309 {
|
|
310 //TODO: do I need to release the memory from BitTable?
|
|
311 constraints.clear() ;
|
|
312 }
|
|
313
|
|
314 void SetAlignments( Alignments *a )
|
|
315 {
|
|
316 pAlignments = a ;
|
|
317 }
|
|
318
|
|
319 void SetUsePrimaryAsUnique( bool in )
|
|
320 {
|
|
321 usePrimaryAsUnique = in ;
|
|
322 }
|
|
323
|
|
324 void Assign( Constraints &c )
|
|
325 {
|
|
326 int i ;
|
|
327 int size = constraints.size() ;
|
|
328 if ( size > 0 )
|
|
329 {
|
|
330 for ( i = 0 ; i < size ; ++i )
|
|
331 constraints[i].vector.Release() ;
|
|
332 constraints.clear() ;
|
|
333 std::vector<struct _constraint>().swap( constraints ) ;
|
|
334 }
|
|
335 matePairs.clear() ;
|
|
336 std::vector<struct _matePairConstraint>().swap( matePairs ) ;
|
|
337
|
|
338 //constraints.resize( c.constraints.size() ) ;
|
|
339 constraints = c.constraints ;
|
|
340 size = c.constraints.size() ;
|
|
341 for ( i = 0 ; i < size ; ++i )
|
|
342 {
|
|
343 /*struct _constraint nc ;
|
|
344 nc.weight = c.constraints[i].weight ;
|
|
345 nc.normAbund = c.constraints[i].normAbund ;
|
|
346 nc.abundance = c.constraints[i].abundance ;
|
|
347 nc.support = c.constraints[i].support ;
|
|
348 nc.uniqSupport = c.constraints[i].uniqSupport ;
|
|
349 nc.maxReadLen = c.constraints[i].maxReadLen ;
|
|
350 nc.info = c.constraints[i].info ;
|
|
351 nc.first = c.constraints[i].first ;
|
|
352 nc.last = c.constraints[i].last ;
|
|
353 nc.vector.Duplicate( c.constraints[i].vector ) ;
|
|
354 constraints[i] = ( nc ) ; */
|
|
355 constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c"
|
|
356 constraints[i].vector.Duplicate( c.constraints[i].vector ) ;
|
|
357 }
|
|
358 matePairs = c.matePairs ;
|
|
359 pAlignments = c.pAlignments ;
|
|
360 }
|
|
361
|
|
362 void DownsampleConstraintsFrom( Constraints &c, int stride = 10 )
|
|
363 {
|
|
364 int i ;
|
|
365 int size = constraints.size(), k ;
|
|
366
|
|
367 if ( size > 0 )
|
|
368 {
|
|
369 for ( i = 0 ; i < size ; ++i )
|
|
370 constraints[i].vector.Release() ;
|
|
371 constraints.clear() ;
|
|
372 std::vector<struct _constraint>().swap( constraints ) ;
|
|
373 }
|
|
374 matePairs.clear() ;
|
|
375 std::vector<struct _matePairConstraint>().swap( matePairs ) ;
|
|
376
|
|
377 //constraints.resize( c.constraints.size() ) ;
|
|
378 //constraints = c.constraints ;
|
|
379 k = 0 ;
|
|
380 size = c.constraints.size() ;
|
|
381 for ( i = 0 ; i < size ; i += stride, ++k )
|
|
382 {
|
|
383 constraints.push_back( c.constraints[i] ) ;
|
|
384 constraints[k].vector.Nullify() ; // so that it won't affect the BitTable in "c"
|
|
385 constraints[k].vector.Duplicate( c.constraints[i].vector ) ;
|
|
386
|
|
387 /*std::vector<int> seIdx ;
|
|
388 constraints[k].vector.GetOnesIndices( seIdx ) ;
|
|
389 int j, l = seIdx.size() ;
|
|
390 for ( j = 2 ; j < l ; ++j )
|
|
391 {
|
|
392 constraints[k].vector.Unset( seIdx[j] ) ;
|
|
393 }
|
|
394 constraints[k].last = seIdx[1] ;*/
|
|
395 }
|
|
396 // mate pairs is not used. if we down-sampling
|
|
397 pAlignments = c.pAlignments ;
|
|
398 }
|
|
399
|
|
400 void TruncateConstraintsCoverFrom( Constraints &c, int seCnt, int maxConstraintSize )
|
|
401 {
|
|
402 int i ;
|
|
403 int size = constraints.size() ;
|
|
404
|
|
405 if ( size > 0 )
|
|
406 {
|
|
407 for ( i = 0 ; i < size ; ++i )
|
|
408 constraints[i].vector.Release() ;
|
|
409 constraints.clear() ;
|
|
410 std::vector<struct _constraint>().swap( constraints ) ;
|
|
411 }
|
|
412 matePairs.clear() ;
|
|
413 std::vector<struct _matePairConstraint>().swap( matePairs ) ;
|
|
414
|
|
415 //constraints.resize( c.constraints.size() ) ;
|
|
416 //constraints = c.constraints ;
|
|
417 size = c.constraints.size() ;
|
|
418 for ( i = 0 ; i < size ; ++i )
|
|
419 {
|
|
420 constraints.push_back( c.constraints[i] ) ;
|
|
421 constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c"
|
|
422 constraints[i].vector.Init( seCnt ) ;
|
|
423 std::vector<int> seIdx ;
|
|
424 c.constraints[i].vector.GetOnesIndices( seIdx ) ;
|
|
425 int j, l = seIdx.size() ;
|
|
426 for ( j = 0 ; j < maxConstraintSize && j < l ; ++j )
|
|
427 {
|
|
428 constraints[i].vector.Set( seIdx[j] ) ;
|
|
429 }
|
|
430 constraints[i].last = seIdx[j - 1] ;
|
|
431 }
|
|
432 // mate pairs is not used. if we down-sampling
|
|
433 pAlignments = c.pAlignments ;
|
|
434 }
|
|
435
|
|
436 void SetHasMateReadIdSuffix( bool in )
|
|
437 {
|
|
438 mateReadIds.SetHasMateReadIdSuffix( in ) ;
|
|
439 }
|
|
440
|
|
441 int BuildConstraints( struct _subexon *subexons, int seCnt, int start, int end ) ;
|
|
442
|
|
443 } ;
|
|
444
|
|
445 #endif
|