Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/Constraints.hpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/Constraints.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,445 @@ +#ifndef _MOURISL_CLASSES_CONSTRAINTS_HEADER +#define _MOURISL_CLASSES_CONSTRAINTS_HEADER + +#include <vector> +#include <map> +#include <algorithm> +#include <string> +#include <string.h> + +#include "BitTable.hpp" +#include "alignments.hpp" +#include "SubexonGraph.hpp" + +struct _constraint +{ + BitTable vector ; // subexon vector + double weight ; + double normAbund ; + double abundance ; + int support ; + int uniqSupport ; + int maxReadLen ; // the longest read length support this constraint. + + int info ; // other usages. + int first, last ; // indicate the first and last index of the subexons. +} ; + +struct _matePairConstraint +{ + int i, j ; + + int support ; + int uniqSupport ; + + double abundance ; + double normAbund ; + int effectiveCount ; + int type ; +} ; + +struct _readIdHeap +{ + char *readId ; + int pos ; + int matePos ; + int idx ; +} ; + +//---------------------------------------------------------------------------------- +// We assume the access to the data structure is sorted by matePos. +// So we can "pre-cache" the ids with the same matePos. +class MateReadIds +{ +private: + std::vector< struct _readIdHeap > heap ; + + void HeapifyUp( int tag ) + { + while ( tag > 1 ) + { + if ( heap[tag / 2].matePos < heap[tag].matePos ) + return ; + struct _readIdHeap tmp ; + tmp = heap[tag / 2] ; + heap[tag / 2] = heap[tag] ; + heap[tag] = tmp ; + + tag /= 2 ; + } + } + + void HeapifyDown( int tag ) + { + int size = heap.size() ; + while ( 2 * tag < size ) + { + int choose = 2 * tag ; + if ( 2 * tag + 1 < size && + heap[ 2 * tag + 1].matePos < heap[2 * tag ].matePos ) + { + choose = 2 * tag + 1 ; + } + + if ( heap[tag].matePos < heap[choose].matePos ) + return ; + + struct _readIdHeap tmp ; + tmp = heap[choose] ; + heap[choose] = heap[tag] ; + heap[tag] = tmp ; + + tag = choose ; + } + } + + struct _readIdHeap Pop() + { + struct _readIdHeap ret ; + int size = heap.size() ; + if ( size < 2 ) + { + ret.readId = NULL ; + return ret ; + } + + ret = heap[1] ; + + heap[1] = heap[ heap.size() - 1] ; + heap.pop_back() ; + HeapifyDown( 1 ) ; + + return ret ; + } + + int cachedMatePos ; + std::map<std::string, int> cachedIdx ; + bool hasMateReadIdSuffix ; // ignore the last ".{1,2}" or "/{1,2}" . +public: + MateReadIds() + { + // Push a dummy element so the vector becomes 1-based. + struct _readIdHeap nh ; + nh.readId = NULL ; + nh.pos = nh.idx = nh.matePos = -1 ; + heap.push_back( nh ) ; + cachedMatePos = -1 ; + hasMateReadIdSuffix = false ; + } + ~MateReadIds() + { + int size = heap.size() ; + std::map<std::string, int>().swap( cachedIdx ) ; + for ( int i = 0 ; i < size ; ++i ) + if ( heap[i].readId != NULL ) + free( heap[i].readId ) ; + } + + void Clear() + { + std::map<std::string, int>().swap( cachedIdx ) ; + + int size = heap.size() ; + for ( int i = 0 ; i < size ; ++i ) + if ( heap[i].readId != NULL ) + free( heap[i].readId ) ; + std::vector<struct _readIdHeap>().swap( heap ) ; + + struct _readIdHeap nh ; + nh.readId = NULL ; + nh.pos = nh.idx = nh.matePos = -1 ; + heap.push_back( nh ) ; + cachedMatePos = -1 ; + } + + void Insert( char *id, int pos, int idx, int matePos ) + { + struct _readIdHeap nh ; + nh.readId = strdup( id ) ; + nh.pos = pos ; + nh.idx = idx ; + nh.matePos = matePos ; + + heap.push_back( nh ) ; + HeapifyUp( heap.size() - 1 ) ; + } + + // If the id does not exist, return -1. + int Query( char *id, int matePos ) + { + int size ; + size = heap.size() ; + if ( matePos > cachedMatePos ) + { + std::map<std::string, int>().swap( cachedIdx ) ; + + while ( size >= 2 && heap[1].matePos < matePos ) + { + struct _readIdHeap r = Pop() ; + if ( r.readId ) + { + free( r.readId ) ; + } + --size ; + } + + while ( size >= 2 && heap[1].matePos == matePos ) + { + struct _readIdHeap r = Pop() ; + cachedIdx[ std::string( r.readId ) ] = r.idx ; + if ( r.readId ) + free( r.readId ) ; + --size ; + } + cachedMatePos = matePos ; + } + std::string s( id ) ; + if ( hasMateReadIdSuffix ) + { + int len = s.length() ; + if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) + && ( s[len - 2] == '.' || s[len - 2] == '/' ) ) + { + s[len - 1] = '2' - s[len - 1] + '1' ; + } + } + + if ( cachedIdx.find( s ) != cachedIdx.end() ) + { + return cachedIdx[s] ; + } + return -1 ; + } + + void UpdateIdx( std::vector<int> &newIdx ) + { + int size = heap.size() ; + int i ; + for ( i = 1 ; i < size ; ++i ) + { + heap[i].idx = newIdx[ heap[i].idx ] ; + } + + for ( std::map<std::string, int>::iterator it = cachedIdx.begin() ; it != cachedIdx.end() ; ++it ) + it->second = newIdx[ it->second ] ; + } + + void SetHasMateReadIdSuffix( bool in ) + { + hasMateReadIdSuffix = true ; + } +} ; + + +//-------------------------------------------------------------------------- +class Constraints +{ +private: + int prevStart, prevEnd ; + bool usePrimaryAsUnique ; + MateReadIds mateReadIds ; + + Alignments *pAlignments ; + + //@return: whether this alignment is compatible with the subexons or not. + bool ConvertAlignmentToBitTable( struct _pair *segments, int segCnt, struct _subexon *subexons, int seCnt, int seStart, struct _constraint &ct ) ; + + // Sort to increasing order. Since the first subexon occupies the least important digit. + static bool CompSortConstraints( const struct _constraint &a, const struct _constraint &b ) + { + //int k + if ( a.first < b.first ) + return true ; + else if ( a.first > b.first ) + return false ; + + int diffPos = a.vector.GetFirstDifference( b.vector ) ; + if ( diffPos == -1 ) // case of equal. + return false ; + + if ( a.vector.Test( diffPos )) + return false ; + else + return true ; + } + + static bool CompSortMatePairs( const struct _matePairConstraint &a, const struct _matePairConstraint &b ) + { + if ( a.i < b.i ) + return true ; + else if ( a.i > b.i ) + return false ; + else + { + if ( a.j < b.j ) + return true ; + else + return false ; + } + } + + void CoalesceSameConstraints() ; + void ComputeNormAbund( struct _subexon *subexons ) ; +public: + std::vector<struct _constraint> constraints ; + std::vector<struct _matePairConstraint> matePairs ; + + Constraints() + { + usePrimaryAsUnique = false ; + } + + Constraints( Alignments *a ): pAlignments( a ) + { + } + + ~Constraints() + { + int i ; + int size = constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector<struct _constraint>().swap( constraints ) ; + matePairs.clear() ; + std::vector<struct _matePairConstraint>().swap( matePairs ) ; + } + + void Clear() + { + //TODO: do I need to release the memory from BitTable? + constraints.clear() ; + } + + void SetAlignments( Alignments *a ) + { + pAlignments = a ; + } + + void SetUsePrimaryAsUnique( bool in ) + { + usePrimaryAsUnique = in ; + } + + void Assign( Constraints &c ) + { + int i ; + int size = constraints.size() ; + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector<struct _constraint>().swap( constraints ) ; + } + matePairs.clear() ; + std::vector<struct _matePairConstraint>().swap( matePairs ) ; + + //constraints.resize( c.constraints.size() ) ; + constraints = c.constraints ; + size = c.constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + /*struct _constraint nc ; + nc.weight = c.constraints[i].weight ; + nc.normAbund = c.constraints[i].normAbund ; + nc.abundance = c.constraints[i].abundance ; + nc.support = c.constraints[i].support ; + nc.uniqSupport = c.constraints[i].uniqSupport ; + nc.maxReadLen = c.constraints[i].maxReadLen ; + nc.info = c.constraints[i].info ; + nc.first = c.constraints[i].first ; + nc.last = c.constraints[i].last ; + nc.vector.Duplicate( c.constraints[i].vector ) ; + constraints[i] = ( nc ) ; */ + constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c" + constraints[i].vector.Duplicate( c.constraints[i].vector ) ; + } + matePairs = c.matePairs ; + pAlignments = c.pAlignments ; + } + + void DownsampleConstraintsFrom( Constraints &c, int stride = 10 ) + { + int i ; + int size = constraints.size(), k ; + + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector<struct _constraint>().swap( constraints ) ; + } + matePairs.clear() ; + std::vector<struct _matePairConstraint>().swap( matePairs ) ; + + //constraints.resize( c.constraints.size() ) ; + //constraints = c.constraints ; + k = 0 ; + size = c.constraints.size() ; + for ( i = 0 ; i < size ; i += stride, ++k ) + { + constraints.push_back( c.constraints[i] ) ; + constraints[k].vector.Nullify() ; // so that it won't affect the BitTable in "c" + constraints[k].vector.Duplicate( c.constraints[i].vector ) ; + + /*std::vector<int> seIdx ; + constraints[k].vector.GetOnesIndices( seIdx ) ; + int j, l = seIdx.size() ; + for ( j = 2 ; j < l ; ++j ) + { + constraints[k].vector.Unset( seIdx[j] ) ; + } + constraints[k].last = seIdx[1] ;*/ + } + // mate pairs is not used. if we down-sampling + pAlignments = c.pAlignments ; + } + + void TruncateConstraintsCoverFrom( Constraints &c, int seCnt, int maxConstraintSize ) + { + int i ; + int size = constraints.size() ; + + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector<struct _constraint>().swap( constraints ) ; + } + matePairs.clear() ; + std::vector<struct _matePairConstraint>().swap( matePairs ) ; + + //constraints.resize( c.constraints.size() ) ; + //constraints = c.constraints ; + size = c.constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + constraints.push_back( c.constraints[i] ) ; + constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c" + constraints[i].vector.Init( seCnt ) ; + std::vector<int> seIdx ; + c.constraints[i].vector.GetOnesIndices( seIdx ) ; + int j, l = seIdx.size() ; + for ( j = 0 ; j < maxConstraintSize && j < l ; ++j ) + { + constraints[i].vector.Set( seIdx[j] ) ; + } + constraints[i].last = seIdx[j - 1] ; + } + // mate pairs is not used. if we down-sampling + pAlignments = c.pAlignments ; + } + + void SetHasMateReadIdSuffix( bool in ) + { + mateReadIds.SetHasMateReadIdSuffix( in ) ; + } + + int BuildConstraints( struct _subexon *subexons, int seCnt, int start, int end ) ; + +} ; + +#endif