diff PsiCLASS-1.0.2/Constraints.hpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PsiCLASS-1.0.2/Constraints.hpp	Fri Mar 26 16:52:45 2021 +0000
@@ -0,0 +1,445 @@
+#ifndef _MOURISL_CLASSES_CONSTRAINTS_HEADER
+#define _MOURISL_CLASSES_CONSTRAINTS_HEADER
+
+#include <vector> 
+#include <map>
+#include <algorithm>
+#include <string>
+#include <string.h>
+
+#include "BitTable.hpp"
+#include "alignments.hpp"
+#include "SubexonGraph.hpp"
+
+struct _constraint
+{
+	BitTable vector ; // subexon vector
+	double weight ;
+	double normAbund ;
+	double abundance ;
+	int support ;
+	int uniqSupport ;
+	int maxReadLen ; // the longest read length support this constraint.
+
+	int info ; // other usages.
+	int first, last ; // indicate the first and last index of the subexons. 
+} ;
+
+struct _matePairConstraint
+{
+	int i, j ;
+	
+	int support ;
+	int uniqSupport ;
+
+	double abundance ;
+	double normAbund ;
+	int effectiveCount ;
+	int type ;
+} ;
+
+struct _readIdHeap
+{
+	char *readId ;
+	int pos ;
+	int matePos ;
+	int idx ;
+} ;
+
+//----------------------------------------------------------------------------------
+// We assume the access to the data structure is sorted by matePos.
+// So we can "pre-cache" the ids with the same matePos.
+class MateReadIds
+{
+private:
+	std::vector< struct _readIdHeap > heap ;
+
+	void HeapifyUp( int tag )
+	{
+		while ( tag > 1 )
+		{
+			if ( heap[tag / 2].matePos < heap[tag].matePos )
+				return ;
+			struct _readIdHeap tmp ;
+			tmp = heap[tag / 2] ;
+			heap[tag / 2] = heap[tag] ;
+			heap[tag] = tmp ;
+
+			tag /= 2 ;
+		}
+	}
+
+	void HeapifyDown( int tag )
+	{
+		int size = heap.size() ;
+		while ( 2 * tag < size )
+		{
+			int choose = 2 * tag ;
+			if ( 2 * tag + 1 < size && 
+				heap[ 2 * tag + 1].matePos < heap[2 * tag ].matePos )
+			{
+				choose = 2 * tag + 1 ;
+			}
+			
+			if ( heap[tag].matePos < heap[choose].matePos )
+				return ;
+
+			struct _readIdHeap tmp ; 
+			tmp = heap[choose] ;
+			heap[choose] = heap[tag] ;
+			heap[tag] = tmp ;
+
+			tag = choose ;
+		}
+	}
+
+	struct _readIdHeap Pop()
+	{
+		struct _readIdHeap ret ;
+		int size = heap.size() ;
+		if ( size < 2 )
+		{
+			ret.readId = NULL ;
+			return ret ;
+		}
+
+		ret = heap[1] ;
+		
+		heap[1] = heap[ heap.size() - 1] ;
+		heap.pop_back() ;
+		HeapifyDown( 1 ) ;
+
+		return ret ;
+	}
+
+	int cachedMatePos ;
+	std::map<std::string, int> cachedIdx ;
+	bool hasMateReadIdSuffix ; // ignore the last ".{1,2}" or "/{1,2}" .
+public:
+	MateReadIds() 
+	{ 
+		// Push a dummy element so the vector becomes 1-based.
+		struct _readIdHeap nh ;
+		nh.readId = NULL ; 
+		nh.pos = nh.idx = nh.matePos = -1 ;
+		heap.push_back( nh ) ;
+		cachedMatePos = -1 ;
+		hasMateReadIdSuffix = false ;
+	}
+	~MateReadIds() 
+	{
+		int size = heap.size() ;
+		std::map<std::string, int>().swap( cachedIdx ) ;
+		for ( int i = 0 ; i < size ; ++i )
+			if ( heap[i].readId != NULL )
+				free( heap[i].readId ) ;
+	}
+
+	void Clear()
+	{
+		std::map<std::string, int>().swap( cachedIdx ) ;
+		
+		int size = heap.size() ;
+		for ( int i = 0 ; i < size ; ++i )
+			if ( heap[i].readId != NULL )
+				free( heap[i].readId ) ;
+		std::vector<struct _readIdHeap>().swap( heap ) ;
+
+		struct _readIdHeap nh ;
+		nh.readId = NULL ; 
+		nh.pos = nh.idx = nh.matePos = -1 ;
+		heap.push_back( nh ) ;
+		cachedMatePos = -1 ;
+	}
+
+	void Insert( char *id, int pos, int idx, int matePos )
+	{
+		struct _readIdHeap nh ;
+		nh.readId = strdup( id ) ;
+		nh.pos = pos ;
+		nh.idx = idx ;
+		nh.matePos = matePos ;
+		
+		heap.push_back( nh ) ;
+		HeapifyUp( heap.size() - 1 ) ;
+	}
+	
+	// If the id does not exist, return -1.
+	int Query( char *id, int matePos )
+	{
+		int size ;
+		size = heap.size() ;
+		if ( matePos > cachedMatePos )
+		{
+			std::map<std::string, int>().swap( cachedIdx ) ;
+			
+			while ( size >= 2 && heap[1].matePos < matePos )
+			{
+				struct _readIdHeap r = Pop() ;
+				if ( r.readId )
+				{
+					free( r.readId ) ;
+				}
+				--size ;
+			}
+
+			while ( size >= 2 && heap[1].matePos == matePos )
+			{
+				struct _readIdHeap r = Pop() ;
+				cachedIdx[ std::string( r.readId ) ] = r.idx ;
+				if ( r.readId )
+					free( r.readId ) ;
+				--size ;
+			}
+			cachedMatePos = matePos ;
+		}
+		std::string s( id ) ;
+		if ( hasMateReadIdSuffix )
+		{
+			int len = s.length() ;
+			if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) 
+				&& ( s[len - 2] == '.' || s[len - 2] == '/' ) )
+			{
+				s[len - 1] = '2' - s[len - 1] + '1' ;
+			}
+		}
+
+		if ( cachedIdx.find( s ) != cachedIdx.end() )
+		{
+			return cachedIdx[s] ;
+		}
+		return -1 ;	
+	}
+	
+	void UpdateIdx( std::vector<int> &newIdx )
+	{
+		int size = heap.size() ;
+		int i ;
+		for ( i = 1 ; i < size ; ++i )
+		{
+			heap[i].idx = newIdx[ heap[i].idx ] ;
+		}
+		
+		for ( std::map<std::string, int>::iterator it = cachedIdx.begin() ; it != cachedIdx.end() ; ++it )
+			it->second = newIdx[ it->second ] ;
+	}
+
+	void SetHasMateReadIdSuffix( bool in )
+	{
+		hasMateReadIdSuffix = true ;
+	}
+} ;
+
+
+//--------------------------------------------------------------------------
+class Constraints
+{
+private:
+	int prevStart, prevEnd ;	
+	bool usePrimaryAsUnique ;
+	MateReadIds mateReadIds ;
+
+	Alignments *pAlignments ;
+	
+	//@return: whether this alignment is compatible with the subexons or not.
+	bool ConvertAlignmentToBitTable( struct _pair *segments, int segCnt, struct _subexon *subexons, int seCnt, int seStart, struct _constraint &ct ) ;
+
+	// Sort to increasing order. Since the first subexon occupies the least important digit.
+	static bool CompSortConstraints( const struct _constraint &a, const struct _constraint &b )
+	{
+		//int k 
+		if ( a.first < b.first )
+			return true ;
+		else if ( a.first > b.first )
+			return false ;
+
+		int diffPos = a.vector.GetFirstDifference( b.vector ) ;
+		if ( diffPos == -1 ) // case of equal.
+			return false ;
+
+		if ( a.vector.Test( diffPos ))
+			return false ;
+		else
+			return true ;
+	}
+
+	static bool CompSortMatePairs( const struct _matePairConstraint &a, const struct _matePairConstraint &b )
+	{
+		if ( a.i < b.i )
+			return true ;
+		else if ( a.i > b.i )
+			return false ;
+		else
+		{
+			if ( a.j < b.j )	
+				return true ;
+			else
+				return false ;
+		}
+	}
+
+	void CoalesceSameConstraints() ;
+	void ComputeNormAbund( struct _subexon *subexons ) ;
+public:
+	std::vector<struct _constraint> constraints ;
+	std::vector<struct _matePairConstraint> matePairs ; 
+	
+	Constraints() 
+	{
+		usePrimaryAsUnique = false ;
+	} 
+
+	Constraints( Alignments *a ): pAlignments( a ) 
+	{
+	}
+	
+	~Constraints() 
+	{
+		int i ;
+		int size = constraints.size() ;
+		for ( i = 0 ; i < size ; ++i )
+			constraints[i].vector.Release() ;
+		constraints.clear() ;
+		std::vector<struct _constraint>().swap( constraints ) ;
+		matePairs.clear() ;
+		std::vector<struct _matePairConstraint>().swap( matePairs ) ;
+	}
+
+	void Clear() 
+	{
+		//TODO: do I need to release the memory from BitTable?
+		constraints.clear() ;
+	}
+
+	void SetAlignments( Alignments *a )
+	{
+		pAlignments = a ;
+	}
+
+	void SetUsePrimaryAsUnique( bool in )
+	{
+		usePrimaryAsUnique = in ;
+	}
+
+	void Assign( Constraints &c )
+	{
+		int i ;
+		int size = constraints.size() ;
+		if ( size > 0 )
+		{
+			for ( i = 0 ; i < size ; ++i )
+				constraints[i].vector.Release() ;
+			constraints.clear() ;
+			std::vector<struct _constraint>().swap( constraints ) ;
+		}
+		matePairs.clear() ;
+		std::vector<struct _matePairConstraint>().swap( matePairs ) ;
+		
+		//constraints.resize( c.constraints.size() ) ;
+		constraints = c.constraints ;
+		size = c.constraints.size() ;
+		for ( i = 0 ; i < size ; ++i )
+		{
+			/*struct _constraint nc ;
+			nc.weight = c.constraints[i].weight ;
+			nc.normAbund = c.constraints[i].normAbund ;
+			nc.abundance = c.constraints[i].abundance ;
+			nc.support = c.constraints[i].support ;
+			nc.uniqSupport = c.constraints[i].uniqSupport ;
+			nc.maxReadLen = c.constraints[i].maxReadLen ;
+			nc.info = c.constraints[i].info ;
+			nc.first = c.constraints[i].first ;
+			nc.last = c.constraints[i].last ;
+			nc.vector.Duplicate( c.constraints[i].vector ) ;
+			constraints[i] = ( nc ) ; */
+			constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c"
+			constraints[i].vector.Duplicate( c.constraints[i].vector ) ;
+		}
+		matePairs = c.matePairs ;
+		pAlignments = c.pAlignments ;
+	}
+
+	void DownsampleConstraintsFrom( Constraints &c, int stride = 10 )
+	{
+		int i ;
+		int size = constraints.size(), k ;
+
+		if ( size > 0 )
+		{
+			for ( i = 0 ; i < size ; ++i )
+				constraints[i].vector.Release() ;
+			constraints.clear() ;
+			std::vector<struct _constraint>().swap( constraints ) ;
+		}
+		matePairs.clear() ;
+		std::vector<struct _matePairConstraint>().swap( matePairs ) ;
+		
+		//constraints.resize( c.constraints.size() ) ;
+		//constraints = c.constraints ;
+		k = 0 ;
+		size = c.constraints.size() ;
+		for ( i = 0 ; i < size ; i += stride, ++k  )
+		{
+			constraints.push_back( c.constraints[i] ) ;
+			constraints[k].vector.Nullify() ; // so that it won't affect the BitTable in "c"
+			constraints[k].vector.Duplicate( c.constraints[i].vector ) ;
+
+			/*std::vector<int> seIdx ;
+			constraints[k].vector.GetOnesIndices( seIdx ) ; 
+			int j, l = seIdx.size() ;
+			for ( j = 2 ; j < l ; ++j )
+			{
+				constraints[k].vector.Unset( seIdx[j] ) ;
+			}
+			constraints[k].last = seIdx[1] ;*/
+		}
+		// mate pairs is not used. if we down-sampling
+		pAlignments = c.pAlignments ;
+	}
+	
+	void TruncateConstraintsCoverFrom( Constraints &c, int seCnt, int maxConstraintSize )
+	{
+		int i ;
+		int size = constraints.size() ;
+
+		if ( size > 0 )
+		{
+			for ( i = 0 ; i < size ; ++i )
+				constraints[i].vector.Release() ;
+			constraints.clear() ;
+			std::vector<struct _constraint>().swap( constraints ) ;
+		}
+		matePairs.clear() ;
+		std::vector<struct _matePairConstraint>().swap( matePairs ) ;
+
+		//constraints.resize( c.constraints.size() ) ;
+		//constraints = c.constraints ;
+		size = c.constraints.size() ;
+		for ( i = 0 ; i < size ; ++i  )
+		{
+			constraints.push_back( c.constraints[i] ) ;
+			constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c"
+			constraints[i].vector.Init( seCnt ) ;
+			std::vector<int> seIdx ;
+			c.constraints[i].vector.GetOnesIndices( seIdx ) ; 
+			int j, l = seIdx.size() ;
+			for ( j = 0 ; j < maxConstraintSize && j < l ; ++j )
+			{
+				constraints[i].vector.Set( seIdx[j] ) ;
+			}
+			constraints[i].last = seIdx[j - 1] ;
+		}
+		// mate pairs is not used. if we down-sampling
+		pAlignments = c.pAlignments ;
+	}
+		
+	void SetHasMateReadIdSuffix( bool in )
+	{
+		mateReadIds.SetHasMateReadIdSuffix( in ) ;
+	}
+
+	int BuildConstraints( struct _subexon *subexons, int seCnt, int start, int end ) ;
+
+} ;
+
+#endif