diff PsiCLASS-1.0.2/SubexonGraph.hpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PsiCLASS-1.0.2/SubexonGraph.hpp	Fri Mar 26 16:52:45 2021 +0000
@@ -0,0 +1,348 @@
+#ifndef _MOURISL_CLASSES_SUBEXONGRAPH_HEADER
+#define _MOURISL_CLASSES_SUBEXONGRAPH_HEADER
+
+#include "alignments.hpp"
+#include "blocks.hpp"
+
+struct _subexon
+{
+	int chrId ;
+	int geneId ;
+	int start, end ;
+	int leftType, rightType ;
+	double avgDepth ;
+	//double ratio, classifier ;
+	double leftRatio, rightRatio ;
+	double leftClassifier, rightClassifier ;
+	int lcCnt, rcCnt ;
+	int leftStrand, rightStrand ;
+	
+	int nextCnt, prevCnt ;
+	int *next, *prev ;
+	
+	bool canBeStart, canBeEnd ;
+} ;
+
+struct _geneInterval
+{
+	int startIdx, endIdx ;
+	int start, end ; // The start and end of a gene interval might be adjusted, so it does not 
+			// need to be match with the corresponding subexons
+} ;
+
+class SubexonGraph
+{
+private:
+	int *visit ;
+	double classifierThreshold ;
+
+	int usedGeneId ;
+	int baseGeneId ;
+
+	// The function to assign gene ids to subexons.
+	void SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id ) ;
+	void GetGeneBoundary( int tag, int &boundary, int timeStamp ) ;
+	void UpdateGeneId( struct _subexon *subexons, int seCnt ) ;
+public:
+	std::vector<struct _subexon> subexons ;
+	std::vector<struct _geneInterval> geneIntervals ;
+
+	~SubexonGraph() 
+	{
+		int i ;
+		int size = subexons.size() ;
+		for ( i = 0 ; i < size ; ++i )
+		{
+			if ( subexons[i].next )
+				delete[] subexons[i].next ;
+			if ( subexons[i].prev )
+				delete[] subexons[i].prev ;
+		}
+	} 
+
+	SubexonGraph( double classifierThreshold, Alignments &bam, FILE *fpSubexon ) 
+	{ 
+		// Read in the subexons
+		rewind( fpSubexon ) ; 
+		char buffer[2048] ;
+		int subexonCnt ;
+		int i, j, k ;
+		while ( fgets( buffer, sizeof( buffer ), fpSubexon ) != NULL )
+		{
+			if ( buffer[0] == '#' )
+				continue ;
+
+			struct _subexon se ;
+			InputSubexon( buffer, bam, se, true ) ;
+
+			// filter.
+			if ( ( se.leftType == 0 && se.rightType == 0 ) 
+				|| ( se.leftType == 0 && se.rightType == 1 ) 	// overhang
+				|| ( se.leftType == 2 && se.rightType == 0 ) // overhang
+				|| ( se.leftType == 2 && se.rightType == 1 ) ) // ir
+			{
+				if ( ( se.leftType == 0 && se.rightType == 1 ) 
+					|| ( se.leftType == 2 && se.rightType == 0 ) ) // if the overhang is too small
+				{
+					if ( se.end - se.start + 1 <= 7 )
+					{
+						if ( se.next )
+							delete[] se.next ;
+						if ( se.prev )
+							delete[] se.prev ;
+						continue ;
+					}
+				}
+
+				if ( se.leftClassifier >= classifierThreshold || se.leftClassifier < 0 )
+				{
+					if ( se.next )
+						delete[] se.next ;
+					if ( se.prev )
+						delete[] se.prev ;
+					continue ;
+				}
+			}
+			
+			// Adjust the coordinate.
+			subexons.push_back( se ) ;	
+		}
+
+		// Convert the coordinate to index
+		// Note that each coordinate can only associate with one subexon.
+		subexonCnt = subexons.size() ;
+		for ( i = 0 ; i < subexonCnt ; ++i )
+		{	
+			struct _subexon &se = subexons[i] ;
+			//printf( "hi1 %d: %d %d\n", i, se.prevCnt, se.prev[0] ) ;
+			int cnt = 0 ;
+
+			// due to filter, we may not fully match the coordinate and the subexon
+			int bound = 0 ;
+			if ( se.prevCnt > 0 )
+				bound = se.prev[0] ;
+			for ( j = i - 1, k = 0 ; k < se.prevCnt && j >= 0 && subexons[j].end >= bound ; --j )
+			{
+				//printf( " %d %d: %d %d\n", j, k, se.prev[ se.prevCnt - 1 - k], subexons[j].end ) ;
+				if ( subexons[j].end == se.prev[se.prevCnt - 1 - k] ) // notice the order is reversed
+				{
+					se.prev[se.prevCnt - 1 - cnt] = j ;
+					++k ;
+					++cnt ;
+				}
+				else if ( subexons[j].end < se.prev[ se.prevCnt - 1 - k ] ) // the corresponding subexon gets filtered.
+				{
+					++k ;
+					++j ; // counter the --j in the loop
+				}
+			}
+			//printf( "hi2 %d : %d\n", i, se.prevCnt ) ;
+			// shft the list
+			for ( j = 0, k = se.prevCnt - cnt ; j < cnt ; ++j, ++k )
+			{
+				se.prev[j] = se.prev[k] ;
+			}
+			se.prevCnt = cnt ;
+			cnt = 0 ;
+			if ( se.nextCnt > 0 )
+				bound = se.next[ se.nextCnt - 1] ;
+			for ( j = i + 1, k = 0 ; k < se.nextCnt && j < subexonCnt && subexons[j].start <= bound ; ++j )
+			{
+				if ( subexons[j].start == se.next[k] )
+				{
+					se.next[cnt] = j ; // cnt is always less than k, so we don't need to worry about overwrite.
+					++k ;
+					++cnt ;
+				}
+				else if ( subexons[j].start > se.next[k] ) 
+				{
+					++k ;
+					--j ;
+				}
+			}
+			se.nextCnt = cnt ;
+		}
+
+		// Adjust the coordinate
+		int seCnt = subexons.size() ;
+		for ( i = 0 ; i < seCnt ; ++i )
+		{
+			--subexons[i].start ;
+			--subexons[i].end ;
+		}
+		rewind( fpSubexon ) ;
+		
+		// Adjust the classifier for hard boundary, if there is a overhang attached to that region.
+		for ( i = 0 ; i < seCnt ; ++i )
+		{
+			if ( subexons[i].leftType == 1 && subexons[i].leftClassifier < 1 )
+			{
+				for ( j = i - 1 ; j >= 0 ; --j )
+					if ( subexons[j].end < subexons[j + 1].start - 1 )
+						break ;
+				if ( subexons[j + 1].leftType == 0 )
+					subexons[i].leftClassifier = 1 ;
+			}
+			if ( subexons[i].rightType == 2 && subexons[i].rightClassifier < 1 )
+			{
+				for ( j = i + 1 ; j < seCnt ; ++j )
+					if ( subexons[j].start > subexons[j - 1].end + 1 )
+						break ;
+				if ( subexons[j - 1].rightType == 0 )
+					subexons[i].rightClassifier = 1 ;
+			}
+		}
+
+		// For the region of mixture of plus and minus strand subexons, if there is
+		// no overhang attached to it, we need to let the hard boundary be a candidate terminal sites.
+		for ( i = 0 ; i < seCnt ; )
+		{
+			// [i,j) is a region
+			int support[2] = {0, 0} ; // the index, 0 is for minus strand, 1 is for plus strand
+			for ( j = i + 1 ; j < seCnt ; ++j )
+			{
+				if ( subexons[j].start > subexons[j - 1].end + 1 )	
+					break ;
+			}
+
+			for ( k = i ; k < j ; ++k )
+			{
+				if ( subexons[k].leftStrand != 0 )
+					++support[ ( subexons[k].leftStrand + 1 ) / 2 ] ;
+				if ( subexons[k].rightStrand != 0 )
+					++support[ ( subexons[k].rightStrand + 1 ) / 2 ] ;
+			}
+			if ( support[0] == 0 || support[1] == 0 )
+			{
+				i = j ;
+				continue ;
+			}
+			// a mixture region. 
+			// We force a terminal site if we have only coming-in and no going-out introns.
+			int leftSupport[2] = {0, 0}, rightSupport[2] = {0, 0};
+			int l ;
+			for ( k = i ; k < j ; ++k )
+			{
+				int cnt = subexons[k].prevCnt ; 
+				if ( subexons[k].leftStrand != 0 )
+					for ( l = 0 ; l < cnt ; ++l )
+						if ( subexons[k].prev[l] < i )
+						{
+							++leftSupport[ ( subexons[k].leftStrand + 1 ) / 2 ] ;
+							break ;
+						}
+				cnt = subexons[k].nextCnt ; 
+				if ( subexons[k].rightStrand != 0 )
+					for ( l = 0 ; l < cnt ; ++l )
+						if ( subexons[k].next[l] >= j )
+						{
+							++rightSupport[ ( subexons[k].rightStrand + 1 ) / 2 ] ;
+							break ;
+						}
+			}
+
+			if ( ( ( leftSupport[0] > 0 && rightSupport[0] == 0 ) || 
+				( leftSupport[1] > 0 && rightSupport[1] == 0 ) ) &&
+				subexons[j - 1].rightType != 0 )
+			{
+				subexons[j - 1].rightClassifier = 0 ;
+			}
+
+			if ( ( ( leftSupport[0] == 0 && rightSupport[0] > 0 ) || 
+				( leftSupport[1] == 0 && rightSupport[1] > 0 ) ) &&
+				subexons[j - 1].leftType != 0 )
+			{
+				subexons[j - 1].leftClassifier = 0 ;
+			}
+
+			i = j ;
+		}
+
+		this->classifierThreshold = classifierThreshold ;
+
+		usedGeneId = baseGeneId = 0 ;
+	} 
+	
+	static bool IsSameStrand( int a, int b )
+	{
+		if ( a == 0 || b == 0 )
+			return true ;
+		if ( a != b )
+			return false ;
+		return true ;
+	}
+	// Parse the input line
+	static int InputSubexon( char *in, Alignments &alignments, struct _subexon &se, bool needPrevNext = false )
+	{
+		int i ;
+		char chrName[50] ;
+		char ls[3], rs[3] ;	
+		sscanf( in, "%s %d %d %d %d %s %s %lf %lf %lf %lf %lf", chrName, &se.start, &se.end, &se.leftType, &se.rightType, ls, rs,
+				&se.avgDepth, &se.leftRatio, &se.rightRatio, 
+				&se.leftClassifier, &se.rightClassifier ) ;	
+		se.chrId = alignments.GetChromIdFromName( chrName ) ;
+		se.nextCnt = se.prevCnt = 0 ;
+		se.next = se.prev = NULL ;
+		se.lcCnt = se.rcCnt = 0 ;
+
+		if ( ls[0] == '+' )
+			se.leftStrand = 1 ;
+		else if ( ls[0] == '-' )
+			se.leftStrand = -1 ;
+		else
+			se.leftStrand = 0 ;
+
+		if ( rs[0] == '+' )
+			se.rightStrand = 1 ;
+		else if ( rs[0] == '-' )
+			se.rightStrand = -1 ;
+		else
+			se.rightStrand = 0 ;
+
+		if ( needPrevNext )
+		{
+			char *p = in ;
+			// Locate the offset for prevCnt
+			for ( i = 0 ; i <= 11 ; ++i )
+			{
+				p = strchr( p, ' ' ) ;
+				++p ;
+			}
+
+			sscanf( p, "%d", &se.prevCnt ) ;
+			p = strchr( p, ' ' ) ;
+			++p ;
+			se.prev = new int[ se.prevCnt ] ;
+			for ( i = 0 ; i < se.prevCnt ; ++i )
+			{
+				sscanf( p, "%d", &se.prev[i] ) ;
+				p = strchr( p, ' ' ) ;
+				++p ;
+			}
+
+			sscanf( p, "%d", &se.nextCnt ) ;
+			p = strchr( p, ' ' ) ;
+			++p ;
+			se.next = new int[ se.nextCnt ] ;
+			for ( i = 0 ; i < se.nextCnt ; ++i )
+			{
+				sscanf( p, "%d", &se.next[i] ) ;
+				p = strchr( p, ' ' ) ;
+				++p ;
+			}
+			
+		}
+		return 1 ;
+	}
+	
+	int GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp ) ;
+
+	//@return: the number of intervals found
+	int ComputeGeneIntervals() ;
+	
+	// Return a list of subexons in that interval and in retList the id of subexon 
+	// should be adjusted to start from 0.
+	int ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList ) ;
+} ;
+
+#endif