Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/SubexonGraph.hpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonGraph.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,348 @@ +#ifndef _MOURISL_CLASSES_SUBEXONGRAPH_HEADER +#define _MOURISL_CLASSES_SUBEXONGRAPH_HEADER + +#include "alignments.hpp" +#include "blocks.hpp" + +struct _subexon +{ + int chrId ; + int geneId ; + int start, end ; + int leftType, rightType ; + double avgDepth ; + //double ratio, classifier ; + double leftRatio, rightRatio ; + double leftClassifier, rightClassifier ; + int lcCnt, rcCnt ; + int leftStrand, rightStrand ; + + int nextCnt, prevCnt ; + int *next, *prev ; + + bool canBeStart, canBeEnd ; +} ; + +struct _geneInterval +{ + int startIdx, endIdx ; + int start, end ; // The start and end of a gene interval might be adjusted, so it does not + // need to be match with the corresponding subexons +} ; + +class SubexonGraph +{ +private: + int *visit ; + double classifierThreshold ; + + int usedGeneId ; + int baseGeneId ; + + // The function to assign gene ids to subexons. + void SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id ) ; + void GetGeneBoundary( int tag, int &boundary, int timeStamp ) ; + void UpdateGeneId( struct _subexon *subexons, int seCnt ) ; +public: + std::vector<struct _subexon> subexons ; + std::vector<struct _geneInterval> geneIntervals ; + + ~SubexonGraph() + { + int i ; + int size = subexons.size() ; + for ( i = 0 ; i < size ; ++i ) + { + if ( subexons[i].next ) + delete[] subexons[i].next ; + if ( subexons[i].prev ) + delete[] subexons[i].prev ; + } + } + + SubexonGraph( double classifierThreshold, Alignments &bam, FILE *fpSubexon ) + { + // Read in the subexons + rewind( fpSubexon ) ; + char buffer[2048] ; + int subexonCnt ; + int i, j, k ; + while ( fgets( buffer, sizeof( buffer ), fpSubexon ) != NULL ) + { + if ( buffer[0] == '#' ) + continue ; + + struct _subexon se ; + InputSubexon( buffer, bam, se, true ) ; + + // filter. + if ( ( se.leftType == 0 && se.rightType == 0 ) + || ( se.leftType == 0 && se.rightType == 1 ) // overhang + || ( se.leftType == 2 && se.rightType == 0 ) // overhang + || ( se.leftType == 2 && se.rightType == 1 ) ) // ir + { + if ( ( se.leftType == 0 && se.rightType == 1 ) + || ( se.leftType == 2 && se.rightType == 0 ) ) // if the overhang is too small + { + if ( se.end - se.start + 1 <= 7 ) + { + if ( se.next ) + delete[] se.next ; + if ( se.prev ) + delete[] se.prev ; + continue ; + } + } + + if ( se.leftClassifier >= classifierThreshold || se.leftClassifier < 0 ) + { + if ( se.next ) + delete[] se.next ; + if ( se.prev ) + delete[] se.prev ; + continue ; + } + } + + // Adjust the coordinate. + subexons.push_back( se ) ; + } + + // Convert the coordinate to index + // Note that each coordinate can only associate with one subexon. + subexonCnt = subexons.size() ; + for ( i = 0 ; i < subexonCnt ; ++i ) + { + struct _subexon &se = subexons[i] ; + //printf( "hi1 %d: %d %d\n", i, se.prevCnt, se.prev[0] ) ; + int cnt = 0 ; + + // due to filter, we may not fully match the coordinate and the subexon + int bound = 0 ; + if ( se.prevCnt > 0 ) + bound = se.prev[0] ; + for ( j = i - 1, k = 0 ; k < se.prevCnt && j >= 0 && subexons[j].end >= bound ; --j ) + { + //printf( " %d %d: %d %d\n", j, k, se.prev[ se.prevCnt - 1 - k], subexons[j].end ) ; + if ( subexons[j].end == se.prev[se.prevCnt - 1 - k] ) // notice the order is reversed + { + se.prev[se.prevCnt - 1 - cnt] = j ; + ++k ; + ++cnt ; + } + else if ( subexons[j].end < se.prev[ se.prevCnt - 1 - k ] ) // the corresponding subexon gets filtered. + { + ++k ; + ++j ; // counter the --j in the loop + } + } + //printf( "hi2 %d : %d\n", i, se.prevCnt ) ; + // shft the list + for ( j = 0, k = se.prevCnt - cnt ; j < cnt ; ++j, ++k ) + { + se.prev[j] = se.prev[k] ; + } + se.prevCnt = cnt ; + cnt = 0 ; + if ( se.nextCnt > 0 ) + bound = se.next[ se.nextCnt - 1] ; + for ( j = i + 1, k = 0 ; k < se.nextCnt && j < subexonCnt && subexons[j].start <= bound ; ++j ) + { + if ( subexons[j].start == se.next[k] ) + { + se.next[cnt] = j ; // cnt is always less than k, so we don't need to worry about overwrite. + ++k ; + ++cnt ; + } + else if ( subexons[j].start > se.next[k] ) + { + ++k ; + --j ; + } + } + se.nextCnt = cnt ; + } + + // Adjust the coordinate + int seCnt = subexons.size() ; + for ( i = 0 ; i < seCnt ; ++i ) + { + --subexons[i].start ; + --subexons[i].end ; + } + rewind( fpSubexon ) ; + + // Adjust the classifier for hard boundary, if there is a overhang attached to that region. + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].leftType == 1 && subexons[i].leftClassifier < 1 ) + { + for ( j = i - 1 ; j >= 0 ; --j ) + if ( subexons[j].end < subexons[j + 1].start - 1 ) + break ; + if ( subexons[j + 1].leftType == 0 ) + subexons[i].leftClassifier = 1 ; + } + if ( subexons[i].rightType == 2 && subexons[i].rightClassifier < 1 ) + { + for ( j = i + 1 ; j < seCnt ; ++j ) + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + if ( subexons[j - 1].rightType == 0 ) + subexons[i].rightClassifier = 1 ; + } + } + + // For the region of mixture of plus and minus strand subexons, if there is + // no overhang attached to it, we need to let the hard boundary be a candidate terminal sites. + for ( i = 0 ; i < seCnt ; ) + { + // [i,j) is a region + int support[2] = {0, 0} ; // the index, 0 is for minus strand, 1 is for plus strand + for ( j = i + 1 ; j < seCnt ; ++j ) + { + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + } + + for ( k = i ; k < j ; ++k ) + { + if ( subexons[k].leftStrand != 0 ) + ++support[ ( subexons[k].leftStrand + 1 ) / 2 ] ; + if ( subexons[k].rightStrand != 0 ) + ++support[ ( subexons[k].rightStrand + 1 ) / 2 ] ; + } + if ( support[0] == 0 || support[1] == 0 ) + { + i = j ; + continue ; + } + // a mixture region. + // We force a terminal site if we have only coming-in and no going-out introns. + int leftSupport[2] = {0, 0}, rightSupport[2] = {0, 0}; + int l ; + for ( k = i ; k < j ; ++k ) + { + int cnt = subexons[k].prevCnt ; + if ( subexons[k].leftStrand != 0 ) + for ( l = 0 ; l < cnt ; ++l ) + if ( subexons[k].prev[l] < i ) + { + ++leftSupport[ ( subexons[k].leftStrand + 1 ) / 2 ] ; + break ; + } + cnt = subexons[k].nextCnt ; + if ( subexons[k].rightStrand != 0 ) + for ( l = 0 ; l < cnt ; ++l ) + if ( subexons[k].next[l] >= j ) + { + ++rightSupport[ ( subexons[k].rightStrand + 1 ) / 2 ] ; + break ; + } + } + + if ( ( ( leftSupport[0] > 0 && rightSupport[0] == 0 ) || + ( leftSupport[1] > 0 && rightSupport[1] == 0 ) ) && + subexons[j - 1].rightType != 0 ) + { + subexons[j - 1].rightClassifier = 0 ; + } + + if ( ( ( leftSupport[0] == 0 && rightSupport[0] > 0 ) || + ( leftSupport[1] == 0 && rightSupport[1] > 0 ) ) && + subexons[j - 1].leftType != 0 ) + { + subexons[j - 1].leftClassifier = 0 ; + } + + i = j ; + } + + this->classifierThreshold = classifierThreshold ; + + usedGeneId = baseGeneId = 0 ; + } + + static bool IsSameStrand( int a, int b ) + { + if ( a == 0 || b == 0 ) + return true ; + if ( a != b ) + return false ; + return true ; + } + // Parse the input line + static int InputSubexon( char *in, Alignments &alignments, struct _subexon &se, bool needPrevNext = false ) + { + int i ; + char chrName[50] ; + char ls[3], rs[3] ; + sscanf( in, "%s %d %d %d %d %s %s %lf %lf %lf %lf %lf", chrName, &se.start, &se.end, &se.leftType, &se.rightType, ls, rs, + &se.avgDepth, &se.leftRatio, &se.rightRatio, + &se.leftClassifier, &se.rightClassifier ) ; + se.chrId = alignments.GetChromIdFromName( chrName ) ; + se.nextCnt = se.prevCnt = 0 ; + se.next = se.prev = NULL ; + se.lcCnt = se.rcCnt = 0 ; + + if ( ls[0] == '+' ) + se.leftStrand = 1 ; + else if ( ls[0] == '-' ) + se.leftStrand = -1 ; + else + se.leftStrand = 0 ; + + if ( rs[0] == '+' ) + se.rightStrand = 1 ; + else if ( rs[0] == '-' ) + se.rightStrand = -1 ; + else + se.rightStrand = 0 ; + + if ( needPrevNext ) + { + char *p = in ; + // Locate the offset for prevCnt + for ( i = 0 ; i <= 11 ; ++i ) + { + p = strchr( p, ' ' ) ; + ++p ; + } + + sscanf( p, "%d", &se.prevCnt ) ; + p = strchr( p, ' ' ) ; + ++p ; + se.prev = new int[ se.prevCnt ] ; + for ( i = 0 ; i < se.prevCnt ; ++i ) + { + sscanf( p, "%d", &se.prev[i] ) ; + p = strchr( p, ' ' ) ; + ++p ; + } + + sscanf( p, "%d", &se.nextCnt ) ; + p = strchr( p, ' ' ) ; + ++p ; + se.next = new int[ se.nextCnt ] ; + for ( i = 0 ; i < se.nextCnt ; ++i ) + { + sscanf( p, "%d", &se.next[i] ) ; + p = strchr( p, ' ' ) ; + ++p ; + } + + } + return 1 ; + } + + int GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp ) ; + + //@return: the number of intervals found + int ComputeGeneIntervals() ; + + // Return a list of subexons in that interval and in retList the id of subexon + // should be adjusted to start from 0. + int ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList ) ; +} ; + +#endif