Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/SubexonGraph.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonGraph.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,241 @@ +#include "SubexonGraph.hpp" + +void SubexonGraph::GetGeneBoundary( int tag, int &boundary, int timeStamp ) +{ + if ( visit[tag] == timeStamp ) + return ; + //printf( "%d %d\n", tag, timeStamp ) ; + visit[tag] = timeStamp ; + if ( subexons[tag].end > boundary ) + boundary = subexons[tag].end ; + //if ( subexons[tag].start == 2858011 ) + // printf( "%d: %d %d\n", tag, subexons[tag].nextCnt, subexons[tag].prevCnt) ; + int i ; + int cnt = subexons[tag].nextCnt ; + for ( i = 0 ; i < cnt ; ++i ) + { + //printf( "next of %d: %d %d\n", tag, i, subexons[tag].next[i] ) ; + GetGeneBoundary( subexons[tag].next[i], boundary, timeStamp ) ; + } +} + +int SubexonGraph::GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp ) +{ + int i ; + int seCnt = subexons.size() ; + if ( startIdx >= seCnt ) + return -1 ; + int farthest = -1 ; + GetGeneBoundary( startIdx, farthest, timeStamp ) ; + + for ( i = startIdx + 1 ; i < seCnt ; ++i ) + { + if ( subexons[i].start > farthest || subexons[i].chrId != subexons[ startIdx ].chrId ) + break ; + + GetGeneBoundary( i, farthest, timeStamp ) ; + } + endIdx = i - 1 ; + + return endIdx ; +} + +int SubexonGraph::ComputeGeneIntervals() +{ + int i, cnt ; + int seCnt = subexons.size() ; + visit = new int[seCnt] ; + memset( visit, -1, sizeof( int ) * seCnt ) ; + int tag = 0 ; + cnt = 0 ; + while ( 1 ) + { + struct _geneInterval ngi ; + //printf( "%d %d %d\n", tag, subexons[tag].start + 1, subexons[tag].end + 1 ) ; + if ( GetGeneIntervalIdx( tag, ngi.endIdx, cnt ) == -1 ) + break ; + ++cnt ; + ngi.startIdx = tag ; + ngi.start = subexons[ ngi.startIdx ].start ; + ngi.end = subexons[ ngi.endIdx ].end ; + + tag = ngi.endIdx + 1 ; + // Adjust the extent + // Adjust the start + if ( subexons[ ngi.startIdx ].leftStrand != 0 + && subexons[ngi.startIdx].leftStrand != subexons[ngi.startIdx ].rightStrand ) + // We should make sure that rightstrand is non-zero whenever left-strand is non-zero for the startIdx. + { + for ( i = ngi.startIdx ; i >= 0 ; --i ) + { + if ( ( subexons[i].leftType == 1 && subexons[i].leftClassifier < classifierThreshold ) // an end within the subexon + || ( subexons[i].leftType == 0 ) // probably a overhang subexon. It should be a subset of the criterion following. + || ( i > 0 && subexons[i - 1].end + 1 < subexons[i].start ) ) // a gap. + break ; + } + ngi.start = subexons[i].start ; + } + + // Adjust the end. + // And here, we also need to decide wether we need to adjust "tag" or not, + // because the next interval might be overlap with current interval by the last subexon. + // We solve the overlap genes now, so we DON'T need to adjust tag. + if ( subexons[ ngi.endIdx ].rightStrand != 0 + && subexons[ngi.endIdx].leftStrand != subexons[ngi.endIdx ].rightStrand ) + { + for ( i = ngi.endIdx ; i < seCnt ; ++i ) + { + if ( ( subexons[i].rightType == 2 && subexons[i].rightClassifier < classifierThreshold ) // an end within the subexon + || ( subexons[i].rightType == 0 ) // probably a overhang subexon. + || ( i < seCnt - 1 && subexons[i].end + 1 < subexons[i + 1].start ) ) // a gap + break ; + } + ngi.end = subexons[i].end ; + + /*if ( subexons[ ngi.endIdx ].rightType == 2 ) + { + for ( i = ngi.endIdx ; i >= ngi.startIdx ; --i ) + { + if ( subexons[i].leftType == 1 ) + break ; + } + // The last region overlapps. + if ( i >= ngi.startIdx && subexons[i].leftStrand != subexons[ ngi.endIdx ].rightStrand ) + --tag ; + }*/ + } + geneIntervals.push_back( ngi ) ; + } + delete[] visit ; + + return cnt ; +} + +int SubexonGraph::ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList ) +{ + int i, j, k ; + int cnt = endIdx - startIdx + 1 ; + //printf( "%s: %d %d %d\n", __func__, startIdx, endIdx, cnt ) ; + for ( i = 0 ; i < cnt ; ++i ) + { + retList[i] = subexons[i + startIdx] ; + retList[i].geneId = -1 ; + retList[i].prev = new int[ retList[i].prevCnt ] ; + retList[i].next = new int[ retList[i].nextCnt ] ; + + for ( j = 0 ; j < retList[i].prevCnt ; ++j ) + retList[i].prev[j] = subexons[i + startIdx].prev[j] - startIdx ; + for ( j = 0 ; j < retList[i].nextCnt ; ++j ) + retList[i].next[j] = subexons[i + startIdx].next[j] - startIdx ; + + for ( j = 0, k = 0 ; j < retList[i].prevCnt ; ++j ) + if ( retList[i].prev[j] >= 0 && retList[i].prev[j] < cnt ) + { + retList[i].prev[k] = retList[i].prev[j] ; + ++k ; + } + retList[i].prevCnt = k ; + + for ( j = 0, k = 0 ; j < retList[i].nextCnt ; ++j ) + if ( retList[i].next[j] >= 0 && retList[i].next[j] < cnt ) + { + retList[i].next[k] = retList[i].next[j] ; + ++k ; + } + retList[i].nextCnt = k ; + } + UpdateGeneId( retList, cnt ) ; + return cnt ; +} + +void SubexonGraph::SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id ) +{ + if ( subexons[tag].geneId != -1 && subexons[tag].geneId != -2 ) + { + if ( subexons[tag].geneId != id ) // a subexon may belong to more than one gene. + { + //printf( "Set -2, %d: %d %d %d %d\n", id, tag, subexons[tag].geneId, subexons[tag].start + 1, strand ) ; + subexons[tag].geneId = -2 ; + } + else + return ; + // There is no need to terminate at the ambiguous exon, the strand will prevent + // us from overwriting previous gene ids. + //return ; + } + else if ( subexons[tag].geneId == -2 ) + return ; + //printf( "%d: %d %d %d %d\n", id, tag, subexons[tag].geneId, subexons[tag].start + 1, strand ) ; + int i ; + if ( subexons[tag].geneId != -2 ) + subexons[ tag ].geneId = id ; + int cnt = subexons[tag].nextCnt ; + // Set through the introns. + if ( IsSameStrand( strand, subexons[tag].rightStrand ) ) + { + for ( i = 0 ; i < cnt ; ++i ) + if ( subexons[ subexons[tag].next[i] ].start > subexons[tag].end + 1 ) + SetGeneId( subexons[tag].next[i], strand, subexons, seCnt, id ) ; + } + + cnt = subexons[tag].prevCnt ; + if ( IsSameStrand( strand, subexons[tag].leftStrand ) ) + { + for ( i = 0 ; i < cnt ; ++i ) + if ( subexons[ subexons[tag].prev[i] ].end < subexons[tag].start - 1 ) + SetGeneId( subexons[tag].prev[i], strand, subexons, seCnt, id ) ; + } + + // Set through the adjacent subexons. + if ( tag < seCnt - 1 && subexons[tag + 1].start == subexons[tag].end + 1 ) + { + SetGeneId( tag + 1, strand, subexons, seCnt, id ) ; + } + + if ( tag > 0 && subexons[tag].start - 1 == subexons[tag - 1].end ) + { + SetGeneId( tag - 1, strand, subexons, seCnt, id ) ; + } + +} + +void SubexonGraph::UpdateGeneId( struct _subexon *subexons, int seCnt ) +{ + int i ; + baseGeneId = usedGeneId ; + int lastMinusStrandGeneId = -1 ; + for ( int strand = -1 ; strand <= 1 ; strand +=2 ) + { + for ( i = 0 ; i < seCnt ; ++i ) + { + //printf( "%d (%d %d) %d.\n", i, subexons[i].start + 1, subexons[i].end + 1, subexons[i].geneId ) ; + if ( ( subexons[i].geneId == -1 && ( ( strand == 1 && subexons[i].rightStrand == 0 ) || subexons[i].rightStrand == strand ) ) + || ( strand == 1 && baseGeneId <= subexons[i].geneId && subexons[i].geneId <= lastMinusStrandGeneId && subexons[i].rightStrand == strand ) ) + { + SetGeneId( i, strand, subexons, seCnt, usedGeneId ) ; + if ( strand == -1 ) + lastMinusStrandGeneId = usedGeneId ; + ++usedGeneId ; + } + } + } + + for ( i = 0 ; i < seCnt ; ++i ) + if ( subexons[i].leftType == 0 && subexons[i].rightType == 0 ) + { + subexons[i].geneId = usedGeneId ; + ++usedGeneId ; + } + // Put base and usedGeneId in lcCnt, rcCnt field. + for ( i = 0 ; i < seCnt ; ++i ) + { + subexons[i].lcCnt = baseGeneId ; + subexons[i].rcCnt = usedGeneId ; + } + + /*for ( i = 0 ; i < seCnt ; ++i ) + { + printf( "geneId %d: %d-%d %d\n", i, subexons[i].start + 1, subexons[i].end + 1, subexons[i].geneId ) ; + } + printf("%d %d\n", baseGeneId, usedGeneId ) ;*/ +}