Mercurial > repos > lsong10 > psiclass

diff PsiCLASS-1.0.2/GetTrustedSplice.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author: lsong10
date: Fri, 26 Mar 2021 16:52:45 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PsiCLASS-1.0.2/GetTrustedSplice.cpp	Fri Mar 26 16:52:45 2021 +0000
@@ -0,0 +1,429 @@
+#include <stdio.h>
+#include <math.h>
+#include <vector>
+#include <algorithm>
+
+#include "alignments.hpp"
+
+#define MAX(x, y) (((x)<(y))?(y):(x))
+#define MIN(x, y) (((x)<(y))?(x):(y))
+
+char usage[] = "Usage: ./trust-splice splice_file_list one_bam_file [OPTIONS]\n"
+		"Options:\n"
+		"\t-a FLOAT: average number of supported reads from the samples (default: 0.5)\n" ;
+
+struct _intron
+{
+	int chrId ;
+	int start, end ;
+	double support ;
+	int uniqSupport ;
+	int secSupport ;
+	char strand ;
+	int editDist ;
+
+	int sampleSupport ;
+} ;
+
+struct _site
+{
+	int chrId ;
+	int pos ;
+	double support ;
+
+	char strand ;
+	int associatedIntronCnt ; 
+} ;
+
+bool CompIntrons( struct _intron a, struct _intron b )
+{
+	if ( a.chrId != b.chrId )
+		return a.chrId < b.chrId ;
+	else if ( a.start != b.start )
+		return a.start < b.start ;
+	else if ( a.end != b.end )
+		return a.end < b.end ;
+	return false ;
+}
+
+bool CompSites( struct _site a, struct _site b )
+{
+	if ( a.chrId != b.chrId )
+		return a.chrId < b.chrId ;
+	else if ( a.pos != b.pos )
+		return a.pos < b.pos ;
+	return false ;
+}
+
+void CoalesceIntrons( std::vector<struct _intron> &introns )
+{
+	std::sort( introns.begin(), introns.end(), CompIntrons ) ;
+	int size = introns.size() ;
+	int i, k ;
+	k = 0 ;
+	for ( i = 1 ; i < size ; ++i )
+	{
+		if ( introns[i].chrId == introns[k].chrId && introns[i].start == introns[k].start 
+			&& introns[i].end == introns[k].end )
+		{
+			introns[k].support += introns[i].support ;
+			introns[k].uniqSupport += introns[i].uniqSupport ;
+			introns[k].secSupport += introns[i].secSupport ;
+			introns[k].sampleSupport += introns[i].sampleSupport ;
+
+			if ( introns[k].strand == '?' )
+				introns[k].strand = introns[i].strand ;	
+		}
+		else
+		{
+			++k ;
+			introns[k] = introns[i] ;
+		}
+	}
+	introns.resize( k + 1 ) ;
+}
+
+void CoalesceSites( std::vector<struct _site> &sites )
+{
+	std::sort( sites.begin(), sites.end(), CompSites ) ;
+	int size = sites.size() ;
+	int i, k ;
+	k = 0 ;
+	for ( i = 1 ; i < size ; ++i )
+	{
+		if ( sites[i].chrId == sites[k].chrId && sites[i].pos == sites[k].pos ) 
+		{
+			sites[k].support += sites[i].support ;
+			sites[k].associatedIntronCnt += sites[i].associatedIntronCnt ;
+		}
+		else
+		{
+			++k ;
+			sites[k] = sites[i] ;
+		}
+	}
+	sites.resize( k + 1 ) ;
+}
+
+int main( int argc, char *argv[] )
+{
+	int i, j, k ;
+	FILE *fpList ;	
+	FILE *fp ;
+	char spliceFile[2048] ;
+	char chrName[1024], strand[3] ;
+	int start, end, uniqSupport, secSupport, uniqEditDistance, secEditDistance ;
+	double support ;
+	Alignments alignments ;
+	std::vector<struct _intron> introns ;
+	std::vector<struct _site> sites ;
+	double averageSupportThreshold = 0.5 ;
+
+	if ( argc <= 1 )
+	{
+		printf( "%s", usage ) ;
+		exit( 1 ) ;
+	}
+
+	for ( i = 3 ; i < argc ; ++i )
+	{
+		if ( !strcmp( argv[ i ], "-a" ) )
+		{
+			averageSupportThreshold = atof( argv[i + 1] ) ;
+			++i ;
+		}
+		else
+		{
+			printf( "Unknown option: %s", argv[i] ) ;
+			exit( 1 ) ;
+		}
+	}
+
+	alignments.Open( argv[2] ) ;
+
+	// Get the number of samples.
+	fpList = fopen( argv[1], "r" ) ;
+	int sampleCnt = 0 ;
+	while ( fscanf( fpList, "%s", spliceFile ) != EOF )
+	{
+		++sampleCnt ;
+	}
+	fclose( fpList ) ;
+
+	// Get all the introns.
+	fpList = fopen( argv[1], "r" ) ;
+	while ( fscanf( fpList, "%s", spliceFile ) != EOF )
+	{
+		fp = fopen( spliceFile, "r" ) ;	
+		while (  fscanf( fp, "%s %d %d %lf %s %d %d %d %d", chrName, &start, &end, &support,
+				strand, &uniqSupport, &secSupport, &uniqEditDistance, &secEditDistance ) != EOF )
+		{
+			
+			if ( support <= 0 )
+				support = 0.1 ;
+			else if ( support == 1 && sampleCnt > 5 )
+				support = 0.75 ;
+
+			struct _intron ni ;
+			ni.chrId = alignments.GetChromIdFromName( chrName ) ;
+			ni.start = start ;
+			ni.end = end ;
+			ni.support = support ;
+			ni.strand = strand[0] ;
+			ni.uniqSupport = uniqSupport ;
+			ni.secSupport = secSupport ;
+			ni.sampleSupport = 1 ;
+			ni.editDist = uniqEditDistance + secEditDistance ;
+			introns.push_back( ni ) ;
+		}
+		fclose( fp ) ;
+
+		CoalesceIntrons( introns ) ;
+	}
+	fclose( fpList ) ;
+
+	// Obtain the split sites.
+	int intronCnt = introns.size() ;
+	for ( i = 0 ; i < intronCnt ; ++i )
+	{
+		struct _site ns ;
+		ns.chrId = introns[i].chrId ;
+		ns.associatedIntronCnt = 1 ;
+		ns.support = introns[i].support ;
+		ns.strand = introns[i].strand ;
+
+		ns.pos = introns[i].start ;
+		sites.push_back( ns ) ;
+		ns.pos = introns[i].end ;
+		sites.push_back( ns ) ;
+	}
+	CoalesceSites( sites ) ;
+
+	// Get the chromsomes with too many split sites.
+	int siteCnt = sites.size() ;
+	std::vector<bool> badChrom ;
+
+	badChrom.resize( alignments.GetChromCount() ) ;
+	int size = alignments.GetChromCount() ;
+	for ( i = 0 ; i < size ; ++i )
+		badChrom[i] = false ;
+
+	for ( i = 0 ; i < siteCnt ; ) 	
+	{
+		for ( j = i + 1 ; sites[j].chrId == sites[i].chrId ; ++j )	
+			;
+		//printf( "%s %d %d:\n", alignments.GetChromName( sites[i].chrId ), alignments.GetChromLength( sites[i].chrId ), j - i ) ;
+		if ( ( j - i ) * 20 > alignments.GetChromLength( sites[i].chrId ) )
+			badChrom[ sites[i].chrId ] = true ;
+		i = j ;
+	}
+	
+	// Output the valid introns.
+	k = 0 ;
+	double unit = sampleCnt / 50 ;
+	if ( unit < 1 )
+		unit = 1 ;
+	
+	int longIntronSize ; 
+	std::vector<int> intronSizes ;
+	for (i = 0 ; i < intronCnt ; ++i)
+		intronSizes.push_back( introns[i].end - introns[i].start + 1 ) ;
+	std::sort( intronSizes.begin(), intronSizes.end() ) ;
+	longIntronSize = intronSizes[ int(intronCnt * 0.99) ] ;
+	if ( longIntronSize > 100000 )
+		longIntronSize = 100000 ;
+	for ( i = 0 ; i < intronCnt ; ++i )
+	{
+		if ( introns[i].support / sampleCnt < averageSupportThreshold )
+			continue ;
+
+		if ( badChrom[ introns[i].chrId ] )
+		{
+			if ( introns[i].sampleSupport <= sampleCnt / 2 )	
+				continue ;
+		}
+		
+		if ( introns[i].uniqSupport <= 2 && ( introns[i].support / sampleCnt < 1 || introns[i].sampleSupport < MIN( 2, sampleCnt ) ) )
+			continue ;
+		
+		//Locate the two split sites.
+		while ( sites[k].chrId < introns[i].chrId || ( sites[k].chrId == introns[i].chrId && sites[k].pos < introns[i].start ) )
+		{
+			++k ;
+		}
+		int a, b ;
+		a = k ;
+		for ( b = a + 1 ; b < siteCnt ; ++b )
+		{
+			if ( sites[b].chrId == introns[i].chrId && sites[b].pos == introns[i].end )
+				break ;
+		}
+		double siteSupport = MAX( sites[a].support, sites[b].support ) ;
+		//if ( introns[i].start == 100233371 && introns[i].end == 100236850 )
+		//	printf( "test: %lf %lf\n", introns[i].support, siteSupport) ;
+		if ( introns[i].support < siteSupport / 10.0 )
+		{
+			double needSample = MIN( ( -log( introns[i].support  / siteSupport ) / log( 10.0 ) + 1 ) * unit, sampleCnt ) ;
+			if ( introns[i].sampleSupport < needSample )
+				continue ;
+		}
+		
+		if ( sampleCnt >= 100 ) //&& introns[i].end - introns[i].start + 1 >= 50000 )
+		{
+			if ( introns[i].sampleSupport <= sampleCnt * 0.01 )
+				continue ;
+		}
+		/*if ( sampleCnt >= 50 )
+		{
+			// just some randomly intron.
+			if ( introns[i].sampleSupport == 1 
+				&& ( sites[a].associatedIntronCnt == 1 || sites[b].associatedIntronCnt == 1 ) )
+				continue ;
+		}*/
+
+		/*if (  introns[i].end - introns[i].start + 1 < 50 )
+		{
+			int needSample = MIN( ( 5 - ( introns[i].end - introns[i].start + 1 ) / 10 ) * unit, sampleCnt ) ;
+			int flag = 0 ;
+
+			if ( introns[i].sampleSupport < needSample )
+				flag = 1 ;
+			if ( flag == 1 )
+				continue ;
+		}*/
+		if ( introns[i].strand == '?' && sampleCnt == 1 && 
+			( introns[i].support < 5 || introns[i].uniqSupport == 0 || introns[i].support < 2 * introns[i].editDist ) )
+			continue ;
+		
+		// Since the strand is uncertain, the alinger may make different decision sample from sample. 
+		//	To keep this intron, one of its splice sites must be more supported than adjacent splice sites.
+		if ( introns[i].strand == '?' && introns[i].sampleSupport <= 0.5 * sampleCnt ) 
+		{
+			int s, e ;
+			int l ;
+			int cnt = 0 ;
+			for ( l = 0 ; l < 2 ; ++l )
+			{
+				int ind = ( l == 0 ) ? a : b ;
+				double max = sites[ind].support ;
+				int maxTag = ind ;
+				for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s )
+				{
+					if ( sites[s].pos + 7 < sites[s + 1].pos )
+						break ;
+					if ( sites[s].support >= max )
+					{
+						max = sites[s].support ;
+						maxTag = s ;
+					}
+				}
+
+				for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e )
+				{
+					if ( sites[e].pos - 7 > sites[e - 1].pos )
+						break ;
+					if ( sites[e].support >= max )
+					{
+						max = sites[e].support ;
+						maxTag = e ;
+					}
+				}
+
+				if ( maxTag == ind )
+					++cnt ;
+			}
+			if ( cnt == 0 )
+				continue ;
+		}
+
+		// Test whether this a intron coming from a wrong strand
+		/*if ( b - a + 1 >= 10 && introns[i].strand != '?' && introns[i].sampleSupport <= 0.5 * sampleCnt )
+		{
+			int plusStrand = 0 ;
+			int minusStrand = 0 ;
+			int l ;
+			int s, e ;
+
+			if ( introns[i].strand == '+' )
+				plusStrand = 2 ;
+			else
+				minusStrand = 2 ;
+
+			for ( l = 0 ; l < 2 ; ++l )
+			{
+				int ind = ( l == 0 ) ? a : b ;
+				for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s )
+				{
+					if ( sites[s].pos + 10000 < sites[s + 1].pos )
+						break ;
+					
+					if ( sites[s].strand == '+' )
+						++plusStrand ; 
+					else if ( sites[s].strand == '-' ) 
+						++minusStrand ; 
+				}
+
+				for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e )
+				{
+					if ( sites[e].pos - 10000 > sites[e - 1].pos )
+						break ;
+					
+					if ( sites[e].strand == '+' )
+						++plusStrand ; 
+					else if ( sites[e].strand == '-' ) 
+						++minusStrand ;
+				}
+			}
+
+			if ( introns[i].start == 161517978 )
+				printf( "capture: %d %d %d %d\n", a, b, minusStrand, plusStrand) ;
+			
+			if ( introns[i].strand == '+' && minusStrand >= 20  && plusStrand == 2 )
+				continue ;
+			else if ( introns[i].strand == '-' && plusStrand >= 20 && minusStrand == 2 )
+				continue ;
+
+		}*/
+	
+		// Filter a intron if one of its splice site associated with too many introns.
+		/*if ( sites[a].associatedIntronCnt >= 10 )
+		{
+			int needSample = MIN( sites[a].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ;
+			if ( introns[i].support < sites[a].support / 100 && 
+				introns[i].sampleSupport < needSample )
+			{
+				continue ;		
+			}
+		}
+		if ( sites[b].associatedIntronCnt >= 10 )
+		{
+			int needSample = MIN( sites[b].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ;
+			if ( introns[i].support < sites[b].support / 100 && 
+				introns[i].sampleSupport < needSample )
+			{
+				continue ;
+			}
+		}*/
+
+
+		// Test for long intron
+		if ( introns[i].end - introns[i].start + 1 >= longIntronSize )
+		{
+			int needSample = MIN( ( ( introns[i].end - introns[i].start + 1 ) / longIntronSize + 1 ) * unit, sampleCnt ) ;
+			int flag = 0 ;
+			if ( (double)introns[i].uniqSupport / ( introns[i].uniqSupport + introns[i].secSupport ) < 0.1 
+				|| introns[i].uniqSupport / sampleCnt < 1 
+				|| introns[i].sampleSupport < needSample )
+				flag = 1 ;
+			if ( flag == 1 && introns[i].end - introns[i].start + 1 >= 3 * longIntronSize )
+				continue ;
+			if ( flag == 1 && ( sites[a].associatedIntronCnt > 1 || sites[b].associatedIntronCnt > 1 || introns[i].sampleSupport <= 1 ) ) // an intron may connect two genes
+				continue ;
+		}
+		
+				
+		printf( "%s %d %d 10 %c 10 0 0 0\n", alignments.GetChromName( introns[i].chrId ), introns[i].start, introns[i].end, introns[i].strand ) ;
+	}
+
+	return 0 ;
+}
author	lsong10
date	Fri, 26 Mar 2021 16:52:45 +0000
parents
children