view PsiCLASS-1.0.2/GetTrustedSplice.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line source

#include <stdio.h>
#include <math.h>
#include <vector>
#include <algorithm>

#include "alignments.hpp"

#define MAX(x, y) (((x)<(y))?(y):(x))
#define MIN(x, y) (((x)<(y))?(x):(y))

char usage[] = "Usage: ./trust-splice splice_file_list one_bam_file [OPTIONS]\n"
		"Options:\n"
		"\t-a FLOAT: average number of supported reads from the samples (default: 0.5)\n" ;

struct _intron
{
	int chrId ;
	int start, end ;
	double support ;
	int uniqSupport ;
	int secSupport ;
	char strand ;
	int editDist ;

	int sampleSupport ;
} ;

struct _site
{
	int chrId ;
	int pos ;
	double support ;

	char strand ;
	int associatedIntronCnt ; 
} ;

bool CompIntrons( struct _intron a, struct _intron b )
{
	if ( a.chrId != b.chrId )
		return a.chrId < b.chrId ;
	else if ( a.start != b.start )
		return a.start < b.start ;
	else if ( a.end != b.end )
		return a.end < b.end ;
	return false ;
}

bool CompSites( struct _site a, struct _site b )
{
	if ( a.chrId != b.chrId )
		return a.chrId < b.chrId ;
	else if ( a.pos != b.pos )
		return a.pos < b.pos ;
	return false ;
}

void CoalesceIntrons( std::vector<struct _intron> &introns )
{
	std::sort( introns.begin(), introns.end(), CompIntrons ) ;
	int size = introns.size() ;
	int i, k ;
	k = 0 ;
	for ( i = 1 ; i < size ; ++i )
	{
		if ( introns[i].chrId == introns[k].chrId && introns[i].start == introns[k].start 
			&& introns[i].end == introns[k].end )
		{
			introns[k].support += introns[i].support ;
			introns[k].uniqSupport += introns[i].uniqSupport ;
			introns[k].secSupport += introns[i].secSupport ;
			introns[k].sampleSupport += introns[i].sampleSupport ;

			if ( introns[k].strand == '?' )
				introns[k].strand = introns[i].strand ;	
		}
		else
		{
			++k ;
			introns[k] = introns[i] ;
		}
	}
	introns.resize( k + 1 ) ;
}

void CoalesceSites( std::vector<struct _site> &sites )
{
	std::sort( sites.begin(), sites.end(), CompSites ) ;
	int size = sites.size() ;
	int i, k ;
	k = 0 ;
	for ( i = 1 ; i < size ; ++i )
	{
		if ( sites[i].chrId == sites[k].chrId && sites[i].pos == sites[k].pos ) 
		{
			sites[k].support += sites[i].support ;
			sites[k].associatedIntronCnt += sites[i].associatedIntronCnt ;
		}
		else
		{
			++k ;
			sites[k] = sites[i] ;
		}
	}
	sites.resize( k + 1 ) ;
}

int main( int argc, char *argv[] )
{
	int i, j, k ;
	FILE *fpList ;	
	FILE *fp ;
	char spliceFile[2048] ;
	char chrName[1024], strand[3] ;
	int start, end, uniqSupport, secSupport, uniqEditDistance, secEditDistance ;
	double support ;
	Alignments alignments ;
	std::vector<struct _intron> introns ;
	std::vector<struct _site> sites ;
	double averageSupportThreshold = 0.5 ;

	if ( argc <= 1 )
	{
		printf( "%s", usage ) ;
		exit( 1 ) ;
	}

	for ( i = 3 ; i < argc ; ++i )
	{
		if ( !strcmp( argv[ i ], "-a" ) )
		{
			averageSupportThreshold = atof( argv[i + 1] ) ;
			++i ;
		}
		else
		{
			printf( "Unknown option: %s", argv[i] ) ;
			exit( 1 ) ;
		}
	}

	alignments.Open( argv[2] ) ;

	// Get the number of samples.
	fpList = fopen( argv[1], "r" ) ;
	int sampleCnt = 0 ;
	while ( fscanf( fpList, "%s", spliceFile ) != EOF )
	{
		++sampleCnt ;
	}
	fclose( fpList ) ;

	// Get all the introns.
	fpList = fopen( argv[1], "r" ) ;
	while ( fscanf( fpList, "%s", spliceFile ) != EOF )
	{
		fp = fopen( spliceFile, "r" ) ;	
		while (  fscanf( fp, "%s %d %d %lf %s %d %d %d %d", chrName, &start, &end, &support,
				strand, &uniqSupport, &secSupport, &uniqEditDistance, &secEditDistance ) != EOF )
		{
			
			if ( support <= 0 )
				support = 0.1 ;
			else if ( support == 1 && sampleCnt > 5 )
				support = 0.75 ;

			struct _intron ni ;
			ni.chrId = alignments.GetChromIdFromName( chrName ) ;
			ni.start = start ;
			ni.end = end ;
			ni.support = support ;
			ni.strand = strand[0] ;
			ni.uniqSupport = uniqSupport ;
			ni.secSupport = secSupport ;
			ni.sampleSupport = 1 ;
			ni.editDist = uniqEditDistance + secEditDistance ;
			introns.push_back( ni ) ;
		}
		fclose( fp ) ;

		CoalesceIntrons( introns ) ;
	}
	fclose( fpList ) ;

	// Obtain the split sites.
	int intronCnt = introns.size() ;
	for ( i = 0 ; i < intronCnt ; ++i )
	{
		struct _site ns ;
		ns.chrId = introns[i].chrId ;
		ns.associatedIntronCnt = 1 ;
		ns.support = introns[i].support ;
		ns.strand = introns[i].strand ;

		ns.pos = introns[i].start ;
		sites.push_back( ns ) ;
		ns.pos = introns[i].end ;
		sites.push_back( ns ) ;
	}
	CoalesceSites( sites ) ;

	// Get the chromsomes with too many split sites.
	int siteCnt = sites.size() ;
	std::vector<bool> badChrom ;

	badChrom.resize( alignments.GetChromCount() ) ;
	int size = alignments.GetChromCount() ;
	for ( i = 0 ; i < size ; ++i )
		badChrom[i] = false ;

	for ( i = 0 ; i < siteCnt ; ) 	
	{
		for ( j = i + 1 ; sites[j].chrId == sites[i].chrId ; ++j )	
			;
		//printf( "%s %d %d:\n", alignments.GetChromName( sites[i].chrId ), alignments.GetChromLength( sites[i].chrId ), j - i ) ;
		if ( ( j - i ) * 20 > alignments.GetChromLength( sites[i].chrId ) )
			badChrom[ sites[i].chrId ] = true ;
		i = j ;
	}
	
	// Output the valid introns.
	k = 0 ;
	double unit = sampleCnt / 50 ;
	if ( unit < 1 )
		unit = 1 ;
	
	int longIntronSize ; 
	std::vector<int> intronSizes ;
	for (i = 0 ; i < intronCnt ; ++i)
		intronSizes.push_back( introns[i].end - introns[i].start + 1 ) ;
	std::sort( intronSizes.begin(), intronSizes.end() ) ;
	longIntronSize = intronSizes[ int(intronCnt * 0.99) ] ;
	if ( longIntronSize > 100000 )
		longIntronSize = 100000 ;
	for ( i = 0 ; i < intronCnt ; ++i )
	{
		if ( introns[i].support / sampleCnt < averageSupportThreshold )
			continue ;

		if ( badChrom[ introns[i].chrId ] )
		{
			if ( introns[i].sampleSupport <= sampleCnt / 2 )	
				continue ;
		}
		
		if ( introns[i].uniqSupport <= 2 && ( introns[i].support / sampleCnt < 1 || introns[i].sampleSupport < MIN( 2, sampleCnt ) ) )
			continue ;
		
		//Locate the two split sites.
		while ( sites[k].chrId < introns[i].chrId || ( sites[k].chrId == introns[i].chrId && sites[k].pos < introns[i].start ) )
		{
			++k ;
		}
		int a, b ;
		a = k ;
		for ( b = a + 1 ; b < siteCnt ; ++b )
		{
			if ( sites[b].chrId == introns[i].chrId && sites[b].pos == introns[i].end )
				break ;
		}
		double siteSupport = MAX( sites[a].support, sites[b].support ) ;
		//if ( introns[i].start == 100233371 && introns[i].end == 100236850 )
		//	printf( "test: %lf %lf\n", introns[i].support, siteSupport) ;
		if ( introns[i].support < siteSupport / 10.0 )
		{
			double needSample = MIN( ( -log( introns[i].support  / siteSupport ) / log( 10.0 ) + 1 ) * unit, sampleCnt ) ;
			if ( introns[i].sampleSupport < needSample )
				continue ;
		}
		
		if ( sampleCnt >= 100 ) //&& introns[i].end - introns[i].start + 1 >= 50000 )
		{
			if ( introns[i].sampleSupport <= sampleCnt * 0.01 )
				continue ;
		}
		/*if ( sampleCnt >= 50 )
		{
			// just some randomly intron.
			if ( introns[i].sampleSupport == 1 
				&& ( sites[a].associatedIntronCnt == 1 || sites[b].associatedIntronCnt == 1 ) )
				continue ;
		}*/

		/*if (  introns[i].end - introns[i].start + 1 < 50 )
		{
			int needSample = MIN( ( 5 - ( introns[i].end - introns[i].start + 1 ) / 10 ) * unit, sampleCnt ) ;
			int flag = 0 ;

			if ( introns[i].sampleSupport < needSample )
				flag = 1 ;
			if ( flag == 1 )
				continue ;
		}*/
		if ( introns[i].strand == '?' && sampleCnt == 1 && 
			( introns[i].support < 5 || introns[i].uniqSupport == 0 || introns[i].support < 2 * introns[i].editDist ) )
			continue ;
		
		// Since the strand is uncertain, the alinger may make different decision sample from sample. 
		//	To keep this intron, one of its splice sites must be more supported than adjacent splice sites.
		if ( introns[i].strand == '?' && introns[i].sampleSupport <= 0.5 * sampleCnt ) 
		{
			int s, e ;
			int l ;
			int cnt = 0 ;
			for ( l = 0 ; l < 2 ; ++l )
			{
				int ind = ( l == 0 ) ? a : b ;
				double max = sites[ind].support ;
				int maxTag = ind ;
				for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s )
				{
					if ( sites[s].pos + 7 < sites[s + 1].pos )
						break ;
					if ( sites[s].support >= max )
					{
						max = sites[s].support ;
						maxTag = s ;
					}
				}

				for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e )
				{
					if ( sites[e].pos - 7 > sites[e - 1].pos )
						break ;
					if ( sites[e].support >= max )
					{
						max = sites[e].support ;
						maxTag = e ;
					}
				}

				if ( maxTag == ind )
					++cnt ;
			}
			if ( cnt == 0 )
				continue ;
		}

		// Test whether this a intron coming from a wrong strand
		/*if ( b - a + 1 >= 10 && introns[i].strand != '?' && introns[i].sampleSupport <= 0.5 * sampleCnt )
		{
			int plusStrand = 0 ;
			int minusStrand = 0 ;
			int l ;
			int s, e ;

			if ( introns[i].strand == '+' )
				plusStrand = 2 ;
			else
				minusStrand = 2 ;

			for ( l = 0 ; l < 2 ; ++l )
			{
				int ind = ( l == 0 ) ? a : b ;
				for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s )
				{
					if ( sites[s].pos + 10000 < sites[s + 1].pos )
						break ;
					
					if ( sites[s].strand == '+' )
						++plusStrand ; 
					else if ( sites[s].strand == '-' ) 
						++minusStrand ; 
				}

				for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e )
				{
					if ( sites[e].pos - 10000 > sites[e - 1].pos )
						break ;
					
					if ( sites[e].strand == '+' )
						++plusStrand ; 
					else if ( sites[e].strand == '-' ) 
						++minusStrand ;
				}
			}

			if ( introns[i].start == 161517978 )
				printf( "capture: %d %d %d %d\n", a, b, minusStrand, plusStrand) ;
			
			if ( introns[i].strand == '+' && minusStrand >= 20  && plusStrand == 2 )
				continue ;
			else if ( introns[i].strand == '-' && plusStrand >= 20 && minusStrand == 2 )
				continue ;

		}*/
	
		// Filter a intron if one of its splice site associated with too many introns.
		/*if ( sites[a].associatedIntronCnt >= 10 )
		{
			int needSample = MIN( sites[a].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ;
			if ( introns[i].support < sites[a].support / 100 && 
				introns[i].sampleSupport < needSample )
			{
				continue ;		
			}
		}
		if ( sites[b].associatedIntronCnt >= 10 )
		{
			int needSample = MIN( sites[b].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ;
			if ( introns[i].support < sites[b].support / 100 && 
				introns[i].sampleSupport < needSample )
			{
				continue ;
			}
		}*/


		// Test for long intron
		if ( introns[i].end - introns[i].start + 1 >= longIntronSize )
		{
			int needSample = MIN( ( ( introns[i].end - introns[i].start + 1 ) / longIntronSize + 1 ) * unit, sampleCnt ) ;
			int flag = 0 ;
			if ( (double)introns[i].uniqSupport / ( introns[i].uniqSupport + introns[i].secSupport ) < 0.1 
				|| introns[i].uniqSupport / sampleCnt < 1 
				|| introns[i].sampleSupport < needSample )
				flag = 1 ;
			if ( flag == 1 && introns[i].end - introns[i].start + 1 >= 3 * longIntronSize )
				continue ;
			if ( flag == 1 && ( sites[a].associatedIntronCnt > 1 || sites[b].associatedIntronCnt > 1 || introns[i].sampleSupport <= 1 ) ) // an intron may connect two genes
				continue ;
		}
		
				
		printf( "%s %d %d 10 %c 10 0 0 0\n", alignments.GetChromName( introns[i].chrId ), introns[i].start, introns[i].end, introns[i].strand ) ;
	}

	return 0 ;
}