diff PsiCLASS-1.0.2/AddXS.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PsiCLASS-1.0.2/AddXS.cpp	Fri Mar 26 16:52:45 2021 +0000
@@ -0,0 +1,583 @@
+#include <stdio.h>
+#include <stdint.h>
+
+#include <string.h>
+#include <stdlib.h>
+#include <vector>
+#include <map>
+#include <fstream>
+
+char usage[] = 
+	"./addXS ref.fa [OPTIONS] < in.sam > out.sam\n"
+		"From bam to bam: samtools view -h in.bam | ./addXS ref.fa | samtools view -bS - > out.bam\n" ;
+char samLine[65537] ;
+char seq[65537] ;
+
+char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2,
+			-1, -1, -1, -1, -1, -1, 0, // Regard 'N' as 'A' 
+			-1, -1, -1, -1, -1, 3,
+			-1, -1, -1, -1, -1, -1 } ;
+char numToNuc[26] = {'A', 'C', 'G', 'T'} ;
+
+struct _pair
+{
+	int a, b ;
+} ;
+
+class BitSequence 
+{
+private:
+	int len ;
+	//std::vector<uint32_t> sequence ;
+	uint32_t *sequence ;
+
+public:
+	BitSequence() { len = 0 ; sequence = NULL ;} 
+	BitSequence( int l )
+	{
+		len = 0 ;
+		sequence = new uint32_t[ l / 16 + 1 ] ;
+	}
+	
+	~BitSequence() 
+	{
+		//if ( sequence != NULL )
+		//	delete[] sequence ;
+	}
+	
+	void SetLength( int l )
+	{
+		len = 0 ;
+		if ( sequence != NULL )
+			delete[] sequence ;
+		sequence = new uint32_t[ l / 16 + 1 ] ;
+	}
+
+	int GetLength()
+	{
+		return len ;
+	}
+
+	void Append( char c )
+	{
+		if ( ( len & 15 ) == 0 )
+		{
+			sequence[ len / 16 ] = 0 ;
+		}
+		++len ;
+		//printf( "%d %c\n", len, c ) ;
+		Set( c, len - 1 ) ;
+	}
+	
+	// pos is 0-based coordinate
+	// notice that the order within one 32 bit butcket is reversed
+	void Set( char c, int pos ) 
+	{
+		if ( pos >= len )		
+			return ;
+
+		if ( c >= 'a' && c <= 'z' )
+		{
+			c = c - 'a' + 'A' ;
+		}
+		if ( c == 'N' )
+			c = 'A' ;
+
+		int ind = pos >> 4 ;
+		int offset = pos & 15 ;
+		int mask = ( (int)( nucToNum[c - 'A'] & 3 ) ) << ( 2 * offset ) ;
+		sequence[ind] = sequence[ind] | mask ;
+		//if ( c != 'A' )
+		//	printf( "%d: %c %c %d %d : %d\n", pos, c, Get(pos), ind, offset, mask ) ;
+		//Print() ;
+	}
+
+	char Get( int pos )
+	{
+		if ( pos >= len )
+			return 'N' ;
+
+		int ind = pos >> 4 ;
+		int offset = pos & 15 ;
+		//printf( "%d: %d\n", pos, sequence[ind] ) ;
+		return numToNuc[ ( ( sequence[ind] >> ( 2 * offset ) ) & 3 ) ] ;
+	}
+
+	void Print()
+	{
+		int i ;
+		for ( i = 0 ; i < len ; ++i )
+			printf( "%c", Get( i ) ) ;
+		printf( "\n" ) ;
+	}
+
+	void Print( FILE *fp, int start, int end, bool rc )
+	{	
+		if ( !rc )
+		{
+			for ( int i = start ; i <= end ; ++i )
+				fprintf( fp, "%c", Get( i ) ) ;
+		}
+		else
+		{
+			for ( int i = end ; i >= start ; --i )
+			{
+				char c = Get( i ) ;
+				if ( c == 'A' )
+					c = 'T' ;
+				else if ( c == 'C' )
+					c = 'G' ; 
+				else if ( c == 'G' )
+					c = 'C' ; 
+				else //if ( c == 'T' )
+					c = 'A' ; 
+				fprintf( fp, "%c", c ) ;
+			} 
+		}
+	}
+} ;
+
+void ReverseComplement( char *s, int l ) 
+{
+	int u, v ;
+	for ( u = 0, v = l - 1 ; u <= v ; ++u, --v )
+	{
+		char tmp ;
+		tmp = s[v] ;
+		s[v] = s[u] ;
+		s[u] = tmp ;
+
+		s[u] = numToNuc[ 3 - nucToNum[ s[u] - 'A' ] ] ;
+		s[v] = numToNuc[ 3 - nucToNum[ s[v] - 'A' ] ] ;
+	}
+}
+
+
+int main( int argc, char *argv[] )
+{
+	int i, j, k ;
+	
+	std::vector<BitSequence> genome ;
+	std::map<std::string, int> chrToChrId ;
+
+	char readid[200], chrom[50], mapq[10], cigar[1000], mateChrom[50] ;
+	char buffer[1024] ;
+	char pattern[1024] ;
+	int start, mstart, flag, tlen ; // read start and mate read start
+	int seqLen ;
+	int chrCnt = 0 ;
+	int chrId ;
+	int width = 7 ;
+	int score ; // 0-bit: multiple aligned, 1-bit can shift the intron, 2-bit can shift the alignment, 3-bit contain sequencing error
+
+	if ( argc < 2 )
+	{
+		printf( "%s", usage ) ;
+		return 0 ;
+	}
+	
+
+	std::ifstream fpRef ;	
+	fpRef.open( argv[1] ) ;
+	std::string line ;
+	
+	int motifStrand[1024] ;
+	
+	/*motifStrand[ 0b10110010 ] = 1 ; // GT/AG
+	motifStrand[ 0b01110001 ] = 0 ; // CT/AC
+	motifStrand[ 0b10010010 ] = 1 ;// GC/AG
+	motifStrand[ 0b01111001 ] = 0 ; // CT/GC
+	motifStrand[ 0b00110001 ] = 1 ;	// AT/AC
+	motifStrand[ 0b10110011 ] = 0 ; // GT/AT*/
+	memset( motifStrand, -1, sizeof( motifStrand )) ;
+	motifStrand[ 0xb2 ] = 1 ; // GT/AG
+	motifStrand[ 0x71 ] = 0 ; // CT/AC
+	motifStrand[ 0x92 ] = 1 ;// GC/AG
+	motifStrand[ 0x79 ] = 0 ; // CT/GC
+	motifStrand[ 0x31 ] = 1 ;	// AT/AC
+	motifStrand[ 0xb3 ] = 0 ; // GT/AT
+	 
+	k = 0 ;
+	while ( getline( fpRef, line ) )
+	{
+		//printf( "%s\n", line.c_str() ) ;
+		int len = line.length() ;
+		if ( line[0] == '>' )
+		{
+			//char *s = strdup( line.c_str() + 1 ) ;
+			if ( chrCnt > 0 )
+			{
+				genome[ chrCnt - 1 ].SetLength( k ) ;
+			}
+			for ( i = 1 ; i < len ; ++i )
+				if ( line[i] == ' ' || line[i] == '\t' )
+					break ;
+			chrToChrId[ line.substr( 1, i - 1 ) ] = chrCnt ;
+			++chrCnt ;
+			
+			BitSequence bs ;
+			genome.push_back( bs ) ;
+			
+			k = 0 ;
+		}	
+		else
+		{
+			k += len ;
+		}			
+	}
+	genome[ chrCnt - 1 ].SetLength( k ) ;
+	fpRef.close() ;
+	
+	fpRef.open( argv[1] ) ;
+	while ( getline( fpRef, line ) )
+	{
+		//printf( "%s\n", line.c_str() ) ;
+		int len = line.length() ;
+		if ( line[0] == '>' )
+		{
+			//char *s = strdup( line.c_str() + 1 ) ;
+			for ( i = 1 ; i < len ; ++i )
+				if ( line[i] == ' ' || line[i] == '\t' )
+					break ;
+			chrId = chrToChrId[ line.substr( 1, i - 1 ) ] ;
+		}	
+		else
+		{
+			BitSequence &bs = genome[chrId] ;
+			for ( i = 0 ; i < len ; ++i )
+			{
+				if ( ( line[i] >= 'A' && line[i] <= 'Z' ) ||
+					( line[i] >= 'a' && line[i] <= 'z' ) )			
+					bs.Append( line[i] ) ;
+			}
+		}			
+	}	
+	fpRef.close() ;
+	
+	
+	
+	while ( fgets( samLine, sizeof( samLine ), stdin ) != NULL )
+	{
+		if ( samLine[0] == '@' )
+		{
+			printf( "%s", samLine ) ;
+			continue ;
+		}
+		
+		sscanf( samLine, "%s %d %s %d %s %s %s %d %d %s", readid, &flag, chrom, &start, mapq, cigar, mateChrom, &mstart, &tlen, seq ) ;
+		struct _pair segments[1000] ;
+		struct _pair seqSegments[1000] ; // the segments with respect to the sequence.
+		int segCnt = 0 ;
+		int seqSegCnt = 0 ;
+		int num = 0 ;
+		int len = 0 ;
+		
+		score = 0 ;
+	
+		for ( i = 0 ; cigar[i] ; ++i )
+		{
+			if ( cigar[i] >= '0' && cigar[i] <= '9' )
+				num = num * 10 + cigar[i] - '0' ;
+			else if ( cigar[i] == 'I' || cigar[i] == 'S' || cigar[i] == 'H'
+				|| cigar[i] == 'P' )
+			{
+				num = 0 ;
+			}
+			else if ( cigar[i] == 'N' )
+			{
+				segments[segCnt].a = start ;
+				segments[segCnt].b = start + len - 1 ;
+				++segCnt ;
+				
+				start = start + len + num ;
+				len = 0 ;
+				num = 0 ;
+			}
+			else
+			{
+				len += num ;
+				num = 0 ;
+			}
+		}
+	
+
+		if ( len > 0 )
+		{
+			segments[segCnt].a = start ;
+			segments[segCnt].b = start + len - 1 ;
+			++segCnt ;
+		}
+			
+		if ( segCnt <= 1 || strstr( samLine, "XS:A:" ) != NULL )
+		{
+			printf( "%s", samLine ) ;
+			continue ;
+		}
+			
+		std::string schr( chrom ) ;
+		int chrId = chrToChrId[schr] ;
+		int strand = -1 ;
+		
+		len = strlen( samLine ) ;
+		samLine[len - 1] = '\0' ;
+		int motif = 0 ;
+		
+		BitSequence &chrSeq = genome[ chrId ] ;
+
+		for ( i = 0 ; i < segCnt - 1 ; ++i )
+		{
+			char m[4] ;
+			m[0] = chrSeq.Get( segments[i].b + 1 - 1  ) ;
+			m[1] = chrSeq.Get( segments[i].b + 2 - 1 ) ;
+			m[2] = chrSeq.Get( segments[i + 1].a - 2 - 1 ) ;
+			m[3] = chrSeq.Get( segments[i + 1].a - 1 - 1 ) ;
+			motif = 0 ;
+			for ( j = 0 ; j < 4 ; ++j )
+				motif = ( motif << 2 ) + ( nucToNum[ m[j] - 'A' ] & 3 );
+			strand = motifStrand[ motif ] ;
+			if ( strand != -1 )
+				break ;
+		}
+		if ( strand == 1 )
+		{
+			printf( "%s\tXS:A:+\n", samLine ) ;
+			continue ;
+		}
+		else if ( strand == 0 )
+		{
+			printf( "%s\tXS:A:-\n", samLine ) ;
+			continue ;
+		}
+
+		char *p ;
+		if ( ( p = strstr( samLine, "NH:i:" ) ) != NULL )
+		{
+			p += 5 ;
+			num = 0 ;
+			while ( *p >= '0' && *p <= '9' )
+			{
+				num = num * 10 + *p - '0' ;
+				++p ;
+			}
+			if ( num > 1 )
+				score |= 1 ;
+		}
+		
+		seqLen = strlen( seq ) ;
+		for ( i = 0 ; i < seqLen ; ++i )
+			if ( seq[i] == 'N' )
+			{
+				score |= 1 ;
+				break ;
+			}
+
+		num = 0 ;
+		len = 0 ;
+		seqSegCnt = 0 ;
+		start = 0 ;
+		for ( i = 0 ; cigar[i] ; ++i )
+		{
+			if ( cigar[i] >= '0' && cigar[i] <= '9' )
+				num = num * 10 + cigar[i] - '0' ;
+			else if ( cigar[i] == 'D' || cigar[i] == 'S' || cigar[i] == 'H'
+				|| cigar[i] == 'P' )
+			{
+				num = 0 ;
+			}
+			else if ( cigar[i] == 'N' )
+			{
+				seqSegments[seqSegCnt].a = start ;
+				seqSegments[seqSegCnt].b = start + len - 1 ;
+				++seqSegCnt ;
+				
+				start = start + len ;
+				len = 0 ;
+				num = 0 ;
+			}
+			else
+			{
+				len += num ;
+				num = 0 ;
+			}
+		}
+		if ( len > 0 )
+		{
+			seqSegments[seqSegCnt].a = start ;
+			seqSegments[seqSegCnt].b = start + len - 1 ;
+			++seqSegCnt ;
+		}
+
+		// Check whether we can shift the intron to a canonical splice site
+		for ( i = 0 ; i < segCnt - 1 ; ++i )
+		{
+			for ( j = -width ; j < width ; ++j )
+			{
+				if ( segments[i].b + j < segments[i].a || segments[i+1].a + j > segments[i+1].b )
+					continue ;
+				// Look for the splice signal.
+				char m[4] ;
+				m[0] = chrSeq.Get( segments[i].b + j + 1 - 1  ) ;
+				m[1] = chrSeq.Get( segments[i].b + j + 2 - 1 ) ;
+				m[2] = chrSeq.Get( segments[i + 1].a + j - 2 - 1 ) ;
+				m[3] = chrSeq.Get( segments[i + 1].a + j - 1 - 1 ) ;
+				motif = 0 ;
+				for ( k = 0 ; k < 4 ; ++k )
+					motif = ( motif << 2 ) + ( nucToNum[ m[k] - 'A' ] & 3 );
+				strand = motifStrand[ motif ] ;
+				if ( strand == -1 )
+					continue ;
+				//printf( "%d %d: %d\n", i, j, motif ) ;
+
+				// We found a signal, then test whether we can shift the intron.
+				// Extract the sequence from the reference genome.
+				int l = 0 ;
+				if ( j < 0 )
+					for ( k = segments[i + 1].a + j, l = 0 ; k <= segments[i + 1].a - 1 ; ++k, ++l )					
+						buffer[l] = chrSeq.Get( k - 1 ) ;
+				else
+					for ( k = segments[i].b + 1, l= 0  ; k <= segments[i].b + j ; ++k, ++l )
+						buffer[l] = chrSeq.Get(k - 1) ;
+				buffer[l] = '\0' ;
+
+				//printf( "%s\n", buffer ) ;
+
+				// Get the coordinate on the sequence.
+				int tag ;
+				int useBegin = 0 ; // is the portion from the beginning part of a seqSegment?
+				int from, to ;
+				if ( j < 0 )
+				{
+					tag = i ;
+					useBegin = 0 ;
+				}
+				else // j>0
+				{
+					tag = i + 1 ;
+					useBegin = 1 ;
+				}
+				//printf( "%d %d\n", tag, useBegin ) ;	
+				
+				if ( ( flag & 0x10 ) != 0 )
+				{
+					//TODO: it seems star already fliped the sequence.
+					//  is it true for other aligners?
+					
+					//tag = segCnt - 1 - tag ;
+					//useBegin = 1 - useBegin ;
+					
+				
+					//ReverseComplement( buffer, l ) ;
+				}
+
+				if ( useBegin )
+				{
+					from = seqSegments[tag].a ;
+					to = seqSegments[tag].a + l - 1 ;
+				}
+				else
+				{
+					from = seqSegments[tag].b - l + 1 ;
+					to = seqSegments[tag].b ;
+				}
+				//printf( "%d %d %d\n", tag, from, to ) ;
+
+				for ( k = 0 ; k < l ; ++k )
+				{
+					if ( seq[from + k] != buffer[k] )
+						break ;
+				}
+				
+
+				if ( k >= l )
+				{
+					// We can shift the intron
+					score |= 2 ;
+					break ;
+				}
+			}
+			if ( j < width )
+				break ;
+		}
+
+		// Check whether the sequence is low-complexity. To do this, we test whether we can shift the seqSegments and directly inspect the sequence.
+		for ( i = 0 ; i < seqSegCnt ; ++i )
+		{
+			int l ;
+			for ( k = 0, j = seqSegments[i].a - width ; j <= seqSegments[i].b + width ; ++j, ++k )
+				buffer[k] = chrSeq.Get( j ) ;
+			buffer[k] = '0' ;
+			
+			for ( l = 0, j = seqSegments[i].a ; j <= seqSegments[i].b ; ++j, ++l )
+			{
+				pattern[l] = seq[j] ;		
+			}
+			pattern[l] = '\0' ;
+
+			// It seems the aligner already flipped the read in its output?
+			//if ( flag & 0x10 != 0 )
+			//	ReverseComplement( pattern, l ) ;
+			
+			
+			char *p = buffer ;
+			int cnt = 0 ;
+			while ( ( p = strstr( p, pattern ) ) != NULL )
+			{
+				++cnt ;
+				++p ;
+			}
+			if ( cnt > 1 )
+			{
+				score |= 4 ;
+				break ;
+			}
+
+			int count[5] = { 0, 0, 0, 0, 0 } ;
+			int max = 0 ;
+			for ( j = 0 ; j < l ; ++j )
+			{
+				++count[ nucToNum[ pattern[j] - 'A' ] ] ; 	
+			}
+
+			for ( j = 0 ; j < 5 ; ++j )
+			{
+				if ( count[j] > max )
+					max = count[j] ;
+			}
+			if ( max > 0.8 * l )
+			{
+				score |= 4 ;
+				break ;
+			}
+		}
+
+		if ( ( p = strstr( samLine, "NM:i:" ) ) != NULL || ( p = strstr( samLine, "nM:i:" ) ) != NULL )
+		{
+			int nm = atoi( p + 5 ) ;
+			if ( nm > 0 )
+			{
+				score |= 8 ;
+			}
+		}
+		else
+		{
+			// TODO: check the nm by ourself
+		}
+
+		// Check whether the alignment is concordant.
+		if ( ( flag & 0x1 ) != 0 && ( flag & 0x4 ) == 0 )
+		{
+			start = segments[0].a ;
+			if ( strcmp( mateChrom, "=" ) 
+				|| ( flag & 0x10 ) == ( flag & 0x20 ) 
+				|| ( ( flag & 0x10 ) == 0 && ( mstart < start || mstart > start + 2000000 ) ) 
+				|| ( ( flag & 0x10 ) != 0 && ( mstart > start || mstart < start - 2000000 ) ) )
+			{
+				score |= 16 ;
+			}
+		}
+
+		printf( "%s\tXS:A:?\tYS:i:%d\n", samLine, score ) ;
+	}
+
+	return 0 ;
+}