view PsiCLASS-1.0.2/AddGeneName.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line source

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include <string>
#include <map>
#include <vector>
#include <algorithm>

char usage[] = "Usage: ./add-genename annotation.gtf gtflist [OPTIONS]\n"
		"\t-o: the directory for output gtf files (default: ./)\n" ;

struct _interval
{
	char chrom[97] ;
	char geneName[97] ;

	int start, end ;
} ;

char line[10000] ;
char buffer[10000], buffer2[10000], buffer3[10000] ;

int CompInterval( const struct _interval &a, const struct _interval &b )
{
	int chromCmp = strcmp( a.chrom, b.chrom ) ;
	if ( chromCmp )
		return chromCmp ;
	else if ( a.start != b.start )
		return a.start - b.start ;
	else
		return a.end - b.end ;
}

bool Comp( const struct _interval &a, const struct _interval &b )
{
	int cmp = CompInterval( a, b ) ;
	if ( cmp < 0 )
		return true ;
	else
		return false ;

	return false ;
}

void SortAndCleanIntervals( std::vector<struct _interval> &it )
{
	std::sort( it.begin(), it.end(), Comp ) ; 

	int i, k ;
	int size = it.size() ;
	if ( size == 0 )
		return ;

	k = 1 ;
	for ( i = 1 ; i < size ; ++i )
	{
		if ( CompInterval( it[k - 1], it[i] ) )
		{
			it[k] = it[i] ;
			++k ;
		}
	}
	it.resize( k ) ;
} 

int GetGTFField( char *ret, char *line, const char *fid )
{
	char *p = strstr( line, fid ) ;
	if ( p == NULL )
		return 0 ;
	//printf( "%s %s\n", line, fid ) ;
	for ( ; *p != ' ' ; ++p )
		;
	p += 2 ;
	
	sscanf( p, "%s", ret ) ;
	p = ret + strlen( ret ) ;
	while ( p != ret && *p != '\"' )
		--p ;
	*p = '\0' ;
	return 1 ;
}

void UpdateIdToAnnoId( std::vector<struct _interval> &transcript, char *tid, int exonTag, int intronTag, std::vector<struct _interval> &exons, std::vector<struct _interval> &introns,
	std::map<std::string, std::string> &txptIdToAnnoId, std::map<std::string, std::string> &geneIdToAnnoId )
{
	int i, k ;
	bool flag = false ;
	// First, try whether intron works.
	int ecnt = transcript.size() ;
	int intronSize = introns.size() ;
	int exonSize = exons.size() ;

	struct _interval itron ;
	strcpy( itron.chrom, transcript[0].chrom ) ;
	k = intronTag ;
	for ( i = 0 ; i < ecnt - 1 && !flag ; ++i )
	{
		itron.start = transcript[i].end ;
		itron.end = transcript[i + 1].start ;
		for ( ; k < intronSize ; ++k )
		{
			//printf( "hi %d: %s %d %d; %d %s %d %d\n", 0, itron.chrom, itron.start, itron.end, k, introns[k].chrom, introns[k].start, introns[k].end ) ;
			if ( strcmp( introns[k].chrom, itron.chrom ) )
				break ;

			int cmp = CompInterval( introns[k], itron ) ;
			if ( cmp == 0 )
			{
				txptIdToAnnoId[ std::string( tid ) ] = introns[k].geneName ;
				geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = introns[k].geneName ;
				flag = true ;
				break ;
			}
			else if ( cmp > 0 )
				break ;
		}
	}

	// Next, try whether exon works
	k = exonTag ;
	for ( i = 0 ; i < ecnt && !flag ; ++i )
	{
		for ( ; k < exonSize ; ++k )
		{
			if ( strcmp( exons[k].chrom, transcript[i].chrom ) )
				break ;

			if ( exons[k].end < transcript[i].start )
				continue ;
			else if ( exons[k].start > transcript[i].end )
				break ;
			else
			{
				txptIdToAnnoId[ std::string( tid ) ] = exons[k].geneName ;
				geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = exons[k].geneName ;
				flag = true ;
				break ;
			}
		}
	}
}

int main( int argc, char *argv[] )
{
	int i, j, k ;
	FILE *fp ;
	FILE *fpList ;
	char outputPath[1024] = "./" ;
	char prevTid[97] = "" ;

	std::vector<struct _interval> introns, exons ;
	std::map<std::string, std::string> txptIdToAnnoId, geneIdToAnnoId ;
	std::map<std::string, int> chromIdMapIntrons, chromIdMapExons ; // the index for the starting of a chrom.
	
	if ( argc < 3 )
	{	
		fprintf( stderr, "%s", usage ) ;
		exit( 1 ) ;
	}

	for ( i = 3 ; i < argc ; ++i )
	{
		if ( !strcmp( argv[i], "-o" ) )
		{
			strcpy( outputPath, argv[i + 1 ] ) ;	
			++i ;
		}
		else
		{
			fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ;
			exit( 1 ) ;
		}
	}

	// Get the exons, introns from the annotation.
	fp = fopen( argv[1], "r" ) ;
	while ( fgets( line, sizeof( line ), fp ) != NULL )
	{
		if ( line[0] == '#' )
			continue ;
		char tid[97] ;
		char gname[97] ;
		char chrom[97] ;
		char type[50] ;
		int start, end ;

		sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ;
		if ( strcmp( type, "exon" ) )
			continue ;
		if ( GetGTFField( gname, line, "gene_name" ) )
		{
			struct _interval ne ;
			strcpy( ne.chrom, chrom ) ;
			strcpy( ne.geneName, gname ) ;
			ne.start = start ; ne.end = end ;

			exons.push_back( ne ) ;
		}
		else
		{
			fprintf( stderr, "%s has no field of gene_name.\n", line ) ;
			exit( 1 ) ;
		}

		if ( GetGTFField( tid, line, "transcript_id" ) )
		{
			if ( !strcmp( tid, prevTid ) )
			{	
				struct _interval ni ;
				
				strcpy( ni.chrom, chrom ) ;
				strcpy( ni.geneName, gname ) ;	

				int size = exons.size() ;
				if ( size < 2 )
					continue ;

				struct _interval &e1 = exons[ size - 2 ] ;
				struct _interval &e2 = exons[ size - 1 ] ;
				if ( e1.start < e2.start )
				{
					ni.start = e1.end ;
					ni.end = e2.start ;
				}
				else
				{
					ni.start = e2.end ;
					ni.end = e1.start ;
				}

				introns.push_back( ni ) ;			
			}
			else
				strcpy( prevTid, tid ) ;
		}
		else
		{
			fprintf( stderr, "%s has no field of transcript_id.\n", line ) ;
			exit( 1 ) ;
		}
	}
	fclose( fp ) ;
	SortAndCleanIntervals( exons ) ;
	SortAndCleanIntervals( introns ) ;
	int exonSize = exons.size() ;
	int intronSize = introns.size() ;
	// Determine the offset for each chrom on the list of features.	
	if ( exonSize )
	{
		chromIdMapExons[ std::string( exons[0].chrom ) ] = 0 ;
		for ( i = 1 ; i < exonSize ; ++i )
		{
			if ( strcmp( exons[i].chrom, exons[i - 1].chrom ) )
				chromIdMapExons[ std::string( exons[i].chrom ) ] = i ;
		}
	}

	if ( intronSize )
	{
		chromIdMapIntrons[ std::string( introns[0].chrom ) ] = 0 ;
		for ( i = 1 ; i < intronSize ; ++i )
		{
			if ( strcmp( introns[i].chrom, introns[i - 1].chrom ) )
			{
				//printf( "%s %d\n", introns[i].chrom, i ) ;
				chromIdMapIntrons[ std::string( introns[i].chrom ) ] = i ;
			}
		}

		//for ( i = 0 ; i < intronSize ; ++i )
		//	printf( "%s\t%d\t%d\n", introns[i].chrom, introns[i].start, introns[i].end ) ;
	}
	//printf( "%d %d\n", exonSize, intronSize ) ;

	// Go through all the GTF files to find the map between PsiCLASS's gene_id to annotation's gene name
	fpList = fopen( argv[2], "r ") ;
	std::vector<struct _interval> transcript ;
	int exonTag = 0 ;
	int intronTag = 0 ;

	while ( fscanf( fpList, "%s", line ) != EOF )
	{
		// Set the output file
		//printf( "hi: %s\n", line ) ;
		char *p ;
		p = line + strlen( line ) ;
		while ( p != line && *p != '/' )
		{
			if ( *p == '\n' )
				*p = '\0' ;
			--p ;
		}
		sprintf( buffer, "%s/%s", outputPath, p ) ;

		// Test whether this will overwrite the input gtf file.
		if ( realpath( buffer, buffer2 ) != NULL )
		{
			if ( realpath( line, buffer3 ) && !strcmp( buffer2, buffer3 ) )
			{
				fprintf( stderr, "Output will overwrite the input files. Please use -o to specify a different output directory.\n" ) ;
				exit( 1 ) ;
			}
		}

		fp = fopen( line, "r" ) ;
		transcript.resize( 0 ) ; // hold the exons in the transcript.
		prevTid[0] = '\0' ;
		exonTag = 0 ;
		intronTag = 0 ;
		int farthest = 0 ;

		while ( fgets( line, sizeof( line ), fp ) )
		{
			char tid[97] ;
			//char gname[97] ;
			char chrom[97] ;
			char type[50] ;
			int start, end ;

			if ( line[0] == '#' )
				continue ;

			sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ;
			if ( strcmp( type, "exon" ) )
				continue ;

			if ( GetGTFField( tid, line, "transcript_id" ) )
			{
				struct _interval ne ;
				strcpy( ne.chrom, chrom ) ;
				ne.start = start ;
				ne.end = end ;
				GetGTFField( ne.geneName, line, "gene_id" ) ;
				
				if ( !strcmp( tid, prevTid ) )
				{
					transcript.push_back( ne ) ;	
					if ( end > farthest )
						farthest = end ;
				}
				else if ( transcript.size() > 0 )
				{
					// the non-existed chrom will be put to offset 0, and that's fine
					if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
						intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; 
					if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
						exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ;

					UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ;

					// Adjust the offset if we are done with a gene cluster
					// We don't need to worry about the case of changing chrom here.
					if ( !strcmp( ne.geneName, transcript[0].geneName ) &&
						start > farthest ) // Use farthest to avoid inteleaved gene.
					{
						while ( intronTag < intronSize && !strcmp( introns[intronTag ].chrom, ne.chrom )
							&& introns[intronTag].end < ne.start )		
							++intronTag ;

						while ( exonTag < exonSize && !strcmp( exons[ exonTag ].chrom, ne.chrom ) 
							&& exons[ exonTag ].end < ne.start )
							++exonTag ;

						farthest = end ;
					}

					transcript.resize( 0 ) ;
					transcript.push_back( ne ) ;
					
					// Find the overlaps.
					strcpy( prevTid, tid ) ;
				}
				else
					strcpy( prevTid, tid ) ;
			}
			else
			{
				fprintf( stderr, "Could not find transcript_id field in GTF file: %s", line ) ;
				exit( 1 ) ;
			}
		}

		if ( transcript.size() > 0 )
		{
			if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
				intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; 
			if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) )
				exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ;

			UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ;
		}
		fclose( fp ) ;
	}
	fclose( fpList ) ;

	// Add the gene_name field.
	fpList = fopen( argv[2], "r" ) ;
	int novelCnt = 0 ;
	while ( fscanf( fpList, "%s", line ) != EOF )
	{
		FILE *fpOut ;
		// Set the output file
		char *p ;
		p = line + strlen( line ) ;
		while ( p != line && *p != '/' )
		{
			if ( *p == '\n' )
				*p = '\0' ;
			--p ;
		}
		sprintf( buffer, "%s/%s", outputPath, p ) ;
		fpOut = fopen( buffer, "w" ) ;

		// Process the input file
		fp = fopen( line, "r" ) ;

		transcript.resize( 0 ) ;
		prevTid[0] = '\0' ;
		while ( fgets( line, sizeof( line ), fp ) )
		{
			char gname[97] ;
			if ( line[0] == '#' )
			{
				fprintf( fpOut, "%s", line ) ;
				continue ;
			}
			
			if ( GetGTFField( buffer, line, "transcript_id" ) )
			{
				if ( txptIdToAnnoId.find( std::string( buffer ) ) != txptIdToAnnoId.end() )
				{
					int len = strlen( line ) ;
					if ( line[len - 1] == '\n' )
					{
						line[len - 1] = '\0' ;
						--len ;
					}
					fprintf( fpOut, "%s gene_name \"%s\";\n", line, txptIdToAnnoId[std::string( buffer )].c_str() ) ;
					continue ;
				}
			}

			if ( GetGTFField( gname, line, "gene_id" ) )
			{
				int len = strlen( line ) ;
				if ( line[len - 1] == '\n' )
				{
					line[len - 1] = '\0' ;
					--len ;
				}
				if ( geneIdToAnnoId.find( std::string( gname ) ) != geneIdToAnnoId.end() )
				{
					fprintf( fpOut, "%s gene_name \"%s\";\n", line, geneIdToAnnoId[std::string(gname)].c_str() ) ;
				}
				else
				{
					sprintf( buffer, "novel_%d", novelCnt ) ;
					geneIdToAnnoId[ std::string( gname ) ] = std::string( buffer ) ;
					fprintf( fpOut, "%s gene_name \"%s\";\n", line, buffer ) ;
					++novelCnt ;
				}

			}
			else
				fprintf( fpOut, "%s", line ) ;

		}
			
		fclose( fp ) ;
		fclose( fpOut ) ;
	}

	return 0 ;
}