view fastx_toolkit-0.0.6/src/libfastx/sequence_alignment.cpp @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
line wrap: on
line source

#include <string>
#include <vector>
#include <ostream>
#include <iostream>
#include <algorithm>
#include <iomanip>
#include <err.h>

#include "sequence_alignment.h"

using namespace std;

void  SequenceAlignmentResults::print(std::ostream& strm) const
{
	size_t delta;
	size_t index;

	strm << "Query-Alingment = " << query_alignment << endl ;
	strm << "target-Alingment= " << target_alignment << endl ;


 	strm << (alignment_found ? "Alignment Found" : "Alignment NOT found") << endl;
	strm << "Score = " << score << " ("
	     << matches << " matches, "
	     << neutral_matches << " neutral-matches, "
	     << mismatches << " mismatches, "
	     << gaps << " gaps) "
	     << std::endl ;
	
	strm << "Query = " << query_sequence
	     << "(qsize " << query_size
	     << " qstart " << query_start
	     << " qend " << query_end 
	     << std::endl ;

	strm << "Target= " << target_sequence
	     << "(tsize " << target_size
	     << " tstart " << target_start
	     << " tend " << target_end 
	     << std::endl ;

        strm << endl;

	delta = max(target_start, query_start);


	//Spaces before the query string
	if ( delta - query_start > 0 )
		strm << std::string( delta - query_start-1, ' ') ;
	//Un-Aligned query part (prefix)
	if ( query_start > 0 )
		strm << query_sequence.substr(0, query_start-1) ;
	//Aligned query part
	strm << "(" << query_alignment << ")";
	//Un-Aligned query part (suffix)
	if ( query_end < query_sequence.length() )
		strm << query_sequence.substr( query_end+1 ) ;
	strm << std::endl ;

	//Alignment bars
	if ( delta > 0 )
		strm << std::string( delta-1, ' ') ;
	strm << "(" ;
	for (index=0; index<query_alignment.length(); index++) {
		strm << ((query_alignment[index]==target_alignment[index]) ? '*' : '|' );
	}
	strm << ")" ;
	strm << std::endl;

	//Spaces before the target string
	if ( delta - target_start > 0 )
		strm <<  std::string( delta - target_start, ' ') ;
	//Un-Aligned target part (prefix)
	if ( target_start > 0 )
		strm << target_sequence.substr(0, target_start-1);
	//Aligned target part
	strm << "(" << target_alignment << ")";

	//Un-Aligned target part (suffix)
	if ( target_end < target_sequence.length() )
		strm << target_sequence.substr( target_end+1 );
	strm << std::endl;

}

SequenceAlignment::SequenceAlignment ( ) :
	_gap_panelty(-5),
	_match_panelty(1),
	_mismatch_panelty(-1),
	_neutral_panelty(0.1)

{
}


void SequenceAlignment::set_sequences(const std::string& _query, const std::string& _target)
{
	_query_sequence = _query ;
	_target_sequence = _target ;
}

void SequenceAlignment::reset_alignment_results()
{
	_alignment_results = SequenceAlignmentResults() ;
	//
	//Reset the results
	_alignment_results.query_sequence = query_sequence() ;
	_alignment_results.target_sequence = target_sequence() ;
}

const SequenceAlignmentResults& SequenceAlignment::align ( const std::string& query, const std::string& target )
{
	set_sequences ( query, target ) ;

	reset_alignment_results();

	resize_matrix ( query_sequence().length(), target_sequence().length() ) ;
	populate_match_matrix();

	reset_matrix( matrix_width(), matrix_height() );
	populate_matrix();
	find_optimal_alignment();

	post_process();

	return _alignment_results;
}

void SequenceAlignment::resize_matrix(size_t width, size_t height)
{
	size_t i;

	if ( matrix_width() >= width && matrix_height() >= height )
		return ;

	query_border.resize ( width ) ;
	target_border.resize ( height ) ;

	score_matrix.resize ( width );
	for (i=0;i<width;i++)
		score_matrix[i].resize(height) ;

	origin_matrix.resize ( width );
	for (i=0;i<width;i++)
		origin_matrix[i].resize(height) ;

	match_matrix.resize ( width );
	for (i=0;i<width;i++) {
		match_matrix[i].resize(height) ;
	}
}

void SequenceAlignment::populate_match_matrix()
{
	for (size_t x=0; x<matrix_width(); x++)
		for(size_t y=0;y<matrix_height();y++)
			match_matrix[x][y] = 
				match_value ( query_nucleotide(x), target_nucleotide(y) ) ;
}


void SequenceAlignment::post_process()
{
	
}

void SequenceAlignment::print_matrix(std::ostream &strm) const
{
	size_t query_index ;
	size_t target_index ;

	#if 0
	printf("Match-Matrix:\n");
	printf(" - ");
	for ( target_index=1; target_index<matrix_height(); target_index++ ) 
		printf(" %c ", target_sequence()[target_index-1] );
	printf("\n");

	for ( query_index=1; query_index<matrix_width(); query_index++ ) {

		printf(" %c ", query_sequence()[query_index-1]) ;

		for ( target_index=1 ; target_index<matrix_height(); target_index++ ) {
			printf(" %c ", match_matrix[query_index][target_index]);		
		}
		printf("\n");
	}
	#endif

	strm << "Score-Matrix:" << endl ;

	//Print Target nucleotides
	strm << setw(2) << left << "-" << setw(7) << "-" ;
	for ( query_index=0; query_index<matrix_width(); query_index++ ) 
		strm << setw(9) << left << query_nucleotide ( query_index ) ;
	strm << endl;
	strm << setw(2) << left << "-" << setw(7) << "-" ;
	for ( query_index=0; query_index<matrix_width(); query_index++ ) 
		strm << setw(9) << left << query_border[query_index] ;
	strm << endl;

	for ( target_index=0; target_index<matrix_height(); target_index++ ) {

		strm << setw(2) << left << target_nucleotide ( target_index ) ;
		strm << setw(6) << right << target_border[target_index] << setw(1) << " ";

		for ( query_index=0 ; query_index<matrix_width(); query_index++ ) {
			char ch ;
			switch (origin ( query_index, target_index ) ) 
			{
			case FROM_UPPER:      ch = '|' ;  break ;
			case FROM_LEFT:       ch = '-' ;  break ;
			case FROM_UPPER_LEFT: ch = '\\' ;  break ;
			case FROM_NOWHERE:    ch = '=' ;  break ;
			default:              ch = '*' ; break ;
			}

			strm << left ;
			strm << setw(1) << match(query_index,target_index);
			strm << setw(1) << ch ;
			strm << setw(7) << fixed << setprecision(1) 
			     << score(query_index,target_index) ;
		}
		strm << endl;
	}
}

#if 0
void LocalSequenceAlignment::reset_matrix( size_t width, size_t height ) 
{
	size_t x,y ;

	highest_scored_query_index = 0 ;
	highest_scored_target_index = 0 ;

	for (x=0; x<width; x++) 
		score_matrix[x][0] = 0 ;
	for (y=0; y<height; y++) 
		score_matrix[0][y] = 0 ;

}

void LocalSequenceAlignment::populate_matrix ( )
{
	size_t query_index ;
	size_t target_index ;

	ssize_t highest_score = 0 ;

	for ( query_index=1; query_index<matrix_width(); query_index++ ) {
		for ( target_index=1 ; target_index<matrix_height(); target_index++ ) {
			ssize_t score = alignment_score(query_index, target_index);

			//printf("score(q=%zu,t=%zu)=%zu\n", query_index, target_index, score ) ;
			score_matrix[query_index][target_index] = (score>0) ? score : 0 ;

			//NOTE
			// not sure ">=" is strictly correct SW (might be just ">")
			if ( score > highest_score ) {
				highest_scored_query_index = query_index ;
				highest_scored_target_index = target_index ;
				highest_score = score ;
			}
		}

	}
}

void LocalSequenceAlignment::find_optimal_alignment ( ) 
{
	size_t query_index = highest_scored_query_index ;
	size_t target_index = highest_scored_target_index;

	_alignment_results.query_end = query_index-1 ;
	_alignment_results.target_end= target_index-1 ;

	_alignment_results.score = score_matrix[query_index][target_index];

	_alignment_results.matches = 0 ;
	_alignment_results.mismatches = 0 ;

	while ( query_index > 0 || target_index > 0 ) {
		if ( score_matrix[query_index][target_index]==0)
			break ;

		//go "left" in the matrix
		if ( query_index>0 &&
		     score_matrix[query_index][target_index] == score_matrix[query_index-1][target_index] + gap_panelty() ) {

			_alignment_results.target_alignment += "-" ;
			_alignment_results.query_alignment += query_sequence()[query_index-1] ;
			query_index--;
		}
		else
		//go "up-left" in the matrix
		if ( query_index>0 && target_index>0 &&
		     score_matrix[query_index][target_index] == 
		     	score_matrix[query_index-1][target_index-1] + match_score(query_index, target_index) ) {

			_alignment_results.target_alignment += target_sequence()[target_index-1];
			_alignment_results.query_alignment += query_sequence()[query_index-1] ;

			(query_sequence()[query_index-1] == target_sequence()[target_index-1]) ?
				(++_alignment_results.matches) : (++_alignment_results.mismatches) ;

			query_index--;
			target_index--;
		}
		else 
		//go "up" in the matrix
		{
			_alignment_results.target_alignment += target_sequence()[target_index-1];
			_alignment_results.query_alignment += "-" ;
			target_index--;
		}
	}

	_alignment_results.query_start = query_index ;
	_alignment_results.target_start= target_index ;

	_alignment_results.query_size = query_sequence().length();
	_alignment_results.target_size= target_sequence().length();

	std::reverse(_alignment_results.target_alignment.begin(), _alignment_results.target_alignment.end());
	std::reverse(_alignment_results.query_alignment.begin(), _alignment_results.query_alignment.end());
}
#endif

void HalfLocalSequenceAlignment::set_sequences(const std::string& _query, const std::string& _target)
{
	//_query_sequence  = _query + std::string( _target.length(), 'N' ); 
	//_target_sequence = std::string( _query.length(), 'N' ) + _target;
	_query_sequence = _query ;
	_target_sequence = _target ;
}


void HalfLocalSequenceAlignment::reset_matrix( size_t width, size_t height ) 
{
	size_t x,y ;

	highest_scored_query_index = 0 ;
	highest_scored_target_index = 0 ;

	for (x=0; x<width; x++) {
		query_border[x] = 
			//gap_panelty() * (ssize_t)x ;
			//((query_sequence()[x-1]=='N') ? neutral_panelty() : gap_panelty()) * (ssize_t)x ;
			//((query_sequence()[x-1]=='N') ? 0 : gap_panelty()) * (ssize_t)x ;
			0 ;
	}

	for (y=0; y<height; y++) {
		target_border[y] = 
			( y <= 3 ) ? 0 : (gap_panelty() * (ssize_t)(y-3));
			//0;
			//((target_sequence()[y-1]=='N') ? 0 : gap_panelty()) * (ssize_t)y ;
			//((target_sequence()[y-1]=='N') ? neutral_panelty() : gap_panelty()) * (ssize_t)y ;
	}
			
}

void HalfLocalSequenceAlignment::populate_matrix ( )
{
	size_t query_index ;
	size_t target_index ;
	DIRECTION origin = FROM_LEFT;

	score_type highest_score = -1000000 ;
	highest_scored_query_index = -1 ;
	highest_scored_target_index = -1 ;

	for ( query_index=0; query_index<matrix_width(); query_index++ ) {
		for ( target_index=0 ; target_index<matrix_height(); target_index++ ) {

			//Note:
			// 'safe_score()' can accept negative value of -1 (and will return the border value)
			score_type up_score     = safe_score(query_index,  ((ssize_t)target_index)-1) + gap_panelty() ;
			score_type left_score   = safe_score(((ssize_t)query_index)-1,target_index )  + gap_panelty() ; 
			score_type upleft_score = safe_score(((ssize_t)query_index)-1,((ssize_t)target_index)-1) + 
						nucleotide_match_score(query_index, target_index);

			//On the diagonal line, best score can not come from upper cell
			//only from left or upper-left cells
			if ( target_index>3 && target_index-3 > query_index ) {
				left_score = -100000 ;
			}

			//printf("query_index=%d, target_index=%d,  upscore=%f, left_score=%f, upleft_score=%f\n",
			//		query_index, target_index, up_score,left_score,upleft_score );

			score_type score = -100000000 ;

			if ( upleft_score > score ) {
				score = upleft_score ;
				origin = FROM_UPPER_LEFT;
			}
			if ( up_score > score ) {
				score = up_score ;
				origin = FROM_UPPER ;
			}
			if ( left_score > score ) {
				score = left_score ;
				origin = FROM_LEFT ;
			}
			//printf("query_index=%d, target_index=%d,  score=%f origin=%d\n",
			//		query_index, target_index, score, origin );
			
			/*if (score<0) {
				score = 0 ;
				origin = FROM_NOWHERE ;
			}*/
			
			score_matrix[query_index][target_index] = score ;
			origin_matrix[query_index][target_index] = origin ;

			//NOTE
			// not sure ">=" is strictly correct SW (might be just ">")
			if ( score > highest_score ) {
				highest_scored_query_index = query_index ;
				highest_scored_target_index = target_index ;
				highest_score = score ;
			}
		}
	}
}

bool HalfLocalSequenceAlignment::starting_point_close_to_end_of_sequences(const size_t query_index, const size_t target_index) const
{
	if ( (size_t)query_index  >= query_sequence().length() - 2  ||
	     (size_t)target_index >= target_sequence().length() - 2 ) {
		/* We've reach either the end of the Adapter
		 * (and the adapter is shorter than the query)
		 * Or the end of the query 
		 * (and the adapter covers up to the end of the query, and then continues on)
		 *
		 * So we can safely start the alignment from this point
		 */
		return true;
	}
	else {
		/* The adapter is not covering the query until the end.
		 */
		return false;
	}
}

#undef DEBUG_STARTING_POINT
void HalfLocalSequenceAlignment::find_alignment_starting_point(ssize_t &new_query_index, ssize_t &new_target_index) const
{
	 /*
	 * Force the alignment to start from the end of the query,
	 * find the best score at the end of the query
	 *
	 * Try (desperately) to find a match that starts at the end of the query or the end of the target/adapter)
	 */
	score_type max_score = score( matrix_width()-1, matrix_height()-1 ) ;
	for ( size_t q_index = 0 ; q_index < matrix_width(); q_index++ ) {
		for ( size_t t_index = matrix_height()-2 ; t_index < matrix_height(); t_index++ ) {
			if ( origin ( q_index, t_index ) > 0 && 
				safe_score ( q_index, t_index ) > max_score ) {
				max_score = safe_score ( q_index, t_index ) ;
				#ifdef DEBUG_STARTING_POINT
				printf("Found new max score = %f at %d,%d\n", max_score, q_index, t_index ) ;
				#endif
				new_target_index = t_index ;
				new_query_index = q_index ;
			}
		}
	}
	for ( size_t q_index = matrix_width()-2 ; q_index < matrix_width(); q_index++ ) {
		for ( size_t t_index = 0 ; t_index < matrix_height(); t_index++ ) {
			if ( origin ( q_index, t_index ) > 0 && 
				safe_score ( q_index, t_index ) > max_score ) {
				max_score = safe_score ( q_index, t_index ) ;
				#ifdef DEBUG_STARTING_POINT
				printf("Found new max score = %f at %d,%d\n", max_score, q_index, t_index ) ;
				#endif
				new_target_index = t_index ;
				new_query_index = q_index ;
			}
		}
	}
	#ifdef DEBUG_STARTING_POINT
	printf("Forcing alignment from query_index=%d, target_index=%d, score=%f, origin=%d\n",
		new_query_index, new_target_index,
		score ( new_query_index, new_target_index ),
		origin ( new_query_index, new_target_index ) );
	#endif
}


#undef DEBUG_FIND_OPTIMAL_ALIGNMENT
SequenceAlignmentResults HalfLocalSequenceAlignment::find_optimal_alignment_from_point ( const size_t query_start, const size_t target_start ) const
{
	SequenceAlignmentResults results;

	results.query_sequence = query_sequence();
	results.target_sequence= target_sequence();

	ssize_t query_index = query_start;
	ssize_t target_index = target_start ;

	results.query_end = query_index ;
	results.target_end= target_index ;

	#ifdef DEBUG_FIND_OPTIMAL_ALIGNMENT
	printf ( "backtrace starting from (qindex=%d, tindex=%d, score=%f)\n",
			query_index, target_index, score_matrix[query_index][target_index]) ;
	#endif
	
	while ( query_index >= 0 && target_index >= 0 ) {
		
		const char q_nuc = query_nucleotide(query_index);
		const char t_nuc = target_nucleotide(target_index);

		const DIRECTION current_origin = origin(query_index, target_index);
		const char current_match = match ( query_index, target_index ) ;


		#ifdef DEBUG_FIND_OPTIMAL_ALIGNMENT
		const score_type current_score = score(query_index, target_index);
		printf("query_index=%d   target_index=%d  query=%c target=%c score_matrix=%3.1f origin=%d  accumulated_score = %3.2f\n",
			query_index, target_index, 
			q_nuc, t_nuc,
			current_score, 
			current_origin,
			results.score) ;
		#endif
	
		results.query_start = query_index ;
		results.target_start= target_index ;

		switch ( current_origin )
		{
		case FROM_LEFT:
			results.target_alignment += "-" ;
			results.query_alignment += q_nuc ;
			results.gaps++;
			results.score += gap_panelty();

			query_index--;
			break ;

		case FROM_UPPER_LEFT:
			results.target_alignment += t_nuc;
			results.query_alignment += q_nuc ;

			switch ( current_match ) 
			{
			case 'N':
				results.neutral_matches++ ;
				results.score += neutral_panelty();
				break ;

			case 'M':
				results.matches++;
				results.score += match_panelty();
				break;

			case 'x':
				results.mismatches++;
				results.score += mismatch_panelty();
				break ;

			default:
				errx(1,"Internal error: unknown match type (%c) at query_index=%zu, target_index=%zu\n",
					current_match, query_index, target_index ) ;
			}

			query_index--;
			target_index--;
			break ;

		case FROM_UPPER:
			results.target_alignment += t_nuc ;
			results.query_alignment += "-" ;
			results.gaps++;
			results.score += gap_panelty();

			target_index--;
			break;

		case FROM_NOWHERE:
		default:
			print_matrix();
			printf("Invalid origin (%d) at query_index=%zu, target_index=%zu\n", 
					current_origin, query_index, target_index ) ;
			printf("Query = %s\n", query_sequence().c_str());
			printf("Target= %s\n", target_sequence().c_str());
			exit(1);
		}
	}

	results.query_size = query_sequence().length();
	results.target_size= target_sequence().length();

	std::reverse(results.target_alignment.begin(),results.target_alignment.end());
	std::reverse(results.query_alignment.begin(), results.query_alignment.end());

	return results;
}

void HalfLocalSequenceAlignment::find_optimal_alignment ( ) 
{
	SequenceAlignmentResults results ;
	

	//Try to find a good alignment, 
	//starting from the highest score cell.
	results = find_optimal_alignment_from_point ( highest_scored_query_index,
						      highest_scored_target_index ) ;

	//Some heuristics:
	//If the adapter matched 7 nucleotides anywhere in the query
	//without mismatches/gaps, accept it.
	if ( results.matches >= 7 
	     && 
	     results.mismatches == 0
	     &&
	     results.gaps == 0 ) {
	     	
		_alignment_results = results ;
		return ;
	}

	if ( starting_point_close_to_end_of_sequences ( highest_scored_query_index,
						        highest_scored_target_index ) ) {
		//We're already very close to the end of the target or query,
		//can't improve much else, so return what we've got.
		_alignment_results = results ;
		return ;
	}


	//More heuristics:
	/* The adapter is not covering the query until the end.
	 * Force the alignment to start from the end of the query,
	 * find the best score at the end of the query
	 *
	 * Try (desperately) to find a match that starts at the end of the query or the end of the target/adapter)
	 */
	ssize_t query_index = highest_scored_query_index ;
	ssize_t target_index = highest_scored_target_index;
	find_alignment_starting_point ( query_index, target_index ) ;

	_alignment_results = results ;
}

void HalfLocalSequenceAlignment::post_process()
{
#if 0
	//Removes the Ns which were added in 'set_sequences'
	//And adjust the results values accordingly

	//return ;
	//_query_sequence.erase ( _query_sequence.find_last_not_of('N') + 1) ;
	//_target_sequence.erase ( 0, _target_sequence.find_first_not_of('N') ) ;

	_alignment_results.query_sequence.erase ( _query_sequence.find_last_not_of('N') + 1) ;
	_alignment_results.target_sequence.erase ( 0, _target_sequence.find_first_not_of('N') ) ;
	_alignment_results.query_size = _alignment_results.query_sequence.length();
	_alignment_results.target_size = _alignment_results.target_sequence.length();


	size_t query_n_position = _alignment_results.query_alignment.find_last_not_of('N') ;
	int query_n_count;

	if ( query_n_position != string::npos )
		query_n_count = _alignment_results.query_alignment.length() - query_n_position ;
	else
		query_n_count = 0 ;

	int target_n_count = _alignment_results.target_alignment.find_first_not_of('N') ;

	if (query_n_position != string::npos )
		_alignment_results.query_alignment.erase( query_n_position ) ;
	_alignment_results.target_alignment.erase( 0,target_n_count ) ;

	//Update Results strucure
	_alignment_results.query_start+= target_n_count ;
	_alignment_results.query_end  -= query_n_count ;

	_alignment_results.target_start = 0;
	_alignment_results.target_end  =  _alignment_results.query_end - _alignment_results.query_start ;

	_alignment_results.query_alignment.erase ( 0, _alignment_results.query_start ) ;
	_alignment_results.target_alignment.erase ( _alignment_results.target_end+1 ) ;

	//Update match/mismatch/gap counts
	_alignment_results.matches = 0 ;
	_alignment_results.mismatches = 0 ;
	_alignment_results.gaps = 0 ;

	for (size_t index=0; index<_alignment_results.query_alignment.length(); index++) {
		char q = _alignment_results.query_alignment[index];
		char t = _alignment_results.target_alignment[index];

		if ( q == '-' || t=='-' ) {
			_alignment_results.gaps ++ ;
		} else {
			if ( q== t )
				_alignment_results.matches++;
			else
				_alignment_results.mismatches++;
		}
	}
#endif 
}