diff fastx_toolkit-0.0.6/src/libfastx/sequence_alignment.cpp @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_toolkit-0.0.6/src/libfastx/sequence_alignment.cpp	Thu Aug 14 04:52:17 2014 -0400
@@ -0,0 +1,710 @@
+#include <string>
+#include <vector>
+#include <ostream>
+#include <iostream>
+#include <algorithm>
+#include <iomanip>
+#include <err.h>
+
+#include "sequence_alignment.h"
+
+using namespace std;
+
+void  SequenceAlignmentResults::print(std::ostream& strm) const
+{
+	size_t delta;
+	size_t index;
+
+	strm << "Query-Alingment = " << query_alignment << endl ;
+	strm << "target-Alingment= " << target_alignment << endl ;
+
+
+ 	strm << (alignment_found ? "Alignment Found" : "Alignment NOT found") << endl;
+	strm << "Score = " << score << " ("
+	     << matches << " matches, "
+	     << neutral_matches << " neutral-matches, "
+	     << mismatches << " mismatches, "
+	     << gaps << " gaps) "
+	     << std::endl ;
+	
+	strm << "Query = " << query_sequence
+	     << "(qsize " << query_size
+	     << " qstart " << query_start
+	     << " qend " << query_end 
+	     << std::endl ;
+
+	strm << "Target= " << target_sequence
+	     << "(tsize " << target_size
+	     << " tstart " << target_start
+	     << " tend " << target_end 
+	     << std::endl ;
+
+        strm << endl;
+
+	delta = max(target_start, query_start);
+
+
+	//Spaces before the query string
+	if ( delta - query_start > 0 )
+		strm << std::string( delta - query_start-1, ' ') ;
+	//Un-Aligned query part (prefix)
+	if ( query_start > 0 )
+		strm << query_sequence.substr(0, query_start-1) ;
+	//Aligned query part
+	strm << "(" << query_alignment << ")";
+	//Un-Aligned query part (suffix)
+	if ( query_end < query_sequence.length() )
+		strm << query_sequence.substr( query_end+1 ) ;
+	strm << std::endl ;
+
+	//Alignment bars
+	if ( delta > 0 )
+		strm << std::string( delta-1, ' ') ;
+	strm << "(" ;
+	for (index=0; index<query_alignment.length(); index++) {
+		strm << ((query_alignment[index]==target_alignment[index]) ? '*' : '|' );
+	}
+	strm << ")" ;
+	strm << std::endl;
+
+	//Spaces before the target string
+	if ( delta - target_start > 0 )
+		strm <<  std::string( delta - target_start, ' ') ;
+	//Un-Aligned target part (prefix)
+	if ( target_start > 0 )
+		strm << target_sequence.substr(0, target_start-1);
+	//Aligned target part
+	strm << "(" << target_alignment << ")";
+
+	//Un-Aligned target part (suffix)
+	if ( target_end < target_sequence.length() )
+		strm << target_sequence.substr( target_end+1 );
+	strm << std::endl;
+
+}
+
+SequenceAlignment::SequenceAlignment ( ) :
+	_gap_panelty(-5),
+	_match_panelty(1),
+	_mismatch_panelty(-1),
+	_neutral_panelty(0.1)
+
+{
+}
+
+
+void SequenceAlignment::set_sequences(const std::string& _query, const std::string& _target)
+{
+	_query_sequence = _query ;
+	_target_sequence = _target ;
+}
+
+void SequenceAlignment::reset_alignment_results()
+{
+	_alignment_results = SequenceAlignmentResults() ;
+	//
+	//Reset the results
+	_alignment_results.query_sequence = query_sequence() ;
+	_alignment_results.target_sequence = target_sequence() ;
+}
+
+const SequenceAlignmentResults& SequenceAlignment::align ( const std::string& query, const std::string& target )
+{
+	set_sequences ( query, target ) ;
+
+	reset_alignment_results();
+
+	resize_matrix ( query_sequence().length(), target_sequence().length() ) ;
+	populate_match_matrix();
+
+	reset_matrix( matrix_width(), matrix_height() );
+	populate_matrix();
+	find_optimal_alignment();
+
+	post_process();
+
+	return _alignment_results;
+}
+
+void SequenceAlignment::resize_matrix(size_t width, size_t height)
+{
+	size_t i;
+
+	if ( matrix_width() >= width && matrix_height() >= height )
+		return ;
+
+	query_border.resize ( width ) ;
+	target_border.resize ( height ) ;
+
+	score_matrix.resize ( width );
+	for (i=0;i<width;i++)
+		score_matrix[i].resize(height) ;
+
+	origin_matrix.resize ( width );
+	for (i=0;i<width;i++)
+		origin_matrix[i].resize(height) ;
+
+	match_matrix.resize ( width );
+	for (i=0;i<width;i++) {
+		match_matrix[i].resize(height) ;
+	}
+}
+
+void SequenceAlignment::populate_match_matrix()
+{
+	for (size_t x=0; x<matrix_width(); x++)
+		for(size_t y=0;y<matrix_height();y++)
+			match_matrix[x][y] = 
+				match_value ( query_nucleotide(x), target_nucleotide(y) ) ;
+}
+
+
+void SequenceAlignment::post_process()
+{
+	
+}
+
+void SequenceAlignment::print_matrix(std::ostream &strm) const
+{
+	size_t query_index ;
+	size_t target_index ;
+
+	#if 0
+	printf("Match-Matrix:\n");
+	printf(" - ");
+	for ( target_index=1; target_index<matrix_height(); target_index++ ) 
+		printf(" %c ", target_sequence()[target_index-1] );
+	printf("\n");
+
+	for ( query_index=1; query_index<matrix_width(); query_index++ ) {
+
+		printf(" %c ", query_sequence()[query_index-1]) ;
+
+		for ( target_index=1 ; target_index<matrix_height(); target_index++ ) {
+			printf(" %c ", match_matrix[query_index][target_index]);		
+		}
+		printf("\n");
+	}
+	#endif
+
+	strm << "Score-Matrix:" << endl ;
+
+	//Print Target nucleotides
+	strm << setw(2) << left << "-" << setw(7) << "-" ;
+	for ( query_index=0; query_index<matrix_width(); query_index++ ) 
+		strm << setw(9) << left << query_nucleotide ( query_index ) ;
+	strm << endl;
+	strm << setw(2) << left << "-" << setw(7) << "-" ;
+	for ( query_index=0; query_index<matrix_width(); query_index++ ) 
+		strm << setw(9) << left << query_border[query_index] ;
+	strm << endl;
+
+	for ( target_index=0; target_index<matrix_height(); target_index++ ) {
+
+		strm << setw(2) << left << target_nucleotide ( target_index ) ;
+		strm << setw(6) << right << target_border[target_index] << setw(1) << " ";
+
+		for ( query_index=0 ; query_index<matrix_width(); query_index++ ) {
+			char ch ;
+			switch (origin ( query_index, target_index ) ) 
+			{
+			case FROM_UPPER:      ch = '|' ;  break ;
+			case FROM_LEFT:       ch = '-' ;  break ;
+			case FROM_UPPER_LEFT: ch = '\\' ;  break ;
+			case FROM_NOWHERE:    ch = '=' ;  break ;
+			default:              ch = '*' ; break ;
+			}
+
+			strm << left ;
+			strm << setw(1) << match(query_index,target_index);
+			strm << setw(1) << ch ;
+			strm << setw(7) << fixed << setprecision(1) 
+			     << score(query_index,target_index) ;
+		}
+		strm << endl;
+	}
+}
+
+#if 0
+void LocalSequenceAlignment::reset_matrix( size_t width, size_t height ) 
+{
+	size_t x,y ;
+
+	highest_scored_query_index = 0 ;
+	highest_scored_target_index = 0 ;
+
+	for (x=0; x<width; x++) 
+		score_matrix[x][0] = 0 ;
+	for (y=0; y<height; y++) 
+		score_matrix[0][y] = 0 ;
+
+}
+
+void LocalSequenceAlignment::populate_matrix ( )
+{
+	size_t query_index ;
+	size_t target_index ;
+
+	ssize_t highest_score = 0 ;
+
+	for ( query_index=1; query_index<matrix_width(); query_index++ ) {
+		for ( target_index=1 ; target_index<matrix_height(); target_index++ ) {
+			ssize_t score = alignment_score(query_index, target_index);
+
+			//printf("score(q=%zu,t=%zu)=%zu\n", query_index, target_index, score ) ;
+			score_matrix[query_index][target_index] = (score>0) ? score : 0 ;
+
+			//NOTE
+			// not sure ">=" is strictly correct SW (might be just ">")
+			if ( score > highest_score ) {
+				highest_scored_query_index = query_index ;
+				highest_scored_target_index = target_index ;
+				highest_score = score ;
+			}
+		}
+
+	}
+}
+
+void LocalSequenceAlignment::find_optimal_alignment ( ) 
+{
+	size_t query_index = highest_scored_query_index ;
+	size_t target_index = highest_scored_target_index;
+
+	_alignment_results.query_end = query_index-1 ;
+	_alignment_results.target_end= target_index-1 ;
+
+	_alignment_results.score = score_matrix[query_index][target_index];
+
+	_alignment_results.matches = 0 ;
+	_alignment_results.mismatches = 0 ;
+
+	while ( query_index > 0 || target_index > 0 ) {
+		if ( score_matrix[query_index][target_index]==0)
+			break ;
+
+		//go "left" in the matrix
+		if ( query_index>0 &&
+		     score_matrix[query_index][target_index] == score_matrix[query_index-1][target_index] + gap_panelty() ) {
+
+			_alignment_results.target_alignment += "-" ;
+			_alignment_results.query_alignment += query_sequence()[query_index-1] ;
+			query_index--;
+		}
+		else
+		//go "up-left" in the matrix
+		if ( query_index>0 && target_index>0 &&
+		     score_matrix[query_index][target_index] == 
+		     	score_matrix[query_index-1][target_index-1] + match_score(query_index, target_index) ) {
+
+			_alignment_results.target_alignment += target_sequence()[target_index-1];
+			_alignment_results.query_alignment += query_sequence()[query_index-1] ;
+
+			(query_sequence()[query_index-1] == target_sequence()[target_index-1]) ?
+				(++_alignment_results.matches) : (++_alignment_results.mismatches) ;
+
+			query_index--;
+			target_index--;
+		}
+		else 
+		//go "up" in the matrix
+		{
+			_alignment_results.target_alignment += target_sequence()[target_index-1];
+			_alignment_results.query_alignment += "-" ;
+			target_index--;
+		}
+	}
+
+	_alignment_results.query_start = query_index ;
+	_alignment_results.target_start= target_index ;
+
+	_alignment_results.query_size = query_sequence().length();
+	_alignment_results.target_size= target_sequence().length();
+
+	std::reverse(_alignment_results.target_alignment.begin(), _alignment_results.target_alignment.end());
+	std::reverse(_alignment_results.query_alignment.begin(), _alignment_results.query_alignment.end());
+}
+#endif
+
+void HalfLocalSequenceAlignment::set_sequences(const std::string& _query, const std::string& _target)
+{
+	//_query_sequence  = _query + std::string( _target.length(), 'N' ); 
+	//_target_sequence = std::string( _query.length(), 'N' ) + _target;
+	_query_sequence = _query ;
+	_target_sequence = _target ;
+}
+
+
+void HalfLocalSequenceAlignment::reset_matrix( size_t width, size_t height ) 
+{
+	size_t x,y ;
+
+	highest_scored_query_index = 0 ;
+	highest_scored_target_index = 0 ;
+
+	for (x=0; x<width; x++) {
+		query_border[x] = 
+			//gap_panelty() * (ssize_t)x ;
+			//((query_sequence()[x-1]=='N') ? neutral_panelty() : gap_panelty()) * (ssize_t)x ;
+			//((query_sequence()[x-1]=='N') ? 0 : gap_panelty()) * (ssize_t)x ;
+			0 ;
+	}
+
+	for (y=0; y<height; y++) {
+		target_border[y] = 
+			( y <= 3 ) ? 0 : (gap_panelty() * (ssize_t)(y-3));
+			//0;
+			//((target_sequence()[y-1]=='N') ? 0 : gap_panelty()) * (ssize_t)y ;
+			//((target_sequence()[y-1]=='N') ? neutral_panelty() : gap_panelty()) * (ssize_t)y ;
+	}
+			
+}
+
+void HalfLocalSequenceAlignment::populate_matrix ( )
+{
+	size_t query_index ;
+	size_t target_index ;
+	DIRECTION origin = FROM_LEFT;
+
+	score_type highest_score = -1000000 ;
+	highest_scored_query_index = -1 ;
+	highest_scored_target_index = -1 ;
+
+	for ( query_index=0; query_index<matrix_width(); query_index++ ) {
+		for ( target_index=0 ; target_index<matrix_height(); target_index++ ) {
+
+			//Note:
+			// 'safe_score()' can accept negative value of -1 (and will return the border value)
+			score_type up_score     = safe_score(query_index,  ((ssize_t)target_index)-1) + gap_panelty() ;
+			score_type left_score   = safe_score(((ssize_t)query_index)-1,target_index )  + gap_panelty() ; 
+			score_type upleft_score = safe_score(((ssize_t)query_index)-1,((ssize_t)target_index)-1) + 
+						nucleotide_match_score(query_index, target_index);
+
+			//On the diagonal line, best score can not come from upper cell
+			//only from left or upper-left cells
+			if ( target_index>3 && target_index-3 > query_index ) {
+				left_score = -100000 ;
+			}
+
+			//printf("query_index=%d, target_index=%d,  upscore=%f, left_score=%f, upleft_score=%f\n",
+			//		query_index, target_index, up_score,left_score,upleft_score );
+
+			score_type score = -100000000 ;
+
+			if ( upleft_score > score ) {
+				score = upleft_score ;
+				origin = FROM_UPPER_LEFT;
+			}
+			if ( up_score > score ) {
+				score = up_score ;
+				origin = FROM_UPPER ;
+			}
+			if ( left_score > score ) {
+				score = left_score ;
+				origin = FROM_LEFT ;
+			}
+			//printf("query_index=%d, target_index=%d,  score=%f origin=%d\n",
+			//		query_index, target_index, score, origin );
+			
+			/*if (score<0) {
+				score = 0 ;
+				origin = FROM_NOWHERE ;
+			}*/
+			
+			score_matrix[query_index][target_index] = score ;
+			origin_matrix[query_index][target_index] = origin ;
+
+			//NOTE
+			// not sure ">=" is strictly correct SW (might be just ">")
+			if ( score > highest_score ) {
+				highest_scored_query_index = query_index ;
+				highest_scored_target_index = target_index ;
+				highest_score = score ;
+			}
+		}
+	}
+}
+
+bool HalfLocalSequenceAlignment::starting_point_close_to_end_of_sequences(const size_t query_index, const size_t target_index) const
+{
+	if ( (size_t)query_index  >= query_sequence().length() - 2  ||
+	     (size_t)target_index >= target_sequence().length() - 2 ) {
+		/* We've reach either the end of the Adapter
+		 * (and the adapter is shorter than the query)
+		 * Or the end of the query 
+		 * (and the adapter covers up to the end of the query, and then continues on)
+		 *
+		 * So we can safely start the alignment from this point
+		 */
+		return true;
+	}
+	else {
+		/* The adapter is not covering the query until the end.
+		 */
+		return false;
+	}
+}
+
+#undef DEBUG_STARTING_POINT
+void HalfLocalSequenceAlignment::find_alignment_starting_point(ssize_t &new_query_index, ssize_t &new_target_index) const
+{
+	 /*
+	 * Force the alignment to start from the end of the query,
+	 * find the best score at the end of the query
+	 *
+	 * Try (desperately) to find a match that starts at the end of the query or the end of the target/adapter)
+	 */
+	score_type max_score = score( matrix_width()-1, matrix_height()-1 ) ;
+	for ( size_t q_index = 0 ; q_index < matrix_width(); q_index++ ) {
+		for ( size_t t_index = matrix_height()-2 ; t_index < matrix_height(); t_index++ ) {
+			if ( origin ( q_index, t_index ) > 0 && 
+				safe_score ( q_index, t_index ) > max_score ) {
+				max_score = safe_score ( q_index, t_index ) ;
+				#ifdef DEBUG_STARTING_POINT
+				printf("Found new max score = %f at %d,%d\n", max_score, q_index, t_index ) ;
+				#endif
+				new_target_index = t_index ;
+				new_query_index = q_index ;
+			}
+		}
+	}
+	for ( size_t q_index = matrix_width()-2 ; q_index < matrix_width(); q_index++ ) {
+		for ( size_t t_index = 0 ; t_index < matrix_height(); t_index++ ) {
+			if ( origin ( q_index, t_index ) > 0 && 
+				safe_score ( q_index, t_index ) > max_score ) {
+				max_score = safe_score ( q_index, t_index ) ;
+				#ifdef DEBUG_STARTING_POINT
+				printf("Found new max score = %f at %d,%d\n", max_score, q_index, t_index ) ;
+				#endif
+				new_target_index = t_index ;
+				new_query_index = q_index ;
+			}
+		}
+	}
+	#ifdef DEBUG_STARTING_POINT
+	printf("Forcing alignment from query_index=%d, target_index=%d, score=%f, origin=%d\n",
+		new_query_index, new_target_index,
+		score ( new_query_index, new_target_index ),
+		origin ( new_query_index, new_target_index ) );
+	#endif
+}
+
+
+#undef DEBUG_FIND_OPTIMAL_ALIGNMENT
+SequenceAlignmentResults HalfLocalSequenceAlignment::find_optimal_alignment_from_point ( const size_t query_start, const size_t target_start ) const
+{
+	SequenceAlignmentResults results;
+
+	results.query_sequence = query_sequence();
+	results.target_sequence= target_sequence();
+
+	ssize_t query_index = query_start;
+	ssize_t target_index = target_start ;
+
+	results.query_end = query_index ;
+	results.target_end= target_index ;
+
+	#ifdef DEBUG_FIND_OPTIMAL_ALIGNMENT
+	printf ( "backtrace starting from (qindex=%d, tindex=%d, score=%f)\n",
+			query_index, target_index, score_matrix[query_index][target_index]) ;
+	#endif
+	
+	while ( query_index >= 0 && target_index >= 0 ) {
+		
+		const char q_nuc = query_nucleotide(query_index);
+		const char t_nuc = target_nucleotide(target_index);
+
+		const DIRECTION current_origin = origin(query_index, target_index);
+		const char current_match = match ( query_index, target_index ) ;
+
+
+		#ifdef DEBUG_FIND_OPTIMAL_ALIGNMENT
+		const score_type current_score = score(query_index, target_index);
+		printf("query_index=%d   target_index=%d  query=%c target=%c score_matrix=%3.1f origin=%d  accumulated_score = %3.2f\n",
+			query_index, target_index, 
+			q_nuc, t_nuc,
+			current_score, 
+			current_origin,
+			results.score) ;
+		#endif
+	
+		results.query_start = query_index ;
+		results.target_start= target_index ;
+
+		switch ( current_origin )
+		{
+		case FROM_LEFT:
+			results.target_alignment += "-" ;
+			results.query_alignment += q_nuc ;
+			results.gaps++;
+			results.score += gap_panelty();
+
+			query_index--;
+			break ;
+
+		case FROM_UPPER_LEFT:
+			results.target_alignment += t_nuc;
+			results.query_alignment += q_nuc ;
+
+			switch ( current_match ) 
+			{
+			case 'N':
+				results.neutral_matches++ ;
+				results.score += neutral_panelty();
+				break ;
+
+			case 'M':
+				results.matches++;
+				results.score += match_panelty();
+				break;
+
+			case 'x':
+				results.mismatches++;
+				results.score += mismatch_panelty();
+				break ;
+
+			default:
+				errx(1,"Internal error: unknown match type (%c) at query_index=%zu, target_index=%zu\n",
+					current_match, query_index, target_index ) ;
+			}
+
+			query_index--;
+			target_index--;
+			break ;
+
+		case FROM_UPPER:
+			results.target_alignment += t_nuc ;
+			results.query_alignment += "-" ;
+			results.gaps++;
+			results.score += gap_panelty();
+
+			target_index--;
+			break;
+
+		case FROM_NOWHERE:
+		default:
+			print_matrix();
+			printf("Invalid origin (%d) at query_index=%zu, target_index=%zu\n", 
+					current_origin, query_index, target_index ) ;
+			printf("Query = %s\n", query_sequence().c_str());
+			printf("Target= %s\n", target_sequence().c_str());
+			exit(1);
+		}
+	}
+
+	results.query_size = query_sequence().length();
+	results.target_size= target_sequence().length();
+
+	std::reverse(results.target_alignment.begin(),results.target_alignment.end());
+	std::reverse(results.query_alignment.begin(), results.query_alignment.end());
+
+	return results;
+}
+
+void HalfLocalSequenceAlignment::find_optimal_alignment ( ) 
+{
+	SequenceAlignmentResults results ;
+	
+
+	//Try to find a good alignment, 
+	//starting from the highest score cell.
+	results = find_optimal_alignment_from_point ( highest_scored_query_index,
+						      highest_scored_target_index ) ;
+
+	//Some heuristics:
+	//If the adapter matched 7 nucleotides anywhere in the query
+	//without mismatches/gaps, accept it.
+	if ( results.matches >= 7 
+	     && 
+	     results.mismatches == 0
+	     &&
+	     results.gaps == 0 ) {
+	     	
+		_alignment_results = results ;
+		return ;
+	}
+
+	if ( starting_point_close_to_end_of_sequences ( highest_scored_query_index,
+						        highest_scored_target_index ) ) {
+		//We're already very close to the end of the target or query,
+		//can't improve much else, so return what we've got.
+		_alignment_results = results ;
+		return ;
+	}
+
+
+	//More heuristics:
+	/* The adapter is not covering the query until the end.
+	 * Force the alignment to start from the end of the query,
+	 * find the best score at the end of the query
+	 *
+	 * Try (desperately) to find a match that starts at the end of the query or the end of the target/adapter)
+	 */
+	ssize_t query_index = highest_scored_query_index ;
+	ssize_t target_index = highest_scored_target_index;
+	find_alignment_starting_point ( query_index, target_index ) ;
+
+	_alignment_results = results ;
+}
+
+void HalfLocalSequenceAlignment::post_process()
+{
+#if 0
+	//Removes the Ns which were added in 'set_sequences'
+	//And adjust the results values accordingly
+
+	//return ;
+	//_query_sequence.erase ( _query_sequence.find_last_not_of('N') + 1) ;
+	//_target_sequence.erase ( 0, _target_sequence.find_first_not_of('N') ) ;
+
+	_alignment_results.query_sequence.erase ( _query_sequence.find_last_not_of('N') + 1) ;
+	_alignment_results.target_sequence.erase ( 0, _target_sequence.find_first_not_of('N') ) ;
+	_alignment_results.query_size = _alignment_results.query_sequence.length();
+	_alignment_results.target_size = _alignment_results.target_sequence.length();
+
+
+	size_t query_n_position = _alignment_results.query_alignment.find_last_not_of('N') ;
+	int query_n_count;
+
+	if ( query_n_position != string::npos )
+		query_n_count = _alignment_results.query_alignment.length() - query_n_position ;
+	else
+		query_n_count = 0 ;
+
+	int target_n_count = _alignment_results.target_alignment.find_first_not_of('N') ;
+
+	if (query_n_position != string::npos )
+		_alignment_results.query_alignment.erase( query_n_position ) ;
+	_alignment_results.target_alignment.erase( 0,target_n_count ) ;
+
+	//Update Results strucure
+	_alignment_results.query_start+= target_n_count ;
+	_alignment_results.query_end  -= query_n_count ;
+
+	_alignment_results.target_start = 0;
+	_alignment_results.target_end  =  _alignment_results.query_end - _alignment_results.query_start ;
+
+	_alignment_results.query_alignment.erase ( 0, _alignment_results.query_start ) ;
+	_alignment_results.target_alignment.erase ( _alignment_results.target_end+1 ) ;
+
+	//Update match/mismatch/gap counts
+	_alignment_results.matches = 0 ;
+	_alignment_results.mismatches = 0 ;
+	_alignment_results.gaps = 0 ;
+
+	for (size_t index=0; index<_alignment_results.query_alignment.length(); index++) {
+		char q = _alignment_results.query_alignment[index];
+		char t = _alignment_results.target_alignment[index];
+
+		if ( q == '-' || t=='-' ) {
+			_alignment_results.gaps ++ ;
+		} else {
+			if ( q== t )
+				_alignment_results.matches++;
+			else
+				_alignment_results.mismatches++;
+		}
+	}
+#endif 
+}
+