Mercurial > repos > xilinxu > xilinxu
diff fastx_toolkit-0.0.6/src/libfastx/sequence_alignment.cpp @ 3:997f5136985f draft default tip
Uploaded
author | xilinxu |
---|---|
date | Thu, 14 Aug 2014 04:52:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_toolkit-0.0.6/src/libfastx/sequence_alignment.cpp Thu Aug 14 04:52:17 2014 -0400 @@ -0,0 +1,710 @@ +#include <string> +#include <vector> +#include <ostream> +#include <iostream> +#include <algorithm> +#include <iomanip> +#include <err.h> + +#include "sequence_alignment.h" + +using namespace std; + +void SequenceAlignmentResults::print(std::ostream& strm) const +{ + size_t delta; + size_t index; + + strm << "Query-Alingment = " << query_alignment << endl ; + strm << "target-Alingment= " << target_alignment << endl ; + + + strm << (alignment_found ? "Alignment Found" : "Alignment NOT found") << endl; + strm << "Score = " << score << " (" + << matches << " matches, " + << neutral_matches << " neutral-matches, " + << mismatches << " mismatches, " + << gaps << " gaps) " + << std::endl ; + + strm << "Query = " << query_sequence + << "(qsize " << query_size + << " qstart " << query_start + << " qend " << query_end + << std::endl ; + + strm << "Target= " << target_sequence + << "(tsize " << target_size + << " tstart " << target_start + << " tend " << target_end + << std::endl ; + + strm << endl; + + delta = max(target_start, query_start); + + + //Spaces before the query string + if ( delta - query_start > 0 ) + strm << std::string( delta - query_start-1, ' ') ; + //Un-Aligned query part (prefix) + if ( query_start > 0 ) + strm << query_sequence.substr(0, query_start-1) ; + //Aligned query part + strm << "(" << query_alignment << ")"; + //Un-Aligned query part (suffix) + if ( query_end < query_sequence.length() ) + strm << query_sequence.substr( query_end+1 ) ; + strm << std::endl ; + + //Alignment bars + if ( delta > 0 ) + strm << std::string( delta-1, ' ') ; + strm << "(" ; + for (index=0; index<query_alignment.length(); index++) { + strm << ((query_alignment[index]==target_alignment[index]) ? '*' : '|' ); + } + strm << ")" ; + strm << std::endl; + + //Spaces before the target string + if ( delta - target_start > 0 ) + strm << std::string( delta - target_start, ' ') ; + //Un-Aligned target part (prefix) + if ( target_start > 0 ) + strm << target_sequence.substr(0, target_start-1); + //Aligned target part + strm << "(" << target_alignment << ")"; + + //Un-Aligned target part (suffix) + if ( target_end < target_sequence.length() ) + strm << target_sequence.substr( target_end+1 ); + strm << std::endl; + +} + +SequenceAlignment::SequenceAlignment ( ) : + _gap_panelty(-5), + _match_panelty(1), + _mismatch_panelty(-1), + _neutral_panelty(0.1) + +{ +} + + +void SequenceAlignment::set_sequences(const std::string& _query, const std::string& _target) +{ + _query_sequence = _query ; + _target_sequence = _target ; +} + +void SequenceAlignment::reset_alignment_results() +{ + _alignment_results = SequenceAlignmentResults() ; + // + //Reset the results + _alignment_results.query_sequence = query_sequence() ; + _alignment_results.target_sequence = target_sequence() ; +} + +const SequenceAlignmentResults& SequenceAlignment::align ( const std::string& query, const std::string& target ) +{ + set_sequences ( query, target ) ; + + reset_alignment_results(); + + resize_matrix ( query_sequence().length(), target_sequence().length() ) ; + populate_match_matrix(); + + reset_matrix( matrix_width(), matrix_height() ); + populate_matrix(); + find_optimal_alignment(); + + post_process(); + + return _alignment_results; +} + +void SequenceAlignment::resize_matrix(size_t width, size_t height) +{ + size_t i; + + if ( matrix_width() >= width && matrix_height() >= height ) + return ; + + query_border.resize ( width ) ; + target_border.resize ( height ) ; + + score_matrix.resize ( width ); + for (i=0;i<width;i++) + score_matrix[i].resize(height) ; + + origin_matrix.resize ( width ); + for (i=0;i<width;i++) + origin_matrix[i].resize(height) ; + + match_matrix.resize ( width ); + for (i=0;i<width;i++) { + match_matrix[i].resize(height) ; + } +} + +void SequenceAlignment::populate_match_matrix() +{ + for (size_t x=0; x<matrix_width(); x++) + for(size_t y=0;y<matrix_height();y++) + match_matrix[x][y] = + match_value ( query_nucleotide(x), target_nucleotide(y) ) ; +} + + +void SequenceAlignment::post_process() +{ + +} + +void SequenceAlignment::print_matrix(std::ostream &strm) const +{ + size_t query_index ; + size_t target_index ; + + #if 0 + printf("Match-Matrix:\n"); + printf(" - "); + for ( target_index=1; target_index<matrix_height(); target_index++ ) + printf(" %c ", target_sequence()[target_index-1] ); + printf("\n"); + + for ( query_index=1; query_index<matrix_width(); query_index++ ) { + + printf(" %c ", query_sequence()[query_index-1]) ; + + for ( target_index=1 ; target_index<matrix_height(); target_index++ ) { + printf(" %c ", match_matrix[query_index][target_index]); + } + printf("\n"); + } + #endif + + strm << "Score-Matrix:" << endl ; + + //Print Target nucleotides + strm << setw(2) << left << "-" << setw(7) << "-" ; + for ( query_index=0; query_index<matrix_width(); query_index++ ) + strm << setw(9) << left << query_nucleotide ( query_index ) ; + strm << endl; + strm << setw(2) << left << "-" << setw(7) << "-" ; + for ( query_index=0; query_index<matrix_width(); query_index++ ) + strm << setw(9) << left << query_border[query_index] ; + strm << endl; + + for ( target_index=0; target_index<matrix_height(); target_index++ ) { + + strm << setw(2) << left << target_nucleotide ( target_index ) ; + strm << setw(6) << right << target_border[target_index] << setw(1) << " "; + + for ( query_index=0 ; query_index<matrix_width(); query_index++ ) { + char ch ; + switch (origin ( query_index, target_index ) ) + { + case FROM_UPPER: ch = '|' ; break ; + case FROM_LEFT: ch = '-' ; break ; + case FROM_UPPER_LEFT: ch = '\\' ; break ; + case FROM_NOWHERE: ch = '=' ; break ; + default: ch = '*' ; break ; + } + + strm << left ; + strm << setw(1) << match(query_index,target_index); + strm << setw(1) << ch ; + strm << setw(7) << fixed << setprecision(1) + << score(query_index,target_index) ; + } + strm << endl; + } +} + +#if 0 +void LocalSequenceAlignment::reset_matrix( size_t width, size_t height ) +{ + size_t x,y ; + + highest_scored_query_index = 0 ; + highest_scored_target_index = 0 ; + + for (x=0; x<width; x++) + score_matrix[x][0] = 0 ; + for (y=0; y<height; y++) + score_matrix[0][y] = 0 ; + +} + +void LocalSequenceAlignment::populate_matrix ( ) +{ + size_t query_index ; + size_t target_index ; + + ssize_t highest_score = 0 ; + + for ( query_index=1; query_index<matrix_width(); query_index++ ) { + for ( target_index=1 ; target_index<matrix_height(); target_index++ ) { + ssize_t score = alignment_score(query_index, target_index); + + //printf("score(q=%zu,t=%zu)=%zu\n", query_index, target_index, score ) ; + score_matrix[query_index][target_index] = (score>0) ? score : 0 ; + + //NOTE + // not sure ">=" is strictly correct SW (might be just ">") + if ( score > highest_score ) { + highest_scored_query_index = query_index ; + highest_scored_target_index = target_index ; + highest_score = score ; + } + } + + } +} + +void LocalSequenceAlignment::find_optimal_alignment ( ) +{ + size_t query_index = highest_scored_query_index ; + size_t target_index = highest_scored_target_index; + + _alignment_results.query_end = query_index-1 ; + _alignment_results.target_end= target_index-1 ; + + _alignment_results.score = score_matrix[query_index][target_index]; + + _alignment_results.matches = 0 ; + _alignment_results.mismatches = 0 ; + + while ( query_index > 0 || target_index > 0 ) { + if ( score_matrix[query_index][target_index]==0) + break ; + + //go "left" in the matrix + if ( query_index>0 && + score_matrix[query_index][target_index] == score_matrix[query_index-1][target_index] + gap_panelty() ) { + + _alignment_results.target_alignment += "-" ; + _alignment_results.query_alignment += query_sequence()[query_index-1] ; + query_index--; + } + else + //go "up-left" in the matrix + if ( query_index>0 && target_index>0 && + score_matrix[query_index][target_index] == + score_matrix[query_index-1][target_index-1] + match_score(query_index, target_index) ) { + + _alignment_results.target_alignment += target_sequence()[target_index-1]; + _alignment_results.query_alignment += query_sequence()[query_index-1] ; + + (query_sequence()[query_index-1] == target_sequence()[target_index-1]) ? + (++_alignment_results.matches) : (++_alignment_results.mismatches) ; + + query_index--; + target_index--; + } + else + //go "up" in the matrix + { + _alignment_results.target_alignment += target_sequence()[target_index-1]; + _alignment_results.query_alignment += "-" ; + target_index--; + } + } + + _alignment_results.query_start = query_index ; + _alignment_results.target_start= target_index ; + + _alignment_results.query_size = query_sequence().length(); + _alignment_results.target_size= target_sequence().length(); + + std::reverse(_alignment_results.target_alignment.begin(), _alignment_results.target_alignment.end()); + std::reverse(_alignment_results.query_alignment.begin(), _alignment_results.query_alignment.end()); +} +#endif + +void HalfLocalSequenceAlignment::set_sequences(const std::string& _query, const std::string& _target) +{ + //_query_sequence = _query + std::string( _target.length(), 'N' ); + //_target_sequence = std::string( _query.length(), 'N' ) + _target; + _query_sequence = _query ; + _target_sequence = _target ; +} + + +void HalfLocalSequenceAlignment::reset_matrix( size_t width, size_t height ) +{ + size_t x,y ; + + highest_scored_query_index = 0 ; + highest_scored_target_index = 0 ; + + for (x=0; x<width; x++) { + query_border[x] = + //gap_panelty() * (ssize_t)x ; + //((query_sequence()[x-1]=='N') ? neutral_panelty() : gap_panelty()) * (ssize_t)x ; + //((query_sequence()[x-1]=='N') ? 0 : gap_panelty()) * (ssize_t)x ; + 0 ; + } + + for (y=0; y<height; y++) { + target_border[y] = + ( y <= 3 ) ? 0 : (gap_panelty() * (ssize_t)(y-3)); + //0; + //((target_sequence()[y-1]=='N') ? 0 : gap_panelty()) * (ssize_t)y ; + //((target_sequence()[y-1]=='N') ? neutral_panelty() : gap_panelty()) * (ssize_t)y ; + } + +} + +void HalfLocalSequenceAlignment::populate_matrix ( ) +{ + size_t query_index ; + size_t target_index ; + DIRECTION origin = FROM_LEFT; + + score_type highest_score = -1000000 ; + highest_scored_query_index = -1 ; + highest_scored_target_index = -1 ; + + for ( query_index=0; query_index<matrix_width(); query_index++ ) { + for ( target_index=0 ; target_index<matrix_height(); target_index++ ) { + + //Note: + // 'safe_score()' can accept negative value of -1 (and will return the border value) + score_type up_score = safe_score(query_index, ((ssize_t)target_index)-1) + gap_panelty() ; + score_type left_score = safe_score(((ssize_t)query_index)-1,target_index ) + gap_panelty() ; + score_type upleft_score = safe_score(((ssize_t)query_index)-1,((ssize_t)target_index)-1) + + nucleotide_match_score(query_index, target_index); + + //On the diagonal line, best score can not come from upper cell + //only from left or upper-left cells + if ( target_index>3 && target_index-3 > query_index ) { + left_score = -100000 ; + } + + //printf("query_index=%d, target_index=%d, upscore=%f, left_score=%f, upleft_score=%f\n", + // query_index, target_index, up_score,left_score,upleft_score ); + + score_type score = -100000000 ; + + if ( upleft_score > score ) { + score = upleft_score ; + origin = FROM_UPPER_LEFT; + } + if ( up_score > score ) { + score = up_score ; + origin = FROM_UPPER ; + } + if ( left_score > score ) { + score = left_score ; + origin = FROM_LEFT ; + } + //printf("query_index=%d, target_index=%d, score=%f origin=%d\n", + // query_index, target_index, score, origin ); + + /*if (score<0) { + score = 0 ; + origin = FROM_NOWHERE ; + }*/ + + score_matrix[query_index][target_index] = score ; + origin_matrix[query_index][target_index] = origin ; + + //NOTE + // not sure ">=" is strictly correct SW (might be just ">") + if ( score > highest_score ) { + highest_scored_query_index = query_index ; + highest_scored_target_index = target_index ; + highest_score = score ; + } + } + } +} + +bool HalfLocalSequenceAlignment::starting_point_close_to_end_of_sequences(const size_t query_index, const size_t target_index) const +{ + if ( (size_t)query_index >= query_sequence().length() - 2 || + (size_t)target_index >= target_sequence().length() - 2 ) { + /* We've reach either the end of the Adapter + * (and the adapter is shorter than the query) + * Or the end of the query + * (and the adapter covers up to the end of the query, and then continues on) + * + * So we can safely start the alignment from this point + */ + return true; + } + else { + /* The adapter is not covering the query until the end. + */ + return false; + } +} + +#undef DEBUG_STARTING_POINT +void HalfLocalSequenceAlignment::find_alignment_starting_point(ssize_t &new_query_index, ssize_t &new_target_index) const +{ + /* + * Force the alignment to start from the end of the query, + * find the best score at the end of the query + * + * Try (desperately) to find a match that starts at the end of the query or the end of the target/adapter) + */ + score_type max_score = score( matrix_width()-1, matrix_height()-1 ) ; + for ( size_t q_index = 0 ; q_index < matrix_width(); q_index++ ) { + for ( size_t t_index = matrix_height()-2 ; t_index < matrix_height(); t_index++ ) { + if ( origin ( q_index, t_index ) > 0 && + safe_score ( q_index, t_index ) > max_score ) { + max_score = safe_score ( q_index, t_index ) ; + #ifdef DEBUG_STARTING_POINT + printf("Found new max score = %f at %d,%d\n", max_score, q_index, t_index ) ; + #endif + new_target_index = t_index ; + new_query_index = q_index ; + } + } + } + for ( size_t q_index = matrix_width()-2 ; q_index < matrix_width(); q_index++ ) { + for ( size_t t_index = 0 ; t_index < matrix_height(); t_index++ ) { + if ( origin ( q_index, t_index ) > 0 && + safe_score ( q_index, t_index ) > max_score ) { + max_score = safe_score ( q_index, t_index ) ; + #ifdef DEBUG_STARTING_POINT + printf("Found new max score = %f at %d,%d\n", max_score, q_index, t_index ) ; + #endif + new_target_index = t_index ; + new_query_index = q_index ; + } + } + } + #ifdef DEBUG_STARTING_POINT + printf("Forcing alignment from query_index=%d, target_index=%d, score=%f, origin=%d\n", + new_query_index, new_target_index, + score ( new_query_index, new_target_index ), + origin ( new_query_index, new_target_index ) ); + #endif +} + + +#undef DEBUG_FIND_OPTIMAL_ALIGNMENT +SequenceAlignmentResults HalfLocalSequenceAlignment::find_optimal_alignment_from_point ( const size_t query_start, const size_t target_start ) const +{ + SequenceAlignmentResults results; + + results.query_sequence = query_sequence(); + results.target_sequence= target_sequence(); + + ssize_t query_index = query_start; + ssize_t target_index = target_start ; + + results.query_end = query_index ; + results.target_end= target_index ; + + #ifdef DEBUG_FIND_OPTIMAL_ALIGNMENT + printf ( "backtrace starting from (qindex=%d, tindex=%d, score=%f)\n", + query_index, target_index, score_matrix[query_index][target_index]) ; + #endif + + while ( query_index >= 0 && target_index >= 0 ) { + + const char q_nuc = query_nucleotide(query_index); + const char t_nuc = target_nucleotide(target_index); + + const DIRECTION current_origin = origin(query_index, target_index); + const char current_match = match ( query_index, target_index ) ; + + + #ifdef DEBUG_FIND_OPTIMAL_ALIGNMENT + const score_type current_score = score(query_index, target_index); + printf("query_index=%d target_index=%d query=%c target=%c score_matrix=%3.1f origin=%d accumulated_score = %3.2f\n", + query_index, target_index, + q_nuc, t_nuc, + current_score, + current_origin, + results.score) ; + #endif + + results.query_start = query_index ; + results.target_start= target_index ; + + switch ( current_origin ) + { + case FROM_LEFT: + results.target_alignment += "-" ; + results.query_alignment += q_nuc ; + results.gaps++; + results.score += gap_panelty(); + + query_index--; + break ; + + case FROM_UPPER_LEFT: + results.target_alignment += t_nuc; + results.query_alignment += q_nuc ; + + switch ( current_match ) + { + case 'N': + results.neutral_matches++ ; + results.score += neutral_panelty(); + break ; + + case 'M': + results.matches++; + results.score += match_panelty(); + break; + + case 'x': + results.mismatches++; + results.score += mismatch_panelty(); + break ; + + default: + errx(1,"Internal error: unknown match type (%c) at query_index=%zu, target_index=%zu\n", + current_match, query_index, target_index ) ; + } + + query_index--; + target_index--; + break ; + + case FROM_UPPER: + results.target_alignment += t_nuc ; + results.query_alignment += "-" ; + results.gaps++; + results.score += gap_panelty(); + + target_index--; + break; + + case FROM_NOWHERE: + default: + print_matrix(); + printf("Invalid origin (%d) at query_index=%zu, target_index=%zu\n", + current_origin, query_index, target_index ) ; + printf("Query = %s\n", query_sequence().c_str()); + printf("Target= %s\n", target_sequence().c_str()); + exit(1); + } + } + + results.query_size = query_sequence().length(); + results.target_size= target_sequence().length(); + + std::reverse(results.target_alignment.begin(),results.target_alignment.end()); + std::reverse(results.query_alignment.begin(), results.query_alignment.end()); + + return results; +} + +void HalfLocalSequenceAlignment::find_optimal_alignment ( ) +{ + SequenceAlignmentResults results ; + + + //Try to find a good alignment, + //starting from the highest score cell. + results = find_optimal_alignment_from_point ( highest_scored_query_index, + highest_scored_target_index ) ; + + //Some heuristics: + //If the adapter matched 7 nucleotides anywhere in the query + //without mismatches/gaps, accept it. + if ( results.matches >= 7 + && + results.mismatches == 0 + && + results.gaps == 0 ) { + + _alignment_results = results ; + return ; + } + + if ( starting_point_close_to_end_of_sequences ( highest_scored_query_index, + highest_scored_target_index ) ) { + //We're already very close to the end of the target or query, + //can't improve much else, so return what we've got. + _alignment_results = results ; + return ; + } + + + //More heuristics: + /* The adapter is not covering the query until the end. + * Force the alignment to start from the end of the query, + * find the best score at the end of the query + * + * Try (desperately) to find a match that starts at the end of the query or the end of the target/adapter) + */ + ssize_t query_index = highest_scored_query_index ; + ssize_t target_index = highest_scored_target_index; + find_alignment_starting_point ( query_index, target_index ) ; + + _alignment_results = results ; +} + +void HalfLocalSequenceAlignment::post_process() +{ +#if 0 + //Removes the Ns which were added in 'set_sequences' + //And adjust the results values accordingly + + //return ; + //_query_sequence.erase ( _query_sequence.find_last_not_of('N') + 1) ; + //_target_sequence.erase ( 0, _target_sequence.find_first_not_of('N') ) ; + + _alignment_results.query_sequence.erase ( _query_sequence.find_last_not_of('N') + 1) ; + _alignment_results.target_sequence.erase ( 0, _target_sequence.find_first_not_of('N') ) ; + _alignment_results.query_size = _alignment_results.query_sequence.length(); + _alignment_results.target_size = _alignment_results.target_sequence.length(); + + + size_t query_n_position = _alignment_results.query_alignment.find_last_not_of('N') ; + int query_n_count; + + if ( query_n_position != string::npos ) + query_n_count = _alignment_results.query_alignment.length() - query_n_position ; + else + query_n_count = 0 ; + + int target_n_count = _alignment_results.target_alignment.find_first_not_of('N') ; + + if (query_n_position != string::npos ) + _alignment_results.query_alignment.erase( query_n_position ) ; + _alignment_results.target_alignment.erase( 0,target_n_count ) ; + + //Update Results strucure + _alignment_results.query_start+= target_n_count ; + _alignment_results.query_end -= query_n_count ; + + _alignment_results.target_start = 0; + _alignment_results.target_end = _alignment_results.query_end - _alignment_results.query_start ; + + _alignment_results.query_alignment.erase ( 0, _alignment_results.query_start ) ; + _alignment_results.target_alignment.erase ( _alignment_results.target_end+1 ) ; + + //Update match/mismatch/gap counts + _alignment_results.matches = 0 ; + _alignment_results.mismatches = 0 ; + _alignment_results.gaps = 0 ; + + for (size_t index=0; index<_alignment_results.query_alignment.length(); index++) { + char q = _alignment_results.query_alignment[index]; + char t = _alignment_results.target_alignment[index]; + + if ( q == '-' || t=='-' ) { + _alignment_results.gaps ++ ; + } else { + if ( q== t ) + _alignment_results.matches++; + else + _alignment_results.mismatches++; + } + } +#endif +} +