comparison fastx_toolkit-0.0.6/src/libfastx/sequence_alignment.cpp @ 3:997f5136985f draft default tip

Uploaded
author xilinxu
date Thu, 14 Aug 2014 04:52:17 -0400
parents
children
comparison
equal deleted inserted replaced
2:dfe9332138cf 3:997f5136985f
1 #include <string>
2 #include <vector>
3 #include <ostream>
4 #include <iostream>
5 #include <algorithm>
6 #include <iomanip>
7 #include <err.h>
8
9 #include "sequence_alignment.h"
10
11 using namespace std;
12
13 void SequenceAlignmentResults::print(std::ostream& strm) const
14 {
15 size_t delta;
16 size_t index;
17
18 strm << "Query-Alingment = " << query_alignment << endl ;
19 strm << "target-Alingment= " << target_alignment << endl ;
20
21
22 strm << (alignment_found ? "Alignment Found" : "Alignment NOT found") << endl;
23 strm << "Score = " << score << " ("
24 << matches << " matches, "
25 << neutral_matches << " neutral-matches, "
26 << mismatches << " mismatches, "
27 << gaps << " gaps) "
28 << std::endl ;
29
30 strm << "Query = " << query_sequence
31 << "(qsize " << query_size
32 << " qstart " << query_start
33 << " qend " << query_end
34 << std::endl ;
35
36 strm << "Target= " << target_sequence
37 << "(tsize " << target_size
38 << " tstart " << target_start
39 << " tend " << target_end
40 << std::endl ;
41
42 strm << endl;
43
44 delta = max(target_start, query_start);
45
46
47 //Spaces before the query string
48 if ( delta - query_start > 0 )
49 strm << std::string( delta - query_start-1, ' ') ;
50 //Un-Aligned query part (prefix)
51 if ( query_start > 0 )
52 strm << query_sequence.substr(0, query_start-1) ;
53 //Aligned query part
54 strm << "(" << query_alignment << ")";
55 //Un-Aligned query part (suffix)
56 if ( query_end < query_sequence.length() )
57 strm << query_sequence.substr( query_end+1 ) ;
58 strm << std::endl ;
59
60 //Alignment bars
61 if ( delta > 0 )
62 strm << std::string( delta-1, ' ') ;
63 strm << "(" ;
64 for (index=0; index<query_alignment.length(); index++) {
65 strm << ((query_alignment[index]==target_alignment[index]) ? '*' : '|' );
66 }
67 strm << ")" ;
68 strm << std::endl;
69
70 //Spaces before the target string
71 if ( delta - target_start > 0 )
72 strm << std::string( delta - target_start, ' ') ;
73 //Un-Aligned target part (prefix)
74 if ( target_start > 0 )
75 strm << target_sequence.substr(0, target_start-1);
76 //Aligned target part
77 strm << "(" << target_alignment << ")";
78
79 //Un-Aligned target part (suffix)
80 if ( target_end < target_sequence.length() )
81 strm << target_sequence.substr( target_end+1 );
82 strm << std::endl;
83
84 }
85
86 SequenceAlignment::SequenceAlignment ( ) :
87 _gap_panelty(-5),
88 _match_panelty(1),
89 _mismatch_panelty(-1),
90 _neutral_panelty(0.1)
91
92 {
93 }
94
95
96 void SequenceAlignment::set_sequences(const std::string& _query, const std::string& _target)
97 {
98 _query_sequence = _query ;
99 _target_sequence = _target ;
100 }
101
102 void SequenceAlignment::reset_alignment_results()
103 {
104 _alignment_results = SequenceAlignmentResults() ;
105 //
106 //Reset the results
107 _alignment_results.query_sequence = query_sequence() ;
108 _alignment_results.target_sequence = target_sequence() ;
109 }
110
111 const SequenceAlignmentResults& SequenceAlignment::align ( const std::string& query, const std::string& target )
112 {
113 set_sequences ( query, target ) ;
114
115 reset_alignment_results();
116
117 resize_matrix ( query_sequence().length(), target_sequence().length() ) ;
118 populate_match_matrix();
119
120 reset_matrix( matrix_width(), matrix_height() );
121 populate_matrix();
122 find_optimal_alignment();
123
124 post_process();
125
126 return _alignment_results;
127 }
128
129 void SequenceAlignment::resize_matrix(size_t width, size_t height)
130 {
131 size_t i;
132
133 if ( matrix_width() >= width && matrix_height() >= height )
134 return ;
135
136 query_border.resize ( width ) ;
137 target_border.resize ( height ) ;
138
139 score_matrix.resize ( width );
140 for (i=0;i<width;i++)
141 score_matrix[i].resize(height) ;
142
143 origin_matrix.resize ( width );
144 for (i=0;i<width;i++)
145 origin_matrix[i].resize(height) ;
146
147 match_matrix.resize ( width );
148 for (i=0;i<width;i++) {
149 match_matrix[i].resize(height) ;
150 }
151 }
152
153 void SequenceAlignment::populate_match_matrix()
154 {
155 for (size_t x=0; x<matrix_width(); x++)
156 for(size_t y=0;y<matrix_height();y++)
157 match_matrix[x][y] =
158 match_value ( query_nucleotide(x), target_nucleotide(y) ) ;
159 }
160
161
162 void SequenceAlignment::post_process()
163 {
164
165 }
166
167 void SequenceAlignment::print_matrix(std::ostream &strm) const
168 {
169 size_t query_index ;
170 size_t target_index ;
171
172 #if 0
173 printf("Match-Matrix:\n");
174 printf(" - ");
175 for ( target_index=1; target_index<matrix_height(); target_index++ )
176 printf(" %c ", target_sequence()[target_index-1] );
177 printf("\n");
178
179 for ( query_index=1; query_index<matrix_width(); query_index++ ) {
180
181 printf(" %c ", query_sequence()[query_index-1]) ;
182
183 for ( target_index=1 ; target_index<matrix_height(); target_index++ ) {
184 printf(" %c ", match_matrix[query_index][target_index]);
185 }
186 printf("\n");
187 }
188 #endif
189
190 strm << "Score-Matrix:" << endl ;
191
192 //Print Target nucleotides
193 strm << setw(2) << left << "-" << setw(7) << "-" ;
194 for ( query_index=0; query_index<matrix_width(); query_index++ )
195 strm << setw(9) << left << query_nucleotide ( query_index ) ;
196 strm << endl;
197 strm << setw(2) << left << "-" << setw(7) << "-" ;
198 for ( query_index=0; query_index<matrix_width(); query_index++ )
199 strm << setw(9) << left << query_border[query_index] ;
200 strm << endl;
201
202 for ( target_index=0; target_index<matrix_height(); target_index++ ) {
203
204 strm << setw(2) << left << target_nucleotide ( target_index ) ;
205 strm << setw(6) << right << target_border[target_index] << setw(1) << " ";
206
207 for ( query_index=0 ; query_index<matrix_width(); query_index++ ) {
208 char ch ;
209 switch (origin ( query_index, target_index ) )
210 {
211 case FROM_UPPER: ch = '|' ; break ;
212 case FROM_LEFT: ch = '-' ; break ;
213 case FROM_UPPER_LEFT: ch = '\\' ; break ;
214 case FROM_NOWHERE: ch = '=' ; break ;
215 default: ch = '*' ; break ;
216 }
217
218 strm << left ;
219 strm << setw(1) << match(query_index,target_index);
220 strm << setw(1) << ch ;
221 strm << setw(7) << fixed << setprecision(1)
222 << score(query_index,target_index) ;
223 }
224 strm << endl;
225 }
226 }
227
228 #if 0
229 void LocalSequenceAlignment::reset_matrix( size_t width, size_t height )
230 {
231 size_t x,y ;
232
233 highest_scored_query_index = 0 ;
234 highest_scored_target_index = 0 ;
235
236 for (x=0; x<width; x++)
237 score_matrix[x][0] = 0 ;
238 for (y=0; y<height; y++)
239 score_matrix[0][y] = 0 ;
240
241 }
242
243 void LocalSequenceAlignment::populate_matrix ( )
244 {
245 size_t query_index ;
246 size_t target_index ;
247
248 ssize_t highest_score = 0 ;
249
250 for ( query_index=1; query_index<matrix_width(); query_index++ ) {
251 for ( target_index=1 ; target_index<matrix_height(); target_index++ ) {
252 ssize_t score = alignment_score(query_index, target_index);
253
254 //printf("score(q=%zu,t=%zu)=%zu\n", query_index, target_index, score ) ;
255 score_matrix[query_index][target_index] = (score>0) ? score : 0 ;
256
257 //NOTE
258 // not sure ">=" is strictly correct SW (might be just ">")
259 if ( score > highest_score ) {
260 highest_scored_query_index = query_index ;
261 highest_scored_target_index = target_index ;
262 highest_score = score ;
263 }
264 }
265
266 }
267 }
268
269 void LocalSequenceAlignment::find_optimal_alignment ( )
270 {
271 size_t query_index = highest_scored_query_index ;
272 size_t target_index = highest_scored_target_index;
273
274 _alignment_results.query_end = query_index-1 ;
275 _alignment_results.target_end= target_index-1 ;
276
277 _alignment_results.score = score_matrix[query_index][target_index];
278
279 _alignment_results.matches = 0 ;
280 _alignment_results.mismatches = 0 ;
281
282 while ( query_index > 0 || target_index > 0 ) {
283 if ( score_matrix[query_index][target_index]==0)
284 break ;
285
286 //go "left" in the matrix
287 if ( query_index>0 &&
288 score_matrix[query_index][target_index] == score_matrix[query_index-1][target_index] + gap_panelty() ) {
289
290 _alignment_results.target_alignment += "-" ;
291 _alignment_results.query_alignment += query_sequence()[query_index-1] ;
292 query_index--;
293 }
294 else
295 //go "up-left" in the matrix
296 if ( query_index>0 && target_index>0 &&
297 score_matrix[query_index][target_index] ==
298 score_matrix[query_index-1][target_index-1] + match_score(query_index, target_index) ) {
299
300 _alignment_results.target_alignment += target_sequence()[target_index-1];
301 _alignment_results.query_alignment += query_sequence()[query_index-1] ;
302
303 (query_sequence()[query_index-1] == target_sequence()[target_index-1]) ?
304 (++_alignment_results.matches) : (++_alignment_results.mismatches) ;
305
306 query_index--;
307 target_index--;
308 }
309 else
310 //go "up" in the matrix
311 {
312 _alignment_results.target_alignment += target_sequence()[target_index-1];
313 _alignment_results.query_alignment += "-" ;
314 target_index--;
315 }
316 }
317
318 _alignment_results.query_start = query_index ;
319 _alignment_results.target_start= target_index ;
320
321 _alignment_results.query_size = query_sequence().length();
322 _alignment_results.target_size= target_sequence().length();
323
324 std::reverse(_alignment_results.target_alignment.begin(), _alignment_results.target_alignment.end());
325 std::reverse(_alignment_results.query_alignment.begin(), _alignment_results.query_alignment.end());
326 }
327 #endif
328
329 void HalfLocalSequenceAlignment::set_sequences(const std::string& _query, const std::string& _target)
330 {
331 //_query_sequence = _query + std::string( _target.length(), 'N' );
332 //_target_sequence = std::string( _query.length(), 'N' ) + _target;
333 _query_sequence = _query ;
334 _target_sequence = _target ;
335 }
336
337
338 void HalfLocalSequenceAlignment::reset_matrix( size_t width, size_t height )
339 {
340 size_t x,y ;
341
342 highest_scored_query_index = 0 ;
343 highest_scored_target_index = 0 ;
344
345 for (x=0; x<width; x++) {
346 query_border[x] =
347 //gap_panelty() * (ssize_t)x ;
348 //((query_sequence()[x-1]=='N') ? neutral_panelty() : gap_panelty()) * (ssize_t)x ;
349 //((query_sequence()[x-1]=='N') ? 0 : gap_panelty()) * (ssize_t)x ;
350 0 ;
351 }
352
353 for (y=0; y<height; y++) {
354 target_border[y] =
355 ( y <= 3 ) ? 0 : (gap_panelty() * (ssize_t)(y-3));
356 //0;
357 //((target_sequence()[y-1]=='N') ? 0 : gap_panelty()) * (ssize_t)y ;
358 //((target_sequence()[y-1]=='N') ? neutral_panelty() : gap_panelty()) * (ssize_t)y ;
359 }
360
361 }
362
363 void HalfLocalSequenceAlignment::populate_matrix ( )
364 {
365 size_t query_index ;
366 size_t target_index ;
367 DIRECTION origin = FROM_LEFT;
368
369 score_type highest_score = -1000000 ;
370 highest_scored_query_index = -1 ;
371 highest_scored_target_index = -1 ;
372
373 for ( query_index=0; query_index<matrix_width(); query_index++ ) {
374 for ( target_index=0 ; target_index<matrix_height(); target_index++ ) {
375
376 //Note:
377 // 'safe_score()' can accept negative value of -1 (and will return the border value)
378 score_type up_score = safe_score(query_index, ((ssize_t)target_index)-1) + gap_panelty() ;
379 score_type left_score = safe_score(((ssize_t)query_index)-1,target_index ) + gap_panelty() ;
380 score_type upleft_score = safe_score(((ssize_t)query_index)-1,((ssize_t)target_index)-1) +
381 nucleotide_match_score(query_index, target_index);
382
383 //On the diagonal line, best score can not come from upper cell
384 //only from left or upper-left cells
385 if ( target_index>3 && target_index-3 > query_index ) {
386 left_score = -100000 ;
387 }
388
389 //printf("query_index=%d, target_index=%d, upscore=%f, left_score=%f, upleft_score=%f\n",
390 // query_index, target_index, up_score,left_score,upleft_score );
391
392 score_type score = -100000000 ;
393
394 if ( upleft_score > score ) {
395 score = upleft_score ;
396 origin = FROM_UPPER_LEFT;
397 }
398 if ( up_score > score ) {
399 score = up_score ;
400 origin = FROM_UPPER ;
401 }
402 if ( left_score > score ) {
403 score = left_score ;
404 origin = FROM_LEFT ;
405 }
406 //printf("query_index=%d, target_index=%d, score=%f origin=%d\n",
407 // query_index, target_index, score, origin );
408
409 /*if (score<0) {
410 score = 0 ;
411 origin = FROM_NOWHERE ;
412 }*/
413
414 score_matrix[query_index][target_index] = score ;
415 origin_matrix[query_index][target_index] = origin ;
416
417 //NOTE
418 // not sure ">=" is strictly correct SW (might be just ">")
419 if ( score > highest_score ) {
420 highest_scored_query_index = query_index ;
421 highest_scored_target_index = target_index ;
422 highest_score = score ;
423 }
424 }
425 }
426 }
427
428 bool HalfLocalSequenceAlignment::starting_point_close_to_end_of_sequences(const size_t query_index, const size_t target_index) const
429 {
430 if ( (size_t)query_index >= query_sequence().length() - 2 ||
431 (size_t)target_index >= target_sequence().length() - 2 ) {
432 /* We've reach either the end of the Adapter
433 * (and the adapter is shorter than the query)
434 * Or the end of the query
435 * (and the adapter covers up to the end of the query, and then continues on)
436 *
437 * So we can safely start the alignment from this point
438 */
439 return true;
440 }
441 else {
442 /* The adapter is not covering the query until the end.
443 */
444 return false;
445 }
446 }
447
448 #undef DEBUG_STARTING_POINT
449 void HalfLocalSequenceAlignment::find_alignment_starting_point(ssize_t &new_query_index, ssize_t &new_target_index) const
450 {
451 /*
452 * Force the alignment to start from the end of the query,
453 * find the best score at the end of the query
454 *
455 * Try (desperately) to find a match that starts at the end of the query or the end of the target/adapter)
456 */
457 score_type max_score = score( matrix_width()-1, matrix_height()-1 ) ;
458 for ( size_t q_index = 0 ; q_index < matrix_width(); q_index++ ) {
459 for ( size_t t_index = matrix_height()-2 ; t_index < matrix_height(); t_index++ ) {
460 if ( origin ( q_index, t_index ) > 0 &&
461 safe_score ( q_index, t_index ) > max_score ) {
462 max_score = safe_score ( q_index, t_index ) ;
463 #ifdef DEBUG_STARTING_POINT
464 printf("Found new max score = %f at %d,%d\n", max_score, q_index, t_index ) ;
465 #endif
466 new_target_index = t_index ;
467 new_query_index = q_index ;
468 }
469 }
470 }
471 for ( size_t q_index = matrix_width()-2 ; q_index < matrix_width(); q_index++ ) {
472 for ( size_t t_index = 0 ; t_index < matrix_height(); t_index++ ) {
473 if ( origin ( q_index, t_index ) > 0 &&
474 safe_score ( q_index, t_index ) > max_score ) {
475 max_score = safe_score ( q_index, t_index ) ;
476 #ifdef DEBUG_STARTING_POINT
477 printf("Found new max score = %f at %d,%d\n", max_score, q_index, t_index ) ;
478 #endif
479 new_target_index = t_index ;
480 new_query_index = q_index ;
481 }
482 }
483 }
484 #ifdef DEBUG_STARTING_POINT
485 printf("Forcing alignment from query_index=%d, target_index=%d, score=%f, origin=%d\n",
486 new_query_index, new_target_index,
487 score ( new_query_index, new_target_index ),
488 origin ( new_query_index, new_target_index ) );
489 #endif
490 }
491
492
493 #undef DEBUG_FIND_OPTIMAL_ALIGNMENT
494 SequenceAlignmentResults HalfLocalSequenceAlignment::find_optimal_alignment_from_point ( const size_t query_start, const size_t target_start ) const
495 {
496 SequenceAlignmentResults results;
497
498 results.query_sequence = query_sequence();
499 results.target_sequence= target_sequence();
500
501 ssize_t query_index = query_start;
502 ssize_t target_index = target_start ;
503
504 results.query_end = query_index ;
505 results.target_end= target_index ;
506
507 #ifdef DEBUG_FIND_OPTIMAL_ALIGNMENT
508 printf ( "backtrace starting from (qindex=%d, tindex=%d, score=%f)\n",
509 query_index, target_index, score_matrix[query_index][target_index]) ;
510 #endif
511
512 while ( query_index >= 0 && target_index >= 0 ) {
513
514 const char q_nuc = query_nucleotide(query_index);
515 const char t_nuc = target_nucleotide(target_index);
516
517 const DIRECTION current_origin = origin(query_index, target_index);
518 const char current_match = match ( query_index, target_index ) ;
519
520
521 #ifdef DEBUG_FIND_OPTIMAL_ALIGNMENT
522 const score_type current_score = score(query_index, target_index);
523 printf("query_index=%d target_index=%d query=%c target=%c score_matrix=%3.1f origin=%d accumulated_score = %3.2f\n",
524 query_index, target_index,
525 q_nuc, t_nuc,
526 current_score,
527 current_origin,
528 results.score) ;
529 #endif
530
531 results.query_start = query_index ;
532 results.target_start= target_index ;
533
534 switch ( current_origin )
535 {
536 case FROM_LEFT:
537 results.target_alignment += "-" ;
538 results.query_alignment += q_nuc ;
539 results.gaps++;
540 results.score += gap_panelty();
541
542 query_index--;
543 break ;
544
545 case FROM_UPPER_LEFT:
546 results.target_alignment += t_nuc;
547 results.query_alignment += q_nuc ;
548
549 switch ( current_match )
550 {
551 case 'N':
552 results.neutral_matches++ ;
553 results.score += neutral_panelty();
554 break ;
555
556 case 'M':
557 results.matches++;
558 results.score += match_panelty();
559 break;
560
561 case 'x':
562 results.mismatches++;
563 results.score += mismatch_panelty();
564 break ;
565
566 default:
567 errx(1,"Internal error: unknown match type (%c) at query_index=%zu, target_index=%zu\n",
568 current_match, query_index, target_index ) ;
569 }
570
571 query_index--;
572 target_index--;
573 break ;
574
575 case FROM_UPPER:
576 results.target_alignment += t_nuc ;
577 results.query_alignment += "-" ;
578 results.gaps++;
579 results.score += gap_panelty();
580
581 target_index--;
582 break;
583
584 case FROM_NOWHERE:
585 default:
586 print_matrix();
587 printf("Invalid origin (%d) at query_index=%zu, target_index=%zu\n",
588 current_origin, query_index, target_index ) ;
589 printf("Query = %s\n", query_sequence().c_str());
590 printf("Target= %s\n", target_sequence().c_str());
591 exit(1);
592 }
593 }
594
595 results.query_size = query_sequence().length();
596 results.target_size= target_sequence().length();
597
598 std::reverse(results.target_alignment.begin(),results.target_alignment.end());
599 std::reverse(results.query_alignment.begin(), results.query_alignment.end());
600
601 return results;
602 }
603
604 void HalfLocalSequenceAlignment::find_optimal_alignment ( )
605 {
606 SequenceAlignmentResults results ;
607
608
609 //Try to find a good alignment,
610 //starting from the highest score cell.
611 results = find_optimal_alignment_from_point ( highest_scored_query_index,
612 highest_scored_target_index ) ;
613
614 //Some heuristics:
615 //If the adapter matched 7 nucleotides anywhere in the query
616 //without mismatches/gaps, accept it.
617 if ( results.matches >= 7
618 &&
619 results.mismatches == 0
620 &&
621 results.gaps == 0 ) {
622
623 _alignment_results = results ;
624 return ;
625 }
626
627 if ( starting_point_close_to_end_of_sequences ( highest_scored_query_index,
628 highest_scored_target_index ) ) {
629 //We're already very close to the end of the target or query,
630 //can't improve much else, so return what we've got.
631 _alignment_results = results ;
632 return ;
633 }
634
635
636 //More heuristics:
637 /* The adapter is not covering the query until the end.
638 * Force the alignment to start from the end of the query,
639 * find the best score at the end of the query
640 *
641 * Try (desperately) to find a match that starts at the end of the query or the end of the target/adapter)
642 */
643 ssize_t query_index = highest_scored_query_index ;
644 ssize_t target_index = highest_scored_target_index;
645 find_alignment_starting_point ( query_index, target_index ) ;
646
647 _alignment_results = results ;
648 }
649
650 void HalfLocalSequenceAlignment::post_process()
651 {
652 #if 0
653 //Removes the Ns which were added in 'set_sequences'
654 //And adjust the results values accordingly
655
656 //return ;
657 //_query_sequence.erase ( _query_sequence.find_last_not_of('N') + 1) ;
658 //_target_sequence.erase ( 0, _target_sequence.find_first_not_of('N') ) ;
659
660 _alignment_results.query_sequence.erase ( _query_sequence.find_last_not_of('N') + 1) ;
661 _alignment_results.target_sequence.erase ( 0, _target_sequence.find_first_not_of('N') ) ;
662 _alignment_results.query_size = _alignment_results.query_sequence.length();
663 _alignment_results.target_size = _alignment_results.target_sequence.length();
664
665
666 size_t query_n_position = _alignment_results.query_alignment.find_last_not_of('N') ;
667 int query_n_count;
668
669 if ( query_n_position != string::npos )
670 query_n_count = _alignment_results.query_alignment.length() - query_n_position ;
671 else
672 query_n_count = 0 ;
673
674 int target_n_count = _alignment_results.target_alignment.find_first_not_of('N') ;
675
676 if (query_n_position != string::npos )
677 _alignment_results.query_alignment.erase( query_n_position ) ;
678 _alignment_results.target_alignment.erase( 0,target_n_count ) ;
679
680 //Update Results strucure
681 _alignment_results.query_start+= target_n_count ;
682 _alignment_results.query_end -= query_n_count ;
683
684 _alignment_results.target_start = 0;
685 _alignment_results.target_end = _alignment_results.query_end - _alignment_results.query_start ;
686
687 _alignment_results.query_alignment.erase ( 0, _alignment_results.query_start ) ;
688 _alignment_results.target_alignment.erase ( _alignment_results.target_end+1 ) ;
689
690 //Update match/mismatch/gap counts
691 _alignment_results.matches = 0 ;
692 _alignment_results.mismatches = 0 ;
693 _alignment_results.gaps = 0 ;
694
695 for (size_t index=0; index<_alignment_results.query_alignment.length(); index++) {
696 char q = _alignment_results.query_alignment[index];
697 char t = _alignment_results.target_alignment[index];
698
699 if ( q == '-' || t=='-' ) {
700 _alignment_results.gaps ++ ;
701 } else {
702 if ( q== t )
703 _alignment_results.matches++;
704 else
705 _alignment_results.mismatches++;
706 }
707 }
708 #endif
709 }
710