3
|
1 #include <string>
|
|
2 #include <vector>
|
|
3 #include <ostream>
|
|
4 #include <iostream>
|
|
5 #include <algorithm>
|
|
6 #include <iomanip>
|
|
7 #include <err.h>
|
|
8
|
|
9 #include "sequence_alignment.h"
|
|
10
|
|
11 using namespace std;
|
|
12
|
|
13 void SequenceAlignmentResults::print(std::ostream& strm) const
|
|
14 {
|
|
15 size_t delta;
|
|
16 size_t index;
|
|
17
|
|
18 strm << "Query-Alingment = " << query_alignment << endl ;
|
|
19 strm << "target-Alingment= " << target_alignment << endl ;
|
|
20
|
|
21
|
|
22 strm << (alignment_found ? "Alignment Found" : "Alignment NOT found") << endl;
|
|
23 strm << "Score = " << score << " ("
|
|
24 << matches << " matches, "
|
|
25 << neutral_matches << " neutral-matches, "
|
|
26 << mismatches << " mismatches, "
|
|
27 << gaps << " gaps) "
|
|
28 << std::endl ;
|
|
29
|
|
30 strm << "Query = " << query_sequence
|
|
31 << "(qsize " << query_size
|
|
32 << " qstart " << query_start
|
|
33 << " qend " << query_end
|
|
34 << std::endl ;
|
|
35
|
|
36 strm << "Target= " << target_sequence
|
|
37 << "(tsize " << target_size
|
|
38 << " tstart " << target_start
|
|
39 << " tend " << target_end
|
|
40 << std::endl ;
|
|
41
|
|
42 strm << endl;
|
|
43
|
|
44 delta = max(target_start, query_start);
|
|
45
|
|
46
|
|
47 //Spaces before the query string
|
|
48 if ( delta - query_start > 0 )
|
|
49 strm << std::string( delta - query_start-1, ' ') ;
|
|
50 //Un-Aligned query part (prefix)
|
|
51 if ( query_start > 0 )
|
|
52 strm << query_sequence.substr(0, query_start-1) ;
|
|
53 //Aligned query part
|
|
54 strm << "(" << query_alignment << ")";
|
|
55 //Un-Aligned query part (suffix)
|
|
56 if ( query_end < query_sequence.length() )
|
|
57 strm << query_sequence.substr( query_end+1 ) ;
|
|
58 strm << std::endl ;
|
|
59
|
|
60 //Alignment bars
|
|
61 if ( delta > 0 )
|
|
62 strm << std::string( delta-1, ' ') ;
|
|
63 strm << "(" ;
|
|
64 for (index=0; index<query_alignment.length(); index++) {
|
|
65 strm << ((query_alignment[index]==target_alignment[index]) ? '*' : '|' );
|
|
66 }
|
|
67 strm << ")" ;
|
|
68 strm << std::endl;
|
|
69
|
|
70 //Spaces before the target string
|
|
71 if ( delta - target_start > 0 )
|
|
72 strm << std::string( delta - target_start, ' ') ;
|
|
73 //Un-Aligned target part (prefix)
|
|
74 if ( target_start > 0 )
|
|
75 strm << target_sequence.substr(0, target_start-1);
|
|
76 //Aligned target part
|
|
77 strm << "(" << target_alignment << ")";
|
|
78
|
|
79 //Un-Aligned target part (suffix)
|
|
80 if ( target_end < target_sequence.length() )
|
|
81 strm << target_sequence.substr( target_end+1 );
|
|
82 strm << std::endl;
|
|
83
|
|
84 }
|
|
85
|
|
86 SequenceAlignment::SequenceAlignment ( ) :
|
|
87 _gap_panelty(-5),
|
|
88 _match_panelty(1),
|
|
89 _mismatch_panelty(-1),
|
|
90 _neutral_panelty(0.1)
|
|
91
|
|
92 {
|
|
93 }
|
|
94
|
|
95
|
|
96 void SequenceAlignment::set_sequences(const std::string& _query, const std::string& _target)
|
|
97 {
|
|
98 _query_sequence = _query ;
|
|
99 _target_sequence = _target ;
|
|
100 }
|
|
101
|
|
102 void SequenceAlignment::reset_alignment_results()
|
|
103 {
|
|
104 _alignment_results = SequenceAlignmentResults() ;
|
|
105 //
|
|
106 //Reset the results
|
|
107 _alignment_results.query_sequence = query_sequence() ;
|
|
108 _alignment_results.target_sequence = target_sequence() ;
|
|
109 }
|
|
110
|
|
111 const SequenceAlignmentResults& SequenceAlignment::align ( const std::string& query, const std::string& target )
|
|
112 {
|
|
113 set_sequences ( query, target ) ;
|
|
114
|
|
115 reset_alignment_results();
|
|
116
|
|
117 resize_matrix ( query_sequence().length(), target_sequence().length() ) ;
|
|
118 populate_match_matrix();
|
|
119
|
|
120 reset_matrix( matrix_width(), matrix_height() );
|
|
121 populate_matrix();
|
|
122 find_optimal_alignment();
|
|
123
|
|
124 post_process();
|
|
125
|
|
126 return _alignment_results;
|
|
127 }
|
|
128
|
|
129 void SequenceAlignment::resize_matrix(size_t width, size_t height)
|
|
130 {
|
|
131 size_t i;
|
|
132
|
|
133 if ( matrix_width() >= width && matrix_height() >= height )
|
|
134 return ;
|
|
135
|
|
136 query_border.resize ( width ) ;
|
|
137 target_border.resize ( height ) ;
|
|
138
|
|
139 score_matrix.resize ( width );
|
|
140 for (i=0;i<width;i++)
|
|
141 score_matrix[i].resize(height) ;
|
|
142
|
|
143 origin_matrix.resize ( width );
|
|
144 for (i=0;i<width;i++)
|
|
145 origin_matrix[i].resize(height) ;
|
|
146
|
|
147 match_matrix.resize ( width );
|
|
148 for (i=0;i<width;i++) {
|
|
149 match_matrix[i].resize(height) ;
|
|
150 }
|
|
151 }
|
|
152
|
|
153 void SequenceAlignment::populate_match_matrix()
|
|
154 {
|
|
155 for (size_t x=0; x<matrix_width(); x++)
|
|
156 for(size_t y=0;y<matrix_height();y++)
|
|
157 match_matrix[x][y] =
|
|
158 match_value ( query_nucleotide(x), target_nucleotide(y) ) ;
|
|
159 }
|
|
160
|
|
161
|
|
162 void SequenceAlignment::post_process()
|
|
163 {
|
|
164
|
|
165 }
|
|
166
|
|
167 void SequenceAlignment::print_matrix(std::ostream &strm) const
|
|
168 {
|
|
169 size_t query_index ;
|
|
170 size_t target_index ;
|
|
171
|
|
172 #if 0
|
|
173 printf("Match-Matrix:\n");
|
|
174 printf(" - ");
|
|
175 for ( target_index=1; target_index<matrix_height(); target_index++ )
|
|
176 printf(" %c ", target_sequence()[target_index-1] );
|
|
177 printf("\n");
|
|
178
|
|
179 for ( query_index=1; query_index<matrix_width(); query_index++ ) {
|
|
180
|
|
181 printf(" %c ", query_sequence()[query_index-1]) ;
|
|
182
|
|
183 for ( target_index=1 ; target_index<matrix_height(); target_index++ ) {
|
|
184 printf(" %c ", match_matrix[query_index][target_index]);
|
|
185 }
|
|
186 printf("\n");
|
|
187 }
|
|
188 #endif
|
|
189
|
|
190 strm << "Score-Matrix:" << endl ;
|
|
191
|
|
192 //Print Target nucleotides
|
|
193 strm << setw(2) << left << "-" << setw(7) << "-" ;
|
|
194 for ( query_index=0; query_index<matrix_width(); query_index++ )
|
|
195 strm << setw(9) << left << query_nucleotide ( query_index ) ;
|
|
196 strm << endl;
|
|
197 strm << setw(2) << left << "-" << setw(7) << "-" ;
|
|
198 for ( query_index=0; query_index<matrix_width(); query_index++ )
|
|
199 strm << setw(9) << left << query_border[query_index] ;
|
|
200 strm << endl;
|
|
201
|
|
202 for ( target_index=0; target_index<matrix_height(); target_index++ ) {
|
|
203
|
|
204 strm << setw(2) << left << target_nucleotide ( target_index ) ;
|
|
205 strm << setw(6) << right << target_border[target_index] << setw(1) << " ";
|
|
206
|
|
207 for ( query_index=0 ; query_index<matrix_width(); query_index++ ) {
|
|
208 char ch ;
|
|
209 switch (origin ( query_index, target_index ) )
|
|
210 {
|
|
211 case FROM_UPPER: ch = '|' ; break ;
|
|
212 case FROM_LEFT: ch = '-' ; break ;
|
|
213 case FROM_UPPER_LEFT: ch = '\\' ; break ;
|
|
214 case FROM_NOWHERE: ch = '=' ; break ;
|
|
215 default: ch = '*' ; break ;
|
|
216 }
|
|
217
|
|
218 strm << left ;
|
|
219 strm << setw(1) << match(query_index,target_index);
|
|
220 strm << setw(1) << ch ;
|
|
221 strm << setw(7) << fixed << setprecision(1)
|
|
222 << score(query_index,target_index) ;
|
|
223 }
|
|
224 strm << endl;
|
|
225 }
|
|
226 }
|
|
227
|
|
228 #if 0
|
|
229 void LocalSequenceAlignment::reset_matrix( size_t width, size_t height )
|
|
230 {
|
|
231 size_t x,y ;
|
|
232
|
|
233 highest_scored_query_index = 0 ;
|
|
234 highest_scored_target_index = 0 ;
|
|
235
|
|
236 for (x=0; x<width; x++)
|
|
237 score_matrix[x][0] = 0 ;
|
|
238 for (y=0; y<height; y++)
|
|
239 score_matrix[0][y] = 0 ;
|
|
240
|
|
241 }
|
|
242
|
|
243 void LocalSequenceAlignment::populate_matrix ( )
|
|
244 {
|
|
245 size_t query_index ;
|
|
246 size_t target_index ;
|
|
247
|
|
248 ssize_t highest_score = 0 ;
|
|
249
|
|
250 for ( query_index=1; query_index<matrix_width(); query_index++ ) {
|
|
251 for ( target_index=1 ; target_index<matrix_height(); target_index++ ) {
|
|
252 ssize_t score = alignment_score(query_index, target_index);
|
|
253
|
|
254 //printf("score(q=%zu,t=%zu)=%zu\n", query_index, target_index, score ) ;
|
|
255 score_matrix[query_index][target_index] = (score>0) ? score : 0 ;
|
|
256
|
|
257 //NOTE
|
|
258 // not sure ">=" is strictly correct SW (might be just ">")
|
|
259 if ( score > highest_score ) {
|
|
260 highest_scored_query_index = query_index ;
|
|
261 highest_scored_target_index = target_index ;
|
|
262 highest_score = score ;
|
|
263 }
|
|
264 }
|
|
265
|
|
266 }
|
|
267 }
|
|
268
|
|
269 void LocalSequenceAlignment::find_optimal_alignment ( )
|
|
270 {
|
|
271 size_t query_index = highest_scored_query_index ;
|
|
272 size_t target_index = highest_scored_target_index;
|
|
273
|
|
274 _alignment_results.query_end = query_index-1 ;
|
|
275 _alignment_results.target_end= target_index-1 ;
|
|
276
|
|
277 _alignment_results.score = score_matrix[query_index][target_index];
|
|
278
|
|
279 _alignment_results.matches = 0 ;
|
|
280 _alignment_results.mismatches = 0 ;
|
|
281
|
|
282 while ( query_index > 0 || target_index > 0 ) {
|
|
283 if ( score_matrix[query_index][target_index]==0)
|
|
284 break ;
|
|
285
|
|
286 //go "left" in the matrix
|
|
287 if ( query_index>0 &&
|
|
288 score_matrix[query_index][target_index] == score_matrix[query_index-1][target_index] + gap_panelty() ) {
|
|
289
|
|
290 _alignment_results.target_alignment += "-" ;
|
|
291 _alignment_results.query_alignment += query_sequence()[query_index-1] ;
|
|
292 query_index--;
|
|
293 }
|
|
294 else
|
|
295 //go "up-left" in the matrix
|
|
296 if ( query_index>0 && target_index>0 &&
|
|
297 score_matrix[query_index][target_index] ==
|
|
298 score_matrix[query_index-1][target_index-1] + match_score(query_index, target_index) ) {
|
|
299
|
|
300 _alignment_results.target_alignment += target_sequence()[target_index-1];
|
|
301 _alignment_results.query_alignment += query_sequence()[query_index-1] ;
|
|
302
|
|
303 (query_sequence()[query_index-1] == target_sequence()[target_index-1]) ?
|
|
304 (++_alignment_results.matches) : (++_alignment_results.mismatches) ;
|
|
305
|
|
306 query_index--;
|
|
307 target_index--;
|
|
308 }
|
|
309 else
|
|
310 //go "up" in the matrix
|
|
311 {
|
|
312 _alignment_results.target_alignment += target_sequence()[target_index-1];
|
|
313 _alignment_results.query_alignment += "-" ;
|
|
314 target_index--;
|
|
315 }
|
|
316 }
|
|
317
|
|
318 _alignment_results.query_start = query_index ;
|
|
319 _alignment_results.target_start= target_index ;
|
|
320
|
|
321 _alignment_results.query_size = query_sequence().length();
|
|
322 _alignment_results.target_size= target_sequence().length();
|
|
323
|
|
324 std::reverse(_alignment_results.target_alignment.begin(), _alignment_results.target_alignment.end());
|
|
325 std::reverse(_alignment_results.query_alignment.begin(), _alignment_results.query_alignment.end());
|
|
326 }
|
|
327 #endif
|
|
328
|
|
329 void HalfLocalSequenceAlignment::set_sequences(const std::string& _query, const std::string& _target)
|
|
330 {
|
|
331 //_query_sequence = _query + std::string( _target.length(), 'N' );
|
|
332 //_target_sequence = std::string( _query.length(), 'N' ) + _target;
|
|
333 _query_sequence = _query ;
|
|
334 _target_sequence = _target ;
|
|
335 }
|
|
336
|
|
337
|
|
338 void HalfLocalSequenceAlignment::reset_matrix( size_t width, size_t height )
|
|
339 {
|
|
340 size_t x,y ;
|
|
341
|
|
342 highest_scored_query_index = 0 ;
|
|
343 highest_scored_target_index = 0 ;
|
|
344
|
|
345 for (x=0; x<width; x++) {
|
|
346 query_border[x] =
|
|
347 //gap_panelty() * (ssize_t)x ;
|
|
348 //((query_sequence()[x-1]=='N') ? neutral_panelty() : gap_panelty()) * (ssize_t)x ;
|
|
349 //((query_sequence()[x-1]=='N') ? 0 : gap_panelty()) * (ssize_t)x ;
|
|
350 0 ;
|
|
351 }
|
|
352
|
|
353 for (y=0; y<height; y++) {
|
|
354 target_border[y] =
|
|
355 ( y <= 3 ) ? 0 : (gap_panelty() * (ssize_t)(y-3));
|
|
356 //0;
|
|
357 //((target_sequence()[y-1]=='N') ? 0 : gap_panelty()) * (ssize_t)y ;
|
|
358 //((target_sequence()[y-1]=='N') ? neutral_panelty() : gap_panelty()) * (ssize_t)y ;
|
|
359 }
|
|
360
|
|
361 }
|
|
362
|
|
363 void HalfLocalSequenceAlignment::populate_matrix ( )
|
|
364 {
|
|
365 size_t query_index ;
|
|
366 size_t target_index ;
|
|
367 DIRECTION origin = FROM_LEFT;
|
|
368
|
|
369 score_type highest_score = -1000000 ;
|
|
370 highest_scored_query_index = -1 ;
|
|
371 highest_scored_target_index = -1 ;
|
|
372
|
|
373 for ( query_index=0; query_index<matrix_width(); query_index++ ) {
|
|
374 for ( target_index=0 ; target_index<matrix_height(); target_index++ ) {
|
|
375
|
|
376 //Note:
|
|
377 // 'safe_score()' can accept negative value of -1 (and will return the border value)
|
|
378 score_type up_score = safe_score(query_index, ((ssize_t)target_index)-1) + gap_panelty() ;
|
|
379 score_type left_score = safe_score(((ssize_t)query_index)-1,target_index ) + gap_panelty() ;
|
|
380 score_type upleft_score = safe_score(((ssize_t)query_index)-1,((ssize_t)target_index)-1) +
|
|
381 nucleotide_match_score(query_index, target_index);
|
|
382
|
|
383 //On the diagonal line, best score can not come from upper cell
|
|
384 //only from left or upper-left cells
|
|
385 if ( target_index>3 && target_index-3 > query_index ) {
|
|
386 left_score = -100000 ;
|
|
387 }
|
|
388
|
|
389 //printf("query_index=%d, target_index=%d, upscore=%f, left_score=%f, upleft_score=%f\n",
|
|
390 // query_index, target_index, up_score,left_score,upleft_score );
|
|
391
|
|
392 score_type score = -100000000 ;
|
|
393
|
|
394 if ( upleft_score > score ) {
|
|
395 score = upleft_score ;
|
|
396 origin = FROM_UPPER_LEFT;
|
|
397 }
|
|
398 if ( up_score > score ) {
|
|
399 score = up_score ;
|
|
400 origin = FROM_UPPER ;
|
|
401 }
|
|
402 if ( left_score > score ) {
|
|
403 score = left_score ;
|
|
404 origin = FROM_LEFT ;
|
|
405 }
|
|
406 //printf("query_index=%d, target_index=%d, score=%f origin=%d\n",
|
|
407 // query_index, target_index, score, origin );
|
|
408
|
|
409 /*if (score<0) {
|
|
410 score = 0 ;
|
|
411 origin = FROM_NOWHERE ;
|
|
412 }*/
|
|
413
|
|
414 score_matrix[query_index][target_index] = score ;
|
|
415 origin_matrix[query_index][target_index] = origin ;
|
|
416
|
|
417 //NOTE
|
|
418 // not sure ">=" is strictly correct SW (might be just ">")
|
|
419 if ( score > highest_score ) {
|
|
420 highest_scored_query_index = query_index ;
|
|
421 highest_scored_target_index = target_index ;
|
|
422 highest_score = score ;
|
|
423 }
|
|
424 }
|
|
425 }
|
|
426 }
|
|
427
|
|
428 bool HalfLocalSequenceAlignment::starting_point_close_to_end_of_sequences(const size_t query_index, const size_t target_index) const
|
|
429 {
|
|
430 if ( (size_t)query_index >= query_sequence().length() - 2 ||
|
|
431 (size_t)target_index >= target_sequence().length() - 2 ) {
|
|
432 /* We've reach either the end of the Adapter
|
|
433 * (and the adapter is shorter than the query)
|
|
434 * Or the end of the query
|
|
435 * (and the adapter covers up to the end of the query, and then continues on)
|
|
436 *
|
|
437 * So we can safely start the alignment from this point
|
|
438 */
|
|
439 return true;
|
|
440 }
|
|
441 else {
|
|
442 /* The adapter is not covering the query until the end.
|
|
443 */
|
|
444 return false;
|
|
445 }
|
|
446 }
|
|
447
|
|
448 #undef DEBUG_STARTING_POINT
|
|
449 void HalfLocalSequenceAlignment::find_alignment_starting_point(ssize_t &new_query_index, ssize_t &new_target_index) const
|
|
450 {
|
|
451 /*
|
|
452 * Force the alignment to start from the end of the query,
|
|
453 * find the best score at the end of the query
|
|
454 *
|
|
455 * Try (desperately) to find a match that starts at the end of the query or the end of the target/adapter)
|
|
456 */
|
|
457 score_type max_score = score( matrix_width()-1, matrix_height()-1 ) ;
|
|
458 for ( size_t q_index = 0 ; q_index < matrix_width(); q_index++ ) {
|
|
459 for ( size_t t_index = matrix_height()-2 ; t_index < matrix_height(); t_index++ ) {
|
|
460 if ( origin ( q_index, t_index ) > 0 &&
|
|
461 safe_score ( q_index, t_index ) > max_score ) {
|
|
462 max_score = safe_score ( q_index, t_index ) ;
|
|
463 #ifdef DEBUG_STARTING_POINT
|
|
464 printf("Found new max score = %f at %d,%d\n", max_score, q_index, t_index ) ;
|
|
465 #endif
|
|
466 new_target_index = t_index ;
|
|
467 new_query_index = q_index ;
|
|
468 }
|
|
469 }
|
|
470 }
|
|
471 for ( size_t q_index = matrix_width()-2 ; q_index < matrix_width(); q_index++ ) {
|
|
472 for ( size_t t_index = 0 ; t_index < matrix_height(); t_index++ ) {
|
|
473 if ( origin ( q_index, t_index ) > 0 &&
|
|
474 safe_score ( q_index, t_index ) > max_score ) {
|
|
475 max_score = safe_score ( q_index, t_index ) ;
|
|
476 #ifdef DEBUG_STARTING_POINT
|
|
477 printf("Found new max score = %f at %d,%d\n", max_score, q_index, t_index ) ;
|
|
478 #endif
|
|
479 new_target_index = t_index ;
|
|
480 new_query_index = q_index ;
|
|
481 }
|
|
482 }
|
|
483 }
|
|
484 #ifdef DEBUG_STARTING_POINT
|
|
485 printf("Forcing alignment from query_index=%d, target_index=%d, score=%f, origin=%d\n",
|
|
486 new_query_index, new_target_index,
|
|
487 score ( new_query_index, new_target_index ),
|
|
488 origin ( new_query_index, new_target_index ) );
|
|
489 #endif
|
|
490 }
|
|
491
|
|
492
|
|
493 #undef DEBUG_FIND_OPTIMAL_ALIGNMENT
|
|
494 SequenceAlignmentResults HalfLocalSequenceAlignment::find_optimal_alignment_from_point ( const size_t query_start, const size_t target_start ) const
|
|
495 {
|
|
496 SequenceAlignmentResults results;
|
|
497
|
|
498 results.query_sequence = query_sequence();
|
|
499 results.target_sequence= target_sequence();
|
|
500
|
|
501 ssize_t query_index = query_start;
|
|
502 ssize_t target_index = target_start ;
|
|
503
|
|
504 results.query_end = query_index ;
|
|
505 results.target_end= target_index ;
|
|
506
|
|
507 #ifdef DEBUG_FIND_OPTIMAL_ALIGNMENT
|
|
508 printf ( "backtrace starting from (qindex=%d, tindex=%d, score=%f)\n",
|
|
509 query_index, target_index, score_matrix[query_index][target_index]) ;
|
|
510 #endif
|
|
511
|
|
512 while ( query_index >= 0 && target_index >= 0 ) {
|
|
513
|
|
514 const char q_nuc = query_nucleotide(query_index);
|
|
515 const char t_nuc = target_nucleotide(target_index);
|
|
516
|
|
517 const DIRECTION current_origin = origin(query_index, target_index);
|
|
518 const char current_match = match ( query_index, target_index ) ;
|
|
519
|
|
520
|
|
521 #ifdef DEBUG_FIND_OPTIMAL_ALIGNMENT
|
|
522 const score_type current_score = score(query_index, target_index);
|
|
523 printf("query_index=%d target_index=%d query=%c target=%c score_matrix=%3.1f origin=%d accumulated_score = %3.2f\n",
|
|
524 query_index, target_index,
|
|
525 q_nuc, t_nuc,
|
|
526 current_score,
|
|
527 current_origin,
|
|
528 results.score) ;
|
|
529 #endif
|
|
530
|
|
531 results.query_start = query_index ;
|
|
532 results.target_start= target_index ;
|
|
533
|
|
534 switch ( current_origin )
|
|
535 {
|
|
536 case FROM_LEFT:
|
|
537 results.target_alignment += "-" ;
|
|
538 results.query_alignment += q_nuc ;
|
|
539 results.gaps++;
|
|
540 results.score += gap_panelty();
|
|
541
|
|
542 query_index--;
|
|
543 break ;
|
|
544
|
|
545 case FROM_UPPER_LEFT:
|
|
546 results.target_alignment += t_nuc;
|
|
547 results.query_alignment += q_nuc ;
|
|
548
|
|
549 switch ( current_match )
|
|
550 {
|
|
551 case 'N':
|
|
552 results.neutral_matches++ ;
|
|
553 results.score += neutral_panelty();
|
|
554 break ;
|
|
555
|
|
556 case 'M':
|
|
557 results.matches++;
|
|
558 results.score += match_panelty();
|
|
559 break;
|
|
560
|
|
561 case 'x':
|
|
562 results.mismatches++;
|
|
563 results.score += mismatch_panelty();
|
|
564 break ;
|
|
565
|
|
566 default:
|
|
567 errx(1,"Internal error: unknown match type (%c) at query_index=%zu, target_index=%zu\n",
|
|
568 current_match, query_index, target_index ) ;
|
|
569 }
|
|
570
|
|
571 query_index--;
|
|
572 target_index--;
|
|
573 break ;
|
|
574
|
|
575 case FROM_UPPER:
|
|
576 results.target_alignment += t_nuc ;
|
|
577 results.query_alignment += "-" ;
|
|
578 results.gaps++;
|
|
579 results.score += gap_panelty();
|
|
580
|
|
581 target_index--;
|
|
582 break;
|
|
583
|
|
584 case FROM_NOWHERE:
|
|
585 default:
|
|
586 print_matrix();
|
|
587 printf("Invalid origin (%d) at query_index=%zu, target_index=%zu\n",
|
|
588 current_origin, query_index, target_index ) ;
|
|
589 printf("Query = %s\n", query_sequence().c_str());
|
|
590 printf("Target= %s\n", target_sequence().c_str());
|
|
591 exit(1);
|
|
592 }
|
|
593 }
|
|
594
|
|
595 results.query_size = query_sequence().length();
|
|
596 results.target_size= target_sequence().length();
|
|
597
|
|
598 std::reverse(results.target_alignment.begin(),results.target_alignment.end());
|
|
599 std::reverse(results.query_alignment.begin(), results.query_alignment.end());
|
|
600
|
|
601 return results;
|
|
602 }
|
|
603
|
|
604 void HalfLocalSequenceAlignment::find_optimal_alignment ( )
|
|
605 {
|
|
606 SequenceAlignmentResults results ;
|
|
607
|
|
608
|
|
609 //Try to find a good alignment,
|
|
610 //starting from the highest score cell.
|
|
611 results = find_optimal_alignment_from_point ( highest_scored_query_index,
|
|
612 highest_scored_target_index ) ;
|
|
613
|
|
614 //Some heuristics:
|
|
615 //If the adapter matched 7 nucleotides anywhere in the query
|
|
616 //without mismatches/gaps, accept it.
|
|
617 if ( results.matches >= 7
|
|
618 &&
|
|
619 results.mismatches == 0
|
|
620 &&
|
|
621 results.gaps == 0 ) {
|
|
622
|
|
623 _alignment_results = results ;
|
|
624 return ;
|
|
625 }
|
|
626
|
|
627 if ( starting_point_close_to_end_of_sequences ( highest_scored_query_index,
|
|
628 highest_scored_target_index ) ) {
|
|
629 //We're already very close to the end of the target or query,
|
|
630 //can't improve much else, so return what we've got.
|
|
631 _alignment_results = results ;
|
|
632 return ;
|
|
633 }
|
|
634
|
|
635
|
|
636 //More heuristics:
|
|
637 /* The adapter is not covering the query until the end.
|
|
638 * Force the alignment to start from the end of the query,
|
|
639 * find the best score at the end of the query
|
|
640 *
|
|
641 * Try (desperately) to find a match that starts at the end of the query or the end of the target/adapter)
|
|
642 */
|
|
643 ssize_t query_index = highest_scored_query_index ;
|
|
644 ssize_t target_index = highest_scored_target_index;
|
|
645 find_alignment_starting_point ( query_index, target_index ) ;
|
|
646
|
|
647 _alignment_results = results ;
|
|
648 }
|
|
649
|
|
650 void HalfLocalSequenceAlignment::post_process()
|
|
651 {
|
|
652 #if 0
|
|
653 //Removes the Ns which were added in 'set_sequences'
|
|
654 //And adjust the results values accordingly
|
|
655
|
|
656 //return ;
|
|
657 //_query_sequence.erase ( _query_sequence.find_last_not_of('N') + 1) ;
|
|
658 //_target_sequence.erase ( 0, _target_sequence.find_first_not_of('N') ) ;
|
|
659
|
|
660 _alignment_results.query_sequence.erase ( _query_sequence.find_last_not_of('N') + 1) ;
|
|
661 _alignment_results.target_sequence.erase ( 0, _target_sequence.find_first_not_of('N') ) ;
|
|
662 _alignment_results.query_size = _alignment_results.query_sequence.length();
|
|
663 _alignment_results.target_size = _alignment_results.target_sequence.length();
|
|
664
|
|
665
|
|
666 size_t query_n_position = _alignment_results.query_alignment.find_last_not_of('N') ;
|
|
667 int query_n_count;
|
|
668
|
|
669 if ( query_n_position != string::npos )
|
|
670 query_n_count = _alignment_results.query_alignment.length() - query_n_position ;
|
|
671 else
|
|
672 query_n_count = 0 ;
|
|
673
|
|
674 int target_n_count = _alignment_results.target_alignment.find_first_not_of('N') ;
|
|
675
|
|
676 if (query_n_position != string::npos )
|
|
677 _alignment_results.query_alignment.erase( query_n_position ) ;
|
|
678 _alignment_results.target_alignment.erase( 0,target_n_count ) ;
|
|
679
|
|
680 //Update Results strucure
|
|
681 _alignment_results.query_start+= target_n_count ;
|
|
682 _alignment_results.query_end -= query_n_count ;
|
|
683
|
|
684 _alignment_results.target_start = 0;
|
|
685 _alignment_results.target_end = _alignment_results.query_end - _alignment_results.query_start ;
|
|
686
|
|
687 _alignment_results.query_alignment.erase ( 0, _alignment_results.query_start ) ;
|
|
688 _alignment_results.target_alignment.erase ( _alignment_results.target_end+1 ) ;
|
|
689
|
|
690 //Update match/mismatch/gap counts
|
|
691 _alignment_results.matches = 0 ;
|
|
692 _alignment_results.mismatches = 0 ;
|
|
693 _alignment_results.gaps = 0 ;
|
|
694
|
|
695 for (size_t index=0; index<_alignment_results.query_alignment.length(); index++) {
|
|
696 char q = _alignment_results.query_alignment[index];
|
|
697 char t = _alignment_results.target_alignment[index];
|
|
698
|
|
699 if ( q == '-' || t=='-' ) {
|
|
700 _alignment_results.gaps ++ ;
|
|
701 } else {
|
|
702 if ( q== t )
|
|
703 _alignment_results.matches++;
|
|
704 else
|
|
705 _alignment_results.mismatches++;
|
|
706 }
|
|
707 }
|
|
708 #endif
|
|
709 }
|
|
710
|