0
|
1 #include <stdio.h>
|
|
2 #include <string>
|
|
3 #include <vector>
|
|
4
|
|
5 #define max(a,b,c) a > b ? ( a > c ? a : c ) : ( b > c ? b : c ) ;
|
|
6 #define min(a,b,c) a < b ? ( a < c ? a : c ) : ( b < c ? b : c ) ;
|
|
7 using namespace std;
|
|
8
|
5
|
9 void printMatrix (float* m, int rows, int cols)
|
0
|
10 {
|
|
11 for(int i = 0; i < rows; i++)
|
|
12 {
|
|
13 for(int j = 0; j < cols; j++)
|
|
14 {
|
|
15 printf("%.2f ", *(m + i*cols +j));
|
|
16 }
|
|
17 printf("\n");
|
|
18 }
|
|
19
|
|
20 }
|
|
21
|
|
22 int levenshteinDistance(string s, string t)
|
|
23 {
|
|
24 //Calculate Edit distance of 2 strings
|
|
25 if(s == t)
|
|
26 return 0;
|
|
27 int size1 = s.length()+1;
|
|
28 int size2 = t.length()+1;
|
|
29 int d[size1][size2];
|
|
30
|
|
31 for (int i=0; i<size1; i++)d[i][0] = i;
|
|
32 for (int i=0; i<size2; i++)d[0][i] = i;
|
|
33
|
|
34 for (int i=1; i<size1; i++)
|
|
35 for (int j=1; j<size2; j++)
|
|
36 {
|
|
37 if(s[i-1] == t[j-1] )
|
|
38 d[i][j] = d[i-1][j-1]; //if equeal no operation
|
|
39 else
|
|
40 d[i][j] = min(d[i-1][j] + 1,d[i][j-1] + 1,d[i-1][j-1] + 1 ); //Addition, deletion or substitution
|
|
41
|
|
42 }
|
|
43
|
|
44 return d[size1-1][size2-1]; // return edit distance
|
|
45 }
|
|
46
|
5
|
47 float nw_align2 (string &seq1, string &seq2, float matchScore, float missmatch_penalty, float gapPenOpen, float gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug)
|
0
|
48 {
|
|
49 //Do NW alignment of 2 strings...
|
|
50 int size1 = seq1.length();
|
|
51 int size2 = seq2.length();
|
|
52
|
5
|
53 float scores[size1+1][size2+1];
|
0
|
54 char dir[size1+1][size2+1];
|
|
55 char gaps[size1+1][size2+1];
|
|
56
|
|
57 for(int i = 0; i <= size1; i++)
|
|
58 {
|
5
|
59 scores[i][0] = (float)(i-1)*gapPenExt + gapPenOpen;
|
0
|
60 dir[i][0] = 'U';
|
5
|
61 gaps[i][0] = 'D';
|
0
|
62 }
|
|
63 for(int j = 0; j <= size2; j++)
|
|
64 {
|
5
|
65 scores[0][j] = (float)(j-1)*gapPenExt + gapPenOpen;
|
0
|
66 dir[0][j] = 'L';
|
5
|
67 gaps[0][j] = 'I';
|
0
|
68 }
|
|
69
|
|
70 scores[0][0] = 0;
|
|
71 gaps[0][0] = 'N';
|
|
72
|
|
73 if (noFrontGapPenalty)
|
|
74 {
|
|
75 for(int i = 0; i <= size1; i++) scores[i][0] = 0;
|
|
76 for(int j = 0; j <= size2; j++) scores[0][j] = 0;
|
|
77 }
|
|
78
|
5
|
79 float match;
|
|
80 float del;
|
|
81 float insert;
|
0
|
82
|
|
83 for(int i = 1; i <= size1; i++)
|
|
84 for(int j = 1; j <= size2; j++)
|
|
85 {
|
|
86 if (seq1[i-1] == 'N' || seq2[j-1] == 'N')
|
|
87 {
|
|
88 match = scores[i-1][j-1];
|
|
89 }
|
|
90 else
|
|
91 {
|
5
|
92 match = scores[i-1][j-1] + (float)(seq1[i-1] == seq2[j-1]? matchScore : missmatch_penalty); //1.9f clustalw
|
0
|
93 }
|
5
|
94 del = scores[i-1][j] + (gaps[i-1][j] != 'D'? gapPenOpen:gapPenExt);
|
|
95 insert = scores[i][j-1] + (gaps[i][j-1] != 'I'? gapPenOpen:gapPenExt);
|
|
96
|
0
|
97 //debug
|
5
|
98 //if (match == del || match == insert)
|
|
99 //{
|
|
100 //printf("I:%d J:%d match = %f del %f insert %f %c %c\n",i,j,match,del,insert,seq1[i-1],seq2[j-1]);
|
|
101 //}
|
0
|
102
|
5
|
103 if (match > del && match > insert)
|
|
104 {
|
|
105 scores[i][j] = match;
|
|
106 dir[i][j] = 'D';
|
|
107 gaps[i][j] = 'N';
|
|
108 }
|
|
109 else if(match >= del && match >= insert && seq1[i-1] != seq2[j-1])
|
0
|
110 {
|
|
111 scores[i][j] = match;
|
|
112 dir[i][j] = 'D';
|
|
113 gaps[i][j] = 'N';
|
|
114 }
|
|
115 else if(del > insert)
|
|
116 {
|
|
117 scores[i][j] = del;
|
|
118 dir[i][j] = 'U';
|
5
|
119 gaps[i][j] = 'D';
|
0
|
120 }
|
|
121 else
|
|
122 {
|
|
123 scores[i][j] = insert;
|
|
124 dir[i][j] = 'L';
|
5
|
125 gaps[i][j] = 'I';
|
0
|
126 }
|
|
127 }
|
|
128
|
|
129 //debug
|
|
130 //if (debug)
|
|
131 // printMatrix(*scores, size1+1, size2+1);
|
|
132 //printMatrix(*dir, size1+1, size2+1);
|
|
133
|
|
134 string alignment1="";
|
|
135 string alignment2="";
|
|
136
|
|
137
|
|
138 int cont1 = size1;
|
|
139 int cont2 = size2;
|
|
140 int max1 = size1;
|
|
141 int max2 = size2;
|
5
|
142 float num = scores[size1][size2];;
|
0
|
143
|
|
144 if (noEndGapPenalty)
|
|
145 {
|
|
146 //Search Maxes
|
|
147
|
|
148 for(int i = 1; i <= size1; i++)
|
|
149 {
|
|
150 if(num<scores[i][size2])
|
|
151 {
|
|
152 num = scores[i][size2];
|
|
153 max1 = i;
|
|
154 max2 = size2;
|
|
155 }
|
|
156 }
|
|
157 for(int j = 1; j <= size2; j++)
|
|
158 {
|
|
159 if(num<scores[size1][j])
|
|
160 {
|
|
161 num = scores[size1][j];
|
|
162 max1 = size1;
|
|
163 max2 = j;
|
|
164 }
|
|
165 }
|
|
166 cont1 = max1;
|
|
167 cont2 = max2;
|
|
168 }
|
|
169
|
|
170 while (cont1 > 0 && cont2 > 0)
|
|
171 {
|
|
172 if(dir[cont1][cont2] == 'D')
|
|
173 {
|
|
174 alignment1+= seq1[cont1-1];
|
|
175 alignment2+= seq2[cont2-1];
|
|
176 cont1--;
|
|
177 cont2--;
|
|
178
|
|
179 }
|
|
180 else if(dir[cont1][cont2] == 'L')
|
|
181 {
|
|
182 alignment1+= '-';
|
|
183 alignment2+= seq2[cont2-1];
|
|
184 cont2--;
|
|
185 }
|
|
186 else
|
|
187 {
|
|
188 alignment1+= seq1[cont1-1];
|
|
189 alignment2+= '-';
|
|
190 cont1--;
|
|
191
|
|
192 }
|
|
193 }
|
|
194
|
|
195 while (cont1 > 0)
|
|
196 {
|
|
197 alignment1+= seq1[cont1-1];
|
|
198 alignment2+= '-';
|
|
199 cont1--;
|
|
200 }
|
|
201
|
|
202 while (cont2 > 0)
|
|
203 {
|
|
204 alignment1+= '-';
|
|
205 alignment2+= seq2[cont2-1];
|
|
206 cont2--;
|
|
207 }
|
|
208
|
|
209 alignment1 = string (alignment1.rbegin(),alignment1.rend());
|
|
210 alignment2 = string (alignment2.rbegin(),alignment2.rend());
|
|
211
|
|
212
|
|
213 if (noEndGapPenalty)
|
|
214 {
|
|
215 //Need to fill out rest of alignment...
|
|
216 if (max1 != size1)
|
|
217 {
|
|
218 for (int i=max1; i<size1; ++i )
|
|
219 {
|
|
220 alignment1+= seq1 [i];
|
|
221 alignment2+= '-';
|
|
222
|
|
223 }
|
|
224 }
|
|
225
|
|
226 if (max2 != size2)
|
|
227 {
|
|
228
|
|
229 for (int i=max2; i<size2; ++i )
|
|
230 {
|
|
231 alignment2+= seq2 [i];
|
|
232 alignment1+= '-';
|
|
233
|
|
234 }
|
|
235 }
|
|
236
|
|
237 }
|
|
238 if (debug)
|
|
239 {
|
|
240 printf("Seq1: %s\n", alignment1.c_str());
|
|
241 printf("Seq2: %s\n\n", alignment2.c_str());
|
|
242
|
|
243 }
|
|
244
|
|
245 //Returns
|
|
246 seq1 = alignment1;
|
|
247 seq2 = alignment2;
|
|
248
|
|
249 return num;
|
|
250
|
|
251 }
|
|
252
|
5
|
253 vector <string> nw_alignAlignments (vector<string> a1, vector<string> a2, float matchScore, float missmatch_penalty, float gapPenOpen, float gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug)
|
0
|
254 {
|
|
255 //Do NW alignment of 2 strings...
|
|
256 int size1 = a1[0].length();
|
|
257 int size2 = a2[0].length();
|
|
258 int v1 = a1.size();
|
|
259 int v2 = a2.size();
|
|
260 int combs = v1 * v2;
|
|
261
|
|
262 vector<string> msa;
|
|
263 msa.clear();
|
|
264 vector<string> alignment1, alignment2;
|
|
265
|
|
266 for (int i=0;i<v1;++i)
|
|
267 alignment1.push_back(string());
|
|
268 for (int i=0;i<v2;++i)
|
|
269 alignment2.push_back(string());
|
|
270
|
|
271
|
|
272
|
5
|
273 float scores[size1+1][size2+1];
|
0
|
274 char dir[size1+1][size2+1];
|
|
275 char gaps[size1+1][size2+1];
|
|
276
|
|
277 for(int i = 0; i <= size1; i++)
|
|
278 {
|
5
|
279 scores[i][0] = (float)(i-1)*gapPenExt + gapPenOpen;
|
0
|
280 dir[i][0] = 'U';
|
5
|
281 gaps[i][0] = 'D';
|
0
|
282 }
|
|
283 for(int j = 0; j <= size2; j++)
|
|
284 {
|
5
|
285 scores[0][j] = (float)(j-1)*gapPenExt + gapPenOpen;
|
0
|
286 dir[0][j] = 'L';
|
5
|
287 gaps[0][j] = 'I';
|
0
|
288 }
|
|
289
|
|
290 scores[0][0] = 0;
|
|
291 gaps[0][0] = 'N';
|
|
292
|
|
293 if (noFrontGapPenalty)
|
|
294 {
|
|
295 for(int i = 0; i <= size1; i++) scores[i][0] = 0;
|
|
296 for(int j = 0; j <= size2; j++) scores[0][j] = 0;
|
|
297 }
|
|
298
|
5
|
299 float match;
|
|
300 float del;
|
|
301 float insert;
|
|
302 bool missmatch;
|
0
|
303
|
|
304 for(int i = 1; i <= size1; i++)
|
|
305 for(int j = 1; j <= size2; j++)
|
|
306 {
|
|
307
|
|
308 match = del = insert = 0.0f;
|
5
|
309 missmatch = true;
|
0
|
310 for(int z=0;z<v1;++z)
|
|
311 {
|
|
312 for(int w=0;w<v2;++w)
|
|
313 {
|
|
314 if (a1[z][i-1] == 'N' || a2[w][j-1] == 'N')
|
|
315 {
|
|
316 match += scores[i-1][j-1];
|
|
317 }
|
|
318 else
|
|
319 {
|
5
|
320 match += scores[i-1][j-1] + (float)(a1[z][i-1] == a2[w][j-1]? matchScore:missmatch_penalty);
|
|
321 if(a1[z][i-1] == a2[w][j-1])
|
|
322 {
|
|
323 missmatch = false;
|
|
324 }
|
0
|
325 }
|
5
|
326 del += scores[i-1][j] + (gaps[i-1][j] != 'D'? gapPenOpen:gapPenExt);
|
|
327 insert += scores[i][j-1] + (gaps[i][j-1] != 'I'? gapPenOpen:gapPenExt);
|
0
|
328 }
|
|
329 }
|
|
330
|
|
331 match /= combs;
|
|
332 del /= combs;
|
|
333 insert /= combs;
|
|
334
|
|
335 if (match > del && match > insert)
|
|
336 {
|
|
337 scores[i][j] = match;
|
|
338 dir[i][j] = 'D';
|
|
339 gaps[i][j] = 'N';
|
|
340 }
|
5
|
341 else if(match >= del && match >= insert && missmatch)
|
|
342 {
|
|
343 scores[i][j] = match;
|
|
344 dir[i][j] = 'D';
|
|
345 gaps[i][j] = 'N';
|
|
346 }
|
0
|
347 else if(del > insert)
|
|
348 {
|
|
349 scores[i][j] = del;
|
|
350 dir[i][j] = 'U';
|
5
|
351 gaps[i][j] = 'D';
|
0
|
352 }
|
|
353 else
|
|
354 {
|
|
355 scores[i][j] = insert;
|
|
356 dir[i][j] = 'L';
|
5
|
357 gaps[i][j] = 'I';
|
0
|
358 }
|
|
359 }
|
|
360
|
|
361 //debug
|
|
362 // if (debug)
|
|
363 // printMatrix(*scores, size1+1, size2+1);
|
|
364 //printMatrix(*dir, size1+1, size2+1);
|
|
365
|
|
366
|
|
367 int cont1 = size1;
|
|
368 int cont2 = size2;
|
|
369 int max1 = size1;
|
|
370 int max2 = size2;
|
5
|
371 float num = scores[size1][size2];
|
0
|
372
|
|
373 if (noEndGapPenalty)
|
|
374 {
|
|
375 //Search Maxes
|
|
376
|
|
377 for(int i = 1; i <= size1; i++)
|
|
378 {
|
|
379 if(num<scores[i][size2])
|
|
380 {
|
|
381 num = scores[i][size2];
|
|
382 max1 = i;
|
|
383 max2 = size2;
|
|
384 }
|
|
385 }
|
|
386 for(int j = 1; j <= size2; j++)
|
|
387 {
|
|
388 if(num<scores[size1][j])
|
|
389 {
|
|
390 num = scores[size1][j];
|
|
391 max1 = size1;
|
|
392 max2 = j;
|
|
393 }
|
|
394 }
|
|
395 cont1 = max1;
|
|
396 cont2 = max2;
|
|
397 }
|
|
398
|
|
399 while (cont1 > 0 && cont2 > 0)
|
|
400 {
|
|
401 if(dir[cont1][cont2] == 'D')
|
|
402 {
|
|
403
|
|
404 for(int z=0;z<v1;++z)
|
|
405 alignment1[z]+= a1[z][cont1-1];
|
|
406
|
|
407 for(int w=0;w<v2;++w)
|
|
408 alignment2[w]+= a2[w][cont2-1];
|
|
409
|
|
410 cont1--;
|
|
411 cont2--;
|
|
412
|
|
413 }
|
|
414 else if(dir[cont1][cont2] == 'L')
|
|
415 {
|
|
416 for(int z=0;z<v1;++z)
|
|
417 alignment1[z]+= '-';
|
|
418 for(int w=0;w<v2;++w)
|
|
419 alignment2[w]+= a2[w][cont2-1];
|
|
420
|
|
421 cont2--;
|
|
422 }
|
|
423 else
|
|
424 {
|
|
425 for(int z=0;z<v1;++z)
|
|
426 alignment1[z]+= a1[z][cont1-1];
|
|
427
|
|
428 for(int w=0;w<v2;++w)
|
|
429 alignment2[w]+= '-';
|
|
430
|
|
431 cont1--;
|
|
432
|
|
433 }
|
|
434 }
|
|
435
|
|
436 while (cont1 > 0)
|
|
437 {
|
|
438 for(int z=0;z<v1;++z)
|
|
439 alignment1[z]+= a1[z][cont1-1];
|
|
440 for(int w=0;w<v2;++w)
|
|
441 alignment2[w]+= '-';
|
|
442
|
|
443 cont1--;
|
|
444 }
|
|
445
|
|
446 while (cont2 > 0)
|
|
447 {
|
|
448 for(int z=0;z<v1;++z)
|
|
449 alignment1[z]+= '-';
|
|
450 for(int w=0;w<v2;++w)
|
|
451 alignment2[w]+= a2[w][cont2-1];
|
|
452
|
|
453 cont2--;
|
|
454 }
|
|
455
|
|
456
|
|
457
|
|
458
|
|
459 for(int z=0;z<v1;++z)
|
|
460 {
|
|
461 alignment1[z] = string (alignment1[z].rbegin(),alignment1[z].rend());
|
|
462 }
|
|
463 for(int w=0;w<v2;++w)
|
|
464 {
|
|
465 alignment2[w] = string (alignment2[w].rbegin(),alignment2[w].rend());
|
|
466 }
|
|
467
|
|
468
|
|
469
|
|
470 if (noEndGapPenalty)
|
|
471 {
|
|
472 //Need to fill out rest of alignment...
|
|
473 if (max1 != size1)
|
|
474 {
|
|
475 for (int i=max1; i<size1; ++i )
|
|
476 {
|
|
477 for(int z=0;z<v1;++z)
|
|
478 alignment1[z]+= a1[z][i];
|
|
479 for(int w=0;w<v2;++w)
|
|
480 alignment2[w]+= '-';
|
|
481
|
|
482 }
|
|
483 }
|
|
484
|
|
485 if (max2 != size2)
|
|
486 {
|
|
487
|
|
488 for (int i=max2; i<size2; ++i )
|
|
489 {
|
|
490 for(int z=0;z<v1;++z)
|
|
491 alignment1[z]+= '-';
|
|
492 for(int w=0;w<v2;++w)
|
|
493 alignment2[w]+= a2[w][i];
|
|
494
|
|
495 }
|
|
496 }
|
|
497
|
|
498 }
|
|
499
|
|
500 //returns
|
|
501 for(int z=0;z<v1;++z)
|
|
502 msa.push_back(alignment1[z]);
|
|
503 for(int w=0;w<v2;++w)
|
|
504 msa.push_back(alignment2[w]);
|
|
505
|
|
506 if (debug)
|
|
507 {
|
|
508 for(int z=0;z<v1;++z)
|
|
509 printf("%s\n",alignment1[z].c_str());
|
|
510 for(int w=0;w<v2;++w)
|
|
511 printf("%s\n",alignment2[w].c_str());
|
|
512
|
|
513 }
|
|
514
|
|
515 return msa;
|
|
516
|
|
517 }
|
|
518
|
|
519
|
|
520
|