0
|
1 #include <stdio.h>
|
|
2 #include <string>
|
|
3 #include <vector>
|
|
4
|
|
5 #define max(a,b,c) a > b ? ( a > c ? a : c ) : ( b > c ? b : c ) ;
|
|
6 #define min(a,b,c) a < b ? ( a < c ? a : c ) : ( b < c ? b : c ) ;
|
|
7 using namespace std;
|
|
8
|
|
9 void printMatrix (double* m, int rows, int cols)
|
|
10 {
|
|
11 for(int i = 0; i < rows; i++)
|
|
12 {
|
|
13 for(int j = 0; j < cols; j++)
|
|
14 {
|
|
15 printf("%.2f ", *(m + i*cols +j));
|
|
16 }
|
|
17 printf("\n");
|
|
18 }
|
|
19
|
|
20 }
|
|
21
|
|
22 int levenshteinDistance(string s, string t)
|
|
23 {
|
|
24 //Calculate Edit distance of 2 strings
|
|
25 if(s == t)
|
|
26 return 0;
|
|
27 int size1 = s.length()+1;
|
|
28 int size2 = t.length()+1;
|
|
29 int d[size1][size2];
|
|
30
|
|
31 for (int i=0; i<size1; i++)d[i][0] = i;
|
|
32 for (int i=0; i<size2; i++)d[0][i] = i;
|
|
33
|
|
34 for (int i=1; i<size1; i++)
|
|
35 for (int j=1; j<size2; j++)
|
|
36 {
|
|
37 if(s[i-1] == t[j-1] )
|
|
38 d[i][j] = d[i-1][j-1]; //if equeal no operation
|
|
39 else
|
|
40 d[i][j] = min(d[i-1][j] + 1,d[i][j-1] + 1,d[i-1][j-1] + 1 ); //Addition, deletion or substitution
|
|
41
|
|
42 }
|
|
43
|
|
44 return d[size1-1][size2-1]; // return edit distance
|
|
45 }
|
|
46
|
|
47 double nw_align2 (string &seq1, string &seq2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug)
|
|
48 {
|
|
49 //Do NW alignment of 2 strings...
|
|
50 int size1 = seq1.length();
|
|
51 int size2 = seq2.length();
|
|
52
|
|
53 double scores[size1+1][size2+1];
|
|
54 char dir[size1+1][size2+1];
|
|
55 char gaps[size1+1][size2+1];
|
|
56
|
|
57 for(int i = 0; i <= size1; i++)
|
|
58 {
|
|
59 scores[i][0] = (double)(i-1)*gapPenExt + gapPenOpen;
|
|
60 dir[i][0] = 'U';
|
|
61 gaps[i][0] = 'O';
|
|
62 }
|
|
63 for(int j = 0; j <= size2; j++)
|
|
64 {
|
|
65 scores[0][j] = (double)(j-1)*gapPenExt + gapPenOpen;
|
|
66 dir[0][j] = 'L';
|
|
67 gaps[0][j] = 'O';
|
|
68 }
|
|
69
|
|
70 scores[0][0] = 0;
|
|
71 gaps[0][0] = 'N';
|
|
72
|
|
73 if (noFrontGapPenalty)
|
|
74 {
|
|
75 for(int i = 0; i <= size1; i++) scores[i][0] = 0;
|
|
76 for(int j = 0; j <= size2; j++) scores[0][j] = 0;
|
|
77 }
|
|
78
|
|
79 double match;
|
|
80 double del;
|
|
81 double insert;
|
|
82
|
|
83 for(int i = 1; i <= size1; i++)
|
|
84 for(int j = 1; j <= size2; j++)
|
|
85 {
|
|
86 if (seq1[i-1] == 'N' || seq2[j-1] == 'N')
|
|
87 {
|
|
88 match = scores[i-1][j-1];
|
|
89 }
|
|
90 else
|
|
91 {
|
|
92 match = scores[i-1][j-1] + (double)(seq1[i-1] == seq2[j-1]? matchScore : missmatch_penalty); //1.9f clustalw
|
|
93 }
|
|
94 del = scores[i-1][j] + (gaps[i-1][j] == 'N'? gapPenOpen:gapPenExt);
|
|
95 insert = scores[i][j-1] + (gaps[i][j-1] == 'N'? gapPenOpen:gapPenExt);
|
|
96 //debug
|
|
97 //printf("I:%d J:%d match = %f del %f insert:%f\n",i,j,match,del,insert);
|
|
98
|
|
99 if (match>del && match > insert)
|
|
100 {
|
|
101 scores[i][j] = match;
|
|
102 dir[i][j] = 'D';
|
|
103 gaps[i][j] = 'N';
|
|
104 }
|
|
105 else if(del > insert)
|
|
106 {
|
|
107 scores[i][j] = del;
|
|
108 dir[i][j] = 'U';
|
|
109 gaps[i][j] = 'O';
|
|
110 }
|
|
111 else
|
|
112 {
|
|
113 scores[i][j] = insert;
|
|
114 dir[i][j] = 'L';
|
|
115 gaps[i][j] = 'O';
|
|
116 }
|
|
117 }
|
|
118
|
|
119 //debug
|
|
120 //if (debug)
|
|
121 // printMatrix(*scores, size1+1, size2+1);
|
|
122 //printMatrix(*dir, size1+1, size2+1);
|
|
123
|
|
124 string alignment1="";
|
|
125 string alignment2="";
|
|
126
|
|
127
|
|
128 int cont1 = size1;
|
|
129 int cont2 = size2;
|
|
130 int max1 = size1;
|
|
131 int max2 = size2;
|
|
132 double num = scores[size1][size2];;
|
|
133
|
|
134 if (noEndGapPenalty)
|
|
135 {
|
|
136 //Search Maxes
|
|
137
|
|
138 for(int i = 1; i <= size1; i++)
|
|
139 {
|
|
140 if(num<scores[i][size2])
|
|
141 {
|
|
142 num = scores[i][size2];
|
|
143 max1 = i;
|
|
144 max2 = size2;
|
|
145 }
|
|
146 }
|
|
147 for(int j = 1; j <= size2; j++)
|
|
148 {
|
|
149 if(num<scores[size1][j])
|
|
150 {
|
|
151 num = scores[size1][j];
|
|
152 max1 = size1;
|
|
153 max2 = j;
|
|
154 }
|
|
155 }
|
|
156 cont1 = max1;
|
|
157 cont2 = max2;
|
|
158 }
|
|
159
|
|
160 while (cont1 > 0 && cont2 > 0)
|
|
161 {
|
|
162 if(dir[cont1][cont2] == 'D')
|
|
163 {
|
|
164 alignment1+= seq1[cont1-1];
|
|
165 alignment2+= seq2[cont2-1];
|
|
166 cont1--;
|
|
167 cont2--;
|
|
168
|
|
169 }
|
|
170 else if(dir[cont1][cont2] == 'L')
|
|
171 {
|
|
172 alignment1+= '-';
|
|
173 alignment2+= seq2[cont2-1];
|
|
174 cont2--;
|
|
175 }
|
|
176 else
|
|
177 {
|
|
178 alignment1+= seq1[cont1-1];
|
|
179 alignment2+= '-';
|
|
180 cont1--;
|
|
181
|
|
182 }
|
|
183 }
|
|
184
|
|
185 while (cont1 > 0)
|
|
186 {
|
|
187 alignment1+= seq1[cont1-1];
|
|
188 alignment2+= '-';
|
|
189 cont1--;
|
|
190 }
|
|
191
|
|
192 while (cont2 > 0)
|
|
193 {
|
|
194 alignment1+= '-';
|
|
195 alignment2+= seq2[cont2-1];
|
|
196 cont2--;
|
|
197 }
|
|
198
|
|
199 alignment1 = string (alignment1.rbegin(),alignment1.rend());
|
|
200 alignment2 = string (alignment2.rbegin(),alignment2.rend());
|
|
201
|
|
202
|
|
203 if (noEndGapPenalty)
|
|
204 {
|
|
205 //Need to fill out rest of alignment...
|
|
206 if (max1 != size1)
|
|
207 {
|
|
208 for (int i=max1; i<size1; ++i )
|
|
209 {
|
|
210 alignment1+= seq1 [i];
|
|
211 alignment2+= '-';
|
|
212
|
|
213 }
|
|
214 }
|
|
215
|
|
216 if (max2 != size2)
|
|
217 {
|
|
218
|
|
219 for (int i=max2; i<size2; ++i )
|
|
220 {
|
|
221 alignment2+= seq2 [i];
|
|
222 alignment1+= '-';
|
|
223
|
|
224 }
|
|
225 }
|
|
226
|
|
227 }
|
|
228 if (debug)
|
|
229 {
|
|
230 printf("Seq1: %s\n", alignment1.c_str());
|
|
231 printf("Seq2: %s\n\n", alignment2.c_str());
|
|
232
|
|
233 }
|
|
234
|
|
235 //Returns
|
|
236 seq1 = alignment1;
|
|
237 seq2 = alignment2;
|
|
238
|
|
239 return num;
|
|
240
|
|
241 }
|
|
242
|
|
243 vector <string> nw_alignAlignments (vector<string> a1, vector<string> a2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug)
|
|
244 {
|
|
245 //Do NW alignment of 2 strings...
|
|
246 int size1 = a1[0].length();
|
|
247 int size2 = a2[0].length();
|
|
248 int v1 = a1.size();
|
|
249 int v2 = a2.size();
|
|
250 int combs = v1 * v2;
|
|
251
|
|
252 vector<string> msa;
|
|
253 msa.clear();
|
|
254 vector<string> alignment1, alignment2;
|
|
255
|
|
256 for (int i=0;i<v1;++i)
|
|
257 alignment1.push_back(string());
|
|
258 for (int i=0;i<v2;++i)
|
|
259 alignment2.push_back(string());
|
|
260
|
|
261
|
|
262
|
|
263 double scores[size1+1][size2+1];
|
|
264 char dir[size1+1][size2+1];
|
|
265 char gaps[size1+1][size2+1];
|
|
266
|
|
267 for(int i = 0; i <= size1; i++)
|
|
268 {
|
|
269 scores[i][0] = (double)(i-1)*gapPenExt + gapPenOpen;
|
|
270 dir[i][0] = 'U';
|
|
271 gaps[i][0] = 'O';
|
|
272 }
|
|
273 for(int j = 0; j <= size2; j++)
|
|
274 {
|
|
275 scores[0][j] = (double)(j-1)*gapPenExt + gapPenOpen;
|
|
276 dir[0][j] = 'L';
|
|
277 gaps[0][j] = 'O';
|
|
278 }
|
|
279
|
|
280 scores[0][0] = 0;
|
|
281 gaps[0][0] = 'N';
|
|
282
|
|
283 if (noFrontGapPenalty)
|
|
284 {
|
|
285 for(int i = 0; i <= size1; i++) scores[i][0] = 0;
|
|
286 for(int j = 0; j <= size2; j++) scores[0][j] = 0;
|
|
287 }
|
|
288
|
|
289 double match;
|
|
290 double del;
|
|
291 double insert;
|
|
292
|
|
293 for(int i = 1; i <= size1; i++)
|
|
294 for(int j = 1; j <= size2; j++)
|
|
295 {
|
|
296
|
|
297 match = del = insert = 0.0f;
|
|
298 for(int z=0;z<v1;++z)
|
|
299 {
|
|
300 for(int w=0;w<v2;++w)
|
|
301 {
|
|
302 if (a1[z][i-1] == 'N' || a2[w][j-1] == 'N')
|
|
303 {
|
|
304 match += scores[i-1][j-1];
|
|
305 }
|
|
306 else
|
|
307 {
|
|
308 match += scores[i-1][j-1] + (double)(a1[z][i-1] == a2[w][j-1]? matchScore:missmatch_penalty);
|
|
309 }
|
|
310 del += scores[i-1][j] + (gaps[i-1][j] == 'N'? gapPenOpen:gapPenExt);
|
|
311 insert += scores[i][j-1] + (gaps[i][j-1] == 'N'? gapPenOpen:gapPenExt);
|
|
312 }
|
|
313 }
|
|
314
|
|
315 match /= combs;
|
|
316 del /= combs;
|
|
317 insert /= combs;
|
|
318
|
|
319 if (match > del && match > insert)
|
|
320 {
|
|
321 scores[i][j] = match;
|
|
322 dir[i][j] = 'D';
|
|
323 gaps[i][j] = 'N';
|
|
324 }
|
|
325 else if(del > insert)
|
|
326 {
|
|
327 scores[i][j] = del;
|
|
328 dir[i][j] = 'U';
|
|
329 gaps[i][j] = 'O';
|
|
330 }
|
|
331 else
|
|
332 {
|
|
333 scores[i][j] = insert;
|
|
334 dir[i][j] = 'L';
|
|
335 gaps[i][j] = 'O';
|
|
336 }
|
|
337 }
|
|
338
|
|
339 //debug
|
|
340 // if (debug)
|
|
341 // printMatrix(*scores, size1+1, size2+1);
|
|
342 //printMatrix(*dir, size1+1, size2+1);
|
|
343
|
|
344
|
|
345 int cont1 = size1;
|
|
346 int cont2 = size2;
|
|
347 int max1 = size1;
|
|
348 int max2 = size2;
|
|
349 double num = scores[size1][size2];
|
|
350
|
|
351 if (noEndGapPenalty)
|
|
352 {
|
|
353 //Search Maxes
|
|
354
|
|
355 for(int i = 1; i <= size1; i++)
|
|
356 {
|
|
357 if(num<scores[i][size2])
|
|
358 {
|
|
359 num = scores[i][size2];
|
|
360 max1 = i;
|
|
361 max2 = size2;
|
|
362 }
|
|
363 }
|
|
364 for(int j = 1; j <= size2; j++)
|
|
365 {
|
|
366 if(num<scores[size1][j])
|
|
367 {
|
|
368 num = scores[size1][j];
|
|
369 max1 = size1;
|
|
370 max2 = j;
|
|
371 }
|
|
372 }
|
|
373 cont1 = max1;
|
|
374 cont2 = max2;
|
|
375 }
|
|
376
|
|
377 while (cont1 > 0 && cont2 > 0)
|
|
378 {
|
|
379 if(dir[cont1][cont2] == 'D')
|
|
380 {
|
|
381
|
|
382 for(int z=0;z<v1;++z)
|
|
383 alignment1[z]+= a1[z][cont1-1];
|
|
384
|
|
385 for(int w=0;w<v2;++w)
|
|
386 alignment2[w]+= a2[w][cont2-1];
|
|
387
|
|
388 cont1--;
|
|
389 cont2--;
|
|
390
|
|
391 }
|
|
392 else if(dir[cont1][cont2] == 'L')
|
|
393 {
|
|
394 for(int z=0;z<v1;++z)
|
|
395 alignment1[z]+= '-';
|
|
396 for(int w=0;w<v2;++w)
|
|
397 alignment2[w]+= a2[w][cont2-1];
|
|
398
|
|
399 cont2--;
|
|
400 }
|
|
401 else
|
|
402 {
|
|
403 for(int z=0;z<v1;++z)
|
|
404 alignment1[z]+= a1[z][cont1-1];
|
|
405
|
|
406 for(int w=0;w<v2;++w)
|
|
407 alignment2[w]+= '-';
|
|
408
|
|
409 cont1--;
|
|
410
|
|
411 }
|
|
412 }
|
|
413
|
|
414 while (cont1 > 0)
|
|
415 {
|
|
416 for(int z=0;z<v1;++z)
|
|
417 alignment1[z]+= a1[z][cont1-1];
|
|
418 for(int w=0;w<v2;++w)
|
|
419 alignment2[w]+= '-';
|
|
420
|
|
421 cont1--;
|
|
422 }
|
|
423
|
|
424 while (cont2 > 0)
|
|
425 {
|
|
426 for(int z=0;z<v1;++z)
|
|
427 alignment1[z]+= '-';
|
|
428 for(int w=0;w<v2;++w)
|
|
429 alignment2[w]+= a2[w][cont2-1];
|
|
430
|
|
431 cont2--;
|
|
432 }
|
|
433
|
|
434
|
|
435
|
|
436
|
|
437 for(int z=0;z<v1;++z)
|
|
438 {
|
|
439 alignment1[z] = string (alignment1[z].rbegin(),alignment1[z].rend());
|
|
440 }
|
|
441 for(int w=0;w<v2;++w)
|
|
442 {
|
|
443 alignment2[w] = string (alignment2[w].rbegin(),alignment2[w].rend());
|
|
444 }
|
|
445
|
|
446
|
|
447
|
|
448 if (noEndGapPenalty)
|
|
449 {
|
|
450 //Need to fill out rest of alignment...
|
|
451 if (max1 != size1)
|
|
452 {
|
|
453 for (int i=max1; i<size1; ++i )
|
|
454 {
|
|
455 for(int z=0;z<v1;++z)
|
|
456 alignment1[z]+= a1[z][i];
|
|
457 for(int w=0;w<v2;++w)
|
|
458 alignment2[w]+= '-';
|
|
459
|
|
460 }
|
|
461 }
|
|
462
|
|
463 if (max2 != size2)
|
|
464 {
|
|
465
|
|
466 for (int i=max2; i<size2; ++i )
|
|
467 {
|
|
468 for(int z=0;z<v1;++z)
|
|
469 alignment1[z]+= '-';
|
|
470 for(int w=0;w<v2;++w)
|
|
471 alignment2[w]+= a2[w][i];
|
|
472
|
|
473 }
|
|
474 }
|
|
475
|
|
476 }
|
|
477
|
|
478 //returns
|
|
479 for(int z=0;z<v1;++z)
|
|
480 msa.push_back(alignment1[z]);
|
|
481 for(int w=0;w<v2;++w)
|
|
482 msa.push_back(alignment2[w]);
|
|
483
|
|
484 if (debug)
|
|
485 {
|
|
486 for(int z=0;z<v1;++z)
|
|
487 printf("%s\n",alignment1[z].c_str());
|
|
488 for(int w=0;w<v2;++w)
|
|
489 printf("%s\n",alignment2[w].c_str());
|
|
490
|
|
491 }
|
|
492
|
|
493 return msa;
|
|
494
|
|
495 }
|
|
496
|
|
497
|
|
498
|