Mercurial > repos > fcaramia > custom_amplicon_alignment
diff alignCustomAmplicon/utils.cpp @ 0:d32bddcff685 draft
Uploaded
author | fcaramia |
---|---|
date | Wed, 09 Jan 2013 00:24:09 -0500 |
parents | |
children | 0aaf65fbb48a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignCustomAmplicon/utils.cpp Wed Jan 09 00:24:09 2013 -0500 @@ -0,0 +1,498 @@ +#include <stdio.h> +#include <string> +#include <vector> + +#define max(a,b,c) a > b ? ( a > c ? a : c ) : ( b > c ? b : c ) ; +#define min(a,b,c) a < b ? ( a < c ? a : c ) : ( b < c ? b : c ) ; +using namespace std; + +void printMatrix (double* m, int rows, int cols) +{ + for(int i = 0; i < rows; i++) + { + for(int j = 0; j < cols; j++) + { + printf("%.2f ", *(m + i*cols +j)); + } + printf("\n"); + } + +} + +int levenshteinDistance(string s, string t) +{ + //Calculate Edit distance of 2 strings + if(s == t) + return 0; + int size1 = s.length()+1; + int size2 = t.length()+1; + int d[size1][size2]; + + for (int i=0; i<size1; i++)d[i][0] = i; + for (int i=0; i<size2; i++)d[0][i] = i; + + for (int i=1; i<size1; i++) + for (int j=1; j<size2; j++) + { + if(s[i-1] == t[j-1] ) + d[i][j] = d[i-1][j-1]; //if equeal no operation + else + d[i][j] = min(d[i-1][j] + 1,d[i][j-1] + 1,d[i-1][j-1] + 1 ); //Addition, deletion or substitution + + } + + return d[size1-1][size2-1]; // return edit distance +} + +double nw_align2 (string &seq1, string &seq2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug) +{ + //Do NW alignment of 2 strings... + int size1 = seq1.length(); + int size2 = seq2.length(); + + double scores[size1+1][size2+1]; + char dir[size1+1][size2+1]; + char gaps[size1+1][size2+1]; + + for(int i = 0; i <= size1; i++) + { + scores[i][0] = (double)(i-1)*gapPenExt + gapPenOpen; + dir[i][0] = 'U'; + gaps[i][0] = 'O'; + } + for(int j = 0; j <= size2; j++) + { + scores[0][j] = (double)(j-1)*gapPenExt + gapPenOpen; + dir[0][j] = 'L'; + gaps[0][j] = 'O'; + } + + scores[0][0] = 0; + gaps[0][0] = 'N'; + + if (noFrontGapPenalty) + { + for(int i = 0; i <= size1; i++) scores[i][0] = 0; + for(int j = 0; j <= size2; j++) scores[0][j] = 0; + } + + double match; + double del; + double insert; + + for(int i = 1; i <= size1; i++) + for(int j = 1; j <= size2; j++) + { + if (seq1[i-1] == 'N' || seq2[j-1] == 'N') + { + match = scores[i-1][j-1]; + } + else + { + match = scores[i-1][j-1] + (double)(seq1[i-1] == seq2[j-1]? matchScore : missmatch_penalty); //1.9f clustalw + } + del = scores[i-1][j] + (gaps[i-1][j] == 'N'? gapPenOpen:gapPenExt); + insert = scores[i][j-1] + (gaps[i][j-1] == 'N'? gapPenOpen:gapPenExt); + //debug + //printf("I:%d J:%d match = %f del %f insert:%f\n",i,j,match,del,insert); + + if (match>del && match > insert) + { + scores[i][j] = match; + dir[i][j] = 'D'; + gaps[i][j] = 'N'; + } + else if(del > insert) + { + scores[i][j] = del; + dir[i][j] = 'U'; + gaps[i][j] = 'O'; + } + else + { + scores[i][j] = insert; + dir[i][j] = 'L'; + gaps[i][j] = 'O'; + } + } + + //debug + //if (debug) + // printMatrix(*scores, size1+1, size2+1); + //printMatrix(*dir, size1+1, size2+1); + + string alignment1=""; + string alignment2=""; + + + int cont1 = size1; + int cont2 = size2; + int max1 = size1; + int max2 = size2; + double num = scores[size1][size2];; + + if (noEndGapPenalty) + { + //Search Maxes + + for(int i = 1; i <= size1; i++) + { + if(num<scores[i][size2]) + { + num = scores[i][size2]; + max1 = i; + max2 = size2; + } + } + for(int j = 1; j <= size2; j++) + { + if(num<scores[size1][j]) + { + num = scores[size1][j]; + max1 = size1; + max2 = j; + } + } + cont1 = max1; + cont2 = max2; + } + + while (cont1 > 0 && cont2 > 0) + { + if(dir[cont1][cont2] == 'D') + { + alignment1+= seq1[cont1-1]; + alignment2+= seq2[cont2-1]; + cont1--; + cont2--; + + } + else if(dir[cont1][cont2] == 'L') + { + alignment1+= '-'; + alignment2+= seq2[cont2-1]; + cont2--; + } + else + { + alignment1+= seq1[cont1-1]; + alignment2+= '-'; + cont1--; + + } + } + + while (cont1 > 0) + { + alignment1+= seq1[cont1-1]; + alignment2+= '-'; + cont1--; + } + + while (cont2 > 0) + { + alignment1+= '-'; + alignment2+= seq2[cont2-1]; + cont2--; + } + + alignment1 = string (alignment1.rbegin(),alignment1.rend()); + alignment2 = string (alignment2.rbegin(),alignment2.rend()); + + + if (noEndGapPenalty) + { + //Need to fill out rest of alignment... + if (max1 != size1) + { + for (int i=max1; i<size1; ++i ) + { + alignment1+= seq1 [i]; + alignment2+= '-'; + + } + } + + if (max2 != size2) + { + + for (int i=max2; i<size2; ++i ) + { + alignment2+= seq2 [i]; + alignment1+= '-'; + + } + } + + } + if (debug) + { + printf("Seq1: %s\n", alignment1.c_str()); + printf("Seq2: %s\n\n", alignment2.c_str()); + + } + + //Returns + seq1 = alignment1; + seq2 = alignment2; + + return num; + +} + +vector <string> nw_alignAlignments (vector<string> a1, vector<string> a2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug) +{ + //Do NW alignment of 2 strings... + int size1 = a1[0].length(); + int size2 = a2[0].length(); + int v1 = a1.size(); + int v2 = a2.size(); + int combs = v1 * v2; + + vector<string> msa; + msa.clear(); + vector<string> alignment1, alignment2; + + for (int i=0;i<v1;++i) + alignment1.push_back(string()); + for (int i=0;i<v2;++i) + alignment2.push_back(string()); + + + + double scores[size1+1][size2+1]; + char dir[size1+1][size2+1]; + char gaps[size1+1][size2+1]; + + for(int i = 0; i <= size1; i++) + { + scores[i][0] = (double)(i-1)*gapPenExt + gapPenOpen; + dir[i][0] = 'U'; + gaps[i][0] = 'O'; + } + for(int j = 0; j <= size2; j++) + { + scores[0][j] = (double)(j-1)*gapPenExt + gapPenOpen; + dir[0][j] = 'L'; + gaps[0][j] = 'O'; + } + + scores[0][0] = 0; + gaps[0][0] = 'N'; + + if (noFrontGapPenalty) + { + for(int i = 0; i <= size1; i++) scores[i][0] = 0; + for(int j = 0; j <= size2; j++) scores[0][j] = 0; + } + + double match; + double del; + double insert; + + for(int i = 1; i <= size1; i++) + for(int j = 1; j <= size2; j++) + { + + match = del = insert = 0.0f; + for(int z=0;z<v1;++z) + { + for(int w=0;w<v2;++w) + { + if (a1[z][i-1] == 'N' || a2[w][j-1] == 'N') + { + match += scores[i-1][j-1]; + } + else + { + match += scores[i-1][j-1] + (double)(a1[z][i-1] == a2[w][j-1]? matchScore:missmatch_penalty); + } + del += scores[i-1][j] + (gaps[i-1][j] == 'N'? gapPenOpen:gapPenExt); + insert += scores[i][j-1] + (gaps[i][j-1] == 'N'? gapPenOpen:gapPenExt); + } + } + + match /= combs; + del /= combs; + insert /= combs; + + if (match > del && match > insert) + { + scores[i][j] = match; + dir[i][j] = 'D'; + gaps[i][j] = 'N'; + } + else if(del > insert) + { + scores[i][j] = del; + dir[i][j] = 'U'; + gaps[i][j] = 'O'; + } + else + { + scores[i][j] = insert; + dir[i][j] = 'L'; + gaps[i][j] = 'O'; + } + } + + //debug +// if (debug) +// printMatrix(*scores, size1+1, size2+1); + //printMatrix(*dir, size1+1, size2+1); + + + int cont1 = size1; + int cont2 = size2; + int max1 = size1; + int max2 = size2; + double num = scores[size1][size2]; + + if (noEndGapPenalty) + { + //Search Maxes + + for(int i = 1; i <= size1; i++) + { + if(num<scores[i][size2]) + { + num = scores[i][size2]; + max1 = i; + max2 = size2; + } + } + for(int j = 1; j <= size2; j++) + { + if(num<scores[size1][j]) + { + num = scores[size1][j]; + max1 = size1; + max2 = j; + } + } + cont1 = max1; + cont2 = max2; + } + + while (cont1 > 0 && cont2 > 0) + { + if(dir[cont1][cont2] == 'D') + { + + for(int z=0;z<v1;++z) + alignment1[z]+= a1[z][cont1-1]; + + for(int w=0;w<v2;++w) + alignment2[w]+= a2[w][cont2-1]; + + cont1--; + cont2--; + + } + else if(dir[cont1][cont2] == 'L') + { + for(int z=0;z<v1;++z) + alignment1[z]+= '-'; + for(int w=0;w<v2;++w) + alignment2[w]+= a2[w][cont2-1]; + + cont2--; + } + else + { + for(int z=0;z<v1;++z) + alignment1[z]+= a1[z][cont1-1]; + + for(int w=0;w<v2;++w) + alignment2[w]+= '-'; + + cont1--; + + } + } + + while (cont1 > 0) + { + for(int z=0;z<v1;++z) + alignment1[z]+= a1[z][cont1-1]; + for(int w=0;w<v2;++w) + alignment2[w]+= '-'; + + cont1--; + } + + while (cont2 > 0) + { + for(int z=0;z<v1;++z) + alignment1[z]+= '-'; + for(int w=0;w<v2;++w) + alignment2[w]+= a2[w][cont2-1]; + + cont2--; + } + + + + + for(int z=0;z<v1;++z) + { + alignment1[z] = string (alignment1[z].rbegin(),alignment1[z].rend()); + } + for(int w=0;w<v2;++w) + { + alignment2[w] = string (alignment2[w].rbegin(),alignment2[w].rend()); + } + + + + if (noEndGapPenalty) + { + //Need to fill out rest of alignment... + if (max1 != size1) + { + for (int i=max1; i<size1; ++i ) + { + for(int z=0;z<v1;++z) + alignment1[z]+= a1[z][i]; + for(int w=0;w<v2;++w) + alignment2[w]+= '-'; + + } + } + + if (max2 != size2) + { + + for (int i=max2; i<size2; ++i ) + { + for(int z=0;z<v1;++z) + alignment1[z]+= '-'; + for(int w=0;w<v2;++w) + alignment2[w]+= a2[w][i]; + + } + } + + } + + //returns + for(int z=0;z<v1;++z) + msa.push_back(alignment1[z]); + for(int w=0;w<v2;++w) + msa.push_back(alignment2[w]); + + if (debug) + { + for(int z=0;z<v1;++z) + printf("%s\n",alignment1[z].c_str()); + for(int w=0;w<v2;++w) + printf("%s\n",alignment2[w].c_str()); + + } + + return msa; + +} + + +