comparison PsiCLASS-1.0.2/samtools-0.1.19/kaln.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 /* The MIT License
2
3 Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com>
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <stdint.h>
30 #include <math.h>
31 #include "kaln.h"
32
33 #define FROM_M 0
34 #define FROM_I 1
35 #define FROM_D 2
36
37 typedef struct {
38 int i, j;
39 unsigned char ctype;
40 } path_t;
41
42 int aln_sm_blosum62[] = {
43 /* A R N D C Q E G H I L K M F P S T W Y V * X */
44 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0,
45 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1,
46 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1,
47 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1,
48 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2,
49 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1,
50 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1,
51 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1,
52 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1,
53 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1,
54 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1,
55 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1,
56 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1,
57 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1,
58 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2,
59 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0,
60 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0,
61 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2,
62 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1,
63 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1,
64 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4,
65 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1
66 };
67
68 int aln_sm_blast[] = {
69 1, -3, -3, -3, -2,
70 -3, 1, -3, -3, -2,
71 -3, -3, 1, -3, -2,
72 -3, -3, -3, 1, -2,
73 -2, -2, -2, -2, -2
74 };
75
76 int aln_sm_qual[] = {
77 0, -23, -23, -23, 0,
78 -23, 0, -23, -23, 0,
79 -23, -23, 0, -23, 0,
80 -23, -23, -23, 0, 0,
81 0, 0, 0, 0, 0
82 };
83
84 ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
85 ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
86
87 ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 };
88
89 static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
90 {
91 int i, n;
92 uint32_t *cigar;
93 unsigned char last_type;
94
95 if (path_len == 0 || path == 0) {
96 *n_cigar = 0;
97 return 0;
98 }
99
100 last_type = path->ctype;
101 for (i = n = 1; i < path_len; ++i) {
102 if (last_type != path[i].ctype) ++n;
103 last_type = path[i].ctype;
104 }
105 *n_cigar = n;
106 cigar = (uint32_t*)calloc(*n_cigar, 4);
107
108 cigar[0] = 1u << 4 | path[path_len-1].ctype;
109 last_type = path[path_len-1].ctype;
110 for (i = path_len - 2, n = 0; i >= 0; --i) {
111 if (path[i].ctype == last_type) cigar[n] += 1u << 4;
112 else {
113 cigar[++n] = 1u << 4 | path[i].ctype;
114 last_type = path[i].ctype;
115 }
116 }
117
118 return cigar;
119 }
120
121 /***************************/
122 /* START OF common_align.c */
123 /***************************/
124
125 #define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF;
126
127 #define set_M(MM, cur, p, sc) \
128 { \
129 if ((p)->M >= (p)->I) { \
130 if ((p)->M >= (p)->D) { \
131 (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \
132 } else { \
133 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
134 } \
135 } else { \
136 if ((p)->I > (p)->D) { \
137 (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \
138 } else { \
139 (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \
140 } \
141 } \
142 }
143 #define set_I(II, cur, p) \
144 { \
145 if ((p)->M - gap_open > (p)->I) { \
146 (cur)->It = FROM_M; \
147 (II) = (p)->M - gap_open - gap_ext; \
148 } else { \
149 (cur)->It = FROM_I; \
150 (II) = (p)->I - gap_ext; \
151 } \
152 }
153 #define set_end_I(II, cur, p) \
154 { \
155 if (gap_end_ext >= 0) { \
156 if ((p)->M - gap_end_open > (p)->I) { \
157 (cur)->It = FROM_M; \
158 (II) = (p)->M - gap_end_open - gap_end_ext; \
159 } else { \
160 (cur)->It = FROM_I; \
161 (II) = (p)->I - gap_end_ext; \
162 } \
163 } else set_I(II, cur, p); \
164 }
165 #define set_D(DD, cur, p) \
166 { \
167 if ((p)->M - gap_open > (p)->D) { \
168 (cur)->Dt = FROM_M; \
169 (DD) = (p)->M - gap_open - gap_ext; \
170 } else { \
171 (cur)->Dt = FROM_D; \
172 (DD) = (p)->D - gap_ext; \
173 } \
174 }
175 #define set_end_D(DD, cur, p) \
176 { \
177 if (gap_end_ext >= 0) { \
178 if ((p)->M - gap_end_open > (p)->D) { \
179 (cur)->Dt = FROM_M; \
180 (DD) = (p)->M - gap_end_open - gap_end_ext; \
181 } else { \
182 (cur)->Dt = FROM_D; \
183 (DD) = (p)->D - gap_end_ext; \
184 } \
185 } else set_D(DD, cur, p); \
186 }
187
188 typedef struct {
189 uint8_t Mt:3, It:2, Dt:3;
190 } dpcell_t;
191
192 typedef struct {
193 int M, I, D;
194 } dpscore_t;
195
196 /***************************
197 * banded global alignment *
198 ***************************/
199 uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar)
200 {
201 int i, j;
202 dpcell_t **dpcell, *q;
203 dpscore_t *curr, *last, *s;
204 int b1, b2, tmp_end;
205 int *mat, end, max = 0;
206 uint8_t type, ctype;
207 uint32_t *cigar = 0;
208
209 int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
210 int *score_matrix, N_MATRIX_ROW;
211
212 /* initialize some align-related parameters. just for compatibility */
213 gap_open = ap->gap_open;
214 gap_ext = ap->gap_ext;
215 gap_end_open = ap->gap_end_open;
216 gap_end_ext = ap->gap_end_ext;
217 b = ap->band_width;
218 score_matrix = ap->matrix;
219 N_MATRIX_ROW = ap->row;
220
221 if (n_cigar) *n_cigar = 0;
222 if (len1 == 0 || len2 == 0) return 0;
223
224 /* calculate b1 and b2 */
225 if (len1 > len2) {
226 b1 = len1 - len2 + b;
227 b2 = b;
228 } else {
229 b1 = b;
230 b2 = len2 - len1 + b;
231 }
232 if (b1 > len1) b1 = len1;
233 if (b2 > len2) b2 = len2;
234 --seq1; --seq2;
235
236 /* allocate memory */
237 end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1);
238 dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1));
239 for (j = 0; j <= len2; ++j)
240 dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end);
241 for (j = b2 + 1; j <= len2; ++j)
242 dpcell[j] -= j - b2;
243 curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
244 last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1));
245
246 /* set first row */
247 SET_INF(*curr); curr->M = 0;
248 for (i = 1, s = curr + 1; i < b1; ++i, ++s) {
249 SET_INF(*s);
250 set_end_D(s->D, dpcell[0] + i, s - 1);
251 }
252 s = curr; curr = last; last = s;
253
254 /* core dynamic programming, part 1 */
255 tmp_end = (b2 < len2)? b2 : len2 - 1;
256 for (j = 1; j <= tmp_end; ++j) {
257 q = dpcell[j]; s = curr; SET_INF(*s);
258 set_end_I(s->I, q, last);
259 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
260 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
261 ++s; ++q;
262 for (i = 1; i != end; ++i, ++s, ++q) {
263 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
264 set_I(s->I, q, last + i);
265 set_D(s->D, q, s - 1);
266 }
267 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
268 set_D(s->D, q, s - 1);
269 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
270 set_end_I(s->I, q, last + i);
271 } else s->I = MINOR_INF;
272 s = curr; curr = last; last = s;
273 }
274 /* last row for part 1, use set_end_D() instead of set_D() */
275 if (j == len2 && b2 != len2 - 1) {
276 q = dpcell[j]; s = curr; SET_INF(*s);
277 set_end_I(s->I, q, last);
278 end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1;
279 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
280 ++s; ++q;
281 for (i = 1; i != end; ++i, ++s, ++q) {
282 set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */
283 set_I(s->I, q, last + i);
284 set_end_D(s->D, q, s - 1);
285 }
286 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
287 set_end_D(s->D, q, s - 1);
288 if (j + b1 - 1 > len1) { /* bug fixed, 040227 */
289 set_end_I(s->I, q, last + i);
290 } else s->I = MINOR_INF;
291 s = curr; curr = last; last = s;
292 ++j;
293 }
294
295 /* core dynamic programming, part 2 */
296 for (; j <= len2 - b2 + 1; ++j) {
297 SET_INF(curr[j - b2]);
298 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
299 end = j + b1 - 1;
300 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) {
301 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
302 set_I(s->I, q, last + i);
303 set_D(s->D, q, s - 1);
304 }
305 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
306 set_D(s->D, q, s - 1);
307 s->I = MINOR_INF;
308 s = curr; curr = last; last = s;
309 }
310
311 /* core dynamic programming, part 3 */
312 for (; j < len2; ++j) {
313 SET_INF(curr[j - b2]);
314 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
315 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
316 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
317 set_I(s->I, q, last + i);
318 set_D(s->D, q, s - 1);
319 }
320 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
321 set_end_I(s->I, q, last + i);
322 set_D(s->D, q, s - 1);
323 s = curr; curr = last; last = s;
324 }
325 /* last row */
326 if (j == len2) {
327 SET_INF(curr[j - b2]);
328 mat = score_matrix + seq2[j] * N_MATRIX_ROW;
329 for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) {
330 set_M(s->M, q, last + i - 1, mat[seq1[i]]);
331 set_I(s->I, q, last + i);
332 set_end_D(s->D, q, s - 1);
333 }
334 set_M(s->M, q, last + len1 - 1, mat[seq1[i]]);
335 set_end_I(s->I, q, last + i);
336 set_end_D(s->D, q, s - 1);
337 s = curr; curr = last; last = s;
338 }
339
340 *_score = last[len1].M;
341 if (n_cigar) { /* backtrace */
342 path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2));
343 i = len1; j = len2;
344 q = dpcell[j] + i;
345 s = last + len1;
346 max = s->M; type = q->Mt; ctype = FROM_M;
347 if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; }
348 if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; }
349
350 p = path;
351 p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */
352 ++p;
353 do {
354 switch (ctype) {
355 case FROM_M: --i; --j; break;
356 case FROM_I: --j; break;
357 case FROM_D: --i; break;
358 }
359 q = dpcell[j] + i;
360 ctype = type;
361 switch (type) {
362 case FROM_M: type = q->Mt; break;
363 case FROM_I: type = q->It; break;
364 case FROM_D: type = q->Dt; break;
365 }
366 p->ctype = ctype; p->i = i; p->j = j;
367 ++p;
368 } while (i || j);
369 cigar = ka_path2cigar32(path, p - path - 1, n_cigar);
370 free(path);
371 }
372
373 /* free memory */
374 for (j = b2 + 1; j <= len2; ++j)
375 dpcell[j] += j - b2;
376 for (j = 0; j <= len2; ++j)
377 free(dpcell[j]);
378 free(dpcell);
379 free(curr); free(last);
380
381 return cigar;
382 }
383
384 typedef struct {
385 int M, I, D;
386 } score_aux_t;
387
388 #define MINUS_INF -0x40000000
389
390 // matrix: len2 rows and len1 columns
391 int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap)
392 {
393
394 #define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \
395 int t1, t2; \
396 score_aux_t *_q; \
397 _q = _q0; \
398 _p->M = _q->M >= _q->I? _q->M : _q->I; \
399 _p->M = _p->M >= _q->D? _p->M : _q->D; \
400 _p->M += (_sc); \
401 ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \
402 _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \
403 }
404
405 int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret;
406 const uint8_t *seq1, *seq2;
407 score_aux_t *curr, *last, *swap;
408 bw = abs(len1 - len2) + ap->band_width;
409 i = len1 > len2? len1 : len2;
410 if (bw > i + 1) bw = i + 1;
411 seq1 = _seq1 - 1; seq2 = _seq2 - 1;
412 curr = calloc(len1 + 2, sizeof(score_aux_t));
413 last = calloc(len1 + 2, sizeof(score_aux_t));
414 { // the zero-th row
415 int x, end = len1;
416 score_aux_t *p;
417 j = 0;
418 x = j + bw; end = len1 < x? len1 : x; // band end
419 p = curr;
420 p->M = 0; p->I = p->D = MINUS_INF;
421 for (i = 1, p = &curr[1]; i <= end; ++i, ++p)
422 p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i);
423 p->M = p->I = p->D = MINUS_INF;
424 swap = curr; curr = last; last = swap;
425 }
426 for (j = 1; j < len2; ++j) {
427 int x, beg = 0, end = len1, *scrow, col_end;
428 score_aux_t *p;
429 x = j - bw; beg = 0 > x? 0 : x; // band start
430 x = j + bw; end = len1 < x? len1 : x; // band end
431 if (beg == 0) { // from zero-th column
432 p = curr;
433 p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
434 ++beg; // then beg = 1
435 }
436 scrow = scmat + seq2[j] * scmat_size;
437 if (end == len1) col_end = 1, --end;
438 else col_end = 0;
439 for (i = beg, p = &curr[beg]; i <= end; ++i, ++p)
440 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide);
441 if (col_end) {
442 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide);
443 ++p;
444 }
445 p->M = p->I = p->D = MINUS_INF;
446 // for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
447 swap = curr; curr = last; last = swap;
448 }
449 { // the last row
450 int x, beg = 0, *scrow;
451 score_aux_t *p;
452 j = len2;
453 x = j - bw; beg = 0 > x? 0 : x; // band start
454 if (beg == 0) { // from zero-th column
455 p = curr;
456 p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
457 ++beg; // then beg = 1
458 }
459 scrow = scmat + seq2[j] * scmat_size;
460 for (i = beg, p = &curr[beg]; i < len1; ++i, ++p)
461 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede);
462 __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede);
463 // for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
464 }
465 ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I;
466 ret = ret >= curr[len1].D? ret : curr[len1].D;
467 free(curr); free(last);
468 return ret;
469 }
470
471 #ifdef _MAIN
472 int main(int argc, char *argv[])
473 {
474 // int len1 = 35, len2 = 35;
475 // uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1";
476 // uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0";
477 int len1 = 4, len2 = 4;
478 uint8_t *seq1 = (uint8_t*)"\1\0\0\1";
479 uint8_t *seq2 = (uint8_t*)"\1\0\1\0";
480 int sc;
481 // ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0);
482 sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual);
483 printf("%d\n", sc);
484 return 0;
485 }
486 #endif