comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bwt.c @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <assert.h>
32 #include <stdint.h>
33 #include "utils.h"
34 #include "bwt.h"
35
36 void bwt_gen_cnt_table(bwt_t *bwt)
37 {
38 int i, j;
39 for (i = 0; i != 256; ++i) {
40 uint32_t x = 0;
41 for (j = 0; j != 4; ++j)
42 x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
43 bwt->cnt_table[i] = x;
44 }
45 }
46
47 // bwt->bwt and bwt->occ must be precalculated
48 void bwt_cal_sa(bwt_t *bwt, int intv)
49 {
50 bwtint_t isa, sa, i; // S(isa) = sa
51
52 xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
53
54 if (bwt->sa) free(bwt->sa);
55 bwt->sa_intv = intv;
56 bwt->n_sa = (bwt->seq_len + intv) / intv;
57 bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
58 // calculate SA value
59 isa = 0; sa = bwt->seq_len;
60 for (i = 0; i < bwt->seq_len; ++i) {
61 if (isa % intv == 0) bwt->sa[isa/intv] = sa;
62 --sa;
63 isa = bwt_invPsi(bwt, isa);
64 }
65 if (isa % intv == 0) bwt->sa[isa/intv] = sa;
66 bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
67 }
68
69 bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
70 {
71 bwtint_t sa = 0;
72 while (k % bwt->sa_intv != 0) {
73 ++sa;
74 k = bwt_invPsi(bwt, k);
75 }
76 /* without setting bwt->sa[0] = -1, the following line should be
77 changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
78 return sa + bwt->sa[k/bwt->sa_intv];
79 }
80
81 static inline int __occ_aux(uint64_t y, int c)
82 {
83 // reduce nucleotide counting to bits counting
84 y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
85 // count the number of 1s in y
86 y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
87 return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
88 }
89
90 inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
91 {
92 bwtint_t n, l, j;
93 uint32_t *p;
94
95 if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
96 if (k == (bwtint_t)(-1)) return 0;
97 if (k >= bwt->primary) --k; // because $ is not in bwt
98
99 // retrieve Occ at k/OCC_INTERVAL
100 n = (p = bwt_occ_intv(bwt, k))[c];
101 p += 4; // jump to the start of the first BWT cell
102
103 // calculate Occ up to the last k/32
104 j = k >> 5 << 5;
105 for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
106 n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
107
108 // calculate Occ
109 n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
110 if (c == 0) n -= ~k&31; // corrected for the masked bits
111
112 return n;
113 }
114
115 // an analogy to bwt_occ() but more efficient, requiring k <= l
116 inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
117 {
118 bwtint_t _k, _l;
119 if (k == l) {
120 *ok = *ol = bwt_occ(bwt, k, c);
121 return;
122 }
123 _k = (k >= bwt->primary)? k-1 : k;
124 _l = (l >= bwt->primary)? l-1 : l;
125 if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
126 *ok = bwt_occ(bwt, k, c);
127 *ol = bwt_occ(bwt, l, c);
128 } else {
129 bwtint_t m, n, i, j;
130 uint32_t *p;
131 if (k >= bwt->primary) --k;
132 if (l >= bwt->primary) --l;
133 n = (p = bwt_occ_intv(bwt, k))[c];
134 p += 4;
135 // calculate *ok
136 j = k >> 5 << 5;
137 for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
138 n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
139 m = n;
140 n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
141 if (c == 0) n -= ~k&31; // corrected for the masked bits
142 *ok = n;
143 // calculate *ol
144 j = l >> 5 << 5;
145 for (; i < j; i += 32, p += 2)
146 m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
147 m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
148 if (c == 0) m -= ~l&31; // corrected for the masked bits
149 *ol = m;
150 }
151 }
152
153 #define __occ_aux4(bwt, b) \
154 ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
155 + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
156
157 inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
158 {
159 bwtint_t l, j, x;
160 uint32_t *p;
161 if (k == (bwtint_t)(-1)) {
162 memset(cnt, 0, 4 * sizeof(bwtint_t));
163 return;
164 }
165 if (k >= bwt->primary) --k; // because $ is not in bwt
166 p = bwt_occ_intv(bwt, k);
167 memcpy(cnt, p, 16);
168 p += 4;
169 j = k >> 4 << 4;
170 for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
171 x += __occ_aux4(bwt, *p);
172 x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
173 cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
174 }
175
176 // an analogy to bwt_occ4() but more efficient, requiring k <= l
177 inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
178 {
179 bwtint_t _k, _l;
180 if (k == l) {
181 bwt_occ4(bwt, k, cntk);
182 memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
183 return;
184 }
185 _k = (k >= bwt->primary)? k-1 : k;
186 _l = (l >= bwt->primary)? l-1 : l;
187 if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
188 bwt_occ4(bwt, k, cntk);
189 bwt_occ4(bwt, l, cntl);
190 } else {
191 bwtint_t i, j, x, y;
192 uint32_t *p;
193 int cl[4];
194 if (k >= bwt->primary) --k; // because $ is not in bwt
195 if (l >= bwt->primary) --l;
196 cl[0] = cl[1] = cl[2] = cl[3] = 0;
197 p = bwt_occ_intv(bwt, k);
198 memcpy(cntk, p, 4 * sizeof(bwtint_t));
199 p += 4;
200 // prepare cntk[]
201 j = k >> 4 << 4;
202 for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
203 x += __occ_aux4(bwt, *p);
204 y = x;
205 x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
206 // calculate cntl[] and finalize cntk[]
207 j = l >> 4 << 4;
208 for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
209 y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
210 memcpy(cntl, cntk, 16);
211 cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
212 cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
213 }
214 }
215
216 int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
217 {
218 bwtint_t k, l, ok, ol;
219 int i;
220 k = 0; l = bwt->seq_len;
221 for (i = len - 1; i >= 0; --i) {
222 ubyte_t c = str[i];
223 if (c > 3) return 0; // no match
224 bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
225 k = bwt->L2[c] + ok + 1;
226 l = bwt->L2[c] + ol;
227 if (k > l) break; // no match
228 }
229 if (k > l) return 0; // no match
230 if (sa_begin) *sa_begin = k;
231 if (sa_end) *sa_end = l;
232 return l - k + 1;
233 }
234
235 int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0)
236 {
237 int i;
238 bwtint_t k, l, ok, ol;
239 k = *k0; l = *l0;
240 for (i = len - 1; i >= 0; --i) {
241 ubyte_t c = str[i];
242 if (c > 3) return 0; // there is an N here. no match
243 bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
244 k = bwt->L2[c] + ok + 1;
245 l = bwt->L2[c] + ol;
246 if (k > l) return 0; // no match
247 }
248 *k0 = k; *l0 = l;
249 return l - k + 1;
250 }