0
|
1 /* The MIT License
|
|
2
|
|
3 Copyright (c) 2008 Genome Research Ltd (GRL).
|
|
4
|
|
5 Permission is hereby granted, free of charge, to any person obtaining
|
|
6 a copy of this software and associated documentation files (the
|
|
7 "Software"), to deal in the Software without restriction, including
|
|
8 without limitation the rights to use, copy, modify, merge, publish,
|
|
9 distribute, sublicense, and/or sell copies of the Software, and to
|
|
10 permit persons to whom the Software is furnished to do so, subject to
|
|
11 the following conditions:
|
|
12
|
|
13 The above copyright notice and this permission notice shall be
|
|
14 included in all copies or substantial portions of the Software.
|
|
15
|
|
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23 SOFTWARE.
|
|
24 */
|
|
25
|
|
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
|
|
27
|
|
28 #include <stdlib.h>
|
|
29 #include <stdio.h>
|
|
30 #include <string.h>
|
|
31 #include <unistd.h>
|
|
32 #include "bntseq.h"
|
|
33 #include "utils.h"
|
|
34 #include "main.h"
|
|
35 #include "bwt.h"
|
|
36
|
|
37 #ifdef _DIVBWT
|
|
38 #include "divsufsort.h"
|
|
39 #endif
|
|
40
|
|
41 int is_bwt(ubyte_t *T, int n);
|
|
42
|
|
43 int64_t bwa_seq_len(const char *fn_pac)
|
|
44 {
|
|
45 FILE *fp;
|
|
46 int64_t pac_len;
|
|
47 ubyte_t c;
|
|
48 fp = xopen(fn_pac, "rb");
|
|
49 fseek(fp, -1, SEEK_END);
|
|
50 pac_len = ftell(fp);
|
|
51 fread(&c, 1, 1, fp);
|
|
52 fclose(fp);
|
|
53 return (pac_len - 1) * 4 + (int)c;
|
|
54 }
|
|
55
|
|
56 bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
|
57 {
|
|
58 bwt_t *bwt;
|
|
59 ubyte_t *buf, *buf2;
|
|
60 int i, pac_size;
|
|
61 FILE *fp;
|
|
62
|
|
63 // initialization
|
|
64 bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
|
|
65 bwt->seq_len = bwa_seq_len(fn_pac);
|
|
66 bwt->bwt_size = (bwt->seq_len + 15) >> 4;
|
|
67 fp = xopen(fn_pac, "rb");
|
|
68
|
|
69 // prepare sequence
|
|
70 pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
|
|
71 buf2 = (ubyte_t*)calloc(pac_size, 1);
|
|
72 fread(buf2, 1, pac_size, fp);
|
|
73 fclose(fp);
|
|
74 memset(bwt->L2, 0, 5 * 4);
|
|
75 buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
|
|
76 for (i = 0; i < bwt->seq_len; ++i) {
|
|
77 buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
|
|
78 ++bwt->L2[1+buf[i]];
|
|
79 }
|
|
80 for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
|
|
81 free(buf2);
|
|
82
|
|
83 // Burrows-Wheeler Transform
|
|
84 if (use_is) {
|
|
85 bwt->primary = is_bwt(buf, bwt->seq_len);
|
|
86 } else {
|
|
87 #ifdef _DIVBWT
|
|
88 bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
|
|
89 #else
|
|
90 err_fatal_simple("libdivsufsort is not compiled in.");
|
|
91 #endif
|
|
92 }
|
|
93 bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
|
|
94 for (i = 0; i < bwt->seq_len; ++i)
|
|
95 bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
|
|
96 free(buf);
|
|
97 return bwt;
|
|
98 }
|
|
99
|
|
100 int bwa_pac2bwt(int argc, char *argv[])
|
|
101 {
|
|
102 bwt_t *bwt;
|
|
103 int c, use_is = 1;
|
|
104 while ((c = getopt(argc, argv, "d")) >= 0) {
|
|
105 switch (c) {
|
|
106 case 'd': use_is = 0; break;
|
|
107 default: return 1;
|
|
108 }
|
|
109 }
|
|
110 if (optind + 2 > argc) {
|
|
111 fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
|
112 return 1;
|
|
113 }
|
|
114 bwt = bwt_pac2bwt(argv[optind], use_is);
|
|
115 bwt_dump_bwt(argv[optind+1], bwt);
|
|
116 bwt_destroy(bwt);
|
|
117 return 0;
|
|
118 }
|
|
119
|
|
120 #define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
|
121
|
|
122 void bwt_bwtupdate_core(bwt_t *bwt)
|
|
123 {
|
|
124 bwtint_t i, k, c[4], n_occ;
|
|
125 uint32_t *buf;
|
|
126
|
|
127 n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
|
|
128 bwt->bwt_size += n_occ * 4; // the new size
|
|
129 buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
|
|
130 c[0] = c[1] = c[2] = c[3] = 0;
|
|
131 for (i = k = 0; i < bwt->seq_len; ++i) {
|
|
132 if (i % OCC_INTERVAL == 0) {
|
|
133 memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
|
134 k += 4;
|
|
135 }
|
|
136 if (i % 16 == 0) buf[k++] = bwt->bwt[i/16];
|
|
137 ++c[bwt_B00(bwt, i)];
|
|
138 }
|
|
139 // the last element
|
|
140 memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
|
141 xassert(k + 4 == bwt->bwt_size, "inconsistent bwt_size");
|
|
142 // update bwt
|
|
143 free(bwt->bwt); bwt->bwt = buf;
|
|
144 }
|
|
145
|
|
146 int bwa_bwtupdate(int argc, char *argv[])
|
|
147 {
|
|
148 bwt_t *bwt;
|
|
149 if (argc < 2) {
|
|
150 fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
|
|
151 return 1;
|
|
152 }
|
|
153 bwt = bwt_restore_bwt(argv[1]);
|
|
154 bwt_bwtupdate_core(bwt);
|
|
155 bwt_dump_bwt(argv[1], bwt);
|
|
156 bwt_destroy(bwt);
|
|
157 return 0;
|
|
158 }
|
|
159
|
|
160 void bwa_pac_rev_core(const char *fn, const char *fn_rev)
|
|
161 {
|
|
162 int64_t seq_len, i;
|
|
163 bwtint_t pac_len, j;
|
|
164 ubyte_t *bufin, *bufout, ct;
|
|
165 FILE *fp;
|
|
166 seq_len = bwa_seq_len(fn);
|
|
167 pac_len = (seq_len >> 2) + 1;
|
|
168 bufin = (ubyte_t*)calloc(pac_len, 1);
|
|
169 bufout = (ubyte_t*)calloc(pac_len, 1);
|
|
170 fp = xopen(fn, "rb");
|
|
171 fread(bufin, 1, pac_len, fp);
|
|
172 fclose(fp);
|
|
173 for (i = seq_len - 1, j = 0; i >= 0; --i) {
|
|
174 int c = bufin[i>>2] >> ((~i&3)<<1) & 3;
|
|
175 bwtint_t j = seq_len - 1 - i;
|
|
176 bufout[j>>2] |= c << ((~j&3)<<1);
|
|
177 }
|
|
178 free(bufin);
|
|
179 fp = xopen(fn_rev, "wb");
|
|
180 fwrite(bufout, 1, pac_len, fp);
|
|
181 ct = seq_len % 4;
|
|
182 fwrite(&ct, 1, 1, fp);
|
|
183 fclose(fp);
|
|
184 free(bufout);
|
|
185 }
|
|
186
|
|
187 int bwa_pac_rev(int argc, char *argv[])
|
|
188 {
|
|
189 if (argc < 3) {
|
|
190 fprintf(stderr, "Usage: bwa pac_rev <in.pac> <out.pac>\n");
|
|
191 return 1;
|
|
192 }
|
|
193 bwa_pac_rev_core(argv[1], argv[2]);
|
|
194 return 0;
|
|
195 }
|
|
196
|
|
197 const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
|
|
198
|
|
199 /* this function is not memory efficient, but this will make life easier
|
|
200 Ideally we should also change .amb files as one 'N' in the nucleotide
|
|
201 sequence leads to two ambiguous colors. I may do this later... */
|
|
202 uint8_t *bwa_pac2cspac_core(const bntseq_t *bns)
|
|
203 {
|
|
204 uint8_t *pac, *cspac;
|
|
205 bwtint_t i;
|
|
206 int c1, c2;
|
|
207 pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
|
|
208 cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
|
|
209 fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
|
210 rewind(bns->fp_pac);
|
|
211 c1 = pac[0]>>6; cspac[0] = c1<<6;
|
|
212 for (i = 1; i < bns->l_pac; ++i) {
|
|
213 c2 = pac[i>>2] >> (~i&3)*2 & 3;
|
|
214 cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2;
|
|
215 c1 = c2;
|
|
216 }
|
|
217 free(pac);
|
|
218 return cspac;
|
|
219 }
|
|
220
|
|
221 int bwa_pac2cspac(int argc, char *argv[])
|
|
222 {
|
|
223 bntseq_t *bns;
|
|
224 uint8_t *cspac, ct;
|
|
225 char *str;
|
|
226 FILE *fp;
|
|
227
|
|
228 if (argc < 3) {
|
|
229 fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n");
|
|
230 return 1;
|
|
231 }
|
|
232 bns = bns_restore(argv[1]);
|
|
233 cspac = bwa_pac2cspac_core(bns);
|
|
234 bns_dump(bns, argv[2]);
|
|
235 // now write cspac
|
|
236 str = (char*)calloc(strlen(argv[2]) + 5, 1);
|
|
237 strcat(strcpy(str, argv[2]), ".pac");
|
|
238 fp = xopen(str, "wb");
|
|
239 fwrite(cspac, 1, bns->l_pac/4 + 1, fp);
|
|
240 ct = bns->l_pac % 4;
|
|
241 fwrite(&ct, 1, 1, fp);
|
|
242 fclose(fp);
|
|
243 bns_destroy(bns);
|
|
244 free(cspac);
|
|
245 return 0;
|
|
246 }
|
|
247
|
|
248 int bwa_bwt2sa(int argc, char *argv[])
|
|
249 {
|
|
250 bwt_t *bwt;
|
|
251 int c, sa_intv = 32;
|
|
252 while ((c = getopt(argc, argv, "i:")) >= 0) {
|
|
253 switch (c) {
|
|
254 case 'i': sa_intv = atoi(optarg); break;
|
|
255 default: return 1;
|
|
256 }
|
|
257 }
|
|
258 if (optind + 2 > argc) {
|
|
259 fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
|
260 return 1;
|
|
261 }
|
|
262 bwt = bwt_restore_bwt(argv[optind]);
|
|
263 bwt_cal_sa(bwt, sa_intv);
|
|
264 bwt_dump_sa(argv[optind+1], bwt);
|
|
265 bwt_destroy(bwt);
|
|
266 return 0;
|
|
267 }
|