comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/bwtmisc.c @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 /* The MIT License
2
3 Copyright (c) 2008 Genome Research Ltd (GRL).
4
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
12
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <unistd.h>
32 #include "bntseq.h"
33 #include "utils.h"
34 #include "main.h"
35 #include "bwt.h"
36
37 #ifdef _DIVBWT
38 #include "divsufsort.h"
39 #endif
40
41 int is_bwt(ubyte_t *T, int n);
42
43 int64_t bwa_seq_len(const char *fn_pac)
44 {
45 FILE *fp;
46 int64_t pac_len;
47 ubyte_t c;
48 fp = xopen(fn_pac, "rb");
49 fseek(fp, -1, SEEK_END);
50 pac_len = ftell(fp);
51 fread(&c, 1, 1, fp);
52 fclose(fp);
53 return (pac_len - 1) * 4 + (int)c;
54 }
55
56 bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
57 {
58 bwt_t *bwt;
59 ubyte_t *buf, *buf2;
60 int i, pac_size;
61 FILE *fp;
62
63 // initialization
64 bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
65 bwt->seq_len = bwa_seq_len(fn_pac);
66 bwt->bwt_size = (bwt->seq_len + 15) >> 4;
67 fp = xopen(fn_pac, "rb");
68
69 // prepare sequence
70 pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
71 buf2 = (ubyte_t*)calloc(pac_size, 1);
72 fread(buf2, 1, pac_size, fp);
73 fclose(fp);
74 memset(bwt->L2, 0, 5 * 4);
75 buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
76 for (i = 0; i < bwt->seq_len; ++i) {
77 buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
78 ++bwt->L2[1+buf[i]];
79 }
80 for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
81 free(buf2);
82
83 // Burrows-Wheeler Transform
84 if (use_is) {
85 bwt->primary = is_bwt(buf, bwt->seq_len);
86 } else {
87 #ifdef _DIVBWT
88 bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
89 #else
90 err_fatal_simple("libdivsufsort is not compiled in.");
91 #endif
92 }
93 bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
94 for (i = 0; i < bwt->seq_len; ++i)
95 bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
96 free(buf);
97 return bwt;
98 }
99
100 int bwa_pac2bwt(int argc, char *argv[])
101 {
102 bwt_t *bwt;
103 int c, use_is = 1;
104 while ((c = getopt(argc, argv, "d")) >= 0) {
105 switch (c) {
106 case 'd': use_is = 0; break;
107 default: return 1;
108 }
109 }
110 if (optind + 2 > argc) {
111 fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
112 return 1;
113 }
114 bwt = bwt_pac2bwt(argv[optind], use_is);
115 bwt_dump_bwt(argv[optind+1], bwt);
116 bwt_destroy(bwt);
117 return 0;
118 }
119
120 #define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
121
122 void bwt_bwtupdate_core(bwt_t *bwt)
123 {
124 bwtint_t i, k, c[4], n_occ;
125 uint32_t *buf;
126
127 n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
128 bwt->bwt_size += n_occ * 4; // the new size
129 buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
130 c[0] = c[1] = c[2] = c[3] = 0;
131 for (i = k = 0; i < bwt->seq_len; ++i) {
132 if (i % OCC_INTERVAL == 0) {
133 memcpy(buf + k, c, sizeof(bwtint_t) * 4);
134 k += 4;
135 }
136 if (i % 16 == 0) buf[k++] = bwt->bwt[i/16];
137 ++c[bwt_B00(bwt, i)];
138 }
139 // the last element
140 memcpy(buf + k, c, sizeof(bwtint_t) * 4);
141 xassert(k + 4 == bwt->bwt_size, "inconsistent bwt_size");
142 // update bwt
143 free(bwt->bwt); bwt->bwt = buf;
144 }
145
146 int bwa_bwtupdate(int argc, char *argv[])
147 {
148 bwt_t *bwt;
149 if (argc < 2) {
150 fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
151 return 1;
152 }
153 bwt = bwt_restore_bwt(argv[1]);
154 bwt_bwtupdate_core(bwt);
155 bwt_dump_bwt(argv[1], bwt);
156 bwt_destroy(bwt);
157 return 0;
158 }
159
160 void bwa_pac_rev_core(const char *fn, const char *fn_rev)
161 {
162 int64_t seq_len, i;
163 bwtint_t pac_len, j;
164 ubyte_t *bufin, *bufout, ct;
165 FILE *fp;
166 seq_len = bwa_seq_len(fn);
167 pac_len = (seq_len >> 2) + 1;
168 bufin = (ubyte_t*)calloc(pac_len, 1);
169 bufout = (ubyte_t*)calloc(pac_len, 1);
170 fp = xopen(fn, "rb");
171 fread(bufin, 1, pac_len, fp);
172 fclose(fp);
173 for (i = seq_len - 1, j = 0; i >= 0; --i) {
174 int c = bufin[i>>2] >> ((~i&3)<<1) & 3;
175 bwtint_t j = seq_len - 1 - i;
176 bufout[j>>2] |= c << ((~j&3)<<1);
177 }
178 free(bufin);
179 fp = xopen(fn_rev, "wb");
180 fwrite(bufout, 1, pac_len, fp);
181 ct = seq_len % 4;
182 fwrite(&ct, 1, 1, fp);
183 fclose(fp);
184 free(bufout);
185 }
186
187 int bwa_pac_rev(int argc, char *argv[])
188 {
189 if (argc < 3) {
190 fprintf(stderr, "Usage: bwa pac_rev <in.pac> <out.pac>\n");
191 return 1;
192 }
193 bwa_pac_rev_core(argv[1], argv[2]);
194 return 0;
195 }
196
197 const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
198
199 /* this function is not memory efficient, but this will make life easier
200 Ideally we should also change .amb files as one 'N' in the nucleotide
201 sequence leads to two ambiguous colors. I may do this later... */
202 uint8_t *bwa_pac2cspac_core(const bntseq_t *bns)
203 {
204 uint8_t *pac, *cspac;
205 bwtint_t i;
206 int c1, c2;
207 pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
208 cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
209 fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
210 rewind(bns->fp_pac);
211 c1 = pac[0]>>6; cspac[0] = c1<<6;
212 for (i = 1; i < bns->l_pac; ++i) {
213 c2 = pac[i>>2] >> (~i&3)*2 & 3;
214 cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2;
215 c1 = c2;
216 }
217 free(pac);
218 return cspac;
219 }
220
221 int bwa_pac2cspac(int argc, char *argv[])
222 {
223 bntseq_t *bns;
224 uint8_t *cspac, ct;
225 char *str;
226 FILE *fp;
227
228 if (argc < 3) {
229 fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n");
230 return 1;
231 }
232 bns = bns_restore(argv[1]);
233 cspac = bwa_pac2cspac_core(bns);
234 bns_dump(bns, argv[2]);
235 // now write cspac
236 str = (char*)calloc(strlen(argv[2]) + 5, 1);
237 strcat(strcpy(str, argv[2]), ".pac");
238 fp = xopen(str, "wb");
239 fwrite(cspac, 1, bns->l_pac/4 + 1, fp);
240 ct = bns->l_pac % 4;
241 fwrite(&ct, 1, 1, fp);
242 fclose(fp);
243 bns_destroy(bns);
244 free(cspac);
245 return 0;
246 }
247
248 int bwa_bwt2sa(int argc, char *argv[])
249 {
250 bwt_t *bwt;
251 int c, sa_intv = 32;
252 while ((c = getopt(argc, argv, "i:")) >= 0) {
253 switch (c) {
254 case 'i': sa_intv = atoi(optarg); break;
255 default: return 1;
256 }
257 }
258 if (optind + 2 > argc) {
259 fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
260 return 1;
261 }
262 bwt = bwt_restore_bwt(argv[optind]);
263 bwt_cal_sa(bwt, sa_intv);
264 bwt_dump_sa(argv[optind+1], bwt);
265 bwt_destroy(bwt);
266 return 0;
267 }