Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/cram_stats.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* | |
2 Copyright (c) 2012-2013 Genome Research Ltd. | |
3 Author: James Bonfield <jkb@sanger.ac.uk> | |
4 | |
5 Redistribution and use in source and binary forms, with or without | |
6 modification, are permitted provided that the following conditions are met: | |
7 | |
8 1. Redistributions of source code must retain the above copyright notice, | |
9 this list of conditions and the following disclaimer. | |
10 | |
11 2. Redistributions in binary form must reproduce the above copyright notice, | |
12 this list of conditions and the following disclaimer in the documentation | |
13 and/or other materials provided with the distribution. | |
14 | |
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
16 Institute nor the names of its contributors may be used to endorse or promote | |
17 products derived from this software without specific prior written permission. | |
18 | |
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 #ifdef HAVE_CONFIG_H | |
32 #include "io_lib_config.h" | |
33 #endif | |
34 | |
35 #include <stdio.h> | |
36 #include <errno.h> | |
37 #include <assert.h> | |
38 #include <stdlib.h> | |
39 #include <string.h> | |
40 #include <zlib.h> | |
41 #include <sys/types.h> | |
42 #include <sys/stat.h> | |
43 #include <math.h> | |
44 #include <ctype.h> | |
45 | |
46 #include "cram/cram.h" | |
47 #include "cram/os.h" | |
48 | |
49 cram_stats *cram_stats_create(void) { | |
50 return calloc(1, sizeof(cram_stats)); | |
51 } | |
52 | |
53 void cram_stats_add(cram_stats *st, int32_t val) { | |
54 st->nsamp++; | |
55 | |
56 //assert(val >= 0); | |
57 | |
58 if (val < MAX_STAT_VAL && val >= 0) { | |
59 st->freqs[val]++; | |
60 } else { | |
61 khint_t k; | |
62 int r; | |
63 | |
64 if (!st->h) { | |
65 st->h = kh_init(m_i2i); | |
66 } | |
67 | |
68 k = kh_put(m_i2i, st->h, val, &r); | |
69 if (r == 0) | |
70 kh_val(st->h, k)++; | |
71 else if (r != -1) | |
72 kh_val(st->h, k) = 1; | |
73 else | |
74 ; // FIXME: handle error | |
75 } | |
76 } | |
77 | |
78 void cram_stats_del(cram_stats *st, int32_t val) { | |
79 st->nsamp--; | |
80 | |
81 //assert(val >= 0); | |
82 | |
83 if (val < MAX_STAT_VAL && val >= 0) { | |
84 st->freqs[val]--; | |
85 assert(st->freqs[val] >= 0); | |
86 } else if (st->h) { | |
87 khint_t k = kh_get(m_i2i, st->h, val); | |
88 | |
89 if (k != kh_end(st->h)) { | |
90 if (--kh_val(st->h, k) == 0) | |
91 kh_del(m_i2i, st->h, k); | |
92 } else { | |
93 fprintf(stderr, "Failed to remove val %d from cram_stats\n", val); | |
94 st->nsamp++; | |
95 } | |
96 } else { | |
97 fprintf(stderr, "Failed to remove val %d from cram_stats\n", val); | |
98 st->nsamp++; | |
99 } | |
100 } | |
101 | |
102 void cram_stats_dump(cram_stats *st) { | |
103 int i; | |
104 fprintf(stderr, "cram_stats:\n"); | |
105 for (i = 0; i < MAX_STAT_VAL; i++) { | |
106 if (!st->freqs[i]) | |
107 continue; | |
108 fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]); | |
109 } | |
110 if (st->h) { | |
111 khint_t k; | |
112 for (k = kh_begin(st->h); k != kh_end(st->h); k++) { | |
113 if (!kh_exist(st->h, k)) | |
114 continue; | |
115 | |
116 fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k)); | |
117 } | |
118 } | |
119 } | |
120 | |
121 #if 1 | |
122 /* Returns the number of bits set in val; it the highest bit used */ | |
123 static int nbits(int v) { | |
124 static const int MultiplyDeBruijnBitPosition[32] = { | |
125 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31, | |
126 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32 | |
127 }; | |
128 | |
129 v |= v >> 1; // first up to set all bits 1 after the first 1 */ | |
130 v |= v >> 2; | |
131 v |= v >> 4; | |
132 v |= v >> 8; | |
133 v |= v >> 16; | |
134 | |
135 // DeBruijn magic to find top bit | |
136 return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27]; | |
137 } | |
138 #endif | |
139 | |
140 /* | |
141 * Computes entropy from integer frequencies for various encoding methods and | |
142 * picks the best encoding. | |
143 * | |
144 * FIXME: we could reuse some of the code here for the actual encoding | |
145 * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman. | |
146 * | |
147 * Returns the best codec to use. | |
148 */ | |
149 enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { | |
150 enum cram_encoding best_encoding = E_NULL; | |
151 int best_size = INT_MAX, bits; | |
152 int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; | |
153 int *vals = NULL, *freqs = NULL, vals_alloc = 0, *codes; | |
154 | |
155 //cram_stats_dump(st); | |
156 | |
157 /* Count number of unique symbols */ | |
158 for (nvals = i = 0; i < MAX_STAT_VAL; i++) { | |
159 if (!st->freqs[i]) | |
160 continue; | |
161 if (nvals >= vals_alloc) { | |
162 vals_alloc = vals_alloc ? vals_alloc*2 : 1024; | |
163 vals = realloc(vals, vals_alloc * sizeof(int)); | |
164 freqs = realloc(freqs, vals_alloc * sizeof(int)); | |
165 if (!vals || !freqs) { | |
166 if (vals) free(vals); | |
167 if (freqs) free(freqs); | |
168 return E_HUFFMAN; // Cannot do much else atm | |
169 } | |
170 } | |
171 vals[nvals] = i; | |
172 freqs[nvals] = st->freqs[i]; | |
173 ntot += freqs[nvals]; | |
174 if (max_val < i) max_val = i; | |
175 if (min_val > i) min_val = i; | |
176 nvals++; | |
177 } | |
178 if (st->h) { | |
179 khint_t k; | |
180 int i; | |
181 | |
182 for (k = kh_begin(st->h); k != kh_end(st->h); k++) { | |
183 if (!kh_exist(st->h, k)) | |
184 continue; | |
185 | |
186 if (nvals >= vals_alloc) { | |
187 vals_alloc = vals_alloc ? vals_alloc*2 : 1024; | |
188 vals = realloc(vals, vals_alloc * sizeof(int)); | |
189 freqs = realloc(freqs, vals_alloc * sizeof(int)); | |
190 if (!vals || !freqs) | |
191 return E_HUFFMAN; // Cannot do much else atm | |
192 } | |
193 i = kh_key(st->h, k); | |
194 vals[nvals]=i; | |
195 freqs[nvals] = kh_val(st->h, k); | |
196 ntot += freqs[nvals]; | |
197 if (max_val < i) max_val = i; | |
198 if (min_val > i) min_val = i; | |
199 nvals++; | |
200 } | |
201 } | |
202 | |
203 st->nvals = nvals; | |
204 assert(ntot == st->nsamp); | |
205 | |
206 if (nvals <= 1) { | |
207 free(vals); | |
208 free(freqs); | |
209 return E_HUFFMAN; | |
210 } | |
211 | |
212 if (fd->verbose > 1) | |
213 fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n", | |
214 min_val, max_val, nvals, ntot); | |
215 | |
216 /* Theoretical entropy */ | |
217 // if (fd->verbose > 1) { | |
218 // double dbits = 0; | |
219 // for (i = 0; i < nvals; i++) { | |
220 // dbits += freqs[i] * log((double)freqs[i]/ntot); | |
221 // } | |
222 // dbits /= -log(2); | |
223 // if (fd->verbose > 1) | |
224 // fprintf(stderr, "Entropy = %f\n", dbits); | |
225 // } | |
226 | |
227 if (nvals > 1 && ntot > 256) { | |
228 #if 0 | |
229 /* | |
230 * CRUDE huffman estimator. Round to closest and round up from 0 | |
231 * to 1 bit. | |
232 * | |
233 * With and without ITF8 incase we have a few discrete values but with | |
234 * large magnitude. | |
235 * | |
236 * Note rans0/arith0 and Z_HUFFMAN_ONLY vs internal huffman can be | |
237 * compared in this way, but order-1 (eg rans1) or maybe LZ77 modes | |
238 * may detect the correlation of high bytes to low bytes in multi- | |
239 * byte values. So this predictor breaks down. | |
240 */ | |
241 double dbits = 0; // entropy + ~huffman | |
242 double dbitsH = 0; | |
243 double dbitsE = 0; // external entropy + ~huffman | |
244 double dbitsEH = 0; | |
245 int F[256] = {0}, n = 0; | |
246 double e = 0; // accumulated error bits | |
247 for (i = 0; i < nvals; i++) { | |
248 double x; int X; | |
249 unsigned int v = vals[i]; | |
250 | |
251 //Better encoding would cope with sign. | |
252 //v = ABS(vals[i])*2+(vals[i]<0); | |
253 | |
254 if (!(v & ~0x7f)) { | |
255 F[v] += freqs[i], n+=freqs[i]; | |
256 } else if (!(v & ~0x3fff)) { | |
257 F[(v>>8) |0x80] += freqs[i]; | |
258 F[ v &0xff] += freqs[i], n+=2*freqs[i]; | |
259 } else if (!(v & ~0x1fffff)) { | |
260 F[(v>>16)|0xc0] += freqs[i]; | |
261 F[(v>>8 )&0xff] += freqs[i]; | |
262 F[ v &0xff] += freqs[i], n+=3*freqs[i]; | |
263 } else if (!(v & ~0x0fffffff)) { | |
264 F[(v>>24)|0xe0] += freqs[i]; | |
265 F[(v>>16)&0xff] += freqs[i]; | |
266 F[(v>>8 )&0xff] += freqs[i]; | |
267 F[ v &0xff] += freqs[i], n+=4*freqs[i]; | |
268 } else { | |
269 F[(v>>28)|0xf0] += freqs[i]; | |
270 F[(v>>20)&0xff] += freqs[i]; | |
271 F[(v>>12)&0xff] += freqs[i]; | |
272 F[(v>>4 )&0xff] += freqs[i]; | |
273 F[ v &0x0f] += freqs[i], n+=5*freqs[i]; | |
274 } | |
275 | |
276 x = -log((double)freqs[i]/ntot)/.69314718055994530941; | |
277 X = x+0.5; | |
278 if ((int)(x+((double)e/freqs[i])+.5)>X) { | |
279 X++; | |
280 } else if ((int)(x+((double)e/freqs[i])+.5)<X) { | |
281 X--; | |
282 } | |
283 e-=freqs[i]*(X-x); | |
284 X += (X==0); | |
285 | |
286 //fprintf(stderr, "Val %d = %d x %d (ent %f, %d) e %f\n", i, v, freqs[i], x, X, e); | |
287 | |
288 dbits += freqs[i] * x; | |
289 dbitsH += freqs[i] * X; | |
290 } | |
291 | |
292 for (i = 0; i < 256; i++) { | |
293 if (F[i]) { | |
294 double x = -log((double)F[i]/n)/.69314718055994530941; | |
295 int X = x+0.5; | |
296 X += (X==0); | |
297 dbitsE += F[i] * x; | |
298 dbitsEH += F[i] * X; | |
299 | |
300 //fprintf(stderr, "Val %d = %d x %d (e %f, %d)\n", i, i, F[i], x, X); | |
301 } | |
302 } | |
303 | |
304 //fprintf(stderr, "CORE Entropy = %f, %f\n", dbits/8, dbitsH/8); | |
305 //fprintf(stderr, "Ext. Entropy = %f, %f\n", dbitsE/8, dbitsEH/8); | |
306 | |
307 if (dbitsE < 1000 || dbitsE / dbits > 1.1) { | |
308 //fprintf(stderr, "=> %d < 200 ? E_HUFFMAN : E_BETA\n", nvals); | |
309 free(vals); free(freqs); | |
310 return nvals < 200 ? E_HUFFMAN : E_BETA; | |
311 } | |
312 #endif | |
313 free(vals); free(freqs); | |
314 return E_EXTERNAL; | |
315 } | |
316 | |
317 /* | |
318 * Avoid complex stats for now, just do heuristic of HUFFMAN for small | |
319 * alphabets and BETA for anything large. | |
320 */ | |
321 free(vals); free(freqs); | |
322 return nvals < 200 ? E_HUFFMAN : E_BETA; | |
323 //return E_HUFFMAN; | |
324 //return E_EXTERNAL; | |
325 | |
326 | |
327 /* We only support huffman now anyway... */ | |
328 //free(vals); free(freqs); return E_HUFFMAN; | |
329 | |
330 /* Beta */ | |
331 bits = nbits(max_val - min_val) * ntot; | |
332 if (fd->verbose > 1) | |
333 fprintf(stderr, "BETA = %d\n", bits); | |
334 if (best_size > bits) | |
335 best_size = bits, best_encoding = E_BETA; | |
336 | |
337 #if 0 | |
338 /* Unary */ | |
339 if (min_val >= 0) { | |
340 for (bits = i = 0; i < nvals; i++) | |
341 bits += freqs[i]*(vals[i]+1); | |
342 if (fd->verbose > 1) | |
343 fprintf(stderr, "UNARY = %d\n", bits); | |
344 if (best_size > bits) | |
345 best_size = bits, best_encoding = E_NULL; //E_UNARY; | |
346 } | |
347 | |
348 /* Gamma */ | |
349 for (bits = i = 0; i < nvals; i++) | |
350 bits += ((nbits(vals[i]-min_val+1)-1) + nbits(vals[i]-min_val+1)) * freqs[i]; | |
351 if (fd->verbose > 1) | |
352 fprintf(stderr, "GAMMA = %d\n", bits); | |
353 if (best_size > bits) | |
354 best_size = bits, best_encoding = E_GAMMA; | |
355 | |
356 /* Subexponential */ | |
357 for (k = 0; k < 10; k++) { | |
358 for (bits = i = 0; i < nvals; i++) { | |
359 if (vals[i]-min_val < (1<<k)) | |
360 bits += (1 + k)*freqs[i]; | |
361 else | |
362 bits += (nbits(vals[i]-min_val)*2-k)*freqs[i]; | |
363 } | |
364 | |
365 if (fd->verbose > 1) | |
366 fprintf(stderr, "SUBEXP%d = %d\n", k, bits); | |
367 if (best_size > bits) | |
368 best_size = bits, best_encoding = E_SUBEXP; | |
369 } | |
370 #endif | |
371 | |
372 /* byte array len */ | |
373 | |
374 /* byte array stop */ | |
375 | |
376 /* External? Guesswork! */ | |
377 | |
378 /* Huffman */ | |
379 // qsort(freqs, nvals, sizeof(freqs[0]), sort_freqs); | |
380 // for (i = 0; i < nvals; i++) { | |
381 // fprintf(stderr, "%d = %d\n", i, freqs[i]); | |
382 // vals[i] = 0; | |
383 // } | |
384 | |
385 /* Grow freqs to 2*freqs, to store sums */ | |
386 /* Vals holds link data */ | |
387 freqs = realloc(freqs, 2*nvals*sizeof(*freqs)); | |
388 codes = calloc(2*nvals, sizeof(*codes)); | |
389 if (!freqs || !codes) | |
390 return E_HUFFMAN; // Cannot do much else atm | |
391 | |
392 /* Inefficient, use pointers to form chain so we can insert and maintain | |
393 * a sorted list? This is currently O(nvals^2) complexity. | |
394 */ | |
395 for (;;) { | |
396 int low1 = INT_MAX, low2 = INT_MAX; | |
397 int ind1 = 0, ind2 = 0; | |
398 for (i = 0; i < nvals; i++) { | |
399 if (freqs[i] < 0) | |
400 continue; | |
401 if (low1 > freqs[i]) | |
402 low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i; | |
403 else if (low2 > freqs[i]) | |
404 low2 = freqs[i], ind2 = i; | |
405 } | |
406 if (low2 == INT_MAX) | |
407 break; | |
408 | |
409 //fprintf(stderr, "Merge ind %d (%d), %d (%d) = %d+%d, => %d=%d\n", | |
410 // ind1, vals[ind1], ind2, vals[ind2], low1, low2, | |
411 // nvals, low1+low2); | |
412 | |
413 freqs[nvals] = low1 + low2; | |
414 codes[ind1] = nvals; | |
415 codes[ind2] = nvals; | |
416 freqs[ind1] *= -1; | |
417 freqs[ind2] *= -1; | |
418 nvals++; | |
419 } | |
420 nvals = nvals/2+1; | |
421 | |
422 for (i = 0; i < nvals; i++) { | |
423 int code_len = 0; | |
424 for (k = codes[i]; k; k = codes[k]) | |
425 code_len++; | |
426 codes[i] = code_len; | |
427 freqs[i] *= -1; | |
428 //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], codes[i]); | |
429 } | |
430 | |
431 for (bits = i = 0; i < nvals; i++) { | |
432 bits += freqs[i] * codes[i]; | |
433 } | |
434 if (fd->verbose > 1) | |
435 fprintf(stderr, "HUFFMAN = %d\n", bits); | |
436 if (best_size >= bits) | |
437 best_size = bits, best_encoding = E_HUFFMAN; | |
438 free(codes); | |
439 | |
440 free(vals); | |
441 free(freqs); | |
442 | |
443 return best_encoding; | |
444 } | |
445 | |
446 void cram_stats_free(cram_stats *st) { | |
447 if (st->h) | |
448 kh_destroy(m_i2i, st->h); | |
449 free(st); | |
450 } |