comparison ezBAMQC/src/htslib/cram/cram_stats.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #ifdef HAVE_CONFIG_H
32 #include "io_lib_config.h"
33 #endif
34
35 #include <stdio.h>
36 #include <errno.h>
37 #include <assert.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <zlib.h>
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <math.h>
44 #include <ctype.h>
45
46 #include "cram/cram.h"
47 #include "cram/os.h"
48
49 cram_stats *cram_stats_create(void) {
50 return calloc(1, sizeof(cram_stats));
51 }
52
53 void cram_stats_add(cram_stats *st, int32_t val) {
54 st->nsamp++;
55
56 //assert(val >= 0);
57
58 if (val < MAX_STAT_VAL && val >= 0) {
59 st->freqs[val]++;
60 } else {
61 khint_t k;
62 int r;
63
64 if (!st->h) {
65 st->h = kh_init(m_i2i);
66 }
67
68 k = kh_put(m_i2i, st->h, val, &r);
69 if (r == 0)
70 kh_val(st->h, k)++;
71 else if (r != -1)
72 kh_val(st->h, k) = 1;
73 else
74 ; // FIXME: handle error
75 }
76 }
77
78 void cram_stats_del(cram_stats *st, int32_t val) {
79 st->nsamp--;
80
81 //assert(val >= 0);
82
83 if (val < MAX_STAT_VAL && val >= 0) {
84 st->freqs[val]--;
85 assert(st->freqs[val] >= 0);
86 } else if (st->h) {
87 khint_t k = kh_get(m_i2i, st->h, val);
88
89 if (k != kh_end(st->h)) {
90 if (--kh_val(st->h, k) == 0)
91 kh_del(m_i2i, st->h, k);
92 } else {
93 fprintf(stderr, "Failed to remove val %d from cram_stats\n", val);
94 st->nsamp++;
95 }
96 } else {
97 fprintf(stderr, "Failed to remove val %d from cram_stats\n", val);
98 st->nsamp++;
99 }
100 }
101
102 void cram_stats_dump(cram_stats *st) {
103 int i;
104 fprintf(stderr, "cram_stats:\n");
105 for (i = 0; i < MAX_STAT_VAL; i++) {
106 if (!st->freqs[i])
107 continue;
108 fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]);
109 }
110 if (st->h) {
111 khint_t k;
112 for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
113 if (!kh_exist(st->h, k))
114 continue;
115
116 fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k));
117 }
118 }
119 }
120
121 #if 1
122 /* Returns the number of bits set in val; it the highest bit used */
123 static int nbits(int v) {
124 static const int MultiplyDeBruijnBitPosition[32] = {
125 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
126 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
127 };
128
129 v |= v >> 1; // first up to set all bits 1 after the first 1 */
130 v |= v >> 2;
131 v |= v >> 4;
132 v |= v >> 8;
133 v |= v >> 16;
134
135 // DeBruijn magic to find top bit
136 return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
137 }
138 #endif
139
140 /*
141 * Computes entropy from integer frequencies for various encoding methods and
142 * picks the best encoding.
143 *
144 * FIXME: we could reuse some of the code here for the actual encoding
145 * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman.
146 *
147 * Returns the best codec to use.
148 */
149 enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) {
150 enum cram_encoding best_encoding = E_NULL;
151 int best_size = INT_MAX, bits;
152 int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
153 int *vals = NULL, *freqs = NULL, vals_alloc = 0, *codes;
154
155 //cram_stats_dump(st);
156
157 /* Count number of unique symbols */
158 for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
159 if (!st->freqs[i])
160 continue;
161 if (nvals >= vals_alloc) {
162 vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
163 vals = realloc(vals, vals_alloc * sizeof(int));
164 freqs = realloc(freqs, vals_alloc * sizeof(int));
165 if (!vals || !freqs) {
166 if (vals) free(vals);
167 if (freqs) free(freqs);
168 return E_HUFFMAN; // Cannot do much else atm
169 }
170 }
171 vals[nvals] = i;
172 freqs[nvals] = st->freqs[i];
173 ntot += freqs[nvals];
174 if (max_val < i) max_val = i;
175 if (min_val > i) min_val = i;
176 nvals++;
177 }
178 if (st->h) {
179 khint_t k;
180 int i;
181
182 for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
183 if (!kh_exist(st->h, k))
184 continue;
185
186 if (nvals >= vals_alloc) {
187 vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
188 vals = realloc(vals, vals_alloc * sizeof(int));
189 freqs = realloc(freqs, vals_alloc * sizeof(int));
190 if (!vals || !freqs)
191 return E_HUFFMAN; // Cannot do much else atm
192 }
193 i = kh_key(st->h, k);
194 vals[nvals]=i;
195 freqs[nvals] = kh_val(st->h, k);
196 ntot += freqs[nvals];
197 if (max_val < i) max_val = i;
198 if (min_val > i) min_val = i;
199 nvals++;
200 }
201 }
202
203 st->nvals = nvals;
204 assert(ntot == st->nsamp);
205
206 if (nvals <= 1) {
207 free(vals);
208 free(freqs);
209 return E_HUFFMAN;
210 }
211
212 if (fd->verbose > 1)
213 fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n",
214 min_val, max_val, nvals, ntot);
215
216 /* Theoretical entropy */
217 // if (fd->verbose > 1) {
218 // double dbits = 0;
219 // for (i = 0; i < nvals; i++) {
220 // dbits += freqs[i] * log((double)freqs[i]/ntot);
221 // }
222 // dbits /= -log(2);
223 // if (fd->verbose > 1)
224 // fprintf(stderr, "Entropy = %f\n", dbits);
225 // }
226
227 if (nvals > 1 && ntot > 256) {
228 #if 0
229 /*
230 * CRUDE huffman estimator. Round to closest and round up from 0
231 * to 1 bit.
232 *
233 * With and without ITF8 incase we have a few discrete values but with
234 * large magnitude.
235 *
236 * Note rans0/arith0 and Z_HUFFMAN_ONLY vs internal huffman can be
237 * compared in this way, but order-1 (eg rans1) or maybe LZ77 modes
238 * may detect the correlation of high bytes to low bytes in multi-
239 * byte values. So this predictor breaks down.
240 */
241 double dbits = 0; // entropy + ~huffman
242 double dbitsH = 0;
243 double dbitsE = 0; // external entropy + ~huffman
244 double dbitsEH = 0;
245 int F[256] = {0}, n = 0;
246 double e = 0; // accumulated error bits
247 for (i = 0; i < nvals; i++) {
248 double x; int X;
249 unsigned int v = vals[i];
250
251 //Better encoding would cope with sign.
252 //v = ABS(vals[i])*2+(vals[i]<0);
253
254 if (!(v & ~0x7f)) {
255 F[v] += freqs[i], n+=freqs[i];
256 } else if (!(v & ~0x3fff)) {
257 F[(v>>8) |0x80] += freqs[i];
258 F[ v &0xff] += freqs[i], n+=2*freqs[i];
259 } else if (!(v & ~0x1fffff)) {
260 F[(v>>16)|0xc0] += freqs[i];
261 F[(v>>8 )&0xff] += freqs[i];
262 F[ v &0xff] += freqs[i], n+=3*freqs[i];
263 } else if (!(v & ~0x0fffffff)) {
264 F[(v>>24)|0xe0] += freqs[i];
265 F[(v>>16)&0xff] += freqs[i];
266 F[(v>>8 )&0xff] += freqs[i];
267 F[ v &0xff] += freqs[i], n+=4*freqs[i];
268 } else {
269 F[(v>>28)|0xf0] += freqs[i];
270 F[(v>>20)&0xff] += freqs[i];
271 F[(v>>12)&0xff] += freqs[i];
272 F[(v>>4 )&0xff] += freqs[i];
273 F[ v &0x0f] += freqs[i], n+=5*freqs[i];
274 }
275
276 x = -log((double)freqs[i]/ntot)/.69314718055994530941;
277 X = x+0.5;
278 if ((int)(x+((double)e/freqs[i])+.5)>X) {
279 X++;
280 } else if ((int)(x+((double)e/freqs[i])+.5)<X) {
281 X--;
282 }
283 e-=freqs[i]*(X-x);
284 X += (X==0);
285
286 //fprintf(stderr, "Val %d = %d x %d (ent %f, %d) e %f\n", i, v, freqs[i], x, X, e);
287
288 dbits += freqs[i] * x;
289 dbitsH += freqs[i] * X;
290 }
291
292 for (i = 0; i < 256; i++) {
293 if (F[i]) {
294 double x = -log((double)F[i]/n)/.69314718055994530941;
295 int X = x+0.5;
296 X += (X==0);
297 dbitsE += F[i] * x;
298 dbitsEH += F[i] * X;
299
300 //fprintf(stderr, "Val %d = %d x %d (e %f, %d)\n", i, i, F[i], x, X);
301 }
302 }
303
304 //fprintf(stderr, "CORE Entropy = %f, %f\n", dbits/8, dbitsH/8);
305 //fprintf(stderr, "Ext. Entropy = %f, %f\n", dbitsE/8, dbitsEH/8);
306
307 if (dbitsE < 1000 || dbitsE / dbits > 1.1) {
308 //fprintf(stderr, "=> %d < 200 ? E_HUFFMAN : E_BETA\n", nvals);
309 free(vals); free(freqs);
310 return nvals < 200 ? E_HUFFMAN : E_BETA;
311 }
312 #endif
313 free(vals); free(freqs);
314 return E_EXTERNAL;
315 }
316
317 /*
318 * Avoid complex stats for now, just do heuristic of HUFFMAN for small
319 * alphabets and BETA for anything large.
320 */
321 free(vals); free(freqs);
322 return nvals < 200 ? E_HUFFMAN : E_BETA;
323 //return E_HUFFMAN;
324 //return E_EXTERNAL;
325
326
327 /* We only support huffman now anyway... */
328 //free(vals); free(freqs); return E_HUFFMAN;
329
330 /* Beta */
331 bits = nbits(max_val - min_val) * ntot;
332 if (fd->verbose > 1)
333 fprintf(stderr, "BETA = %d\n", bits);
334 if (best_size > bits)
335 best_size = bits, best_encoding = E_BETA;
336
337 #if 0
338 /* Unary */
339 if (min_val >= 0) {
340 for (bits = i = 0; i < nvals; i++)
341 bits += freqs[i]*(vals[i]+1);
342 if (fd->verbose > 1)
343 fprintf(stderr, "UNARY = %d\n", bits);
344 if (best_size > bits)
345 best_size = bits, best_encoding = E_NULL; //E_UNARY;
346 }
347
348 /* Gamma */
349 for (bits = i = 0; i < nvals; i++)
350 bits += ((nbits(vals[i]-min_val+1)-1) + nbits(vals[i]-min_val+1)) * freqs[i];
351 if (fd->verbose > 1)
352 fprintf(stderr, "GAMMA = %d\n", bits);
353 if (best_size > bits)
354 best_size = bits, best_encoding = E_GAMMA;
355
356 /* Subexponential */
357 for (k = 0; k < 10; k++) {
358 for (bits = i = 0; i < nvals; i++) {
359 if (vals[i]-min_val < (1<<k))
360 bits += (1 + k)*freqs[i];
361 else
362 bits += (nbits(vals[i]-min_val)*2-k)*freqs[i];
363 }
364
365 if (fd->verbose > 1)
366 fprintf(stderr, "SUBEXP%d = %d\n", k, bits);
367 if (best_size > bits)
368 best_size = bits, best_encoding = E_SUBEXP;
369 }
370 #endif
371
372 /* byte array len */
373
374 /* byte array stop */
375
376 /* External? Guesswork! */
377
378 /* Huffman */
379 // qsort(freqs, nvals, sizeof(freqs[0]), sort_freqs);
380 // for (i = 0; i < nvals; i++) {
381 // fprintf(stderr, "%d = %d\n", i, freqs[i]);
382 // vals[i] = 0;
383 // }
384
385 /* Grow freqs to 2*freqs, to store sums */
386 /* Vals holds link data */
387 freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
388 codes = calloc(2*nvals, sizeof(*codes));
389 if (!freqs || !codes)
390 return E_HUFFMAN; // Cannot do much else atm
391
392 /* Inefficient, use pointers to form chain so we can insert and maintain
393 * a sorted list? This is currently O(nvals^2) complexity.
394 */
395 for (;;) {
396 int low1 = INT_MAX, low2 = INT_MAX;
397 int ind1 = 0, ind2 = 0;
398 for (i = 0; i < nvals; i++) {
399 if (freqs[i] < 0)
400 continue;
401 if (low1 > freqs[i])
402 low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
403 else if (low2 > freqs[i])
404 low2 = freqs[i], ind2 = i;
405 }
406 if (low2 == INT_MAX)
407 break;
408
409 //fprintf(stderr, "Merge ind %d (%d), %d (%d) = %d+%d, => %d=%d\n",
410 // ind1, vals[ind1], ind2, vals[ind2], low1, low2,
411 // nvals, low1+low2);
412
413 freqs[nvals] = low1 + low2;
414 codes[ind1] = nvals;
415 codes[ind2] = nvals;
416 freqs[ind1] *= -1;
417 freqs[ind2] *= -1;
418 nvals++;
419 }
420 nvals = nvals/2+1;
421
422 for (i = 0; i < nvals; i++) {
423 int code_len = 0;
424 for (k = codes[i]; k; k = codes[k])
425 code_len++;
426 codes[i] = code_len;
427 freqs[i] *= -1;
428 //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], codes[i]);
429 }
430
431 for (bits = i = 0; i < nvals; i++) {
432 bits += freqs[i] * codes[i];
433 }
434 if (fd->verbose > 1)
435 fprintf(stderr, "HUFFMAN = %d\n", bits);
436 if (best_size >= bits)
437 best_size = bits, best_encoding = E_HUFFMAN;
438 free(codes);
439
440 free(vals);
441 free(freqs);
442
443 return best_encoding;
444 }
445
446 void cram_stats_free(cram_stats *st) {
447 if (st->h)
448 kh_destroy(m_i2i, st->h);
449 free(st);
450 }