Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/cram_codecs.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* | |
2 Copyright (c) 2012-2013 Genome Research Ltd. | |
3 Author: James Bonfield <jkb@sanger.ac.uk> | |
4 | |
5 Redistribution and use in source and binary forms, with or without | |
6 modification, are permitted provided that the following conditions are met: | |
7 | |
8 1. Redistributions of source code must retain the above copyright notice, | |
9 this list of conditions and the following disclaimer. | |
10 | |
11 2. Redistributions in binary form must reproduce the above copyright notice, | |
12 this list of conditions and the following disclaimer in the documentation | |
13 and/or other materials provided with the distribution. | |
14 | |
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
16 Institute nor the names of its contributors may be used to endorse or promote | |
17 products derived from this software without specific prior written permission. | |
18 | |
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 /* | |
32 * FIXME: add checking of cram_external_type to return NULL on unsupported | |
33 * {codec,type} tuples. | |
34 */ | |
35 | |
36 #ifdef HAVE_CONFIG_H | |
37 #include "io_lib_config.h" | |
38 #endif | |
39 | |
40 #include <stdlib.h> | |
41 #include <string.h> | |
42 #include <assert.h> | |
43 #include <limits.h> | |
44 | |
45 #include "cram/cram.h" | |
46 | |
47 static char *codec2str(enum cram_encoding codec) { | |
48 switch (codec) { | |
49 case E_NULL: return "NULL"; | |
50 case E_EXTERNAL: return "EXTERNAL"; | |
51 case E_GOLOMB: return "GOLOMB"; | |
52 case E_HUFFMAN: return "HUFFMAN"; | |
53 case E_BYTE_ARRAY_LEN: return "BYTE_ARRAY_LEN"; | |
54 case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP"; | |
55 case E_BETA: return "BETA"; | |
56 case E_SUBEXP: return "SUBEXP"; | |
57 case E_GOLOMB_RICE: return "GOLOMB_RICE"; | |
58 case E_GAMMA: return "GAMMA"; | |
59 } | |
60 | |
61 return "(unknown)"; | |
62 } | |
63 | |
64 /* | |
65 * --------------------------------------------------------------------------- | |
66 * Block bit-level I/O functions. | |
67 * All defined static here to promote easy inlining by the compiler. | |
68 */ | |
69 | |
70 #if 0 | |
71 /* Get a single bit, MSB first */ | |
72 static signed int get_bit_MSB(cram_block *block) { | |
73 unsigned int val; | |
74 | |
75 if (block->byte > block->alloc) | |
76 return -1; | |
77 | |
78 val = block->data[block->byte] >> block->bit; | |
79 if (--block->bit == -1) { | |
80 block->bit = 7; | |
81 block->byte++; | |
82 //printf("(%02X)", block->data[block->byte]); | |
83 } | |
84 | |
85 //printf("-B%d-", val&1); | |
86 | |
87 return val & 1; | |
88 } | |
89 #endif | |
90 | |
91 /* | |
92 * Count number of successive 0 and 1 bits | |
93 */ | |
94 static int get_one_bits_MSB(cram_block *block) { | |
95 int n = 0, b; | |
96 do { | |
97 b = block->data[block->byte] >> block->bit; | |
98 if (--block->bit == -1) { | |
99 block->bit = 7; | |
100 block->byte++; | |
101 } | |
102 n++; | |
103 } while (b&1); | |
104 | |
105 return n-1; | |
106 } | |
107 | |
108 static int get_zero_bits_MSB(cram_block *block) { | |
109 int n = 0, b; | |
110 do { | |
111 b = block->data[block->byte] >> block->bit; | |
112 if (--block->bit == -1) { | |
113 block->bit = 7; | |
114 block->byte++; | |
115 } | |
116 n++; | |
117 } while (!(b&1)); | |
118 | |
119 return n-1; | |
120 } | |
121 | |
122 #if 0 | |
123 /* Stores a single bit */ | |
124 static void store_bit_MSB(cram_block *block, unsigned int bit) { | |
125 if (block->byte >= block->alloc) { | |
126 block->alloc = block->alloc ? block->alloc*2 : 1024; | |
127 block->data = realloc(block->data, block->alloc); | |
128 } | |
129 | |
130 if (bit) | |
131 block->data[block->byte] |= (1 << block->bit); | |
132 | |
133 if (--block->bit == -1) { | |
134 block->bit = 7; | |
135 block->byte++; | |
136 block->data[block->byte] = 0; | |
137 } | |
138 } | |
139 #endif | |
140 | |
141 #if 0 | |
142 /* Rounds to the next whole byte boundary first */ | |
143 static void store_bytes_MSB(cram_block *block, char *bytes, int len) { | |
144 if (block->bit != 7) { | |
145 block->bit = 7; | |
146 block->byte++; | |
147 } | |
148 | |
149 while (block->byte + len >= block->alloc) { | |
150 block->alloc = block->alloc ? block->alloc*2 : 1024; | |
151 block->data = realloc(block->data, block->alloc); | |
152 } | |
153 | |
154 memcpy(&block->data[block->byte], bytes, len); | |
155 block->byte += len; | |
156 } | |
157 #endif | |
158 | |
159 /* Local optimised copy for inlining */ | |
160 static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { | |
161 unsigned int val = 0; | |
162 int i; | |
163 | |
164 #if 0 | |
165 // Fits within the current byte */ | |
166 if (nbits <= block->bit+1) { | |
167 val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1); | |
168 if ((block->bit -= nbits) == -1) { | |
169 block->bit = 7; | |
170 block->byte++; | |
171 } | |
172 return val; | |
173 } | |
174 | |
175 // partial first byte | |
176 val = block->data[block->byte] & ((1<<(block->bit+1))-1); | |
177 nbits -= block->bit+1; | |
178 block->bit = 7; | |
179 block->byte++; | |
180 | |
181 // whole middle bytes | |
182 while (nbits >= 8) { | |
183 val = (val << 8) | block->data[block->byte++]; | |
184 nbits -= 8; | |
185 } | |
186 | |
187 val <<= nbits; | |
188 val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1); | |
189 block->bit -= nbits; | |
190 return val; | |
191 #endif | |
192 | |
193 #if 0 | |
194 /* Inefficient implementation! */ | |
195 //printf("{"); | |
196 for (i = 0; i < nbits; i++) | |
197 //val = (val << 1) | get_bit_MSB(block); | |
198 GET_BIT_MSB(block, val); | |
199 #endif | |
200 | |
201 #if 1 | |
202 /* Combination of 1st two methods */ | |
203 if (nbits <= block->bit+1) { | |
204 val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1); | |
205 if ((block->bit -= nbits) == -1) { | |
206 block->bit = 7; | |
207 block->byte++; | |
208 } | |
209 return val; | |
210 } | |
211 | |
212 switch(nbits) { | |
213 // case 15: GET_BIT_MSB(block, val); | |
214 // case 14: GET_BIT_MSB(block, val); | |
215 // case 13: GET_BIT_MSB(block, val); | |
216 // case 12: GET_BIT_MSB(block, val); | |
217 // case 11: GET_BIT_MSB(block, val); | |
218 // case 10: GET_BIT_MSB(block, val); | |
219 // case 9: GET_BIT_MSB(block, val); | |
220 case 8: GET_BIT_MSB(block, val); | |
221 case 7: GET_BIT_MSB(block, val); | |
222 case 6: GET_BIT_MSB(block, val); | |
223 case 5: GET_BIT_MSB(block, val); | |
224 case 4: GET_BIT_MSB(block, val); | |
225 case 3: GET_BIT_MSB(block, val); | |
226 case 2: GET_BIT_MSB(block, val); | |
227 case 1: GET_BIT_MSB(block, val); | |
228 break; | |
229 | |
230 default: | |
231 for (i = 0; i < nbits; i++) | |
232 //val = (val << 1) | get_bit_MSB(block); | |
233 GET_BIT_MSB(block, val); | |
234 } | |
235 #endif | |
236 | |
237 //printf("=0x%x}", val); | |
238 | |
239 return val; | |
240 } | |
241 | |
242 /* | |
243 * Can store up to 24-bits worth of data encoded in an integer value | |
244 * Possibly we'd want to have a less optimal store_bits function when dealing | |
245 * with nbits > 24, but for now we assume the codes generated are never | |
246 * that big. (Given this is only possible with 121392 or more | |
247 * characters with exactly the correct frequency distribution we check | |
248 * for it elsewhere.) | |
249 */ | |
250 static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) { | |
251 /* fprintf(stderr, " store_bits: %02x %d\n", val, nbits); */ | |
252 | |
253 /* | |
254 * Use slow mode until we tweak the huffman generator to never generate | |
255 * codes longer than 24-bits. | |
256 */ | |
257 unsigned int mask; | |
258 | |
259 if (block->byte+4 >= block->alloc) { | |
260 if (block->byte) { | |
261 block->alloc *= 2; | |
262 block->data = realloc(block->data, block->alloc + 4); | |
263 if (!block->data) | |
264 return -1; | |
265 } else { | |
266 block->alloc = 1024; | |
267 block->data = realloc(block->data, block->alloc + 4); | |
268 if (!block->data) | |
269 return -1; | |
270 block->data[0] = 0; // initialise first byte of buffer | |
271 } | |
272 } | |
273 | |
274 /* fits in current bit-field */ | |
275 if (nbits <= block->bit+1) { | |
276 block->data[block->byte] |= (val << (block->bit+1-nbits)); | |
277 if ((block->bit-=nbits) == -1) { | |
278 block->bit = 7; | |
279 block->byte++; | |
280 block->data[block->byte] = 0; | |
281 } | |
282 return 0; | |
283 } | |
284 | |
285 block->data[block->byte] |= (val >> (nbits -= block->bit+1)); | |
286 block->bit = 7; | |
287 block->byte++; | |
288 block->data[block->byte] = 0; | |
289 | |
290 mask = 1<<(nbits-1); | |
291 do { | |
292 if (val & mask) | |
293 block->data[block->byte] |= (1 << block->bit); | |
294 if (--block->bit == -1) { | |
295 block->bit = 7; | |
296 block->byte++; | |
297 block->data[block->byte] = 0; | |
298 } | |
299 mask >>= 1; | |
300 } while(--nbits); | |
301 | |
302 return 0; | |
303 } | |
304 | |
305 /* | |
306 * Returns the next 'size' bytes from a block, or NULL if insufficient | |
307 * data left.This is just a pointer into the block data and not an | |
308 * allocated object, so do not free the result. | |
309 */ | |
310 static char *cram_extract_block(cram_block *b, int size) { | |
311 char *cp = (char *)b->data + b->idx; | |
312 b->idx += size; | |
313 if (b->idx > b->uncomp_size) | |
314 return NULL; | |
315 | |
316 return cp; | |
317 } | |
318 | |
319 /* | |
320 * --------------------------------------------------------------------------- | |
321 * EXTERNAL | |
322 */ | |
323 int cram_external_decode_int(cram_slice *slice, cram_codec *c, | |
324 cram_block *in, char *out, int *out_size) { | |
325 int i; | |
326 char *cp; | |
327 cram_block *b = NULL; | |
328 | |
329 /* Find the external block */ | |
330 if (slice->block_by_id) { | |
331 if (!(b = slice->block_by_id[c->external.content_id])) | |
332 return *out_size?-1:0; | |
333 } else { | |
334 for (i = 0; i < slice->hdr->num_blocks; i++) { | |
335 b = slice->block[i]; | |
336 if (b && b->content_type == EXTERNAL && | |
337 b->content_id == c->external.content_id) { | |
338 break; | |
339 } | |
340 } | |
341 if (i == slice->hdr->num_blocks || !b) | |
342 return -1; | |
343 } | |
344 | |
345 cp = (char *)b->data + b->idx; | |
346 // E_INT and E_LONG are guaranteed single item queries | |
347 b->idx += itf8_get(cp, (int32_t *)out); | |
348 *out_size = 1; | |
349 | |
350 return 0; | |
351 } | |
352 | |
353 int cram_external_decode_char(cram_slice *slice, cram_codec *c, | |
354 cram_block *in, char *out, | |
355 int *out_size) { | |
356 int i; | |
357 char *cp; | |
358 cram_block *b = NULL; | |
359 | |
360 /* Find the external block */ | |
361 if (slice->block_by_id) { | |
362 if (!(b = slice->block_by_id[c->external.content_id])) | |
363 return *out_size?-1:0; | |
364 } else { | |
365 for (i = 0; i < slice->hdr->num_blocks; i++) { | |
366 b = slice->block[i]; | |
367 if (b && b->content_type == EXTERNAL && | |
368 b->content_id == c->external.content_id) { | |
369 break; | |
370 } | |
371 } | |
372 if (i == slice->hdr->num_blocks || !b) | |
373 return -1; | |
374 } | |
375 | |
376 cp = cram_extract_block(b, *out_size); | |
377 if (!cp) | |
378 return -1; | |
379 | |
380 memcpy(out, cp, *out_size); | |
381 return 0; | |
382 } | |
383 | |
384 static int cram_external_decode_block(cram_slice *slice, cram_codec *c, | |
385 cram_block *in, char *out_, | |
386 int *out_size) { | |
387 int i; | |
388 char *cp; | |
389 cram_block *b = NULL; | |
390 cram_block *out = (cram_block *)out_; | |
391 | |
392 /* Find the external block */ | |
393 if (slice->block_by_id) { | |
394 if (!(b = slice->block_by_id[c->external.content_id])) | |
395 return *out_size?-1:0; | |
396 } else { | |
397 for (i = 0; i < slice->hdr->num_blocks; i++) { | |
398 b = slice->block[i]; | |
399 if (b && b->content_type == EXTERNAL && | |
400 b->content_id == c->external.content_id) { | |
401 break; | |
402 } | |
403 } | |
404 if (i == slice->hdr->num_blocks || !b) | |
405 return -1; | |
406 } | |
407 | |
408 cp = cram_extract_block(b, *out_size); | |
409 if (!cp) | |
410 return -1; | |
411 | |
412 BLOCK_APPEND(out, cp, *out_size); | |
413 return 0; | |
414 } | |
415 | |
416 void cram_external_decode_free(cram_codec *c) { | |
417 if (c) | |
418 free(c); | |
419 } | |
420 | |
421 cram_codec *cram_external_decode_init(char *data, int size, | |
422 enum cram_external_type option, | |
423 int version) { | |
424 cram_codec *c; | |
425 char *cp = data; | |
426 | |
427 if (!(c = malloc(sizeof(*c)))) | |
428 return NULL; | |
429 | |
430 c->codec = E_EXTERNAL; | |
431 if (option == E_INT || option == E_LONG) | |
432 c->decode = cram_external_decode_int; | |
433 else if (option == E_BYTE_ARRAY || option == E_BYTE) | |
434 c->decode = cram_external_decode_char; | |
435 else | |
436 c->decode = cram_external_decode_block; | |
437 c->free = cram_external_decode_free; | |
438 | |
439 cp += itf8_get(cp, &c->external.content_id); | |
440 | |
441 if (cp - data != size) { | |
442 fprintf(stderr, "Malformed external header stream\n"); | |
443 free(c); | |
444 return NULL; | |
445 } | |
446 | |
447 c->external.type = option; | |
448 | |
449 return c; | |
450 } | |
451 | |
452 int cram_external_encode_int(cram_slice *slice, cram_codec *c, | |
453 char *in, int in_size) { | |
454 uint32_t *i32 = (uint32_t *)in; | |
455 | |
456 itf8_put_blk(c->out, *i32); | |
457 return 0; | |
458 } | |
459 | |
460 int cram_external_encode_char(cram_slice *slice, cram_codec *c, | |
461 char *in, int in_size) { | |
462 BLOCK_APPEND(c->out, in, in_size); | |
463 return 0; | |
464 } | |
465 | |
466 void cram_external_encode_free(cram_codec *c) { | |
467 if (!c) | |
468 return; | |
469 free(c); | |
470 } | |
471 | |
472 int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, | |
473 int version) { | |
474 char tmp[99], *tp = tmp; | |
475 int len = 0; | |
476 | |
477 if (prefix) { | |
478 size_t l = strlen(prefix); | |
479 BLOCK_APPEND(b, prefix, l); | |
480 len += l; | |
481 } | |
482 | |
483 tp += itf8_put(tp, c->e_external.content_id); | |
484 len += itf8_put_blk(b, c->codec); | |
485 len += itf8_put_blk(b, tp-tmp); | |
486 BLOCK_APPEND(b, tmp, tp-tmp); | |
487 len += tp-tmp; | |
488 | |
489 return len; | |
490 } | |
491 | |
492 cram_codec *cram_external_encode_init(cram_stats *st, | |
493 enum cram_external_type option, | |
494 void *dat, | |
495 int version) { | |
496 cram_codec *c; | |
497 | |
498 c = malloc(sizeof(*c)); | |
499 if (!c) | |
500 return NULL; | |
501 c->codec = E_EXTERNAL; | |
502 c->free = cram_external_encode_free; | |
503 if (option == E_INT || option == E_LONG) | |
504 c->encode = cram_external_encode_int; | |
505 else if (option == E_BYTE_ARRAY || option == E_BYTE) | |
506 c->encode = cram_external_encode_char; | |
507 else | |
508 abort(); | |
509 c->store = cram_external_encode_store; | |
510 | |
511 c->e_external.content_id = (size_t)dat; | |
512 | |
513 return c; | |
514 } | |
515 | |
516 /* | |
517 * --------------------------------------------------------------------------- | |
518 * BETA | |
519 */ | |
520 int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { | |
521 int32_t *out_i = (int32_t *)out; | |
522 int i, n; | |
523 | |
524 if (c->beta.nbits) { | |
525 for (i = 0, n = *out_size; i < n; i++) | |
526 out_i[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset; | |
527 } else { | |
528 for (i = 0, n = *out_size; i < n; i++) | |
529 out_i[i] = -c->beta.offset; | |
530 } | |
531 | |
532 return 0; | |
533 } | |
534 | |
535 int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { | |
536 int i, n; | |
537 | |
538 if (c->beta.nbits) { | |
539 for (i = 0, n = *out_size; i < n; i++) | |
540 out[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset; | |
541 } else { | |
542 for (i = 0, n = *out_size; i < n; i++) | |
543 out[i] = -c->beta.offset; | |
544 } | |
545 | |
546 return 0; | |
547 } | |
548 | |
549 void cram_beta_decode_free(cram_codec *c) { | |
550 if (c) | |
551 free(c); | |
552 } | |
553 | |
554 cram_codec *cram_beta_decode_init(char *data, int size, | |
555 enum cram_external_type option, | |
556 int version) { | |
557 cram_codec *c; | |
558 char *cp = data; | |
559 | |
560 if (!(c = malloc(sizeof(*c)))) | |
561 return NULL; | |
562 | |
563 c->codec = E_BETA; | |
564 if (option == E_INT || option == E_LONG) | |
565 c->decode = cram_beta_decode_int; | |
566 else if (option == E_BYTE_ARRAY || option == E_BYTE) | |
567 c->decode = cram_beta_decode_char; | |
568 else | |
569 abort(); | |
570 c->free = cram_beta_decode_free; | |
571 | |
572 cp += itf8_get(cp, &c->beta.offset); | |
573 cp += itf8_get(cp, &c->beta.nbits); | |
574 | |
575 if (cp - data != size) { | |
576 fprintf(stderr, "Malformed beta header stream\n"); | |
577 free(c); | |
578 return NULL; | |
579 } | |
580 | |
581 return c; | |
582 } | |
583 | |
584 int cram_beta_encode_store(cram_codec *c, cram_block *b, | |
585 char *prefix, int version) { | |
586 int len = 0; | |
587 | |
588 if (prefix) { | |
589 size_t l = strlen(prefix); | |
590 BLOCK_APPEND(b, prefix, l); | |
591 len += l; | |
592 } | |
593 | |
594 len += itf8_put_blk(b, c->codec); | |
595 len += itf8_put_blk(b, itf8_size(c->e_beta.offset) | |
596 + itf8_size(c->e_beta.nbits)); // codec length | |
597 len += itf8_put_blk(b, c->e_beta.offset); | |
598 len += itf8_put_blk(b, c->e_beta.nbits); | |
599 | |
600 return len; | |
601 } | |
602 | |
603 int cram_beta_encode_int(cram_slice *slice, cram_codec *c, | |
604 char *in, int in_size) { | |
605 int *syms = (int *)in; | |
606 int i, r = 0; | |
607 | |
608 for (i = 0; i < in_size; i++) | |
609 r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset, | |
610 c->e_beta.nbits); | |
611 | |
612 return r; | |
613 } | |
614 | |
615 int cram_beta_encode_char(cram_slice *slice, cram_codec *c, | |
616 char *in, int in_size) { | |
617 unsigned char *syms = (unsigned char *)in; | |
618 int i, r = 0; | |
619 | |
620 for (i = 0; i < in_size; i++) | |
621 r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset, | |
622 c->e_beta.nbits); | |
623 | |
624 return r; | |
625 } | |
626 | |
627 void cram_beta_encode_free(cram_codec *c) { | |
628 if (c) free(c); | |
629 } | |
630 | |
631 cram_codec *cram_beta_encode_init(cram_stats *st, | |
632 enum cram_external_type option, | |
633 void *dat, | |
634 int version) { | |
635 cram_codec *c; | |
636 int min_val, max_val, len = 0; | |
637 | |
638 c = malloc(sizeof(*c)); | |
639 if (!c) | |
640 return NULL; | |
641 c->codec = E_BETA; | |
642 c->free = cram_beta_encode_free; | |
643 if (option == E_INT) | |
644 c->encode = cram_beta_encode_int; | |
645 else | |
646 c->encode = cram_beta_encode_char; | |
647 c->store = cram_beta_encode_store; | |
648 | |
649 if (dat) { | |
650 min_val = ((int *)dat)[0]; | |
651 max_val = ((int *)dat)[1]; | |
652 } else { | |
653 min_val = INT_MAX; | |
654 max_val = INT_MIN; | |
655 int i; | |
656 for (i = 0; i < MAX_STAT_VAL; i++) { | |
657 if (!st->freqs[i]) | |
658 continue; | |
659 if (min_val > i) | |
660 min_val = i; | |
661 max_val = i; | |
662 } | |
663 if (st->h) { | |
664 khint_t k; | |
665 | |
666 for (k = kh_begin(st->h); k != kh_end(st->h); k++) { | |
667 if (!kh_exist(st->h, k)) | |
668 continue; | |
669 | |
670 i = kh_key(st->h, k); | |
671 if (min_val > i) | |
672 min_val = i; | |
673 if (max_val < i) | |
674 max_val = i; | |
675 } | |
676 } | |
677 } | |
678 | |
679 assert(max_val >= min_val); | |
680 c->e_beta.offset = -min_val; | |
681 max_val -= min_val; | |
682 while (max_val) { | |
683 len++; | |
684 max_val >>= 1; | |
685 } | |
686 c->e_beta.nbits = len; | |
687 | |
688 return c; | |
689 } | |
690 | |
691 /* | |
692 * --------------------------------------------------------------------------- | |
693 * SUBEXP | |
694 */ | |
695 int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { | |
696 int32_t *out_i = (int32_t *)out; | |
697 int n, count; | |
698 int k = c->subexp.k; | |
699 | |
700 for (count = 0, n = *out_size; count < n; count++) { | |
701 int i = 0, tail; | |
702 int val; | |
703 | |
704 /* Get number of 1s */ | |
705 //while (get_bit_MSB(in) == 1) i++; | |
706 i = get_one_bits_MSB(in); | |
707 | |
708 /* | |
709 * Val is | |
710 * i > 0: 2^(k+i-1) + k+i-1 bits | |
711 * i = 0: k bits | |
712 */ | |
713 if (i) { | |
714 tail = i + k-1; | |
715 val = 0; | |
716 while (tail) { | |
717 //val = val<<1; val |= get_bit_MSB(in); | |
718 GET_BIT_MSB(in, val); | |
719 tail--; | |
720 } | |
721 val += 1 << (i + k-1); | |
722 } else { | |
723 tail = k; | |
724 val = 0; | |
725 while (tail) { | |
726 //val = val<<1; val |= get_bit_MSB(in); | |
727 GET_BIT_MSB(in, val); | |
728 tail--; | |
729 } | |
730 } | |
731 | |
732 out_i[count] = val - c->subexp.offset; | |
733 } | |
734 | |
735 return 0; | |
736 } | |
737 | |
738 void cram_subexp_decode_free(cram_codec *c) { | |
739 if (c) | |
740 free(c); | |
741 } | |
742 | |
743 cram_codec *cram_subexp_decode_init(char *data, int size, | |
744 enum cram_external_type option, | |
745 int version) { | |
746 cram_codec *c; | |
747 char *cp = data; | |
748 | |
749 if (!(c = malloc(sizeof(*c)))) | |
750 return NULL; | |
751 | |
752 c->codec = E_SUBEXP; | |
753 c->decode = cram_subexp_decode; | |
754 c->free = cram_subexp_decode_free; | |
755 | |
756 cp += itf8_get(cp, &c->subexp.offset); | |
757 cp += itf8_get(cp, &c->subexp.k); | |
758 | |
759 if (cp - data != size) { | |
760 fprintf(stderr, "Malformed subexp header stream\n"); | |
761 free(c); | |
762 return NULL; | |
763 } | |
764 | |
765 return c; | |
766 } | |
767 | |
768 /* | |
769 * --------------------------------------------------------------------------- | |
770 * GAMMA | |
771 */ | |
772 int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { | |
773 int32_t *out_i = (int32_t *)out; | |
774 int i, n; | |
775 | |
776 for (i = 0, n = *out_size; i < n; i++) { | |
777 int nz = 0; | |
778 int val; | |
779 //while (get_bit_MSB(in) == 0) nz++; | |
780 nz = get_zero_bits_MSB(in); | |
781 val = 1; | |
782 while (nz > 0) { | |
783 //val <<= 1; val |= get_bit_MSB(in); | |
784 GET_BIT_MSB(in, val); | |
785 nz--; | |
786 } | |
787 | |
788 out_i[i] = val - c->gamma.offset; | |
789 } | |
790 | |
791 return 0; | |
792 } | |
793 | |
794 void cram_gamma_decode_free(cram_codec *c) { | |
795 if (c) | |
796 free(c); | |
797 } | |
798 | |
799 cram_codec *cram_gamma_decode_init(char *data, int size, | |
800 enum cram_external_type option, | |
801 int version) { | |
802 cram_codec *c; | |
803 char *cp = data; | |
804 | |
805 if (!(c = malloc(sizeof(*c)))) | |
806 return NULL; | |
807 | |
808 c->codec = E_GAMMA; | |
809 c->decode = cram_gamma_decode; | |
810 c->free = cram_gamma_decode_free; | |
811 | |
812 cp += itf8_get(cp, &c->gamma.offset); | |
813 | |
814 if (cp - data != size) { | |
815 fprintf(stderr, "Malformed gamma header stream\n"); | |
816 free(c); | |
817 return NULL; | |
818 } | |
819 | |
820 return c; | |
821 } | |
822 | |
823 /* | |
824 * --------------------------------------------------------------------------- | |
825 * HUFFMAN | |
826 */ | |
827 | |
828 static int code_sort(const void *vp1, const void *vp2) { | |
829 const cram_huffman_code *c1 = (const cram_huffman_code *)vp1; | |
830 const cram_huffman_code *c2 = (const cram_huffman_code *)vp2; | |
831 | |
832 if (c1->len != c2->len) | |
833 return c1->len - c2->len; | |
834 else | |
835 return c1->symbol - c2->symbol; | |
836 } | |
837 | |
838 void cram_huffman_decode_free(cram_codec *c) { | |
839 if (!c) | |
840 return; | |
841 | |
842 if (c->huffman.codes) | |
843 free(c->huffman.codes); | |
844 free(c); | |
845 } | |
846 | |
847 int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c, | |
848 cram_block *in, char *out, int *out_size) { | |
849 int i, n; | |
850 | |
851 /* Special case of 0 length codes */ | |
852 for (i = 0, n = *out_size; i < n; i++) { | |
853 out[i] = c->huffman.codes[0].symbol; | |
854 } | |
855 return 0; | |
856 } | |
857 | |
858 int cram_huffman_decode_char(cram_slice *slice, cram_codec *c, | |
859 cram_block *in, char *out, int *out_size) { | |
860 int i, n, ncodes = c->huffman.ncodes; | |
861 const cram_huffman_code * const codes = c->huffman.codes; | |
862 | |
863 for (i = 0, n = *out_size; i < n; i++) { | |
864 int idx = 0; | |
865 int val = 0, len = 0, last_len = 0; | |
866 | |
867 for (;;) { | |
868 int dlen = codes[idx].len - last_len; | |
869 if (dlen <= 0 || (in->alloc - in->byte)*8 + in->bit + 7 < dlen) | |
870 return -1; | |
871 | |
872 //val <<= dlen; | |
873 //val |= get_bits_MSB(in, dlen); | |
874 //last_len = (len += dlen); | |
875 | |
876 last_len = (len += dlen); | |
877 for (; dlen; dlen--) GET_BIT_MSB(in, val); | |
878 | |
879 idx = val - codes[idx].p; | |
880 if (idx >= ncodes || idx < 0) | |
881 return -1; | |
882 | |
883 if (codes[idx].code == val && codes[idx].len == len) { | |
884 out[i] = codes[idx].symbol; | |
885 break; | |
886 } | |
887 } | |
888 } | |
889 | |
890 return 0; | |
891 } | |
892 | |
893 int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c, | |
894 cram_block *in, char *out, int *out_size) { | |
895 int32_t *out_i = (int32_t *)out; | |
896 int i, n; | |
897 const cram_huffman_code * const codes = c->huffman.codes; | |
898 | |
899 /* Special case of 0 length codes */ | |
900 for (i = 0, n = *out_size; i < n; i++) { | |
901 out_i[i] = codes[0].symbol; | |
902 } | |
903 return 0; | |
904 } | |
905 | |
906 int cram_huffman_decode_int(cram_slice *slice, cram_codec *c, | |
907 cram_block *in, char *out, int *out_size) { | |
908 int32_t *out_i = (int32_t *)out; | |
909 int i, n, ncodes = c->huffman.ncodes; | |
910 const cram_huffman_code * const codes = c->huffman.codes; | |
911 | |
912 for (i = 0, n = *out_size; i < n; i++) { | |
913 int idx = 0; | |
914 int val = 0, len = 0, last_len = 0; | |
915 | |
916 // Now one bit at a time for remaining checks | |
917 for (;;) { | |
918 int dlen = codes[idx].len - last_len; | |
919 if (dlen <= 0 || (in->alloc - in->byte)*8 + in->bit + 7 < dlen) | |
920 return -1; | |
921 | |
922 //val <<= dlen; | |
923 //val |= get_bits_MSB(in, dlen); | |
924 //last_len = (len += dlen); | |
925 | |
926 last_len = (len += dlen); | |
927 for (; dlen; dlen--) GET_BIT_MSB(in, val); | |
928 | |
929 idx = val - codes[idx].p; | |
930 if (idx >= ncodes || idx < 0) | |
931 return -1; | |
932 | |
933 if (codes[idx].code == val && codes[idx].len == len) { | |
934 out_i[i] = codes[idx].symbol; | |
935 break; | |
936 } | |
937 } | |
938 } | |
939 | |
940 return 0; | |
941 } | |
942 | |
943 /* | |
944 * Initialises a huffman decoder from an encoding data stream. | |
945 */ | |
946 cram_codec *cram_huffman_decode_init(char *data, int size, | |
947 enum cram_external_type option, | |
948 int version) { | |
949 int32_t ncodes, i, j; | |
950 char *cp = data, *data_end = &data[size]; | |
951 cram_codec *h; | |
952 cram_huffman_code *codes; | |
953 int32_t val, last_len, max_len = 0; | |
954 | |
955 cp += itf8_get(cp, &ncodes); | |
956 h = calloc(1, sizeof(*h)); | |
957 if (!h) | |
958 return NULL; | |
959 | |
960 h->free = cram_huffman_decode_free; | |
961 | |
962 h->huffman.ncodes = ncodes; | |
963 codes = h->huffman.codes = malloc(ncodes * sizeof(*codes)); | |
964 if (!codes) { | |
965 free(h); | |
966 return NULL; | |
967 } | |
968 | |
969 /* Read symbols and bit-lengths */ | |
970 for (i = 0; i < ncodes && cp < data_end; i++) { | |
971 cp += itf8_get(cp, &codes[i].symbol); | |
972 } | |
973 | |
974 if (cp >= data_end) { | |
975 fprintf(stderr, "Malformed huffman header stream\n"); | |
976 free(h); | |
977 return NULL; | |
978 } | |
979 cp += itf8_get(cp, &i); | |
980 if (i != ncodes) { | |
981 fprintf(stderr, "Malformed huffman header stream\n"); | |
982 free(h); | |
983 return NULL; | |
984 } | |
985 | |
986 if (ncodes == 0) { | |
987 /* NULL huffman stream */ | |
988 return h; | |
989 } | |
990 | |
991 for (i = 0; i < ncodes && cp < data_end; i++) { | |
992 cp += itf8_get(cp, &codes[i].len); | |
993 if (max_len < codes[i].len) | |
994 max_len = codes[i].len; | |
995 } | |
996 if (cp - data != size || max_len >= ncodes) { | |
997 fprintf(stderr, "Malformed huffman header stream\n"); | |
998 free(h); | |
999 return NULL; | |
1000 } | |
1001 | |
1002 /* Sort by bit length and then by symbol value */ | |
1003 qsort(codes, ncodes, sizeof(*codes), code_sort); | |
1004 | |
1005 /* Assign canonical codes */ | |
1006 val = -1, last_len = 0; | |
1007 for (i = 0; i < ncodes; i++) { | |
1008 val++; | |
1009 if (codes[i].len > last_len) { | |
1010 while (codes[i].len > last_len) { | |
1011 val <<= 1; | |
1012 last_len++; | |
1013 } | |
1014 } | |
1015 codes[i].code = val; | |
1016 } | |
1017 | |
1018 /* | |
1019 * Compute the next starting point, offset by the i'th value. | |
1020 * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then | |
1021 * codes[10..13].p = 30 - 10. | |
1022 */ | |
1023 last_len = 0; | |
1024 for (i = j = 0; i < ncodes; i++) { | |
1025 if (codes[i].len > last_len) { | |
1026 j = codes[i].code - i; | |
1027 last_len = codes[i].len; | |
1028 } | |
1029 codes[i].p = j; | |
1030 } | |
1031 | |
1032 // puts("==HUFF LEN=="); | |
1033 // for (i = 0; i <= last_len+1; i++) { | |
1034 // printf("len %d=%d prefix %d\n", i, h->huffman.lengths[i], h->huffman.prefix[i]); | |
1035 // } | |
1036 // puts("===HUFFMAN CODES==="); | |
1037 // for (i = 0; i < ncodes; i++) { | |
1038 // int j; | |
1039 // printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code); | |
1040 // j = codes[i].len; | |
1041 // while (j) { | |
1042 // putchar(codes[i].code & (1 << --j) ? '1' : '0'); | |
1043 // } | |
1044 // printf(" %d\n", codes[i].code); | |
1045 // } | |
1046 | |
1047 h->codec = E_HUFFMAN; | |
1048 if (option == E_BYTE || option == E_BYTE_ARRAY) { | |
1049 if (h->huffman.codes[0].len == 0) | |
1050 h->decode = cram_huffman_decode_char0; | |
1051 else | |
1052 h->decode = cram_huffman_decode_char; | |
1053 } else if (option == E_BYTE_ARRAY_BLOCK) { | |
1054 abort(); | |
1055 } else { | |
1056 if (h->huffman.codes[0].len == 0) | |
1057 h->decode = cram_huffman_decode_int0; | |
1058 else | |
1059 h->decode = cram_huffman_decode_int; | |
1060 } | |
1061 | |
1062 return (cram_codec *)h; | |
1063 } | |
1064 | |
1065 int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c, | |
1066 char *in, int in_size) { | |
1067 return 0; | |
1068 } | |
1069 | |
1070 int cram_huffman_encode_char(cram_slice *slice, cram_codec *c, | |
1071 char *in, int in_size) { | |
1072 int i, code, len, r = 0; | |
1073 unsigned char *syms = (unsigned char *)in; | |
1074 | |
1075 do { | |
1076 int sym = *syms++; | |
1077 if (sym >= -1 && sym < MAX_HUFF) { | |
1078 i = c->e_huffman.val2code[sym+1]; | |
1079 assert(c->e_huffman.codes[i].symbol == sym); | |
1080 code = c->e_huffman.codes[i].code; | |
1081 len = c->e_huffman.codes[i].len; | |
1082 } else { | |
1083 /* Slow - use a lookup table for when sym < MAX_HUFF? */ | |
1084 for (i = 0; i < c->e_huffman.nvals; i++) { | |
1085 if (c->e_huffman.codes[i].symbol == sym) | |
1086 break; | |
1087 } | |
1088 if (i == c->e_huffman.nvals) | |
1089 return -1; | |
1090 | |
1091 code = c->e_huffman.codes[i].code; | |
1092 len = c->e_huffman.codes[i].len; | |
1093 } | |
1094 | |
1095 r |= store_bits_MSB(c->out, code, len); | |
1096 } while (--in_size); | |
1097 | |
1098 return r; | |
1099 } | |
1100 | |
1101 int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c, | |
1102 char *in, int in_size) { | |
1103 return 0; | |
1104 } | |
1105 | |
1106 int cram_huffman_encode_int(cram_slice *slice, cram_codec *c, | |
1107 char *in, int in_size) { | |
1108 int i, code, len, r = 0; | |
1109 int *syms = (int *)in; | |
1110 | |
1111 do { | |
1112 int sym = *syms++; | |
1113 | |
1114 if (sym >= -1 && sym < MAX_HUFF) { | |
1115 i = c->e_huffman.val2code[sym+1]; | |
1116 assert(c->e_huffman.codes[i].symbol == sym); | |
1117 code = c->e_huffman.codes[i].code; | |
1118 len = c->e_huffman.codes[i].len; | |
1119 } else { | |
1120 /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */ | |
1121 for (i = 0; i < c->e_huffman.nvals; i++) { | |
1122 if (c->e_huffman.codes[i].symbol == sym) | |
1123 break; | |
1124 } | |
1125 if (i == c->e_huffman.nvals) | |
1126 return -1; | |
1127 | |
1128 code = c->e_huffman.codes[i].code; | |
1129 len = c->e_huffman.codes[i].len; | |
1130 } | |
1131 | |
1132 r |= store_bits_MSB(c->out, code, len); | |
1133 } while (--in_size); | |
1134 | |
1135 return r; | |
1136 } | |
1137 | |
1138 void cram_huffman_encode_free(cram_codec *c) { | |
1139 if (!c) | |
1140 return; | |
1141 | |
1142 if (c->e_huffman.codes) | |
1143 free(c->e_huffman.codes); | |
1144 free(c); | |
1145 } | |
1146 | |
1147 /* | |
1148 * Encodes a huffman tree. | |
1149 * Returns number of bytes written. | |
1150 */ | |
1151 int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, | |
1152 int version) { | |
1153 int i, len = 0; | |
1154 cram_huffman_code *codes = c->e_huffman.codes; | |
1155 /* | |
1156 * Up to code length 127 means 2.5e+26 bytes of data required (worst | |
1157 * case huffman tree needs symbols with freqs matching the Fibonacci | |
1158 * series). So guaranteed 1 byte per code. | |
1159 * | |
1160 * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8). | |
1161 * | |
1162 * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory | |
1163 */ | |
1164 char *tmp = malloc(6*c->e_huffman.nvals+16); | |
1165 char *tp = tmp; | |
1166 | |
1167 if (!tmp) | |
1168 return -1; | |
1169 | |
1170 if (prefix) { | |
1171 size_t l = strlen(prefix); | |
1172 BLOCK_APPEND(b, prefix, l); | |
1173 len += l; | |
1174 } | |
1175 | |
1176 tp += itf8_put(tp, c->e_huffman.nvals); | |
1177 for (i = 0; i < c->e_huffman.nvals; i++) { | |
1178 tp += itf8_put(tp, codes[i].symbol); | |
1179 } | |
1180 | |
1181 tp += itf8_put(tp, c->e_huffman.nvals); | |
1182 for (i = 0; i < c->e_huffman.nvals; i++) { | |
1183 tp += itf8_put(tp, codes[i].len); | |
1184 } | |
1185 | |
1186 len += itf8_put_blk(b, c->codec); | |
1187 len += itf8_put_blk(b, tp-tmp); | |
1188 BLOCK_APPEND(b, tmp, tp-tmp); | |
1189 len += tp-tmp; | |
1190 | |
1191 free(tmp); | |
1192 | |
1193 return len; | |
1194 } | |
1195 | |
1196 cram_codec *cram_huffman_encode_init(cram_stats *st, | |
1197 enum cram_external_type option, | |
1198 void *dat, | |
1199 int version) { | |
1200 int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens, code, len; | |
1201 int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; | |
1202 cram_codec *c; | |
1203 cram_huffman_code *codes; | |
1204 | |
1205 c = malloc(sizeof(*c)); | |
1206 if (!c) | |
1207 return NULL; | |
1208 c->codec = E_HUFFMAN; | |
1209 | |
1210 /* Count number of unique symbols */ | |
1211 for (nvals = i = 0; i < MAX_STAT_VAL; i++) { | |
1212 if (!st->freqs[i]) | |
1213 continue; | |
1214 if (nvals >= vals_alloc) { | |
1215 vals_alloc = vals_alloc ? vals_alloc*2 : 1024; | |
1216 vals = realloc(vals, vals_alloc * sizeof(int)); | |
1217 freqs = realloc(freqs, vals_alloc * sizeof(int)); | |
1218 if (!vals || !freqs) { | |
1219 if (vals) free(vals); | |
1220 if (freqs) free(freqs); | |
1221 free(c); | |
1222 return NULL; | |
1223 } | |
1224 } | |
1225 vals[nvals] = i; | |
1226 freqs[nvals] = st->freqs[i]; | |
1227 assert(st->freqs[i] > 0); | |
1228 ntot += freqs[nvals]; | |
1229 if (max_val < i) max_val = i; | |
1230 if (min_val > i) min_val = i; | |
1231 nvals++; | |
1232 } | |
1233 if (st->h) { | |
1234 khint_t k; | |
1235 | |
1236 for (k = kh_begin(st->h); k != kh_end(st->h); k++) { | |
1237 if (!kh_exist(st->h, k)) | |
1238 continue; | |
1239 if (nvals >= vals_alloc) { | |
1240 vals_alloc = vals_alloc ? vals_alloc*2 : 1024; | |
1241 vals = realloc(vals, vals_alloc * sizeof(int)); | |
1242 freqs = realloc(freqs, vals_alloc * sizeof(int)); | |
1243 if (!vals || !freqs) | |
1244 return NULL; | |
1245 } | |
1246 vals[nvals]= kh_key(st->h, k); | |
1247 freqs[nvals] = kh_val(st->h, k); | |
1248 assert(freqs[nvals] > 0); | |
1249 ntot += freqs[nvals]; | |
1250 if (max_val < i) max_val = i; | |
1251 if (min_val > i) min_val = i; | |
1252 nvals++; | |
1253 } | |
1254 } | |
1255 | |
1256 assert(nvals > 0); | |
1257 | |
1258 freqs = realloc(freqs, 2*nvals*sizeof(*freqs)); | |
1259 lens = calloc(2*nvals, sizeof(*lens)); | |
1260 if (!lens || !freqs) | |
1261 return NULL; | |
1262 | |
1263 /* Inefficient, use pointers to form chain so we can insert and maintain | |
1264 * a sorted list? This is currently O(nvals^2) complexity. | |
1265 */ | |
1266 for (;;) { | |
1267 int low1 = INT_MAX, low2 = INT_MAX; | |
1268 int ind1 = 0, ind2 = 0; | |
1269 for (i = 0; i < nvals; i++) { | |
1270 if (freqs[i] < 0) | |
1271 continue; | |
1272 if (low1 > freqs[i]) | |
1273 low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i; | |
1274 else if (low2 > freqs[i]) | |
1275 low2 = freqs[i], ind2 = i; | |
1276 } | |
1277 if (low2 == INT_MAX) | |
1278 break; | |
1279 | |
1280 freqs[nvals] = low1 + low2; | |
1281 lens[ind1] = nvals; | |
1282 lens[ind2] = nvals; | |
1283 freqs[ind1] *= -1; | |
1284 freqs[ind2] *= -1; | |
1285 nvals++; | |
1286 } | |
1287 nvals = nvals/2+1; | |
1288 | |
1289 /* Assign lengths */ | |
1290 for (i = 0; i < nvals; i++) { | |
1291 int code_len = 0; | |
1292 for (k = lens[i]; k; k = lens[k]) | |
1293 code_len++; | |
1294 lens[i] = code_len; | |
1295 freqs[i] *= -1; | |
1296 //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]); | |
1297 } | |
1298 | |
1299 | |
1300 /* Sort, need in a struct */ | |
1301 if (!(codes = malloc(nvals * sizeof(*codes)))) | |
1302 return NULL; | |
1303 for (i = 0; i < nvals; i++) { | |
1304 codes[i].symbol = vals[i]; | |
1305 codes[i].len = lens[i]; | |
1306 } | |
1307 qsort(codes, nvals, sizeof(*codes), code_sort); | |
1308 | |
1309 /* | |
1310 * Generate canonical codes from lengths. | |
1311 * Sort by length. | |
1312 * Start with 0. | |
1313 * Every new code of same length is +1. | |
1314 * Every new code of new length is +1 then <<1 per extra length. | |
1315 * | |
1316 * /\ | |
1317 * a/\ | |
1318 * /\/\ | |
1319 * bcd/\ | |
1320 * ef | |
1321 * | |
1322 * a 1 0 | |
1323 * b 3 4 (0+1)<<2 | |
1324 * c 3 5 | |
1325 * d 3 6 | |
1326 * e 4 14 (6+1)<<1 | |
1327 * f 5 15 | |
1328 */ | |
1329 code = 0; len = codes[0].len; | |
1330 for (i = 0; i < nvals; i++) { | |
1331 while (len != codes[i].len) { | |
1332 code<<=1; | |
1333 len++; | |
1334 } | |
1335 codes[i].code = code++; | |
1336 | |
1337 if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF) | |
1338 c->e_huffman.val2code[codes[i].symbol+1] = i; | |
1339 | |
1340 //fprintf(stderr, "sym %d, code %d, len %d\n", | |
1341 // codes[i].symbol, codes[i].code, codes[i].len); | |
1342 } | |
1343 | |
1344 free(lens); | |
1345 free(vals); | |
1346 free(freqs); | |
1347 | |
1348 c->e_huffman.codes = codes; | |
1349 c->e_huffman.nvals = nvals; | |
1350 | |
1351 c->free = cram_huffman_encode_free; | |
1352 if (option == E_BYTE || option == E_BYTE_ARRAY) { | |
1353 if (c->e_huffman.codes[0].len == 0) | |
1354 c->encode = cram_huffman_encode_char0; | |
1355 else | |
1356 c->encode = cram_huffman_encode_char; | |
1357 } else { | |
1358 if (c->e_huffman.codes[0].len == 0) | |
1359 c->encode = cram_huffman_encode_int0; | |
1360 else | |
1361 c->encode = cram_huffman_encode_int; | |
1362 } | |
1363 c->store = cram_huffman_encode_store; | |
1364 | |
1365 return c; | |
1366 } | |
1367 | |
1368 /* | |
1369 * --------------------------------------------------------------------------- | |
1370 * BYTE_ARRAY_LEN | |
1371 */ | |
1372 int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c, | |
1373 cram_block *in, char *out, | |
1374 int *out_size) { | |
1375 /* Fetch length */ | |
1376 int32_t len, one = 1; | |
1377 | |
1378 c->byte_array_len.len_codec->decode(slice, c->byte_array_len.len_codec, in, (char *)&len, &one); | |
1379 //printf("ByteArray Len=%d\n", len); | |
1380 | |
1381 if (c->byte_array_len.value_codec) { | |
1382 c->byte_array_len.value_codec->decode(slice, | |
1383 c->byte_array_len.value_codec, | |
1384 in, out, &len); | |
1385 } else { | |
1386 return -1; | |
1387 } | |
1388 | |
1389 *out_size = len; | |
1390 | |
1391 return 0; | |
1392 } | |
1393 | |
1394 void cram_byte_array_len_decode_free(cram_codec *c) { | |
1395 if (!c) return; | |
1396 | |
1397 if (c->byte_array_len.len_codec) | |
1398 c->byte_array_len.len_codec->free(c->byte_array_len.len_codec); | |
1399 | |
1400 if (c->byte_array_len.value_codec) | |
1401 c->byte_array_len.value_codec->free(c->byte_array_len.value_codec); | |
1402 | |
1403 free(c); | |
1404 } | |
1405 | |
1406 cram_codec *cram_byte_array_len_decode_init(char *data, int size, | |
1407 enum cram_external_type option, | |
1408 int version) { | |
1409 cram_codec *c; | |
1410 char *cp = data; | |
1411 int32_t encoding; | |
1412 int32_t sub_size; | |
1413 | |
1414 if (!(c = malloc(sizeof(*c)))) | |
1415 return NULL; | |
1416 | |
1417 c->codec = E_BYTE_ARRAY_LEN; | |
1418 c->decode = cram_byte_array_len_decode; | |
1419 c->free = cram_byte_array_len_decode_free; | |
1420 | |
1421 cp += itf8_get(cp, &encoding); | |
1422 cp += itf8_get(cp, &sub_size); | |
1423 c->byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size, | |
1424 E_INT, version); | |
1425 cp += sub_size; | |
1426 | |
1427 cp += itf8_get(cp, &encoding); | |
1428 cp += itf8_get(cp, &sub_size); | |
1429 c->byte_array_len.value_codec = cram_decoder_init(encoding, cp, sub_size, | |
1430 option, version); | |
1431 cp += sub_size; | |
1432 | |
1433 if (cp - data != size) { | |
1434 fprintf(stderr, "Malformed byte_array_len header stream\n"); | |
1435 free(c); | |
1436 return NULL; | |
1437 } | |
1438 | |
1439 return c; | |
1440 } | |
1441 | |
1442 int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c, | |
1443 char *in, int in_size) { | |
1444 int32_t i32 = in_size; | |
1445 int r = 0; | |
1446 | |
1447 r |= c->e_byte_array_len.len_codec->encode(slice, | |
1448 c->e_byte_array_len.len_codec, | |
1449 (char *)&i32, 1); | |
1450 r |= c->e_byte_array_len.val_codec->encode(slice, | |
1451 c->e_byte_array_len.val_codec, | |
1452 in, in_size); | |
1453 return r; | |
1454 } | |
1455 | |
1456 void cram_byte_array_len_encode_free(cram_codec *c) { | |
1457 if (!c) | |
1458 return; | |
1459 | |
1460 if (c->e_byte_array_len.len_codec) | |
1461 c->e_byte_array_len.len_codec->free(c->e_byte_array_len.len_codec); | |
1462 | |
1463 if (c->e_byte_array_len.val_codec) | |
1464 c->e_byte_array_len.val_codec->free(c->e_byte_array_len.val_codec); | |
1465 | |
1466 free(c); | |
1467 } | |
1468 | |
1469 int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, | |
1470 char *prefix, int version) { | |
1471 int len = 0, len2, len3; | |
1472 cram_codec *tc; | |
1473 cram_block *b_len, *b_val; | |
1474 | |
1475 if (prefix) { | |
1476 size_t l = strlen(prefix); | |
1477 BLOCK_APPEND(b, prefix, l); | |
1478 len += l; | |
1479 } | |
1480 | |
1481 tc = c->e_byte_array_len.len_codec; | |
1482 b_len = cram_new_block(0, 0); | |
1483 len2 = tc->store(tc, b_len, NULL, version); | |
1484 | |
1485 tc = c->e_byte_array_len.val_codec; | |
1486 b_val = cram_new_block(0, 0); | |
1487 len3 = tc->store(tc, b_val, NULL, version); | |
1488 | |
1489 len += itf8_put_blk(b, c->codec); | |
1490 len += itf8_put_blk(b, len2+len3); | |
1491 BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len)); | |
1492 BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val)); | |
1493 | |
1494 cram_free_block(b_len); | |
1495 cram_free_block(b_val); | |
1496 | |
1497 return len + len2 + len3; | |
1498 } | |
1499 | |
1500 cram_codec *cram_byte_array_len_encode_init(cram_stats *st, | |
1501 enum cram_external_type option, | |
1502 void *dat, | |
1503 int version) { | |
1504 cram_codec *c; | |
1505 cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat; | |
1506 | |
1507 c = malloc(sizeof(*c)); | |
1508 if (!c) | |
1509 return NULL; | |
1510 c->codec = E_BYTE_ARRAY_LEN; | |
1511 c->free = cram_byte_array_len_encode_free; | |
1512 c->encode = cram_byte_array_len_encode; | |
1513 c->store = cram_byte_array_len_encode_store; | |
1514 | |
1515 c->e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding, | |
1516 NULL, E_INT, | |
1517 e->len_dat, | |
1518 version); | |
1519 c->e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding, | |
1520 NULL, E_BYTE_ARRAY, | |
1521 e->val_dat, | |
1522 version); | |
1523 | |
1524 return c; | |
1525 } | |
1526 | |
1527 /* | |
1528 * --------------------------------------------------------------------------- | |
1529 * BYTE_ARRAY_STOP | |
1530 */ | |
1531 static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, | |
1532 cram_block *in, char *out, | |
1533 int *out_size) { | |
1534 int i; | |
1535 cram_block *b = NULL; | |
1536 char *cp, ch; | |
1537 | |
1538 if (slice->block_by_id) { | |
1539 if (!(b = slice->block_by_id[c->byte_array_stop.content_id])) | |
1540 return *out_size?-1:0; | |
1541 } else { | |
1542 for (i = 0; i < slice->hdr->num_blocks; i++) { | |
1543 b = slice->block[i]; | |
1544 if (b && b->content_type == EXTERNAL && | |
1545 b->content_id == c->byte_array_stop.content_id) { | |
1546 break; | |
1547 } | |
1548 } | |
1549 if (i == slice->hdr->num_blocks || !b) | |
1550 return -1; | |
1551 } | |
1552 | |
1553 if (b->idx >= b->uncomp_size) | |
1554 return -1; | |
1555 | |
1556 cp = (char *)b->data + b->idx; | |
1557 while ((ch = *cp) != (char)c->byte_array_stop.stop) { | |
1558 if (cp - (char *)b->data >= b->uncomp_size) | |
1559 return -1; | |
1560 *out++ = ch; | |
1561 cp++; | |
1562 } | |
1563 | |
1564 *out_size = cp - (char *)(b->data + b->idx); | |
1565 b->idx = cp - (char *)b->data + 1; | |
1566 | |
1567 return 0; | |
1568 } | |
1569 | |
1570 int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, | |
1571 cram_block *in, char *out_, | |
1572 int *out_size) { | |
1573 cram_block *b = NULL; | |
1574 cram_block *out = (cram_block *)out_; | |
1575 char *cp, *out_cp, *cp_end; | |
1576 char stop; | |
1577 | |
1578 if (slice->block_by_id) { | |
1579 if (!(b = slice->block_by_id[c->byte_array_stop.content_id])) | |
1580 return *out_size?-1:0; | |
1581 } else { | |
1582 int i; | |
1583 for (i = 0; i < slice->hdr->num_blocks; i++) { | |
1584 b = slice->block[i]; | |
1585 if (b && b->content_type == EXTERNAL && | |
1586 b->content_id == c->byte_array_stop.content_id) { | |
1587 break; | |
1588 } | |
1589 } | |
1590 if (i == slice->hdr->num_blocks || !b) | |
1591 return -1; | |
1592 } | |
1593 | |
1594 if (b->idx >= b->uncomp_size) | |
1595 return -1; | |
1596 cp = (char *)b->data + b->idx; | |
1597 cp_end = (char *)b->data + b->uncomp_size; | |
1598 out_cp = (char *)BLOCK_END(out); | |
1599 | |
1600 stop = c->byte_array_stop.stop; | |
1601 if (cp_end - cp < out->alloc - out->byte) { | |
1602 while (*cp != stop && cp != cp_end) | |
1603 *out_cp++ = *cp++; | |
1604 BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out); | |
1605 } else { | |
1606 char *cp_start; | |
1607 for (cp_start = cp; *cp != stop && cp != cp_end; cp++) | |
1608 ; | |
1609 BLOCK_APPEND(out, cp_start, cp - cp_start); | |
1610 BLOCK_GROW(out, cp - cp_start); | |
1611 } | |
1612 | |
1613 *out_size = cp - (char *)(b->data + b->idx); | |
1614 b->idx = cp - (char *)b->data + 1; | |
1615 | |
1616 return 0; | |
1617 } | |
1618 | |
1619 void cram_byte_array_stop_decode_free(cram_codec *c) { | |
1620 if (!c) return; | |
1621 | |
1622 free(c); | |
1623 } | |
1624 | |
1625 cram_codec *cram_byte_array_stop_decode_init(char *data, int size, | |
1626 enum cram_external_type option, | |
1627 int version) { | |
1628 cram_codec *c; | |
1629 unsigned char *cp = (unsigned char *)data; | |
1630 | |
1631 if (!(c = malloc(sizeof(*c)))) | |
1632 return NULL; | |
1633 | |
1634 c->codec = E_BYTE_ARRAY_STOP; | |
1635 c->decode = (option == E_BYTE_ARRAY_BLOCK) | |
1636 ? cram_byte_array_stop_decode_block | |
1637 : cram_byte_array_stop_decode_char; | |
1638 c->free = cram_byte_array_stop_decode_free; | |
1639 | |
1640 c->byte_array_stop.stop = *cp++; | |
1641 if (CRAM_MAJOR_VERS(version) == 1) { | |
1642 c->byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16) | |
1643 + (cp[3]<<24); | |
1644 cp += 4; | |
1645 } else { | |
1646 cp += itf8_get(cp, &c->byte_array_stop.content_id); | |
1647 } | |
1648 | |
1649 if ((char *)cp - data != size) { | |
1650 fprintf(stderr, "Malformed byte_array_stop header stream\n"); | |
1651 free(c); | |
1652 return NULL; | |
1653 } | |
1654 | |
1655 return c; | |
1656 } | |
1657 | |
1658 int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c, | |
1659 char *in, int in_size) { | |
1660 BLOCK_APPEND(c->out, in, in_size); | |
1661 BLOCK_APPEND_CHAR(c->out, c->e_byte_array_stop.stop); | |
1662 return 0; | |
1663 } | |
1664 | |
1665 void cram_byte_array_stop_encode_free(cram_codec *c) { | |
1666 if (!c) | |
1667 return; | |
1668 free(c); | |
1669 } | |
1670 | |
1671 int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, | |
1672 char *prefix, int version) { | |
1673 int len = 0; | |
1674 char buf[20], *cp = buf; | |
1675 | |
1676 if (prefix) { | |
1677 size_t l = strlen(prefix); | |
1678 BLOCK_APPEND(b, prefix, l); | |
1679 len += l; | |
1680 } | |
1681 | |
1682 cp += itf8_put(cp, c->codec); | |
1683 | |
1684 if (CRAM_MAJOR_VERS(version) == 1) { | |
1685 cp += itf8_put(cp, 5); | |
1686 *cp++ = c->e_byte_array_stop.stop; | |
1687 *cp++ = (c->e_byte_array_stop.content_id >> 0) & 0xff; | |
1688 *cp++ = (c->e_byte_array_stop.content_id >> 8) & 0xff; | |
1689 *cp++ = (c->e_byte_array_stop.content_id >> 16) & 0xff; | |
1690 *cp++ = (c->e_byte_array_stop.content_id >> 24) & 0xff; | |
1691 } else { | |
1692 cp += itf8_put(cp, 1 + itf8_size(c->e_byte_array_stop.content_id)); | |
1693 *cp++ = c->e_byte_array_stop.stop; | |
1694 cp += itf8_put(cp, c->e_byte_array_stop.content_id); | |
1695 } | |
1696 | |
1697 BLOCK_APPEND(b, buf, cp-buf); | |
1698 len += cp-buf; | |
1699 | |
1700 return len; | |
1701 } | |
1702 | |
1703 cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, | |
1704 enum cram_external_type option, | |
1705 void *dat, | |
1706 int version) { | |
1707 cram_codec *c; | |
1708 | |
1709 c = malloc(sizeof(*c)); | |
1710 if (!c) | |
1711 return NULL; | |
1712 c->codec = E_BYTE_ARRAY_STOP; | |
1713 c->free = cram_byte_array_stop_encode_free; | |
1714 c->encode = cram_byte_array_stop_encode; | |
1715 c->store = cram_byte_array_stop_encode_store; | |
1716 | |
1717 c->e_byte_array_stop.stop = ((int *)dat)[0]; | |
1718 c->e_byte_array_stop.content_id = ((int *)dat)[1]; | |
1719 | |
1720 return c; | |
1721 } | |
1722 | |
1723 /* | |
1724 * --------------------------------------------------------------------------- | |
1725 */ | |
1726 | |
1727 char *cram_encoding2str(enum cram_encoding t) { | |
1728 switch (t) { | |
1729 case E_NULL: return "NULL"; | |
1730 case E_EXTERNAL: return "EXTERNAL"; | |
1731 case E_GOLOMB: return "GOLOMB"; | |
1732 case E_HUFFMAN: return "HUFFMAN"; | |
1733 case E_BYTE_ARRAY_LEN: return "BYTE_ARRAY_LEN"; | |
1734 case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP"; | |
1735 case E_BETA: return "BETA"; | |
1736 case E_SUBEXP: return "SUBEXP"; | |
1737 case E_GOLOMB_RICE: return "GOLOMB_RICE"; | |
1738 case E_GAMMA: return "GAMMA"; | |
1739 } | |
1740 return "?"; | |
1741 } | |
1742 | |
1743 static cram_codec *(*decode_init[])(char *data, | |
1744 int size, | |
1745 enum cram_external_type option, | |
1746 int version) = { | |
1747 NULL, | |
1748 cram_external_decode_init, | |
1749 NULL, | |
1750 cram_huffman_decode_init, | |
1751 cram_byte_array_len_decode_init, | |
1752 cram_byte_array_stop_decode_init, | |
1753 cram_beta_decode_init, | |
1754 cram_subexp_decode_init, | |
1755 NULL, | |
1756 cram_gamma_decode_init, | |
1757 }; | |
1758 | |
1759 cram_codec *cram_decoder_init(enum cram_encoding codec, | |
1760 char *data, int size, | |
1761 enum cram_external_type option, | |
1762 int version) { | |
1763 if (decode_init[codec]) { | |
1764 return decode_init[codec](data, size, option, version); | |
1765 } else { | |
1766 fprintf(stderr, "Unimplemented codec of type %s\n", codec2str(codec)); | |
1767 return NULL; | |
1768 } | |
1769 } | |
1770 | |
1771 static cram_codec *(*encode_init[])(cram_stats *stx, | |
1772 enum cram_external_type option, | |
1773 void *opt, | |
1774 int version) = { | |
1775 NULL, | |
1776 cram_external_encode_init, | |
1777 NULL, | |
1778 cram_huffman_encode_init, | |
1779 cram_byte_array_len_encode_init, | |
1780 cram_byte_array_stop_encode_init, | |
1781 cram_beta_encode_init, | |
1782 NULL, //cram_subexp_encode_init, | |
1783 NULL, | |
1784 NULL, //cram_gamma_encode_init, | |
1785 }; | |
1786 | |
1787 cram_codec *cram_encoder_init(enum cram_encoding codec, | |
1788 cram_stats *st, | |
1789 enum cram_external_type option, | |
1790 void *dat, | |
1791 int version) { | |
1792 if (st && !st->nvals) | |
1793 return NULL; | |
1794 | |
1795 if (encode_init[codec]) { | |
1796 cram_codec *r; | |
1797 if ((r = encode_init[codec](st, option, dat, version))) | |
1798 r->out = NULL; | |
1799 return r; | |
1800 } else { | |
1801 fprintf(stderr, "Unimplemented codec of type %s\n", codec2str(codec)); | |
1802 abort(); | |
1803 } | |
1804 } | |
1805 | |
1806 /* | |
1807 * Returns the content_id used by this codec, also in id2 if byte_array_len. | |
1808 * Returns -1 for the CORE block and -2 for unneeded. | |
1809 * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs. | |
1810 */ | |
1811 int cram_codec_to_id(cram_codec *c, int *id2) { | |
1812 int bnum1, bnum2 = -2; | |
1813 | |
1814 switch (c->codec) { | |
1815 case E_HUFFMAN: | |
1816 bnum1 = c->huffman.ncodes == 1 ? -2 : -1; | |
1817 break; | |
1818 case E_GOLOMB: | |
1819 case E_BETA: | |
1820 case E_SUBEXP: | |
1821 case E_GOLOMB_RICE: | |
1822 case E_GAMMA: | |
1823 bnum1 = -1; | |
1824 break; | |
1825 case E_EXTERNAL: | |
1826 bnum1 = c->external.content_id; | |
1827 break; | |
1828 case E_BYTE_ARRAY_LEN: | |
1829 bnum1 = cram_codec_to_id(c->byte_array_len.len_codec, NULL); | |
1830 bnum2 = cram_codec_to_id(c->byte_array_len.value_codec, NULL); | |
1831 break; | |
1832 case E_BYTE_ARRAY_STOP: | |
1833 bnum1 = c->byte_array_stop.content_id; | |
1834 break; | |
1835 case E_NULL: | |
1836 bnum1 = -2; | |
1837 break; | |
1838 default: | |
1839 fprintf(stderr, "Unknown codec type %d\n", c->codec); | |
1840 bnum1 = -1; | |
1841 } | |
1842 | |
1843 if (id2) | |
1844 *id2 = bnum2; | |
1845 return bnum1; | |
1846 } |