comparison ezBAMQC/src/htslib/cram/cram_codecs.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * FIXME: add checking of cram_external_type to return NULL on unsupported
33 * {codec,type} tuples.
34 */
35
36 #ifdef HAVE_CONFIG_H
37 #include "io_lib_config.h"
38 #endif
39
40 #include <stdlib.h>
41 #include <string.h>
42 #include <assert.h>
43 #include <limits.h>
44
45 #include "cram/cram.h"
46
47 static char *codec2str(enum cram_encoding codec) {
48 switch (codec) {
49 case E_NULL: return "NULL";
50 case E_EXTERNAL: return "EXTERNAL";
51 case E_GOLOMB: return "GOLOMB";
52 case E_HUFFMAN: return "HUFFMAN";
53 case E_BYTE_ARRAY_LEN: return "BYTE_ARRAY_LEN";
54 case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
55 case E_BETA: return "BETA";
56 case E_SUBEXP: return "SUBEXP";
57 case E_GOLOMB_RICE: return "GOLOMB_RICE";
58 case E_GAMMA: return "GAMMA";
59 }
60
61 return "(unknown)";
62 }
63
64 /*
65 * ---------------------------------------------------------------------------
66 * Block bit-level I/O functions.
67 * All defined static here to promote easy inlining by the compiler.
68 */
69
70 #if 0
71 /* Get a single bit, MSB first */
72 static signed int get_bit_MSB(cram_block *block) {
73 unsigned int val;
74
75 if (block->byte > block->alloc)
76 return -1;
77
78 val = block->data[block->byte] >> block->bit;
79 if (--block->bit == -1) {
80 block->bit = 7;
81 block->byte++;
82 //printf("(%02X)", block->data[block->byte]);
83 }
84
85 //printf("-B%d-", val&1);
86
87 return val & 1;
88 }
89 #endif
90
91 /*
92 * Count number of successive 0 and 1 bits
93 */
94 static int get_one_bits_MSB(cram_block *block) {
95 int n = 0, b;
96 do {
97 b = block->data[block->byte] >> block->bit;
98 if (--block->bit == -1) {
99 block->bit = 7;
100 block->byte++;
101 }
102 n++;
103 } while (b&1);
104
105 return n-1;
106 }
107
108 static int get_zero_bits_MSB(cram_block *block) {
109 int n = 0, b;
110 do {
111 b = block->data[block->byte] >> block->bit;
112 if (--block->bit == -1) {
113 block->bit = 7;
114 block->byte++;
115 }
116 n++;
117 } while (!(b&1));
118
119 return n-1;
120 }
121
122 #if 0
123 /* Stores a single bit */
124 static void store_bit_MSB(cram_block *block, unsigned int bit) {
125 if (block->byte >= block->alloc) {
126 block->alloc = block->alloc ? block->alloc*2 : 1024;
127 block->data = realloc(block->data, block->alloc);
128 }
129
130 if (bit)
131 block->data[block->byte] |= (1 << block->bit);
132
133 if (--block->bit == -1) {
134 block->bit = 7;
135 block->byte++;
136 block->data[block->byte] = 0;
137 }
138 }
139 #endif
140
141 #if 0
142 /* Rounds to the next whole byte boundary first */
143 static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
144 if (block->bit != 7) {
145 block->bit = 7;
146 block->byte++;
147 }
148
149 while (block->byte + len >= block->alloc) {
150 block->alloc = block->alloc ? block->alloc*2 : 1024;
151 block->data = realloc(block->data, block->alloc);
152 }
153
154 memcpy(&block->data[block->byte], bytes, len);
155 block->byte += len;
156 }
157 #endif
158
159 /* Local optimised copy for inlining */
160 static inline unsigned int get_bits_MSB(cram_block *block, int nbits) {
161 unsigned int val = 0;
162 int i;
163
164 #if 0
165 // Fits within the current byte */
166 if (nbits <= block->bit+1) {
167 val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
168 if ((block->bit -= nbits) == -1) {
169 block->bit = 7;
170 block->byte++;
171 }
172 return val;
173 }
174
175 // partial first byte
176 val = block->data[block->byte] & ((1<<(block->bit+1))-1);
177 nbits -= block->bit+1;
178 block->bit = 7;
179 block->byte++;
180
181 // whole middle bytes
182 while (nbits >= 8) {
183 val = (val << 8) | block->data[block->byte++];
184 nbits -= 8;
185 }
186
187 val <<= nbits;
188 val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
189 block->bit -= nbits;
190 return val;
191 #endif
192
193 #if 0
194 /* Inefficient implementation! */
195 //printf("{");
196 for (i = 0; i < nbits; i++)
197 //val = (val << 1) | get_bit_MSB(block);
198 GET_BIT_MSB(block, val);
199 #endif
200
201 #if 1
202 /* Combination of 1st two methods */
203 if (nbits <= block->bit+1) {
204 val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
205 if ((block->bit -= nbits) == -1) {
206 block->bit = 7;
207 block->byte++;
208 }
209 return val;
210 }
211
212 switch(nbits) {
213 // case 15: GET_BIT_MSB(block, val);
214 // case 14: GET_BIT_MSB(block, val);
215 // case 13: GET_BIT_MSB(block, val);
216 // case 12: GET_BIT_MSB(block, val);
217 // case 11: GET_BIT_MSB(block, val);
218 // case 10: GET_BIT_MSB(block, val);
219 // case 9: GET_BIT_MSB(block, val);
220 case 8: GET_BIT_MSB(block, val);
221 case 7: GET_BIT_MSB(block, val);
222 case 6: GET_BIT_MSB(block, val);
223 case 5: GET_BIT_MSB(block, val);
224 case 4: GET_BIT_MSB(block, val);
225 case 3: GET_BIT_MSB(block, val);
226 case 2: GET_BIT_MSB(block, val);
227 case 1: GET_BIT_MSB(block, val);
228 break;
229
230 default:
231 for (i = 0; i < nbits; i++)
232 //val = (val << 1) | get_bit_MSB(block);
233 GET_BIT_MSB(block, val);
234 }
235 #endif
236
237 //printf("=0x%x}", val);
238
239 return val;
240 }
241
242 /*
243 * Can store up to 24-bits worth of data encoded in an integer value
244 * Possibly we'd want to have a less optimal store_bits function when dealing
245 * with nbits > 24, but for now we assume the codes generated are never
246 * that big. (Given this is only possible with 121392 or more
247 * characters with exactly the correct frequency distribution we check
248 * for it elsewhere.)
249 */
250 static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) {
251 /* fprintf(stderr, " store_bits: %02x %d\n", val, nbits); */
252
253 /*
254 * Use slow mode until we tweak the huffman generator to never generate
255 * codes longer than 24-bits.
256 */
257 unsigned int mask;
258
259 if (block->byte+4 >= block->alloc) {
260 if (block->byte) {
261 block->alloc *= 2;
262 block->data = realloc(block->data, block->alloc + 4);
263 if (!block->data)
264 return -1;
265 } else {
266 block->alloc = 1024;
267 block->data = realloc(block->data, block->alloc + 4);
268 if (!block->data)
269 return -1;
270 block->data[0] = 0; // initialise first byte of buffer
271 }
272 }
273
274 /* fits in current bit-field */
275 if (nbits <= block->bit+1) {
276 block->data[block->byte] |= (val << (block->bit+1-nbits));
277 if ((block->bit-=nbits) == -1) {
278 block->bit = 7;
279 block->byte++;
280 block->data[block->byte] = 0;
281 }
282 return 0;
283 }
284
285 block->data[block->byte] |= (val >> (nbits -= block->bit+1));
286 block->bit = 7;
287 block->byte++;
288 block->data[block->byte] = 0;
289
290 mask = 1<<(nbits-1);
291 do {
292 if (val & mask)
293 block->data[block->byte] |= (1 << block->bit);
294 if (--block->bit == -1) {
295 block->bit = 7;
296 block->byte++;
297 block->data[block->byte] = 0;
298 }
299 mask >>= 1;
300 } while(--nbits);
301
302 return 0;
303 }
304
305 /*
306 * Returns the next 'size' bytes from a block, or NULL if insufficient
307 * data left.This is just a pointer into the block data and not an
308 * allocated object, so do not free the result.
309 */
310 static char *cram_extract_block(cram_block *b, int size) {
311 char *cp = (char *)b->data + b->idx;
312 b->idx += size;
313 if (b->idx > b->uncomp_size)
314 return NULL;
315
316 return cp;
317 }
318
319 /*
320 * ---------------------------------------------------------------------------
321 * EXTERNAL
322 */
323 int cram_external_decode_int(cram_slice *slice, cram_codec *c,
324 cram_block *in, char *out, int *out_size) {
325 int i;
326 char *cp;
327 cram_block *b = NULL;
328
329 /* Find the external block */
330 if (slice->block_by_id) {
331 if (!(b = slice->block_by_id[c->external.content_id]))
332 return *out_size?-1:0;
333 } else {
334 for (i = 0; i < slice->hdr->num_blocks; i++) {
335 b = slice->block[i];
336 if (b && b->content_type == EXTERNAL &&
337 b->content_id == c->external.content_id) {
338 break;
339 }
340 }
341 if (i == slice->hdr->num_blocks || !b)
342 return -1;
343 }
344
345 cp = (char *)b->data + b->idx;
346 // E_INT and E_LONG are guaranteed single item queries
347 b->idx += itf8_get(cp, (int32_t *)out);
348 *out_size = 1;
349
350 return 0;
351 }
352
353 int cram_external_decode_char(cram_slice *slice, cram_codec *c,
354 cram_block *in, char *out,
355 int *out_size) {
356 int i;
357 char *cp;
358 cram_block *b = NULL;
359
360 /* Find the external block */
361 if (slice->block_by_id) {
362 if (!(b = slice->block_by_id[c->external.content_id]))
363 return *out_size?-1:0;
364 } else {
365 for (i = 0; i < slice->hdr->num_blocks; i++) {
366 b = slice->block[i];
367 if (b && b->content_type == EXTERNAL &&
368 b->content_id == c->external.content_id) {
369 break;
370 }
371 }
372 if (i == slice->hdr->num_blocks || !b)
373 return -1;
374 }
375
376 cp = cram_extract_block(b, *out_size);
377 if (!cp)
378 return -1;
379
380 memcpy(out, cp, *out_size);
381 return 0;
382 }
383
384 static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
385 cram_block *in, char *out_,
386 int *out_size) {
387 int i;
388 char *cp;
389 cram_block *b = NULL;
390 cram_block *out = (cram_block *)out_;
391
392 /* Find the external block */
393 if (slice->block_by_id) {
394 if (!(b = slice->block_by_id[c->external.content_id]))
395 return *out_size?-1:0;
396 } else {
397 for (i = 0; i < slice->hdr->num_blocks; i++) {
398 b = slice->block[i];
399 if (b && b->content_type == EXTERNAL &&
400 b->content_id == c->external.content_id) {
401 break;
402 }
403 }
404 if (i == slice->hdr->num_blocks || !b)
405 return -1;
406 }
407
408 cp = cram_extract_block(b, *out_size);
409 if (!cp)
410 return -1;
411
412 BLOCK_APPEND(out, cp, *out_size);
413 return 0;
414 }
415
416 void cram_external_decode_free(cram_codec *c) {
417 if (c)
418 free(c);
419 }
420
421 cram_codec *cram_external_decode_init(char *data, int size,
422 enum cram_external_type option,
423 int version) {
424 cram_codec *c;
425 char *cp = data;
426
427 if (!(c = malloc(sizeof(*c))))
428 return NULL;
429
430 c->codec = E_EXTERNAL;
431 if (option == E_INT || option == E_LONG)
432 c->decode = cram_external_decode_int;
433 else if (option == E_BYTE_ARRAY || option == E_BYTE)
434 c->decode = cram_external_decode_char;
435 else
436 c->decode = cram_external_decode_block;
437 c->free = cram_external_decode_free;
438
439 cp += itf8_get(cp, &c->external.content_id);
440
441 if (cp - data != size) {
442 fprintf(stderr, "Malformed external header stream\n");
443 free(c);
444 return NULL;
445 }
446
447 c->external.type = option;
448
449 return c;
450 }
451
452 int cram_external_encode_int(cram_slice *slice, cram_codec *c,
453 char *in, int in_size) {
454 uint32_t *i32 = (uint32_t *)in;
455
456 itf8_put_blk(c->out, *i32);
457 return 0;
458 }
459
460 int cram_external_encode_char(cram_slice *slice, cram_codec *c,
461 char *in, int in_size) {
462 BLOCK_APPEND(c->out, in, in_size);
463 return 0;
464 }
465
466 void cram_external_encode_free(cram_codec *c) {
467 if (!c)
468 return;
469 free(c);
470 }
471
472 int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
473 int version) {
474 char tmp[99], *tp = tmp;
475 int len = 0;
476
477 if (prefix) {
478 size_t l = strlen(prefix);
479 BLOCK_APPEND(b, prefix, l);
480 len += l;
481 }
482
483 tp += itf8_put(tp, c->e_external.content_id);
484 len += itf8_put_blk(b, c->codec);
485 len += itf8_put_blk(b, tp-tmp);
486 BLOCK_APPEND(b, tmp, tp-tmp);
487 len += tp-tmp;
488
489 return len;
490 }
491
492 cram_codec *cram_external_encode_init(cram_stats *st,
493 enum cram_external_type option,
494 void *dat,
495 int version) {
496 cram_codec *c;
497
498 c = malloc(sizeof(*c));
499 if (!c)
500 return NULL;
501 c->codec = E_EXTERNAL;
502 c->free = cram_external_encode_free;
503 if (option == E_INT || option == E_LONG)
504 c->encode = cram_external_encode_int;
505 else if (option == E_BYTE_ARRAY || option == E_BYTE)
506 c->encode = cram_external_encode_char;
507 else
508 abort();
509 c->store = cram_external_encode_store;
510
511 c->e_external.content_id = (size_t)dat;
512
513 return c;
514 }
515
516 /*
517 * ---------------------------------------------------------------------------
518 * BETA
519 */
520 int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
521 int32_t *out_i = (int32_t *)out;
522 int i, n;
523
524 if (c->beta.nbits) {
525 for (i = 0, n = *out_size; i < n; i++)
526 out_i[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
527 } else {
528 for (i = 0, n = *out_size; i < n; i++)
529 out_i[i] = -c->beta.offset;
530 }
531
532 return 0;
533 }
534
535 int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
536 int i, n;
537
538 if (c->beta.nbits) {
539 for (i = 0, n = *out_size; i < n; i++)
540 out[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
541 } else {
542 for (i = 0, n = *out_size; i < n; i++)
543 out[i] = -c->beta.offset;
544 }
545
546 return 0;
547 }
548
549 void cram_beta_decode_free(cram_codec *c) {
550 if (c)
551 free(c);
552 }
553
554 cram_codec *cram_beta_decode_init(char *data, int size,
555 enum cram_external_type option,
556 int version) {
557 cram_codec *c;
558 char *cp = data;
559
560 if (!(c = malloc(sizeof(*c))))
561 return NULL;
562
563 c->codec = E_BETA;
564 if (option == E_INT || option == E_LONG)
565 c->decode = cram_beta_decode_int;
566 else if (option == E_BYTE_ARRAY || option == E_BYTE)
567 c->decode = cram_beta_decode_char;
568 else
569 abort();
570 c->free = cram_beta_decode_free;
571
572 cp += itf8_get(cp, &c->beta.offset);
573 cp += itf8_get(cp, &c->beta.nbits);
574
575 if (cp - data != size) {
576 fprintf(stderr, "Malformed beta header stream\n");
577 free(c);
578 return NULL;
579 }
580
581 return c;
582 }
583
584 int cram_beta_encode_store(cram_codec *c, cram_block *b,
585 char *prefix, int version) {
586 int len = 0;
587
588 if (prefix) {
589 size_t l = strlen(prefix);
590 BLOCK_APPEND(b, prefix, l);
591 len += l;
592 }
593
594 len += itf8_put_blk(b, c->codec);
595 len += itf8_put_blk(b, itf8_size(c->e_beta.offset)
596 + itf8_size(c->e_beta.nbits)); // codec length
597 len += itf8_put_blk(b, c->e_beta.offset);
598 len += itf8_put_blk(b, c->e_beta.nbits);
599
600 return len;
601 }
602
603 int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
604 char *in, int in_size) {
605 int *syms = (int *)in;
606 int i, r = 0;
607
608 for (i = 0; i < in_size; i++)
609 r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
610 c->e_beta.nbits);
611
612 return r;
613 }
614
615 int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
616 char *in, int in_size) {
617 unsigned char *syms = (unsigned char *)in;
618 int i, r = 0;
619
620 for (i = 0; i < in_size; i++)
621 r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
622 c->e_beta.nbits);
623
624 return r;
625 }
626
627 void cram_beta_encode_free(cram_codec *c) {
628 if (c) free(c);
629 }
630
631 cram_codec *cram_beta_encode_init(cram_stats *st,
632 enum cram_external_type option,
633 void *dat,
634 int version) {
635 cram_codec *c;
636 int min_val, max_val, len = 0;
637
638 c = malloc(sizeof(*c));
639 if (!c)
640 return NULL;
641 c->codec = E_BETA;
642 c->free = cram_beta_encode_free;
643 if (option == E_INT)
644 c->encode = cram_beta_encode_int;
645 else
646 c->encode = cram_beta_encode_char;
647 c->store = cram_beta_encode_store;
648
649 if (dat) {
650 min_val = ((int *)dat)[0];
651 max_val = ((int *)dat)[1];
652 } else {
653 min_val = INT_MAX;
654 max_val = INT_MIN;
655 int i;
656 for (i = 0; i < MAX_STAT_VAL; i++) {
657 if (!st->freqs[i])
658 continue;
659 if (min_val > i)
660 min_val = i;
661 max_val = i;
662 }
663 if (st->h) {
664 khint_t k;
665
666 for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
667 if (!kh_exist(st->h, k))
668 continue;
669
670 i = kh_key(st->h, k);
671 if (min_val > i)
672 min_val = i;
673 if (max_val < i)
674 max_val = i;
675 }
676 }
677 }
678
679 assert(max_val >= min_val);
680 c->e_beta.offset = -min_val;
681 max_val -= min_val;
682 while (max_val) {
683 len++;
684 max_val >>= 1;
685 }
686 c->e_beta.nbits = len;
687
688 return c;
689 }
690
691 /*
692 * ---------------------------------------------------------------------------
693 * SUBEXP
694 */
695 int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
696 int32_t *out_i = (int32_t *)out;
697 int n, count;
698 int k = c->subexp.k;
699
700 for (count = 0, n = *out_size; count < n; count++) {
701 int i = 0, tail;
702 int val;
703
704 /* Get number of 1s */
705 //while (get_bit_MSB(in) == 1) i++;
706 i = get_one_bits_MSB(in);
707
708 /*
709 * Val is
710 * i > 0: 2^(k+i-1) + k+i-1 bits
711 * i = 0: k bits
712 */
713 if (i) {
714 tail = i + k-1;
715 val = 0;
716 while (tail) {
717 //val = val<<1; val |= get_bit_MSB(in);
718 GET_BIT_MSB(in, val);
719 tail--;
720 }
721 val += 1 << (i + k-1);
722 } else {
723 tail = k;
724 val = 0;
725 while (tail) {
726 //val = val<<1; val |= get_bit_MSB(in);
727 GET_BIT_MSB(in, val);
728 tail--;
729 }
730 }
731
732 out_i[count] = val - c->subexp.offset;
733 }
734
735 return 0;
736 }
737
738 void cram_subexp_decode_free(cram_codec *c) {
739 if (c)
740 free(c);
741 }
742
743 cram_codec *cram_subexp_decode_init(char *data, int size,
744 enum cram_external_type option,
745 int version) {
746 cram_codec *c;
747 char *cp = data;
748
749 if (!(c = malloc(sizeof(*c))))
750 return NULL;
751
752 c->codec = E_SUBEXP;
753 c->decode = cram_subexp_decode;
754 c->free = cram_subexp_decode_free;
755
756 cp += itf8_get(cp, &c->subexp.offset);
757 cp += itf8_get(cp, &c->subexp.k);
758
759 if (cp - data != size) {
760 fprintf(stderr, "Malformed subexp header stream\n");
761 free(c);
762 return NULL;
763 }
764
765 return c;
766 }
767
768 /*
769 * ---------------------------------------------------------------------------
770 * GAMMA
771 */
772 int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
773 int32_t *out_i = (int32_t *)out;
774 int i, n;
775
776 for (i = 0, n = *out_size; i < n; i++) {
777 int nz = 0;
778 int val;
779 //while (get_bit_MSB(in) == 0) nz++;
780 nz = get_zero_bits_MSB(in);
781 val = 1;
782 while (nz > 0) {
783 //val <<= 1; val |= get_bit_MSB(in);
784 GET_BIT_MSB(in, val);
785 nz--;
786 }
787
788 out_i[i] = val - c->gamma.offset;
789 }
790
791 return 0;
792 }
793
794 void cram_gamma_decode_free(cram_codec *c) {
795 if (c)
796 free(c);
797 }
798
799 cram_codec *cram_gamma_decode_init(char *data, int size,
800 enum cram_external_type option,
801 int version) {
802 cram_codec *c;
803 char *cp = data;
804
805 if (!(c = malloc(sizeof(*c))))
806 return NULL;
807
808 c->codec = E_GAMMA;
809 c->decode = cram_gamma_decode;
810 c->free = cram_gamma_decode_free;
811
812 cp += itf8_get(cp, &c->gamma.offset);
813
814 if (cp - data != size) {
815 fprintf(stderr, "Malformed gamma header stream\n");
816 free(c);
817 return NULL;
818 }
819
820 return c;
821 }
822
823 /*
824 * ---------------------------------------------------------------------------
825 * HUFFMAN
826 */
827
828 static int code_sort(const void *vp1, const void *vp2) {
829 const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
830 const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
831
832 if (c1->len != c2->len)
833 return c1->len - c2->len;
834 else
835 return c1->symbol - c2->symbol;
836 }
837
838 void cram_huffman_decode_free(cram_codec *c) {
839 if (!c)
840 return;
841
842 if (c->huffman.codes)
843 free(c->huffman.codes);
844 free(c);
845 }
846
847 int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
848 cram_block *in, char *out, int *out_size) {
849 int i, n;
850
851 /* Special case of 0 length codes */
852 for (i = 0, n = *out_size; i < n; i++) {
853 out[i] = c->huffman.codes[0].symbol;
854 }
855 return 0;
856 }
857
858 int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
859 cram_block *in, char *out, int *out_size) {
860 int i, n, ncodes = c->huffman.ncodes;
861 const cram_huffman_code * const codes = c->huffman.codes;
862
863 for (i = 0, n = *out_size; i < n; i++) {
864 int idx = 0;
865 int val = 0, len = 0, last_len = 0;
866
867 for (;;) {
868 int dlen = codes[idx].len - last_len;
869 if (dlen <= 0 || (in->alloc - in->byte)*8 + in->bit + 7 < dlen)
870 return -1;
871
872 //val <<= dlen;
873 //val |= get_bits_MSB(in, dlen);
874 //last_len = (len += dlen);
875
876 last_len = (len += dlen);
877 for (; dlen; dlen--) GET_BIT_MSB(in, val);
878
879 idx = val - codes[idx].p;
880 if (idx >= ncodes || idx < 0)
881 return -1;
882
883 if (codes[idx].code == val && codes[idx].len == len) {
884 out[i] = codes[idx].symbol;
885 break;
886 }
887 }
888 }
889
890 return 0;
891 }
892
893 int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
894 cram_block *in, char *out, int *out_size) {
895 int32_t *out_i = (int32_t *)out;
896 int i, n;
897 const cram_huffman_code * const codes = c->huffman.codes;
898
899 /* Special case of 0 length codes */
900 for (i = 0, n = *out_size; i < n; i++) {
901 out_i[i] = codes[0].symbol;
902 }
903 return 0;
904 }
905
906 int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
907 cram_block *in, char *out, int *out_size) {
908 int32_t *out_i = (int32_t *)out;
909 int i, n, ncodes = c->huffman.ncodes;
910 const cram_huffman_code * const codes = c->huffman.codes;
911
912 for (i = 0, n = *out_size; i < n; i++) {
913 int idx = 0;
914 int val = 0, len = 0, last_len = 0;
915
916 // Now one bit at a time for remaining checks
917 for (;;) {
918 int dlen = codes[idx].len - last_len;
919 if (dlen <= 0 || (in->alloc - in->byte)*8 + in->bit + 7 < dlen)
920 return -1;
921
922 //val <<= dlen;
923 //val |= get_bits_MSB(in, dlen);
924 //last_len = (len += dlen);
925
926 last_len = (len += dlen);
927 for (; dlen; dlen--) GET_BIT_MSB(in, val);
928
929 idx = val - codes[idx].p;
930 if (idx >= ncodes || idx < 0)
931 return -1;
932
933 if (codes[idx].code == val && codes[idx].len == len) {
934 out_i[i] = codes[idx].symbol;
935 break;
936 }
937 }
938 }
939
940 return 0;
941 }
942
943 /*
944 * Initialises a huffman decoder from an encoding data stream.
945 */
946 cram_codec *cram_huffman_decode_init(char *data, int size,
947 enum cram_external_type option,
948 int version) {
949 int32_t ncodes, i, j;
950 char *cp = data, *data_end = &data[size];
951 cram_codec *h;
952 cram_huffman_code *codes;
953 int32_t val, last_len, max_len = 0;
954
955 cp += itf8_get(cp, &ncodes);
956 h = calloc(1, sizeof(*h));
957 if (!h)
958 return NULL;
959
960 h->free = cram_huffman_decode_free;
961
962 h->huffman.ncodes = ncodes;
963 codes = h->huffman.codes = malloc(ncodes * sizeof(*codes));
964 if (!codes) {
965 free(h);
966 return NULL;
967 }
968
969 /* Read symbols and bit-lengths */
970 for (i = 0; i < ncodes && cp < data_end; i++) {
971 cp += itf8_get(cp, &codes[i].symbol);
972 }
973
974 if (cp >= data_end) {
975 fprintf(stderr, "Malformed huffman header stream\n");
976 free(h);
977 return NULL;
978 }
979 cp += itf8_get(cp, &i);
980 if (i != ncodes) {
981 fprintf(stderr, "Malformed huffman header stream\n");
982 free(h);
983 return NULL;
984 }
985
986 if (ncodes == 0) {
987 /* NULL huffman stream */
988 return h;
989 }
990
991 for (i = 0; i < ncodes && cp < data_end; i++) {
992 cp += itf8_get(cp, &codes[i].len);
993 if (max_len < codes[i].len)
994 max_len = codes[i].len;
995 }
996 if (cp - data != size || max_len >= ncodes) {
997 fprintf(stderr, "Malformed huffman header stream\n");
998 free(h);
999 return NULL;
1000 }
1001
1002 /* Sort by bit length and then by symbol value */
1003 qsort(codes, ncodes, sizeof(*codes), code_sort);
1004
1005 /* Assign canonical codes */
1006 val = -1, last_len = 0;
1007 for (i = 0; i < ncodes; i++) {
1008 val++;
1009 if (codes[i].len > last_len) {
1010 while (codes[i].len > last_len) {
1011 val <<= 1;
1012 last_len++;
1013 }
1014 }
1015 codes[i].code = val;
1016 }
1017
1018 /*
1019 * Compute the next starting point, offset by the i'th value.
1020 * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
1021 * codes[10..13].p = 30 - 10.
1022 */
1023 last_len = 0;
1024 for (i = j = 0; i < ncodes; i++) {
1025 if (codes[i].len > last_len) {
1026 j = codes[i].code - i;
1027 last_len = codes[i].len;
1028 }
1029 codes[i].p = j;
1030 }
1031
1032 // puts("==HUFF LEN==");
1033 // for (i = 0; i <= last_len+1; i++) {
1034 // printf("len %d=%d prefix %d\n", i, h->huffman.lengths[i], h->huffman.prefix[i]);
1035 // }
1036 // puts("===HUFFMAN CODES===");
1037 // for (i = 0; i < ncodes; i++) {
1038 // int j;
1039 // printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
1040 // j = codes[i].len;
1041 // while (j) {
1042 // putchar(codes[i].code & (1 << --j) ? '1' : '0');
1043 // }
1044 // printf(" %d\n", codes[i].code);
1045 // }
1046
1047 h->codec = E_HUFFMAN;
1048 if (option == E_BYTE || option == E_BYTE_ARRAY) {
1049 if (h->huffman.codes[0].len == 0)
1050 h->decode = cram_huffman_decode_char0;
1051 else
1052 h->decode = cram_huffman_decode_char;
1053 } else if (option == E_BYTE_ARRAY_BLOCK) {
1054 abort();
1055 } else {
1056 if (h->huffman.codes[0].len == 0)
1057 h->decode = cram_huffman_decode_int0;
1058 else
1059 h->decode = cram_huffman_decode_int;
1060 }
1061
1062 return (cram_codec *)h;
1063 }
1064
1065 int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
1066 char *in, int in_size) {
1067 return 0;
1068 }
1069
1070 int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
1071 char *in, int in_size) {
1072 int i, code, len, r = 0;
1073 unsigned char *syms = (unsigned char *)in;
1074
1075 do {
1076 int sym = *syms++;
1077 if (sym >= -1 && sym < MAX_HUFF) {
1078 i = c->e_huffman.val2code[sym+1];
1079 assert(c->e_huffman.codes[i].symbol == sym);
1080 code = c->e_huffman.codes[i].code;
1081 len = c->e_huffman.codes[i].len;
1082 } else {
1083 /* Slow - use a lookup table for when sym < MAX_HUFF? */
1084 for (i = 0; i < c->e_huffman.nvals; i++) {
1085 if (c->e_huffman.codes[i].symbol == sym)
1086 break;
1087 }
1088 if (i == c->e_huffman.nvals)
1089 return -1;
1090
1091 code = c->e_huffman.codes[i].code;
1092 len = c->e_huffman.codes[i].len;
1093 }
1094
1095 r |= store_bits_MSB(c->out, code, len);
1096 } while (--in_size);
1097
1098 return r;
1099 }
1100
1101 int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
1102 char *in, int in_size) {
1103 return 0;
1104 }
1105
1106 int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
1107 char *in, int in_size) {
1108 int i, code, len, r = 0;
1109 int *syms = (int *)in;
1110
1111 do {
1112 int sym = *syms++;
1113
1114 if (sym >= -1 && sym < MAX_HUFF) {
1115 i = c->e_huffman.val2code[sym+1];
1116 assert(c->e_huffman.codes[i].symbol == sym);
1117 code = c->e_huffman.codes[i].code;
1118 len = c->e_huffman.codes[i].len;
1119 } else {
1120 /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
1121 for (i = 0; i < c->e_huffman.nvals; i++) {
1122 if (c->e_huffman.codes[i].symbol == sym)
1123 break;
1124 }
1125 if (i == c->e_huffman.nvals)
1126 return -1;
1127
1128 code = c->e_huffman.codes[i].code;
1129 len = c->e_huffman.codes[i].len;
1130 }
1131
1132 r |= store_bits_MSB(c->out, code, len);
1133 } while (--in_size);
1134
1135 return r;
1136 }
1137
1138 void cram_huffman_encode_free(cram_codec *c) {
1139 if (!c)
1140 return;
1141
1142 if (c->e_huffman.codes)
1143 free(c->e_huffman.codes);
1144 free(c);
1145 }
1146
1147 /*
1148 * Encodes a huffman tree.
1149 * Returns number of bytes written.
1150 */
1151 int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
1152 int version) {
1153 int i, len = 0;
1154 cram_huffman_code *codes = c->e_huffman.codes;
1155 /*
1156 * Up to code length 127 means 2.5e+26 bytes of data required (worst
1157 * case huffman tree needs symbols with freqs matching the Fibonacci
1158 * series). So guaranteed 1 byte per code.
1159 *
1160 * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
1161 *
1162 * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
1163 */
1164 char *tmp = malloc(6*c->e_huffman.nvals+16);
1165 char *tp = tmp;
1166
1167 if (!tmp)
1168 return -1;
1169
1170 if (prefix) {
1171 size_t l = strlen(prefix);
1172 BLOCK_APPEND(b, prefix, l);
1173 len += l;
1174 }
1175
1176 tp += itf8_put(tp, c->e_huffman.nvals);
1177 for (i = 0; i < c->e_huffman.nvals; i++) {
1178 tp += itf8_put(tp, codes[i].symbol);
1179 }
1180
1181 tp += itf8_put(tp, c->e_huffman.nvals);
1182 for (i = 0; i < c->e_huffman.nvals; i++) {
1183 tp += itf8_put(tp, codes[i].len);
1184 }
1185
1186 len += itf8_put_blk(b, c->codec);
1187 len += itf8_put_blk(b, tp-tmp);
1188 BLOCK_APPEND(b, tmp, tp-tmp);
1189 len += tp-tmp;
1190
1191 free(tmp);
1192
1193 return len;
1194 }
1195
1196 cram_codec *cram_huffman_encode_init(cram_stats *st,
1197 enum cram_external_type option,
1198 void *dat,
1199 int version) {
1200 int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens, code, len;
1201 int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
1202 cram_codec *c;
1203 cram_huffman_code *codes;
1204
1205 c = malloc(sizeof(*c));
1206 if (!c)
1207 return NULL;
1208 c->codec = E_HUFFMAN;
1209
1210 /* Count number of unique symbols */
1211 for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
1212 if (!st->freqs[i])
1213 continue;
1214 if (nvals >= vals_alloc) {
1215 vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
1216 vals = realloc(vals, vals_alloc * sizeof(int));
1217 freqs = realloc(freqs, vals_alloc * sizeof(int));
1218 if (!vals || !freqs) {
1219 if (vals) free(vals);
1220 if (freqs) free(freqs);
1221 free(c);
1222 return NULL;
1223 }
1224 }
1225 vals[nvals] = i;
1226 freqs[nvals] = st->freqs[i];
1227 assert(st->freqs[i] > 0);
1228 ntot += freqs[nvals];
1229 if (max_val < i) max_val = i;
1230 if (min_val > i) min_val = i;
1231 nvals++;
1232 }
1233 if (st->h) {
1234 khint_t k;
1235
1236 for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
1237 if (!kh_exist(st->h, k))
1238 continue;
1239 if (nvals >= vals_alloc) {
1240 vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
1241 vals = realloc(vals, vals_alloc * sizeof(int));
1242 freqs = realloc(freqs, vals_alloc * sizeof(int));
1243 if (!vals || !freqs)
1244 return NULL;
1245 }
1246 vals[nvals]= kh_key(st->h, k);
1247 freqs[nvals] = kh_val(st->h, k);
1248 assert(freqs[nvals] > 0);
1249 ntot += freqs[nvals];
1250 if (max_val < i) max_val = i;
1251 if (min_val > i) min_val = i;
1252 nvals++;
1253 }
1254 }
1255
1256 assert(nvals > 0);
1257
1258 freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
1259 lens = calloc(2*nvals, sizeof(*lens));
1260 if (!lens || !freqs)
1261 return NULL;
1262
1263 /* Inefficient, use pointers to form chain so we can insert and maintain
1264 * a sorted list? This is currently O(nvals^2) complexity.
1265 */
1266 for (;;) {
1267 int low1 = INT_MAX, low2 = INT_MAX;
1268 int ind1 = 0, ind2 = 0;
1269 for (i = 0; i < nvals; i++) {
1270 if (freqs[i] < 0)
1271 continue;
1272 if (low1 > freqs[i])
1273 low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
1274 else if (low2 > freqs[i])
1275 low2 = freqs[i], ind2 = i;
1276 }
1277 if (low2 == INT_MAX)
1278 break;
1279
1280 freqs[nvals] = low1 + low2;
1281 lens[ind1] = nvals;
1282 lens[ind2] = nvals;
1283 freqs[ind1] *= -1;
1284 freqs[ind2] *= -1;
1285 nvals++;
1286 }
1287 nvals = nvals/2+1;
1288
1289 /* Assign lengths */
1290 for (i = 0; i < nvals; i++) {
1291 int code_len = 0;
1292 for (k = lens[i]; k; k = lens[k])
1293 code_len++;
1294 lens[i] = code_len;
1295 freqs[i] *= -1;
1296 //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
1297 }
1298
1299
1300 /* Sort, need in a struct */
1301 if (!(codes = malloc(nvals * sizeof(*codes))))
1302 return NULL;
1303 for (i = 0; i < nvals; i++) {
1304 codes[i].symbol = vals[i];
1305 codes[i].len = lens[i];
1306 }
1307 qsort(codes, nvals, sizeof(*codes), code_sort);
1308
1309 /*
1310 * Generate canonical codes from lengths.
1311 * Sort by length.
1312 * Start with 0.
1313 * Every new code of same length is +1.
1314 * Every new code of new length is +1 then <<1 per extra length.
1315 *
1316 * /\
1317 * a/\
1318 * /\/\
1319 * bcd/\
1320 * ef
1321 *
1322 * a 1 0
1323 * b 3 4 (0+1)<<2
1324 * c 3 5
1325 * d 3 6
1326 * e 4 14 (6+1)<<1
1327 * f 5 15
1328 */
1329 code = 0; len = codes[0].len;
1330 for (i = 0; i < nvals; i++) {
1331 while (len != codes[i].len) {
1332 code<<=1;
1333 len++;
1334 }
1335 codes[i].code = code++;
1336
1337 if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
1338 c->e_huffman.val2code[codes[i].symbol+1] = i;
1339
1340 //fprintf(stderr, "sym %d, code %d, len %d\n",
1341 // codes[i].symbol, codes[i].code, codes[i].len);
1342 }
1343
1344 free(lens);
1345 free(vals);
1346 free(freqs);
1347
1348 c->e_huffman.codes = codes;
1349 c->e_huffman.nvals = nvals;
1350
1351 c->free = cram_huffman_encode_free;
1352 if (option == E_BYTE || option == E_BYTE_ARRAY) {
1353 if (c->e_huffman.codes[0].len == 0)
1354 c->encode = cram_huffman_encode_char0;
1355 else
1356 c->encode = cram_huffman_encode_char;
1357 } else {
1358 if (c->e_huffman.codes[0].len == 0)
1359 c->encode = cram_huffman_encode_int0;
1360 else
1361 c->encode = cram_huffman_encode_int;
1362 }
1363 c->store = cram_huffman_encode_store;
1364
1365 return c;
1366 }
1367
1368 /*
1369 * ---------------------------------------------------------------------------
1370 * BYTE_ARRAY_LEN
1371 */
1372 int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
1373 cram_block *in, char *out,
1374 int *out_size) {
1375 /* Fetch length */
1376 int32_t len, one = 1;
1377
1378 c->byte_array_len.len_codec->decode(slice, c->byte_array_len.len_codec, in, (char *)&len, &one);
1379 //printf("ByteArray Len=%d\n", len);
1380
1381 if (c->byte_array_len.value_codec) {
1382 c->byte_array_len.value_codec->decode(slice,
1383 c->byte_array_len.value_codec,
1384 in, out, &len);
1385 } else {
1386 return -1;
1387 }
1388
1389 *out_size = len;
1390
1391 return 0;
1392 }
1393
1394 void cram_byte_array_len_decode_free(cram_codec *c) {
1395 if (!c) return;
1396
1397 if (c->byte_array_len.len_codec)
1398 c->byte_array_len.len_codec->free(c->byte_array_len.len_codec);
1399
1400 if (c->byte_array_len.value_codec)
1401 c->byte_array_len.value_codec->free(c->byte_array_len.value_codec);
1402
1403 free(c);
1404 }
1405
1406 cram_codec *cram_byte_array_len_decode_init(char *data, int size,
1407 enum cram_external_type option,
1408 int version) {
1409 cram_codec *c;
1410 char *cp = data;
1411 int32_t encoding;
1412 int32_t sub_size;
1413
1414 if (!(c = malloc(sizeof(*c))))
1415 return NULL;
1416
1417 c->codec = E_BYTE_ARRAY_LEN;
1418 c->decode = cram_byte_array_len_decode;
1419 c->free = cram_byte_array_len_decode_free;
1420
1421 cp += itf8_get(cp, &encoding);
1422 cp += itf8_get(cp, &sub_size);
1423 c->byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size,
1424 E_INT, version);
1425 cp += sub_size;
1426
1427 cp += itf8_get(cp, &encoding);
1428 cp += itf8_get(cp, &sub_size);
1429 c->byte_array_len.value_codec = cram_decoder_init(encoding, cp, sub_size,
1430 option, version);
1431 cp += sub_size;
1432
1433 if (cp - data != size) {
1434 fprintf(stderr, "Malformed byte_array_len header stream\n");
1435 free(c);
1436 return NULL;
1437 }
1438
1439 return c;
1440 }
1441
1442 int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
1443 char *in, int in_size) {
1444 int32_t i32 = in_size;
1445 int r = 0;
1446
1447 r |= c->e_byte_array_len.len_codec->encode(slice,
1448 c->e_byte_array_len.len_codec,
1449 (char *)&i32, 1);
1450 r |= c->e_byte_array_len.val_codec->encode(slice,
1451 c->e_byte_array_len.val_codec,
1452 in, in_size);
1453 return r;
1454 }
1455
1456 void cram_byte_array_len_encode_free(cram_codec *c) {
1457 if (!c)
1458 return;
1459
1460 if (c->e_byte_array_len.len_codec)
1461 c->e_byte_array_len.len_codec->free(c->e_byte_array_len.len_codec);
1462
1463 if (c->e_byte_array_len.val_codec)
1464 c->e_byte_array_len.val_codec->free(c->e_byte_array_len.val_codec);
1465
1466 free(c);
1467 }
1468
1469 int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
1470 char *prefix, int version) {
1471 int len = 0, len2, len3;
1472 cram_codec *tc;
1473 cram_block *b_len, *b_val;
1474
1475 if (prefix) {
1476 size_t l = strlen(prefix);
1477 BLOCK_APPEND(b, prefix, l);
1478 len += l;
1479 }
1480
1481 tc = c->e_byte_array_len.len_codec;
1482 b_len = cram_new_block(0, 0);
1483 len2 = tc->store(tc, b_len, NULL, version);
1484
1485 tc = c->e_byte_array_len.val_codec;
1486 b_val = cram_new_block(0, 0);
1487 len3 = tc->store(tc, b_val, NULL, version);
1488
1489 len += itf8_put_blk(b, c->codec);
1490 len += itf8_put_blk(b, len2+len3);
1491 BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
1492 BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
1493
1494 cram_free_block(b_len);
1495 cram_free_block(b_val);
1496
1497 return len + len2 + len3;
1498 }
1499
1500 cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
1501 enum cram_external_type option,
1502 void *dat,
1503 int version) {
1504 cram_codec *c;
1505 cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
1506
1507 c = malloc(sizeof(*c));
1508 if (!c)
1509 return NULL;
1510 c->codec = E_BYTE_ARRAY_LEN;
1511 c->free = cram_byte_array_len_encode_free;
1512 c->encode = cram_byte_array_len_encode;
1513 c->store = cram_byte_array_len_encode_store;
1514
1515 c->e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
1516 NULL, E_INT,
1517 e->len_dat,
1518 version);
1519 c->e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
1520 NULL, E_BYTE_ARRAY,
1521 e->val_dat,
1522 version);
1523
1524 return c;
1525 }
1526
1527 /*
1528 * ---------------------------------------------------------------------------
1529 * BYTE_ARRAY_STOP
1530 */
1531 static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
1532 cram_block *in, char *out,
1533 int *out_size) {
1534 int i;
1535 cram_block *b = NULL;
1536 char *cp, ch;
1537
1538 if (slice->block_by_id) {
1539 if (!(b = slice->block_by_id[c->byte_array_stop.content_id]))
1540 return *out_size?-1:0;
1541 } else {
1542 for (i = 0; i < slice->hdr->num_blocks; i++) {
1543 b = slice->block[i];
1544 if (b && b->content_type == EXTERNAL &&
1545 b->content_id == c->byte_array_stop.content_id) {
1546 break;
1547 }
1548 }
1549 if (i == slice->hdr->num_blocks || !b)
1550 return -1;
1551 }
1552
1553 if (b->idx >= b->uncomp_size)
1554 return -1;
1555
1556 cp = (char *)b->data + b->idx;
1557 while ((ch = *cp) != (char)c->byte_array_stop.stop) {
1558 if (cp - (char *)b->data >= b->uncomp_size)
1559 return -1;
1560 *out++ = ch;
1561 cp++;
1562 }
1563
1564 *out_size = cp - (char *)(b->data + b->idx);
1565 b->idx = cp - (char *)b->data + 1;
1566
1567 return 0;
1568 }
1569
1570 int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
1571 cram_block *in, char *out_,
1572 int *out_size) {
1573 cram_block *b = NULL;
1574 cram_block *out = (cram_block *)out_;
1575 char *cp, *out_cp, *cp_end;
1576 char stop;
1577
1578 if (slice->block_by_id) {
1579 if (!(b = slice->block_by_id[c->byte_array_stop.content_id]))
1580 return *out_size?-1:0;
1581 } else {
1582 int i;
1583 for (i = 0; i < slice->hdr->num_blocks; i++) {
1584 b = slice->block[i];
1585 if (b && b->content_type == EXTERNAL &&
1586 b->content_id == c->byte_array_stop.content_id) {
1587 break;
1588 }
1589 }
1590 if (i == slice->hdr->num_blocks || !b)
1591 return -1;
1592 }
1593
1594 if (b->idx >= b->uncomp_size)
1595 return -1;
1596 cp = (char *)b->data + b->idx;
1597 cp_end = (char *)b->data + b->uncomp_size;
1598 out_cp = (char *)BLOCK_END(out);
1599
1600 stop = c->byte_array_stop.stop;
1601 if (cp_end - cp < out->alloc - out->byte) {
1602 while (*cp != stop && cp != cp_end)
1603 *out_cp++ = *cp++;
1604 BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out);
1605 } else {
1606 char *cp_start;
1607 for (cp_start = cp; *cp != stop && cp != cp_end; cp++)
1608 ;
1609 BLOCK_APPEND(out, cp_start, cp - cp_start);
1610 BLOCK_GROW(out, cp - cp_start);
1611 }
1612
1613 *out_size = cp - (char *)(b->data + b->idx);
1614 b->idx = cp - (char *)b->data + 1;
1615
1616 return 0;
1617 }
1618
1619 void cram_byte_array_stop_decode_free(cram_codec *c) {
1620 if (!c) return;
1621
1622 free(c);
1623 }
1624
1625 cram_codec *cram_byte_array_stop_decode_init(char *data, int size,
1626 enum cram_external_type option,
1627 int version) {
1628 cram_codec *c;
1629 unsigned char *cp = (unsigned char *)data;
1630
1631 if (!(c = malloc(sizeof(*c))))
1632 return NULL;
1633
1634 c->codec = E_BYTE_ARRAY_STOP;
1635 c->decode = (option == E_BYTE_ARRAY_BLOCK)
1636 ? cram_byte_array_stop_decode_block
1637 : cram_byte_array_stop_decode_char;
1638 c->free = cram_byte_array_stop_decode_free;
1639
1640 c->byte_array_stop.stop = *cp++;
1641 if (CRAM_MAJOR_VERS(version) == 1) {
1642 c->byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
1643 + (cp[3]<<24);
1644 cp += 4;
1645 } else {
1646 cp += itf8_get(cp, &c->byte_array_stop.content_id);
1647 }
1648
1649 if ((char *)cp - data != size) {
1650 fprintf(stderr, "Malformed byte_array_stop header stream\n");
1651 free(c);
1652 return NULL;
1653 }
1654
1655 return c;
1656 }
1657
1658 int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
1659 char *in, int in_size) {
1660 BLOCK_APPEND(c->out, in, in_size);
1661 BLOCK_APPEND_CHAR(c->out, c->e_byte_array_stop.stop);
1662 return 0;
1663 }
1664
1665 void cram_byte_array_stop_encode_free(cram_codec *c) {
1666 if (!c)
1667 return;
1668 free(c);
1669 }
1670
1671 int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
1672 char *prefix, int version) {
1673 int len = 0;
1674 char buf[20], *cp = buf;
1675
1676 if (prefix) {
1677 size_t l = strlen(prefix);
1678 BLOCK_APPEND(b, prefix, l);
1679 len += l;
1680 }
1681
1682 cp += itf8_put(cp, c->codec);
1683
1684 if (CRAM_MAJOR_VERS(version) == 1) {
1685 cp += itf8_put(cp, 5);
1686 *cp++ = c->e_byte_array_stop.stop;
1687 *cp++ = (c->e_byte_array_stop.content_id >> 0) & 0xff;
1688 *cp++ = (c->e_byte_array_stop.content_id >> 8) & 0xff;
1689 *cp++ = (c->e_byte_array_stop.content_id >> 16) & 0xff;
1690 *cp++ = (c->e_byte_array_stop.content_id >> 24) & 0xff;
1691 } else {
1692 cp += itf8_put(cp, 1 + itf8_size(c->e_byte_array_stop.content_id));
1693 *cp++ = c->e_byte_array_stop.stop;
1694 cp += itf8_put(cp, c->e_byte_array_stop.content_id);
1695 }
1696
1697 BLOCK_APPEND(b, buf, cp-buf);
1698 len += cp-buf;
1699
1700 return len;
1701 }
1702
1703 cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
1704 enum cram_external_type option,
1705 void *dat,
1706 int version) {
1707 cram_codec *c;
1708
1709 c = malloc(sizeof(*c));
1710 if (!c)
1711 return NULL;
1712 c->codec = E_BYTE_ARRAY_STOP;
1713 c->free = cram_byte_array_stop_encode_free;
1714 c->encode = cram_byte_array_stop_encode;
1715 c->store = cram_byte_array_stop_encode_store;
1716
1717 c->e_byte_array_stop.stop = ((int *)dat)[0];
1718 c->e_byte_array_stop.content_id = ((int *)dat)[1];
1719
1720 return c;
1721 }
1722
1723 /*
1724 * ---------------------------------------------------------------------------
1725 */
1726
1727 char *cram_encoding2str(enum cram_encoding t) {
1728 switch (t) {
1729 case E_NULL: return "NULL";
1730 case E_EXTERNAL: return "EXTERNAL";
1731 case E_GOLOMB: return "GOLOMB";
1732 case E_HUFFMAN: return "HUFFMAN";
1733 case E_BYTE_ARRAY_LEN: return "BYTE_ARRAY_LEN";
1734 case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
1735 case E_BETA: return "BETA";
1736 case E_SUBEXP: return "SUBEXP";
1737 case E_GOLOMB_RICE: return "GOLOMB_RICE";
1738 case E_GAMMA: return "GAMMA";
1739 }
1740 return "?";
1741 }
1742
1743 static cram_codec *(*decode_init[])(char *data,
1744 int size,
1745 enum cram_external_type option,
1746 int version) = {
1747 NULL,
1748 cram_external_decode_init,
1749 NULL,
1750 cram_huffman_decode_init,
1751 cram_byte_array_len_decode_init,
1752 cram_byte_array_stop_decode_init,
1753 cram_beta_decode_init,
1754 cram_subexp_decode_init,
1755 NULL,
1756 cram_gamma_decode_init,
1757 };
1758
1759 cram_codec *cram_decoder_init(enum cram_encoding codec,
1760 char *data, int size,
1761 enum cram_external_type option,
1762 int version) {
1763 if (decode_init[codec]) {
1764 return decode_init[codec](data, size, option, version);
1765 } else {
1766 fprintf(stderr, "Unimplemented codec of type %s\n", codec2str(codec));
1767 return NULL;
1768 }
1769 }
1770
1771 static cram_codec *(*encode_init[])(cram_stats *stx,
1772 enum cram_external_type option,
1773 void *opt,
1774 int version) = {
1775 NULL,
1776 cram_external_encode_init,
1777 NULL,
1778 cram_huffman_encode_init,
1779 cram_byte_array_len_encode_init,
1780 cram_byte_array_stop_encode_init,
1781 cram_beta_encode_init,
1782 NULL, //cram_subexp_encode_init,
1783 NULL,
1784 NULL, //cram_gamma_encode_init,
1785 };
1786
1787 cram_codec *cram_encoder_init(enum cram_encoding codec,
1788 cram_stats *st,
1789 enum cram_external_type option,
1790 void *dat,
1791 int version) {
1792 if (st && !st->nvals)
1793 return NULL;
1794
1795 if (encode_init[codec]) {
1796 cram_codec *r;
1797 if ((r = encode_init[codec](st, option, dat, version)))
1798 r->out = NULL;
1799 return r;
1800 } else {
1801 fprintf(stderr, "Unimplemented codec of type %s\n", codec2str(codec));
1802 abort();
1803 }
1804 }
1805
1806 /*
1807 * Returns the content_id used by this codec, also in id2 if byte_array_len.
1808 * Returns -1 for the CORE block and -2 for unneeded.
1809 * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
1810 */
1811 int cram_codec_to_id(cram_codec *c, int *id2) {
1812 int bnum1, bnum2 = -2;
1813
1814 switch (c->codec) {
1815 case E_HUFFMAN:
1816 bnum1 = c->huffman.ncodes == 1 ? -2 : -1;
1817 break;
1818 case E_GOLOMB:
1819 case E_BETA:
1820 case E_SUBEXP:
1821 case E_GOLOMB_RICE:
1822 case E_GAMMA:
1823 bnum1 = -1;
1824 break;
1825 case E_EXTERNAL:
1826 bnum1 = c->external.content_id;
1827 break;
1828 case E_BYTE_ARRAY_LEN:
1829 bnum1 = cram_codec_to_id(c->byte_array_len.len_codec, NULL);
1830 bnum2 = cram_codec_to_id(c->byte_array_len.value_codec, NULL);
1831 break;
1832 case E_BYTE_ARRAY_STOP:
1833 bnum1 = c->byte_array_stop.content_id;
1834 break;
1835 case E_NULL:
1836 bnum1 = -2;
1837 break;
1838 default:
1839 fprintf(stderr, "Unknown codec type %d\n", c->codec);
1840 bnum1 = -1;
1841 }
1842
1843 if (id2)
1844 *id2 = bnum2;
1845 return bnum1;
1846 }