comparison ezBAMQC/src/htslib/cram/cram_io.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /*
2 Copyright (c) 2012-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * CRAM I/O primitives.
33 *
34 * - ITF8 encoding and decoding.
35 * - Block based I/O
36 * - Zlib inflating and deflating (memory)
37 * - CRAM basic data structure reading and writing
38 * - File opening / closing
39 * - Reference sequence handling
40 */
41
42 /*
43 * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
44 * a way to return errors for when malloc fails.
45 */
46
47 #ifdef HAVE_CONFIG_H
48 #include "io_lib_config.h"
49 #endif
50
51 #include <stdio.h>
52 #include <errno.h>
53 #include <assert.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <zlib.h>
57 #ifdef HAVE_LIBBZ2
58 #include <bzlib.h>
59 #endif
60 #ifdef HAVE_LIBLZMA
61 #include <lzma.h>
62 #endif
63 #include <sys/types.h>
64 #include <sys/stat.h>
65 #include <math.h>
66 #include <ctype.h>
67
68 #include "cram/cram.h"
69 #include "cram/os.h"
70 #include "cram/md5.h"
71 #include "cram/open_trace_file.h"
72 #include "cram/rANS_static.h"
73
74 //#define REF_DEBUG
75
76 #ifdef REF_DEBUG
77 #include <sys/syscall.h>
78 #define gettid() (int)syscall(SYS_gettid)
79
80 #define RP(...) fprintf (stderr, __VA_ARGS__)
81 #else
82 #define RP(...)
83 #endif
84
85 #include "htslib/hfile.h"
86 #include "htslib/bgzf.h"
87 #include "htslib/faidx.h"
88
89 #define TRIAL_SPAN 50
90 #define NTRIALS 3
91
92
93 /* ----------------------------------------------------------------------
94 * ITF8 encoding and decoding.
95 *
96 * Also see the itf8_get and itf8_put macros in cram_io.h
97 */
98
99 /*
100 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
101 * *val.
102 *
103 * Returns the number of bytes read on success
104 * -1 on failure
105 */
106 int itf8_decode(cram_fd *fd, int32_t *val_p) {
107 static int nbytes[16] = {
108 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
109 1,1,1,1, // 1000xxxx - 1011xxxx
110 2,2, // 1100xxxx - 1101xxxx
111 3, // 1110xxxx
112 4, // 1111xxxx
113 };
114
115 static int nbits[16] = {
116 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
117 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
118 0x1f, 0x1f, // 1100xxxx - 1101xxxx
119 0x0f, // 1110xxxx
120 0x0f, // 1111xxxx
121 };
122
123 int32_t val = hgetc(fd->fp);
124 if (val == -1)
125 return -1;
126
127 int i = nbytes[val>>4];
128 val &= nbits[val>>4];
129
130 switch(i) {
131 case 0:
132 *val_p = val;
133 return 1;
134
135 case 1:
136 val = (val<<8) | (unsigned char)hgetc(fd->fp);
137 *val_p = val;
138 return 2;
139
140 case 2:
141 val = (val<<8) | (unsigned char)hgetc(fd->fp);
142 val = (val<<8) | (unsigned char)hgetc(fd->fp);
143 *val_p = val;
144 return 3;
145
146 case 3:
147 val = (val<<8) | (unsigned char)hgetc(fd->fp);
148 val = (val<<8) | (unsigned char)hgetc(fd->fp);
149 val = (val<<8) | (unsigned char)hgetc(fd->fp);
150 *val_p = val;
151 return 4;
152
153 case 4: // really 3.5 more, why make it different?
154 val = (val<<8) | (unsigned char)hgetc(fd->fp);
155 val = (val<<8) | (unsigned char)hgetc(fd->fp);
156 val = (val<<8) | (unsigned char)hgetc(fd->fp);
157 val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
158 *val_p = val;
159 }
160
161 return 5;
162 }
163
164 /*
165 * Encodes and writes a single integer in ITF-8 format.
166 * Returns 0 on success
167 * -1 on failure
168 */
169 int itf8_encode(cram_fd *fd, int32_t val) {
170 char buf[5];
171 int len = itf8_put(buf, val);
172 return hwrite(fd->fp, buf, len) == len ? 0 : -1;
173 }
174
175 #ifndef ITF8_MACROS
176 /*
177 * As above, but decoding from memory
178 */
179 int itf8_get(char *cp, int32_t *val_p) {
180 unsigned char *up = (unsigned char *)cp;
181
182 if (up[0] < 0x80) {
183 *val_p = up[0];
184 return 1;
185 } else if (up[0] < 0xc0) {
186 *val_p = ((up[0] <<8) | up[1]) & 0x3fff;
187 return 2;
188 } else if (up[0] < 0xe0) {
189 *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff;
190 return 3;
191 } else if (up[0] < 0xf0) {
192 *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
193 return 4;
194 } else {
195 *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
196 return 5;
197 }
198 }
199
200 /*
201 * Stores a value to memory in ITF-8 format.
202 *
203 * Returns the number of bytes required to store the number.
204 * This is a maximum of 5 bytes.
205 */
206 int itf8_put(char *cp, int32_t val) {
207 if (!(val & ~0x00000007f)) { // 1 byte
208 *cp = val;
209 return 1;
210 } else if (!(val & ~0x00003fff)) { // 2 byte
211 *cp++ = (val >> 8 ) | 0x80;
212 *cp = val & 0xff;
213 return 2;
214 } else if (!(val & ~0x01fffff)) { // 3 byte
215 *cp++ = (val >> 16) | 0xc0;
216 *cp++ = (val >> 8 ) & 0xff;
217 *cp = val & 0xff;
218 return 3;
219 } else if (!(val & ~0x0fffffff)) { // 4 byte
220 *cp++ = (val >> 24) | 0xe0;
221 *cp++ = (val >> 16) & 0xff;
222 *cp++ = (val >> 8 ) & 0xff;
223 *cp = val & 0xff;
224 return 4;
225 } else { // 5 byte
226 *cp++ = 0xf0 | ((val>>28) & 0xff);
227 *cp++ = (val >> 20) & 0xff;
228 *cp++ = (val >> 12) & 0xff;
229 *cp++ = (val >> 4 ) & 0xff;
230 *cp = val & 0x0f;
231 return 5;
232 }
233 }
234 #endif
235
236 /* 64-bit itf8 variant */
237 int ltf8_put(char *cp, int64_t val) {
238 if (!(val & ~((1LL<<7)-1))) {
239 *cp = val;
240 return 1;
241 } else if (!(val & ~((1LL<<(6+8))-1))) {
242 *cp++ = (val >> 8 ) | 0x80;
243 *cp = val & 0xff;
244 return 2;
245 } else if (!(val & ~((1LL<<(5+2*8))-1))) {
246 *cp++ = (val >> 16) | 0xc0;
247 *cp++ = (val >> 8 ) & 0xff;
248 *cp = val & 0xff;
249 return 3;
250 } else if (!(val & ~((1LL<<(4+3*8))-1))) {
251 *cp++ = (val >> 24) | 0xe0;
252 *cp++ = (val >> 16) & 0xff;
253 *cp++ = (val >> 8 ) & 0xff;
254 *cp = val & 0xff;
255 return 4;
256 } else if (!(val & ~((1LL<<(3+4*8))-1))) {
257 *cp++ = (val >> 32) | 0xf0;
258 *cp++ = (val >> 24) & 0xff;
259 *cp++ = (val >> 16) & 0xff;
260 *cp++ = (val >> 8 ) & 0xff;
261 *cp = val & 0xff;
262 return 5;
263 } else if (!(val & ~((1LL<<(2+5*8))-1))) {
264 *cp++ = (val >> 40) | 0xf8;
265 *cp++ = (val >> 32) & 0xff;
266 *cp++ = (val >> 24) & 0xff;
267 *cp++ = (val >> 16) & 0xff;
268 *cp++ = (val >> 8 ) & 0xff;
269 *cp = val & 0xff;
270 return 6;
271 } else if (!(val & ~((1LL<<(1+6*8))-1))) {
272 *cp++ = (val >> 48) | 0xfc;
273 *cp++ = (val >> 40) & 0xff;
274 *cp++ = (val >> 32) & 0xff;
275 *cp++ = (val >> 24) & 0xff;
276 *cp++ = (val >> 16) & 0xff;
277 *cp++ = (val >> 8 ) & 0xff;
278 *cp = val & 0xff;
279 return 7;
280 } else if (!(val & ~((1LL<<(7*8))-1))) {
281 *cp++ = (val >> 56) | 0xfe;
282 *cp++ = (val >> 48) & 0xff;
283 *cp++ = (val >> 40) & 0xff;
284 *cp++ = (val >> 32) & 0xff;
285 *cp++ = (val >> 24) & 0xff;
286 *cp++ = (val >> 16) & 0xff;
287 *cp++ = (val >> 8 ) & 0xff;
288 *cp = val & 0xff;
289 return 8;
290 } else {
291 *cp++ = 0xff;
292 *cp++ = (val >> 56) & 0xff;
293 *cp++ = (val >> 48) & 0xff;
294 *cp++ = (val >> 40) & 0xff;
295 *cp++ = (val >> 32) & 0xff;
296 *cp++ = (val >> 24) & 0xff;
297 *cp++ = (val >> 16) & 0xff;
298 *cp++ = (val >> 8 ) & 0xff;
299 *cp = val & 0xff;
300 return 9;
301 }
302 }
303
304 int ltf8_get(char *cp, int64_t *val_p) {
305 unsigned char *up = (unsigned char *)cp;
306
307 if (up[0] < 0x80) {
308 *val_p = up[0];
309 return 1;
310 } else if (up[0] < 0xc0) {
311 *val_p = (((uint64_t)up[0]<< 8) |
312 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
313 return 2;
314 } else if (up[0] < 0xe0) {
315 *val_p = (((uint64_t)up[0]<<16) |
316 ((uint64_t)up[1]<< 8) |
317 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
318 return 3;
319 } else if (up[0] < 0xf0) {
320 *val_p = (((uint64_t)up[0]<<24) |
321 ((uint64_t)up[1]<<16) |
322 ((uint64_t)up[2]<< 8) |
323 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
324 return 4;
325 } else if (up[0] < 0xf8) {
326 *val_p = (((uint64_t)up[0]<<32) |
327 ((uint64_t)up[1]<<24) |
328 ((uint64_t)up[2]<<16) |
329 ((uint64_t)up[3]<< 8) |
330 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
331 return 5;
332 } else if (up[0] < 0xfc) {
333 *val_p = (((uint64_t)up[0]<<40) |
334 ((uint64_t)up[1]<<32) |
335 ((uint64_t)up[2]<<24) |
336 ((uint64_t)up[3]<<16) |
337 ((uint64_t)up[4]<< 8) |
338 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
339 return 6;
340 } else if (up[0] < 0xfe) {
341 *val_p = (((uint64_t)up[0]<<48) |
342 ((uint64_t)up[1]<<40) |
343 ((uint64_t)up[2]<<32) |
344 ((uint64_t)up[3]<<24) |
345 ((uint64_t)up[4]<<16) |
346 ((uint64_t)up[5]<< 8) |
347 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
348 return 7;
349 } else if (up[0] < 0xff) {
350 *val_p = (((uint64_t)up[1]<<48) |
351 ((uint64_t)up[2]<<40) |
352 ((uint64_t)up[3]<<32) |
353 ((uint64_t)up[4]<<24) |
354 ((uint64_t)up[5]<<16) |
355 ((uint64_t)up[6]<< 8) |
356 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
357 return 8;
358 } else {
359 *val_p = (((uint64_t)up[1]<<56) |
360 ((uint64_t)up[2]<<48) |
361 ((uint64_t)up[3]<<40) |
362 ((uint64_t)up[4]<<32) |
363 ((uint64_t)up[5]<<24) |
364 ((uint64_t)up[6]<<16) |
365 ((uint64_t)up[7]<< 8) |
366 (uint64_t)up[8]);
367 return 9;
368 }
369 }
370
371 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
372 int c = hgetc(fd->fp);
373 int64_t val = (unsigned char)c;
374 if (c == -1)
375 return -1;
376
377 if (val < 0x80) {
378 *val_p = val;
379 return 1;
380
381 } else if (val < 0xc0) {
382 val = (val<<8) | (unsigned char)hgetc(fd->fp);
383 *val_p = val & (((1LL<<(6+8)))-1);
384 return 2;
385
386 } else if (val < 0xe0) {
387 val = (val<<8) | (unsigned char)hgetc(fd->fp);
388 val = (val<<8) | (unsigned char)hgetc(fd->fp);
389 *val_p = val & ((1LL<<(5+2*8))-1);
390 return 3;
391
392 } else if (val < 0xf0) {
393 val = (val<<8) | (unsigned char)hgetc(fd->fp);
394 val = (val<<8) | (unsigned char)hgetc(fd->fp);
395 val = (val<<8) | (unsigned char)hgetc(fd->fp);
396 *val_p = val & ((1LL<<(4+3*8))-1);
397 return 4;
398
399 } else if (val < 0xf8) {
400 val = (val<<8) | (unsigned char)hgetc(fd->fp);
401 val = (val<<8) | (unsigned char)hgetc(fd->fp);
402 val = (val<<8) | (unsigned char)hgetc(fd->fp);
403 val = (val<<8) | (unsigned char)hgetc(fd->fp);
404 *val_p = val & ((1LL<<(3+4*8))-1);
405 return 5;
406
407 } else if (val < 0xfc) {
408 val = (val<<8) | (unsigned char)hgetc(fd->fp);
409 val = (val<<8) | (unsigned char)hgetc(fd->fp);
410 val = (val<<8) | (unsigned char)hgetc(fd->fp);
411 val = (val<<8) | (unsigned char)hgetc(fd->fp);
412 val = (val<<8) | (unsigned char)hgetc(fd->fp);
413 *val_p = val & ((1LL<<(2+5*8))-1);
414 return 6;
415
416 } else if (val < 0xfe) {
417 val = (val<<8) | (unsigned char)hgetc(fd->fp);
418 val = (val<<8) | (unsigned char)hgetc(fd->fp);
419 val = (val<<8) | (unsigned char)hgetc(fd->fp);
420 val = (val<<8) | (unsigned char)hgetc(fd->fp);
421 val = (val<<8) | (unsigned char)hgetc(fd->fp);
422 val = (val<<8) | (unsigned char)hgetc(fd->fp);
423 *val_p = val & ((1LL<<(1+6*8))-1);
424 return 7;
425
426 } else if (val < 0xff) {
427 val = (val<<8) | (unsigned char)hgetc(fd->fp);
428 val = (val<<8) | (unsigned char)hgetc(fd->fp);
429 val = (val<<8) | (unsigned char)hgetc(fd->fp);
430 val = (val<<8) | (unsigned char)hgetc(fd->fp);
431 val = (val<<8) | (unsigned char)hgetc(fd->fp);
432 val = (val<<8) | (unsigned char)hgetc(fd->fp);
433 val = (val<<8) | (unsigned char)hgetc(fd->fp);
434 *val_p = val & ((1LL<<(7*8))-1);
435 return 8;
436
437 } else {
438 val = (val<<8) | (unsigned char)hgetc(fd->fp);
439 val = (val<<8) | (unsigned char)hgetc(fd->fp);
440 val = (val<<8) | (unsigned char)hgetc(fd->fp);
441 val = (val<<8) | (unsigned char)hgetc(fd->fp);
442 val = (val<<8) | (unsigned char)hgetc(fd->fp);
443 val = (val<<8) | (unsigned char)hgetc(fd->fp);
444 val = (val<<8) | (unsigned char)hgetc(fd->fp);
445 val = (val<<8) | (unsigned char)hgetc(fd->fp);
446 *val_p = val;
447 }
448
449 return 9;
450 }
451
452 /*
453 * Pushes a value in ITF8 format onto the end of a block.
454 * This shouldn't be used for high-volume data as it is not the fastest
455 * method.
456 *
457 * Returns the number of bytes written
458 */
459 int itf8_put_blk(cram_block *blk, int val) {
460 char buf[5];
461 int sz;
462
463 sz = itf8_put(buf, val);
464 BLOCK_APPEND(blk, buf, sz);
465 return sz;
466 }
467
468 /*
469 * Decodes a 32-bit little endian value from fd and stores in val.
470 *
471 * Returns the number of bytes read on success
472 * -1 on failure
473 */
474 int int32_decode(cram_fd *fd, int32_t *val) {
475 int32_t i;
476 if (4 != hread(fd->fp, &i, 4))
477 return -1;
478
479 *val = le_int4(i);
480 return 4;
481 }
482
483 /*
484 * Encodes a 32-bit little endian value 'val' and writes to fd.
485 *
486 * Returns the number of bytes written on success
487 * -1 on failure
488 */
489 int int32_encode(cram_fd *fd, int32_t val) {
490 val = le_int4(val);
491 if (4 != hwrite(fd->fp, &val, 4))
492 return -1;
493
494 return 4;
495 }
496
497 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
498 int int32_get(cram_block *b, int32_t *val) {
499 if (b->uncomp_size - BLOCK_SIZE(b) < 4)
500 return -1;
501
502 *val =
503 b->data[b->byte ] |
504 (b->data[b->byte+1] << 8) |
505 (b->data[b->byte+2] << 16) |
506 (b->data[b->byte+3] << 24);
507 BLOCK_SIZE(b) += 4;
508 return 4;
509 }
510
511 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
512 int int32_put(cram_block *b, int32_t val) {
513 unsigned char cp[4];
514 cp[0] = ( val & 0xff);
515 cp[1] = ((val>>8) & 0xff);
516 cp[2] = ((val>>16) & 0xff);
517 cp[3] = ((val>>24) & 0xff);
518
519 BLOCK_APPEND(b, cp, 4);
520 return b->data ? 0 : -1;
521 }
522
523 /* ----------------------------------------------------------------------
524 * zlib compression code - from Gap5's tg_iface_g.c
525 * They're static here as they're only used within the cram_compress_block
526 * and cram_uncompress_block functions, which are the external interface.
527 */
528 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
529 z_stream s;
530 unsigned char *data = NULL; /* Uncompressed output */
531 int data_alloc = 0;
532 int err;
533
534 /* Starting point at uncompressed size, and scale after that */
535 data = malloc(data_alloc = csize*1.2+100);
536 if (!data)
537 return NULL;
538
539 /* Initialise zlib stream */
540 s.zalloc = Z_NULL; /* use default allocation functions */
541 s.zfree = Z_NULL;
542 s.opaque = Z_NULL;
543 s.next_in = (unsigned char *)cdata;
544 s.avail_in = csize;
545 s.total_in = 0;
546 s.next_out = data;
547 s.avail_out = data_alloc;
548 s.total_out = 0;
549
550 //err = inflateInit(&s);
551 err = inflateInit2(&s, 15 + 32);
552 if (err != Z_OK) {
553 fprintf(stderr, "zlib inflateInit error: %s\n", s.msg);
554 free(data);
555 return NULL;
556 }
557
558 /* Decode to 'data' array */
559 for (;s.avail_in;) {
560 unsigned char *data_tmp;
561 int alloc_inc;
562
563 s.next_out = &data[s.total_out];
564 err = inflate(&s, Z_NO_FLUSH);
565 if (err == Z_STREAM_END)
566 break;
567
568 if (err != Z_OK) {
569 fprintf(stderr, "zlib inflate error: %s\n", s.msg);
570 break;
571 }
572
573 /* More to come, so realloc based on growth so far */
574 alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
575 data = realloc((data_tmp = data), data_alloc += alloc_inc);
576 if (!data) {
577 free(data_tmp);
578 return NULL;
579 }
580 s.avail_out += alloc_inc;
581 }
582 inflateEnd(&s);
583
584 *size = s.total_out;
585 return (char *)data;
586 }
587
588 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
589 int level, int strat) {
590 z_stream s;
591 unsigned char *cdata = NULL; /* Compressed output */
592 int cdata_alloc = 0;
593 int cdata_pos = 0;
594 int err;
595
596 cdata = malloc(cdata_alloc = size*1.05+100);
597 if (!cdata)
598 return NULL;
599 cdata_pos = 0;
600
601 /* Initialise zlib stream */
602 s.zalloc = Z_NULL; /* use default allocation functions */
603 s.zfree = Z_NULL;
604 s.opaque = Z_NULL;
605 s.next_in = (unsigned char *)data;
606 s.avail_in = size;
607 s.total_in = 0;
608 s.next_out = cdata;
609 s.avail_out = cdata_alloc;
610 s.total_out = 0;
611 s.data_type = Z_BINARY;
612
613 err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
614 if (err != Z_OK) {
615 fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg);
616 return NULL;
617 }
618
619 /* Encode to 'cdata' array */
620 for (;s.avail_in;) {
621 s.next_out = &cdata[cdata_pos];
622 s.avail_out = cdata_alloc - cdata_pos;
623 if (cdata_alloc - cdata_pos <= 0) {
624 fprintf(stderr, "Deflate produced larger output than expected. Abort\n");
625 return NULL;
626 }
627 err = deflate(&s, Z_NO_FLUSH);
628 cdata_pos = cdata_alloc - s.avail_out;
629 if (err != Z_OK) {
630 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
631 break;
632 }
633 }
634 if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
635 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
636 }
637 *cdata_size = s.total_out;
638
639 if (deflateEnd(&s) != Z_OK) {
640 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
641 }
642 return (char *)cdata;
643 }
644
645 #ifdef HAVE_LIBLZMA
646 /* ------------------------------------------------------------------------ */
647 /*
648 * Data compression routines using liblzma (xz)
649 *
650 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
651 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
652 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
653 * as compression times.
654 *
655 * For now we disable this functionality. If it's to be reenabled make sure you
656 * improve the mem_inflate implementation as it's just a test hack at the
657 * moment.
658 */
659
660 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
661 int level) {
662 char *out;
663 size_t out_size = lzma_stream_buffer_bound(size);
664 *cdata_size = 0;
665
666 out = malloc(out_size);
667
668 /* Single call compression */
669 if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
670 (uint8_t *)data, size,
671 (uint8_t *)out, cdata_size,
672 out_size))
673 return NULL;
674
675 return out;
676 }
677
678 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
679 lzma_stream strm = LZMA_STREAM_INIT;
680 size_t out_size = 0, out_pos = 0;
681 char *out = NULL;
682 int r;
683
684 /* Initiate the decoder */
685 if (LZMA_OK != lzma_stream_decoder(&strm, 50000000, 0))
686 return NULL;
687
688 /* Decode loop */
689 strm.avail_in = csize;
690 strm.next_in = (uint8_t *)cdata;
691
692 for (;strm.avail_in;) {
693 if (strm.avail_in > out_size - out_pos) {
694 out_size += strm.avail_in * 4 + 32768;
695 out = realloc(out, out_size);
696 }
697 strm.avail_out = out_size - out_pos;
698 strm.next_out = (uint8_t *)&out[out_pos];
699
700 r = lzma_code(&strm, LZMA_RUN);
701 if (LZMA_OK != r && LZMA_STREAM_END != r) {
702 fprintf(stderr, "r=%d\n", r);
703 fprintf(stderr, "mem=%"PRId64"d\n", (int64_t)lzma_memusage(&strm));
704 return NULL;
705 }
706
707 out_pos = strm.total_out;
708
709 if (r == LZMA_STREAM_END)
710 break;
711 }
712
713 /* finish up any unflushed data; necessary? */
714 r = lzma_code(&strm, LZMA_FINISH);
715 if (r != LZMA_OK && r != LZMA_STREAM_END) {
716 fprintf(stderr, "r=%d\n", r);
717 return NULL;
718 }
719
720 out = realloc(out, strm.total_out);
721 *size = strm.total_out;
722
723 lzma_end(&strm);
724
725 return out;
726 }
727 #endif
728
729 /* ----------------------------------------------------------------------
730 * CRAM blocks - the dynamically growable data block. We have code to
731 * create, update, (un)compress and read/write.
732 *
733 * These are derived from the deflate_interlaced.c blocks, but with the
734 * CRAM extension of content types and IDs.
735 */
736
737 /*
738 * Allocates a new cram_block structure with a specified content_type and
739 * id.
740 *
741 * Returns block pointer on success
742 * NULL on failure
743 */
744 cram_block *cram_new_block(enum cram_content_type content_type,
745 int content_id) {
746 cram_block *b = malloc(sizeof(*b));
747 if (!b)
748 return NULL;
749 b->method = b->orig_method = RAW;
750 b->content_type = content_type;
751 b->content_id = content_id;
752 b->comp_size = 0;
753 b->uncomp_size = 0;
754 b->data = NULL;
755 b->alloc = 0;
756 b->byte = 0;
757 b->bit = 7; // MSB
758
759 return b;
760 }
761
762 /*
763 * Reads a block from a cram file.
764 * Returns cram_block pointer on success.
765 * NULL on failure
766 */
767 cram_block *cram_read_block(cram_fd *fd) {
768 cram_block *b = malloc(sizeof(*b));
769 if (!b)
770 return NULL;
771
772 //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
773
774 if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; }
775 if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
776 if (-1 == itf8_decode(fd, &b->content_id)) { free(b); return NULL; }
777 if (-1 == itf8_decode(fd, &b->comp_size)) { free(b); return NULL; }
778 if (-1 == itf8_decode(fd, &b->uncomp_size)) { free(b); return NULL; }
779
780 // fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
781 // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
782
783 if (b->method == RAW) {
784 b->alloc = b->uncomp_size;
785 if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
786 if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
787 free(b->data);
788 free(b);
789 return NULL;
790 }
791 } else {
792 b->alloc = b->comp_size;
793 if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; }
794 if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
795 free(b->data);
796 free(b);
797 return NULL;
798 }
799 }
800
801 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
802 unsigned char dat[100], *cp = dat;;
803 uint32_t crc;
804
805
806 if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
807 free(b);
808 return NULL;
809 }
810
811 *cp++ = b->method;
812 *cp++ = b->content_type;
813 cp += itf8_put(cp, b->content_id);
814 cp += itf8_put(cp, b->comp_size);
815 cp += itf8_put(cp, b->uncomp_size);
816 crc = crc32(0L, dat, cp-dat);
817 crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc);
818
819 if (crc != b->crc32) {
820 fprintf(stderr, "Block CRC32 failure\n");
821 free(b->data);
822 free(b);
823 return NULL;
824 }
825 }
826
827 b->orig_method = b->method;
828 b->idx = 0;
829 b->byte = 0;
830 b->bit = 7; // MSB
831
832 return b;
833 }
834
835 /*
836 * Writes a CRAM block.
837 * Returns 0 on success
838 * -1 on failure
839 */
840 int cram_write_block(cram_fd *fd, cram_block *b) {
841 assert(b->method != RAW || (b->comp_size == b->uncomp_size));
842
843 if (hputc(b->method, fd->fp) == EOF) return -1;
844 if (hputc(b->content_type, fd->fp) == EOF) return -1;
845 if (itf8_encode(fd, b->content_id) == -1) return -1;
846 if (itf8_encode(fd, b->comp_size) == -1) return -1;
847 if (itf8_encode(fd, b->uncomp_size) == -1) return -1;
848
849 if (b->method == RAW) {
850 if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
851 return -1;
852 } else {
853 if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
854 return -1;
855 }
856
857 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
858 unsigned char dat[100], *cp = dat;;
859 uint32_t crc;
860
861 *cp++ = b->method;
862 *cp++ = b->content_type;
863 cp += itf8_put(cp, b->content_id);
864 cp += itf8_put(cp, b->comp_size);
865 cp += itf8_put(cp, b->uncomp_size);
866 crc = crc32(0L, dat, cp-dat);
867
868 if (b->method == RAW) {
869 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
870 } else {
871 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
872 }
873
874 if (-1 == int32_encode(fd, b->crc32))
875 return -1;
876 }
877
878 return 0;
879 }
880
881 /*
882 * Frees a CRAM block, deallocating internal data too.
883 */
884 void cram_free_block(cram_block *b) {
885 if (!b)
886 return;
887 if (b->data)
888 free(b->data);
889 free(b);
890 }
891
892 /*
893 * Uncompresses a CRAM block, if compressed.
894 */
895 int cram_uncompress_block(cram_block *b) {
896 char *uncomp;
897 size_t uncomp_size = 0;
898
899 if (b->uncomp_size == 0) {
900 // blank block
901 b->method = RAW;
902 return 0;
903 }
904
905 switch (b->method) {
906 case RAW:
907 return 0;
908
909 case GZIP:
910 uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
911 if (!uncomp)
912 return -1;
913 if ((int)uncomp_size != b->uncomp_size) {
914 free(uncomp);
915 return -1;
916 }
917 free(b->data);
918 b->data = (unsigned char *)uncomp;
919 b->alloc = uncomp_size;
920 b->method = RAW;
921 break;
922
923 #ifdef HAVE_LIBBZ2
924 case BZIP2: {
925 unsigned int usize = b->uncomp_size;
926 if (!(uncomp = malloc(usize)))
927 return -1;
928 if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
929 (char *)b->data, b->comp_size,
930 0, 0)) {
931 free(uncomp);
932 return -1;
933 }
934 free(b->data);
935 b->data = (unsigned char *)uncomp;
936 b->alloc = usize;
937 b->method = RAW;
938 b->uncomp_size = usize; // Just incase it differs
939 break;
940 }
941 #else
942 case BZIP2:
943 fprintf(stderr, "Bzip2 compression is not compiled into this "
944 "version.\nPlease rebuild and try again.\n");
945 return -1;
946 #endif
947
948 #ifdef HAVE_LIBLZMA
949 case LZMA:
950 uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
951 if (!uncomp)
952 return -1;
953 if ((int)uncomp_size != b->uncomp_size)
954 return -1;
955 free(b->data);
956 b->data = (unsigned char *)uncomp;
957 b->alloc = uncomp_size;
958 b->method = RAW;
959 break;
960 #else
961 case LZMA:
962 fprintf(stderr, "Lzma compression is not compiled into this "
963 "version.\nPlease rebuild and try again.\n");
964 return -1;
965 break;
966 #endif
967
968 case RANS: {
969 unsigned int usize = b->uncomp_size, usize2;
970 uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
971 assert(usize == usize2);
972 free(b->data);
973 b->data = (unsigned char *)uncomp;
974 b->alloc = usize2;
975 b->method = RAW;
976 b->uncomp_size = usize2; // Just incase it differs
977 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
978 break;
979 }
980
981 default:
982 return -1;
983 }
984
985 return 0;
986 }
987
988 static char *cram_compress_by_method(char *in, size_t in_size,
989 size_t *out_size,
990 enum cram_block_method method,
991 int level, int strat) {
992 switch (method) {
993 case GZIP:
994 return zlib_mem_deflate(in, in_size, out_size, level, strat);
995
996 case BZIP2: {
997 #ifdef HAVE_LIBBZ2
998 unsigned int comp_size = in_size*1.01 + 600;
999 char *comp = malloc(comp_size);
1000 if (!comp)
1001 return NULL;
1002
1003 if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1004 in, in_size,
1005 level, 0, 30)) {
1006 free(comp);
1007 return NULL;
1008 }
1009 *out_size = comp_size;
1010 return comp;
1011 #else
1012 return NULL;
1013 #endif
1014 }
1015
1016 case LZMA:
1017 #ifdef HAVE_LIBLZMA
1018 return lzma_mem_deflate(in, in_size, out_size, level);
1019 #else
1020 return NULL;
1021 #endif
1022
1023 case RANS0: {
1024 unsigned int out_size_i;
1025 unsigned char *cp;
1026 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0);
1027 *out_size = out_size_i;
1028 return (char *)cp;
1029 }
1030
1031 case RANS1: {
1032 unsigned int out_size_i;
1033 unsigned char *cp;
1034
1035 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1);
1036 *out_size = out_size_i;
1037 return (char *)cp;
1038 }
1039
1040 case RAW:
1041 break;
1042
1043 default:
1044 return NULL;
1045 }
1046
1047 return NULL;
1048 }
1049
1050
1051 /*
1052 * Compresses a block using one of two different zlib strategies. If we only
1053 * want one choice set strat2 to be -1.
1054 *
1055 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
1056 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
1057 * significantly faster.
1058 */
1059 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
1060 int method, int level) {
1061
1062 char *comp = NULL;
1063 size_t comp_size = 0;
1064 int strat;
1065
1066 //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1067
1068 if (method == RAW || level == 0 || b->uncomp_size == 0) {
1069 b->method = RAW;
1070 b->comp_size = b->uncomp_size;
1071 //fprintf(stderr, "Skip block id %d\n", b->content_id);
1072 return 0;
1073 }
1074
1075 if (metrics) {
1076 pthread_mutex_lock(&fd->metrics_lock);
1077 if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1078 size_t sz_best = INT_MAX;
1079 size_t sz_gz_rle = 0;
1080 size_t sz_gz_def = 0;
1081 size_t sz_rans0 = 0;
1082 size_t sz_rans1 = 0;
1083 size_t sz_bzip2 = 0;
1084 size_t sz_lzma = 0;
1085 int method_best = 0;
1086 char *c_best = NULL, *c = NULL;
1087
1088 if (metrics->revised_method)
1089 method = metrics->revised_method;
1090 else
1091 metrics->revised_method = method;
1092
1093 if (metrics->next_trial == 0) {
1094 metrics->next_trial = TRIAL_SPAN;
1095 metrics->trial = NTRIALS;
1096 metrics->sz_gz_rle /= 2;
1097 metrics->sz_gz_def /= 2;
1098 metrics->sz_rans0 /= 2;
1099 metrics->sz_rans1 /= 2;
1100 metrics->sz_bzip2 /= 2;
1101 metrics->sz_lzma /= 2;
1102 }
1103
1104 pthread_mutex_unlock(&fd->metrics_lock);
1105
1106 if (method & (1<<GZIP_RLE)) {
1107 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1108 &sz_gz_rle, GZIP, 1, Z_RLE);
1109 if (c && sz_best > sz_gz_rle) {
1110 sz_best = sz_gz_rle;
1111 method_best = GZIP_RLE;
1112 if (c_best)
1113 free(c_best);
1114 c_best = c;
1115 } else if (c) {
1116 free(c);
1117 } else {
1118 sz_gz_rle = b->uncomp_size*2+1000;
1119 }
1120
1121 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
1122 }
1123
1124 if (method & (1<<GZIP)) {
1125 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1126 &sz_gz_def, GZIP, level,
1127 Z_FILTERED);
1128 if (c && sz_best > sz_gz_def) {
1129 sz_best = sz_gz_def;
1130 method_best = GZIP;
1131 if (c_best)
1132 free(c_best);
1133 c_best = c;
1134 } else if (c) {
1135 free(c);
1136 } else {
1137 sz_gz_def = b->uncomp_size*2+1000;
1138 }
1139
1140 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
1141 }
1142
1143 if (method & (1<<RANS0)) {
1144 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1145 &sz_rans0, RANS0, 0, 0);
1146 if (c && sz_best > sz_rans0) {
1147 sz_best = sz_rans0;
1148 method_best = RANS0;
1149 if (c_best)
1150 free(c_best);
1151 c_best = c;
1152 } else if (c) {
1153 free(c);
1154 } else {
1155 sz_rans0 = b->uncomp_size*2+1000;
1156 }
1157 }
1158
1159 if (method & (1<<RANS1)) {
1160 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1161 &sz_rans1, RANS1, 0, 0);
1162 if (c && sz_best > sz_rans1) {
1163 sz_best = sz_rans1;
1164 method_best = RANS1;
1165 if (c_best)
1166 free(c_best);
1167 c_best = c;
1168 } else if (c) {
1169 free(c);
1170 } else {
1171 sz_rans1 = b->uncomp_size*2+1000;
1172 }
1173 }
1174
1175 if (method & (1<<BZIP2)) {
1176 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1177 &sz_bzip2, BZIP2, level, 0);
1178 if (c && sz_best > sz_bzip2) {
1179 sz_best = sz_bzip2;
1180 method_best = BZIP2;
1181 if (c_best)
1182 free(c_best);
1183 c_best = c;
1184 } else if (c) {
1185 free(c);
1186 } else {
1187 sz_bzip2 = b->uncomp_size*2+1000;
1188 }
1189 }
1190
1191 if (method & (1<<LZMA)) {
1192 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1193 &sz_lzma, LZMA, level, 0);
1194 if (c && sz_best > sz_lzma) {
1195 sz_best = sz_lzma;
1196 method_best = LZMA;
1197 if (c_best)
1198 free(c_best);
1199 c_best = c;
1200 } else if (c) {
1201 free(c);
1202 } else {
1203 sz_lzma = b->uncomp_size*2+1000;
1204 }
1205 }
1206
1207 //fprintf(stderr, "sz_best = %d\n", sz_best);
1208
1209 free(b->data);
1210 b->data = (unsigned char *)c_best;
1211 //printf("method_best = %s\n", cram_block_method2str(method_best));
1212 b->method = method_best == GZIP_RLE ? GZIP : method_best;
1213 b->comp_size = sz_best;
1214
1215 pthread_mutex_lock(&fd->metrics_lock);
1216 metrics->sz_gz_rle += sz_gz_rle;
1217 metrics->sz_gz_def += sz_gz_def;
1218 metrics->sz_rans0 += sz_rans0;
1219 metrics->sz_rans1 += sz_rans1;
1220 metrics->sz_bzip2 += sz_bzip2;
1221 metrics->sz_lzma += sz_lzma;
1222 if (--metrics->trial == 0) {
1223 int best_method = RAW;
1224 int best_sz = INT_MAX;
1225
1226 // Scale methods by cost
1227 if (fd->level <= 3) {
1228 metrics->sz_rans1 *= 1.02;
1229 metrics->sz_gz_def *= 1.04;
1230 metrics->sz_bzip2 *= 1.08;
1231 metrics->sz_lzma *= 1.10;
1232 } else if (fd->level <= 6) {
1233 metrics->sz_rans1 *= 1.01;
1234 metrics->sz_gz_def *= 1.02;
1235 metrics->sz_bzip2 *= 1.03;
1236 metrics->sz_lzma *= 1.05;
1237 }
1238
1239 if (method & (1<<GZIP_RLE) && best_sz > metrics->sz_gz_rle)
1240 best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE;
1241
1242 if (method & (1<<GZIP) && best_sz > metrics->sz_gz_def)
1243 best_sz = metrics->sz_gz_def, best_method = GZIP;
1244
1245 if (method & (1<<RANS0) && best_sz > metrics->sz_rans0)
1246 best_sz = metrics->sz_rans0, best_method = RANS0;
1247
1248 if (method & (1<<RANS1) && best_sz > metrics->sz_rans1)
1249 best_sz = metrics->sz_rans1, best_method = RANS1;
1250
1251 if (method & (1<<BZIP2) && best_sz > metrics->sz_bzip2)
1252 best_sz = metrics->sz_bzip2, best_method = BZIP2;
1253
1254 if (method & (1<<LZMA) && best_sz > metrics->sz_lzma)
1255 best_sz = metrics->sz_lzma, best_method = LZMA;
1256
1257 if (best_method == GZIP_RLE) {
1258 metrics->method = GZIP;
1259 metrics->strat = Z_RLE;
1260 } else {
1261 metrics->method = best_method;
1262 metrics->strat = Z_FILTERED;
1263 }
1264
1265 // If we see at least MAXFAIL trials in a row for a specific
1266 // compression method with more than MAXDELTA aggregate
1267 // size then we drop this from the list of methods used
1268 // for this block type.
1269 #define MAXDELTA 0.20
1270 #define MAXFAILS 4
1271 if (best_method == GZIP_RLE) {
1272 metrics->gz_rle_cnt = 0;
1273 metrics->gz_rle_extra = 0;
1274 } else if (best_sz < metrics->sz_gz_rle) {
1275 double r = (double)metrics->sz_gz_rle / best_sz - 1;
1276 if (++metrics->gz_rle_cnt >= MAXFAILS &&
1277 (metrics->gz_rle_extra += r) >= MAXDELTA)
1278 method &= ~(1<<GZIP_RLE);
1279 }
1280
1281 if (best_method == GZIP) {
1282 metrics->gz_def_cnt = 0;
1283 metrics->gz_def_extra = 0;
1284 } else if (best_sz < metrics->sz_gz_def) {
1285 double r = (double)metrics->sz_gz_def / best_sz - 1;
1286 if (++metrics->gz_def_cnt >= MAXFAILS &&
1287 (metrics->gz_def_extra += r) >= MAXDELTA)
1288 method &= ~(1<<GZIP);
1289 }
1290
1291 if (best_method == RANS0) {
1292 metrics->rans0_cnt = 0;
1293 metrics->rans0_extra = 0;
1294 } else if (best_sz < metrics->sz_rans0) {
1295 double r = (double)metrics->sz_rans0 / best_sz - 1;
1296 if (++metrics->rans0_cnt >= MAXFAILS &&
1297 (metrics->rans0_extra += r) >= MAXDELTA)
1298 method &= ~(1<<RANS0);
1299 }
1300
1301 if (best_method == RANS1) {
1302 metrics->rans1_cnt = 0;
1303 metrics->rans1_extra = 0;
1304 } else if (best_sz < metrics->sz_rans1) {
1305 double r = (double)metrics->sz_rans1 / best_sz - 1;
1306 if (++metrics->rans1_cnt >= MAXFAILS &&
1307 (metrics->rans1_extra += r) >= MAXDELTA)
1308 method &= ~(1<<RANS1);
1309 }
1310
1311 if (best_method == BZIP2) {
1312 metrics->bzip2_cnt = 0;
1313 metrics->bzip2_extra = 0;
1314 } else if (best_sz < metrics->sz_bzip2) {
1315 double r = (double)metrics->sz_bzip2 / best_sz - 1;
1316 if (++metrics->bzip2_cnt >= MAXFAILS &&
1317 (metrics->bzip2_extra += r) >= MAXDELTA)
1318 method &= ~(1<<BZIP2);
1319 }
1320
1321 if (best_method == LZMA) {
1322 metrics->lzma_cnt = 0;
1323 metrics->lzma_extra = 0;
1324 } else if (best_sz < metrics->sz_lzma) {
1325 double r = (double)metrics->sz_lzma / best_sz - 1;
1326 if (++metrics->lzma_cnt >= MAXFAILS &&
1327 (metrics->lzma_extra += r) >= MAXDELTA)
1328 method &= ~(1<<LZMA);
1329 }
1330
1331 //if (method != metrics->revised_method)
1332 // fprintf(stderr, "%d: method from %x to %x\n",
1333 // b->content_id, metrics->revised_method, method);
1334 metrics->revised_method = method;
1335 }
1336 pthread_mutex_unlock(&fd->metrics_lock);
1337 } else {
1338 strat = metrics->strat;
1339 method = metrics->method;
1340
1341 pthread_mutex_unlock(&fd->metrics_lock);
1342 comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1343 &comp_size, method,
1344 level, strat);
1345 if (!comp)
1346 return -1;
1347 free(b->data);
1348 b->data = (unsigned char *)comp;
1349 b->comp_size = comp_size;
1350 b->method = method;
1351 }
1352
1353 } else {
1354 // no cached metrics, so just do zlib?
1355 comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1356 &comp_size, GZIP, level, Z_FILTERED);
1357 if (!comp) {
1358 fprintf(stderr, "Compression failed!\n");
1359 return -1;
1360 }
1361 free(b->data);
1362 b->data = (unsigned char *)comp;
1363 b->comp_size = comp_size;
1364 b->method = GZIP;
1365 }
1366
1367 if (fd->verbose)
1368 fprintf(stderr, "Compressed block ID %d from %d to %d by method %s\n",
1369 b->content_id, b->uncomp_size, b->comp_size,
1370 cram_block_method2str(b->method));
1371
1372 if (b->method == RANS1)
1373 b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing
1374
1375 return 0;
1376 }
1377
1378 cram_metrics *cram_new_metrics(void) {
1379 cram_metrics *m = calloc(1, sizeof(*m));
1380 if (!m)
1381 return NULL;
1382 m->trial = NTRIALS-1;
1383 m->next_trial = TRIAL_SPAN;
1384 m->method = RAW;
1385 m->strat = 0;
1386 m->revised_method = 0;
1387
1388 return m;
1389 }
1390
1391 char *cram_block_method2str(enum cram_block_method m) {
1392 switch(m) {
1393 case RAW: return "RAW";
1394 case GZIP: return "GZIP";
1395 case BZIP2: return "BZIP2";
1396 case LZMA: return "LZMA";
1397 case RANS0: return "RANS0";
1398 case RANS1: return "RANS1";
1399 case GZIP_RLE: return "GZIP_RLE";
1400 case ERROR: break;
1401 }
1402 return "?";
1403 }
1404
1405 char *cram_content_type2str(enum cram_content_type t) {
1406 switch (t) {
1407 case FILE_HEADER: return "FILE_HEADER";
1408 case COMPRESSION_HEADER: return "COMPRESSION_HEADER";
1409 case MAPPED_SLICE: return "MAPPED_SLICE";
1410 case UNMAPPED_SLICE: return "UNMAPPED_SLICE";
1411 case EXTERNAL: return "EXTERNAL";
1412 case CORE: return "CORE";
1413 case CT_ERROR: break;
1414 }
1415 return "?";
1416 }
1417
1418 /*
1419 * Extra error checking on fclose to really ensure data is written.
1420 * Care needs to be taken to handle pipes vs real files.
1421 *
1422 * Returns 0 on success
1423 * -1 on failure.
1424 */
1425 int paranoid_fclose(FILE *fp) {
1426 if (-1 == fflush(fp) && errno != EBADF) {
1427 fclose(fp);
1428 return -1;
1429 }
1430
1431 errno = 0;
1432 if (-1 == fsync(fileno(fp))) {
1433 if (errno != EINVAL) { // eg pipe
1434 fclose(fp);
1435 return -1;
1436 }
1437 }
1438 return fclose(fp);
1439 }
1440
1441 /* ----------------------------------------------------------------------
1442 * Reference sequence handling
1443 *
1444 * These revolve around the refs_t structure, which may potentially be
1445 * shared between multiple cram_fd.
1446 *
1447 * We start with refs_create() to allocate an empty refs_t and then
1448 * populate it with @SQ line data using refs_from_header(). This is done on
1449 * cram_open(). Also at start up we can call cram_load_reference() which
1450 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
1451 * new one specified. In either case refs2id() is then called which
1452 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
1453 *
1454 * Later, possibly within a thread, we will want to know the actual ref
1455 * seq itself, obtained by calling cram_get_ref(). This may use the
1456 * UR: or M5: fields or the filename specified in the original
1457 * cram_load_reference() call.
1458 *
1459 * Given the potential for multi-threaded reference usage, we have
1460 * reference counting (sorry for the confusing double use of "ref") to
1461 * track the number of callers interested in any specific reference.
1462 */
1463
1464 void refs_free(refs_t *r) {
1465 RP("refs_free()\n");
1466
1467 if (--r->count > 0)
1468 return;
1469
1470 if (!r)
1471 return;
1472
1473 if (r->pool)
1474 string_pool_destroy(r->pool);
1475
1476 if (r->h_meta) {
1477 khint_t k;
1478
1479 for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
1480 ref_entry *e;
1481
1482 if (!kh_exist(r->h_meta, k))
1483 continue;
1484 if (!(e = kh_val(r->h_meta, k)))
1485 continue;
1486 if (e->seq)
1487 free(e->seq);
1488 free(e);
1489 }
1490
1491 kh_destroy(refs, r->h_meta);
1492 }
1493
1494 if (r->ref_id)
1495 free(r->ref_id);
1496
1497 if (r->fp)
1498 bgzf_close(r->fp);
1499
1500 pthread_mutex_destroy(&r->lock);
1501
1502 free(r);
1503 }
1504
1505 static refs_t *refs_create(void) {
1506 refs_t *r = calloc(1, sizeof(*r));
1507
1508 RP("refs_create()\n");
1509
1510 if (!r)
1511 return NULL;
1512
1513 if (!(r->pool = string_pool_create(8192)))
1514 goto err;
1515
1516 r->ref_id = NULL; // see refs2id() to populate.
1517 r->count = 1;
1518 r->last = NULL;
1519 r->last_id = -1;
1520
1521 if (!(r->h_meta = kh_init(refs)))
1522 goto err;
1523
1524 pthread_mutex_init(&r->lock, NULL);
1525
1526 return r;
1527
1528 err:
1529 refs_free(r);
1530 return NULL;
1531 }
1532
1533 /*
1534 * Opens a reference fasta file as a BGZF stream, allowing for
1535 * compressed files. It automatically builds a .fai file if
1536 * required and if compressed a .gzi bgzf index too.
1537 *
1538 * Returns a BGZF handle on success;
1539 * NULL on failure.
1540 */
1541 static BGZF *bgzf_open_ref(char *fn, char *mode) {
1542 BGZF *fp;
1543 char fai_file[PATH_MAX];
1544
1545 snprintf(fai_file, PATH_MAX, "%s.fai", fn);
1546 if (access(fai_file, R_OK) != 0)
1547 if (fai_build(fn) != 0)
1548 return NULL;
1549
1550 if (!(fp = bgzf_open(fn, mode))) {
1551 perror(fn);
1552 return NULL;
1553 }
1554
1555 if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
1556 fprintf(stderr, "Unable to load .gzi index '%s.gzi'\n", fn);
1557 bgzf_close(fp);
1558 return NULL;
1559 }
1560
1561 return fp;
1562 }
1563
1564 /*
1565 * Loads a FAI file for a reference.fasta.
1566 * "is_err" indicates whether failure to load is worthy of emitting an
1567 * error message. In some cases (eg with embedded references) we
1568 * speculatively load, just incase, and silently ignore errors.
1569 *
1570 * Returns the refs_t struct on success (maybe newly allocated);
1571 * NULL on failure
1572 */
1573 static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
1574 struct stat sb;
1575 FILE *fp = NULL;
1576 char fai_fn[PATH_MAX];
1577 char line[8192];
1578 refs_t *r = r_orig;
1579 size_t fn_l = strlen(fn);
1580 int id = 0, id_alloc = 0;
1581
1582 RP("refs_load_fai %s\n", fn);
1583
1584 if (!r)
1585 if (!(r = refs_create()))
1586 goto err;
1587
1588 /* Open reference, for later use */
1589 if (stat(fn, &sb) != 0) {
1590 if (is_err)
1591 perror(fn);
1592 goto err;
1593 }
1594
1595 if (r->fp)
1596 if (bgzf_close(r->fp) != 0)
1597 goto err;
1598 r->fp = NULL;
1599
1600 if (!(r->fn = string_dup(r->pool, fn)))
1601 goto err;
1602
1603 if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
1604 r->fn[fn_l-4] = 0;
1605
1606 if (!(r->fp = bgzf_open_ref(r->fn, "r")))
1607 goto err;
1608
1609 /* Parse .fai file and load meta-data */
1610 sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
1611
1612 if (stat(fai_fn, &sb) != 0) {
1613 if (is_err)
1614 perror(fai_fn);
1615 goto err;
1616 }
1617 if (!(fp = fopen(fai_fn, "r"))) {
1618 if (is_err)
1619 perror(fai_fn);
1620 goto err;
1621 }
1622 while (fgets(line, 8192, fp) != NULL) {
1623 ref_entry *e = malloc(sizeof(*e));
1624 char *cp;
1625 int n;
1626 khint_t k;
1627
1628 if (!e)
1629 return NULL;
1630
1631 // id
1632 for (cp = line; *cp && !isspace(*cp); cp++)
1633 ;
1634 *cp++ = 0;
1635 e->name = string_dup(r->pool, line);
1636
1637 // length
1638 while (*cp && isspace(*cp))
1639 cp++;
1640 e->length = strtoll(cp, &cp, 10);
1641
1642 // offset
1643 while (*cp && isspace(*cp))
1644 cp++;
1645 e->offset = strtoll(cp, &cp, 10);
1646
1647 // bases per line
1648 while (*cp && isspace(*cp))
1649 cp++;
1650 e->bases_per_line = strtol(cp, &cp, 10);
1651
1652 // line length
1653 while (*cp && isspace(*cp))
1654 cp++;
1655 e->line_length = strtol(cp, &cp, 10);
1656
1657 // filename
1658 e->fn = r->fn;
1659
1660 e->count = 0;
1661 e->seq = NULL;
1662
1663 k = kh_put(refs, r->h_meta, e->name, &n);
1664 if (-1 == n) {
1665 free(e);
1666 return NULL;
1667 }
1668
1669 if (n) {
1670 kh_val(r->h_meta, k) = e;
1671 } else {
1672 ref_entry *re = kh_val(r->h_meta, k);
1673 if (re && (re->count != 0 || re->length != 0)) {
1674 /* Keep old */
1675 free(e);
1676 } else {
1677 /* Replace old */
1678 if (re)
1679 free(re);
1680 kh_val(r->h_meta, k) = e;
1681 }
1682 }
1683
1684 if (id >= id_alloc) {
1685 int x;
1686
1687 id_alloc = id_alloc ?id_alloc*2 : 16;
1688 r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
1689
1690 for (x = id; x < id_alloc; x++)
1691 r->ref_id[x] = NULL;
1692 }
1693 r->ref_id[id] = e;
1694 r->nref = ++id;
1695 }
1696
1697 return r;
1698
1699 err:
1700 if (fp)
1701 fclose(fp);
1702
1703 if (!r_orig)
1704 refs_free(r);
1705
1706 return NULL;
1707 }
1708
1709 /*
1710 * Indexes references by the order they appear in a BAM file. This may not
1711 * necessarily be the same order they appear in the fasta reference file.
1712 *
1713 * Returns 0 on success
1714 * -1 on failure
1715 */
1716 int refs2id(refs_t *r, SAM_hdr *h) {
1717 int i;
1718
1719 if (r->ref_id)
1720 free(r->ref_id);
1721 if (r->last)
1722 r->last = NULL;
1723
1724 r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
1725 if (!r->ref_id)
1726 return -1;
1727
1728 r->nref = h->nref;
1729 for (i = 0; i < h->nref; i++) {
1730 khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
1731 if (k != kh_end(r->h_meta)) {
1732 r->ref_id[i] = kh_val(r->h_meta, k);
1733 } else {
1734 fprintf(stderr, "Unable to find ref name '%s'\n",
1735 h->ref[i].name);
1736 }
1737 }
1738
1739 return 0;
1740 }
1741
1742 /*
1743 * Generates refs_t entries based on @SQ lines in the header.
1744 * Returns 0 on success
1745 * -1 on failure
1746 */
1747 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
1748 int i, j;
1749
1750 if (!h || h->nref == 0)
1751 return 0;
1752
1753 //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
1754
1755 /* Existing refs are fine, as long as they're compatible with the hdr. */
1756 if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
1757 return -1;
1758
1759 /* Copy info from h->ref[i] over to r */
1760 for (i = 0, j = r->nref; i < h->nref; i++) {
1761 SAM_hdr_type *ty;
1762 SAM_hdr_tag *tag;
1763 khint_t k;
1764 int n;
1765
1766 k = kh_get(refs, r->h_meta, h->ref[i].name);
1767 if (k != kh_end(r->h_meta))
1768 // Ref already known about
1769 continue;
1770
1771 if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
1772 return -1;
1773
1774 if (!h->ref[j].name)
1775 return -1;
1776
1777 r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
1778 r->ref_id[j]->length = 0; // marker for not yet loaded
1779
1780 /* Initialise likely filename if known */
1781 if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
1782 if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
1783 r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
1784 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
1785 }
1786 }
1787
1788 k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
1789 if (n <= 0) // already exists or error
1790 return -1;
1791 kh_val(r->h_meta, k) = r->ref_id[j];
1792
1793 j++;
1794 }
1795 r->nref = j;
1796
1797 return 0;
1798 }
1799
1800 /*
1801 * Attaches a header to a cram_fd.
1802 *
1803 * This should be used when creating a new cram_fd for writing where
1804 * we have an SAM_hdr already constructed (eg from a file we've read
1805 * in).
1806 */
1807 int cram_set_header(cram_fd *fd, SAM_hdr *hdr) {
1808 if (fd->header)
1809 sam_hdr_free(fd->header);
1810 fd->header = hdr;
1811 return refs_from_header(fd->refs, fd, hdr);
1812 }
1813
1814 /*
1815 * Converts a directory and a filename into an expanded path, replacing %s
1816 * in directory with the filename and %[0-9]+s with portions of the filename
1817 * Any remaining parts of filename are added to the end with /%s.
1818 */
1819 void expand_cache_path(char *path, char *dir, char *fn) {
1820 char *cp;
1821
1822 while ((cp = strchr(dir, '%'))) {
1823 strncpy(path, dir, cp-dir);
1824 path += cp-dir;
1825
1826 if (*++cp == 's') {
1827 strcpy(path, fn);
1828 path += strlen(fn);
1829 fn += strlen(fn);
1830 cp++;
1831 } else if (*cp >= '0' && *cp <= '9') {
1832 char *endp;
1833 long l;
1834
1835 l = strtol(cp, &endp, 10);
1836 l = MIN(l, strlen(fn));
1837 if (*endp == 's') {
1838 strncpy(path, fn, l);
1839 path += l;
1840 fn += l;
1841 *path = 0;
1842 cp = endp+1;
1843 } else {
1844 *path++ = '%';
1845 *path++ = *cp++;
1846 }
1847 } else {
1848 *path++ = '%';
1849 *path++ = *cp++;
1850 }
1851 dir = cp;
1852 }
1853 strcpy(path, dir);
1854 path += strlen(dir);
1855 if (*fn && path[-1] != '/')
1856 *path++ = '/';
1857 strcpy(path, fn);
1858 }
1859
1860 /*
1861 * Make the directory containing path and any prefix directories.
1862 */
1863 void mkdir_prefix(char *path, int mode) {
1864 char *cp = strrchr(path, '/');
1865 if (!cp)
1866 return;
1867
1868 *cp = 0;
1869 if (is_directory(path)) {
1870 *cp = '/';
1871 return;
1872 }
1873
1874 if (mkdir(path, mode) == 0) {
1875 chmod(path, mode);
1876 *cp = '/';
1877 return;
1878 }
1879
1880 mkdir_prefix(path, mode);
1881 mkdir(path, mode);
1882 chmod(path, mode);
1883 *cp = '/';
1884 }
1885
1886 /*
1887 * Return the cache directory to use, based on the first of these
1888 * environment variables to be set to a non-empty value.
1889 */
1890 static const char *get_cache_basedir(const char **extra) {
1891 char *base;
1892
1893 *extra = "";
1894
1895 base = getenv("XDG_CACHE_HOME");
1896 if (base && *base) return base;
1897
1898 base = getenv("HOME");
1899 if (base && *base) { *extra = "/.cache"; return base; }
1900
1901 base = getenv("TMPDIR");
1902 if (base && *base) return base;
1903
1904 base = getenv("TEMP");
1905 if (base && *base) return base;
1906
1907 return "/tmp";
1908 }
1909
1910 /*
1911 * Queries the M5 string from the header and attempts to populate the
1912 * reference from this using the REF_PATH environment.
1913 *
1914 * Returns 0 on sucess
1915 * -1 on failure
1916 */
1917 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
1918 char *ref_path = getenv("REF_PATH");
1919 SAM_hdr_type *ty;
1920 SAM_hdr_tag *tag;
1921 char path[PATH_MAX], path_tmp[PATH_MAX], cache[PATH_MAX];
1922 char *local_cache = getenv("REF_CACHE");
1923 mFILE *mf;
1924
1925 if (fd->verbose)
1926 fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id);
1927
1928 if (!ref_path || *ref_path == '\0') {
1929 /*
1930 * If we have no ref path, we use the EBI server.
1931 * However to avoid spamming it we require a local ref cache too.
1932 */
1933 ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s";
1934 if (!local_cache || *local_cache == '\0') {
1935 const char *extra;
1936 const char *base = get_cache_basedir(&extra);
1937 snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
1938 local_cache = cache;
1939 if (fd->verbose)
1940 fprintf(stderr, "Populating local cache: %s\n", local_cache);
1941 }
1942 }
1943
1944 if (!r->name)
1945 return -1;
1946
1947 if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
1948 return -1;
1949
1950 if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
1951 goto no_M5;
1952
1953 if (fd->verbose)
1954 fprintf(stderr, "Querying ref %s\n", tag->str+3);
1955
1956 /* Use cache if available */
1957 if (local_cache && *local_cache) {
1958 struct stat sb;
1959 BGZF *fp;
1960
1961 expand_cache_path(path, local_cache, tag->str+3);
1962
1963 if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) {
1964 r->length = sb.st_size;
1965 r->offset = r->line_length = r->bases_per_line = 0;
1966
1967 r->fn = string_dup(fd->refs->pool, path);
1968
1969 if (fd->refs->fp)
1970 if (bgzf_close(fd->refs->fp) != 0)
1971 return -1;
1972 fd->refs->fp = fp;
1973 fd->refs->fn = r->fn;
1974
1975 // Fall back to cram_get_ref() where it'll do the actual
1976 // reading of the file.
1977 return 0;
1978 }
1979 }
1980
1981 /* Otherwise search */
1982 if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
1983 size_t sz;
1984 r->seq = mfsteal(mf, &sz);
1985 r->length = sz;
1986 } else {
1987 refs_t *refs;
1988 char *fn;
1989
1990 no_M5:
1991 /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
1992 if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
1993 return -1;
1994
1995 fn = (strncmp(tag->str+3, "file:", 5) == 0)
1996 ? tag->str+8
1997 : tag->str+3;
1998
1999 if (fd->refs->fp) {
2000 if (bgzf_close(fd->refs->fp) != 0)
2001 return -1;
2002 fd->refs->fp = NULL;
2003 }
2004 if (!(refs = refs_load_fai(fd->refs, fn, 0)))
2005 return -1;
2006 fd->refs = refs;
2007 if (fd->refs->fp) {
2008 if (bgzf_close(fd->refs->fp) != 0)
2009 return -1;
2010 fd->refs->fp = NULL;
2011 }
2012
2013 if (!fd->refs->fn)
2014 return -1;
2015
2016 if (-1 == refs2id(fd->refs, fd->header))
2017 return -1;
2018 if (!fd->refs->ref_id || !fd->refs->ref_id[id])
2019 return -1;
2020
2021 // Local copy already, so fall back to cram_get_ref().
2022 return 0;
2023 }
2024
2025 /* Populate the local disk cache if required */
2026 if (local_cache && *local_cache) {
2027 FILE *fp;
2028 int i;
2029
2030 expand_cache_path(path, local_cache, tag->str+3);
2031 if (fd->verbose)
2032 fprintf(stderr, "Path='%s'\n", path);
2033 mkdir_prefix(path, 01777);
2034
2035 i = 0;
2036 do {
2037 sprintf(path_tmp, "%s.tmp_%d", path, /*getpid(),*/ i);
2038 i++;
2039 fp = fopen(path_tmp, "wx");
2040 } while (fp == NULL && errno == EEXIST);
2041 if (!fp) {
2042 perror(path_tmp);
2043
2044 // Not fatal - we have the data already so keep going.
2045 return 0;
2046 }
2047
2048 if (r->length != fwrite(r->seq, 1, r->length, fp)) {
2049 perror(path);
2050 }
2051 if (-1 == paranoid_fclose(fp)) {
2052 unlink(path_tmp);
2053 } else {
2054 if (0 == chmod(path_tmp, 0444))
2055 rename(path_tmp, path);
2056 else
2057 unlink(path_tmp);
2058 }
2059 }
2060
2061 return 0;
2062 }
2063
2064 static void cram_ref_incr_locked(refs_t *r, int id) {
2065 RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2066
2067 if (id < 0 || !r->ref_id[id]->seq)
2068 return;
2069
2070 if (r->last_id == id)
2071 r->last_id = -1;
2072
2073 ++r->ref_id[id]->count;
2074 }
2075
2076 void cram_ref_incr(refs_t *r, int id) {
2077 pthread_mutex_lock(&r->lock);
2078 cram_ref_incr_locked(r, id);
2079 pthread_mutex_unlock(&r->lock);
2080 }
2081
2082 static void cram_ref_decr_locked(refs_t *r, int id) {
2083 RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2084
2085 if (id < 0 || !r->ref_id[id]->seq) {
2086 assert(r->ref_id[id]->count >= 0);
2087 return;
2088 }
2089
2090 if (--r->ref_id[id]->count <= 0) {
2091 assert(r->ref_id[id]->count == 0);
2092 if (r->last_id >= 0) {
2093 if (r->ref_id[r->last_id]->count <= 0 &&
2094 r->ref_id[r->last_id]->seq) {
2095 RP("%d FREE REF %d (%p)\n", gettid(),
2096 r->last_id, r->ref_id[r->last_id]->seq);
2097 free(r->ref_id[r->last_id]->seq);
2098 r->ref_id[r->last_id]->seq = NULL;
2099 r->ref_id[r->last_id]->length = 0;
2100 }
2101 }
2102 r->last_id = id;
2103 }
2104 }
2105
2106 void cram_ref_decr(refs_t *r, int id) {
2107 pthread_mutex_lock(&r->lock);
2108 cram_ref_decr_locked(r, id);
2109 pthread_mutex_unlock(&r->lock);
2110 }
2111
2112 /*
2113 * Used by cram_ref_load and cram_ref_get. The file handle will have
2114 * already been opened, so we can catch it. The ref_entry *e informs us
2115 * of whether this is a multi-line fasta file or a raw MD5 style file.
2116 * Either way we create a single contiguous sequence.
2117 *
2118 * Returns all or part of a reference sequence on success (malloced);
2119 * NULL on failure.
2120 */
2121 static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
2122 off_t offset, len;
2123 char *seq;
2124
2125 if (end < start)
2126 end = start;
2127
2128 /*
2129 * Compute locations in file. This is trivial for the MD5 files, but
2130 * is still necessary for the fasta variants.
2131 */
2132 offset = e->line_length
2133 ? e->offset + (start-1)/e->bases_per_line * e->line_length +
2134 (start-1) % e->bases_per_line
2135 : start-1;
2136
2137 len = (e->line_length
2138 ? e->offset + (end-1)/e->bases_per_line * e->line_length +
2139 (end-1) % e->bases_per_line
2140 : end-1) - offset + 1;
2141
2142 if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
2143 perror("bgzf_useek() on reference file");
2144 return NULL;
2145 }
2146
2147 if (len == 0 || !(seq = malloc(len))) {
2148 return NULL;
2149 }
2150
2151 if (len != bgzf_read(fp, seq, len)) {
2152 perror("bgzf_read() on reference file");
2153 free(seq);
2154 return NULL;
2155 }
2156
2157 /* Strip white-space if required. */
2158 if (len != end-start+1) {
2159 int i, j;
2160 char *cp = seq;
2161 char *cp_to;
2162
2163 for (i = j = 0; i < len; i++) {
2164 if (cp[i] >= '!' && cp[i] <= '~')
2165 cp[j++] = cp[i] & ~0x20;
2166 }
2167 cp_to = cp+j;
2168
2169 if (cp_to - seq != end-start+1) {
2170 fprintf(stderr, "Malformed reference file?\n");
2171 free(seq);
2172 return NULL;
2173 }
2174 } else {
2175 int i;
2176 for (i = 0; i < len; i++) {
2177 seq[i] = seq[i] & ~0x20; // uppercase in ASCII
2178 }
2179 }
2180
2181 return seq;
2182 }
2183
2184 /*
2185 * Load the entire reference 'id'.
2186 * This also increments the reference count by 1.
2187 *
2188 * Returns ref_entry on success;
2189 * NULL on failure
2190 */
2191 ref_entry *cram_ref_load(refs_t *r, int id) {
2192 ref_entry *e = r->ref_id[id];
2193 int start = 1, end = e->length;
2194 char *seq;
2195
2196 if (e->seq) {
2197 return e;
2198 }
2199
2200 assert(e->count == 0);
2201
2202 if (r->last) {
2203 #ifdef REF_DEBUG
2204 int idx = 0;
2205 for (idx = 0; idx < r->nref; idx++)
2206 if (r->last == r->ref_id[idx])
2207 break;
2208 RP("%d cram_ref_load DECR %d\n", gettid(), idx);
2209 #endif
2210 assert(r->last->count > 0);
2211 if (--r->last->count <= 0) {
2212 RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
2213 if (r->last->seq) {
2214 free(r->last->seq);
2215 r->last->seq = NULL;
2216 }
2217 }
2218 }
2219
2220 /* Open file if it's not already the current open reference */
2221 if (strcmp(r->fn, e->fn) || r->fp == NULL) {
2222 if (r->fp)
2223 if (bgzf_close(r->fp) != 0)
2224 return NULL;
2225 r->fn = e->fn;
2226 if (!(r->fp = bgzf_open_ref(r->fn, "r")))
2227 return NULL;
2228 }
2229
2230 RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
2231
2232 if (!(seq = load_ref_portion(r->fp, e, start, end))) {
2233 return NULL;
2234 }
2235
2236 RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
2237
2238 RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
2239 e->seq = seq;
2240 e->count++;
2241
2242 /*
2243 * Also keep track of last used ref so incr/decr loops on the same
2244 * sequence don't cause load/free loops.
2245 */
2246 RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
2247 r->last = e;
2248 e->count++;
2249
2250 return e;
2251 }
2252
2253 /*
2254 * Returns a portion of a reference sequence from start to end inclusive.
2255 * The returned pointer is owned by either the cram_file fd or by the
2256 * internal refs_t structure and should not be freed by the caller.
2257 *
2258 * The difference is whether or not this refs_t is in use by just the one
2259 * cram_fd or by multiples, or whether we have multiple threads accessing
2260 * references. In either case fd->shared will be true and we start using
2261 * reference counting to track the number of users of a specific reference
2262 * sequence.
2263 *
2264 * Otherwise the ref seq returned is allocated as part of cram_fd itself
2265 * and will be freed up on the next call to cram_get_ref or cram_close.
2266 *
2267 * To return the entire reference sequence, specify start as 1 and end
2268 * as 0.
2269 *
2270 * To cease using a reference, call cram_ref_decr().
2271 *
2272 * Returns reference on success,
2273 * NULL on failure
2274 */
2275 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
2276 ref_entry *r;
2277 char *seq;
2278 int ostart = start;
2279
2280 if (id == -1)
2281 return NULL;
2282
2283 /* FIXME: axiomatic query of r->seq being true?
2284 * Or shortcut for unsorted data where we load once and never free?
2285 */
2286
2287 //fd->shared_ref = 1; // hard code for now to simplify things
2288
2289 pthread_mutex_lock(&fd->ref_lock);
2290
2291 RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
2292
2293 /*
2294 * Unsorted data implies we want to fetch an entire reference at a time.
2295 * We just deal with this at the moment by claiming we're sharing
2296 * references instead, which has the same requirement.
2297 */
2298 if (fd->unsorted)
2299 fd->shared_ref = 1;
2300
2301
2302 /* Sanity checking: does this ID exist? */
2303 if (id >= fd->refs->nref) {
2304 fprintf(stderr, "No reference found for id %d\n", id);
2305 pthread_mutex_unlock(&fd->ref_lock);
2306 return NULL;
2307 }
2308
2309 if (!fd->refs || !fd->refs->ref_id[id]) {
2310 fprintf(stderr, "No reference found for id %d\n", id);
2311 pthread_mutex_unlock(&fd->ref_lock);
2312 return NULL;
2313 }
2314
2315 if (!(r = fd->refs->ref_id[id])) {
2316 fprintf(stderr, "No reference found for id %d\n", id);
2317 pthread_mutex_unlock(&fd->ref_lock);
2318 return NULL;
2319 }
2320
2321
2322 /*
2323 * It has an entry, but may not have been populated yet.
2324 * Any manually loaded .fai files have their lengths known.
2325 * A ref entry computed from @SQ lines (M5 or UR field) will have
2326 * r->length == 0 unless it's been loaded once and verified that we have
2327 * an on-disk filename for it.
2328 *
2329 * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
2330 * open_path_mfile and libcurl, which isn't multi-thread safe unless I
2331 * rewrite my code to have one curl handle per thread.
2332 */
2333 pthread_mutex_lock(&fd->refs->lock);
2334 if (r->length == 0) {
2335 if (cram_populate_ref(fd, id, r) == -1) {
2336 fprintf(stderr, "Failed to populate reference for id %d\n", id);
2337 pthread_mutex_unlock(&fd->refs->lock);
2338 pthread_mutex_unlock(&fd->ref_lock);
2339 return NULL;
2340 }
2341 r = fd->refs->ref_id[id];
2342 if (fd->unsorted)
2343 cram_ref_incr_locked(fd->refs, id);
2344 }
2345
2346
2347 /*
2348 * We now know that we the filename containing the reference, so check
2349 * for limits. If it's over half the reference we'll load all of it in
2350 * memory as this will speed up subsequent calls.
2351 */
2352 if (end < 1)
2353 end = r->length;
2354 if (end >= r->length)
2355 end = r->length;
2356 assert(start >= 1);
2357
2358 if (end - start >= 0.5*r->length || fd->shared_ref) {
2359 start = 1;
2360 end = r->length;
2361 }
2362
2363 /*
2364 * Maybe we have it cached already? If so use it.
2365 *
2366 * Alternatively if we don't have the sequence but we're sharing
2367 * references and/or are asking for the entire length of it, then
2368 * load the full reference into the refs structure and return
2369 * a pointer to that one instead.
2370 */
2371 if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
2372 char *cp;
2373
2374 if (id >= 0) {
2375 if (r->seq) {
2376 cram_ref_incr_locked(fd->refs, id);
2377 } else {
2378 ref_entry *e;
2379 if (!(e = cram_ref_load(fd->refs, id))) {
2380 pthread_mutex_unlock(&fd->refs->lock);
2381 pthread_mutex_unlock(&fd->ref_lock);
2382 return NULL;
2383 }
2384
2385 /* unsorted data implies cache ref indefinitely, to avoid
2386 * continually loading and unloading.
2387 */
2388 if (fd->unsorted)
2389 cram_ref_incr_locked(fd->refs, id);
2390 }
2391
2392 fd->ref = NULL; /* We never access it directly */
2393 fd->ref_start = 1;
2394 fd->ref_end = r->length;
2395 fd->ref_id = id;
2396
2397 cp = fd->refs->ref_id[id]->seq + ostart-1;
2398 } else {
2399 fd->ref = NULL;
2400 cp = NULL;
2401 }
2402
2403 RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
2404
2405 pthread_mutex_unlock(&fd->refs->lock);
2406 pthread_mutex_unlock(&fd->ref_lock);
2407 return cp;
2408 }
2409
2410 /*
2411 * Otherwise we're not sharing, we don't have a copy of it already and
2412 * we're only asking for a small portion of it.
2413 *
2414 * In this case load up just that segment ourselves, freeing any old
2415 * small segments in the process.
2416 */
2417
2418 /* Unmapped ref ID */
2419 if (id < 0) {
2420 if (fd->ref_free) {
2421 free(fd->ref_free);
2422 fd->ref_free = NULL;
2423 }
2424 fd->ref = NULL;
2425 fd->ref_id = id;
2426 pthread_mutex_unlock(&fd->refs->lock);
2427 pthread_mutex_unlock(&fd->ref_lock);
2428 return NULL;
2429 }
2430
2431 /* Open file if it's not already the current open reference */
2432 if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
2433 if (fd->refs->fp)
2434 if (bgzf_close(fd->refs->fp) != 0)
2435 return NULL;
2436 fd->refs->fn = r->fn;
2437 if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r"))) {
2438 pthread_mutex_unlock(&fd->refs->lock);
2439 pthread_mutex_unlock(&fd->ref_lock);
2440 return NULL;
2441 }
2442 }
2443
2444 if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
2445 pthread_mutex_unlock(&fd->refs->lock);
2446 pthread_mutex_unlock(&fd->ref_lock);
2447 return NULL;
2448 }
2449
2450 if (fd->ref_free)
2451 free(fd->ref_free);
2452
2453 fd->ref_id = id;
2454 fd->ref_start = start;
2455 fd->ref_end = end;
2456 fd->ref_free = fd->ref;
2457 seq = fd->ref;
2458
2459 pthread_mutex_unlock(&fd->refs->lock);
2460 pthread_mutex_unlock(&fd->ref_lock);
2461
2462 return seq + ostart - start;
2463 }
2464
2465 /*
2466 * If fd has been opened for reading, it may be permitted to specify 'fn'
2467 * as NULL and let the code auto-detect the reference by parsing the
2468 * SAM header @SQ lines.
2469 */
2470 int cram_load_reference(cram_fd *fd, char *fn) {
2471 if (fn) {
2472 fd->refs = refs_load_fai(fd->refs, fn,
2473 !(fd->embed_ref && fd->mode == 'r'));
2474 fn = fd->refs ? fd->refs->fn : NULL;
2475 }
2476 fd->ref_fn = fn;
2477
2478 if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
2479 if (fd->refs)
2480 refs_free(fd->refs);
2481 if (!(fd->refs = refs_create()))
2482 return -1;
2483 if (-1 == refs_from_header(fd->refs, fd, fd->header))
2484 return -1;
2485 }
2486
2487 if (fd->header)
2488 if (-1 == refs2id(fd->refs, fd->header))
2489 return -1;
2490
2491 return fn ? 0 : -1;
2492 }
2493
2494 /* ----------------------------------------------------------------------
2495 * Containers
2496 */
2497
2498 /*
2499 * Creates a new container, specifying the maximum number of slices
2500 * and records permitted.
2501 *
2502 * Returns cram_container ptr on success
2503 * NULL on failure
2504 */
2505 cram_container *cram_new_container(int nrec, int nslice) {
2506 cram_container *c = calloc(1, sizeof(*c));
2507 enum cram_DS_ID id;
2508
2509 if (!c)
2510 return NULL;
2511
2512 c->curr_ref = -2;
2513
2514 c->max_c_rec = nrec * nslice;
2515 c->curr_c_rec = 0;
2516
2517 c->max_rec = nrec;
2518 c->record_counter = 0;
2519 c->num_bases = 0;
2520
2521 c->max_slice = nslice;
2522 c->curr_slice = 0;
2523
2524 c->pos_sorted = 1;
2525 c->max_apos = 0;
2526 c->multi_seq = 0;
2527
2528 c->bams = NULL;
2529
2530 if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
2531 goto err;
2532 c->slice = NULL;
2533
2534 if (!(c->comp_hdr = cram_new_compression_header()))
2535 goto err;
2536 c->comp_hdr_block = NULL;
2537
2538 for (id = DS_RN; id < DS_TN; id++)
2539 if (!(c->stats[id] = cram_stats_create())) goto err;
2540
2541 //c->aux_B_stats = cram_stats_create();
2542
2543 if (!(c->tags_used = kh_init(s_i2i)))
2544 goto err;
2545 c->refs_used = 0;
2546
2547 return c;
2548
2549 err:
2550 if (c) {
2551 if (c->slices)
2552 free(c->slices);
2553 free(c);
2554 }
2555 return NULL;
2556 }
2557
2558 void cram_free_container(cram_container *c) {
2559 enum cram_DS_ID id;
2560 int i;
2561
2562 if (!c)
2563 return;
2564
2565 if (c->refs_used)
2566 free(c->refs_used);
2567
2568 if (c->landmark)
2569 free(c->landmark);
2570
2571 if (c->comp_hdr)
2572 cram_free_compression_header(c->comp_hdr);
2573
2574 if (c->comp_hdr_block)
2575 cram_free_block(c->comp_hdr_block);
2576
2577 if (c->slices) {
2578 for (i = 0; i < c->max_slice; i++)
2579 if (c->slices[i])
2580 cram_free_slice(c->slices[i]);
2581 free(c->slices);
2582 }
2583
2584 for (id = DS_RN; id < DS_TN; id++)
2585 if (c->stats[id]) cram_stats_free(c->stats[id]);
2586
2587 //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
2588
2589 if (c->tags_used) kh_destroy(s_i2i, c->tags_used);
2590
2591 free(c);
2592 }
2593
2594 /*
2595 * Reads a container header.
2596 *
2597 * Returns cram_container on success
2598 * NULL on failure or no container left (fd->err == 0).
2599 */
2600 cram_container *cram_read_container(cram_fd *fd) {
2601 cram_container c2, *c;
2602 int i, s;
2603 size_t rd = 0;
2604
2605 fd->err = 0;
2606 fd->eof = 0;
2607
2608 memset(&c2, 0, sizeof(c2));
2609 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2610 if ((s = itf8_decode(fd, &c2.length)) == -1) {
2611 fd->eof = fd->empty_container ? 1 : 2;
2612 return NULL;
2613 } else {
2614 rd+=s;
2615 }
2616 } else {
2617 if ((s = int32_decode(fd, &c2.length)) == -1) {
2618 if (CRAM_MAJOR_VERS(fd->version) == 2 &&
2619 CRAM_MINOR_VERS(fd->version) == 0)
2620 fd->eof = 1; // EOF blocks arrived in v2.1
2621 else
2622 fd->eof = fd->empty_container ? 1 : 2;
2623 return NULL;
2624 } else {
2625 rd+=s;
2626 }
2627 }
2628 if ((s = itf8_decode(fd, &c2.ref_seq_id)) == -1) return NULL; else rd+=s;
2629 if ((s = itf8_decode(fd, &c2.ref_seq_start))== -1) return NULL; else rd+=s;
2630 if ((s = itf8_decode(fd, &c2.ref_seq_span)) == -1) return NULL; else rd+=s;
2631 if ((s = itf8_decode(fd, &c2.num_records)) == -1) return NULL; else rd+=s;
2632
2633 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2634 c2.record_counter = 0;
2635 c2.num_bases = 0;
2636 } else {
2637 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2638 if ((s = ltf8_decode(fd, &c2.record_counter)) == -1)
2639 return NULL;
2640 else
2641 rd += s;
2642 } else {
2643 int32_t i32;
2644 if ((s = itf8_decode(fd, &i32)) == -1)
2645 return NULL;
2646 else
2647 rd += s;
2648 c2.record_counter = i32;
2649 }
2650
2651 if ((s = ltf8_decode(fd, &c2.num_bases))== -1)
2652 return NULL;
2653 else
2654 rd += s;
2655 }
2656 if ((s = itf8_decode(fd, &c2.num_blocks)) == -1) return NULL; else rd+=s;
2657 if ((s = itf8_decode(fd, &c2.num_landmarks))== -1) return NULL; else rd+=s;
2658
2659 if (!(c = calloc(1, sizeof(*c))))
2660 return NULL;
2661
2662 *c = c2;
2663
2664 if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
2665 c->num_landmarks) {
2666 fd->err = errno;
2667 cram_free_container(c);
2668 return NULL;
2669 }
2670 for (i = 0; i < c->num_landmarks; i++) {
2671 if ((s = itf8_decode(fd, &c->landmark[i])) == -1) {
2672 cram_free_container(c);
2673 return NULL;
2674 } else {
2675 rd += s;
2676 }
2677 }
2678
2679 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2680 uint32_t crc, i;
2681 unsigned char *dat = malloc(50 + 5*(c->num_landmarks)), *cp = dat;
2682 if (!dat) {
2683 cram_free_container(c);
2684 return NULL;
2685 }
2686 if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
2687 return NULL;
2688 else
2689 rd+=4;
2690
2691 /* Reencode first as we can't easily access the original byte stream.
2692 *
2693 * FIXME: Technically this means this may not be fool proof. We could
2694 * create a CRAM file using a 2 byte ITF8 value that can fit in a
2695 * 1 byte field, meaning the encoding is different to the original
2696 * form and so has a different CRC.
2697 *
2698 * The correct implementation would be to have an alternative form
2699 * of itf8_decode which also squirrels away the raw byte stream
2700 * during decoding so we can then CRC that.
2701 */
2702 *(unsigned int *)cp = le_int4(c->length); cp += 4;
2703 cp += itf8_put(cp, c->ref_seq_id);
2704 cp += itf8_put(cp, c->ref_seq_start);
2705 cp += itf8_put(cp, c->ref_seq_span);
2706 cp += itf8_put(cp, c->num_records);
2707 cp += ltf8_put((char *)cp, c->record_counter);
2708 cp += itf8_put(cp, c->num_bases);
2709 cp += itf8_put(cp, c->num_blocks);
2710 cp += itf8_put(cp, c->num_landmarks);
2711 for (i = 0; i < c->num_landmarks; i++) {
2712 cp += itf8_put(cp, c->landmark[i]);
2713 }
2714
2715 crc = crc32(0L, dat, cp-dat);
2716 if (crc != c->crc32) {
2717 fprintf(stderr, "Container header CRC32 failure\n");
2718 cram_free_container(c);
2719 return NULL;
2720 }
2721 }
2722
2723 c->offset = rd;
2724 c->slices = NULL;
2725 c->curr_slice = 0;
2726 c->max_slice = c->num_landmarks;
2727 c->slice_rec = 0;
2728 c->curr_rec = 0;
2729 c->max_rec = 0;
2730
2731 if (c->ref_seq_id == -2) {
2732 c->multi_seq = 1;
2733 fd->multi_seq = 1;
2734 }
2735
2736 fd->empty_container =
2737 (c->num_records == 0 &&
2738 c->ref_seq_id == -1 &&
2739 c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
2740
2741 return c;
2742 }
2743
2744 /*
2745 * Writes a container structure.
2746 *
2747 * Returns 0 on success
2748 * -1 on failure
2749 */
2750 int cram_write_container(cram_fd *fd, cram_container *c) {
2751 char buf_a[1024], *buf = buf_a, *cp;
2752 int i;
2753
2754 if (55 + c->num_landmarks * 5 >= 1024)
2755 buf = malloc(55 + c->num_landmarks * 5);
2756 cp = buf;
2757
2758 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2759 cp += itf8_put(cp, c->length);
2760 } else {
2761 *(int32_t *)cp = le_int4(c->length);
2762 cp += 4;
2763 }
2764 if (c->multi_seq) {
2765 cp += itf8_put(cp, -2);
2766 cp += itf8_put(cp, 0);
2767 cp += itf8_put(cp, 0);
2768 } else {
2769 cp += itf8_put(cp, c->ref_seq_id);
2770 cp += itf8_put(cp, c->ref_seq_start);
2771 cp += itf8_put(cp, c->ref_seq_span);
2772 }
2773 cp += itf8_put(cp, c->num_records);
2774 if (CRAM_MAJOR_VERS(fd->version) == 2) {
2775 cp += itf8_put(cp, c->record_counter);
2776 cp += ltf8_put(cp, c->num_bases);
2777 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2778 cp += ltf8_put(cp, c->record_counter);
2779 cp += ltf8_put(cp, c->num_bases);
2780 }
2781
2782 cp += itf8_put(cp, c->num_blocks);
2783 cp += itf8_put(cp, c->num_landmarks);
2784 for (i = 0; i < c->num_landmarks; i++)
2785 cp += itf8_put(cp, c->landmark[i]);
2786
2787 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2788 c->crc32 = crc32(0L, (uc *)buf, cp-buf);
2789 cp[0] = c->crc32 & 0xff;
2790 cp[1] = (c->crc32 >> 8) & 0xff;
2791 cp[2] = (c->crc32 >> 16) & 0xff;
2792 cp[3] = (c->crc32 >> 24) & 0xff;
2793 cp += 4;
2794 }
2795
2796 if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
2797 if (buf != buf_a)
2798 free(buf);
2799 return -1;
2800 }
2801
2802 if (buf != buf_a)
2803 free(buf);
2804
2805 return 0;
2806 }
2807
2808 // common component shared by cram_flush_container{,_mt}
2809 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
2810 int i, j;
2811
2812 //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
2813
2814 /* Write the container struct itself */
2815 if (0 != cram_write_container(fd, c))
2816 return -1;
2817
2818 /* And the compression header */
2819 if (0 != cram_write_block(fd, c->comp_hdr_block))
2820 return -1;
2821
2822 /* Followed by the slice blocks */
2823 for (i = 0; i < c->curr_slice; i++) {
2824 cram_slice *s = c->slices[i];
2825
2826 if (0 != cram_write_block(fd, s->hdr_block))
2827 return -1;
2828
2829 for (j = 0; j < s->hdr->num_blocks; j++) {
2830 if (0 != cram_write_block(fd, s->block[j]))
2831 return -1;
2832 }
2833 }
2834
2835 return hflush(fd->fp) == 0 ? 0 : -1;
2836 }
2837
2838 /*
2839 * Flushes a completely or partially full container to disk, writing
2840 * container structure, header and blocks. This also calls the encoder
2841 * functions.
2842 *
2843 * Returns 0 on success
2844 * -1 on failure
2845 */
2846 int cram_flush_container(cram_fd *fd, cram_container *c) {
2847 /* Encode the container blocks and generate compression header */
2848 if (0 != cram_encode_container(fd, c))
2849 return -1;
2850
2851 return cram_flush_container2(fd, c);
2852 }
2853
2854 typedef struct {
2855 cram_fd *fd;
2856 cram_container *c;
2857 } cram_job;
2858
2859 void *cram_flush_thread(void *arg) {
2860 cram_job *j = (cram_job *)arg;
2861
2862 /* Encode the container blocks and generate compression header */
2863 if (0 != cram_encode_container(j->fd, j->c)) {
2864 fprintf(stderr, "cram_encode_container failed\n");
2865 return NULL;
2866 }
2867
2868 return arg;
2869 }
2870
2871 static int cram_flush_result(cram_fd *fd) {
2872 int i, ret = 0;
2873 t_pool_result *r;
2874
2875 while ((r = t_pool_next_result(fd->rqueue))) {
2876 cram_job *j = (cram_job *)r->data;
2877 cram_container *c;
2878
2879 if (!j) {
2880 t_pool_delete_result(r, 0);
2881 return -1;
2882 }
2883
2884 fd = j->fd;
2885 c = j->c;
2886
2887 if (0 != cram_flush_container2(fd, c))
2888 return -1;
2889
2890 /* Free the container */
2891 for (i = 0; i < c->max_slice; i++) {
2892 cram_free_slice(c->slices[i]);
2893 c->slices[i] = NULL;
2894 }
2895
2896 c->slice = NULL;
2897 c->curr_slice = 0;
2898
2899 cram_free_container(c);
2900
2901 ret |= hflush(fd->fp) == 0 ? 0 : -1;
2902
2903 t_pool_delete_result(r, 1);
2904 }
2905
2906 return ret;
2907 }
2908
2909 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
2910 cram_job *j;
2911
2912 if (!fd->pool)
2913 return cram_flush_container(fd, c);
2914
2915 if (!(j = malloc(sizeof(*j))))
2916 return -1;
2917 j->fd = fd;
2918 j->c = c;
2919
2920 t_pool_dispatch(fd->pool, fd->rqueue, cram_flush_thread, j);
2921
2922 return cram_flush_result(fd);
2923 }
2924
2925 /* ----------------------------------------------------------------------
2926 * Compression headers; the first part of the container
2927 */
2928
2929 /*
2930 * Creates a new blank container compression header
2931 *
2932 * Returns header ptr on success
2933 * NULL on failure
2934 */
2935 cram_block_compression_hdr *cram_new_compression_header(void) {
2936 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
2937 if (!hdr)
2938 return NULL;
2939
2940 if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
2941 free(hdr);
2942 return NULL;
2943 }
2944
2945 if (!(hdr->TD_hash = kh_init(m_s2i))) {
2946 cram_free_block(hdr->TD_blk);
2947 free(hdr);
2948 return NULL;
2949 }
2950
2951 if (!(hdr->TD_keys = string_pool_create(8192))) {
2952 kh_destroy(m_s2i, hdr->TD_hash);
2953 cram_free_block(hdr->TD_blk);
2954 free(hdr);
2955 return NULL;
2956 }
2957
2958 return hdr;
2959 }
2960
2961 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
2962 int i;
2963
2964 if (hdr->landmark)
2965 free(hdr->landmark);
2966
2967 if (hdr->preservation_map)
2968 kh_destroy(map, hdr->preservation_map);
2969
2970 for (i = 0; i < CRAM_MAP_HASH; i++) {
2971 cram_map *m, *m2;
2972 for (m = hdr->rec_encoding_map[i]; m; m = m2) {
2973 m2 = m->next;
2974 if (m->codec)
2975 m->codec->free(m->codec);
2976 free(m);
2977 }
2978 }
2979
2980 for (i = 0; i < CRAM_MAP_HASH; i++) {
2981 cram_map *m, *m2;
2982 for (m = hdr->tag_encoding_map[i]; m; m = m2) {
2983 m2 = m->next;
2984 if (m->codec)
2985 m->codec->free(m->codec);
2986 free(m);
2987 }
2988 }
2989
2990 for (i = 0; i < DS_END; i++) {
2991 if (hdr->codecs[i])
2992 hdr->codecs[i]->free(hdr->codecs[i]);
2993 }
2994
2995 if (hdr->TL)
2996 free(hdr->TL);
2997 if (hdr->TD_blk)
2998 cram_free_block(hdr->TD_blk);
2999 if (hdr->TD_hash)
3000 kh_destroy(m_s2i, hdr->TD_hash);
3001 if (hdr->TD_keys)
3002 string_pool_destroy(hdr->TD_keys);
3003
3004 free(hdr);
3005 }
3006
3007
3008 /* ----------------------------------------------------------------------
3009 * Slices and slice headers
3010 */
3011
3012 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
3013 if (!hdr)
3014 return;
3015
3016 if (hdr->block_content_ids)
3017 free(hdr->block_content_ids);
3018
3019 free(hdr);
3020
3021 return;
3022 }
3023
3024 void cram_free_slice(cram_slice *s) {
3025 if (!s)
3026 return;
3027
3028 if (s->hdr_block)
3029 cram_free_block(s->hdr_block);
3030
3031 if (s->block) {
3032 int i;
3033
3034 if (s->hdr) {
3035 for (i = 0; i < s->hdr->num_blocks; i++) {
3036 cram_free_block(s->block[i]);
3037 }
3038 }
3039 free(s->block);
3040 }
3041
3042 if (s->block_by_id)
3043 free(s->block_by_id);
3044
3045 if (s->hdr)
3046 cram_free_slice_header(s->hdr);
3047
3048 if (s->seqs_blk)
3049 cram_free_block(s->seqs_blk);
3050
3051 if (s->qual_blk)
3052 cram_free_block(s->qual_blk);
3053
3054 if (s->name_blk)
3055 cram_free_block(s->name_blk);
3056
3057 if (s->aux_blk)
3058 cram_free_block(s->aux_blk);
3059
3060 if (s->aux_OQ_blk)
3061 cram_free_block(s->aux_OQ_blk);
3062
3063 if (s->aux_BQ_blk)
3064 cram_free_block(s->aux_BQ_blk);
3065
3066 if (s->aux_FZ_blk)
3067 cram_free_block(s->aux_FZ_blk);
3068
3069 if (s->aux_oq_blk)
3070 cram_free_block(s->aux_oq_blk);
3071
3072 if (s->aux_os_blk)
3073 cram_free_block(s->aux_os_blk);
3074
3075 if (s->aux_oz_blk)
3076 cram_free_block(s->aux_oz_blk);
3077
3078 if (s->base_blk)
3079 cram_free_block(s->base_blk);
3080
3081 if (s->soft_blk)
3082 cram_free_block(s->soft_blk);
3083
3084 if (s->cigar)
3085 free(s->cigar);
3086
3087 if (s->crecs)
3088 free(s->crecs);
3089
3090 if (s->features)
3091 free(s->features);
3092
3093 if (s->TN)
3094 free(s->TN);
3095
3096 if (s->pair_keys)
3097 string_pool_destroy(s->pair_keys);
3098
3099 if (s->pair[0])
3100 kh_destroy(m_s2i, s->pair[0]);
3101 if (s->pair[1])
3102 kh_destroy(m_s2i, s->pair[1]);
3103
3104 free(s);
3105 }
3106
3107 /*
3108 * Creates a new empty slice in memory, for subsequent writing to
3109 * disk.
3110 *
3111 * Returns cram_slice ptr on success
3112 * NULL on failure
3113 */
3114 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
3115 cram_slice *s = calloc(1, sizeof(*s));
3116 if (!s)
3117 return NULL;
3118
3119 if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
3120 goto err;
3121 s->hdr->content_type = type;
3122
3123 s->hdr_block = NULL;
3124 s->block = NULL;
3125 s->block_by_id = NULL;
3126 s->last_apos = 0;
3127 if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
3128 s->cigar = NULL;
3129 s->cigar_alloc = 0;
3130 s->ncigar = 0;
3131
3132 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
3133 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
3134 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
3135 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
3136 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
3137 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
3138
3139 s->features = NULL;
3140 s->nfeatures = s->afeatures = 0;
3141
3142 #ifndef TN_external
3143 s->TN = NULL;
3144 s->nTN = s->aTN = 0;
3145 #endif
3146
3147 // Volatile keys as we do realloc in dstring
3148 if (!(s->pair_keys = string_pool_create(8192))) goto err;
3149 if (!(s->pair[0] = kh_init(m_s2i))) goto err;
3150 if (!(s->pair[1] = kh_init(m_s2i))) goto err;
3151
3152 #ifdef BA_external
3153 s->BA_len = 0;
3154 #endif
3155
3156 return s;
3157
3158 err:
3159 if (s)
3160 cram_free_slice(s);
3161
3162 return NULL;
3163 }
3164
3165 /*
3166 * Loads an entire slice.
3167 * FIXME: In 1.0 the native unit of slices within CRAM is broken
3168 * as slices contain references to objects in other slices.
3169 * To work around this while keeping the slice oriented outer loop
3170 * we read all slices and stitch them together into a fake large
3171 * slice instead.
3172 *
3173 * Returns cram_slice ptr on success
3174 * NULL on failure
3175 */
3176 cram_slice *cram_read_slice(cram_fd *fd) {
3177 cram_block *b = cram_read_block(fd);
3178 cram_slice *s = calloc(1, sizeof(*s));
3179 int i, n, max_id, min_id;
3180
3181 if (!b || !s)
3182 goto err;
3183
3184 s->hdr_block = b;
3185 switch (b->content_type) {
3186 case MAPPED_SLICE:
3187 case UNMAPPED_SLICE:
3188 if (!(s->hdr = cram_decode_slice_header(fd, b)))
3189 goto err;
3190 break;
3191
3192 default:
3193 fprintf(stderr, "Unexpected block of type %s\n",
3194 cram_content_type2str(b->content_type));
3195 goto err;
3196 }
3197
3198 s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
3199 if (!s->block)
3200 goto err;
3201
3202 for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
3203 if (!(s->block[i] = cram_read_block(fd)))
3204 goto err;
3205
3206 if (s->block[i]->content_type == EXTERNAL) {
3207 if (max_id < s->block[i]->content_id)
3208 max_id = s->block[i]->content_id;
3209 if (min_id > s->block[i]->content_id)
3210 min_id = s->block[i]->content_id;
3211 }
3212 }
3213 if (min_id >= 0 && max_id < 1024) {
3214 if (!(s->block_by_id = calloc(1024, sizeof(s->block[0]))))
3215 goto err;
3216
3217 for (i = 0; i < n; i++) {
3218 if (s->block[i]->content_type != EXTERNAL)
3219 continue;
3220 s->block_by_id[s->block[i]->content_id] = s->block[i];
3221 }
3222 }
3223
3224 /* Initialise encoding/decoding tables */
3225 s->cigar = NULL;
3226 s->cigar_alloc = 0;
3227 s->ncigar = 0;
3228
3229 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
3230 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
3231 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
3232 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
3233 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
3234 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
3235
3236 s->crecs = NULL;
3237
3238 s->last_apos = s->hdr->ref_seq_start;
3239
3240 return s;
3241
3242 err:
3243 if (b)
3244 cram_free_block(b);
3245 if (s) {
3246 s->hdr_block = NULL;
3247 cram_free_slice(s);
3248 }
3249 return NULL;
3250 }
3251
3252
3253 /* ----------------------------------------------------------------------
3254 * CRAM file definition (header)
3255 */
3256
3257 /*
3258 * Reads a CRAM file definition structure.
3259 * Returns file_def ptr on success
3260 * NULL on failure
3261 */
3262 cram_file_def *cram_read_file_def(cram_fd *fd) {
3263 cram_file_def *def = malloc(sizeof(*def));
3264 if (!def)
3265 return NULL;
3266
3267 if (26 != hread(fd->fp, &def->magic[0], 26)) {
3268 free(def);
3269 return NULL;
3270 }
3271
3272 if (memcmp(def->magic, "CRAM", 4) != 0) {
3273 free(def);
3274 return NULL;
3275 }
3276
3277 if (def->major_version > 3) {
3278 fprintf(stderr, "CRAM version number mismatch\n"
3279 "Expected 1.x, 2.x or 3.x, got %d.%d\n",
3280 def->major_version, def->minor_version);
3281 free(def);
3282 return NULL;
3283 }
3284
3285 fd->first_container += 26;
3286 fd->last_slice = 0;
3287
3288 return def;
3289 }
3290
3291 /*
3292 * Writes a cram_file_def structure to cram_fd.
3293 * Returns 0 on success
3294 * -1 on failure
3295 */
3296 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
3297 return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
3298 }
3299
3300 void cram_free_file_def(cram_file_def *def) {
3301 if (def) free(def);
3302 }
3303
3304 /* ----------------------------------------------------------------------
3305 * SAM header I/O
3306 */
3307
3308
3309 /*
3310 * Reads the SAM header from the first CRAM data block.
3311 * Also performs minimal parsing to extract read-group
3312 * and sample information.
3313
3314 * Returns SAM hdr ptr on success
3315 * NULL on failure
3316 */
3317 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
3318 int32_t header_len;
3319 char *header;
3320 SAM_hdr *hdr;
3321
3322 /* 1.1 onwards stores the header in the first block of a container */
3323 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3324 /* Length */
3325 if (-1 == int32_decode(fd, &header_len))
3326 return NULL;
3327
3328 /* Alloc and read */
3329 if (NULL == (header = malloc(header_len+1)))
3330 return NULL;
3331
3332 *header = 0;
3333 if (header_len != hread(fd->fp, header, header_len))
3334 return NULL;
3335
3336 fd->first_container += 4 + header_len;
3337 } else {
3338 cram_container *c = cram_read_container(fd);
3339 cram_block *b;
3340 int i, len;
3341
3342 if (!c)
3343 return NULL;
3344
3345 if (c->num_blocks < 1) {
3346 cram_free_container(c);
3347 return NULL;
3348 }
3349
3350 if (!(b = cram_read_block(fd))) {
3351 cram_free_container(c);
3352 return NULL;
3353 }
3354 cram_uncompress_block(b);
3355
3356 len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3357 itf8_size(b->content_id) +
3358 itf8_size(b->uncomp_size) +
3359 itf8_size(b->comp_size);
3360
3361 /* Extract header from 1st block */
3362 if (-1 == int32_get(b, &header_len) ||
3363 b->uncomp_size - 4 < header_len) {
3364 cram_free_container(c);
3365 cram_free_block(b);
3366 return NULL;
3367 }
3368 if (NULL == (header = malloc(header_len+1))) {
3369 cram_free_container(c);
3370 cram_free_block(b);
3371 return NULL;
3372 }
3373 memcpy(header, BLOCK_END(b), header_len);
3374 header[header_len]='\0';
3375 cram_free_block(b);
3376
3377 /* Consume any remaining blocks */
3378 for (i = 1; i < c->num_blocks; i++) {
3379 if (!(b = cram_read_block(fd))) {
3380 cram_free_container(c);
3381 return NULL;
3382 }
3383 len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3384 itf8_size(b->content_id) +
3385 itf8_size(b->uncomp_size) +
3386 itf8_size(b->comp_size);
3387 cram_free_block(b);
3388 }
3389
3390 if (c->length && c->length > len) {
3391 // Consume padding
3392 char *pads = malloc(c->length - len);
3393 if (!pads) {
3394 cram_free_container(c);
3395 return NULL;
3396 }
3397
3398 if (c->length - len != hread(fd->fp, pads, c->length - len)) {
3399 cram_free_container(c);
3400 return NULL;
3401 }
3402 free(pads);
3403 }
3404
3405 cram_free_container(c);
3406 }
3407
3408 /* Parse */
3409 hdr = sam_hdr_parse_(header, header_len);
3410 free(header);
3411
3412 return hdr;
3413 }
3414
3415 /*
3416 * Converts 'in' to a full pathname to store in out.
3417 * Out must be at least PATH_MAX bytes long.
3418 */
3419 static void full_path(char *out, char *in) {
3420 if (*in == '/') {
3421 strncpy(out, in, PATH_MAX);
3422 out[PATH_MAX-1] = 0;
3423 } else {
3424 int len;
3425
3426 // unable to get dir or out+in is too long
3427 if (!getcwd(out, PATH_MAX) ||
3428 (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
3429 strncpy(out, in, PATH_MAX);
3430 out[PATH_MAX-1] = 0;
3431 return;
3432 }
3433
3434 sprintf(out+len, "/%.*s", PATH_MAX - len, in);
3435
3436 // FIXME: cope with `pwd`/../../../foo.fa ?
3437 }
3438 }
3439
3440 /*
3441 * Writes a CRAM SAM header.
3442 * Returns 0 on success
3443 * -1 on failure
3444 */
3445 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
3446 int header_len;
3447 int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
3448
3449 /* Write CRAM MAGIC if not yet written. */
3450 if (fd->file_def->major_version == 0) {
3451 fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
3452 fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
3453 if (0 != cram_write_file_def(fd, fd->file_def))
3454 return -1;
3455 }
3456
3457 /* 1.0 requires and UNKNOWN read-group */
3458 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3459 if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
3460 if (sam_hdr_add(hdr, "RG",
3461 "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
3462 return -1;
3463 }
3464
3465 /* Fix M5 strings */
3466 if (fd->refs && !fd->no_ref) {
3467 int i;
3468 for (i = 0; i < hdr->nref; i++) {
3469 SAM_hdr_type *ty;
3470 char *ref;
3471
3472 if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
3473 return -1;
3474
3475 if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
3476 char unsigned buf[16], buf2[33];
3477 int j, rlen;
3478 MD5_CTX md5;
3479
3480 if (!fd->refs ||
3481 !fd->refs->ref_id ||
3482 !fd->refs->ref_id[i]) {
3483 return -1;
3484 }
3485 rlen = fd->refs->ref_id[i]->length;
3486 MD5_Init(&md5);
3487 ref = cram_get_ref(fd, i, 1, rlen);
3488 if (NULL == ref) return -1;
3489 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
3490 MD5_Update(&md5, ref, rlen);
3491 MD5_Final(buf, &md5);
3492 cram_ref_decr(fd->refs, i);
3493
3494 for (j = 0; j < 16; j++) {
3495 buf2[j*2+0] = "0123456789abcdef"[buf[j]>>4];
3496 buf2[j*2+1] = "0123456789abcdef"[buf[j]&15];
3497 }
3498 buf2[32] = 0;
3499 if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
3500 return -1;
3501 }
3502
3503 if (fd->ref_fn) {
3504 char ref_fn[PATH_MAX];
3505 full_path(ref_fn, fd->ref_fn);
3506 if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
3507 return -1;
3508 }
3509 }
3510 }
3511
3512 if (sam_hdr_rebuild(hdr))
3513 return -1;
3514
3515 /* Length */
3516 header_len = sam_hdr_length(hdr);
3517 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3518 if (-1 == int32_encode(fd, header_len))
3519 return -1;
3520
3521 /* Text data */
3522 if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
3523 return -1;
3524 } else {
3525 /* Create block(s) inside a container */
3526 cram_block *b = cram_new_block(FILE_HEADER, 0);
3527 cram_container *c = cram_new_container(0, 0);
3528 int padded_length;
3529 char *pads;
3530 int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
3531
3532 if (!b || !c) {
3533 if (b) cram_free_block(b);
3534 if (c) cram_free_container(c);
3535 return -1;
3536 }
3537
3538 int32_put(b, header_len);
3539 BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
3540 BLOCK_UPLEN(b);
3541
3542 // Compress header block if V3.0 and above
3543 if (CRAM_MAJOR_VERS(fd->version) >= 3 && fd->level > 0) {
3544 int method = 1<<GZIP;
3545 if (fd->use_bz2)
3546 method |= 1<<BZIP2;
3547 if (fd->use_lzma)
3548 method |= 1<<LZMA;
3549 cram_compress_block(fd, b, NULL, method, fd->level);
3550 }
3551
3552 if (blank_block) {
3553 c->length = b->comp_size + 2 + 4*is_cram_3 +
3554 itf8_size(b->content_id) +
3555 itf8_size(b->uncomp_size) +
3556 itf8_size(b->comp_size);
3557
3558 c->num_blocks = 2;
3559 c->num_landmarks = 2;
3560 if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
3561 cram_free_block(b);
3562 cram_free_container(c);
3563 return -1;
3564 }
3565 c->landmark[0] = 0;
3566 c->landmark[1] = c->length;
3567
3568 // Plus extra storage for uncompressed secondary blank block
3569 padded_length = MIN(c->length*.5, 10000);
3570 c->length += padded_length + 2 + 4*is_cram_3 +
3571 itf8_size(b->content_id) +
3572 itf8_size(padded_length)*2;
3573 } else {
3574 // Pad the block instead.
3575 c->num_blocks = 1;
3576 c->num_landmarks = 1;
3577 if (!(c->landmark = malloc(sizeof(*c->landmark))))
3578 return -1;
3579 c->landmark[0] = 0;
3580
3581 padded_length = MAX(c->length*1.5, 10000) - c->length;
3582
3583 c->length = b->comp_size + padded_length +
3584 2 + 4*is_cram_3 +
3585 itf8_size(b->content_id) +
3586 itf8_size(b->uncomp_size) +
3587 itf8_size(b->comp_size);
3588
3589 if (NULL == (pads = calloc(1, padded_length))) {
3590 cram_free_block(b);
3591 cram_free_container(c);
3592 return -1;
3593 }
3594 BLOCK_APPEND(b, pads, padded_length);
3595 BLOCK_UPLEN(b);
3596 free(pads);
3597 }
3598
3599 if (-1 == cram_write_container(fd, c)) {
3600 cram_free_block(b);
3601 cram_free_container(c);
3602 return -1;
3603 }
3604
3605 if (-1 == cram_write_block(fd, b)) {
3606 cram_free_block(b);
3607 cram_free_container(c);
3608 return -1;
3609 }
3610
3611 if (blank_block) {
3612 BLOCK_RESIZE(b, padded_length);
3613 memset(BLOCK_DATA(b), 0, padded_length);
3614 BLOCK_SIZE(b) = padded_length;
3615 BLOCK_UPLEN(b);
3616 b->method = RAW;
3617 if (-1 == cram_write_block(fd, b)) {
3618 cram_free_block(b);
3619 cram_free_container(c);
3620 return -1;
3621 }
3622 }
3623
3624 cram_free_block(b);
3625 cram_free_container(c);
3626 }
3627
3628 if (-1 == refs_from_header(fd->refs, fd, fd->header))
3629 return -1;
3630 if (-1 == refs2id(fd->refs, fd->header))
3631 return -1;
3632
3633 if (0 != hflush(fd->fp))
3634 return -1;
3635
3636 RP("=== Finishing saving header ===\n");
3637
3638 return 0;
3639 }
3640
3641 /* ----------------------------------------------------------------------
3642 * The top-level cram opening, closing and option handling
3643 */
3644
3645 /*
3646 * Initialises the lookup tables. These could be global statics, but they're
3647 * clumsy to setup in a multi-threaded environment unless we generate
3648 * verbatim code and include that.
3649 */
3650 static void cram_init_tables(cram_fd *fd) {
3651 int i;
3652
3653 memset(fd->L1, 4, 256);
3654 fd->L1['A'] = 0; fd->L1['a'] = 0;
3655 fd->L1['C'] = 1; fd->L1['c'] = 1;
3656 fd->L1['G'] = 2; fd->L1['g'] = 2;
3657 fd->L1['T'] = 3; fd->L1['t'] = 3;
3658
3659 memset(fd->L2, 5, 256);
3660 fd->L2['A'] = 0; fd->L2['a'] = 0;
3661 fd->L2['C'] = 1; fd->L2['c'] = 1;
3662 fd->L2['G'] = 2; fd->L2['g'] = 2;
3663 fd->L2['T'] = 3; fd->L2['t'] = 3;
3664 fd->L2['N'] = 4; fd->L2['n'] = 4;
3665
3666 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3667 for (i = 0; i < 0x200; i++) {
3668 int f = 0;
3669
3670 if (i & CRAM_FPAIRED) f |= BAM_FPAIRED;
3671 if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
3672 if (i & CRAM_FUNMAP) f |= BAM_FUNMAP;
3673 if (i & CRAM_FREVERSE) f |= BAM_FREVERSE;
3674 if (i & CRAM_FREAD1) f |= BAM_FREAD1;
3675 if (i & CRAM_FREAD2) f |= BAM_FREAD2;
3676 if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY;
3677 if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL;
3678 if (i & CRAM_FDUP) f |= BAM_FDUP;
3679
3680 fd->bam_flag_swap[i] = f;
3681 }
3682
3683 for (i = 0; i < 0x1000; i++) {
3684 int g = 0;
3685
3686 if (i & BAM_FPAIRED) g |= CRAM_FPAIRED;
3687 if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR;
3688 if (i & BAM_FUNMAP) g |= CRAM_FUNMAP;
3689 if (i & BAM_FREVERSE) g |= CRAM_FREVERSE;
3690 if (i & BAM_FREAD1) g |= CRAM_FREAD1;
3691 if (i & BAM_FREAD2) g |= CRAM_FREAD2;
3692 if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY;
3693 if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL;
3694 if (i & BAM_FDUP) g |= CRAM_FDUP;
3695
3696 fd->cram_flag_swap[i] = g;
3697 }
3698 } else {
3699 /* NOP */
3700 for (i = 0; i < 0x1000; i++)
3701 fd->bam_flag_swap[i] = i;
3702 for (i = 0; i < 0x1000; i++)
3703 fd->cram_flag_swap[i] = i;
3704 }
3705
3706 memset(fd->cram_sub_matrix, 4, 32*32);
3707 for (i = 0; i < 32; i++) {
3708 fd->cram_sub_matrix[i]['A'&0x1f]=0;
3709 fd->cram_sub_matrix[i]['C'&0x1f]=1;
3710 fd->cram_sub_matrix[i]['G'&0x1f]=2;
3711 fd->cram_sub_matrix[i]['T'&0x1f]=3;
3712 fd->cram_sub_matrix[i]['N'&0x1f]=4;
3713 }
3714 for (i = 0; i < 20; i+=4) {
3715 int j;
3716 for (j = 0; j < 20; j++) {
3717 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3718 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3719 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3720 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3721 }
3722 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
3723 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
3724 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
3725 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
3726 }
3727 }
3728
3729 // Default version numbers for CRAM
3730 static int major_version = 2;
3731 static int minor_version = 1;
3732
3733 /*
3734 * Opens a CRAM file for read (mode "rb") or write ("wb").
3735 * The filename may be "-" to indicate stdin or stdout.
3736 *
3737 * Returns file handle on success
3738 * NULL on failure.
3739 */
3740 cram_fd *cram_open(const char *filename, const char *mode) {
3741 hFILE *fp;
3742 cram_fd *fd;
3743 char fmode[3]= { mode[0], '\0', '\0' };
3744
3745 if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
3746 fmode[1] = 'b';
3747 }
3748
3749 fp = hopen(filename, fmode);
3750 if (!fp)
3751 return NULL;
3752
3753 fd = cram_dopen(fp, filename, mode);
3754 if (!fd)
3755 hclose_abruptly(fp);
3756
3757 return fd;
3758 }
3759
3760 /* Opens an existing stream for reading or writing.
3761 *
3762 * Returns file handle on success;
3763 * NULL on failure.
3764 */
3765 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
3766 int i;
3767 char *cp;
3768 cram_fd *fd = calloc(1, sizeof(*fd));
3769 if (!fd)
3770 return NULL;
3771
3772 fd->level = 5;
3773 for (i = 0; mode[i]; i++) {
3774 if (mode[i] >= '0' && mode[i] <= '9') {
3775 fd->level = mode[i] - '0';
3776 break;
3777 }
3778 }
3779
3780 fd->fp = fp;
3781 fd->mode = *mode;
3782 fd->first_container = 0;
3783
3784 if (fd->mode == 'r') {
3785 /* Reader */
3786
3787 if (!(fd->file_def = cram_read_file_def(fd)))
3788 goto err;
3789
3790 fd->version = fd->file_def->major_version * 256 +
3791 fd->file_def->minor_version;
3792
3793 if (!(fd->header = cram_read_SAM_hdr(fd)))
3794 goto err;
3795
3796 } else {
3797 /* Writer */
3798 cram_file_def *def = calloc(1, sizeof(*def));
3799 if (!def)
3800 return NULL;
3801
3802 fd->file_def = def;
3803
3804 def->magic[0] = 'C';
3805 def->magic[1] = 'R';
3806 def->magic[2] = 'A';
3807 def->magic[3] = 'M';
3808 def->major_version = 0; // Indicator to write file def later.
3809 def->minor_version = 0;
3810 memset(def->file_id, 0, 20);
3811 strncpy(def->file_id, filename, 20);
3812
3813 fd->version = major_version * 256 + minor_version;
3814
3815 /* SAM header written later along with this file_def */
3816 }
3817
3818 cram_init_tables(fd);
3819
3820 fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
3821 if (!fd->prefix)
3822 goto err;
3823 fd->first_base = fd->last_base = -1;
3824 fd->record_counter = 0;
3825
3826 fd->ctr = NULL;
3827 fd->refs = refs_create();
3828 if (!fd->refs)
3829 goto err;
3830 fd->ref_id = -2;
3831 fd->ref = NULL;
3832
3833 fd->decode_md = 0;
3834 fd->verbose = 0;
3835 fd->seqs_per_slice = SEQS_PER_SLICE;
3836 fd->slices_per_container = SLICE_PER_CNT;
3837 fd->embed_ref = 0;
3838 fd->no_ref = 0;
3839 fd->ignore_md5 = 0;
3840 fd->use_bz2 = 0;
3841 fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
3842 fd->use_lzma = 0;
3843 fd->multi_seq = -1;
3844 fd->unsorted = 0;
3845 fd->shared_ref = 0;
3846
3847 fd->index = NULL;
3848 fd->own_pool = 0;
3849 fd->pool = NULL;
3850 fd->rqueue = NULL;
3851 fd->job_pending = NULL;
3852 fd->ooc = 0;
3853 fd->required_fields = INT_MAX;
3854
3855 for (i = 0; i < DS_END; i++)
3856 fd->m[i] = cram_new_metrics();
3857
3858 fd->range.refid = -2; // no ref.
3859 fd->eof = 1; // See samtools issue #150
3860 fd->ref_fn = NULL;
3861
3862 fd->bl = NULL;
3863
3864 /* Initialise dummy refs from the @SQ headers */
3865 if (-1 == refs_from_header(fd->refs, fd, fd->header))
3866 goto err;
3867
3868 return fd;
3869
3870 err:
3871 if (fd)
3872 free(fd);
3873
3874 return NULL;
3875 }
3876
3877 /*
3878 * Seek within a CRAM file.
3879 *
3880 * Returns 0 on success
3881 * -1 on failure
3882 */
3883 int cram_seek(cram_fd *fd, off_t offset, int whence) {
3884 char buf[65536];
3885
3886 fd->ooc = 0;
3887
3888 if (hseek(fd->fp, offset, whence) >= 0)
3889 return 0;
3890
3891 if (!(whence == SEEK_CUR && offset >= 0))
3892 return -1;
3893
3894 /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
3895 while (offset > 0) {
3896 int len = MIN(65536, offset);
3897 if (len != hread(fd->fp, buf, len))
3898 return -1;
3899 offset -= len;
3900 }
3901
3902 return 0;
3903 }
3904
3905 /*
3906 * Flushes a CRAM file.
3907 * Useful for when writing to stdout without wishing to close the stream.
3908 *
3909 * Returns 0 on success
3910 * -1 on failure
3911 */
3912 int cram_flush(cram_fd *fd) {
3913 if (!fd)
3914 return -1;
3915
3916 if (fd->mode == 'w' && fd->ctr) {
3917 if(fd->ctr->slice)
3918 fd->ctr->curr_slice++;
3919 if (-1 == cram_flush_container_mt(fd, fd->ctr))
3920 return -1;
3921 }
3922
3923 return 0;
3924 }
3925
3926 /*
3927 * Closes a CRAM file.
3928 * Returns 0 on success
3929 * -1 on failure
3930 */
3931 int cram_close(cram_fd *fd) {
3932 spare_bams *bl, *next;
3933 int i;
3934
3935 if (!fd)
3936 return -1;
3937
3938 if (fd->mode == 'w' && fd->ctr) {
3939 if(fd->ctr->slice)
3940 fd->ctr->curr_slice++;
3941 if (-1 == cram_flush_container_mt(fd, fd->ctr))
3942 return -1;
3943 }
3944
3945 if (fd->pool) {
3946 t_pool_flush(fd->pool);
3947
3948 if (0 != cram_flush_result(fd))
3949 return -1;
3950
3951 pthread_mutex_destroy(&fd->metrics_lock);
3952 pthread_mutex_destroy(&fd->ref_lock);
3953 pthread_mutex_destroy(&fd->bam_list_lock);
3954
3955 fd->ctr = NULL; // prevent double freeing
3956
3957 //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
3958
3959 t_results_queue_destroy(fd->rqueue);
3960 }
3961
3962 if (fd->mode == 'w') {
3963 /* Write EOF block */
3964 if (CRAM_MAJOR_VERS(fd->version) == 3) {
3965 if (38 != hwrite(fd->fp,
3966 "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
3967 "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
3968 "\x00\x01\x00" // Cont HDR
3969 "\x05\xbd\xd9\x4f" // CRC32
3970 "\x00\x01\x00\x06\x06" // Comp.HDR blk
3971 "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
3972 "\xee\x63\x01\x4b", // CRC32
3973 38))
3974 return -1;
3975 } else {
3976 if (30 != hwrite(fd->fp,
3977 "\x0b\x00\x00\x00\xff\xff\xff\xff"
3978 "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
3979 "\x00\x01\x00\x00\x01\x00\x06\x06"
3980 "\x01\x00\x01\x00\x01\x00", 30))
3981 return -1;
3982 }
3983 }
3984
3985 for (bl = fd->bl; bl; bl = next) {
3986 int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
3987
3988 next = bl->next;
3989 for (i = 0; i < max_rec; i++) {
3990 if (bl->bams[i])
3991 bam_free(bl->bams[i]);
3992 }
3993 free(bl->bams);
3994 free(bl);
3995 }
3996
3997 if (hclose(fd->fp) != 0)
3998 return -1;
3999
4000 if (fd->file_def)
4001 cram_free_file_def(fd->file_def);
4002
4003 if (fd->header)
4004 sam_hdr_free(fd->header);
4005
4006 free(fd->prefix);
4007
4008 if (fd->ctr)
4009 cram_free_container(fd->ctr);
4010
4011 if (fd->refs)
4012 refs_free(fd->refs);
4013 if (fd->ref_free)
4014 free(fd->ref_free);
4015
4016 for (i = 0; i < DS_END; i++)
4017 if (fd->m[i])
4018 free(fd->m[i]);
4019
4020 if (fd->index)
4021 cram_index_free(fd);
4022
4023 if (fd->own_pool && fd->pool)
4024 t_pool_destroy(fd->pool, 0);
4025
4026 free(fd);
4027 return 0;
4028 }
4029
4030 /*
4031 * Returns 1 if we hit an EOF while reading.
4032 */
4033 int cram_eof(cram_fd *fd) {
4034 return fd->eof;
4035 }
4036
4037
4038 /*
4039 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4040 * Use this immediately after opening.
4041 *
4042 * Returns 0 on success
4043 * -1 on failure
4044 */
4045 int cram_set_option(cram_fd *fd, enum cram_option opt, ...) {
4046 int r;
4047 va_list args;
4048
4049 va_start(args, opt);
4050 r = cram_set_voption(fd, opt, args);
4051 va_end(args);
4052
4053 return r;
4054 }
4055
4056 /*
4057 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4058 * Use this immediately after opening.
4059 *
4060 * Returns 0 on success
4061 * -1 on failure
4062 */
4063 int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) {
4064 refs_t *refs;
4065
4066 if (!fd)
4067 return -1;
4068
4069 switch (opt) {
4070 case CRAM_OPT_DECODE_MD:
4071 fd->decode_md = va_arg(args, int);
4072 break;
4073
4074 case CRAM_OPT_PREFIX:
4075 if (fd->prefix)
4076 free(fd->prefix);
4077 if (!(fd->prefix = strdup(va_arg(args, char *))))
4078 return -1;
4079 break;
4080
4081 case CRAM_OPT_VERBOSITY:
4082 fd->verbose = va_arg(args, int);
4083 break;
4084
4085 case CRAM_OPT_SEQS_PER_SLICE:
4086 fd->seqs_per_slice = va_arg(args, int);
4087 break;
4088
4089 case CRAM_OPT_SLICES_PER_CONTAINER:
4090 fd->slices_per_container = va_arg(args, int);
4091 break;
4092
4093 case CRAM_OPT_EMBED_REF:
4094 fd->embed_ref = va_arg(args, int);
4095 break;
4096
4097 case CRAM_OPT_NO_REF:
4098 fd->no_ref = va_arg(args, int);
4099 break;
4100
4101 case CRAM_OPT_IGNORE_MD5:
4102 fd->ignore_md5 = va_arg(args, int);
4103 break;
4104
4105 case CRAM_OPT_USE_BZIP2:
4106 fd->use_bz2 = va_arg(args, int);
4107 break;
4108
4109 case CRAM_OPT_USE_RANS:
4110 fd->use_rans = va_arg(args, int);
4111 break;
4112
4113 case CRAM_OPT_USE_LZMA:
4114 fd->use_lzma = va_arg(args, int);
4115 break;
4116
4117 case CRAM_OPT_SHARED_REF:
4118 fd->shared_ref = 1;
4119 refs = va_arg(args, refs_t *);
4120 if (refs != fd->refs) {
4121 if (fd->refs)
4122 refs_free(fd->refs);
4123 fd->refs = refs;
4124 fd->refs->count++;
4125 }
4126 break;
4127
4128 case CRAM_OPT_RANGE:
4129 fd->range = *va_arg(args, cram_range *);
4130 return cram_seek_to_refpos(fd, &fd->range);
4131
4132 case CRAM_OPT_REFERENCE:
4133 return cram_load_reference(fd, va_arg(args, char *));
4134
4135 case CRAM_OPT_VERSION: {
4136 int major, minor;
4137 char *s = va_arg(args, char *);
4138 if (2 != sscanf(s, "%d.%d", &major, &minor)) {
4139 fprintf(stderr, "Malformed version string %s\n", s);
4140 return -1;
4141 }
4142 if (!((major == 1 && minor == 0) ||
4143 (major == 2 && (minor == 0 || minor == 1)) ||
4144 (major == 3 && minor == 0))) {
4145 fprintf(stderr, "Unknown version string; "
4146 "use 1.0, 2.0, 2.1 or 3.0\n");
4147 return -1;
4148 }
4149 fd->version = major*256 + minor;
4150
4151 if (CRAM_MAJOR_VERS(fd->version) >= 3)
4152 fd->use_rans = 1;
4153 break;
4154 }
4155
4156 case CRAM_OPT_MULTI_SEQ_PER_SLICE:
4157 fd->multi_seq = va_arg(args, int);
4158 break;
4159
4160 case CRAM_OPT_NTHREADS: {
4161 int nthreads = va_arg(args, int);
4162 if (nthreads > 1) {
4163 if (!(fd->pool = t_pool_init(nthreads*2, nthreads)))
4164 return -1;
4165
4166 fd->rqueue = t_results_queue_init();
4167 pthread_mutex_init(&fd->metrics_lock, NULL);
4168 pthread_mutex_init(&fd->ref_lock, NULL);
4169 pthread_mutex_init(&fd->bam_list_lock, NULL);
4170 fd->shared_ref = 1;
4171 fd->own_pool = 1;
4172 }
4173 break;
4174 }
4175
4176 case CRAM_OPT_THREAD_POOL:
4177 fd->pool = va_arg(args, t_pool *);
4178 if (fd->pool) {
4179 fd->rqueue = t_results_queue_init();
4180 pthread_mutex_init(&fd->metrics_lock, NULL);
4181 pthread_mutex_init(&fd->ref_lock, NULL);
4182 pthread_mutex_init(&fd->bam_list_lock, NULL);
4183 }
4184 fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
4185 fd->own_pool = 0;
4186
4187 //fd->qsize = 1;
4188 //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
4189 //t_pool_dispatch(fd->pool, cram_decoder_thread, fd);
4190 break;
4191
4192 case CRAM_OPT_REQUIRED_FIELDS:
4193 fd->required_fields = va_arg(args, int);
4194 break;
4195
4196 default:
4197 fprintf(stderr, "Unknown CRAM option code %d\n", opt);
4198 return -1;
4199 }
4200
4201 return 0;
4202 }