Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/cram_io.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* | |
2 Copyright (c) 2012-2014 Genome Research Ltd. | |
3 Author: James Bonfield <jkb@sanger.ac.uk> | |
4 | |
5 Redistribution and use in source and binary forms, with or without | |
6 modification, are permitted provided that the following conditions are met: | |
7 | |
8 1. Redistributions of source code must retain the above copyright notice, | |
9 this list of conditions and the following disclaimer. | |
10 | |
11 2. Redistributions in binary form must reproduce the above copyright notice, | |
12 this list of conditions and the following disclaimer in the documentation | |
13 and/or other materials provided with the distribution. | |
14 | |
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
16 Institute nor the names of its contributors may be used to endorse or promote | |
17 products derived from this software without specific prior written permission. | |
18 | |
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 /* | |
32 * CRAM I/O primitives. | |
33 * | |
34 * - ITF8 encoding and decoding. | |
35 * - Block based I/O | |
36 * - Zlib inflating and deflating (memory) | |
37 * - CRAM basic data structure reading and writing | |
38 * - File opening / closing | |
39 * - Reference sequence handling | |
40 */ | |
41 | |
42 /* | |
43 * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need | |
44 * a way to return errors for when malloc fails. | |
45 */ | |
46 | |
47 #ifdef HAVE_CONFIG_H | |
48 #include "io_lib_config.h" | |
49 #endif | |
50 | |
51 #include <stdio.h> | |
52 #include <errno.h> | |
53 #include <assert.h> | |
54 #include <stdlib.h> | |
55 #include <string.h> | |
56 #include <zlib.h> | |
57 #ifdef HAVE_LIBBZ2 | |
58 #include <bzlib.h> | |
59 #endif | |
60 #ifdef HAVE_LIBLZMA | |
61 #include <lzma.h> | |
62 #endif | |
63 #include <sys/types.h> | |
64 #include <sys/stat.h> | |
65 #include <math.h> | |
66 #include <ctype.h> | |
67 | |
68 #include "cram/cram.h" | |
69 #include "cram/os.h" | |
70 #include "cram/md5.h" | |
71 #include "cram/open_trace_file.h" | |
72 #include "cram/rANS_static.h" | |
73 | |
74 //#define REF_DEBUG | |
75 | |
76 #ifdef REF_DEBUG | |
77 #include <sys/syscall.h> | |
78 #define gettid() (int)syscall(SYS_gettid) | |
79 | |
80 #define RP(...) fprintf (stderr, __VA_ARGS__) | |
81 #else | |
82 #define RP(...) | |
83 #endif | |
84 | |
85 #include "htslib/hfile.h" | |
86 #include "htslib/bgzf.h" | |
87 #include "htslib/faidx.h" | |
88 | |
89 #define TRIAL_SPAN 50 | |
90 #define NTRIALS 3 | |
91 | |
92 | |
93 /* ---------------------------------------------------------------------- | |
94 * ITF8 encoding and decoding. | |
95 * | |
96 * Also see the itf8_get and itf8_put macros in cram_io.h | |
97 */ | |
98 | |
99 /* | |
100 * Reads an integer in ITF-8 encoding from 'cp' and stores it in | |
101 * *val. | |
102 * | |
103 * Returns the number of bytes read on success | |
104 * -1 on failure | |
105 */ | |
106 int itf8_decode(cram_fd *fd, int32_t *val_p) { | |
107 static int nbytes[16] = { | |
108 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx | |
109 1,1,1,1, // 1000xxxx - 1011xxxx | |
110 2,2, // 1100xxxx - 1101xxxx | |
111 3, // 1110xxxx | |
112 4, // 1111xxxx | |
113 }; | |
114 | |
115 static int nbits[16] = { | |
116 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx | |
117 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx | |
118 0x1f, 0x1f, // 1100xxxx - 1101xxxx | |
119 0x0f, // 1110xxxx | |
120 0x0f, // 1111xxxx | |
121 }; | |
122 | |
123 int32_t val = hgetc(fd->fp); | |
124 if (val == -1) | |
125 return -1; | |
126 | |
127 int i = nbytes[val>>4]; | |
128 val &= nbits[val>>4]; | |
129 | |
130 switch(i) { | |
131 case 0: | |
132 *val_p = val; | |
133 return 1; | |
134 | |
135 case 1: | |
136 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
137 *val_p = val; | |
138 return 2; | |
139 | |
140 case 2: | |
141 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
142 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
143 *val_p = val; | |
144 return 3; | |
145 | |
146 case 3: | |
147 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
148 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
149 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
150 *val_p = val; | |
151 return 4; | |
152 | |
153 case 4: // really 3.5 more, why make it different? | |
154 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
155 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
156 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
157 val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f); | |
158 *val_p = val; | |
159 } | |
160 | |
161 return 5; | |
162 } | |
163 | |
164 /* | |
165 * Encodes and writes a single integer in ITF-8 format. | |
166 * Returns 0 on success | |
167 * -1 on failure | |
168 */ | |
169 int itf8_encode(cram_fd *fd, int32_t val) { | |
170 char buf[5]; | |
171 int len = itf8_put(buf, val); | |
172 return hwrite(fd->fp, buf, len) == len ? 0 : -1; | |
173 } | |
174 | |
175 #ifndef ITF8_MACROS | |
176 /* | |
177 * As above, but decoding from memory | |
178 */ | |
179 int itf8_get(char *cp, int32_t *val_p) { | |
180 unsigned char *up = (unsigned char *)cp; | |
181 | |
182 if (up[0] < 0x80) { | |
183 *val_p = up[0]; | |
184 return 1; | |
185 } else if (up[0] < 0xc0) { | |
186 *val_p = ((up[0] <<8) | up[1]) & 0x3fff; | |
187 return 2; | |
188 } else if (up[0] < 0xe0) { | |
189 *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; | |
190 return 3; | |
191 } else if (up[0] < 0xf0) { | |
192 *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; | |
193 return 4; | |
194 } else { | |
195 *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); | |
196 return 5; | |
197 } | |
198 } | |
199 | |
200 /* | |
201 * Stores a value to memory in ITF-8 format. | |
202 * | |
203 * Returns the number of bytes required to store the number. | |
204 * This is a maximum of 5 bytes. | |
205 */ | |
206 int itf8_put(char *cp, int32_t val) { | |
207 if (!(val & ~0x00000007f)) { // 1 byte | |
208 *cp = val; | |
209 return 1; | |
210 } else if (!(val & ~0x00003fff)) { // 2 byte | |
211 *cp++ = (val >> 8 ) | 0x80; | |
212 *cp = val & 0xff; | |
213 return 2; | |
214 } else if (!(val & ~0x01fffff)) { // 3 byte | |
215 *cp++ = (val >> 16) | 0xc0; | |
216 *cp++ = (val >> 8 ) & 0xff; | |
217 *cp = val & 0xff; | |
218 return 3; | |
219 } else if (!(val & ~0x0fffffff)) { // 4 byte | |
220 *cp++ = (val >> 24) | 0xe0; | |
221 *cp++ = (val >> 16) & 0xff; | |
222 *cp++ = (val >> 8 ) & 0xff; | |
223 *cp = val & 0xff; | |
224 return 4; | |
225 } else { // 5 byte | |
226 *cp++ = 0xf0 | ((val>>28) & 0xff); | |
227 *cp++ = (val >> 20) & 0xff; | |
228 *cp++ = (val >> 12) & 0xff; | |
229 *cp++ = (val >> 4 ) & 0xff; | |
230 *cp = val & 0x0f; | |
231 return 5; | |
232 } | |
233 } | |
234 #endif | |
235 | |
236 /* 64-bit itf8 variant */ | |
237 int ltf8_put(char *cp, int64_t val) { | |
238 if (!(val & ~((1LL<<7)-1))) { | |
239 *cp = val; | |
240 return 1; | |
241 } else if (!(val & ~((1LL<<(6+8))-1))) { | |
242 *cp++ = (val >> 8 ) | 0x80; | |
243 *cp = val & 0xff; | |
244 return 2; | |
245 } else if (!(val & ~((1LL<<(5+2*8))-1))) { | |
246 *cp++ = (val >> 16) | 0xc0; | |
247 *cp++ = (val >> 8 ) & 0xff; | |
248 *cp = val & 0xff; | |
249 return 3; | |
250 } else if (!(val & ~((1LL<<(4+3*8))-1))) { | |
251 *cp++ = (val >> 24) | 0xe0; | |
252 *cp++ = (val >> 16) & 0xff; | |
253 *cp++ = (val >> 8 ) & 0xff; | |
254 *cp = val & 0xff; | |
255 return 4; | |
256 } else if (!(val & ~((1LL<<(3+4*8))-1))) { | |
257 *cp++ = (val >> 32) | 0xf0; | |
258 *cp++ = (val >> 24) & 0xff; | |
259 *cp++ = (val >> 16) & 0xff; | |
260 *cp++ = (val >> 8 ) & 0xff; | |
261 *cp = val & 0xff; | |
262 return 5; | |
263 } else if (!(val & ~((1LL<<(2+5*8))-1))) { | |
264 *cp++ = (val >> 40) | 0xf8; | |
265 *cp++ = (val >> 32) & 0xff; | |
266 *cp++ = (val >> 24) & 0xff; | |
267 *cp++ = (val >> 16) & 0xff; | |
268 *cp++ = (val >> 8 ) & 0xff; | |
269 *cp = val & 0xff; | |
270 return 6; | |
271 } else if (!(val & ~((1LL<<(1+6*8))-1))) { | |
272 *cp++ = (val >> 48) | 0xfc; | |
273 *cp++ = (val >> 40) & 0xff; | |
274 *cp++ = (val >> 32) & 0xff; | |
275 *cp++ = (val >> 24) & 0xff; | |
276 *cp++ = (val >> 16) & 0xff; | |
277 *cp++ = (val >> 8 ) & 0xff; | |
278 *cp = val & 0xff; | |
279 return 7; | |
280 } else if (!(val & ~((1LL<<(7*8))-1))) { | |
281 *cp++ = (val >> 56) | 0xfe; | |
282 *cp++ = (val >> 48) & 0xff; | |
283 *cp++ = (val >> 40) & 0xff; | |
284 *cp++ = (val >> 32) & 0xff; | |
285 *cp++ = (val >> 24) & 0xff; | |
286 *cp++ = (val >> 16) & 0xff; | |
287 *cp++ = (val >> 8 ) & 0xff; | |
288 *cp = val & 0xff; | |
289 return 8; | |
290 } else { | |
291 *cp++ = 0xff; | |
292 *cp++ = (val >> 56) & 0xff; | |
293 *cp++ = (val >> 48) & 0xff; | |
294 *cp++ = (val >> 40) & 0xff; | |
295 *cp++ = (val >> 32) & 0xff; | |
296 *cp++ = (val >> 24) & 0xff; | |
297 *cp++ = (val >> 16) & 0xff; | |
298 *cp++ = (val >> 8 ) & 0xff; | |
299 *cp = val & 0xff; | |
300 return 9; | |
301 } | |
302 } | |
303 | |
304 int ltf8_get(char *cp, int64_t *val_p) { | |
305 unsigned char *up = (unsigned char *)cp; | |
306 | |
307 if (up[0] < 0x80) { | |
308 *val_p = up[0]; | |
309 return 1; | |
310 } else if (up[0] < 0xc0) { | |
311 *val_p = (((uint64_t)up[0]<< 8) | | |
312 (uint64_t)up[1]) & (((1LL<<(6+8)))-1); | |
313 return 2; | |
314 } else if (up[0] < 0xe0) { | |
315 *val_p = (((uint64_t)up[0]<<16) | | |
316 ((uint64_t)up[1]<< 8) | | |
317 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); | |
318 return 3; | |
319 } else if (up[0] < 0xf0) { | |
320 *val_p = (((uint64_t)up[0]<<24) | | |
321 ((uint64_t)up[1]<<16) | | |
322 ((uint64_t)up[2]<< 8) | | |
323 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); | |
324 return 4; | |
325 } else if (up[0] < 0xf8) { | |
326 *val_p = (((uint64_t)up[0]<<32) | | |
327 ((uint64_t)up[1]<<24) | | |
328 ((uint64_t)up[2]<<16) | | |
329 ((uint64_t)up[3]<< 8) | | |
330 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); | |
331 return 5; | |
332 } else if (up[0] < 0xfc) { | |
333 *val_p = (((uint64_t)up[0]<<40) | | |
334 ((uint64_t)up[1]<<32) | | |
335 ((uint64_t)up[2]<<24) | | |
336 ((uint64_t)up[3]<<16) | | |
337 ((uint64_t)up[4]<< 8) | | |
338 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); | |
339 return 6; | |
340 } else if (up[0] < 0xfe) { | |
341 *val_p = (((uint64_t)up[0]<<48) | | |
342 ((uint64_t)up[1]<<40) | | |
343 ((uint64_t)up[2]<<32) | | |
344 ((uint64_t)up[3]<<24) | | |
345 ((uint64_t)up[4]<<16) | | |
346 ((uint64_t)up[5]<< 8) | | |
347 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); | |
348 return 7; | |
349 } else if (up[0] < 0xff) { | |
350 *val_p = (((uint64_t)up[1]<<48) | | |
351 ((uint64_t)up[2]<<40) | | |
352 ((uint64_t)up[3]<<32) | | |
353 ((uint64_t)up[4]<<24) | | |
354 ((uint64_t)up[5]<<16) | | |
355 ((uint64_t)up[6]<< 8) | | |
356 (uint64_t)up[7]) & ((1LL<<(7*8))-1); | |
357 return 8; | |
358 } else { | |
359 *val_p = (((uint64_t)up[1]<<56) | | |
360 ((uint64_t)up[2]<<48) | | |
361 ((uint64_t)up[3]<<40) | | |
362 ((uint64_t)up[4]<<32) | | |
363 ((uint64_t)up[5]<<24) | | |
364 ((uint64_t)up[6]<<16) | | |
365 ((uint64_t)up[7]<< 8) | | |
366 (uint64_t)up[8]); | |
367 return 9; | |
368 } | |
369 } | |
370 | |
371 int ltf8_decode(cram_fd *fd, int64_t *val_p) { | |
372 int c = hgetc(fd->fp); | |
373 int64_t val = (unsigned char)c; | |
374 if (c == -1) | |
375 return -1; | |
376 | |
377 if (val < 0x80) { | |
378 *val_p = val; | |
379 return 1; | |
380 | |
381 } else if (val < 0xc0) { | |
382 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
383 *val_p = val & (((1LL<<(6+8)))-1); | |
384 return 2; | |
385 | |
386 } else if (val < 0xe0) { | |
387 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
388 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
389 *val_p = val & ((1LL<<(5+2*8))-1); | |
390 return 3; | |
391 | |
392 } else if (val < 0xf0) { | |
393 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
394 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
395 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
396 *val_p = val & ((1LL<<(4+3*8))-1); | |
397 return 4; | |
398 | |
399 } else if (val < 0xf8) { | |
400 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
401 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
402 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
403 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
404 *val_p = val & ((1LL<<(3+4*8))-1); | |
405 return 5; | |
406 | |
407 } else if (val < 0xfc) { | |
408 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
409 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
410 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
411 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
412 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
413 *val_p = val & ((1LL<<(2+5*8))-1); | |
414 return 6; | |
415 | |
416 } else if (val < 0xfe) { | |
417 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
418 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
419 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
420 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
421 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
422 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
423 *val_p = val & ((1LL<<(1+6*8))-1); | |
424 return 7; | |
425 | |
426 } else if (val < 0xff) { | |
427 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
428 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
429 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
430 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
431 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
432 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
433 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
434 *val_p = val & ((1LL<<(7*8))-1); | |
435 return 8; | |
436 | |
437 } else { | |
438 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
439 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
440 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
441 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
442 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
443 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
444 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
445 val = (val<<8) | (unsigned char)hgetc(fd->fp); | |
446 *val_p = val; | |
447 } | |
448 | |
449 return 9; | |
450 } | |
451 | |
452 /* | |
453 * Pushes a value in ITF8 format onto the end of a block. | |
454 * This shouldn't be used for high-volume data as it is not the fastest | |
455 * method. | |
456 * | |
457 * Returns the number of bytes written | |
458 */ | |
459 int itf8_put_blk(cram_block *blk, int val) { | |
460 char buf[5]; | |
461 int sz; | |
462 | |
463 sz = itf8_put(buf, val); | |
464 BLOCK_APPEND(blk, buf, sz); | |
465 return sz; | |
466 } | |
467 | |
468 /* | |
469 * Decodes a 32-bit little endian value from fd and stores in val. | |
470 * | |
471 * Returns the number of bytes read on success | |
472 * -1 on failure | |
473 */ | |
474 int int32_decode(cram_fd *fd, int32_t *val) { | |
475 int32_t i; | |
476 if (4 != hread(fd->fp, &i, 4)) | |
477 return -1; | |
478 | |
479 *val = le_int4(i); | |
480 return 4; | |
481 } | |
482 | |
483 /* | |
484 * Encodes a 32-bit little endian value 'val' and writes to fd. | |
485 * | |
486 * Returns the number of bytes written on success | |
487 * -1 on failure | |
488 */ | |
489 int int32_encode(cram_fd *fd, int32_t val) { | |
490 val = le_int4(val); | |
491 if (4 != hwrite(fd->fp, &val, 4)) | |
492 return -1; | |
493 | |
494 return 4; | |
495 } | |
496 | |
497 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ | |
498 int int32_get(cram_block *b, int32_t *val) { | |
499 if (b->uncomp_size - BLOCK_SIZE(b) < 4) | |
500 return -1; | |
501 | |
502 *val = | |
503 b->data[b->byte ] | | |
504 (b->data[b->byte+1] << 8) | | |
505 (b->data[b->byte+2] << 16) | | |
506 (b->data[b->byte+3] << 24); | |
507 BLOCK_SIZE(b) += 4; | |
508 return 4; | |
509 } | |
510 | |
511 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ | |
512 int int32_put(cram_block *b, int32_t val) { | |
513 unsigned char cp[4]; | |
514 cp[0] = ( val & 0xff); | |
515 cp[1] = ((val>>8) & 0xff); | |
516 cp[2] = ((val>>16) & 0xff); | |
517 cp[3] = ((val>>24) & 0xff); | |
518 | |
519 BLOCK_APPEND(b, cp, 4); | |
520 return b->data ? 0 : -1; | |
521 } | |
522 | |
523 /* ---------------------------------------------------------------------- | |
524 * zlib compression code - from Gap5's tg_iface_g.c | |
525 * They're static here as they're only used within the cram_compress_block | |
526 * and cram_uncompress_block functions, which are the external interface. | |
527 */ | |
528 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { | |
529 z_stream s; | |
530 unsigned char *data = NULL; /* Uncompressed output */ | |
531 int data_alloc = 0; | |
532 int err; | |
533 | |
534 /* Starting point at uncompressed size, and scale after that */ | |
535 data = malloc(data_alloc = csize*1.2+100); | |
536 if (!data) | |
537 return NULL; | |
538 | |
539 /* Initialise zlib stream */ | |
540 s.zalloc = Z_NULL; /* use default allocation functions */ | |
541 s.zfree = Z_NULL; | |
542 s.opaque = Z_NULL; | |
543 s.next_in = (unsigned char *)cdata; | |
544 s.avail_in = csize; | |
545 s.total_in = 0; | |
546 s.next_out = data; | |
547 s.avail_out = data_alloc; | |
548 s.total_out = 0; | |
549 | |
550 //err = inflateInit(&s); | |
551 err = inflateInit2(&s, 15 + 32); | |
552 if (err != Z_OK) { | |
553 fprintf(stderr, "zlib inflateInit error: %s\n", s.msg); | |
554 free(data); | |
555 return NULL; | |
556 } | |
557 | |
558 /* Decode to 'data' array */ | |
559 for (;s.avail_in;) { | |
560 unsigned char *data_tmp; | |
561 int alloc_inc; | |
562 | |
563 s.next_out = &data[s.total_out]; | |
564 err = inflate(&s, Z_NO_FLUSH); | |
565 if (err == Z_STREAM_END) | |
566 break; | |
567 | |
568 if (err != Z_OK) { | |
569 fprintf(stderr, "zlib inflate error: %s\n", s.msg); | |
570 break; | |
571 } | |
572 | |
573 /* More to come, so realloc based on growth so far */ | |
574 alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100; | |
575 data = realloc((data_tmp = data), data_alloc += alloc_inc); | |
576 if (!data) { | |
577 free(data_tmp); | |
578 return NULL; | |
579 } | |
580 s.avail_out += alloc_inc; | |
581 } | |
582 inflateEnd(&s); | |
583 | |
584 *size = s.total_out; | |
585 return (char *)data; | |
586 } | |
587 | |
588 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, | |
589 int level, int strat) { | |
590 z_stream s; | |
591 unsigned char *cdata = NULL; /* Compressed output */ | |
592 int cdata_alloc = 0; | |
593 int cdata_pos = 0; | |
594 int err; | |
595 | |
596 cdata = malloc(cdata_alloc = size*1.05+100); | |
597 if (!cdata) | |
598 return NULL; | |
599 cdata_pos = 0; | |
600 | |
601 /* Initialise zlib stream */ | |
602 s.zalloc = Z_NULL; /* use default allocation functions */ | |
603 s.zfree = Z_NULL; | |
604 s.opaque = Z_NULL; | |
605 s.next_in = (unsigned char *)data; | |
606 s.avail_in = size; | |
607 s.total_in = 0; | |
608 s.next_out = cdata; | |
609 s.avail_out = cdata_alloc; | |
610 s.total_out = 0; | |
611 s.data_type = Z_BINARY; | |
612 | |
613 err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat); | |
614 if (err != Z_OK) { | |
615 fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg); | |
616 return NULL; | |
617 } | |
618 | |
619 /* Encode to 'cdata' array */ | |
620 for (;s.avail_in;) { | |
621 s.next_out = &cdata[cdata_pos]; | |
622 s.avail_out = cdata_alloc - cdata_pos; | |
623 if (cdata_alloc - cdata_pos <= 0) { | |
624 fprintf(stderr, "Deflate produced larger output than expected. Abort\n"); | |
625 return NULL; | |
626 } | |
627 err = deflate(&s, Z_NO_FLUSH); | |
628 cdata_pos = cdata_alloc - s.avail_out; | |
629 if (err != Z_OK) { | |
630 fprintf(stderr, "zlib deflate error: %s\n", s.msg); | |
631 break; | |
632 } | |
633 } | |
634 if (deflate(&s, Z_FINISH) != Z_STREAM_END) { | |
635 fprintf(stderr, "zlib deflate error: %s\n", s.msg); | |
636 } | |
637 *cdata_size = s.total_out; | |
638 | |
639 if (deflateEnd(&s) != Z_OK) { | |
640 fprintf(stderr, "zlib deflate error: %s\n", s.msg); | |
641 } | |
642 return (char *)cdata; | |
643 } | |
644 | |
645 #ifdef HAVE_LIBLZMA | |
646 /* ------------------------------------------------------------------------ */ | |
647 /* | |
648 * Data compression routines using liblzma (xz) | |
649 * | |
650 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but | |
651 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq | |
652 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad | |
653 * as compression times. | |
654 * | |
655 * For now we disable this functionality. If it's to be reenabled make sure you | |
656 * improve the mem_inflate implementation as it's just a test hack at the | |
657 * moment. | |
658 */ | |
659 | |
660 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size, | |
661 int level) { | |
662 char *out; | |
663 size_t out_size = lzma_stream_buffer_bound(size); | |
664 *cdata_size = 0; | |
665 | |
666 out = malloc(out_size); | |
667 | |
668 /* Single call compression */ | |
669 if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL, | |
670 (uint8_t *)data, size, | |
671 (uint8_t *)out, cdata_size, | |
672 out_size)) | |
673 return NULL; | |
674 | |
675 return out; | |
676 } | |
677 | |
678 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { | |
679 lzma_stream strm = LZMA_STREAM_INIT; | |
680 size_t out_size = 0, out_pos = 0; | |
681 char *out = NULL; | |
682 int r; | |
683 | |
684 /* Initiate the decoder */ | |
685 if (LZMA_OK != lzma_stream_decoder(&strm, 50000000, 0)) | |
686 return NULL; | |
687 | |
688 /* Decode loop */ | |
689 strm.avail_in = csize; | |
690 strm.next_in = (uint8_t *)cdata; | |
691 | |
692 for (;strm.avail_in;) { | |
693 if (strm.avail_in > out_size - out_pos) { | |
694 out_size += strm.avail_in * 4 + 32768; | |
695 out = realloc(out, out_size); | |
696 } | |
697 strm.avail_out = out_size - out_pos; | |
698 strm.next_out = (uint8_t *)&out[out_pos]; | |
699 | |
700 r = lzma_code(&strm, LZMA_RUN); | |
701 if (LZMA_OK != r && LZMA_STREAM_END != r) { | |
702 fprintf(stderr, "r=%d\n", r); | |
703 fprintf(stderr, "mem=%"PRId64"d\n", (int64_t)lzma_memusage(&strm)); | |
704 return NULL; | |
705 } | |
706 | |
707 out_pos = strm.total_out; | |
708 | |
709 if (r == LZMA_STREAM_END) | |
710 break; | |
711 } | |
712 | |
713 /* finish up any unflushed data; necessary? */ | |
714 r = lzma_code(&strm, LZMA_FINISH); | |
715 if (r != LZMA_OK && r != LZMA_STREAM_END) { | |
716 fprintf(stderr, "r=%d\n", r); | |
717 return NULL; | |
718 } | |
719 | |
720 out = realloc(out, strm.total_out); | |
721 *size = strm.total_out; | |
722 | |
723 lzma_end(&strm); | |
724 | |
725 return out; | |
726 } | |
727 #endif | |
728 | |
729 /* ---------------------------------------------------------------------- | |
730 * CRAM blocks - the dynamically growable data block. We have code to | |
731 * create, update, (un)compress and read/write. | |
732 * | |
733 * These are derived from the deflate_interlaced.c blocks, but with the | |
734 * CRAM extension of content types and IDs. | |
735 */ | |
736 | |
737 /* | |
738 * Allocates a new cram_block structure with a specified content_type and | |
739 * id. | |
740 * | |
741 * Returns block pointer on success | |
742 * NULL on failure | |
743 */ | |
744 cram_block *cram_new_block(enum cram_content_type content_type, | |
745 int content_id) { | |
746 cram_block *b = malloc(sizeof(*b)); | |
747 if (!b) | |
748 return NULL; | |
749 b->method = b->orig_method = RAW; | |
750 b->content_type = content_type; | |
751 b->content_id = content_id; | |
752 b->comp_size = 0; | |
753 b->uncomp_size = 0; | |
754 b->data = NULL; | |
755 b->alloc = 0; | |
756 b->byte = 0; | |
757 b->bit = 7; // MSB | |
758 | |
759 return b; | |
760 } | |
761 | |
762 /* | |
763 * Reads a block from a cram file. | |
764 * Returns cram_block pointer on success. | |
765 * NULL on failure | |
766 */ | |
767 cram_block *cram_read_block(cram_fd *fd) { | |
768 cram_block *b = malloc(sizeof(*b)); | |
769 if (!b) | |
770 return NULL; | |
771 | |
772 //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp)); | |
773 | |
774 if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; } | |
775 if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; } | |
776 if (-1 == itf8_decode(fd, &b->content_id)) { free(b); return NULL; } | |
777 if (-1 == itf8_decode(fd, &b->comp_size)) { free(b); return NULL; } | |
778 if (-1 == itf8_decode(fd, &b->uncomp_size)) { free(b); return NULL; } | |
779 | |
780 // fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n", | |
781 // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size); | |
782 | |
783 if (b->method == RAW) { | |
784 b->alloc = b->uncomp_size; | |
785 if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; } | |
786 if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) { | |
787 free(b->data); | |
788 free(b); | |
789 return NULL; | |
790 } | |
791 } else { | |
792 b->alloc = b->comp_size; | |
793 if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; } | |
794 if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) { | |
795 free(b->data); | |
796 free(b); | |
797 return NULL; | |
798 } | |
799 } | |
800 | |
801 if (CRAM_MAJOR_VERS(fd->version) >= 3) { | |
802 unsigned char dat[100], *cp = dat;; | |
803 uint32_t crc; | |
804 | |
805 | |
806 if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) { | |
807 free(b); | |
808 return NULL; | |
809 } | |
810 | |
811 *cp++ = b->method; | |
812 *cp++ = b->content_type; | |
813 cp += itf8_put(cp, b->content_id); | |
814 cp += itf8_put(cp, b->comp_size); | |
815 cp += itf8_put(cp, b->uncomp_size); | |
816 crc = crc32(0L, dat, cp-dat); | |
817 crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc); | |
818 | |
819 if (crc != b->crc32) { | |
820 fprintf(stderr, "Block CRC32 failure\n"); | |
821 free(b->data); | |
822 free(b); | |
823 return NULL; | |
824 } | |
825 } | |
826 | |
827 b->orig_method = b->method; | |
828 b->idx = 0; | |
829 b->byte = 0; | |
830 b->bit = 7; // MSB | |
831 | |
832 return b; | |
833 } | |
834 | |
835 /* | |
836 * Writes a CRAM block. | |
837 * Returns 0 on success | |
838 * -1 on failure | |
839 */ | |
840 int cram_write_block(cram_fd *fd, cram_block *b) { | |
841 assert(b->method != RAW || (b->comp_size == b->uncomp_size)); | |
842 | |
843 if (hputc(b->method, fd->fp) == EOF) return -1; | |
844 if (hputc(b->content_type, fd->fp) == EOF) return -1; | |
845 if (itf8_encode(fd, b->content_id) == -1) return -1; | |
846 if (itf8_encode(fd, b->comp_size) == -1) return -1; | |
847 if (itf8_encode(fd, b->uncomp_size) == -1) return -1; | |
848 | |
849 if (b->method == RAW) { | |
850 if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size)) | |
851 return -1; | |
852 } else { | |
853 if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size)) | |
854 return -1; | |
855 } | |
856 | |
857 if (CRAM_MAJOR_VERS(fd->version) >= 3) { | |
858 unsigned char dat[100], *cp = dat;; | |
859 uint32_t crc; | |
860 | |
861 *cp++ = b->method; | |
862 *cp++ = b->content_type; | |
863 cp += itf8_put(cp, b->content_id); | |
864 cp += itf8_put(cp, b->comp_size); | |
865 cp += itf8_put(cp, b->uncomp_size); | |
866 crc = crc32(0L, dat, cp-dat); | |
867 | |
868 if (b->method == RAW) { | |
869 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size); | |
870 } else { | |
871 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size); | |
872 } | |
873 | |
874 if (-1 == int32_encode(fd, b->crc32)) | |
875 return -1; | |
876 } | |
877 | |
878 return 0; | |
879 } | |
880 | |
881 /* | |
882 * Frees a CRAM block, deallocating internal data too. | |
883 */ | |
884 void cram_free_block(cram_block *b) { | |
885 if (!b) | |
886 return; | |
887 if (b->data) | |
888 free(b->data); | |
889 free(b); | |
890 } | |
891 | |
892 /* | |
893 * Uncompresses a CRAM block, if compressed. | |
894 */ | |
895 int cram_uncompress_block(cram_block *b) { | |
896 char *uncomp; | |
897 size_t uncomp_size = 0; | |
898 | |
899 if (b->uncomp_size == 0) { | |
900 // blank block | |
901 b->method = RAW; | |
902 return 0; | |
903 } | |
904 | |
905 switch (b->method) { | |
906 case RAW: | |
907 return 0; | |
908 | |
909 case GZIP: | |
910 uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); | |
911 if (!uncomp) | |
912 return -1; | |
913 if ((int)uncomp_size != b->uncomp_size) { | |
914 free(uncomp); | |
915 return -1; | |
916 } | |
917 free(b->data); | |
918 b->data = (unsigned char *)uncomp; | |
919 b->alloc = uncomp_size; | |
920 b->method = RAW; | |
921 break; | |
922 | |
923 #ifdef HAVE_LIBBZ2 | |
924 case BZIP2: { | |
925 unsigned int usize = b->uncomp_size; | |
926 if (!(uncomp = malloc(usize))) | |
927 return -1; | |
928 if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize, | |
929 (char *)b->data, b->comp_size, | |
930 0, 0)) { | |
931 free(uncomp); | |
932 return -1; | |
933 } | |
934 free(b->data); | |
935 b->data = (unsigned char *)uncomp; | |
936 b->alloc = usize; | |
937 b->method = RAW; | |
938 b->uncomp_size = usize; // Just incase it differs | |
939 break; | |
940 } | |
941 #else | |
942 case BZIP2: | |
943 fprintf(stderr, "Bzip2 compression is not compiled into this " | |
944 "version.\nPlease rebuild and try again.\n"); | |
945 return -1; | |
946 #endif | |
947 | |
948 #ifdef HAVE_LIBLZMA | |
949 case LZMA: | |
950 uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); | |
951 if (!uncomp) | |
952 return -1; | |
953 if ((int)uncomp_size != b->uncomp_size) | |
954 return -1; | |
955 free(b->data); | |
956 b->data = (unsigned char *)uncomp; | |
957 b->alloc = uncomp_size; | |
958 b->method = RAW; | |
959 break; | |
960 #else | |
961 case LZMA: | |
962 fprintf(stderr, "Lzma compression is not compiled into this " | |
963 "version.\nPlease rebuild and try again.\n"); | |
964 return -1; | |
965 break; | |
966 #endif | |
967 | |
968 case RANS: { | |
969 unsigned int usize = b->uncomp_size, usize2; | |
970 uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2); | |
971 assert(usize == usize2); | |
972 free(b->data); | |
973 b->data = (unsigned char *)uncomp; | |
974 b->alloc = usize2; | |
975 b->method = RAW; | |
976 b->uncomp_size = usize2; // Just incase it differs | |
977 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); | |
978 break; | |
979 } | |
980 | |
981 default: | |
982 return -1; | |
983 } | |
984 | |
985 return 0; | |
986 } | |
987 | |
988 static char *cram_compress_by_method(char *in, size_t in_size, | |
989 size_t *out_size, | |
990 enum cram_block_method method, | |
991 int level, int strat) { | |
992 switch (method) { | |
993 case GZIP: | |
994 return zlib_mem_deflate(in, in_size, out_size, level, strat); | |
995 | |
996 case BZIP2: { | |
997 #ifdef HAVE_LIBBZ2 | |
998 unsigned int comp_size = in_size*1.01 + 600; | |
999 char *comp = malloc(comp_size); | |
1000 if (!comp) | |
1001 return NULL; | |
1002 | |
1003 if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size, | |
1004 in, in_size, | |
1005 level, 0, 30)) { | |
1006 free(comp); | |
1007 return NULL; | |
1008 } | |
1009 *out_size = comp_size; | |
1010 return comp; | |
1011 #else | |
1012 return NULL; | |
1013 #endif | |
1014 } | |
1015 | |
1016 case LZMA: | |
1017 #ifdef HAVE_LIBLZMA | |
1018 return lzma_mem_deflate(in, in_size, out_size, level); | |
1019 #else | |
1020 return NULL; | |
1021 #endif | |
1022 | |
1023 case RANS0: { | |
1024 unsigned int out_size_i; | |
1025 unsigned char *cp; | |
1026 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0); | |
1027 *out_size = out_size_i; | |
1028 return (char *)cp; | |
1029 } | |
1030 | |
1031 case RANS1: { | |
1032 unsigned int out_size_i; | |
1033 unsigned char *cp; | |
1034 | |
1035 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1); | |
1036 *out_size = out_size_i; | |
1037 return (char *)cp; | |
1038 } | |
1039 | |
1040 case RAW: | |
1041 break; | |
1042 | |
1043 default: | |
1044 return NULL; | |
1045 } | |
1046 | |
1047 return NULL; | |
1048 } | |
1049 | |
1050 | |
1051 /* | |
1052 * Compresses a block using one of two different zlib strategies. If we only | |
1053 * want one choice set strat2 to be -1. | |
1054 * | |
1055 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED | |
1056 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is | |
1057 * significantly faster. | |
1058 */ | |
1059 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, | |
1060 int method, int level) { | |
1061 | |
1062 char *comp = NULL; | |
1063 size_t comp_size = 0; | |
1064 int strat; | |
1065 | |
1066 //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size); | |
1067 | |
1068 if (method == RAW || level == 0 || b->uncomp_size == 0) { | |
1069 b->method = RAW; | |
1070 b->comp_size = b->uncomp_size; | |
1071 //fprintf(stderr, "Skip block id %d\n", b->content_id); | |
1072 return 0; | |
1073 } | |
1074 | |
1075 if (metrics) { | |
1076 pthread_mutex_lock(&fd->metrics_lock); | |
1077 if (metrics->trial > 0 || --metrics->next_trial <= 0) { | |
1078 size_t sz_best = INT_MAX; | |
1079 size_t sz_gz_rle = 0; | |
1080 size_t sz_gz_def = 0; | |
1081 size_t sz_rans0 = 0; | |
1082 size_t sz_rans1 = 0; | |
1083 size_t sz_bzip2 = 0; | |
1084 size_t sz_lzma = 0; | |
1085 int method_best = 0; | |
1086 char *c_best = NULL, *c = NULL; | |
1087 | |
1088 if (metrics->revised_method) | |
1089 method = metrics->revised_method; | |
1090 else | |
1091 metrics->revised_method = method; | |
1092 | |
1093 if (metrics->next_trial == 0) { | |
1094 metrics->next_trial = TRIAL_SPAN; | |
1095 metrics->trial = NTRIALS; | |
1096 metrics->sz_gz_rle /= 2; | |
1097 metrics->sz_gz_def /= 2; | |
1098 metrics->sz_rans0 /= 2; | |
1099 metrics->sz_rans1 /= 2; | |
1100 metrics->sz_bzip2 /= 2; | |
1101 metrics->sz_lzma /= 2; | |
1102 } | |
1103 | |
1104 pthread_mutex_unlock(&fd->metrics_lock); | |
1105 | |
1106 if (method & (1<<GZIP_RLE)) { | |
1107 c = cram_compress_by_method((char *)b->data, b->uncomp_size, | |
1108 &sz_gz_rle, GZIP, 1, Z_RLE); | |
1109 if (c && sz_best > sz_gz_rle) { | |
1110 sz_best = sz_gz_rle; | |
1111 method_best = GZIP_RLE; | |
1112 if (c_best) | |
1113 free(c_best); | |
1114 c_best = c; | |
1115 } else if (c) { | |
1116 free(c); | |
1117 } else { | |
1118 sz_gz_rle = b->uncomp_size*2+1000; | |
1119 } | |
1120 | |
1121 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle); | |
1122 } | |
1123 | |
1124 if (method & (1<<GZIP)) { | |
1125 c = cram_compress_by_method((char *)b->data, b->uncomp_size, | |
1126 &sz_gz_def, GZIP, level, | |
1127 Z_FILTERED); | |
1128 if (c && sz_best > sz_gz_def) { | |
1129 sz_best = sz_gz_def; | |
1130 method_best = GZIP; | |
1131 if (c_best) | |
1132 free(c_best); | |
1133 c_best = c; | |
1134 } else if (c) { | |
1135 free(c); | |
1136 } else { | |
1137 sz_gz_def = b->uncomp_size*2+1000; | |
1138 } | |
1139 | |
1140 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def); | |
1141 } | |
1142 | |
1143 if (method & (1<<RANS0)) { | |
1144 c = cram_compress_by_method((char *)b->data, b->uncomp_size, | |
1145 &sz_rans0, RANS0, 0, 0); | |
1146 if (c && sz_best > sz_rans0) { | |
1147 sz_best = sz_rans0; | |
1148 method_best = RANS0; | |
1149 if (c_best) | |
1150 free(c_best); | |
1151 c_best = c; | |
1152 } else if (c) { | |
1153 free(c); | |
1154 } else { | |
1155 sz_rans0 = b->uncomp_size*2+1000; | |
1156 } | |
1157 } | |
1158 | |
1159 if (method & (1<<RANS1)) { | |
1160 c = cram_compress_by_method((char *)b->data, b->uncomp_size, | |
1161 &sz_rans1, RANS1, 0, 0); | |
1162 if (c && sz_best > sz_rans1) { | |
1163 sz_best = sz_rans1; | |
1164 method_best = RANS1; | |
1165 if (c_best) | |
1166 free(c_best); | |
1167 c_best = c; | |
1168 } else if (c) { | |
1169 free(c); | |
1170 } else { | |
1171 sz_rans1 = b->uncomp_size*2+1000; | |
1172 } | |
1173 } | |
1174 | |
1175 if (method & (1<<BZIP2)) { | |
1176 c = cram_compress_by_method((char *)b->data, b->uncomp_size, | |
1177 &sz_bzip2, BZIP2, level, 0); | |
1178 if (c && sz_best > sz_bzip2) { | |
1179 sz_best = sz_bzip2; | |
1180 method_best = BZIP2; | |
1181 if (c_best) | |
1182 free(c_best); | |
1183 c_best = c; | |
1184 } else if (c) { | |
1185 free(c); | |
1186 } else { | |
1187 sz_bzip2 = b->uncomp_size*2+1000; | |
1188 } | |
1189 } | |
1190 | |
1191 if (method & (1<<LZMA)) { | |
1192 c = cram_compress_by_method((char *)b->data, b->uncomp_size, | |
1193 &sz_lzma, LZMA, level, 0); | |
1194 if (c && sz_best > sz_lzma) { | |
1195 sz_best = sz_lzma; | |
1196 method_best = LZMA; | |
1197 if (c_best) | |
1198 free(c_best); | |
1199 c_best = c; | |
1200 } else if (c) { | |
1201 free(c); | |
1202 } else { | |
1203 sz_lzma = b->uncomp_size*2+1000; | |
1204 } | |
1205 } | |
1206 | |
1207 //fprintf(stderr, "sz_best = %d\n", sz_best); | |
1208 | |
1209 free(b->data); | |
1210 b->data = (unsigned char *)c_best; | |
1211 //printf("method_best = %s\n", cram_block_method2str(method_best)); | |
1212 b->method = method_best == GZIP_RLE ? GZIP : method_best; | |
1213 b->comp_size = sz_best; | |
1214 | |
1215 pthread_mutex_lock(&fd->metrics_lock); | |
1216 metrics->sz_gz_rle += sz_gz_rle; | |
1217 metrics->sz_gz_def += sz_gz_def; | |
1218 metrics->sz_rans0 += sz_rans0; | |
1219 metrics->sz_rans1 += sz_rans1; | |
1220 metrics->sz_bzip2 += sz_bzip2; | |
1221 metrics->sz_lzma += sz_lzma; | |
1222 if (--metrics->trial == 0) { | |
1223 int best_method = RAW; | |
1224 int best_sz = INT_MAX; | |
1225 | |
1226 // Scale methods by cost | |
1227 if (fd->level <= 3) { | |
1228 metrics->sz_rans1 *= 1.02; | |
1229 metrics->sz_gz_def *= 1.04; | |
1230 metrics->sz_bzip2 *= 1.08; | |
1231 metrics->sz_lzma *= 1.10; | |
1232 } else if (fd->level <= 6) { | |
1233 metrics->sz_rans1 *= 1.01; | |
1234 metrics->sz_gz_def *= 1.02; | |
1235 metrics->sz_bzip2 *= 1.03; | |
1236 metrics->sz_lzma *= 1.05; | |
1237 } | |
1238 | |
1239 if (method & (1<<GZIP_RLE) && best_sz > metrics->sz_gz_rle) | |
1240 best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE; | |
1241 | |
1242 if (method & (1<<GZIP) && best_sz > metrics->sz_gz_def) | |
1243 best_sz = metrics->sz_gz_def, best_method = GZIP; | |
1244 | |
1245 if (method & (1<<RANS0) && best_sz > metrics->sz_rans0) | |
1246 best_sz = metrics->sz_rans0, best_method = RANS0; | |
1247 | |
1248 if (method & (1<<RANS1) && best_sz > metrics->sz_rans1) | |
1249 best_sz = metrics->sz_rans1, best_method = RANS1; | |
1250 | |
1251 if (method & (1<<BZIP2) && best_sz > metrics->sz_bzip2) | |
1252 best_sz = metrics->sz_bzip2, best_method = BZIP2; | |
1253 | |
1254 if (method & (1<<LZMA) && best_sz > metrics->sz_lzma) | |
1255 best_sz = metrics->sz_lzma, best_method = LZMA; | |
1256 | |
1257 if (best_method == GZIP_RLE) { | |
1258 metrics->method = GZIP; | |
1259 metrics->strat = Z_RLE; | |
1260 } else { | |
1261 metrics->method = best_method; | |
1262 metrics->strat = Z_FILTERED; | |
1263 } | |
1264 | |
1265 // If we see at least MAXFAIL trials in a row for a specific | |
1266 // compression method with more than MAXDELTA aggregate | |
1267 // size then we drop this from the list of methods used | |
1268 // for this block type. | |
1269 #define MAXDELTA 0.20 | |
1270 #define MAXFAILS 4 | |
1271 if (best_method == GZIP_RLE) { | |
1272 metrics->gz_rle_cnt = 0; | |
1273 metrics->gz_rle_extra = 0; | |
1274 } else if (best_sz < metrics->sz_gz_rle) { | |
1275 double r = (double)metrics->sz_gz_rle / best_sz - 1; | |
1276 if (++metrics->gz_rle_cnt >= MAXFAILS && | |
1277 (metrics->gz_rle_extra += r) >= MAXDELTA) | |
1278 method &= ~(1<<GZIP_RLE); | |
1279 } | |
1280 | |
1281 if (best_method == GZIP) { | |
1282 metrics->gz_def_cnt = 0; | |
1283 metrics->gz_def_extra = 0; | |
1284 } else if (best_sz < metrics->sz_gz_def) { | |
1285 double r = (double)metrics->sz_gz_def / best_sz - 1; | |
1286 if (++metrics->gz_def_cnt >= MAXFAILS && | |
1287 (metrics->gz_def_extra += r) >= MAXDELTA) | |
1288 method &= ~(1<<GZIP); | |
1289 } | |
1290 | |
1291 if (best_method == RANS0) { | |
1292 metrics->rans0_cnt = 0; | |
1293 metrics->rans0_extra = 0; | |
1294 } else if (best_sz < metrics->sz_rans0) { | |
1295 double r = (double)metrics->sz_rans0 / best_sz - 1; | |
1296 if (++metrics->rans0_cnt >= MAXFAILS && | |
1297 (metrics->rans0_extra += r) >= MAXDELTA) | |
1298 method &= ~(1<<RANS0); | |
1299 } | |
1300 | |
1301 if (best_method == RANS1) { | |
1302 metrics->rans1_cnt = 0; | |
1303 metrics->rans1_extra = 0; | |
1304 } else if (best_sz < metrics->sz_rans1) { | |
1305 double r = (double)metrics->sz_rans1 / best_sz - 1; | |
1306 if (++metrics->rans1_cnt >= MAXFAILS && | |
1307 (metrics->rans1_extra += r) >= MAXDELTA) | |
1308 method &= ~(1<<RANS1); | |
1309 } | |
1310 | |
1311 if (best_method == BZIP2) { | |
1312 metrics->bzip2_cnt = 0; | |
1313 metrics->bzip2_extra = 0; | |
1314 } else if (best_sz < metrics->sz_bzip2) { | |
1315 double r = (double)metrics->sz_bzip2 / best_sz - 1; | |
1316 if (++metrics->bzip2_cnt >= MAXFAILS && | |
1317 (metrics->bzip2_extra += r) >= MAXDELTA) | |
1318 method &= ~(1<<BZIP2); | |
1319 } | |
1320 | |
1321 if (best_method == LZMA) { | |
1322 metrics->lzma_cnt = 0; | |
1323 metrics->lzma_extra = 0; | |
1324 } else if (best_sz < metrics->sz_lzma) { | |
1325 double r = (double)metrics->sz_lzma / best_sz - 1; | |
1326 if (++metrics->lzma_cnt >= MAXFAILS && | |
1327 (metrics->lzma_extra += r) >= MAXDELTA) | |
1328 method &= ~(1<<LZMA); | |
1329 } | |
1330 | |
1331 //if (method != metrics->revised_method) | |
1332 // fprintf(stderr, "%d: method from %x to %x\n", | |
1333 // b->content_id, metrics->revised_method, method); | |
1334 metrics->revised_method = method; | |
1335 } | |
1336 pthread_mutex_unlock(&fd->metrics_lock); | |
1337 } else { | |
1338 strat = metrics->strat; | |
1339 method = metrics->method; | |
1340 | |
1341 pthread_mutex_unlock(&fd->metrics_lock); | |
1342 comp = cram_compress_by_method((char *)b->data, b->uncomp_size, | |
1343 &comp_size, method, | |
1344 level, strat); | |
1345 if (!comp) | |
1346 return -1; | |
1347 free(b->data); | |
1348 b->data = (unsigned char *)comp; | |
1349 b->comp_size = comp_size; | |
1350 b->method = method; | |
1351 } | |
1352 | |
1353 } else { | |
1354 // no cached metrics, so just do zlib? | |
1355 comp = cram_compress_by_method((char *)b->data, b->uncomp_size, | |
1356 &comp_size, GZIP, level, Z_FILTERED); | |
1357 if (!comp) { | |
1358 fprintf(stderr, "Compression failed!\n"); | |
1359 return -1; | |
1360 } | |
1361 free(b->data); | |
1362 b->data = (unsigned char *)comp; | |
1363 b->comp_size = comp_size; | |
1364 b->method = GZIP; | |
1365 } | |
1366 | |
1367 if (fd->verbose) | |
1368 fprintf(stderr, "Compressed block ID %d from %d to %d by method %s\n", | |
1369 b->content_id, b->uncomp_size, b->comp_size, | |
1370 cram_block_method2str(b->method)); | |
1371 | |
1372 if (b->method == RANS1) | |
1373 b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing | |
1374 | |
1375 return 0; | |
1376 } | |
1377 | |
1378 cram_metrics *cram_new_metrics(void) { | |
1379 cram_metrics *m = calloc(1, sizeof(*m)); | |
1380 if (!m) | |
1381 return NULL; | |
1382 m->trial = NTRIALS-1; | |
1383 m->next_trial = TRIAL_SPAN; | |
1384 m->method = RAW; | |
1385 m->strat = 0; | |
1386 m->revised_method = 0; | |
1387 | |
1388 return m; | |
1389 } | |
1390 | |
1391 char *cram_block_method2str(enum cram_block_method m) { | |
1392 switch(m) { | |
1393 case RAW: return "RAW"; | |
1394 case GZIP: return "GZIP"; | |
1395 case BZIP2: return "BZIP2"; | |
1396 case LZMA: return "LZMA"; | |
1397 case RANS0: return "RANS0"; | |
1398 case RANS1: return "RANS1"; | |
1399 case GZIP_RLE: return "GZIP_RLE"; | |
1400 case ERROR: break; | |
1401 } | |
1402 return "?"; | |
1403 } | |
1404 | |
1405 char *cram_content_type2str(enum cram_content_type t) { | |
1406 switch (t) { | |
1407 case FILE_HEADER: return "FILE_HEADER"; | |
1408 case COMPRESSION_HEADER: return "COMPRESSION_HEADER"; | |
1409 case MAPPED_SLICE: return "MAPPED_SLICE"; | |
1410 case UNMAPPED_SLICE: return "UNMAPPED_SLICE"; | |
1411 case EXTERNAL: return "EXTERNAL"; | |
1412 case CORE: return "CORE"; | |
1413 case CT_ERROR: break; | |
1414 } | |
1415 return "?"; | |
1416 } | |
1417 | |
1418 /* | |
1419 * Extra error checking on fclose to really ensure data is written. | |
1420 * Care needs to be taken to handle pipes vs real files. | |
1421 * | |
1422 * Returns 0 on success | |
1423 * -1 on failure. | |
1424 */ | |
1425 int paranoid_fclose(FILE *fp) { | |
1426 if (-1 == fflush(fp) && errno != EBADF) { | |
1427 fclose(fp); | |
1428 return -1; | |
1429 } | |
1430 | |
1431 errno = 0; | |
1432 if (-1 == fsync(fileno(fp))) { | |
1433 if (errno != EINVAL) { // eg pipe | |
1434 fclose(fp); | |
1435 return -1; | |
1436 } | |
1437 } | |
1438 return fclose(fp); | |
1439 } | |
1440 | |
1441 /* ---------------------------------------------------------------------- | |
1442 * Reference sequence handling | |
1443 * | |
1444 * These revolve around the refs_t structure, which may potentially be | |
1445 * shared between multiple cram_fd. | |
1446 * | |
1447 * We start with refs_create() to allocate an empty refs_t and then | |
1448 * populate it with @SQ line data using refs_from_header(). This is done on | |
1449 * cram_open(). Also at start up we can call cram_load_reference() which | |
1450 * is used with "scramble -r foo.fa". This replaces the fd->refs with the | |
1451 * new one specified. In either case refs2id() is then called which | |
1452 * maps ref_entry names to @SQ ids (refs_t->ref_id[]). | |
1453 * | |
1454 * Later, possibly within a thread, we will want to know the actual ref | |
1455 * seq itself, obtained by calling cram_get_ref(). This may use the | |
1456 * UR: or M5: fields or the filename specified in the original | |
1457 * cram_load_reference() call. | |
1458 * | |
1459 * Given the potential for multi-threaded reference usage, we have | |
1460 * reference counting (sorry for the confusing double use of "ref") to | |
1461 * track the number of callers interested in any specific reference. | |
1462 */ | |
1463 | |
1464 void refs_free(refs_t *r) { | |
1465 RP("refs_free()\n"); | |
1466 | |
1467 if (--r->count > 0) | |
1468 return; | |
1469 | |
1470 if (!r) | |
1471 return; | |
1472 | |
1473 if (r->pool) | |
1474 string_pool_destroy(r->pool); | |
1475 | |
1476 if (r->h_meta) { | |
1477 khint_t k; | |
1478 | |
1479 for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) { | |
1480 ref_entry *e; | |
1481 | |
1482 if (!kh_exist(r->h_meta, k)) | |
1483 continue; | |
1484 if (!(e = kh_val(r->h_meta, k))) | |
1485 continue; | |
1486 if (e->seq) | |
1487 free(e->seq); | |
1488 free(e); | |
1489 } | |
1490 | |
1491 kh_destroy(refs, r->h_meta); | |
1492 } | |
1493 | |
1494 if (r->ref_id) | |
1495 free(r->ref_id); | |
1496 | |
1497 if (r->fp) | |
1498 bgzf_close(r->fp); | |
1499 | |
1500 pthread_mutex_destroy(&r->lock); | |
1501 | |
1502 free(r); | |
1503 } | |
1504 | |
1505 static refs_t *refs_create(void) { | |
1506 refs_t *r = calloc(1, sizeof(*r)); | |
1507 | |
1508 RP("refs_create()\n"); | |
1509 | |
1510 if (!r) | |
1511 return NULL; | |
1512 | |
1513 if (!(r->pool = string_pool_create(8192))) | |
1514 goto err; | |
1515 | |
1516 r->ref_id = NULL; // see refs2id() to populate. | |
1517 r->count = 1; | |
1518 r->last = NULL; | |
1519 r->last_id = -1; | |
1520 | |
1521 if (!(r->h_meta = kh_init(refs))) | |
1522 goto err; | |
1523 | |
1524 pthread_mutex_init(&r->lock, NULL); | |
1525 | |
1526 return r; | |
1527 | |
1528 err: | |
1529 refs_free(r); | |
1530 return NULL; | |
1531 } | |
1532 | |
1533 /* | |
1534 * Opens a reference fasta file as a BGZF stream, allowing for | |
1535 * compressed files. It automatically builds a .fai file if | |
1536 * required and if compressed a .gzi bgzf index too. | |
1537 * | |
1538 * Returns a BGZF handle on success; | |
1539 * NULL on failure. | |
1540 */ | |
1541 static BGZF *bgzf_open_ref(char *fn, char *mode) { | |
1542 BGZF *fp; | |
1543 char fai_file[PATH_MAX]; | |
1544 | |
1545 snprintf(fai_file, PATH_MAX, "%s.fai", fn); | |
1546 if (access(fai_file, R_OK) != 0) | |
1547 if (fai_build(fn) != 0) | |
1548 return NULL; | |
1549 | |
1550 if (!(fp = bgzf_open(fn, mode))) { | |
1551 perror(fn); | |
1552 return NULL; | |
1553 } | |
1554 | |
1555 if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) { | |
1556 fprintf(stderr, "Unable to load .gzi index '%s.gzi'\n", fn); | |
1557 bgzf_close(fp); | |
1558 return NULL; | |
1559 } | |
1560 | |
1561 return fp; | |
1562 } | |
1563 | |
1564 /* | |
1565 * Loads a FAI file for a reference.fasta. | |
1566 * "is_err" indicates whether failure to load is worthy of emitting an | |
1567 * error message. In some cases (eg with embedded references) we | |
1568 * speculatively load, just incase, and silently ignore errors. | |
1569 * | |
1570 * Returns the refs_t struct on success (maybe newly allocated); | |
1571 * NULL on failure | |
1572 */ | |
1573 static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) { | |
1574 struct stat sb; | |
1575 FILE *fp = NULL; | |
1576 char fai_fn[PATH_MAX]; | |
1577 char line[8192]; | |
1578 refs_t *r = r_orig; | |
1579 size_t fn_l = strlen(fn); | |
1580 int id = 0, id_alloc = 0; | |
1581 | |
1582 RP("refs_load_fai %s\n", fn); | |
1583 | |
1584 if (!r) | |
1585 if (!(r = refs_create())) | |
1586 goto err; | |
1587 | |
1588 /* Open reference, for later use */ | |
1589 if (stat(fn, &sb) != 0) { | |
1590 if (is_err) | |
1591 perror(fn); | |
1592 goto err; | |
1593 } | |
1594 | |
1595 if (r->fp) | |
1596 if (bgzf_close(r->fp) != 0) | |
1597 goto err; | |
1598 r->fp = NULL; | |
1599 | |
1600 if (!(r->fn = string_dup(r->pool, fn))) | |
1601 goto err; | |
1602 | |
1603 if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) | |
1604 r->fn[fn_l-4] = 0; | |
1605 | |
1606 if (!(r->fp = bgzf_open_ref(r->fn, "r"))) | |
1607 goto err; | |
1608 | |
1609 /* Parse .fai file and load meta-data */ | |
1610 sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn); | |
1611 | |
1612 if (stat(fai_fn, &sb) != 0) { | |
1613 if (is_err) | |
1614 perror(fai_fn); | |
1615 goto err; | |
1616 } | |
1617 if (!(fp = fopen(fai_fn, "r"))) { | |
1618 if (is_err) | |
1619 perror(fai_fn); | |
1620 goto err; | |
1621 } | |
1622 while (fgets(line, 8192, fp) != NULL) { | |
1623 ref_entry *e = malloc(sizeof(*e)); | |
1624 char *cp; | |
1625 int n; | |
1626 khint_t k; | |
1627 | |
1628 if (!e) | |
1629 return NULL; | |
1630 | |
1631 // id | |
1632 for (cp = line; *cp && !isspace(*cp); cp++) | |
1633 ; | |
1634 *cp++ = 0; | |
1635 e->name = string_dup(r->pool, line); | |
1636 | |
1637 // length | |
1638 while (*cp && isspace(*cp)) | |
1639 cp++; | |
1640 e->length = strtoll(cp, &cp, 10); | |
1641 | |
1642 // offset | |
1643 while (*cp && isspace(*cp)) | |
1644 cp++; | |
1645 e->offset = strtoll(cp, &cp, 10); | |
1646 | |
1647 // bases per line | |
1648 while (*cp && isspace(*cp)) | |
1649 cp++; | |
1650 e->bases_per_line = strtol(cp, &cp, 10); | |
1651 | |
1652 // line length | |
1653 while (*cp && isspace(*cp)) | |
1654 cp++; | |
1655 e->line_length = strtol(cp, &cp, 10); | |
1656 | |
1657 // filename | |
1658 e->fn = r->fn; | |
1659 | |
1660 e->count = 0; | |
1661 e->seq = NULL; | |
1662 | |
1663 k = kh_put(refs, r->h_meta, e->name, &n); | |
1664 if (-1 == n) { | |
1665 free(e); | |
1666 return NULL; | |
1667 } | |
1668 | |
1669 if (n) { | |
1670 kh_val(r->h_meta, k) = e; | |
1671 } else { | |
1672 ref_entry *re = kh_val(r->h_meta, k); | |
1673 if (re && (re->count != 0 || re->length != 0)) { | |
1674 /* Keep old */ | |
1675 free(e); | |
1676 } else { | |
1677 /* Replace old */ | |
1678 if (re) | |
1679 free(re); | |
1680 kh_val(r->h_meta, k) = e; | |
1681 } | |
1682 } | |
1683 | |
1684 if (id >= id_alloc) { | |
1685 int x; | |
1686 | |
1687 id_alloc = id_alloc ?id_alloc*2 : 16; | |
1688 r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id)); | |
1689 | |
1690 for (x = id; x < id_alloc; x++) | |
1691 r->ref_id[x] = NULL; | |
1692 } | |
1693 r->ref_id[id] = e; | |
1694 r->nref = ++id; | |
1695 } | |
1696 | |
1697 return r; | |
1698 | |
1699 err: | |
1700 if (fp) | |
1701 fclose(fp); | |
1702 | |
1703 if (!r_orig) | |
1704 refs_free(r); | |
1705 | |
1706 return NULL; | |
1707 } | |
1708 | |
1709 /* | |
1710 * Indexes references by the order they appear in a BAM file. This may not | |
1711 * necessarily be the same order they appear in the fasta reference file. | |
1712 * | |
1713 * Returns 0 on success | |
1714 * -1 on failure | |
1715 */ | |
1716 int refs2id(refs_t *r, SAM_hdr *h) { | |
1717 int i; | |
1718 | |
1719 if (r->ref_id) | |
1720 free(r->ref_id); | |
1721 if (r->last) | |
1722 r->last = NULL; | |
1723 | |
1724 r->ref_id = calloc(h->nref, sizeof(*r->ref_id)); | |
1725 if (!r->ref_id) | |
1726 return -1; | |
1727 | |
1728 r->nref = h->nref; | |
1729 for (i = 0; i < h->nref; i++) { | |
1730 khint_t k = kh_get(refs, r->h_meta, h->ref[i].name); | |
1731 if (k != kh_end(r->h_meta)) { | |
1732 r->ref_id[i] = kh_val(r->h_meta, k); | |
1733 } else { | |
1734 fprintf(stderr, "Unable to find ref name '%s'\n", | |
1735 h->ref[i].name); | |
1736 } | |
1737 } | |
1738 | |
1739 return 0; | |
1740 } | |
1741 | |
1742 /* | |
1743 * Generates refs_t entries based on @SQ lines in the header. | |
1744 * Returns 0 on success | |
1745 * -1 on failure | |
1746 */ | |
1747 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) { | |
1748 int i, j; | |
1749 | |
1750 if (!h || h->nref == 0) | |
1751 return 0; | |
1752 | |
1753 //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode); | |
1754 | |
1755 /* Existing refs are fine, as long as they're compatible with the hdr. */ | |
1756 if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id)))) | |
1757 return -1; | |
1758 | |
1759 /* Copy info from h->ref[i] over to r */ | |
1760 for (i = 0, j = r->nref; i < h->nref; i++) { | |
1761 SAM_hdr_type *ty; | |
1762 SAM_hdr_tag *tag; | |
1763 khint_t k; | |
1764 int n; | |
1765 | |
1766 k = kh_get(refs, r->h_meta, h->ref[i].name); | |
1767 if (k != kh_end(r->h_meta)) | |
1768 // Ref already known about | |
1769 continue; | |
1770 | |
1771 if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry)))) | |
1772 return -1; | |
1773 | |
1774 if (!h->ref[j].name) | |
1775 return -1; | |
1776 | |
1777 r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name); | |
1778 r->ref_id[j]->length = 0; // marker for not yet loaded | |
1779 | |
1780 /* Initialise likely filename if known */ | |
1781 if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) { | |
1782 if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) { | |
1783 r->ref_id[j]->fn = string_dup(r->pool, tag->str+3); | |
1784 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn); | |
1785 } | |
1786 } | |
1787 | |
1788 k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n); | |
1789 if (n <= 0) // already exists or error | |
1790 return -1; | |
1791 kh_val(r->h_meta, k) = r->ref_id[j]; | |
1792 | |
1793 j++; | |
1794 } | |
1795 r->nref = j; | |
1796 | |
1797 return 0; | |
1798 } | |
1799 | |
1800 /* | |
1801 * Attaches a header to a cram_fd. | |
1802 * | |
1803 * This should be used when creating a new cram_fd for writing where | |
1804 * we have an SAM_hdr already constructed (eg from a file we've read | |
1805 * in). | |
1806 */ | |
1807 int cram_set_header(cram_fd *fd, SAM_hdr *hdr) { | |
1808 if (fd->header) | |
1809 sam_hdr_free(fd->header); | |
1810 fd->header = hdr; | |
1811 return refs_from_header(fd->refs, fd, hdr); | |
1812 } | |
1813 | |
1814 /* | |
1815 * Converts a directory and a filename into an expanded path, replacing %s | |
1816 * in directory with the filename and %[0-9]+s with portions of the filename | |
1817 * Any remaining parts of filename are added to the end with /%s. | |
1818 */ | |
1819 void expand_cache_path(char *path, char *dir, char *fn) { | |
1820 char *cp; | |
1821 | |
1822 while ((cp = strchr(dir, '%'))) { | |
1823 strncpy(path, dir, cp-dir); | |
1824 path += cp-dir; | |
1825 | |
1826 if (*++cp == 's') { | |
1827 strcpy(path, fn); | |
1828 path += strlen(fn); | |
1829 fn += strlen(fn); | |
1830 cp++; | |
1831 } else if (*cp >= '0' && *cp <= '9') { | |
1832 char *endp; | |
1833 long l; | |
1834 | |
1835 l = strtol(cp, &endp, 10); | |
1836 l = MIN(l, strlen(fn)); | |
1837 if (*endp == 's') { | |
1838 strncpy(path, fn, l); | |
1839 path += l; | |
1840 fn += l; | |
1841 *path = 0; | |
1842 cp = endp+1; | |
1843 } else { | |
1844 *path++ = '%'; | |
1845 *path++ = *cp++; | |
1846 } | |
1847 } else { | |
1848 *path++ = '%'; | |
1849 *path++ = *cp++; | |
1850 } | |
1851 dir = cp; | |
1852 } | |
1853 strcpy(path, dir); | |
1854 path += strlen(dir); | |
1855 if (*fn && path[-1] != '/') | |
1856 *path++ = '/'; | |
1857 strcpy(path, fn); | |
1858 } | |
1859 | |
1860 /* | |
1861 * Make the directory containing path and any prefix directories. | |
1862 */ | |
1863 void mkdir_prefix(char *path, int mode) { | |
1864 char *cp = strrchr(path, '/'); | |
1865 if (!cp) | |
1866 return; | |
1867 | |
1868 *cp = 0; | |
1869 if (is_directory(path)) { | |
1870 *cp = '/'; | |
1871 return; | |
1872 } | |
1873 | |
1874 if (mkdir(path, mode) == 0) { | |
1875 chmod(path, mode); | |
1876 *cp = '/'; | |
1877 return; | |
1878 } | |
1879 | |
1880 mkdir_prefix(path, mode); | |
1881 mkdir(path, mode); | |
1882 chmod(path, mode); | |
1883 *cp = '/'; | |
1884 } | |
1885 | |
1886 /* | |
1887 * Return the cache directory to use, based on the first of these | |
1888 * environment variables to be set to a non-empty value. | |
1889 */ | |
1890 static const char *get_cache_basedir(const char **extra) { | |
1891 char *base; | |
1892 | |
1893 *extra = ""; | |
1894 | |
1895 base = getenv("XDG_CACHE_HOME"); | |
1896 if (base && *base) return base; | |
1897 | |
1898 base = getenv("HOME"); | |
1899 if (base && *base) { *extra = "/.cache"; return base; } | |
1900 | |
1901 base = getenv("TMPDIR"); | |
1902 if (base && *base) return base; | |
1903 | |
1904 base = getenv("TEMP"); | |
1905 if (base && *base) return base; | |
1906 | |
1907 return "/tmp"; | |
1908 } | |
1909 | |
1910 /* | |
1911 * Queries the M5 string from the header and attempts to populate the | |
1912 * reference from this using the REF_PATH environment. | |
1913 * | |
1914 * Returns 0 on sucess | |
1915 * -1 on failure | |
1916 */ | |
1917 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { | |
1918 char *ref_path = getenv("REF_PATH"); | |
1919 SAM_hdr_type *ty; | |
1920 SAM_hdr_tag *tag; | |
1921 char path[PATH_MAX], path_tmp[PATH_MAX], cache[PATH_MAX]; | |
1922 char *local_cache = getenv("REF_CACHE"); | |
1923 mFILE *mf; | |
1924 | |
1925 if (fd->verbose) | |
1926 fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id); | |
1927 | |
1928 if (!ref_path || *ref_path == '\0') { | |
1929 /* | |
1930 * If we have no ref path, we use the EBI server. | |
1931 * However to avoid spamming it we require a local ref cache too. | |
1932 */ | |
1933 ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s"; | |
1934 if (!local_cache || *local_cache == '\0') { | |
1935 const char *extra; | |
1936 const char *base = get_cache_basedir(&extra); | |
1937 snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra); | |
1938 local_cache = cache; | |
1939 if (fd->verbose) | |
1940 fprintf(stderr, "Populating local cache: %s\n", local_cache); | |
1941 } | |
1942 } | |
1943 | |
1944 if (!r->name) | |
1945 return -1; | |
1946 | |
1947 if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name))) | |
1948 return -1; | |
1949 | |
1950 if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL))) | |
1951 goto no_M5; | |
1952 | |
1953 if (fd->verbose) | |
1954 fprintf(stderr, "Querying ref %s\n", tag->str+3); | |
1955 | |
1956 /* Use cache if available */ | |
1957 if (local_cache && *local_cache) { | |
1958 struct stat sb; | |
1959 BGZF *fp; | |
1960 | |
1961 expand_cache_path(path, local_cache, tag->str+3); | |
1962 | |
1963 if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) { | |
1964 r->length = sb.st_size; | |
1965 r->offset = r->line_length = r->bases_per_line = 0; | |
1966 | |
1967 r->fn = string_dup(fd->refs->pool, path); | |
1968 | |
1969 if (fd->refs->fp) | |
1970 if (bgzf_close(fd->refs->fp) != 0) | |
1971 return -1; | |
1972 fd->refs->fp = fp; | |
1973 fd->refs->fn = r->fn; | |
1974 | |
1975 // Fall back to cram_get_ref() where it'll do the actual | |
1976 // reading of the file. | |
1977 return 0; | |
1978 } | |
1979 } | |
1980 | |
1981 /* Otherwise search */ | |
1982 if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) { | |
1983 size_t sz; | |
1984 r->seq = mfsteal(mf, &sz); | |
1985 r->length = sz; | |
1986 } else { | |
1987 refs_t *refs; | |
1988 char *fn; | |
1989 | |
1990 no_M5: | |
1991 /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */ | |
1992 if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL))) | |
1993 return -1; | |
1994 | |
1995 fn = (strncmp(tag->str+3, "file:", 5) == 0) | |
1996 ? tag->str+8 | |
1997 : tag->str+3; | |
1998 | |
1999 if (fd->refs->fp) { | |
2000 if (bgzf_close(fd->refs->fp) != 0) | |
2001 return -1; | |
2002 fd->refs->fp = NULL; | |
2003 } | |
2004 if (!(refs = refs_load_fai(fd->refs, fn, 0))) | |
2005 return -1; | |
2006 fd->refs = refs; | |
2007 if (fd->refs->fp) { | |
2008 if (bgzf_close(fd->refs->fp) != 0) | |
2009 return -1; | |
2010 fd->refs->fp = NULL; | |
2011 } | |
2012 | |
2013 if (!fd->refs->fn) | |
2014 return -1; | |
2015 | |
2016 if (-1 == refs2id(fd->refs, fd->header)) | |
2017 return -1; | |
2018 if (!fd->refs->ref_id || !fd->refs->ref_id[id]) | |
2019 return -1; | |
2020 | |
2021 // Local copy already, so fall back to cram_get_ref(). | |
2022 return 0; | |
2023 } | |
2024 | |
2025 /* Populate the local disk cache if required */ | |
2026 if (local_cache && *local_cache) { | |
2027 FILE *fp; | |
2028 int i; | |
2029 | |
2030 expand_cache_path(path, local_cache, tag->str+3); | |
2031 if (fd->verbose) | |
2032 fprintf(stderr, "Path='%s'\n", path); | |
2033 mkdir_prefix(path, 01777); | |
2034 | |
2035 i = 0; | |
2036 do { | |
2037 sprintf(path_tmp, "%s.tmp_%d", path, /*getpid(),*/ i); | |
2038 i++; | |
2039 fp = fopen(path_tmp, "wx"); | |
2040 } while (fp == NULL && errno == EEXIST); | |
2041 if (!fp) { | |
2042 perror(path_tmp); | |
2043 | |
2044 // Not fatal - we have the data already so keep going. | |
2045 return 0; | |
2046 } | |
2047 | |
2048 if (r->length != fwrite(r->seq, 1, r->length, fp)) { | |
2049 perror(path); | |
2050 } | |
2051 if (-1 == paranoid_fclose(fp)) { | |
2052 unlink(path_tmp); | |
2053 } else { | |
2054 if (0 == chmod(path_tmp, 0444)) | |
2055 rename(path_tmp, path); | |
2056 else | |
2057 unlink(path_tmp); | |
2058 } | |
2059 } | |
2060 | |
2061 return 0; | |
2062 } | |
2063 | |
2064 static void cram_ref_incr_locked(refs_t *r, int id) { | |
2065 RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1); | |
2066 | |
2067 if (id < 0 || !r->ref_id[id]->seq) | |
2068 return; | |
2069 | |
2070 if (r->last_id == id) | |
2071 r->last_id = -1; | |
2072 | |
2073 ++r->ref_id[id]->count; | |
2074 } | |
2075 | |
2076 void cram_ref_incr(refs_t *r, int id) { | |
2077 pthread_mutex_lock(&r->lock); | |
2078 cram_ref_incr_locked(r, id); | |
2079 pthread_mutex_unlock(&r->lock); | |
2080 } | |
2081 | |
2082 static void cram_ref_decr_locked(refs_t *r, int id) { | |
2083 RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1); | |
2084 | |
2085 if (id < 0 || !r->ref_id[id]->seq) { | |
2086 assert(r->ref_id[id]->count >= 0); | |
2087 return; | |
2088 } | |
2089 | |
2090 if (--r->ref_id[id]->count <= 0) { | |
2091 assert(r->ref_id[id]->count == 0); | |
2092 if (r->last_id >= 0) { | |
2093 if (r->ref_id[r->last_id]->count <= 0 && | |
2094 r->ref_id[r->last_id]->seq) { | |
2095 RP("%d FREE REF %d (%p)\n", gettid(), | |
2096 r->last_id, r->ref_id[r->last_id]->seq); | |
2097 free(r->ref_id[r->last_id]->seq); | |
2098 r->ref_id[r->last_id]->seq = NULL; | |
2099 r->ref_id[r->last_id]->length = 0; | |
2100 } | |
2101 } | |
2102 r->last_id = id; | |
2103 } | |
2104 } | |
2105 | |
2106 void cram_ref_decr(refs_t *r, int id) { | |
2107 pthread_mutex_lock(&r->lock); | |
2108 cram_ref_decr_locked(r, id); | |
2109 pthread_mutex_unlock(&r->lock); | |
2110 } | |
2111 | |
2112 /* | |
2113 * Used by cram_ref_load and cram_ref_get. The file handle will have | |
2114 * already been opened, so we can catch it. The ref_entry *e informs us | |
2115 * of whether this is a multi-line fasta file or a raw MD5 style file. | |
2116 * Either way we create a single contiguous sequence. | |
2117 * | |
2118 * Returns all or part of a reference sequence on success (malloced); | |
2119 * NULL on failure. | |
2120 */ | |
2121 static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) { | |
2122 off_t offset, len; | |
2123 char *seq; | |
2124 | |
2125 if (end < start) | |
2126 end = start; | |
2127 | |
2128 /* | |
2129 * Compute locations in file. This is trivial for the MD5 files, but | |
2130 * is still necessary for the fasta variants. | |
2131 */ | |
2132 offset = e->line_length | |
2133 ? e->offset + (start-1)/e->bases_per_line * e->line_length + | |
2134 (start-1) % e->bases_per_line | |
2135 : start-1; | |
2136 | |
2137 len = (e->line_length | |
2138 ? e->offset + (end-1)/e->bases_per_line * e->line_length + | |
2139 (end-1) % e->bases_per_line | |
2140 : end-1) - offset + 1; | |
2141 | |
2142 if (bgzf_useek(fp, offset, SEEK_SET) < 0) { | |
2143 perror("bgzf_useek() on reference file"); | |
2144 return NULL; | |
2145 } | |
2146 | |
2147 if (len == 0 || !(seq = malloc(len))) { | |
2148 return NULL; | |
2149 } | |
2150 | |
2151 if (len != bgzf_read(fp, seq, len)) { | |
2152 perror("bgzf_read() on reference file"); | |
2153 free(seq); | |
2154 return NULL; | |
2155 } | |
2156 | |
2157 /* Strip white-space if required. */ | |
2158 if (len != end-start+1) { | |
2159 int i, j; | |
2160 char *cp = seq; | |
2161 char *cp_to; | |
2162 | |
2163 for (i = j = 0; i < len; i++) { | |
2164 if (cp[i] >= '!' && cp[i] <= '~') | |
2165 cp[j++] = cp[i] & ~0x20; | |
2166 } | |
2167 cp_to = cp+j; | |
2168 | |
2169 if (cp_to - seq != end-start+1) { | |
2170 fprintf(stderr, "Malformed reference file?\n"); | |
2171 free(seq); | |
2172 return NULL; | |
2173 } | |
2174 } else { | |
2175 int i; | |
2176 for (i = 0; i < len; i++) { | |
2177 seq[i] = seq[i] & ~0x20; // uppercase in ASCII | |
2178 } | |
2179 } | |
2180 | |
2181 return seq; | |
2182 } | |
2183 | |
2184 /* | |
2185 * Load the entire reference 'id'. | |
2186 * This also increments the reference count by 1. | |
2187 * | |
2188 * Returns ref_entry on success; | |
2189 * NULL on failure | |
2190 */ | |
2191 ref_entry *cram_ref_load(refs_t *r, int id) { | |
2192 ref_entry *e = r->ref_id[id]; | |
2193 int start = 1, end = e->length; | |
2194 char *seq; | |
2195 | |
2196 if (e->seq) { | |
2197 return e; | |
2198 } | |
2199 | |
2200 assert(e->count == 0); | |
2201 | |
2202 if (r->last) { | |
2203 #ifdef REF_DEBUG | |
2204 int idx = 0; | |
2205 for (idx = 0; idx < r->nref; idx++) | |
2206 if (r->last == r->ref_id[idx]) | |
2207 break; | |
2208 RP("%d cram_ref_load DECR %d\n", gettid(), idx); | |
2209 #endif | |
2210 assert(r->last->count > 0); | |
2211 if (--r->last->count <= 0) { | |
2212 RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq); | |
2213 if (r->last->seq) { | |
2214 free(r->last->seq); | |
2215 r->last->seq = NULL; | |
2216 } | |
2217 } | |
2218 } | |
2219 | |
2220 /* Open file if it's not already the current open reference */ | |
2221 if (strcmp(r->fn, e->fn) || r->fp == NULL) { | |
2222 if (r->fp) | |
2223 if (bgzf_close(r->fp) != 0) | |
2224 return NULL; | |
2225 r->fn = e->fn; | |
2226 if (!(r->fp = bgzf_open_ref(r->fn, "r"))) | |
2227 return NULL; | |
2228 } | |
2229 | |
2230 RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end); | |
2231 | |
2232 if (!(seq = load_ref_portion(r->fp, e, start, end))) { | |
2233 return NULL; | |
2234 } | |
2235 | |
2236 RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq); | |
2237 | |
2238 RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1)); | |
2239 e->seq = seq; | |
2240 e->count++; | |
2241 | |
2242 /* | |
2243 * Also keep track of last used ref so incr/decr loops on the same | |
2244 * sequence don't cause load/free loops. | |
2245 */ | |
2246 RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1); | |
2247 r->last = e; | |
2248 e->count++; | |
2249 | |
2250 return e; | |
2251 } | |
2252 | |
2253 /* | |
2254 * Returns a portion of a reference sequence from start to end inclusive. | |
2255 * The returned pointer is owned by either the cram_file fd or by the | |
2256 * internal refs_t structure and should not be freed by the caller. | |
2257 * | |
2258 * The difference is whether or not this refs_t is in use by just the one | |
2259 * cram_fd or by multiples, or whether we have multiple threads accessing | |
2260 * references. In either case fd->shared will be true and we start using | |
2261 * reference counting to track the number of users of a specific reference | |
2262 * sequence. | |
2263 * | |
2264 * Otherwise the ref seq returned is allocated as part of cram_fd itself | |
2265 * and will be freed up on the next call to cram_get_ref or cram_close. | |
2266 * | |
2267 * To return the entire reference sequence, specify start as 1 and end | |
2268 * as 0. | |
2269 * | |
2270 * To cease using a reference, call cram_ref_decr(). | |
2271 * | |
2272 * Returns reference on success, | |
2273 * NULL on failure | |
2274 */ | |
2275 char *cram_get_ref(cram_fd *fd, int id, int start, int end) { | |
2276 ref_entry *r; | |
2277 char *seq; | |
2278 int ostart = start; | |
2279 | |
2280 if (id == -1) | |
2281 return NULL; | |
2282 | |
2283 /* FIXME: axiomatic query of r->seq being true? | |
2284 * Or shortcut for unsorted data where we load once and never free? | |
2285 */ | |
2286 | |
2287 //fd->shared_ref = 1; // hard code for now to simplify things | |
2288 | |
2289 pthread_mutex_lock(&fd->ref_lock); | |
2290 | |
2291 RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end); | |
2292 | |
2293 /* | |
2294 * Unsorted data implies we want to fetch an entire reference at a time. | |
2295 * We just deal with this at the moment by claiming we're sharing | |
2296 * references instead, which has the same requirement. | |
2297 */ | |
2298 if (fd->unsorted) | |
2299 fd->shared_ref = 1; | |
2300 | |
2301 | |
2302 /* Sanity checking: does this ID exist? */ | |
2303 if (id >= fd->refs->nref) { | |
2304 fprintf(stderr, "No reference found for id %d\n", id); | |
2305 pthread_mutex_unlock(&fd->ref_lock); | |
2306 return NULL; | |
2307 } | |
2308 | |
2309 if (!fd->refs || !fd->refs->ref_id[id]) { | |
2310 fprintf(stderr, "No reference found for id %d\n", id); | |
2311 pthread_mutex_unlock(&fd->ref_lock); | |
2312 return NULL; | |
2313 } | |
2314 | |
2315 if (!(r = fd->refs->ref_id[id])) { | |
2316 fprintf(stderr, "No reference found for id %d\n", id); | |
2317 pthread_mutex_unlock(&fd->ref_lock); | |
2318 return NULL; | |
2319 } | |
2320 | |
2321 | |
2322 /* | |
2323 * It has an entry, but may not have been populated yet. | |
2324 * Any manually loaded .fai files have their lengths known. | |
2325 * A ref entry computed from @SQ lines (M5 or UR field) will have | |
2326 * r->length == 0 unless it's been loaded once and verified that we have | |
2327 * an on-disk filename for it. | |
2328 * | |
2329 * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls | |
2330 * open_path_mfile and libcurl, which isn't multi-thread safe unless I | |
2331 * rewrite my code to have one curl handle per thread. | |
2332 */ | |
2333 pthread_mutex_lock(&fd->refs->lock); | |
2334 if (r->length == 0) { | |
2335 if (cram_populate_ref(fd, id, r) == -1) { | |
2336 fprintf(stderr, "Failed to populate reference for id %d\n", id); | |
2337 pthread_mutex_unlock(&fd->refs->lock); | |
2338 pthread_mutex_unlock(&fd->ref_lock); | |
2339 return NULL; | |
2340 } | |
2341 r = fd->refs->ref_id[id]; | |
2342 if (fd->unsorted) | |
2343 cram_ref_incr_locked(fd->refs, id); | |
2344 } | |
2345 | |
2346 | |
2347 /* | |
2348 * We now know that we the filename containing the reference, so check | |
2349 * for limits. If it's over half the reference we'll load all of it in | |
2350 * memory as this will speed up subsequent calls. | |
2351 */ | |
2352 if (end < 1) | |
2353 end = r->length; | |
2354 if (end >= r->length) | |
2355 end = r->length; | |
2356 assert(start >= 1); | |
2357 | |
2358 if (end - start >= 0.5*r->length || fd->shared_ref) { | |
2359 start = 1; | |
2360 end = r->length; | |
2361 } | |
2362 | |
2363 /* | |
2364 * Maybe we have it cached already? If so use it. | |
2365 * | |
2366 * Alternatively if we don't have the sequence but we're sharing | |
2367 * references and/or are asking for the entire length of it, then | |
2368 * load the full reference into the refs structure and return | |
2369 * a pointer to that one instead. | |
2370 */ | |
2371 if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) { | |
2372 char *cp; | |
2373 | |
2374 if (id >= 0) { | |
2375 if (r->seq) { | |
2376 cram_ref_incr_locked(fd->refs, id); | |
2377 } else { | |
2378 ref_entry *e; | |
2379 if (!(e = cram_ref_load(fd->refs, id))) { | |
2380 pthread_mutex_unlock(&fd->refs->lock); | |
2381 pthread_mutex_unlock(&fd->ref_lock); | |
2382 return NULL; | |
2383 } | |
2384 | |
2385 /* unsorted data implies cache ref indefinitely, to avoid | |
2386 * continually loading and unloading. | |
2387 */ | |
2388 if (fd->unsorted) | |
2389 cram_ref_incr_locked(fd->refs, id); | |
2390 } | |
2391 | |
2392 fd->ref = NULL; /* We never access it directly */ | |
2393 fd->ref_start = 1; | |
2394 fd->ref_end = r->length; | |
2395 fd->ref_id = id; | |
2396 | |
2397 cp = fd->refs->ref_id[id]->seq + ostart-1; | |
2398 } else { | |
2399 fd->ref = NULL; | |
2400 cp = NULL; | |
2401 } | |
2402 | |
2403 RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count); | |
2404 | |
2405 pthread_mutex_unlock(&fd->refs->lock); | |
2406 pthread_mutex_unlock(&fd->ref_lock); | |
2407 return cp; | |
2408 } | |
2409 | |
2410 /* | |
2411 * Otherwise we're not sharing, we don't have a copy of it already and | |
2412 * we're only asking for a small portion of it. | |
2413 * | |
2414 * In this case load up just that segment ourselves, freeing any old | |
2415 * small segments in the process. | |
2416 */ | |
2417 | |
2418 /* Unmapped ref ID */ | |
2419 if (id < 0) { | |
2420 if (fd->ref_free) { | |
2421 free(fd->ref_free); | |
2422 fd->ref_free = NULL; | |
2423 } | |
2424 fd->ref = NULL; | |
2425 fd->ref_id = id; | |
2426 pthread_mutex_unlock(&fd->refs->lock); | |
2427 pthread_mutex_unlock(&fd->ref_lock); | |
2428 return NULL; | |
2429 } | |
2430 | |
2431 /* Open file if it's not already the current open reference */ | |
2432 if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) { | |
2433 if (fd->refs->fp) | |
2434 if (bgzf_close(fd->refs->fp) != 0) | |
2435 return NULL; | |
2436 fd->refs->fn = r->fn; | |
2437 if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r"))) { | |
2438 pthread_mutex_unlock(&fd->refs->lock); | |
2439 pthread_mutex_unlock(&fd->ref_lock); | |
2440 return NULL; | |
2441 } | |
2442 } | |
2443 | |
2444 if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) { | |
2445 pthread_mutex_unlock(&fd->refs->lock); | |
2446 pthread_mutex_unlock(&fd->ref_lock); | |
2447 return NULL; | |
2448 } | |
2449 | |
2450 if (fd->ref_free) | |
2451 free(fd->ref_free); | |
2452 | |
2453 fd->ref_id = id; | |
2454 fd->ref_start = start; | |
2455 fd->ref_end = end; | |
2456 fd->ref_free = fd->ref; | |
2457 seq = fd->ref; | |
2458 | |
2459 pthread_mutex_unlock(&fd->refs->lock); | |
2460 pthread_mutex_unlock(&fd->ref_lock); | |
2461 | |
2462 return seq + ostart - start; | |
2463 } | |
2464 | |
2465 /* | |
2466 * If fd has been opened for reading, it may be permitted to specify 'fn' | |
2467 * as NULL and let the code auto-detect the reference by parsing the | |
2468 * SAM header @SQ lines. | |
2469 */ | |
2470 int cram_load_reference(cram_fd *fd, char *fn) { | |
2471 if (fn) { | |
2472 fd->refs = refs_load_fai(fd->refs, fn, | |
2473 !(fd->embed_ref && fd->mode == 'r')); | |
2474 fn = fd->refs ? fd->refs->fn : NULL; | |
2475 } | |
2476 fd->ref_fn = fn; | |
2477 | |
2478 if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) { | |
2479 if (fd->refs) | |
2480 refs_free(fd->refs); | |
2481 if (!(fd->refs = refs_create())) | |
2482 return -1; | |
2483 if (-1 == refs_from_header(fd->refs, fd, fd->header)) | |
2484 return -1; | |
2485 } | |
2486 | |
2487 if (fd->header) | |
2488 if (-1 == refs2id(fd->refs, fd->header)) | |
2489 return -1; | |
2490 | |
2491 return fn ? 0 : -1; | |
2492 } | |
2493 | |
2494 /* ---------------------------------------------------------------------- | |
2495 * Containers | |
2496 */ | |
2497 | |
2498 /* | |
2499 * Creates a new container, specifying the maximum number of slices | |
2500 * and records permitted. | |
2501 * | |
2502 * Returns cram_container ptr on success | |
2503 * NULL on failure | |
2504 */ | |
2505 cram_container *cram_new_container(int nrec, int nslice) { | |
2506 cram_container *c = calloc(1, sizeof(*c)); | |
2507 enum cram_DS_ID id; | |
2508 | |
2509 if (!c) | |
2510 return NULL; | |
2511 | |
2512 c->curr_ref = -2; | |
2513 | |
2514 c->max_c_rec = nrec * nslice; | |
2515 c->curr_c_rec = 0; | |
2516 | |
2517 c->max_rec = nrec; | |
2518 c->record_counter = 0; | |
2519 c->num_bases = 0; | |
2520 | |
2521 c->max_slice = nslice; | |
2522 c->curr_slice = 0; | |
2523 | |
2524 c->pos_sorted = 1; | |
2525 c->max_apos = 0; | |
2526 c->multi_seq = 0; | |
2527 | |
2528 c->bams = NULL; | |
2529 | |
2530 if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *)))) | |
2531 goto err; | |
2532 c->slice = NULL; | |
2533 | |
2534 if (!(c->comp_hdr = cram_new_compression_header())) | |
2535 goto err; | |
2536 c->comp_hdr_block = NULL; | |
2537 | |
2538 for (id = DS_RN; id < DS_TN; id++) | |
2539 if (!(c->stats[id] = cram_stats_create())) goto err; | |
2540 | |
2541 //c->aux_B_stats = cram_stats_create(); | |
2542 | |
2543 if (!(c->tags_used = kh_init(s_i2i))) | |
2544 goto err; | |
2545 c->refs_used = 0; | |
2546 | |
2547 return c; | |
2548 | |
2549 err: | |
2550 if (c) { | |
2551 if (c->slices) | |
2552 free(c->slices); | |
2553 free(c); | |
2554 } | |
2555 return NULL; | |
2556 } | |
2557 | |
2558 void cram_free_container(cram_container *c) { | |
2559 enum cram_DS_ID id; | |
2560 int i; | |
2561 | |
2562 if (!c) | |
2563 return; | |
2564 | |
2565 if (c->refs_used) | |
2566 free(c->refs_used); | |
2567 | |
2568 if (c->landmark) | |
2569 free(c->landmark); | |
2570 | |
2571 if (c->comp_hdr) | |
2572 cram_free_compression_header(c->comp_hdr); | |
2573 | |
2574 if (c->comp_hdr_block) | |
2575 cram_free_block(c->comp_hdr_block); | |
2576 | |
2577 if (c->slices) { | |
2578 for (i = 0; i < c->max_slice; i++) | |
2579 if (c->slices[i]) | |
2580 cram_free_slice(c->slices[i]); | |
2581 free(c->slices); | |
2582 } | |
2583 | |
2584 for (id = DS_RN; id < DS_TN; id++) | |
2585 if (c->stats[id]) cram_stats_free(c->stats[id]); | |
2586 | |
2587 //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats); | |
2588 | |
2589 if (c->tags_used) kh_destroy(s_i2i, c->tags_used); | |
2590 | |
2591 free(c); | |
2592 } | |
2593 | |
2594 /* | |
2595 * Reads a container header. | |
2596 * | |
2597 * Returns cram_container on success | |
2598 * NULL on failure or no container left (fd->err == 0). | |
2599 */ | |
2600 cram_container *cram_read_container(cram_fd *fd) { | |
2601 cram_container c2, *c; | |
2602 int i, s; | |
2603 size_t rd = 0; | |
2604 | |
2605 fd->err = 0; | |
2606 fd->eof = 0; | |
2607 | |
2608 memset(&c2, 0, sizeof(c2)); | |
2609 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
2610 if ((s = itf8_decode(fd, &c2.length)) == -1) { | |
2611 fd->eof = fd->empty_container ? 1 : 2; | |
2612 return NULL; | |
2613 } else { | |
2614 rd+=s; | |
2615 } | |
2616 } else { | |
2617 if ((s = int32_decode(fd, &c2.length)) == -1) { | |
2618 if (CRAM_MAJOR_VERS(fd->version) == 2 && | |
2619 CRAM_MINOR_VERS(fd->version) == 0) | |
2620 fd->eof = 1; // EOF blocks arrived in v2.1 | |
2621 else | |
2622 fd->eof = fd->empty_container ? 1 : 2; | |
2623 return NULL; | |
2624 } else { | |
2625 rd+=s; | |
2626 } | |
2627 } | |
2628 if ((s = itf8_decode(fd, &c2.ref_seq_id)) == -1) return NULL; else rd+=s; | |
2629 if ((s = itf8_decode(fd, &c2.ref_seq_start))== -1) return NULL; else rd+=s; | |
2630 if ((s = itf8_decode(fd, &c2.ref_seq_span)) == -1) return NULL; else rd+=s; | |
2631 if ((s = itf8_decode(fd, &c2.num_records)) == -1) return NULL; else rd+=s; | |
2632 | |
2633 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
2634 c2.record_counter = 0; | |
2635 c2.num_bases = 0; | |
2636 } else { | |
2637 if (CRAM_MAJOR_VERS(fd->version) >= 3) { | |
2638 if ((s = ltf8_decode(fd, &c2.record_counter)) == -1) | |
2639 return NULL; | |
2640 else | |
2641 rd += s; | |
2642 } else { | |
2643 int32_t i32; | |
2644 if ((s = itf8_decode(fd, &i32)) == -1) | |
2645 return NULL; | |
2646 else | |
2647 rd += s; | |
2648 c2.record_counter = i32; | |
2649 } | |
2650 | |
2651 if ((s = ltf8_decode(fd, &c2.num_bases))== -1) | |
2652 return NULL; | |
2653 else | |
2654 rd += s; | |
2655 } | |
2656 if ((s = itf8_decode(fd, &c2.num_blocks)) == -1) return NULL; else rd+=s; | |
2657 if ((s = itf8_decode(fd, &c2.num_landmarks))== -1) return NULL; else rd+=s; | |
2658 | |
2659 if (!(c = calloc(1, sizeof(*c)))) | |
2660 return NULL; | |
2661 | |
2662 *c = c2; | |
2663 | |
2664 if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) && | |
2665 c->num_landmarks) { | |
2666 fd->err = errno; | |
2667 cram_free_container(c); | |
2668 return NULL; | |
2669 } | |
2670 for (i = 0; i < c->num_landmarks; i++) { | |
2671 if ((s = itf8_decode(fd, &c->landmark[i])) == -1) { | |
2672 cram_free_container(c); | |
2673 return NULL; | |
2674 } else { | |
2675 rd += s; | |
2676 } | |
2677 } | |
2678 | |
2679 if (CRAM_MAJOR_VERS(fd->version) >= 3) { | |
2680 uint32_t crc, i; | |
2681 unsigned char *dat = malloc(50 + 5*(c->num_landmarks)), *cp = dat; | |
2682 if (!dat) { | |
2683 cram_free_container(c); | |
2684 return NULL; | |
2685 } | |
2686 if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) | |
2687 return NULL; | |
2688 else | |
2689 rd+=4; | |
2690 | |
2691 /* Reencode first as we can't easily access the original byte stream. | |
2692 * | |
2693 * FIXME: Technically this means this may not be fool proof. We could | |
2694 * create a CRAM file using a 2 byte ITF8 value that can fit in a | |
2695 * 1 byte field, meaning the encoding is different to the original | |
2696 * form and so has a different CRC. | |
2697 * | |
2698 * The correct implementation would be to have an alternative form | |
2699 * of itf8_decode which also squirrels away the raw byte stream | |
2700 * during decoding so we can then CRC that. | |
2701 */ | |
2702 *(unsigned int *)cp = le_int4(c->length); cp += 4; | |
2703 cp += itf8_put(cp, c->ref_seq_id); | |
2704 cp += itf8_put(cp, c->ref_seq_start); | |
2705 cp += itf8_put(cp, c->ref_seq_span); | |
2706 cp += itf8_put(cp, c->num_records); | |
2707 cp += ltf8_put((char *)cp, c->record_counter); | |
2708 cp += itf8_put(cp, c->num_bases); | |
2709 cp += itf8_put(cp, c->num_blocks); | |
2710 cp += itf8_put(cp, c->num_landmarks); | |
2711 for (i = 0; i < c->num_landmarks; i++) { | |
2712 cp += itf8_put(cp, c->landmark[i]); | |
2713 } | |
2714 | |
2715 crc = crc32(0L, dat, cp-dat); | |
2716 if (crc != c->crc32) { | |
2717 fprintf(stderr, "Container header CRC32 failure\n"); | |
2718 cram_free_container(c); | |
2719 return NULL; | |
2720 } | |
2721 } | |
2722 | |
2723 c->offset = rd; | |
2724 c->slices = NULL; | |
2725 c->curr_slice = 0; | |
2726 c->max_slice = c->num_landmarks; | |
2727 c->slice_rec = 0; | |
2728 c->curr_rec = 0; | |
2729 c->max_rec = 0; | |
2730 | |
2731 if (c->ref_seq_id == -2) { | |
2732 c->multi_seq = 1; | |
2733 fd->multi_seq = 1; | |
2734 } | |
2735 | |
2736 fd->empty_container = | |
2737 (c->num_records == 0 && | |
2738 c->ref_seq_id == -1 && | |
2739 c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0; | |
2740 | |
2741 return c; | |
2742 } | |
2743 | |
2744 /* | |
2745 * Writes a container structure. | |
2746 * | |
2747 * Returns 0 on success | |
2748 * -1 on failure | |
2749 */ | |
2750 int cram_write_container(cram_fd *fd, cram_container *c) { | |
2751 char buf_a[1024], *buf = buf_a, *cp; | |
2752 int i; | |
2753 | |
2754 if (55 + c->num_landmarks * 5 >= 1024) | |
2755 buf = malloc(55 + c->num_landmarks * 5); | |
2756 cp = buf; | |
2757 | |
2758 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
2759 cp += itf8_put(cp, c->length); | |
2760 } else { | |
2761 *(int32_t *)cp = le_int4(c->length); | |
2762 cp += 4; | |
2763 } | |
2764 if (c->multi_seq) { | |
2765 cp += itf8_put(cp, -2); | |
2766 cp += itf8_put(cp, 0); | |
2767 cp += itf8_put(cp, 0); | |
2768 } else { | |
2769 cp += itf8_put(cp, c->ref_seq_id); | |
2770 cp += itf8_put(cp, c->ref_seq_start); | |
2771 cp += itf8_put(cp, c->ref_seq_span); | |
2772 } | |
2773 cp += itf8_put(cp, c->num_records); | |
2774 if (CRAM_MAJOR_VERS(fd->version) == 2) { | |
2775 cp += itf8_put(cp, c->record_counter); | |
2776 cp += ltf8_put(cp, c->num_bases); | |
2777 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { | |
2778 cp += ltf8_put(cp, c->record_counter); | |
2779 cp += ltf8_put(cp, c->num_bases); | |
2780 } | |
2781 | |
2782 cp += itf8_put(cp, c->num_blocks); | |
2783 cp += itf8_put(cp, c->num_landmarks); | |
2784 for (i = 0; i < c->num_landmarks; i++) | |
2785 cp += itf8_put(cp, c->landmark[i]); | |
2786 | |
2787 if (CRAM_MAJOR_VERS(fd->version) >= 3) { | |
2788 c->crc32 = crc32(0L, (uc *)buf, cp-buf); | |
2789 cp[0] = c->crc32 & 0xff; | |
2790 cp[1] = (c->crc32 >> 8) & 0xff; | |
2791 cp[2] = (c->crc32 >> 16) & 0xff; | |
2792 cp[3] = (c->crc32 >> 24) & 0xff; | |
2793 cp += 4; | |
2794 } | |
2795 | |
2796 if (cp-buf != hwrite(fd->fp, buf, cp-buf)) { | |
2797 if (buf != buf_a) | |
2798 free(buf); | |
2799 return -1; | |
2800 } | |
2801 | |
2802 if (buf != buf_a) | |
2803 free(buf); | |
2804 | |
2805 return 0; | |
2806 } | |
2807 | |
2808 // common component shared by cram_flush_container{,_mt} | |
2809 static int cram_flush_container2(cram_fd *fd, cram_container *c) { | |
2810 int i, j; | |
2811 | |
2812 //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum); | |
2813 | |
2814 /* Write the container struct itself */ | |
2815 if (0 != cram_write_container(fd, c)) | |
2816 return -1; | |
2817 | |
2818 /* And the compression header */ | |
2819 if (0 != cram_write_block(fd, c->comp_hdr_block)) | |
2820 return -1; | |
2821 | |
2822 /* Followed by the slice blocks */ | |
2823 for (i = 0; i < c->curr_slice; i++) { | |
2824 cram_slice *s = c->slices[i]; | |
2825 | |
2826 if (0 != cram_write_block(fd, s->hdr_block)) | |
2827 return -1; | |
2828 | |
2829 for (j = 0; j < s->hdr->num_blocks; j++) { | |
2830 if (0 != cram_write_block(fd, s->block[j])) | |
2831 return -1; | |
2832 } | |
2833 } | |
2834 | |
2835 return hflush(fd->fp) == 0 ? 0 : -1; | |
2836 } | |
2837 | |
2838 /* | |
2839 * Flushes a completely or partially full container to disk, writing | |
2840 * container structure, header and blocks. This also calls the encoder | |
2841 * functions. | |
2842 * | |
2843 * Returns 0 on success | |
2844 * -1 on failure | |
2845 */ | |
2846 int cram_flush_container(cram_fd *fd, cram_container *c) { | |
2847 /* Encode the container blocks and generate compression header */ | |
2848 if (0 != cram_encode_container(fd, c)) | |
2849 return -1; | |
2850 | |
2851 return cram_flush_container2(fd, c); | |
2852 } | |
2853 | |
2854 typedef struct { | |
2855 cram_fd *fd; | |
2856 cram_container *c; | |
2857 } cram_job; | |
2858 | |
2859 void *cram_flush_thread(void *arg) { | |
2860 cram_job *j = (cram_job *)arg; | |
2861 | |
2862 /* Encode the container blocks and generate compression header */ | |
2863 if (0 != cram_encode_container(j->fd, j->c)) { | |
2864 fprintf(stderr, "cram_encode_container failed\n"); | |
2865 return NULL; | |
2866 } | |
2867 | |
2868 return arg; | |
2869 } | |
2870 | |
2871 static int cram_flush_result(cram_fd *fd) { | |
2872 int i, ret = 0; | |
2873 t_pool_result *r; | |
2874 | |
2875 while ((r = t_pool_next_result(fd->rqueue))) { | |
2876 cram_job *j = (cram_job *)r->data; | |
2877 cram_container *c; | |
2878 | |
2879 if (!j) { | |
2880 t_pool_delete_result(r, 0); | |
2881 return -1; | |
2882 } | |
2883 | |
2884 fd = j->fd; | |
2885 c = j->c; | |
2886 | |
2887 if (0 != cram_flush_container2(fd, c)) | |
2888 return -1; | |
2889 | |
2890 /* Free the container */ | |
2891 for (i = 0; i < c->max_slice; i++) { | |
2892 cram_free_slice(c->slices[i]); | |
2893 c->slices[i] = NULL; | |
2894 } | |
2895 | |
2896 c->slice = NULL; | |
2897 c->curr_slice = 0; | |
2898 | |
2899 cram_free_container(c); | |
2900 | |
2901 ret |= hflush(fd->fp) == 0 ? 0 : -1; | |
2902 | |
2903 t_pool_delete_result(r, 1); | |
2904 } | |
2905 | |
2906 return ret; | |
2907 } | |
2908 | |
2909 int cram_flush_container_mt(cram_fd *fd, cram_container *c) { | |
2910 cram_job *j; | |
2911 | |
2912 if (!fd->pool) | |
2913 return cram_flush_container(fd, c); | |
2914 | |
2915 if (!(j = malloc(sizeof(*j)))) | |
2916 return -1; | |
2917 j->fd = fd; | |
2918 j->c = c; | |
2919 | |
2920 t_pool_dispatch(fd->pool, fd->rqueue, cram_flush_thread, j); | |
2921 | |
2922 return cram_flush_result(fd); | |
2923 } | |
2924 | |
2925 /* ---------------------------------------------------------------------- | |
2926 * Compression headers; the first part of the container | |
2927 */ | |
2928 | |
2929 /* | |
2930 * Creates a new blank container compression header | |
2931 * | |
2932 * Returns header ptr on success | |
2933 * NULL on failure | |
2934 */ | |
2935 cram_block_compression_hdr *cram_new_compression_header(void) { | |
2936 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); | |
2937 if (!hdr) | |
2938 return NULL; | |
2939 | |
2940 if (!(hdr->TD_blk = cram_new_block(CORE, 0))) { | |
2941 free(hdr); | |
2942 return NULL; | |
2943 } | |
2944 | |
2945 if (!(hdr->TD_hash = kh_init(m_s2i))) { | |
2946 cram_free_block(hdr->TD_blk); | |
2947 free(hdr); | |
2948 return NULL; | |
2949 } | |
2950 | |
2951 if (!(hdr->TD_keys = string_pool_create(8192))) { | |
2952 kh_destroy(m_s2i, hdr->TD_hash); | |
2953 cram_free_block(hdr->TD_blk); | |
2954 free(hdr); | |
2955 return NULL; | |
2956 } | |
2957 | |
2958 return hdr; | |
2959 } | |
2960 | |
2961 void cram_free_compression_header(cram_block_compression_hdr *hdr) { | |
2962 int i; | |
2963 | |
2964 if (hdr->landmark) | |
2965 free(hdr->landmark); | |
2966 | |
2967 if (hdr->preservation_map) | |
2968 kh_destroy(map, hdr->preservation_map); | |
2969 | |
2970 for (i = 0; i < CRAM_MAP_HASH; i++) { | |
2971 cram_map *m, *m2; | |
2972 for (m = hdr->rec_encoding_map[i]; m; m = m2) { | |
2973 m2 = m->next; | |
2974 if (m->codec) | |
2975 m->codec->free(m->codec); | |
2976 free(m); | |
2977 } | |
2978 } | |
2979 | |
2980 for (i = 0; i < CRAM_MAP_HASH; i++) { | |
2981 cram_map *m, *m2; | |
2982 for (m = hdr->tag_encoding_map[i]; m; m = m2) { | |
2983 m2 = m->next; | |
2984 if (m->codec) | |
2985 m->codec->free(m->codec); | |
2986 free(m); | |
2987 } | |
2988 } | |
2989 | |
2990 for (i = 0; i < DS_END; i++) { | |
2991 if (hdr->codecs[i]) | |
2992 hdr->codecs[i]->free(hdr->codecs[i]); | |
2993 } | |
2994 | |
2995 if (hdr->TL) | |
2996 free(hdr->TL); | |
2997 if (hdr->TD_blk) | |
2998 cram_free_block(hdr->TD_blk); | |
2999 if (hdr->TD_hash) | |
3000 kh_destroy(m_s2i, hdr->TD_hash); | |
3001 if (hdr->TD_keys) | |
3002 string_pool_destroy(hdr->TD_keys); | |
3003 | |
3004 free(hdr); | |
3005 } | |
3006 | |
3007 | |
3008 /* ---------------------------------------------------------------------- | |
3009 * Slices and slice headers | |
3010 */ | |
3011 | |
3012 void cram_free_slice_header(cram_block_slice_hdr *hdr) { | |
3013 if (!hdr) | |
3014 return; | |
3015 | |
3016 if (hdr->block_content_ids) | |
3017 free(hdr->block_content_ids); | |
3018 | |
3019 free(hdr); | |
3020 | |
3021 return; | |
3022 } | |
3023 | |
3024 void cram_free_slice(cram_slice *s) { | |
3025 if (!s) | |
3026 return; | |
3027 | |
3028 if (s->hdr_block) | |
3029 cram_free_block(s->hdr_block); | |
3030 | |
3031 if (s->block) { | |
3032 int i; | |
3033 | |
3034 if (s->hdr) { | |
3035 for (i = 0; i < s->hdr->num_blocks; i++) { | |
3036 cram_free_block(s->block[i]); | |
3037 } | |
3038 } | |
3039 free(s->block); | |
3040 } | |
3041 | |
3042 if (s->block_by_id) | |
3043 free(s->block_by_id); | |
3044 | |
3045 if (s->hdr) | |
3046 cram_free_slice_header(s->hdr); | |
3047 | |
3048 if (s->seqs_blk) | |
3049 cram_free_block(s->seqs_blk); | |
3050 | |
3051 if (s->qual_blk) | |
3052 cram_free_block(s->qual_blk); | |
3053 | |
3054 if (s->name_blk) | |
3055 cram_free_block(s->name_blk); | |
3056 | |
3057 if (s->aux_blk) | |
3058 cram_free_block(s->aux_blk); | |
3059 | |
3060 if (s->aux_OQ_blk) | |
3061 cram_free_block(s->aux_OQ_blk); | |
3062 | |
3063 if (s->aux_BQ_blk) | |
3064 cram_free_block(s->aux_BQ_blk); | |
3065 | |
3066 if (s->aux_FZ_blk) | |
3067 cram_free_block(s->aux_FZ_blk); | |
3068 | |
3069 if (s->aux_oq_blk) | |
3070 cram_free_block(s->aux_oq_blk); | |
3071 | |
3072 if (s->aux_os_blk) | |
3073 cram_free_block(s->aux_os_blk); | |
3074 | |
3075 if (s->aux_oz_blk) | |
3076 cram_free_block(s->aux_oz_blk); | |
3077 | |
3078 if (s->base_blk) | |
3079 cram_free_block(s->base_blk); | |
3080 | |
3081 if (s->soft_blk) | |
3082 cram_free_block(s->soft_blk); | |
3083 | |
3084 if (s->cigar) | |
3085 free(s->cigar); | |
3086 | |
3087 if (s->crecs) | |
3088 free(s->crecs); | |
3089 | |
3090 if (s->features) | |
3091 free(s->features); | |
3092 | |
3093 if (s->TN) | |
3094 free(s->TN); | |
3095 | |
3096 if (s->pair_keys) | |
3097 string_pool_destroy(s->pair_keys); | |
3098 | |
3099 if (s->pair[0]) | |
3100 kh_destroy(m_s2i, s->pair[0]); | |
3101 if (s->pair[1]) | |
3102 kh_destroy(m_s2i, s->pair[1]); | |
3103 | |
3104 free(s); | |
3105 } | |
3106 | |
3107 /* | |
3108 * Creates a new empty slice in memory, for subsequent writing to | |
3109 * disk. | |
3110 * | |
3111 * Returns cram_slice ptr on success | |
3112 * NULL on failure | |
3113 */ | |
3114 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) { | |
3115 cram_slice *s = calloc(1, sizeof(*s)); | |
3116 if (!s) | |
3117 return NULL; | |
3118 | |
3119 if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr)))) | |
3120 goto err; | |
3121 s->hdr->content_type = type; | |
3122 | |
3123 s->hdr_block = NULL; | |
3124 s->block = NULL; | |
3125 s->block_by_id = NULL; | |
3126 s->last_apos = 0; | |
3127 if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err; | |
3128 s->cigar = NULL; | |
3129 s->cigar_alloc = 0; | |
3130 s->ncigar = 0; | |
3131 | |
3132 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; | |
3133 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err; | |
3134 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err; | |
3135 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err; | |
3136 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err; | |
3137 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err; | |
3138 | |
3139 s->features = NULL; | |
3140 s->nfeatures = s->afeatures = 0; | |
3141 | |
3142 #ifndef TN_external | |
3143 s->TN = NULL; | |
3144 s->nTN = s->aTN = 0; | |
3145 #endif | |
3146 | |
3147 // Volatile keys as we do realloc in dstring | |
3148 if (!(s->pair_keys = string_pool_create(8192))) goto err; | |
3149 if (!(s->pair[0] = kh_init(m_s2i))) goto err; | |
3150 if (!(s->pair[1] = kh_init(m_s2i))) goto err; | |
3151 | |
3152 #ifdef BA_external | |
3153 s->BA_len = 0; | |
3154 #endif | |
3155 | |
3156 return s; | |
3157 | |
3158 err: | |
3159 if (s) | |
3160 cram_free_slice(s); | |
3161 | |
3162 return NULL; | |
3163 } | |
3164 | |
3165 /* | |
3166 * Loads an entire slice. | |
3167 * FIXME: In 1.0 the native unit of slices within CRAM is broken | |
3168 * as slices contain references to objects in other slices. | |
3169 * To work around this while keeping the slice oriented outer loop | |
3170 * we read all slices and stitch them together into a fake large | |
3171 * slice instead. | |
3172 * | |
3173 * Returns cram_slice ptr on success | |
3174 * NULL on failure | |
3175 */ | |
3176 cram_slice *cram_read_slice(cram_fd *fd) { | |
3177 cram_block *b = cram_read_block(fd); | |
3178 cram_slice *s = calloc(1, sizeof(*s)); | |
3179 int i, n, max_id, min_id; | |
3180 | |
3181 if (!b || !s) | |
3182 goto err; | |
3183 | |
3184 s->hdr_block = b; | |
3185 switch (b->content_type) { | |
3186 case MAPPED_SLICE: | |
3187 case UNMAPPED_SLICE: | |
3188 if (!(s->hdr = cram_decode_slice_header(fd, b))) | |
3189 goto err; | |
3190 break; | |
3191 | |
3192 default: | |
3193 fprintf(stderr, "Unexpected block of type %s\n", | |
3194 cram_content_type2str(b->content_type)); | |
3195 goto err; | |
3196 } | |
3197 | |
3198 s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block)); | |
3199 if (!s->block) | |
3200 goto err; | |
3201 | |
3202 for (max_id = i = 0, min_id = INT_MAX; i < n; i++) { | |
3203 if (!(s->block[i] = cram_read_block(fd))) | |
3204 goto err; | |
3205 | |
3206 if (s->block[i]->content_type == EXTERNAL) { | |
3207 if (max_id < s->block[i]->content_id) | |
3208 max_id = s->block[i]->content_id; | |
3209 if (min_id > s->block[i]->content_id) | |
3210 min_id = s->block[i]->content_id; | |
3211 } | |
3212 } | |
3213 if (min_id >= 0 && max_id < 1024) { | |
3214 if (!(s->block_by_id = calloc(1024, sizeof(s->block[0])))) | |
3215 goto err; | |
3216 | |
3217 for (i = 0; i < n; i++) { | |
3218 if (s->block[i]->content_type != EXTERNAL) | |
3219 continue; | |
3220 s->block_by_id[s->block[i]->content_id] = s->block[i]; | |
3221 } | |
3222 } | |
3223 | |
3224 /* Initialise encoding/decoding tables */ | |
3225 s->cigar = NULL; | |
3226 s->cigar_alloc = 0; | |
3227 s->ncigar = 0; | |
3228 | |
3229 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; | |
3230 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err; | |
3231 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err; | |
3232 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err; | |
3233 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err; | |
3234 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err; | |
3235 | |
3236 s->crecs = NULL; | |
3237 | |
3238 s->last_apos = s->hdr->ref_seq_start; | |
3239 | |
3240 return s; | |
3241 | |
3242 err: | |
3243 if (b) | |
3244 cram_free_block(b); | |
3245 if (s) { | |
3246 s->hdr_block = NULL; | |
3247 cram_free_slice(s); | |
3248 } | |
3249 return NULL; | |
3250 } | |
3251 | |
3252 | |
3253 /* ---------------------------------------------------------------------- | |
3254 * CRAM file definition (header) | |
3255 */ | |
3256 | |
3257 /* | |
3258 * Reads a CRAM file definition structure. | |
3259 * Returns file_def ptr on success | |
3260 * NULL on failure | |
3261 */ | |
3262 cram_file_def *cram_read_file_def(cram_fd *fd) { | |
3263 cram_file_def *def = malloc(sizeof(*def)); | |
3264 if (!def) | |
3265 return NULL; | |
3266 | |
3267 if (26 != hread(fd->fp, &def->magic[0], 26)) { | |
3268 free(def); | |
3269 return NULL; | |
3270 } | |
3271 | |
3272 if (memcmp(def->magic, "CRAM", 4) != 0) { | |
3273 free(def); | |
3274 return NULL; | |
3275 } | |
3276 | |
3277 if (def->major_version > 3) { | |
3278 fprintf(stderr, "CRAM version number mismatch\n" | |
3279 "Expected 1.x, 2.x or 3.x, got %d.%d\n", | |
3280 def->major_version, def->minor_version); | |
3281 free(def); | |
3282 return NULL; | |
3283 } | |
3284 | |
3285 fd->first_container += 26; | |
3286 fd->last_slice = 0; | |
3287 | |
3288 return def; | |
3289 } | |
3290 | |
3291 /* | |
3292 * Writes a cram_file_def structure to cram_fd. | |
3293 * Returns 0 on success | |
3294 * -1 on failure | |
3295 */ | |
3296 int cram_write_file_def(cram_fd *fd, cram_file_def *def) { | |
3297 return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1; | |
3298 } | |
3299 | |
3300 void cram_free_file_def(cram_file_def *def) { | |
3301 if (def) free(def); | |
3302 } | |
3303 | |
3304 /* ---------------------------------------------------------------------- | |
3305 * SAM header I/O | |
3306 */ | |
3307 | |
3308 | |
3309 /* | |
3310 * Reads the SAM header from the first CRAM data block. | |
3311 * Also performs minimal parsing to extract read-group | |
3312 * and sample information. | |
3313 | |
3314 * Returns SAM hdr ptr on success | |
3315 * NULL on failure | |
3316 */ | |
3317 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) { | |
3318 int32_t header_len; | |
3319 char *header; | |
3320 SAM_hdr *hdr; | |
3321 | |
3322 /* 1.1 onwards stores the header in the first block of a container */ | |
3323 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
3324 /* Length */ | |
3325 if (-1 == int32_decode(fd, &header_len)) | |
3326 return NULL; | |
3327 | |
3328 /* Alloc and read */ | |
3329 if (NULL == (header = malloc(header_len+1))) | |
3330 return NULL; | |
3331 | |
3332 *header = 0; | |
3333 if (header_len != hread(fd->fp, header, header_len)) | |
3334 return NULL; | |
3335 | |
3336 fd->first_container += 4 + header_len; | |
3337 } else { | |
3338 cram_container *c = cram_read_container(fd); | |
3339 cram_block *b; | |
3340 int i, len; | |
3341 | |
3342 if (!c) | |
3343 return NULL; | |
3344 | |
3345 if (c->num_blocks < 1) { | |
3346 cram_free_container(c); | |
3347 return NULL; | |
3348 } | |
3349 | |
3350 if (!(b = cram_read_block(fd))) { | |
3351 cram_free_container(c); | |
3352 return NULL; | |
3353 } | |
3354 cram_uncompress_block(b); | |
3355 | |
3356 len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + | |
3357 itf8_size(b->content_id) + | |
3358 itf8_size(b->uncomp_size) + | |
3359 itf8_size(b->comp_size); | |
3360 | |
3361 /* Extract header from 1st block */ | |
3362 if (-1 == int32_get(b, &header_len) || | |
3363 b->uncomp_size - 4 < header_len) { | |
3364 cram_free_container(c); | |
3365 cram_free_block(b); | |
3366 return NULL; | |
3367 } | |
3368 if (NULL == (header = malloc(header_len+1))) { | |
3369 cram_free_container(c); | |
3370 cram_free_block(b); | |
3371 return NULL; | |
3372 } | |
3373 memcpy(header, BLOCK_END(b), header_len); | |
3374 header[header_len]='\0'; | |
3375 cram_free_block(b); | |
3376 | |
3377 /* Consume any remaining blocks */ | |
3378 for (i = 1; i < c->num_blocks; i++) { | |
3379 if (!(b = cram_read_block(fd))) { | |
3380 cram_free_container(c); | |
3381 return NULL; | |
3382 } | |
3383 len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + | |
3384 itf8_size(b->content_id) + | |
3385 itf8_size(b->uncomp_size) + | |
3386 itf8_size(b->comp_size); | |
3387 cram_free_block(b); | |
3388 } | |
3389 | |
3390 if (c->length && c->length > len) { | |
3391 // Consume padding | |
3392 char *pads = malloc(c->length - len); | |
3393 if (!pads) { | |
3394 cram_free_container(c); | |
3395 return NULL; | |
3396 } | |
3397 | |
3398 if (c->length - len != hread(fd->fp, pads, c->length - len)) { | |
3399 cram_free_container(c); | |
3400 return NULL; | |
3401 } | |
3402 free(pads); | |
3403 } | |
3404 | |
3405 cram_free_container(c); | |
3406 } | |
3407 | |
3408 /* Parse */ | |
3409 hdr = sam_hdr_parse_(header, header_len); | |
3410 free(header); | |
3411 | |
3412 return hdr; | |
3413 } | |
3414 | |
3415 /* | |
3416 * Converts 'in' to a full pathname to store in out. | |
3417 * Out must be at least PATH_MAX bytes long. | |
3418 */ | |
3419 static void full_path(char *out, char *in) { | |
3420 if (*in == '/') { | |
3421 strncpy(out, in, PATH_MAX); | |
3422 out[PATH_MAX-1] = 0; | |
3423 } else { | |
3424 int len; | |
3425 | |
3426 // unable to get dir or out+in is too long | |
3427 if (!getcwd(out, PATH_MAX) || | |
3428 (len = strlen(out))+1+strlen(in) >= PATH_MAX) { | |
3429 strncpy(out, in, PATH_MAX); | |
3430 out[PATH_MAX-1] = 0; | |
3431 return; | |
3432 } | |
3433 | |
3434 sprintf(out+len, "/%.*s", PATH_MAX - len, in); | |
3435 | |
3436 // FIXME: cope with `pwd`/../../../foo.fa ? | |
3437 } | |
3438 } | |
3439 | |
3440 /* | |
3441 * Writes a CRAM SAM header. | |
3442 * Returns 0 on success | |
3443 * -1 on failure | |
3444 */ | |
3445 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) { | |
3446 int header_len; | |
3447 int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3); | |
3448 | |
3449 /* Write CRAM MAGIC if not yet written. */ | |
3450 if (fd->file_def->major_version == 0) { | |
3451 fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version); | |
3452 fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version); | |
3453 if (0 != cram_write_file_def(fd, fd->file_def)) | |
3454 return -1; | |
3455 } | |
3456 | |
3457 /* 1.0 requires and UNKNOWN read-group */ | |
3458 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
3459 if (!sam_hdr_find_rg(hdr, "UNKNOWN")) | |
3460 if (sam_hdr_add(hdr, "RG", | |
3461 "ID", "UNKNOWN", "SM", "UNKNOWN", NULL)) | |
3462 return -1; | |
3463 } | |
3464 | |
3465 /* Fix M5 strings */ | |
3466 if (fd->refs && !fd->no_ref) { | |
3467 int i; | |
3468 for (i = 0; i < hdr->nref; i++) { | |
3469 SAM_hdr_type *ty; | |
3470 char *ref; | |
3471 | |
3472 if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name))) | |
3473 return -1; | |
3474 | |
3475 if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) { | |
3476 char unsigned buf[16], buf2[33]; | |
3477 int j, rlen; | |
3478 MD5_CTX md5; | |
3479 | |
3480 if (!fd->refs || | |
3481 !fd->refs->ref_id || | |
3482 !fd->refs->ref_id[i]) { | |
3483 return -1; | |
3484 } | |
3485 rlen = fd->refs->ref_id[i]->length; | |
3486 MD5_Init(&md5); | |
3487 ref = cram_get_ref(fd, i, 1, rlen); | |
3488 if (NULL == ref) return -1; | |
3489 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */ | |
3490 MD5_Update(&md5, ref, rlen); | |
3491 MD5_Final(buf, &md5); | |
3492 cram_ref_decr(fd->refs, i); | |
3493 | |
3494 for (j = 0; j < 16; j++) { | |
3495 buf2[j*2+0] = "0123456789abcdef"[buf[j]>>4]; | |
3496 buf2[j*2+1] = "0123456789abcdef"[buf[j]&15]; | |
3497 } | |
3498 buf2[32] = 0; | |
3499 if (sam_hdr_update(hdr, ty, "M5", buf2, NULL)) | |
3500 return -1; | |
3501 } | |
3502 | |
3503 if (fd->ref_fn) { | |
3504 char ref_fn[PATH_MAX]; | |
3505 full_path(ref_fn, fd->ref_fn); | |
3506 if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL)) | |
3507 return -1; | |
3508 } | |
3509 } | |
3510 } | |
3511 | |
3512 if (sam_hdr_rebuild(hdr)) | |
3513 return -1; | |
3514 | |
3515 /* Length */ | |
3516 header_len = sam_hdr_length(hdr); | |
3517 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
3518 if (-1 == int32_encode(fd, header_len)) | |
3519 return -1; | |
3520 | |
3521 /* Text data */ | |
3522 if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len)) | |
3523 return -1; | |
3524 } else { | |
3525 /* Create block(s) inside a container */ | |
3526 cram_block *b = cram_new_block(FILE_HEADER, 0); | |
3527 cram_container *c = cram_new_container(0, 0); | |
3528 int padded_length; | |
3529 char *pads; | |
3530 int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3); | |
3531 | |
3532 if (!b || !c) { | |
3533 if (b) cram_free_block(b); | |
3534 if (c) cram_free_container(c); | |
3535 return -1; | |
3536 } | |
3537 | |
3538 int32_put(b, header_len); | |
3539 BLOCK_APPEND(b, sam_hdr_str(hdr), header_len); | |
3540 BLOCK_UPLEN(b); | |
3541 | |
3542 // Compress header block if V3.0 and above | |
3543 if (CRAM_MAJOR_VERS(fd->version) >= 3 && fd->level > 0) { | |
3544 int method = 1<<GZIP; | |
3545 if (fd->use_bz2) | |
3546 method |= 1<<BZIP2; | |
3547 if (fd->use_lzma) | |
3548 method |= 1<<LZMA; | |
3549 cram_compress_block(fd, b, NULL, method, fd->level); | |
3550 } | |
3551 | |
3552 if (blank_block) { | |
3553 c->length = b->comp_size + 2 + 4*is_cram_3 + | |
3554 itf8_size(b->content_id) + | |
3555 itf8_size(b->uncomp_size) + | |
3556 itf8_size(b->comp_size); | |
3557 | |
3558 c->num_blocks = 2; | |
3559 c->num_landmarks = 2; | |
3560 if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) { | |
3561 cram_free_block(b); | |
3562 cram_free_container(c); | |
3563 return -1; | |
3564 } | |
3565 c->landmark[0] = 0; | |
3566 c->landmark[1] = c->length; | |
3567 | |
3568 // Plus extra storage for uncompressed secondary blank block | |
3569 padded_length = MIN(c->length*.5, 10000); | |
3570 c->length += padded_length + 2 + 4*is_cram_3 + | |
3571 itf8_size(b->content_id) + | |
3572 itf8_size(padded_length)*2; | |
3573 } else { | |
3574 // Pad the block instead. | |
3575 c->num_blocks = 1; | |
3576 c->num_landmarks = 1; | |
3577 if (!(c->landmark = malloc(sizeof(*c->landmark)))) | |
3578 return -1; | |
3579 c->landmark[0] = 0; | |
3580 | |
3581 padded_length = MAX(c->length*1.5, 10000) - c->length; | |
3582 | |
3583 c->length = b->comp_size + padded_length + | |
3584 2 + 4*is_cram_3 + | |
3585 itf8_size(b->content_id) + | |
3586 itf8_size(b->uncomp_size) + | |
3587 itf8_size(b->comp_size); | |
3588 | |
3589 if (NULL == (pads = calloc(1, padded_length))) { | |
3590 cram_free_block(b); | |
3591 cram_free_container(c); | |
3592 return -1; | |
3593 } | |
3594 BLOCK_APPEND(b, pads, padded_length); | |
3595 BLOCK_UPLEN(b); | |
3596 free(pads); | |
3597 } | |
3598 | |
3599 if (-1 == cram_write_container(fd, c)) { | |
3600 cram_free_block(b); | |
3601 cram_free_container(c); | |
3602 return -1; | |
3603 } | |
3604 | |
3605 if (-1 == cram_write_block(fd, b)) { | |
3606 cram_free_block(b); | |
3607 cram_free_container(c); | |
3608 return -1; | |
3609 } | |
3610 | |
3611 if (blank_block) { | |
3612 BLOCK_RESIZE(b, padded_length); | |
3613 memset(BLOCK_DATA(b), 0, padded_length); | |
3614 BLOCK_SIZE(b) = padded_length; | |
3615 BLOCK_UPLEN(b); | |
3616 b->method = RAW; | |
3617 if (-1 == cram_write_block(fd, b)) { | |
3618 cram_free_block(b); | |
3619 cram_free_container(c); | |
3620 return -1; | |
3621 } | |
3622 } | |
3623 | |
3624 cram_free_block(b); | |
3625 cram_free_container(c); | |
3626 } | |
3627 | |
3628 if (-1 == refs_from_header(fd->refs, fd, fd->header)) | |
3629 return -1; | |
3630 if (-1 == refs2id(fd->refs, fd->header)) | |
3631 return -1; | |
3632 | |
3633 if (0 != hflush(fd->fp)) | |
3634 return -1; | |
3635 | |
3636 RP("=== Finishing saving header ===\n"); | |
3637 | |
3638 return 0; | |
3639 } | |
3640 | |
3641 /* ---------------------------------------------------------------------- | |
3642 * The top-level cram opening, closing and option handling | |
3643 */ | |
3644 | |
3645 /* | |
3646 * Initialises the lookup tables. These could be global statics, but they're | |
3647 * clumsy to setup in a multi-threaded environment unless we generate | |
3648 * verbatim code and include that. | |
3649 */ | |
3650 static void cram_init_tables(cram_fd *fd) { | |
3651 int i; | |
3652 | |
3653 memset(fd->L1, 4, 256); | |
3654 fd->L1['A'] = 0; fd->L1['a'] = 0; | |
3655 fd->L1['C'] = 1; fd->L1['c'] = 1; | |
3656 fd->L1['G'] = 2; fd->L1['g'] = 2; | |
3657 fd->L1['T'] = 3; fd->L1['t'] = 3; | |
3658 | |
3659 memset(fd->L2, 5, 256); | |
3660 fd->L2['A'] = 0; fd->L2['a'] = 0; | |
3661 fd->L2['C'] = 1; fd->L2['c'] = 1; | |
3662 fd->L2['G'] = 2; fd->L2['g'] = 2; | |
3663 fd->L2['T'] = 3; fd->L2['t'] = 3; | |
3664 fd->L2['N'] = 4; fd->L2['n'] = 4; | |
3665 | |
3666 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
3667 for (i = 0; i < 0x200; i++) { | |
3668 int f = 0; | |
3669 | |
3670 if (i & CRAM_FPAIRED) f |= BAM_FPAIRED; | |
3671 if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR; | |
3672 if (i & CRAM_FUNMAP) f |= BAM_FUNMAP; | |
3673 if (i & CRAM_FREVERSE) f |= BAM_FREVERSE; | |
3674 if (i & CRAM_FREAD1) f |= BAM_FREAD1; | |
3675 if (i & CRAM_FREAD2) f |= BAM_FREAD2; | |
3676 if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY; | |
3677 if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL; | |
3678 if (i & CRAM_FDUP) f |= BAM_FDUP; | |
3679 | |
3680 fd->bam_flag_swap[i] = f; | |
3681 } | |
3682 | |
3683 for (i = 0; i < 0x1000; i++) { | |
3684 int g = 0; | |
3685 | |
3686 if (i & BAM_FPAIRED) g |= CRAM_FPAIRED; | |
3687 if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR; | |
3688 if (i & BAM_FUNMAP) g |= CRAM_FUNMAP; | |
3689 if (i & BAM_FREVERSE) g |= CRAM_FREVERSE; | |
3690 if (i & BAM_FREAD1) g |= CRAM_FREAD1; | |
3691 if (i & BAM_FREAD2) g |= CRAM_FREAD2; | |
3692 if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY; | |
3693 if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL; | |
3694 if (i & BAM_FDUP) g |= CRAM_FDUP; | |
3695 | |
3696 fd->cram_flag_swap[i] = g; | |
3697 } | |
3698 } else { | |
3699 /* NOP */ | |
3700 for (i = 0; i < 0x1000; i++) | |
3701 fd->bam_flag_swap[i] = i; | |
3702 for (i = 0; i < 0x1000; i++) | |
3703 fd->cram_flag_swap[i] = i; | |
3704 } | |
3705 | |
3706 memset(fd->cram_sub_matrix, 4, 32*32); | |
3707 for (i = 0; i < 32; i++) { | |
3708 fd->cram_sub_matrix[i]['A'&0x1f]=0; | |
3709 fd->cram_sub_matrix[i]['C'&0x1f]=1; | |
3710 fd->cram_sub_matrix[i]['G'&0x1f]=2; | |
3711 fd->cram_sub_matrix[i]['T'&0x1f]=3; | |
3712 fd->cram_sub_matrix[i]['N'&0x1f]=4; | |
3713 } | |
3714 for (i = 0; i < 20; i+=4) { | |
3715 int j; | |
3716 for (j = 0; j < 20; j++) { | |
3717 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; | |
3718 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; | |
3719 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; | |
3720 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; | |
3721 } | |
3722 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0; | |
3723 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1; | |
3724 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2; | |
3725 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3; | |
3726 } | |
3727 } | |
3728 | |
3729 // Default version numbers for CRAM | |
3730 static int major_version = 2; | |
3731 static int minor_version = 1; | |
3732 | |
3733 /* | |
3734 * Opens a CRAM file for read (mode "rb") or write ("wb"). | |
3735 * The filename may be "-" to indicate stdin or stdout. | |
3736 * | |
3737 * Returns file handle on success | |
3738 * NULL on failure. | |
3739 */ | |
3740 cram_fd *cram_open(const char *filename, const char *mode) { | |
3741 hFILE *fp; | |
3742 cram_fd *fd; | |
3743 char fmode[3]= { mode[0], '\0', '\0' }; | |
3744 | |
3745 if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) { | |
3746 fmode[1] = 'b'; | |
3747 } | |
3748 | |
3749 fp = hopen(filename, fmode); | |
3750 if (!fp) | |
3751 return NULL; | |
3752 | |
3753 fd = cram_dopen(fp, filename, mode); | |
3754 if (!fd) | |
3755 hclose_abruptly(fp); | |
3756 | |
3757 return fd; | |
3758 } | |
3759 | |
3760 /* Opens an existing stream for reading or writing. | |
3761 * | |
3762 * Returns file handle on success; | |
3763 * NULL on failure. | |
3764 */ | |
3765 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { | |
3766 int i; | |
3767 char *cp; | |
3768 cram_fd *fd = calloc(1, sizeof(*fd)); | |
3769 if (!fd) | |
3770 return NULL; | |
3771 | |
3772 fd->level = 5; | |
3773 for (i = 0; mode[i]; i++) { | |
3774 if (mode[i] >= '0' && mode[i] <= '9') { | |
3775 fd->level = mode[i] - '0'; | |
3776 break; | |
3777 } | |
3778 } | |
3779 | |
3780 fd->fp = fp; | |
3781 fd->mode = *mode; | |
3782 fd->first_container = 0; | |
3783 | |
3784 if (fd->mode == 'r') { | |
3785 /* Reader */ | |
3786 | |
3787 if (!(fd->file_def = cram_read_file_def(fd))) | |
3788 goto err; | |
3789 | |
3790 fd->version = fd->file_def->major_version * 256 + | |
3791 fd->file_def->minor_version; | |
3792 | |
3793 if (!(fd->header = cram_read_SAM_hdr(fd))) | |
3794 goto err; | |
3795 | |
3796 } else { | |
3797 /* Writer */ | |
3798 cram_file_def *def = calloc(1, sizeof(*def)); | |
3799 if (!def) | |
3800 return NULL; | |
3801 | |
3802 fd->file_def = def; | |
3803 | |
3804 def->magic[0] = 'C'; | |
3805 def->magic[1] = 'R'; | |
3806 def->magic[2] = 'A'; | |
3807 def->magic[3] = 'M'; | |
3808 def->major_version = 0; // Indicator to write file def later. | |
3809 def->minor_version = 0; | |
3810 memset(def->file_id, 0, 20); | |
3811 strncpy(def->file_id, filename, 20); | |
3812 | |
3813 fd->version = major_version * 256 + minor_version; | |
3814 | |
3815 /* SAM header written later along with this file_def */ | |
3816 } | |
3817 | |
3818 cram_init_tables(fd); | |
3819 | |
3820 fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename); | |
3821 if (!fd->prefix) | |
3822 goto err; | |
3823 fd->first_base = fd->last_base = -1; | |
3824 fd->record_counter = 0; | |
3825 | |
3826 fd->ctr = NULL; | |
3827 fd->refs = refs_create(); | |
3828 if (!fd->refs) | |
3829 goto err; | |
3830 fd->ref_id = -2; | |
3831 fd->ref = NULL; | |
3832 | |
3833 fd->decode_md = 0; | |
3834 fd->verbose = 0; | |
3835 fd->seqs_per_slice = SEQS_PER_SLICE; | |
3836 fd->slices_per_container = SLICE_PER_CNT; | |
3837 fd->embed_ref = 0; | |
3838 fd->no_ref = 0; | |
3839 fd->ignore_md5 = 0; | |
3840 fd->use_bz2 = 0; | |
3841 fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3); | |
3842 fd->use_lzma = 0; | |
3843 fd->multi_seq = -1; | |
3844 fd->unsorted = 0; | |
3845 fd->shared_ref = 0; | |
3846 | |
3847 fd->index = NULL; | |
3848 fd->own_pool = 0; | |
3849 fd->pool = NULL; | |
3850 fd->rqueue = NULL; | |
3851 fd->job_pending = NULL; | |
3852 fd->ooc = 0; | |
3853 fd->required_fields = INT_MAX; | |
3854 | |
3855 for (i = 0; i < DS_END; i++) | |
3856 fd->m[i] = cram_new_metrics(); | |
3857 | |
3858 fd->range.refid = -2; // no ref. | |
3859 fd->eof = 1; // See samtools issue #150 | |
3860 fd->ref_fn = NULL; | |
3861 | |
3862 fd->bl = NULL; | |
3863 | |
3864 /* Initialise dummy refs from the @SQ headers */ | |
3865 if (-1 == refs_from_header(fd->refs, fd, fd->header)) | |
3866 goto err; | |
3867 | |
3868 return fd; | |
3869 | |
3870 err: | |
3871 if (fd) | |
3872 free(fd); | |
3873 | |
3874 return NULL; | |
3875 } | |
3876 | |
3877 /* | |
3878 * Seek within a CRAM file. | |
3879 * | |
3880 * Returns 0 on success | |
3881 * -1 on failure | |
3882 */ | |
3883 int cram_seek(cram_fd *fd, off_t offset, int whence) { | |
3884 char buf[65536]; | |
3885 | |
3886 fd->ooc = 0; | |
3887 | |
3888 if (hseek(fd->fp, offset, whence) >= 0) | |
3889 return 0; | |
3890 | |
3891 if (!(whence == SEEK_CUR && offset >= 0)) | |
3892 return -1; | |
3893 | |
3894 /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */ | |
3895 while (offset > 0) { | |
3896 int len = MIN(65536, offset); | |
3897 if (len != hread(fd->fp, buf, len)) | |
3898 return -1; | |
3899 offset -= len; | |
3900 } | |
3901 | |
3902 return 0; | |
3903 } | |
3904 | |
3905 /* | |
3906 * Flushes a CRAM file. | |
3907 * Useful for when writing to stdout without wishing to close the stream. | |
3908 * | |
3909 * Returns 0 on success | |
3910 * -1 on failure | |
3911 */ | |
3912 int cram_flush(cram_fd *fd) { | |
3913 if (!fd) | |
3914 return -1; | |
3915 | |
3916 if (fd->mode == 'w' && fd->ctr) { | |
3917 if(fd->ctr->slice) | |
3918 fd->ctr->curr_slice++; | |
3919 if (-1 == cram_flush_container_mt(fd, fd->ctr)) | |
3920 return -1; | |
3921 } | |
3922 | |
3923 return 0; | |
3924 } | |
3925 | |
3926 /* | |
3927 * Closes a CRAM file. | |
3928 * Returns 0 on success | |
3929 * -1 on failure | |
3930 */ | |
3931 int cram_close(cram_fd *fd) { | |
3932 spare_bams *bl, *next; | |
3933 int i; | |
3934 | |
3935 if (!fd) | |
3936 return -1; | |
3937 | |
3938 if (fd->mode == 'w' && fd->ctr) { | |
3939 if(fd->ctr->slice) | |
3940 fd->ctr->curr_slice++; | |
3941 if (-1 == cram_flush_container_mt(fd, fd->ctr)) | |
3942 return -1; | |
3943 } | |
3944 | |
3945 if (fd->pool) { | |
3946 t_pool_flush(fd->pool); | |
3947 | |
3948 if (0 != cram_flush_result(fd)) | |
3949 return -1; | |
3950 | |
3951 pthread_mutex_destroy(&fd->metrics_lock); | |
3952 pthread_mutex_destroy(&fd->ref_lock); | |
3953 pthread_mutex_destroy(&fd->bam_list_lock); | |
3954 | |
3955 fd->ctr = NULL; // prevent double freeing | |
3956 | |
3957 //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue); | |
3958 | |
3959 t_results_queue_destroy(fd->rqueue); | |
3960 } | |
3961 | |
3962 if (fd->mode == 'w') { | |
3963 /* Write EOF block */ | |
3964 if (CRAM_MAJOR_VERS(fd->version) == 3) { | |
3965 if (38 != hwrite(fd->fp, | |
3966 "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR | |
3967 "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR | |
3968 "\x00\x01\x00" // Cont HDR | |
3969 "\x05\xbd\xd9\x4f" // CRC32 | |
3970 "\x00\x01\x00\x06\x06" // Comp.HDR blk | |
3971 "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk | |
3972 "\xee\x63\x01\x4b", // CRC32 | |
3973 38)) | |
3974 return -1; | |
3975 } else { | |
3976 if (30 != hwrite(fd->fp, | |
3977 "\x0b\x00\x00\x00\xff\xff\xff\xff" | |
3978 "\x0f\xe0\x45\x4f\x46\x00\x00\x00" | |
3979 "\x00\x01\x00\x00\x01\x00\x06\x06" | |
3980 "\x01\x00\x01\x00\x01\x00", 30)) | |
3981 return -1; | |
3982 } | |
3983 } | |
3984 | |
3985 for (bl = fd->bl; bl; bl = next) { | |
3986 int i, max_rec = fd->seqs_per_slice * fd->slices_per_container; | |
3987 | |
3988 next = bl->next; | |
3989 for (i = 0; i < max_rec; i++) { | |
3990 if (bl->bams[i]) | |
3991 bam_free(bl->bams[i]); | |
3992 } | |
3993 free(bl->bams); | |
3994 free(bl); | |
3995 } | |
3996 | |
3997 if (hclose(fd->fp) != 0) | |
3998 return -1; | |
3999 | |
4000 if (fd->file_def) | |
4001 cram_free_file_def(fd->file_def); | |
4002 | |
4003 if (fd->header) | |
4004 sam_hdr_free(fd->header); | |
4005 | |
4006 free(fd->prefix); | |
4007 | |
4008 if (fd->ctr) | |
4009 cram_free_container(fd->ctr); | |
4010 | |
4011 if (fd->refs) | |
4012 refs_free(fd->refs); | |
4013 if (fd->ref_free) | |
4014 free(fd->ref_free); | |
4015 | |
4016 for (i = 0; i < DS_END; i++) | |
4017 if (fd->m[i]) | |
4018 free(fd->m[i]); | |
4019 | |
4020 if (fd->index) | |
4021 cram_index_free(fd); | |
4022 | |
4023 if (fd->own_pool && fd->pool) | |
4024 t_pool_destroy(fd->pool, 0); | |
4025 | |
4026 free(fd); | |
4027 return 0; | |
4028 } | |
4029 | |
4030 /* | |
4031 * Returns 1 if we hit an EOF while reading. | |
4032 */ | |
4033 int cram_eof(cram_fd *fd) { | |
4034 return fd->eof; | |
4035 } | |
4036 | |
4037 | |
4038 /* | |
4039 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h. | |
4040 * Use this immediately after opening. | |
4041 * | |
4042 * Returns 0 on success | |
4043 * -1 on failure | |
4044 */ | |
4045 int cram_set_option(cram_fd *fd, enum cram_option opt, ...) { | |
4046 int r; | |
4047 va_list args; | |
4048 | |
4049 va_start(args, opt); | |
4050 r = cram_set_voption(fd, opt, args); | |
4051 va_end(args); | |
4052 | |
4053 return r; | |
4054 } | |
4055 | |
4056 /* | |
4057 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h. | |
4058 * Use this immediately after opening. | |
4059 * | |
4060 * Returns 0 on success | |
4061 * -1 on failure | |
4062 */ | |
4063 int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) { | |
4064 refs_t *refs; | |
4065 | |
4066 if (!fd) | |
4067 return -1; | |
4068 | |
4069 switch (opt) { | |
4070 case CRAM_OPT_DECODE_MD: | |
4071 fd->decode_md = va_arg(args, int); | |
4072 break; | |
4073 | |
4074 case CRAM_OPT_PREFIX: | |
4075 if (fd->prefix) | |
4076 free(fd->prefix); | |
4077 if (!(fd->prefix = strdup(va_arg(args, char *)))) | |
4078 return -1; | |
4079 break; | |
4080 | |
4081 case CRAM_OPT_VERBOSITY: | |
4082 fd->verbose = va_arg(args, int); | |
4083 break; | |
4084 | |
4085 case CRAM_OPT_SEQS_PER_SLICE: | |
4086 fd->seqs_per_slice = va_arg(args, int); | |
4087 break; | |
4088 | |
4089 case CRAM_OPT_SLICES_PER_CONTAINER: | |
4090 fd->slices_per_container = va_arg(args, int); | |
4091 break; | |
4092 | |
4093 case CRAM_OPT_EMBED_REF: | |
4094 fd->embed_ref = va_arg(args, int); | |
4095 break; | |
4096 | |
4097 case CRAM_OPT_NO_REF: | |
4098 fd->no_ref = va_arg(args, int); | |
4099 break; | |
4100 | |
4101 case CRAM_OPT_IGNORE_MD5: | |
4102 fd->ignore_md5 = va_arg(args, int); | |
4103 break; | |
4104 | |
4105 case CRAM_OPT_USE_BZIP2: | |
4106 fd->use_bz2 = va_arg(args, int); | |
4107 break; | |
4108 | |
4109 case CRAM_OPT_USE_RANS: | |
4110 fd->use_rans = va_arg(args, int); | |
4111 break; | |
4112 | |
4113 case CRAM_OPT_USE_LZMA: | |
4114 fd->use_lzma = va_arg(args, int); | |
4115 break; | |
4116 | |
4117 case CRAM_OPT_SHARED_REF: | |
4118 fd->shared_ref = 1; | |
4119 refs = va_arg(args, refs_t *); | |
4120 if (refs != fd->refs) { | |
4121 if (fd->refs) | |
4122 refs_free(fd->refs); | |
4123 fd->refs = refs; | |
4124 fd->refs->count++; | |
4125 } | |
4126 break; | |
4127 | |
4128 case CRAM_OPT_RANGE: | |
4129 fd->range = *va_arg(args, cram_range *); | |
4130 return cram_seek_to_refpos(fd, &fd->range); | |
4131 | |
4132 case CRAM_OPT_REFERENCE: | |
4133 return cram_load_reference(fd, va_arg(args, char *)); | |
4134 | |
4135 case CRAM_OPT_VERSION: { | |
4136 int major, minor; | |
4137 char *s = va_arg(args, char *); | |
4138 if (2 != sscanf(s, "%d.%d", &major, &minor)) { | |
4139 fprintf(stderr, "Malformed version string %s\n", s); | |
4140 return -1; | |
4141 } | |
4142 if (!((major == 1 && minor == 0) || | |
4143 (major == 2 && (minor == 0 || minor == 1)) || | |
4144 (major == 3 && minor == 0))) { | |
4145 fprintf(stderr, "Unknown version string; " | |
4146 "use 1.0, 2.0, 2.1 or 3.0\n"); | |
4147 return -1; | |
4148 } | |
4149 fd->version = major*256 + minor; | |
4150 | |
4151 if (CRAM_MAJOR_VERS(fd->version) >= 3) | |
4152 fd->use_rans = 1; | |
4153 break; | |
4154 } | |
4155 | |
4156 case CRAM_OPT_MULTI_SEQ_PER_SLICE: | |
4157 fd->multi_seq = va_arg(args, int); | |
4158 break; | |
4159 | |
4160 case CRAM_OPT_NTHREADS: { | |
4161 int nthreads = va_arg(args, int); | |
4162 if (nthreads > 1) { | |
4163 if (!(fd->pool = t_pool_init(nthreads*2, nthreads))) | |
4164 return -1; | |
4165 | |
4166 fd->rqueue = t_results_queue_init(); | |
4167 pthread_mutex_init(&fd->metrics_lock, NULL); | |
4168 pthread_mutex_init(&fd->ref_lock, NULL); | |
4169 pthread_mutex_init(&fd->bam_list_lock, NULL); | |
4170 fd->shared_ref = 1; | |
4171 fd->own_pool = 1; | |
4172 } | |
4173 break; | |
4174 } | |
4175 | |
4176 case CRAM_OPT_THREAD_POOL: | |
4177 fd->pool = va_arg(args, t_pool *); | |
4178 if (fd->pool) { | |
4179 fd->rqueue = t_results_queue_init(); | |
4180 pthread_mutex_init(&fd->metrics_lock, NULL); | |
4181 pthread_mutex_init(&fd->ref_lock, NULL); | |
4182 pthread_mutex_init(&fd->bam_list_lock, NULL); | |
4183 } | |
4184 fd->shared_ref = 1; // Needed to avoid clobbering ref between threads | |
4185 fd->own_pool = 0; | |
4186 | |
4187 //fd->qsize = 1; | |
4188 //fd->decoded = calloc(fd->qsize, sizeof(cram_container *)); | |
4189 //t_pool_dispatch(fd->pool, cram_decoder_thread, fd); | |
4190 break; | |
4191 | |
4192 case CRAM_OPT_REQUIRED_FIELDS: | |
4193 fd->required_fields = va_arg(args, int); | |
4194 break; | |
4195 | |
4196 default: | |
4197 fprintf(stderr, "Unknown CRAM option code %d\n", opt); | |
4198 return -1; | |
4199 } | |
4200 | |
4201 return 0; | |
4202 } |