0
|
1 /*
|
|
2 Copyright (c) 2012-2014 Genome Research Ltd.
|
|
3 Author: James Bonfield <jkb@sanger.ac.uk>
|
|
4
|
|
5 Redistribution and use in source and binary forms, with or without
|
|
6 modification, are permitted provided that the following conditions are met:
|
|
7
|
|
8 1. Redistributions of source code must retain the above copyright notice,
|
|
9 this list of conditions and the following disclaimer.
|
|
10
|
|
11 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12 this list of conditions and the following disclaimer in the documentation
|
|
13 and/or other materials provided with the distribution.
|
|
14
|
|
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
|
16 Institute nor the names of its contributors may be used to endorse or promote
|
|
17 products derived from this software without specific prior written permission.
|
|
18
|
|
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
|
|
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
|
|
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29 */
|
|
30
|
|
31 /*
|
|
32 * CRAM I/O primitives.
|
|
33 *
|
|
34 * - ITF8 encoding and decoding.
|
|
35 * - Block based I/O
|
|
36 * - Zlib inflating and deflating (memory)
|
|
37 * - CRAM basic data structure reading and writing
|
|
38 * - File opening / closing
|
|
39 * - Reference sequence handling
|
|
40 */
|
|
41
|
|
42 /*
|
|
43 * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
|
|
44 * a way to return errors for when malloc fails.
|
|
45 */
|
|
46
|
|
47 #ifdef HAVE_CONFIG_H
|
|
48 #include "io_lib_config.h"
|
|
49 #endif
|
|
50
|
|
51 #include <stdio.h>
|
|
52 #include <errno.h>
|
|
53 #include <assert.h>
|
|
54 #include <stdlib.h>
|
|
55 #include <string.h>
|
|
56 #include <zlib.h>
|
|
57 #ifdef HAVE_LIBBZ2
|
|
58 #include <bzlib.h>
|
|
59 #endif
|
|
60 #ifdef HAVE_LIBLZMA
|
|
61 #include <lzma.h>
|
|
62 #endif
|
|
63 #include <sys/types.h>
|
|
64 #include <sys/stat.h>
|
|
65 #include <math.h>
|
|
66 #include <ctype.h>
|
|
67
|
|
68 #include "cram/cram.h"
|
|
69 #include "cram/os.h"
|
|
70 #include "cram/md5.h"
|
|
71 #include "cram/open_trace_file.h"
|
|
72 #include "cram/rANS_static.h"
|
|
73
|
|
74 //#define REF_DEBUG
|
|
75
|
|
76 #ifdef REF_DEBUG
|
|
77 #include <sys/syscall.h>
|
|
78 #define gettid() (int)syscall(SYS_gettid)
|
|
79
|
|
80 #define RP(...) fprintf (stderr, __VA_ARGS__)
|
|
81 #else
|
|
82 #define RP(...)
|
|
83 #endif
|
|
84
|
|
85 #include "htslib/hfile.h"
|
|
86 #include "htslib/bgzf.h"
|
|
87 #include "htslib/faidx.h"
|
|
88
|
|
89 #define TRIAL_SPAN 50
|
|
90 #define NTRIALS 3
|
|
91
|
|
92
|
|
93 /* ----------------------------------------------------------------------
|
|
94 * ITF8 encoding and decoding.
|
|
95 *
|
|
96 * Also see the itf8_get and itf8_put macros in cram_io.h
|
|
97 */
|
|
98
|
|
99 /*
|
|
100 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
|
|
101 * *val.
|
|
102 *
|
|
103 * Returns the number of bytes read on success
|
|
104 * -1 on failure
|
|
105 */
|
|
106 int itf8_decode(cram_fd *fd, int32_t *val_p) {
|
|
107 static int nbytes[16] = {
|
|
108 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
|
|
109 1,1,1,1, // 1000xxxx - 1011xxxx
|
|
110 2,2, // 1100xxxx - 1101xxxx
|
|
111 3, // 1110xxxx
|
|
112 4, // 1111xxxx
|
|
113 };
|
|
114
|
|
115 static int nbits[16] = {
|
|
116 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
|
|
117 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
|
|
118 0x1f, 0x1f, // 1100xxxx - 1101xxxx
|
|
119 0x0f, // 1110xxxx
|
|
120 0x0f, // 1111xxxx
|
|
121 };
|
|
122
|
|
123 int32_t val = hgetc(fd->fp);
|
|
124 if (val == -1)
|
|
125 return -1;
|
|
126
|
|
127 int i = nbytes[val>>4];
|
|
128 val &= nbits[val>>4];
|
|
129
|
|
130 switch(i) {
|
|
131 case 0:
|
|
132 *val_p = val;
|
|
133 return 1;
|
|
134
|
|
135 case 1:
|
|
136 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
137 *val_p = val;
|
|
138 return 2;
|
|
139
|
|
140 case 2:
|
|
141 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
142 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
143 *val_p = val;
|
|
144 return 3;
|
|
145
|
|
146 case 3:
|
|
147 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
148 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
149 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
150 *val_p = val;
|
|
151 return 4;
|
|
152
|
|
153 case 4: // really 3.5 more, why make it different?
|
|
154 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
155 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
156 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
157 val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
|
|
158 *val_p = val;
|
|
159 }
|
|
160
|
|
161 return 5;
|
|
162 }
|
|
163
|
|
164 /*
|
|
165 * Encodes and writes a single integer in ITF-8 format.
|
|
166 * Returns 0 on success
|
|
167 * -1 on failure
|
|
168 */
|
|
169 int itf8_encode(cram_fd *fd, int32_t val) {
|
|
170 char buf[5];
|
|
171 int len = itf8_put(buf, val);
|
|
172 return hwrite(fd->fp, buf, len) == len ? 0 : -1;
|
|
173 }
|
|
174
|
|
175 #ifndef ITF8_MACROS
|
|
176 /*
|
|
177 * As above, but decoding from memory
|
|
178 */
|
|
179 int itf8_get(char *cp, int32_t *val_p) {
|
|
180 unsigned char *up = (unsigned char *)cp;
|
|
181
|
|
182 if (up[0] < 0x80) {
|
|
183 *val_p = up[0];
|
|
184 return 1;
|
|
185 } else if (up[0] < 0xc0) {
|
|
186 *val_p = ((up[0] <<8) | up[1]) & 0x3fff;
|
|
187 return 2;
|
|
188 } else if (up[0] < 0xe0) {
|
|
189 *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff;
|
|
190 return 3;
|
|
191 } else if (up[0] < 0xf0) {
|
|
192 *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
|
|
193 return 4;
|
|
194 } else {
|
|
195 *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
|
|
196 return 5;
|
|
197 }
|
|
198 }
|
|
199
|
|
200 /*
|
|
201 * Stores a value to memory in ITF-8 format.
|
|
202 *
|
|
203 * Returns the number of bytes required to store the number.
|
|
204 * This is a maximum of 5 bytes.
|
|
205 */
|
|
206 int itf8_put(char *cp, int32_t val) {
|
|
207 if (!(val & ~0x00000007f)) { // 1 byte
|
|
208 *cp = val;
|
|
209 return 1;
|
|
210 } else if (!(val & ~0x00003fff)) { // 2 byte
|
|
211 *cp++ = (val >> 8 ) | 0x80;
|
|
212 *cp = val & 0xff;
|
|
213 return 2;
|
|
214 } else if (!(val & ~0x01fffff)) { // 3 byte
|
|
215 *cp++ = (val >> 16) | 0xc0;
|
|
216 *cp++ = (val >> 8 ) & 0xff;
|
|
217 *cp = val & 0xff;
|
|
218 return 3;
|
|
219 } else if (!(val & ~0x0fffffff)) { // 4 byte
|
|
220 *cp++ = (val >> 24) | 0xe0;
|
|
221 *cp++ = (val >> 16) & 0xff;
|
|
222 *cp++ = (val >> 8 ) & 0xff;
|
|
223 *cp = val & 0xff;
|
|
224 return 4;
|
|
225 } else { // 5 byte
|
|
226 *cp++ = 0xf0 | ((val>>28) & 0xff);
|
|
227 *cp++ = (val >> 20) & 0xff;
|
|
228 *cp++ = (val >> 12) & 0xff;
|
|
229 *cp++ = (val >> 4 ) & 0xff;
|
|
230 *cp = val & 0x0f;
|
|
231 return 5;
|
|
232 }
|
|
233 }
|
|
234 #endif
|
|
235
|
|
236 /* 64-bit itf8 variant */
|
|
237 int ltf8_put(char *cp, int64_t val) {
|
|
238 if (!(val & ~((1LL<<7)-1))) {
|
|
239 *cp = val;
|
|
240 return 1;
|
|
241 } else if (!(val & ~((1LL<<(6+8))-1))) {
|
|
242 *cp++ = (val >> 8 ) | 0x80;
|
|
243 *cp = val & 0xff;
|
|
244 return 2;
|
|
245 } else if (!(val & ~((1LL<<(5+2*8))-1))) {
|
|
246 *cp++ = (val >> 16) | 0xc0;
|
|
247 *cp++ = (val >> 8 ) & 0xff;
|
|
248 *cp = val & 0xff;
|
|
249 return 3;
|
|
250 } else if (!(val & ~((1LL<<(4+3*8))-1))) {
|
|
251 *cp++ = (val >> 24) | 0xe0;
|
|
252 *cp++ = (val >> 16) & 0xff;
|
|
253 *cp++ = (val >> 8 ) & 0xff;
|
|
254 *cp = val & 0xff;
|
|
255 return 4;
|
|
256 } else if (!(val & ~((1LL<<(3+4*8))-1))) {
|
|
257 *cp++ = (val >> 32) | 0xf0;
|
|
258 *cp++ = (val >> 24) & 0xff;
|
|
259 *cp++ = (val >> 16) & 0xff;
|
|
260 *cp++ = (val >> 8 ) & 0xff;
|
|
261 *cp = val & 0xff;
|
|
262 return 5;
|
|
263 } else if (!(val & ~((1LL<<(2+5*8))-1))) {
|
|
264 *cp++ = (val >> 40) | 0xf8;
|
|
265 *cp++ = (val >> 32) & 0xff;
|
|
266 *cp++ = (val >> 24) & 0xff;
|
|
267 *cp++ = (val >> 16) & 0xff;
|
|
268 *cp++ = (val >> 8 ) & 0xff;
|
|
269 *cp = val & 0xff;
|
|
270 return 6;
|
|
271 } else if (!(val & ~((1LL<<(1+6*8))-1))) {
|
|
272 *cp++ = (val >> 48) | 0xfc;
|
|
273 *cp++ = (val >> 40) & 0xff;
|
|
274 *cp++ = (val >> 32) & 0xff;
|
|
275 *cp++ = (val >> 24) & 0xff;
|
|
276 *cp++ = (val >> 16) & 0xff;
|
|
277 *cp++ = (val >> 8 ) & 0xff;
|
|
278 *cp = val & 0xff;
|
|
279 return 7;
|
|
280 } else if (!(val & ~((1LL<<(7*8))-1))) {
|
|
281 *cp++ = (val >> 56) | 0xfe;
|
|
282 *cp++ = (val >> 48) & 0xff;
|
|
283 *cp++ = (val >> 40) & 0xff;
|
|
284 *cp++ = (val >> 32) & 0xff;
|
|
285 *cp++ = (val >> 24) & 0xff;
|
|
286 *cp++ = (val >> 16) & 0xff;
|
|
287 *cp++ = (val >> 8 ) & 0xff;
|
|
288 *cp = val & 0xff;
|
|
289 return 8;
|
|
290 } else {
|
|
291 *cp++ = 0xff;
|
|
292 *cp++ = (val >> 56) & 0xff;
|
|
293 *cp++ = (val >> 48) & 0xff;
|
|
294 *cp++ = (val >> 40) & 0xff;
|
|
295 *cp++ = (val >> 32) & 0xff;
|
|
296 *cp++ = (val >> 24) & 0xff;
|
|
297 *cp++ = (val >> 16) & 0xff;
|
|
298 *cp++ = (val >> 8 ) & 0xff;
|
|
299 *cp = val & 0xff;
|
|
300 return 9;
|
|
301 }
|
|
302 }
|
|
303
|
|
304 int ltf8_get(char *cp, int64_t *val_p) {
|
|
305 unsigned char *up = (unsigned char *)cp;
|
|
306
|
|
307 if (up[0] < 0x80) {
|
|
308 *val_p = up[0];
|
|
309 return 1;
|
|
310 } else if (up[0] < 0xc0) {
|
|
311 *val_p = (((uint64_t)up[0]<< 8) |
|
|
312 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
|
|
313 return 2;
|
|
314 } else if (up[0] < 0xe0) {
|
|
315 *val_p = (((uint64_t)up[0]<<16) |
|
|
316 ((uint64_t)up[1]<< 8) |
|
|
317 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
|
|
318 return 3;
|
|
319 } else if (up[0] < 0xf0) {
|
|
320 *val_p = (((uint64_t)up[0]<<24) |
|
|
321 ((uint64_t)up[1]<<16) |
|
|
322 ((uint64_t)up[2]<< 8) |
|
|
323 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
|
|
324 return 4;
|
|
325 } else if (up[0] < 0xf8) {
|
|
326 *val_p = (((uint64_t)up[0]<<32) |
|
|
327 ((uint64_t)up[1]<<24) |
|
|
328 ((uint64_t)up[2]<<16) |
|
|
329 ((uint64_t)up[3]<< 8) |
|
|
330 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
|
|
331 return 5;
|
|
332 } else if (up[0] < 0xfc) {
|
|
333 *val_p = (((uint64_t)up[0]<<40) |
|
|
334 ((uint64_t)up[1]<<32) |
|
|
335 ((uint64_t)up[2]<<24) |
|
|
336 ((uint64_t)up[3]<<16) |
|
|
337 ((uint64_t)up[4]<< 8) |
|
|
338 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
|
|
339 return 6;
|
|
340 } else if (up[0] < 0xfe) {
|
|
341 *val_p = (((uint64_t)up[0]<<48) |
|
|
342 ((uint64_t)up[1]<<40) |
|
|
343 ((uint64_t)up[2]<<32) |
|
|
344 ((uint64_t)up[3]<<24) |
|
|
345 ((uint64_t)up[4]<<16) |
|
|
346 ((uint64_t)up[5]<< 8) |
|
|
347 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
|
|
348 return 7;
|
|
349 } else if (up[0] < 0xff) {
|
|
350 *val_p = (((uint64_t)up[1]<<48) |
|
|
351 ((uint64_t)up[2]<<40) |
|
|
352 ((uint64_t)up[3]<<32) |
|
|
353 ((uint64_t)up[4]<<24) |
|
|
354 ((uint64_t)up[5]<<16) |
|
|
355 ((uint64_t)up[6]<< 8) |
|
|
356 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
|
|
357 return 8;
|
|
358 } else {
|
|
359 *val_p = (((uint64_t)up[1]<<56) |
|
|
360 ((uint64_t)up[2]<<48) |
|
|
361 ((uint64_t)up[3]<<40) |
|
|
362 ((uint64_t)up[4]<<32) |
|
|
363 ((uint64_t)up[5]<<24) |
|
|
364 ((uint64_t)up[6]<<16) |
|
|
365 ((uint64_t)up[7]<< 8) |
|
|
366 (uint64_t)up[8]);
|
|
367 return 9;
|
|
368 }
|
|
369 }
|
|
370
|
|
371 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
|
|
372 int c = hgetc(fd->fp);
|
|
373 int64_t val = (unsigned char)c;
|
|
374 if (c == -1)
|
|
375 return -1;
|
|
376
|
|
377 if (val < 0x80) {
|
|
378 *val_p = val;
|
|
379 return 1;
|
|
380
|
|
381 } else if (val < 0xc0) {
|
|
382 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
383 *val_p = val & (((1LL<<(6+8)))-1);
|
|
384 return 2;
|
|
385
|
|
386 } else if (val < 0xe0) {
|
|
387 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
388 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
389 *val_p = val & ((1LL<<(5+2*8))-1);
|
|
390 return 3;
|
|
391
|
|
392 } else if (val < 0xf0) {
|
|
393 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
394 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
395 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
396 *val_p = val & ((1LL<<(4+3*8))-1);
|
|
397 return 4;
|
|
398
|
|
399 } else if (val < 0xf8) {
|
|
400 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
401 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
402 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
403 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
404 *val_p = val & ((1LL<<(3+4*8))-1);
|
|
405 return 5;
|
|
406
|
|
407 } else if (val < 0xfc) {
|
|
408 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
409 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
410 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
411 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
412 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
413 *val_p = val & ((1LL<<(2+5*8))-1);
|
|
414 return 6;
|
|
415
|
|
416 } else if (val < 0xfe) {
|
|
417 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
418 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
419 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
420 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
421 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
422 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
423 *val_p = val & ((1LL<<(1+6*8))-1);
|
|
424 return 7;
|
|
425
|
|
426 } else if (val < 0xff) {
|
|
427 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
428 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
429 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
430 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
431 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
432 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
433 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
434 *val_p = val & ((1LL<<(7*8))-1);
|
|
435 return 8;
|
|
436
|
|
437 } else {
|
|
438 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
439 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
440 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
441 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
442 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
443 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
444 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
445 val = (val<<8) | (unsigned char)hgetc(fd->fp);
|
|
446 *val_p = val;
|
|
447 }
|
|
448
|
|
449 return 9;
|
|
450 }
|
|
451
|
|
452 /*
|
|
453 * Pushes a value in ITF8 format onto the end of a block.
|
|
454 * This shouldn't be used for high-volume data as it is not the fastest
|
|
455 * method.
|
|
456 *
|
|
457 * Returns the number of bytes written
|
|
458 */
|
|
459 int itf8_put_blk(cram_block *blk, int val) {
|
|
460 char buf[5];
|
|
461 int sz;
|
|
462
|
|
463 sz = itf8_put(buf, val);
|
|
464 BLOCK_APPEND(blk, buf, sz);
|
|
465 return sz;
|
|
466 }
|
|
467
|
|
468 /*
|
|
469 * Decodes a 32-bit little endian value from fd and stores in val.
|
|
470 *
|
|
471 * Returns the number of bytes read on success
|
|
472 * -1 on failure
|
|
473 */
|
|
474 int int32_decode(cram_fd *fd, int32_t *val) {
|
|
475 int32_t i;
|
|
476 if (4 != hread(fd->fp, &i, 4))
|
|
477 return -1;
|
|
478
|
|
479 *val = le_int4(i);
|
|
480 return 4;
|
|
481 }
|
|
482
|
|
483 /*
|
|
484 * Encodes a 32-bit little endian value 'val' and writes to fd.
|
|
485 *
|
|
486 * Returns the number of bytes written on success
|
|
487 * -1 on failure
|
|
488 */
|
|
489 int int32_encode(cram_fd *fd, int32_t val) {
|
|
490 val = le_int4(val);
|
|
491 if (4 != hwrite(fd->fp, &val, 4))
|
|
492 return -1;
|
|
493
|
|
494 return 4;
|
|
495 }
|
|
496
|
|
497 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
|
|
498 int int32_get(cram_block *b, int32_t *val) {
|
|
499 if (b->uncomp_size - BLOCK_SIZE(b) < 4)
|
|
500 return -1;
|
|
501
|
|
502 *val =
|
|
503 b->data[b->byte ] |
|
|
504 (b->data[b->byte+1] << 8) |
|
|
505 (b->data[b->byte+2] << 16) |
|
|
506 (b->data[b->byte+3] << 24);
|
|
507 BLOCK_SIZE(b) += 4;
|
|
508 return 4;
|
|
509 }
|
|
510
|
|
511 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
|
|
512 int int32_put(cram_block *b, int32_t val) {
|
|
513 unsigned char cp[4];
|
|
514 cp[0] = ( val & 0xff);
|
|
515 cp[1] = ((val>>8) & 0xff);
|
|
516 cp[2] = ((val>>16) & 0xff);
|
|
517 cp[3] = ((val>>24) & 0xff);
|
|
518
|
|
519 BLOCK_APPEND(b, cp, 4);
|
|
520 return b->data ? 0 : -1;
|
|
521 }
|
|
522
|
|
523 /* ----------------------------------------------------------------------
|
|
524 * zlib compression code - from Gap5's tg_iface_g.c
|
|
525 * They're static here as they're only used within the cram_compress_block
|
|
526 * and cram_uncompress_block functions, which are the external interface.
|
|
527 */
|
|
528 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
|
|
529 z_stream s;
|
|
530 unsigned char *data = NULL; /* Uncompressed output */
|
|
531 int data_alloc = 0;
|
|
532 int err;
|
|
533
|
|
534 /* Starting point at uncompressed size, and scale after that */
|
|
535 data = malloc(data_alloc = csize*1.2+100);
|
|
536 if (!data)
|
|
537 return NULL;
|
|
538
|
|
539 /* Initialise zlib stream */
|
|
540 s.zalloc = Z_NULL; /* use default allocation functions */
|
|
541 s.zfree = Z_NULL;
|
|
542 s.opaque = Z_NULL;
|
|
543 s.next_in = (unsigned char *)cdata;
|
|
544 s.avail_in = csize;
|
|
545 s.total_in = 0;
|
|
546 s.next_out = data;
|
|
547 s.avail_out = data_alloc;
|
|
548 s.total_out = 0;
|
|
549
|
|
550 //err = inflateInit(&s);
|
|
551 err = inflateInit2(&s, 15 + 32);
|
|
552 if (err != Z_OK) {
|
|
553 fprintf(stderr, "zlib inflateInit error: %s\n", s.msg);
|
|
554 free(data);
|
|
555 return NULL;
|
|
556 }
|
|
557
|
|
558 /* Decode to 'data' array */
|
|
559 for (;s.avail_in;) {
|
|
560 unsigned char *data_tmp;
|
|
561 int alloc_inc;
|
|
562
|
|
563 s.next_out = &data[s.total_out];
|
|
564 err = inflate(&s, Z_NO_FLUSH);
|
|
565 if (err == Z_STREAM_END)
|
|
566 break;
|
|
567
|
|
568 if (err != Z_OK) {
|
|
569 fprintf(stderr, "zlib inflate error: %s\n", s.msg);
|
|
570 break;
|
|
571 }
|
|
572
|
|
573 /* More to come, so realloc based on growth so far */
|
|
574 alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
|
|
575 data = realloc((data_tmp = data), data_alloc += alloc_inc);
|
|
576 if (!data) {
|
|
577 free(data_tmp);
|
|
578 return NULL;
|
|
579 }
|
|
580 s.avail_out += alloc_inc;
|
|
581 }
|
|
582 inflateEnd(&s);
|
|
583
|
|
584 *size = s.total_out;
|
|
585 return (char *)data;
|
|
586 }
|
|
587
|
|
588 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
|
|
589 int level, int strat) {
|
|
590 z_stream s;
|
|
591 unsigned char *cdata = NULL; /* Compressed output */
|
|
592 int cdata_alloc = 0;
|
|
593 int cdata_pos = 0;
|
|
594 int err;
|
|
595
|
|
596 cdata = malloc(cdata_alloc = size*1.05+100);
|
|
597 if (!cdata)
|
|
598 return NULL;
|
|
599 cdata_pos = 0;
|
|
600
|
|
601 /* Initialise zlib stream */
|
|
602 s.zalloc = Z_NULL; /* use default allocation functions */
|
|
603 s.zfree = Z_NULL;
|
|
604 s.opaque = Z_NULL;
|
|
605 s.next_in = (unsigned char *)data;
|
|
606 s.avail_in = size;
|
|
607 s.total_in = 0;
|
|
608 s.next_out = cdata;
|
|
609 s.avail_out = cdata_alloc;
|
|
610 s.total_out = 0;
|
|
611 s.data_type = Z_BINARY;
|
|
612
|
|
613 err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
|
|
614 if (err != Z_OK) {
|
|
615 fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg);
|
|
616 return NULL;
|
|
617 }
|
|
618
|
|
619 /* Encode to 'cdata' array */
|
|
620 for (;s.avail_in;) {
|
|
621 s.next_out = &cdata[cdata_pos];
|
|
622 s.avail_out = cdata_alloc - cdata_pos;
|
|
623 if (cdata_alloc - cdata_pos <= 0) {
|
|
624 fprintf(stderr, "Deflate produced larger output than expected. Abort\n");
|
|
625 return NULL;
|
|
626 }
|
|
627 err = deflate(&s, Z_NO_FLUSH);
|
|
628 cdata_pos = cdata_alloc - s.avail_out;
|
|
629 if (err != Z_OK) {
|
|
630 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
|
|
631 break;
|
|
632 }
|
|
633 }
|
|
634 if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
|
|
635 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
|
|
636 }
|
|
637 *cdata_size = s.total_out;
|
|
638
|
|
639 if (deflateEnd(&s) != Z_OK) {
|
|
640 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
|
|
641 }
|
|
642 return (char *)cdata;
|
|
643 }
|
|
644
|
|
645 #ifdef HAVE_LIBLZMA
|
|
646 /* ------------------------------------------------------------------------ */
|
|
647 /*
|
|
648 * Data compression routines using liblzma (xz)
|
|
649 *
|
|
650 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
|
|
651 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
|
|
652 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
|
|
653 * as compression times.
|
|
654 *
|
|
655 * For now we disable this functionality. If it's to be reenabled make sure you
|
|
656 * improve the mem_inflate implementation as it's just a test hack at the
|
|
657 * moment.
|
|
658 */
|
|
659
|
|
660 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
|
|
661 int level) {
|
|
662 char *out;
|
|
663 size_t out_size = lzma_stream_buffer_bound(size);
|
|
664 *cdata_size = 0;
|
|
665
|
|
666 out = malloc(out_size);
|
|
667
|
|
668 /* Single call compression */
|
|
669 if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
|
|
670 (uint8_t *)data, size,
|
|
671 (uint8_t *)out, cdata_size,
|
|
672 out_size))
|
|
673 return NULL;
|
|
674
|
|
675 return out;
|
|
676 }
|
|
677
|
|
678 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
|
|
679 lzma_stream strm = LZMA_STREAM_INIT;
|
|
680 size_t out_size = 0, out_pos = 0;
|
|
681 char *out = NULL;
|
|
682 int r;
|
|
683
|
|
684 /* Initiate the decoder */
|
|
685 if (LZMA_OK != lzma_stream_decoder(&strm, 50000000, 0))
|
|
686 return NULL;
|
|
687
|
|
688 /* Decode loop */
|
|
689 strm.avail_in = csize;
|
|
690 strm.next_in = (uint8_t *)cdata;
|
|
691
|
|
692 for (;strm.avail_in;) {
|
|
693 if (strm.avail_in > out_size - out_pos) {
|
|
694 out_size += strm.avail_in * 4 + 32768;
|
|
695 out = realloc(out, out_size);
|
|
696 }
|
|
697 strm.avail_out = out_size - out_pos;
|
|
698 strm.next_out = (uint8_t *)&out[out_pos];
|
|
699
|
|
700 r = lzma_code(&strm, LZMA_RUN);
|
|
701 if (LZMA_OK != r && LZMA_STREAM_END != r) {
|
|
702 fprintf(stderr, "r=%d\n", r);
|
|
703 fprintf(stderr, "mem=%"PRId64"d\n", (int64_t)lzma_memusage(&strm));
|
|
704 return NULL;
|
|
705 }
|
|
706
|
|
707 out_pos = strm.total_out;
|
|
708
|
|
709 if (r == LZMA_STREAM_END)
|
|
710 break;
|
|
711 }
|
|
712
|
|
713 /* finish up any unflushed data; necessary? */
|
|
714 r = lzma_code(&strm, LZMA_FINISH);
|
|
715 if (r != LZMA_OK && r != LZMA_STREAM_END) {
|
|
716 fprintf(stderr, "r=%d\n", r);
|
|
717 return NULL;
|
|
718 }
|
|
719
|
|
720 out = realloc(out, strm.total_out);
|
|
721 *size = strm.total_out;
|
|
722
|
|
723 lzma_end(&strm);
|
|
724
|
|
725 return out;
|
|
726 }
|
|
727 #endif
|
|
728
|
|
729 /* ----------------------------------------------------------------------
|
|
730 * CRAM blocks - the dynamically growable data block. We have code to
|
|
731 * create, update, (un)compress and read/write.
|
|
732 *
|
|
733 * These are derived from the deflate_interlaced.c blocks, but with the
|
|
734 * CRAM extension of content types and IDs.
|
|
735 */
|
|
736
|
|
737 /*
|
|
738 * Allocates a new cram_block structure with a specified content_type and
|
|
739 * id.
|
|
740 *
|
|
741 * Returns block pointer on success
|
|
742 * NULL on failure
|
|
743 */
|
|
744 cram_block *cram_new_block(enum cram_content_type content_type,
|
|
745 int content_id) {
|
|
746 cram_block *b = malloc(sizeof(*b));
|
|
747 if (!b)
|
|
748 return NULL;
|
|
749 b->method = b->orig_method = RAW;
|
|
750 b->content_type = content_type;
|
|
751 b->content_id = content_id;
|
|
752 b->comp_size = 0;
|
|
753 b->uncomp_size = 0;
|
|
754 b->data = NULL;
|
|
755 b->alloc = 0;
|
|
756 b->byte = 0;
|
|
757 b->bit = 7; // MSB
|
|
758
|
|
759 return b;
|
|
760 }
|
|
761
|
|
762 /*
|
|
763 * Reads a block from a cram file.
|
|
764 * Returns cram_block pointer on success.
|
|
765 * NULL on failure
|
|
766 */
|
|
767 cram_block *cram_read_block(cram_fd *fd) {
|
|
768 cram_block *b = malloc(sizeof(*b));
|
|
769 if (!b)
|
|
770 return NULL;
|
|
771
|
|
772 //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
|
|
773
|
|
774 if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; }
|
|
775 if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
|
|
776 if (-1 == itf8_decode(fd, &b->content_id)) { free(b); return NULL; }
|
|
777 if (-1 == itf8_decode(fd, &b->comp_size)) { free(b); return NULL; }
|
|
778 if (-1 == itf8_decode(fd, &b->uncomp_size)) { free(b); return NULL; }
|
|
779
|
|
780 // fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
|
|
781 // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
|
|
782
|
|
783 if (b->method == RAW) {
|
|
784 b->alloc = b->uncomp_size;
|
|
785 if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
|
|
786 if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
|
|
787 free(b->data);
|
|
788 free(b);
|
|
789 return NULL;
|
|
790 }
|
|
791 } else {
|
|
792 b->alloc = b->comp_size;
|
|
793 if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; }
|
|
794 if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
|
|
795 free(b->data);
|
|
796 free(b);
|
|
797 return NULL;
|
|
798 }
|
|
799 }
|
|
800
|
|
801 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
|
802 unsigned char dat[100], *cp = dat;;
|
|
803 uint32_t crc;
|
|
804
|
|
805
|
|
806 if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
|
|
807 free(b);
|
|
808 return NULL;
|
|
809 }
|
|
810
|
|
811 *cp++ = b->method;
|
|
812 *cp++ = b->content_type;
|
|
813 cp += itf8_put(cp, b->content_id);
|
|
814 cp += itf8_put(cp, b->comp_size);
|
|
815 cp += itf8_put(cp, b->uncomp_size);
|
|
816 crc = crc32(0L, dat, cp-dat);
|
|
817 crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc);
|
|
818
|
|
819 if (crc != b->crc32) {
|
|
820 fprintf(stderr, "Block CRC32 failure\n");
|
|
821 free(b->data);
|
|
822 free(b);
|
|
823 return NULL;
|
|
824 }
|
|
825 }
|
|
826
|
|
827 b->orig_method = b->method;
|
|
828 b->idx = 0;
|
|
829 b->byte = 0;
|
|
830 b->bit = 7; // MSB
|
|
831
|
|
832 return b;
|
|
833 }
|
|
834
|
|
835 /*
|
|
836 * Writes a CRAM block.
|
|
837 * Returns 0 on success
|
|
838 * -1 on failure
|
|
839 */
|
|
840 int cram_write_block(cram_fd *fd, cram_block *b) {
|
|
841 assert(b->method != RAW || (b->comp_size == b->uncomp_size));
|
|
842
|
|
843 if (hputc(b->method, fd->fp) == EOF) return -1;
|
|
844 if (hputc(b->content_type, fd->fp) == EOF) return -1;
|
|
845 if (itf8_encode(fd, b->content_id) == -1) return -1;
|
|
846 if (itf8_encode(fd, b->comp_size) == -1) return -1;
|
|
847 if (itf8_encode(fd, b->uncomp_size) == -1) return -1;
|
|
848
|
|
849 if (b->method == RAW) {
|
|
850 if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
|
|
851 return -1;
|
|
852 } else {
|
|
853 if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
|
|
854 return -1;
|
|
855 }
|
|
856
|
|
857 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
|
858 unsigned char dat[100], *cp = dat;;
|
|
859 uint32_t crc;
|
|
860
|
|
861 *cp++ = b->method;
|
|
862 *cp++ = b->content_type;
|
|
863 cp += itf8_put(cp, b->content_id);
|
|
864 cp += itf8_put(cp, b->comp_size);
|
|
865 cp += itf8_put(cp, b->uncomp_size);
|
|
866 crc = crc32(0L, dat, cp-dat);
|
|
867
|
|
868 if (b->method == RAW) {
|
|
869 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
|
|
870 } else {
|
|
871 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
|
|
872 }
|
|
873
|
|
874 if (-1 == int32_encode(fd, b->crc32))
|
|
875 return -1;
|
|
876 }
|
|
877
|
|
878 return 0;
|
|
879 }
|
|
880
|
|
881 /*
|
|
882 * Frees a CRAM block, deallocating internal data too.
|
|
883 */
|
|
884 void cram_free_block(cram_block *b) {
|
|
885 if (!b)
|
|
886 return;
|
|
887 if (b->data)
|
|
888 free(b->data);
|
|
889 free(b);
|
|
890 }
|
|
891
|
|
892 /*
|
|
893 * Uncompresses a CRAM block, if compressed.
|
|
894 */
|
|
895 int cram_uncompress_block(cram_block *b) {
|
|
896 char *uncomp;
|
|
897 size_t uncomp_size = 0;
|
|
898
|
|
899 if (b->uncomp_size == 0) {
|
|
900 // blank block
|
|
901 b->method = RAW;
|
|
902 return 0;
|
|
903 }
|
|
904
|
|
905 switch (b->method) {
|
|
906 case RAW:
|
|
907 return 0;
|
|
908
|
|
909 case GZIP:
|
|
910 uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
|
|
911 if (!uncomp)
|
|
912 return -1;
|
|
913 if ((int)uncomp_size != b->uncomp_size) {
|
|
914 free(uncomp);
|
|
915 return -1;
|
|
916 }
|
|
917 free(b->data);
|
|
918 b->data = (unsigned char *)uncomp;
|
|
919 b->alloc = uncomp_size;
|
|
920 b->method = RAW;
|
|
921 break;
|
|
922
|
|
923 #ifdef HAVE_LIBBZ2
|
|
924 case BZIP2: {
|
|
925 unsigned int usize = b->uncomp_size;
|
|
926 if (!(uncomp = malloc(usize)))
|
|
927 return -1;
|
|
928 if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
|
|
929 (char *)b->data, b->comp_size,
|
|
930 0, 0)) {
|
|
931 free(uncomp);
|
|
932 return -1;
|
|
933 }
|
|
934 free(b->data);
|
|
935 b->data = (unsigned char *)uncomp;
|
|
936 b->alloc = usize;
|
|
937 b->method = RAW;
|
|
938 b->uncomp_size = usize; // Just incase it differs
|
|
939 break;
|
|
940 }
|
|
941 #else
|
|
942 case BZIP2:
|
|
943 fprintf(stderr, "Bzip2 compression is not compiled into this "
|
|
944 "version.\nPlease rebuild and try again.\n");
|
|
945 return -1;
|
|
946 #endif
|
|
947
|
|
948 #ifdef HAVE_LIBLZMA
|
|
949 case LZMA:
|
|
950 uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
|
|
951 if (!uncomp)
|
|
952 return -1;
|
|
953 if ((int)uncomp_size != b->uncomp_size)
|
|
954 return -1;
|
|
955 free(b->data);
|
|
956 b->data = (unsigned char *)uncomp;
|
|
957 b->alloc = uncomp_size;
|
|
958 b->method = RAW;
|
|
959 break;
|
|
960 #else
|
|
961 case LZMA:
|
|
962 fprintf(stderr, "Lzma compression is not compiled into this "
|
|
963 "version.\nPlease rebuild and try again.\n");
|
|
964 return -1;
|
|
965 break;
|
|
966 #endif
|
|
967
|
|
968 case RANS: {
|
|
969 unsigned int usize = b->uncomp_size, usize2;
|
|
970 uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
|
|
971 assert(usize == usize2);
|
|
972 free(b->data);
|
|
973 b->data = (unsigned char *)uncomp;
|
|
974 b->alloc = usize2;
|
|
975 b->method = RAW;
|
|
976 b->uncomp_size = usize2; // Just incase it differs
|
|
977 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
|
|
978 break;
|
|
979 }
|
|
980
|
|
981 default:
|
|
982 return -1;
|
|
983 }
|
|
984
|
|
985 return 0;
|
|
986 }
|
|
987
|
|
988 static char *cram_compress_by_method(char *in, size_t in_size,
|
|
989 size_t *out_size,
|
|
990 enum cram_block_method method,
|
|
991 int level, int strat) {
|
|
992 switch (method) {
|
|
993 case GZIP:
|
|
994 return zlib_mem_deflate(in, in_size, out_size, level, strat);
|
|
995
|
|
996 case BZIP2: {
|
|
997 #ifdef HAVE_LIBBZ2
|
|
998 unsigned int comp_size = in_size*1.01 + 600;
|
|
999 char *comp = malloc(comp_size);
|
|
1000 if (!comp)
|
|
1001 return NULL;
|
|
1002
|
|
1003 if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
|
|
1004 in, in_size,
|
|
1005 level, 0, 30)) {
|
|
1006 free(comp);
|
|
1007 return NULL;
|
|
1008 }
|
|
1009 *out_size = comp_size;
|
|
1010 return comp;
|
|
1011 #else
|
|
1012 return NULL;
|
|
1013 #endif
|
|
1014 }
|
|
1015
|
|
1016 case LZMA:
|
|
1017 #ifdef HAVE_LIBLZMA
|
|
1018 return lzma_mem_deflate(in, in_size, out_size, level);
|
|
1019 #else
|
|
1020 return NULL;
|
|
1021 #endif
|
|
1022
|
|
1023 case RANS0: {
|
|
1024 unsigned int out_size_i;
|
|
1025 unsigned char *cp;
|
|
1026 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0);
|
|
1027 *out_size = out_size_i;
|
|
1028 return (char *)cp;
|
|
1029 }
|
|
1030
|
|
1031 case RANS1: {
|
|
1032 unsigned int out_size_i;
|
|
1033 unsigned char *cp;
|
|
1034
|
|
1035 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1);
|
|
1036 *out_size = out_size_i;
|
|
1037 return (char *)cp;
|
|
1038 }
|
|
1039
|
|
1040 case RAW:
|
|
1041 break;
|
|
1042
|
|
1043 default:
|
|
1044 return NULL;
|
|
1045 }
|
|
1046
|
|
1047 return NULL;
|
|
1048 }
|
|
1049
|
|
1050
|
|
1051 /*
|
|
1052 * Compresses a block using one of two different zlib strategies. If we only
|
|
1053 * want one choice set strat2 to be -1.
|
|
1054 *
|
|
1055 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
|
|
1056 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
|
|
1057 * significantly faster.
|
|
1058 */
|
|
1059 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
|
|
1060 int method, int level) {
|
|
1061
|
|
1062 char *comp = NULL;
|
|
1063 size_t comp_size = 0;
|
|
1064 int strat;
|
|
1065
|
|
1066 //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
|
|
1067
|
|
1068 if (method == RAW || level == 0 || b->uncomp_size == 0) {
|
|
1069 b->method = RAW;
|
|
1070 b->comp_size = b->uncomp_size;
|
|
1071 //fprintf(stderr, "Skip block id %d\n", b->content_id);
|
|
1072 return 0;
|
|
1073 }
|
|
1074
|
|
1075 if (metrics) {
|
|
1076 pthread_mutex_lock(&fd->metrics_lock);
|
|
1077 if (metrics->trial > 0 || --metrics->next_trial <= 0) {
|
|
1078 size_t sz_best = INT_MAX;
|
|
1079 size_t sz_gz_rle = 0;
|
|
1080 size_t sz_gz_def = 0;
|
|
1081 size_t sz_rans0 = 0;
|
|
1082 size_t sz_rans1 = 0;
|
|
1083 size_t sz_bzip2 = 0;
|
|
1084 size_t sz_lzma = 0;
|
|
1085 int method_best = 0;
|
|
1086 char *c_best = NULL, *c = NULL;
|
|
1087
|
|
1088 if (metrics->revised_method)
|
|
1089 method = metrics->revised_method;
|
|
1090 else
|
|
1091 metrics->revised_method = method;
|
|
1092
|
|
1093 if (metrics->next_trial == 0) {
|
|
1094 metrics->next_trial = TRIAL_SPAN;
|
|
1095 metrics->trial = NTRIALS;
|
|
1096 metrics->sz_gz_rle /= 2;
|
|
1097 metrics->sz_gz_def /= 2;
|
|
1098 metrics->sz_rans0 /= 2;
|
|
1099 metrics->sz_rans1 /= 2;
|
|
1100 metrics->sz_bzip2 /= 2;
|
|
1101 metrics->sz_lzma /= 2;
|
|
1102 }
|
|
1103
|
|
1104 pthread_mutex_unlock(&fd->metrics_lock);
|
|
1105
|
|
1106 if (method & (1<<GZIP_RLE)) {
|
|
1107 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
|
|
1108 &sz_gz_rle, GZIP, 1, Z_RLE);
|
|
1109 if (c && sz_best > sz_gz_rle) {
|
|
1110 sz_best = sz_gz_rle;
|
|
1111 method_best = GZIP_RLE;
|
|
1112 if (c_best)
|
|
1113 free(c_best);
|
|
1114 c_best = c;
|
|
1115 } else if (c) {
|
|
1116 free(c);
|
|
1117 } else {
|
|
1118 sz_gz_rle = b->uncomp_size*2+1000;
|
|
1119 }
|
|
1120
|
|
1121 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
|
|
1122 }
|
|
1123
|
|
1124 if (method & (1<<GZIP)) {
|
|
1125 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
|
|
1126 &sz_gz_def, GZIP, level,
|
|
1127 Z_FILTERED);
|
|
1128 if (c && sz_best > sz_gz_def) {
|
|
1129 sz_best = sz_gz_def;
|
|
1130 method_best = GZIP;
|
|
1131 if (c_best)
|
|
1132 free(c_best);
|
|
1133 c_best = c;
|
|
1134 } else if (c) {
|
|
1135 free(c);
|
|
1136 } else {
|
|
1137 sz_gz_def = b->uncomp_size*2+1000;
|
|
1138 }
|
|
1139
|
|
1140 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
|
|
1141 }
|
|
1142
|
|
1143 if (method & (1<<RANS0)) {
|
|
1144 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
|
|
1145 &sz_rans0, RANS0, 0, 0);
|
|
1146 if (c && sz_best > sz_rans0) {
|
|
1147 sz_best = sz_rans0;
|
|
1148 method_best = RANS0;
|
|
1149 if (c_best)
|
|
1150 free(c_best);
|
|
1151 c_best = c;
|
|
1152 } else if (c) {
|
|
1153 free(c);
|
|
1154 } else {
|
|
1155 sz_rans0 = b->uncomp_size*2+1000;
|
|
1156 }
|
|
1157 }
|
|
1158
|
|
1159 if (method & (1<<RANS1)) {
|
|
1160 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
|
|
1161 &sz_rans1, RANS1, 0, 0);
|
|
1162 if (c && sz_best > sz_rans1) {
|
|
1163 sz_best = sz_rans1;
|
|
1164 method_best = RANS1;
|
|
1165 if (c_best)
|
|
1166 free(c_best);
|
|
1167 c_best = c;
|
|
1168 } else if (c) {
|
|
1169 free(c);
|
|
1170 } else {
|
|
1171 sz_rans1 = b->uncomp_size*2+1000;
|
|
1172 }
|
|
1173 }
|
|
1174
|
|
1175 if (method & (1<<BZIP2)) {
|
|
1176 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
|
|
1177 &sz_bzip2, BZIP2, level, 0);
|
|
1178 if (c && sz_best > sz_bzip2) {
|
|
1179 sz_best = sz_bzip2;
|
|
1180 method_best = BZIP2;
|
|
1181 if (c_best)
|
|
1182 free(c_best);
|
|
1183 c_best = c;
|
|
1184 } else if (c) {
|
|
1185 free(c);
|
|
1186 } else {
|
|
1187 sz_bzip2 = b->uncomp_size*2+1000;
|
|
1188 }
|
|
1189 }
|
|
1190
|
|
1191 if (method & (1<<LZMA)) {
|
|
1192 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
|
|
1193 &sz_lzma, LZMA, level, 0);
|
|
1194 if (c && sz_best > sz_lzma) {
|
|
1195 sz_best = sz_lzma;
|
|
1196 method_best = LZMA;
|
|
1197 if (c_best)
|
|
1198 free(c_best);
|
|
1199 c_best = c;
|
|
1200 } else if (c) {
|
|
1201 free(c);
|
|
1202 } else {
|
|
1203 sz_lzma = b->uncomp_size*2+1000;
|
|
1204 }
|
|
1205 }
|
|
1206
|
|
1207 //fprintf(stderr, "sz_best = %d\n", sz_best);
|
|
1208
|
|
1209 free(b->data);
|
|
1210 b->data = (unsigned char *)c_best;
|
|
1211 //printf("method_best = %s\n", cram_block_method2str(method_best));
|
|
1212 b->method = method_best == GZIP_RLE ? GZIP : method_best;
|
|
1213 b->comp_size = sz_best;
|
|
1214
|
|
1215 pthread_mutex_lock(&fd->metrics_lock);
|
|
1216 metrics->sz_gz_rle += sz_gz_rle;
|
|
1217 metrics->sz_gz_def += sz_gz_def;
|
|
1218 metrics->sz_rans0 += sz_rans0;
|
|
1219 metrics->sz_rans1 += sz_rans1;
|
|
1220 metrics->sz_bzip2 += sz_bzip2;
|
|
1221 metrics->sz_lzma += sz_lzma;
|
|
1222 if (--metrics->trial == 0) {
|
|
1223 int best_method = RAW;
|
|
1224 int best_sz = INT_MAX;
|
|
1225
|
|
1226 // Scale methods by cost
|
|
1227 if (fd->level <= 3) {
|
|
1228 metrics->sz_rans1 *= 1.02;
|
|
1229 metrics->sz_gz_def *= 1.04;
|
|
1230 metrics->sz_bzip2 *= 1.08;
|
|
1231 metrics->sz_lzma *= 1.10;
|
|
1232 } else if (fd->level <= 6) {
|
|
1233 metrics->sz_rans1 *= 1.01;
|
|
1234 metrics->sz_gz_def *= 1.02;
|
|
1235 metrics->sz_bzip2 *= 1.03;
|
|
1236 metrics->sz_lzma *= 1.05;
|
|
1237 }
|
|
1238
|
|
1239 if (method & (1<<GZIP_RLE) && best_sz > metrics->sz_gz_rle)
|
|
1240 best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE;
|
|
1241
|
|
1242 if (method & (1<<GZIP) && best_sz > metrics->sz_gz_def)
|
|
1243 best_sz = metrics->sz_gz_def, best_method = GZIP;
|
|
1244
|
|
1245 if (method & (1<<RANS0) && best_sz > metrics->sz_rans0)
|
|
1246 best_sz = metrics->sz_rans0, best_method = RANS0;
|
|
1247
|
|
1248 if (method & (1<<RANS1) && best_sz > metrics->sz_rans1)
|
|
1249 best_sz = metrics->sz_rans1, best_method = RANS1;
|
|
1250
|
|
1251 if (method & (1<<BZIP2) && best_sz > metrics->sz_bzip2)
|
|
1252 best_sz = metrics->sz_bzip2, best_method = BZIP2;
|
|
1253
|
|
1254 if (method & (1<<LZMA) && best_sz > metrics->sz_lzma)
|
|
1255 best_sz = metrics->sz_lzma, best_method = LZMA;
|
|
1256
|
|
1257 if (best_method == GZIP_RLE) {
|
|
1258 metrics->method = GZIP;
|
|
1259 metrics->strat = Z_RLE;
|
|
1260 } else {
|
|
1261 metrics->method = best_method;
|
|
1262 metrics->strat = Z_FILTERED;
|
|
1263 }
|
|
1264
|
|
1265 // If we see at least MAXFAIL trials in a row for a specific
|
|
1266 // compression method with more than MAXDELTA aggregate
|
|
1267 // size then we drop this from the list of methods used
|
|
1268 // for this block type.
|
|
1269 #define MAXDELTA 0.20
|
|
1270 #define MAXFAILS 4
|
|
1271 if (best_method == GZIP_RLE) {
|
|
1272 metrics->gz_rle_cnt = 0;
|
|
1273 metrics->gz_rle_extra = 0;
|
|
1274 } else if (best_sz < metrics->sz_gz_rle) {
|
|
1275 double r = (double)metrics->sz_gz_rle / best_sz - 1;
|
|
1276 if (++metrics->gz_rle_cnt >= MAXFAILS &&
|
|
1277 (metrics->gz_rle_extra += r) >= MAXDELTA)
|
|
1278 method &= ~(1<<GZIP_RLE);
|
|
1279 }
|
|
1280
|
|
1281 if (best_method == GZIP) {
|
|
1282 metrics->gz_def_cnt = 0;
|
|
1283 metrics->gz_def_extra = 0;
|
|
1284 } else if (best_sz < metrics->sz_gz_def) {
|
|
1285 double r = (double)metrics->sz_gz_def / best_sz - 1;
|
|
1286 if (++metrics->gz_def_cnt >= MAXFAILS &&
|
|
1287 (metrics->gz_def_extra += r) >= MAXDELTA)
|
|
1288 method &= ~(1<<GZIP);
|
|
1289 }
|
|
1290
|
|
1291 if (best_method == RANS0) {
|
|
1292 metrics->rans0_cnt = 0;
|
|
1293 metrics->rans0_extra = 0;
|
|
1294 } else if (best_sz < metrics->sz_rans0) {
|
|
1295 double r = (double)metrics->sz_rans0 / best_sz - 1;
|
|
1296 if (++metrics->rans0_cnt >= MAXFAILS &&
|
|
1297 (metrics->rans0_extra += r) >= MAXDELTA)
|
|
1298 method &= ~(1<<RANS0);
|
|
1299 }
|
|
1300
|
|
1301 if (best_method == RANS1) {
|
|
1302 metrics->rans1_cnt = 0;
|
|
1303 metrics->rans1_extra = 0;
|
|
1304 } else if (best_sz < metrics->sz_rans1) {
|
|
1305 double r = (double)metrics->sz_rans1 / best_sz - 1;
|
|
1306 if (++metrics->rans1_cnt >= MAXFAILS &&
|
|
1307 (metrics->rans1_extra += r) >= MAXDELTA)
|
|
1308 method &= ~(1<<RANS1);
|
|
1309 }
|
|
1310
|
|
1311 if (best_method == BZIP2) {
|
|
1312 metrics->bzip2_cnt = 0;
|
|
1313 metrics->bzip2_extra = 0;
|
|
1314 } else if (best_sz < metrics->sz_bzip2) {
|
|
1315 double r = (double)metrics->sz_bzip2 / best_sz - 1;
|
|
1316 if (++metrics->bzip2_cnt >= MAXFAILS &&
|
|
1317 (metrics->bzip2_extra += r) >= MAXDELTA)
|
|
1318 method &= ~(1<<BZIP2);
|
|
1319 }
|
|
1320
|
|
1321 if (best_method == LZMA) {
|
|
1322 metrics->lzma_cnt = 0;
|
|
1323 metrics->lzma_extra = 0;
|
|
1324 } else if (best_sz < metrics->sz_lzma) {
|
|
1325 double r = (double)metrics->sz_lzma / best_sz - 1;
|
|
1326 if (++metrics->lzma_cnt >= MAXFAILS &&
|
|
1327 (metrics->lzma_extra += r) >= MAXDELTA)
|
|
1328 method &= ~(1<<LZMA);
|
|
1329 }
|
|
1330
|
|
1331 //if (method != metrics->revised_method)
|
|
1332 // fprintf(stderr, "%d: method from %x to %x\n",
|
|
1333 // b->content_id, metrics->revised_method, method);
|
|
1334 metrics->revised_method = method;
|
|
1335 }
|
|
1336 pthread_mutex_unlock(&fd->metrics_lock);
|
|
1337 } else {
|
|
1338 strat = metrics->strat;
|
|
1339 method = metrics->method;
|
|
1340
|
|
1341 pthread_mutex_unlock(&fd->metrics_lock);
|
|
1342 comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
|
|
1343 &comp_size, method,
|
|
1344 level, strat);
|
|
1345 if (!comp)
|
|
1346 return -1;
|
|
1347 free(b->data);
|
|
1348 b->data = (unsigned char *)comp;
|
|
1349 b->comp_size = comp_size;
|
|
1350 b->method = method;
|
|
1351 }
|
|
1352
|
|
1353 } else {
|
|
1354 // no cached metrics, so just do zlib?
|
|
1355 comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
|
|
1356 &comp_size, GZIP, level, Z_FILTERED);
|
|
1357 if (!comp) {
|
|
1358 fprintf(stderr, "Compression failed!\n");
|
|
1359 return -1;
|
|
1360 }
|
|
1361 free(b->data);
|
|
1362 b->data = (unsigned char *)comp;
|
|
1363 b->comp_size = comp_size;
|
|
1364 b->method = GZIP;
|
|
1365 }
|
|
1366
|
|
1367 if (fd->verbose)
|
|
1368 fprintf(stderr, "Compressed block ID %d from %d to %d by method %s\n",
|
|
1369 b->content_id, b->uncomp_size, b->comp_size,
|
|
1370 cram_block_method2str(b->method));
|
|
1371
|
|
1372 if (b->method == RANS1)
|
|
1373 b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing
|
|
1374
|
|
1375 return 0;
|
|
1376 }
|
|
1377
|
|
1378 cram_metrics *cram_new_metrics(void) {
|
|
1379 cram_metrics *m = calloc(1, sizeof(*m));
|
|
1380 if (!m)
|
|
1381 return NULL;
|
|
1382 m->trial = NTRIALS-1;
|
|
1383 m->next_trial = TRIAL_SPAN;
|
|
1384 m->method = RAW;
|
|
1385 m->strat = 0;
|
|
1386 m->revised_method = 0;
|
|
1387
|
|
1388 return m;
|
|
1389 }
|
|
1390
|
|
1391 char *cram_block_method2str(enum cram_block_method m) {
|
|
1392 switch(m) {
|
|
1393 case RAW: return "RAW";
|
|
1394 case GZIP: return "GZIP";
|
|
1395 case BZIP2: return "BZIP2";
|
|
1396 case LZMA: return "LZMA";
|
|
1397 case RANS0: return "RANS0";
|
|
1398 case RANS1: return "RANS1";
|
|
1399 case GZIP_RLE: return "GZIP_RLE";
|
|
1400 case ERROR: break;
|
|
1401 }
|
|
1402 return "?";
|
|
1403 }
|
|
1404
|
|
1405 char *cram_content_type2str(enum cram_content_type t) {
|
|
1406 switch (t) {
|
|
1407 case FILE_HEADER: return "FILE_HEADER";
|
|
1408 case COMPRESSION_HEADER: return "COMPRESSION_HEADER";
|
|
1409 case MAPPED_SLICE: return "MAPPED_SLICE";
|
|
1410 case UNMAPPED_SLICE: return "UNMAPPED_SLICE";
|
|
1411 case EXTERNAL: return "EXTERNAL";
|
|
1412 case CORE: return "CORE";
|
|
1413 case CT_ERROR: break;
|
|
1414 }
|
|
1415 return "?";
|
|
1416 }
|
|
1417
|
|
1418 /*
|
|
1419 * Extra error checking on fclose to really ensure data is written.
|
|
1420 * Care needs to be taken to handle pipes vs real files.
|
|
1421 *
|
|
1422 * Returns 0 on success
|
|
1423 * -1 on failure.
|
|
1424 */
|
|
1425 int paranoid_fclose(FILE *fp) {
|
|
1426 if (-1 == fflush(fp) && errno != EBADF) {
|
|
1427 fclose(fp);
|
|
1428 return -1;
|
|
1429 }
|
|
1430
|
|
1431 errno = 0;
|
|
1432 if (-1 == fsync(fileno(fp))) {
|
|
1433 if (errno != EINVAL) { // eg pipe
|
|
1434 fclose(fp);
|
|
1435 return -1;
|
|
1436 }
|
|
1437 }
|
|
1438 return fclose(fp);
|
|
1439 }
|
|
1440
|
|
1441 /* ----------------------------------------------------------------------
|
|
1442 * Reference sequence handling
|
|
1443 *
|
|
1444 * These revolve around the refs_t structure, which may potentially be
|
|
1445 * shared between multiple cram_fd.
|
|
1446 *
|
|
1447 * We start with refs_create() to allocate an empty refs_t and then
|
|
1448 * populate it with @SQ line data using refs_from_header(). This is done on
|
|
1449 * cram_open(). Also at start up we can call cram_load_reference() which
|
|
1450 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
|
|
1451 * new one specified. In either case refs2id() is then called which
|
|
1452 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
|
|
1453 *
|
|
1454 * Later, possibly within a thread, we will want to know the actual ref
|
|
1455 * seq itself, obtained by calling cram_get_ref(). This may use the
|
|
1456 * UR: or M5: fields or the filename specified in the original
|
|
1457 * cram_load_reference() call.
|
|
1458 *
|
|
1459 * Given the potential for multi-threaded reference usage, we have
|
|
1460 * reference counting (sorry for the confusing double use of "ref") to
|
|
1461 * track the number of callers interested in any specific reference.
|
|
1462 */
|
|
1463
|
|
1464 void refs_free(refs_t *r) {
|
|
1465 RP("refs_free()\n");
|
|
1466
|
|
1467 if (--r->count > 0)
|
|
1468 return;
|
|
1469
|
|
1470 if (!r)
|
|
1471 return;
|
|
1472
|
|
1473 if (r->pool)
|
|
1474 string_pool_destroy(r->pool);
|
|
1475
|
|
1476 if (r->h_meta) {
|
|
1477 khint_t k;
|
|
1478
|
|
1479 for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
|
|
1480 ref_entry *e;
|
|
1481
|
|
1482 if (!kh_exist(r->h_meta, k))
|
|
1483 continue;
|
|
1484 if (!(e = kh_val(r->h_meta, k)))
|
|
1485 continue;
|
|
1486 if (e->seq)
|
|
1487 free(e->seq);
|
|
1488 free(e);
|
|
1489 }
|
|
1490
|
|
1491 kh_destroy(refs, r->h_meta);
|
|
1492 }
|
|
1493
|
|
1494 if (r->ref_id)
|
|
1495 free(r->ref_id);
|
|
1496
|
|
1497 if (r->fp)
|
|
1498 bgzf_close(r->fp);
|
|
1499
|
|
1500 pthread_mutex_destroy(&r->lock);
|
|
1501
|
|
1502 free(r);
|
|
1503 }
|
|
1504
|
|
1505 static refs_t *refs_create(void) {
|
|
1506 refs_t *r = calloc(1, sizeof(*r));
|
|
1507
|
|
1508 RP("refs_create()\n");
|
|
1509
|
|
1510 if (!r)
|
|
1511 return NULL;
|
|
1512
|
|
1513 if (!(r->pool = string_pool_create(8192)))
|
|
1514 goto err;
|
|
1515
|
|
1516 r->ref_id = NULL; // see refs2id() to populate.
|
|
1517 r->count = 1;
|
|
1518 r->last = NULL;
|
|
1519 r->last_id = -1;
|
|
1520
|
|
1521 if (!(r->h_meta = kh_init(refs)))
|
|
1522 goto err;
|
|
1523
|
|
1524 pthread_mutex_init(&r->lock, NULL);
|
|
1525
|
|
1526 return r;
|
|
1527
|
|
1528 err:
|
|
1529 refs_free(r);
|
|
1530 return NULL;
|
|
1531 }
|
|
1532
|
|
1533 /*
|
|
1534 * Opens a reference fasta file as a BGZF stream, allowing for
|
|
1535 * compressed files. It automatically builds a .fai file if
|
|
1536 * required and if compressed a .gzi bgzf index too.
|
|
1537 *
|
|
1538 * Returns a BGZF handle on success;
|
|
1539 * NULL on failure.
|
|
1540 */
|
|
1541 static BGZF *bgzf_open_ref(char *fn, char *mode) {
|
|
1542 BGZF *fp;
|
|
1543 char fai_file[PATH_MAX];
|
|
1544
|
|
1545 snprintf(fai_file, PATH_MAX, "%s.fai", fn);
|
|
1546 if (access(fai_file, R_OK) != 0)
|
|
1547 if (fai_build(fn) != 0)
|
|
1548 return NULL;
|
|
1549
|
|
1550 if (!(fp = bgzf_open(fn, mode))) {
|
|
1551 perror(fn);
|
|
1552 return NULL;
|
|
1553 }
|
|
1554
|
|
1555 if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
|
|
1556 fprintf(stderr, "Unable to load .gzi index '%s.gzi'\n", fn);
|
|
1557 bgzf_close(fp);
|
|
1558 return NULL;
|
|
1559 }
|
|
1560
|
|
1561 return fp;
|
|
1562 }
|
|
1563
|
|
1564 /*
|
|
1565 * Loads a FAI file for a reference.fasta.
|
|
1566 * "is_err" indicates whether failure to load is worthy of emitting an
|
|
1567 * error message. In some cases (eg with embedded references) we
|
|
1568 * speculatively load, just incase, and silently ignore errors.
|
|
1569 *
|
|
1570 * Returns the refs_t struct on success (maybe newly allocated);
|
|
1571 * NULL on failure
|
|
1572 */
|
|
1573 static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
|
|
1574 struct stat sb;
|
|
1575 FILE *fp = NULL;
|
|
1576 char fai_fn[PATH_MAX];
|
|
1577 char line[8192];
|
|
1578 refs_t *r = r_orig;
|
|
1579 size_t fn_l = strlen(fn);
|
|
1580 int id = 0, id_alloc = 0;
|
|
1581
|
|
1582 RP("refs_load_fai %s\n", fn);
|
|
1583
|
|
1584 if (!r)
|
|
1585 if (!(r = refs_create()))
|
|
1586 goto err;
|
|
1587
|
|
1588 /* Open reference, for later use */
|
|
1589 if (stat(fn, &sb) != 0) {
|
|
1590 if (is_err)
|
|
1591 perror(fn);
|
|
1592 goto err;
|
|
1593 }
|
|
1594
|
|
1595 if (r->fp)
|
|
1596 if (bgzf_close(r->fp) != 0)
|
|
1597 goto err;
|
|
1598 r->fp = NULL;
|
|
1599
|
|
1600 if (!(r->fn = string_dup(r->pool, fn)))
|
|
1601 goto err;
|
|
1602
|
|
1603 if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
|
|
1604 r->fn[fn_l-4] = 0;
|
|
1605
|
|
1606 if (!(r->fp = bgzf_open_ref(r->fn, "r")))
|
|
1607 goto err;
|
|
1608
|
|
1609 /* Parse .fai file and load meta-data */
|
|
1610 sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
|
|
1611
|
|
1612 if (stat(fai_fn, &sb) != 0) {
|
|
1613 if (is_err)
|
|
1614 perror(fai_fn);
|
|
1615 goto err;
|
|
1616 }
|
|
1617 if (!(fp = fopen(fai_fn, "r"))) {
|
|
1618 if (is_err)
|
|
1619 perror(fai_fn);
|
|
1620 goto err;
|
|
1621 }
|
|
1622 while (fgets(line, 8192, fp) != NULL) {
|
|
1623 ref_entry *e = malloc(sizeof(*e));
|
|
1624 char *cp;
|
|
1625 int n;
|
|
1626 khint_t k;
|
|
1627
|
|
1628 if (!e)
|
|
1629 return NULL;
|
|
1630
|
|
1631 // id
|
|
1632 for (cp = line; *cp && !isspace(*cp); cp++)
|
|
1633 ;
|
|
1634 *cp++ = 0;
|
|
1635 e->name = string_dup(r->pool, line);
|
|
1636
|
|
1637 // length
|
|
1638 while (*cp && isspace(*cp))
|
|
1639 cp++;
|
|
1640 e->length = strtoll(cp, &cp, 10);
|
|
1641
|
|
1642 // offset
|
|
1643 while (*cp && isspace(*cp))
|
|
1644 cp++;
|
|
1645 e->offset = strtoll(cp, &cp, 10);
|
|
1646
|
|
1647 // bases per line
|
|
1648 while (*cp && isspace(*cp))
|
|
1649 cp++;
|
|
1650 e->bases_per_line = strtol(cp, &cp, 10);
|
|
1651
|
|
1652 // line length
|
|
1653 while (*cp && isspace(*cp))
|
|
1654 cp++;
|
|
1655 e->line_length = strtol(cp, &cp, 10);
|
|
1656
|
|
1657 // filename
|
|
1658 e->fn = r->fn;
|
|
1659
|
|
1660 e->count = 0;
|
|
1661 e->seq = NULL;
|
|
1662
|
|
1663 k = kh_put(refs, r->h_meta, e->name, &n);
|
|
1664 if (-1 == n) {
|
|
1665 free(e);
|
|
1666 return NULL;
|
|
1667 }
|
|
1668
|
|
1669 if (n) {
|
|
1670 kh_val(r->h_meta, k) = e;
|
|
1671 } else {
|
|
1672 ref_entry *re = kh_val(r->h_meta, k);
|
|
1673 if (re && (re->count != 0 || re->length != 0)) {
|
|
1674 /* Keep old */
|
|
1675 free(e);
|
|
1676 } else {
|
|
1677 /* Replace old */
|
|
1678 if (re)
|
|
1679 free(re);
|
|
1680 kh_val(r->h_meta, k) = e;
|
|
1681 }
|
|
1682 }
|
|
1683
|
|
1684 if (id >= id_alloc) {
|
|
1685 int x;
|
|
1686
|
|
1687 id_alloc = id_alloc ?id_alloc*2 : 16;
|
|
1688 r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
|
|
1689
|
|
1690 for (x = id; x < id_alloc; x++)
|
|
1691 r->ref_id[x] = NULL;
|
|
1692 }
|
|
1693 r->ref_id[id] = e;
|
|
1694 r->nref = ++id;
|
|
1695 }
|
|
1696
|
|
1697 return r;
|
|
1698
|
|
1699 err:
|
|
1700 if (fp)
|
|
1701 fclose(fp);
|
|
1702
|
|
1703 if (!r_orig)
|
|
1704 refs_free(r);
|
|
1705
|
|
1706 return NULL;
|
|
1707 }
|
|
1708
|
|
1709 /*
|
|
1710 * Indexes references by the order they appear in a BAM file. This may not
|
|
1711 * necessarily be the same order they appear in the fasta reference file.
|
|
1712 *
|
|
1713 * Returns 0 on success
|
|
1714 * -1 on failure
|
|
1715 */
|
|
1716 int refs2id(refs_t *r, SAM_hdr *h) {
|
|
1717 int i;
|
|
1718
|
|
1719 if (r->ref_id)
|
|
1720 free(r->ref_id);
|
|
1721 if (r->last)
|
|
1722 r->last = NULL;
|
|
1723
|
|
1724 r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
|
|
1725 if (!r->ref_id)
|
|
1726 return -1;
|
|
1727
|
|
1728 r->nref = h->nref;
|
|
1729 for (i = 0; i < h->nref; i++) {
|
|
1730 khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
|
|
1731 if (k != kh_end(r->h_meta)) {
|
|
1732 r->ref_id[i] = kh_val(r->h_meta, k);
|
|
1733 } else {
|
|
1734 fprintf(stderr, "Unable to find ref name '%s'\n",
|
|
1735 h->ref[i].name);
|
|
1736 }
|
|
1737 }
|
|
1738
|
|
1739 return 0;
|
|
1740 }
|
|
1741
|
|
1742 /*
|
|
1743 * Generates refs_t entries based on @SQ lines in the header.
|
|
1744 * Returns 0 on success
|
|
1745 * -1 on failure
|
|
1746 */
|
|
1747 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
|
|
1748 int i, j;
|
|
1749
|
|
1750 if (!h || h->nref == 0)
|
|
1751 return 0;
|
|
1752
|
|
1753 //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
|
|
1754
|
|
1755 /* Existing refs are fine, as long as they're compatible with the hdr. */
|
|
1756 if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
|
|
1757 return -1;
|
|
1758
|
|
1759 /* Copy info from h->ref[i] over to r */
|
|
1760 for (i = 0, j = r->nref; i < h->nref; i++) {
|
|
1761 SAM_hdr_type *ty;
|
|
1762 SAM_hdr_tag *tag;
|
|
1763 khint_t k;
|
|
1764 int n;
|
|
1765
|
|
1766 k = kh_get(refs, r->h_meta, h->ref[i].name);
|
|
1767 if (k != kh_end(r->h_meta))
|
|
1768 // Ref already known about
|
|
1769 continue;
|
|
1770
|
|
1771 if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
|
|
1772 return -1;
|
|
1773
|
|
1774 if (!h->ref[j].name)
|
|
1775 return -1;
|
|
1776
|
|
1777 r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
|
|
1778 r->ref_id[j]->length = 0; // marker for not yet loaded
|
|
1779
|
|
1780 /* Initialise likely filename if known */
|
|
1781 if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
|
|
1782 if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
|
|
1783 r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
|
|
1784 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
|
|
1785 }
|
|
1786 }
|
|
1787
|
|
1788 k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
|
|
1789 if (n <= 0) // already exists or error
|
|
1790 return -1;
|
|
1791 kh_val(r->h_meta, k) = r->ref_id[j];
|
|
1792
|
|
1793 j++;
|
|
1794 }
|
|
1795 r->nref = j;
|
|
1796
|
|
1797 return 0;
|
|
1798 }
|
|
1799
|
|
1800 /*
|
|
1801 * Attaches a header to a cram_fd.
|
|
1802 *
|
|
1803 * This should be used when creating a new cram_fd for writing where
|
|
1804 * we have an SAM_hdr already constructed (eg from a file we've read
|
|
1805 * in).
|
|
1806 */
|
|
1807 int cram_set_header(cram_fd *fd, SAM_hdr *hdr) {
|
|
1808 if (fd->header)
|
|
1809 sam_hdr_free(fd->header);
|
|
1810 fd->header = hdr;
|
|
1811 return refs_from_header(fd->refs, fd, hdr);
|
|
1812 }
|
|
1813
|
|
1814 /*
|
|
1815 * Converts a directory and a filename into an expanded path, replacing %s
|
|
1816 * in directory with the filename and %[0-9]+s with portions of the filename
|
|
1817 * Any remaining parts of filename are added to the end with /%s.
|
|
1818 */
|
|
1819 void expand_cache_path(char *path, char *dir, char *fn) {
|
|
1820 char *cp;
|
|
1821
|
|
1822 while ((cp = strchr(dir, '%'))) {
|
|
1823 strncpy(path, dir, cp-dir);
|
|
1824 path += cp-dir;
|
|
1825
|
|
1826 if (*++cp == 's') {
|
|
1827 strcpy(path, fn);
|
|
1828 path += strlen(fn);
|
|
1829 fn += strlen(fn);
|
|
1830 cp++;
|
|
1831 } else if (*cp >= '0' && *cp <= '9') {
|
|
1832 char *endp;
|
|
1833 long l;
|
|
1834
|
|
1835 l = strtol(cp, &endp, 10);
|
|
1836 l = MIN(l, strlen(fn));
|
|
1837 if (*endp == 's') {
|
|
1838 strncpy(path, fn, l);
|
|
1839 path += l;
|
|
1840 fn += l;
|
|
1841 *path = 0;
|
|
1842 cp = endp+1;
|
|
1843 } else {
|
|
1844 *path++ = '%';
|
|
1845 *path++ = *cp++;
|
|
1846 }
|
|
1847 } else {
|
|
1848 *path++ = '%';
|
|
1849 *path++ = *cp++;
|
|
1850 }
|
|
1851 dir = cp;
|
|
1852 }
|
|
1853 strcpy(path, dir);
|
|
1854 path += strlen(dir);
|
|
1855 if (*fn && path[-1] != '/')
|
|
1856 *path++ = '/';
|
|
1857 strcpy(path, fn);
|
|
1858 }
|
|
1859
|
|
1860 /*
|
|
1861 * Make the directory containing path and any prefix directories.
|
|
1862 */
|
|
1863 void mkdir_prefix(char *path, int mode) {
|
|
1864 char *cp = strrchr(path, '/');
|
|
1865 if (!cp)
|
|
1866 return;
|
|
1867
|
|
1868 *cp = 0;
|
|
1869 if (is_directory(path)) {
|
|
1870 *cp = '/';
|
|
1871 return;
|
|
1872 }
|
|
1873
|
|
1874 if (mkdir(path, mode) == 0) {
|
|
1875 chmod(path, mode);
|
|
1876 *cp = '/';
|
|
1877 return;
|
|
1878 }
|
|
1879
|
|
1880 mkdir_prefix(path, mode);
|
|
1881 mkdir(path, mode);
|
|
1882 chmod(path, mode);
|
|
1883 *cp = '/';
|
|
1884 }
|
|
1885
|
|
1886 /*
|
|
1887 * Return the cache directory to use, based on the first of these
|
|
1888 * environment variables to be set to a non-empty value.
|
|
1889 */
|
|
1890 static const char *get_cache_basedir(const char **extra) {
|
|
1891 char *base;
|
|
1892
|
|
1893 *extra = "";
|
|
1894
|
|
1895 base = getenv("XDG_CACHE_HOME");
|
|
1896 if (base && *base) return base;
|
|
1897
|
|
1898 base = getenv("HOME");
|
|
1899 if (base && *base) { *extra = "/.cache"; return base; }
|
|
1900
|
|
1901 base = getenv("TMPDIR");
|
|
1902 if (base && *base) return base;
|
|
1903
|
|
1904 base = getenv("TEMP");
|
|
1905 if (base && *base) return base;
|
|
1906
|
|
1907 return "/tmp";
|
|
1908 }
|
|
1909
|
|
1910 /*
|
|
1911 * Queries the M5 string from the header and attempts to populate the
|
|
1912 * reference from this using the REF_PATH environment.
|
|
1913 *
|
|
1914 * Returns 0 on sucess
|
|
1915 * -1 on failure
|
|
1916 */
|
|
1917 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
|
|
1918 char *ref_path = getenv("REF_PATH");
|
|
1919 SAM_hdr_type *ty;
|
|
1920 SAM_hdr_tag *tag;
|
|
1921 char path[PATH_MAX], path_tmp[PATH_MAX], cache[PATH_MAX];
|
|
1922 char *local_cache = getenv("REF_CACHE");
|
|
1923 mFILE *mf;
|
|
1924
|
|
1925 if (fd->verbose)
|
|
1926 fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", fd, id);
|
|
1927
|
|
1928 if (!ref_path || *ref_path == '\0') {
|
|
1929 /*
|
|
1930 * If we have no ref path, we use the EBI server.
|
|
1931 * However to avoid spamming it we require a local ref cache too.
|
|
1932 */
|
|
1933 ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s";
|
|
1934 if (!local_cache || *local_cache == '\0') {
|
|
1935 const char *extra;
|
|
1936 const char *base = get_cache_basedir(&extra);
|
|
1937 snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
|
|
1938 local_cache = cache;
|
|
1939 if (fd->verbose)
|
|
1940 fprintf(stderr, "Populating local cache: %s\n", local_cache);
|
|
1941 }
|
|
1942 }
|
|
1943
|
|
1944 if (!r->name)
|
|
1945 return -1;
|
|
1946
|
|
1947 if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
|
|
1948 return -1;
|
|
1949
|
|
1950 if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
|
|
1951 goto no_M5;
|
|
1952
|
|
1953 if (fd->verbose)
|
|
1954 fprintf(stderr, "Querying ref %s\n", tag->str+3);
|
|
1955
|
|
1956 /* Use cache if available */
|
|
1957 if (local_cache && *local_cache) {
|
|
1958 struct stat sb;
|
|
1959 BGZF *fp;
|
|
1960
|
|
1961 expand_cache_path(path, local_cache, tag->str+3);
|
|
1962
|
|
1963 if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) {
|
|
1964 r->length = sb.st_size;
|
|
1965 r->offset = r->line_length = r->bases_per_line = 0;
|
|
1966
|
|
1967 r->fn = string_dup(fd->refs->pool, path);
|
|
1968
|
|
1969 if (fd->refs->fp)
|
|
1970 if (bgzf_close(fd->refs->fp) != 0)
|
|
1971 return -1;
|
|
1972 fd->refs->fp = fp;
|
|
1973 fd->refs->fn = r->fn;
|
|
1974
|
|
1975 // Fall back to cram_get_ref() where it'll do the actual
|
|
1976 // reading of the file.
|
|
1977 return 0;
|
|
1978 }
|
|
1979 }
|
|
1980
|
|
1981 /* Otherwise search */
|
|
1982 if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
|
|
1983 size_t sz;
|
|
1984 r->seq = mfsteal(mf, &sz);
|
|
1985 r->length = sz;
|
|
1986 } else {
|
|
1987 refs_t *refs;
|
|
1988 char *fn;
|
|
1989
|
|
1990 no_M5:
|
|
1991 /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
|
|
1992 if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
|
|
1993 return -1;
|
|
1994
|
|
1995 fn = (strncmp(tag->str+3, "file:", 5) == 0)
|
|
1996 ? tag->str+8
|
|
1997 : tag->str+3;
|
|
1998
|
|
1999 if (fd->refs->fp) {
|
|
2000 if (bgzf_close(fd->refs->fp) != 0)
|
|
2001 return -1;
|
|
2002 fd->refs->fp = NULL;
|
|
2003 }
|
|
2004 if (!(refs = refs_load_fai(fd->refs, fn, 0)))
|
|
2005 return -1;
|
|
2006 fd->refs = refs;
|
|
2007 if (fd->refs->fp) {
|
|
2008 if (bgzf_close(fd->refs->fp) != 0)
|
|
2009 return -1;
|
|
2010 fd->refs->fp = NULL;
|
|
2011 }
|
|
2012
|
|
2013 if (!fd->refs->fn)
|
|
2014 return -1;
|
|
2015
|
|
2016 if (-1 == refs2id(fd->refs, fd->header))
|
|
2017 return -1;
|
|
2018 if (!fd->refs->ref_id || !fd->refs->ref_id[id])
|
|
2019 return -1;
|
|
2020
|
|
2021 // Local copy already, so fall back to cram_get_ref().
|
|
2022 return 0;
|
|
2023 }
|
|
2024
|
|
2025 /* Populate the local disk cache if required */
|
|
2026 if (local_cache && *local_cache) {
|
|
2027 FILE *fp;
|
|
2028 int i;
|
|
2029
|
|
2030 expand_cache_path(path, local_cache, tag->str+3);
|
|
2031 if (fd->verbose)
|
|
2032 fprintf(stderr, "Path='%s'\n", path);
|
|
2033 mkdir_prefix(path, 01777);
|
|
2034
|
|
2035 i = 0;
|
|
2036 do {
|
|
2037 sprintf(path_tmp, "%s.tmp_%d", path, /*getpid(),*/ i);
|
|
2038 i++;
|
|
2039 fp = fopen(path_tmp, "wx");
|
|
2040 } while (fp == NULL && errno == EEXIST);
|
|
2041 if (!fp) {
|
|
2042 perror(path_tmp);
|
|
2043
|
|
2044 // Not fatal - we have the data already so keep going.
|
|
2045 return 0;
|
|
2046 }
|
|
2047
|
|
2048 if (r->length != fwrite(r->seq, 1, r->length, fp)) {
|
|
2049 perror(path);
|
|
2050 }
|
|
2051 if (-1 == paranoid_fclose(fp)) {
|
|
2052 unlink(path_tmp);
|
|
2053 } else {
|
|
2054 if (0 == chmod(path_tmp, 0444))
|
|
2055 rename(path_tmp, path);
|
|
2056 else
|
|
2057 unlink(path_tmp);
|
|
2058 }
|
|
2059 }
|
|
2060
|
|
2061 return 0;
|
|
2062 }
|
|
2063
|
|
2064 static void cram_ref_incr_locked(refs_t *r, int id) {
|
|
2065 RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
|
|
2066
|
|
2067 if (id < 0 || !r->ref_id[id]->seq)
|
|
2068 return;
|
|
2069
|
|
2070 if (r->last_id == id)
|
|
2071 r->last_id = -1;
|
|
2072
|
|
2073 ++r->ref_id[id]->count;
|
|
2074 }
|
|
2075
|
|
2076 void cram_ref_incr(refs_t *r, int id) {
|
|
2077 pthread_mutex_lock(&r->lock);
|
|
2078 cram_ref_incr_locked(r, id);
|
|
2079 pthread_mutex_unlock(&r->lock);
|
|
2080 }
|
|
2081
|
|
2082 static void cram_ref_decr_locked(refs_t *r, int id) {
|
|
2083 RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
|
|
2084
|
|
2085 if (id < 0 || !r->ref_id[id]->seq) {
|
|
2086 assert(r->ref_id[id]->count >= 0);
|
|
2087 return;
|
|
2088 }
|
|
2089
|
|
2090 if (--r->ref_id[id]->count <= 0) {
|
|
2091 assert(r->ref_id[id]->count == 0);
|
|
2092 if (r->last_id >= 0) {
|
|
2093 if (r->ref_id[r->last_id]->count <= 0 &&
|
|
2094 r->ref_id[r->last_id]->seq) {
|
|
2095 RP("%d FREE REF %d (%p)\n", gettid(),
|
|
2096 r->last_id, r->ref_id[r->last_id]->seq);
|
|
2097 free(r->ref_id[r->last_id]->seq);
|
|
2098 r->ref_id[r->last_id]->seq = NULL;
|
|
2099 r->ref_id[r->last_id]->length = 0;
|
|
2100 }
|
|
2101 }
|
|
2102 r->last_id = id;
|
|
2103 }
|
|
2104 }
|
|
2105
|
|
2106 void cram_ref_decr(refs_t *r, int id) {
|
|
2107 pthread_mutex_lock(&r->lock);
|
|
2108 cram_ref_decr_locked(r, id);
|
|
2109 pthread_mutex_unlock(&r->lock);
|
|
2110 }
|
|
2111
|
|
2112 /*
|
|
2113 * Used by cram_ref_load and cram_ref_get. The file handle will have
|
|
2114 * already been opened, so we can catch it. The ref_entry *e informs us
|
|
2115 * of whether this is a multi-line fasta file or a raw MD5 style file.
|
|
2116 * Either way we create a single contiguous sequence.
|
|
2117 *
|
|
2118 * Returns all or part of a reference sequence on success (malloced);
|
|
2119 * NULL on failure.
|
|
2120 */
|
|
2121 static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
|
|
2122 off_t offset, len;
|
|
2123 char *seq;
|
|
2124
|
|
2125 if (end < start)
|
|
2126 end = start;
|
|
2127
|
|
2128 /*
|
|
2129 * Compute locations in file. This is trivial for the MD5 files, but
|
|
2130 * is still necessary for the fasta variants.
|
|
2131 */
|
|
2132 offset = e->line_length
|
|
2133 ? e->offset + (start-1)/e->bases_per_line * e->line_length +
|
|
2134 (start-1) % e->bases_per_line
|
|
2135 : start-1;
|
|
2136
|
|
2137 len = (e->line_length
|
|
2138 ? e->offset + (end-1)/e->bases_per_line * e->line_length +
|
|
2139 (end-1) % e->bases_per_line
|
|
2140 : end-1) - offset + 1;
|
|
2141
|
|
2142 if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
|
|
2143 perror("bgzf_useek() on reference file");
|
|
2144 return NULL;
|
|
2145 }
|
|
2146
|
|
2147 if (len == 0 || !(seq = malloc(len))) {
|
|
2148 return NULL;
|
|
2149 }
|
|
2150
|
|
2151 if (len != bgzf_read(fp, seq, len)) {
|
|
2152 perror("bgzf_read() on reference file");
|
|
2153 free(seq);
|
|
2154 return NULL;
|
|
2155 }
|
|
2156
|
|
2157 /* Strip white-space if required. */
|
|
2158 if (len != end-start+1) {
|
|
2159 int i, j;
|
|
2160 char *cp = seq;
|
|
2161 char *cp_to;
|
|
2162
|
|
2163 for (i = j = 0; i < len; i++) {
|
|
2164 if (cp[i] >= '!' && cp[i] <= '~')
|
|
2165 cp[j++] = cp[i] & ~0x20;
|
|
2166 }
|
|
2167 cp_to = cp+j;
|
|
2168
|
|
2169 if (cp_to - seq != end-start+1) {
|
|
2170 fprintf(stderr, "Malformed reference file?\n");
|
|
2171 free(seq);
|
|
2172 return NULL;
|
|
2173 }
|
|
2174 } else {
|
|
2175 int i;
|
|
2176 for (i = 0; i < len; i++) {
|
|
2177 seq[i] = seq[i] & ~0x20; // uppercase in ASCII
|
|
2178 }
|
|
2179 }
|
|
2180
|
|
2181 return seq;
|
|
2182 }
|
|
2183
|
|
2184 /*
|
|
2185 * Load the entire reference 'id'.
|
|
2186 * This also increments the reference count by 1.
|
|
2187 *
|
|
2188 * Returns ref_entry on success;
|
|
2189 * NULL on failure
|
|
2190 */
|
|
2191 ref_entry *cram_ref_load(refs_t *r, int id) {
|
|
2192 ref_entry *e = r->ref_id[id];
|
|
2193 int start = 1, end = e->length;
|
|
2194 char *seq;
|
|
2195
|
|
2196 if (e->seq) {
|
|
2197 return e;
|
|
2198 }
|
|
2199
|
|
2200 assert(e->count == 0);
|
|
2201
|
|
2202 if (r->last) {
|
|
2203 #ifdef REF_DEBUG
|
|
2204 int idx = 0;
|
|
2205 for (idx = 0; idx < r->nref; idx++)
|
|
2206 if (r->last == r->ref_id[idx])
|
|
2207 break;
|
|
2208 RP("%d cram_ref_load DECR %d\n", gettid(), idx);
|
|
2209 #endif
|
|
2210 assert(r->last->count > 0);
|
|
2211 if (--r->last->count <= 0) {
|
|
2212 RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
|
|
2213 if (r->last->seq) {
|
|
2214 free(r->last->seq);
|
|
2215 r->last->seq = NULL;
|
|
2216 }
|
|
2217 }
|
|
2218 }
|
|
2219
|
|
2220 /* Open file if it's not already the current open reference */
|
|
2221 if (strcmp(r->fn, e->fn) || r->fp == NULL) {
|
|
2222 if (r->fp)
|
|
2223 if (bgzf_close(r->fp) != 0)
|
|
2224 return NULL;
|
|
2225 r->fn = e->fn;
|
|
2226 if (!(r->fp = bgzf_open_ref(r->fn, "r")))
|
|
2227 return NULL;
|
|
2228 }
|
|
2229
|
|
2230 RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
|
|
2231
|
|
2232 if (!(seq = load_ref_portion(r->fp, e, start, end))) {
|
|
2233 return NULL;
|
|
2234 }
|
|
2235
|
|
2236 RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
|
|
2237
|
|
2238 RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
|
|
2239 e->seq = seq;
|
|
2240 e->count++;
|
|
2241
|
|
2242 /*
|
|
2243 * Also keep track of last used ref so incr/decr loops on the same
|
|
2244 * sequence don't cause load/free loops.
|
|
2245 */
|
|
2246 RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
|
|
2247 r->last = e;
|
|
2248 e->count++;
|
|
2249
|
|
2250 return e;
|
|
2251 }
|
|
2252
|
|
2253 /*
|
|
2254 * Returns a portion of a reference sequence from start to end inclusive.
|
|
2255 * The returned pointer is owned by either the cram_file fd or by the
|
|
2256 * internal refs_t structure and should not be freed by the caller.
|
|
2257 *
|
|
2258 * The difference is whether or not this refs_t is in use by just the one
|
|
2259 * cram_fd or by multiples, or whether we have multiple threads accessing
|
|
2260 * references. In either case fd->shared will be true and we start using
|
|
2261 * reference counting to track the number of users of a specific reference
|
|
2262 * sequence.
|
|
2263 *
|
|
2264 * Otherwise the ref seq returned is allocated as part of cram_fd itself
|
|
2265 * and will be freed up on the next call to cram_get_ref or cram_close.
|
|
2266 *
|
|
2267 * To return the entire reference sequence, specify start as 1 and end
|
|
2268 * as 0.
|
|
2269 *
|
|
2270 * To cease using a reference, call cram_ref_decr().
|
|
2271 *
|
|
2272 * Returns reference on success,
|
|
2273 * NULL on failure
|
|
2274 */
|
|
2275 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
|
|
2276 ref_entry *r;
|
|
2277 char *seq;
|
|
2278 int ostart = start;
|
|
2279
|
|
2280 if (id == -1)
|
|
2281 return NULL;
|
|
2282
|
|
2283 /* FIXME: axiomatic query of r->seq being true?
|
|
2284 * Or shortcut for unsorted data where we load once and never free?
|
|
2285 */
|
|
2286
|
|
2287 //fd->shared_ref = 1; // hard code for now to simplify things
|
|
2288
|
|
2289 pthread_mutex_lock(&fd->ref_lock);
|
|
2290
|
|
2291 RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
|
|
2292
|
|
2293 /*
|
|
2294 * Unsorted data implies we want to fetch an entire reference at a time.
|
|
2295 * We just deal with this at the moment by claiming we're sharing
|
|
2296 * references instead, which has the same requirement.
|
|
2297 */
|
|
2298 if (fd->unsorted)
|
|
2299 fd->shared_ref = 1;
|
|
2300
|
|
2301
|
|
2302 /* Sanity checking: does this ID exist? */
|
|
2303 if (id >= fd->refs->nref) {
|
|
2304 fprintf(stderr, "No reference found for id %d\n", id);
|
|
2305 pthread_mutex_unlock(&fd->ref_lock);
|
|
2306 return NULL;
|
|
2307 }
|
|
2308
|
|
2309 if (!fd->refs || !fd->refs->ref_id[id]) {
|
|
2310 fprintf(stderr, "No reference found for id %d\n", id);
|
|
2311 pthread_mutex_unlock(&fd->ref_lock);
|
|
2312 return NULL;
|
|
2313 }
|
|
2314
|
|
2315 if (!(r = fd->refs->ref_id[id])) {
|
|
2316 fprintf(stderr, "No reference found for id %d\n", id);
|
|
2317 pthread_mutex_unlock(&fd->ref_lock);
|
|
2318 return NULL;
|
|
2319 }
|
|
2320
|
|
2321
|
|
2322 /*
|
|
2323 * It has an entry, but may not have been populated yet.
|
|
2324 * Any manually loaded .fai files have their lengths known.
|
|
2325 * A ref entry computed from @SQ lines (M5 or UR field) will have
|
|
2326 * r->length == 0 unless it's been loaded once and verified that we have
|
|
2327 * an on-disk filename for it.
|
|
2328 *
|
|
2329 * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
|
|
2330 * open_path_mfile and libcurl, which isn't multi-thread safe unless I
|
|
2331 * rewrite my code to have one curl handle per thread.
|
|
2332 */
|
|
2333 pthread_mutex_lock(&fd->refs->lock);
|
|
2334 if (r->length == 0) {
|
|
2335 if (cram_populate_ref(fd, id, r) == -1) {
|
|
2336 fprintf(stderr, "Failed to populate reference for id %d\n", id);
|
|
2337 pthread_mutex_unlock(&fd->refs->lock);
|
|
2338 pthread_mutex_unlock(&fd->ref_lock);
|
|
2339 return NULL;
|
|
2340 }
|
|
2341 r = fd->refs->ref_id[id];
|
|
2342 if (fd->unsorted)
|
|
2343 cram_ref_incr_locked(fd->refs, id);
|
|
2344 }
|
|
2345
|
|
2346
|
|
2347 /*
|
|
2348 * We now know that we the filename containing the reference, so check
|
|
2349 * for limits. If it's over half the reference we'll load all of it in
|
|
2350 * memory as this will speed up subsequent calls.
|
|
2351 */
|
|
2352 if (end < 1)
|
|
2353 end = r->length;
|
|
2354 if (end >= r->length)
|
|
2355 end = r->length;
|
|
2356 assert(start >= 1);
|
|
2357
|
|
2358 if (end - start >= 0.5*r->length || fd->shared_ref) {
|
|
2359 start = 1;
|
|
2360 end = r->length;
|
|
2361 }
|
|
2362
|
|
2363 /*
|
|
2364 * Maybe we have it cached already? If so use it.
|
|
2365 *
|
|
2366 * Alternatively if we don't have the sequence but we're sharing
|
|
2367 * references and/or are asking for the entire length of it, then
|
|
2368 * load the full reference into the refs structure and return
|
|
2369 * a pointer to that one instead.
|
|
2370 */
|
|
2371 if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
|
|
2372 char *cp;
|
|
2373
|
|
2374 if (id >= 0) {
|
|
2375 if (r->seq) {
|
|
2376 cram_ref_incr_locked(fd->refs, id);
|
|
2377 } else {
|
|
2378 ref_entry *e;
|
|
2379 if (!(e = cram_ref_load(fd->refs, id))) {
|
|
2380 pthread_mutex_unlock(&fd->refs->lock);
|
|
2381 pthread_mutex_unlock(&fd->ref_lock);
|
|
2382 return NULL;
|
|
2383 }
|
|
2384
|
|
2385 /* unsorted data implies cache ref indefinitely, to avoid
|
|
2386 * continually loading and unloading.
|
|
2387 */
|
|
2388 if (fd->unsorted)
|
|
2389 cram_ref_incr_locked(fd->refs, id);
|
|
2390 }
|
|
2391
|
|
2392 fd->ref = NULL; /* We never access it directly */
|
|
2393 fd->ref_start = 1;
|
|
2394 fd->ref_end = r->length;
|
|
2395 fd->ref_id = id;
|
|
2396
|
|
2397 cp = fd->refs->ref_id[id]->seq + ostart-1;
|
|
2398 } else {
|
|
2399 fd->ref = NULL;
|
|
2400 cp = NULL;
|
|
2401 }
|
|
2402
|
|
2403 RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
|
|
2404
|
|
2405 pthread_mutex_unlock(&fd->refs->lock);
|
|
2406 pthread_mutex_unlock(&fd->ref_lock);
|
|
2407 return cp;
|
|
2408 }
|
|
2409
|
|
2410 /*
|
|
2411 * Otherwise we're not sharing, we don't have a copy of it already and
|
|
2412 * we're only asking for a small portion of it.
|
|
2413 *
|
|
2414 * In this case load up just that segment ourselves, freeing any old
|
|
2415 * small segments in the process.
|
|
2416 */
|
|
2417
|
|
2418 /* Unmapped ref ID */
|
|
2419 if (id < 0) {
|
|
2420 if (fd->ref_free) {
|
|
2421 free(fd->ref_free);
|
|
2422 fd->ref_free = NULL;
|
|
2423 }
|
|
2424 fd->ref = NULL;
|
|
2425 fd->ref_id = id;
|
|
2426 pthread_mutex_unlock(&fd->refs->lock);
|
|
2427 pthread_mutex_unlock(&fd->ref_lock);
|
|
2428 return NULL;
|
|
2429 }
|
|
2430
|
|
2431 /* Open file if it's not already the current open reference */
|
|
2432 if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
|
|
2433 if (fd->refs->fp)
|
|
2434 if (bgzf_close(fd->refs->fp) != 0)
|
|
2435 return NULL;
|
|
2436 fd->refs->fn = r->fn;
|
|
2437 if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r"))) {
|
|
2438 pthread_mutex_unlock(&fd->refs->lock);
|
|
2439 pthread_mutex_unlock(&fd->ref_lock);
|
|
2440 return NULL;
|
|
2441 }
|
|
2442 }
|
|
2443
|
|
2444 if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
|
|
2445 pthread_mutex_unlock(&fd->refs->lock);
|
|
2446 pthread_mutex_unlock(&fd->ref_lock);
|
|
2447 return NULL;
|
|
2448 }
|
|
2449
|
|
2450 if (fd->ref_free)
|
|
2451 free(fd->ref_free);
|
|
2452
|
|
2453 fd->ref_id = id;
|
|
2454 fd->ref_start = start;
|
|
2455 fd->ref_end = end;
|
|
2456 fd->ref_free = fd->ref;
|
|
2457 seq = fd->ref;
|
|
2458
|
|
2459 pthread_mutex_unlock(&fd->refs->lock);
|
|
2460 pthread_mutex_unlock(&fd->ref_lock);
|
|
2461
|
|
2462 return seq + ostart - start;
|
|
2463 }
|
|
2464
|
|
2465 /*
|
|
2466 * If fd has been opened for reading, it may be permitted to specify 'fn'
|
|
2467 * as NULL and let the code auto-detect the reference by parsing the
|
|
2468 * SAM header @SQ lines.
|
|
2469 */
|
|
2470 int cram_load_reference(cram_fd *fd, char *fn) {
|
|
2471 if (fn) {
|
|
2472 fd->refs = refs_load_fai(fd->refs, fn,
|
|
2473 !(fd->embed_ref && fd->mode == 'r'));
|
|
2474 fn = fd->refs ? fd->refs->fn : NULL;
|
|
2475 }
|
|
2476 fd->ref_fn = fn;
|
|
2477
|
|
2478 if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
|
|
2479 if (fd->refs)
|
|
2480 refs_free(fd->refs);
|
|
2481 if (!(fd->refs = refs_create()))
|
|
2482 return -1;
|
|
2483 if (-1 == refs_from_header(fd->refs, fd, fd->header))
|
|
2484 return -1;
|
|
2485 }
|
|
2486
|
|
2487 if (fd->header)
|
|
2488 if (-1 == refs2id(fd->refs, fd->header))
|
|
2489 return -1;
|
|
2490
|
|
2491 return fn ? 0 : -1;
|
|
2492 }
|
|
2493
|
|
2494 /* ----------------------------------------------------------------------
|
|
2495 * Containers
|
|
2496 */
|
|
2497
|
|
2498 /*
|
|
2499 * Creates a new container, specifying the maximum number of slices
|
|
2500 * and records permitted.
|
|
2501 *
|
|
2502 * Returns cram_container ptr on success
|
|
2503 * NULL on failure
|
|
2504 */
|
|
2505 cram_container *cram_new_container(int nrec, int nslice) {
|
|
2506 cram_container *c = calloc(1, sizeof(*c));
|
|
2507 enum cram_DS_ID id;
|
|
2508
|
|
2509 if (!c)
|
|
2510 return NULL;
|
|
2511
|
|
2512 c->curr_ref = -2;
|
|
2513
|
|
2514 c->max_c_rec = nrec * nslice;
|
|
2515 c->curr_c_rec = 0;
|
|
2516
|
|
2517 c->max_rec = nrec;
|
|
2518 c->record_counter = 0;
|
|
2519 c->num_bases = 0;
|
|
2520
|
|
2521 c->max_slice = nslice;
|
|
2522 c->curr_slice = 0;
|
|
2523
|
|
2524 c->pos_sorted = 1;
|
|
2525 c->max_apos = 0;
|
|
2526 c->multi_seq = 0;
|
|
2527
|
|
2528 c->bams = NULL;
|
|
2529
|
|
2530 if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
|
|
2531 goto err;
|
|
2532 c->slice = NULL;
|
|
2533
|
|
2534 if (!(c->comp_hdr = cram_new_compression_header()))
|
|
2535 goto err;
|
|
2536 c->comp_hdr_block = NULL;
|
|
2537
|
|
2538 for (id = DS_RN; id < DS_TN; id++)
|
|
2539 if (!(c->stats[id] = cram_stats_create())) goto err;
|
|
2540
|
|
2541 //c->aux_B_stats = cram_stats_create();
|
|
2542
|
|
2543 if (!(c->tags_used = kh_init(s_i2i)))
|
|
2544 goto err;
|
|
2545 c->refs_used = 0;
|
|
2546
|
|
2547 return c;
|
|
2548
|
|
2549 err:
|
|
2550 if (c) {
|
|
2551 if (c->slices)
|
|
2552 free(c->slices);
|
|
2553 free(c);
|
|
2554 }
|
|
2555 return NULL;
|
|
2556 }
|
|
2557
|
|
2558 void cram_free_container(cram_container *c) {
|
|
2559 enum cram_DS_ID id;
|
|
2560 int i;
|
|
2561
|
|
2562 if (!c)
|
|
2563 return;
|
|
2564
|
|
2565 if (c->refs_used)
|
|
2566 free(c->refs_used);
|
|
2567
|
|
2568 if (c->landmark)
|
|
2569 free(c->landmark);
|
|
2570
|
|
2571 if (c->comp_hdr)
|
|
2572 cram_free_compression_header(c->comp_hdr);
|
|
2573
|
|
2574 if (c->comp_hdr_block)
|
|
2575 cram_free_block(c->comp_hdr_block);
|
|
2576
|
|
2577 if (c->slices) {
|
|
2578 for (i = 0; i < c->max_slice; i++)
|
|
2579 if (c->slices[i])
|
|
2580 cram_free_slice(c->slices[i]);
|
|
2581 free(c->slices);
|
|
2582 }
|
|
2583
|
|
2584 for (id = DS_RN; id < DS_TN; id++)
|
|
2585 if (c->stats[id]) cram_stats_free(c->stats[id]);
|
|
2586
|
|
2587 //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
|
|
2588
|
|
2589 if (c->tags_used) kh_destroy(s_i2i, c->tags_used);
|
|
2590
|
|
2591 free(c);
|
|
2592 }
|
|
2593
|
|
2594 /*
|
|
2595 * Reads a container header.
|
|
2596 *
|
|
2597 * Returns cram_container on success
|
|
2598 * NULL on failure or no container left (fd->err == 0).
|
|
2599 */
|
|
2600 cram_container *cram_read_container(cram_fd *fd) {
|
|
2601 cram_container c2, *c;
|
|
2602 int i, s;
|
|
2603 size_t rd = 0;
|
|
2604
|
|
2605 fd->err = 0;
|
|
2606 fd->eof = 0;
|
|
2607
|
|
2608 memset(&c2, 0, sizeof(c2));
|
|
2609 if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
|
2610 if ((s = itf8_decode(fd, &c2.length)) == -1) {
|
|
2611 fd->eof = fd->empty_container ? 1 : 2;
|
|
2612 return NULL;
|
|
2613 } else {
|
|
2614 rd+=s;
|
|
2615 }
|
|
2616 } else {
|
|
2617 if ((s = int32_decode(fd, &c2.length)) == -1) {
|
|
2618 if (CRAM_MAJOR_VERS(fd->version) == 2 &&
|
|
2619 CRAM_MINOR_VERS(fd->version) == 0)
|
|
2620 fd->eof = 1; // EOF blocks arrived in v2.1
|
|
2621 else
|
|
2622 fd->eof = fd->empty_container ? 1 : 2;
|
|
2623 return NULL;
|
|
2624 } else {
|
|
2625 rd+=s;
|
|
2626 }
|
|
2627 }
|
|
2628 if ((s = itf8_decode(fd, &c2.ref_seq_id)) == -1) return NULL; else rd+=s;
|
|
2629 if ((s = itf8_decode(fd, &c2.ref_seq_start))== -1) return NULL; else rd+=s;
|
|
2630 if ((s = itf8_decode(fd, &c2.ref_seq_span)) == -1) return NULL; else rd+=s;
|
|
2631 if ((s = itf8_decode(fd, &c2.num_records)) == -1) return NULL; else rd+=s;
|
|
2632
|
|
2633 if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
|
2634 c2.record_counter = 0;
|
|
2635 c2.num_bases = 0;
|
|
2636 } else {
|
|
2637 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
|
2638 if ((s = ltf8_decode(fd, &c2.record_counter)) == -1)
|
|
2639 return NULL;
|
|
2640 else
|
|
2641 rd += s;
|
|
2642 } else {
|
|
2643 int32_t i32;
|
|
2644 if ((s = itf8_decode(fd, &i32)) == -1)
|
|
2645 return NULL;
|
|
2646 else
|
|
2647 rd += s;
|
|
2648 c2.record_counter = i32;
|
|
2649 }
|
|
2650
|
|
2651 if ((s = ltf8_decode(fd, &c2.num_bases))== -1)
|
|
2652 return NULL;
|
|
2653 else
|
|
2654 rd += s;
|
|
2655 }
|
|
2656 if ((s = itf8_decode(fd, &c2.num_blocks)) == -1) return NULL; else rd+=s;
|
|
2657 if ((s = itf8_decode(fd, &c2.num_landmarks))== -1) return NULL; else rd+=s;
|
|
2658
|
|
2659 if (!(c = calloc(1, sizeof(*c))))
|
|
2660 return NULL;
|
|
2661
|
|
2662 *c = c2;
|
|
2663
|
|
2664 if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
|
|
2665 c->num_landmarks) {
|
|
2666 fd->err = errno;
|
|
2667 cram_free_container(c);
|
|
2668 return NULL;
|
|
2669 }
|
|
2670 for (i = 0; i < c->num_landmarks; i++) {
|
|
2671 if ((s = itf8_decode(fd, &c->landmark[i])) == -1) {
|
|
2672 cram_free_container(c);
|
|
2673 return NULL;
|
|
2674 } else {
|
|
2675 rd += s;
|
|
2676 }
|
|
2677 }
|
|
2678
|
|
2679 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
|
2680 uint32_t crc, i;
|
|
2681 unsigned char *dat = malloc(50 + 5*(c->num_landmarks)), *cp = dat;
|
|
2682 if (!dat) {
|
|
2683 cram_free_container(c);
|
|
2684 return NULL;
|
|
2685 }
|
|
2686 if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
|
|
2687 return NULL;
|
|
2688 else
|
|
2689 rd+=4;
|
|
2690
|
|
2691 /* Reencode first as we can't easily access the original byte stream.
|
|
2692 *
|
|
2693 * FIXME: Technically this means this may not be fool proof. We could
|
|
2694 * create a CRAM file using a 2 byte ITF8 value that can fit in a
|
|
2695 * 1 byte field, meaning the encoding is different to the original
|
|
2696 * form and so has a different CRC.
|
|
2697 *
|
|
2698 * The correct implementation would be to have an alternative form
|
|
2699 * of itf8_decode which also squirrels away the raw byte stream
|
|
2700 * during decoding so we can then CRC that.
|
|
2701 */
|
|
2702 *(unsigned int *)cp = le_int4(c->length); cp += 4;
|
|
2703 cp += itf8_put(cp, c->ref_seq_id);
|
|
2704 cp += itf8_put(cp, c->ref_seq_start);
|
|
2705 cp += itf8_put(cp, c->ref_seq_span);
|
|
2706 cp += itf8_put(cp, c->num_records);
|
|
2707 cp += ltf8_put((char *)cp, c->record_counter);
|
|
2708 cp += itf8_put(cp, c->num_bases);
|
|
2709 cp += itf8_put(cp, c->num_blocks);
|
|
2710 cp += itf8_put(cp, c->num_landmarks);
|
|
2711 for (i = 0; i < c->num_landmarks; i++) {
|
|
2712 cp += itf8_put(cp, c->landmark[i]);
|
|
2713 }
|
|
2714
|
|
2715 crc = crc32(0L, dat, cp-dat);
|
|
2716 if (crc != c->crc32) {
|
|
2717 fprintf(stderr, "Container header CRC32 failure\n");
|
|
2718 cram_free_container(c);
|
|
2719 return NULL;
|
|
2720 }
|
|
2721 }
|
|
2722
|
|
2723 c->offset = rd;
|
|
2724 c->slices = NULL;
|
|
2725 c->curr_slice = 0;
|
|
2726 c->max_slice = c->num_landmarks;
|
|
2727 c->slice_rec = 0;
|
|
2728 c->curr_rec = 0;
|
|
2729 c->max_rec = 0;
|
|
2730
|
|
2731 if (c->ref_seq_id == -2) {
|
|
2732 c->multi_seq = 1;
|
|
2733 fd->multi_seq = 1;
|
|
2734 }
|
|
2735
|
|
2736 fd->empty_container =
|
|
2737 (c->num_records == 0 &&
|
|
2738 c->ref_seq_id == -1 &&
|
|
2739 c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
|
|
2740
|
|
2741 return c;
|
|
2742 }
|
|
2743
|
|
2744 /*
|
|
2745 * Writes a container structure.
|
|
2746 *
|
|
2747 * Returns 0 on success
|
|
2748 * -1 on failure
|
|
2749 */
|
|
2750 int cram_write_container(cram_fd *fd, cram_container *c) {
|
|
2751 char buf_a[1024], *buf = buf_a, *cp;
|
|
2752 int i;
|
|
2753
|
|
2754 if (55 + c->num_landmarks * 5 >= 1024)
|
|
2755 buf = malloc(55 + c->num_landmarks * 5);
|
|
2756 cp = buf;
|
|
2757
|
|
2758 if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
|
2759 cp += itf8_put(cp, c->length);
|
|
2760 } else {
|
|
2761 *(int32_t *)cp = le_int4(c->length);
|
|
2762 cp += 4;
|
|
2763 }
|
|
2764 if (c->multi_seq) {
|
|
2765 cp += itf8_put(cp, -2);
|
|
2766 cp += itf8_put(cp, 0);
|
|
2767 cp += itf8_put(cp, 0);
|
|
2768 } else {
|
|
2769 cp += itf8_put(cp, c->ref_seq_id);
|
|
2770 cp += itf8_put(cp, c->ref_seq_start);
|
|
2771 cp += itf8_put(cp, c->ref_seq_span);
|
|
2772 }
|
|
2773 cp += itf8_put(cp, c->num_records);
|
|
2774 if (CRAM_MAJOR_VERS(fd->version) == 2) {
|
|
2775 cp += itf8_put(cp, c->record_counter);
|
|
2776 cp += ltf8_put(cp, c->num_bases);
|
|
2777 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
|
2778 cp += ltf8_put(cp, c->record_counter);
|
|
2779 cp += ltf8_put(cp, c->num_bases);
|
|
2780 }
|
|
2781
|
|
2782 cp += itf8_put(cp, c->num_blocks);
|
|
2783 cp += itf8_put(cp, c->num_landmarks);
|
|
2784 for (i = 0; i < c->num_landmarks; i++)
|
|
2785 cp += itf8_put(cp, c->landmark[i]);
|
|
2786
|
|
2787 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
|
|
2788 c->crc32 = crc32(0L, (uc *)buf, cp-buf);
|
|
2789 cp[0] = c->crc32 & 0xff;
|
|
2790 cp[1] = (c->crc32 >> 8) & 0xff;
|
|
2791 cp[2] = (c->crc32 >> 16) & 0xff;
|
|
2792 cp[3] = (c->crc32 >> 24) & 0xff;
|
|
2793 cp += 4;
|
|
2794 }
|
|
2795
|
|
2796 if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
|
|
2797 if (buf != buf_a)
|
|
2798 free(buf);
|
|
2799 return -1;
|
|
2800 }
|
|
2801
|
|
2802 if (buf != buf_a)
|
|
2803 free(buf);
|
|
2804
|
|
2805 return 0;
|
|
2806 }
|
|
2807
|
|
2808 // common component shared by cram_flush_container{,_mt}
|
|
2809 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
|
|
2810 int i, j;
|
|
2811
|
|
2812 //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
|
|
2813
|
|
2814 /* Write the container struct itself */
|
|
2815 if (0 != cram_write_container(fd, c))
|
|
2816 return -1;
|
|
2817
|
|
2818 /* And the compression header */
|
|
2819 if (0 != cram_write_block(fd, c->comp_hdr_block))
|
|
2820 return -1;
|
|
2821
|
|
2822 /* Followed by the slice blocks */
|
|
2823 for (i = 0; i < c->curr_slice; i++) {
|
|
2824 cram_slice *s = c->slices[i];
|
|
2825
|
|
2826 if (0 != cram_write_block(fd, s->hdr_block))
|
|
2827 return -1;
|
|
2828
|
|
2829 for (j = 0; j < s->hdr->num_blocks; j++) {
|
|
2830 if (0 != cram_write_block(fd, s->block[j]))
|
|
2831 return -1;
|
|
2832 }
|
|
2833 }
|
|
2834
|
|
2835 return hflush(fd->fp) == 0 ? 0 : -1;
|
|
2836 }
|
|
2837
|
|
2838 /*
|
|
2839 * Flushes a completely or partially full container to disk, writing
|
|
2840 * container structure, header and blocks. This also calls the encoder
|
|
2841 * functions.
|
|
2842 *
|
|
2843 * Returns 0 on success
|
|
2844 * -1 on failure
|
|
2845 */
|
|
2846 int cram_flush_container(cram_fd *fd, cram_container *c) {
|
|
2847 /* Encode the container blocks and generate compression header */
|
|
2848 if (0 != cram_encode_container(fd, c))
|
|
2849 return -1;
|
|
2850
|
|
2851 return cram_flush_container2(fd, c);
|
|
2852 }
|
|
2853
|
|
2854 typedef struct {
|
|
2855 cram_fd *fd;
|
|
2856 cram_container *c;
|
|
2857 } cram_job;
|
|
2858
|
|
2859 void *cram_flush_thread(void *arg) {
|
|
2860 cram_job *j = (cram_job *)arg;
|
|
2861
|
|
2862 /* Encode the container blocks and generate compression header */
|
|
2863 if (0 != cram_encode_container(j->fd, j->c)) {
|
|
2864 fprintf(stderr, "cram_encode_container failed\n");
|
|
2865 return NULL;
|
|
2866 }
|
|
2867
|
|
2868 return arg;
|
|
2869 }
|
|
2870
|
|
2871 static int cram_flush_result(cram_fd *fd) {
|
|
2872 int i, ret = 0;
|
|
2873 t_pool_result *r;
|
|
2874
|
|
2875 while ((r = t_pool_next_result(fd->rqueue))) {
|
|
2876 cram_job *j = (cram_job *)r->data;
|
|
2877 cram_container *c;
|
|
2878
|
|
2879 if (!j) {
|
|
2880 t_pool_delete_result(r, 0);
|
|
2881 return -1;
|
|
2882 }
|
|
2883
|
|
2884 fd = j->fd;
|
|
2885 c = j->c;
|
|
2886
|
|
2887 if (0 != cram_flush_container2(fd, c))
|
|
2888 return -1;
|
|
2889
|
|
2890 /* Free the container */
|
|
2891 for (i = 0; i < c->max_slice; i++) {
|
|
2892 cram_free_slice(c->slices[i]);
|
|
2893 c->slices[i] = NULL;
|
|
2894 }
|
|
2895
|
|
2896 c->slice = NULL;
|
|
2897 c->curr_slice = 0;
|
|
2898
|
|
2899 cram_free_container(c);
|
|
2900
|
|
2901 ret |= hflush(fd->fp) == 0 ? 0 : -1;
|
|
2902
|
|
2903 t_pool_delete_result(r, 1);
|
|
2904 }
|
|
2905
|
|
2906 return ret;
|
|
2907 }
|
|
2908
|
|
2909 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
|
|
2910 cram_job *j;
|
|
2911
|
|
2912 if (!fd->pool)
|
|
2913 return cram_flush_container(fd, c);
|
|
2914
|
|
2915 if (!(j = malloc(sizeof(*j))))
|
|
2916 return -1;
|
|
2917 j->fd = fd;
|
|
2918 j->c = c;
|
|
2919
|
|
2920 t_pool_dispatch(fd->pool, fd->rqueue, cram_flush_thread, j);
|
|
2921
|
|
2922 return cram_flush_result(fd);
|
|
2923 }
|
|
2924
|
|
2925 /* ----------------------------------------------------------------------
|
|
2926 * Compression headers; the first part of the container
|
|
2927 */
|
|
2928
|
|
2929 /*
|
|
2930 * Creates a new blank container compression header
|
|
2931 *
|
|
2932 * Returns header ptr on success
|
|
2933 * NULL on failure
|
|
2934 */
|
|
2935 cram_block_compression_hdr *cram_new_compression_header(void) {
|
|
2936 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
|
|
2937 if (!hdr)
|
|
2938 return NULL;
|
|
2939
|
|
2940 if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
|
|
2941 free(hdr);
|
|
2942 return NULL;
|
|
2943 }
|
|
2944
|
|
2945 if (!(hdr->TD_hash = kh_init(m_s2i))) {
|
|
2946 cram_free_block(hdr->TD_blk);
|
|
2947 free(hdr);
|
|
2948 return NULL;
|
|
2949 }
|
|
2950
|
|
2951 if (!(hdr->TD_keys = string_pool_create(8192))) {
|
|
2952 kh_destroy(m_s2i, hdr->TD_hash);
|
|
2953 cram_free_block(hdr->TD_blk);
|
|
2954 free(hdr);
|
|
2955 return NULL;
|
|
2956 }
|
|
2957
|
|
2958 return hdr;
|
|
2959 }
|
|
2960
|
|
2961 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
|
|
2962 int i;
|
|
2963
|
|
2964 if (hdr->landmark)
|
|
2965 free(hdr->landmark);
|
|
2966
|
|
2967 if (hdr->preservation_map)
|
|
2968 kh_destroy(map, hdr->preservation_map);
|
|
2969
|
|
2970 for (i = 0; i < CRAM_MAP_HASH; i++) {
|
|
2971 cram_map *m, *m2;
|
|
2972 for (m = hdr->rec_encoding_map[i]; m; m = m2) {
|
|
2973 m2 = m->next;
|
|
2974 if (m->codec)
|
|
2975 m->codec->free(m->codec);
|
|
2976 free(m);
|
|
2977 }
|
|
2978 }
|
|
2979
|
|
2980 for (i = 0; i < CRAM_MAP_HASH; i++) {
|
|
2981 cram_map *m, *m2;
|
|
2982 for (m = hdr->tag_encoding_map[i]; m; m = m2) {
|
|
2983 m2 = m->next;
|
|
2984 if (m->codec)
|
|
2985 m->codec->free(m->codec);
|
|
2986 free(m);
|
|
2987 }
|
|
2988 }
|
|
2989
|
|
2990 for (i = 0; i < DS_END; i++) {
|
|
2991 if (hdr->codecs[i])
|
|
2992 hdr->codecs[i]->free(hdr->codecs[i]);
|
|
2993 }
|
|
2994
|
|
2995 if (hdr->TL)
|
|
2996 free(hdr->TL);
|
|
2997 if (hdr->TD_blk)
|
|
2998 cram_free_block(hdr->TD_blk);
|
|
2999 if (hdr->TD_hash)
|
|
3000 kh_destroy(m_s2i, hdr->TD_hash);
|
|
3001 if (hdr->TD_keys)
|
|
3002 string_pool_destroy(hdr->TD_keys);
|
|
3003
|
|
3004 free(hdr);
|
|
3005 }
|
|
3006
|
|
3007
|
|
3008 /* ----------------------------------------------------------------------
|
|
3009 * Slices and slice headers
|
|
3010 */
|
|
3011
|
|
3012 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
|
|
3013 if (!hdr)
|
|
3014 return;
|
|
3015
|
|
3016 if (hdr->block_content_ids)
|
|
3017 free(hdr->block_content_ids);
|
|
3018
|
|
3019 free(hdr);
|
|
3020
|
|
3021 return;
|
|
3022 }
|
|
3023
|
|
3024 void cram_free_slice(cram_slice *s) {
|
|
3025 if (!s)
|
|
3026 return;
|
|
3027
|
|
3028 if (s->hdr_block)
|
|
3029 cram_free_block(s->hdr_block);
|
|
3030
|
|
3031 if (s->block) {
|
|
3032 int i;
|
|
3033
|
|
3034 if (s->hdr) {
|
|
3035 for (i = 0; i < s->hdr->num_blocks; i++) {
|
|
3036 cram_free_block(s->block[i]);
|
|
3037 }
|
|
3038 }
|
|
3039 free(s->block);
|
|
3040 }
|
|
3041
|
|
3042 if (s->block_by_id)
|
|
3043 free(s->block_by_id);
|
|
3044
|
|
3045 if (s->hdr)
|
|
3046 cram_free_slice_header(s->hdr);
|
|
3047
|
|
3048 if (s->seqs_blk)
|
|
3049 cram_free_block(s->seqs_blk);
|
|
3050
|
|
3051 if (s->qual_blk)
|
|
3052 cram_free_block(s->qual_blk);
|
|
3053
|
|
3054 if (s->name_blk)
|
|
3055 cram_free_block(s->name_blk);
|
|
3056
|
|
3057 if (s->aux_blk)
|
|
3058 cram_free_block(s->aux_blk);
|
|
3059
|
|
3060 if (s->aux_OQ_blk)
|
|
3061 cram_free_block(s->aux_OQ_blk);
|
|
3062
|
|
3063 if (s->aux_BQ_blk)
|
|
3064 cram_free_block(s->aux_BQ_blk);
|
|
3065
|
|
3066 if (s->aux_FZ_blk)
|
|
3067 cram_free_block(s->aux_FZ_blk);
|
|
3068
|
|
3069 if (s->aux_oq_blk)
|
|
3070 cram_free_block(s->aux_oq_blk);
|
|
3071
|
|
3072 if (s->aux_os_blk)
|
|
3073 cram_free_block(s->aux_os_blk);
|
|
3074
|
|
3075 if (s->aux_oz_blk)
|
|
3076 cram_free_block(s->aux_oz_blk);
|
|
3077
|
|
3078 if (s->base_blk)
|
|
3079 cram_free_block(s->base_blk);
|
|
3080
|
|
3081 if (s->soft_blk)
|
|
3082 cram_free_block(s->soft_blk);
|
|
3083
|
|
3084 if (s->cigar)
|
|
3085 free(s->cigar);
|
|
3086
|
|
3087 if (s->crecs)
|
|
3088 free(s->crecs);
|
|
3089
|
|
3090 if (s->features)
|
|
3091 free(s->features);
|
|
3092
|
|
3093 if (s->TN)
|
|
3094 free(s->TN);
|
|
3095
|
|
3096 if (s->pair_keys)
|
|
3097 string_pool_destroy(s->pair_keys);
|
|
3098
|
|
3099 if (s->pair[0])
|
|
3100 kh_destroy(m_s2i, s->pair[0]);
|
|
3101 if (s->pair[1])
|
|
3102 kh_destroy(m_s2i, s->pair[1]);
|
|
3103
|
|
3104 free(s);
|
|
3105 }
|
|
3106
|
|
3107 /*
|
|
3108 * Creates a new empty slice in memory, for subsequent writing to
|
|
3109 * disk.
|
|
3110 *
|
|
3111 * Returns cram_slice ptr on success
|
|
3112 * NULL on failure
|
|
3113 */
|
|
3114 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
|
|
3115 cram_slice *s = calloc(1, sizeof(*s));
|
|
3116 if (!s)
|
|
3117 return NULL;
|
|
3118
|
|
3119 if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
|
|
3120 goto err;
|
|
3121 s->hdr->content_type = type;
|
|
3122
|
|
3123 s->hdr_block = NULL;
|
|
3124 s->block = NULL;
|
|
3125 s->block_by_id = NULL;
|
|
3126 s->last_apos = 0;
|
|
3127 if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
|
|
3128 s->cigar = NULL;
|
|
3129 s->cigar_alloc = 0;
|
|
3130 s->ncigar = 0;
|
|
3131
|
|
3132 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
|
|
3133 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
|
|
3134 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
|
|
3135 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
|
|
3136 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
|
|
3137 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
|
|
3138
|
|
3139 s->features = NULL;
|
|
3140 s->nfeatures = s->afeatures = 0;
|
|
3141
|
|
3142 #ifndef TN_external
|
|
3143 s->TN = NULL;
|
|
3144 s->nTN = s->aTN = 0;
|
|
3145 #endif
|
|
3146
|
|
3147 // Volatile keys as we do realloc in dstring
|
|
3148 if (!(s->pair_keys = string_pool_create(8192))) goto err;
|
|
3149 if (!(s->pair[0] = kh_init(m_s2i))) goto err;
|
|
3150 if (!(s->pair[1] = kh_init(m_s2i))) goto err;
|
|
3151
|
|
3152 #ifdef BA_external
|
|
3153 s->BA_len = 0;
|
|
3154 #endif
|
|
3155
|
|
3156 return s;
|
|
3157
|
|
3158 err:
|
|
3159 if (s)
|
|
3160 cram_free_slice(s);
|
|
3161
|
|
3162 return NULL;
|
|
3163 }
|
|
3164
|
|
3165 /*
|
|
3166 * Loads an entire slice.
|
|
3167 * FIXME: In 1.0 the native unit of slices within CRAM is broken
|
|
3168 * as slices contain references to objects in other slices.
|
|
3169 * To work around this while keeping the slice oriented outer loop
|
|
3170 * we read all slices and stitch them together into a fake large
|
|
3171 * slice instead.
|
|
3172 *
|
|
3173 * Returns cram_slice ptr on success
|
|
3174 * NULL on failure
|
|
3175 */
|
|
3176 cram_slice *cram_read_slice(cram_fd *fd) {
|
|
3177 cram_block *b = cram_read_block(fd);
|
|
3178 cram_slice *s = calloc(1, sizeof(*s));
|
|
3179 int i, n, max_id, min_id;
|
|
3180
|
|
3181 if (!b || !s)
|
|
3182 goto err;
|
|
3183
|
|
3184 s->hdr_block = b;
|
|
3185 switch (b->content_type) {
|
|
3186 case MAPPED_SLICE:
|
|
3187 case UNMAPPED_SLICE:
|
|
3188 if (!(s->hdr = cram_decode_slice_header(fd, b)))
|
|
3189 goto err;
|
|
3190 break;
|
|
3191
|
|
3192 default:
|
|
3193 fprintf(stderr, "Unexpected block of type %s\n",
|
|
3194 cram_content_type2str(b->content_type));
|
|
3195 goto err;
|
|
3196 }
|
|
3197
|
|
3198 s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
|
|
3199 if (!s->block)
|
|
3200 goto err;
|
|
3201
|
|
3202 for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
|
|
3203 if (!(s->block[i] = cram_read_block(fd)))
|
|
3204 goto err;
|
|
3205
|
|
3206 if (s->block[i]->content_type == EXTERNAL) {
|
|
3207 if (max_id < s->block[i]->content_id)
|
|
3208 max_id = s->block[i]->content_id;
|
|
3209 if (min_id > s->block[i]->content_id)
|
|
3210 min_id = s->block[i]->content_id;
|
|
3211 }
|
|
3212 }
|
|
3213 if (min_id >= 0 && max_id < 1024) {
|
|
3214 if (!(s->block_by_id = calloc(1024, sizeof(s->block[0]))))
|
|
3215 goto err;
|
|
3216
|
|
3217 for (i = 0; i < n; i++) {
|
|
3218 if (s->block[i]->content_type != EXTERNAL)
|
|
3219 continue;
|
|
3220 s->block_by_id[s->block[i]->content_id] = s->block[i];
|
|
3221 }
|
|
3222 }
|
|
3223
|
|
3224 /* Initialise encoding/decoding tables */
|
|
3225 s->cigar = NULL;
|
|
3226 s->cigar_alloc = 0;
|
|
3227 s->ncigar = 0;
|
|
3228
|
|
3229 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
|
|
3230 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
|
|
3231 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
|
|
3232 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
|
|
3233 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
|
|
3234 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
|
|
3235
|
|
3236 s->crecs = NULL;
|
|
3237
|
|
3238 s->last_apos = s->hdr->ref_seq_start;
|
|
3239
|
|
3240 return s;
|
|
3241
|
|
3242 err:
|
|
3243 if (b)
|
|
3244 cram_free_block(b);
|
|
3245 if (s) {
|
|
3246 s->hdr_block = NULL;
|
|
3247 cram_free_slice(s);
|
|
3248 }
|
|
3249 return NULL;
|
|
3250 }
|
|
3251
|
|
3252
|
|
3253 /* ----------------------------------------------------------------------
|
|
3254 * CRAM file definition (header)
|
|
3255 */
|
|
3256
|
|
3257 /*
|
|
3258 * Reads a CRAM file definition structure.
|
|
3259 * Returns file_def ptr on success
|
|
3260 * NULL on failure
|
|
3261 */
|
|
3262 cram_file_def *cram_read_file_def(cram_fd *fd) {
|
|
3263 cram_file_def *def = malloc(sizeof(*def));
|
|
3264 if (!def)
|
|
3265 return NULL;
|
|
3266
|
|
3267 if (26 != hread(fd->fp, &def->magic[0], 26)) {
|
|
3268 free(def);
|
|
3269 return NULL;
|
|
3270 }
|
|
3271
|
|
3272 if (memcmp(def->magic, "CRAM", 4) != 0) {
|
|
3273 free(def);
|
|
3274 return NULL;
|
|
3275 }
|
|
3276
|
|
3277 if (def->major_version > 3) {
|
|
3278 fprintf(stderr, "CRAM version number mismatch\n"
|
|
3279 "Expected 1.x, 2.x or 3.x, got %d.%d\n",
|
|
3280 def->major_version, def->minor_version);
|
|
3281 free(def);
|
|
3282 return NULL;
|
|
3283 }
|
|
3284
|
|
3285 fd->first_container += 26;
|
|
3286 fd->last_slice = 0;
|
|
3287
|
|
3288 return def;
|
|
3289 }
|
|
3290
|
|
3291 /*
|
|
3292 * Writes a cram_file_def structure to cram_fd.
|
|
3293 * Returns 0 on success
|
|
3294 * -1 on failure
|
|
3295 */
|
|
3296 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
|
|
3297 return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
|
|
3298 }
|
|
3299
|
|
3300 void cram_free_file_def(cram_file_def *def) {
|
|
3301 if (def) free(def);
|
|
3302 }
|
|
3303
|
|
3304 /* ----------------------------------------------------------------------
|
|
3305 * SAM header I/O
|
|
3306 */
|
|
3307
|
|
3308
|
|
3309 /*
|
|
3310 * Reads the SAM header from the first CRAM data block.
|
|
3311 * Also performs minimal parsing to extract read-group
|
|
3312 * and sample information.
|
|
3313
|
|
3314 * Returns SAM hdr ptr on success
|
|
3315 * NULL on failure
|
|
3316 */
|
|
3317 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
|
|
3318 int32_t header_len;
|
|
3319 char *header;
|
|
3320 SAM_hdr *hdr;
|
|
3321
|
|
3322 /* 1.1 onwards stores the header in the first block of a container */
|
|
3323 if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
|
3324 /* Length */
|
|
3325 if (-1 == int32_decode(fd, &header_len))
|
|
3326 return NULL;
|
|
3327
|
|
3328 /* Alloc and read */
|
|
3329 if (NULL == (header = malloc(header_len+1)))
|
|
3330 return NULL;
|
|
3331
|
|
3332 *header = 0;
|
|
3333 if (header_len != hread(fd->fp, header, header_len))
|
|
3334 return NULL;
|
|
3335
|
|
3336 fd->first_container += 4 + header_len;
|
|
3337 } else {
|
|
3338 cram_container *c = cram_read_container(fd);
|
|
3339 cram_block *b;
|
|
3340 int i, len;
|
|
3341
|
|
3342 if (!c)
|
|
3343 return NULL;
|
|
3344
|
|
3345 if (c->num_blocks < 1) {
|
|
3346 cram_free_container(c);
|
|
3347 return NULL;
|
|
3348 }
|
|
3349
|
|
3350 if (!(b = cram_read_block(fd))) {
|
|
3351 cram_free_container(c);
|
|
3352 return NULL;
|
|
3353 }
|
|
3354 cram_uncompress_block(b);
|
|
3355
|
|
3356 len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
|
|
3357 itf8_size(b->content_id) +
|
|
3358 itf8_size(b->uncomp_size) +
|
|
3359 itf8_size(b->comp_size);
|
|
3360
|
|
3361 /* Extract header from 1st block */
|
|
3362 if (-1 == int32_get(b, &header_len) ||
|
|
3363 b->uncomp_size - 4 < header_len) {
|
|
3364 cram_free_container(c);
|
|
3365 cram_free_block(b);
|
|
3366 return NULL;
|
|
3367 }
|
|
3368 if (NULL == (header = malloc(header_len+1))) {
|
|
3369 cram_free_container(c);
|
|
3370 cram_free_block(b);
|
|
3371 return NULL;
|
|
3372 }
|
|
3373 memcpy(header, BLOCK_END(b), header_len);
|
|
3374 header[header_len]='\0';
|
|
3375 cram_free_block(b);
|
|
3376
|
|
3377 /* Consume any remaining blocks */
|
|
3378 for (i = 1; i < c->num_blocks; i++) {
|
|
3379 if (!(b = cram_read_block(fd))) {
|
|
3380 cram_free_container(c);
|
|
3381 return NULL;
|
|
3382 }
|
|
3383 len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
|
|
3384 itf8_size(b->content_id) +
|
|
3385 itf8_size(b->uncomp_size) +
|
|
3386 itf8_size(b->comp_size);
|
|
3387 cram_free_block(b);
|
|
3388 }
|
|
3389
|
|
3390 if (c->length && c->length > len) {
|
|
3391 // Consume padding
|
|
3392 char *pads = malloc(c->length - len);
|
|
3393 if (!pads) {
|
|
3394 cram_free_container(c);
|
|
3395 return NULL;
|
|
3396 }
|
|
3397
|
|
3398 if (c->length - len != hread(fd->fp, pads, c->length - len)) {
|
|
3399 cram_free_container(c);
|
|
3400 return NULL;
|
|
3401 }
|
|
3402 free(pads);
|
|
3403 }
|
|
3404
|
|
3405 cram_free_container(c);
|
|
3406 }
|
|
3407
|
|
3408 /* Parse */
|
|
3409 hdr = sam_hdr_parse_(header, header_len);
|
|
3410 free(header);
|
|
3411
|
|
3412 return hdr;
|
|
3413 }
|
|
3414
|
|
3415 /*
|
|
3416 * Converts 'in' to a full pathname to store in out.
|
|
3417 * Out must be at least PATH_MAX bytes long.
|
|
3418 */
|
|
3419 static void full_path(char *out, char *in) {
|
|
3420 if (*in == '/') {
|
|
3421 strncpy(out, in, PATH_MAX);
|
|
3422 out[PATH_MAX-1] = 0;
|
|
3423 } else {
|
|
3424 int len;
|
|
3425
|
|
3426 // unable to get dir or out+in is too long
|
|
3427 if (!getcwd(out, PATH_MAX) ||
|
|
3428 (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
|
|
3429 strncpy(out, in, PATH_MAX);
|
|
3430 out[PATH_MAX-1] = 0;
|
|
3431 return;
|
|
3432 }
|
|
3433
|
|
3434 sprintf(out+len, "/%.*s", PATH_MAX - len, in);
|
|
3435
|
|
3436 // FIXME: cope with `pwd`/../../../foo.fa ?
|
|
3437 }
|
|
3438 }
|
|
3439
|
|
3440 /*
|
|
3441 * Writes a CRAM SAM header.
|
|
3442 * Returns 0 on success
|
|
3443 * -1 on failure
|
|
3444 */
|
|
3445 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
|
|
3446 int header_len;
|
|
3447 int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
|
|
3448
|
|
3449 /* Write CRAM MAGIC if not yet written. */
|
|
3450 if (fd->file_def->major_version == 0) {
|
|
3451 fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
|
|
3452 fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
|
|
3453 if (0 != cram_write_file_def(fd, fd->file_def))
|
|
3454 return -1;
|
|
3455 }
|
|
3456
|
|
3457 /* 1.0 requires and UNKNOWN read-group */
|
|
3458 if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
|
3459 if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
|
|
3460 if (sam_hdr_add(hdr, "RG",
|
|
3461 "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
|
|
3462 return -1;
|
|
3463 }
|
|
3464
|
|
3465 /* Fix M5 strings */
|
|
3466 if (fd->refs && !fd->no_ref) {
|
|
3467 int i;
|
|
3468 for (i = 0; i < hdr->nref; i++) {
|
|
3469 SAM_hdr_type *ty;
|
|
3470 char *ref;
|
|
3471
|
|
3472 if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
|
|
3473 return -1;
|
|
3474
|
|
3475 if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
|
|
3476 char unsigned buf[16], buf2[33];
|
|
3477 int j, rlen;
|
|
3478 MD5_CTX md5;
|
|
3479
|
|
3480 if (!fd->refs ||
|
|
3481 !fd->refs->ref_id ||
|
|
3482 !fd->refs->ref_id[i]) {
|
|
3483 return -1;
|
|
3484 }
|
|
3485 rlen = fd->refs->ref_id[i]->length;
|
|
3486 MD5_Init(&md5);
|
|
3487 ref = cram_get_ref(fd, i, 1, rlen);
|
|
3488 if (NULL == ref) return -1;
|
|
3489 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
|
|
3490 MD5_Update(&md5, ref, rlen);
|
|
3491 MD5_Final(buf, &md5);
|
|
3492 cram_ref_decr(fd->refs, i);
|
|
3493
|
|
3494 for (j = 0; j < 16; j++) {
|
|
3495 buf2[j*2+0] = "0123456789abcdef"[buf[j]>>4];
|
|
3496 buf2[j*2+1] = "0123456789abcdef"[buf[j]&15];
|
|
3497 }
|
|
3498 buf2[32] = 0;
|
|
3499 if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
|
|
3500 return -1;
|
|
3501 }
|
|
3502
|
|
3503 if (fd->ref_fn) {
|
|
3504 char ref_fn[PATH_MAX];
|
|
3505 full_path(ref_fn, fd->ref_fn);
|
|
3506 if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
|
|
3507 return -1;
|
|
3508 }
|
|
3509 }
|
|
3510 }
|
|
3511
|
|
3512 if (sam_hdr_rebuild(hdr))
|
|
3513 return -1;
|
|
3514
|
|
3515 /* Length */
|
|
3516 header_len = sam_hdr_length(hdr);
|
|
3517 if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
|
3518 if (-1 == int32_encode(fd, header_len))
|
|
3519 return -1;
|
|
3520
|
|
3521 /* Text data */
|
|
3522 if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
|
|
3523 return -1;
|
|
3524 } else {
|
|
3525 /* Create block(s) inside a container */
|
|
3526 cram_block *b = cram_new_block(FILE_HEADER, 0);
|
|
3527 cram_container *c = cram_new_container(0, 0);
|
|
3528 int padded_length;
|
|
3529 char *pads;
|
|
3530 int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
|
|
3531
|
|
3532 if (!b || !c) {
|
|
3533 if (b) cram_free_block(b);
|
|
3534 if (c) cram_free_container(c);
|
|
3535 return -1;
|
|
3536 }
|
|
3537
|
|
3538 int32_put(b, header_len);
|
|
3539 BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
|
|
3540 BLOCK_UPLEN(b);
|
|
3541
|
|
3542 // Compress header block if V3.0 and above
|
|
3543 if (CRAM_MAJOR_VERS(fd->version) >= 3 && fd->level > 0) {
|
|
3544 int method = 1<<GZIP;
|
|
3545 if (fd->use_bz2)
|
|
3546 method |= 1<<BZIP2;
|
|
3547 if (fd->use_lzma)
|
|
3548 method |= 1<<LZMA;
|
|
3549 cram_compress_block(fd, b, NULL, method, fd->level);
|
|
3550 }
|
|
3551
|
|
3552 if (blank_block) {
|
|
3553 c->length = b->comp_size + 2 + 4*is_cram_3 +
|
|
3554 itf8_size(b->content_id) +
|
|
3555 itf8_size(b->uncomp_size) +
|
|
3556 itf8_size(b->comp_size);
|
|
3557
|
|
3558 c->num_blocks = 2;
|
|
3559 c->num_landmarks = 2;
|
|
3560 if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
|
|
3561 cram_free_block(b);
|
|
3562 cram_free_container(c);
|
|
3563 return -1;
|
|
3564 }
|
|
3565 c->landmark[0] = 0;
|
|
3566 c->landmark[1] = c->length;
|
|
3567
|
|
3568 // Plus extra storage for uncompressed secondary blank block
|
|
3569 padded_length = MIN(c->length*.5, 10000);
|
|
3570 c->length += padded_length + 2 + 4*is_cram_3 +
|
|
3571 itf8_size(b->content_id) +
|
|
3572 itf8_size(padded_length)*2;
|
|
3573 } else {
|
|
3574 // Pad the block instead.
|
|
3575 c->num_blocks = 1;
|
|
3576 c->num_landmarks = 1;
|
|
3577 if (!(c->landmark = malloc(sizeof(*c->landmark))))
|
|
3578 return -1;
|
|
3579 c->landmark[0] = 0;
|
|
3580
|
|
3581 padded_length = MAX(c->length*1.5, 10000) - c->length;
|
|
3582
|
|
3583 c->length = b->comp_size + padded_length +
|
|
3584 2 + 4*is_cram_3 +
|
|
3585 itf8_size(b->content_id) +
|
|
3586 itf8_size(b->uncomp_size) +
|
|
3587 itf8_size(b->comp_size);
|
|
3588
|
|
3589 if (NULL == (pads = calloc(1, padded_length))) {
|
|
3590 cram_free_block(b);
|
|
3591 cram_free_container(c);
|
|
3592 return -1;
|
|
3593 }
|
|
3594 BLOCK_APPEND(b, pads, padded_length);
|
|
3595 BLOCK_UPLEN(b);
|
|
3596 free(pads);
|
|
3597 }
|
|
3598
|
|
3599 if (-1 == cram_write_container(fd, c)) {
|
|
3600 cram_free_block(b);
|
|
3601 cram_free_container(c);
|
|
3602 return -1;
|
|
3603 }
|
|
3604
|
|
3605 if (-1 == cram_write_block(fd, b)) {
|
|
3606 cram_free_block(b);
|
|
3607 cram_free_container(c);
|
|
3608 return -1;
|
|
3609 }
|
|
3610
|
|
3611 if (blank_block) {
|
|
3612 BLOCK_RESIZE(b, padded_length);
|
|
3613 memset(BLOCK_DATA(b), 0, padded_length);
|
|
3614 BLOCK_SIZE(b) = padded_length;
|
|
3615 BLOCK_UPLEN(b);
|
|
3616 b->method = RAW;
|
|
3617 if (-1 == cram_write_block(fd, b)) {
|
|
3618 cram_free_block(b);
|
|
3619 cram_free_container(c);
|
|
3620 return -1;
|
|
3621 }
|
|
3622 }
|
|
3623
|
|
3624 cram_free_block(b);
|
|
3625 cram_free_container(c);
|
|
3626 }
|
|
3627
|
|
3628 if (-1 == refs_from_header(fd->refs, fd, fd->header))
|
|
3629 return -1;
|
|
3630 if (-1 == refs2id(fd->refs, fd->header))
|
|
3631 return -1;
|
|
3632
|
|
3633 if (0 != hflush(fd->fp))
|
|
3634 return -1;
|
|
3635
|
|
3636 RP("=== Finishing saving header ===\n");
|
|
3637
|
|
3638 return 0;
|
|
3639 }
|
|
3640
|
|
3641 /* ----------------------------------------------------------------------
|
|
3642 * The top-level cram opening, closing and option handling
|
|
3643 */
|
|
3644
|
|
3645 /*
|
|
3646 * Initialises the lookup tables. These could be global statics, but they're
|
|
3647 * clumsy to setup in a multi-threaded environment unless we generate
|
|
3648 * verbatim code and include that.
|
|
3649 */
|
|
3650 static void cram_init_tables(cram_fd *fd) {
|
|
3651 int i;
|
|
3652
|
|
3653 memset(fd->L1, 4, 256);
|
|
3654 fd->L1['A'] = 0; fd->L1['a'] = 0;
|
|
3655 fd->L1['C'] = 1; fd->L1['c'] = 1;
|
|
3656 fd->L1['G'] = 2; fd->L1['g'] = 2;
|
|
3657 fd->L1['T'] = 3; fd->L1['t'] = 3;
|
|
3658
|
|
3659 memset(fd->L2, 5, 256);
|
|
3660 fd->L2['A'] = 0; fd->L2['a'] = 0;
|
|
3661 fd->L2['C'] = 1; fd->L2['c'] = 1;
|
|
3662 fd->L2['G'] = 2; fd->L2['g'] = 2;
|
|
3663 fd->L2['T'] = 3; fd->L2['t'] = 3;
|
|
3664 fd->L2['N'] = 4; fd->L2['n'] = 4;
|
|
3665
|
|
3666 if (CRAM_MAJOR_VERS(fd->version) == 1) {
|
|
3667 for (i = 0; i < 0x200; i++) {
|
|
3668 int f = 0;
|
|
3669
|
|
3670 if (i & CRAM_FPAIRED) f |= BAM_FPAIRED;
|
|
3671 if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
|
|
3672 if (i & CRAM_FUNMAP) f |= BAM_FUNMAP;
|
|
3673 if (i & CRAM_FREVERSE) f |= BAM_FREVERSE;
|
|
3674 if (i & CRAM_FREAD1) f |= BAM_FREAD1;
|
|
3675 if (i & CRAM_FREAD2) f |= BAM_FREAD2;
|
|
3676 if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY;
|
|
3677 if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL;
|
|
3678 if (i & CRAM_FDUP) f |= BAM_FDUP;
|
|
3679
|
|
3680 fd->bam_flag_swap[i] = f;
|
|
3681 }
|
|
3682
|
|
3683 for (i = 0; i < 0x1000; i++) {
|
|
3684 int g = 0;
|
|
3685
|
|
3686 if (i & BAM_FPAIRED) g |= CRAM_FPAIRED;
|
|
3687 if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR;
|
|
3688 if (i & BAM_FUNMAP) g |= CRAM_FUNMAP;
|
|
3689 if (i & BAM_FREVERSE) g |= CRAM_FREVERSE;
|
|
3690 if (i & BAM_FREAD1) g |= CRAM_FREAD1;
|
|
3691 if (i & BAM_FREAD2) g |= CRAM_FREAD2;
|
|
3692 if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY;
|
|
3693 if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL;
|
|
3694 if (i & BAM_FDUP) g |= CRAM_FDUP;
|
|
3695
|
|
3696 fd->cram_flag_swap[i] = g;
|
|
3697 }
|
|
3698 } else {
|
|
3699 /* NOP */
|
|
3700 for (i = 0; i < 0x1000; i++)
|
|
3701 fd->bam_flag_swap[i] = i;
|
|
3702 for (i = 0; i < 0x1000; i++)
|
|
3703 fd->cram_flag_swap[i] = i;
|
|
3704 }
|
|
3705
|
|
3706 memset(fd->cram_sub_matrix, 4, 32*32);
|
|
3707 for (i = 0; i < 32; i++) {
|
|
3708 fd->cram_sub_matrix[i]['A'&0x1f]=0;
|
|
3709 fd->cram_sub_matrix[i]['C'&0x1f]=1;
|
|
3710 fd->cram_sub_matrix[i]['G'&0x1f]=2;
|
|
3711 fd->cram_sub_matrix[i]['T'&0x1f]=3;
|
|
3712 fd->cram_sub_matrix[i]['N'&0x1f]=4;
|
|
3713 }
|
|
3714 for (i = 0; i < 20; i+=4) {
|
|
3715 int j;
|
|
3716 for (j = 0; j < 20; j++) {
|
|
3717 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
|
|
3718 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
|
|
3719 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
|
|
3720 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
|
|
3721 }
|
|
3722 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
|
|
3723 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
|
|
3724 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
|
|
3725 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
|
|
3726 }
|
|
3727 }
|
|
3728
|
|
3729 // Default version numbers for CRAM
|
|
3730 static int major_version = 2;
|
|
3731 static int minor_version = 1;
|
|
3732
|
|
3733 /*
|
|
3734 * Opens a CRAM file for read (mode "rb") or write ("wb").
|
|
3735 * The filename may be "-" to indicate stdin or stdout.
|
|
3736 *
|
|
3737 * Returns file handle on success
|
|
3738 * NULL on failure.
|
|
3739 */
|
|
3740 cram_fd *cram_open(const char *filename, const char *mode) {
|
|
3741 hFILE *fp;
|
|
3742 cram_fd *fd;
|
|
3743 char fmode[3]= { mode[0], '\0', '\0' };
|
|
3744
|
|
3745 if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
|
|
3746 fmode[1] = 'b';
|
|
3747 }
|
|
3748
|
|
3749 fp = hopen(filename, fmode);
|
|
3750 if (!fp)
|
|
3751 return NULL;
|
|
3752
|
|
3753 fd = cram_dopen(fp, filename, mode);
|
|
3754 if (!fd)
|
|
3755 hclose_abruptly(fp);
|
|
3756
|
|
3757 return fd;
|
|
3758 }
|
|
3759
|
|
3760 /* Opens an existing stream for reading or writing.
|
|
3761 *
|
|
3762 * Returns file handle on success;
|
|
3763 * NULL on failure.
|
|
3764 */
|
|
3765 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
|
|
3766 int i;
|
|
3767 char *cp;
|
|
3768 cram_fd *fd = calloc(1, sizeof(*fd));
|
|
3769 if (!fd)
|
|
3770 return NULL;
|
|
3771
|
|
3772 fd->level = 5;
|
|
3773 for (i = 0; mode[i]; i++) {
|
|
3774 if (mode[i] >= '0' && mode[i] <= '9') {
|
|
3775 fd->level = mode[i] - '0';
|
|
3776 break;
|
|
3777 }
|
|
3778 }
|
|
3779
|
|
3780 fd->fp = fp;
|
|
3781 fd->mode = *mode;
|
|
3782 fd->first_container = 0;
|
|
3783
|
|
3784 if (fd->mode == 'r') {
|
|
3785 /* Reader */
|
|
3786
|
|
3787 if (!(fd->file_def = cram_read_file_def(fd)))
|
|
3788 goto err;
|
|
3789
|
|
3790 fd->version = fd->file_def->major_version * 256 +
|
|
3791 fd->file_def->minor_version;
|
|
3792
|
|
3793 if (!(fd->header = cram_read_SAM_hdr(fd)))
|
|
3794 goto err;
|
|
3795
|
|
3796 } else {
|
|
3797 /* Writer */
|
|
3798 cram_file_def *def = calloc(1, sizeof(*def));
|
|
3799 if (!def)
|
|
3800 return NULL;
|
|
3801
|
|
3802 fd->file_def = def;
|
|
3803
|
|
3804 def->magic[0] = 'C';
|
|
3805 def->magic[1] = 'R';
|
|
3806 def->magic[2] = 'A';
|
|
3807 def->magic[3] = 'M';
|
|
3808 def->major_version = 0; // Indicator to write file def later.
|
|
3809 def->minor_version = 0;
|
|
3810 memset(def->file_id, 0, 20);
|
|
3811 strncpy(def->file_id, filename, 20);
|
|
3812
|
|
3813 fd->version = major_version * 256 + minor_version;
|
|
3814
|
|
3815 /* SAM header written later along with this file_def */
|
|
3816 }
|
|
3817
|
|
3818 cram_init_tables(fd);
|
|
3819
|
|
3820 fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
|
|
3821 if (!fd->prefix)
|
|
3822 goto err;
|
|
3823 fd->first_base = fd->last_base = -1;
|
|
3824 fd->record_counter = 0;
|
|
3825
|
|
3826 fd->ctr = NULL;
|
|
3827 fd->refs = refs_create();
|
|
3828 if (!fd->refs)
|
|
3829 goto err;
|
|
3830 fd->ref_id = -2;
|
|
3831 fd->ref = NULL;
|
|
3832
|
|
3833 fd->decode_md = 0;
|
|
3834 fd->verbose = 0;
|
|
3835 fd->seqs_per_slice = SEQS_PER_SLICE;
|
|
3836 fd->slices_per_container = SLICE_PER_CNT;
|
|
3837 fd->embed_ref = 0;
|
|
3838 fd->no_ref = 0;
|
|
3839 fd->ignore_md5 = 0;
|
|
3840 fd->use_bz2 = 0;
|
|
3841 fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
|
|
3842 fd->use_lzma = 0;
|
|
3843 fd->multi_seq = -1;
|
|
3844 fd->unsorted = 0;
|
|
3845 fd->shared_ref = 0;
|
|
3846
|
|
3847 fd->index = NULL;
|
|
3848 fd->own_pool = 0;
|
|
3849 fd->pool = NULL;
|
|
3850 fd->rqueue = NULL;
|
|
3851 fd->job_pending = NULL;
|
|
3852 fd->ooc = 0;
|
|
3853 fd->required_fields = INT_MAX;
|
|
3854
|
|
3855 for (i = 0; i < DS_END; i++)
|
|
3856 fd->m[i] = cram_new_metrics();
|
|
3857
|
|
3858 fd->range.refid = -2; // no ref.
|
|
3859 fd->eof = 1; // See samtools issue #150
|
|
3860 fd->ref_fn = NULL;
|
|
3861
|
|
3862 fd->bl = NULL;
|
|
3863
|
|
3864 /* Initialise dummy refs from the @SQ headers */
|
|
3865 if (-1 == refs_from_header(fd->refs, fd, fd->header))
|
|
3866 goto err;
|
|
3867
|
|
3868 return fd;
|
|
3869
|
|
3870 err:
|
|
3871 if (fd)
|
|
3872 free(fd);
|
|
3873
|
|
3874 return NULL;
|
|
3875 }
|
|
3876
|
|
3877 /*
|
|
3878 * Seek within a CRAM file.
|
|
3879 *
|
|
3880 * Returns 0 on success
|
|
3881 * -1 on failure
|
|
3882 */
|
|
3883 int cram_seek(cram_fd *fd, off_t offset, int whence) {
|
|
3884 char buf[65536];
|
|
3885
|
|
3886 fd->ooc = 0;
|
|
3887
|
|
3888 if (hseek(fd->fp, offset, whence) >= 0)
|
|
3889 return 0;
|
|
3890
|
|
3891 if (!(whence == SEEK_CUR && offset >= 0))
|
|
3892 return -1;
|
|
3893
|
|
3894 /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
|
|
3895 while (offset > 0) {
|
|
3896 int len = MIN(65536, offset);
|
|
3897 if (len != hread(fd->fp, buf, len))
|
|
3898 return -1;
|
|
3899 offset -= len;
|
|
3900 }
|
|
3901
|
|
3902 return 0;
|
|
3903 }
|
|
3904
|
|
3905 /*
|
|
3906 * Flushes a CRAM file.
|
|
3907 * Useful for when writing to stdout without wishing to close the stream.
|
|
3908 *
|
|
3909 * Returns 0 on success
|
|
3910 * -1 on failure
|
|
3911 */
|
|
3912 int cram_flush(cram_fd *fd) {
|
|
3913 if (!fd)
|
|
3914 return -1;
|
|
3915
|
|
3916 if (fd->mode == 'w' && fd->ctr) {
|
|
3917 if(fd->ctr->slice)
|
|
3918 fd->ctr->curr_slice++;
|
|
3919 if (-1 == cram_flush_container_mt(fd, fd->ctr))
|
|
3920 return -1;
|
|
3921 }
|
|
3922
|
|
3923 return 0;
|
|
3924 }
|
|
3925
|
|
3926 /*
|
|
3927 * Closes a CRAM file.
|
|
3928 * Returns 0 on success
|
|
3929 * -1 on failure
|
|
3930 */
|
|
3931 int cram_close(cram_fd *fd) {
|
|
3932 spare_bams *bl, *next;
|
|
3933 int i;
|
|
3934
|
|
3935 if (!fd)
|
|
3936 return -1;
|
|
3937
|
|
3938 if (fd->mode == 'w' && fd->ctr) {
|
|
3939 if(fd->ctr->slice)
|
|
3940 fd->ctr->curr_slice++;
|
|
3941 if (-1 == cram_flush_container_mt(fd, fd->ctr))
|
|
3942 return -1;
|
|
3943 }
|
|
3944
|
|
3945 if (fd->pool) {
|
|
3946 t_pool_flush(fd->pool);
|
|
3947
|
|
3948 if (0 != cram_flush_result(fd))
|
|
3949 return -1;
|
|
3950
|
|
3951 pthread_mutex_destroy(&fd->metrics_lock);
|
|
3952 pthread_mutex_destroy(&fd->ref_lock);
|
|
3953 pthread_mutex_destroy(&fd->bam_list_lock);
|
|
3954
|
|
3955 fd->ctr = NULL; // prevent double freeing
|
|
3956
|
|
3957 //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
|
|
3958
|
|
3959 t_results_queue_destroy(fd->rqueue);
|
|
3960 }
|
|
3961
|
|
3962 if (fd->mode == 'w') {
|
|
3963 /* Write EOF block */
|
|
3964 if (CRAM_MAJOR_VERS(fd->version) == 3) {
|
|
3965 if (38 != hwrite(fd->fp,
|
|
3966 "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
|
|
3967 "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
|
|
3968 "\x00\x01\x00" // Cont HDR
|
|
3969 "\x05\xbd\xd9\x4f" // CRC32
|
|
3970 "\x00\x01\x00\x06\x06" // Comp.HDR blk
|
|
3971 "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
|
|
3972 "\xee\x63\x01\x4b", // CRC32
|
|
3973 38))
|
|
3974 return -1;
|
|
3975 } else {
|
|
3976 if (30 != hwrite(fd->fp,
|
|
3977 "\x0b\x00\x00\x00\xff\xff\xff\xff"
|
|
3978 "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
|
|
3979 "\x00\x01\x00\x00\x01\x00\x06\x06"
|
|
3980 "\x01\x00\x01\x00\x01\x00", 30))
|
|
3981 return -1;
|
|
3982 }
|
|
3983 }
|
|
3984
|
|
3985 for (bl = fd->bl; bl; bl = next) {
|
|
3986 int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
|
|
3987
|
|
3988 next = bl->next;
|
|
3989 for (i = 0; i < max_rec; i++) {
|
|
3990 if (bl->bams[i])
|
|
3991 bam_free(bl->bams[i]);
|
|
3992 }
|
|
3993 free(bl->bams);
|
|
3994 free(bl);
|
|
3995 }
|
|
3996
|
|
3997 if (hclose(fd->fp) != 0)
|
|
3998 return -1;
|
|
3999
|
|
4000 if (fd->file_def)
|
|
4001 cram_free_file_def(fd->file_def);
|
|
4002
|
|
4003 if (fd->header)
|
|
4004 sam_hdr_free(fd->header);
|
|
4005
|
|
4006 free(fd->prefix);
|
|
4007
|
|
4008 if (fd->ctr)
|
|
4009 cram_free_container(fd->ctr);
|
|
4010
|
|
4011 if (fd->refs)
|
|
4012 refs_free(fd->refs);
|
|
4013 if (fd->ref_free)
|
|
4014 free(fd->ref_free);
|
|
4015
|
|
4016 for (i = 0; i < DS_END; i++)
|
|
4017 if (fd->m[i])
|
|
4018 free(fd->m[i]);
|
|
4019
|
|
4020 if (fd->index)
|
|
4021 cram_index_free(fd);
|
|
4022
|
|
4023 if (fd->own_pool && fd->pool)
|
|
4024 t_pool_destroy(fd->pool, 0);
|
|
4025
|
|
4026 free(fd);
|
|
4027 return 0;
|
|
4028 }
|
|
4029
|
|
4030 /*
|
|
4031 * Returns 1 if we hit an EOF while reading.
|
|
4032 */
|
|
4033 int cram_eof(cram_fd *fd) {
|
|
4034 return fd->eof;
|
|
4035 }
|
|
4036
|
|
4037
|
|
4038 /*
|
|
4039 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
|
|
4040 * Use this immediately after opening.
|
|
4041 *
|
|
4042 * Returns 0 on success
|
|
4043 * -1 on failure
|
|
4044 */
|
|
4045 int cram_set_option(cram_fd *fd, enum cram_option opt, ...) {
|
|
4046 int r;
|
|
4047 va_list args;
|
|
4048
|
|
4049 va_start(args, opt);
|
|
4050 r = cram_set_voption(fd, opt, args);
|
|
4051 va_end(args);
|
|
4052
|
|
4053 return r;
|
|
4054 }
|
|
4055
|
|
4056 /*
|
|
4057 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
|
|
4058 * Use this immediately after opening.
|
|
4059 *
|
|
4060 * Returns 0 on success
|
|
4061 * -1 on failure
|
|
4062 */
|
|
4063 int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args) {
|
|
4064 refs_t *refs;
|
|
4065
|
|
4066 if (!fd)
|
|
4067 return -1;
|
|
4068
|
|
4069 switch (opt) {
|
|
4070 case CRAM_OPT_DECODE_MD:
|
|
4071 fd->decode_md = va_arg(args, int);
|
|
4072 break;
|
|
4073
|
|
4074 case CRAM_OPT_PREFIX:
|
|
4075 if (fd->prefix)
|
|
4076 free(fd->prefix);
|
|
4077 if (!(fd->prefix = strdup(va_arg(args, char *))))
|
|
4078 return -1;
|
|
4079 break;
|
|
4080
|
|
4081 case CRAM_OPT_VERBOSITY:
|
|
4082 fd->verbose = va_arg(args, int);
|
|
4083 break;
|
|
4084
|
|
4085 case CRAM_OPT_SEQS_PER_SLICE:
|
|
4086 fd->seqs_per_slice = va_arg(args, int);
|
|
4087 break;
|
|
4088
|
|
4089 case CRAM_OPT_SLICES_PER_CONTAINER:
|
|
4090 fd->slices_per_container = va_arg(args, int);
|
|
4091 break;
|
|
4092
|
|
4093 case CRAM_OPT_EMBED_REF:
|
|
4094 fd->embed_ref = va_arg(args, int);
|
|
4095 break;
|
|
4096
|
|
4097 case CRAM_OPT_NO_REF:
|
|
4098 fd->no_ref = va_arg(args, int);
|
|
4099 break;
|
|
4100
|
|
4101 case CRAM_OPT_IGNORE_MD5:
|
|
4102 fd->ignore_md5 = va_arg(args, int);
|
|
4103 break;
|
|
4104
|
|
4105 case CRAM_OPT_USE_BZIP2:
|
|
4106 fd->use_bz2 = va_arg(args, int);
|
|
4107 break;
|
|
4108
|
|
4109 case CRAM_OPT_USE_RANS:
|
|
4110 fd->use_rans = va_arg(args, int);
|
|
4111 break;
|
|
4112
|
|
4113 case CRAM_OPT_USE_LZMA:
|
|
4114 fd->use_lzma = va_arg(args, int);
|
|
4115 break;
|
|
4116
|
|
4117 case CRAM_OPT_SHARED_REF:
|
|
4118 fd->shared_ref = 1;
|
|
4119 refs = va_arg(args, refs_t *);
|
|
4120 if (refs != fd->refs) {
|
|
4121 if (fd->refs)
|
|
4122 refs_free(fd->refs);
|
|
4123 fd->refs = refs;
|
|
4124 fd->refs->count++;
|
|
4125 }
|
|
4126 break;
|
|
4127
|
|
4128 case CRAM_OPT_RANGE:
|
|
4129 fd->range = *va_arg(args, cram_range *);
|
|
4130 return cram_seek_to_refpos(fd, &fd->range);
|
|
4131
|
|
4132 case CRAM_OPT_REFERENCE:
|
|
4133 return cram_load_reference(fd, va_arg(args, char *));
|
|
4134
|
|
4135 case CRAM_OPT_VERSION: {
|
|
4136 int major, minor;
|
|
4137 char *s = va_arg(args, char *);
|
|
4138 if (2 != sscanf(s, "%d.%d", &major, &minor)) {
|
|
4139 fprintf(stderr, "Malformed version string %s\n", s);
|
|
4140 return -1;
|
|
4141 }
|
|
4142 if (!((major == 1 && minor == 0) ||
|
|
4143 (major == 2 && (minor == 0 || minor == 1)) ||
|
|
4144 (major == 3 && minor == 0))) {
|
|
4145 fprintf(stderr, "Unknown version string; "
|
|
4146 "use 1.0, 2.0, 2.1 or 3.0\n");
|
|
4147 return -1;
|
|
4148 }
|
|
4149 fd->version = major*256 + minor;
|
|
4150
|
|
4151 if (CRAM_MAJOR_VERS(fd->version) >= 3)
|
|
4152 fd->use_rans = 1;
|
|
4153 break;
|
|
4154 }
|
|
4155
|
|
4156 case CRAM_OPT_MULTI_SEQ_PER_SLICE:
|
|
4157 fd->multi_seq = va_arg(args, int);
|
|
4158 break;
|
|
4159
|
|
4160 case CRAM_OPT_NTHREADS: {
|
|
4161 int nthreads = va_arg(args, int);
|
|
4162 if (nthreads > 1) {
|
|
4163 if (!(fd->pool = t_pool_init(nthreads*2, nthreads)))
|
|
4164 return -1;
|
|
4165
|
|
4166 fd->rqueue = t_results_queue_init();
|
|
4167 pthread_mutex_init(&fd->metrics_lock, NULL);
|
|
4168 pthread_mutex_init(&fd->ref_lock, NULL);
|
|
4169 pthread_mutex_init(&fd->bam_list_lock, NULL);
|
|
4170 fd->shared_ref = 1;
|
|
4171 fd->own_pool = 1;
|
|
4172 }
|
|
4173 break;
|
|
4174 }
|
|
4175
|
|
4176 case CRAM_OPT_THREAD_POOL:
|
|
4177 fd->pool = va_arg(args, t_pool *);
|
|
4178 if (fd->pool) {
|
|
4179 fd->rqueue = t_results_queue_init();
|
|
4180 pthread_mutex_init(&fd->metrics_lock, NULL);
|
|
4181 pthread_mutex_init(&fd->ref_lock, NULL);
|
|
4182 pthread_mutex_init(&fd->bam_list_lock, NULL);
|
|
4183 }
|
|
4184 fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
|
|
4185 fd->own_pool = 0;
|
|
4186
|
|
4187 //fd->qsize = 1;
|
|
4188 //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
|
|
4189 //t_pool_dispatch(fd->pool, cram_decoder_thread, fd);
|
|
4190 break;
|
|
4191
|
|
4192 case CRAM_OPT_REQUIRED_FIELDS:
|
|
4193 fd->required_fields = va_arg(args, int);
|
|
4194 break;
|
|
4195
|
|
4196 default:
|
|
4197 fprintf(stderr, "Unknown CRAM option code %d\n", opt);
|
|
4198 return -1;
|
|
4199 }
|
|
4200
|
|
4201 return 0;
|
|
4202 }
|