comparison ezBAMQC/src/htslib/cram/cram_io.h @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /*
2 Copyright (c) 2012-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*! \file
32 * Include cram.h instead.
33 *
34 * This is an internal part of the CRAM system and is automatically included
35 * when you #include cram.h.
36 *
37 * Implements the low level CRAM I/O primitives.
38 * This includes basic data types such as byte, int, ITF-8,
39 * maps, bitwise I/O, etc.
40 */
41
42 #ifndef _CRAM_IO_H_
43 #define _CRAM_IO_H_
44
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48
49 #define ITF8_MACROS
50
51 #include <stdint.h>
52 #include <cram/misc.h>
53
54 /**@{ ----------------------------------------------------------------------
55 * ITF8 encoding and decoding.
56 *
57 * Also see the itf8_get and itf8_put macros.
58 */
59
60 /*! INTERNAL: Converts two characters into an integer for use in switch{} */
61 #define CRAM_KEY(a,b) (((a)<<8)|((b)))
62
63 /*! Reads an integer in ITF-8 encoding from 'fd' and stores it in
64 * *val.
65 *
66 * @return
67 * Returns the number of bytes read on success;
68 * -1 on failure
69 */
70 int itf8_decode(cram_fd *fd, int32_t *val);
71
72 #ifndef ITF8_MACROS
73 /*! Reads an integer in ITF-8 encoding from 'cp' and stores it in
74 * *val.
75 *
76 * @return
77 * Returns the number of bytes read on success;
78 * -1 on failure
79 */
80 int itf8_get(char *cp, int32_t *val_p);
81
82 /*! Stores a value to memory in ITF-8 format.
83 *
84 * @return
85 * Returns the number of bytes required to store the number.
86 * This is a maximum of 5 bytes.
87 */
88 int itf8_put(char *cp, int32_t val);
89
90 #else
91
92 /*
93 * Macro implementations of the above
94 */
95 #define itf8_get(c,v) (((uc)(c)[0]<0x80)?(*(v)=(uc)(c)[0],1):(((uc)(c)[0]<0xc0)?(*(v)=(((uc)(c)[0]<<8)|(uc)(c)[1])&0x3fff,2):(((uc)(c)[0]<0xe0)?(*(v)=(((uc)(c)[0]<<16)|((uc)(c)[1]<<8)|(uc)(c)[2])&0x1fffff,3):(((uc)(c)[0]<0xf0)?(*(v)=(((uc)(c)[0]<<24)|((uc)(c)[1]<<16)|((uc)(c)[2]<<8)|(uc)(c)[3])&0x0fffffff,4):(*(v)=(((uc)(c)[0]&0x0f)<<28)|((uc)(c)[1]<<20)|((uc)(c)[2]<<12)|((uc)(c)[3]<<4)|((uc)(c)[4]&0x0f),5)))))
96
97 #define itf8_put(c,v) ((!((v)&~0x7f))?((c)[0]=(v),1):(!((v)&~0x3fff))?((c)[0]=((v)>>8)|0x80,(c)[1]=(v)&0xff,2):(!((v)&~0x1fffff))?((c)[0]=((v)>>16)|0xc0,(c)[1]=((v)>>8)&0xff,(c)[2]=(v)&0xff,3):(!((v)&~0xfffffff))?((c)[0]=((v)>>24)|0xe0,(c)[1]=((v)>>16)&0xff,(c)[2]=((v)>>8)&0xff,(c)[3]=(v)&0xff,4):((c)[0]=0xf0|(((v)>>28)&0xff),(c)[1]=((v)>>20)&0xff,(c)[2]=((v)>>12)&0xff,(c)[3]=((v)>>4)&0xff,(c)[4]=(v)&0xf,5))
98
99 #define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5)
100
101 #endif
102
103 int ltf8_get(char *cp, int64_t *val_p);
104 int ltf8_put(char *cp, int64_t val);
105
106 /*! Pushes a value in ITF8 format onto the end of a block.
107 *
108 * This shouldn't be used for high-volume data as it is not the fastest
109 * method.
110 *
111 * @return
112 * Returns the number of bytes written
113 */
114 int itf8_put_blk(cram_block *blk, int val);
115
116 /**@}*/
117 /**@{ ----------------------------------------------------------------------
118 * CRAM blocks - the dynamically growable data block. We have code to
119 * create, update, (un)compress and read/write.
120 *
121 * These are derived from the deflate_interlaced.c blocks, but with the
122 * CRAM extension of content types and IDs.
123 */
124
125 /*! Allocates a new cram_block structure with a specified content_type and
126 * id.
127 *
128 * @return
129 * Returns block pointer on success;
130 * NULL on failure
131 */
132 cram_block *cram_new_block(enum cram_content_type content_type,
133 int content_id);
134
135 /*! Reads a block from a cram file.
136 *
137 * @return
138 * Returns cram_block pointer on success;
139 * NULL on failure
140 */
141 cram_block *cram_read_block(cram_fd *fd);
142
143 /*! Writes a CRAM block.
144 *
145 * @return
146 * Returns 0 on success;
147 * -1 on failure
148 */
149 int cram_write_block(cram_fd *fd, cram_block *b);
150
151 /*! Frees a CRAM block, deallocating internal data too.
152 */
153 void cram_free_block(cram_block *b);
154
155 /*! Uncompress a memory block using Zlib.
156 *
157 * @return
158 * Returns 0 on success;
159 * -1 on failure
160 */
161 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size);
162
163 /*! Uncompresses a CRAM block, if compressed.
164 *
165 * @return
166 * Returns 0 on success;
167 * -1 on failure
168 */
169 int cram_uncompress_block(cram_block *b);
170
171 /*! Compresses a block.
172 *
173 * Compresses a block using one of two different zlib strategies. If we only
174 * want one choice set strat2 to be -1.
175 *
176 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
177 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
178 * significantly faster.
179 *
180 * @return
181 * Returns 0 on success;
182 * -1 on failure
183 */
184 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
185 int method, int level);
186
187 cram_metrics *cram_new_metrics(void);
188 char *cram_block_method2str(enum cram_block_method m);
189 char *cram_content_type2str(enum cram_content_type t);
190
191 /* --- Accessor macros for manipulating blocks on a byte by byte basis --- */
192
193 /* Block size and data pointer. */
194 #define BLOCK_SIZE(b) ((b)->byte)
195 #define BLOCK_DATA(b) ((b)->data)
196
197 /* Returns the address one past the end of the block */
198 #define BLOCK_END(b) (&(b)->data[(b)->byte])
199
200 /* Request block to be at least 'l' bytes long */
201 #define BLOCK_RESIZE(b,l) \
202 do { \
203 while((b)->alloc <= (l)) { \
204 (b)->alloc = (b)->alloc ? (b)->alloc*1.5 : 1024; \
205 (b)->data = realloc((b)->data, (b)->alloc); \
206 } \
207 } while(0)
208
209 /* Ensure the block can hold at least another 'l' bytes */
210 #define BLOCK_GROW(b,l) BLOCK_RESIZE((b), BLOCK_SIZE((b)) + (l))
211
212 /* Append string 's' of length 'l' */
213 #define BLOCK_APPEND(b,s,l) \
214 do { \
215 BLOCK_GROW((b),(l)); \
216 memcpy(BLOCK_END((b)), (s), (l)); \
217 BLOCK_SIZE((b)) += (l); \
218 } while (0)
219
220 /* Append as single character 'c' */
221 #define BLOCK_APPEND_CHAR(b,c) \
222 do { \
223 BLOCK_GROW((b),1); \
224 (b)->data[(b)->byte++] = (c); \
225 } while (0)
226
227 /* Append a single unsigned integer */
228 #define BLOCK_APPEND_UINT(b,i) \
229 do { \
230 unsigned char *cp; \
231 BLOCK_GROW((b),11); \
232 cp = &(b)->data[(b)->byte]; \
233 (b)->byte += append_uint32(cp, (i)) - cp; \
234 } while (0)
235
236 static inline unsigned char *append_uint32(unsigned char *cp, uint32_t i) {
237 uint32_t j;
238
239 if (i == 0) {
240 *cp++ = '0';
241 return cp;
242 }
243
244 if (i < 100) goto b1;
245 if (i < 10000) goto b3;
246 if (i < 1000000) goto b5;
247 if (i < 100000000) goto b7;
248
249 if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
250 if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;}
251 b7:if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;}
252 if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;}
253 b5:if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;}
254 if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;}
255 b3:if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;}
256 if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;}
257 b1:if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;}
258 if (i) *cp++ = i + '0';
259 return cp;
260
261 x8: *cp++ = i / 100000000 + '0', i %= 100000000;
262 x7: *cp++ = i / 10000000 + '0', i %= 10000000;
263 x6: *cp++ = i / 1000000 + '0', i %= 1000000;
264 x5: *cp++ = i / 100000 + '0', i %= 100000;
265 x4: *cp++ = i / 10000 + '0', i %= 10000;
266 x3: *cp++ = i / 1000 + '0', i %= 1000;
267 x2: *cp++ = i / 100 + '0', i %= 100;
268 x1: *cp++ = i / 10 + '0', i %= 10;
269 x0: *cp++ = i + '0';
270
271 return cp;
272 }
273
274 static inline unsigned char *append_sub32(unsigned char *cp, uint32_t i) {
275 *cp++ = i / 100000000 + '0', i %= 100000000;
276 *cp++ = i / 10000000 + '0', i %= 10000000;
277 *cp++ = i / 1000000 + '0', i %= 1000000;
278 *cp++ = i / 100000 + '0', i %= 100000;
279 *cp++ = i / 10000 + '0', i %= 10000;
280 *cp++ = i / 1000 + '0', i %= 1000;
281 *cp++ = i / 100 + '0', i %= 100;
282 *cp++ = i / 10 + '0', i %= 10;
283 *cp++ = i + '0';
284
285 return cp;
286 }
287
288 static inline unsigned char *append_uint64(unsigned char *cp, uint64_t i) {
289 uint64_t j;
290
291 if (i <= 0xffffffff)
292 return append_uint32(cp, i);
293
294 if ((j = i/1000000000) > 1000000000) {
295 cp = append_uint32(cp, j/1000000000);
296 j %= 1000000000;
297 cp = append_sub32(cp, j);
298 } else {
299 cp = append_uint32(cp, i / 1000000000);
300 }
301 cp = append_sub32(cp, i % 1000000000);
302
303 return cp;
304 }
305
306 #define BLOCK_UPLEN(b) \
307 (b)->comp_size = (b)->uncomp_size = BLOCK_SIZE((b))
308
309 /**@}*/
310 /**@{ ----------------------------------------------------------------------
311 * Reference sequence handling
312 */
313
314 /*! Loads a reference set from fn and stores in the cram_fd.
315 *
316 * @return
317 * Returns 0 on success;
318 * -1 on failure
319 */
320 int cram_load_reference(cram_fd *fd, char *fn);
321
322 /*! Generates a lookup table in refs based on the SQ headers in SAM_hdr.
323 *
324 * Indexes references by the order they appear in a BAM file. This may not
325 * necessarily be the same order they appear in the fasta reference file.
326 *
327 * @return
328 * Returns 0 on success;
329 * -1 on failure
330 */
331 int refs2id(refs_t *r, SAM_hdr *bfd);
332
333 void refs_free(refs_t *r);
334
335 /*! Returns a portion of a reference sequence from start to end inclusive.
336 *
337 * The returned pointer is owned by the cram_file fd and should not be freed
338 * by the caller. It is valid only until the next cram_get_ref is called
339 * with the same fd parameter (so is thread-safe if given multiple files).
340 *
341 * To return the entire reference sequence, specify start as 1 and end
342 * as 0.
343 *
344 * @return
345 * Returns reference on success;
346 * NULL on failure
347 */
348 char *cram_get_ref(cram_fd *fd, int id, int start, int end);
349 void cram_ref_incr(refs_t *r, int id);
350 void cram_ref_decr(refs_t *r, int id);
351 /**@}*/
352 /**@{ ----------------------------------------------------------------------
353 * Containers
354 */
355
356 /*! Creates a new container, specifying the maximum number of slices
357 * and records permitted.
358 *
359 * @return
360 * Returns cram_container ptr on success;
361 * NULL on failure
362 */
363 cram_container *cram_new_container(int nrec, int nslice);
364 void cram_free_container(cram_container *c);
365
366 /*! Reads a container header.
367 *
368 * @return
369 * Returns cram_container on success;
370 * NULL on failure or no container left (fd->err == 0).
371 */
372 cram_container *cram_read_container(cram_fd *fd);
373
374 /*! Writes a container structure.
375 *
376 * @return
377 * Returns 0 on success;
378 * -1 on failure
379 */
380 int cram_write_container(cram_fd *fd, cram_container *h);
381
382 /*! Flushes a container to disk.
383 *
384 * Flushes a completely or partially full container to disk, writing
385 * container structure, header and blocks. This also calls the encoder
386 * functions.
387 *
388 * @return
389 * Returns 0 on success;
390 * -1 on failure
391 */
392 int cram_flush_container(cram_fd *fd, cram_container *c);
393 int cram_flush_container_mt(cram_fd *fd, cram_container *c);
394
395
396 /**@}*/
397 /**@{ ----------------------------------------------------------------------
398 * Compression headers; the first part of the container
399 */
400
401 /*! Creates a new blank container compression header
402 *
403 * @return
404 * Returns header ptr on success;
405 * NULL on failure
406 */
407 cram_block_compression_hdr *cram_new_compression_header(void);
408
409 /*! Frees a cram_block_compression_hdr */
410 void cram_free_compression_header(cram_block_compression_hdr *hdr);
411
412
413 /**@}*/
414 /**@{ ----------------------------------------------------------------------
415 * Slices and slice headers
416 */
417
418 /*! Frees a slice header */
419 void cram_free_slice_header(cram_block_slice_hdr *hdr);
420
421 /*! Frees a slice */
422 void cram_free_slice(cram_slice *s);
423
424 /*! Creates a new empty slice in memory, for subsequent writing to
425 * disk.
426 *
427 * @return
428 * Returns cram_slice ptr on success;
429 * NULL on failure
430 */
431 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs);
432
433 /*! Loads an entire slice.
434 *
435 * FIXME: In 1.0 the native unit of slices within CRAM is broken
436 * as slices contain references to objects in other slices.
437 * To work around this while keeping the slice oriented outer loop
438 * we read all slices and stitch them together into a fake large
439 * slice instead.
440 *
441 * @return
442 * Returns cram_slice ptr on success;
443 * NULL on failure
444 */
445 cram_slice *cram_read_slice(cram_fd *fd);
446
447
448
449 /**@}*/
450 /**@{ ----------------------------------------------------------------------
451 * CRAM file definition (header)
452 */
453
454 /*! Reads a CRAM file definition structure.
455 *
456 * @return
457 * Returns file_def ptr on success;
458 * NULL on failure
459 */
460 cram_file_def *cram_read_file_def(cram_fd *fd);
461
462 /*! Writes a cram_file_def structure to cram_fd.
463 *
464 * @return
465 * Returns 0 on success;
466 * -1 on failure
467 */
468 int cram_write_file_def(cram_fd *fd, cram_file_def *def);
469
470 /*! Frees a cram_file_def structure. */
471 void cram_free_file_def(cram_file_def *def);
472
473
474 /**@}*/
475 /**@{ ----------------------------------------------------------------------
476 * SAM header I/O
477 */
478
479 /*! Reads the SAM header from the first CRAM data block.
480 *
481 * Also performs minimal parsing to extract read-group
482 * and sample information.
483 *
484 * @return
485 * Returns SAM hdr ptr on success;
486 * NULL on failure
487 */
488 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd);
489
490 /*! Writes a CRAM SAM header.
491 *
492 * @return
493 * Returns 0 on success;
494 * -1 on failure
495 */
496 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr);
497
498
499 /**@}*/
500 /**@{ ----------------------------------------------------------------------
501 * The top-level cram opening, closing and option handling
502 */
503
504 /*! Opens a CRAM file for read (mode "rb") or write ("wb").
505 *
506 * The filename may be "-" to indicate stdin or stdout.
507 *
508 * @return
509 * Returns file handle on success;
510 * NULL on failure.
511 */
512 cram_fd *cram_open(const char *filename, const char *mode);
513
514 /*! Opens an existing stream for reading or writing.
515 *
516 * @return
517 * Returns file handle on success;
518 * NULL on failure.
519 */
520 cram_fd *cram_dopen(struct hFILE *fp, const char *filename, const char *mode);
521
522 /*! Closes a CRAM file.
523 *
524 * @return
525 * Returns 0 on success;
526 * -1 on failure
527 */
528 int cram_close(cram_fd *fd);
529
530 /*
531 * Seek within a CRAM file.
532 *
533 * Returns 0 on success
534 * -1 on failure
535 */
536 int cram_seek(cram_fd *fd, off_t offset, int whence);
537
538 /*
539 * Flushes a CRAM file.
540 * Useful for when writing to stdout without wishing to close the stream.
541 *
542 * Returns 0 on success
543 * -1 on failure
544 */
545 int cram_flush(cram_fd *fd);
546
547 /*! Checks for end of file on a cram_fd stream.
548 *
549 * @return
550 * Returns 0 if not at end of file
551 * 1 if we hit an expected EOF (end of range or EOF block)
552 * 2 for other EOF (end of stream without EOF block)
553 */
554 int cram_eof(cram_fd *fd);
555
556 /*! Sets options on the cram_fd.
557 *
558 * See CRAM_OPT_* definitions in cram_structs.h.
559 * Use this immediately after opening.
560 *
561 * @return
562 * Returns 0 on success;
563 * -1 on failure
564 */
565 int cram_set_option(cram_fd *fd, enum cram_option opt, ...);
566
567 /*! Sets options on the cram_fd.
568 *
569 * See CRAM_OPT_* definitions in cram_structs.h.
570 * Use this immediately after opening.
571 *
572 * @return
573 * Returns 0 on success;
574 * -1 on failure
575 */
576 int cram_set_voption(cram_fd *fd, enum cram_option opt, va_list args);
577
578 /*!
579 * Attaches a header to a cram_fd.
580 *
581 * This should be used when creating a new cram_fd for writing where
582 * we have an SAM_hdr already constructed (eg from a file we've read
583 * in).
584 *
585 * @return
586 * Returns 0 on success;
587 * -1 on failure
588 */
589 int cram_set_header(cram_fd *fd, SAM_hdr *hdr);
590
591
592 #ifdef __cplusplus
593 }
594 #endif
595
596 #endif /* _CRAM_IO_H_ */