0
|
1 /*
|
|
2 Copyright (c) 2012-2013 Genome Research Ltd.
|
|
3 Author: James Bonfield <jkb@sanger.ac.uk>
|
|
4
|
|
5 Redistribution and use in source and binary forms, with or without
|
|
6 modification, are permitted provided that the following conditions are met:
|
|
7
|
|
8 1. Redistributions of source code must retain the above copyright notice,
|
|
9 this list of conditions and the following disclaimer.
|
|
10
|
|
11 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12 this list of conditions and the following disclaimer in the documentation
|
|
13 and/or other materials provided with the distribution.
|
|
14
|
|
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
|
16 Institute nor the names of its contributors may be used to endorse or promote
|
|
17 products derived from this software without specific prior written permission.
|
|
18
|
|
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
|
|
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
|
|
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29 */
|
|
30
|
|
31 #ifndef _CRAM_STRUCTS_H_
|
|
32 #define _CRAM_STRUCTS_H_
|
|
33
|
|
34 #ifdef __cplusplus
|
|
35 extern "C" {
|
|
36 #endif
|
|
37
|
|
38 /*
|
|
39 * Defines in-memory structs for the basic file-format objects in the
|
|
40 * CRAM format.
|
|
41 *
|
|
42 * The basic file format is:
|
|
43 * File-def SAM-hdr Container Container ...
|
|
44 *
|
|
45 * Container:
|
|
46 * Service-block data-block data-block ...
|
|
47 *
|
|
48 * Multiple blocks in a container are grouped together as slices,
|
|
49 * also sometimes referred to as landmarks in the spec.
|
|
50 */
|
|
51
|
|
52
|
|
53 #include <stdint.h>
|
|
54
|
|
55 #include "cram/thread_pool.h"
|
|
56 #include "cram/string_alloc.h"
|
|
57 #include "htslib/khash.h"
|
|
58
|
|
59 // Generic hash-map integer -> integer
|
|
60 KHASH_MAP_INIT_INT(m_i2i, int)
|
|
61
|
|
62 // Generic hash-set integer -> (existance)
|
|
63 KHASH_SET_INIT_INT(s_i2i)
|
|
64
|
|
65 // For brevity
|
|
66 typedef unsigned char uc;
|
|
67
|
|
68 /*
|
|
69 * A union for the preservation map. Required for khash.
|
|
70 */
|
|
71 typedef union {
|
|
72 int i;
|
|
73 char *p;
|
|
74 } pmap_t;
|
|
75
|
|
76 // Generates static functions here which isn't ideal, but we have no way
|
|
77 // currently to declare the kh_map_t structure here without also declaring a
|
|
78 // duplicate in the .c files due to the nature of the KHASH macros.
|
|
79 KHASH_MAP_INIT_STR(map, pmap_t)
|
|
80
|
|
81 struct hFILE;
|
|
82
|
|
83 #define SEQS_PER_SLICE 10000
|
|
84 #define SLICE_PER_CNT 1
|
|
85
|
|
86 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
|
|
87
|
|
88 #define MAX_STAT_VAL 1024
|
|
89 //#define MAX_STAT_VAL 16
|
|
90 typedef struct {
|
|
91 int freqs[MAX_STAT_VAL];
|
|
92 khash_t(m_i2i) *h;
|
|
93 int nsamp; // total number of values added
|
|
94 int nvals; // total number of unique values added
|
|
95 } cram_stats;
|
|
96
|
|
97 /* NB: matches java impl, not the spec */
|
|
98 enum cram_encoding {
|
|
99 E_NULL = 0,
|
|
100 E_EXTERNAL = 1,
|
|
101 E_GOLOMB = 2,
|
|
102 E_HUFFMAN = 3,
|
|
103 E_BYTE_ARRAY_LEN = 4,
|
|
104 E_BYTE_ARRAY_STOP = 5,
|
|
105 E_BETA = 6,
|
|
106 E_SUBEXP = 7,
|
|
107 E_GOLOMB_RICE = 8,
|
|
108 E_GAMMA = 9
|
|
109 };
|
|
110
|
|
111 enum cram_external_type {
|
|
112 E_INT = 1,
|
|
113 E_LONG = 2,
|
|
114 E_BYTE = 3,
|
|
115 E_BYTE_ARRAY = 4,
|
|
116 E_BYTE_ARRAY_BLOCK = 5,
|
|
117 };
|
|
118
|
|
119 /* External IDs used by this implementation (only assumed during writing) */
|
|
120 enum cram_DS_ID {
|
|
121 DS_CORE = 0,
|
|
122 DS_aux = 1, // aux_blk
|
|
123 DS_aux_OQ = 2,
|
|
124 DS_aux_BQ = 3,
|
|
125 DS_aux_BD = 4,
|
|
126 DS_aux_BI = 5,
|
|
127 DS_aux_FZ = 6, // also ZM:B
|
|
128 DS_aux_oq = 7, // other qualities
|
|
129 DS_aux_os = 8, // other sequences
|
|
130 DS_aux_oz = 9, // other strings
|
|
131 DS_ref,
|
|
132 DS_RN, // name_blk
|
|
133 DS_QS, // qual_blk
|
|
134 DS_IN, // base_blk
|
|
135 DS_SC, // soft_blk
|
|
136
|
|
137 DS_BF, // start loop
|
|
138 DS_CF,
|
|
139 DS_AP,
|
|
140 DS_RG,
|
|
141 DS_MQ,
|
|
142 DS_NS,
|
|
143 DS_MF,
|
|
144 DS_TS,
|
|
145 DS_NP,
|
|
146 DS_NF,
|
|
147 DS_RL,
|
|
148 DS_FN,
|
|
149 DS_FC,
|
|
150 DS_FP,
|
|
151 DS_DL,
|
|
152 DS_BA,
|
|
153 DS_BS,
|
|
154 DS_TL,
|
|
155 DS_RI,
|
|
156 DS_RS,
|
|
157 DS_PD,
|
|
158 DS_HC,
|
|
159 DS_BB,
|
|
160 DS_QQ,
|
|
161
|
|
162 DS_TN, // end loop
|
|
163
|
|
164 DS_RN_len,
|
|
165 DS_SC_len,
|
|
166 DS_BB_len,
|
|
167 DS_QQ_len,
|
|
168
|
|
169 DS_TC, // CRAM v1.0 tags
|
|
170 DS_TM, // test
|
|
171 DS_TV, // test
|
|
172
|
|
173 DS_END,
|
|
174 };
|
|
175
|
|
176 /* "File Definition Structure" */
|
|
177 typedef struct {
|
|
178 char magic[4];
|
|
179 uint8_t major_version;
|
|
180 uint8_t minor_version;
|
|
181 char file_id[20]; // Filename or SHA1 checksum
|
|
182 } cram_file_def;
|
|
183
|
|
184 #define CRAM_MAJOR_VERS(v) ((v) >> 8)
|
|
185 #define CRAM_MINOR_VERS(v) ((v) & 0xff)
|
|
186
|
|
187 struct cram_slice;
|
|
188
|
|
189 enum cram_block_method {
|
|
190 ERROR = -1,
|
|
191 RAW = 0,
|
|
192 GZIP = 1,
|
|
193 BZIP2 = 2,
|
|
194 LZMA = 3,
|
|
195 RANS = 4, // Generic; either order
|
|
196 RANS0 = 4,
|
|
197 RANS1 = 10, // Not externalised; stored as RANS (generic)
|
|
198 GZIP_RLE = 11, // NB: not externalised in CRAM
|
|
199 };
|
|
200
|
|
201 enum cram_content_type {
|
|
202 CT_ERROR = -1,
|
|
203 FILE_HEADER = 0,
|
|
204 COMPRESSION_HEADER = 1,
|
|
205 MAPPED_SLICE = 2,
|
|
206 UNMAPPED_SLICE = 3, // CRAM V1.0 only
|
|
207 EXTERNAL = 4,
|
|
208 CORE = 5,
|
|
209 };
|
|
210
|
|
211 /* Compression metrics */
|
|
212 typedef struct {
|
|
213 // number of trials and time to next trial
|
|
214 int trial;
|
|
215 int next_trial;
|
|
216
|
|
217 // aggregate sizes during trials
|
|
218 int sz_gz_rle;
|
|
219 int sz_gz_def;
|
|
220 int sz_rans0;
|
|
221 int sz_rans1;
|
|
222 int sz_bzip2;
|
|
223 int sz_lzma;
|
|
224
|
|
225 // resultant method from trials
|
|
226 int method;
|
|
227 int strat;
|
|
228
|
|
229 // Revisions of method, to allow culling of continually failing ones.
|
|
230 int gz_rle_cnt;
|
|
231 int gz_def_cnt;
|
|
232 int rans0_cnt;
|
|
233 int rans1_cnt;
|
|
234 int bzip2_cnt;
|
|
235 int lzma_cnt;
|
|
236 int revised_method;
|
|
237
|
|
238 double gz_rle_extra;
|
|
239 double gz_def_extra;
|
|
240 double rans0_extra;
|
|
241 double rans1_extra;
|
|
242 double bzip2_extra;
|
|
243 double lzma_extra;
|
|
244 } cram_metrics;
|
|
245
|
|
246 /* Block */
|
|
247 typedef struct {
|
|
248 enum cram_block_method method, orig_method;
|
|
249 enum cram_content_type content_type;
|
|
250 int32_t content_id;
|
|
251 int32_t comp_size;
|
|
252 int32_t uncomp_size;
|
|
253 uint32_t crc32;
|
|
254 int32_t idx; /* offset into data */
|
|
255 unsigned char *data;
|
|
256
|
|
257 // For bit I/O
|
|
258 size_t alloc;
|
|
259 size_t byte;
|
|
260 int bit;
|
|
261 } cram_block;
|
|
262
|
|
263 struct cram_codec; /* defined in cram_codecs.h */
|
|
264 struct cram_map;
|
|
265
|
|
266 #define CRAM_MAP_HASH 32
|
|
267 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
|
|
268
|
|
269 /* Compression header block */
|
|
270 typedef struct {
|
|
271 int32_t ref_seq_id;
|
|
272 int32_t ref_seq_start;
|
|
273 int32_t ref_seq_span;
|
|
274 int32_t num_records;
|
|
275 int32_t num_landmarks;
|
|
276 int32_t *landmark;
|
|
277
|
|
278 /* Flags from preservation map */
|
|
279 int mapped_qs_included;
|
|
280 int unmapped_qs_included;
|
|
281 int unmapped_placed;
|
|
282 int qs_included;
|
|
283 int read_names_included;
|
|
284 int AP_delta;
|
|
285 // indexed by ref-base and subst. code
|
|
286 char substitution_matrix[5][4];
|
|
287
|
|
288 // TD Dictionary as a concatenated block
|
|
289 cram_block *TD_blk; // Tag Dictionary
|
|
290 int nTL; // number of TL entries in TD
|
|
291 unsigned char **TL; // array of size nTL, pointer into TD_blk.
|
|
292 khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices
|
|
293 string_alloc_t *TD_keys; // Pooled keys for TD hash.
|
|
294
|
|
295 khash_t(map) *preservation_map;
|
|
296 struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
|
|
297 struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
|
|
298
|
|
299 struct cram_codec *codecs[DS_END];
|
|
300
|
|
301 char *uncomp; // A single block of uncompressed data
|
|
302 size_t uncomp_size, uncomp_alloc;
|
|
303
|
|
304 unsigned int data_series; // See cram_fields enum below
|
|
305 } cram_block_compression_hdr;
|
|
306
|
|
307 typedef struct cram_map {
|
|
308 int key; /* 0xe0 + 3 bytes */
|
|
309 enum cram_encoding encoding;
|
|
310 int offset; /* Offset into a single block of memory */
|
|
311 int size; /* Size */
|
|
312 struct cram_codec *codec;
|
|
313 struct cram_map *next; // for noddy internal hash
|
|
314 } cram_map;
|
|
315
|
|
316 /* Mapped or unmapped slice header block */
|
|
317 typedef struct {
|
|
318 enum cram_content_type content_type;
|
|
319 int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */
|
|
320 int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */
|
|
321 int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */
|
|
322 int32_t num_records;
|
|
323 int64_t record_counter;
|
|
324 int32_t num_blocks;
|
|
325 int32_t num_content_ids;
|
|
326 int32_t *block_content_ids;
|
|
327 int32_t ref_base_id; /* if content_type == MAPPED_SLICE */
|
|
328 unsigned char md5[16];
|
|
329 } cram_block_slice_hdr;
|
|
330
|
|
331 struct ref_entry;
|
|
332
|
|
333 /*
|
|
334 * Container.
|
|
335 *
|
|
336 * Conceptually a container is split into slices, and slices into blocks.
|
|
337 * However on disk it's just a list of blocks and we need to query the
|
|
338 * block types to identify the start/end points of the slices.
|
|
339 *
|
|
340 * OR... are landmarks the start/end points of slices?
|
|
341 */
|
|
342 typedef struct {
|
|
343 int32_t length;
|
|
344 int32_t ref_seq_id;
|
|
345 int32_t ref_seq_start;
|
|
346 int32_t ref_seq_span;
|
|
347 int64_t record_counter;
|
|
348 int64_t num_bases;
|
|
349 int32_t num_records;
|
|
350 int32_t num_blocks;
|
|
351 int32_t num_landmarks;
|
|
352 int32_t *landmark;
|
|
353
|
|
354 /* Size of container header above */
|
|
355 size_t offset;
|
|
356
|
|
357 /* Compression header is always the first block? */
|
|
358 cram_block_compression_hdr *comp_hdr;
|
|
359 cram_block *comp_hdr_block;
|
|
360
|
|
361 /* For construction purposes */
|
|
362 int max_slice, curr_slice; // maximum number of slices
|
|
363 int max_rec, curr_rec; // current and max recs per slice
|
|
364 int max_c_rec, curr_c_rec; // current and max recs per container
|
|
365 int slice_rec; // rec no. for start of this slice
|
|
366 int curr_ref; // current ref ID. -2 for no previous
|
|
367 int last_pos; // last record position
|
|
368 struct cram_slice **slices, *slice;
|
|
369 int pos_sorted; // boolean, 1=>position sorted data
|
|
370 int max_apos; // maximum position, used if pos_sorted==0
|
|
371 int last_slice; // number of reads in last slice (0 for 1st)
|
|
372 int multi_seq; // true if packing multi seqs per cont/slice
|
|
373 int unsorted; // true is AP_delta is 0.
|
|
374
|
|
375 /* Copied from fd before encoding, to allow multi-threading */
|
|
376 int ref_start, first_base, last_base, ref_id, ref_end;
|
|
377 char *ref;
|
|
378 //struct ref_entry *ref;
|
|
379
|
|
380 /* For multi-threading */
|
|
381 bam_seq_t **bams;
|
|
382
|
|
383 /* Statistics for encoding */
|
|
384 cram_stats *stats[DS_END];
|
|
385
|
|
386 khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map
|
|
387 int *refs_used; // array of frequency of ref seq IDs
|
|
388
|
|
389 uint32_t crc32; // CRC32
|
|
390 } cram_container;
|
|
391
|
|
392 /*
|
|
393 * A single cram record
|
|
394 */
|
|
395 typedef struct {
|
|
396 struct cram_slice *s; // Filled out by cram_decode only
|
|
397
|
|
398 int32_t ref_id; // fixed for all recs in slice?
|
|
399 int32_t flags; // BF
|
|
400 int32_t cram_flags; // CF
|
|
401 int32_t len; // RL
|
|
402 int32_t apos; // AP
|
|
403 int32_t rg; // RG
|
|
404 int32_t name; // RN; idx to s->names_blk
|
|
405 int32_t name_len;
|
|
406 int32_t mate_line; // index to another cram_record
|
|
407 int32_t mate_ref_id;
|
|
408 int32_t mate_pos; // NP
|
|
409 int32_t tlen; // TS
|
|
410
|
|
411 // Auxiliary data
|
|
412 int32_t ntags; // TC
|
|
413 int32_t aux; // idx to s->aux_blk
|
|
414 int32_t aux_size; // total size of packed ntags in aux_blk
|
|
415 #ifndef TN_external
|
|
416 int32_t TN_idx; // TN; idx to s->TN;
|
|
417 #else
|
|
418 int32_t tn; // idx to s->tn_blk
|
|
419 #endif
|
|
420 int TL;
|
|
421
|
|
422 int32_t seq; // idx to s->seqs_blk
|
|
423 int32_t qual; // idx to s->qual_blk
|
|
424 int32_t cigar; // idx to s->cigar
|
|
425 int32_t ncigar;
|
|
426 int32_t aend; // alignment end
|
|
427 int32_t mqual; // MQ
|
|
428
|
|
429 int32_t feature; // idx to s->feature
|
|
430 int32_t nfeature; // number of features
|
|
431 int32_t mate_flags; // MF
|
|
432 } cram_record;
|
|
433
|
|
434 // Accessor macros as an analogue of the bam ones
|
|
435 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name])
|
|
436 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq])
|
|
437 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual])
|
|
438 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux])
|
|
439 #define cram_seqi(c,i) (cram_seq((c))[(i)])
|
|
440 #define cram_name_len(c) ((c)->name_len)
|
|
441 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0)
|
|
442 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0)
|
|
443 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar])
|
|
444
|
|
445 /*
|
|
446 * A feature is a base difference, used for the sequence reference encoding.
|
|
447 * (We generate these internally when writing CRAM.)
|
|
448 */
|
|
449 typedef struct {
|
|
450 union {
|
|
451 struct {
|
|
452 int pos;
|
|
453 int code;
|
|
454 int base; // substitution code
|
|
455 } X;
|
|
456 struct {
|
|
457 int pos;
|
|
458 int code;
|
|
459 int base; // actual base & qual
|
|
460 int qual;
|
|
461 } B;
|
|
462 struct {
|
|
463 int pos;
|
|
464 int code;
|
|
465 int seq_idx; // index to s->seqs_blk
|
|
466 int len;
|
|
467 } b;
|
|
468 struct {
|
|
469 int pos;
|
|
470 int code;
|
|
471 int qual;
|
|
472 } Q;
|
|
473 struct {
|
|
474 int pos;
|
|
475 int code;
|
|
476 int len;
|
|
477 int seq_idx; // soft-clip multiple bases
|
|
478 } S;
|
|
479 struct {
|
|
480 int pos;
|
|
481 int code;
|
|
482 int len;
|
|
483 int seq_idx; // insertion multiple bases
|
|
484 } I;
|
|
485 struct {
|
|
486 int pos;
|
|
487 int code;
|
|
488 int base; // insertion single base
|
|
489 } i;
|
|
490 struct {
|
|
491 int pos;
|
|
492 int code;
|
|
493 int len;
|
|
494 } D;
|
|
495 struct {
|
|
496 int pos;
|
|
497 int code;
|
|
498 int len;
|
|
499 } N;
|
|
500 struct {
|
|
501 int pos;
|
|
502 int code;
|
|
503 int len;
|
|
504 } P;
|
|
505 struct {
|
|
506 int pos;
|
|
507 int code;
|
|
508 int len;
|
|
509 } H;
|
|
510 };
|
|
511 } cram_feature;
|
|
512
|
|
513 /*
|
|
514 * A slice is really just a set of blocks, but it
|
|
515 * is the logical unit for decoding a number of
|
|
516 * sequences.
|
|
517 */
|
|
518 typedef struct cram_slice {
|
|
519 cram_block_slice_hdr *hdr;
|
|
520 cram_block *hdr_block;
|
|
521 cram_block **block;
|
|
522 cram_block **block_by_id;
|
|
523
|
|
524 /* State used during encoding/decoding */
|
|
525 int last_apos, max_apos;
|
|
526
|
|
527 /* Array of decoded cram records */
|
|
528 cram_record *crecs;
|
|
529
|
|
530 /* An dynamically growing buffers for data pointed
|
|
531 * to by crecs[] array.
|
|
532 */
|
|
533 uint32_t *cigar;
|
|
534 uint32_t cigar_alloc;
|
|
535 uint32_t ncigar;
|
|
536
|
|
537 cram_feature *features;
|
|
538 int nfeatures;
|
|
539 int afeatures; // allocated size of features
|
|
540
|
|
541 #ifndef TN_external
|
|
542 // TN field (Tag Name)
|
|
543 uint32_t *TN;
|
|
544 int nTN, aTN; // used and allocated size for TN[]
|
|
545 #else
|
|
546 cram_block *tn_blk;
|
|
547 int tn_id;
|
|
548 #endif
|
|
549
|
|
550 // For variable sized elements which are always external blocks.
|
|
551 cram_block *name_blk;
|
|
552 cram_block *seqs_blk;
|
|
553 cram_block *qual_blk;
|
|
554 cram_block *base_blk;
|
|
555 cram_block *soft_blk;
|
|
556 cram_block *aux_blk;
|
|
557 cram_block *aux_OQ_blk;
|
|
558 cram_block *aux_BQ_blk;
|
|
559 cram_block *aux_BD_blk;
|
|
560 cram_block *aux_BI_blk;
|
|
561 cram_block *aux_FZ_blk;
|
|
562 cram_block *aux_oq_blk;
|
|
563 cram_block *aux_os_blk;
|
|
564 cram_block *aux_oz_blk;
|
|
565
|
|
566 string_alloc_t *pair_keys; // Pooled keys for pair hash.
|
|
567 khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice.
|
|
568
|
|
569 char *ref; // slice of current reference
|
|
570 int ref_start; // start position of current reference;
|
|
571 int ref_end; // end position of current reference;
|
|
572 int ref_id;
|
|
573 } cram_slice;
|
|
574
|
|
575 /*-----------------------------------------------------------------------------
|
|
576 * Consider moving reference handling to cram_refs.[ch]
|
|
577 */
|
|
578 // from fa.fai / samtools faidx files
|
|
579 typedef struct ref_entry {
|
|
580 char *name;
|
|
581 char *fn;
|
|
582 int64_t length;
|
|
583 int64_t offset;
|
|
584 int bases_per_line;
|
|
585 int line_length;
|
|
586 int64_t count; // for shared references so we know to dealloc seq
|
|
587 char *seq;
|
|
588 } ref_entry;
|
|
589
|
|
590 KHASH_MAP_INIT_STR(refs, ref_entry*)
|
|
591
|
|
592 // References structure.
|
|
593 typedef struct {
|
|
594 string_alloc_t *pool; // String pool for holding filenames and SN vals
|
|
595
|
|
596 khash_t(refs) *h_meta; // ref_entry*, index by name
|
|
597 ref_entry **ref_id; // ref_entry*, index by ID
|
|
598 int nref; // number of ref_entry
|
|
599
|
|
600 char *fn; // current file opened
|
|
601 BGZF *fp; // and the hFILE* to go with it.
|
|
602
|
|
603 int count; // how many cram_fd sharing this refs struct
|
|
604
|
|
605 pthread_mutex_t lock; // Mutex for multi-threaded updating
|
|
606 ref_entry *last; // Last queried sequence
|
|
607 int last_id; // Used in cram_ref_decr_locked to delay free
|
|
608 } refs_t;
|
|
609
|
|
610 /*-----------------------------------------------------------------------------
|
|
611 * CRAM index
|
|
612 *
|
|
613 * Detect format by number of entries per line.
|
|
614 * 5 => 1.0 (refid, start, nseq, C offset, slice)
|
|
615 * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
|
|
616 *
|
|
617 * Indices are stored in a nested containment list, which is trivial to set
|
|
618 * up as the indices are on sorted data so we're appending to the nclist
|
|
619 * in sorted order. Basically if a slice entirely fits within a previous
|
|
620 * slice then we append to that slices list. This is done recursively.
|
|
621 *
|
|
622 * Lists are sorted on two dimensions: ref id + slice coords.
|
|
623 */
|
|
624 typedef struct cram_index {
|
|
625 int nslice, nalloc; // total number of slices
|
|
626 struct cram_index *e; // array of size nslice
|
|
627
|
|
628 int refid; // 1.0 1.1
|
|
629 int start; // 1.0 1.1
|
|
630 int end; // 1.1
|
|
631 int nseq; // 1.0 - undocumented
|
|
632 int slice; // 1.0 landmark index, 1.1 landmark value
|
|
633 int len; // 1.1 - size of slice in bytes
|
|
634 int64_t offset; // 1.0 1.1
|
|
635 } cram_index;
|
|
636
|
|
637 typedef struct {
|
|
638 int refid;
|
|
639 int start;
|
|
640 int end;
|
|
641 } cram_range;
|
|
642
|
|
643 /*-----------------------------------------------------------------------------
|
|
644 */
|
|
645 /* CRAM File handle */
|
|
646
|
|
647 typedef struct spare_bams {
|
|
648 bam_seq_t **bams;
|
|
649 struct spare_bams *next;
|
|
650 } spare_bams;
|
|
651
|
|
652 typedef struct cram_fd {
|
|
653 struct hFILE *fp;
|
|
654 int mode; // 'r' or 'w'
|
|
655 int version;
|
|
656 cram_file_def *file_def;
|
|
657 SAM_hdr *header;
|
|
658
|
|
659 char *prefix;
|
|
660 int64_t record_counter;
|
|
661 int err;
|
|
662
|
|
663 // Most recent compression header decoded
|
|
664 //cram_block_compression_hdr *comp_hdr;
|
|
665 //cram_block_slice_hdr *slice_hdr;
|
|
666
|
|
667 // Current container being processed.
|
|
668 cram_container *ctr;
|
|
669
|
|
670 // positions for encoding or decoding
|
|
671 int first_base, last_base;
|
|
672
|
|
673 // cached reference portion
|
|
674 refs_t *refs; // ref meta-data structure
|
|
675 char *ref, *ref_free; // current portion held in memory
|
|
676 int ref_id;
|
|
677 int ref_start;
|
|
678 int ref_end;
|
|
679 char *ref_fn; // reference fasta filename
|
|
680
|
|
681 // compression level and metrics
|
|
682 int level;
|
|
683 cram_metrics *m[DS_END];
|
|
684
|
|
685 // options
|
|
686 int decode_md; // Whether to export MD and NM tags
|
|
687 int verbose;
|
|
688 int seqs_per_slice;
|
|
689 int slices_per_container;
|
|
690 int embed_ref;
|
|
691 int no_ref;
|
|
692 int ignore_md5;
|
|
693 int use_bz2;
|
|
694 int use_rans;
|
|
695 int use_lzma;
|
|
696 int shared_ref;
|
|
697 unsigned int required_fields;
|
|
698 cram_range range;
|
|
699
|
|
700 // lookup tables, stored here so we can be trivially multi-threaded
|
|
701 unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
|
|
702 unsigned int cram_flag_swap[0x1000];// bam -> cram flags
|
|
703 unsigned char L1[256]; // ACGT{*} ->0123{4}
|
|
704 unsigned char L2[256]; // ACGTN{*}->01234{5}
|
|
705 char cram_sub_matrix[32][32]; // base substituion codes
|
|
706
|
|
707 int index_sz;
|
|
708 cram_index *index; // array, sizeof index_sz
|
|
709 off_t first_container;
|
|
710 int eof;
|
|
711 int last_slice; // number of recs encoded in last slice
|
|
712 int multi_seq;
|
|
713 int unsorted;
|
|
714 int empty_container; // Marker for EOF block
|
|
715
|
|
716 // thread pool
|
|
717 int own_pool;
|
|
718 t_pool *pool;
|
|
719 t_results_queue *rqueue;
|
|
720 pthread_mutex_t metrics_lock;
|
|
721 pthread_mutex_t ref_lock;
|
|
722 spare_bams *bl;
|
|
723 pthread_mutex_t bam_list_lock;
|
|
724 void *job_pending;
|
|
725 int ooc; // out of containers.
|
|
726 } cram_fd;
|
|
727
|
|
728 // Translation of required fields to cram data series
|
|
729 enum cram_fields {
|
|
730 CRAM_BF = 0x00000001,
|
|
731 CRAM_AP = 0x00000002,
|
|
732 CRAM_FP = 0x00000004,
|
|
733 CRAM_RL = 0x00000008,
|
|
734 CRAM_DL = 0x00000010,
|
|
735 CRAM_NF = 0x00000020,
|
|
736 CRAM_BA = 0x00000040,
|
|
737 CRAM_QS = 0x00000080,
|
|
738 CRAM_FC = 0x00000100,
|
|
739 CRAM_FN = 0x00000200,
|
|
740 CRAM_BS = 0x00000400,
|
|
741 CRAM_IN = 0x00000800,
|
|
742 CRAM_RG = 0x00001000,
|
|
743 CRAM_MQ = 0x00002000,
|
|
744 CRAM_TL = 0x00004000,
|
|
745 CRAM_RN = 0x00008000,
|
|
746 CRAM_NS = 0x00010000,
|
|
747 CRAM_NP = 0x00020000,
|
|
748 CRAM_TS = 0x00040000,
|
|
749 CRAM_MF = 0x00080000,
|
|
750 CRAM_CF = 0x00100000,
|
|
751 CRAM_RI = 0x00200000,
|
|
752 CRAM_RS = 0x00400000,
|
|
753 CRAM_PD = 0x00800000,
|
|
754 CRAM_HC = 0x01000000,
|
|
755 CRAM_SC = 0x02000000,
|
|
756 CRAM_BB = 0x04000000,
|
|
757 CRAM_BB_len = 0x08000000,
|
|
758 CRAM_QQ = 0x10000000,
|
|
759 CRAM_QQ_len = 0x20000000,
|
|
760 CRAM_aux= 0x40000000,
|
|
761 CRAM_ALL= 0x7fffffff,
|
|
762 };
|
|
763
|
|
764 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
|
|
765 // encode a base difference, but we don't need to know what it is for CIGAR.
|
|
766 // If we have a soft-clip or insertion, we do need SC/IN though to know how
|
|
767 // long that array is.
|
|
768 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
|
|
769 CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
|
|
770
|
|
771 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_QS | CRAM_BS | \
|
|
772 CRAM_RL | CRAM_AP | CRAM_BB | CRAM_QQ)
|
|
773
|
|
774 /* BF bitfields */
|
|
775 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
|
|
776 #define CRAM_FPAIRED 256
|
|
777 #define CRAM_FPROPER_PAIR 128
|
|
778 #define CRAM_FUNMAP 64
|
|
779 #define CRAM_FREVERSE 32
|
|
780 #define CRAM_FREAD1 16
|
|
781 #define CRAM_FREAD2 8
|
|
782 #define CRAM_FSECONDARY 4
|
|
783 #define CRAM_FQCFAIL 2
|
|
784 #define CRAM_FDUP 1
|
|
785
|
|
786 #define DS_aux_S "\001"
|
|
787 #define DS_aux_OQ_S "\002"
|
|
788 #define DS_aux_BQ_S "\003"
|
|
789 #define DS_aux_BD_S "\004"
|
|
790 #define DS_aux_BI_S "\005"
|
|
791 #define DS_aux_FZ_S "\006"
|
|
792 #define DS_aux_oq_S "\007"
|
|
793 #define DS_aux_os_S "\010"
|
|
794 #define DS_aux_oz_S "\011"
|
|
795
|
|
796 #define CRAM_M_REVERSE 1
|
|
797 #define CRAM_M_UNMAP 2
|
|
798
|
|
799
|
|
800 /* CF bitfields */
|
|
801 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
|
|
802 #define CRAM_FLAG_DETACHED (1<<1)
|
|
803 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2)
|
|
804
|
|
805 #ifdef __cplusplus
|
|
806 }
|
|
807 #endif
|
|
808
|
|
809 #endif /* _CRAM_STRUCTS_H_ */
|