annotate ezBAMQC/src/htslib/cram/cram_structs.h @ 20:9de3bbec2479 draft default tip

Uploaded
author youngkim
date Thu, 31 Mar 2016 10:10:37 -0400
parents dfa3745e5fd8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2 Copyright (c) 2012-2013 Genome Research Ltd.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3 Author: James Bonfield <jkb@sanger.ac.uk>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
4
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
5 Redistribution and use in source and binary forms, with or without
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
6 modification, are permitted provided that the following conditions are met:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
7
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
8 1. Redistributions of source code must retain the above copyright notice,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
9 this list of conditions and the following disclaimer.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
10
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
11 2. Redistributions in binary form must reproduce the above copyright notice,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
12 this list of conditions and the following disclaimer in the documentation
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
13 and/or other materials provided with the distribution.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
14
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
16 Institute nor the names of its contributors may be used to endorse or promote
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
17 products derived from this software without specific prior written permission.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
18
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
29 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
30
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
31 #ifndef _CRAM_STRUCTS_H_
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
32 #define _CRAM_STRUCTS_H_
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
33
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
34 #ifdef __cplusplus
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
35 extern "C" {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
36 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
37
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
38 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
39 * Defines in-memory structs for the basic file-format objects in the
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
40 * CRAM format.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
41 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
42 * The basic file format is:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
43 * File-def SAM-hdr Container Container ...
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
44 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
45 * Container:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
46 * Service-block data-block data-block ...
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
47 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
48 * Multiple blocks in a container are grouped together as slices,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
49 * also sometimes referred to as landmarks in the spec.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
50 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
51
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
52
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
53 #include <stdint.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
54
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
55 #include "cram/thread_pool.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
56 #include "cram/string_alloc.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
57 #include "htslib/khash.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
58
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
59 // Generic hash-map integer -> integer
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
60 KHASH_MAP_INIT_INT(m_i2i, int)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
61
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
62 // Generic hash-set integer -> (existance)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
63 KHASH_SET_INIT_INT(s_i2i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
64
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
65 // For brevity
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
66 typedef unsigned char uc;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
67
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
68 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
69 * A union for the preservation map. Required for khash.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
70 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
71 typedef union {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
72 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
73 char *p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
74 } pmap_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
75
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
76 // Generates static functions here which isn't ideal, but we have no way
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
77 // currently to declare the kh_map_t structure here without also declaring a
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
78 // duplicate in the .c files due to the nature of the KHASH macros.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
79 KHASH_MAP_INIT_STR(map, pmap_t)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
80
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
81 struct hFILE;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
82
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
83 #define SEQS_PER_SLICE 10000
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
84 #define SLICE_PER_CNT 1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
85
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
86 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
87
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
88 #define MAX_STAT_VAL 1024
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
89 //#define MAX_STAT_VAL 16
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
90 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
91 int freqs[MAX_STAT_VAL];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
92 khash_t(m_i2i) *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
93 int nsamp; // total number of values added
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
94 int nvals; // total number of unique values added
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
95 } cram_stats;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
96
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
97 /* NB: matches java impl, not the spec */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
98 enum cram_encoding {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
99 E_NULL = 0,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
100 E_EXTERNAL = 1,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
101 E_GOLOMB = 2,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
102 E_HUFFMAN = 3,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
103 E_BYTE_ARRAY_LEN = 4,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
104 E_BYTE_ARRAY_STOP = 5,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
105 E_BETA = 6,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
106 E_SUBEXP = 7,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
107 E_GOLOMB_RICE = 8,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
108 E_GAMMA = 9
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
109 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
110
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
111 enum cram_external_type {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
112 E_INT = 1,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
113 E_LONG = 2,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
114 E_BYTE = 3,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
115 E_BYTE_ARRAY = 4,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
116 E_BYTE_ARRAY_BLOCK = 5,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
117 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
118
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
119 /* External IDs used by this implementation (only assumed during writing) */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
120 enum cram_DS_ID {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
121 DS_CORE = 0,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
122 DS_aux = 1, // aux_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
123 DS_aux_OQ = 2,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
124 DS_aux_BQ = 3,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
125 DS_aux_BD = 4,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
126 DS_aux_BI = 5,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
127 DS_aux_FZ = 6, // also ZM:B
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
128 DS_aux_oq = 7, // other qualities
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
129 DS_aux_os = 8, // other sequences
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
130 DS_aux_oz = 9, // other strings
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
131 DS_ref,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
132 DS_RN, // name_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
133 DS_QS, // qual_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
134 DS_IN, // base_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
135 DS_SC, // soft_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
136
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
137 DS_BF, // start loop
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
138 DS_CF,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
139 DS_AP,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
140 DS_RG,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
141 DS_MQ,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
142 DS_NS,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
143 DS_MF,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
144 DS_TS,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
145 DS_NP,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
146 DS_NF,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
147 DS_RL,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
148 DS_FN,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
149 DS_FC,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
150 DS_FP,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
151 DS_DL,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
152 DS_BA,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
153 DS_BS,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
154 DS_TL,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
155 DS_RI,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
156 DS_RS,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
157 DS_PD,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
158 DS_HC,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
159 DS_BB,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
160 DS_QQ,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
161
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
162 DS_TN, // end loop
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
163
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
164 DS_RN_len,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
165 DS_SC_len,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
166 DS_BB_len,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
167 DS_QQ_len,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
168
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
169 DS_TC, // CRAM v1.0 tags
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
170 DS_TM, // test
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
171 DS_TV, // test
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
172
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
173 DS_END,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
174 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
175
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
176 /* "File Definition Structure" */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
177 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
178 char magic[4];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
179 uint8_t major_version;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
180 uint8_t minor_version;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
181 char file_id[20]; // Filename or SHA1 checksum
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
182 } cram_file_def;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
183
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
184 #define CRAM_MAJOR_VERS(v) ((v) >> 8)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
185 #define CRAM_MINOR_VERS(v) ((v) & 0xff)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
186
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
187 struct cram_slice;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
188
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
189 enum cram_block_method {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
190 ERROR = -1,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
191 RAW = 0,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
192 GZIP = 1,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
193 BZIP2 = 2,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
194 LZMA = 3,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
195 RANS = 4, // Generic; either order
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
196 RANS0 = 4,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
197 RANS1 = 10, // Not externalised; stored as RANS (generic)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
198 GZIP_RLE = 11, // NB: not externalised in CRAM
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
199 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
200
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
201 enum cram_content_type {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
202 CT_ERROR = -1,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
203 FILE_HEADER = 0,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
204 COMPRESSION_HEADER = 1,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
205 MAPPED_SLICE = 2,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
206 UNMAPPED_SLICE = 3, // CRAM V1.0 only
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
207 EXTERNAL = 4,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
208 CORE = 5,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
209 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
210
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
211 /* Compression metrics */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
212 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
213 // number of trials and time to next trial
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
214 int trial;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
215 int next_trial;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
216
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
217 // aggregate sizes during trials
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
218 int sz_gz_rle;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
219 int sz_gz_def;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
220 int sz_rans0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
221 int sz_rans1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
222 int sz_bzip2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
223 int sz_lzma;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
224
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
225 // resultant method from trials
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
226 int method;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
227 int strat;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
228
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
229 // Revisions of method, to allow culling of continually failing ones.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
230 int gz_rle_cnt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
231 int gz_def_cnt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
232 int rans0_cnt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
233 int rans1_cnt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
234 int bzip2_cnt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
235 int lzma_cnt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
236 int revised_method;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
237
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
238 double gz_rle_extra;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
239 double gz_def_extra;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
240 double rans0_extra;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
241 double rans1_extra;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
242 double bzip2_extra;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
243 double lzma_extra;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
244 } cram_metrics;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
245
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
246 /* Block */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
247 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
248 enum cram_block_method method, orig_method;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
249 enum cram_content_type content_type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
250 int32_t content_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
251 int32_t comp_size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
252 int32_t uncomp_size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
253 uint32_t crc32;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
254 int32_t idx; /* offset into data */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
255 unsigned char *data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
256
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
257 // For bit I/O
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
258 size_t alloc;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
259 size_t byte;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
260 int bit;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
261 } cram_block;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
262
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
263 struct cram_codec; /* defined in cram_codecs.h */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
264 struct cram_map;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
265
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
266 #define CRAM_MAP_HASH 32
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
267 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
268
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
269 /* Compression header block */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
270 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
271 int32_t ref_seq_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
272 int32_t ref_seq_start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
273 int32_t ref_seq_span;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
274 int32_t num_records;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
275 int32_t num_landmarks;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
276 int32_t *landmark;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
277
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
278 /* Flags from preservation map */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
279 int mapped_qs_included;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
280 int unmapped_qs_included;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
281 int unmapped_placed;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
282 int qs_included;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
283 int read_names_included;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
284 int AP_delta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
285 // indexed by ref-base and subst. code
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
286 char substitution_matrix[5][4];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
287
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
288 // TD Dictionary as a concatenated block
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
289 cram_block *TD_blk; // Tag Dictionary
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
290 int nTL; // number of TL entries in TD
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
291 unsigned char **TL; // array of size nTL, pointer into TD_blk.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
292 khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
293 string_alloc_t *TD_keys; // Pooled keys for TD hash.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
294
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
295 khash_t(map) *preservation_map;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
296 struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
297 struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
298
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
299 struct cram_codec *codecs[DS_END];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
300
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
301 char *uncomp; // A single block of uncompressed data
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
302 size_t uncomp_size, uncomp_alloc;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
303
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
304 unsigned int data_series; // See cram_fields enum below
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
305 } cram_block_compression_hdr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
306
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
307 typedef struct cram_map {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
308 int key; /* 0xe0 + 3 bytes */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
309 enum cram_encoding encoding;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
310 int offset; /* Offset into a single block of memory */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
311 int size; /* Size */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
312 struct cram_codec *codec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
313 struct cram_map *next; // for noddy internal hash
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
314 } cram_map;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
315
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
316 /* Mapped or unmapped slice header block */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
317 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
318 enum cram_content_type content_type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
319 int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
320 int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
321 int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
322 int32_t num_records;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
323 int64_t record_counter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
324 int32_t num_blocks;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
325 int32_t num_content_ids;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
326 int32_t *block_content_ids;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
327 int32_t ref_base_id; /* if content_type == MAPPED_SLICE */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
328 unsigned char md5[16];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
329 } cram_block_slice_hdr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
330
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
331 struct ref_entry;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
332
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
333 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
334 * Container.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
335 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
336 * Conceptually a container is split into slices, and slices into blocks.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
337 * However on disk it's just a list of blocks and we need to query the
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
338 * block types to identify the start/end points of the slices.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
339 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
340 * OR... are landmarks the start/end points of slices?
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
341 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
342 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
343 int32_t length;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
344 int32_t ref_seq_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
345 int32_t ref_seq_start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
346 int32_t ref_seq_span;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
347 int64_t record_counter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
348 int64_t num_bases;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
349 int32_t num_records;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
350 int32_t num_blocks;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
351 int32_t num_landmarks;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
352 int32_t *landmark;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
353
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
354 /* Size of container header above */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
355 size_t offset;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
356
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
357 /* Compression header is always the first block? */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
358 cram_block_compression_hdr *comp_hdr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
359 cram_block *comp_hdr_block;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
360
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
361 /* For construction purposes */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
362 int max_slice, curr_slice; // maximum number of slices
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
363 int max_rec, curr_rec; // current and max recs per slice
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
364 int max_c_rec, curr_c_rec; // current and max recs per container
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
365 int slice_rec; // rec no. for start of this slice
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
366 int curr_ref; // current ref ID. -2 for no previous
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
367 int last_pos; // last record position
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
368 struct cram_slice **slices, *slice;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
369 int pos_sorted; // boolean, 1=>position sorted data
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
370 int max_apos; // maximum position, used if pos_sorted==0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
371 int last_slice; // number of reads in last slice (0 for 1st)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
372 int multi_seq; // true if packing multi seqs per cont/slice
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
373 int unsorted; // true is AP_delta is 0.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
374
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
375 /* Copied from fd before encoding, to allow multi-threading */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
376 int ref_start, first_base, last_base, ref_id, ref_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
377 char *ref;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
378 //struct ref_entry *ref;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
379
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
380 /* For multi-threading */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
381 bam_seq_t **bams;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
382
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
383 /* Statistics for encoding */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
384 cram_stats *stats[DS_END];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
385
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
386 khash_t(s_i2i) *tags_used; // set of tag types in use, for tag encoding map
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
387 int *refs_used; // array of frequency of ref seq IDs
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
388
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
389 uint32_t crc32; // CRC32
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
390 } cram_container;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
391
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
392 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
393 * A single cram record
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
394 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
395 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
396 struct cram_slice *s; // Filled out by cram_decode only
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
397
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
398 int32_t ref_id; // fixed for all recs in slice?
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
399 int32_t flags; // BF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
400 int32_t cram_flags; // CF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
401 int32_t len; // RL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
402 int32_t apos; // AP
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
403 int32_t rg; // RG
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
404 int32_t name; // RN; idx to s->names_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
405 int32_t name_len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
406 int32_t mate_line; // index to another cram_record
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
407 int32_t mate_ref_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
408 int32_t mate_pos; // NP
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
409 int32_t tlen; // TS
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
410
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
411 // Auxiliary data
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
412 int32_t ntags; // TC
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
413 int32_t aux; // idx to s->aux_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
414 int32_t aux_size; // total size of packed ntags in aux_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
415 #ifndef TN_external
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
416 int32_t TN_idx; // TN; idx to s->TN;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
417 #else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
418 int32_t tn; // idx to s->tn_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
419 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
420 int TL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
421
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
422 int32_t seq; // idx to s->seqs_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
423 int32_t qual; // idx to s->qual_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
424 int32_t cigar; // idx to s->cigar
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
425 int32_t ncigar;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
426 int32_t aend; // alignment end
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
427 int32_t mqual; // MQ
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
428
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
429 int32_t feature; // idx to s->feature
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
430 int32_t nfeature; // number of features
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
431 int32_t mate_flags; // MF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
432 } cram_record;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
433
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
434 // Accessor macros as an analogue of the bam ones
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
435 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name])
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
436 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq])
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
437 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual])
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
438 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux])
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
439 #define cram_seqi(c,i) (cram_seq((c))[(i)])
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
440 #define cram_name_len(c) ((c)->name_len)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
441 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
442 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
443 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar])
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
444
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
445 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
446 * A feature is a base difference, used for the sequence reference encoding.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
447 * (We generate these internally when writing CRAM.)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
448 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
449 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
450 union {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
451 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
452 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
453 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
454 int base; // substitution code
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
455 } X;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
456 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
457 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
458 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
459 int base; // actual base & qual
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
460 int qual;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
461 } B;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
462 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
463 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
464 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
465 int seq_idx; // index to s->seqs_blk
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
466 int len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
467 } b;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
468 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
469 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
470 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
471 int qual;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
472 } Q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
473 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
474 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
475 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
476 int len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
477 int seq_idx; // soft-clip multiple bases
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
478 } S;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
479 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
480 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
481 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
482 int len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
483 int seq_idx; // insertion multiple bases
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
484 } I;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
485 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
486 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
487 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
488 int base; // insertion single base
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
489 } i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
490 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
491 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
492 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
493 int len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
494 } D;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
495 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
496 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
497 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
498 int len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
499 } N;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
500 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
501 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
502 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
503 int len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
504 } P;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
505 struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
506 int pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
507 int code;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
508 int len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
509 } H;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
510 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
511 } cram_feature;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
512
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
513 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
514 * A slice is really just a set of blocks, but it
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
515 * is the logical unit for decoding a number of
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
516 * sequences.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
517 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
518 typedef struct cram_slice {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
519 cram_block_slice_hdr *hdr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
520 cram_block *hdr_block;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
521 cram_block **block;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
522 cram_block **block_by_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
523
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
524 /* State used during encoding/decoding */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
525 int last_apos, max_apos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
526
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
527 /* Array of decoded cram records */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
528 cram_record *crecs;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
529
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
530 /* An dynamically growing buffers for data pointed
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
531 * to by crecs[] array.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
532 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
533 uint32_t *cigar;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
534 uint32_t cigar_alloc;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
535 uint32_t ncigar;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
536
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
537 cram_feature *features;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
538 int nfeatures;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
539 int afeatures; // allocated size of features
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
540
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
541 #ifndef TN_external
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
542 // TN field (Tag Name)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
543 uint32_t *TN;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
544 int nTN, aTN; // used and allocated size for TN[]
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
545 #else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
546 cram_block *tn_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
547 int tn_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
548 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
549
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
550 // For variable sized elements which are always external blocks.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
551 cram_block *name_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
552 cram_block *seqs_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
553 cram_block *qual_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
554 cram_block *base_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
555 cram_block *soft_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
556 cram_block *aux_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
557 cram_block *aux_OQ_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
558 cram_block *aux_BQ_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
559 cram_block *aux_BD_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
560 cram_block *aux_BI_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
561 cram_block *aux_FZ_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
562 cram_block *aux_oq_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
563 cram_block *aux_os_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
564 cram_block *aux_oz_blk;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
565
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
566 string_alloc_t *pair_keys; // Pooled keys for pair hash.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
567 khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
568
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
569 char *ref; // slice of current reference
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
570 int ref_start; // start position of current reference;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
571 int ref_end; // end position of current reference;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
572 int ref_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
573 } cram_slice;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
574
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
575 /*-----------------------------------------------------------------------------
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
576 * Consider moving reference handling to cram_refs.[ch]
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
577 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
578 // from fa.fai / samtools faidx files
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
579 typedef struct ref_entry {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
580 char *name;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
581 char *fn;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
582 int64_t length;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
583 int64_t offset;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
584 int bases_per_line;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
585 int line_length;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
586 int64_t count; // for shared references so we know to dealloc seq
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
587 char *seq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
588 } ref_entry;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
589
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
590 KHASH_MAP_INIT_STR(refs, ref_entry*)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
591
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
592 // References structure.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
593 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
594 string_alloc_t *pool; // String pool for holding filenames and SN vals
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
595
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
596 khash_t(refs) *h_meta; // ref_entry*, index by name
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
597 ref_entry **ref_id; // ref_entry*, index by ID
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
598 int nref; // number of ref_entry
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
599
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
600 char *fn; // current file opened
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
601 BGZF *fp; // and the hFILE* to go with it.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
602
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
603 int count; // how many cram_fd sharing this refs struct
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
604
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
605 pthread_mutex_t lock; // Mutex for multi-threaded updating
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
606 ref_entry *last; // Last queried sequence
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
607 int last_id; // Used in cram_ref_decr_locked to delay free
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
608 } refs_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
609
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
610 /*-----------------------------------------------------------------------------
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
611 * CRAM index
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
612 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
613 * Detect format by number of entries per line.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
614 * 5 => 1.0 (refid, start, nseq, C offset, slice)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
615 * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
616 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
617 * Indices are stored in a nested containment list, which is trivial to set
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
618 * up as the indices are on sorted data so we're appending to the nclist
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
619 * in sorted order. Basically if a slice entirely fits within a previous
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
620 * slice then we append to that slices list. This is done recursively.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
621 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
622 * Lists are sorted on two dimensions: ref id + slice coords.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
623 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
624 typedef struct cram_index {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
625 int nslice, nalloc; // total number of slices
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
626 struct cram_index *e; // array of size nslice
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
627
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
628 int refid; // 1.0 1.1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
629 int start; // 1.0 1.1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
630 int end; // 1.1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
631 int nseq; // 1.0 - undocumented
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
632 int slice; // 1.0 landmark index, 1.1 landmark value
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
633 int len; // 1.1 - size of slice in bytes
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
634 int64_t offset; // 1.0 1.1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
635 } cram_index;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
636
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
637 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
638 int refid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
639 int start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
640 int end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
641 } cram_range;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
642
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
643 /*-----------------------------------------------------------------------------
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
644 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
645 /* CRAM File handle */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
646
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
647 typedef struct spare_bams {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
648 bam_seq_t **bams;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
649 struct spare_bams *next;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
650 } spare_bams;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
651
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
652 typedef struct cram_fd {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
653 struct hFILE *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
654 int mode; // 'r' or 'w'
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
655 int version;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
656 cram_file_def *file_def;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
657 SAM_hdr *header;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
658
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
659 char *prefix;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
660 int64_t record_counter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
661 int err;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
662
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
663 // Most recent compression header decoded
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
664 //cram_block_compression_hdr *comp_hdr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
665 //cram_block_slice_hdr *slice_hdr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
666
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
667 // Current container being processed.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
668 cram_container *ctr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
669
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
670 // positions for encoding or decoding
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
671 int first_base, last_base;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
672
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
673 // cached reference portion
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
674 refs_t *refs; // ref meta-data structure
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
675 char *ref, *ref_free; // current portion held in memory
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
676 int ref_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
677 int ref_start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
678 int ref_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
679 char *ref_fn; // reference fasta filename
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
680
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
681 // compression level and metrics
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
682 int level;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
683 cram_metrics *m[DS_END];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
684
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
685 // options
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
686 int decode_md; // Whether to export MD and NM tags
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
687 int verbose;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
688 int seqs_per_slice;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
689 int slices_per_container;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
690 int embed_ref;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
691 int no_ref;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
692 int ignore_md5;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
693 int use_bz2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
694 int use_rans;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
695 int use_lzma;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
696 int shared_ref;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
697 unsigned int required_fields;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
698 cram_range range;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
699
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
700 // lookup tables, stored here so we can be trivially multi-threaded
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
701 unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
702 unsigned int cram_flag_swap[0x1000];// bam -> cram flags
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
703 unsigned char L1[256]; // ACGT{*} ->0123{4}
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
704 unsigned char L2[256]; // ACGTN{*}->01234{5}
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
705 char cram_sub_matrix[32][32]; // base substituion codes
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
706
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
707 int index_sz;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
708 cram_index *index; // array, sizeof index_sz
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
709 off_t first_container;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
710 int eof;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
711 int last_slice; // number of recs encoded in last slice
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
712 int multi_seq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
713 int unsorted;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
714 int empty_container; // Marker for EOF block
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
715
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
716 // thread pool
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
717 int own_pool;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
718 t_pool *pool;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
719 t_results_queue *rqueue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
720 pthread_mutex_t metrics_lock;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
721 pthread_mutex_t ref_lock;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
722 spare_bams *bl;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
723 pthread_mutex_t bam_list_lock;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
724 void *job_pending;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
725 int ooc; // out of containers.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
726 } cram_fd;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
727
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
728 // Translation of required fields to cram data series
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
729 enum cram_fields {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
730 CRAM_BF = 0x00000001,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
731 CRAM_AP = 0x00000002,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
732 CRAM_FP = 0x00000004,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
733 CRAM_RL = 0x00000008,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
734 CRAM_DL = 0x00000010,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
735 CRAM_NF = 0x00000020,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
736 CRAM_BA = 0x00000040,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
737 CRAM_QS = 0x00000080,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
738 CRAM_FC = 0x00000100,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
739 CRAM_FN = 0x00000200,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
740 CRAM_BS = 0x00000400,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
741 CRAM_IN = 0x00000800,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
742 CRAM_RG = 0x00001000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
743 CRAM_MQ = 0x00002000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
744 CRAM_TL = 0x00004000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
745 CRAM_RN = 0x00008000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
746 CRAM_NS = 0x00010000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
747 CRAM_NP = 0x00020000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
748 CRAM_TS = 0x00040000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
749 CRAM_MF = 0x00080000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
750 CRAM_CF = 0x00100000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
751 CRAM_RI = 0x00200000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
752 CRAM_RS = 0x00400000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
753 CRAM_PD = 0x00800000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
754 CRAM_HC = 0x01000000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
755 CRAM_SC = 0x02000000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
756 CRAM_BB = 0x04000000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
757 CRAM_BB_len = 0x08000000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
758 CRAM_QQ = 0x10000000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
759 CRAM_QQ_len = 0x20000000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
760 CRAM_aux= 0x40000000,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
761 CRAM_ALL= 0x7fffffff,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
762 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
763
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
764 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
765 // encode a base difference, but we don't need to know what it is for CIGAR.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
766 // If we have a soft-clip or insertion, we do need SC/IN though to know how
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
767 // long that array is.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
768 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
769 CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
770
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
771 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_QS | CRAM_BS | \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
772 CRAM_RL | CRAM_AP | CRAM_BB | CRAM_QQ)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
773
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
774 /* BF bitfields */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
775 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
776 #define CRAM_FPAIRED 256
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
777 #define CRAM_FPROPER_PAIR 128
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
778 #define CRAM_FUNMAP 64
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
779 #define CRAM_FREVERSE 32
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
780 #define CRAM_FREAD1 16
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
781 #define CRAM_FREAD2 8
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
782 #define CRAM_FSECONDARY 4
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
783 #define CRAM_FQCFAIL 2
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
784 #define CRAM_FDUP 1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
785
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
786 #define DS_aux_S "\001"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
787 #define DS_aux_OQ_S "\002"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
788 #define DS_aux_BQ_S "\003"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
789 #define DS_aux_BD_S "\004"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
790 #define DS_aux_BI_S "\005"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
791 #define DS_aux_FZ_S "\006"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
792 #define DS_aux_oq_S "\007"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
793 #define DS_aux_os_S "\010"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
794 #define DS_aux_oz_S "\011"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
795
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
796 #define CRAM_M_REVERSE 1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
797 #define CRAM_M_UNMAP 2
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
798
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
799
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
800 /* CF bitfields */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
801 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
802 #define CRAM_FLAG_DETACHED (1<<1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
803 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
804
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
805 #ifdef __cplusplus
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
806 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
807 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
808
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
809 #endif /* _CRAM_STRUCTS_H_ */