Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/cram/cram_encode.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* | |
2 Copyright (c) 2012-2013 Genome Research Ltd. | |
3 Author: James Bonfield <jkb@sanger.ac.uk> | |
4 | |
5 Redistribution and use in source and binary forms, with or without | |
6 modification, are permitted provided that the following conditions are met: | |
7 | |
8 1. Redistributions of source code must retain the above copyright notice, | |
9 this list of conditions and the following disclaimer. | |
10 | |
11 2. Redistributions in binary form must reproduce the above copyright notice, | |
12 this list of conditions and the following disclaimer in the documentation | |
13 and/or other materials provided with the distribution. | |
14 | |
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger | |
16 Institute nor the names of its contributors may be used to endorse or promote | |
17 products derived from this software without specific prior written permission. | |
18 | |
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND | |
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE | |
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 #ifdef HAVE_CONFIG_H | |
32 #include "io_lib_config.h" | |
33 #endif | |
34 | |
35 #include <stdio.h> | |
36 #include <errno.h> | |
37 #include <assert.h> | |
38 #include <stdlib.h> | |
39 #include <string.h> | |
40 #include <zlib.h> | |
41 #include <sys/types.h> | |
42 #include <sys/stat.h> | |
43 #include <math.h> | |
44 #include <ctype.h> | |
45 | |
46 #include "cram/cram.h" | |
47 #include "cram/os.h" | |
48 #include "cram/md5.h" | |
49 | |
50 #define Z_CRAM_STRAT Z_FILTERED | |
51 //#define Z_CRAM_STRAT Z_RLE | |
52 //#define Z_CRAM_STRAT Z_HUFFMAN_ONLY | |
53 //#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY | |
54 | |
55 static int process_one_read(cram_fd *fd, cram_container *c, | |
56 cram_slice *s, cram_record *cr, | |
57 bam_seq_t *b, int rnum); | |
58 | |
59 /* | |
60 * Returns index of val into key. | |
61 * Basically strchr(key, val)-key; | |
62 */ | |
63 static int sub_idx(char *key, char val) { | |
64 int i; | |
65 | |
66 for (i = 0; *key && *key++ != val; i++); | |
67 return i; | |
68 } | |
69 | |
70 /* | |
71 * Encodes a compression header block into a generic cram_block structure. | |
72 * | |
73 * Returns cram_block ptr on success | |
74 * NULL on failure | |
75 */ | |
76 cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, | |
77 cram_block_compression_hdr *h) { | |
78 cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0); | |
79 cram_block *map = cram_new_block(COMPRESSION_HEADER, 0); | |
80 int i, mc; | |
81 | |
82 if (!cb || !map) | |
83 return NULL; | |
84 | |
85 /* | |
86 * This is a concatenation of several blocks of data: | |
87 * header + landmarks, preservation map, read encoding map, and the tag | |
88 * encoding map. | |
89 * All 4 are variable sized and we need to know how large these are | |
90 * before creating the compression header itself as this starts with | |
91 * the total size (stored as a variable length string). | |
92 */ | |
93 | |
94 // Duplicated from container itself, and removed in 1.1 | |
95 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
96 itf8_put_blk(cb, h->ref_seq_id); | |
97 itf8_put_blk(cb, h->ref_seq_start); | |
98 itf8_put_blk(cb, h->ref_seq_span); | |
99 itf8_put_blk(cb, h->num_records); | |
100 itf8_put_blk(cb, h->num_landmarks); | |
101 for (i = 0; i < h->num_landmarks; i++) { | |
102 itf8_put_blk(cb, h->landmark[i]); | |
103 } | |
104 } | |
105 | |
106 /* Create in-memory preservation map */ | |
107 /* FIXME: should create this when we create the container */ | |
108 { | |
109 khint_t k; | |
110 int r; | |
111 | |
112 if (!(h->preservation_map = kh_init(map))) | |
113 return NULL; | |
114 | |
115 k = kh_put(map, h->preservation_map, "RN", &r); | |
116 if (-1 == r) return NULL; | |
117 kh_val(h->preservation_map, k).i = 1; | |
118 | |
119 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
120 k = kh_put(map, h->preservation_map, "PI", &r); | |
121 if (-1 == r) return NULL; | |
122 kh_val(h->preservation_map, k).i = 0; | |
123 | |
124 k = kh_put(map, h->preservation_map, "UI", &r); | |
125 if (-1 == r) return NULL; | |
126 kh_val(h->preservation_map, k).i = 1; | |
127 | |
128 k = kh_put(map, h->preservation_map, "MI", &r); | |
129 if (-1 == r) return NULL; | |
130 kh_val(h->preservation_map, k).i = 1; | |
131 | |
132 } else { | |
133 // Technically SM was in 1.0, but wasn't in Java impl. | |
134 k = kh_put(map, h->preservation_map, "SM", &r); | |
135 if (-1 == r) return NULL; | |
136 kh_val(h->preservation_map, k).i = 0; | |
137 | |
138 k = kh_put(map, h->preservation_map, "TD", &r); | |
139 if (-1 == r) return NULL; | |
140 kh_val(h->preservation_map, k).i = 0; | |
141 | |
142 k = kh_put(map, h->preservation_map, "AP", &r); | |
143 if (-1 == r) return NULL; | |
144 kh_val(h->preservation_map, k).i = c->pos_sorted; | |
145 | |
146 if (fd->no_ref || fd->embed_ref) { | |
147 // Reference Required == No | |
148 k = kh_put(map, h->preservation_map, "RR", &r); | |
149 if (-1 == r) return NULL; | |
150 kh_val(h->preservation_map, k).i = 0; | |
151 } | |
152 } | |
153 } | |
154 | |
155 /* Encode preservation map; could collapse this and above into one */ | |
156 mc = 0; | |
157 BLOCK_SIZE(map) = 0; | |
158 if (h->preservation_map) { | |
159 khint_t k; | |
160 | |
161 for (k = kh_begin(h->preservation_map); | |
162 k != kh_end(h->preservation_map); | |
163 k++) { | |
164 const char *key; | |
165 khash_t(map) *pmap = h->preservation_map; | |
166 | |
167 | |
168 if (!kh_exist(pmap, k)) | |
169 continue; | |
170 | |
171 key = kh_key(pmap, k); | |
172 BLOCK_APPEND(map, key, 2); | |
173 | |
174 switch(CRAM_KEY(key[0], key[1])) { | |
175 case CRAM_KEY('M','I'): | |
176 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
177 break; | |
178 | |
179 case CRAM_KEY('U','I'): | |
180 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
181 break; | |
182 | |
183 case CRAM_KEY('P','I'): | |
184 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
185 break; | |
186 | |
187 case CRAM_KEY('A','P'): | |
188 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
189 break; | |
190 | |
191 case CRAM_KEY('R','N'): | |
192 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
193 break; | |
194 | |
195 case CRAM_KEY('R','R'): | |
196 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); | |
197 break; | |
198 | |
199 case CRAM_KEY('S','M'): { | |
200 char smat[5], *mp = smat; | |
201 *mp++ = | |
202 (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) | | |
203 (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) | | |
204 (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) | | |
205 (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0); | |
206 *mp++ = | |
207 (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) | | |
208 (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) | | |
209 (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) | | |
210 (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0); | |
211 *mp++ = | |
212 (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) | | |
213 (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) | | |
214 (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) | | |
215 (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0); | |
216 *mp++ = | |
217 (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) | | |
218 (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) | | |
219 (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) | | |
220 (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0); | |
221 *mp++ = | |
222 (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) | | |
223 (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) | | |
224 (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) | | |
225 (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0); | |
226 BLOCK_APPEND(map, smat, 5); | |
227 break; | |
228 } | |
229 | |
230 case CRAM_KEY('T','D'): { | |
231 itf8_put_blk(map, BLOCK_SIZE(h->TD_blk)); | |
232 BLOCK_APPEND(map, | |
233 BLOCK_DATA(h->TD_blk), | |
234 BLOCK_SIZE(h->TD_blk)); | |
235 break; | |
236 } | |
237 | |
238 default: | |
239 fprintf(stderr, "Unknown preservation key '%.2s'\n", key); | |
240 break; | |
241 } | |
242 | |
243 mc++; | |
244 } | |
245 } | |
246 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); | |
247 itf8_put_blk(cb, mc); | |
248 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); | |
249 | |
250 /* rec encoding map */ | |
251 mc = 0; | |
252 BLOCK_SIZE(map) = 0; | |
253 if (h->codecs[DS_BF]) { | |
254 if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF", | |
255 fd->version)) | |
256 return NULL; | |
257 mc++; | |
258 } | |
259 if (h->codecs[DS_CF]) { | |
260 if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF", | |
261 fd->version)) | |
262 return NULL; | |
263 mc++; | |
264 } | |
265 if (h->codecs[DS_RL]) { | |
266 if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL", | |
267 fd->version)) | |
268 return NULL; | |
269 mc++; | |
270 } | |
271 if (h->codecs[DS_AP]) { | |
272 if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP", | |
273 fd->version)) | |
274 return NULL; | |
275 mc++; | |
276 } | |
277 if (h->codecs[DS_RG]) { | |
278 if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG", | |
279 fd->version)) | |
280 return NULL; | |
281 mc++; | |
282 } | |
283 if (h->codecs[DS_MF]) { | |
284 if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF", | |
285 fd->version)) | |
286 return NULL; | |
287 mc++; | |
288 } | |
289 if (h->codecs[DS_NS]) { | |
290 if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS", | |
291 fd->version)) | |
292 return NULL; | |
293 mc++; | |
294 } | |
295 if (h->codecs[DS_NP]) { | |
296 if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP", | |
297 fd->version)) | |
298 return NULL; | |
299 mc++; | |
300 } | |
301 if (h->codecs[DS_TS]) { | |
302 if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS", | |
303 fd->version)) | |
304 return NULL; | |
305 mc++; | |
306 } | |
307 if (h->codecs[DS_NF]) { | |
308 if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF", | |
309 fd->version)) | |
310 return NULL; | |
311 mc++; | |
312 } | |
313 if (h->codecs[DS_TC]) { | |
314 if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC", | |
315 fd->version)) | |
316 return NULL; | |
317 mc++; | |
318 } | |
319 if (h->codecs[DS_TN]) { | |
320 if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN", | |
321 fd->version)) | |
322 return NULL; | |
323 mc++; | |
324 } | |
325 if (h->codecs[DS_TL]) { | |
326 if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL", | |
327 fd->version)) | |
328 return NULL; | |
329 mc++; | |
330 } | |
331 if (h->codecs[DS_FN]) { | |
332 if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN", | |
333 fd->version)) | |
334 return NULL; | |
335 mc++; | |
336 } | |
337 if (h->codecs[DS_FC]) { | |
338 if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC", | |
339 fd->version)) | |
340 return NULL; | |
341 mc++; | |
342 } | |
343 if (h->codecs[DS_FP]) { | |
344 if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP", | |
345 fd->version)) | |
346 return NULL; | |
347 mc++; | |
348 } | |
349 if (h->codecs[DS_BS]) { | |
350 if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS", | |
351 fd->version)) | |
352 return NULL; | |
353 mc++; | |
354 } | |
355 if (h->codecs[DS_IN]) { | |
356 if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN", | |
357 fd->version)) | |
358 return NULL; | |
359 mc++; | |
360 } | |
361 if (h->codecs[DS_DL]) { | |
362 if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL", | |
363 fd->version)) | |
364 return NULL; | |
365 mc++; | |
366 } | |
367 if (h->codecs[DS_BA]) { | |
368 if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA", | |
369 fd->version)) | |
370 return NULL; | |
371 mc++; | |
372 } | |
373 if (h->codecs[DS_BB]) { | |
374 if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB", | |
375 fd->version)) | |
376 return NULL; | |
377 mc++; | |
378 } | |
379 if (h->codecs[DS_MQ]) { | |
380 if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ", | |
381 fd->version)) | |
382 return NULL; | |
383 mc++; | |
384 } | |
385 if (h->codecs[DS_RN]) { | |
386 if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN", | |
387 fd->version)) | |
388 return NULL; | |
389 mc++; | |
390 } | |
391 if (h->codecs[DS_QS]) { | |
392 if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS", | |
393 fd->version)) | |
394 return NULL; | |
395 mc++; | |
396 } | |
397 if (h->codecs[DS_QQ]) { | |
398 if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ", | |
399 fd->version)) | |
400 return NULL; | |
401 mc++; | |
402 } | |
403 if (h->codecs[DS_RI]) { | |
404 if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI", | |
405 fd->version)) | |
406 return NULL; | |
407 mc++; | |
408 } | |
409 if (CRAM_MAJOR_VERS(fd->version) != 1) { | |
410 if (h->codecs[DS_SC]) { | |
411 if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC", | |
412 fd->version)) | |
413 return NULL; | |
414 mc++; | |
415 } | |
416 if (h->codecs[DS_RS]) { | |
417 if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS", | |
418 fd->version)) | |
419 return NULL; | |
420 mc++; | |
421 } | |
422 if (h->codecs[DS_PD]) { | |
423 if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD", | |
424 fd->version)) | |
425 return NULL; | |
426 mc++; | |
427 } | |
428 if (h->codecs[DS_HC]) { | |
429 if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC", | |
430 fd->version)) | |
431 return NULL; | |
432 mc++; | |
433 } | |
434 } | |
435 if (h->codecs[DS_TM]) { | |
436 if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM", | |
437 fd->version)) | |
438 return NULL; | |
439 mc++; | |
440 } | |
441 if (h->codecs[DS_TV]) { | |
442 if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV", | |
443 fd->version)) | |
444 return NULL; | |
445 mc++; | |
446 } | |
447 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); | |
448 itf8_put_blk(cb, mc); | |
449 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); | |
450 | |
451 /* tag encoding map */ | |
452 #if 0 | |
453 mp = map; mc = 0; | |
454 if (h->tag_encoding_map) { | |
455 HashItem *hi; | |
456 HashIter *iter = HashTableIterCreate(); | |
457 if (!iter) | |
458 return NULL; | |
459 | |
460 while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) { | |
461 cram_map *m = hi->data.p; | |
462 int sz; | |
463 | |
464 mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]); | |
465 if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version))) | |
466 return NULL; | |
467 mp += sz; | |
468 mc++; | |
469 } | |
470 | |
471 HashTableIterDestroy(iter); | |
472 } | |
473 #else | |
474 mc = 0; | |
475 BLOCK_SIZE(map) = 0; | |
476 if (c->tags_used) { | |
477 khint_t k; | |
478 | |
479 #define TAG_ID(a) ((#a[0]<<8)+#a[1]) | |
480 | |
481 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) { | |
482 int key; | |
483 if (!kh_exist(c->tags_used, k)) | |
484 continue; | |
485 | |
486 mc++; | |
487 itf8_put_blk(map, kh_key(c->tags_used, k)); | |
488 | |
489 // use block content id 4 | |
490 switch((key = kh_key(c->tags_used, k)) & 0xff) { | |
491 case 'Z': case 'H': | |
492 // string as byte_array_stop | |
493 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
494 BLOCK_APPEND(map, | |
495 "\005" // BYTE_ARRAY_STOP | |
496 "\005" // len | |
497 "\t" // stop-byte is also SAM separator | |
498 DS_aux_S "\000\000\000", | |
499 7); | |
500 } else { | |
501 if (key>>8 == TAG_ID(OQ)) | |
502 BLOCK_APPEND(map, | |
503 "\005" // BYTE_ARRAY_STOP | |
504 "\002" // len | |
505 "\t" // stop-byte is also SAM separator | |
506 DS_aux_OQ_S, | |
507 4); | |
508 else if (key>>8 == TAG_ID(BQ)) | |
509 BLOCK_APPEND(map, | |
510 "\005" // BYTE_ARRAY_STOP | |
511 "\002" // len | |
512 "\t" // stop-byte is also SAM separator | |
513 DS_aux_BQ_S, | |
514 4); | |
515 else if (key>>8 == TAG_ID(BD)) | |
516 BLOCK_APPEND(map, | |
517 "\005" // BYTE_ARRAY_STOP | |
518 "\002" // len | |
519 "\t" // stop-byte is also SAM separator | |
520 DS_aux_BD_S, | |
521 4); | |
522 else if (key>>8 == TAG_ID(BI)) | |
523 BLOCK_APPEND(map, | |
524 "\005" // BYTE_ARRAY_STOP | |
525 "\002" // len | |
526 "\t" // stop-byte is also SAM separator | |
527 DS_aux_BI_S, | |
528 4); | |
529 else if ((key>>8 == TAG_ID(Q2)) || | |
530 (key>>8 == TAG_ID(U2)) || | |
531 (key>>8 == TAG_ID(QT)) || | |
532 (key>>8 == TAG_ID(CQ))) | |
533 BLOCK_APPEND(map, | |
534 "\005" // BYTE_ARRAY_STOP | |
535 "\002" // len | |
536 "\t" // stop-byte is also SAM separator | |
537 DS_aux_oq_S, | |
538 4); | |
539 else if ((key>>8 == TAG_ID(R2)) || | |
540 (key>>8 == TAG_ID(E2)) || | |
541 (key>>8 == TAG_ID(CS)) || | |
542 (key>>8 == TAG_ID(BC)) || | |
543 (key>>8 == TAG_ID(RT))) | |
544 BLOCK_APPEND(map, | |
545 "\005" // BYTE_ARRAY_STOP | |
546 "\002" // len | |
547 "\t" // stop-byte is also SAM separator | |
548 DS_aux_os_S, | |
549 4); | |
550 else | |
551 BLOCK_APPEND(map, | |
552 "\005" // BYTE_ARRAY_STOP | |
553 "\002" // len | |
554 "\t" // stop-byte is also SAM separator | |
555 DS_aux_oz_S, | |
556 4); | |
557 } | |
558 break; | |
559 | |
560 case 'A': case 'c': case 'C': | |
561 // byte array len, 1 byte | |
562 BLOCK_APPEND(map, | |
563 "\004" // BYTE_ARRAY_LEN | |
564 "\011" // length | |
565 "\003" // HUFFMAN (len) | |
566 "\004" // huffman-len | |
567 "\001" // 1 symbol | |
568 "\001" // symbol=1 byte value | |
569 "\001" // 1 length | |
570 "\000" // length=0 | |
571 "\001" // EXTERNAL (val) | |
572 "\001" // external-len | |
573 DS_aux_S,// content-id | |
574 11); | |
575 break; | |
576 | |
577 case 's': case 'S': | |
578 // byte array len, 2 byte | |
579 BLOCK_APPEND(map, | |
580 "\004" // BYTE_ARRAY_LEN | |
581 "\011" // length | |
582 "\003" // HUFFMAN (len) | |
583 "\004" // huffman-len | |
584 "\001" // 1 symbol | |
585 "\002" // symbol=2 byte value | |
586 "\001" // 1 length | |
587 "\000" // length=0 | |
588 "\001" // EXTERNAL (val) | |
589 "\001" // external-len | |
590 DS_aux_S,// content-id | |
591 11); | |
592 break; | |
593 | |
594 case 'i': case 'I': case 'f': | |
595 // byte array len, 4 byte | |
596 BLOCK_APPEND(map, | |
597 "\004" // BYTE_ARRAY_LEN | |
598 "\011" // length | |
599 "\003" // HUFFMAN (len) | |
600 "\004" // huffman-len | |
601 "\001" // 1 symbol | |
602 "\004" // symbol=4 byte value | |
603 "\001" // 1 length | |
604 "\000" // length=0 | |
605 "\001" // EXTERNAL (val) | |
606 "\001" // external-len | |
607 DS_aux_S,// content-id | |
608 11); | |
609 break; | |
610 | |
611 case 'B': | |
612 // Byte array of variable size, but we generate our tag | |
613 // byte stream at the wrong stage (during reading and not | |
614 // after slice header construction). So we use | |
615 // BYTE_ARRAY_LEN with the length codec being external | |
616 // too. | |
617 if ((key>>8 == TAG_ID(FZ)) || (key>>8 == TAG_ID(ZM))) | |
618 BLOCK_APPEND(map, | |
619 "\004" // BYTE_ARRAY_LEN | |
620 "\006" // length | |
621 "\001" // EXTERNAL (len) | |
622 "\001" // external-len | |
623 DS_aux_FZ_S // content-id | |
624 "\001" // EXTERNAL (val) | |
625 "\001" // external-len | |
626 DS_aux_FZ_S,// content-id | |
627 8); | |
628 else | |
629 BLOCK_APPEND(map, | |
630 "\004" // BYTE_ARRAY_LEN | |
631 "\006" // length | |
632 "\001" // EXTERNAL (len) | |
633 "\001" // external-len | |
634 DS_aux_S // content-id | |
635 "\001" // EXTERNAL (val) | |
636 "\001" // external-len | |
637 DS_aux_S,// content-id | |
638 8); | |
639 break; | |
640 | |
641 default: | |
642 fprintf(stderr, "Unsupported SAM aux type '%c'\n", | |
643 kh_key(c->tags_used, k) & 0xff); | |
644 } | |
645 //mp += m->codec->store(m->codec, mp, NULL, fd->version); | |
646 } | |
647 } | |
648 #endif | |
649 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); | |
650 itf8_put_blk(cb, mc); | |
651 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); | |
652 | |
653 if (fd->verbose) | |
654 fprintf(stderr, "Wrote compression block header in %d bytes\n", | |
655 (int)BLOCK_SIZE(cb)); | |
656 | |
657 BLOCK_UPLEN(cb); | |
658 | |
659 cram_free_block(map); | |
660 | |
661 return cb; | |
662 } | |
663 | |
664 | |
665 /* | |
666 * Encodes a slice compression header. | |
667 * | |
668 * Returns cram_block on success | |
669 * NULL on failure | |
670 */ | |
671 cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { | |
672 char *buf; | |
673 char *cp; | |
674 cram_block *b = cram_new_block(MAPPED_SLICE, 0); | |
675 int j; | |
676 | |
677 if (!b) | |
678 return NULL; | |
679 | |
680 if (NULL == (cp = buf = malloc(16+5*(8+s->hdr->num_blocks)))) { | |
681 cram_free_block(b); | |
682 return NULL; | |
683 } | |
684 | |
685 cp += itf8_put(cp, s->hdr->ref_seq_id); | |
686 cp += itf8_put(cp, s->hdr->ref_seq_start); | |
687 cp += itf8_put(cp, s->hdr->ref_seq_span); | |
688 cp += itf8_put(cp, s->hdr->num_records); | |
689 if (CRAM_MAJOR_VERS(fd->version) == 2) | |
690 cp += itf8_put(cp, s->hdr->record_counter); | |
691 else if (CRAM_MAJOR_VERS(fd->version) >= 3) | |
692 cp += ltf8_put(cp, s->hdr->record_counter); | |
693 cp += itf8_put(cp, s->hdr->num_blocks); | |
694 cp += itf8_put(cp, s->hdr->num_content_ids); | |
695 for (j = 0; j < s->hdr->num_content_ids; j++) { | |
696 cp += itf8_put(cp, s->hdr->block_content_ids[j]); | |
697 } | |
698 if (s->hdr->content_type == MAPPED_SLICE) | |
699 cp += itf8_put(cp, s->hdr->ref_base_id); | |
700 | |
701 if (CRAM_MAJOR_VERS(fd->version) != 1) { | |
702 memcpy(cp, s->hdr->md5, 16); cp += 16; | |
703 } | |
704 | |
705 assert(cp-buf <= 16+5*(8+s->hdr->num_blocks)); | |
706 | |
707 b->data = (unsigned char *)buf; | |
708 b->comp_size = b->uncomp_size = cp-buf; | |
709 | |
710 return b; | |
711 } | |
712 | |
713 | |
714 /* | |
715 * Encodes a single read. | |
716 * | |
717 * Returns 0 on success | |
718 * -1 on failure | |
719 */ | |
720 static int cram_encode_slice_read(cram_fd *fd, | |
721 cram_container *c, | |
722 cram_block_compression_hdr *h, | |
723 cram_slice *s, | |
724 cram_record *cr, | |
725 int *last_pos) { | |
726 int r = 0; | |
727 int32_t i32; | |
728 unsigned char uc; | |
729 | |
730 //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name); | |
731 | |
732 //printf("BF=0x%x\n", cr->flags); | |
733 // bf = cram_flag_swap[cr->flags]; | |
734 i32 = fd->cram_flag_swap[cr->flags & 0xfff]; | |
735 r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1); | |
736 | |
737 i32 = cr->cram_flags; | |
738 r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1); | |
739 | |
740 if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2) | |
741 r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1); | |
742 | |
743 r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1); | |
744 | |
745 if (c->pos_sorted) { | |
746 i32 = cr->apos - *last_pos; | |
747 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); | |
748 *last_pos = cr->apos; | |
749 } else { | |
750 i32 = cr->apos; | |
751 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); | |
752 } | |
753 | |
754 r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1); | |
755 | |
756 if (c->comp_hdr->read_names_included) { | |
757 // RN codec: Already stored in block[3]. | |
758 } | |
759 | |
760 if (cr->cram_flags & CRAM_FLAG_DETACHED) { | |
761 i32 = cr->mate_flags; | |
762 r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1); | |
763 | |
764 if (!c->comp_hdr->read_names_included) { | |
765 // RN codec: Already stored in block[3]. | |
766 } | |
767 | |
768 r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS], | |
769 (char *)&cr->mate_ref_id, 1); | |
770 | |
771 r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], | |
772 (char *)&cr->mate_pos, 1); | |
773 | |
774 r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], | |
775 (char *)&cr->tlen, 1); | |
776 } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { | |
777 r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], | |
778 (char *)&cr->mate_line, 1); | |
779 } | |
780 | |
781 /* Aux tags */ | |
782 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
783 int j; | |
784 uc = cr->ntags; | |
785 r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1); | |
786 | |
787 for (j = 0; j < cr->ntags; j++) { | |
788 uint32_t i32 = s->TN[cr->TN_idx + j]; // id | |
789 r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1); | |
790 } | |
791 } else { | |
792 r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1); | |
793 } | |
794 | |
795 // qual | |
796 // QS codec : Already stored in block[2]. | |
797 | |
798 // features (diffs) | |
799 if (!(cr->flags & BAM_FUNMAP)) { | |
800 int prev_pos = 0, j; | |
801 | |
802 r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN], | |
803 (char *)&cr->nfeature, 1); | |
804 for (j = 0; j < cr->nfeature; j++) { | |
805 cram_feature *f = &s->features[cr->feature + j]; | |
806 | |
807 uc = f->X.code; | |
808 r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1); | |
809 i32 = f->X.pos - prev_pos; | |
810 r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1); | |
811 prev_pos = f->X.pos; | |
812 | |
813 switch(f->X.code) { | |
814 //char *seq; | |
815 | |
816 case 'X': | |
817 //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base); | |
818 | |
819 uc = f->X.base; | |
820 r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS], | |
821 (char *)&uc, 1); | |
822 break; | |
823 case 'S': | |
824 // Already done | |
825 // r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC], | |
826 // BLOCK_DATA(s->soft_blk) + f->S.seq_idx, | |
827 // f->S.len); | |
828 | |
829 // if (IS_CRAM_3_VERS(fd)) { | |
830 // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], | |
831 // BLOCK_DATA(s->seqs_blk) + f->S.seq_idx, | |
832 // f->S.len); | |
833 // } | |
834 break; | |
835 case 'I': | |
836 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; | |
837 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], | |
838 // seq, f->S.len); | |
839 // if (IS_CRAM_3_VERS(fd)) { | |
840 // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], | |
841 // BLOCK_DATA(s->seqs_blk) + f->I.seq_idx, | |
842 // f->I.len); | |
843 // } | |
844 break; | |
845 case 'i': | |
846 uc = f->i.base; | |
847 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], | |
848 (char *)&uc, 1); | |
849 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; | |
850 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], | |
851 // seq, 1); | |
852 break; | |
853 case 'D': | |
854 i32 = f->D.len; | |
855 r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL], | |
856 (char *)&i32, 1); | |
857 break; | |
858 | |
859 case 'B': | |
860 // // Used when we try to store a non ACGTN base or an N | |
861 // // that aligns against a non ACGTN reference | |
862 | |
863 uc = f->B.base; | |
864 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], | |
865 (char *)&uc, 1); | |
866 | |
867 // Already added | |
868 // uc = f->B.qual; | |
869 // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS], | |
870 // (char *)&uc, 1); | |
871 break; | |
872 | |
873 case 'b': | |
874 // string of bases | |
875 r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], | |
876 (char *)BLOCK_DATA(s->seqs_blk) | |
877 + f->b.seq_idx, | |
878 f->b.len); | |
879 break; | |
880 | |
881 case 'Q': | |
882 // Already added | |
883 // uc = f->B.qual; | |
884 // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS], | |
885 // (char *)&uc, 1); | |
886 break; | |
887 | |
888 case 'N': | |
889 i32 = f->N.len; | |
890 r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS], | |
891 (char *)&i32, 1); | |
892 break; | |
893 | |
894 case 'P': | |
895 i32 = f->P.len; | |
896 r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD], | |
897 (char *)&i32, 1); | |
898 break; | |
899 | |
900 case 'H': | |
901 i32 = f->H.len; | |
902 r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC], | |
903 (char *)&i32, 1); | |
904 break; | |
905 | |
906 | |
907 default: | |
908 fprintf(stderr, "unhandled feature code %c\n", | |
909 f->X.code); | |
910 return -1; | |
911 } | |
912 } | |
913 | |
914 r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ], | |
915 (char *)&cr->mqual, 1); | |
916 } else { | |
917 char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq; | |
918 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len); | |
919 } | |
920 | |
921 return r ? -1 : 0; | |
922 } | |
923 | |
924 | |
925 /* | |
926 * Applies various compression methods to specific blocks, depending on | |
927 * known observations of how data series compress. | |
928 * | |
929 * Returns 0 on success | |
930 * -1 on failure | |
931 */ | |
932 static int cram_compress_slice(cram_fd *fd, cram_slice *s) { | |
933 int level = fd->level, i; | |
934 int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method; | |
935 | |
936 /* Compress the CORE Block too, with minimal zlib level */ | |
937 if (level > 5 && s->block[0]->uncomp_size > 500) | |
938 cram_compress_block(fd, s->block[0], NULL, GZIP, 1); | |
939 | |
940 if (fd->use_bz2) | |
941 method |= 1<<BZIP2; | |
942 | |
943 if (fd->use_rans) | |
944 method |= (1<<RANS0) | (1<<RANS1); | |
945 | |
946 if (fd->use_lzma) | |
947 method |= (1<<LZMA); | |
948 | |
949 /* Faster method for data series we only need entropy encoding on */ | |
950 methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA); | |
951 if (level >= 6) | |
952 methodF = method; | |
953 | |
954 | |
955 /* Specific compression methods for certain block types */ | |
956 if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq) | |
957 method, level)) | |
958 return -1; | |
959 | |
960 if (fd->level == 0) { | |
961 /* Do nothing */ | |
962 } else if (fd->level == 1) { | |
963 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], | |
964 methodF, 1)) | |
965 return -1; | |
966 for (i = DS_aux; i <= DS_aux_oz; i++) { | |
967 if (s->block[i]) | |
968 if (cram_compress_block(fd, s->block[i], fd->m[i], | |
969 method, 1)) | |
970 return -1; | |
971 } | |
972 } else if (fd->level < 3) { | |
973 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], | |
974 method, 1)) | |
975 return -1; | |
976 if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], | |
977 method, 1)) | |
978 return -1; | |
979 if (s->block[DS_BB]) | |
980 if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], | |
981 method, 1)) | |
982 return -1; | |
983 for (i = DS_aux; i <= DS_aux_oz; i++) { | |
984 if (s->block[i]) | |
985 if (cram_compress_block(fd, s->block[i], fd->m[i], | |
986 method, level)) | |
987 return -1; | |
988 } | |
989 } else { | |
990 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], | |
991 method, level)) | |
992 return -1; | |
993 if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], | |
994 method, level)) | |
995 return -1; | |
996 if (s->block[DS_BB]) | |
997 if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], | |
998 method, level)) | |
999 return -1; | |
1000 for (i = DS_aux; i <= DS_aux_oz; i++) { | |
1001 if (s->block[i]) | |
1002 if (cram_compress_block(fd, s->block[i], fd->m[i], | |
1003 method, level)) | |
1004 return -1; | |
1005 } | |
1006 } | |
1007 | |
1008 // NAME: best is generally xz, bzip2, zlib then rans1 | |
1009 // It benefits well from a little bit extra compression level. | |
1010 if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN], | |
1011 method & ~(1<<RANS0 | 1<<GZIP_RLE), | |
1012 MIN(9,level))) | |
1013 return -1; | |
1014 | |
1015 // NS shows strong local correlation as rearrangements are localised | |
1016 if (s->block[DS_NS] != s->block[0]) | |
1017 if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS], | |
1018 method, level)) | |
1019 return -1; | |
1020 | |
1021 | |
1022 /* | |
1023 * Minimal compression of any block still uncompressed, bar CORE | |
1024 */ | |
1025 { | |
1026 int i; | |
1027 for (i = 1; i < DS_END; i++) { | |
1028 if (!s->block[i] || s->block[i] == s->block[0]) | |
1029 continue; | |
1030 | |
1031 // fast methods only | |
1032 if (s->block[i]->method == RAW) { | |
1033 cram_compress_block(fd, s->block[i], fd->m[i], | |
1034 methodF, level); | |
1035 } | |
1036 } | |
1037 } | |
1038 | |
1039 return 0; | |
1040 } | |
1041 | |
1042 /* | |
1043 * Encodes a single slice from a container | |
1044 * | |
1045 * Returns 0 on success | |
1046 * -1 on failure | |
1047 */ | |
1048 static int cram_encode_slice(cram_fd *fd, cram_container *c, | |
1049 cram_block_compression_hdr *h, cram_slice *s) { | |
1050 int rec, r = 0, last_pos; | |
1051 int embed_ref; | |
1052 enum cram_DS_ID id; | |
1053 | |
1054 embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0; | |
1055 | |
1056 /* | |
1057 * Slice external blocks: | |
1058 * ID 0 => base calls (insertions, soft-clip) | |
1059 * ID 1 => qualities | |
1060 * ID 2 => names | |
1061 * ID 3 => TS (insert size), NP (next frag) | |
1062 * ID 4 => tag values | |
1063 * ID 6 => tag IDs (TN), if CRAM_V1.0 | |
1064 * ID 7 => TD tag dictionary, if !CRAM_V1.0 | |
1065 */ | |
1066 | |
1067 /* Create cram slice header */ | |
1068 s->hdr->ref_base_id = embed_ref ? DS_ref : -1; | |
1069 s->hdr->record_counter = c->num_records + c->record_counter; | |
1070 c->num_records += s->hdr->num_records; | |
1071 | |
1072 s->block = calloc(DS_END, sizeof(s->block[0])); | |
1073 s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t)); | |
1074 if (!s->block || !s->hdr->block_content_ids) | |
1075 return -1; | |
1076 | |
1077 // Create first fixed blocks, always external. | |
1078 // CORE | |
1079 if (!(s->block[0] = cram_new_block(CORE, 0))) | |
1080 return -1; | |
1081 | |
1082 // TN block for CRAM v1 | |
1083 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
1084 if (h->codecs[DS_TN]->codec == E_EXTERNAL) { | |
1085 if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1; | |
1086 h->codecs[DS_TN]->external.content_id = DS_TN; | |
1087 } else { | |
1088 s->block[DS_TN] = s->block[0]; | |
1089 } | |
1090 s->block[DS_TN] = s->block[DS_TN]; | |
1091 } | |
1092 | |
1093 // Embedded reference | |
1094 if (embed_ref) { | |
1095 if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref))) | |
1096 return -1; | |
1097 s->ref_id = DS_ref; // needed? | |
1098 BLOCK_APPEND(s->block[DS_ref], | |
1099 c->ref + c->first_base - c->ref_start, | |
1100 c->last_base - c->first_base + 1); | |
1101 } | |
1102 | |
1103 /* | |
1104 * All the data-series blocks if appropriate. | |
1105 */ | |
1106 for (id = DS_BF; id < DS_TN; id++) { | |
1107 if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL || | |
1108 h->codecs[id]->codec == E_BYTE_ARRAY_STOP || | |
1109 h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) { | |
1110 switch (h->codecs[id]->codec) { | |
1111 case E_EXTERNAL: | |
1112 if (!(s->block[id] = cram_new_block(EXTERNAL, id))) | |
1113 return -1; | |
1114 h->codecs[id]->external.content_id = id; | |
1115 break; | |
1116 | |
1117 case E_BYTE_ARRAY_STOP: | |
1118 if (!(s->block[id] = cram_new_block(EXTERNAL, id))) | |
1119 return -1; | |
1120 h->codecs[id]->byte_array_stop.content_id = id; | |
1121 break; | |
1122 | |
1123 case E_BYTE_ARRAY_LEN: { | |
1124 cram_codec *cc; | |
1125 | |
1126 cc = h->codecs[id]->e_byte_array_len.len_codec; | |
1127 if (cc->codec == E_EXTERNAL) { | |
1128 int eid = cc->external.content_id; | |
1129 if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) | |
1130 return -1; | |
1131 cc->external.content_id = eid; | |
1132 cc->out = s->block[eid]; | |
1133 } | |
1134 | |
1135 cc = h->codecs[id]->e_byte_array_len.val_codec; | |
1136 if (cc->codec == E_EXTERNAL) { | |
1137 int eid = cc->external.content_id; | |
1138 if (!s->block[eid]) | |
1139 if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) | |
1140 return -1; | |
1141 cc->external.content_id = eid; | |
1142 cc->out = s->block[eid]; | |
1143 } | |
1144 break; | |
1145 } | |
1146 default: | |
1147 break; | |
1148 } | |
1149 } else { | |
1150 if (!(id == DS_BB && !h->codecs[DS_BB])) | |
1151 s->block[id] = s->block[0]; | |
1152 } | |
1153 if (h->codecs[id]) | |
1154 h->codecs[id]->out = s->block[id]; | |
1155 } | |
1156 | |
1157 /* Encode reads */ | |
1158 last_pos = s->hdr->ref_seq_start; | |
1159 for (rec = 0; rec < s->hdr->num_records; rec++) { | |
1160 cram_record *cr = &s->crecs[rec]; | |
1161 if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1) | |
1162 return -1; | |
1163 } | |
1164 | |
1165 s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7); | |
1166 s->block[0]->comp_size = s->block[0]->uncomp_size; | |
1167 | |
1168 // Make sure the fixed blocks point to the correct sources | |
1169 s->block[DS_IN] = s->base_blk; s->base_blk = NULL; | |
1170 s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL; | |
1171 s->block[DS_RN] = s->name_blk; s->name_blk = NULL; | |
1172 s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL; | |
1173 s->block[DS_aux]= s->aux_blk; s->aux_blk = NULL; | |
1174 s->block[DS_aux_OQ]= s->aux_OQ_blk; s->aux_OQ_blk = NULL; | |
1175 s->block[DS_aux_BQ]= s->aux_BQ_blk; s->aux_BQ_blk = NULL; | |
1176 s->block[DS_aux_BD]= s->aux_BD_blk; s->aux_BD_blk = NULL; | |
1177 s->block[DS_aux_BI]= s->aux_BI_blk; s->aux_BI_blk = NULL; | |
1178 s->block[DS_aux_FZ]= s->aux_FZ_blk; s->aux_FZ_blk = NULL; | |
1179 s->block[DS_aux_oq]= s->aux_oq_blk; s->aux_oq_blk = NULL; | |
1180 s->block[DS_aux_os]= s->aux_os_blk; s->aux_os_blk = NULL; | |
1181 s->block[DS_aux_oz]= s->aux_oz_blk; s->aux_oz_blk = NULL; | |
1182 | |
1183 // Ensure block sizes are up to date. | |
1184 for (id = 1; id < DS_END; id++) { | |
1185 if (!s->block[id] || s->block[id] == s->block[0]) | |
1186 continue; | |
1187 | |
1188 if (s->block[id]->uncomp_size == 0) | |
1189 BLOCK_UPLEN(s->block[id]); | |
1190 } | |
1191 | |
1192 // Compress it all | |
1193 if (cram_compress_slice(fd, s) == -1) | |
1194 return -1; | |
1195 | |
1196 // Collapse empty blocks and create hdr_block | |
1197 { | |
1198 int i, j; | |
1199 for (i = j = 1; i < DS_END; i++) { | |
1200 if (!s->block[i] || s->block[i] == s->block[0]) | |
1201 continue; | |
1202 if (s->block[i]->uncomp_size == 0) { | |
1203 cram_free_block(s->block[i]); | |
1204 s->block[i] = NULL; | |
1205 continue; | |
1206 } | |
1207 s->block[j] = s->block[i]; | |
1208 s->hdr->block_content_ids[j-1] = s->block[i]->content_id; | |
1209 j++; | |
1210 } | |
1211 s->hdr->num_content_ids = j-1; | |
1212 s->hdr->num_blocks = j; | |
1213 | |
1214 if (!(s->hdr_block = cram_encode_slice_header(fd, s))) | |
1215 return -1; | |
1216 } | |
1217 | |
1218 return r ? -1 : 0; | |
1219 } | |
1220 | |
1221 /* | |
1222 * Encodes all slices in a container into blocks. | |
1223 * Returns 0 on success | |
1224 * -1 on failure | |
1225 */ | |
1226 int cram_encode_container(cram_fd *fd, cram_container *c) { | |
1227 int i, j, slice_offset; | |
1228 cram_block_compression_hdr *h = c->comp_hdr; | |
1229 cram_block *c_hdr; | |
1230 int multi_ref = 0; | |
1231 int r1, r2, sn, nref; | |
1232 spare_bams *spares; | |
1233 | |
1234 /* Cache references up-front if we have unsorted access patterns */ | |
1235 pthread_mutex_lock(&fd->ref_lock); | |
1236 nref = fd->refs->nref; | |
1237 pthread_mutex_unlock(&fd->ref_lock); | |
1238 | |
1239 if (!fd->no_ref && c->refs_used) { | |
1240 for (i = 0; i < nref; i++) { | |
1241 if (c->refs_used[i]) | |
1242 cram_get_ref(fd, i, 1, 0); | |
1243 } | |
1244 } | |
1245 | |
1246 /* To create M5 strings */ | |
1247 /* Fetch reference sequence */ | |
1248 if (!fd->no_ref) { | |
1249 bam_seq_t *b = c->bams[0]; | |
1250 char *ref; | |
1251 | |
1252 ref = cram_get_ref(fd, bam_ref(b), 1, 0); | |
1253 if (!ref && bam_ref(b) >= 0) { | |
1254 fprintf(stderr, "Failed to load reference #%d\n", bam_ref(b)); | |
1255 return -1; | |
1256 } | |
1257 if ((c->ref_id = bam_ref(b)) >= 0) { | |
1258 c->ref_seq_id = c->ref_id; | |
1259 c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; | |
1260 c->ref_start = 1; | |
1261 c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; | |
1262 } else { | |
1263 c->ref_seq_id = c->ref_id; // FIXME remove one var! | |
1264 } | |
1265 } else { | |
1266 c->ref_id = bam_ref(c->bams[0]); | |
1267 cram_ref_incr(fd->refs, c->ref_id); | |
1268 c->ref_seq_id = c->ref_id; | |
1269 } | |
1270 | |
1271 /* Turn bams into cram_records and gather basic stats */ | |
1272 for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) { | |
1273 cram_slice *s = c->slices[sn]; | |
1274 int first_base = INT_MAX, last_base = INT_MIN; | |
1275 | |
1276 assert(sn < c->curr_slice); | |
1277 | |
1278 /* FIXME: we could create our slice objects here too instead of | |
1279 * in cram_put_bam_seq. It's more natural here and also this is | |
1280 * bit is threaded so it's less work in the main thread. | |
1281 */ | |
1282 | |
1283 for (r2 = 0; r1 < c->curr_c_rec && r2 < c->max_rec; r1++, r2++) { | |
1284 cram_record *cr = &s->crecs[r2]; | |
1285 bam_seq_t *b = c->bams[r1]; | |
1286 | |
1287 /* If multi-ref we need to cope with changing reference per seq */ | |
1288 if (c->multi_seq && !fd->no_ref) { | |
1289 if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) { | |
1290 if (c->ref_seq_id >= 0) | |
1291 cram_ref_decr(fd->refs, c->ref_seq_id); | |
1292 | |
1293 if (!cram_get_ref(fd, bam_ref(b), 1, 0)) { | |
1294 fprintf(stderr, "Failed to load reference #%d\n", | |
1295 bam_ref(b)); | |
1296 return -1; | |
1297 } | |
1298 | |
1299 c->ref_seq_id = bam_ref(b); // overwritten later by -2 | |
1300 assert(fd->refs->ref_id[c->ref_seq_id]->seq); | |
1301 c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; | |
1302 c->ref_start = 1; | |
1303 c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; | |
1304 } | |
1305 } | |
1306 | |
1307 process_one_read(fd, c, s, cr, b, r2); | |
1308 | |
1309 if (first_base > cr->apos) | |
1310 first_base = cr->apos; | |
1311 | |
1312 if (last_base < cr->aend) | |
1313 last_base = cr->aend; | |
1314 } | |
1315 | |
1316 if (c->multi_seq) { | |
1317 s->hdr->ref_seq_id = -2; | |
1318 s->hdr->ref_seq_start = 0; | |
1319 s->hdr->ref_seq_span = 0; | |
1320 } else { | |
1321 s->hdr->ref_seq_id = c->ref_id; | |
1322 s->hdr->ref_seq_start = first_base; | |
1323 s->hdr->ref_seq_span = last_base - first_base + 1; | |
1324 } | |
1325 s->hdr->num_records = r2; | |
1326 } | |
1327 | |
1328 if (c->multi_seq && !fd->no_ref) { | |
1329 if (c->ref_seq_id >= 0) | |
1330 cram_ref_decr(fd->refs, c->ref_seq_id); | |
1331 } | |
1332 | |
1333 /* Link our bams[] array onto the spare bam list for reuse */ | |
1334 spares = malloc(sizeof(*spares)); | |
1335 pthread_mutex_lock(&fd->bam_list_lock); | |
1336 spares->bams = c->bams; | |
1337 spares->next = fd->bl; | |
1338 fd->bl = spares; | |
1339 pthread_mutex_unlock(&fd->bam_list_lock); | |
1340 c->bams = NULL; | |
1341 | |
1342 /* Detect if a multi-seq container */ | |
1343 cram_stats_encoding(fd, c->stats[DS_RI]); | |
1344 multi_ref = c->stats[DS_RI]->nvals > 1; | |
1345 | |
1346 if (multi_ref) { | |
1347 if (fd->verbose) | |
1348 fprintf(stderr, "Multi-ref container\n"); | |
1349 c->ref_seq_id = -2; | |
1350 c->ref_seq_start = 0; | |
1351 c->ref_seq_span = 0; | |
1352 } | |
1353 | |
1354 | |
1355 /* Compute MD5s */ | |
1356 for (i = 0; i < c->curr_slice; i++) { | |
1357 cram_slice *s = c->slices[i]; | |
1358 | |
1359 if (CRAM_MAJOR_VERS(fd->version) != 1) { | |
1360 if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) { | |
1361 MD5_CTX md5; | |
1362 MD5_Init(&md5); | |
1363 MD5_Update(&md5, | |
1364 c->ref + s->hdr->ref_seq_start - c->ref_start, | |
1365 s->hdr->ref_seq_span); | |
1366 MD5_Final(s->hdr->md5, &md5); | |
1367 } else { | |
1368 memset(s->hdr->md5, 0, 16); | |
1369 } | |
1370 } | |
1371 } | |
1372 | |
1373 c->num_records = 0; | |
1374 c->num_blocks = 0; | |
1375 c->length = 0; | |
1376 | |
1377 //fprintf(stderr, "=== BF ===\n"); | |
1378 h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]), | |
1379 c->stats[DS_BF], E_INT, NULL, | |
1380 fd->version); | |
1381 | |
1382 //fprintf(stderr, "=== CF ===\n"); | |
1383 h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]), | |
1384 c->stats[DS_CF], E_INT, NULL, | |
1385 fd->version); | |
1386 // fprintf(stderr, "=== RN ===\n"); | |
1387 // h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]), | |
1388 // c->stats[DS_RN], E_BYTE_ARRAY, NULL, | |
1389 // fd->version); | |
1390 | |
1391 //fprintf(stderr, "=== AP ===\n"); | |
1392 if (c->pos_sorted) { | |
1393 h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), | |
1394 c->stats[DS_AP], E_INT, NULL, | |
1395 fd->version); | |
1396 } else { | |
1397 int p[2] = {0, c->max_apos}; | |
1398 h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p, | |
1399 fd->version); | |
1400 } | |
1401 | |
1402 //fprintf(stderr, "=== RG ===\n"); | |
1403 h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]), | |
1404 c->stats[DS_RG], E_INT, NULL, | |
1405 fd->version); | |
1406 | |
1407 //fprintf(stderr, "=== MQ ===\n"); | |
1408 h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]), | |
1409 c->stats[DS_MQ], E_INT, NULL, | |
1410 fd->version); | |
1411 | |
1412 //fprintf(stderr, "=== NS ===\n"); | |
1413 h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), | |
1414 c->stats[DS_NS], E_INT, NULL, | |
1415 fd->version); | |
1416 | |
1417 //fprintf(stderr, "=== MF ===\n"); | |
1418 h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]), | |
1419 c->stats[DS_MF], E_INT, NULL, | |
1420 fd->version); | |
1421 | |
1422 //fprintf(stderr, "=== TS ===\n"); | |
1423 h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), | |
1424 c->stats[DS_TS], E_INT, NULL, | |
1425 fd->version); | |
1426 //fprintf(stderr, "=== NP ===\n"); | |
1427 h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]), | |
1428 c->stats[DS_NP], E_INT, NULL, | |
1429 fd->version); | |
1430 //fprintf(stderr, "=== NF ===\n"); | |
1431 h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]), | |
1432 c->stats[DS_NF], E_INT, NULL, | |
1433 fd->version); | |
1434 | |
1435 //fprintf(stderr, "=== RL ===\n"); | |
1436 h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]), | |
1437 c->stats[DS_RL], E_INT, NULL, | |
1438 fd->version); | |
1439 | |
1440 //fprintf(stderr, "=== FN ===\n"); | |
1441 h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]), | |
1442 c->stats[DS_FN], E_INT, NULL, | |
1443 fd->version); | |
1444 | |
1445 //fprintf(stderr, "=== FC ===\n"); | |
1446 h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]), | |
1447 c->stats[DS_FC], E_BYTE, NULL, | |
1448 fd->version); | |
1449 | |
1450 //fprintf(stderr, "=== FP ===\n"); | |
1451 h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]), | |
1452 c->stats[DS_FP], E_INT, NULL, | |
1453 fd->version); | |
1454 | |
1455 //fprintf(stderr, "=== DL ===\n"); | |
1456 h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]), | |
1457 c->stats[DS_DL], E_INT, NULL, | |
1458 fd->version); | |
1459 | |
1460 //fprintf(stderr, "=== BA ===\n"); | |
1461 h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]), | |
1462 c->stats[DS_BA], E_BYTE, NULL, | |
1463 fd->version); | |
1464 | |
1465 if (CRAM_MAJOR_VERS(fd->version) >= 3) { | |
1466 cram_byte_array_len_encoder e; | |
1467 | |
1468 e.len_encoding = E_EXTERNAL; | |
1469 e.len_dat = (void *)DS_BB_len; | |
1470 //e.len_dat = (void *)DS_BB; | |
1471 | |
1472 e.val_encoding = E_EXTERNAL; | |
1473 e.val_dat = (void *)DS_BB; | |
1474 | |
1475 h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, | |
1476 E_BYTE_ARRAY, (void *)&e, | |
1477 fd->version); | |
1478 } else { | |
1479 h->codecs[DS_BB] = NULL; | |
1480 } | |
1481 | |
1482 //fprintf(stderr, "=== BS ===\n"); | |
1483 h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]), | |
1484 c->stats[DS_BS], E_BYTE, NULL, | |
1485 fd->version); | |
1486 | |
1487 if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
1488 h->codecs[DS_TL] = NULL; | |
1489 h->codecs[DS_RI] = NULL; | |
1490 h->codecs[DS_RS] = NULL; | |
1491 h->codecs[DS_PD] = NULL; | |
1492 h->codecs[DS_HC] = NULL; | |
1493 h->codecs[DS_SC] = NULL; | |
1494 | |
1495 //fprintf(stderr, "=== TC ===\n"); | |
1496 h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]), | |
1497 c->stats[DS_TC], E_BYTE, NULL, | |
1498 fd->version); | |
1499 | |
1500 //fprintf(stderr, "=== TN ===\n"); | |
1501 h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]), | |
1502 c->stats[DS_TN], E_INT, NULL, | |
1503 fd->version); | |
1504 } else { | |
1505 h->codecs[DS_TC] = NULL; | |
1506 h->codecs[DS_TN] = NULL; | |
1507 | |
1508 //fprintf(stderr, "=== TL ===\n"); | |
1509 h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]), | |
1510 c->stats[DS_TL], E_INT, NULL, | |
1511 fd->version); | |
1512 | |
1513 | |
1514 //fprintf(stderr, "=== RI ===\n"); | |
1515 h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), | |
1516 c->stats[DS_RI], E_INT, NULL, | |
1517 fd->version); | |
1518 | |
1519 //fprintf(stderr, "=== RS ===\n"); | |
1520 h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]), | |
1521 c->stats[DS_RS], E_INT, NULL, | |
1522 fd->version); | |
1523 | |
1524 //fprintf(stderr, "=== PD ===\n"); | |
1525 h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]), | |
1526 c->stats[DS_PD], E_INT, NULL, | |
1527 fd->version); | |
1528 | |
1529 //fprintf(stderr, "=== HC ===\n"); | |
1530 h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]), | |
1531 c->stats[DS_HC], E_INT, NULL, | |
1532 fd->version); | |
1533 | |
1534 //fprintf(stderr, "=== SC ===\n"); | |
1535 if (1) { | |
1536 int i2[2] = {0, DS_SC}; | |
1537 | |
1538 h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, | |
1539 E_BYTE_ARRAY, (void *)i2, | |
1540 fd->version); | |
1541 } else { | |
1542 // Appears to be no practical benefit to using this method, | |
1543 // but it may work better if we start mixing SC, IN and BB | |
1544 // elements into the same external block. | |
1545 cram_byte_array_len_encoder e; | |
1546 | |
1547 e.len_encoding = E_EXTERNAL; | |
1548 e.len_dat = (void *)DS_SC_len; | |
1549 | |
1550 e.val_encoding = E_EXTERNAL; | |
1551 e.val_dat = (void *)DS_SC; | |
1552 | |
1553 h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, | |
1554 E_BYTE_ARRAY, (void *)&e, | |
1555 fd->version); | |
1556 } | |
1557 } | |
1558 | |
1559 //fprintf(stderr, "=== IN ===\n"); | |
1560 { | |
1561 int i2[2] = {0, DS_IN}; | |
1562 h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, | |
1563 E_BYTE_ARRAY, (void *)i2, | |
1564 fd->version); | |
1565 } | |
1566 | |
1567 h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, | |
1568 (void *)DS_QS, | |
1569 fd->version); | |
1570 { | |
1571 int i2[2] = {0, DS_RN}; | |
1572 h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, | |
1573 E_BYTE_ARRAY, (void *)i2, | |
1574 fd->version); | |
1575 } | |
1576 | |
1577 | |
1578 /* Encode slices */ | |
1579 for (i = 0; i < c->curr_slice; i++) { | |
1580 if (fd->verbose) | |
1581 fprintf(stderr, "Encode slice %d\n", i); | |
1582 if (cram_encode_slice(fd, c, h, c->slices[i]) != 0) | |
1583 return -1; | |
1584 } | |
1585 | |
1586 /* Create compression header */ | |
1587 { | |
1588 h->ref_seq_id = c->ref_seq_id; | |
1589 h->ref_seq_start = c->ref_seq_start; | |
1590 h->ref_seq_span = c->ref_seq_span; | |
1591 h->num_records = c->num_records; | |
1592 | |
1593 h->mapped_qs_included = 0; // fixme | |
1594 h->unmapped_qs_included = 0; // fixme | |
1595 // h->... fixme | |
1596 memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); | |
1597 | |
1598 if (!(c_hdr = cram_encode_compression_header(fd, c, h))) | |
1599 return -1; | |
1600 } | |
1601 | |
1602 /* Compute landmarks */ | |
1603 /* Fill out slice landmarks */ | |
1604 c->num_landmarks = c->curr_slice; | |
1605 c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark)); | |
1606 if (!c->landmark) | |
1607 return -1; | |
1608 | |
1609 /* | |
1610 * Slice offset starts after the first block, so we need to simulate | |
1611 * writing it to work out the correct offset | |
1612 */ | |
1613 { | |
1614 slice_offset = c_hdr->method == RAW | |
1615 ? c_hdr->uncomp_size | |
1616 : c_hdr->comp_size; | |
1617 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + | |
1618 itf8_size(c_hdr->content_id) + | |
1619 itf8_size(c_hdr->comp_size) + | |
1620 itf8_size(c_hdr->uncomp_size); | |
1621 } | |
1622 | |
1623 c->ref_seq_id = c->slices[0]->hdr->ref_seq_id; | |
1624 c->ref_seq_start = c->slices[0]->hdr->ref_seq_start; | |
1625 c->ref_seq_span = c->slices[0]->hdr->ref_seq_span; | |
1626 for (i = 0; i < c->curr_slice; i++) { | |
1627 cram_slice *s = c->slices[i]; | |
1628 | |
1629 c->num_blocks += s->hdr->num_blocks + 2; | |
1630 c->landmark[i] = slice_offset; | |
1631 | |
1632 if (s->hdr->ref_seq_start + s->hdr->ref_seq_span > | |
1633 c->ref_seq_start + c->ref_seq_span) { | |
1634 c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span | |
1635 - c->ref_seq_start; | |
1636 } | |
1637 | |
1638 slice_offset += s->hdr_block->method == RAW | |
1639 ? s->hdr_block->uncomp_size | |
1640 : s->hdr_block->comp_size; | |
1641 | |
1642 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + | |
1643 itf8_size(s->hdr_block->content_id) + | |
1644 itf8_size(s->hdr_block->comp_size) + | |
1645 itf8_size(s->hdr_block->uncomp_size); | |
1646 | |
1647 for (j = 0; j < s->hdr->num_blocks; j++) { | |
1648 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + | |
1649 itf8_size(s->block[j]->content_id) + | |
1650 itf8_size(s->block[j]->comp_size) + | |
1651 itf8_size(s->block[j]->uncomp_size); | |
1652 | |
1653 slice_offset += s->block[j]->method == RAW | |
1654 ? s->block[j]->uncomp_size | |
1655 : s->block[j]->comp_size; | |
1656 } | |
1657 } | |
1658 c->length += slice_offset; // just past the final slice | |
1659 | |
1660 c->comp_hdr_block = c_hdr; | |
1661 | |
1662 if (c->ref_seq_id >= 0) { | |
1663 cram_ref_decr(fd->refs, c->ref_seq_id); | |
1664 } | |
1665 | |
1666 /* Cache references up-front if we have unsorted access patterns */ | |
1667 if (!fd->no_ref && c->refs_used) { | |
1668 for (i = 0; i < fd->refs->nref; i++) { | |
1669 if (c->refs_used[i]) | |
1670 cram_ref_decr(fd->refs, i); | |
1671 } | |
1672 } | |
1673 | |
1674 return 0; | |
1675 } | |
1676 | |
1677 | |
1678 /* | |
1679 * Adds a feature code to a read within a slice. For purposes of minimising | |
1680 * memory allocations and fragmentation we have one array of features for all | |
1681 * reads within the slice. We return the index into this array for this new | |
1682 * feature. | |
1683 * | |
1684 * Returns feature index on success | |
1685 * -1 on failure. | |
1686 */ | |
1687 static int cram_add_feature(cram_container *c, cram_slice *s, | |
1688 cram_record *r, cram_feature *f) { | |
1689 if (s->nfeatures >= s->afeatures) { | |
1690 s->afeatures = s->afeatures ? s->afeatures*2 : 1024; | |
1691 s->features = realloc(s->features, s->afeatures*sizeof(*s->features)); | |
1692 if (!s->features) | |
1693 return -1; | |
1694 } | |
1695 | |
1696 if (!r->nfeature++) { | |
1697 r->feature = s->nfeatures; | |
1698 cram_stats_add(c->stats[DS_FP], f->X.pos); | |
1699 } else { | |
1700 cram_stats_add(c->stats[DS_FP], | |
1701 f->X.pos - s->features[r->feature + r->nfeature-2].X.pos); | |
1702 } | |
1703 cram_stats_add(c->stats[DS_FC], f->X.code); | |
1704 | |
1705 s->features[s->nfeatures++] = *f; | |
1706 | |
1707 return 0; | |
1708 } | |
1709 | |
1710 static int cram_add_substitution(cram_fd *fd, cram_container *c, | |
1711 cram_slice *s, cram_record *r, | |
1712 int pos, char base, char qual, char ref) { | |
1713 cram_feature f; | |
1714 | |
1715 // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN | |
1716 if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) { | |
1717 f.X.pos = pos+1; | |
1718 f.X.code = 'X'; | |
1719 f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f]; | |
1720 cram_stats_add(c->stats[DS_BS], f.X.base); | |
1721 } else { | |
1722 f.B.pos = pos+1; | |
1723 f.B.code = 'B'; | |
1724 f.B.base = base; | |
1725 f.B.qual = qual; | |
1726 cram_stats_add(c->stats[DS_BA], f.B.base); | |
1727 cram_stats_add(c->stats[DS_QS], f.B.qual); | |
1728 BLOCK_APPEND_CHAR(s->qual_blk, qual); | |
1729 } | |
1730 return cram_add_feature(c, s, r, &f); | |
1731 } | |
1732 | |
1733 static int cram_add_bases(cram_fd *fd, cram_container *c, | |
1734 cram_slice *s, cram_record *r, | |
1735 int pos, int len, char *base) { | |
1736 cram_feature f; | |
1737 | |
1738 f.b.pos = pos+1; | |
1739 f.b.code = 'b'; | |
1740 f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk); | |
1741 f.b.len = len; | |
1742 | |
1743 return cram_add_feature(c, s, r, &f); | |
1744 } | |
1745 | |
1746 static int cram_add_base(cram_fd *fd, cram_container *c, | |
1747 cram_slice *s, cram_record *r, | |
1748 int pos, char base, char qual) { | |
1749 cram_feature f; | |
1750 f.B.pos = pos+1; | |
1751 f.B.code = 'B'; | |
1752 f.B.base = base; | |
1753 f.B.qual = qual; | |
1754 cram_stats_add(c->stats[DS_BA], base); | |
1755 cram_stats_add(c->stats[DS_QS], qual); | |
1756 BLOCK_APPEND_CHAR(s->qual_blk, qual); | |
1757 return cram_add_feature(c, s, r, &f); | |
1758 } | |
1759 | |
1760 static int cram_add_quality(cram_fd *fd, cram_container *c, | |
1761 cram_slice *s, cram_record *r, | |
1762 int pos, char qual) { | |
1763 cram_feature f; | |
1764 f.Q.pos = pos+1; | |
1765 f.Q.code = 'Q'; | |
1766 f.Q.qual = qual; | |
1767 cram_stats_add(c->stats[DS_QS], qual); | |
1768 BLOCK_APPEND_CHAR(s->qual_blk, qual); | |
1769 return cram_add_feature(c, s, r, &f); | |
1770 } | |
1771 | |
1772 static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r, | |
1773 int pos, int len, char *base) { | |
1774 cram_feature f; | |
1775 f.D.pos = pos+1; | |
1776 f.D.code = 'D'; | |
1777 f.D.len = len; | |
1778 cram_stats_add(c->stats[DS_DL], len); | |
1779 return cram_add_feature(c, s, r, &f); | |
1780 } | |
1781 | |
1782 static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r, | |
1783 int pos, int len, char *base, int version) { | |
1784 cram_feature f; | |
1785 f.S.pos = pos+1; | |
1786 f.S.code = 'S'; | |
1787 f.S.len = len; | |
1788 switch (CRAM_MAJOR_VERS(version)) { | |
1789 case 1: | |
1790 f.S.seq_idx = BLOCK_SIZE(s->base_blk); | |
1791 BLOCK_APPEND(s->base_blk, base, len); | |
1792 BLOCK_APPEND_CHAR(s->base_blk, '\0'); | |
1793 break; | |
1794 | |
1795 case 2: | |
1796 default: | |
1797 f.S.seq_idx = BLOCK_SIZE(s->soft_blk); | |
1798 if (base) { | |
1799 BLOCK_APPEND(s->soft_blk, base, len); | |
1800 } else { | |
1801 int i; | |
1802 for (i = 0; i < len; i++) | |
1803 BLOCK_APPEND_CHAR(s->soft_blk, 'N'); | |
1804 } | |
1805 BLOCK_APPEND_CHAR(s->soft_blk, '\0'); | |
1806 break; | |
1807 | |
1808 // default: | |
1809 // // v3.0 onwards uses BB data-series | |
1810 // f.S.seq_idx = BLOCK_SIZE(s->soft_blk); | |
1811 } | |
1812 return cram_add_feature(c, s, r, &f); | |
1813 } | |
1814 | |
1815 static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r, | |
1816 int pos, int len, char *base) { | |
1817 cram_feature f; | |
1818 f.S.pos = pos+1; | |
1819 f.S.code = 'H'; | |
1820 f.S.len = len; | |
1821 cram_stats_add(c->stats[DS_HC], len); | |
1822 return cram_add_feature(c, s, r, &f); | |
1823 } | |
1824 | |
1825 static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r, | |
1826 int pos, int len, char *base) { | |
1827 cram_feature f; | |
1828 f.S.pos = pos+1; | |
1829 f.S.code = 'N'; | |
1830 f.S.len = len; | |
1831 cram_stats_add(c->stats[DS_RS], len); | |
1832 return cram_add_feature(c, s, r, &f); | |
1833 } | |
1834 | |
1835 static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r, | |
1836 int pos, int len, char *base) { | |
1837 cram_feature f; | |
1838 f.S.pos = pos+1; | |
1839 f.S.code = 'P'; | |
1840 f.S.len = len; | |
1841 cram_stats_add(c->stats[DS_PD], len); | |
1842 return cram_add_feature(c, s, r, &f); | |
1843 } | |
1844 | |
1845 static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, | |
1846 int pos, int len, char *base) { | |
1847 cram_feature f; | |
1848 f.I.pos = pos+1; | |
1849 if (len == 1) { | |
1850 char b = base ? *base : 'N'; | |
1851 f.i.code = 'i'; | |
1852 f.i.base = b; | |
1853 cram_stats_add(c->stats[DS_BA], b); | |
1854 } else { | |
1855 f.I.code = 'I'; | |
1856 f.I.len = len; | |
1857 f.S.seq_idx = BLOCK_SIZE(s->base_blk); | |
1858 if (base) { | |
1859 BLOCK_APPEND(s->base_blk, base, len); | |
1860 } else { | |
1861 int i; | |
1862 for (i = 0; i < len; i++) | |
1863 BLOCK_APPEND_CHAR(s->base_blk, 'N'); | |
1864 } | |
1865 BLOCK_APPEND_CHAR(s->base_blk, '\0'); | |
1866 } | |
1867 return cram_add_feature(c, s, r, &f); | |
1868 } | |
1869 | |
1870 /* | |
1871 * Encodes auxiliary data. | |
1872 * Returns the read-group parsed out of the BAM aux fields on success | |
1873 * NULL on failure or no rg present (FIXME) | |
1874 */ | |
1875 static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, | |
1876 cram_slice *s, cram_record *cr) { | |
1877 char *aux, *tmp, *rg = NULL; | |
1878 int aux_size = bam_blk_size(b) - | |
1879 ((char *)bam_aux(b) - (char *)&bam_ref(b)); | |
1880 | |
1881 /* Worst case is 1 nul char on every ??:Z: string, so +33% */ | |
1882 BLOCK_GROW(s->aux_blk, aux_size*1.34+1); | |
1883 tmp = (char *)BLOCK_END(s->aux_blk); | |
1884 | |
1885 aux = (char *)bam_aux(b); | |
1886 cr->TN_idx = s->nTN; | |
1887 | |
1888 while (aux[0] != 0) { | |
1889 int32_t i32; | |
1890 int r; | |
1891 | |
1892 if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { | |
1893 rg = &aux[3]; | |
1894 while (*aux++); | |
1895 continue; | |
1896 } | |
1897 if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { | |
1898 while (*aux++); | |
1899 continue; | |
1900 } | |
1901 if (aux[0] == 'N' && aux[1] == 'M') { | |
1902 switch(aux[2]) { | |
1903 case 'A': case 'C': case 'c': aux+=4; break; | |
1904 case 'I': case 'i': case 'f': aux+=7; break; | |
1905 default: | |
1906 fprintf(stderr, "Unhandled type code for NM tag\n"); | |
1907 return NULL; | |
1908 } | |
1909 continue; | |
1910 } | |
1911 | |
1912 cr->ntags++; | |
1913 | |
1914 i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; | |
1915 kh_put(s_i2i, c->tags_used, i32, &r); | |
1916 if (-1 == r) | |
1917 return NULL; | |
1918 | |
1919 if (s->nTN >= s->aTN) { | |
1920 s->aTN = s->aTN ? s->aTN*2 : 1024; | |
1921 if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN)))) | |
1922 return NULL; | |
1923 } | |
1924 s->TN[s->nTN++] = i32; | |
1925 cram_stats_add(c->stats[DS_TN], i32); | |
1926 | |
1927 switch(aux[2]) { | |
1928 case 'A': case 'C': case 'c': | |
1929 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
1930 *tmp++=*aux++; | |
1931 break; | |
1932 | |
1933 case 'S': case 's': | |
1934 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
1935 *tmp++=*aux++; *tmp++=*aux++; | |
1936 break; | |
1937 | |
1938 case 'I': case 'i': case 'f': | |
1939 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
1940 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
1941 break; | |
1942 | |
1943 case 'd': | |
1944 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
1945 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
1946 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
1947 break; | |
1948 | |
1949 case 'Z': case 'H': | |
1950 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
1951 while ((*tmp++=*aux++)); | |
1952 *tmp++ = '\t'; // stop byte | |
1953 break; | |
1954 | |
1955 case 'B': { | |
1956 int type = aux[3], blen; | |
1957 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + | |
1958 (((unsigned char *)aux)[5]<< 8) + | |
1959 (((unsigned char *)aux)[6]<<16) + | |
1960 (((unsigned char *)aux)[7]<<24)); | |
1961 // skip TN field | |
1962 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
1963 | |
1964 // We use BYTE_ARRAY_LEN with external length, so store that first | |
1965 switch (type) { | |
1966 case 'c': case 'C': | |
1967 blen = count; | |
1968 break; | |
1969 case 's': case 'S': | |
1970 blen = 2*count; | |
1971 break; | |
1972 case 'i': case 'I': case 'f': | |
1973 blen = 4*count; | |
1974 break; | |
1975 default: | |
1976 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", | |
1977 type); | |
1978 return NULL; | |
1979 | |
1980 } | |
1981 | |
1982 tmp += itf8_put(tmp, blen+5); | |
1983 | |
1984 *tmp++=*aux++; // sub-type & length | |
1985 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
1986 | |
1987 // The tag data itself | |
1988 memcpy(tmp, aux, blen); tmp += blen; aux += blen; | |
1989 | |
1990 //cram_stats_add(c->aux_B_stats, blen); | |
1991 break; | |
1992 } | |
1993 default: | |
1994 fprintf(stderr, "Unknown aux type '%c'\n", aux[2]); | |
1995 return NULL; | |
1996 } | |
1997 } | |
1998 cram_stats_add(c->stats[DS_TC], cr->ntags); | |
1999 | |
2000 cr->aux = BLOCK_SIZE(s->aux_blk); | |
2001 cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); | |
2002 BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); | |
2003 assert(s->aux_blk->byte <= s->aux_blk->alloc); | |
2004 | |
2005 return rg; | |
2006 } | |
2007 | |
2008 /* | |
2009 * Encodes auxiliary data. Largely duplicated from above, but done so to | |
2010 * keep it simple and avoid a myriad of version ifs. | |
2011 * | |
2012 * Returns the read-group parsed out of the BAM aux fields on success | |
2013 * NULL on failure or no rg present (FIXME) | |
2014 */ | |
2015 static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, | |
2016 cram_slice *s, cram_record *cr) { | |
2017 char *aux, *orig, *tmp, *rg = NULL; | |
2018 int aux_size = bam_get_l_aux(b); | |
2019 cram_block *td_b = c->comp_hdr->TD_blk; | |
2020 int TD_blk_size = BLOCK_SIZE(td_b), new; | |
2021 char *key; | |
2022 khint_t k; | |
2023 | |
2024 | |
2025 /* Worst case is 1 nul char on every ??:Z: string, so +33% */ | |
2026 BLOCK_GROW(s->aux_blk, aux_size*1.34+1); | |
2027 tmp = (char *)BLOCK_END(s->aux_blk); | |
2028 | |
2029 | |
2030 orig = aux = (char *)bam_aux(b); | |
2031 | |
2032 // Copy aux keys to td_b and aux values to s->aux_blk | |
2033 while (aux - orig < aux_size && aux[0] != 0) { | |
2034 uint32_t i32; | |
2035 int r; | |
2036 | |
2037 if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { | |
2038 rg = &aux[3]; | |
2039 while (*aux++); | |
2040 continue; | |
2041 } | |
2042 if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { | |
2043 while (*aux++); | |
2044 continue; | |
2045 } | |
2046 if (aux[0] == 'N' && aux[1] == 'M') { | |
2047 switch(aux[2]) { | |
2048 case 'A': case 'C': case 'c': aux+=4; break; | |
2049 case 'S': case 's': aux+=5; break; | |
2050 case 'I': case 'i': case 'f': aux+=7; break; | |
2051 default: | |
2052 fprintf(stderr, "Unhandled type code for NM tag\n"); | |
2053 return NULL; | |
2054 } | |
2055 continue; | |
2056 } | |
2057 | |
2058 BLOCK_APPEND(td_b, aux, 3); | |
2059 | |
2060 i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; | |
2061 kh_put(s_i2i, c->tags_used, i32, &r); | |
2062 if (-1 == r) | |
2063 return NULL; | |
2064 | |
2065 // BQ:Z | |
2066 if (aux[0] == 'B' && aux[1] == 'Q' && aux[2] == 'Z') { | |
2067 char *tmp; | |
2068 if (!s->aux_BQ_blk) | |
2069 if (!(s->aux_BQ_blk = cram_new_block(EXTERNAL, DS_aux_BQ))) | |
2070 return NULL; | |
2071 BLOCK_GROW(s->aux_BQ_blk, aux_size*1.34+1); | |
2072 tmp = (char *)BLOCK_END(s->aux_BQ_blk); | |
2073 aux += 3; | |
2074 while ((*tmp++=*aux++)); | |
2075 *tmp++ = '\t'; | |
2076 BLOCK_SIZE(s->aux_BQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BQ_blk); | |
2077 continue; | |
2078 } | |
2079 | |
2080 // BD:Z | |
2081 if (aux[0] == 'B' && aux[1]=='D' && aux[2] == 'Z') { | |
2082 char *tmp; | |
2083 if (!s->aux_BD_blk) | |
2084 if (!(s->aux_BD_blk = cram_new_block(EXTERNAL, DS_aux_BD))) | |
2085 return NULL; | |
2086 BLOCK_GROW(s->aux_BD_blk, aux_size*1.34+1); | |
2087 tmp = (char *)BLOCK_END(s->aux_BD_blk); | |
2088 aux += 3; | |
2089 while ((*tmp++=*aux++)); | |
2090 *tmp++ = '\t'; | |
2091 BLOCK_SIZE(s->aux_BD_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BD_blk); | |
2092 continue; | |
2093 } | |
2094 | |
2095 // BI:Z | |
2096 if (aux[0] == 'B' && aux[1]=='I' && aux[2] == 'Z') { | |
2097 char *tmp; | |
2098 if (!s->aux_BI_blk) | |
2099 if (!(s->aux_BI_blk = cram_new_block(EXTERNAL, DS_aux_BI))) | |
2100 return NULL; | |
2101 BLOCK_GROW(s->aux_BI_blk, aux_size*1.34+1); | |
2102 tmp = (char *)BLOCK_END(s->aux_BI_blk); | |
2103 aux += 3; | |
2104 while ((*tmp++=*aux++)); | |
2105 *tmp++ = '\t'; | |
2106 BLOCK_SIZE(s->aux_BI_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BI_blk); | |
2107 continue; | |
2108 } | |
2109 | |
2110 // OQ:Z: | |
2111 if (aux[0] == 'O' && aux[1] == 'Q' && aux[2] == 'Z') { | |
2112 char *tmp; | |
2113 if (!s->aux_OQ_blk) | |
2114 if (!(s->aux_OQ_blk = cram_new_block(EXTERNAL, DS_aux_OQ))) | |
2115 return NULL; | |
2116 BLOCK_GROW(s->aux_OQ_blk, aux_size*1.34+1); | |
2117 tmp = (char *)BLOCK_END(s->aux_OQ_blk); | |
2118 aux += 3; | |
2119 while ((*tmp++=*aux++)); | |
2120 *tmp++ = '\t'; | |
2121 BLOCK_SIZE(s->aux_OQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_OQ_blk); | |
2122 continue; | |
2123 } | |
2124 | |
2125 // FZ:B or ZM:B | |
2126 if ((aux[0] == 'F' && aux[1] == 'Z' && aux[2] == 'B') || | |
2127 (aux[0] == 'Z' && aux[1] == 'M' && aux[2] == 'B')) { | |
2128 int type = aux[3], blen; | |
2129 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + | |
2130 (((unsigned char *)aux)[5]<< 8) + | |
2131 (((unsigned char *)aux)[6]<<16) + | |
2132 (((unsigned char *)aux)[7]<<24)); | |
2133 char *tmp; | |
2134 if (!s->aux_FZ_blk) | |
2135 if (!(s->aux_FZ_blk = cram_new_block(EXTERNAL, DS_aux_FZ))) | |
2136 return NULL; | |
2137 BLOCK_GROW(s->aux_FZ_blk, aux_size*1.34+1); | |
2138 tmp = (char *)BLOCK_END(s->aux_FZ_blk); | |
2139 | |
2140 // skip TN field | |
2141 aux+=3; | |
2142 | |
2143 // We use BYTE_ARRAY_LEN with external length, so store that first | |
2144 switch (type) { | |
2145 case 'c': case 'C': | |
2146 blen = count; | |
2147 break; | |
2148 case 's': case 'S': | |
2149 blen = 2*count; | |
2150 break; | |
2151 case 'i': case 'I': case 'f': | |
2152 blen = 4*count; | |
2153 break; | |
2154 default: | |
2155 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", | |
2156 type); | |
2157 return NULL; | |
2158 | |
2159 } | |
2160 | |
2161 blen += 5; // sub-type & length | |
2162 tmp += itf8_put(tmp, blen); | |
2163 | |
2164 // The tag data itself | |
2165 memcpy(tmp, aux, blen); tmp += blen; aux += blen; | |
2166 | |
2167 BLOCK_SIZE(s->aux_FZ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_FZ_blk); | |
2168 continue; | |
2169 } | |
2170 | |
2171 // Other quality data - {Q2,E2,U2,CQ}:Z and similar | |
2172 if (((aux[0] == 'Q' && aux[1] == '2') || | |
2173 (aux[0] == 'U' && aux[1] == '2') || | |
2174 (aux[0] == 'Q' && aux[1] == 'T') || | |
2175 (aux[0] == 'C' && aux[1] == 'Q')) && aux[2] == 'Z') { | |
2176 char *tmp; | |
2177 if (!s->aux_oq_blk) | |
2178 if (!(s->aux_oq_blk = cram_new_block(EXTERNAL, DS_aux_oq))) | |
2179 return NULL; | |
2180 BLOCK_GROW(s->aux_oq_blk, aux_size*1.34+1); | |
2181 tmp = (char *)BLOCK_END(s->aux_oq_blk); | |
2182 aux += 3; | |
2183 while ((*tmp++=*aux++)); | |
2184 *tmp++ = '\t'; | |
2185 BLOCK_SIZE(s->aux_oq_blk) = (uc *)tmp - BLOCK_DATA(s->aux_oq_blk); | |
2186 continue; | |
2187 } | |
2188 | |
2189 // Other sequence data - {R2,E2,CS,BC,RT}:Z and similar | |
2190 if (((aux[0] == 'R' && aux[1] == '2') || | |
2191 (aux[0] == 'E' && aux[1] == '2') || | |
2192 (aux[0] == 'C' && aux[1] == 'S') || | |
2193 (aux[0] == 'B' && aux[1] == 'C') || | |
2194 (aux[0] == 'R' && aux[1] == 'T')) && aux[2] == 'Z') { | |
2195 char *tmp; | |
2196 if (!s->aux_os_blk) | |
2197 if (!(s->aux_os_blk = cram_new_block(EXTERNAL, DS_aux_os))) | |
2198 return NULL; | |
2199 BLOCK_GROW(s->aux_os_blk, aux_size*1.34+1); | |
2200 tmp = (char *)BLOCK_END(s->aux_os_blk); | |
2201 aux += 3; | |
2202 while ((*tmp++=*aux++)); | |
2203 *tmp++ = '\t'; | |
2204 BLOCK_SIZE(s->aux_os_blk) = (uc *)tmp - BLOCK_DATA(s->aux_os_blk); | |
2205 continue; | |
2206 } | |
2207 | |
2208 | |
2209 switch(aux[2]) { | |
2210 case 'A': case 'C': case 'c': | |
2211 aux+=3; | |
2212 *tmp++=*aux++; | |
2213 break; | |
2214 | |
2215 case 'S': case 's': | |
2216 aux+=3; | |
2217 *tmp++=*aux++; *tmp++=*aux++; | |
2218 break; | |
2219 | |
2220 case 'I': case 'i': case 'f': | |
2221 aux+=3; | |
2222 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
2223 break; | |
2224 | |
2225 case 'd': | |
2226 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
2227 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
2228 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; | |
2229 break; | |
2230 | |
2231 case 'Z': case 'H': | |
2232 { | |
2233 char *tmp; | |
2234 if (!s->aux_oz_blk) | |
2235 if (!(s->aux_oz_blk = cram_new_block(EXTERNAL, DS_aux_oz))) | |
2236 return NULL; | |
2237 BLOCK_GROW(s->aux_oz_blk, aux_size*1.34+1); | |
2238 tmp = (char *)BLOCK_END(s->aux_oz_blk); | |
2239 aux += 3; | |
2240 while ((*tmp++=*aux++)); | |
2241 *tmp++ = '\t'; | |
2242 BLOCK_SIZE(s->aux_oz_blk) = (uc *)tmp - | |
2243 BLOCK_DATA(s->aux_oz_blk); | |
2244 } | |
2245 break; | |
2246 | |
2247 case 'B': { | |
2248 int type = aux[3], blen; | |
2249 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + | |
2250 (((unsigned char *)aux)[5]<< 8) + | |
2251 (((unsigned char *)aux)[6]<<16) + | |
2252 (((unsigned char *)aux)[7]<<24)); | |
2253 // skip TN field | |
2254 aux+=3; | |
2255 | |
2256 // We use BYTE_ARRAY_LEN with external length, so store that first | |
2257 switch (type) { | |
2258 case 'c': case 'C': | |
2259 blen = count; | |
2260 break; | |
2261 case 's': case 'S': | |
2262 blen = 2*count; | |
2263 break; | |
2264 case 'i': case 'I': case 'f': | |
2265 blen = 4*count; | |
2266 break; | |
2267 default: | |
2268 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n", | |
2269 type); | |
2270 return NULL; | |
2271 | |
2272 } | |
2273 | |
2274 blen += 5; // sub-type & length | |
2275 tmp += itf8_put(tmp, blen); | |
2276 | |
2277 // The tag data itself | |
2278 memcpy(tmp, aux, blen); tmp += blen; aux += blen; | |
2279 | |
2280 //cram_stats_add(c->aux_B_stats, blen); | |
2281 break; | |
2282 } | |
2283 default: | |
2284 fprintf(stderr, "Unknown aux type '%c'\n", aux[2]); | |
2285 return NULL; | |
2286 } | |
2287 } | |
2288 | |
2289 // FIXME: sort BLOCK_DATA(td_b) by char[3] triples | |
2290 | |
2291 // And and increment TD hash entry | |
2292 BLOCK_APPEND_CHAR(td_b, 0); | |
2293 | |
2294 // Duplicate key as BLOCK_DATA() can be realloced to a new pointer. | |
2295 key = string_ndup(c->comp_hdr->TD_keys, | |
2296 (char *)BLOCK_DATA(td_b) + TD_blk_size, | |
2297 BLOCK_SIZE(td_b) - TD_blk_size); | |
2298 k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new); | |
2299 if (new < 0) { | |
2300 return NULL; | |
2301 } else if (new == 0) { | |
2302 BLOCK_SIZE(td_b) = TD_blk_size; | |
2303 } else { | |
2304 kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL; | |
2305 c->comp_hdr->nTL++; | |
2306 } | |
2307 | |
2308 cr->TL = kh_val(c->comp_hdr->TD_hash, k); | |
2309 cram_stats_add(c->stats[DS_TL], cr->TL); | |
2310 | |
2311 cr->aux = BLOCK_SIZE(s->aux_blk); | |
2312 cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); | |
2313 BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); | |
2314 assert(s->aux_blk->byte <= s->aux_blk->alloc); | |
2315 | |
2316 return rg; | |
2317 } | |
2318 | |
2319 | |
2320 /* | |
2321 * Handles creation of a new container or new slice, flushing any | |
2322 * existing containers when appropriate. | |
2323 * | |
2324 * Really this is next slice, which may or may not lead to a new container. | |
2325 * | |
2326 * Returns cram_container pointer on success | |
2327 * NULL on failure. | |
2328 */ | |
2329 static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { | |
2330 cram_container *c = fd->ctr; | |
2331 cram_slice *s; | |
2332 int i; | |
2333 | |
2334 /* First occurence */ | |
2335 if (c->curr_ref == -2) | |
2336 c->curr_ref = bam_ref(b); | |
2337 | |
2338 if (c->slice) { | |
2339 s = c->slice; | |
2340 if (c->multi_seq) { | |
2341 s->hdr->ref_seq_id = -2; | |
2342 s->hdr->ref_seq_start = 0; | |
2343 s->hdr->ref_seq_span = 0; | |
2344 } else { | |
2345 s->hdr->ref_seq_id = c->curr_ref; | |
2346 s->hdr->ref_seq_start = c->first_base; | |
2347 s->hdr->ref_seq_span = c->last_base - c->first_base + 1; | |
2348 } | |
2349 s->hdr->num_records = c->curr_rec; | |
2350 | |
2351 if (c->curr_slice == 0) { | |
2352 if (c->ref_seq_id != s->hdr->ref_seq_id) | |
2353 c->ref_seq_id = s->hdr->ref_seq_id; | |
2354 c->ref_seq_start = c->first_base; | |
2355 } | |
2356 | |
2357 c->curr_slice++; | |
2358 } | |
2359 | |
2360 /* Flush container */ | |
2361 if (c->curr_slice == c->max_slice || | |
2362 (bam_ref(b) != c->curr_ref && !c->multi_seq)) { | |
2363 c->ref_seq_span = fd->last_base - c->ref_seq_start + 1; | |
2364 if (fd->verbose) | |
2365 fprintf(stderr, "Flush container %d/%d..%d\n", | |
2366 c->ref_seq_id, c->ref_seq_start, | |
2367 c->ref_seq_start + c->ref_seq_span -1); | |
2368 | |
2369 /* Encode slices */ | |
2370 if (fd->pool) { | |
2371 if (-1 == cram_flush_container_mt(fd, c)) | |
2372 return NULL; | |
2373 } else { | |
2374 if (-1 == cram_flush_container(fd, c)) | |
2375 return NULL; | |
2376 | |
2377 // Move to sep func, as we need cram_flush_container for | |
2378 // the closing phase to flush the partial container. | |
2379 for (i = 0; i < c->max_slice; i++) { | |
2380 cram_free_slice(c->slices[i]); | |
2381 c->slices[i] = NULL; | |
2382 } | |
2383 | |
2384 c->slice = NULL; | |
2385 c->curr_slice = 0; | |
2386 | |
2387 /* Easy approach for purposes of freeing stats */ | |
2388 cram_free_container(c); | |
2389 } | |
2390 | |
2391 c = fd->ctr = cram_new_container(fd->seqs_per_slice, | |
2392 fd->slices_per_container); | |
2393 if (!c) | |
2394 return NULL; | |
2395 c->record_counter = fd->record_counter; | |
2396 c->curr_ref = bam_ref(b); | |
2397 } | |
2398 | |
2399 c->last_pos = c->first_base = c->last_base = bam_pos(b)+1; | |
2400 | |
2401 /* New slice */ | |
2402 c->slice = c->slices[c->curr_slice] = | |
2403 cram_new_slice(MAPPED_SLICE, c->max_rec); | |
2404 if (!c->slice) | |
2405 return NULL; | |
2406 | |
2407 if (c->multi_seq) { | |
2408 c->slice->hdr->ref_seq_id = -2; | |
2409 c->slice->hdr->ref_seq_start = 0; | |
2410 c->slice->last_apos = 1; | |
2411 } else { | |
2412 c->slice->hdr->ref_seq_id = bam_ref(b); | |
2413 // wrong for unsorted data, will fix during encoding. | |
2414 c->slice->hdr->ref_seq_start = bam_pos(b)+1; | |
2415 c->slice->last_apos = bam_pos(b)+1; | |
2416 } | |
2417 | |
2418 c->curr_rec = 0; | |
2419 | |
2420 return c; | |
2421 } | |
2422 | |
2423 /* | |
2424 * Converts a single bam record into a cram record. | |
2425 * Possibly used within a thread. | |
2426 * | |
2427 * Returns 0 on success; | |
2428 * -1 on failure | |
2429 */ | |
2430 static int process_one_read(cram_fd *fd, cram_container *c, | |
2431 cram_slice *s, cram_record *cr, | |
2432 bam_seq_t *b, int rnum) { | |
2433 int i, fake_qual = -1; | |
2434 char *cp, *rg; | |
2435 char *ref, *seq, *qual; | |
2436 | |
2437 // FIXME: multi-ref containers | |
2438 | |
2439 ref = c->ref; | |
2440 cr->len = bam_seq_len(b); cram_stats_add(c->stats[DS_RL], cr->len); | |
2441 | |
2442 //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg); | |
2443 | |
2444 // Fields to resolve later | |
2445 //cr->mate_line; // index to another cram_record | |
2446 //cr->mate_flags; // MF | |
2447 //cr->ntags; // TC | |
2448 cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); | |
2449 if (CRAM_MAJOR_VERS(fd->version) == 1) | |
2450 rg = cram_encode_aux_1_0(fd, b, c, s, cr); | |
2451 else | |
2452 rg = cram_encode_aux(fd, b, c, s, cr); | |
2453 | |
2454 //cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b)); | |
2455 //cr->aux = DSTRING_LEN(s->aux_ds); | |
2456 //dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size); | |
2457 | |
2458 /* Read group, identified earlier */ | |
2459 if (rg) { | |
2460 SAM_RG *brg = sam_hdr_find_rg(fd->header, rg); | |
2461 cr->rg = brg ? brg->id : -1; | |
2462 } else if (CRAM_MAJOR_VERS(fd->version) == 1) { | |
2463 SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN"); | |
2464 assert(brg); | |
2465 } else { | |
2466 cr->rg = -1; | |
2467 } | |
2468 cram_stats_add(c->stats[DS_RG], cr->rg); | |
2469 | |
2470 | |
2471 cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id); | |
2472 cr->flags = bam_flag(b); | |
2473 if (bam_cigar_len(b) == 0) | |
2474 cr->flags |= BAM_FUNMAP; | |
2475 cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]); | |
2476 | |
2477 // Non reference based encoding means storing the bases verbatim as features, which in | |
2478 // turn means every base also has a quality already stored. | |
2479 if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3) | |
2480 cr->cram_flags = CRAM_FLAG_PRESERVE_QUAL_SCORES; | |
2481 else | |
2482 cr->cram_flags = 0; | |
2483 //cram_stats_add(c->stats[DS_CF], cr->cram_flags); | |
2484 | |
2485 c->num_bases += cr->len; | |
2486 cr->apos = bam_pos(b)+1; | |
2487 if (c->pos_sorted) { | |
2488 if (cr->apos < s->last_apos) { | |
2489 c->pos_sorted = 0; | |
2490 } else { | |
2491 cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos); | |
2492 s->last_apos = cr->apos; | |
2493 } | |
2494 } else { | |
2495 //cram_stats_add(c->stats[DS_AP], cr->apos); | |
2496 } | |
2497 c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos); | |
2498 | |
2499 cr->name = BLOCK_SIZE(s->name_blk); | |
2500 cr->name_len = bam_name_len(b); | |
2501 cram_stats_add(c->stats[DS_RN], cr->name_len); | |
2502 | |
2503 BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); | |
2504 | |
2505 | |
2506 /* | |
2507 * This seqs_ds is largely pointless and it could reuse the same memory | |
2508 * over and over. | |
2509 * s->base_blk is what we need for encoding. | |
2510 */ | |
2511 cr->seq = BLOCK_SIZE(s->seqs_blk); | |
2512 cr->qual = BLOCK_SIZE(s->qual_blk); | |
2513 BLOCK_GROW(s->seqs_blk, cr->len+1); | |
2514 BLOCK_GROW(s->qual_blk, cr->len); | |
2515 seq = cp = (char *)BLOCK_END(s->seqs_blk); | |
2516 | |
2517 *seq = 0; | |
2518 #ifdef ALLOW_UAC | |
2519 { | |
2520 // Convert seq 2 bases at a time for speed. | |
2521 static const uint16_t code2base[256] = { | |
2522 15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077, | |
2523 21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029, | |
2524 15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081, | |
2525 21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033, | |
2526 15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083, | |
2527 21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035, | |
2528 15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093, | |
2529 21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045, | |
2530 15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087, | |
2531 21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039, | |
2532 15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098, | |
2533 21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050, | |
2534 15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099, | |
2535 21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051, | |
2536 15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102, | |
2537 21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054, | |
2538 15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100, | |
2539 21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052, | |
2540 15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103, | |
2541 21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055, | |
2542 15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105, | |
2543 21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057, | |
2544 15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088, | |
2545 21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040, | |
2546 15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091, | |
2547 21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043, | |
2548 15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084, | |
2549 21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036, | |
2550 15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082, | |
2551 21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034, | |
2552 15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094, | |
2553 21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046 | |
2554 }; | |
2555 | |
2556 int l2 = cr->len / 2; | |
2557 unsigned char *from = (unsigned char *)bam_seq(b); | |
2558 uint16_t *cpi = (uint16_t *)cp; | |
2559 cp[0] = 0; | |
2560 for (i = 0; i < l2; i++) | |
2561 cpi[i] = le_int2(code2base[from[i]]); | |
2562 if ((i *= 2) < cr->len) | |
2563 cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; | |
2564 } | |
2565 #else | |
2566 for (i = 0; i < cr->len; i++) | |
2567 cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)]; | |
2568 #endif | |
2569 BLOCK_SIZE(s->seqs_blk) += cr->len; | |
2570 | |
2571 qual = cp = (char *)bam_qual(b); | |
2572 | |
2573 /* Copy and parse */ | |
2574 if (!(cr->flags & BAM_FUNMAP)) { | |
2575 int32_t *cig_to, *cig_from; | |
2576 int apos = cr->apos-1, spos = 0; | |
2577 | |
2578 cr->cigar = s->ncigar; | |
2579 cr->ncigar = bam_cigar_len(b); | |
2580 while (cr->cigar + cr->ncigar >= s->cigar_alloc) { | |
2581 s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024; | |
2582 s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar)); | |
2583 if (!s->cigar) | |
2584 return -1; | |
2585 } | |
2586 | |
2587 cig_to = (int32_t *)s->cigar; | |
2588 cig_from = (int32_t *)bam_cigar(b); | |
2589 | |
2590 cr->feature = 0; | |
2591 cr->nfeature = 0; | |
2592 for (i = 0; i < cr->ncigar; i++) { | |
2593 enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK; | |
2594 int cig_len = cig_from[i] >> BAM_CIGAR_SHIFT; | |
2595 cig_to[i] = cig_from[i]; | |
2596 | |
2597 /* Can also generate events from here for CRAM diffs */ | |
2598 | |
2599 switch (cig_op) { | |
2600 int l; | |
2601 | |
2602 // Don't trust = and X ops to be correct. | |
2603 case BAM_CMATCH: | |
2604 case BAM_CBASE_MATCH: | |
2605 case BAM_CBASE_MISMATCH: | |
2606 //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n", | |
2607 // cig_len, &ref[apos], cig_len, &seq[spos]); | |
2608 l = 0; | |
2609 if (!fd->no_ref && cr->len) { | |
2610 int end = cig_len+apos < c->ref_end | |
2611 ? cig_len : c->ref_end - apos; | |
2612 char *sp = &seq[spos]; | |
2613 char *rp = &ref[apos]; | |
2614 char *qp = &qual[spos]; | |
2615 for (l = 0; l < end; l++) { | |
2616 if (rp[l] != sp[l]) { | |
2617 if (!sp[l]) | |
2618 break; | |
2619 if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) { | |
2620 // Disabled for the time being as it doesn't | |
2621 // seem to gain us much. | |
2622 int ol=l; | |
2623 while (l<end && rp[l] != sp[l]) | |
2624 l++; | |
2625 if (l-ol > 1) { | |
2626 if (cram_add_bases(fd, c, s, cr, spos+ol, | |
2627 l-ol, &seq[spos+ol])) | |
2628 return -1; | |
2629 l--; | |
2630 } else { | |
2631 l = ol; | |
2632 if (cram_add_substitution(fd, c, s, cr, | |
2633 spos+l, sp[l], | |
2634 qp[l], rp[l])) | |
2635 return -1; | |
2636 } | |
2637 } else { | |
2638 if (cram_add_substitution(fd, c, s, cr, spos+l, | |
2639 sp[l], qp[l], rp[l])) | |
2640 return -1; | |
2641 } | |
2642 } | |
2643 } | |
2644 spos += l; | |
2645 apos += l; | |
2646 } | |
2647 | |
2648 if (l < cig_len && cr->len) { | |
2649 if (fd->no_ref) { | |
2650 if (CRAM_MAJOR_VERS(fd->version) == 3) { | |
2651 if (cram_add_bases(fd, c, s, cr, spos, | |
2652 cig_len-l, &seq[spos])) | |
2653 return -1; | |
2654 spos += cig_len-l; | |
2655 } else { | |
2656 for (; l < cig_len && seq[spos]; l++, spos++) { | |
2657 if (cram_add_base(fd, c, s, cr, spos, | |
2658 seq[spos], qual[spos])) | |
2659 return -1; | |
2660 } | |
2661 } | |
2662 } else { | |
2663 /* off end of sequence or non-ref based output */ | |
2664 for (; l < cig_len && seq[spos]; l++, spos++) { | |
2665 if (cram_add_base(fd, c, s, cr, spos, | |
2666 seq[spos], qual[spos])) | |
2667 return -1; | |
2668 } | |
2669 } | |
2670 apos += cig_len; | |
2671 } else if (!cr->len) { | |
2672 /* Seq "*" */ | |
2673 apos += cig_len; | |
2674 spos += cig_len; | |
2675 } | |
2676 break; | |
2677 | |
2678 case BAM_CDEL: | |
2679 if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos])) | |
2680 return -1; | |
2681 apos += cig_len; | |
2682 break; | |
2683 | |
2684 case BAM_CREF_SKIP: | |
2685 if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos])) | |
2686 return -1; | |
2687 apos += cig_len; | |
2688 break; | |
2689 | |
2690 case BAM_CINS: | |
2691 if (cram_add_insertion(c, s, cr, spos, cig_len, | |
2692 cr->len ? &seq[spos] : NULL)) | |
2693 return -1; | |
2694 if (fd->no_ref && cr->len) { | |
2695 for (l = 0; l < cig_len; l++, spos++) { | |
2696 cram_add_quality(fd, c, s, cr, spos, qual[spos]); | |
2697 } | |
2698 } else { | |
2699 spos += cig_len; | |
2700 } | |
2701 break; | |
2702 | |
2703 case BAM_CSOFT_CLIP: | |
2704 if (cram_add_softclip(c, s, cr, spos, cig_len, | |
2705 cr->len ? &seq[spos] : NULL, | |
2706 fd->version)) | |
2707 return -1; | |
2708 if (fd->no_ref && | |
2709 !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { | |
2710 if (cr->len) { | |
2711 for (l = 0; l < cig_len; l++, spos++) { | |
2712 cram_add_quality(fd, c, s, cr, spos, qual[spos]); | |
2713 } | |
2714 } else { | |
2715 for (l = 0; l < cig_len; l++, spos++) { | |
2716 cram_add_quality(fd, c, s, cr, spos, -1); | |
2717 } | |
2718 } | |
2719 } else { | |
2720 spos += cig_len; | |
2721 } | |
2722 break; | |
2723 | |
2724 case BAM_CHARD_CLIP: | |
2725 if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos])) | |
2726 return -1; | |
2727 break; | |
2728 | |
2729 case BAM_CPAD: | |
2730 if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos])) | |
2731 return -1; | |
2732 break; | |
2733 } | |
2734 } | |
2735 fake_qual = spos; | |
2736 cr->aend = MIN(apos, c->ref_end); | |
2737 cram_stats_add(c->stats[DS_FN], cr->nfeature); | |
2738 } else { | |
2739 // Unmapped | |
2740 cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES; | |
2741 cr->cigar = 0; | |
2742 cr->ncigar = 0; | |
2743 cr->nfeature = 0; | |
2744 cr->aend = cr->apos; | |
2745 for (i = 0; i < cr->len; i++) | |
2746 cram_stats_add(c->stats[DS_BA], seq[i]); | |
2747 } | |
2748 | |
2749 /* | |
2750 * Append to the qual block now. We do this here as | |
2751 * cram_add_substitution() can generate BA/QS events which need to | |
2752 * be in the qual block before we append the rest of the data. | |
2753 */ | |
2754 if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) { | |
2755 /* Special case of seq "*" */ | |
2756 if (cr->len == 0) { | |
2757 cram_stats_add(c->stats[DS_RL], cr->len = fake_qual); | |
2758 BLOCK_GROW(s->qual_blk, cr->len); | |
2759 cp = (char *)BLOCK_END(s->qual_blk); | |
2760 memset(cp, 255, cr->len); | |
2761 } else { | |
2762 BLOCK_GROW(s->qual_blk, cr->len); | |
2763 cp = (char *)BLOCK_END(s->qual_blk); | |
2764 char *from = (char *)&bam_qual(b)[0]; | |
2765 char *to = &cp[0]; | |
2766 memcpy(to, from, cr->len); | |
2767 //for (i = 0; i < cr->len; i++) cp[i] = from[i]; | |
2768 } | |
2769 BLOCK_SIZE(s->qual_blk) += cr->len; | |
2770 } else { | |
2771 if (cr->len == 0) { | |
2772 cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1; | |
2773 cram_stats_add(c->stats[DS_RL], cr->len); | |
2774 } | |
2775 } | |
2776 | |
2777 /* Now we know apos and aend both, update mate-pair information */ | |
2778 { | |
2779 int new; | |
2780 khint_t k; | |
2781 int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0; | |
2782 | |
2783 //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum, | |
2784 // cr->name_len, DSTRING_STR(s->name_ds)+cr->name); | |
2785 if (cr->flags & BAM_FPAIRED) { | |
2786 char *key = string_ndup(s->pair_keys, | |
2787 (char *)BLOCK_DATA(s->name_blk)+cr->name, | |
2788 cr->name_len); | |
2789 if (!key) | |
2790 return -1; | |
2791 | |
2792 k = kh_put(m_s2i, s->pair[sec], key, &new); | |
2793 if (-1 == new) | |
2794 return -1; | |
2795 else if (new > 0) | |
2796 kh_val(s->pair[sec], k) = rnum; | |
2797 } else { | |
2798 new = 1; | |
2799 } | |
2800 | |
2801 if (new == 0) { | |
2802 cram_record *p = &s->crecs[kh_val(s->pair[sec], k)]; | |
2803 int aleft, aright, sign; | |
2804 | |
2805 aleft = MIN(cr->apos, p->apos); | |
2806 aright = MAX(cr->aend, p->aend); | |
2807 if (cr->apos < p->apos) { | |
2808 sign = 1; | |
2809 } else if (cr->apos > p->apos) { | |
2810 sign = -1; | |
2811 } else if (cr->flags & BAM_FREAD1) { | |
2812 sign = 1; | |
2813 } else { | |
2814 sign = -1; | |
2815 } | |
2816 | |
2817 //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair[sec], k)); | |
2818 | |
2819 // This vs p: tlen, matepos, flags | |
2820 if (bam_ins_size(b) != sign*(aright-aleft+1)) | |
2821 goto detached; | |
2822 | |
2823 if (MAX(bam_mate_pos(b)+1, 0) != p->apos) | |
2824 goto detached; | |
2825 | |
2826 if (((bam_flag(b) & BAM_FMUNMAP) != 0) != | |
2827 ((p->flags & BAM_FUNMAP) != 0)) | |
2828 goto detached; | |
2829 | |
2830 if (((bam_flag(b) & BAM_FMREVERSE) != 0) != | |
2831 ((p->flags & BAM_FREVERSE) != 0)) | |
2832 goto detached; | |
2833 | |
2834 | |
2835 // p vs this: tlen, matepos, flags | |
2836 if (p->tlen != -sign*(aright-aleft+1)) | |
2837 goto detached; | |
2838 | |
2839 if (p->mate_pos != cr->apos) | |
2840 goto detached; | |
2841 | |
2842 if (((p->flags & BAM_FMUNMAP) != 0) != | |
2843 ((p->mate_flags & CRAM_M_UNMAP) != 0)) | |
2844 goto detached; | |
2845 | |
2846 if (((p->flags & BAM_FMREVERSE) != 0) != | |
2847 ((p->mate_flags & CRAM_M_REVERSE) != 0)) | |
2848 goto detached; | |
2849 | |
2850 // Supplementary reads are just too ill defined | |
2851 if ((cr->flags & BAM_FSUPPLEMENTARY) || | |
2852 (p->flags & BAM_FSUPPLEMENTARY)) | |
2853 goto detached; | |
2854 | |
2855 /* | |
2856 * The fields below are unused when encoding this read as it is | |
2857 * no longer detached. In theory they may get referred to when | |
2858 * processing a 3rd or 4th read in this template?, so we set them | |
2859 * here just to be sure. | |
2860 * | |
2861 * They do not need cram_stats_add() calls those as they are | |
2862 * not emitted. | |
2863 */ | |
2864 cr->mate_pos = p->apos; | |
2865 cr->tlen = sign*(aright-aleft+1); | |
2866 cr->mate_flags = | |
2867 ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + | |
2868 ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE; | |
2869 | |
2870 // Decrement statistics aggregated earlier | |
2871 cram_stats_del(c->stats[DS_NP], p->mate_pos); | |
2872 cram_stats_del(c->stats[DS_MF], p->mate_flags); | |
2873 cram_stats_del(c->stats[DS_TS], p->tlen); | |
2874 cram_stats_del(c->stats[DS_NS], p->mate_ref_id); | |
2875 | |
2876 /* Similarly we could correct the p-> values too, but these will no | |
2877 * longer have any code that refers back to them as the new 'p' | |
2878 * for this template is our current 'cr'. | |
2879 */ | |
2880 //p->mate_pos = cr->apos; | |
2881 //p->mate_flags = | |
2882 // ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + | |
2883 // ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE; | |
2884 //p->tlen = p->apos - cr->aend; | |
2885 | |
2886 // Clear detached from cr flags | |
2887 cr->cram_flags &= ~CRAM_FLAG_DETACHED; | |
2888 cram_stats_add(c->stats[DS_CF], cr->cram_flags); | |
2889 | |
2890 // Clear detached from p flags and set downstream | |
2891 cram_stats_del(c->stats[DS_CF], p->cram_flags); | |
2892 p->cram_flags &= ~CRAM_FLAG_DETACHED; | |
2893 p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM; | |
2894 cram_stats_add(c->stats[DS_CF], p->cram_flags); | |
2895 | |
2896 p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1); | |
2897 cram_stats_add(c->stats[DS_NF], p->mate_line); | |
2898 | |
2899 kh_val(s->pair[sec], k) = rnum; | |
2900 } else { | |
2901 detached: | |
2902 //fprintf(stderr, "unpaired\n"); | |
2903 | |
2904 /* Derive mate flags from this flag */ | |
2905 cr->mate_flags = 0; | |
2906 if (bam_flag(b) & BAM_FMUNMAP) | |
2907 cr->mate_flags |= CRAM_M_UNMAP; | |
2908 if (bam_flag(b) & BAM_FMREVERSE) | |
2909 cr->mate_flags |= CRAM_M_REVERSE; | |
2910 | |
2911 cram_stats_add(c->stats[DS_MF], cr->mate_flags); | |
2912 | |
2913 cr->mate_pos = MAX(bam_mate_pos(b)+1, 0); | |
2914 cram_stats_add(c->stats[DS_NP], cr->mate_pos); | |
2915 | |
2916 cr->tlen = bam_ins_size(b); | |
2917 cram_stats_add(c->stats[DS_TS], cr->tlen); | |
2918 | |
2919 cr->cram_flags |= CRAM_FLAG_DETACHED; | |
2920 cram_stats_add(c->stats[DS_CF], cr->cram_flags); | |
2921 cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)); | |
2922 } | |
2923 } | |
2924 | |
2925 cr->mqual = bam_map_qual(b); | |
2926 cram_stats_add(c->stats[DS_MQ], cr->mqual); | |
2927 | |
2928 cr->mate_ref_id = bam_mate_ref(b); | |
2929 | |
2930 if (!(bam_flag(b) & BAM_FUNMAP)) { | |
2931 if (c->first_base > cr->apos) | |
2932 c->first_base = cr->apos; | |
2933 | |
2934 if (c->last_base < cr->aend) | |
2935 c->last_base = cr->aend; | |
2936 } | |
2937 | |
2938 return 0; | |
2939 } | |
2940 | |
2941 /* | |
2942 * Write iterator: put BAM format sequences into a CRAM file. | |
2943 * We buffer up a containers worth of data at a time. | |
2944 * | |
2945 * Returns 0 on success | |
2946 * -1 on failure | |
2947 */ | |
2948 int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { | |
2949 cram_container *c; | |
2950 | |
2951 if (!fd->ctr) { | |
2952 fd->ctr = cram_new_container(fd->seqs_per_slice, | |
2953 fd->slices_per_container); | |
2954 if (!fd->ctr) | |
2955 return -1; | |
2956 fd->ctr->record_counter = fd->record_counter; | |
2957 } | |
2958 c = fd->ctr; | |
2959 | |
2960 if (!c->slice || c->curr_rec == c->max_rec || | |
2961 (bam_ref(b) != c->curr_ref && c->curr_ref >= -1)) { | |
2962 int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1; | |
2963 int curr_ref = c->slice ? c->curr_ref : bam_ref(b); | |
2964 | |
2965 | |
2966 /* | |
2967 * Start packing slices when we routinely have under 1/4tr full. | |
2968 * | |
2969 * This option isn't available if we choose to embed references | |
2970 * since we can only have one per slice. | |
2971 */ | |
2972 if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 && | |
2973 fd->last_slice && fd->last_slice < c->max_rec/4+10 && | |
2974 !fd->embed_ref) { | |
2975 if (fd->verbose && !c->multi_seq) | |
2976 fprintf(stderr, "Multi-ref enabled for this container\n"); | |
2977 multi_seq = 1; | |
2978 } | |
2979 | |
2980 slice_rec = c->slice_rec; | |
2981 curr_rec = c->curr_rec; | |
2982 | |
2983 if (CRAM_MAJOR_VERS(fd->version) == 1 || | |
2984 c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice) { | |
2985 if (NULL == (c = cram_next_container(fd, b))) { | |
2986 if (fd->ctr) { | |
2987 // prevent cram_close attempting to flush | |
2988 cram_free_container(fd->ctr); | |
2989 fd->ctr = NULL; | |
2990 } | |
2991 return -1; | |
2992 } | |
2993 } | |
2994 | |
2995 /* | |
2996 * Due to our processing order, some things we've already done we | |
2997 * cannot easily undo. So when we first notice we should be packing | |
2998 * multiple sequences per container we emit the small partial | |
2999 * container as-is and then start a fresh one in a different mode. | |
3000 */ | |
3001 if (multi_seq) { | |
3002 fd->multi_seq = 1; | |
3003 c->multi_seq = 1; | |
3004 c->pos_sorted = 0; // required atm for multi_seq slices | |
3005 | |
3006 if (!c->refs_used) { | |
3007 pthread_mutex_lock(&fd->ref_lock); | |
3008 c->refs_used = calloc(fd->refs->nref, sizeof(int)); | |
3009 pthread_mutex_unlock(&fd->ref_lock); | |
3010 if (!c->refs_used) | |
3011 return -1; | |
3012 } | |
3013 } | |
3014 | |
3015 fd->last_slice = curr_rec - slice_rec; | |
3016 c->slice_rec = c->curr_rec; | |
3017 | |
3018 // Have we seen this reference before? | |
3019 if (bam_ref(b) >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref && | |
3020 !fd->unsorted && multi_seq) { | |
3021 | |
3022 if (!c->refs_used) { | |
3023 pthread_mutex_lock(&fd->ref_lock); | |
3024 c->refs_used = calloc(fd->refs->nref, sizeof(int)); | |
3025 pthread_mutex_unlock(&fd->ref_lock); | |
3026 if (!c->refs_used) | |
3027 return -1; | |
3028 } else if (c->refs_used && c->refs_used[bam_ref(b)]) { | |
3029 fprintf(stderr, "Unsorted mode enabled\n"); | |
3030 pthread_mutex_lock(&fd->ref_lock); | |
3031 fd->unsorted = 1; | |
3032 pthread_mutex_unlock(&fd->ref_lock); | |
3033 fd->multi_seq = 1; | |
3034 } | |
3035 } | |
3036 | |
3037 c->curr_ref = bam_ref(b); | |
3038 if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++; | |
3039 } | |
3040 | |
3041 if (!c->bams) { | |
3042 /* First time through, allocate a set of bam pointers */ | |
3043 pthread_mutex_lock(&fd->bam_list_lock); | |
3044 if (fd->bl) { | |
3045 spare_bams *spare = fd->bl; | |
3046 c->bams = spare->bams; | |
3047 fd->bl = spare->next; | |
3048 free(spare); | |
3049 } else { | |
3050 c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *)); | |
3051 if (!c->bams) | |
3052 return -1; | |
3053 } | |
3054 pthread_mutex_unlock(&fd->bam_list_lock); | |
3055 } | |
3056 | |
3057 /* Copy or alloc+copy the bam record, for later encoding */ | |
3058 if (c->bams[c->curr_c_rec]) | |
3059 bam_copy1(c->bams[c->curr_c_rec], b); | |
3060 else | |
3061 c->bams[c->curr_c_rec] = bam_dup(b); | |
3062 | |
3063 c->curr_rec++; | |
3064 c->curr_c_rec++; | |
3065 fd->record_counter++; | |
3066 | |
3067 return 0; | |
3068 } |