comparison ezBAMQC/src/htslib/cram/cram_encode.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #ifdef HAVE_CONFIG_H
32 #include "io_lib_config.h"
33 #endif
34
35 #include <stdio.h>
36 #include <errno.h>
37 #include <assert.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <zlib.h>
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <math.h>
44 #include <ctype.h>
45
46 #include "cram/cram.h"
47 #include "cram/os.h"
48 #include "cram/md5.h"
49
50 #define Z_CRAM_STRAT Z_FILTERED
51 //#define Z_CRAM_STRAT Z_RLE
52 //#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
53 //#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
54
55 static int process_one_read(cram_fd *fd, cram_container *c,
56 cram_slice *s, cram_record *cr,
57 bam_seq_t *b, int rnum);
58
59 /*
60 * Returns index of val into key.
61 * Basically strchr(key, val)-key;
62 */
63 static int sub_idx(char *key, char val) {
64 int i;
65
66 for (i = 0; *key && *key++ != val; i++);
67 return i;
68 }
69
70 /*
71 * Encodes a compression header block into a generic cram_block structure.
72 *
73 * Returns cram_block ptr on success
74 * NULL on failure
75 */
76 cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
77 cram_block_compression_hdr *h) {
78 cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0);
79 cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
80 int i, mc;
81
82 if (!cb || !map)
83 return NULL;
84
85 /*
86 * This is a concatenation of several blocks of data:
87 * header + landmarks, preservation map, read encoding map, and the tag
88 * encoding map.
89 * All 4 are variable sized and we need to know how large these are
90 * before creating the compression header itself as this starts with
91 * the total size (stored as a variable length string).
92 */
93
94 // Duplicated from container itself, and removed in 1.1
95 if (CRAM_MAJOR_VERS(fd->version) == 1) {
96 itf8_put_blk(cb, h->ref_seq_id);
97 itf8_put_blk(cb, h->ref_seq_start);
98 itf8_put_blk(cb, h->ref_seq_span);
99 itf8_put_blk(cb, h->num_records);
100 itf8_put_blk(cb, h->num_landmarks);
101 for (i = 0; i < h->num_landmarks; i++) {
102 itf8_put_blk(cb, h->landmark[i]);
103 }
104 }
105
106 /* Create in-memory preservation map */
107 /* FIXME: should create this when we create the container */
108 {
109 khint_t k;
110 int r;
111
112 if (!(h->preservation_map = kh_init(map)))
113 return NULL;
114
115 k = kh_put(map, h->preservation_map, "RN", &r);
116 if (-1 == r) return NULL;
117 kh_val(h->preservation_map, k).i = 1;
118
119 if (CRAM_MAJOR_VERS(fd->version) == 1) {
120 k = kh_put(map, h->preservation_map, "PI", &r);
121 if (-1 == r) return NULL;
122 kh_val(h->preservation_map, k).i = 0;
123
124 k = kh_put(map, h->preservation_map, "UI", &r);
125 if (-1 == r) return NULL;
126 kh_val(h->preservation_map, k).i = 1;
127
128 k = kh_put(map, h->preservation_map, "MI", &r);
129 if (-1 == r) return NULL;
130 kh_val(h->preservation_map, k).i = 1;
131
132 } else {
133 // Technically SM was in 1.0, but wasn't in Java impl.
134 k = kh_put(map, h->preservation_map, "SM", &r);
135 if (-1 == r) return NULL;
136 kh_val(h->preservation_map, k).i = 0;
137
138 k = kh_put(map, h->preservation_map, "TD", &r);
139 if (-1 == r) return NULL;
140 kh_val(h->preservation_map, k).i = 0;
141
142 k = kh_put(map, h->preservation_map, "AP", &r);
143 if (-1 == r) return NULL;
144 kh_val(h->preservation_map, k).i = c->pos_sorted;
145
146 if (fd->no_ref || fd->embed_ref) {
147 // Reference Required == No
148 k = kh_put(map, h->preservation_map, "RR", &r);
149 if (-1 == r) return NULL;
150 kh_val(h->preservation_map, k).i = 0;
151 }
152 }
153 }
154
155 /* Encode preservation map; could collapse this and above into one */
156 mc = 0;
157 BLOCK_SIZE(map) = 0;
158 if (h->preservation_map) {
159 khint_t k;
160
161 for (k = kh_begin(h->preservation_map);
162 k != kh_end(h->preservation_map);
163 k++) {
164 const char *key;
165 khash_t(map) *pmap = h->preservation_map;
166
167
168 if (!kh_exist(pmap, k))
169 continue;
170
171 key = kh_key(pmap, k);
172 BLOCK_APPEND(map, key, 2);
173
174 switch(CRAM_KEY(key[0], key[1])) {
175 case CRAM_KEY('M','I'):
176 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
177 break;
178
179 case CRAM_KEY('U','I'):
180 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
181 break;
182
183 case CRAM_KEY('P','I'):
184 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
185 break;
186
187 case CRAM_KEY('A','P'):
188 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
189 break;
190
191 case CRAM_KEY('R','N'):
192 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
193 break;
194
195 case CRAM_KEY('R','R'):
196 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
197 break;
198
199 case CRAM_KEY('S','M'): {
200 char smat[5], *mp = smat;
201 *mp++ =
202 (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) |
203 (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) |
204 (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) |
205 (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0);
206 *mp++ =
207 (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) |
208 (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) |
209 (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) |
210 (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0);
211 *mp++ =
212 (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) |
213 (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) |
214 (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) |
215 (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0);
216 *mp++ =
217 (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) |
218 (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) |
219 (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) |
220 (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0);
221 *mp++ =
222 (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) |
223 (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) |
224 (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) |
225 (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0);
226 BLOCK_APPEND(map, smat, 5);
227 break;
228 }
229
230 case CRAM_KEY('T','D'): {
231 itf8_put_blk(map, BLOCK_SIZE(h->TD_blk));
232 BLOCK_APPEND(map,
233 BLOCK_DATA(h->TD_blk),
234 BLOCK_SIZE(h->TD_blk));
235 break;
236 }
237
238 default:
239 fprintf(stderr, "Unknown preservation key '%.2s'\n", key);
240 break;
241 }
242
243 mc++;
244 }
245 }
246 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
247 itf8_put_blk(cb, mc);
248 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
249
250 /* rec encoding map */
251 mc = 0;
252 BLOCK_SIZE(map) = 0;
253 if (h->codecs[DS_BF]) {
254 if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
255 fd->version))
256 return NULL;
257 mc++;
258 }
259 if (h->codecs[DS_CF]) {
260 if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
261 fd->version))
262 return NULL;
263 mc++;
264 }
265 if (h->codecs[DS_RL]) {
266 if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
267 fd->version))
268 return NULL;
269 mc++;
270 }
271 if (h->codecs[DS_AP]) {
272 if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
273 fd->version))
274 return NULL;
275 mc++;
276 }
277 if (h->codecs[DS_RG]) {
278 if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
279 fd->version))
280 return NULL;
281 mc++;
282 }
283 if (h->codecs[DS_MF]) {
284 if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
285 fd->version))
286 return NULL;
287 mc++;
288 }
289 if (h->codecs[DS_NS]) {
290 if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
291 fd->version))
292 return NULL;
293 mc++;
294 }
295 if (h->codecs[DS_NP]) {
296 if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
297 fd->version))
298 return NULL;
299 mc++;
300 }
301 if (h->codecs[DS_TS]) {
302 if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
303 fd->version))
304 return NULL;
305 mc++;
306 }
307 if (h->codecs[DS_NF]) {
308 if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
309 fd->version))
310 return NULL;
311 mc++;
312 }
313 if (h->codecs[DS_TC]) {
314 if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
315 fd->version))
316 return NULL;
317 mc++;
318 }
319 if (h->codecs[DS_TN]) {
320 if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
321 fd->version))
322 return NULL;
323 mc++;
324 }
325 if (h->codecs[DS_TL]) {
326 if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
327 fd->version))
328 return NULL;
329 mc++;
330 }
331 if (h->codecs[DS_FN]) {
332 if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
333 fd->version))
334 return NULL;
335 mc++;
336 }
337 if (h->codecs[DS_FC]) {
338 if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
339 fd->version))
340 return NULL;
341 mc++;
342 }
343 if (h->codecs[DS_FP]) {
344 if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
345 fd->version))
346 return NULL;
347 mc++;
348 }
349 if (h->codecs[DS_BS]) {
350 if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
351 fd->version))
352 return NULL;
353 mc++;
354 }
355 if (h->codecs[DS_IN]) {
356 if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
357 fd->version))
358 return NULL;
359 mc++;
360 }
361 if (h->codecs[DS_DL]) {
362 if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
363 fd->version))
364 return NULL;
365 mc++;
366 }
367 if (h->codecs[DS_BA]) {
368 if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
369 fd->version))
370 return NULL;
371 mc++;
372 }
373 if (h->codecs[DS_BB]) {
374 if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
375 fd->version))
376 return NULL;
377 mc++;
378 }
379 if (h->codecs[DS_MQ]) {
380 if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
381 fd->version))
382 return NULL;
383 mc++;
384 }
385 if (h->codecs[DS_RN]) {
386 if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
387 fd->version))
388 return NULL;
389 mc++;
390 }
391 if (h->codecs[DS_QS]) {
392 if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
393 fd->version))
394 return NULL;
395 mc++;
396 }
397 if (h->codecs[DS_QQ]) {
398 if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
399 fd->version))
400 return NULL;
401 mc++;
402 }
403 if (h->codecs[DS_RI]) {
404 if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
405 fd->version))
406 return NULL;
407 mc++;
408 }
409 if (CRAM_MAJOR_VERS(fd->version) != 1) {
410 if (h->codecs[DS_SC]) {
411 if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
412 fd->version))
413 return NULL;
414 mc++;
415 }
416 if (h->codecs[DS_RS]) {
417 if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
418 fd->version))
419 return NULL;
420 mc++;
421 }
422 if (h->codecs[DS_PD]) {
423 if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
424 fd->version))
425 return NULL;
426 mc++;
427 }
428 if (h->codecs[DS_HC]) {
429 if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
430 fd->version))
431 return NULL;
432 mc++;
433 }
434 }
435 if (h->codecs[DS_TM]) {
436 if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
437 fd->version))
438 return NULL;
439 mc++;
440 }
441 if (h->codecs[DS_TV]) {
442 if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
443 fd->version))
444 return NULL;
445 mc++;
446 }
447 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
448 itf8_put_blk(cb, mc);
449 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
450
451 /* tag encoding map */
452 #if 0
453 mp = map; mc = 0;
454 if (h->tag_encoding_map) {
455 HashItem *hi;
456 HashIter *iter = HashTableIterCreate();
457 if (!iter)
458 return NULL;
459
460 while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) {
461 cram_map *m = hi->data.p;
462 int sz;
463
464 mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]);
465 if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version)))
466 return NULL;
467 mp += sz;
468 mc++;
469 }
470
471 HashTableIterDestroy(iter);
472 }
473 #else
474 mc = 0;
475 BLOCK_SIZE(map) = 0;
476 if (c->tags_used) {
477 khint_t k;
478
479 #define TAG_ID(a) ((#a[0]<<8)+#a[1])
480
481 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
482 int key;
483 if (!kh_exist(c->tags_used, k))
484 continue;
485
486 mc++;
487 itf8_put_blk(map, kh_key(c->tags_used, k));
488
489 // use block content id 4
490 switch((key = kh_key(c->tags_used, k)) & 0xff) {
491 case 'Z': case 'H':
492 // string as byte_array_stop
493 if (CRAM_MAJOR_VERS(fd->version) == 1) {
494 BLOCK_APPEND(map,
495 "\005" // BYTE_ARRAY_STOP
496 "\005" // len
497 "\t" // stop-byte is also SAM separator
498 DS_aux_S "\000\000\000",
499 7);
500 } else {
501 if (key>>8 == TAG_ID(OQ))
502 BLOCK_APPEND(map,
503 "\005" // BYTE_ARRAY_STOP
504 "\002" // len
505 "\t" // stop-byte is also SAM separator
506 DS_aux_OQ_S,
507 4);
508 else if (key>>8 == TAG_ID(BQ))
509 BLOCK_APPEND(map,
510 "\005" // BYTE_ARRAY_STOP
511 "\002" // len
512 "\t" // stop-byte is also SAM separator
513 DS_aux_BQ_S,
514 4);
515 else if (key>>8 == TAG_ID(BD))
516 BLOCK_APPEND(map,
517 "\005" // BYTE_ARRAY_STOP
518 "\002" // len
519 "\t" // stop-byte is also SAM separator
520 DS_aux_BD_S,
521 4);
522 else if (key>>8 == TAG_ID(BI))
523 BLOCK_APPEND(map,
524 "\005" // BYTE_ARRAY_STOP
525 "\002" // len
526 "\t" // stop-byte is also SAM separator
527 DS_aux_BI_S,
528 4);
529 else if ((key>>8 == TAG_ID(Q2)) ||
530 (key>>8 == TAG_ID(U2)) ||
531 (key>>8 == TAG_ID(QT)) ||
532 (key>>8 == TAG_ID(CQ)))
533 BLOCK_APPEND(map,
534 "\005" // BYTE_ARRAY_STOP
535 "\002" // len
536 "\t" // stop-byte is also SAM separator
537 DS_aux_oq_S,
538 4);
539 else if ((key>>8 == TAG_ID(R2)) ||
540 (key>>8 == TAG_ID(E2)) ||
541 (key>>8 == TAG_ID(CS)) ||
542 (key>>8 == TAG_ID(BC)) ||
543 (key>>8 == TAG_ID(RT)))
544 BLOCK_APPEND(map,
545 "\005" // BYTE_ARRAY_STOP
546 "\002" // len
547 "\t" // stop-byte is also SAM separator
548 DS_aux_os_S,
549 4);
550 else
551 BLOCK_APPEND(map,
552 "\005" // BYTE_ARRAY_STOP
553 "\002" // len
554 "\t" // stop-byte is also SAM separator
555 DS_aux_oz_S,
556 4);
557 }
558 break;
559
560 case 'A': case 'c': case 'C':
561 // byte array len, 1 byte
562 BLOCK_APPEND(map,
563 "\004" // BYTE_ARRAY_LEN
564 "\011" // length
565 "\003" // HUFFMAN (len)
566 "\004" // huffman-len
567 "\001" // 1 symbol
568 "\001" // symbol=1 byte value
569 "\001" // 1 length
570 "\000" // length=0
571 "\001" // EXTERNAL (val)
572 "\001" // external-len
573 DS_aux_S,// content-id
574 11);
575 break;
576
577 case 's': case 'S':
578 // byte array len, 2 byte
579 BLOCK_APPEND(map,
580 "\004" // BYTE_ARRAY_LEN
581 "\011" // length
582 "\003" // HUFFMAN (len)
583 "\004" // huffman-len
584 "\001" // 1 symbol
585 "\002" // symbol=2 byte value
586 "\001" // 1 length
587 "\000" // length=0
588 "\001" // EXTERNAL (val)
589 "\001" // external-len
590 DS_aux_S,// content-id
591 11);
592 break;
593
594 case 'i': case 'I': case 'f':
595 // byte array len, 4 byte
596 BLOCK_APPEND(map,
597 "\004" // BYTE_ARRAY_LEN
598 "\011" // length
599 "\003" // HUFFMAN (len)
600 "\004" // huffman-len
601 "\001" // 1 symbol
602 "\004" // symbol=4 byte value
603 "\001" // 1 length
604 "\000" // length=0
605 "\001" // EXTERNAL (val)
606 "\001" // external-len
607 DS_aux_S,// content-id
608 11);
609 break;
610
611 case 'B':
612 // Byte array of variable size, but we generate our tag
613 // byte stream at the wrong stage (during reading and not
614 // after slice header construction). So we use
615 // BYTE_ARRAY_LEN with the length codec being external
616 // too.
617 if ((key>>8 == TAG_ID(FZ)) || (key>>8 == TAG_ID(ZM)))
618 BLOCK_APPEND(map,
619 "\004" // BYTE_ARRAY_LEN
620 "\006" // length
621 "\001" // EXTERNAL (len)
622 "\001" // external-len
623 DS_aux_FZ_S // content-id
624 "\001" // EXTERNAL (val)
625 "\001" // external-len
626 DS_aux_FZ_S,// content-id
627 8);
628 else
629 BLOCK_APPEND(map,
630 "\004" // BYTE_ARRAY_LEN
631 "\006" // length
632 "\001" // EXTERNAL (len)
633 "\001" // external-len
634 DS_aux_S // content-id
635 "\001" // EXTERNAL (val)
636 "\001" // external-len
637 DS_aux_S,// content-id
638 8);
639 break;
640
641 default:
642 fprintf(stderr, "Unsupported SAM aux type '%c'\n",
643 kh_key(c->tags_used, k) & 0xff);
644 }
645 //mp += m->codec->store(m->codec, mp, NULL, fd->version);
646 }
647 }
648 #endif
649 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
650 itf8_put_blk(cb, mc);
651 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
652
653 if (fd->verbose)
654 fprintf(stderr, "Wrote compression block header in %d bytes\n",
655 (int)BLOCK_SIZE(cb));
656
657 BLOCK_UPLEN(cb);
658
659 cram_free_block(map);
660
661 return cb;
662 }
663
664
665 /*
666 * Encodes a slice compression header.
667 *
668 * Returns cram_block on success
669 * NULL on failure
670 */
671 cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
672 char *buf;
673 char *cp;
674 cram_block *b = cram_new_block(MAPPED_SLICE, 0);
675 int j;
676
677 if (!b)
678 return NULL;
679
680 if (NULL == (cp = buf = malloc(16+5*(8+s->hdr->num_blocks)))) {
681 cram_free_block(b);
682 return NULL;
683 }
684
685 cp += itf8_put(cp, s->hdr->ref_seq_id);
686 cp += itf8_put(cp, s->hdr->ref_seq_start);
687 cp += itf8_put(cp, s->hdr->ref_seq_span);
688 cp += itf8_put(cp, s->hdr->num_records);
689 if (CRAM_MAJOR_VERS(fd->version) == 2)
690 cp += itf8_put(cp, s->hdr->record_counter);
691 else if (CRAM_MAJOR_VERS(fd->version) >= 3)
692 cp += ltf8_put(cp, s->hdr->record_counter);
693 cp += itf8_put(cp, s->hdr->num_blocks);
694 cp += itf8_put(cp, s->hdr->num_content_ids);
695 for (j = 0; j < s->hdr->num_content_ids; j++) {
696 cp += itf8_put(cp, s->hdr->block_content_ids[j]);
697 }
698 if (s->hdr->content_type == MAPPED_SLICE)
699 cp += itf8_put(cp, s->hdr->ref_base_id);
700
701 if (CRAM_MAJOR_VERS(fd->version) != 1) {
702 memcpy(cp, s->hdr->md5, 16); cp += 16;
703 }
704
705 assert(cp-buf <= 16+5*(8+s->hdr->num_blocks));
706
707 b->data = (unsigned char *)buf;
708 b->comp_size = b->uncomp_size = cp-buf;
709
710 return b;
711 }
712
713
714 /*
715 * Encodes a single read.
716 *
717 * Returns 0 on success
718 * -1 on failure
719 */
720 static int cram_encode_slice_read(cram_fd *fd,
721 cram_container *c,
722 cram_block_compression_hdr *h,
723 cram_slice *s,
724 cram_record *cr,
725 int *last_pos) {
726 int r = 0;
727 int32_t i32;
728 unsigned char uc;
729
730 //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
731
732 //printf("BF=0x%x\n", cr->flags);
733 // bf = cram_flag_swap[cr->flags];
734 i32 = fd->cram_flag_swap[cr->flags & 0xfff];
735 r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
736
737 i32 = cr->cram_flags;
738 r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
739
740 if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
741 r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
742
743 r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
744
745 if (c->pos_sorted) {
746 i32 = cr->apos - *last_pos;
747 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
748 *last_pos = cr->apos;
749 } else {
750 i32 = cr->apos;
751 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
752 }
753
754 r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
755
756 if (c->comp_hdr->read_names_included) {
757 // RN codec: Already stored in block[3].
758 }
759
760 if (cr->cram_flags & CRAM_FLAG_DETACHED) {
761 i32 = cr->mate_flags;
762 r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
763
764 if (!c->comp_hdr->read_names_included) {
765 // RN codec: Already stored in block[3].
766 }
767
768 r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
769 (char *)&cr->mate_ref_id, 1);
770
771 r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
772 (char *)&cr->mate_pos, 1);
773
774 r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
775 (char *)&cr->tlen, 1);
776 } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
777 r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
778 (char *)&cr->mate_line, 1);
779 }
780
781 /* Aux tags */
782 if (CRAM_MAJOR_VERS(fd->version) == 1) {
783 int j;
784 uc = cr->ntags;
785 r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
786
787 for (j = 0; j < cr->ntags; j++) {
788 uint32_t i32 = s->TN[cr->TN_idx + j]; // id
789 r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
790 }
791 } else {
792 r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
793 }
794
795 // qual
796 // QS codec : Already stored in block[2].
797
798 // features (diffs)
799 if (!(cr->flags & BAM_FUNMAP)) {
800 int prev_pos = 0, j;
801
802 r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
803 (char *)&cr->nfeature, 1);
804 for (j = 0; j < cr->nfeature; j++) {
805 cram_feature *f = &s->features[cr->feature + j];
806
807 uc = f->X.code;
808 r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
809 i32 = f->X.pos - prev_pos;
810 r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
811 prev_pos = f->X.pos;
812
813 switch(f->X.code) {
814 //char *seq;
815
816 case 'X':
817 //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
818
819 uc = f->X.base;
820 r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
821 (char *)&uc, 1);
822 break;
823 case 'S':
824 // Already done
825 // r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
826 // BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
827 // f->S.len);
828
829 // if (IS_CRAM_3_VERS(fd)) {
830 // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
831 // BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
832 // f->S.len);
833 // }
834 break;
835 case 'I':
836 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
837 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
838 // seq, f->S.len);
839 // if (IS_CRAM_3_VERS(fd)) {
840 // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
841 // BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
842 // f->I.len);
843 // }
844 break;
845 case 'i':
846 uc = f->i.base;
847 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
848 (char *)&uc, 1);
849 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
850 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
851 // seq, 1);
852 break;
853 case 'D':
854 i32 = f->D.len;
855 r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
856 (char *)&i32, 1);
857 break;
858
859 case 'B':
860 // // Used when we try to store a non ACGTN base or an N
861 // // that aligns against a non ACGTN reference
862
863 uc = f->B.base;
864 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
865 (char *)&uc, 1);
866
867 // Already added
868 // uc = f->B.qual;
869 // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
870 // (char *)&uc, 1);
871 break;
872
873 case 'b':
874 // string of bases
875 r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
876 (char *)BLOCK_DATA(s->seqs_blk)
877 + f->b.seq_idx,
878 f->b.len);
879 break;
880
881 case 'Q':
882 // Already added
883 // uc = f->B.qual;
884 // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
885 // (char *)&uc, 1);
886 break;
887
888 case 'N':
889 i32 = f->N.len;
890 r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
891 (char *)&i32, 1);
892 break;
893
894 case 'P':
895 i32 = f->P.len;
896 r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
897 (char *)&i32, 1);
898 break;
899
900 case 'H':
901 i32 = f->H.len;
902 r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
903 (char *)&i32, 1);
904 break;
905
906
907 default:
908 fprintf(stderr, "unhandled feature code %c\n",
909 f->X.code);
910 return -1;
911 }
912 }
913
914 r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
915 (char *)&cr->mqual, 1);
916 } else {
917 char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
918 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
919 }
920
921 return r ? -1 : 0;
922 }
923
924
925 /*
926 * Applies various compression methods to specific blocks, depending on
927 * known observations of how data series compress.
928 *
929 * Returns 0 on success
930 * -1 on failure
931 */
932 static int cram_compress_slice(cram_fd *fd, cram_slice *s) {
933 int level = fd->level, i;
934 int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
935
936 /* Compress the CORE Block too, with minimal zlib level */
937 if (level > 5 && s->block[0]->uncomp_size > 500)
938 cram_compress_block(fd, s->block[0], NULL, GZIP, 1);
939
940 if (fd->use_bz2)
941 method |= 1<<BZIP2;
942
943 if (fd->use_rans)
944 method |= (1<<RANS0) | (1<<RANS1);
945
946 if (fd->use_lzma)
947 method |= (1<<LZMA);
948
949 /* Faster method for data series we only need entropy encoding on */
950 methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
951 if (level >= 6)
952 methodF = method;
953
954
955 /* Specific compression methods for certain block types */
956 if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
957 method, level))
958 return -1;
959
960 if (fd->level == 0) {
961 /* Do nothing */
962 } else if (fd->level == 1) {
963 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
964 methodF, 1))
965 return -1;
966 for (i = DS_aux; i <= DS_aux_oz; i++) {
967 if (s->block[i])
968 if (cram_compress_block(fd, s->block[i], fd->m[i],
969 method, 1))
970 return -1;
971 }
972 } else if (fd->level < 3) {
973 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
974 method, 1))
975 return -1;
976 if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
977 method, 1))
978 return -1;
979 if (s->block[DS_BB])
980 if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
981 method, 1))
982 return -1;
983 for (i = DS_aux; i <= DS_aux_oz; i++) {
984 if (s->block[i])
985 if (cram_compress_block(fd, s->block[i], fd->m[i],
986 method, level))
987 return -1;
988 }
989 } else {
990 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
991 method, level))
992 return -1;
993 if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
994 method, level))
995 return -1;
996 if (s->block[DS_BB])
997 if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
998 method, level))
999 return -1;
1000 for (i = DS_aux; i <= DS_aux_oz; i++) {
1001 if (s->block[i])
1002 if (cram_compress_block(fd, s->block[i], fd->m[i],
1003 method, level))
1004 return -1;
1005 }
1006 }
1007
1008 // NAME: best is generally xz, bzip2, zlib then rans1
1009 // It benefits well from a little bit extra compression level.
1010 if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN],
1011 method & ~(1<<RANS0 | 1<<GZIP_RLE),
1012 MIN(9,level)))
1013 return -1;
1014
1015 // NS shows strong local correlation as rearrangements are localised
1016 if (s->block[DS_NS] != s->block[0])
1017 if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS],
1018 method, level))
1019 return -1;
1020
1021
1022 /*
1023 * Minimal compression of any block still uncompressed, bar CORE
1024 */
1025 {
1026 int i;
1027 for (i = 1; i < DS_END; i++) {
1028 if (!s->block[i] || s->block[i] == s->block[0])
1029 continue;
1030
1031 // fast methods only
1032 if (s->block[i]->method == RAW) {
1033 cram_compress_block(fd, s->block[i], fd->m[i],
1034 methodF, level);
1035 }
1036 }
1037 }
1038
1039 return 0;
1040 }
1041
1042 /*
1043 * Encodes a single slice from a container
1044 *
1045 * Returns 0 on success
1046 * -1 on failure
1047 */
1048 static int cram_encode_slice(cram_fd *fd, cram_container *c,
1049 cram_block_compression_hdr *h, cram_slice *s) {
1050 int rec, r = 0, last_pos;
1051 int embed_ref;
1052 enum cram_DS_ID id;
1053
1054 embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
1055
1056 /*
1057 * Slice external blocks:
1058 * ID 0 => base calls (insertions, soft-clip)
1059 * ID 1 => qualities
1060 * ID 2 => names
1061 * ID 3 => TS (insert size), NP (next frag)
1062 * ID 4 => tag values
1063 * ID 6 => tag IDs (TN), if CRAM_V1.0
1064 * ID 7 => TD tag dictionary, if !CRAM_V1.0
1065 */
1066
1067 /* Create cram slice header */
1068 s->hdr->ref_base_id = embed_ref ? DS_ref : -1;
1069 s->hdr->record_counter = c->num_records + c->record_counter;
1070 c->num_records += s->hdr->num_records;
1071
1072 s->block = calloc(DS_END, sizeof(s->block[0]));
1073 s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
1074 if (!s->block || !s->hdr->block_content_ids)
1075 return -1;
1076
1077 // Create first fixed blocks, always external.
1078 // CORE
1079 if (!(s->block[0] = cram_new_block(CORE, 0)))
1080 return -1;
1081
1082 // TN block for CRAM v1
1083 if (CRAM_MAJOR_VERS(fd->version) == 1) {
1084 if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
1085 if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
1086 h->codecs[DS_TN]->external.content_id = DS_TN;
1087 } else {
1088 s->block[DS_TN] = s->block[0];
1089 }
1090 s->block[DS_TN] = s->block[DS_TN];
1091 }
1092
1093 // Embedded reference
1094 if (embed_ref) {
1095 if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
1096 return -1;
1097 s->ref_id = DS_ref; // needed?
1098 BLOCK_APPEND(s->block[DS_ref],
1099 c->ref + c->first_base - c->ref_start,
1100 c->last_base - c->first_base + 1);
1101 }
1102
1103 /*
1104 * All the data-series blocks if appropriate.
1105 */
1106 for (id = DS_BF; id < DS_TN; id++) {
1107 if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL ||
1108 h->codecs[id]->codec == E_BYTE_ARRAY_STOP ||
1109 h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) {
1110 switch (h->codecs[id]->codec) {
1111 case E_EXTERNAL:
1112 if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
1113 return -1;
1114 h->codecs[id]->external.content_id = id;
1115 break;
1116
1117 case E_BYTE_ARRAY_STOP:
1118 if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
1119 return -1;
1120 h->codecs[id]->byte_array_stop.content_id = id;
1121 break;
1122
1123 case E_BYTE_ARRAY_LEN: {
1124 cram_codec *cc;
1125
1126 cc = h->codecs[id]->e_byte_array_len.len_codec;
1127 if (cc->codec == E_EXTERNAL) {
1128 int eid = cc->external.content_id;
1129 if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
1130 return -1;
1131 cc->external.content_id = eid;
1132 cc->out = s->block[eid];
1133 }
1134
1135 cc = h->codecs[id]->e_byte_array_len.val_codec;
1136 if (cc->codec == E_EXTERNAL) {
1137 int eid = cc->external.content_id;
1138 if (!s->block[eid])
1139 if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
1140 return -1;
1141 cc->external.content_id = eid;
1142 cc->out = s->block[eid];
1143 }
1144 break;
1145 }
1146 default:
1147 break;
1148 }
1149 } else {
1150 if (!(id == DS_BB && !h->codecs[DS_BB]))
1151 s->block[id] = s->block[0];
1152 }
1153 if (h->codecs[id])
1154 h->codecs[id]->out = s->block[id];
1155 }
1156
1157 /* Encode reads */
1158 last_pos = s->hdr->ref_seq_start;
1159 for (rec = 0; rec < s->hdr->num_records; rec++) {
1160 cram_record *cr = &s->crecs[rec];
1161 if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
1162 return -1;
1163 }
1164
1165 s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
1166 s->block[0]->comp_size = s->block[0]->uncomp_size;
1167
1168 // Make sure the fixed blocks point to the correct sources
1169 s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
1170 s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
1171 s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
1172 s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
1173 s->block[DS_aux]= s->aux_blk; s->aux_blk = NULL;
1174 s->block[DS_aux_OQ]= s->aux_OQ_blk; s->aux_OQ_blk = NULL;
1175 s->block[DS_aux_BQ]= s->aux_BQ_blk; s->aux_BQ_blk = NULL;
1176 s->block[DS_aux_BD]= s->aux_BD_blk; s->aux_BD_blk = NULL;
1177 s->block[DS_aux_BI]= s->aux_BI_blk; s->aux_BI_blk = NULL;
1178 s->block[DS_aux_FZ]= s->aux_FZ_blk; s->aux_FZ_blk = NULL;
1179 s->block[DS_aux_oq]= s->aux_oq_blk; s->aux_oq_blk = NULL;
1180 s->block[DS_aux_os]= s->aux_os_blk; s->aux_os_blk = NULL;
1181 s->block[DS_aux_oz]= s->aux_oz_blk; s->aux_oz_blk = NULL;
1182
1183 // Ensure block sizes are up to date.
1184 for (id = 1; id < DS_END; id++) {
1185 if (!s->block[id] || s->block[id] == s->block[0])
1186 continue;
1187
1188 if (s->block[id]->uncomp_size == 0)
1189 BLOCK_UPLEN(s->block[id]);
1190 }
1191
1192 // Compress it all
1193 if (cram_compress_slice(fd, s) == -1)
1194 return -1;
1195
1196 // Collapse empty blocks and create hdr_block
1197 {
1198 int i, j;
1199 for (i = j = 1; i < DS_END; i++) {
1200 if (!s->block[i] || s->block[i] == s->block[0])
1201 continue;
1202 if (s->block[i]->uncomp_size == 0) {
1203 cram_free_block(s->block[i]);
1204 s->block[i] = NULL;
1205 continue;
1206 }
1207 s->block[j] = s->block[i];
1208 s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
1209 j++;
1210 }
1211 s->hdr->num_content_ids = j-1;
1212 s->hdr->num_blocks = j;
1213
1214 if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
1215 return -1;
1216 }
1217
1218 return r ? -1 : 0;
1219 }
1220
1221 /*
1222 * Encodes all slices in a container into blocks.
1223 * Returns 0 on success
1224 * -1 on failure
1225 */
1226 int cram_encode_container(cram_fd *fd, cram_container *c) {
1227 int i, j, slice_offset;
1228 cram_block_compression_hdr *h = c->comp_hdr;
1229 cram_block *c_hdr;
1230 int multi_ref = 0;
1231 int r1, r2, sn, nref;
1232 spare_bams *spares;
1233
1234 /* Cache references up-front if we have unsorted access patterns */
1235 pthread_mutex_lock(&fd->ref_lock);
1236 nref = fd->refs->nref;
1237 pthread_mutex_unlock(&fd->ref_lock);
1238
1239 if (!fd->no_ref && c->refs_used) {
1240 for (i = 0; i < nref; i++) {
1241 if (c->refs_used[i])
1242 cram_get_ref(fd, i, 1, 0);
1243 }
1244 }
1245
1246 /* To create M5 strings */
1247 /* Fetch reference sequence */
1248 if (!fd->no_ref) {
1249 bam_seq_t *b = c->bams[0];
1250 char *ref;
1251
1252 ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1253 if (!ref && bam_ref(b) >= 0) {
1254 fprintf(stderr, "Failed to load reference #%d\n", bam_ref(b));
1255 return -1;
1256 }
1257 if ((c->ref_id = bam_ref(b)) >= 0) {
1258 c->ref_seq_id = c->ref_id;
1259 c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
1260 c->ref_start = 1;
1261 c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
1262 } else {
1263 c->ref_seq_id = c->ref_id; // FIXME remove one var!
1264 }
1265 } else {
1266 c->ref_id = bam_ref(c->bams[0]);
1267 cram_ref_incr(fd->refs, c->ref_id);
1268 c->ref_seq_id = c->ref_id;
1269 }
1270
1271 /* Turn bams into cram_records and gather basic stats */
1272 for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1273 cram_slice *s = c->slices[sn];
1274 int first_base = INT_MAX, last_base = INT_MIN;
1275
1276 assert(sn < c->curr_slice);
1277
1278 /* FIXME: we could create our slice objects here too instead of
1279 * in cram_put_bam_seq. It's more natural here and also this is
1280 * bit is threaded so it's less work in the main thread.
1281 */
1282
1283 for (r2 = 0; r1 < c->curr_c_rec && r2 < c->max_rec; r1++, r2++) {
1284 cram_record *cr = &s->crecs[r2];
1285 bam_seq_t *b = c->bams[r1];
1286
1287 /* If multi-ref we need to cope with changing reference per seq */
1288 if (c->multi_seq && !fd->no_ref) {
1289 if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
1290 if (c->ref_seq_id >= 0)
1291 cram_ref_decr(fd->refs, c->ref_seq_id);
1292
1293 if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
1294 fprintf(stderr, "Failed to load reference #%d\n",
1295 bam_ref(b));
1296 return -1;
1297 }
1298
1299 c->ref_seq_id = bam_ref(b); // overwritten later by -2
1300 assert(fd->refs->ref_id[c->ref_seq_id]->seq);
1301 c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
1302 c->ref_start = 1;
1303 c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
1304 }
1305 }
1306
1307 process_one_read(fd, c, s, cr, b, r2);
1308
1309 if (first_base > cr->apos)
1310 first_base = cr->apos;
1311
1312 if (last_base < cr->aend)
1313 last_base = cr->aend;
1314 }
1315
1316 if (c->multi_seq) {
1317 s->hdr->ref_seq_id = -2;
1318 s->hdr->ref_seq_start = 0;
1319 s->hdr->ref_seq_span = 0;
1320 } else {
1321 s->hdr->ref_seq_id = c->ref_id;
1322 s->hdr->ref_seq_start = first_base;
1323 s->hdr->ref_seq_span = last_base - first_base + 1;
1324 }
1325 s->hdr->num_records = r2;
1326 }
1327
1328 if (c->multi_seq && !fd->no_ref) {
1329 if (c->ref_seq_id >= 0)
1330 cram_ref_decr(fd->refs, c->ref_seq_id);
1331 }
1332
1333 /* Link our bams[] array onto the spare bam list for reuse */
1334 spares = malloc(sizeof(*spares));
1335 pthread_mutex_lock(&fd->bam_list_lock);
1336 spares->bams = c->bams;
1337 spares->next = fd->bl;
1338 fd->bl = spares;
1339 pthread_mutex_unlock(&fd->bam_list_lock);
1340 c->bams = NULL;
1341
1342 /* Detect if a multi-seq container */
1343 cram_stats_encoding(fd, c->stats[DS_RI]);
1344 multi_ref = c->stats[DS_RI]->nvals > 1;
1345
1346 if (multi_ref) {
1347 if (fd->verbose)
1348 fprintf(stderr, "Multi-ref container\n");
1349 c->ref_seq_id = -2;
1350 c->ref_seq_start = 0;
1351 c->ref_seq_span = 0;
1352 }
1353
1354
1355 /* Compute MD5s */
1356 for (i = 0; i < c->curr_slice; i++) {
1357 cram_slice *s = c->slices[i];
1358
1359 if (CRAM_MAJOR_VERS(fd->version) != 1) {
1360 if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) {
1361 MD5_CTX md5;
1362 MD5_Init(&md5);
1363 MD5_Update(&md5,
1364 c->ref + s->hdr->ref_seq_start - c->ref_start,
1365 s->hdr->ref_seq_span);
1366 MD5_Final(s->hdr->md5, &md5);
1367 } else {
1368 memset(s->hdr->md5, 0, 16);
1369 }
1370 }
1371 }
1372
1373 c->num_records = 0;
1374 c->num_blocks = 0;
1375 c->length = 0;
1376
1377 //fprintf(stderr, "=== BF ===\n");
1378 h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
1379 c->stats[DS_BF], E_INT, NULL,
1380 fd->version);
1381
1382 //fprintf(stderr, "=== CF ===\n");
1383 h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
1384 c->stats[DS_CF], E_INT, NULL,
1385 fd->version);
1386 // fprintf(stderr, "=== RN ===\n");
1387 // h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
1388 // c->stats[DS_RN], E_BYTE_ARRAY, NULL,
1389 // fd->version);
1390
1391 //fprintf(stderr, "=== AP ===\n");
1392 if (c->pos_sorted) {
1393 h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
1394 c->stats[DS_AP], E_INT, NULL,
1395 fd->version);
1396 } else {
1397 int p[2] = {0, c->max_apos};
1398 h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p,
1399 fd->version);
1400 }
1401
1402 //fprintf(stderr, "=== RG ===\n");
1403 h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
1404 c->stats[DS_RG], E_INT, NULL,
1405 fd->version);
1406
1407 //fprintf(stderr, "=== MQ ===\n");
1408 h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
1409 c->stats[DS_MQ], E_INT, NULL,
1410 fd->version);
1411
1412 //fprintf(stderr, "=== NS ===\n");
1413 h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
1414 c->stats[DS_NS], E_INT, NULL,
1415 fd->version);
1416
1417 //fprintf(stderr, "=== MF ===\n");
1418 h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
1419 c->stats[DS_MF], E_INT, NULL,
1420 fd->version);
1421
1422 //fprintf(stderr, "=== TS ===\n");
1423 h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
1424 c->stats[DS_TS], E_INT, NULL,
1425 fd->version);
1426 //fprintf(stderr, "=== NP ===\n");
1427 h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
1428 c->stats[DS_NP], E_INT, NULL,
1429 fd->version);
1430 //fprintf(stderr, "=== NF ===\n");
1431 h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
1432 c->stats[DS_NF], E_INT, NULL,
1433 fd->version);
1434
1435 //fprintf(stderr, "=== RL ===\n");
1436 h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
1437 c->stats[DS_RL], E_INT, NULL,
1438 fd->version);
1439
1440 //fprintf(stderr, "=== FN ===\n");
1441 h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
1442 c->stats[DS_FN], E_INT, NULL,
1443 fd->version);
1444
1445 //fprintf(stderr, "=== FC ===\n");
1446 h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
1447 c->stats[DS_FC], E_BYTE, NULL,
1448 fd->version);
1449
1450 //fprintf(stderr, "=== FP ===\n");
1451 h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
1452 c->stats[DS_FP], E_INT, NULL,
1453 fd->version);
1454
1455 //fprintf(stderr, "=== DL ===\n");
1456 h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
1457 c->stats[DS_DL], E_INT, NULL,
1458 fd->version);
1459
1460 //fprintf(stderr, "=== BA ===\n");
1461 h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
1462 c->stats[DS_BA], E_BYTE, NULL,
1463 fd->version);
1464
1465 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1466 cram_byte_array_len_encoder e;
1467
1468 e.len_encoding = E_EXTERNAL;
1469 e.len_dat = (void *)DS_BB_len;
1470 //e.len_dat = (void *)DS_BB;
1471
1472 e.val_encoding = E_EXTERNAL;
1473 e.val_dat = (void *)DS_BB;
1474
1475 h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
1476 E_BYTE_ARRAY, (void *)&e,
1477 fd->version);
1478 } else {
1479 h->codecs[DS_BB] = NULL;
1480 }
1481
1482 //fprintf(stderr, "=== BS ===\n");
1483 h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
1484 c->stats[DS_BS], E_BYTE, NULL,
1485 fd->version);
1486
1487 if (CRAM_MAJOR_VERS(fd->version) == 1) {
1488 h->codecs[DS_TL] = NULL;
1489 h->codecs[DS_RI] = NULL;
1490 h->codecs[DS_RS] = NULL;
1491 h->codecs[DS_PD] = NULL;
1492 h->codecs[DS_HC] = NULL;
1493 h->codecs[DS_SC] = NULL;
1494
1495 //fprintf(stderr, "=== TC ===\n");
1496 h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
1497 c->stats[DS_TC], E_BYTE, NULL,
1498 fd->version);
1499
1500 //fprintf(stderr, "=== TN ===\n");
1501 h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
1502 c->stats[DS_TN], E_INT, NULL,
1503 fd->version);
1504 } else {
1505 h->codecs[DS_TC] = NULL;
1506 h->codecs[DS_TN] = NULL;
1507
1508 //fprintf(stderr, "=== TL ===\n");
1509 h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
1510 c->stats[DS_TL], E_INT, NULL,
1511 fd->version);
1512
1513
1514 //fprintf(stderr, "=== RI ===\n");
1515 h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
1516 c->stats[DS_RI], E_INT, NULL,
1517 fd->version);
1518
1519 //fprintf(stderr, "=== RS ===\n");
1520 h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
1521 c->stats[DS_RS], E_INT, NULL,
1522 fd->version);
1523
1524 //fprintf(stderr, "=== PD ===\n");
1525 h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
1526 c->stats[DS_PD], E_INT, NULL,
1527 fd->version);
1528
1529 //fprintf(stderr, "=== HC ===\n");
1530 h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
1531 c->stats[DS_HC], E_INT, NULL,
1532 fd->version);
1533
1534 //fprintf(stderr, "=== SC ===\n");
1535 if (1) {
1536 int i2[2] = {0, DS_SC};
1537
1538 h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1539 E_BYTE_ARRAY, (void *)i2,
1540 fd->version);
1541 } else {
1542 // Appears to be no practical benefit to using this method,
1543 // but it may work better if we start mixing SC, IN and BB
1544 // elements into the same external block.
1545 cram_byte_array_len_encoder e;
1546
1547 e.len_encoding = E_EXTERNAL;
1548 e.len_dat = (void *)DS_SC_len;
1549
1550 e.val_encoding = E_EXTERNAL;
1551 e.val_dat = (void *)DS_SC;
1552
1553 h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
1554 E_BYTE_ARRAY, (void *)&e,
1555 fd->version);
1556 }
1557 }
1558
1559 //fprintf(stderr, "=== IN ===\n");
1560 {
1561 int i2[2] = {0, DS_IN};
1562 h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1563 E_BYTE_ARRAY, (void *)i2,
1564 fd->version);
1565 }
1566
1567 h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
1568 (void *)DS_QS,
1569 fd->version);
1570 {
1571 int i2[2] = {0, DS_RN};
1572 h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1573 E_BYTE_ARRAY, (void *)i2,
1574 fd->version);
1575 }
1576
1577
1578 /* Encode slices */
1579 for (i = 0; i < c->curr_slice; i++) {
1580 if (fd->verbose)
1581 fprintf(stderr, "Encode slice %d\n", i);
1582 if (cram_encode_slice(fd, c, h, c->slices[i]) != 0)
1583 return -1;
1584 }
1585
1586 /* Create compression header */
1587 {
1588 h->ref_seq_id = c->ref_seq_id;
1589 h->ref_seq_start = c->ref_seq_start;
1590 h->ref_seq_span = c->ref_seq_span;
1591 h->num_records = c->num_records;
1592
1593 h->mapped_qs_included = 0; // fixme
1594 h->unmapped_qs_included = 0; // fixme
1595 // h->... fixme
1596 memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
1597
1598 if (!(c_hdr = cram_encode_compression_header(fd, c, h)))
1599 return -1;
1600 }
1601
1602 /* Compute landmarks */
1603 /* Fill out slice landmarks */
1604 c->num_landmarks = c->curr_slice;
1605 c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
1606 if (!c->landmark)
1607 return -1;
1608
1609 /*
1610 * Slice offset starts after the first block, so we need to simulate
1611 * writing it to work out the correct offset
1612 */
1613 {
1614 slice_offset = c_hdr->method == RAW
1615 ? c_hdr->uncomp_size
1616 : c_hdr->comp_size;
1617 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1618 itf8_size(c_hdr->content_id) +
1619 itf8_size(c_hdr->comp_size) +
1620 itf8_size(c_hdr->uncomp_size);
1621 }
1622
1623 c->ref_seq_id = c->slices[0]->hdr->ref_seq_id;
1624 c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
1625 c->ref_seq_span = c->slices[0]->hdr->ref_seq_span;
1626 for (i = 0; i < c->curr_slice; i++) {
1627 cram_slice *s = c->slices[i];
1628
1629 c->num_blocks += s->hdr->num_blocks + 2;
1630 c->landmark[i] = slice_offset;
1631
1632 if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
1633 c->ref_seq_start + c->ref_seq_span) {
1634 c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
1635 - c->ref_seq_start;
1636 }
1637
1638 slice_offset += s->hdr_block->method == RAW
1639 ? s->hdr_block->uncomp_size
1640 : s->hdr_block->comp_size;
1641
1642 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1643 itf8_size(s->hdr_block->content_id) +
1644 itf8_size(s->hdr_block->comp_size) +
1645 itf8_size(s->hdr_block->uncomp_size);
1646
1647 for (j = 0; j < s->hdr->num_blocks; j++) {
1648 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1649 itf8_size(s->block[j]->content_id) +
1650 itf8_size(s->block[j]->comp_size) +
1651 itf8_size(s->block[j]->uncomp_size);
1652
1653 slice_offset += s->block[j]->method == RAW
1654 ? s->block[j]->uncomp_size
1655 : s->block[j]->comp_size;
1656 }
1657 }
1658 c->length += slice_offset; // just past the final slice
1659
1660 c->comp_hdr_block = c_hdr;
1661
1662 if (c->ref_seq_id >= 0) {
1663 cram_ref_decr(fd->refs, c->ref_seq_id);
1664 }
1665
1666 /* Cache references up-front if we have unsorted access patterns */
1667 if (!fd->no_ref && c->refs_used) {
1668 for (i = 0; i < fd->refs->nref; i++) {
1669 if (c->refs_used[i])
1670 cram_ref_decr(fd->refs, i);
1671 }
1672 }
1673
1674 return 0;
1675 }
1676
1677
1678 /*
1679 * Adds a feature code to a read within a slice. For purposes of minimising
1680 * memory allocations and fragmentation we have one array of features for all
1681 * reads within the slice. We return the index into this array for this new
1682 * feature.
1683 *
1684 * Returns feature index on success
1685 * -1 on failure.
1686 */
1687 static int cram_add_feature(cram_container *c, cram_slice *s,
1688 cram_record *r, cram_feature *f) {
1689 if (s->nfeatures >= s->afeatures) {
1690 s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
1691 s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
1692 if (!s->features)
1693 return -1;
1694 }
1695
1696 if (!r->nfeature++) {
1697 r->feature = s->nfeatures;
1698 cram_stats_add(c->stats[DS_FP], f->X.pos);
1699 } else {
1700 cram_stats_add(c->stats[DS_FP],
1701 f->X.pos - s->features[r->feature + r->nfeature-2].X.pos);
1702 }
1703 cram_stats_add(c->stats[DS_FC], f->X.code);
1704
1705 s->features[s->nfeatures++] = *f;
1706
1707 return 0;
1708 }
1709
1710 static int cram_add_substitution(cram_fd *fd, cram_container *c,
1711 cram_slice *s, cram_record *r,
1712 int pos, char base, char qual, char ref) {
1713 cram_feature f;
1714
1715 // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
1716 if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
1717 f.X.pos = pos+1;
1718 f.X.code = 'X';
1719 f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
1720 cram_stats_add(c->stats[DS_BS], f.X.base);
1721 } else {
1722 f.B.pos = pos+1;
1723 f.B.code = 'B';
1724 f.B.base = base;
1725 f.B.qual = qual;
1726 cram_stats_add(c->stats[DS_BA], f.B.base);
1727 cram_stats_add(c->stats[DS_QS], f.B.qual);
1728 BLOCK_APPEND_CHAR(s->qual_blk, qual);
1729 }
1730 return cram_add_feature(c, s, r, &f);
1731 }
1732
1733 static int cram_add_bases(cram_fd *fd, cram_container *c,
1734 cram_slice *s, cram_record *r,
1735 int pos, int len, char *base) {
1736 cram_feature f;
1737
1738 f.b.pos = pos+1;
1739 f.b.code = 'b';
1740 f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
1741 f.b.len = len;
1742
1743 return cram_add_feature(c, s, r, &f);
1744 }
1745
1746 static int cram_add_base(cram_fd *fd, cram_container *c,
1747 cram_slice *s, cram_record *r,
1748 int pos, char base, char qual) {
1749 cram_feature f;
1750 f.B.pos = pos+1;
1751 f.B.code = 'B';
1752 f.B.base = base;
1753 f.B.qual = qual;
1754 cram_stats_add(c->stats[DS_BA], base);
1755 cram_stats_add(c->stats[DS_QS], qual);
1756 BLOCK_APPEND_CHAR(s->qual_blk, qual);
1757 return cram_add_feature(c, s, r, &f);
1758 }
1759
1760 static int cram_add_quality(cram_fd *fd, cram_container *c,
1761 cram_slice *s, cram_record *r,
1762 int pos, char qual) {
1763 cram_feature f;
1764 f.Q.pos = pos+1;
1765 f.Q.code = 'Q';
1766 f.Q.qual = qual;
1767 cram_stats_add(c->stats[DS_QS], qual);
1768 BLOCK_APPEND_CHAR(s->qual_blk, qual);
1769 return cram_add_feature(c, s, r, &f);
1770 }
1771
1772 static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
1773 int pos, int len, char *base) {
1774 cram_feature f;
1775 f.D.pos = pos+1;
1776 f.D.code = 'D';
1777 f.D.len = len;
1778 cram_stats_add(c->stats[DS_DL], len);
1779 return cram_add_feature(c, s, r, &f);
1780 }
1781
1782 static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
1783 int pos, int len, char *base, int version) {
1784 cram_feature f;
1785 f.S.pos = pos+1;
1786 f.S.code = 'S';
1787 f.S.len = len;
1788 switch (CRAM_MAJOR_VERS(version)) {
1789 case 1:
1790 f.S.seq_idx = BLOCK_SIZE(s->base_blk);
1791 BLOCK_APPEND(s->base_blk, base, len);
1792 BLOCK_APPEND_CHAR(s->base_blk, '\0');
1793 break;
1794
1795 case 2:
1796 default:
1797 f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
1798 if (base) {
1799 BLOCK_APPEND(s->soft_blk, base, len);
1800 } else {
1801 int i;
1802 for (i = 0; i < len; i++)
1803 BLOCK_APPEND_CHAR(s->soft_blk, 'N');
1804 }
1805 BLOCK_APPEND_CHAR(s->soft_blk, '\0');
1806 break;
1807
1808 // default:
1809 // // v3.0 onwards uses BB data-series
1810 // f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
1811 }
1812 return cram_add_feature(c, s, r, &f);
1813 }
1814
1815 static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
1816 int pos, int len, char *base) {
1817 cram_feature f;
1818 f.S.pos = pos+1;
1819 f.S.code = 'H';
1820 f.S.len = len;
1821 cram_stats_add(c->stats[DS_HC], len);
1822 return cram_add_feature(c, s, r, &f);
1823 }
1824
1825 static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
1826 int pos, int len, char *base) {
1827 cram_feature f;
1828 f.S.pos = pos+1;
1829 f.S.code = 'N';
1830 f.S.len = len;
1831 cram_stats_add(c->stats[DS_RS], len);
1832 return cram_add_feature(c, s, r, &f);
1833 }
1834
1835 static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
1836 int pos, int len, char *base) {
1837 cram_feature f;
1838 f.S.pos = pos+1;
1839 f.S.code = 'P';
1840 f.S.len = len;
1841 cram_stats_add(c->stats[DS_PD], len);
1842 return cram_add_feature(c, s, r, &f);
1843 }
1844
1845 static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
1846 int pos, int len, char *base) {
1847 cram_feature f;
1848 f.I.pos = pos+1;
1849 if (len == 1) {
1850 char b = base ? *base : 'N';
1851 f.i.code = 'i';
1852 f.i.base = b;
1853 cram_stats_add(c->stats[DS_BA], b);
1854 } else {
1855 f.I.code = 'I';
1856 f.I.len = len;
1857 f.S.seq_idx = BLOCK_SIZE(s->base_blk);
1858 if (base) {
1859 BLOCK_APPEND(s->base_blk, base, len);
1860 } else {
1861 int i;
1862 for (i = 0; i < len; i++)
1863 BLOCK_APPEND_CHAR(s->base_blk, 'N');
1864 }
1865 BLOCK_APPEND_CHAR(s->base_blk, '\0');
1866 }
1867 return cram_add_feature(c, s, r, &f);
1868 }
1869
1870 /*
1871 * Encodes auxiliary data.
1872 * Returns the read-group parsed out of the BAM aux fields on success
1873 * NULL on failure or no rg present (FIXME)
1874 */
1875 static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
1876 cram_slice *s, cram_record *cr) {
1877 char *aux, *tmp, *rg = NULL;
1878 int aux_size = bam_blk_size(b) -
1879 ((char *)bam_aux(b) - (char *)&bam_ref(b));
1880
1881 /* Worst case is 1 nul char on every ??:Z: string, so +33% */
1882 BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
1883 tmp = (char *)BLOCK_END(s->aux_blk);
1884
1885 aux = (char *)bam_aux(b);
1886 cr->TN_idx = s->nTN;
1887
1888 while (aux[0] != 0) {
1889 int32_t i32;
1890 int r;
1891
1892 if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
1893 rg = &aux[3];
1894 while (*aux++);
1895 continue;
1896 }
1897 if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
1898 while (*aux++);
1899 continue;
1900 }
1901 if (aux[0] == 'N' && aux[1] == 'M') {
1902 switch(aux[2]) {
1903 case 'A': case 'C': case 'c': aux+=4; break;
1904 case 'I': case 'i': case 'f': aux+=7; break;
1905 default:
1906 fprintf(stderr, "Unhandled type code for NM tag\n");
1907 return NULL;
1908 }
1909 continue;
1910 }
1911
1912 cr->ntags++;
1913
1914 i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2];
1915 kh_put(s_i2i, c->tags_used, i32, &r);
1916 if (-1 == r)
1917 return NULL;
1918
1919 if (s->nTN >= s->aTN) {
1920 s->aTN = s->aTN ? s->aTN*2 : 1024;
1921 if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN))))
1922 return NULL;
1923 }
1924 s->TN[s->nTN++] = i32;
1925 cram_stats_add(c->stats[DS_TN], i32);
1926
1927 switch(aux[2]) {
1928 case 'A': case 'C': case 'c':
1929 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1930 *tmp++=*aux++;
1931 break;
1932
1933 case 'S': case 's':
1934 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1935 *tmp++=*aux++; *tmp++=*aux++;
1936 break;
1937
1938 case 'I': case 'i': case 'f':
1939 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1940 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1941 break;
1942
1943 case 'd':
1944 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1945 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1946 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1947 break;
1948
1949 case 'Z': case 'H':
1950 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1951 while ((*tmp++=*aux++));
1952 *tmp++ = '\t'; // stop byte
1953 break;
1954
1955 case 'B': {
1956 int type = aux[3], blen;
1957 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
1958 (((unsigned char *)aux)[5]<< 8) +
1959 (((unsigned char *)aux)[6]<<16) +
1960 (((unsigned char *)aux)[7]<<24));
1961 // skip TN field
1962 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1963
1964 // We use BYTE_ARRAY_LEN with external length, so store that first
1965 switch (type) {
1966 case 'c': case 'C':
1967 blen = count;
1968 break;
1969 case 's': case 'S':
1970 blen = 2*count;
1971 break;
1972 case 'i': case 'I': case 'f':
1973 blen = 4*count;
1974 break;
1975 default:
1976 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
1977 type);
1978 return NULL;
1979
1980 }
1981
1982 tmp += itf8_put(tmp, blen+5);
1983
1984 *tmp++=*aux++; // sub-type & length
1985 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1986
1987 // The tag data itself
1988 memcpy(tmp, aux, blen); tmp += blen; aux += blen;
1989
1990 //cram_stats_add(c->aux_B_stats, blen);
1991 break;
1992 }
1993 default:
1994 fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
1995 return NULL;
1996 }
1997 }
1998 cram_stats_add(c->stats[DS_TC], cr->ntags);
1999
2000 cr->aux = BLOCK_SIZE(s->aux_blk);
2001 cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
2002 BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
2003 assert(s->aux_blk->byte <= s->aux_blk->alloc);
2004
2005 return rg;
2006 }
2007
2008 /*
2009 * Encodes auxiliary data. Largely duplicated from above, but done so to
2010 * keep it simple and avoid a myriad of version ifs.
2011 *
2012 * Returns the read-group parsed out of the BAM aux fields on success
2013 * NULL on failure or no rg present (FIXME)
2014 */
2015 static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
2016 cram_slice *s, cram_record *cr) {
2017 char *aux, *orig, *tmp, *rg = NULL;
2018 int aux_size = bam_get_l_aux(b);
2019 cram_block *td_b = c->comp_hdr->TD_blk;
2020 int TD_blk_size = BLOCK_SIZE(td_b), new;
2021 char *key;
2022 khint_t k;
2023
2024
2025 /* Worst case is 1 nul char on every ??:Z: string, so +33% */
2026 BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
2027 tmp = (char *)BLOCK_END(s->aux_blk);
2028
2029
2030 orig = aux = (char *)bam_aux(b);
2031
2032 // Copy aux keys to td_b and aux values to s->aux_blk
2033 while (aux - orig < aux_size && aux[0] != 0) {
2034 uint32_t i32;
2035 int r;
2036
2037 if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
2038 rg = &aux[3];
2039 while (*aux++);
2040 continue;
2041 }
2042 if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
2043 while (*aux++);
2044 continue;
2045 }
2046 if (aux[0] == 'N' && aux[1] == 'M') {
2047 switch(aux[2]) {
2048 case 'A': case 'C': case 'c': aux+=4; break;
2049 case 'S': case 's': aux+=5; break;
2050 case 'I': case 'i': case 'f': aux+=7; break;
2051 default:
2052 fprintf(stderr, "Unhandled type code for NM tag\n");
2053 return NULL;
2054 }
2055 continue;
2056 }
2057
2058 BLOCK_APPEND(td_b, aux, 3);
2059
2060 i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2];
2061 kh_put(s_i2i, c->tags_used, i32, &r);
2062 if (-1 == r)
2063 return NULL;
2064
2065 // BQ:Z
2066 if (aux[0] == 'B' && aux[1] == 'Q' && aux[2] == 'Z') {
2067 char *tmp;
2068 if (!s->aux_BQ_blk)
2069 if (!(s->aux_BQ_blk = cram_new_block(EXTERNAL, DS_aux_BQ)))
2070 return NULL;
2071 BLOCK_GROW(s->aux_BQ_blk, aux_size*1.34+1);
2072 tmp = (char *)BLOCK_END(s->aux_BQ_blk);
2073 aux += 3;
2074 while ((*tmp++=*aux++));
2075 *tmp++ = '\t';
2076 BLOCK_SIZE(s->aux_BQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BQ_blk);
2077 continue;
2078 }
2079
2080 // BD:Z
2081 if (aux[0] == 'B' && aux[1]=='D' && aux[2] == 'Z') {
2082 char *tmp;
2083 if (!s->aux_BD_blk)
2084 if (!(s->aux_BD_blk = cram_new_block(EXTERNAL, DS_aux_BD)))
2085 return NULL;
2086 BLOCK_GROW(s->aux_BD_blk, aux_size*1.34+1);
2087 tmp = (char *)BLOCK_END(s->aux_BD_blk);
2088 aux += 3;
2089 while ((*tmp++=*aux++));
2090 *tmp++ = '\t';
2091 BLOCK_SIZE(s->aux_BD_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BD_blk);
2092 continue;
2093 }
2094
2095 // BI:Z
2096 if (aux[0] == 'B' && aux[1]=='I' && aux[2] == 'Z') {
2097 char *tmp;
2098 if (!s->aux_BI_blk)
2099 if (!(s->aux_BI_blk = cram_new_block(EXTERNAL, DS_aux_BI)))
2100 return NULL;
2101 BLOCK_GROW(s->aux_BI_blk, aux_size*1.34+1);
2102 tmp = (char *)BLOCK_END(s->aux_BI_blk);
2103 aux += 3;
2104 while ((*tmp++=*aux++));
2105 *tmp++ = '\t';
2106 BLOCK_SIZE(s->aux_BI_blk) = (uc *)tmp - BLOCK_DATA(s->aux_BI_blk);
2107 continue;
2108 }
2109
2110 // OQ:Z:
2111 if (aux[0] == 'O' && aux[1] == 'Q' && aux[2] == 'Z') {
2112 char *tmp;
2113 if (!s->aux_OQ_blk)
2114 if (!(s->aux_OQ_blk = cram_new_block(EXTERNAL, DS_aux_OQ)))
2115 return NULL;
2116 BLOCK_GROW(s->aux_OQ_blk, aux_size*1.34+1);
2117 tmp = (char *)BLOCK_END(s->aux_OQ_blk);
2118 aux += 3;
2119 while ((*tmp++=*aux++));
2120 *tmp++ = '\t';
2121 BLOCK_SIZE(s->aux_OQ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_OQ_blk);
2122 continue;
2123 }
2124
2125 // FZ:B or ZM:B
2126 if ((aux[0] == 'F' && aux[1] == 'Z' && aux[2] == 'B') ||
2127 (aux[0] == 'Z' && aux[1] == 'M' && aux[2] == 'B')) {
2128 int type = aux[3], blen;
2129 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
2130 (((unsigned char *)aux)[5]<< 8) +
2131 (((unsigned char *)aux)[6]<<16) +
2132 (((unsigned char *)aux)[7]<<24));
2133 char *tmp;
2134 if (!s->aux_FZ_blk)
2135 if (!(s->aux_FZ_blk = cram_new_block(EXTERNAL, DS_aux_FZ)))
2136 return NULL;
2137 BLOCK_GROW(s->aux_FZ_blk, aux_size*1.34+1);
2138 tmp = (char *)BLOCK_END(s->aux_FZ_blk);
2139
2140 // skip TN field
2141 aux+=3;
2142
2143 // We use BYTE_ARRAY_LEN with external length, so store that first
2144 switch (type) {
2145 case 'c': case 'C':
2146 blen = count;
2147 break;
2148 case 's': case 'S':
2149 blen = 2*count;
2150 break;
2151 case 'i': case 'I': case 'f':
2152 blen = 4*count;
2153 break;
2154 default:
2155 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
2156 type);
2157 return NULL;
2158
2159 }
2160
2161 blen += 5; // sub-type & length
2162 tmp += itf8_put(tmp, blen);
2163
2164 // The tag data itself
2165 memcpy(tmp, aux, blen); tmp += blen; aux += blen;
2166
2167 BLOCK_SIZE(s->aux_FZ_blk) = (uc *)tmp - BLOCK_DATA(s->aux_FZ_blk);
2168 continue;
2169 }
2170
2171 // Other quality data - {Q2,E2,U2,CQ}:Z and similar
2172 if (((aux[0] == 'Q' && aux[1] == '2') ||
2173 (aux[0] == 'U' && aux[1] == '2') ||
2174 (aux[0] == 'Q' && aux[1] == 'T') ||
2175 (aux[0] == 'C' && aux[1] == 'Q')) && aux[2] == 'Z') {
2176 char *tmp;
2177 if (!s->aux_oq_blk)
2178 if (!(s->aux_oq_blk = cram_new_block(EXTERNAL, DS_aux_oq)))
2179 return NULL;
2180 BLOCK_GROW(s->aux_oq_blk, aux_size*1.34+1);
2181 tmp = (char *)BLOCK_END(s->aux_oq_blk);
2182 aux += 3;
2183 while ((*tmp++=*aux++));
2184 *tmp++ = '\t';
2185 BLOCK_SIZE(s->aux_oq_blk) = (uc *)tmp - BLOCK_DATA(s->aux_oq_blk);
2186 continue;
2187 }
2188
2189 // Other sequence data - {R2,E2,CS,BC,RT}:Z and similar
2190 if (((aux[0] == 'R' && aux[1] == '2') ||
2191 (aux[0] == 'E' && aux[1] == '2') ||
2192 (aux[0] == 'C' && aux[1] == 'S') ||
2193 (aux[0] == 'B' && aux[1] == 'C') ||
2194 (aux[0] == 'R' && aux[1] == 'T')) && aux[2] == 'Z') {
2195 char *tmp;
2196 if (!s->aux_os_blk)
2197 if (!(s->aux_os_blk = cram_new_block(EXTERNAL, DS_aux_os)))
2198 return NULL;
2199 BLOCK_GROW(s->aux_os_blk, aux_size*1.34+1);
2200 tmp = (char *)BLOCK_END(s->aux_os_blk);
2201 aux += 3;
2202 while ((*tmp++=*aux++));
2203 *tmp++ = '\t';
2204 BLOCK_SIZE(s->aux_os_blk) = (uc *)tmp - BLOCK_DATA(s->aux_os_blk);
2205 continue;
2206 }
2207
2208
2209 switch(aux[2]) {
2210 case 'A': case 'C': case 'c':
2211 aux+=3;
2212 *tmp++=*aux++;
2213 break;
2214
2215 case 'S': case 's':
2216 aux+=3;
2217 *tmp++=*aux++; *tmp++=*aux++;
2218 break;
2219
2220 case 'I': case 'i': case 'f':
2221 aux+=3;
2222 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2223 break;
2224
2225 case 'd':
2226 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2227 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2228 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2229 break;
2230
2231 case 'Z': case 'H':
2232 {
2233 char *tmp;
2234 if (!s->aux_oz_blk)
2235 if (!(s->aux_oz_blk = cram_new_block(EXTERNAL, DS_aux_oz)))
2236 return NULL;
2237 BLOCK_GROW(s->aux_oz_blk, aux_size*1.34+1);
2238 tmp = (char *)BLOCK_END(s->aux_oz_blk);
2239 aux += 3;
2240 while ((*tmp++=*aux++));
2241 *tmp++ = '\t';
2242 BLOCK_SIZE(s->aux_oz_blk) = (uc *)tmp -
2243 BLOCK_DATA(s->aux_oz_blk);
2244 }
2245 break;
2246
2247 case 'B': {
2248 int type = aux[3], blen;
2249 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
2250 (((unsigned char *)aux)[5]<< 8) +
2251 (((unsigned char *)aux)[6]<<16) +
2252 (((unsigned char *)aux)[7]<<24));
2253 // skip TN field
2254 aux+=3;
2255
2256 // We use BYTE_ARRAY_LEN with external length, so store that first
2257 switch (type) {
2258 case 'c': case 'C':
2259 blen = count;
2260 break;
2261 case 's': case 'S':
2262 blen = 2*count;
2263 break;
2264 case 'i': case 'I': case 'f':
2265 blen = 4*count;
2266 break;
2267 default:
2268 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
2269 type);
2270 return NULL;
2271
2272 }
2273
2274 blen += 5; // sub-type & length
2275 tmp += itf8_put(tmp, blen);
2276
2277 // The tag data itself
2278 memcpy(tmp, aux, blen); tmp += blen; aux += blen;
2279
2280 //cram_stats_add(c->aux_B_stats, blen);
2281 break;
2282 }
2283 default:
2284 fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
2285 return NULL;
2286 }
2287 }
2288
2289 // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
2290
2291 // And and increment TD hash entry
2292 BLOCK_APPEND_CHAR(td_b, 0);
2293
2294 // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
2295 key = string_ndup(c->comp_hdr->TD_keys,
2296 (char *)BLOCK_DATA(td_b) + TD_blk_size,
2297 BLOCK_SIZE(td_b) - TD_blk_size);
2298 k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
2299 if (new < 0) {
2300 return NULL;
2301 } else if (new == 0) {
2302 BLOCK_SIZE(td_b) = TD_blk_size;
2303 } else {
2304 kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
2305 c->comp_hdr->nTL++;
2306 }
2307
2308 cr->TL = kh_val(c->comp_hdr->TD_hash, k);
2309 cram_stats_add(c->stats[DS_TL], cr->TL);
2310
2311 cr->aux = BLOCK_SIZE(s->aux_blk);
2312 cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
2313 BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
2314 assert(s->aux_blk->byte <= s->aux_blk->alloc);
2315
2316 return rg;
2317 }
2318
2319
2320 /*
2321 * Handles creation of a new container or new slice, flushing any
2322 * existing containers when appropriate.
2323 *
2324 * Really this is next slice, which may or may not lead to a new container.
2325 *
2326 * Returns cram_container pointer on success
2327 * NULL on failure.
2328 */
2329 static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
2330 cram_container *c = fd->ctr;
2331 cram_slice *s;
2332 int i;
2333
2334 /* First occurence */
2335 if (c->curr_ref == -2)
2336 c->curr_ref = bam_ref(b);
2337
2338 if (c->slice) {
2339 s = c->slice;
2340 if (c->multi_seq) {
2341 s->hdr->ref_seq_id = -2;
2342 s->hdr->ref_seq_start = 0;
2343 s->hdr->ref_seq_span = 0;
2344 } else {
2345 s->hdr->ref_seq_id = c->curr_ref;
2346 s->hdr->ref_seq_start = c->first_base;
2347 s->hdr->ref_seq_span = c->last_base - c->first_base + 1;
2348 }
2349 s->hdr->num_records = c->curr_rec;
2350
2351 if (c->curr_slice == 0) {
2352 if (c->ref_seq_id != s->hdr->ref_seq_id)
2353 c->ref_seq_id = s->hdr->ref_seq_id;
2354 c->ref_seq_start = c->first_base;
2355 }
2356
2357 c->curr_slice++;
2358 }
2359
2360 /* Flush container */
2361 if (c->curr_slice == c->max_slice ||
2362 (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
2363 c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
2364 if (fd->verbose)
2365 fprintf(stderr, "Flush container %d/%d..%d\n",
2366 c->ref_seq_id, c->ref_seq_start,
2367 c->ref_seq_start + c->ref_seq_span -1);
2368
2369 /* Encode slices */
2370 if (fd->pool) {
2371 if (-1 == cram_flush_container_mt(fd, c))
2372 return NULL;
2373 } else {
2374 if (-1 == cram_flush_container(fd, c))
2375 return NULL;
2376
2377 // Move to sep func, as we need cram_flush_container for
2378 // the closing phase to flush the partial container.
2379 for (i = 0; i < c->max_slice; i++) {
2380 cram_free_slice(c->slices[i]);
2381 c->slices[i] = NULL;
2382 }
2383
2384 c->slice = NULL;
2385 c->curr_slice = 0;
2386
2387 /* Easy approach for purposes of freeing stats */
2388 cram_free_container(c);
2389 }
2390
2391 c = fd->ctr = cram_new_container(fd->seqs_per_slice,
2392 fd->slices_per_container);
2393 if (!c)
2394 return NULL;
2395 c->record_counter = fd->record_counter;
2396 c->curr_ref = bam_ref(b);
2397 }
2398
2399 c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
2400
2401 /* New slice */
2402 c->slice = c->slices[c->curr_slice] =
2403 cram_new_slice(MAPPED_SLICE, c->max_rec);
2404 if (!c->slice)
2405 return NULL;
2406
2407 if (c->multi_seq) {
2408 c->slice->hdr->ref_seq_id = -2;
2409 c->slice->hdr->ref_seq_start = 0;
2410 c->slice->last_apos = 1;
2411 } else {
2412 c->slice->hdr->ref_seq_id = bam_ref(b);
2413 // wrong for unsorted data, will fix during encoding.
2414 c->slice->hdr->ref_seq_start = bam_pos(b)+1;
2415 c->slice->last_apos = bam_pos(b)+1;
2416 }
2417
2418 c->curr_rec = 0;
2419
2420 return c;
2421 }
2422
2423 /*
2424 * Converts a single bam record into a cram record.
2425 * Possibly used within a thread.
2426 *
2427 * Returns 0 on success;
2428 * -1 on failure
2429 */
2430 static int process_one_read(cram_fd *fd, cram_container *c,
2431 cram_slice *s, cram_record *cr,
2432 bam_seq_t *b, int rnum) {
2433 int i, fake_qual = -1;
2434 char *cp, *rg;
2435 char *ref, *seq, *qual;
2436
2437 // FIXME: multi-ref containers
2438
2439 ref = c->ref;
2440 cr->len = bam_seq_len(b); cram_stats_add(c->stats[DS_RL], cr->len);
2441
2442 //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
2443
2444 // Fields to resolve later
2445 //cr->mate_line; // index to another cram_record
2446 //cr->mate_flags; // MF
2447 //cr->ntags; // TC
2448 cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
2449 if (CRAM_MAJOR_VERS(fd->version) == 1)
2450 rg = cram_encode_aux_1_0(fd, b, c, s, cr);
2451 else
2452 rg = cram_encode_aux(fd, b, c, s, cr);
2453
2454 //cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b));
2455 //cr->aux = DSTRING_LEN(s->aux_ds);
2456 //dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size);
2457
2458 /* Read group, identified earlier */
2459 if (rg) {
2460 SAM_RG *brg = sam_hdr_find_rg(fd->header, rg);
2461 cr->rg = brg ? brg->id : -1;
2462 } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
2463 SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN");
2464 assert(brg);
2465 } else {
2466 cr->rg = -1;
2467 }
2468 cram_stats_add(c->stats[DS_RG], cr->rg);
2469
2470
2471 cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id);
2472 cr->flags = bam_flag(b);
2473 if (bam_cigar_len(b) == 0)
2474 cr->flags |= BAM_FUNMAP;
2475 cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]);
2476
2477 // Non reference based encoding means storing the bases verbatim as features, which in
2478 // turn means every base also has a quality already stored.
2479 if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
2480 cr->cram_flags = CRAM_FLAG_PRESERVE_QUAL_SCORES;
2481 else
2482 cr->cram_flags = 0;
2483 //cram_stats_add(c->stats[DS_CF], cr->cram_flags);
2484
2485 c->num_bases += cr->len;
2486 cr->apos = bam_pos(b)+1;
2487 if (c->pos_sorted) {
2488 if (cr->apos < s->last_apos) {
2489 c->pos_sorted = 0;
2490 } else {
2491 cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos);
2492 s->last_apos = cr->apos;
2493 }
2494 } else {
2495 //cram_stats_add(c->stats[DS_AP], cr->apos);
2496 }
2497 c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
2498
2499 cr->name = BLOCK_SIZE(s->name_blk);
2500 cr->name_len = bam_name_len(b);
2501 cram_stats_add(c->stats[DS_RN], cr->name_len);
2502
2503 BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
2504
2505
2506 /*
2507 * This seqs_ds is largely pointless and it could reuse the same memory
2508 * over and over.
2509 * s->base_blk is what we need for encoding.
2510 */
2511 cr->seq = BLOCK_SIZE(s->seqs_blk);
2512 cr->qual = BLOCK_SIZE(s->qual_blk);
2513 BLOCK_GROW(s->seqs_blk, cr->len+1);
2514 BLOCK_GROW(s->qual_blk, cr->len);
2515 seq = cp = (char *)BLOCK_END(s->seqs_blk);
2516
2517 *seq = 0;
2518 #ifdef ALLOW_UAC
2519 {
2520 // Convert seq 2 bases at a time for speed.
2521 static const uint16_t code2base[256] = {
2522 15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077,
2523 21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029,
2524 15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081,
2525 21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033,
2526 15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083,
2527 21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035,
2528 15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093,
2529 21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045,
2530 15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087,
2531 21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039,
2532 15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098,
2533 21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050,
2534 15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099,
2535 21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051,
2536 15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102,
2537 21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054,
2538 15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100,
2539 21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052,
2540 15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103,
2541 21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055,
2542 15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105,
2543 21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057,
2544 15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088,
2545 21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040,
2546 15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091,
2547 21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043,
2548 15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084,
2549 21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036,
2550 15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082,
2551 21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034,
2552 15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094,
2553 21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046
2554 };
2555
2556 int l2 = cr->len / 2;
2557 unsigned char *from = (unsigned char *)bam_seq(b);
2558 uint16_t *cpi = (uint16_t *)cp;
2559 cp[0] = 0;
2560 for (i = 0; i < l2; i++)
2561 cpi[i] = le_int2(code2base[from[i]]);
2562 if ((i *= 2) < cr->len)
2563 cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
2564 }
2565 #else
2566 for (i = 0; i < cr->len; i++)
2567 cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
2568 #endif
2569 BLOCK_SIZE(s->seqs_blk) += cr->len;
2570
2571 qual = cp = (char *)bam_qual(b);
2572
2573 /* Copy and parse */
2574 if (!(cr->flags & BAM_FUNMAP)) {
2575 int32_t *cig_to, *cig_from;
2576 int apos = cr->apos-1, spos = 0;
2577
2578 cr->cigar = s->ncigar;
2579 cr->ncigar = bam_cigar_len(b);
2580 while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
2581 s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
2582 s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
2583 if (!s->cigar)
2584 return -1;
2585 }
2586
2587 cig_to = (int32_t *)s->cigar;
2588 cig_from = (int32_t *)bam_cigar(b);
2589
2590 cr->feature = 0;
2591 cr->nfeature = 0;
2592 for (i = 0; i < cr->ncigar; i++) {
2593 enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
2594 int cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
2595 cig_to[i] = cig_from[i];
2596
2597 /* Can also generate events from here for CRAM diffs */
2598
2599 switch (cig_op) {
2600 int l;
2601
2602 // Don't trust = and X ops to be correct.
2603 case BAM_CMATCH:
2604 case BAM_CBASE_MATCH:
2605 case BAM_CBASE_MISMATCH:
2606 //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
2607 // cig_len, &ref[apos], cig_len, &seq[spos]);
2608 l = 0;
2609 if (!fd->no_ref && cr->len) {
2610 int end = cig_len+apos < c->ref_end
2611 ? cig_len : c->ref_end - apos;
2612 char *sp = &seq[spos];
2613 char *rp = &ref[apos];
2614 char *qp = &qual[spos];
2615 for (l = 0; l < end; l++) {
2616 if (rp[l] != sp[l]) {
2617 if (!sp[l])
2618 break;
2619 if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
2620 // Disabled for the time being as it doesn't
2621 // seem to gain us much.
2622 int ol=l;
2623 while (l<end && rp[l] != sp[l])
2624 l++;
2625 if (l-ol > 1) {
2626 if (cram_add_bases(fd, c, s, cr, spos+ol,
2627 l-ol, &seq[spos+ol]))
2628 return -1;
2629 l--;
2630 } else {
2631 l = ol;
2632 if (cram_add_substitution(fd, c, s, cr,
2633 spos+l, sp[l],
2634 qp[l], rp[l]))
2635 return -1;
2636 }
2637 } else {
2638 if (cram_add_substitution(fd, c, s, cr, spos+l,
2639 sp[l], qp[l], rp[l]))
2640 return -1;
2641 }
2642 }
2643 }
2644 spos += l;
2645 apos += l;
2646 }
2647
2648 if (l < cig_len && cr->len) {
2649 if (fd->no_ref) {
2650 if (CRAM_MAJOR_VERS(fd->version) == 3) {
2651 if (cram_add_bases(fd, c, s, cr, spos,
2652 cig_len-l, &seq[spos]))
2653 return -1;
2654 spos += cig_len-l;
2655 } else {
2656 for (; l < cig_len && seq[spos]; l++, spos++) {
2657 if (cram_add_base(fd, c, s, cr, spos,
2658 seq[spos], qual[spos]))
2659 return -1;
2660 }
2661 }
2662 } else {
2663 /* off end of sequence or non-ref based output */
2664 for (; l < cig_len && seq[spos]; l++, spos++) {
2665 if (cram_add_base(fd, c, s, cr, spos,
2666 seq[spos], qual[spos]))
2667 return -1;
2668 }
2669 }
2670 apos += cig_len;
2671 } else if (!cr->len) {
2672 /* Seq "*" */
2673 apos += cig_len;
2674 spos += cig_len;
2675 }
2676 break;
2677
2678 case BAM_CDEL:
2679 if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
2680 return -1;
2681 apos += cig_len;
2682 break;
2683
2684 case BAM_CREF_SKIP:
2685 if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
2686 return -1;
2687 apos += cig_len;
2688 break;
2689
2690 case BAM_CINS:
2691 if (cram_add_insertion(c, s, cr, spos, cig_len,
2692 cr->len ? &seq[spos] : NULL))
2693 return -1;
2694 if (fd->no_ref && cr->len) {
2695 for (l = 0; l < cig_len; l++, spos++) {
2696 cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2697 }
2698 } else {
2699 spos += cig_len;
2700 }
2701 break;
2702
2703 case BAM_CSOFT_CLIP:
2704 if (cram_add_softclip(c, s, cr, spos, cig_len,
2705 cr->len ? &seq[spos] : NULL,
2706 fd->version))
2707 return -1;
2708 if (fd->no_ref &&
2709 !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2710 if (cr->len) {
2711 for (l = 0; l < cig_len; l++, spos++) {
2712 cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2713 }
2714 } else {
2715 for (l = 0; l < cig_len; l++, spos++) {
2716 cram_add_quality(fd, c, s, cr, spos, -1);
2717 }
2718 }
2719 } else {
2720 spos += cig_len;
2721 }
2722 break;
2723
2724 case BAM_CHARD_CLIP:
2725 if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
2726 return -1;
2727 break;
2728
2729 case BAM_CPAD:
2730 if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
2731 return -1;
2732 break;
2733 }
2734 }
2735 fake_qual = spos;
2736 cr->aend = MIN(apos, c->ref_end);
2737 cram_stats_add(c->stats[DS_FN], cr->nfeature);
2738 } else {
2739 // Unmapped
2740 cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
2741 cr->cigar = 0;
2742 cr->ncigar = 0;
2743 cr->nfeature = 0;
2744 cr->aend = cr->apos;
2745 for (i = 0; i < cr->len; i++)
2746 cram_stats_add(c->stats[DS_BA], seq[i]);
2747 }
2748
2749 /*
2750 * Append to the qual block now. We do this here as
2751 * cram_add_substitution() can generate BA/QS events which need to
2752 * be in the qual block before we append the rest of the data.
2753 */
2754 if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
2755 /* Special case of seq "*" */
2756 if (cr->len == 0) {
2757 cram_stats_add(c->stats[DS_RL], cr->len = fake_qual);
2758 BLOCK_GROW(s->qual_blk, cr->len);
2759 cp = (char *)BLOCK_END(s->qual_blk);
2760 memset(cp, 255, cr->len);
2761 } else {
2762 BLOCK_GROW(s->qual_blk, cr->len);
2763 cp = (char *)BLOCK_END(s->qual_blk);
2764 char *from = (char *)&bam_qual(b)[0];
2765 char *to = &cp[0];
2766 memcpy(to, from, cr->len);
2767 //for (i = 0; i < cr->len; i++) cp[i] = from[i];
2768 }
2769 BLOCK_SIZE(s->qual_blk) += cr->len;
2770 } else {
2771 if (cr->len == 0) {
2772 cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
2773 cram_stats_add(c->stats[DS_RL], cr->len);
2774 }
2775 }
2776
2777 /* Now we know apos and aend both, update mate-pair information */
2778 {
2779 int new;
2780 khint_t k;
2781 int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
2782
2783 //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
2784 // cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
2785 if (cr->flags & BAM_FPAIRED) {
2786 char *key = string_ndup(s->pair_keys,
2787 (char *)BLOCK_DATA(s->name_blk)+cr->name,
2788 cr->name_len);
2789 if (!key)
2790 return -1;
2791
2792 k = kh_put(m_s2i, s->pair[sec], key, &new);
2793 if (-1 == new)
2794 return -1;
2795 else if (new > 0)
2796 kh_val(s->pair[sec], k) = rnum;
2797 } else {
2798 new = 1;
2799 }
2800
2801 if (new == 0) {
2802 cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
2803 int aleft, aright, sign;
2804
2805 aleft = MIN(cr->apos, p->apos);
2806 aright = MAX(cr->aend, p->aend);
2807 if (cr->apos < p->apos) {
2808 sign = 1;
2809 } else if (cr->apos > p->apos) {
2810 sign = -1;
2811 } else if (cr->flags & BAM_FREAD1) {
2812 sign = 1;
2813 } else {
2814 sign = -1;
2815 }
2816
2817 //fprintf(stderr, "paired %"PRId64"\n", kh_val(s->pair[sec], k));
2818
2819 // This vs p: tlen, matepos, flags
2820 if (bam_ins_size(b) != sign*(aright-aleft+1))
2821 goto detached;
2822
2823 if (MAX(bam_mate_pos(b)+1, 0) != p->apos)
2824 goto detached;
2825
2826 if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
2827 ((p->flags & BAM_FUNMAP) != 0))
2828 goto detached;
2829
2830 if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
2831 ((p->flags & BAM_FREVERSE) != 0))
2832 goto detached;
2833
2834
2835 // p vs this: tlen, matepos, flags
2836 if (p->tlen != -sign*(aright-aleft+1))
2837 goto detached;
2838
2839 if (p->mate_pos != cr->apos)
2840 goto detached;
2841
2842 if (((p->flags & BAM_FMUNMAP) != 0) !=
2843 ((p->mate_flags & CRAM_M_UNMAP) != 0))
2844 goto detached;
2845
2846 if (((p->flags & BAM_FMREVERSE) != 0) !=
2847 ((p->mate_flags & CRAM_M_REVERSE) != 0))
2848 goto detached;
2849
2850 // Supplementary reads are just too ill defined
2851 if ((cr->flags & BAM_FSUPPLEMENTARY) ||
2852 (p->flags & BAM_FSUPPLEMENTARY))
2853 goto detached;
2854
2855 /*
2856 * The fields below are unused when encoding this read as it is
2857 * no longer detached. In theory they may get referred to when
2858 * processing a 3rd or 4th read in this template?, so we set them
2859 * here just to be sure.
2860 *
2861 * They do not need cram_stats_add() calls those as they are
2862 * not emitted.
2863 */
2864 cr->mate_pos = p->apos;
2865 cr->tlen = sign*(aright-aleft+1);
2866 cr->mate_flags =
2867 ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
2868 ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
2869
2870 // Decrement statistics aggregated earlier
2871 cram_stats_del(c->stats[DS_NP], p->mate_pos);
2872 cram_stats_del(c->stats[DS_MF], p->mate_flags);
2873 cram_stats_del(c->stats[DS_TS], p->tlen);
2874 cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
2875
2876 /* Similarly we could correct the p-> values too, but these will no
2877 * longer have any code that refers back to them as the new 'p'
2878 * for this template is our current 'cr'.
2879 */
2880 //p->mate_pos = cr->apos;
2881 //p->mate_flags =
2882 // ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
2883 // ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
2884 //p->tlen = p->apos - cr->aend;
2885
2886 // Clear detached from cr flags
2887 cr->cram_flags &= ~CRAM_FLAG_DETACHED;
2888 cram_stats_add(c->stats[DS_CF], cr->cram_flags);
2889
2890 // Clear detached from p flags and set downstream
2891 cram_stats_del(c->stats[DS_CF], p->cram_flags);
2892 p->cram_flags &= ~CRAM_FLAG_DETACHED;
2893 p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM;
2894 cram_stats_add(c->stats[DS_CF], p->cram_flags);
2895
2896 p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
2897 cram_stats_add(c->stats[DS_NF], p->mate_line);
2898
2899 kh_val(s->pair[sec], k) = rnum;
2900 } else {
2901 detached:
2902 //fprintf(stderr, "unpaired\n");
2903
2904 /* Derive mate flags from this flag */
2905 cr->mate_flags = 0;
2906 if (bam_flag(b) & BAM_FMUNMAP)
2907 cr->mate_flags |= CRAM_M_UNMAP;
2908 if (bam_flag(b) & BAM_FMREVERSE)
2909 cr->mate_flags |= CRAM_M_REVERSE;
2910
2911 cram_stats_add(c->stats[DS_MF], cr->mate_flags);
2912
2913 cr->mate_pos = MAX(bam_mate_pos(b)+1, 0);
2914 cram_stats_add(c->stats[DS_NP], cr->mate_pos);
2915
2916 cr->tlen = bam_ins_size(b);
2917 cram_stats_add(c->stats[DS_TS], cr->tlen);
2918
2919 cr->cram_flags |= CRAM_FLAG_DETACHED;
2920 cram_stats_add(c->stats[DS_CF], cr->cram_flags);
2921 cram_stats_add(c->stats[DS_NS], bam_mate_ref(b));
2922 }
2923 }
2924
2925 cr->mqual = bam_map_qual(b);
2926 cram_stats_add(c->stats[DS_MQ], cr->mqual);
2927
2928 cr->mate_ref_id = bam_mate_ref(b);
2929
2930 if (!(bam_flag(b) & BAM_FUNMAP)) {
2931 if (c->first_base > cr->apos)
2932 c->first_base = cr->apos;
2933
2934 if (c->last_base < cr->aend)
2935 c->last_base = cr->aend;
2936 }
2937
2938 return 0;
2939 }
2940
2941 /*
2942 * Write iterator: put BAM format sequences into a CRAM file.
2943 * We buffer up a containers worth of data at a time.
2944 *
2945 * Returns 0 on success
2946 * -1 on failure
2947 */
2948 int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
2949 cram_container *c;
2950
2951 if (!fd->ctr) {
2952 fd->ctr = cram_new_container(fd->seqs_per_slice,
2953 fd->slices_per_container);
2954 if (!fd->ctr)
2955 return -1;
2956 fd->ctr->record_counter = fd->record_counter;
2957 }
2958 c = fd->ctr;
2959
2960 if (!c->slice || c->curr_rec == c->max_rec ||
2961 (bam_ref(b) != c->curr_ref && c->curr_ref >= -1)) {
2962 int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
2963 int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
2964
2965
2966 /*
2967 * Start packing slices when we routinely have under 1/4tr full.
2968 *
2969 * This option isn't available if we choose to embed references
2970 * since we can only have one per slice.
2971 */
2972 if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
2973 fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
2974 !fd->embed_ref) {
2975 if (fd->verbose && !c->multi_seq)
2976 fprintf(stderr, "Multi-ref enabled for this container\n");
2977 multi_seq = 1;
2978 }
2979
2980 slice_rec = c->slice_rec;
2981 curr_rec = c->curr_rec;
2982
2983 if (CRAM_MAJOR_VERS(fd->version) == 1 ||
2984 c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice) {
2985 if (NULL == (c = cram_next_container(fd, b))) {
2986 if (fd->ctr) {
2987 // prevent cram_close attempting to flush
2988 cram_free_container(fd->ctr);
2989 fd->ctr = NULL;
2990 }
2991 return -1;
2992 }
2993 }
2994
2995 /*
2996 * Due to our processing order, some things we've already done we
2997 * cannot easily undo. So when we first notice we should be packing
2998 * multiple sequences per container we emit the small partial
2999 * container as-is and then start a fresh one in a different mode.
3000 */
3001 if (multi_seq) {
3002 fd->multi_seq = 1;
3003 c->multi_seq = 1;
3004 c->pos_sorted = 0; // required atm for multi_seq slices
3005
3006 if (!c->refs_used) {
3007 pthread_mutex_lock(&fd->ref_lock);
3008 c->refs_used = calloc(fd->refs->nref, sizeof(int));
3009 pthread_mutex_unlock(&fd->ref_lock);
3010 if (!c->refs_used)
3011 return -1;
3012 }
3013 }
3014
3015 fd->last_slice = curr_rec - slice_rec;
3016 c->slice_rec = c->curr_rec;
3017
3018 // Have we seen this reference before?
3019 if (bam_ref(b) >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref &&
3020 !fd->unsorted && multi_seq) {
3021
3022 if (!c->refs_used) {
3023 pthread_mutex_lock(&fd->ref_lock);
3024 c->refs_used = calloc(fd->refs->nref, sizeof(int));
3025 pthread_mutex_unlock(&fd->ref_lock);
3026 if (!c->refs_used)
3027 return -1;
3028 } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
3029 fprintf(stderr, "Unsorted mode enabled\n");
3030 pthread_mutex_lock(&fd->ref_lock);
3031 fd->unsorted = 1;
3032 pthread_mutex_unlock(&fd->ref_lock);
3033 fd->multi_seq = 1;
3034 }
3035 }
3036
3037 c->curr_ref = bam_ref(b);
3038 if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
3039 }
3040
3041 if (!c->bams) {
3042 /* First time through, allocate a set of bam pointers */
3043 pthread_mutex_lock(&fd->bam_list_lock);
3044 if (fd->bl) {
3045 spare_bams *spare = fd->bl;
3046 c->bams = spare->bams;
3047 fd->bl = spare->next;
3048 free(spare);
3049 } else {
3050 c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
3051 if (!c->bams)
3052 return -1;
3053 }
3054 pthread_mutex_unlock(&fd->bam_list_lock);
3055 }
3056
3057 /* Copy or alloc+copy the bam record, for later encoding */
3058 if (c->bams[c->curr_c_rec])
3059 bam_copy1(c->bams[c->curr_c_rec], b);
3060 else
3061 c->bams[c->curr_c_rec] = bam_dup(b);
3062
3063 c->curr_rec++;
3064 c->curr_c_rec++;
3065 fd->record_counter++;
3066
3067 return 0;
3068 }