comparison ezBAMQC/src/htslib/sam.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /* sam.c -- SAM and BAM file I/O and manipulation.
2
3 Copyright (C) 2008-2010, 2012-2014 Genome Research Ltd.
4 Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <errno.h>
30 #include <ctype.h>
31 #include <zlib.h>
32 #include "htslib/sam.h"
33 #include "htslib/bgzf.h"
34 #include "cram/cram.h"
35 #include "htslib/hfile.h"
36
37 #include "htslib/khash.h"
38 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
39
40 typedef khash_t(s2i) sdict_t;
41
42 /**********************
43 *** BAM header I/O ***
44 **********************/
45
46 bam_hdr_t *bam_hdr_init()
47 {
48 return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
49 }
50
51 void bam_hdr_destroy(bam_hdr_t *h)
52 {
53 int32_t i;
54 if (h == NULL) return;
55 if (h->target_name) {
56 for (i = 0; i < h->n_targets; ++i)
57 free(h->target_name[i]);
58 free(h->target_name);
59 free(h->target_len);
60 }
61 free(h->text); free(h->cigar_tab);
62 if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
63 free(h);
64 }
65
66 bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
67 {
68 if (h0 == NULL) return NULL;
69 bam_hdr_t *h;
70 if ((h = bam_hdr_init()) == NULL) return NULL;
71 // copy the simple data
72 h->n_targets = h0->n_targets;
73 h->ignore_sam_err = h0->ignore_sam_err;
74 h->l_text = h0->l_text;
75 // Then the pointery stuff
76 h->cigar_tab = NULL;
77 h->sdict = NULL;
78 h->text = (char*)calloc(h->l_text + 1, 1);
79 memcpy(h->text, h0->text, h->l_text);
80 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
81 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
82 int i;
83 for (i = 0; i < h->n_targets; ++i) {
84 h->target_len[i] = h0->target_len[i];
85 h->target_name[i] = strdup(h0->target_name[i]);
86 }
87 return h;
88 }
89
90
91 static bam_hdr_t *hdr_from_dict(sdict_t *d)
92 {
93 bam_hdr_t *h;
94 khint_t k;
95 h = bam_hdr_init();
96 h->sdict = d;
97 h->n_targets = kh_size(d);
98 h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
99 h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
100 for (k = kh_begin(d); k != kh_end(d); ++k) {
101 if (!kh_exist(d, k)) continue;
102 h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
103 h->target_len[kh_val(d, k)>>32] = kh_val(d, k)<<32>>32;
104 kh_val(d, k) >>= 32;
105 }
106 return h;
107 }
108
109 bam_hdr_t *bam_hdr_read(BGZF *fp)
110 {
111 bam_hdr_t *h;
112 char buf[4];
113 int magic_len, has_EOF;
114 int32_t i = 1, name_len;
115 // check EOF
116 has_EOF = bgzf_check_EOF(fp);
117 if (has_EOF < 0) {
118 perror("[W::sam_hdr_read] bgzf_check_EOF");
119 } else if (has_EOF == 0 && hts_verbose >= 2)
120 fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
121 // read "BAM1"
122 magic_len = bgzf_read(fp, buf, 4);
123 if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
124 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] invalid BAM binary header\n", __func__);
125 return 0;
126 }
127 h = bam_hdr_init();
128 // read plain text and the number of reference sequences
129 bgzf_read(fp, &h->l_text, 4);
130 if (fp->is_be) ed_swap_4p(&h->l_text);
131 h->text = (char*)malloc(h->l_text + 1);
132 h->text[h->l_text] = 0; // make sure it is NULL terminated
133 bgzf_read(fp, h->text, h->l_text);
134 bgzf_read(fp, &h->n_targets, 4);
135 if (fp->is_be) ed_swap_4p(&h->n_targets);
136 // read reference sequence names and lengths
137 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
138 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
139 for (i = 0; i != h->n_targets; ++i) {
140 bgzf_read(fp, &name_len, 4);
141 if (fp->is_be) ed_swap_4p(&name_len);
142 h->target_name[i] = (char*)calloc(name_len, 1);
143 bgzf_read(fp, h->target_name[i], name_len);
144 bgzf_read(fp, &h->target_len[i], 4);
145 if (fp->is_be) ed_swap_4p(&h->target_len[i]);
146 }
147 return h;
148 }
149
150 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
151 {
152 char buf[4];
153 int32_t i, name_len, x;
154 // write "BAM1"
155 strncpy(buf, "BAM\1", 4);
156 bgzf_write(fp, buf, 4);
157 // write plain text and the number of reference sequences
158 if (fp->is_be) {
159 x = ed_swap_4(h->l_text);
160 bgzf_write(fp, &x, 4);
161 if (h->l_text) bgzf_write(fp, h->text, h->l_text);
162 x = ed_swap_4(h->n_targets);
163 bgzf_write(fp, &x, 4);
164 } else {
165 bgzf_write(fp, &h->l_text, 4);
166 if (h->l_text) bgzf_write(fp, h->text, h->l_text);
167 bgzf_write(fp, &h->n_targets, 4);
168 }
169 // write sequence names and lengths
170 for (i = 0; i != h->n_targets; ++i) {
171 char *p = h->target_name[i];
172 name_len = strlen(p) + 1;
173 if (fp->is_be) {
174 x = ed_swap_4(name_len);
175 bgzf_write(fp, &x, 4);
176 } else bgzf_write(fp, &name_len, 4);
177 bgzf_write(fp, p, name_len);
178 if (fp->is_be) {
179 x = ed_swap_4(h->target_len[i]);
180 bgzf_write(fp, &x, 4);
181 } else bgzf_write(fp, &h->target_len[i], 4);
182 }
183 bgzf_flush(fp);
184 return 0;
185 }
186
187 int bam_name2id(bam_hdr_t *h, const char *ref)
188 {
189 sdict_t *d = (sdict_t*)h->sdict;
190 khint_t k;
191 if (h->sdict == 0) {
192 int i, absent;
193 d = kh_init(s2i);
194 for (i = 0; i < h->n_targets; ++i) {
195 k = kh_put(s2i, d, h->target_name[i], &absent);
196 kh_val(d, k) = i;
197 }
198 h->sdict = d;
199 }
200 k = kh_get(s2i, d, ref);
201 return k == kh_end(d)? -1 : kh_val(d, k);
202 }
203
204 /*************************
205 *** BAM alignment I/O ***
206 *************************/
207
208 bam1_t *bam_init1()
209 {
210 return (bam1_t*)calloc(1, sizeof(bam1_t));
211 }
212
213 void bam_destroy1(bam1_t *b)
214 {
215 if (b == 0) return;
216 free(b->data); free(b);
217 }
218
219 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
220 {
221 uint8_t *data = bdst->data;
222 int m_data = bdst->m_data; // backup data and m_data
223 if (m_data < bsrc->l_data) { // double the capacity
224 m_data = bsrc->l_data; kroundup32(m_data);
225 data = (uint8_t*)realloc(data, m_data);
226 }
227 memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
228 *bdst = *bsrc; // copy the rest
229 // restore the backup
230 bdst->m_data = m_data;
231 bdst->data = data;
232 return bdst;
233 }
234
235 bam1_t *bam_dup1(const bam1_t *bsrc)
236 {
237 if (bsrc == NULL) return NULL;
238 bam1_t *bdst = bam_init1();
239 if (bdst == NULL) return NULL;
240 return bam_copy1(bdst, bsrc);
241 }
242
243 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
244 {
245 int k, l;
246 for (k = l = 0; k < n_cigar; ++k)
247 if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
248 l += bam_cigar_oplen(cigar[k]);
249 return l;
250 }
251
252 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
253 {
254 int k, l;
255 for (k = l = 0; k < n_cigar; ++k)
256 if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
257 l += bam_cigar_oplen(cigar[k]);
258 return l;
259 }
260
261 int32_t bam_endpos(const bam1_t *b)
262 {
263 if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
264 return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
265 else
266 return b->core.pos + 1;
267 }
268
269 static inline int aux_type2size(uint8_t type)
270 {
271 switch (type) {
272 case 'A': case 'c': case 'C':
273 return 1;
274 case 's': case 'S':
275 return 2;
276 case 'i': case 'I': case 'f':
277 return 4;
278 case 'd':
279 return 8;
280 case 'Z': case 'H': case 'B':
281 return type;
282 default:
283 return 0;
284 }
285 }
286
287 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
288 {
289 uint8_t *s;
290 uint32_t *cigar = (uint32_t*)(data + c->l_qname);
291 uint32_t i, n;
292 s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
293 for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
294 while (s < data + l_data) {
295 int size;
296 s += 2; // skip key
297 size = aux_type2size(*s); ++s; // skip type
298 switch (size) {
299 case 1: ++s; break;
300 case 2: ed_swap_2p(s); s += 2; break;
301 case 4: ed_swap_4p(s); s += 4; break;
302 case 8: ed_swap_8p(s); s += 8; break;
303 case 'Z':
304 case 'H':
305 while (*s) ++s;
306 ++s;
307 break;
308 case 'B':
309 size = aux_type2size(*s); ++s;
310 if (is_host) memcpy(&n, s, 4), ed_swap_4p(s);
311 else ed_swap_4p(s), memcpy(&n, s, 4);
312 s += 4;
313 switch (size) {
314 case 1: s += n; break;
315 case 2: for (i = 0; i < n; ++i, s += 2) ed_swap_2p(s); break;
316 case 4: for (i = 0; i < n; ++i, s += 4) ed_swap_4p(s); break;
317 case 8: for (i = 0; i < n; ++i, s += 8) ed_swap_8p(s); break;
318 }
319 break;
320 }
321 }
322 }
323
324 int bam_read1(BGZF *fp, bam1_t *b)
325 {
326 bam1_core_t *c = &b->core;
327 int32_t block_len, ret, i;
328 uint32_t x[8];
329 if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
330 if (ret == 0) return -1; // normal end-of-file
331 else return -2; // truncated
332 }
333 if (bgzf_read(fp, x, 32) != 32) return -3;
334 if (fp->is_be) {
335 ed_swap_4p(&block_len);
336 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
337 }
338 c->tid = x[0]; c->pos = x[1];
339 c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
340 c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
341 c->l_qseq = x[4];
342 c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
343 b->l_data = block_len - 32;
344 if (b->l_data < 0 || c->l_qseq < 0) return -4;
345 if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data)
346 return -4;
347 if (b->m_data < b->l_data) {
348 b->m_data = b->l_data;
349 kroundup32(b->m_data);
350 b->data = (uint8_t*)realloc(b->data, b->m_data);
351 if (!b->data)
352 return -4;
353 }
354 if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4;
355 //b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
356 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
357 return 4 + block_len;
358 }
359
360 int bam_write1(BGZF *fp, const bam1_t *b)
361 {
362 const bam1_core_t *c = &b->core;
363 uint32_t x[8], block_len = b->l_data + 32, y;
364 int i, ok;
365 x[0] = c->tid;
366 x[1] = c->pos;
367 x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
368 x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
369 x[4] = c->l_qseq;
370 x[5] = c->mtid;
371 x[6] = c->mpos;
372 x[7] = c->isize;
373 ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
374 if (fp->is_be) {
375 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
376 y = block_len;
377 if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
378 swap_data(c, b->l_data, b->data, 1);
379 } else {
380 if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
381 }
382 if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
383 if (ok) ok = (bgzf_write(fp, b->data, b->l_data) >= 0);
384 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
385 return ok? 4 + block_len : -1;
386 }
387
388 /********************
389 *** BAM indexing ***
390 ********************/
391
392 static hts_idx_t *bam_index(BGZF *fp, int min_shift)
393 {
394 int n_lvls, i, fmt;
395 bam1_t *b;
396 hts_idx_t *idx;
397 bam_hdr_t *h;
398 h = bam_hdr_read(fp);
399 if (min_shift > 0) {
400 int64_t max_len = 0, s;
401 for (i = 0; i < h->n_targets; ++i)
402 if (max_len < h->target_len[i]) max_len = h->target_len[i];
403 max_len += 256;
404 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
405 fmt = HTS_FMT_CSI;
406 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
407 idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
408 bam_hdr_destroy(h);
409 b = bam_init1();
410 while (bam_read1(fp, b) >= 0) {
411 int l, ret;
412 l = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
413 if (l == 0) l = 1; // no zero-length records
414 ret = hts_idx_push(idx, b->core.tid, b->core.pos, b->core.pos + l, bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
415 if (ret < 0)
416 {
417 // unsorted
418 bam_destroy1(b);
419 hts_idx_destroy(idx);
420 return NULL;
421 }
422 }
423 hts_idx_finish(idx, bgzf_tell(fp));
424 bam_destroy1(b);
425 return idx;
426 }
427
428 int bam_index_build(const char *fn, int min_shift)
429 {
430 hts_idx_t *idx;
431 htsFile *fp;
432 int ret = 0;
433
434 if ((fp = hts_open(fn, "r")) == 0) return -1;
435 switch (fp->format.format) {
436 case cram:
437 ret = cram_index_build(fp->fp.cram, fn);
438 break;
439
440 case bam:
441 idx = bam_index(fp->fp.bgzf, min_shift);
442 if (idx) {
443 hts_idx_save(idx, fn, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
444 hts_idx_destroy(idx);
445 }
446 else ret = -1;
447 break;
448
449 default:
450 ret = -1;
451 break;
452 }
453 hts_close(fp);
454
455 return ret;
456 }
457
458 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
459 {
460 bam1_t *b = bv;
461 int ret;
462 if ((ret = bam_read1(fp, b)) >= 0) {
463 *tid = b->core.tid; *beg = b->core.pos;
464 *end = b->core.pos + (b->core.n_cigar? bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)) : 1);
465 }
466 return ret;
467 }
468
469 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
470 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
471 {
472 htsFile *fp = fpv;
473 bam1_t *b = bv;
474 return cram_get_bam_seq(fp->fp.cram, &b);
475 }
476
477 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
478 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
479 {
480 htsFile *fp = fpv;
481 bam1_t *b = bv;
482 switch (fp->format.format) {
483 case bam: return bam_read1(bgzfp, b);
484 case cram: return cram_get_bam_seq(fp->fp.cram, &b);
485 default:
486 // TODO Need headers available to implement this for SAM files
487 fprintf(stderr, "[sam_bam_cram_readrec] Not implemented for SAM files -- Exiting\n");
488 abort();
489 }
490 }
491
492 // The CRAM implementation stores the loaded index within the cram_fd rather
493 // than separately as is done elsewhere in htslib. So if p is a pointer to
494 // an hts_idx_t with p->fmt == HTS_FMT_CRAI, then it actually points to an
495 // hts_cram_idx_t and should be cast accordingly.
496 typedef struct hts_cram_idx_t {
497 int fmt;
498 cram_fd *cram;
499 } hts_cram_idx_t;
500
501 hts_idx_t *sam_index_load(samFile *fp, const char *fn)
502 {
503 switch (fp->format.format) {
504 case bam:
505 return bam_index_load(fn);
506
507 case cram: {
508 if (cram_index_load(fp->fp.cram, fn) < 0) return NULL;
509 // Cons up a fake "index" just pointing at the associated cram_fd:
510 hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
511 if (idx == NULL) return NULL;
512 idx->fmt = HTS_FMT_CRAI;
513 idx->cram = fp->fp.cram;
514 return (hts_idx_t *) idx;
515 }
516
517 default:
518 return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
519 }
520 }
521
522 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
523 {
524 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
525 hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
526 if (iter == NULL) return NULL;
527
528 // Cons up a dummy iterator for which hts_itr_next() will simply invoke
529 // the readrec function:
530 iter->read_rest = 1;
531 iter->off = NULL;
532 iter->bins.a = NULL;
533 iter->readrec = readrec;
534
535 if (tid >= 0) {
536 cram_range r = { tid, beg+1, end };
537 if (cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r) != 0) { free(iter); return NULL; }
538 iter->curr_off = 0;
539 // The following fields are not required by hts_itr_next(), but are
540 // filled in in case user code wants to look at them.
541 iter->tid = tid;
542 iter->beg = beg;
543 iter->end = end;
544 }
545 else switch (tid) {
546 case HTS_IDX_REST:
547 iter->curr_off = 0;
548 break;
549 case HTS_IDX_NONE:
550 iter->curr_off = 0;
551 iter->finished = 1;
552 break;
553 default:
554 fprintf(stderr, "[cram_itr_query] tid=%d not implemented for CRAM files -- Exiting\n", tid);
555 abort();
556 break;
557 }
558
559 return iter;
560 }
561
562 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
563 {
564 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
565 if (idx == NULL)
566 return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
567 else if (cidx->fmt == HTS_FMT_CRAI)
568 return cram_itr_query(idx, tid, beg, end, cram_readrec);
569 else
570 return hts_itr_query(idx, tid, beg, end, bam_readrec);
571 }
572
573 static int cram_name2id(void *fdv, const char *ref)
574 {
575 cram_fd *fd = (cram_fd *) fdv;
576 return sam_hdr_name2ref(fd->header, ref);
577 }
578
579 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
580 {
581 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
582 if (cidx->fmt == HTS_FMT_CRAI)
583 return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
584 else
585 return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
586 }
587
588 /**********************
589 *** SAM header I/O ***
590 **********************/
591
592 #include "htslib/kseq.h"
593 #include "htslib/kstring.h"
594
595 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
596 {
597 const char *q, *r, *p;
598 khash_t(s2i) *d;
599 d = kh_init(s2i);
600 for (p = text; *p; ++p) {
601 if (strncmp(p, "@SQ", 3) == 0) {
602 char *sn = 0;
603 int ln = -1;
604 for (q = p + 4;; ++q) {
605 if (strncmp(q, "SN:", 3) == 0) {
606 q += 3;
607 for (r = q; *r != '\t' && *r != '\n'; ++r);
608 sn = (char*)calloc(r - q + 1, 1);
609 strncpy(sn, q, r - q);
610 q = r;
611 } else if (strncmp(q, "LN:", 3) == 0)
612 ln = strtol(q + 3, (char**)&q, 10);
613 while (*q != '\t' && *q != '\n') ++q;
614 if (*q == '\n') break;
615 }
616 p = q;
617 if (sn && ln >= 0) {
618 khint_t k;
619 int absent;
620 k = kh_put(s2i, d, sn, &absent);
621 if (!absent) {
622 if (hts_verbose >= 2)
623 fprintf(stderr, "[W::%s] duplicated sequence '%s'\n", __func__, sn);
624 free(sn);
625 } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
626 }
627 }
628 while (*p != '\n') ++p;
629 }
630 return hdr_from_dict(d);
631 }
632
633 bam_hdr_t *sam_hdr_read(htsFile *fp)
634 {
635 switch (fp->format.format) {
636 case bam:
637 return bam_hdr_read(fp->fp.bgzf);
638
639 case cram:
640 return cram_header_to_bam(fp->fp.cram->header);
641
642 case sam: {
643 kstring_t str;
644 bam_hdr_t *h;
645 int has_SQ = 0;
646 str.l = str.m = 0; str.s = 0;
647 while (hts_getline(fp, KS_SEP_LINE, &fp->line) >= 0) {
648 if (fp->line.s[0] != '@') break;
649 if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
650 kputsn(fp->line.s, fp->line.l, &str);
651 kputc('\n', &str);
652 }
653 if (! has_SQ && fp->fn_aux) {
654 char line[2048];
655 FILE *f = fopen(fp->fn_aux, "r");
656 if (f == NULL) return NULL;
657 while (fgets(line, sizeof line, f)) {
658 const char *name = strtok(line, "\t");
659 const char *length = strtok(NULL, "\t");
660 ksprintf(&str, "@SQ\tSN:%s\tLN:%s\n", name, length);
661 }
662 fclose(f);
663 }
664 if (str.l == 0) kputsn("", 0, &str);
665 h = sam_hdr_parse(str.l, str.s);
666 h->l_text = str.l; h->text = str.s;
667 return h;
668 }
669
670 default:
671 abort();
672 }
673 }
674
675 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
676 {
677 switch (fp->format.format) {
678 case binary_format:
679 fp->format.category = sequence_data;
680 fp->format.format = bam;
681 /* fall-through */
682 case bam:
683 bam_hdr_write(fp->fp.bgzf, h);
684 break;
685
686 case cram: {
687 cram_fd *fd = fp->fp.cram;
688 if (cram_set_header(fd, bam_header_to_cram((bam_hdr_t *)h)) < 0) return -1;
689 if (fp->fn_aux)
690 cram_load_reference(fd, fp->fn_aux);
691 if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
692 }
693 break;
694
695 case text_format:
696 fp->format.category = sequence_data;
697 fp->format.format = sam;
698 /* fall-through */
699 case sam: {
700 char *p;
701 hputs(h->text, fp->fp.hfile);
702 p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
703 if (p == 0) {
704 int i;
705 for (i = 0; i < h->n_targets; ++i) {
706 fp->line.l = 0;
707 kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
708 kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
709 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
710 }
711 }
712 if ( hflush(fp->fp.hfile) != 0 ) return -1;
713 }
714 break;
715
716 default:
717 abort();
718 }
719 return 0;
720 }
721
722 /**********************
723 *** SAM record I/O ***
724 **********************/
725
726 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
727 {
728 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
729 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
730 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
731 #define _parse_err(cond, msg) do { if ((cond) && hts_verbose >= 1) { fprintf(stderr, "[E::%s] " msg "\n", __func__); goto err_ret; } } while (0)
732 #define _parse_warn(cond, msg) if ((cond) && hts_verbose >= 2) fprintf(stderr, "[W::%s] " msg "\n", __func__)
733
734 uint8_t *t;
735 char *p = s->s, *q;
736 int i;
737 kstring_t str;
738 bam1_core_t *c = &b->core;
739
740 str.l = b->l_data = 0;
741 str.s = (char*)b->data; str.m = b->m_data;
742 memset(c, 0, 32);
743 if (h->cigar_tab == 0) {
744 h->cigar_tab = (int8_t*) malloc(128);
745 for (i = 0; i < 128; ++i)
746 h->cigar_tab[i] = -1;
747 for (i = 0; BAM_CIGAR_STR[i]; ++i)
748 h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
749 }
750 // qname
751 q = _read_token(p);
752 kputsn_(q, p - q, &str);
753 c->l_qname = p - q;
754 // flag
755 c->flag = strtol(p, &p, 0);
756 if (*p++ != '\t') goto err_ret; // malformated flag
757 // chr
758 q = _read_token(p);
759 if (strcmp(q, "*")) {
760 _parse_err(h->n_targets == 0, "missing SAM header");
761 c->tid = bam_name2id(h, q);
762 _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
763 } else c->tid = -1;
764 // pos
765 c->pos = strtol(p, &p, 10) - 1;
766 if (*p++ != '\t') goto err_ret;
767 if (c->pos < 0 && c->tid >= 0) {
768 _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
769 c->tid = -1;
770 }
771 if (c->tid < 0) c->flag |= BAM_FUNMAP;
772 // mapq
773 c->qual = strtol(p, &p, 10);
774 if (*p++ != '\t') goto err_ret;
775 // cigar
776 if (*p != '*') {
777 uint32_t *cigar;
778 size_t n_cigar = 0;
779 for (q = p; *p && *p != '\t'; ++p)
780 if (!isdigit(*p)) ++n_cigar;
781 if (*p++ != '\t') goto err_ret;
782 _parse_err(n_cigar >= 65536, "too many CIGAR operations");
783 c->n_cigar = n_cigar;
784 _get_mem(uint32_t, &cigar, &str, c->n_cigar<<2);
785 for (i = 0; i < c->n_cigar; ++i, ++q) {
786 int op;
787 cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
788 op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
789 _parse_err(op < 0, "unrecognized CIGAR operator");
790 cigar[i] |= op;
791 }
792 i = bam_cigar2rlen(c->n_cigar, cigar);
793 } else {
794 _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
795 c->flag |= BAM_FUNMAP;
796 q = _read_token(p);
797 i = 1;
798 }
799 c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
800 // mate chr
801 q = _read_token(p);
802 if (strcmp(q, "=") == 0) c->mtid = c->tid;
803 else if (strcmp(q, "*") == 0) c->mtid = -1;
804 else c->mtid = bam_name2id(h, q);
805 // mpos
806 c->mpos = strtol(p, &p, 10) - 1;
807 if (*p++ != '\t') goto err_ret;
808 if (c->mpos < 0 && c->mtid >= 0) {
809 _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
810 c->mtid = -1;
811 }
812 // tlen
813 c->isize = strtol(p, &p, 10);
814 if (*p++ != '\t') goto err_ret;
815 // seq
816 q = _read_token(p);
817 if (strcmp(q, "*")) {
818 c->l_qseq = p - q - 1;
819 i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
820 _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
821 i = (c->l_qseq + 1) >> 1;
822 _get_mem(uint8_t, &t, &str, i);
823 memset(t, 0, i);
824 for (i = 0; i < c->l_qseq; ++i)
825 t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2);
826 } else c->l_qseq = 0;
827 // qual
828 q = _read_token_aux(p);
829 _get_mem(uint8_t, &t, &str, c->l_qseq);
830 if (strcmp(q, "*")) {
831 _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
832 for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
833 } else memset(t, 0xff, c->l_qseq);
834 // aux
835 // Note that (like the bam1_core_t fields) this aux data in b->data is
836 // stored in host endianness; so there is no byte swapping needed here.
837 while (p < s->s + s->l) {
838 uint8_t type;
839 q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
840 _parse_err(p - q - 1 < 6, "incomplete aux field");
841 kputsn_(q, 2, &str);
842 q += 3; type = *q++; ++q; // q points to value
843 if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
844 kputc_('A', &str);
845 kputc_(*q, &str);
846 } else if (type == 'i' || type == 'I') {
847 if (*q == '-') {
848 long x = strtol(q, &q, 10);
849 if (x >= INT8_MIN) {
850 kputc_('c', &str); kputc_(x, &str);
851 } else if (x >= INT16_MIN) {
852 int16_t y = x;
853 kputc_('s', &str); kputsn_((char*)&y, 2, &str);
854 } else {
855 int32_t y = x;
856 kputc_('i', &str); kputsn_(&y, 4, &str);
857 }
858 } else {
859 unsigned long x = strtoul(q, &q, 10);
860 if (x <= UINT8_MAX) {
861 kputc_('C', &str); kputc_(x, &str);
862 } else if (x <= UINT16_MAX) {
863 uint16_t y = x;
864 kputc_('S', &str); kputsn_(&y, 2, &str);
865 } else {
866 uint32_t y = x;
867 kputc_('I', &str); kputsn_(&y, 4, &str);
868 }
869 }
870 } else if (type == 'f') {
871 float x;
872 x = strtod(q, &q);
873 kputc_('f', &str); kputsn_(&x, 4, &str);
874 } else if (type == 'd') {
875 double x;
876 x = strtod(q, &q);
877 kputc_('d', &str); kputsn_(&x, 8, &str);
878 } else if (type == 'Z' || type == 'H') {
879 kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
880 } else if (type == 'B') {
881 int32_t n;
882 char *r;
883 _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
884 type = *q++; // q points to the first ',' following the typing byte
885 for (r = q, n = 0; *r; ++r)
886 if (*r == ',') ++n;
887 kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str);
888 // FIXME: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_()
889 if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
890 else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
891 else if (type == 's') while (q + 1 < p) { int16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
892 else if (type == 'S') while (q + 1 < p) { uint16_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 2, &str); }
893 else if (type == 'i') while (q + 1 < p) { int32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
894 else if (type == 'I') while (q + 1 < p) { uint32_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 4, &str); }
895 else if (type == 'f') while (q + 1 < p) { float x = strtod(q + 1, &q); kputsn_(&x, 4, &str); }
896 else _parse_err(1, "unrecognized type");
897 } else _parse_err(1, "unrecognized type");
898 }
899 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
900 return 0;
901
902 #undef _parse_warn
903 #undef _parse_err
904 #undef _get_mem
905 #undef _read_token_aux
906 #undef _read_token
907 err_ret:
908 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
909 return -2;
910 }
911
912 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
913 {
914 switch (fp->format.format) {
915 case bam: {
916 int r = bam_read1(fp->fp.bgzf, b);
917 if (r >= 0) {
918 if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
919 b->core.mtid >= h->n_targets || b->core.mtid < -1)
920 return -3;
921 }
922 return r;
923 }
924
925 case cram:
926 return cram_get_bam_seq(fp->fp.cram, &b);
927
928 case sam: {
929 int ret;
930 err_recover:
931 if (fp->line.l == 0) {
932 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
933 if (ret < 0) return -1;
934 }
935 ret = sam_parse1(&fp->line, h, b);
936 fp->line.l = 0;
937 if (ret < 0) {
938 if (hts_verbose >= 1)
939 fprintf(stderr, "[W::%s] parse error at line %lld\n", __func__, (long long)fp->lineno);
940 if (h->ignore_sam_err) goto err_recover;
941 }
942 return ret;
943 }
944
945 default:
946 abort();
947 }
948 }
949
950 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
951 {
952 int i;
953 uint8_t *s;
954 const bam1_core_t *c = &b->core;
955
956 str->l = 0;
957 kputsn(bam_get_qname(b), c->l_qname-1, str); kputc('\t', str); // query name
958 kputw(c->flag, str); kputc('\t', str); // flag
959 if (c->tid >= 0) { // chr
960 kputs(h->target_name[c->tid] , str);
961 kputc('\t', str);
962 } else kputsn("*\t", 2, str);
963 kputw(c->pos + 1, str); kputc('\t', str); // pos
964 kputw(c->qual, str); kputc('\t', str); // qual
965 if (c->n_cigar) { // cigar
966 uint32_t *cigar = bam_get_cigar(b);
967 for (i = 0; i < c->n_cigar; ++i) {
968 kputw(bam_cigar_oplen(cigar[i]), str);
969 kputc(bam_cigar_opchr(cigar[i]), str);
970 }
971 } else kputc('*', str);
972 kputc('\t', str);
973 if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
974 else if (c->mtid == c->tid) kputsn("=\t", 2, str);
975 else {
976 kputs(h->target_name[c->mtid], str);
977 kputc('\t', str);
978 }
979 kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
980 kputw(c->isize, str); kputc('\t', str); // template len
981 if (c->l_qseq) { // seq and qual
982 uint8_t *s = bam_get_seq(b);
983 for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
984 kputc('\t', str);
985 s = bam_get_qual(b);
986 if (s[0] == 0xff) kputc('*', str);
987 else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
988 } else kputsn("*\t*", 3, str);
989 s = bam_get_aux(b); // aux
990 while (s+4 <= b->data + b->l_data) {
991 uint8_t type, key[2];
992 key[0] = s[0]; key[1] = s[1];
993 s += 2; type = *s++;
994 kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
995 if (type == 'A') {
996 kputsn("A:", 2, str);
997 kputc(*s, str);
998 ++s;
999 } else if (type == 'C') {
1000 kputsn("i:", 2, str);
1001 kputw(*s, str);
1002 ++s;
1003 } else if (type == 'c') {
1004 kputsn("i:", 2, str);
1005 kputw(*(int8_t*)s, str);
1006 ++s;
1007 } else if (type == 'S') {
1008 if (s+2 <= b->data + b->l_data) {
1009 kputsn("i:", 2, str);
1010 kputw(*(uint16_t*)s, str);
1011 s += 2;
1012 } else return -1;
1013 } else if (type == 's') {
1014 if (s+2 <= b->data + b->l_data) {
1015 kputsn("i:", 2, str);
1016 kputw(*(int16_t*)s, str);
1017 s += 2;
1018 } else return -1;
1019 } else if (type == 'I') {
1020 if (s+4 <= b->data + b->l_data) {
1021 kputsn("i:", 2, str);
1022 kputuw(*(uint32_t*)s, str);
1023 s += 4;
1024 } else return -1;
1025 } else if (type == 'i') {
1026 if (s+4 <= b->data + b->l_data) {
1027 kputsn("i:", 2, str);
1028 kputw(*(int32_t*)s, str);
1029 s += 4;
1030 } else return -1;
1031 } else if (type == 'f') {
1032 if (s+4 <= b->data + b->l_data) {
1033 ksprintf(str, "f:%g", *(float*)s);
1034 s += 4;
1035 } else return -1;
1036
1037 } else if (type == 'd') {
1038 if (s+8 <= b->data + b->l_data) {
1039 ksprintf(str, "d:%g", *(double*)s);
1040 s += 8;
1041 } else return -1;
1042 } else if (type == 'Z' || type == 'H') {
1043 kputc(type, str); kputc(':', str);
1044 while (s < b->data + b->l_data && *s) kputc(*s++, str);
1045 if (s >= b->data + b->l_data)
1046 return -1;
1047 ++s;
1048 } else if (type == 'B') {
1049 uint8_t sub_type = *(s++);
1050 int32_t n;
1051 memcpy(&n, s, 4);
1052 s += 4; // no point to the start of the array
1053 if (s + n >= b->data + b->l_data)
1054 return -1;
1055 kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
1056 for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
1057 kputc(',', str);
1058 if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; }
1059 else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
1060 else if ('s' == sub_type) { kputw(*(int16_t*)s, str); s += 2; }
1061 else if ('S' == sub_type) { kputw(*(uint16_t*)s, str); s += 2; }
1062 else if ('i' == sub_type) { kputw(*(int32_t*)s, str); s += 4; }
1063 else if ('I' == sub_type) { kputuw(*(uint32_t*)s, str); s += 4; }
1064 else if ('f' == sub_type) { ksprintf(str, "%g", *(float*)s); s += 4; }
1065 }
1066 }
1067 }
1068 return str->l;
1069 }
1070
1071 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
1072 {
1073 switch (fp->format.format) {
1074 case binary_format:
1075 fp->format.category = sequence_data;
1076 fp->format.format = bam;
1077 /* fall-through */
1078 case bam:
1079 return bam_write1(fp->fp.bgzf, b);
1080
1081 case cram:
1082 return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
1083
1084 case text_format:
1085 fp->format.category = sequence_data;
1086 fp->format.format = sam;
1087 /* fall-through */
1088 case sam:
1089 if (sam_format1(h, b, &fp->line) < 0) return -1;
1090 kputc('\n', &fp->line);
1091 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1092 return fp->line.l;
1093
1094 default:
1095 abort();
1096 }
1097 }
1098
1099 /************************
1100 *** Auxiliary fields ***
1101 ************************/
1102
1103 void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
1104 {
1105 int ori_len = b->l_data;
1106 b->l_data += 3 + len;
1107 if (b->m_data < b->l_data) {
1108 b->m_data = b->l_data;
1109 kroundup32(b->m_data);
1110 b->data = (uint8_t*)realloc(b->data, b->m_data);
1111 }
1112 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
1113 b->data[ori_len + 2] = type;
1114 memcpy(b->data + ori_len + 3, data, len);
1115 }
1116
1117 static inline uint8_t *skip_aux(uint8_t *s)
1118 {
1119 int size = aux_type2size(*s); ++s; // skip type
1120 uint32_t n;
1121 switch (size) {
1122 case 'Z':
1123 case 'H':
1124 while (*s) ++s;
1125 return s + 1;
1126 case 'B':
1127 size = aux_type2size(*s); ++s;
1128 memcpy(&n, s, 4); s += 4;
1129 return s + size * n;
1130 case 0:
1131 abort();
1132 break;
1133 default:
1134 return s + size;
1135 }
1136 }
1137
1138 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
1139 {
1140 uint8_t *s;
1141 int y = tag[0]<<8 | tag[1];
1142 s = bam_get_aux(b);
1143 while (s < b->data + b->l_data) {
1144 int x = (int)s[0]<<8 | s[1];
1145 s += 2;
1146 if (x == y) return s;
1147 s = skip_aux(s);
1148 }
1149 return 0;
1150 }
1151 // s MUST BE returned by bam_aux_get()
1152 int bam_aux_del(bam1_t *b, uint8_t *s)
1153 {
1154 uint8_t *p, *aux;
1155 int l_aux = bam_get_l_aux(b);
1156 aux = bam_get_aux(b);
1157 p = s - 2;
1158 s = skip_aux(s);
1159 memmove(p, s, l_aux - (s - aux));
1160 b->l_data -= s - p;
1161 return 0;
1162 }
1163
1164 int32_t bam_aux2i(const uint8_t *s)
1165 {
1166 int type;
1167 type = *s++;
1168 if (type == 'c') return (int32_t)*(int8_t*)s;
1169 else if (type == 'C') return (int32_t)*(uint8_t*)s;
1170 else if (type == 's') return (int32_t)*(int16_t*)s;
1171 else if (type == 'S') return (int32_t)*(uint16_t*)s;
1172 else if (type == 'i' || type == 'I') return *(int32_t*)s;
1173 else return 0;
1174 }
1175
1176 double bam_aux2f(const uint8_t *s)
1177 {
1178 int type;
1179 type = *s++;
1180 if (type == 'd') return *(double*)s;
1181 else if (type == 'f') return *(float*)s;
1182 else return 0.0;
1183 }
1184
1185 char bam_aux2A(const uint8_t *s)
1186 {
1187 int type;
1188 type = *s++;
1189 if (type == 'A') return *(char*)s;
1190 else return 0;
1191 }
1192
1193 char *bam_aux2Z(const uint8_t *s)
1194 {
1195 int type;
1196 type = *s++;
1197 if (type == 'Z' || type == 'H') return (char*)s;
1198 else return 0;
1199 }
1200
1201 int sam_open_mode(char *mode, const char *fn, const char *format)
1202 {
1203 // TODO Parse "bam5" etc for compression level
1204 if (format == NULL) {
1205 // Try to pick a format based on the filename extension
1206 const char *ext = fn? strrchr(fn, '.') : NULL;
1207 if (ext == NULL || strchr(ext, '/')) return -1;
1208 return sam_open_mode(mode, fn, ext+1);
1209 }
1210 else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
1211 else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
1212 else if (strcmp(format, "sam") == 0) strcpy(mode, "");
1213 else return -1;
1214
1215 return 0;
1216 }
1217
1218 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
1219 int bam_str2flag(const char *str)
1220 {
1221 char *end, *beg = (char*) str;
1222 long int flag = strtol(str, &end, 0);
1223 if ( end!=str ) return flag; // the conversion was successful
1224 flag = 0;
1225 while ( *str )
1226 {
1227 end = beg;
1228 while ( *end && *end!=',' ) end++;
1229 if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
1230 else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
1231 else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
1232 else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
1233 else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
1234 else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
1235 else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
1236 else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
1237 else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
1238 else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
1239 else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
1240 else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
1241 else return -1;
1242 if ( !*end ) break;
1243 beg = end + 1;
1244 }
1245 return flag;
1246 }
1247
1248 char *bam_flag2str(int flag)
1249 {
1250 kstring_t str = {0,0,0};
1251 if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
1252 if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
1253 if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
1254 if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
1255 if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
1256 if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
1257 if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
1258 if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
1259 if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
1260 if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
1261 if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
1262 if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
1263 if ( str.l == 0 ) kputsn("", 0, &str);
1264 return str.s;
1265 }
1266
1267
1268 /**************************
1269 *** Pileup and Mpileup ***
1270 **************************/
1271
1272 #if !defined(BAM_NO_PILEUP)
1273
1274 #include <assert.h>
1275
1276 /*******************
1277 *** Memory pool ***
1278 *******************/
1279
1280 typedef struct {
1281 int k, x, y, end;
1282 } cstate_t;
1283
1284 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
1285
1286 typedef struct __linkbuf_t {
1287 bam1_t b;
1288 int32_t beg, end;
1289 cstate_t s;
1290 struct __linkbuf_t *next;
1291 } lbnode_t;
1292
1293 typedef struct {
1294 int cnt, n, max;
1295 lbnode_t **buf;
1296 } mempool_t;
1297
1298 static mempool_t *mp_init(void)
1299 {
1300 mempool_t *mp;
1301 mp = (mempool_t*)calloc(1, sizeof(mempool_t));
1302 return mp;
1303 }
1304 static void mp_destroy(mempool_t *mp)
1305 {
1306 int k;
1307 for (k = 0; k < mp->n; ++k) {
1308 free(mp->buf[k]->b.data);
1309 free(mp->buf[k]);
1310 }
1311 free(mp->buf);
1312 free(mp);
1313 }
1314 static inline lbnode_t *mp_alloc(mempool_t *mp)
1315 {
1316 ++mp->cnt;
1317 if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
1318 else return mp->buf[--mp->n];
1319 }
1320 static inline void mp_free(mempool_t *mp, lbnode_t *p)
1321 {
1322 --mp->cnt; p->next = 0; // clear lbnode_t::next here
1323 if (mp->n == mp->max) {
1324 mp->max = mp->max? mp->max<<1 : 256;
1325 mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
1326 }
1327 mp->buf[mp->n++] = p;
1328 }
1329
1330 /**********************
1331 *** CIGAR resolver ***
1332 **********************/
1333
1334 /* s->k: the index of the CIGAR operator that has just been processed.
1335 s->x: the reference coordinate of the start of s->k
1336 s->y: the query coordiante of the start of s->k
1337 */
1338 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
1339 {
1340 #define _cop(c) ((c)&BAM_CIGAR_MASK)
1341 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
1342
1343 bam1_t *b = p->b;
1344 bam1_core_t *c = &b->core;
1345 uint32_t *cigar = bam_get_cigar(b);
1346 int k;
1347 // determine the current CIGAR operation
1348 // fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
1349 if (s->k == -1) { // never processed
1350 if (c->n_cigar == 1) { // just one operation, save a loop
1351 if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
1352 } else { // find the first match or deletion
1353 for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
1354 int op = _cop(cigar[k]);
1355 int l = _cln(cigar[k]);
1356 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
1357 else if (op == BAM_CREF_SKIP) s->x += l;
1358 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
1359 }
1360 assert(k < c->n_cigar);
1361 s->k = k;
1362 }
1363 } else { // the read has been processed before
1364 int op, l = _cln(cigar[s->k]);
1365 if (pos - s->x >= l) { // jump to the next operation
1366 assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
1367 op = _cop(cigar[s->k+1]);
1368 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
1369 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
1370 s->x += l;
1371 ++s->k;
1372 } else { // find the next M/D/N/=/X
1373 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
1374 s->x += l;
1375 for (k = s->k + 1; k < c->n_cigar; ++k) {
1376 op = _cop(cigar[k]), l = _cln(cigar[k]);
1377 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
1378 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
1379 }
1380 s->k = k;
1381 }
1382 assert(s->k < c->n_cigar); // otherwise a bug
1383 } // else, do nothing
1384 }
1385 { // collect pileup information
1386 int op, l;
1387 op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
1388 p->is_del = p->indel = p->is_refskip = 0;
1389 if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
1390 int op2 = _cop(cigar[s->k+1]);
1391 int l2 = _cln(cigar[s->k+1]);
1392 if (op2 == BAM_CDEL) p->indel = -(int)l2;
1393 else if (op2 == BAM_CINS) p->indel = l2;
1394 else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
1395 int l3 = 0;
1396 for (k = s->k + 2; k < c->n_cigar; ++k) {
1397 op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
1398 if (op2 == BAM_CINS) l3 += l2;
1399 else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
1400 }
1401 if (l3 > 0) p->indel = l3;
1402 }
1403 }
1404 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
1405 p->qpos = s->y + (pos - s->x);
1406 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
1407 p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
1408 p->is_refskip = (op == BAM_CREF_SKIP);
1409 } // cannot be other operations; otherwise a bug
1410 p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
1411 }
1412 return 1;
1413 }
1414
1415 /***********************
1416 *** Pileup iterator ***
1417 ***********************/
1418
1419 // Dictionary of overlapping reads
1420 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
1421 typedef khash_t(olap_hash) olap_hash_t;
1422
1423 struct __bam_plp_t {
1424 mempool_t *mp;
1425 lbnode_t *head, *tail, *dummy;
1426 int32_t tid, pos, max_tid, max_pos;
1427 int is_eof, max_plp, error, maxcnt;
1428 uint64_t id;
1429 bam_pileup1_t *plp;
1430 // for the "auto" interface only
1431 bam1_t *b;
1432 bam_plp_auto_f func;
1433 void *data;
1434 olap_hash_t *overlaps;
1435 };
1436
1437 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
1438 {
1439 bam_plp_t iter;
1440 iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
1441 iter->mp = mp_init();
1442 iter->head = iter->tail = mp_alloc(iter->mp);
1443 iter->dummy = mp_alloc(iter->mp);
1444 iter->max_tid = iter->max_pos = -1;
1445 iter->maxcnt = 8000;
1446 if (func) {
1447 iter->func = func;
1448 iter->data = data;
1449 iter->b = bam_init1();
1450 }
1451 return iter;
1452 }
1453
1454 void bam_plp_init_overlaps(bam_plp_t iter)
1455 {
1456 iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
1457 }
1458
1459 void bam_plp_destroy(bam_plp_t iter)
1460 {
1461 if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
1462 mp_free(iter->mp, iter->dummy);
1463 mp_free(iter->mp, iter->head);
1464 if (iter->mp->cnt != 0)
1465 fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
1466 mp_destroy(iter->mp);
1467 if (iter->b) bam_destroy1(iter->b);
1468 free(iter->plp);
1469 free(iter);
1470 }
1471
1472
1473 //---------------------------------
1474 //--- Tweak overlapping reads
1475 //---------------------------------
1476
1477 /**
1478 * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
1479 * cigar_iref2iseq_next() - get the next CMATCH base
1480 * @cigar: pointer to current cigar block (rw)
1481 * @cigar_max: pointer just beyond the last cigar block
1482 * @icig: position within the current cigar block (rw)
1483 * @iseq: position in the sequence (rw)
1484 * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
1485 *
1486 * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
1487 */
1488 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
1489 {
1490 int pos = *iref;
1491 if ( pos < 0 ) return -1;
1492 *icig = 0;
1493 *iseq = 0;
1494 *iref = 0;
1495 while ( *cigar<cigar_max )
1496 {
1497 int cig = (**cigar) & BAM_CIGAR_MASK;
1498 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
1499
1500 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
1501 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
1502 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
1503 {
1504 pos -= ncig;
1505 if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
1506 (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
1507 continue;
1508 }
1509 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
1510 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
1511 {
1512 pos -= ncig;
1513 if ( pos<0 ) pos = 0;
1514 (*cigar)++; *icig = 0; *iref += ncig;
1515 continue;
1516 }
1517 fprintf(stderr,"todo: cigar %d\n", cig);
1518 assert(0);
1519 }
1520 *iseq = -1;
1521 return -1;
1522 }
1523 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
1524 {
1525 while ( *cigar < cigar_max )
1526 {
1527 int cig = (**cigar) & BAM_CIGAR_MASK;
1528 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
1529
1530 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
1531 {
1532 if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; }
1533 (*iseq)++; (*icig)++; (*iref)++;
1534 return BAM_CMATCH;
1535 }
1536 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
1537 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
1538 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
1539 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
1540 fprintf(stderr,"todo: cigar %d\n", cig);
1541 assert(0);
1542 }
1543 *iseq = -1;
1544 *iref = -1;
1545 return -1;
1546 }
1547
1548 static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
1549 {
1550 uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
1551 uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
1552 int a_icig = 0, a_iseq = 0;
1553 int b_icig = 0, b_iseq = 0;
1554 uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
1555 uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
1556
1557 int iref = b->core.pos;
1558 int a_iref = iref - a->core.pos;
1559 int b_iref = iref - b->core.pos;
1560 int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
1561 if ( a_ret<0 ) return; // no overlap
1562 int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
1563 if ( b_ret<0 ) return; // no overlap
1564
1565 #if DBG
1566 fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
1567 a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
1568 #endif
1569
1570 while ( 1 )
1571 {
1572 // Increment reference position
1573 while ( a_iref>=0 && a_iref < iref - a->core.pos )
1574 a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
1575 if ( a_ret<0 ) break; // done
1576 if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
1577
1578 while ( b_iref>=0 && b_iref < iref - b->core.pos )
1579 b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
1580 if ( b_ret<0 ) break; // done
1581 if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
1582
1583 iref++;
1584 if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels
1585
1586 if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
1587 {
1588 #if DBG
1589 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
1590 #endif
1591 // we are very confident about this base
1592 int qual = a_qual[a_iseq] + b_qual[b_iseq];
1593 a_qual[a_iseq] = qual>200 ? 200 : qual;
1594 b_qual[b_iseq] = 0;
1595 }
1596 else
1597 {
1598 if ( a_qual[a_iseq] >= b_qual[b_iseq] )
1599 {
1600 #if DBG
1601 fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
1602 #endif
1603 a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch
1604 b_qual[b_iseq] = 0;
1605 }
1606 else
1607 {
1608 #if DBG
1609 fprintf(stderr,"[%c/%c]",tolower(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
1610 #endif
1611 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
1612 a_qual[a_iseq] = 0;
1613 }
1614 }
1615 }
1616 #if DBG
1617 fprintf(stderr,"\n");
1618 #endif
1619 }
1620
1621 // Fix overlapping reads. Simple soft-clipping did not give good results.
1622 // Lowering qualities of unwanted bases is more selective and works better.
1623 //
1624 static void overlap_push(bam_plp_t iter, lbnode_t *node)
1625 {
1626 if ( !iter->overlaps ) return;
1627
1628 // mapped mates and paired reads only
1629 if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
1630
1631 // no overlap possible, unless some wild cigar
1632 if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
1633
1634 khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
1635 if ( kitr==kh_end(iter->overlaps) )
1636 {
1637 int ret;
1638 kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
1639 kh_value(iter->overlaps, kitr) = node;
1640 }
1641 else
1642 {
1643 lbnode_t *a = kh_value(iter->overlaps, kitr);
1644 tweak_overlap_quality(&a->b, &node->b);
1645 kh_del(olap_hash, iter->overlaps, kitr);
1646 assert(a->end-1 == a->s.end);
1647 a->end = a->b.core.pos + bam_cigar2rlen(a->b.core.n_cigar, bam_get_cigar(&a->b));
1648 a->s.end = a->end - 1;
1649 }
1650 }
1651
1652 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
1653 {
1654 if ( !iter->overlaps ) return;
1655
1656 khiter_t kitr;
1657 if ( b )
1658 {
1659 kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
1660 if ( kitr!=kh_end(iter->overlaps) )
1661 kh_del(olap_hash, iter->overlaps, kitr);
1662 }
1663 else
1664 {
1665 // remove all
1666 for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
1667 if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
1668 }
1669 }
1670
1671
1672
1673 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
1674 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
1675 // buffer yet (the current position is still the maximum position across all buffered reads).
1676 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
1677 {
1678 if (iter->error) { *_n_plp = -1; return 0; }
1679 *_n_plp = 0;
1680 if (iter->is_eof && iter->head->next == 0) return 0;
1681 while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
1682 int n_plp = 0;
1683 lbnode_t *p, *q;
1684 // write iter->plp at iter->pos
1685 iter->dummy->next = iter->head;
1686 for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) {
1687 if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
1688 overlap_remove(iter, &p->b);
1689 q->next = p->next; mp_free(iter->mp, p); p = q;
1690 } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
1691 if (n_plp == iter->max_plp) { // then double the capacity
1692 iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
1693 iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
1694 }
1695 iter->plp[n_plp].b = &p->b;
1696 if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
1697 }
1698 }
1699 iter->head = iter->dummy->next; // dummy->next may be changed
1700 *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
1701 // update iter->tid and iter->pos
1702 if (iter->head->next) {
1703 if (iter->tid > iter->head->b.core.tid) {
1704 fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
1705 iter->error = 1;
1706 *_n_plp = -1;
1707 return 0;
1708 }
1709 }
1710 if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
1711 iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
1712 } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
1713 iter->pos = iter->head->beg; // jump to the next position
1714 } else ++iter->pos; // scan contiguously
1715 // return
1716 if (n_plp) return iter->plp;
1717 if (iter->is_eof && iter->head->next == 0) break;
1718 }
1719 return 0;
1720 }
1721
1722 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
1723 {
1724 if (iter->error) return -1;
1725 if (b) {
1726 if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
1727 // Skip only unmapped reads here, any additional filtering must be done in iter->func
1728 if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
1729 if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
1730 {
1731 overlap_remove(iter, b);
1732 return 0;
1733 }
1734 bam_copy1(&iter->tail->b, b);
1735 overlap_push(iter, iter->tail);
1736 #ifndef BAM_NO_ID
1737 iter->tail->b.id = iter->id++;
1738 #endif
1739 iter->tail->beg = b->core.pos;
1740 iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1741 iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
1742 if (b->core.tid < iter->max_tid) {
1743 fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
1744 iter->error = 1;
1745 return -1;
1746 }
1747 if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
1748 fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
1749 iter->error = 1;
1750 return -1;
1751 }
1752 iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
1753 if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
1754 iter->tail->next = mp_alloc(iter->mp);
1755 iter->tail = iter->tail->next;
1756 }
1757 } else iter->is_eof = 1;
1758 return 0;
1759 }
1760
1761 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
1762 {
1763 const bam_pileup1_t *plp;
1764 if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
1765 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
1766 else { // no pileup line can be obtained; read alignments
1767 *_n_plp = 0;
1768 if (iter->is_eof) return 0;
1769 int ret;
1770 while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
1771 if (bam_plp_push(iter, iter->b) < 0) {
1772 *_n_plp = -1;
1773 return 0;
1774 }
1775 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
1776 // otherwise no pileup line can be returned; read the next alignment.
1777 }
1778 if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
1779 bam_plp_push(iter, 0);
1780 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
1781 return 0;
1782 }
1783 }
1784
1785 void bam_plp_reset(bam_plp_t iter)
1786 {
1787 lbnode_t *p, *q;
1788 iter->max_tid = iter->max_pos = -1;
1789 iter->tid = iter->pos = 0;
1790 iter->is_eof = 0;
1791 for (p = iter->head; p->next;) {
1792 overlap_remove(iter, NULL);
1793 q = p->next;
1794 mp_free(iter->mp, p);
1795 p = q;
1796 }
1797 iter->head = iter->tail;
1798 }
1799
1800 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
1801 {
1802 iter->maxcnt = maxcnt;
1803 }
1804
1805 /************************
1806 *** Mpileup iterator ***
1807 ************************/
1808
1809 struct __bam_mplp_t {
1810 int n;
1811 uint64_t min, *pos;
1812 bam_plp_t *iter;
1813 int *n_plp;
1814 const bam_pileup1_t **plp;
1815 };
1816
1817 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
1818 {
1819 int i;
1820 bam_mplp_t iter;
1821 iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
1822 iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
1823 iter->n_plp = (int*)calloc(n, sizeof(int));
1824 iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
1825 iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
1826 iter->n = n;
1827 iter->min = (uint64_t)-1;
1828 for (i = 0; i < n; ++i) {
1829 iter->iter[i] = bam_plp_init(func, data[i]);
1830 iter->pos[i] = iter->min;
1831 }
1832 return iter;
1833 }
1834
1835 void bam_mplp_init_overlaps(bam_mplp_t iter)
1836 {
1837 int i;
1838 for (i = 0; i < iter->n; ++i)
1839 bam_plp_init_overlaps(iter->iter[i]);
1840 }
1841
1842 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
1843 {
1844 int i;
1845 for (i = 0; i < iter->n; ++i)
1846 iter->iter[i]->maxcnt = maxcnt;
1847 }
1848
1849 void bam_mplp_destroy(bam_mplp_t iter)
1850 {
1851 int i;
1852 for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
1853 free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
1854 free(iter);
1855 }
1856
1857 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
1858 {
1859 int i, ret = 0;
1860 uint64_t new_min = (uint64_t)-1;
1861 for (i = 0; i < iter->n; ++i) {
1862 if (iter->pos[i] == iter->min) {
1863 int tid, pos;
1864 iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
1865 if ( iter->iter[i]->error ) return -1;
1866 iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
1867 }
1868 if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
1869 }
1870 iter->min = new_min;
1871 if (new_min == (uint64_t)-1) return 0;
1872 *_tid = new_min>>32; *_pos = (uint32_t)new_min;
1873 for (i = 0; i < iter->n; ++i) {
1874 if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
1875 n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
1876 ++ret;
1877 } else n_plp[i] = 0, plp[i] = 0;
1878 }
1879 return ret;
1880 }
1881
1882 #endif // ~!defined(BAM_NO_PILEUP)