comparison pyPRADA_1.2/tools/samtools-0.1.16/bam_index.c @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 #include <ctype.h>
2 #include <assert.h>
3 #include "bam.h"
4 #include "khash.h"
5 #include "ksort.h"
6 #include "bam_endian.h"
7 #ifdef _USE_KNETFILE
8 #include "knetfile.h"
9 #endif
10
11 /*!
12 @header
13
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
19 short alignments.
20
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
29
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
38
39 */
40
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
44
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
46
47 typedef struct {
48 uint64_t u, v;
49 } pair64_t;
50
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
53
54 typedef struct {
55 uint32_t m, n;
56 pair64_t *list;
57 } bam_binlist_t;
58
59 typedef struct {
60 int32_t n, m;
61 uint64_t *offset;
62 } bam_lidx_t;
63
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
65
66 struct __bam_index_t {
67 int32_t n;
68 uint64_t n_no_coor; // unmapped reads without coordinate
69 khash_t(i) **index;
70 bam_lidx_t *index2;
71 };
72
73 // requirement: len <= LEN_MASK
74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
75 {
76 khint_t k;
77 bam_binlist_t *l;
78 int ret;
79 k = kh_put(i, h, bin, &ret);
80 l = &kh_value(h, k);
81 if (ret) { // not present
82 l->m = 1; l->n = 0;
83 l->list = (pair64_t*)calloc(l->m, 16);
84 }
85 if (l->n == l->m) {
86 l->m <<= 1;
87 l->list = (pair64_t*)realloc(l->list, l->m * 16);
88 }
89 l->list[l->n].u = beg; l->list[l->n++].v = end;
90 }
91
92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
93 {
94 int i, beg, end;
95 beg = b->core.pos >> BAM_LIDX_SHIFT;
96 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 if (index2->m < end + 1) {
98 int old_m = index2->m;
99 index2->m = end + 1;
100 kroundup32(index2->m);
101 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
103 }
104 if (beg == end) {
105 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
106 } else {
107 for (i = beg; i <= end; ++i)
108 if (index2->offset[i] == 0) index2->offset[i] = offset;
109 }
110 index2->n = end + 1;
111 }
112
113 static void merge_chunks(bam_index_t *idx)
114 {
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
116 khash_t(i) *index;
117 int i, l, m;
118 khint_t k;
119 for (i = 0; i < idx->n; ++i) {
120 index = idx->index[i];
121 for (k = kh_begin(index); k != kh_end(index); ++k) {
122 bam_binlist_t *p;
123 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 p = &kh_value(index, k);
125 m = 0;
126 for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
129 #else
130 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
131 #endif
132 else p->list[++m] = p->list[l];
133 } // ~for(l)
134 p->n = m + 1;
135 } // ~for(k)
136 } // ~for(i)
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
138 }
139
140 static void fill_missing(bam_index_t *idx)
141 {
142 int i, j;
143 for (i = 0; i < idx->n; ++i) {
144 bam_lidx_t *idx2 = &idx->index2[i];
145 for (j = 1; j < idx2->n; ++j)
146 if (idx2->offset[j] == 0)
147 idx2->offset[j] = idx2->offset[j-1];
148 }
149 }
150
151 bam_index_t *bam_index_core(bamFile fp)
152 {
153 bam1_t *b;
154 bam_header_t *h;
155 int i, ret;
156 bam_index_t *idx;
157 uint32_t last_bin, save_bin;
158 int32_t last_coor, last_tid, save_tid;
159 bam1_core_t *c;
160 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
161
162 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
163 b = (bam1_t*)calloc(1, sizeof(bam1_t));
164 h = bam_header_read(fp);
165 c = &b->core;
166
167 idx->n = h->n_targets;
168 bam_header_destroy(h);
169 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
170 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
171 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
172
173 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
174 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
175 n_mapped = n_unmapped = n_no_coor = off_end = 0;
176 off_beg = off_end = bam_tell(fp);
177 while ((ret = bam_read1(fp, b)) >= 0) {
178 if (c->tid < 0) ++n_no_coor;
179 if (last_tid != c->tid) { // change of chromosomes
180 last_tid = c->tid;
181 last_bin = 0xffffffffu;
182 } else if (last_coor > c->pos) {
183 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
184 bam1_qname(b), last_coor, c->pos, c->tid+1);
185 exit(1);
186 }
187 if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
188 if (c->bin != last_bin) { // then possibly write the binning index
189 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
190 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
191 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
192 off_end = last_off;
193 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
194 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
195 n_mapped = n_unmapped = 0;
196 off_beg = off_end;
197 }
198 save_off = last_off;
199 save_bin = last_bin = c->bin;
200 save_tid = c->tid;
201 if (save_tid < 0) break;
202 }
203 if (bam_tell(fp) <= last_off) {
204 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
205 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
206 exit(1);
207 }
208 if (c->flag & BAM_FUNMAP) ++n_unmapped;
209 else ++n_mapped;
210 last_off = bam_tell(fp);
211 last_coor = b->core.pos;
212 }
213 if (save_tid >= 0) {
214 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
215 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
216 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
217 }
218 merge_chunks(idx);
219 fill_missing(idx);
220 if (ret >= 0) {
221 while ((ret = bam_read1(fp, b)) >= 0) {
222 ++n_no_coor;
223 if (c->tid >= 0 && n_no_coor) {
224 fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
225 exit(1);
226 }
227 }
228 }
229 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
230 free(b->data); free(b);
231 idx->n_no_coor = n_no_coor;
232 return idx;
233 }
234
235 void bam_index_destroy(bam_index_t *idx)
236 {
237 khint_t k;
238 int i;
239 if (idx == 0) return;
240 for (i = 0; i < idx->n; ++i) {
241 khash_t(i) *index = idx->index[i];
242 bam_lidx_t *index2 = idx->index2 + i;
243 for (k = kh_begin(index); k != kh_end(index); ++k) {
244 if (kh_exist(index, k))
245 free(kh_value(index, k).list);
246 }
247 kh_destroy(i, index);
248 free(index2->offset);
249 }
250 free(idx->index); free(idx->index2);
251 free(idx);
252 }
253
254 void bam_index_save(const bam_index_t *idx, FILE *fp)
255 {
256 int32_t i, size;
257 khint_t k;
258 fwrite("BAI\1", 1, 4, fp);
259 if (bam_is_be) {
260 uint32_t x = idx->n;
261 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
262 } else fwrite(&idx->n, 4, 1, fp);
263 for (i = 0; i < idx->n; ++i) {
264 khash_t(i) *index = idx->index[i];
265 bam_lidx_t *index2 = idx->index2 + i;
266 // write binning index
267 size = kh_size(index);
268 if (bam_is_be) { // big endian
269 uint32_t x = size;
270 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
271 } else fwrite(&size, 4, 1, fp);
272 for (k = kh_begin(index); k != kh_end(index); ++k) {
273 if (kh_exist(index, k)) {
274 bam_binlist_t *p = &kh_value(index, k);
275 if (bam_is_be) { // big endian
276 uint32_t x;
277 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
278 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
279 for (x = 0; (int)x < p->n; ++x) {
280 bam_swap_endian_8p(&p->list[x].u);
281 bam_swap_endian_8p(&p->list[x].v);
282 }
283 fwrite(p->list, 16, p->n, fp);
284 for (x = 0; (int)x < p->n; ++x) {
285 bam_swap_endian_8p(&p->list[x].u);
286 bam_swap_endian_8p(&p->list[x].v);
287 }
288 } else {
289 fwrite(&kh_key(index, k), 4, 1, fp);
290 fwrite(&p->n, 4, 1, fp);
291 fwrite(p->list, 16, p->n, fp);
292 }
293 }
294 }
295 // write linear index (index2)
296 if (bam_is_be) {
297 int x = index2->n;
298 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
299 } else fwrite(&index2->n, 4, 1, fp);
300 if (bam_is_be) { // big endian
301 int x;
302 for (x = 0; (int)x < index2->n; ++x)
303 bam_swap_endian_8p(&index2->offset[x]);
304 fwrite(index2->offset, 8, index2->n, fp);
305 for (x = 0; (int)x < index2->n; ++x)
306 bam_swap_endian_8p(&index2->offset[x]);
307 } else fwrite(index2->offset, 8, index2->n, fp);
308 }
309 { // write the number of reads coor-less records.
310 uint64_t x = idx->n_no_coor;
311 if (bam_is_be) bam_swap_endian_8p(&x);
312 fwrite(&x, 8, 1, fp);
313 }
314 fflush(fp);
315 }
316
317 static bam_index_t *bam_index_load_core(FILE *fp)
318 {
319 int i;
320 char magic[4];
321 bam_index_t *idx;
322 if (fp == 0) {
323 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
324 return 0;
325 }
326 fread(magic, 1, 4, fp);
327 if (strncmp(magic, "BAI\1", 4)) {
328 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
329 fclose(fp);
330 return 0;
331 }
332 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
333 fread(&idx->n, 4, 1, fp);
334 if (bam_is_be) bam_swap_endian_4p(&idx->n);
335 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
336 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
337 for (i = 0; i < idx->n; ++i) {
338 khash_t(i) *index;
339 bam_lidx_t *index2 = idx->index2 + i;
340 uint32_t key, size;
341 khint_t k;
342 int j, ret;
343 bam_binlist_t *p;
344 index = idx->index[i] = kh_init(i);
345 // load binning index
346 fread(&size, 4, 1, fp);
347 if (bam_is_be) bam_swap_endian_4p(&size);
348 for (j = 0; j < (int)size; ++j) {
349 fread(&key, 4, 1, fp);
350 if (bam_is_be) bam_swap_endian_4p(&key);
351 k = kh_put(i, index, key, &ret);
352 p = &kh_value(index, k);
353 fread(&p->n, 4, 1, fp);
354 if (bam_is_be) bam_swap_endian_4p(&p->n);
355 p->m = p->n;
356 p->list = (pair64_t*)malloc(p->m * 16);
357 fread(p->list, 16, p->n, fp);
358 if (bam_is_be) {
359 int x;
360 for (x = 0; x < p->n; ++x) {
361 bam_swap_endian_8p(&p->list[x].u);
362 bam_swap_endian_8p(&p->list[x].v);
363 }
364 }
365 }
366 // load linear index
367 fread(&index2->n, 4, 1, fp);
368 if (bam_is_be) bam_swap_endian_4p(&index2->n);
369 index2->m = index2->n;
370 index2->offset = (uint64_t*)calloc(index2->m, 8);
371 fread(index2->offset, index2->n, 8, fp);
372 if (bam_is_be)
373 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
374 }
375 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
376 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
377 return idx;
378 }
379
380 bam_index_t *bam_index_load_local(const char *_fn)
381 {
382 FILE *fp;
383 char *fnidx, *fn;
384
385 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
386 const char *p;
387 int l = strlen(_fn);
388 for (p = _fn + l - 1; p >= _fn; --p)
389 if (*p == '/') break;
390 fn = strdup(p + 1);
391 } else fn = strdup(_fn);
392 fnidx = (char*)calloc(strlen(fn) + 5, 1);
393 strcpy(fnidx, fn); strcat(fnidx, ".bai");
394 fp = fopen(fnidx, "rb");
395 if (fp == 0) { // try "{base}.bai"
396 char *s = strstr(fn, "bam");
397 if (s == fn + strlen(fn) - 3) {
398 strcpy(fnidx, fn);
399 fnidx[strlen(fn)-1] = 'i';
400 fp = fopen(fnidx, "rb");
401 }
402 }
403 free(fnidx); free(fn);
404 if (fp) {
405 bam_index_t *idx = bam_index_load_core(fp);
406 fclose(fp);
407 return idx;
408 } else return 0;
409 }
410
411 #ifdef _USE_KNETFILE
412 static void download_from_remote(const char *url)
413 {
414 const int buf_size = 1 * 1024 * 1024;
415 char *fn;
416 FILE *fp;
417 uint8_t *buf;
418 knetFile *fp_remote;
419 int l;
420 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
421 l = strlen(url);
422 for (fn = (char*)url + l - 1; fn >= url; --fn)
423 if (*fn == '/') break;
424 ++fn; // fn now points to the file name
425 fp_remote = knet_open(url, "r");
426 if (fp_remote == 0) {
427 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
428 return;
429 }
430 if ((fp = fopen(fn, "wb")) == 0) {
431 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
432 knet_close(fp_remote);
433 return;
434 }
435 buf = (uint8_t*)calloc(buf_size, 1);
436 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
437 fwrite(buf, 1, l, fp);
438 free(buf);
439 fclose(fp);
440 knet_close(fp_remote);
441 }
442 #else
443 static void download_from_remote(const char *url)
444 {
445 return;
446 }
447 #endif
448
449 bam_index_t *bam_index_load(const char *fn)
450 {
451 bam_index_t *idx;
452 idx = bam_index_load_local(fn);
453 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
454 char *fnidx = calloc(strlen(fn) + 5, 1);
455 strcat(strcpy(fnidx, fn), ".bai");
456 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
457 download_from_remote(fnidx);
458 idx = bam_index_load_local(fn);
459 }
460 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
461 return idx;
462 }
463
464 int bam_index_build2(const char *fn, const char *_fnidx)
465 {
466 char *fnidx;
467 FILE *fpidx;
468 bamFile fp;
469 bam_index_t *idx;
470 if ((fp = bam_open(fn, "r")) == 0) {
471 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
472 return -1;
473 }
474 idx = bam_index_core(fp);
475 bam_close(fp);
476 if (_fnidx == 0) {
477 fnidx = (char*)calloc(strlen(fn) + 5, 1);
478 strcpy(fnidx, fn); strcat(fnidx, ".bai");
479 } else fnidx = strdup(_fnidx);
480 fpidx = fopen(fnidx, "wb");
481 if (fpidx == 0) {
482 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
483 free(fnidx);
484 return -1;
485 }
486 bam_index_save(idx, fpidx);
487 bam_index_destroy(idx);
488 fclose(fpidx);
489 free(fnidx);
490 return 0;
491 }
492
493 int bam_index_build(const char *fn)
494 {
495 return bam_index_build2(fn, 0);
496 }
497
498 int bam_index(int argc, char *argv[])
499 {
500 if (argc < 2) {
501 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
502 return 1;
503 }
504 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
505 else bam_index_build(argv[1]);
506 return 0;
507 }
508
509 int bam_idxstats(int argc, char *argv[])
510 {
511 bam_index_t *idx;
512 bam_header_t *header;
513 bamFile fp;
514 int i;
515 if (argc < 2) {
516 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
517 return 1;
518 }
519 fp = bam_open(argv[1], "r");
520 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
521 header = bam_header_read(fp);
522 bam_close(fp);
523 idx = bam_index_load(argv[1]);
524 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
525 for (i = 0; i < idx->n; ++i) {
526 khint_t k;
527 khash_t(i) *h = idx->index[i];
528 printf("%s\t%d", header->target_name[i], header->target_len[i]);
529 k = kh_get(i, h, BAM_MAX_BIN);
530 if (k != kh_end(h))
531 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
532 else printf("\t0\t0");
533 putchar('\n');
534 }
535 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
536 bam_header_destroy(header);
537 bam_index_destroy(idx);
538 return 0;
539 }
540
541 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
542 {
543 int i = 0, k;
544 if (beg >= end) return 0;
545 if (end >= 1u<<29) end = 1u<<29;
546 --end;
547 list[i++] = 0;
548 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
549 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
550 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
551 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
552 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
553 return i;
554 }
555
556 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
557 {
558 uint32_t rbeg = b->core.pos;
559 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
560 return (rend > beg && rbeg < end);
561 }
562
563 struct __bam_iter_t {
564 int from_first; // read from the first record; no random access
565 int tid, beg, end, n_off, i, finished;
566 uint64_t curr_off;
567 pair64_t *off;
568 };
569
570 // bam_fetch helper function retrieves
571 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
572 {
573 uint16_t *bins;
574 int i, n_bins, n_off;
575 pair64_t *off;
576 khint_t k;
577 khash_t(i) *index;
578 uint64_t min_off;
579 bam_iter_t iter = 0;
580
581 if (beg < 0) beg = 0;
582 if (end < beg) return 0;
583 // initialize iter
584 iter = calloc(1, sizeof(struct __bam_iter_t));
585 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
586 //
587 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
588 n_bins = reg2bins(beg, end, bins);
589 index = idx->index[tid];
590 if (idx->index2[tid].n > 0) {
591 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
592 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
593 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
594 int n = beg>>BAM_LIDX_SHIFT;
595 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
596 for (i = n - 1; i >= 0; --i)
597 if (idx->index2[tid].offset[i] != 0) break;
598 if (i >= 0) min_off = idx->index2[tid].offset[i];
599 }
600 } else min_off = 0; // tabix 0.1.2 may produce such index files
601 for (i = n_off = 0; i < n_bins; ++i) {
602 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
603 n_off += kh_value(index, k).n;
604 }
605 if (n_off == 0) {
606 free(bins); return iter;
607 }
608 off = (pair64_t*)calloc(n_off, 16);
609 for (i = n_off = 0; i < n_bins; ++i) {
610 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
611 int j;
612 bam_binlist_t *p = &kh_value(index, k);
613 for (j = 0; j < p->n; ++j)
614 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
615 }
616 }
617 free(bins);
618 if (n_off == 0) {
619 free(off); return iter;
620 }
621 {
622 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
623 int l;
624 ks_introsort(off, n_off, off);
625 // resolve completely contained adjacent blocks
626 for (i = 1, l = 0; i < n_off; ++i)
627 if (off[l].v < off[i].v)
628 off[++l] = off[i];
629 n_off = l + 1;
630 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
631 for (i = 1; i < n_off; ++i)
632 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
633 { // merge adjacent blocks
634 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
635 for (i = 1, l = 0; i < n_off; ++i) {
636 #ifdef BAM_TRUE_OFFSET
637 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
638 #else
639 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
640 #endif
641 else off[++l] = off[i];
642 }
643 n_off = l + 1;
644 #endif
645 }
646 bam_destroy1(b);
647 }
648 iter->n_off = n_off; iter->off = off;
649 return iter;
650 }
651
652 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
653 { // for pysam compatibility
654 bam_iter_t iter;
655 pair64_t *off;
656 iter = bam_iter_query(idx, tid, beg, end);
657 off = iter->off; *cnt_off = iter->n_off;
658 free(iter);
659 return off;
660 }
661
662 void bam_iter_destroy(bam_iter_t iter)
663 {
664 if (iter) { free(iter->off); free(iter); }
665 }
666
667 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
668 {
669 int ret;
670 if (iter && iter->finished) return -1;
671 if (iter == 0 || iter->from_first) {
672 ret = bam_read1(fp, b);
673 if (ret < 0 && iter) iter->finished = 1;
674 return ret;
675 }
676 if (iter->off == 0) return -1;
677 for (;;) {
678 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
679 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
680 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
681 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
682 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
683 iter->curr_off = bam_tell(fp);
684 }
685 ++iter->i;
686 }
687 if ((ret = bam_read1(fp, b)) >= 0) {
688 iter->curr_off = bam_tell(fp);
689 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
690 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
691 break;
692 }
693 else if (is_overlap(iter->beg, iter->end, b)) return ret;
694 } else break; // end of file or error
695 }
696 iter->finished = 1;
697 return ret;
698 }
699
700 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
701 {
702 int ret;
703 bam_iter_t iter;
704 bam1_t *b;
705 b = bam_init1();
706 iter = bam_iter_query(idx, tid, beg, end);
707 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
708 bam_iter_destroy(iter);
709 bam_destroy1(b);
710 return (ret == -1)? 0 : ret;
711 }