comparison PsiCLASS-1.0.2/samtools-0.1.19/bam_index.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #include <ctype.h>
2 #include <assert.h>
3 #include "bam.h"
4 #include "khash.h"
5 #include "ksort.h"
6 #include "bam_endian.h"
7 #ifdef _USE_KNETFILE
8 #include "knetfile.h"
9 #endif
10
11 /*!
12 @header
13
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
19 short alignments.
20
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
29
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
38
39 */
40
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
44
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
46
47 typedef struct {
48 uint64_t u, v;
49 } pair64_t;
50
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
53
54 typedef struct {
55 uint32_t m, n;
56 pair64_t *list;
57 } bam_binlist_t;
58
59 typedef struct {
60 int32_t n, m;
61 uint64_t *offset;
62 } bam_lidx_t;
63
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
65
66 struct __bam_index_t {
67 int32_t n;
68 uint64_t n_no_coor; // unmapped reads without coordinate
69 khash_t(i) **index;
70 bam_lidx_t *index2;
71 };
72
73 // requirement: len <= LEN_MASK
74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
75 {
76 khint_t k;
77 bam_binlist_t *l;
78 int ret;
79 k = kh_put(i, h, bin, &ret);
80 l = &kh_value(h, k);
81 if (ret) { // not present
82 l->m = 1; l->n = 0;
83 l->list = (pair64_t*)calloc(l->m, 16);
84 }
85 if (l->n == l->m) {
86 l->m <<= 1;
87 l->list = (pair64_t*)realloc(l->list, l->m * 16);
88 }
89 l->list[l->n].u = beg; l->list[l->n++].v = end;
90 }
91
92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
93 {
94 int i, beg, end;
95 beg = b->core.pos >> BAM_LIDX_SHIFT;
96 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 if (index2->m < end + 1) {
98 int old_m = index2->m;
99 index2->m = end + 1;
100 kroundup32(index2->m);
101 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
103 }
104 if (beg == end) {
105 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
106 } else {
107 for (i = beg; i <= end; ++i)
108 if (index2->offset[i] == 0) index2->offset[i] = offset;
109 }
110 index2->n = end + 1;
111 }
112
113 static void merge_chunks(bam_index_t *idx)
114 {
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
116 khash_t(i) *index;
117 int i, l, m;
118 khint_t k;
119 for (i = 0; i < idx->n; ++i) {
120 index = idx->index[i];
121 for (k = kh_begin(index); k != kh_end(index); ++k) {
122 bam_binlist_t *p;
123 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 p = &kh_value(index, k);
125 m = 0;
126 for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
129 #else
130 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
131 #endif
132 else p->list[++m] = p->list[l];
133 } // ~for(l)
134 p->n = m + 1;
135 } // ~for(k)
136 } // ~for(i)
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
138 }
139
140 static void fill_missing(bam_index_t *idx)
141 {
142 int i, j;
143 for (i = 0; i < idx->n; ++i) {
144 bam_lidx_t *idx2 = &idx->index2[i];
145 for (j = 1; j < idx2->n; ++j)
146 if (idx2->offset[j] == 0)
147 idx2->offset[j] = idx2->offset[j-1];
148 }
149 }
150
151 bam_index_t *bam_index_core(bamFile fp)
152 {
153 bam1_t *b;
154 bam_header_t *h;
155 int i, ret;
156 bam_index_t *idx;
157 uint32_t last_bin, save_bin;
158 int32_t last_coor, last_tid, save_tid;
159 bam1_core_t *c;
160 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
161
162 h = bam_header_read(fp);
163 if(h == 0) {
164 fprintf(stderr, "[bam_index_core] Invalid BAM header.");
165 return NULL;
166 }
167
168 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
169 b = (bam1_t*)calloc(1, sizeof(bam1_t));
170 c = &b->core;
171
172 idx->n = h->n_targets;
173 bam_header_destroy(h);
174 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
175 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
176 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
177
178 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
179 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
180 n_mapped = n_unmapped = n_no_coor = off_end = 0;
181 off_beg = off_end = bam_tell(fp);
182 while ((ret = bam_read1(fp, b)) >= 0) {
183 if (c->tid < 0) ++n_no_coor;
184 if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
185 last_tid = c->tid;
186 last_bin = 0xffffffffu;
187 } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
188 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
189 bam1_qname(b), last_tid+1, c->tid+1);
190 return NULL;
191 } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
192 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
193 bam1_qname(b), last_coor, c->pos, c->tid+1);
194 return NULL;
195 }
196 if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
197 if (c->bin != last_bin) { // then possibly write the binning index
198 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
199 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
200 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
201 off_end = last_off;
202 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
203 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
204 n_mapped = n_unmapped = 0;
205 off_beg = off_end;
206 }
207 save_off = last_off;
208 save_bin = last_bin = c->bin;
209 save_tid = c->tid;
210 if (save_tid < 0) break;
211 }
212 if (bam_tell(fp) <= last_off) {
213 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
214 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
215 return NULL;
216 }
217 if (c->flag & BAM_FUNMAP) ++n_unmapped;
218 else ++n_mapped;
219 last_off = bam_tell(fp);
220 last_coor = b->core.pos;
221 }
222 if (save_tid >= 0) {
223 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
224 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
225 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
226 }
227 merge_chunks(idx);
228 fill_missing(idx);
229 if (ret >= 0) {
230 while ((ret = bam_read1(fp, b)) >= 0) {
231 ++n_no_coor;
232 if (c->tid >= 0 && n_no_coor) {
233 fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
234 return NULL;
235 }
236 }
237 }
238 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
239 free(b->data); free(b);
240 idx->n_no_coor = n_no_coor;
241 return idx;
242 }
243
244 void bam_index_destroy(bam_index_t *idx)
245 {
246 khint_t k;
247 int i;
248 if (idx == 0) return;
249 for (i = 0; i < idx->n; ++i) {
250 khash_t(i) *index = idx->index[i];
251 bam_lidx_t *index2 = idx->index2 + i;
252 for (k = kh_begin(index); k != kh_end(index); ++k) {
253 if (kh_exist(index, k))
254 free(kh_value(index, k).list);
255 }
256 kh_destroy(i, index);
257 free(index2->offset);
258 }
259 free(idx->index); free(idx->index2);
260 free(idx);
261 }
262
263 void bam_index_save(const bam_index_t *idx, FILE *fp)
264 {
265 int32_t i, size;
266 khint_t k;
267 fwrite("BAI\1", 1, 4, fp);
268 if (bam_is_be) {
269 uint32_t x = idx->n;
270 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
271 } else fwrite(&idx->n, 4, 1, fp);
272 for (i = 0; i < idx->n; ++i) {
273 khash_t(i) *index = idx->index[i];
274 bam_lidx_t *index2 = idx->index2 + i;
275 // write binning index
276 size = kh_size(index);
277 if (bam_is_be) { // big endian
278 uint32_t x = size;
279 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
280 } else fwrite(&size, 4, 1, fp);
281 for (k = kh_begin(index); k != kh_end(index); ++k) {
282 if (kh_exist(index, k)) {
283 bam_binlist_t *p = &kh_value(index, k);
284 if (bam_is_be) { // big endian
285 uint32_t x;
286 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
287 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
288 for (x = 0; (int)x < p->n; ++x) {
289 bam_swap_endian_8p(&p->list[x].u);
290 bam_swap_endian_8p(&p->list[x].v);
291 }
292 fwrite(p->list, 16, p->n, fp);
293 for (x = 0; (int)x < p->n; ++x) {
294 bam_swap_endian_8p(&p->list[x].u);
295 bam_swap_endian_8p(&p->list[x].v);
296 }
297 } else {
298 fwrite(&kh_key(index, k), 4, 1, fp);
299 fwrite(&p->n, 4, 1, fp);
300 fwrite(p->list, 16, p->n, fp);
301 }
302 }
303 }
304 // write linear index (index2)
305 if (bam_is_be) {
306 int x = index2->n;
307 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
308 } else fwrite(&index2->n, 4, 1, fp);
309 if (bam_is_be) { // big endian
310 int x;
311 for (x = 0; (int)x < index2->n; ++x)
312 bam_swap_endian_8p(&index2->offset[x]);
313 fwrite(index2->offset, 8, index2->n, fp);
314 for (x = 0; (int)x < index2->n; ++x)
315 bam_swap_endian_8p(&index2->offset[x]);
316 } else fwrite(index2->offset, 8, index2->n, fp);
317 }
318 { // write the number of reads coor-less records.
319 uint64_t x = idx->n_no_coor;
320 if (bam_is_be) bam_swap_endian_8p(&x);
321 fwrite(&x, 8, 1, fp);
322 }
323 fflush(fp);
324 }
325
326 static bam_index_t *bam_index_load_core(FILE *fp)
327 {
328 int i;
329 char magic[4];
330 bam_index_t *idx;
331 if (fp == 0) {
332 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
333 return 0;
334 }
335 fread(magic, 1, 4, fp);
336 if (strncmp(magic, "BAI\1", 4)) {
337 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
338 fclose(fp);
339 return 0;
340 }
341 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
342 fread(&idx->n, 4, 1, fp);
343 if (bam_is_be) bam_swap_endian_4p(&idx->n);
344 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
345 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
346 for (i = 0; i < idx->n; ++i) {
347 khash_t(i) *index;
348 bam_lidx_t *index2 = idx->index2 + i;
349 uint32_t key, size;
350 khint_t k;
351 int j, ret;
352 bam_binlist_t *p;
353 index = idx->index[i] = kh_init(i);
354 // load binning index
355 fread(&size, 4, 1, fp);
356 if (bam_is_be) bam_swap_endian_4p(&size);
357 for (j = 0; j < (int)size; ++j) {
358 fread(&key, 4, 1, fp);
359 if (bam_is_be) bam_swap_endian_4p(&key);
360 k = kh_put(i, index, key, &ret);
361 p = &kh_value(index, k);
362 fread(&p->n, 4, 1, fp);
363 if (bam_is_be) bam_swap_endian_4p(&p->n);
364 p->m = p->n;
365 p->list = (pair64_t*)malloc(p->m * 16);
366 fread(p->list, 16, p->n, fp);
367 if (bam_is_be) {
368 int x;
369 for (x = 0; x < p->n; ++x) {
370 bam_swap_endian_8p(&p->list[x].u);
371 bam_swap_endian_8p(&p->list[x].v);
372 }
373 }
374 }
375 // load linear index
376 fread(&index2->n, 4, 1, fp);
377 if (bam_is_be) bam_swap_endian_4p(&index2->n);
378 index2->m = index2->n;
379 index2->offset = (uint64_t*)calloc(index2->m, 8);
380 fread(index2->offset, index2->n, 8, fp);
381 if (bam_is_be)
382 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
383 }
384 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
385 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
386 return idx;
387 }
388
389 bam_index_t *bam_index_load_local(const char *_fn)
390 {
391 FILE *fp;
392 char *fnidx, *fn;
393
394 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
395 const char *p;
396 int l = strlen(_fn);
397 for (p = _fn + l - 1; p >= _fn; --p)
398 if (*p == '/') break;
399 fn = strdup(p + 1);
400 } else fn = strdup(_fn);
401 fnidx = (char*)calloc(strlen(fn) + 5, 1);
402 strcpy(fnidx, fn); strcat(fnidx, ".bai");
403 fp = fopen(fnidx, "rb");
404 if (fp == 0) { // try "{base}.bai"
405 char *s = strstr(fn, "bam");
406 if (s == fn + strlen(fn) - 3) {
407 strcpy(fnidx, fn);
408 fnidx[strlen(fn)-1] = 'i';
409 fp = fopen(fnidx, "rb");
410 }
411 }
412 free(fnidx); free(fn);
413 if (fp) {
414 bam_index_t *idx = bam_index_load_core(fp);
415 fclose(fp);
416 return idx;
417 } else return 0;
418 }
419
420 #ifdef _USE_KNETFILE
421 static void download_from_remote(const char *url)
422 {
423 const int buf_size = 1 * 1024 * 1024;
424 char *fn;
425 FILE *fp;
426 uint8_t *buf;
427 knetFile *fp_remote;
428 int l;
429 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
430 l = strlen(url);
431 for (fn = (char*)url + l - 1; fn >= url; --fn)
432 if (*fn == '/') break;
433 ++fn; // fn now points to the file name
434 fp_remote = knet_open(url, "r");
435 if (fp_remote == 0) {
436 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
437 return;
438 }
439 if ((fp = fopen(fn, "wb")) == 0) {
440 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
441 knet_close(fp_remote);
442 return;
443 }
444 buf = (uint8_t*)calloc(buf_size, 1);
445 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
446 fwrite(buf, 1, l, fp);
447 free(buf);
448 fclose(fp);
449 knet_close(fp_remote);
450 }
451 #else
452 static void download_from_remote(const char *url)
453 {
454 return;
455 }
456 #endif
457
458 bam_index_t *bam_index_load(const char *fn)
459 {
460 bam_index_t *idx;
461 idx = bam_index_load_local(fn);
462 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
463 char *fnidx = calloc(strlen(fn) + 5, 1);
464 strcat(strcpy(fnidx, fn), ".bai");
465 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
466 download_from_remote(fnidx);
467 free(fnidx);
468 idx = bam_index_load_local(fn);
469 }
470 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
471 return idx;
472 }
473
474 int bam_index_build2(const char *fn, const char *_fnidx)
475 {
476 char *fnidx;
477 FILE *fpidx;
478 bamFile fp;
479 bam_index_t *idx;
480 if ((fp = bam_open(fn, "r")) == 0) {
481 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
482 return -1;
483 }
484 idx = bam_index_core(fp);
485 bam_close(fp);
486 if(idx == 0) {
487 fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
488 return -1;
489 }
490 if (_fnidx == 0) {
491 fnidx = (char*)calloc(strlen(fn) + 5, 1);
492 strcpy(fnidx, fn); strcat(fnidx, ".bai");
493 } else fnidx = strdup(_fnidx);
494 fpidx = fopen(fnidx, "wb");
495 if (fpidx == 0) {
496 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
497 free(fnidx);
498 bam_index_destroy(idx);
499 return -1;
500 }
501 bam_index_save(idx, fpidx);
502 bam_index_destroy(idx);
503 fclose(fpidx);
504 free(fnidx);
505 return 0;
506 }
507
508 int bam_index_build(const char *fn)
509 {
510 return bam_index_build2(fn, 0);
511 }
512
513 int bam_index(int argc, char *argv[])
514 {
515 if (argc < 2) {
516 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
517 return 1;
518 }
519 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
520 else bam_index_build(argv[1]);
521 return 0;
522 }
523
524 int bam_idxstats(int argc, char *argv[])
525 {
526 bam_index_t *idx;
527 bam_header_t *header;
528 bamFile fp;
529 int i;
530 if (argc < 2) {
531 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
532 return 1;
533 }
534 fp = bam_open(argv[1], "r");
535 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
536 header = bam_header_read(fp);
537 bam_close(fp);
538 idx = bam_index_load(argv[1]);
539 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
540 for (i = 0; i < idx->n; ++i) {
541 khint_t k;
542 khash_t(i) *h = idx->index[i];
543 printf("%s\t%d", header->target_name[i], header->target_len[i]);
544 k = kh_get(i, h, BAM_MAX_BIN);
545 if (k != kh_end(h))
546 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
547 else printf("\t0\t0");
548 putchar('\n');
549 }
550 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
551 bam_header_destroy(header);
552 bam_index_destroy(idx);
553 return 0;
554 }
555
556 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
557 {
558 int i = 0, k;
559 if (beg >= end) return 0;
560 if (end >= 1u<<29) end = 1u<<29;
561 --end;
562 list[i++] = 0;
563 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
564 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
565 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
566 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
567 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
568 return i;
569 }
570
571 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
572 {
573 uint32_t rbeg = b->core.pos;
574 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
575 return (rend > beg && rbeg < end);
576 }
577
578 struct __bam_iter_t {
579 int from_first; // read from the first record; no random access
580 int tid, beg, end, n_off, i, finished;
581 uint64_t curr_off;
582 pair64_t *off;
583 };
584
585 // bam_fetch helper function retrieves
586 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
587 {
588 uint16_t *bins;
589 int i, n_bins, n_off;
590 pair64_t *off;
591 khint_t k;
592 khash_t(i) *index;
593 uint64_t min_off;
594 bam_iter_t iter = 0;
595
596 if (beg < 0) beg = 0;
597 if (end < beg) return 0;
598 // initialize iter
599 iter = calloc(1, sizeof(struct __bam_iter_t));
600 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
601 //
602 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
603 n_bins = reg2bins(beg, end, bins);
604 index = idx->index[tid];
605 if (idx->index2[tid].n > 0) {
606 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
607 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
608 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
609 int n = beg>>BAM_LIDX_SHIFT;
610 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
611 for (i = n - 1; i >= 0; --i)
612 if (idx->index2[tid].offset[i] != 0) break;
613 if (i >= 0) min_off = idx->index2[tid].offset[i];
614 }
615 } else min_off = 0; // tabix 0.1.2 may produce such index files
616 for (i = n_off = 0; i < n_bins; ++i) {
617 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
618 n_off += kh_value(index, k).n;
619 }
620 if (n_off == 0) {
621 free(bins); return iter;
622 }
623 off = (pair64_t*)calloc(n_off, 16);
624 for (i = n_off = 0; i < n_bins; ++i) {
625 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
626 int j;
627 bam_binlist_t *p = &kh_value(index, k);
628 for (j = 0; j < p->n; ++j)
629 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
630 }
631 }
632 free(bins);
633 if (n_off == 0) {
634 free(off); return iter;
635 }
636 {
637 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
638 int l;
639 ks_introsort(off, n_off, off);
640 // resolve completely contained adjacent blocks
641 for (i = 1, l = 0; i < n_off; ++i)
642 if (off[l].v < off[i].v)
643 off[++l] = off[i];
644 n_off = l + 1;
645 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
646 for (i = 1; i < n_off; ++i)
647 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
648 { // merge adjacent blocks
649 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
650 for (i = 1, l = 0; i < n_off; ++i) {
651 #ifdef BAM_TRUE_OFFSET
652 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
653 #else
654 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
655 #endif
656 else off[++l] = off[i];
657 }
658 n_off = l + 1;
659 #endif
660 }
661 bam_destroy1(b);
662 }
663 iter->n_off = n_off; iter->off = off;
664 return iter;
665 }
666
667 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
668 { // for pysam compatibility
669 bam_iter_t iter;
670 pair64_t *off;
671 iter = bam_iter_query(idx, tid, beg, end);
672 off = iter->off; *cnt_off = iter->n_off;
673 free(iter);
674 return off;
675 }
676
677 void bam_iter_destroy(bam_iter_t iter)
678 {
679 if (iter) { free(iter->off); free(iter); }
680 }
681
682 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
683 {
684 int ret;
685 if (iter && iter->finished) return -1;
686 if (iter == 0 || iter->from_first) {
687 ret = bam_read1(fp, b);
688 if (ret < 0 && iter) iter->finished = 1;
689 return ret;
690 }
691 if (iter->off == 0) return -1;
692 for (;;) {
693 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
694 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
695 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
696 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
697 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
698 iter->curr_off = bam_tell(fp);
699 }
700 ++iter->i;
701 }
702 if ((ret = bam_read1(fp, b)) >= 0) {
703 iter->curr_off = bam_tell(fp);
704 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
705 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
706 break;
707 }
708 else if (is_overlap(iter->beg, iter->end, b)) return ret;
709 } else break; // end of file or error
710 }
711 iter->finished = 1;
712 return ret;
713 }
714
715 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
716 {
717 int ret;
718 bam_iter_t iter;
719 bam1_t *b;
720 b = bam_init1();
721 iter = bam_iter_query(idx, tid, beg, end);
722 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
723 bam_iter_destroy(iter);
724 bam_destroy1(b);
725 return (ret == -1)? 0 : ret;
726 }