comparison ezBAMQC/src/htslib/hts.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /* hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3 Copyright (C) 2008, 2009, 2012-2015 Genome Research Ltd.
4 Copyright (C) 2012, 2013 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <zlib.h>
27 #include <ctype.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <stdlib.h>
31 #include <limits.h>
32 #include <fcntl.h>
33 #include <errno.h>
34 #include <sys/stat.h>
35 #include "htslib/bgzf.h"
36 #include "htslib/hts.h"
37 #include "cram/cram.h"
38 #include "htslib/hfile.h"
39 #include "version.h"
40
41 #include "htslib/kseq.h"
42 #define KS_BGZF 1
43 #if KS_BGZF
44 // bgzf now supports gzip-compressed files, the gzFile branch can be removed
45 KSTREAM_INIT2(, BGZF*, bgzf_read, 65536)
46 #else
47 KSTREAM_INIT2(, gzFile, gzread, 16384)
48 #endif
49
50 #include "htslib/khash.h"
51 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
52
53 int hts_verbose = 3;
54
55 const char *hts_version()
56 {
57 return HTS_VERSION;
58 }
59
60 const unsigned char seq_nt16_table[256] = {
61 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
62 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
63 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
64 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
65 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
66 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
67 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
68 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
69
70 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
71 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
72 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
73 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
74 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
75 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
76 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
77 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
78 };
79
80 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
81
82 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
83
84 /**********************
85 *** Basic file I/O ***
86 **********************/
87
88 static enum htsFormatCategory format_category(enum htsExactFormat fmt)
89 {
90 switch (fmt) {
91 case bam:
92 case sam:
93 case cram:
94 return sequence_data;
95
96 case vcf:
97 case bcf:
98 return variant_data;
99
100 case bai:
101 case crai:
102 case csi:
103 case gzi:
104 case tbi:
105 return index_file;
106
107 case bed:
108 return region_list;
109
110 case unknown_format:
111 case binary_format:
112 case text_format:
113 case format_maximum:
114 break;
115 }
116
117 return unknown_category;
118 }
119
120 // Decompress up to ten or so bytes by peeking at the file, which must be
121 // positioned at the start of a GZIP block.
122 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
123 {
124 // Typically at most a couple of hundred bytes of input are required
125 // to get a few bytes of output from inflate(), so hopefully this buffer
126 // size suffices in general.
127 unsigned char buffer[512];
128 z_stream zs;
129 ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
130
131 if (npeek < 0) return 0;
132
133 zs.zalloc = NULL;
134 zs.zfree = NULL;
135 zs.next_in = buffer;
136 zs.avail_in = npeek;
137 zs.next_out = dest;
138 zs.avail_out = destsize;
139 if (inflateInit2(&zs, 31) != Z_OK) return 0;
140
141 while (zs.total_out < destsize)
142 if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
143
144 destsize = zs.total_out;
145 inflateEnd(&zs);
146
147 return destsize;
148 }
149
150 // Parse "x.y" text, taking care because the string is not NUL-terminated
151 // and filling in major/minor only when the digits are followed by a delimiter,
152 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
153 static void
154 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
155 {
156 const char *str = (const char *) u;
157 const char *slim = (const char *) ulim;
158 const char *s;
159
160 fmt->version.major = fmt->version.minor = -1;
161
162 for (s = str; s < slim; s++) if (!isdigit(*s)) break;
163 if (s < slim) {
164 fmt->version.major = atoi(str);
165 if (*s == '.') {
166 str = &s[1];
167 for (s = str; s < slim; s++) if (!isdigit(*s)) break;
168 if (s < slim)
169 fmt->version.minor = atoi(str);
170 }
171 else
172 fmt->version.minor = 0;
173 }
174 }
175
176 int hts_detect_format(hFILE *hfile, htsFormat *fmt)
177 {
178 unsigned char s[21];
179 ssize_t len = hpeek(hfile, s, 18);
180 if (len < 0) return -1;
181
182 if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
183 // The stream is either gzip-compressed or BGZF-compressed.
184 // Determine which, and decompress the first few bytes.
185 fmt->compression = (len >= 18 && (s[3] & 4) &&
186 memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
187 len = decompress_peek(hfile, s, sizeof s);
188 }
189 else {
190 fmt->compression = no_compression;
191 len = hpeek(hfile, s, sizeof s);
192 }
193 if (len < 0) return -1;
194
195 fmt->compression_level = -1;
196 fmt->specific = NULL;
197
198 if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) {
199 fmt->category = sequence_data;
200 fmt->format = cram;
201 fmt->version.major = s[4], fmt->version.minor = s[5];
202 fmt->compression = custom;
203 return 0;
204 }
205 else if (len >= 4 && s[3] <= '\4') {
206 if (memcmp(s, "BAM\1", 4) == 0) {
207 fmt->category = sequence_data;
208 fmt->format = bam;
209 // TODO Decompress enough to pick version from @HD-VN header
210 fmt->version.major = 1, fmt->version.minor = -1;
211 return 0;
212 }
213 else if (memcmp(s, "BAI\1", 4) == 0) {
214 fmt->category = index_file;
215 fmt->format = bai;
216 fmt->version.major = -1, fmt->version.minor = -1;
217 return 0;
218 }
219 else if (memcmp(s, "BCF\4", 4) == 0) {
220 fmt->category = variant_data;
221 fmt->format = bcf;
222 fmt->version.major = 1, fmt->version.minor = -1;
223 return 0;
224 }
225 else if (memcmp(s, "BCF\2", 4) == 0) {
226 fmt->category = variant_data;
227 fmt->format = bcf;
228 fmt->version.major = s[3];
229 fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
230 return 0;
231 }
232 else if (memcmp(s, "CSI\1", 4) == 0) {
233 fmt->category = index_file;
234 fmt->format = csi;
235 fmt->version.major = 1, fmt->version.minor = -1;
236 return 0;
237 }
238 else if (memcmp(s, "TBI\1", 4) == 0) {
239 fmt->category = index_file;
240 fmt->format = tbi;
241 fmt->version.major = -1, fmt->version.minor = -1;
242 return 0;
243 }
244 }
245 else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
246 fmt->category = variant_data;
247 fmt->format = vcf;
248 if (len >= 21 && s[16] == 'v')
249 parse_version(fmt, &s[17], &s[len]);
250 else
251 fmt->version.major = fmt->version.minor = -1;
252 return 0;
253 }
254 else if (len >= 4 && s[0] == '@' &&
255 (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
256 memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) {
257 fmt->category = sequence_data;
258 fmt->format = sam;
259 // @HD-VN is not guaranteed to be the first tag, but then @HD is
260 // not guaranteed to be present at all...
261 if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
262 parse_version(fmt, &s[7], &s[len]);
263 else
264 fmt->version.major = 1, fmt->version.minor = -1;
265 return 0;
266 }
267 else {
268 // Various possibilities for tab-delimited text:
269 // .crai (gzipped tab-delimited six columns: seqid 5*number)
270 // .bed ([3..12] tab-delimited columns)
271 // .bedpe (>= 10 tab-delimited columns)
272 // .sam (tab-delimited >= 11 columns: seqid number seqid...)
273 // FIXME For now, assume it's SAM
274 fmt->category = sequence_data;
275 fmt->format = sam;
276 fmt->version.major = 1, fmt->version.minor = -1;
277 return 0;
278 }
279
280 fmt->category = unknown_category;
281 fmt->format = unknown_format;
282 fmt->version.major = fmt->version.minor = -1;
283 fmt->compression = no_compression;
284 return 0;
285 }
286
287 char *hts_format_description(const htsFormat *format)
288 {
289 kstring_t str = { 0, 0, NULL };
290
291 switch (format->format) {
292 case sam: kputs("SAM", &str); break;
293 case bam: kputs("BAM", &str); break;
294 case cram: kputs("CRAM", &str); break;
295 case vcf: kputs("VCF", &str); break;
296 case bcf:
297 if (format->version.major == 1) kputs("Legacy BCF", &str);
298 else kputs("BCF", &str);
299 break;
300 case bai: kputs("BAI", &str); break;
301 case crai: kputs("CRAI", &str); break;
302 case csi: kputs("CSI", &str); break;
303 case tbi: kputs("Tabix", &str); break;
304 default: kputs("unknown", &str); break;
305 }
306
307 if (format->version.major >= 0) {
308 kputs(" version ", &str);
309 kputw(format->version.major, &str);
310 if (format->version.minor >= 0) {
311 kputc('.', &str);
312 kputw(format->version.minor, &str);
313 }
314 }
315
316 switch (format->compression) {
317 case custom: kputs(" compressed", &str); break;
318 case gzip: kputs(" gzip-compressed", &str); break;
319 case bgzf:
320 switch (format->format) {
321 case bam:
322 case bcf:
323 case csi:
324 case tbi:
325 // These are by definition BGZF, so just use the generic term
326 kputs(" compressed", &str);
327 break;
328 default:
329 kputs(" BGZF-compressed", &str);
330 break;
331 }
332 break;
333 default: break;
334 }
335
336 switch (format->category) {
337 case sequence_data: kputs(" sequence", &str); break;
338 case variant_data: kputs(" variant calling", &str); break;
339 case index_file: kputs(" index", &str); break;
340 case region_list: kputs(" genomic region", &str); break;
341 default: break;
342 }
343
344 if (format->compression == no_compression)
345 switch (format->format) {
346 case sam:
347 case crai:
348 case vcf:
349 case bed:
350 kputs(" text", &str);
351 break;
352
353 default:
354 kputs(" data", &str);
355 break;
356 }
357 else
358 kputs(" data", &str);
359
360 return ks_release(&str);
361 }
362
363 htsFile *hts_open(const char *fn, const char *mode)
364 {
365 htsFile *fp = NULL;
366 hFILE *hfile = hopen(fn, mode);
367 if (hfile == NULL) goto error;
368
369 fp = hts_hopen(hfile, fn, mode);
370 if (fp == NULL) goto error;
371
372 return fp;
373
374 error:
375 if (hts_verbose >= 2)
376 fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
377
378 if (hfile)
379 hclose_abruptly(hfile);
380
381 return NULL;
382 }
383
384 htsFile *hts_hopen(struct hFILE *hfile, const char *fn, const char *mode)
385 {
386 htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
387 if (fp == NULL) goto error;
388
389 fp->fn = strdup(fn);
390 fp->is_be = ed_is_big();
391
392 if (strchr(mode, 'r')) {
393 if (hts_detect_format(hfile, &fp->format) < 0) goto error;
394 }
395 else if (strchr(mode, 'w') || strchr(mode, 'a')) {
396 htsFormat *fmt = &fp->format;
397 fp->is_write = 1;
398
399 if (strchr(mode, 'b')) fmt->format = binary_format;
400 else if (strchr(mode, 'c')) fmt->format = cram;
401 else fmt->format = text_format;
402
403 if (strchr(mode, 'z')) fmt->compression = bgzf;
404 else if (strchr(mode, 'g')) fmt->compression = gzip;
405 else if (strchr(mode, 'u')) fmt->compression = no_compression;
406 else {
407 // No compression mode specified, set to the default for the format
408 switch (fmt->format) {
409 case binary_format: fmt->compression = bgzf; break;
410 case cram: fmt->compression = custom; break;
411 case text_format: fmt->compression = no_compression; break;
412 default: abort();
413 }
414 }
415
416 // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
417 fmt->category = format_category(fmt->format);
418
419 fmt->version.major = fmt->version.minor = -1;
420 fmt->compression_level = -1;
421 fmt->specific = NULL;
422 }
423 else goto error;
424
425 switch (fp->format.format) {
426 case binary_format:
427 case bam:
428 case bcf:
429 fp->fp.bgzf = bgzf_hopen(hfile, mode);
430 if (fp->fp.bgzf == NULL) goto error;
431 fp->is_bin = 1;
432 break;
433
434 case cram:
435 fp->fp.cram = cram_dopen(hfile, fn, mode);
436 if (fp->fp.cram == NULL) goto error;
437 if (!fp->is_write)
438 cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
439 fp->is_cram = 1;
440 break;
441
442 case text_format:
443 case sam:
444 case vcf:
445 if (!fp->is_write) {
446 #if KS_BGZF
447 BGZF *gzfp = bgzf_hopen(hfile, mode);
448 #else
449 // TODO Implement gzip hFILE adaptor
450 hclose(hfile); // This won't work, especially for stdin
451 gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb");
452 #endif
453 if (gzfp) fp->fp.voidp = ks_init(gzfp);
454 else goto error;
455 }
456 else if (fp->format.compression != no_compression) {
457 fp->fp.bgzf = bgzf_hopen(hfile, mode);
458 if (fp->fp.bgzf == NULL) goto error;
459 }
460 else
461 fp->fp.hfile = hfile;
462 break;
463
464 default:
465 goto error;
466 }
467
468 return fp;
469
470 error:
471 if (hts_verbose >= 2)
472 fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
473
474 if (fp) {
475 free(fp->fn);
476 free(fp->fn_aux);
477 free(fp);
478 }
479 return NULL;
480 }
481
482 int hts_close(htsFile *fp)
483 {
484 int ret, save;
485
486 switch (fp->format.format) {
487 case binary_format:
488 case bam:
489 case bcf:
490 ret = bgzf_close(fp->fp.bgzf);
491 break;
492
493 case cram:
494 if (!fp->is_write) {
495 switch (cram_eof(fp->fp.cram)) {
496 case 0:
497 fprintf(stderr, "[E::%s] Failed to decode sequence.\n", __func__);
498 return -1;
499 case 2:
500 fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
501 break;
502 default: /* case 1, expected EOF */
503 break;
504 }
505 }
506 ret = cram_close(fp->fp.cram);
507 break;
508
509 case text_format:
510 case sam:
511 case vcf:
512 if (!fp->is_write) {
513 #if KS_BGZF
514 BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f;
515 ret = bgzf_close(gzfp);
516 #else
517 gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f;
518 ret = gzclose(gzfp);
519 #endif
520 ks_destroy((kstream_t*)fp->fp.voidp);
521 }
522 else if (fp->format.compression != no_compression)
523 ret = bgzf_close(fp->fp.bgzf);
524 else
525 ret = hclose(fp->fp.hfile);
526 break;
527
528 default:
529 ret = -1;
530 break;
531 }
532
533 save = errno;
534 free(fp->fn);
535 free(fp->fn_aux);
536 free(fp->line.s);
537 free(fp);
538 errno = save;
539 return ret;
540 }
541
542 const htsFormat *hts_get_format(htsFile *fp)
543 {
544 return fp? &fp->format : NULL;
545 }
546
547 int hts_set_opt(htsFile *fp, enum cram_option opt, ...) {
548 int r;
549 va_list args;
550
551 if (fp->format.format != cram)
552 return 0;
553
554 va_start(args, opt);
555 r = cram_set_voption(fp->fp.cram, opt, args);
556 va_end(args);
557
558 return r;
559 }
560
561 int hts_set_threads(htsFile *fp, int n)
562 {
563 if (fp->format.compression == bgzf) {
564 return bgzf_mt(fp->fp.bgzf, n, 256);
565 } else if (fp->format.format == cram) {
566 return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
567 }
568 else return 0;
569 }
570
571 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
572 {
573 free(fp->fn_aux);
574 if (fn_aux) {
575 fp->fn_aux = strdup(fn_aux);
576 if (fp->fn_aux == NULL) return -1;
577 }
578 else fp->fn_aux = NULL;
579
580 if (fp->format.format == cram)
581 cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux);
582
583 return 0;
584 }
585
586 // For VCF/BCF backward sweeper. Not exposing these functions because their
587 // future is uncertain. Things will probably have to change with hFILE...
588 BGZF *hts_get_bgzfp(htsFile *fp)
589 {
590 if ( fp->is_bin )
591 return fp->fp.bgzf;
592 else
593 return ((kstream_t*)fp->fp.voidp)->f;
594 }
595 int hts_useek(htsFile *fp, long uoffset, int where)
596 {
597 if ( fp->is_bin )
598 return bgzf_useek(fp->fp.bgzf, uoffset, where);
599 else
600 {
601 ks_rewind((kstream_t*)fp->fp.voidp);
602 ((kstream_t*)fp->fp.voidp)->seek_pos = uoffset;
603 return bgzf_useek(((kstream_t*)fp->fp.voidp)->f, uoffset, where);
604 }
605 }
606 long hts_utell(htsFile *fp)
607 {
608 if ( fp->is_bin )
609 return bgzf_utell(fp->fp.bgzf);
610 else
611 return ((kstream_t*)fp->fp.voidp)->seek_pos;
612 }
613
614 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
615 {
616 int ret, dret;
617 ret = ks_getuntil((kstream_t*)fp->fp.voidp, delimiter, str, &dret);
618 ++fp->lineno;
619 return ret;
620 }
621
622 char **hts_readlist(const char *string, int is_file, int *_n)
623 {
624 int m = 0, n = 0, dret;
625 char **s = 0;
626 if ( is_file )
627 {
628 #if KS_BGZF
629 BGZF *fp = bgzf_open(string, "r");
630 #else
631 gzFile fp = gzopen(string, "r");
632 #endif
633 if ( !fp ) return NULL;
634
635 kstream_t *ks;
636 kstring_t str;
637 str.s = 0; str.l = str.m = 0;
638 ks = ks_init(fp);
639 while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0)
640 {
641 if (str.l == 0) continue;
642 n++;
643 hts_expand(char*,n,m,s);
644 s[n-1] = strdup(str.s);
645 }
646 ks_destroy(ks);
647 #if KS_BGZF
648 bgzf_close(fp);
649 #else
650 gzclose(fp);
651 #endif
652 free(str.s);
653 }
654 else
655 {
656 const char *q = string, *p = string;
657 while ( 1 )
658 {
659 if (*p == ',' || *p == 0)
660 {
661 n++;
662 hts_expand(char*,n,m,s);
663 s[n-1] = (char*)calloc(p - q + 1, 1);
664 strncpy(s[n-1], q, p - q);
665 q = p + 1;
666 }
667 if ( !*p ) break;
668 p++;
669 }
670 }
671 s = (char**)realloc(s, n * sizeof(char*));
672 *_n = n;
673 return s;
674 }
675
676 char **hts_readlines(const char *fn, int *_n)
677 {
678 int m = 0, n = 0, dret;
679 char **s = 0;
680 #if KS_BGZF
681 BGZF *fp = bgzf_open(fn, "r");
682 #else
683 gzFile fp = gzopen(fn, "r");
684 #endif
685 if ( fp ) { // read from file
686 kstream_t *ks;
687 kstring_t str;
688 str.s = 0; str.l = str.m = 0;
689 ks = ks_init(fp);
690 while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
691 if (str.l == 0) continue;
692 if (m == n) {
693 m = m? m<<1 : 16;
694 s = (char**)realloc(s, m * sizeof(char*));
695 }
696 s[n++] = strdup(str.s);
697 }
698 ks_destroy(ks);
699 #if KS_BGZF
700 bgzf_close(fp);
701 #else
702 gzclose(fp);
703 #endif
704 s = (char**)realloc(s, n * sizeof(char*));
705 free(str.s);
706 } else if (*fn == ':') { // read from string
707 const char *q, *p;
708 for (q = p = fn + 1;; ++p)
709 if (*p == ',' || *p == 0) {
710 if (m == n) {
711 m = m? m<<1 : 16;
712 s = (char**)realloc(s, m * sizeof(char*));
713 }
714 s[n] = (char*)calloc(p - q + 1, 1);
715 strncpy(s[n++], q, p - q);
716 q = p + 1;
717 if (*p == 0) break;
718 }
719 } else return 0;
720 s = (char**)realloc(s, n * sizeof(char*));
721 *_n = n;
722 return s;
723 }
724
725 // DEPRECATED: To be removed in a future HTSlib release
726 int hts_file_type(const char *fname)
727 {
728 int len = strlen(fname);
729 if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
730 if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
731 if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
732 if ( !strcmp("-",fname) ) return FT_STDIN;
733
734 hFILE *f = hopen(fname, "r");
735 if (f == NULL) return 0;
736
737 htsFormat fmt;
738 if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
739 if (hclose(f) < 0) return 0;
740
741 switch (fmt.format) {
742 case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
743 case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
744 default: return 0;
745 }
746 }
747
748 /****************
749 *** Indexing ***
750 ****************/
751
752 #define HTS_MIN_MARKER_DIST 0x10000
753
754 // Finds the special meta bin
755 // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
756 #define META_BIN(idx) ((idx)->n_bins + 1)
757
758 #define pair64_lt(a,b) ((a).u < (b).u)
759
760 #include "htslib/ksort.h"
761 KSORT_INIT(_off, hts_pair64_t, pair64_lt)
762
763 typedef struct {
764 int32_t m, n;
765 uint64_t loff;
766 hts_pair64_t *list;
767 } bins_t;
768
769 #include "htslib/khash.h"
770 KHASH_MAP_INIT_INT(bin, bins_t)
771 typedef khash_t(bin) bidx_t;
772
773 typedef struct {
774 int32_t n, m;
775 uint64_t *offset;
776 } lidx_t;
777
778 struct __hts_idx_t {
779 int fmt, min_shift, n_lvls, n_bins;
780 uint32_t l_meta;
781 int32_t n, m;
782 uint64_t n_no_coor;
783 bidx_t **bidx;
784 lidx_t *lidx;
785 uint8_t *meta;
786 struct {
787 uint32_t last_bin, save_bin;
788 int last_coor, last_tid, save_tid, finished;
789 uint64_t last_off, save_off;
790 uint64_t off_beg, off_end;
791 uint64_t n_mapped, n_unmapped;
792 } z; // keep internal states
793 };
794
795 static inline void insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
796 {
797 khint_t k;
798 bins_t *l;
799 int absent;
800 k = kh_put(bin, b, bin, &absent);
801 l = &kh_value(b, k);
802 if (absent) {
803 l->m = 1; l->n = 0;
804 l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
805 }
806 if (l->n == l->m) {
807 l->m <<= 1;
808 l->list = (hts_pair64_t*)realloc(l->list, l->m * sizeof(hts_pair64_t));
809 }
810 l->list[l->n].u = beg;
811 l->list[l->n++].v = end;
812 }
813
814 static inline void insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
815 {
816 int i, beg, end;
817 beg = _beg >> min_shift;
818 end = (_end - 1) >> min_shift;
819 if (l->m < end + 1) {
820 int old_m = l->m;
821 l->m = end + 1;
822 kroundup32(l->m);
823 l->offset = (uint64_t*)realloc(l->offset, l->m * sizeof(uint64_t));
824 memset(l->offset + old_m, 0xff, 8 * (l->m - old_m)); // fill l->offset with (uint64_t)-1
825 }
826 if (beg == end) { // to save a loop in this case
827 if (l->offset[beg] == (uint64_t)-1) l->offset[beg] = offset;
828 } else {
829 for (i = beg; i <= end; ++i)
830 if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
831 }
832 if (l->n < end + 1) l->n = end + 1;
833 }
834
835 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
836 {
837 hts_idx_t *idx;
838 idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
839 if (idx == NULL) return NULL;
840 idx->fmt = fmt;
841 idx->min_shift = min_shift;
842 idx->n_lvls = n_lvls;
843 idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
844 idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu;
845 idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
846 idx->z.last_coor = 0xffffffffu;
847 if (n) {
848 idx->n = idx->m = n;
849 idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
850 if (idx->bidx == NULL) { free(idx); return NULL; }
851 idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
852 if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
853 }
854 return idx;
855 }
856
857 static void update_loff(hts_idx_t *idx, int i, int free_lidx)
858 {
859 bidx_t *bidx = idx->bidx[i];
860 lidx_t *lidx = &idx->lidx[i];
861 khint_t k;
862 int l;
863 uint64_t offset0 = 0;
864 if (bidx) {
865 k = kh_get(bin, bidx, META_BIN(idx));
866 if (k != kh_end(bidx))
867 offset0 = kh_val(bidx, k).list[0].u;
868 for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
869 lidx->offset[l] = offset0;
870 } else l = 1;
871 for (; l < lidx->n; ++l) // fill missing values
872 if (lidx->offset[l] == (uint64_t)-1)
873 lidx->offset[l] = lidx->offset[l-1];
874 if (bidx == 0) return;
875 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
876 if (kh_exist(bidx, k))
877 {
878 if ( kh_key(bidx, k) < idx->n_bins )
879 {
880 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
881 // disable linear index if bot_bin out of bounds
882 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
883 }
884 else
885 kh_val(bidx, k).loff = 0;
886 }
887 if (free_lidx) {
888 free(lidx->offset);
889 lidx->m = lidx->n = 0;
890 lidx->offset = 0;
891 }
892 }
893
894 static void compress_binning(hts_idx_t *idx, int i)
895 {
896 bidx_t *bidx = idx->bidx[i];
897 khint_t k;
898 int l, m;
899 if (bidx == 0) return;
900 // merge a bin to its parent if the bin is too small
901 for (l = idx->n_lvls; l > 0; --l) {
902 unsigned start = hts_bin_first(l);
903 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
904 bins_t *p, *q;
905 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
906 p = &kh_value(bidx, k);
907 if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
908 if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
909 khint_t kp;
910 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
911 if (kp == kh_end(bidx)) continue;
912 q = &kh_val(bidx, kp);
913 if (q->n + p->n > q->m) {
914 q->m = q->n + p->n;
915 kroundup32(q->m);
916 q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t));
917 }
918 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
919 q->n += p->n;
920 free(p->list);
921 kh_del(bin, bidx, k);
922 }
923 }
924 }
925 k = kh_get(bin, bidx, 0);
926 if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
927 // merge adjacent chunks that start from the same BGZF block
928 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
929 bins_t *p;
930 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
931 p = &kh_value(bidx, k);
932 for (l = 1, m = 0; l < p->n; ++l) {
933 if (p->list[m].v>>16 >= p->list[l].u>>16) {
934 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
935 } else p->list[++m] = p->list[l];
936 }
937 p->n = m + 1;
938 }
939 }
940
941 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
942 {
943 int i;
944 if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times
945 if (idx->z.save_tid >= 0) {
946 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
947 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
948 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
949 }
950 for (i = 0; i < idx->n; ++i) {
951 update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
952 compress_binning(idx, i);
953 }
954 idx->z.finished = 1;
955 }
956
957 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
958 {
959 int bin;
960 if (tid<0) beg = -1, end = 0;
961 if (tid >= idx->m) { // enlarge the index
962 int32_t oldm = idx->m;
963 idx->m = idx->m? idx->m<<1 : 2;
964 idx->bidx = (bidx_t**)realloc(idx->bidx, idx->m * sizeof(bidx_t*));
965 idx->lidx = (lidx_t*) realloc(idx->lidx, idx->m * sizeof(lidx_t));
966 memset(&idx->bidx[oldm], 0, (idx->m - oldm) * sizeof(bidx_t*));
967 memset(&idx->lidx[oldm], 0, (idx->m - oldm) * sizeof(lidx_t));
968 }
969 if (idx->n < tid + 1) idx->n = tid + 1;
970 if (idx->z.finished) return 0;
971 if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
972 if ( tid>=0 && idx->n_no_coor )
973 {
974 if (hts_verbose >= 1) fprintf(stderr,"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->z.last_tid);
975 return -1;
976 }
977 if (tid>=0 && idx->bidx[tid] != 0)
978 {
979 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] chromosome blocks not continuous\n", __func__);
980 return -1;
981 }
982 idx->z.last_tid = tid;
983 idx->z.last_bin = 0xffffffffu;
984 } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
985 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] unsorted positions\n", __func__);
986 return -1;
987 }
988 if ( tid>=0 )
989 {
990 if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
991 if ( is_mapped)
992 insert_to_l(&idx->lidx[tid], beg, end, idx->z.last_off, idx->min_shift); // last_off points to the start of the current record
993 }
994 else idx->n_no_coor++;
995 bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
996 if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
997 if (idx->z.save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
998 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, idx->z.last_off);
999 if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
1000 idx->z.off_end = idx->z.last_off;
1001 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, idx->z.off_end);
1002 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
1003 idx->z.n_mapped = idx->z.n_unmapped = 0;
1004 idx->z.off_beg = idx->z.off_end;
1005 }
1006 idx->z.save_off = idx->z.last_off;
1007 idx->z.save_bin = idx->z.last_bin = bin;
1008 idx->z.save_tid = tid;
1009 }
1010 if (is_mapped) ++idx->z.n_mapped;
1011 else ++idx->z.n_unmapped;
1012 idx->z.last_off = offset;
1013 idx->z.last_coor = beg;
1014 return 0;
1015 }
1016
1017 void hts_idx_destroy(hts_idx_t *idx)
1018 {
1019 khint_t k;
1020 int i;
1021 if (idx == 0) return;
1022 // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
1023 if (idx->fmt == HTS_FMT_CRAI) { free(idx); return; }
1024
1025 for (i = 0; i < idx->m; ++i) {
1026 bidx_t *bidx = idx->bidx[i];
1027 free(idx->lidx[i].offset);
1028 if (bidx == 0) continue;
1029 for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1030 if (kh_exist(bidx, k))
1031 free(kh_value(bidx, k).list);
1032 kh_destroy(bin, bidx);
1033 }
1034 free(idx->bidx); free(idx->lidx); free(idx->meta);
1035 free(idx);
1036 }
1037
1038 static inline long idx_read(int is_bgzf, void *fp, void *buf, long l)
1039 {
1040 if (is_bgzf) return bgzf_read((BGZF*)fp, buf, l);
1041 else return (long)fread(buf, 1, l, (FILE*)fp);
1042 }
1043
1044 static inline long idx_write(int is_bgzf, void *fp, const void *buf, long l)
1045 {
1046 if (is_bgzf) return bgzf_write((BGZF*)fp, buf, l);
1047 else return (long)fwrite(buf, 1, l, (FILE*)fp);
1048 }
1049
1050 static inline void swap_bins(bins_t *p)
1051 {
1052 int i;
1053 for (i = 0; i < p->n; ++i) {
1054 ed_swap_8p(&p->list[i].u);
1055 ed_swap_8p(&p->list[i].v);
1056 }
1057 }
1058
1059 static void hts_idx_save_core(const hts_idx_t *idx, void *fp, int fmt)
1060 {
1061 int32_t i, size, is_be;
1062 int is_bgzf = (fmt != HTS_FMT_BAI);
1063 is_be = ed_is_big();
1064 if (is_be) {
1065 uint32_t x = idx->n;
1066 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
1067 } else idx_write(is_bgzf, fp, &idx->n, 4);
1068 if (fmt == HTS_FMT_TBI && idx->l_meta) idx_write(is_bgzf, fp, idx->meta, idx->l_meta);
1069 for (i = 0; i < idx->n; ++i) {
1070 khint_t k;
1071 bidx_t *bidx = idx->bidx[i];
1072 lidx_t *lidx = &idx->lidx[i];
1073 // write binning index
1074 size = bidx? kh_size(bidx) : 0;
1075 if (is_be) { // big endian
1076 uint32_t x = size;
1077 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
1078 } else idx_write(is_bgzf, fp, &size, 4);
1079 if (bidx == 0) goto write_lidx;
1080 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1081 bins_t *p;
1082 if (!kh_exist(bidx, k)) continue;
1083 p = &kh_value(bidx, k);
1084 if (is_be) { // big endian
1085 uint32_t x;
1086 x = kh_key(bidx, k); idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
1087 if (fmt == HTS_FMT_CSI) {
1088 uint64_t y = kh_val(bidx, k).loff;
1089 idx_write(is_bgzf, fp, ed_swap_4p(&y), 8);
1090 }
1091 x = p->n; idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
1092 swap_bins(p);
1093 idx_write(is_bgzf, fp, p->list, 16 * p->n);
1094 swap_bins(p);
1095 } else {
1096 idx_write(is_bgzf, fp, &kh_key(bidx, k), 4);
1097 if (fmt == HTS_FMT_CSI) idx_write(is_bgzf, fp, &kh_val(bidx, k).loff, 8);
1098 //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
1099 idx_write(is_bgzf, fp, &p->n, 4);
1100 idx_write(is_bgzf, fp, p->list, p->n << 4);
1101 }
1102 }
1103 write_lidx:
1104 if (fmt != HTS_FMT_CSI) {
1105 if (is_be) {
1106 int32_t x = lidx->n;
1107 idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
1108 for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]);
1109 idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3);
1110 for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]);
1111 } else {
1112 idx_write(is_bgzf, fp, &lidx->n, 4);
1113 idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3);
1114 }
1115 }
1116 }
1117 if (is_be) { // write the number of reads without coordinates
1118 uint64_t x = idx->n_no_coor;
1119 idx_write(is_bgzf, fp, &x, 8);
1120 } else idx_write(is_bgzf, fp, &idx->n_no_coor, 8);
1121 }
1122
1123 void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
1124 {
1125 char *fnidx;
1126 fnidx = (char*)calloc(1, strlen(fn) + 5);
1127 strcpy(fnidx, fn);
1128 if (fmt == HTS_FMT_CSI) {
1129 BGZF *fp;
1130 uint32_t x[3];
1131 int is_be, i;
1132 is_be = ed_is_big();
1133 fp = bgzf_open(strcat(fnidx, ".csi"), "w");
1134 bgzf_write(fp, "CSI\1", 4);
1135 x[0] = idx->min_shift; x[1] = idx->n_lvls; x[2] = idx->l_meta;
1136 if (is_be) {
1137 for (i = 0; i < 3; ++i)
1138 bgzf_write(fp, ed_swap_4p(&x[i]), 4);
1139 } else bgzf_write(fp, &x, 12);
1140 if (idx->l_meta) bgzf_write(fp, idx->meta, idx->l_meta);
1141 hts_idx_save_core(idx, fp, HTS_FMT_CSI);
1142 bgzf_close(fp);
1143 } else if (fmt == HTS_FMT_TBI) {
1144 BGZF *fp;
1145 fp = bgzf_open(strcat(fnidx, ".tbi"), "w");
1146 bgzf_write(fp, "TBI\1", 4);
1147 hts_idx_save_core(idx, fp, HTS_FMT_TBI);
1148 bgzf_close(fp);
1149 } else if (fmt == HTS_FMT_BAI) {
1150 FILE *fp;
1151 fp = fopen(strcat(fnidx, ".bai"), "w");
1152 fwrite("BAI\1", 1, 4, fp);
1153 hts_idx_save_core(idx, fp, HTS_FMT_BAI);
1154 fclose(fp);
1155 } else abort();
1156 free(fnidx);
1157 }
1158
1159 static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt)
1160 {
1161 int32_t i, n, is_be;
1162 int is_bgzf = (fmt != HTS_FMT_BAI);
1163 is_be = ed_is_big();
1164 if (idx == NULL) return -4;
1165 for (i = 0; i < idx->n; ++i) {
1166 bidx_t *h;
1167 lidx_t *l = &idx->lidx[i];
1168 uint32_t key;
1169 int j, absent;
1170 bins_t *p;
1171 h = idx->bidx[i] = kh_init(bin);
1172 if (idx_read(is_bgzf, fp, &n, 4) != 4) return -1;
1173 if (is_be) ed_swap_4p(&n);
1174 for (j = 0; j < n; ++j) {
1175 khint_t k;
1176 if (idx_read(is_bgzf, fp, &key, 4) != 4) return -1;
1177 if (is_be) ed_swap_4p(&key);
1178 k = kh_put(bin, h, key, &absent);
1179 if (absent <= 0) return -3; // Duplicate bin number
1180 p = &kh_val(h, k);
1181 if (fmt == HTS_FMT_CSI) {
1182 if (idx_read(is_bgzf, fp, &p->loff, 8) != 8) return -1;
1183 if (is_be) ed_swap_8p(&p->loff);
1184 } else p->loff = 0;
1185 if (idx_read(is_bgzf, fp, &p->n, 4) != 4) return -1;
1186 if (is_be) ed_swap_4p(&p->n);
1187 p->m = p->n;
1188 p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
1189 if (p->list == NULL) return -2;
1190 if (idx_read(is_bgzf, fp, p->list, p->n<<4) != p->n<<4) return -1;
1191 if (is_be) swap_bins(p);
1192 }
1193 if (fmt != HTS_FMT_CSI) { // load linear index
1194 int j;
1195 if (idx_read(is_bgzf, fp, &l->n, 4) != 4) return -1;
1196 if (is_be) ed_swap_4p(&l->n);
1197 l->m = l->n;
1198 l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
1199 if (l->offset == NULL) return -2;
1200 if (idx_read(is_bgzf, fp, l->offset, l->n << 3) != l->n << 3) return -1;
1201 if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
1202 for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
1203 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
1204 update_loff(idx, i, 1);
1205 }
1206 }
1207 if (idx_read(is_bgzf, fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
1208 if (is_be) ed_swap_8p(&idx->n_no_coor);
1209 return 0;
1210 }
1211
1212 hts_idx_t *hts_idx_load_local(const char *fn, int fmt)
1213 {
1214 uint8_t magic[4];
1215 int i, is_be;
1216 hts_idx_t *idx = NULL;
1217 is_be = ed_is_big();
1218 if (fmt == HTS_FMT_CSI) {
1219 BGZF *fp;
1220 uint32_t x[3], n;
1221 uint8_t *meta = 0;
1222 if ((fp = bgzf_open(fn, "r")) == 0) return NULL;
1223 if (bgzf_read(fp, magic, 4) != 4) goto csi_fail;
1224 if (memcmp(magic, "CSI\1", 4) != 0) goto csi_fail;
1225 if (bgzf_read(fp, x, 12) != 12) goto csi_fail;
1226 if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
1227 if (x[2]) {
1228 if ((meta = (uint8_t*)malloc(x[2])) == NULL) goto csi_fail;
1229 if (bgzf_read(fp, meta, x[2]) != x[2]) goto csi_fail;
1230 }
1231 if (bgzf_read(fp, &n, 4) != 4) goto csi_fail;
1232 if (is_be) ed_swap_4p(&n);
1233 if ((idx = hts_idx_init(n, fmt, 0, x[0], x[1])) == NULL) goto csi_fail;
1234 idx->l_meta = x[2];
1235 idx->meta = meta;
1236 meta = NULL;
1237 if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto csi_fail;
1238 bgzf_close(fp);
1239 return idx;
1240
1241 csi_fail:
1242 bgzf_close(fp);
1243 hts_idx_destroy(idx);
1244 free(meta);
1245 return NULL;
1246
1247 } else if (fmt == HTS_FMT_TBI) {
1248 BGZF *fp;
1249 uint32_t x[8];
1250 if ((fp = bgzf_open(fn, "r")) == 0) return NULL;
1251 if (bgzf_read(fp, magic, 4) != 4) goto tbi_fail;
1252 if (memcmp(magic, "TBI\1", 4) != 0) goto tbi_fail;
1253 if (bgzf_read(fp, x, 32) != 32) goto tbi_fail;
1254 if (is_be) for (i = 0; i < 8; ++i) ed_swap_4p(&x[i]);
1255 if ((idx = hts_idx_init(x[0], fmt, 0, 14, 5)) == NULL) goto tbi_fail;
1256 idx->l_meta = 28 + x[7];
1257 if ((idx->meta = (uint8_t*)malloc(idx->l_meta)) == NULL) goto tbi_fail;
1258 memcpy(idx->meta, &x[1], 28);
1259 if (bgzf_read(fp, idx->meta + 28, x[7]) != x[7]) goto tbi_fail;
1260 if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto tbi_fail;
1261 bgzf_close(fp);
1262 return idx;
1263
1264 tbi_fail:
1265 bgzf_close(fp);
1266 hts_idx_destroy(idx);
1267 return NULL;
1268
1269 } else if (fmt == HTS_FMT_BAI) {
1270 uint32_t n;
1271 FILE *fp;
1272 if ((fp = fopen(fn, "rb")) == 0) return NULL;
1273 if (fread(magic, 1, 4, fp) != 4) goto bai_fail;
1274 if (memcmp(magic, "BAI\1", 4) != 0) goto bai_fail;
1275 if (fread(&n, 4, 1, fp) != 1) goto bai_fail;
1276 if (is_be) ed_swap_4p(&n);
1277 idx = hts_idx_init(n, fmt, 0, 14, 5);
1278 if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto bai_fail;
1279 fclose(fp);
1280 return idx;
1281
1282 bai_fail:
1283 fclose(fp);
1284 hts_idx_destroy(idx);
1285 return NULL;
1286
1287 } else abort();
1288 }
1289
1290 void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
1291 {
1292 if (idx->meta) free(idx->meta);
1293 idx->l_meta = l_meta;
1294 if (is_copy) {
1295 idx->meta = (uint8_t*)malloc(l_meta);
1296 memcpy(idx->meta, meta, l_meta);
1297 } else idx->meta = meta;
1298 }
1299
1300 uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
1301 {
1302 *l_meta = idx->l_meta;
1303 return idx->meta;
1304 }
1305
1306 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
1307 {
1308 if ( !idx->n )
1309 {
1310 *n = 0;
1311 return NULL;
1312 }
1313
1314 int tid = 0, i;
1315 const char **names = (const char**) calloc(idx->n,sizeof(const char*));
1316 for (i=0; i<idx->n; i++)
1317 {
1318 bidx_t *bidx = idx->bidx[i];
1319 if ( !bidx ) continue;
1320 names[tid++] = getid(hdr,i);
1321 }
1322 *n = tid;
1323 return names;
1324 }
1325
1326 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
1327 {
1328 if ( idx->fmt == HTS_FMT_CRAI ) {
1329 *mapped = 0; *unmapped = 0;
1330 return -1;
1331 }
1332
1333 bidx_t *h = idx->bidx[tid];
1334 khint_t k = kh_get(bin, h, META_BIN(idx));
1335 if (k != kh_end(h)) {
1336 *mapped = kh_val(h, k).list[1].u;
1337 *unmapped = kh_val(h, k).list[1].v;
1338 return 0;
1339 } else {
1340 *mapped = 0; *unmapped = 0;
1341 return -1;
1342 }
1343 }
1344
1345 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
1346 {
1347 return idx->n_no_coor;
1348 }
1349
1350 /****************
1351 *** Iterator ***
1352 ****************/
1353
1354 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
1355 {
1356 int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
1357 if (beg >= end) return 0;
1358 if (end >= 1LL<<s) end = 1LL<<s;
1359 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
1360 int b, e, n, i;
1361 b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
1362 if (itr->bins.n + n > itr->bins.m) {
1363 itr->bins.m = itr->bins.n + n;
1364 kroundup32(itr->bins.m);
1365 itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
1366 }
1367 for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
1368 }
1369 return itr->bins.n;
1370 }
1371
1372 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
1373 {
1374 int i, n_off, l, bin;
1375 hts_pair64_t *off;
1376 khint_t k;
1377 bidx_t *bidx;
1378 uint64_t min_off;
1379 hts_itr_t *iter = 0;
1380 if (tid < 0) {
1381 int finished0 = 0;
1382 uint64_t off0 = (uint64_t)-1;
1383 khint_t k;
1384 switch (tid) {
1385 case HTS_IDX_START:
1386 // Find the smallest offset, note that sequence ids may not be ordered sequentially
1387 for (i=0; i<idx->n; i++)
1388 {
1389 bidx = idx->bidx[i];
1390 k = kh_get(bin, bidx, META_BIN(idx));
1391 if (k == kh_end(bidx)) continue;
1392 if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u;
1393 }
1394 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
1395 break;
1396
1397 case HTS_IDX_NOCOOR:
1398 if ( idx->n>0 )
1399 {
1400 bidx = idx->bidx[idx->n - 1];
1401 k = kh_get(bin, bidx, META_BIN(idx));
1402 if (k != kh_end(bidx)) off0 = kh_val(bidx, k).list[0].v;
1403 }
1404 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
1405 break;
1406
1407 case HTS_IDX_REST:
1408 off0 = 0;
1409 break;
1410
1411 case HTS_IDX_NONE:
1412 finished0 = 1;
1413 off0 = 0;
1414 break;
1415
1416 default:
1417 return 0;
1418 }
1419 if (off0 != (uint64_t)-1) {
1420 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
1421 iter->read_rest = 1;
1422 iter->finished = finished0;
1423 iter->curr_off = off0;
1424 iter->readrec = readrec;
1425 return iter;
1426 } else return 0;
1427 }
1428
1429 if (beg < 0) beg = 0;
1430 if (end < beg) return 0;
1431 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0;
1432
1433 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
1434 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
1435 iter->readrec = readrec;
1436
1437 // compute min_off
1438 bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
1439 do {
1440 int first;
1441 k = kh_get(bin, bidx, bin);
1442 if (k != kh_end(bidx)) break;
1443 first = (hts_bin_parent(bin)<<3) + 1;
1444 if (bin > first) --bin;
1445 else bin = hts_bin_parent(bin);
1446 } while (bin);
1447 if (bin == 0) k = kh_get(bin, bidx, bin);
1448 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
1449 // retrieve bins
1450 reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
1451 for (i = n_off = 0; i < iter->bins.n; ++i)
1452 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
1453 n_off += kh_value(bidx, k).n;
1454 if (n_off == 0) return iter;
1455 off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t));
1456 for (i = n_off = 0; i < iter->bins.n; ++i) {
1457 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
1458 int j;
1459 bins_t *p = &kh_value(bidx, k);
1460 for (j = 0; j < p->n; ++j)
1461 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
1462 }
1463 }
1464 if (n_off == 0) {
1465 free(off); return iter;
1466 }
1467 ks_introsort(_off, n_off, off);
1468 // resolve completely contained adjacent blocks
1469 for (i = 1, l = 0; i < n_off; ++i)
1470 if (off[l].v < off[i].v) off[++l] = off[i];
1471 n_off = l + 1;
1472 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
1473 for (i = 1; i < n_off; ++i)
1474 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
1475 // merge adjacent blocks
1476 for (i = 1, l = 0; i < n_off; ++i) {
1477 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
1478 else off[++l] = off[i];
1479 }
1480 n_off = l + 1;
1481 iter->n_off = n_off; iter->off = off;
1482 return iter;
1483 }
1484
1485 void hts_itr_destroy(hts_itr_t *iter)
1486 {
1487 if (iter) { free(iter->off); free(iter->bins.a); free(iter); }
1488 }
1489
1490 const char *hts_parse_reg(const char *s, int *beg, int *end)
1491 {
1492 int i, k, l, name_end;
1493 *beg = *end = -1;
1494 name_end = l = strlen(s);
1495 // determine the sequence name
1496 for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
1497 if (i >= 0) name_end = i;
1498 if (name_end < l) { // check if this is really the end
1499 int n_hyphen = 0;
1500 for (i = name_end + 1; i < l; ++i) {
1501 if (s[i] == '-') ++n_hyphen;
1502 else if (!isdigit(s[i]) && s[i] != ',') break;
1503 }
1504 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
1505 }
1506 // parse the interval
1507 if (name_end < l) {
1508 char *tmp;
1509 tmp = (char*)alloca(l - name_end + 1);
1510 for (i = name_end + 1, k = 0; i < l; ++i)
1511 if (s[i] != ',') tmp[k++] = s[i];
1512 tmp[k] = 0;
1513 if ((*beg = strtol(tmp, &tmp, 10) - 1) < 0) *beg = 0;
1514 *end = *tmp? strtol(tmp + 1, &tmp, 10) : INT_MAX;
1515 if (*beg > *end) name_end = l;
1516 }
1517 if (name_end == l) *beg = 0, *end = INT_MAX;
1518 return s + name_end;
1519 }
1520
1521 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
1522 {
1523 int tid, beg, end;
1524 char *q, *tmp;
1525 if (strcmp(reg, ".") == 0)
1526 return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
1527 else if (strcmp(reg, "*") != 0) {
1528 q = (char*)hts_parse_reg(reg, &beg, &end);
1529 tmp = (char*)alloca(q - reg + 1);
1530 strncpy(tmp, reg, q - reg);
1531 tmp[q - reg] = 0;
1532 if ((tid = getid(hdr, tmp)) < 0)
1533 tid = getid(hdr, reg);
1534 if (tid < 0) return 0;
1535 return itr_query(idx, tid, beg, end, readrec);
1536 } else return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
1537 }
1538
1539 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
1540 {
1541 int ret, tid, beg, end;
1542 if (iter == NULL || iter->finished) return -1;
1543 if (iter->read_rest) {
1544 if (iter->curr_off) { // seek to the start
1545 bgzf_seek(fp, iter->curr_off, SEEK_SET);
1546 iter->curr_off = 0; // only seek once
1547 }
1548 ret = iter->readrec(fp, data, r, &tid, &beg, &end);
1549 if (ret < 0) iter->finished = 1;
1550 iter->curr_tid = tid;
1551 iter->curr_beg = beg;
1552 iter->curr_end = end;
1553 return ret;
1554 }
1555 if (iter->off == 0) return -1;
1556 for (;;) {
1557 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
1558 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
1559 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
1560 bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
1561 iter->curr_off = bgzf_tell(fp);
1562 }
1563 ++iter->i;
1564 }
1565 if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
1566 iter->curr_off = bgzf_tell(fp);
1567 if (tid != iter->tid || beg >= iter->end) { // no need to proceed
1568 ret = -1; break;
1569 } else if (end > iter->beg && iter->end > beg) {
1570 iter->curr_tid = tid;
1571 iter->curr_beg = beg;
1572 iter->curr_end = end;
1573 return ret;
1574 }
1575 } else break; // end of file or error
1576 }
1577 iter->finished = 1;
1578 return ret;
1579 }
1580
1581 /**********************
1582 *** Retrieve index ***
1583 **********************/
1584
1585 static char *test_and_fetch(const char *fn)
1586 {
1587 FILE *fp;
1588 if (hisremote(fn)) {
1589 const int buf_size = 1 * 1024 * 1024;
1590 hFILE *fp_remote;
1591 uint8_t *buf;
1592 int l;
1593 const char *p;
1594 for (p = fn + strlen(fn) - 1; p >= fn; --p)
1595 if (*p == '/') break;
1596 ++p; // p now points to the local file name
1597 // Attempt to open local file first
1598 if ((fp = fopen((char*)p, "rb")) != 0)
1599 {
1600 fclose(fp);
1601 return (char*)p;
1602 }
1603 // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index.
1604 if ((fp_remote = hopen(fn, "r")) == 0) return 0;
1605 if ((fp = fopen(p, "w")) == 0) {
1606 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p);
1607 hclose_abruptly(fp_remote);
1608 return 0;
1609 }
1610 if (hts_verbose >= 3) fprintf(stderr, "[M::%s] downloading file '%s' to local directory\n", __func__, fn);
1611 buf = (uint8_t*)calloc(buf_size, 1);
1612 while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp);
1613 free(buf);
1614 fclose(fp);
1615 if (hclose(fp_remote) != 0) fprintf(stderr, "[E::%s] fail to close remote file '%s'\n", __func__, fn);
1616 return (char*)p;
1617 } else {
1618 if ((fp = fopen(fn, "rb")) == 0) return 0;
1619 fclose(fp);
1620 return (char*)fn;
1621 }
1622 }
1623
1624 char *hts_idx_getfn(const char *fn, const char *ext)
1625 {
1626 int i, l_fn, l_ext;
1627 char *fnidx, *ret;
1628 l_fn = strlen(fn); l_ext = strlen(ext);
1629 fnidx = (char*)calloc(l_fn + l_ext + 1, 1);
1630 strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
1631 if ((ret = test_and_fetch(fnidx)) == 0) {
1632 for (i = l_fn - 1; i > 0; --i)
1633 if (fnidx[i] == '.') break;
1634 strcpy(fnidx + i, ext);
1635 ret = test_and_fetch(fnidx);
1636 }
1637 if (ret == 0) {
1638 free(fnidx);
1639 return 0;
1640 }
1641 l_fn = strlen(ret);
1642 memmove(fnidx, ret, l_fn + 1);
1643 return fnidx;
1644 }
1645
1646 hts_idx_t *hts_idx_load(const char *fn, int fmt)
1647 {
1648 char *fnidx;
1649 hts_idx_t *idx;
1650 fnidx = hts_idx_getfn(fn, ".csi");
1651 if (fnidx) fmt = HTS_FMT_CSI;
1652 else fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi");
1653 if (fnidx == 0) return 0;
1654
1655 // Check that the index file is up to date, the main file might have changed
1656 struct stat stat_idx,stat_main;
1657 if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
1658 {
1659 if ( stat_idx.st_mtime < stat_main.st_mtime )
1660 fprintf(stderr, "Warning: The index file is older than the data file: %s\n", fnidx);
1661 }
1662 idx = hts_idx_load_local(fnidx, fmt);
1663 free(fnidx);
1664 return idx;
1665 }