annotate ezBAMQC/src/htslib/tbx.c @ 8:82bb8c455761

Uploaded
author cshl-bsr
date Wed, 30 Mar 2016 12:03:10 -0400
parents dfa3745e5fd8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1 /* tbx.c -- tabix API functions.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3 Copyright (C) 2009, 2010, 2012-2014 Genome Research Ltd.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
4 Copyright (C) 2010-2012 Broad Institute.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
5
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
6 Author: Heng Li <lh3@sanger.ac.uk>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
7
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
8 Permission is hereby granted, free of charge, to any person obtaining a copy
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
9 of this software and associated documentation files (the "Software"), to deal
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
10 in the Software without restriction, including without limitation the rights
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
12 copies of the Software, and to permit persons to whom the Software is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
13 furnished to do so, subject to the following conditions:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
14
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
15 The above copyright notice and this permission notice shall be included in
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
16 all copies or substantial portions of the Software.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
17
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
24 DEALINGS IN THE SOFTWARE. */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
25
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
26 #include <stdlib.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
27 #include <string.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
28 #include <ctype.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
29 #include <stdio.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
30 #include <assert.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
31 #include "htslib/tbx.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
32 #include "htslib/bgzf.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
33
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
34 #include "htslib/khash.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
35 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
36
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
37 tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
38 tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
39 tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
40 tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
41 tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
42
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
43 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
44 int64_t beg, end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
45 char *ss, *se;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
46 int tid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
47 } tbx_intv_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
48
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
49 static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
50 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
51 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
52 khash_t(s2i) *d;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
53 if (tbx->dict == 0) tbx->dict = kh_init(s2i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
54 d = (khash_t(s2i)*)tbx->dict;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
55 if (is_add) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
56 int absent;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
57 k = kh_put(s2i, d, ss, &absent);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
58 if (absent) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
59 kh_key(d, k) = strdup(ss);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
60 kh_val(d, k) = kh_size(d) - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
61 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
62 } else k = kh_get(s2i, d, ss);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
63 return k == kh_end(d)? -1 : kh_val(d, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
64 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
65
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
66 int tbx_name2id(tbx_t *tbx, const char *ss)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
67 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
68 return get_tid(tbx, ss, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
69 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
70
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
71 int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
72 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
73 int i, b = 0, id = 1, ncols = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
74 char *s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
75 intv->ss = intv->se = 0; intv->beg = intv->end = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
76 for (i = 0; i <= len; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
77 if (line[i] == '\t' || line[i] == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
78 ++ncols;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
79 if (id == conf->sc) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
80 intv->ss = line + b; intv->se = line + i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
81 } else if (id == conf->bc) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
82 // here ->beg is 0-based.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
83 intv->beg = intv->end = strtol(line + b, &s, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
84 if ( s==line+b ) return -1; // expected int
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
85 if (!(conf->preset&TBX_UCSC)) --intv->beg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
86 else ++intv->end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
87 if (intv->beg < 0) intv->beg = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
88 if (intv->end < 1) intv->end = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
89 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
90 if ((conf->preset&0xffff) == TBX_GENERIC) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
91 if (id == conf->ec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
92 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
93 intv->end = strtol(line + b, &s, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
94 if ( s==line+b ) return -1; // expected int
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
95 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
96 } else if ((conf->preset&0xffff) == TBX_SAM) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
97 if (id == 6) { // CIGAR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
98 int l = 0, op;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
99 char *t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
100 for (s = line + b; s < line + i;) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
101 long x = strtol(s, &t, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
102 op = toupper(*t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
103 if (op == 'M' || op == 'D' || op == 'N') l += x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
104 s = t + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
105 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
106 if (l == 0) l = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
107 intv->end = intv->beg + l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
108 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
109 } else if ((conf->preset&0xffff) == TBX_VCF) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
110 if (id == 4) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
111 if (b < i) intv->end = intv->beg + (i - b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
112 } else if (id == 8) { // look for "END="
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
113 int c = line[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
114 line[i] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
115 s = strstr(line + b, "END=");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
116 if (s == line + b) s += 4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
117 else if (s) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
118 s = strstr(line + b, ";END=");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
119 if (s) s += 5;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
120 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
121 if (s) intv->end = strtol(s, &s, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
122 line[i] = c;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
123 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
124 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
125 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
126 b = i + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
127 ++id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
128 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
129 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
130 if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
131 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
132 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
133
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
134 static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
135 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
136 if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
137 int c = *intv->se;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
138 *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
139 return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
140 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
141 char *type = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
142 switch (tbx->conf.preset&0xffff)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
143 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
144 case TBX_SAM: type = "TBX_SAM"; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
145 case TBX_VCF: type = "TBX_VCF"; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
146 case TBX_UCSC: type = "TBX_UCSC"; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
147 default: type = "TBX_GENERIC"; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
148 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
149 fprintf(stderr, "[E::%s] failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"\n", __func__, type, str->s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
150 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
151 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
152 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
153
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
154 int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
155 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
156 tbx_t *tbx = (tbx_t *) tbxv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
157 kstring_t *s = (kstring_t *) sv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
158 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
159 if ((ret = bgzf_getline(fp, '\n', s)) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
160 tbx_intv_t intv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
161 get_intv(tbx, s, &intv, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
162 *tid = intv.tid; *beg = intv.beg; *end = intv.end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
163 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
164 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
165 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
166
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
167 void tbx_set_meta(tbx_t *tbx)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
168 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
169 int i, l = 0, l_nm;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
170 uint32_t x[7];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
171 char **name;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
172 uint8_t *meta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
173 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
174 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
175
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
176 memcpy(x, &tbx->conf, 24);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
177 name = (char**)malloc(sizeof(char*) * kh_size(d));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
178 for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
179 if (!kh_exist(d, k)) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
180 name[kh_val(d, k)] = (char*)kh_key(d, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
181 l += strlen(kh_key(d, k)) + 1; // +1 to include '\0'
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
182 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
183 l_nm = x[6] = l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
184 meta = (uint8_t*)malloc(l_nm + 28);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
185 if (ed_is_big())
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
186 for (i = 0; i < 7; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
187 x[i] = ed_swap_4(x[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
188 memcpy(meta, x, 28);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
189 for (l = 28, i = 0; i < (int)kh_size(d); ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
190 int x = strlen(name[i]) + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
191 memcpy(meta + l, name[i], x);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
192 l += x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
193 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
194 free(name);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
195 hts_idx_set_meta(tbx->idx, l, meta, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
196 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
197
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
198 tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
199 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
200 tbx_t *tbx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
201 kstring_t str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
202 int ret, first = 0, n_lvls, fmt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
203 int64_t lineno = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
204 uint64_t last_off = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
205 tbx_intv_t intv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
206
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
207 str.s = 0; str.l = str.m = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
208 tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
209 tbx->conf = *conf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
210 if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
211 else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
212 while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
213 ++lineno;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
214 if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
215 last_off = bgzf_tell(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
216 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
217 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
218 if (first == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
219 tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
220 first = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
221 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
222 get_intv(tbx, &str, &intv, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
223 ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
224 if (ret < 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
225 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
226 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
227 tbx_destroy(tbx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
228 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
229 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
230 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
231 if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
232 if ( !tbx->dict ) tbx->dict = kh_init(s2i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
233 hts_idx_finish(tbx->idx, bgzf_tell(fp));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
234 tbx_set_meta(tbx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
235 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
236 return tbx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
237 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
238
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
239 void tbx_destroy(tbx_t *tbx)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
240 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
241 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
242 if (d != NULL)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
243 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
244 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
245 for (k = kh_begin(d); k != kh_end(d); ++k)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
246 if (kh_exist(d, k)) free((char*)kh_key(d, k));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
247 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
248 hts_idx_destroy(tbx->idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
249 kh_destroy(s2i, d);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
250 free(tbx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
251 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
252
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
253 int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
254 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
255 tbx_t *tbx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
256 BGZF *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
257 if ( bgzf_is_bgzf(fn)!=1 ) { fprintf(stderr,"Not a BGZF file: %s\n", fn); return -1; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
258 if ((fp = bgzf_open(fn, "r")) == 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
259 if ( !fp->is_compressed ) { bgzf_close(fp); return -1; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
260 tbx = tbx_index(fp, min_shift, conf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
261 bgzf_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
262 if ( !tbx ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
263 hts_idx_save(tbx->idx, fn, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
264 tbx_destroy(tbx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
265 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
266 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
267
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
268 tbx_t *tbx_index_load(const char *fn)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
269 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
270 tbx_t *tbx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
271 uint8_t *meta;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
272 char *nm, *p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
273 uint32_t x[7];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
274 int l_meta, l_nm;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
275 tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
276 tbx->idx = hts_idx_load(fn, HTS_FMT_TBI);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
277 if ( !tbx->idx )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
278 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
279 free(tbx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
280 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
281 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
282 meta = hts_idx_get_meta(tbx->idx, &l_meta);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
283 if ( !meta )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
284 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
285 free(tbx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
286 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
287 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
288 memcpy(x, meta, 28);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
289 memcpy(&tbx->conf, x, 24);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
290 p = nm = (char*)meta + 28;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
291 l_nm = x[6];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
292 for (; p - nm < l_nm; p += strlen(p) + 1) get_tid(tbx, p, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
293 return tbx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
294 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
295
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
296 const char **tbx_seqnames(tbx_t *tbx, int *n)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
297 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
298 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
299 if (d == NULL)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
300 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
301 *n = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
302 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
303 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
304 int tid, m = kh_size(d);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
305 const char **names = (const char**) calloc(m,sizeof(const char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
306 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
307 for (k=kh_begin(d); k<kh_end(d); k++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
308 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
309 if ( !kh_exist(d,k) ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
310 tid = kh_val(d,k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
311 assert( tid<m );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
312 names[tid] = kh_key(d,k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
313 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
314 // sanity check: there should be no gaps
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
315 for (tid=0; tid<m; tid++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
316 assert(names[tid]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
317 *n = m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
318 return names;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
319 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
320