comparison ezBAMQC/src/htslib/tbx.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /* tbx.c -- tabix API functions.
2
3 Copyright (C) 2009, 2010, 2012-2014 Genome Research Ltd.
4 Copyright (C) 2010-2012 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include <stdio.h>
30 #include <assert.h>
31 #include "htslib/tbx.h"
32 #include "htslib/bgzf.h"
33
34 #include "htslib/khash.h"
35 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
36
37 tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 };
38 tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 };
39 tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 };
40 tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
41 tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
42
43 typedef struct {
44 int64_t beg, end;
45 char *ss, *se;
46 int tid;
47 } tbx_intv_t;
48
49 static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
50 {
51 khint_t k;
52 khash_t(s2i) *d;
53 if (tbx->dict == 0) tbx->dict = kh_init(s2i);
54 d = (khash_t(s2i)*)tbx->dict;
55 if (is_add) {
56 int absent;
57 k = kh_put(s2i, d, ss, &absent);
58 if (absent) {
59 kh_key(d, k) = strdup(ss);
60 kh_val(d, k) = kh_size(d) - 1;
61 }
62 } else k = kh_get(s2i, d, ss);
63 return k == kh_end(d)? -1 : kh_val(d, k);
64 }
65
66 int tbx_name2id(tbx_t *tbx, const char *ss)
67 {
68 return get_tid(tbx, ss, 0);
69 }
70
71 int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
72 {
73 int i, b = 0, id = 1, ncols = 0;
74 char *s;
75 intv->ss = intv->se = 0; intv->beg = intv->end = -1;
76 for (i = 0; i <= len; ++i) {
77 if (line[i] == '\t' || line[i] == 0) {
78 ++ncols;
79 if (id == conf->sc) {
80 intv->ss = line + b; intv->se = line + i;
81 } else if (id == conf->bc) {
82 // here ->beg is 0-based.
83 intv->beg = intv->end = strtol(line + b, &s, 0);
84 if ( s==line+b ) return -1; // expected int
85 if (!(conf->preset&TBX_UCSC)) --intv->beg;
86 else ++intv->end;
87 if (intv->beg < 0) intv->beg = 0;
88 if (intv->end < 1) intv->end = 1;
89 } else {
90 if ((conf->preset&0xffff) == TBX_GENERIC) {
91 if (id == conf->ec)
92 {
93 intv->end = strtol(line + b, &s, 0);
94 if ( s==line+b ) return -1; // expected int
95 }
96 } else if ((conf->preset&0xffff) == TBX_SAM) {
97 if (id == 6) { // CIGAR
98 int l = 0, op;
99 char *t;
100 for (s = line + b; s < line + i;) {
101 long x = strtol(s, &t, 10);
102 op = toupper(*t);
103 if (op == 'M' || op == 'D' || op == 'N') l += x;
104 s = t + 1;
105 }
106 if (l == 0) l = 1;
107 intv->end = intv->beg + l;
108 }
109 } else if ((conf->preset&0xffff) == TBX_VCF) {
110 if (id == 4) {
111 if (b < i) intv->end = intv->beg + (i - b);
112 } else if (id == 8) { // look for "END="
113 int c = line[i];
114 line[i] = 0;
115 s = strstr(line + b, "END=");
116 if (s == line + b) s += 4;
117 else if (s) {
118 s = strstr(line + b, ";END=");
119 if (s) s += 5;
120 }
121 if (s) intv->end = strtol(s, &s, 0);
122 line[i] = c;
123 }
124 }
125 }
126 b = i + 1;
127 ++id;
128 }
129 }
130 if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
131 return 0;
132 }
133
134 static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add)
135 {
136 if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
137 int c = *intv->se;
138 *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
139 return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
140 } else {
141 char *type = NULL;
142 switch (tbx->conf.preset&0xffff)
143 {
144 case TBX_SAM: type = "TBX_SAM"; break;
145 case TBX_VCF: type = "TBX_VCF"; break;
146 case TBX_UCSC: type = "TBX_UCSC"; break;
147 default: type = "TBX_GENERIC"; break;
148 }
149 fprintf(stderr, "[E::%s] failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"\n", __func__, type, str->s);
150 return -1;
151 }
152 }
153
154 int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end)
155 {
156 tbx_t *tbx = (tbx_t *) tbxv;
157 kstring_t *s = (kstring_t *) sv;
158 int ret;
159 if ((ret = bgzf_getline(fp, '\n', s)) >= 0) {
160 tbx_intv_t intv;
161 get_intv(tbx, s, &intv, 0);
162 *tid = intv.tid; *beg = intv.beg; *end = intv.end;
163 }
164 return ret;
165 }
166
167 void tbx_set_meta(tbx_t *tbx)
168 {
169 int i, l = 0, l_nm;
170 uint32_t x[7];
171 char **name;
172 uint8_t *meta;
173 khint_t k;
174 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
175
176 memcpy(x, &tbx->conf, 24);
177 name = (char**)malloc(sizeof(char*) * kh_size(d));
178 for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) {
179 if (!kh_exist(d, k)) continue;
180 name[kh_val(d, k)] = (char*)kh_key(d, k);
181 l += strlen(kh_key(d, k)) + 1; // +1 to include '\0'
182 }
183 l_nm = x[6] = l;
184 meta = (uint8_t*)malloc(l_nm + 28);
185 if (ed_is_big())
186 for (i = 0; i < 7; ++i)
187 x[i] = ed_swap_4(x[i]);
188 memcpy(meta, x, 28);
189 for (l = 28, i = 0; i < (int)kh_size(d); ++i) {
190 int x = strlen(name[i]) + 1;
191 memcpy(meta + l, name[i], x);
192 l += x;
193 }
194 free(name);
195 hts_idx_set_meta(tbx->idx, l, meta, 0);
196 }
197
198 tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
199 {
200 tbx_t *tbx;
201 kstring_t str;
202 int ret, first = 0, n_lvls, fmt;
203 int64_t lineno = 0;
204 uint64_t last_off = 0;
205 tbx_intv_t intv;
206
207 str.s = 0; str.l = str.m = 0;
208 tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
209 tbx->conf = *conf;
210 if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI;
211 else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
212 while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
213 ++lineno;
214 if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
215 last_off = bgzf_tell(fp);
216 continue;
217 }
218 if (first == 0) {
219 tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
220 first = 1;
221 }
222 get_intv(tbx, &str, &intv, 1);
223 ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1);
224 if (ret < 0)
225 {
226 free(str.s);
227 tbx_destroy(tbx);
228 return NULL;
229 }
230 }
231 if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file
232 if ( !tbx->dict ) tbx->dict = kh_init(s2i);
233 hts_idx_finish(tbx->idx, bgzf_tell(fp));
234 tbx_set_meta(tbx);
235 free(str.s);
236 return tbx;
237 }
238
239 void tbx_destroy(tbx_t *tbx)
240 {
241 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
242 if (d != NULL)
243 {
244 khint_t k;
245 for (k = kh_begin(d); k != kh_end(d); ++k)
246 if (kh_exist(d, k)) free((char*)kh_key(d, k));
247 }
248 hts_idx_destroy(tbx->idx);
249 kh_destroy(s2i, d);
250 free(tbx);
251 }
252
253 int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf)
254 {
255 tbx_t *tbx;
256 BGZF *fp;
257 if ( bgzf_is_bgzf(fn)!=1 ) { fprintf(stderr,"Not a BGZF file: %s\n", fn); return -1; }
258 if ((fp = bgzf_open(fn, "r")) == 0) return -1;
259 if ( !fp->is_compressed ) { bgzf_close(fp); return -1; }
260 tbx = tbx_index(fp, min_shift, conf);
261 bgzf_close(fp);
262 if ( !tbx ) return -1;
263 hts_idx_save(tbx->idx, fn, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI);
264 tbx_destroy(tbx);
265 return 0;
266 }
267
268 tbx_t *tbx_index_load(const char *fn)
269 {
270 tbx_t *tbx;
271 uint8_t *meta;
272 char *nm, *p;
273 uint32_t x[7];
274 int l_meta, l_nm;
275 tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
276 tbx->idx = hts_idx_load(fn, HTS_FMT_TBI);
277 if ( !tbx->idx )
278 {
279 free(tbx);
280 return NULL;
281 }
282 meta = hts_idx_get_meta(tbx->idx, &l_meta);
283 if ( !meta )
284 {
285 free(tbx);
286 return NULL;
287 }
288 memcpy(x, meta, 28);
289 memcpy(&tbx->conf, x, 24);
290 p = nm = (char*)meta + 28;
291 l_nm = x[6];
292 for (; p - nm < l_nm; p += strlen(p) + 1) get_tid(tbx, p, 1);
293 return tbx;
294 }
295
296 const char **tbx_seqnames(tbx_t *tbx, int *n)
297 {
298 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
299 if (d == NULL)
300 {
301 *n = 0;
302 return NULL;
303 }
304 int tid, m = kh_size(d);
305 const char **names = (const char**) calloc(m,sizeof(const char*));
306 khint_t k;
307 for (k=kh_begin(d); k<kh_end(d); k++)
308 {
309 if ( !kh_exist(d,k) ) continue;
310 tid = kh_val(d,k);
311 assert( tid<m );
312 names[tid] = kh_key(d,k);
313 }
314 // sanity check: there should be no gaps
315 for (tid=0; tid<m; tid++)
316 assert(names[tid]);
317 *n = m;
318 return names;
319 }
320