Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/tbx.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* tbx.c -- tabix API functions. | |
2 | |
3 Copyright (C) 2009, 2010, 2012-2014 Genome Research Ltd. | |
4 Copyright (C) 2010-2012 Broad Institute. | |
5 | |
6 Author: Heng Li <lh3@sanger.ac.uk> | |
7 | |
8 Permission is hereby granted, free of charge, to any person obtaining a copy | |
9 of this software and associated documentation files (the "Software"), to deal | |
10 in the Software without restriction, including without limitation the rights | |
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
12 copies of the Software, and to permit persons to whom the Software is | |
13 furnished to do so, subject to the following conditions: | |
14 | |
15 The above copyright notice and this permission notice shall be included in | |
16 all copies or substantial portions of the Software. | |
17 | |
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
24 DEALINGS IN THE SOFTWARE. */ | |
25 | |
26 #include <stdlib.h> | |
27 #include <string.h> | |
28 #include <ctype.h> | |
29 #include <stdio.h> | |
30 #include <assert.h> | |
31 #include "htslib/tbx.h" | |
32 #include "htslib/bgzf.h" | |
33 | |
34 #include "htslib/khash.h" | |
35 KHASH_DECLARE(s2i, kh_cstr_t, int64_t) | |
36 | |
37 tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 }; | |
38 tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 }; | |
39 tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 }; | |
40 tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 }; | |
41 tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 }; | |
42 | |
43 typedef struct { | |
44 int64_t beg, end; | |
45 char *ss, *se; | |
46 int tid; | |
47 } tbx_intv_t; | |
48 | |
49 static inline int get_tid(tbx_t *tbx, const char *ss, int is_add) | |
50 { | |
51 khint_t k; | |
52 khash_t(s2i) *d; | |
53 if (tbx->dict == 0) tbx->dict = kh_init(s2i); | |
54 d = (khash_t(s2i)*)tbx->dict; | |
55 if (is_add) { | |
56 int absent; | |
57 k = kh_put(s2i, d, ss, &absent); | |
58 if (absent) { | |
59 kh_key(d, k) = strdup(ss); | |
60 kh_val(d, k) = kh_size(d) - 1; | |
61 } | |
62 } else k = kh_get(s2i, d, ss); | |
63 return k == kh_end(d)? -1 : kh_val(d, k); | |
64 } | |
65 | |
66 int tbx_name2id(tbx_t *tbx, const char *ss) | |
67 { | |
68 return get_tid(tbx, ss, 0); | |
69 } | |
70 | |
71 int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) | |
72 { | |
73 int i, b = 0, id = 1, ncols = 0; | |
74 char *s; | |
75 intv->ss = intv->se = 0; intv->beg = intv->end = -1; | |
76 for (i = 0; i <= len; ++i) { | |
77 if (line[i] == '\t' || line[i] == 0) { | |
78 ++ncols; | |
79 if (id == conf->sc) { | |
80 intv->ss = line + b; intv->se = line + i; | |
81 } else if (id == conf->bc) { | |
82 // here ->beg is 0-based. | |
83 intv->beg = intv->end = strtol(line + b, &s, 0); | |
84 if ( s==line+b ) return -1; // expected int | |
85 if (!(conf->preset&TBX_UCSC)) --intv->beg; | |
86 else ++intv->end; | |
87 if (intv->beg < 0) intv->beg = 0; | |
88 if (intv->end < 1) intv->end = 1; | |
89 } else { | |
90 if ((conf->preset&0xffff) == TBX_GENERIC) { | |
91 if (id == conf->ec) | |
92 { | |
93 intv->end = strtol(line + b, &s, 0); | |
94 if ( s==line+b ) return -1; // expected int | |
95 } | |
96 } else if ((conf->preset&0xffff) == TBX_SAM) { | |
97 if (id == 6) { // CIGAR | |
98 int l = 0, op; | |
99 char *t; | |
100 for (s = line + b; s < line + i;) { | |
101 long x = strtol(s, &t, 10); | |
102 op = toupper(*t); | |
103 if (op == 'M' || op == 'D' || op == 'N') l += x; | |
104 s = t + 1; | |
105 } | |
106 if (l == 0) l = 1; | |
107 intv->end = intv->beg + l; | |
108 } | |
109 } else if ((conf->preset&0xffff) == TBX_VCF) { | |
110 if (id == 4) { | |
111 if (b < i) intv->end = intv->beg + (i - b); | |
112 } else if (id == 8) { // look for "END=" | |
113 int c = line[i]; | |
114 line[i] = 0; | |
115 s = strstr(line + b, "END="); | |
116 if (s == line + b) s += 4; | |
117 else if (s) { | |
118 s = strstr(line + b, ";END="); | |
119 if (s) s += 5; | |
120 } | |
121 if (s) intv->end = strtol(s, &s, 0); | |
122 line[i] = c; | |
123 } | |
124 } | |
125 } | |
126 b = i + 1; | |
127 ++id; | |
128 } | |
129 } | |
130 if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1; | |
131 return 0; | |
132 } | |
133 | |
134 static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add) | |
135 { | |
136 if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) { | |
137 int c = *intv->se; | |
138 *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c; | |
139 return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1; | |
140 } else { | |
141 char *type = NULL; | |
142 switch (tbx->conf.preset&0xffff) | |
143 { | |
144 case TBX_SAM: type = "TBX_SAM"; break; | |
145 case TBX_VCF: type = "TBX_VCF"; break; | |
146 case TBX_UCSC: type = "TBX_UCSC"; break; | |
147 default: type = "TBX_GENERIC"; break; | |
148 } | |
149 fprintf(stderr, "[E::%s] failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"\n", __func__, type, str->s); | |
150 return -1; | |
151 } | |
152 } | |
153 | |
154 int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end) | |
155 { | |
156 tbx_t *tbx = (tbx_t *) tbxv; | |
157 kstring_t *s = (kstring_t *) sv; | |
158 int ret; | |
159 if ((ret = bgzf_getline(fp, '\n', s)) >= 0) { | |
160 tbx_intv_t intv; | |
161 get_intv(tbx, s, &intv, 0); | |
162 *tid = intv.tid; *beg = intv.beg; *end = intv.end; | |
163 } | |
164 return ret; | |
165 } | |
166 | |
167 void tbx_set_meta(tbx_t *tbx) | |
168 { | |
169 int i, l = 0, l_nm; | |
170 uint32_t x[7]; | |
171 char **name; | |
172 uint8_t *meta; | |
173 khint_t k; | |
174 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict; | |
175 | |
176 memcpy(x, &tbx->conf, 24); | |
177 name = (char**)malloc(sizeof(char*) * kh_size(d)); | |
178 for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) { | |
179 if (!kh_exist(d, k)) continue; | |
180 name[kh_val(d, k)] = (char*)kh_key(d, k); | |
181 l += strlen(kh_key(d, k)) + 1; // +1 to include '\0' | |
182 } | |
183 l_nm = x[6] = l; | |
184 meta = (uint8_t*)malloc(l_nm + 28); | |
185 if (ed_is_big()) | |
186 for (i = 0; i < 7; ++i) | |
187 x[i] = ed_swap_4(x[i]); | |
188 memcpy(meta, x, 28); | |
189 for (l = 28, i = 0; i < (int)kh_size(d); ++i) { | |
190 int x = strlen(name[i]) + 1; | |
191 memcpy(meta + l, name[i], x); | |
192 l += x; | |
193 } | |
194 free(name); | |
195 hts_idx_set_meta(tbx->idx, l, meta, 0); | |
196 } | |
197 | |
198 tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) | |
199 { | |
200 tbx_t *tbx; | |
201 kstring_t str; | |
202 int ret, first = 0, n_lvls, fmt; | |
203 int64_t lineno = 0; | |
204 uint64_t last_off = 0; | |
205 tbx_intv_t intv; | |
206 | |
207 str.s = 0; str.l = str.m = 0; | |
208 tbx = (tbx_t*)calloc(1, sizeof(tbx_t)); | |
209 tbx->conf = *conf; | |
210 if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI; | |
211 else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI; | |
212 while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { | |
213 ++lineno; | |
214 if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) { | |
215 last_off = bgzf_tell(fp); | |
216 continue; | |
217 } | |
218 if (first == 0) { | |
219 tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); | |
220 first = 1; | |
221 } | |
222 get_intv(tbx, &str, &intv, 1); | |
223 ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1); | |
224 if (ret < 0) | |
225 { | |
226 free(str.s); | |
227 tbx_destroy(tbx); | |
228 return NULL; | |
229 } | |
230 } | |
231 if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file | |
232 if ( !tbx->dict ) tbx->dict = kh_init(s2i); | |
233 hts_idx_finish(tbx->idx, bgzf_tell(fp)); | |
234 tbx_set_meta(tbx); | |
235 free(str.s); | |
236 return tbx; | |
237 } | |
238 | |
239 void tbx_destroy(tbx_t *tbx) | |
240 { | |
241 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict; | |
242 if (d != NULL) | |
243 { | |
244 khint_t k; | |
245 for (k = kh_begin(d); k != kh_end(d); ++k) | |
246 if (kh_exist(d, k)) free((char*)kh_key(d, k)); | |
247 } | |
248 hts_idx_destroy(tbx->idx); | |
249 kh_destroy(s2i, d); | |
250 free(tbx); | |
251 } | |
252 | |
253 int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf) | |
254 { | |
255 tbx_t *tbx; | |
256 BGZF *fp; | |
257 if ( bgzf_is_bgzf(fn)!=1 ) { fprintf(stderr,"Not a BGZF file: %s\n", fn); return -1; } | |
258 if ((fp = bgzf_open(fn, "r")) == 0) return -1; | |
259 if ( !fp->is_compressed ) { bgzf_close(fp); return -1; } | |
260 tbx = tbx_index(fp, min_shift, conf); | |
261 bgzf_close(fp); | |
262 if ( !tbx ) return -1; | |
263 hts_idx_save(tbx->idx, fn, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI); | |
264 tbx_destroy(tbx); | |
265 return 0; | |
266 } | |
267 | |
268 tbx_t *tbx_index_load(const char *fn) | |
269 { | |
270 tbx_t *tbx; | |
271 uint8_t *meta; | |
272 char *nm, *p; | |
273 uint32_t x[7]; | |
274 int l_meta, l_nm; | |
275 tbx = (tbx_t*)calloc(1, sizeof(tbx_t)); | |
276 tbx->idx = hts_idx_load(fn, HTS_FMT_TBI); | |
277 if ( !tbx->idx ) | |
278 { | |
279 free(tbx); | |
280 return NULL; | |
281 } | |
282 meta = hts_idx_get_meta(tbx->idx, &l_meta); | |
283 if ( !meta ) | |
284 { | |
285 free(tbx); | |
286 return NULL; | |
287 } | |
288 memcpy(x, meta, 28); | |
289 memcpy(&tbx->conf, x, 24); | |
290 p = nm = (char*)meta + 28; | |
291 l_nm = x[6]; | |
292 for (; p - nm < l_nm; p += strlen(p) + 1) get_tid(tbx, p, 1); | |
293 return tbx; | |
294 } | |
295 | |
296 const char **tbx_seqnames(tbx_t *tbx, int *n) | |
297 { | |
298 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict; | |
299 if (d == NULL) | |
300 { | |
301 *n = 0; | |
302 return NULL; | |
303 } | |
304 int tid, m = kh_size(d); | |
305 const char **names = (const char**) calloc(m,sizeof(const char*)); | |
306 khint_t k; | |
307 for (k=kh_begin(d); k<kh_end(d); k++) | |
308 { | |
309 if ( !kh_exist(d,k) ) continue; | |
310 tid = kh_val(d,k); | |
311 assert( tid<m ); | |
312 names[tid] = kh_key(d,k); | |
313 } | |
314 // sanity check: there should be no gaps | |
315 for (tid=0; tid<m; tid++) | |
316 assert(names[tid]); | |
317 *n = m; | |
318 return names; | |
319 } | |
320 |