0
|
1 /* faidx.c -- FASTA random access.
|
|
2
|
|
3 Copyright (C) 2008, 2009, 2013-2015 Genome Research Ltd.
|
|
4 Portions copyright (C) 2011 Broad Institute.
|
|
5
|
|
6 Author: Heng Li <lh3@sanger.ac.uk>
|
|
7
|
|
8 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
9 of this software and associated documentation files (the "Software"), to deal
|
|
10 in the Software without restriction, including without limitation the rights
|
|
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
12 copies of the Software, and to permit persons to whom the Software is
|
|
13 furnished to do so, subject to the following conditions:
|
|
14
|
|
15 The above copyright notice and this permission notice shall be included in
|
|
16 all copies or substantial portions of the Software.
|
|
17
|
|
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
24 DEALINGS IN THE SOFTWARE. */
|
|
25
|
|
26 #include <ctype.h>
|
|
27 #include <string.h>
|
|
28 #include <stdlib.h>
|
|
29 #include <stdio.h>
|
|
30 #include <stdint.h>
|
|
31
|
|
32 #include "htslib/bgzf.h"
|
|
33 #include "htslib/faidx.h"
|
|
34 #include "htslib/hfile.h"
|
|
35 #include "htslib/khash.h"
|
|
36
|
|
37 typedef struct {
|
|
38 int32_t line_len, line_blen;
|
|
39 int64_t len;
|
|
40 uint64_t offset;
|
|
41 } faidx1_t;
|
|
42 KHASH_MAP_INIT_STR(s, faidx1_t)
|
|
43
|
|
44 struct __faidx_t {
|
|
45 BGZF *bgzf;
|
|
46 int n, m;
|
|
47 char **name;
|
|
48 khash_t(s) *hash;
|
|
49 };
|
|
50
|
|
51 #ifndef kroundup32
|
|
52 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
|
53 #endif
|
|
54
|
|
55 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
|
|
56 {
|
|
57 khint_t k;
|
|
58 int ret;
|
|
59 faidx1_t t;
|
|
60 if (idx->n == idx->m) {
|
|
61 idx->m = idx->m? idx->m<<1 : 16;
|
|
62 idx->name = (char**)realloc(idx->name, sizeof(char*) * idx->m);
|
|
63 }
|
|
64 idx->name[idx->n] = strdup(name);
|
|
65 k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
|
|
66 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
|
|
67 kh_value(idx->hash, k) = t;
|
|
68 ++idx->n;
|
|
69 }
|
|
70
|
|
71 faidx_t *fai_build_core(BGZF *bgzf)
|
|
72 {
|
|
73 char *name;
|
|
74 int c;
|
|
75 int l_name, m_name;
|
|
76 int line_len, line_blen, state;
|
|
77 int l1, l2;
|
|
78 faidx_t *idx;
|
|
79 uint64_t offset;
|
|
80 int64_t len;
|
|
81
|
|
82 idx = (faidx_t*)calloc(1, sizeof(faidx_t));
|
|
83 idx->hash = kh_init(s);
|
|
84 name = 0; l_name = m_name = 0;
|
|
85 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
|
|
86 while ( (c=bgzf_getc(bgzf))>=0 ) {
|
|
87 if (c == '\n') { // an empty line
|
|
88 if (state == 1) {
|
|
89 offset = bgzf_utell(bgzf);
|
|
90 continue;
|
|
91 } else if ((state == 0 && len < 0) || state == 2) continue;
|
|
92 }
|
|
93 if (c == '>') { // fasta header
|
|
94 if (len >= 0)
|
|
95 fai_insert_index(idx, name, len, line_len, line_blen, offset);
|
|
96 l_name = 0;
|
|
97 while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) {
|
|
98 if (m_name < l_name + 2) {
|
|
99 m_name = l_name + 2;
|
|
100 kroundup32(m_name);
|
|
101 name = (char*)realloc(name, m_name);
|
|
102 }
|
|
103 name[l_name++] = c;
|
|
104 }
|
|
105 name[l_name] = '\0';
|
|
106 if ( c<0 ) {
|
|
107 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
|
|
108 free(name); fai_destroy(idx);
|
|
109 return 0;
|
|
110 }
|
|
111 if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
|
|
112 state = 1; len = 0;
|
|
113 offset = bgzf_utell(bgzf);
|
|
114 } else {
|
|
115 if (state == 3) {
|
|
116 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
|
|
117 free(name); fai_destroy(idx);
|
|
118 return 0;
|
|
119 }
|
|
120 if (state == 2) state = 3;
|
|
121 l1 = l2 = 0;
|
|
122 do {
|
|
123 ++l1;
|
|
124 if (isgraph(c)) ++l2;
|
|
125 } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
|
|
126 if (state == 3 && l2) {
|
|
127 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
|
|
128 free(name); fai_destroy(idx);
|
|
129 return 0;
|
|
130 }
|
|
131 ++l1; len += l2;
|
|
132 if (state == 1) line_len = l1, line_blen = l2, state = 0;
|
|
133 else if (state == 0) {
|
|
134 if (l1 != line_len || l2 != line_blen) state = 2;
|
|
135 }
|
|
136 }
|
|
137 }
|
|
138 if ( name )
|
|
139 fai_insert_index(idx, name, len, line_len, line_blen, offset);
|
|
140 else
|
|
141 {
|
|
142 free(idx);
|
|
143 return NULL;
|
|
144 }
|
|
145 free(name);
|
|
146 return idx;
|
|
147 }
|
|
148
|
|
149 void fai_save(const faidx_t *fai, FILE *fp)
|
|
150 {
|
|
151 khint_t k;
|
|
152 int i;
|
|
153 for (i = 0; i < fai->n; ++i) {
|
|
154 faidx1_t x;
|
|
155 k = kh_get(s, fai->hash, fai->name[i]);
|
|
156 x = kh_value(fai->hash, k);
|
|
157 #ifdef _WIN32
|
|
158 fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
|
|
159 #else
|
|
160 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
|
|
161 #endif
|
|
162 }
|
|
163 }
|
|
164
|
|
165 faidx_t *fai_read(FILE *fp)
|
|
166 {
|
|
167 faidx_t *fai;
|
|
168 char *buf, *p;
|
|
169 int len, line_len, line_blen;
|
|
170 #ifdef _WIN32
|
|
171 long offset;
|
|
172 #else
|
|
173 long long offset;
|
|
174 #endif
|
|
175 fai = (faidx_t*)calloc(1, sizeof(faidx_t));
|
|
176 fai->hash = kh_init(s);
|
|
177 buf = (char*)calloc(0x10000, 1);
|
|
178 while (!feof(fp) && fgets(buf, 0x10000, fp)) {
|
|
179 for (p = buf; *p && isgraph(*p); ++p);
|
|
180 *p = 0; ++p;
|
|
181 #ifdef _WIN32
|
|
182 sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
|
|
183 #else
|
|
184 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
|
|
185 #endif
|
|
186 fai_insert_index(fai, buf, len, line_len, line_blen, offset);
|
|
187 }
|
|
188 free(buf);
|
|
189 return fai;
|
|
190 }
|
|
191
|
|
192 void fai_destroy(faidx_t *fai)
|
|
193 {
|
|
194 int i;
|
|
195 for (i = 0; i < fai->n; ++i) free(fai->name[i]);
|
|
196 free(fai->name);
|
|
197 kh_destroy(s, fai->hash);
|
|
198 if (fai->bgzf) bgzf_close(fai->bgzf);
|
|
199 free(fai);
|
|
200 }
|
|
201
|
|
202 int fai_build(const char *fn)
|
|
203 {
|
|
204 char *str;
|
|
205 BGZF *bgzf;
|
|
206 FILE *fp;
|
|
207 faidx_t *fai;
|
|
208 str = (char*)calloc(strlen(fn) + 5, 1);
|
|
209 sprintf(str, "%s.fai", fn);
|
|
210 bgzf = bgzf_open(fn, "r");
|
|
211 if ( !bgzf ) {
|
|
212 fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
|
|
213 free(str);
|
|
214 return -1;
|
|
215 }
|
|
216 if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf);
|
|
217 fai = fai_build_core(bgzf);
|
|
218 if ( !fai )
|
|
219 {
|
|
220 if ( bgzf->is_compressed && bgzf->is_gzip ) fprintf(stderr,"Cannot index files compressed with gzip, please use bgzip\n");
|
|
221 free(str);
|
|
222 return -1;
|
|
223 }
|
|
224 if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi");
|
|
225 bgzf_close(bgzf);
|
|
226 fp = fopen(str, "wb");
|
|
227 if ( !fp ) {
|
|
228 fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
|
|
229 fai_destroy(fai); free(str);
|
|
230 return -1;
|
|
231 }
|
|
232 fai_save(fai, fp);
|
|
233 fclose(fp);
|
|
234 free(str);
|
|
235 fai_destroy(fai);
|
|
236 return 0;
|
|
237 }
|
|
238
|
|
239 static FILE *download_and_open(const char *fn)
|
|
240 {
|
|
241 const int buf_size = 1 * 1024 * 1024;
|
|
242 uint8_t *buf;
|
|
243 FILE *fp;
|
|
244 hFILE *fp_remote;
|
|
245 const char *url = fn;
|
|
246 const char *p;
|
|
247 int l = strlen(fn);
|
|
248 for (p = fn + l - 1; p >= fn; --p)
|
|
249 if (*p == '/') break;
|
|
250 fn = p + 1;
|
|
251
|
|
252 // First try to open a local copy
|
|
253 fp = fopen(fn, "r");
|
|
254 if (fp)
|
|
255 return fp;
|
|
256
|
|
257 // If failed, download from remote and open
|
|
258 fp_remote = hopen(url, "rb");
|
|
259 if (fp_remote == 0) {
|
|
260 fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
|
|
261 return NULL;
|
|
262 }
|
|
263 if ((fp = fopen(fn, "wb")) == 0) {
|
|
264 fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
|
|
265 hclose_abruptly(fp_remote);
|
|
266 return NULL;
|
|
267 }
|
|
268 buf = (uint8_t*)calloc(buf_size, 1);
|
|
269 while ((l = hread(fp_remote, buf, buf_size)) > 0)
|
|
270 fwrite(buf, 1, l, fp);
|
|
271 free(buf);
|
|
272 fclose(fp);
|
|
273 if (hclose(fp_remote) != 0)
|
|
274 fprintf(stderr, "[download_from_remote] fail to close remote file %s\n", url);
|
|
275
|
|
276 return fopen(fn, "r");
|
|
277 }
|
|
278
|
|
279 faidx_t *fai_load(const char *fn)
|
|
280 {
|
|
281 char *str;
|
|
282 FILE *fp;
|
|
283 faidx_t *fai;
|
|
284 str = (char*)calloc(strlen(fn) + 5, 1);
|
|
285 sprintf(str, "%s.fai", fn);
|
|
286
|
|
287 if (hisremote(str))
|
|
288 {
|
|
289 fp = download_and_open(str);
|
|
290 if ( !fp )
|
|
291 {
|
|
292 fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
|
|
293 free(str);
|
|
294 return 0;
|
|
295 }
|
|
296 }
|
|
297 else
|
|
298 fp = fopen(str, "rb");
|
|
299
|
|
300 if (fp == 0) {
|
|
301 fprintf(stderr, "[fai_load] build FASTA index.\n");
|
|
302 fai_build(fn);
|
|
303 fp = fopen(str, "rb");
|
|
304 if (fp == 0) {
|
|
305 fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
|
|
306 free(str);
|
|
307 return 0;
|
|
308 }
|
|
309 }
|
|
310
|
|
311 fai = fai_read(fp);
|
|
312 fclose(fp);
|
|
313
|
|
314 fai->bgzf = bgzf_open(fn, "rb");
|
|
315 free(str);
|
|
316 if (fai->bgzf == 0) {
|
|
317 fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
|
|
318 return 0;
|
|
319 }
|
|
320 if ( fai->bgzf->is_compressed==1 )
|
|
321 {
|
|
322 if ( bgzf_index_load(fai->bgzf, fn, ".gzi") < 0 )
|
|
323 {
|
|
324 fprintf(stderr, "[fai_load] failed to load .gzi index: %s[.gzi]\n", fn);
|
|
325 fai_destroy(fai);
|
|
326 return NULL;
|
|
327 }
|
|
328 }
|
|
329 return fai;
|
|
330 }
|
|
331
|
|
332 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
|
|
333 {
|
|
334 char *s;
|
|
335 int c, i, l, k, name_end;
|
|
336 khiter_t iter;
|
|
337 faidx1_t val;
|
|
338 khash_t(s) *h;
|
|
339 int beg, end;
|
|
340
|
|
341 beg = end = -1;
|
|
342 h = fai->hash;
|
|
343 name_end = l = strlen(str);
|
|
344 s = (char*)malloc(l+1);
|
|
345 // remove space
|
|
346 for (i = k = 0; i < l; ++i)
|
|
347 if (!isspace(str[i])) s[k++] = str[i];
|
|
348 s[k] = 0; l = k;
|
|
349 // determine the sequence name
|
|
350 for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
|
|
351 if (i >= 0) name_end = i;
|
|
352 if (name_end < l) { // check if this is really the end
|
|
353 int n_hyphen = 0;
|
|
354 for (i = name_end + 1; i < l; ++i) {
|
|
355 if (s[i] == '-') ++n_hyphen;
|
|
356 else if (!isdigit(s[i]) && s[i] != ',') break;
|
|
357 }
|
|
358 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
|
|
359 s[name_end] = 0;
|
|
360 iter = kh_get(s, h, s);
|
|
361 if (iter == kh_end(h)) { // cannot find the sequence name
|
|
362 iter = kh_get(s, h, str); // try str as the name
|
|
363 if (iter == kh_end(h)) {
|
|
364 *len = 0;
|
|
365 free(s); return 0;
|
|
366 } else s[name_end] = ':', name_end = l;
|
|
367 }
|
|
368 } else iter = kh_get(s, h, str);
|
|
369 if(iter == kh_end(h)) {
|
|
370 fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
|
|
371 free(s);
|
|
372 *len = -2;
|
|
373 return 0;
|
|
374 };
|
|
375 val = kh_value(h, iter);
|
|
376 // parse the interval
|
|
377 if (name_end < l) {
|
|
378 for (i = k = name_end + 1; i < l; ++i)
|
|
379 if (s[i] != ',') s[k++] = s[i];
|
|
380 s[k] = 0;
|
|
381 beg = atoi(s + name_end + 1);
|
|
382 for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
|
|
383 end = i < k? atoi(s + i + 1) : val.len;
|
|
384 if (beg > 0) --beg;
|
|
385 } else beg = 0, end = val.len;
|
|
386 if (beg >= val.len) beg = val.len;
|
|
387 if (end >= val.len) end = val.len;
|
|
388 if (beg > end) beg = end;
|
|
389 free(s);
|
|
390
|
|
391 // now retrieve the sequence
|
|
392 int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
|
|
393 if ( ret<0 )
|
|
394 {
|
|
395 *len = -1;
|
|
396 fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
|
|
397 return NULL;
|
|
398 }
|
|
399 l = 0;
|
|
400 s = (char*)malloc(end - beg + 2);
|
|
401 while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg )
|
|
402 if (isgraph(c)) s[l++] = c;
|
|
403 s[l] = '\0';
|
|
404 *len = l;
|
|
405 return s;
|
|
406 }
|
|
407
|
|
408 int faidx_fetch_nseq(const faidx_t *fai)
|
|
409 {
|
|
410 return fai->n;
|
|
411 }
|
|
412
|
|
413 int faidx_nseq(const faidx_t *fai)
|
|
414 {
|
|
415 return fai->n;
|
|
416 }
|
|
417
|
|
418 const char *faidx_iseq(const faidx_t *fai, int i)
|
|
419 {
|
|
420 return fai->name[i];
|
|
421 }
|
|
422
|
|
423 int faidx_seq_len(const faidx_t *fai, const char *seq)
|
|
424 {
|
|
425 khint_t k = kh_get(s, fai->hash, seq);
|
|
426 if ( k == kh_end(fai->hash) ) return -1;
|
|
427 return kh_val(fai->hash, k).len;
|
|
428 }
|
|
429
|
|
430 char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
|
|
431 {
|
|
432 int l, c;
|
|
433 khiter_t iter;
|
|
434 faidx1_t val;
|
|
435 char *seq=NULL;
|
|
436
|
|
437 // Adjust position
|
|
438 iter = kh_get(s, fai->hash, c_name);
|
|
439 if (iter == kh_end(fai->hash))
|
|
440 {
|
|
441 *len = -2;
|
|
442 fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name);
|
|
443 return NULL;
|
|
444 }
|
|
445 val = kh_value(fai->hash, iter);
|
|
446 if(p_end_i < p_beg_i) p_beg_i = p_end_i;
|
|
447 if(p_beg_i < 0) p_beg_i = 0;
|
|
448 else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
|
|
449 if(p_end_i < 0) p_end_i = 0;
|
|
450 else if(val.len <= p_end_i) p_end_i = val.len - 1;
|
|
451
|
|
452 // Now retrieve the sequence
|
|
453 int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
|
|
454 if ( ret<0 )
|
|
455 {
|
|
456 *len = -1;
|
|
457 fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
|
|
458 return NULL;
|
|
459 }
|
|
460 l = 0;
|
|
461 seq = (char*)malloc(p_end_i - p_beg_i + 2);
|
|
462 while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1)
|
|
463 if (isgraph(c)) seq[l++] = c;
|
|
464 seq[l] = '\0';
|
|
465 *len = l;
|
|
466 return seq;
|
|
467 }
|
|
468
|
|
469 int faidx_has_seq(const faidx_t *fai, const char *seq)
|
|
470 {
|
|
471 khiter_t iter = kh_get(s, fai->hash, seq);
|
|
472 if (iter == kh_end(fai->hash)) return 0;
|
|
473 return 1;
|
|
474 }
|
|
475
|