comparison ezBAMQC/src/htslib/faidx.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /* faidx.c -- FASTA random access.
2
3 Copyright (C) 2008, 2009, 2013-2015 Genome Research Ltd.
4 Portions copyright (C) 2011 Broad Institute.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <ctype.h>
27 #include <string.h>
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <stdint.h>
31
32 #include "htslib/bgzf.h"
33 #include "htslib/faidx.h"
34 #include "htslib/hfile.h"
35 #include "htslib/khash.h"
36
37 typedef struct {
38 int32_t line_len, line_blen;
39 int64_t len;
40 uint64_t offset;
41 } faidx1_t;
42 KHASH_MAP_INIT_STR(s, faidx1_t)
43
44 struct __faidx_t {
45 BGZF *bgzf;
46 int n, m;
47 char **name;
48 khash_t(s) *hash;
49 };
50
51 #ifndef kroundup32
52 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
53 #endif
54
55 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
56 {
57 khint_t k;
58 int ret;
59 faidx1_t t;
60 if (idx->n == idx->m) {
61 idx->m = idx->m? idx->m<<1 : 16;
62 idx->name = (char**)realloc(idx->name, sizeof(char*) * idx->m);
63 }
64 idx->name[idx->n] = strdup(name);
65 k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
66 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
67 kh_value(idx->hash, k) = t;
68 ++idx->n;
69 }
70
71 faidx_t *fai_build_core(BGZF *bgzf)
72 {
73 char *name;
74 int c;
75 int l_name, m_name;
76 int line_len, line_blen, state;
77 int l1, l2;
78 faidx_t *idx;
79 uint64_t offset;
80 int64_t len;
81
82 idx = (faidx_t*)calloc(1, sizeof(faidx_t));
83 idx->hash = kh_init(s);
84 name = 0; l_name = m_name = 0;
85 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
86 while ( (c=bgzf_getc(bgzf))>=0 ) {
87 if (c == '\n') { // an empty line
88 if (state == 1) {
89 offset = bgzf_utell(bgzf);
90 continue;
91 } else if ((state == 0 && len < 0) || state == 2) continue;
92 }
93 if (c == '>') { // fasta header
94 if (len >= 0)
95 fai_insert_index(idx, name, len, line_len, line_blen, offset);
96 l_name = 0;
97 while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) {
98 if (m_name < l_name + 2) {
99 m_name = l_name + 2;
100 kroundup32(m_name);
101 name = (char*)realloc(name, m_name);
102 }
103 name[l_name++] = c;
104 }
105 name[l_name] = '\0';
106 if ( c<0 ) {
107 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
108 free(name); fai_destroy(idx);
109 return 0;
110 }
111 if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
112 state = 1; len = 0;
113 offset = bgzf_utell(bgzf);
114 } else {
115 if (state == 3) {
116 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
117 free(name); fai_destroy(idx);
118 return 0;
119 }
120 if (state == 2) state = 3;
121 l1 = l2 = 0;
122 do {
123 ++l1;
124 if (isgraph(c)) ++l2;
125 } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
126 if (state == 3 && l2) {
127 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
128 free(name); fai_destroy(idx);
129 return 0;
130 }
131 ++l1; len += l2;
132 if (state == 1) line_len = l1, line_blen = l2, state = 0;
133 else if (state == 0) {
134 if (l1 != line_len || l2 != line_blen) state = 2;
135 }
136 }
137 }
138 if ( name )
139 fai_insert_index(idx, name, len, line_len, line_blen, offset);
140 else
141 {
142 free(idx);
143 return NULL;
144 }
145 free(name);
146 return idx;
147 }
148
149 void fai_save(const faidx_t *fai, FILE *fp)
150 {
151 khint_t k;
152 int i;
153 for (i = 0; i < fai->n; ++i) {
154 faidx1_t x;
155 k = kh_get(s, fai->hash, fai->name[i]);
156 x = kh_value(fai->hash, k);
157 #ifdef _WIN32
158 fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
159 #else
160 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
161 #endif
162 }
163 }
164
165 faidx_t *fai_read(FILE *fp)
166 {
167 faidx_t *fai;
168 char *buf, *p;
169 int len, line_len, line_blen;
170 #ifdef _WIN32
171 long offset;
172 #else
173 long long offset;
174 #endif
175 fai = (faidx_t*)calloc(1, sizeof(faidx_t));
176 fai->hash = kh_init(s);
177 buf = (char*)calloc(0x10000, 1);
178 while (!feof(fp) && fgets(buf, 0x10000, fp)) {
179 for (p = buf; *p && isgraph(*p); ++p);
180 *p = 0; ++p;
181 #ifdef _WIN32
182 sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
183 #else
184 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
185 #endif
186 fai_insert_index(fai, buf, len, line_len, line_blen, offset);
187 }
188 free(buf);
189 return fai;
190 }
191
192 void fai_destroy(faidx_t *fai)
193 {
194 int i;
195 for (i = 0; i < fai->n; ++i) free(fai->name[i]);
196 free(fai->name);
197 kh_destroy(s, fai->hash);
198 if (fai->bgzf) bgzf_close(fai->bgzf);
199 free(fai);
200 }
201
202 int fai_build(const char *fn)
203 {
204 char *str;
205 BGZF *bgzf;
206 FILE *fp;
207 faidx_t *fai;
208 str = (char*)calloc(strlen(fn) + 5, 1);
209 sprintf(str, "%s.fai", fn);
210 bgzf = bgzf_open(fn, "r");
211 if ( !bgzf ) {
212 fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
213 free(str);
214 return -1;
215 }
216 if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf);
217 fai = fai_build_core(bgzf);
218 if ( !fai )
219 {
220 if ( bgzf->is_compressed && bgzf->is_gzip ) fprintf(stderr,"Cannot index files compressed with gzip, please use bgzip\n");
221 free(str);
222 return -1;
223 }
224 if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi");
225 bgzf_close(bgzf);
226 fp = fopen(str, "wb");
227 if ( !fp ) {
228 fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
229 fai_destroy(fai); free(str);
230 return -1;
231 }
232 fai_save(fai, fp);
233 fclose(fp);
234 free(str);
235 fai_destroy(fai);
236 return 0;
237 }
238
239 static FILE *download_and_open(const char *fn)
240 {
241 const int buf_size = 1 * 1024 * 1024;
242 uint8_t *buf;
243 FILE *fp;
244 hFILE *fp_remote;
245 const char *url = fn;
246 const char *p;
247 int l = strlen(fn);
248 for (p = fn + l - 1; p >= fn; --p)
249 if (*p == '/') break;
250 fn = p + 1;
251
252 // First try to open a local copy
253 fp = fopen(fn, "r");
254 if (fp)
255 return fp;
256
257 // If failed, download from remote and open
258 fp_remote = hopen(url, "rb");
259 if (fp_remote == 0) {
260 fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
261 return NULL;
262 }
263 if ((fp = fopen(fn, "wb")) == 0) {
264 fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
265 hclose_abruptly(fp_remote);
266 return NULL;
267 }
268 buf = (uint8_t*)calloc(buf_size, 1);
269 while ((l = hread(fp_remote, buf, buf_size)) > 0)
270 fwrite(buf, 1, l, fp);
271 free(buf);
272 fclose(fp);
273 if (hclose(fp_remote) != 0)
274 fprintf(stderr, "[download_from_remote] fail to close remote file %s\n", url);
275
276 return fopen(fn, "r");
277 }
278
279 faidx_t *fai_load(const char *fn)
280 {
281 char *str;
282 FILE *fp;
283 faidx_t *fai;
284 str = (char*)calloc(strlen(fn) + 5, 1);
285 sprintf(str, "%s.fai", fn);
286
287 if (hisremote(str))
288 {
289 fp = download_and_open(str);
290 if ( !fp )
291 {
292 fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
293 free(str);
294 return 0;
295 }
296 }
297 else
298 fp = fopen(str, "rb");
299
300 if (fp == 0) {
301 fprintf(stderr, "[fai_load] build FASTA index.\n");
302 fai_build(fn);
303 fp = fopen(str, "rb");
304 if (fp == 0) {
305 fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
306 free(str);
307 return 0;
308 }
309 }
310
311 fai = fai_read(fp);
312 fclose(fp);
313
314 fai->bgzf = bgzf_open(fn, "rb");
315 free(str);
316 if (fai->bgzf == 0) {
317 fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
318 return 0;
319 }
320 if ( fai->bgzf->is_compressed==1 )
321 {
322 if ( bgzf_index_load(fai->bgzf, fn, ".gzi") < 0 )
323 {
324 fprintf(stderr, "[fai_load] failed to load .gzi index: %s[.gzi]\n", fn);
325 fai_destroy(fai);
326 return NULL;
327 }
328 }
329 return fai;
330 }
331
332 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
333 {
334 char *s;
335 int c, i, l, k, name_end;
336 khiter_t iter;
337 faidx1_t val;
338 khash_t(s) *h;
339 int beg, end;
340
341 beg = end = -1;
342 h = fai->hash;
343 name_end = l = strlen(str);
344 s = (char*)malloc(l+1);
345 // remove space
346 for (i = k = 0; i < l; ++i)
347 if (!isspace(str[i])) s[k++] = str[i];
348 s[k] = 0; l = k;
349 // determine the sequence name
350 for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
351 if (i >= 0) name_end = i;
352 if (name_end < l) { // check if this is really the end
353 int n_hyphen = 0;
354 for (i = name_end + 1; i < l; ++i) {
355 if (s[i] == '-') ++n_hyphen;
356 else if (!isdigit(s[i]) && s[i] != ',') break;
357 }
358 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
359 s[name_end] = 0;
360 iter = kh_get(s, h, s);
361 if (iter == kh_end(h)) { // cannot find the sequence name
362 iter = kh_get(s, h, str); // try str as the name
363 if (iter == kh_end(h)) {
364 *len = 0;
365 free(s); return 0;
366 } else s[name_end] = ':', name_end = l;
367 }
368 } else iter = kh_get(s, h, str);
369 if(iter == kh_end(h)) {
370 fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
371 free(s);
372 *len = -2;
373 return 0;
374 };
375 val = kh_value(h, iter);
376 // parse the interval
377 if (name_end < l) {
378 for (i = k = name_end + 1; i < l; ++i)
379 if (s[i] != ',') s[k++] = s[i];
380 s[k] = 0;
381 beg = atoi(s + name_end + 1);
382 for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
383 end = i < k? atoi(s + i + 1) : val.len;
384 if (beg > 0) --beg;
385 } else beg = 0, end = val.len;
386 if (beg >= val.len) beg = val.len;
387 if (end >= val.len) end = val.len;
388 if (beg > end) beg = end;
389 free(s);
390
391 // now retrieve the sequence
392 int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
393 if ( ret<0 )
394 {
395 *len = -1;
396 fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
397 return NULL;
398 }
399 l = 0;
400 s = (char*)malloc(end - beg + 2);
401 while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg )
402 if (isgraph(c)) s[l++] = c;
403 s[l] = '\0';
404 *len = l;
405 return s;
406 }
407
408 int faidx_fetch_nseq(const faidx_t *fai)
409 {
410 return fai->n;
411 }
412
413 int faidx_nseq(const faidx_t *fai)
414 {
415 return fai->n;
416 }
417
418 const char *faidx_iseq(const faidx_t *fai, int i)
419 {
420 return fai->name[i];
421 }
422
423 int faidx_seq_len(const faidx_t *fai, const char *seq)
424 {
425 khint_t k = kh_get(s, fai->hash, seq);
426 if ( k == kh_end(fai->hash) ) return -1;
427 return kh_val(fai->hash, k).len;
428 }
429
430 char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
431 {
432 int l, c;
433 khiter_t iter;
434 faidx1_t val;
435 char *seq=NULL;
436
437 // Adjust position
438 iter = kh_get(s, fai->hash, c_name);
439 if (iter == kh_end(fai->hash))
440 {
441 *len = -2;
442 fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name);
443 return NULL;
444 }
445 val = kh_value(fai->hash, iter);
446 if(p_end_i < p_beg_i) p_beg_i = p_end_i;
447 if(p_beg_i < 0) p_beg_i = 0;
448 else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
449 if(p_end_i < 0) p_end_i = 0;
450 else if(val.len <= p_end_i) p_end_i = val.len - 1;
451
452 // Now retrieve the sequence
453 int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
454 if ( ret<0 )
455 {
456 *len = -1;
457 fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
458 return NULL;
459 }
460 l = 0;
461 seq = (char*)malloc(p_end_i - p_beg_i + 2);
462 while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1)
463 if (isgraph(c)) seq[l++] = c;
464 seq[l] = '\0';
465 *len = l;
466 return seq;
467 }
468
469 int faidx_has_seq(const faidx_t *fai, const char *seq)
470 {
471 khiter_t iter = kh_get(s, fai->hash, seq);
472 if (iter == kh_end(fai->hash)) return 0;
473 return 1;
474 }
475