Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/faidx.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* faidx.c -- FASTA random access. | |
2 | |
3 Copyright (C) 2008, 2009, 2013-2015 Genome Research Ltd. | |
4 Portions copyright (C) 2011 Broad Institute. | |
5 | |
6 Author: Heng Li <lh3@sanger.ac.uk> | |
7 | |
8 Permission is hereby granted, free of charge, to any person obtaining a copy | |
9 of this software and associated documentation files (the "Software"), to deal | |
10 in the Software without restriction, including without limitation the rights | |
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
12 copies of the Software, and to permit persons to whom the Software is | |
13 furnished to do so, subject to the following conditions: | |
14 | |
15 The above copyright notice and this permission notice shall be included in | |
16 all copies or substantial portions of the Software. | |
17 | |
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
24 DEALINGS IN THE SOFTWARE. */ | |
25 | |
26 #include <ctype.h> | |
27 #include <string.h> | |
28 #include <stdlib.h> | |
29 #include <stdio.h> | |
30 #include <stdint.h> | |
31 | |
32 #include "htslib/bgzf.h" | |
33 #include "htslib/faidx.h" | |
34 #include "htslib/hfile.h" | |
35 #include "htslib/khash.h" | |
36 | |
37 typedef struct { | |
38 int32_t line_len, line_blen; | |
39 int64_t len; | |
40 uint64_t offset; | |
41 } faidx1_t; | |
42 KHASH_MAP_INIT_STR(s, faidx1_t) | |
43 | |
44 struct __faidx_t { | |
45 BGZF *bgzf; | |
46 int n, m; | |
47 char **name; | |
48 khash_t(s) *hash; | |
49 }; | |
50 | |
51 #ifndef kroundup32 | |
52 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
53 #endif | |
54 | |
55 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) | |
56 { | |
57 khint_t k; | |
58 int ret; | |
59 faidx1_t t; | |
60 if (idx->n == idx->m) { | |
61 idx->m = idx->m? idx->m<<1 : 16; | |
62 idx->name = (char**)realloc(idx->name, sizeof(char*) * idx->m); | |
63 } | |
64 idx->name[idx->n] = strdup(name); | |
65 k = kh_put(s, idx->hash, idx->name[idx->n], &ret); | |
66 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; | |
67 kh_value(idx->hash, k) = t; | |
68 ++idx->n; | |
69 } | |
70 | |
71 faidx_t *fai_build_core(BGZF *bgzf) | |
72 { | |
73 char *name; | |
74 int c; | |
75 int l_name, m_name; | |
76 int line_len, line_blen, state; | |
77 int l1, l2; | |
78 faidx_t *idx; | |
79 uint64_t offset; | |
80 int64_t len; | |
81 | |
82 idx = (faidx_t*)calloc(1, sizeof(faidx_t)); | |
83 idx->hash = kh_init(s); | |
84 name = 0; l_name = m_name = 0; | |
85 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; | |
86 while ( (c=bgzf_getc(bgzf))>=0 ) { | |
87 if (c == '\n') { // an empty line | |
88 if (state == 1) { | |
89 offset = bgzf_utell(bgzf); | |
90 continue; | |
91 } else if ((state == 0 && len < 0) || state == 2) continue; | |
92 } | |
93 if (c == '>') { // fasta header | |
94 if (len >= 0) | |
95 fai_insert_index(idx, name, len, line_len, line_blen, offset); | |
96 l_name = 0; | |
97 while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) { | |
98 if (m_name < l_name + 2) { | |
99 m_name = l_name + 2; | |
100 kroundup32(m_name); | |
101 name = (char*)realloc(name, m_name); | |
102 } | |
103 name[l_name++] = c; | |
104 } | |
105 name[l_name] = '\0'; | |
106 if ( c<0 ) { | |
107 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); | |
108 free(name); fai_destroy(idx); | |
109 return 0; | |
110 } | |
111 if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n'); | |
112 state = 1; len = 0; | |
113 offset = bgzf_utell(bgzf); | |
114 } else { | |
115 if (state == 3) { | |
116 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); | |
117 free(name); fai_destroy(idx); | |
118 return 0; | |
119 } | |
120 if (state == 2) state = 3; | |
121 l1 = l2 = 0; | |
122 do { | |
123 ++l1; | |
124 if (isgraph(c)) ++l2; | |
125 } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n'); | |
126 if (state == 3 && l2) { | |
127 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); | |
128 free(name); fai_destroy(idx); | |
129 return 0; | |
130 } | |
131 ++l1; len += l2; | |
132 if (state == 1) line_len = l1, line_blen = l2, state = 0; | |
133 else if (state == 0) { | |
134 if (l1 != line_len || l2 != line_blen) state = 2; | |
135 } | |
136 } | |
137 } | |
138 if ( name ) | |
139 fai_insert_index(idx, name, len, line_len, line_blen, offset); | |
140 else | |
141 { | |
142 free(idx); | |
143 return NULL; | |
144 } | |
145 free(name); | |
146 return idx; | |
147 } | |
148 | |
149 void fai_save(const faidx_t *fai, FILE *fp) | |
150 { | |
151 khint_t k; | |
152 int i; | |
153 for (i = 0; i < fai->n; ++i) { | |
154 faidx1_t x; | |
155 k = kh_get(s, fai->hash, fai->name[i]); | |
156 x = kh_value(fai->hash, k); | |
157 #ifdef _WIN32 | |
158 fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len); | |
159 #else | |
160 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); | |
161 #endif | |
162 } | |
163 } | |
164 | |
165 faidx_t *fai_read(FILE *fp) | |
166 { | |
167 faidx_t *fai; | |
168 char *buf, *p; | |
169 int len, line_len, line_blen; | |
170 #ifdef _WIN32 | |
171 long offset; | |
172 #else | |
173 long long offset; | |
174 #endif | |
175 fai = (faidx_t*)calloc(1, sizeof(faidx_t)); | |
176 fai->hash = kh_init(s); | |
177 buf = (char*)calloc(0x10000, 1); | |
178 while (!feof(fp) && fgets(buf, 0x10000, fp)) { | |
179 for (p = buf; *p && isgraph(*p); ++p); | |
180 *p = 0; ++p; | |
181 #ifdef _WIN32 | |
182 sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len); | |
183 #else | |
184 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); | |
185 #endif | |
186 fai_insert_index(fai, buf, len, line_len, line_blen, offset); | |
187 } | |
188 free(buf); | |
189 return fai; | |
190 } | |
191 | |
192 void fai_destroy(faidx_t *fai) | |
193 { | |
194 int i; | |
195 for (i = 0; i < fai->n; ++i) free(fai->name[i]); | |
196 free(fai->name); | |
197 kh_destroy(s, fai->hash); | |
198 if (fai->bgzf) bgzf_close(fai->bgzf); | |
199 free(fai); | |
200 } | |
201 | |
202 int fai_build(const char *fn) | |
203 { | |
204 char *str; | |
205 BGZF *bgzf; | |
206 FILE *fp; | |
207 faidx_t *fai; | |
208 str = (char*)calloc(strlen(fn) + 5, 1); | |
209 sprintf(str, "%s.fai", fn); | |
210 bgzf = bgzf_open(fn, "r"); | |
211 if ( !bgzf ) { | |
212 fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); | |
213 free(str); | |
214 return -1; | |
215 } | |
216 if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf); | |
217 fai = fai_build_core(bgzf); | |
218 if ( !fai ) | |
219 { | |
220 if ( bgzf->is_compressed && bgzf->is_gzip ) fprintf(stderr,"Cannot index files compressed with gzip, please use bgzip\n"); | |
221 free(str); | |
222 return -1; | |
223 } | |
224 if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi"); | |
225 bgzf_close(bgzf); | |
226 fp = fopen(str, "wb"); | |
227 if ( !fp ) { | |
228 fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); | |
229 fai_destroy(fai); free(str); | |
230 return -1; | |
231 } | |
232 fai_save(fai, fp); | |
233 fclose(fp); | |
234 free(str); | |
235 fai_destroy(fai); | |
236 return 0; | |
237 } | |
238 | |
239 static FILE *download_and_open(const char *fn) | |
240 { | |
241 const int buf_size = 1 * 1024 * 1024; | |
242 uint8_t *buf; | |
243 FILE *fp; | |
244 hFILE *fp_remote; | |
245 const char *url = fn; | |
246 const char *p; | |
247 int l = strlen(fn); | |
248 for (p = fn + l - 1; p >= fn; --p) | |
249 if (*p == '/') break; | |
250 fn = p + 1; | |
251 | |
252 // First try to open a local copy | |
253 fp = fopen(fn, "r"); | |
254 if (fp) | |
255 return fp; | |
256 | |
257 // If failed, download from remote and open | |
258 fp_remote = hopen(url, "rb"); | |
259 if (fp_remote == 0) { | |
260 fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); | |
261 return NULL; | |
262 } | |
263 if ((fp = fopen(fn, "wb")) == 0) { | |
264 fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); | |
265 hclose_abruptly(fp_remote); | |
266 return NULL; | |
267 } | |
268 buf = (uint8_t*)calloc(buf_size, 1); | |
269 while ((l = hread(fp_remote, buf, buf_size)) > 0) | |
270 fwrite(buf, 1, l, fp); | |
271 free(buf); | |
272 fclose(fp); | |
273 if (hclose(fp_remote) != 0) | |
274 fprintf(stderr, "[download_from_remote] fail to close remote file %s\n", url); | |
275 | |
276 return fopen(fn, "r"); | |
277 } | |
278 | |
279 faidx_t *fai_load(const char *fn) | |
280 { | |
281 char *str; | |
282 FILE *fp; | |
283 faidx_t *fai; | |
284 str = (char*)calloc(strlen(fn) + 5, 1); | |
285 sprintf(str, "%s.fai", fn); | |
286 | |
287 if (hisremote(str)) | |
288 { | |
289 fp = download_and_open(str); | |
290 if ( !fp ) | |
291 { | |
292 fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); | |
293 free(str); | |
294 return 0; | |
295 } | |
296 } | |
297 else | |
298 fp = fopen(str, "rb"); | |
299 | |
300 if (fp == 0) { | |
301 fprintf(stderr, "[fai_load] build FASTA index.\n"); | |
302 fai_build(fn); | |
303 fp = fopen(str, "rb"); | |
304 if (fp == 0) { | |
305 fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); | |
306 free(str); | |
307 return 0; | |
308 } | |
309 } | |
310 | |
311 fai = fai_read(fp); | |
312 fclose(fp); | |
313 | |
314 fai->bgzf = bgzf_open(fn, "rb"); | |
315 free(str); | |
316 if (fai->bgzf == 0) { | |
317 fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); | |
318 return 0; | |
319 } | |
320 if ( fai->bgzf->is_compressed==1 ) | |
321 { | |
322 if ( bgzf_index_load(fai->bgzf, fn, ".gzi") < 0 ) | |
323 { | |
324 fprintf(stderr, "[fai_load] failed to load .gzi index: %s[.gzi]\n", fn); | |
325 fai_destroy(fai); | |
326 return NULL; | |
327 } | |
328 } | |
329 return fai; | |
330 } | |
331 | |
332 char *fai_fetch(const faidx_t *fai, const char *str, int *len) | |
333 { | |
334 char *s; | |
335 int c, i, l, k, name_end; | |
336 khiter_t iter; | |
337 faidx1_t val; | |
338 khash_t(s) *h; | |
339 int beg, end; | |
340 | |
341 beg = end = -1; | |
342 h = fai->hash; | |
343 name_end = l = strlen(str); | |
344 s = (char*)malloc(l+1); | |
345 // remove space | |
346 for (i = k = 0; i < l; ++i) | |
347 if (!isspace(str[i])) s[k++] = str[i]; | |
348 s[k] = 0; l = k; | |
349 // determine the sequence name | |
350 for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end | |
351 if (i >= 0) name_end = i; | |
352 if (name_end < l) { // check if this is really the end | |
353 int n_hyphen = 0; | |
354 for (i = name_end + 1; i < l; ++i) { | |
355 if (s[i] == '-') ++n_hyphen; | |
356 else if (!isdigit(s[i]) && s[i] != ',') break; | |
357 } | |
358 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name | |
359 s[name_end] = 0; | |
360 iter = kh_get(s, h, s); | |
361 if (iter == kh_end(h)) { // cannot find the sequence name | |
362 iter = kh_get(s, h, str); // try str as the name | |
363 if (iter == kh_end(h)) { | |
364 *len = 0; | |
365 free(s); return 0; | |
366 } else s[name_end] = ':', name_end = l; | |
367 } | |
368 } else iter = kh_get(s, h, str); | |
369 if(iter == kh_end(h)) { | |
370 fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); | |
371 free(s); | |
372 *len = -2; | |
373 return 0; | |
374 }; | |
375 val = kh_value(h, iter); | |
376 // parse the interval | |
377 if (name_end < l) { | |
378 for (i = k = name_end + 1; i < l; ++i) | |
379 if (s[i] != ',') s[k++] = s[i]; | |
380 s[k] = 0; | |
381 beg = atoi(s + name_end + 1); | |
382 for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; | |
383 end = i < k? atoi(s + i + 1) : val.len; | |
384 if (beg > 0) --beg; | |
385 } else beg = 0, end = val.len; | |
386 if (beg >= val.len) beg = val.len; | |
387 if (end >= val.len) end = val.len; | |
388 if (beg > end) beg = end; | |
389 free(s); | |
390 | |
391 // now retrieve the sequence | |
392 int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); | |
393 if ( ret<0 ) | |
394 { | |
395 *len = -1; | |
396 fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n"); | |
397 return NULL; | |
398 } | |
399 l = 0; | |
400 s = (char*)malloc(end - beg + 2); | |
401 while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg ) | |
402 if (isgraph(c)) s[l++] = c; | |
403 s[l] = '\0'; | |
404 *len = l; | |
405 return s; | |
406 } | |
407 | |
408 int faidx_fetch_nseq(const faidx_t *fai) | |
409 { | |
410 return fai->n; | |
411 } | |
412 | |
413 int faidx_nseq(const faidx_t *fai) | |
414 { | |
415 return fai->n; | |
416 } | |
417 | |
418 const char *faidx_iseq(const faidx_t *fai, int i) | |
419 { | |
420 return fai->name[i]; | |
421 } | |
422 | |
423 int faidx_seq_len(const faidx_t *fai, const char *seq) | |
424 { | |
425 khint_t k = kh_get(s, fai->hash, seq); | |
426 if ( k == kh_end(fai->hash) ) return -1; | |
427 return kh_val(fai->hash, k).len; | |
428 } | |
429 | |
430 char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) | |
431 { | |
432 int l, c; | |
433 khiter_t iter; | |
434 faidx1_t val; | |
435 char *seq=NULL; | |
436 | |
437 // Adjust position | |
438 iter = kh_get(s, fai->hash, c_name); | |
439 if (iter == kh_end(fai->hash)) | |
440 { | |
441 *len = -2; | |
442 fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name); | |
443 return NULL; | |
444 } | |
445 val = kh_value(fai->hash, iter); | |
446 if(p_end_i < p_beg_i) p_beg_i = p_end_i; | |
447 if(p_beg_i < 0) p_beg_i = 0; | |
448 else if(val.len <= p_beg_i) p_beg_i = val.len - 1; | |
449 if(p_end_i < 0) p_end_i = 0; | |
450 else if(val.len <= p_end_i) p_end_i = val.len - 1; | |
451 | |
452 // Now retrieve the sequence | |
453 int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); | |
454 if ( ret<0 ) | |
455 { | |
456 *len = -1; | |
457 fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n"); | |
458 return NULL; | |
459 } | |
460 l = 0; | |
461 seq = (char*)malloc(p_end_i - p_beg_i + 2); | |
462 while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1) | |
463 if (isgraph(c)) seq[l++] = c; | |
464 seq[l] = '\0'; | |
465 *len = l; | |
466 return seq; | |
467 } | |
468 | |
469 int faidx_has_seq(const faidx_t *fai, const char *seq) | |
470 { | |
471 khiter_t iter = kh_get(s, fai->hash, seq); | |
472 if (iter == kh_end(fai->hash)) return 0; | |
473 return 1; | |
474 } | |
475 |