Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/faidx.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <ctype.h> | |
2 #include <string.h> | |
3 #include <stdlib.h> | |
4 #include <stdio.h> | |
5 #include <stdint.h> | |
6 #include "faidx.h" | |
7 #include "khash.h" | |
8 | |
9 typedef struct { | |
10 int32_t line_len, line_blen; | |
11 int64_t len; | |
12 uint64_t offset; | |
13 } faidx1_t; | |
14 KHASH_MAP_INIT_STR(s, faidx1_t) | |
15 | |
16 #ifndef _NO_RAZF | |
17 #include "razf.h" | |
18 #else | |
19 #ifdef _WIN32 | |
20 #define ftello(fp) ftell(fp) | |
21 #define fseeko(fp, offset, whence) fseek(fp, offset, whence) | |
22 #else | |
23 extern off_t ftello(FILE *stream); | |
24 extern int fseeko(FILE *stream, off_t offset, int whence); | |
25 #endif | |
26 #define RAZF FILE | |
27 #define razf_read(fp, buf, size) fread(buf, 1, size, fp) | |
28 #define razf_open(fn, mode) fopen(fn, mode) | |
29 #define razf_close(fp) fclose(fp) | |
30 #define razf_seek(fp, offset, whence) fseeko(fp, offset, whence) | |
31 #define razf_tell(fp) ftello(fp) | |
32 #endif | |
33 #ifdef _USE_KNETFILE | |
34 #include "knetfile.h" | |
35 #endif | |
36 | |
37 struct __faidx_t { | |
38 RAZF *rz; | |
39 int n, m; | |
40 char **name; | |
41 khash_t(s) *hash; | |
42 }; | |
43 | |
44 #ifndef kroundup32 | |
45 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
46 #endif | |
47 | |
48 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) | |
49 { | |
50 khint_t k; | |
51 int ret; | |
52 faidx1_t t; | |
53 if (idx->n == idx->m) { | |
54 idx->m = idx->m? idx->m<<1 : 16; | |
55 idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m); | |
56 } | |
57 idx->name[idx->n] = strdup(name); | |
58 k = kh_put(s, idx->hash, idx->name[idx->n], &ret); | |
59 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; | |
60 kh_value(idx->hash, k) = t; | |
61 ++idx->n; | |
62 } | |
63 | |
64 faidx_t *fai_build_core(RAZF *rz) | |
65 { | |
66 char c, *name; | |
67 int l_name, m_name, ret; | |
68 int line_len, line_blen, state; | |
69 int l1, l2; | |
70 faidx_t *idx; | |
71 uint64_t offset; | |
72 int64_t len; | |
73 | |
74 idx = (faidx_t*)calloc(1, sizeof(faidx_t)); | |
75 idx->hash = kh_init(s); | |
76 name = 0; l_name = m_name = 0; | |
77 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; | |
78 while (razf_read(rz, &c, 1)) { | |
79 if (c == '\n') { // an empty line | |
80 if (state == 1) { | |
81 offset = razf_tell(rz); | |
82 continue; | |
83 } else if ((state == 0 && len < 0) || state == 2) continue; | |
84 } | |
85 if (c == '>') { // fasta header | |
86 if (len >= 0) | |
87 fai_insert_index(idx, name, len, line_len, line_blen, offset); | |
88 l_name = 0; | |
89 while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) { | |
90 if (m_name < l_name + 2) { | |
91 m_name = l_name + 2; | |
92 kroundup32(m_name); | |
93 name = (char*)realloc(name, m_name); | |
94 } | |
95 name[l_name++] = c; | |
96 } | |
97 name[l_name] = '\0'; | |
98 if (ret == 0) { | |
99 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); | |
100 free(name); fai_destroy(idx); | |
101 return 0; | |
102 } | |
103 if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n'); | |
104 state = 1; len = 0; | |
105 offset = razf_tell(rz); | |
106 } else { | |
107 if (state == 3) { | |
108 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); | |
109 free(name); fai_destroy(idx); | |
110 return 0; | |
111 } | |
112 if (state == 2) state = 3; | |
113 l1 = l2 = 0; | |
114 do { | |
115 ++l1; | |
116 if (isgraph(c)) ++l2; | |
117 } while ((ret = razf_read(rz, &c, 1)) && c != '\n'); | |
118 if (state == 3 && l2) { | |
119 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); | |
120 free(name); fai_destroy(idx); | |
121 return 0; | |
122 } | |
123 ++l1; len += l2; | |
124 if (state == 1) line_len = l1, line_blen = l2, state = 0; | |
125 else if (state == 0) { | |
126 if (l1 != line_len || l2 != line_blen) state = 2; | |
127 } | |
128 } | |
129 } | |
130 fai_insert_index(idx, name, len, line_len, line_blen, offset); | |
131 free(name); | |
132 return idx; | |
133 } | |
134 | |
135 void fai_save(const faidx_t *fai, FILE *fp) | |
136 { | |
137 khint_t k; | |
138 int i; | |
139 for (i = 0; i < fai->n; ++i) { | |
140 faidx1_t x; | |
141 k = kh_get(s, fai->hash, fai->name[i]); | |
142 x = kh_value(fai->hash, k); | |
143 #ifdef _WIN32 | |
144 fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len); | |
145 #else | |
146 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); | |
147 #endif | |
148 } | |
149 } | |
150 | |
151 faidx_t *fai_read(FILE *fp) | |
152 { | |
153 faidx_t *fai; | |
154 char *buf, *p; | |
155 int len, line_len, line_blen; | |
156 #ifdef _WIN32 | |
157 long offset; | |
158 #else | |
159 long long offset; | |
160 #endif | |
161 fai = (faidx_t*)calloc(1, sizeof(faidx_t)); | |
162 fai->hash = kh_init(s); | |
163 buf = (char*)calloc(0x10000, 1); | |
164 while (!feof(fp) && fgets(buf, 0x10000, fp)) { | |
165 for (p = buf; *p && isgraph(*p); ++p); | |
166 *p = 0; ++p; | |
167 #ifdef _WIN32 | |
168 sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len); | |
169 #else | |
170 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); | |
171 #endif | |
172 fai_insert_index(fai, buf, len, line_len, line_blen, offset); | |
173 } | |
174 free(buf); | |
175 return fai; | |
176 } | |
177 | |
178 void fai_destroy(faidx_t *fai) | |
179 { | |
180 int i; | |
181 for (i = 0; i < fai->n; ++i) free(fai->name[i]); | |
182 free(fai->name); | |
183 kh_destroy(s, fai->hash); | |
184 if (fai->rz) razf_close(fai->rz); | |
185 free(fai); | |
186 } | |
187 | |
188 int fai_build(const char *fn) | |
189 { | |
190 char *str; | |
191 RAZF *rz; | |
192 FILE *fp; | |
193 faidx_t *fai; | |
194 str = (char*)calloc(strlen(fn) + 5, 1); | |
195 sprintf(str, "%s.fai", fn); | |
196 rz = razf_open(fn, "r"); | |
197 if (rz == 0) { | |
198 fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); | |
199 free(str); | |
200 return -1; | |
201 } | |
202 fai = fai_build_core(rz); | |
203 razf_close(rz); | |
204 fp = fopen(str, "wb"); | |
205 if (fp == 0) { | |
206 fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); | |
207 fai_destroy(fai); free(str); | |
208 return -1; | |
209 } | |
210 fai_save(fai, fp); | |
211 fclose(fp); | |
212 free(str); | |
213 fai_destroy(fai); | |
214 return 0; | |
215 } | |
216 | |
217 #ifdef _USE_KNETFILE | |
218 FILE *download_and_open(const char *fn) | |
219 { | |
220 const int buf_size = 1 * 1024 * 1024; | |
221 uint8_t *buf; | |
222 FILE *fp; | |
223 knetFile *fp_remote; | |
224 const char *url = fn; | |
225 const char *p; | |
226 int l = strlen(fn); | |
227 for (p = fn + l - 1; p >= fn; --p) | |
228 if (*p == '/') break; | |
229 fn = p + 1; | |
230 | |
231 // First try to open a local copy | |
232 fp = fopen(fn, "r"); | |
233 if (fp) | |
234 return fp; | |
235 | |
236 // If failed, download from remote and open | |
237 fp_remote = knet_open(url, "rb"); | |
238 if (fp_remote == 0) { | |
239 fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); | |
240 return NULL; | |
241 } | |
242 if ((fp = fopen(fn, "wb")) == 0) { | |
243 fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); | |
244 knet_close(fp_remote); | |
245 return NULL; | |
246 } | |
247 buf = (uint8_t*)calloc(buf_size, 1); | |
248 while ((l = knet_read(fp_remote, buf, buf_size)) != 0) | |
249 fwrite(buf, 1, l, fp); | |
250 free(buf); | |
251 fclose(fp); | |
252 knet_close(fp_remote); | |
253 | |
254 return fopen(fn, "r"); | |
255 } | |
256 #endif | |
257 | |
258 faidx_t *fai_load(const char *fn) | |
259 { | |
260 char *str; | |
261 FILE *fp; | |
262 faidx_t *fai; | |
263 str = (char*)calloc(strlen(fn) + 5, 1); | |
264 sprintf(str, "%s.fai", fn); | |
265 | |
266 #ifdef _USE_KNETFILE | |
267 if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) | |
268 { | |
269 fp = download_and_open(str); | |
270 if ( !fp ) | |
271 { | |
272 fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); | |
273 free(str); | |
274 return 0; | |
275 } | |
276 } | |
277 else | |
278 #endif | |
279 fp = fopen(str, "rb"); | |
280 if (fp == 0) { | |
281 fprintf(stderr, "[fai_load] build FASTA index.\n"); | |
282 fai_build(fn); | |
283 fp = fopen(str, "rb"); | |
284 if (fp == 0) { | |
285 fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); | |
286 free(str); | |
287 return 0; | |
288 } | |
289 } | |
290 | |
291 fai = fai_read(fp); | |
292 fclose(fp); | |
293 | |
294 fai->rz = razf_open(fn, "rb"); | |
295 free(str); | |
296 if (fai->rz == 0) { | |
297 fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); | |
298 return 0; | |
299 } | |
300 return fai; | |
301 } | |
302 | |
303 char *fai_fetch(const faidx_t *fai, const char *str, int *len) | |
304 { | |
305 char *s, c; | |
306 int i, l, k, name_end; | |
307 khiter_t iter; | |
308 faidx1_t val; | |
309 khash_t(s) *h; | |
310 int beg, end; | |
311 | |
312 beg = end = -1; | |
313 h = fai->hash; | |
314 name_end = l = strlen(str); | |
315 s = (char*)malloc(l+1); | |
316 // remove space | |
317 for (i = k = 0; i < l; ++i) | |
318 if (!isspace(str[i])) s[k++] = str[i]; | |
319 s[k] = 0; l = k; | |
320 // determine the sequence name | |
321 for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end | |
322 if (i >= 0) name_end = i; | |
323 if (name_end < l) { // check if this is really the end | |
324 int n_hyphen = 0; | |
325 for (i = name_end + 1; i < l; ++i) { | |
326 if (s[i] == '-') ++n_hyphen; | |
327 else if (!isdigit(s[i]) && s[i] != ',') break; | |
328 } | |
329 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name | |
330 s[name_end] = 0; | |
331 iter = kh_get(s, h, s); | |
332 if (iter == kh_end(h)) { // cannot find the sequence name | |
333 iter = kh_get(s, h, str); // try str as the name | |
334 if (iter == kh_end(h)) { | |
335 *len = 0; | |
336 free(s); return 0; | |
337 } else s[name_end] = ':', name_end = l; | |
338 } | |
339 } else iter = kh_get(s, h, str); | |
340 if(iter == kh_end(h)) { | |
341 fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); | |
342 free(s); | |
343 return 0; | |
344 }; | |
345 val = kh_value(h, iter); | |
346 // parse the interval | |
347 if (name_end < l) { | |
348 for (i = k = name_end + 1; i < l; ++i) | |
349 if (s[i] != ',') s[k++] = s[i]; | |
350 s[k] = 0; | |
351 beg = atoi(s + name_end + 1); | |
352 for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; | |
353 end = i < k? atoi(s + i + 1) : val.len; | |
354 if (beg > 0) --beg; | |
355 } else beg = 0, end = val.len; | |
356 if (beg >= val.len) beg = val.len; | |
357 if (end >= val.len) end = val.len; | |
358 if (beg > end) beg = end; | |
359 free(s); | |
360 | |
361 // now retrieve the sequence | |
362 l = 0; | |
363 s = (char*)malloc(end - beg + 2); | |
364 razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); | |
365 while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err) | |
366 if (isgraph(c)) s[l++] = c; | |
367 s[l] = '\0'; | |
368 *len = l; | |
369 return s; | |
370 } | |
371 | |
372 int faidx_main(int argc, char *argv[]) | |
373 { | |
374 if (argc == 1) { | |
375 fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n"); | |
376 return 1; | |
377 } else { | |
378 if (argc == 2) fai_build(argv[1]); | |
379 else { | |
380 int i, j, k, l; | |
381 char *s; | |
382 faidx_t *fai; | |
383 fai = fai_load(argv[1]); | |
384 if (fai == 0) return 1; | |
385 for (i = 2; i != argc; ++i) { | |
386 printf(">%s\n", argv[i]); | |
387 s = fai_fetch(fai, argv[i], &l); | |
388 for (j = 0; j < l; j += 60) { | |
389 for (k = 0; k < 60 && k < l - j; ++k) | |
390 putchar(s[j + k]); | |
391 putchar('\n'); | |
392 } | |
393 free(s); | |
394 } | |
395 fai_destroy(fai); | |
396 } | |
397 } | |
398 return 0; | |
399 } | |
400 | |
401 int faidx_fetch_nseq(const faidx_t *fai) | |
402 { | |
403 return fai->n; | |
404 } | |
405 | |
406 char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len) | |
407 { | |
408 int l; | |
409 char c; | |
410 khiter_t iter; | |
411 faidx1_t val; | |
412 char *seq=NULL; | |
413 | |
414 // Adjust position | |
415 iter = kh_get(s, fai->hash, c_name); | |
416 if(iter == kh_end(fai->hash)) return 0; | |
417 val = kh_value(fai->hash, iter); | |
418 if(p_end_i < p_beg_i) p_beg_i = p_end_i; | |
419 if(p_beg_i < 0) p_beg_i = 0; | |
420 else if(val.len <= p_beg_i) p_beg_i = val.len - 1; | |
421 if(p_end_i < 0) p_end_i = 0; | |
422 else if(val.len <= p_end_i) p_end_i = val.len - 1; | |
423 | |
424 // Now retrieve the sequence | |
425 l = 0; | |
426 seq = (char*)malloc(p_end_i - p_beg_i + 2); | |
427 razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); | |
428 while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1) | |
429 if (isgraph(c)) seq[l++] = c; | |
430 seq[l] = '\0'; | |
431 *len = l; | |
432 return seq; | |
433 } | |
434 | |
435 #ifdef FAIDX_MAIN | |
436 int main(int argc, char *argv[]) { return faidx_main(argc, argv); } | |
437 #endif |