annotate pyPRADA_1.2/tools/samtools-0.1.16/faidx.c @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #include <ctype.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2 #include <string.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 #include <stdlib.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4 #include <stdio.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 #include <stdint.h>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 #include "faidx.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 #include "khash.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 typedef struct {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 uint64_t len:32, line_len:16, line_blen:16;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 uint64_t offset;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 } faidx1_t;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 KHASH_MAP_INIT_STR(s, faidx1_t)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 #ifndef _NO_RAZF
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 #include "razf.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 #else
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 #ifdef _WIN32
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 #define ftello(fp) ftell(fp)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 #else
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 extern off_t ftello(FILE *stream);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 extern int fseeko(FILE *stream, off_t offset, int whence);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 #endif
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 #define RAZF FILE
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 #define razf_read(fp, buf, size) fread(buf, 1, size, fp)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 #define razf_open(fn, mode) fopen(fn, mode)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 #define razf_close(fp) fclose(fp)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 #define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 #define razf_tell(fp) ftello(fp)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 #endif
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 #ifdef _USE_KNETFILE
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 #include "knetfile.h"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 #endif
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 struct __faidx_t {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 RAZF *rz;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 int n, m;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 char **name;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 khash_t(s) *hash;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 };
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 #ifndef kroundup32
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 #endif
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 khint_t k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 int ret;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 faidx1_t t;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 if (idx->n == idx->m) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 idx->m = idx->m? idx->m<<1 : 16;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 idx->name[idx->n] = strdup(name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 kh_value(idx->hash, k) = t;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 ++idx->n;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 faidx_t *fai_build_core(RAZF *rz)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 char c, *name;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 int l_name, m_name, ret;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 int len, line_len, line_blen, state;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 int l1, l2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 faidx_t *idx;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 uint64_t offset;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 idx = (faidx_t*)calloc(1, sizeof(faidx_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 idx->hash = kh_init(s);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 name = 0; l_name = m_name = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 while (razf_read(rz, &c, 1)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 if (c == '\n') { // an empty line
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 if (state == 1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 offset = razf_tell(rz);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 continue;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 } else if ((state == 0 && len < 0) || state == 2) continue;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 if (c == '>') { // fasta header
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 if (len >= 0)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 fai_insert_index(idx, name, len, line_len, line_blen, offset);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 l_name = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 if (m_name < l_name + 2) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 m_name = l_name + 2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 kroundup32(m_name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 name = (char*)realloc(name, m_name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 name[l_name++] = c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 name[l_name] = '\0';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 if (ret == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 free(name); fai_destroy(idx);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 state = 1; len = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 offset = razf_tell(rz);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 if (state == 3) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 free(name); fai_destroy(idx);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 if (state == 2) state = 3;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 l1 = l2 = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 do {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 ++l1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 if (isgraph(c)) ++l2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 } while ((ret = razf_read(rz, &c, 1)) && c != '\n');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 if (state == 3 && l2) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 free(name); fai_destroy(idx);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 ++l1; len += l2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 if (l2 >= 0x10000) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 free(name); fai_destroy(idx);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 if (state == 1) line_len = l1, line_blen = l2, state = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 else if (state == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 if (l1 != line_len || l2 != line_blen) state = 2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 fai_insert_index(idx, name, len, line_len, line_blen, offset);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 free(name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 return idx;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 void fai_save(const faidx_t *fai, FILE *fp)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 khint_t k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 int i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 for (i = 0; i < fai->n; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 faidx1_t x;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 k = kh_get(s, fai->hash, fai->name[i]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 x = kh_value(fai->hash, k);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 #ifdef _WIN32
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 #else
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 #endif
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 faidx_t *fai_read(FILE *fp)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 faidx_t *fai;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 char *buf, *p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 int len, line_len, line_blen;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 #ifdef _WIN32
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 long offset;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 #else
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 long long offset;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 #endif
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 fai = (faidx_t*)calloc(1, sizeof(faidx_t));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165 fai->hash = kh_init(s);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 buf = (char*)calloc(0x10000, 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 while (!feof(fp) && fgets(buf, 0x10000, fp)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 for (p = buf; *p && isgraph(*p); ++p);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 *p = 0; ++p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170 #ifdef _WIN32
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 #else
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 #endif
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 fai_insert_index(fai, buf, len, line_len, line_blen, offset);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177 free(buf);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 return fai;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 void fai_destroy(faidx_t *fai)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 int i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 for (i = 0; i < fai->n; ++i) free(fai->name[i]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 free(fai->name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 kh_destroy(s, fai->hash);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 if (fai->rz) razf_close(fai->rz);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188 free(fai);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
191 int fai_build(const char *fn)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
192 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
193 char *str;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
194 RAZF *rz;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
195 FILE *fp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
196 faidx_t *fai;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
197 str = (char*)calloc(strlen(fn) + 5, 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
198 sprintf(str, "%s.fai", fn);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
199 rz = razf_open(fn, "r");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
200 if (rz == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
201 fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
202 free(str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
203 return -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
204 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
205 fai = fai_build_core(rz);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
206 razf_close(rz);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
207 fp = fopen(str, "wb");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
208 if (fp == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
209 fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
210 fai_destroy(fai); free(str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
211 return -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
212 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
213 fai_save(fai, fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
214 fclose(fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
215 free(str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
216 fai_destroy(fai);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
217 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
218 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
219
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
220 #ifdef _USE_KNETFILE
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
221 FILE *download_and_open(const char *fn)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
222 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
223 const int buf_size = 1 * 1024 * 1024;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
224 uint8_t *buf;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
225 FILE *fp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
226 knetFile *fp_remote;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
227 const char *url = fn;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
228 const char *p;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
229 int l = strlen(fn);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
230 for (p = fn + l - 1; p >= fn; --p)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
231 if (*p == '/') break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
232 fn = p + 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
233
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
234 // First try to open a local copy
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
235 fp = fopen(fn, "r");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
236 if (fp)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
237 return fp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
238
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
239 // If failed, download from remote and open
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
240 fp_remote = knet_open(url, "rb");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
241 if (fp_remote == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
242 fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
243 return NULL;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
244 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
245 if ((fp = fopen(fn, "wb")) == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
246 fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
247 knet_close(fp_remote);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
248 return NULL;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
249 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
250 buf = (uint8_t*)calloc(buf_size, 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
251 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
252 fwrite(buf, 1, l, fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
253 free(buf);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
254 fclose(fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
255 knet_close(fp_remote);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
256
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
257 return fopen(fn, "r");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
258 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
259 #endif
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
260
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
261 faidx_t *fai_load(const char *fn)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
262 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
263 char *str;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
264 FILE *fp;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
265 faidx_t *fai;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
266 str = (char*)calloc(strlen(fn) + 5, 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
267 sprintf(str, "%s.fai", fn);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
268
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
269 #ifdef _USE_KNETFILE
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
270 if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
271 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
272 fp = download_and_open(str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
273 if ( !fp )
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
274 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
275 fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
276 free(str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
277 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
278 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
279 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
280 else
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
281 #endif
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
282 fp = fopen(str, "rb");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
283 if (fp == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
284 fprintf(stderr, "[fai_load] build FASTA index.\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
285 fai_build(fn);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
286 fp = fopen(str, "rb");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
287 if (fp == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
288 fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
289 free(str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
290 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
291 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
292 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
293
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
294 fai = fai_read(fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
295 fclose(fp);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
296
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
297 fai->rz = razf_open(fn, "rb");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
298 free(str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
299 if (fai->rz == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
300 fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
301 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
302 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
303 return fai;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
304 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
305
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
306 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
307 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
308 char *s, *p, c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
309 int i, l, k;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
310 khiter_t iter;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
311 faidx1_t val;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
312 khash_t(s) *h;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
313 int beg, end;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
314
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
315 beg = end = -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
316 h = fai->hash;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
317 l = strlen(str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
318 p = s = (char*)malloc(l+1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
319 /* squeeze out "," */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
320 for (i = k = 0; i != l; ++i)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
321 if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
322 s[k] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
323 for (i = 0; i != k; ++i) if (s[i] == ':') break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
324 s[i] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
325 iter = kh_get(s, h, s); /* get the ref_id */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
326 if (iter == kh_end(h)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
327 *len = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
328 free(s); return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
329 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
330 val = kh_value(h, iter);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
331 if (i == k) { /* dump the whole sequence */
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
332 beg = 0; end = val.len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
333 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
334 for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
335 beg = atoi(p);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
336 if (i < k) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
337 p = s + i + 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
338 end = atoi(p);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
339 } else end = val.len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
340 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
341 if (beg > 0) --beg;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
342 if (beg >= val.len) beg = val.len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
343 if (end >= val.len) end = val.len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
344 if (beg > end) beg = end;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
345 free(s);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
346
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
347 // now retrieve the sequence
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
348 l = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
349 s = (char*)malloc(end - beg + 2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
350 razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
351 while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
352 if (isgraph(c)) s[l++] = c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
353 s[l] = '\0';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
354 *len = l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
355 return s;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
356 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
357
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
358 int faidx_main(int argc, char *argv[])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
359 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
360 if (argc == 1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
361 fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
362 return 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
363 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
364 if (argc == 2) fai_build(argv[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
365 else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
366 int i, j, k, l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
367 char *s;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
368 faidx_t *fai;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
369 fai = fai_load(argv[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
370 if (fai == 0) return 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
371 for (i = 2; i != argc; ++i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
372 printf(">%s\n", argv[i]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
373 s = fai_fetch(fai, argv[i], &l);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
374 for (j = 0; j < l; j += 60) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
375 for (k = 0; k < 60 && k < l - j; ++k)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
376 putchar(s[j + k]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
377 putchar('\n');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
378 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
379 free(s);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
380 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
381 fai_destroy(fai);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
382 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
383 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
384 return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
385 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
386
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
387 int faidx_fetch_nseq(const faidx_t *fai)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
388 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
389 return fai->n;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
390 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
391
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
392 char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
393 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
394 int l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
395 char c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
396 khiter_t iter;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
397 faidx1_t val;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
398 char *seq=NULL;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
399
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
400 // Adjust position
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
401 iter = kh_get(s, fai->hash, c_name);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
402 if(iter == kh_end(fai->hash)) return 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
403 val = kh_value(fai->hash, iter);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
404 if(p_end_i < p_beg_i) p_beg_i = p_end_i;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
405 if(p_beg_i < 0) p_beg_i = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
406 else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
407 if(p_end_i < 0) p_end_i = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
408 else if(val.len <= p_end_i) p_end_i = val.len - 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
409
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
410 // Now retrieve the sequence
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
411 l = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
412 seq = (char*)malloc(p_end_i - p_beg_i + 2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
413 razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
414 while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
415 if (isgraph(c)) seq[l++] = c;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
416 seq[l] = '\0';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
417 *len = l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
418 return seq;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
419 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
420
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
421 #ifdef FAIDX_MAIN
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
422 int main(int argc, char *argv[]) { return faidx_main(argc, argv); }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
423 #endif