Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bcftools/index.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <assert.h> | |
2 #include <ctype.h> | |
3 #include <sys/stat.h> | |
4 #include "bam_endian.h" | |
5 #include "kstring.h" | |
6 #include "bcf.h" | |
7 #ifdef _USE_KNETFILE | |
8 #include "knetfile.h" | |
9 #endif | |
10 | |
11 #define TAD_LIDX_SHIFT 13 | |
12 | |
13 typedef struct { | |
14 int32_t n, m; | |
15 uint64_t *offset; | |
16 } bcf_lidx_t; | |
17 | |
18 struct __bcf_idx_t { | |
19 int32_t n; | |
20 bcf_lidx_t *index2; | |
21 }; | |
22 | |
23 /************ | |
24 * indexing * | |
25 ************/ | |
26 | |
27 static inline void insert_offset2(bcf_lidx_t *index2, int _beg, int _end, uint64_t offset) | |
28 { | |
29 int i, beg, end; | |
30 beg = _beg >> TAD_LIDX_SHIFT; | |
31 end = (_end - 1) >> TAD_LIDX_SHIFT; | |
32 if (index2->m < end + 1) { | |
33 int old_m = index2->m; | |
34 index2->m = end + 1; | |
35 kroundup32(index2->m); | |
36 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); | |
37 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); | |
38 } | |
39 if (beg == end) { | |
40 if (index2->offset[beg] == 0) index2->offset[beg] = offset; | |
41 } else { | |
42 for (i = beg; i <= end; ++i) | |
43 if (index2->offset[i] == 0) index2->offset[i] = offset; | |
44 } | |
45 if (index2->n < end + 1) index2->n = end + 1; | |
46 } | |
47 | |
48 bcf_idx_t *bcf_idx_core(bcf_t *bp, bcf_hdr_t *h) | |
49 { | |
50 bcf_idx_t *idx; | |
51 int32_t last_coor, last_tid; | |
52 uint64_t last_off; | |
53 kstring_t *str; | |
54 BGZF *fp = bp->fp; | |
55 bcf1_t *b; | |
56 int ret; | |
57 | |
58 b = calloc(1, sizeof(bcf1_t)); | |
59 str = calloc(1, sizeof(kstring_t)); | |
60 idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); | |
61 idx->n = h->n_ref; | |
62 idx->index2 = calloc(h->n_ref, sizeof(bcf_lidx_t)); | |
63 | |
64 last_tid = 0xffffffffu; | |
65 last_off = bgzf_tell(fp); last_coor = 0xffffffffu; | |
66 while ((ret = bcf_read(bp, h, b)) > 0) { | |
67 int end, tmp; | |
68 if (last_tid != b->tid) { // change of chromosomes | |
69 last_tid = b->tid; | |
70 } else if (last_coor > b->pos) { | |
71 fprintf(stderr, "[bcf_idx_core] the input is out of order\n"); | |
72 free(str->s); free(str); free(idx); bcf_destroy(b); | |
73 return 0; | |
74 } | |
75 tmp = strlen(b->ref); | |
76 end = b->pos + (tmp > 0? tmp : 1); | |
77 insert_offset2(&idx->index2[b->tid], b->pos, end, last_off); | |
78 last_off = bgzf_tell(fp); | |
79 last_coor = b->pos; | |
80 } | |
81 free(str->s); free(str); bcf_destroy(b); | |
82 return idx; | |
83 } | |
84 | |
85 void bcf_idx_destroy(bcf_idx_t *idx) | |
86 { | |
87 int i; | |
88 if (idx == 0) return; | |
89 for (i = 0; i < idx->n; ++i) free(idx->index2[i].offset); | |
90 free(idx->index2); | |
91 free(idx); | |
92 } | |
93 | |
94 /****************** | |
95 * index file I/O * | |
96 ******************/ | |
97 | |
98 void bcf_idx_save(const bcf_idx_t *idx, BGZF *fp) | |
99 { | |
100 int32_t i, ti_is_be; | |
101 ti_is_be = bam_is_big_endian(); | |
102 bgzf_write(fp, "BCI\4", 4); | |
103 if (ti_is_be) { | |
104 uint32_t x = idx->n; | |
105 bgzf_write(fp, bam_swap_endian_4p(&x), 4); | |
106 } else bgzf_write(fp, &idx->n, 4); | |
107 for (i = 0; i < idx->n; ++i) { | |
108 bcf_lidx_t *index2 = idx->index2 + i; | |
109 // write linear index (index2) | |
110 if (ti_is_be) { | |
111 int x = index2->n; | |
112 bgzf_write(fp, bam_swap_endian_4p(&x), 4); | |
113 } else bgzf_write(fp, &index2->n, 4); | |
114 if (ti_is_be) { // big endian | |
115 int x; | |
116 for (x = 0; (int)x < index2->n; ++x) | |
117 bam_swap_endian_8p(&index2->offset[x]); | |
118 bgzf_write(fp, index2->offset, 8 * index2->n); | |
119 for (x = 0; (int)x < index2->n; ++x) | |
120 bam_swap_endian_8p(&index2->offset[x]); | |
121 } else bgzf_write(fp, index2->offset, 8 * index2->n); | |
122 } | |
123 } | |
124 | |
125 static bcf_idx_t *bcf_idx_load_core(BGZF *fp) | |
126 { | |
127 int i, ti_is_be; | |
128 char magic[4]; | |
129 bcf_idx_t *idx; | |
130 ti_is_be = bam_is_big_endian(); | |
131 if (fp == 0) { | |
132 fprintf(stderr, "[%s] fail to load index.\n", __func__); | |
133 return 0; | |
134 } | |
135 bgzf_read(fp, magic, 4); | |
136 if (strncmp(magic, "BCI\4", 4)) { | |
137 fprintf(stderr, "[%s] wrong magic number.\n", __func__); | |
138 return 0; | |
139 } | |
140 idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); | |
141 bgzf_read(fp, &idx->n, 4); | |
142 if (ti_is_be) bam_swap_endian_4p(&idx->n); | |
143 idx->index2 = (bcf_lidx_t*)calloc(idx->n, sizeof(bcf_lidx_t)); | |
144 for (i = 0; i < idx->n; ++i) { | |
145 bcf_lidx_t *index2 = idx->index2 + i; | |
146 int j; | |
147 bgzf_read(fp, &index2->n, 4); | |
148 if (ti_is_be) bam_swap_endian_4p(&index2->n); | |
149 index2->m = index2->n; | |
150 index2->offset = (uint64_t*)calloc(index2->m, 8); | |
151 bgzf_read(fp, index2->offset, index2->n * 8); | |
152 if (ti_is_be) | |
153 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); | |
154 } | |
155 return idx; | |
156 } | |
157 | |
158 bcf_idx_t *bcf_idx_load_local(const char *fnidx) | |
159 { | |
160 BGZF *fp; | |
161 fp = bgzf_open(fnidx, "r"); | |
162 if (fp) { | |
163 bcf_idx_t *idx = bcf_idx_load_core(fp); | |
164 bgzf_close(fp); | |
165 return idx; | |
166 } else return 0; | |
167 } | |
168 | |
169 #ifdef _USE_KNETFILE | |
170 static void download_from_remote(const char *url) | |
171 { | |
172 const int buf_size = 1 * 1024 * 1024; | |
173 char *fn; | |
174 FILE *fp; | |
175 uint8_t *buf; | |
176 knetFile *fp_remote; | |
177 int l; | |
178 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; | |
179 l = strlen(url); | |
180 for (fn = (char*)url + l - 1; fn >= url; --fn) | |
181 if (*fn == '/') break; | |
182 ++fn; // fn now points to the file name | |
183 fp_remote = knet_open(url, "r"); | |
184 if (fp_remote == 0) { | |
185 fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); | |
186 return; | |
187 } | |
188 if ((fp = fopen(fn, "w")) == 0) { | |
189 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); | |
190 knet_close(fp_remote); | |
191 return; | |
192 } | |
193 buf = (uint8_t*)calloc(buf_size, 1); | |
194 while ((l = knet_read(fp_remote, buf, buf_size)) != 0) | |
195 fwrite(buf, 1, l, fp); | |
196 free(buf); | |
197 fclose(fp); | |
198 knet_close(fp_remote); | |
199 } | |
200 #else | |
201 static void download_from_remote(const char *url) | |
202 { | |
203 return; | |
204 } | |
205 #endif | |
206 | |
207 static char *get_local_version(const char *fn) | |
208 { | |
209 struct stat sbuf; | |
210 char *fnidx = (char*)calloc(strlen(fn) + 5, 1); | |
211 strcat(strcpy(fnidx, fn), ".bci"); | |
212 if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) { | |
213 char *p, *url; | |
214 int l = strlen(fnidx); | |
215 for (p = fnidx + l - 1; p >= fnidx; --p) | |
216 if (*p == '/') break; | |
217 url = fnidx; fnidx = strdup(p + 1); | |
218 if (stat(fnidx, &sbuf) == 0) { | |
219 free(url); | |
220 return fnidx; | |
221 } | |
222 fprintf(stderr, "[%s] downloading the index file...\n", __func__); | |
223 download_from_remote(url); | |
224 free(url); | |
225 } | |
226 if (stat(fnidx, &sbuf) == 0) return fnidx; | |
227 free(fnidx); return 0; | |
228 } | |
229 | |
230 bcf_idx_t *bcf_idx_load(const char *fn) | |
231 { | |
232 bcf_idx_t *idx; | |
233 char *fname = get_local_version(fn); | |
234 if (fname == 0) return 0; | |
235 idx = bcf_idx_load_local(fname); | |
236 free(fname); | |
237 return idx; | |
238 } | |
239 | |
240 int bcf_idx_build2(const char *fn, const char *_fnidx) | |
241 { | |
242 char *fnidx; | |
243 BGZF *fpidx; | |
244 bcf_t *bp; | |
245 bcf_idx_t *idx; | |
246 bcf_hdr_t *h; | |
247 if ((bp = bcf_open(fn, "r")) == 0) { | |
248 fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n"); | |
249 return -1; | |
250 } | |
251 h = bcf_hdr_read(bp); | |
252 idx = bcf_idx_core(bp, h); | |
253 bcf_close(bp); | |
254 if (_fnidx == 0) { | |
255 fnidx = (char*)calloc(strlen(fn) + 5, 1); | |
256 strcpy(fnidx, fn); strcat(fnidx, ".bci"); | |
257 } else fnidx = strdup(_fnidx); | |
258 fpidx = bgzf_open(fnidx, "w"); | |
259 if (fpidx == 0) { | |
260 fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n"); | |
261 free(fnidx); | |
262 bcf_idx_destroy(idx); | |
263 return -1; | |
264 } | |
265 bcf_idx_save(idx, fpidx); | |
266 bcf_idx_destroy(idx); | |
267 bgzf_close(fpidx); | |
268 free(fnidx); | |
269 return 0; | |
270 } | |
271 | |
272 int bcf_idx_build(const char *fn) | |
273 { | |
274 return bcf_idx_build2(fn, 0); | |
275 } | |
276 | |
277 /******************************************** | |
278 * parse a region in the format chr:beg-end * | |
279 ********************************************/ | |
280 | |
281 int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end) | |
282 { | |
283 char *s, *p; | |
284 int i, l, k; | |
285 l = strlen(str); | |
286 p = s = (char*)malloc(l+1); | |
287 /* squeeze out "," */ | |
288 for (i = k = 0; i != l; ++i) | |
289 if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; | |
290 s[k] = 0; | |
291 for (i = 0; i != k; ++i) if (s[i] == ':') break; | |
292 s[i] = 0; | |
293 if ((*tid = bcf_str2id(str2id, s)) < 0) { | |
294 free(s); | |
295 return -1; | |
296 } | |
297 if (i == k) { /* dump the whole sequence */ | |
298 *begin = 0; *end = 1<<29; free(s); | |
299 return 0; | |
300 } | |
301 for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; | |
302 *begin = atoi(p); | |
303 if (i < k) { | |
304 p = s + i + 1; | |
305 *end = atoi(p); | |
306 } else *end = 1<<29; | |
307 if (*begin > 0) --*begin; | |
308 free(s); | |
309 if (*begin > *end) return -1; | |
310 return 0; | |
311 } | |
312 | |
313 /******************************* | |
314 * retrieve a specified region * | |
315 *******************************/ | |
316 | |
317 uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg) | |
318 { | |
319 uint64_t min_off, *offset; | |
320 int i; | |
321 if (beg < 0) beg = 0; | |
322 offset = idx->index2[tid].offset; | |
323 for (i = beg>>TAD_LIDX_SHIFT; i < idx->index2[tid].n && offset[i] == 0; ++i); | |
324 min_off = (i == idx->index2[tid].n)? offset[idx->index2[tid].n-1] : offset[i]; | |
325 return min_off; | |
326 } | |
327 | |
328 int bcf_main_index(int argc, char *argv[]) | |
329 { | |
330 if (argc == 1) { | |
331 fprintf(stderr, "Usage: bcftools index <in.bcf>\n"); | |
332 return 1; | |
333 } | |
334 bcf_idx_build(argv[1]); | |
335 return 0; | |
336 } |