comparison ezBAMQC/src/htslib/cram/cram_index.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /*
2 Copyright (c) 2013-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * The index is a gzipped tab-delimited text file with one line per slice.
33 * The columns are:
34 * 1: reference number (0 to N-1, as per BAM ref_id)
35 * 2: reference position of 1st read in slice (1..?)
36 * 3: number of reads in slice
37 * 4: offset of container start (relative to end of SAM header, so 1st
38 * container is offset 0).
39 * 5: slice number within container (ie which landmark).
40 *
41 * In memory, we hold this in a nested containment list. Each list element is
42 * a cram_index struct. Each element in turn can contain its own list of
43 * cram_index structs.
44 *
45 * Any start..end range which is entirely contained within another (and
46 * earlier as it is sorted) range will be held within it. This ensures that
47 * the outer list will never have containments and we can safely do a
48 * binary search to find the first range which overlaps any given coordinate.
49 */
50
51 #ifdef HAVE_CONFIG_H
52 #include "io_lib_config.h"
53 #endif
54
55 #include <stdio.h>
56 #include <errno.h>
57 #include <assert.h>
58 #include <stdlib.h>
59 #include <string.h>
60 #include <zlib.h>
61 #include <sys/types.h>
62 #include <sys/stat.h>
63 #include <math.h>
64 #include <ctype.h>
65
66 #include "htslib/hfile.h"
67 #include "cram/cram.h"
68 #include "cram/os.h"
69 #include "cram/zfio.h"
70
71 #if 0
72 static void dump_index_(cram_index *e, int level) {
73 int i, n;
74 n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
75 printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
76 for (i = 0; i < e->nslice; i++) {
77 dump_index_(&e->e[i], level+1);
78 }
79 }
80
81 static void dump_index(cram_fd *fd) {
82 int i;
83 for (i = 0; i < fd->index_sz; i++) {
84 dump_index_(&fd->index[i], 0);
85 }
86 }
87 #endif
88
89 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
90 int sign = 1;
91 int32_t val = 0;
92 size_t p = *pos;
93
94 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
95 p++;
96
97 if (p < k->l && k->s[p] == '-')
98 sign = -1, p++;
99
100 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
101 return -1;
102
103 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
104 val = val*10 + k->s[p++]-'0';
105
106 *pos = p;
107 *val_p = sign*val;
108
109 return 0;
110 }
111
112 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
113 int sign = 1;
114 int64_t val = 0;
115 size_t p = *pos;
116
117 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
118 p++;
119
120 if (p < k->l && k->s[p] == '-')
121 sign = -1, p++;
122
123 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
124 return -1;
125
126 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
127 val = val*10 + k->s[p++]-'0';
128
129 *pos = p;
130 *val_p = sign*val;
131
132 return 0;
133 }
134
135 /*
136 * Loads a CRAM .crai index into memory.
137 *
138 * Returns 0 for success
139 * -1 for failure
140 */
141 int cram_index_load(cram_fd *fd, const char *fn) {
142 char fn2[PATH_MAX];
143 char buf[65536];
144 ssize_t len;
145 kstring_t kstr = {0};
146 hFILE *fp;
147 cram_index *idx;
148 cram_index **idx_stack = NULL, *ep, e;
149 int idx_stack_alloc = 0, idx_stack_ptr = 0;
150 size_t pos = 0;
151
152 /* Check if already loaded */
153 if (fd->index)
154 return 0;
155
156 fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
157 if (!fd->index)
158 return -1;
159
160 idx = &fd->index[0];
161 idx->refid = -1;
162 idx->start = INT_MIN;
163 idx->end = INT_MAX;
164
165 idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
166 idx_stack[idx_stack_ptr] = idx;
167
168 sprintf(fn2, "%s.crai", fn);
169 if (!(fp = hopen(fn2, "r"))) {
170 perror(fn2);
171 free(idx_stack);
172 return -1;
173 }
174
175 // Load the file into memory
176 while ((len = hread(fp, buf, 65536)) > 0)
177 kputsn(buf, len, &kstr);
178 if (len < 0 || kstr.l < 2) {
179 if (kstr.s)
180 free(kstr.s);
181 free(idx_stack);
182 return -1;
183 }
184
185 if (hclose(fp)) {
186 if (kstr.s)
187 free(kstr.s);
188 free(idx_stack);
189 return -1;
190 }
191
192
193 // Uncompress if required
194 if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
195 size_t l;
196 char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
197 free(kstr.s);
198 if (!s) {
199 free(idx_stack);
200 return -1;
201 }
202 kstr.s = s;
203 kstr.l = l;
204 kstr.m = l; // conservative estimate of the size allocated
205 kputsn("", 0, &kstr); // ensure kstr.s is NUL-terminated
206 }
207
208
209 // Parse it line at a time
210 do {
211 /* 1.1 layout */
212 if (kget_int32(&kstr, &pos, &e.refid) == -1) {
213 free(kstr.s); free(idx_stack); return -1;
214 }
215 if (kget_int32(&kstr, &pos, &e.start) == -1) {
216 free(kstr.s); free(idx_stack); return -1;
217 }
218 if (kget_int32(&kstr, &pos, &e.end) == -1) {
219 free(kstr.s); free(idx_stack); return -1;
220 }
221 if (kget_int64(&kstr, &pos, &e.offset) == -1) {
222 free(kstr.s); free(idx_stack); return -1;
223 }
224 if (kget_int32(&kstr, &pos, &e.slice) == -1) {
225 free(kstr.s); free(idx_stack); return -1;
226 }
227 if (kget_int32(&kstr, &pos, &e.len) == -1) {
228 free(kstr.s); free(idx_stack); return -1;
229 }
230
231 e.end += e.start-1;
232 //printf("%d/%d..%d\n", e.refid, e.start, e.end);
233
234 if (e.refid < -1) {
235 free(kstr.s);
236 free(idx_stack);
237 fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
238 return -1;
239 }
240
241 if (e.refid != idx->refid) {
242 if (fd->index_sz < e.refid+2) {
243 size_t index_end = fd->index_sz * sizeof(*fd->index);
244 fd->index_sz = e.refid+2;
245 fd->index = realloc(fd->index,
246 fd->index_sz * sizeof(*fd->index));
247 memset(((char *)fd->index) + index_end, 0,
248 fd->index_sz * sizeof(*fd->index) - index_end);
249 }
250 idx = &fd->index[e.refid+1];
251 idx->refid = e.refid;
252 idx->start = INT_MIN;
253 idx->end = INT_MAX;
254 idx->nslice = idx->nalloc = 0;
255 idx->e = NULL;
256 idx_stack[(idx_stack_ptr = 0)] = idx;
257 }
258
259 while (!(e.start >= idx->start && e.end <= idx->end)) {
260 idx = idx_stack[--idx_stack_ptr];
261 }
262
263 // Now contains, so append
264 if (idx->nslice+1 >= idx->nalloc) {
265 idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
266 idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
267 }
268
269 e.nalloc = e.nslice = 0; e.e = NULL;
270 *(ep = &idx->e[idx->nslice++]) = e;
271 idx = ep;
272
273 if (++idx_stack_ptr >= idx_stack_alloc) {
274 idx_stack_alloc *= 2;
275 idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
276 }
277 idx_stack[idx_stack_ptr] = idx;
278
279 while (pos < kstr.l && kstr.s[pos] != '\n')
280 pos++;
281 pos++;
282 } while (pos < kstr.l);
283
284 free(idx_stack);
285 free(kstr.s);
286
287 // dump_index(fd);
288
289 return 0;
290 }
291
292 static void cram_index_free_recurse(cram_index *e) {
293 if (e->e) {
294 int i;
295 for (i = 0; i < e->nslice; i++) {
296 cram_index_free_recurse(&e->e[i]);
297 }
298 free(e->e);
299 }
300 }
301
302 void cram_index_free(cram_fd *fd) {
303 int i;
304
305 if (!fd->index)
306 return;
307
308 for (i = 0; i < fd->index_sz; i++) {
309 cram_index_free_recurse(&fd->index[i]);
310 }
311 free(fd->index);
312
313 fd->index = NULL;
314 }
315
316 /*
317 * Searches the index for the first slice overlapping a reference ID
318 * and position, or one immediately preceding it if none is found in
319 * the index to overlap this position. (Our index may have missing
320 * entries, but we require at least one per reference.)
321 *
322 * If the index finds multiple slices overlapping this position we
323 * return the first one only. Subsequent calls should specifying
324 * "from" as the last slice we checked to find the next one. Otherwise
325 * set "from" to be NULL to find the first one.
326 *
327 * Returns the cram_index pointer on sucess
328 * NULL on failure
329 */
330 cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
331 cram_index *from) {
332 int i, j, k;
333 cram_index *e;
334
335 if (refid+1 < 0 || refid+1 >= fd->index_sz)
336 return NULL;
337
338 i = 0, j = fd->index[refid+1].nslice-1;
339
340 if (!from)
341 from = &fd->index[refid+1];
342
343 for (k = j/2; k != i; k = (j-i)/2 + i) {
344 if (from->e[k].refid > refid) {
345 j = k;
346 continue;
347 }
348
349 if (from->e[k].refid < refid) {
350 i = k;
351 continue;
352 }
353
354 if (from->e[k].start >= pos) {
355 j = k;
356 continue;
357 }
358
359 if (from->e[k].start < pos) {
360 i = k;
361 continue;
362 }
363 }
364 // i==j or i==j-1. Check if j is better.
365 if (from->e[j].start < pos && from->e[j].refid == refid)
366 i = j;
367
368 /* The above found *a* bin overlapping, but not necessarily the first */
369 while (i > 0 && from->e[i-1].end >= pos)
370 i--;
371
372 /* Special case for matching a start pos */
373 if (i+1 < from->nslice &&
374 from->e[i+1].start == pos &&
375 from->e[i+1].refid == refid)
376 i++;
377
378 e = &from->e[i];
379
380 return e;
381 }
382
383
384 /*
385 * Skips to a container overlapping the start coordinate listed in
386 * cram_range.
387 *
388 * In theory we call cram_index_query multiple times, once per slice
389 * overlapping the range. However slices may be absent from the index
390 * which makes this problematic. Instead we find the left-most slice
391 * and then read from then on, skipping decoding of slices and/or
392 * whole containers when they don't overlap the specified cram_range.
393 *
394 * Returns 0 on success
395 * -1 on failure
396 */
397 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
398 cram_index *e;
399
400 // Ideally use an index, so see if we have one.
401 if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
402 if (0 != cram_seek(fd, e->offset, SEEK_SET))
403 if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
404 return -1;
405 } else {
406 fprintf(stderr, "Unknown reference ID. Missing from index?\n");
407 return -1;
408 }
409
410 if (fd->ctr) {
411 cram_free_container(fd->ctr);
412 fd->ctr = NULL;
413 fd->ooc = 0;
414 }
415
416 return 0;
417 }
418
419
420 /*
421 * A specialised form of cram_index_build (below) that deals with slices
422 * having multiple references in this (ref_id -2). In this scenario we
423 * decode the slice to look at the RI data series instead.
424 *
425 * Returns 0 on success
426 * -1 on failure
427 */
428 static int cram_index_build_multiref(cram_fd *fd,
429 cram_container *c,
430 cram_slice *s,
431 zfp *fp,
432 off_t cpos,
433 int32_t landmark,
434 int sz) {
435 int i, ref = -2, ref_start = 0, ref_end;
436 char buf[1024];
437
438 if (0 != cram_decode_slice(fd, c, s, fd->header))
439 return -1;
440
441 ref_end = INT_MIN;
442 for (i = 0; i < s->hdr->num_records; i++) {
443 if (s->crecs[i].ref_id == ref) {
444 if (ref_end < s->crecs[i].aend)
445 ref_end = s->crecs[i].aend;
446 continue;
447 }
448
449 if (ref != -2) {
450 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
451 ref, ref_start, ref_end - ref_start + 1,
452 (int64_t)cpos, landmark, sz);
453 zfputs(buf, fp);
454 }
455
456 ref = s->crecs[i].ref_id;
457 ref_start = s->crecs[i].apos;
458 ref_end = INT_MIN;
459 }
460
461 if (ref != -2) {
462 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
463 ref, ref_start, ref_end - ref_start + 1,
464 (int64_t)cpos, landmark, sz);
465 zfputs(buf, fp);
466 }
467
468 return 0;
469 }
470
471 /*
472 * Builds an index file.
473 *
474 * fd is a newly opened cram file that we wish to index.
475 * fn_base is the filename of the associated CRAM file. Internally we
476 * add ".crai" to this to get the index filename.
477 *
478 * Returns 0 on success
479 * -1 on failure
480 */
481 int cram_index_build(cram_fd *fd, const char *fn_base) {
482 cram_container *c;
483 off_t cpos, spos, hpos;
484 zfp *fp;
485 char fn_idx[PATH_MAX];
486
487 if (strlen(fn_base) > PATH_MAX-6)
488 return -1;
489
490 sprintf(fn_idx, "%s.crai", fn_base);
491 if (!(fp = zfopen(fn_idx, "wz"))) {
492 perror(fn_idx);
493 return -1;
494 }
495
496 cpos = htell(fd->fp);
497 while ((c = cram_read_container(fd))) {
498 int j;
499
500 if (fd->err) {
501 perror("Cram container read");
502 return 1;
503 }
504
505 hpos = htell(fd->fp);
506
507 if (!(c->comp_hdr_block = cram_read_block(fd)))
508 return 1;
509 assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
510
511 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
512 if (!c->comp_hdr)
513 return -1;
514
515 // 2.0 format
516 for (j = 0; j < c->num_landmarks; j++) {
517 char buf[1024];
518 cram_slice *s;
519 int sz;
520
521 spos = htell(fd->fp);
522 assert(spos - cpos - c->offset == c->landmark[j]);
523
524 if (!(s = cram_read_slice(fd))) {
525 zfclose(fp);
526 return -1;
527 }
528
529 sz = (int)(htell(fd->fp) - spos);
530
531 if (s->hdr->ref_seq_id == -2) {
532 cram_index_build_multiref(fd, c, s, fp,
533 cpos, c->landmark[j], sz);
534 } else {
535 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
536 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
537 s->hdr->ref_seq_span, (int64_t)cpos,
538 c->landmark[j], sz);
539 zfputs(buf, fp);
540 }
541
542 cram_free_slice(s);
543 }
544
545 cpos = htell(fd->fp);
546 assert(cpos == hpos + c->length);
547
548 cram_free_container(c);
549 }
550 if (fd->err) {
551 zfclose(fp);
552 return -1;
553 }
554
555
556 return zfclose(fp);
557 }