0
|
1 /*
|
|
2 Copyright (c) 2013-2014 Genome Research Ltd.
|
|
3 Author: James Bonfield <jkb@sanger.ac.uk>
|
|
4
|
|
5 Redistribution and use in source and binary forms, with or without
|
|
6 modification, are permitted provided that the following conditions are met:
|
|
7
|
|
8 1. Redistributions of source code must retain the above copyright notice,
|
|
9 this list of conditions and the following disclaimer.
|
|
10
|
|
11 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12 this list of conditions and the following disclaimer in the documentation
|
|
13 and/or other materials provided with the distribution.
|
|
14
|
|
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
|
16 Institute nor the names of its contributors may be used to endorse or promote
|
|
17 products derived from this software without specific prior written permission.
|
|
18
|
|
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
|
|
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
|
|
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29 */
|
|
30
|
|
31 /*
|
|
32 * The index is a gzipped tab-delimited text file with one line per slice.
|
|
33 * The columns are:
|
|
34 * 1: reference number (0 to N-1, as per BAM ref_id)
|
|
35 * 2: reference position of 1st read in slice (1..?)
|
|
36 * 3: number of reads in slice
|
|
37 * 4: offset of container start (relative to end of SAM header, so 1st
|
|
38 * container is offset 0).
|
|
39 * 5: slice number within container (ie which landmark).
|
|
40 *
|
|
41 * In memory, we hold this in a nested containment list. Each list element is
|
|
42 * a cram_index struct. Each element in turn can contain its own list of
|
|
43 * cram_index structs.
|
|
44 *
|
|
45 * Any start..end range which is entirely contained within another (and
|
|
46 * earlier as it is sorted) range will be held within it. This ensures that
|
|
47 * the outer list will never have containments and we can safely do a
|
|
48 * binary search to find the first range which overlaps any given coordinate.
|
|
49 */
|
|
50
|
|
51 #ifdef HAVE_CONFIG_H
|
|
52 #include "io_lib_config.h"
|
|
53 #endif
|
|
54
|
|
55 #include <stdio.h>
|
|
56 #include <errno.h>
|
|
57 #include <assert.h>
|
|
58 #include <stdlib.h>
|
|
59 #include <string.h>
|
|
60 #include <zlib.h>
|
|
61 #include <sys/types.h>
|
|
62 #include <sys/stat.h>
|
|
63 #include <math.h>
|
|
64 #include <ctype.h>
|
|
65
|
|
66 #include "htslib/hfile.h"
|
|
67 #include "cram/cram.h"
|
|
68 #include "cram/os.h"
|
|
69 #include "cram/zfio.h"
|
|
70
|
|
71 #if 0
|
|
72 static void dump_index_(cram_index *e, int level) {
|
|
73 int i, n;
|
|
74 n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
|
|
75 printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
|
|
76 for (i = 0; i < e->nslice; i++) {
|
|
77 dump_index_(&e->e[i], level+1);
|
|
78 }
|
|
79 }
|
|
80
|
|
81 static void dump_index(cram_fd *fd) {
|
|
82 int i;
|
|
83 for (i = 0; i < fd->index_sz; i++) {
|
|
84 dump_index_(&fd->index[i], 0);
|
|
85 }
|
|
86 }
|
|
87 #endif
|
|
88
|
|
89 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
|
|
90 int sign = 1;
|
|
91 int32_t val = 0;
|
|
92 size_t p = *pos;
|
|
93
|
|
94 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
|
|
95 p++;
|
|
96
|
|
97 if (p < k->l && k->s[p] == '-')
|
|
98 sign = -1, p++;
|
|
99
|
|
100 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
|
|
101 return -1;
|
|
102
|
|
103 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
|
|
104 val = val*10 + k->s[p++]-'0';
|
|
105
|
|
106 *pos = p;
|
|
107 *val_p = sign*val;
|
|
108
|
|
109 return 0;
|
|
110 }
|
|
111
|
|
112 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
|
|
113 int sign = 1;
|
|
114 int64_t val = 0;
|
|
115 size_t p = *pos;
|
|
116
|
|
117 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
|
|
118 p++;
|
|
119
|
|
120 if (p < k->l && k->s[p] == '-')
|
|
121 sign = -1, p++;
|
|
122
|
|
123 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
|
|
124 return -1;
|
|
125
|
|
126 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
|
|
127 val = val*10 + k->s[p++]-'0';
|
|
128
|
|
129 *pos = p;
|
|
130 *val_p = sign*val;
|
|
131
|
|
132 return 0;
|
|
133 }
|
|
134
|
|
135 /*
|
|
136 * Loads a CRAM .crai index into memory.
|
|
137 *
|
|
138 * Returns 0 for success
|
|
139 * -1 for failure
|
|
140 */
|
|
141 int cram_index_load(cram_fd *fd, const char *fn) {
|
|
142 char fn2[PATH_MAX];
|
|
143 char buf[65536];
|
|
144 ssize_t len;
|
|
145 kstring_t kstr = {0};
|
|
146 hFILE *fp;
|
|
147 cram_index *idx;
|
|
148 cram_index **idx_stack = NULL, *ep, e;
|
|
149 int idx_stack_alloc = 0, idx_stack_ptr = 0;
|
|
150 size_t pos = 0;
|
|
151
|
|
152 /* Check if already loaded */
|
|
153 if (fd->index)
|
|
154 return 0;
|
|
155
|
|
156 fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
|
|
157 if (!fd->index)
|
|
158 return -1;
|
|
159
|
|
160 idx = &fd->index[0];
|
|
161 idx->refid = -1;
|
|
162 idx->start = INT_MIN;
|
|
163 idx->end = INT_MAX;
|
|
164
|
|
165 idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
|
|
166 idx_stack[idx_stack_ptr] = idx;
|
|
167
|
|
168 sprintf(fn2, "%s.crai", fn);
|
|
169 if (!(fp = hopen(fn2, "r"))) {
|
|
170 perror(fn2);
|
|
171 free(idx_stack);
|
|
172 return -1;
|
|
173 }
|
|
174
|
|
175 // Load the file into memory
|
|
176 while ((len = hread(fp, buf, 65536)) > 0)
|
|
177 kputsn(buf, len, &kstr);
|
|
178 if (len < 0 || kstr.l < 2) {
|
|
179 if (kstr.s)
|
|
180 free(kstr.s);
|
|
181 free(idx_stack);
|
|
182 return -1;
|
|
183 }
|
|
184
|
|
185 if (hclose(fp)) {
|
|
186 if (kstr.s)
|
|
187 free(kstr.s);
|
|
188 free(idx_stack);
|
|
189 return -1;
|
|
190 }
|
|
191
|
|
192
|
|
193 // Uncompress if required
|
|
194 if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
|
|
195 size_t l;
|
|
196 char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
|
|
197 free(kstr.s);
|
|
198 if (!s) {
|
|
199 free(idx_stack);
|
|
200 return -1;
|
|
201 }
|
|
202 kstr.s = s;
|
|
203 kstr.l = l;
|
|
204 kstr.m = l; // conservative estimate of the size allocated
|
|
205 kputsn("", 0, &kstr); // ensure kstr.s is NUL-terminated
|
|
206 }
|
|
207
|
|
208
|
|
209 // Parse it line at a time
|
|
210 do {
|
|
211 /* 1.1 layout */
|
|
212 if (kget_int32(&kstr, &pos, &e.refid) == -1) {
|
|
213 free(kstr.s); free(idx_stack); return -1;
|
|
214 }
|
|
215 if (kget_int32(&kstr, &pos, &e.start) == -1) {
|
|
216 free(kstr.s); free(idx_stack); return -1;
|
|
217 }
|
|
218 if (kget_int32(&kstr, &pos, &e.end) == -1) {
|
|
219 free(kstr.s); free(idx_stack); return -1;
|
|
220 }
|
|
221 if (kget_int64(&kstr, &pos, &e.offset) == -1) {
|
|
222 free(kstr.s); free(idx_stack); return -1;
|
|
223 }
|
|
224 if (kget_int32(&kstr, &pos, &e.slice) == -1) {
|
|
225 free(kstr.s); free(idx_stack); return -1;
|
|
226 }
|
|
227 if (kget_int32(&kstr, &pos, &e.len) == -1) {
|
|
228 free(kstr.s); free(idx_stack); return -1;
|
|
229 }
|
|
230
|
|
231 e.end += e.start-1;
|
|
232 //printf("%d/%d..%d\n", e.refid, e.start, e.end);
|
|
233
|
|
234 if (e.refid < -1) {
|
|
235 free(kstr.s);
|
|
236 free(idx_stack);
|
|
237 fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
|
|
238 return -1;
|
|
239 }
|
|
240
|
|
241 if (e.refid != idx->refid) {
|
|
242 if (fd->index_sz < e.refid+2) {
|
|
243 size_t index_end = fd->index_sz * sizeof(*fd->index);
|
|
244 fd->index_sz = e.refid+2;
|
|
245 fd->index = realloc(fd->index,
|
|
246 fd->index_sz * sizeof(*fd->index));
|
|
247 memset(((char *)fd->index) + index_end, 0,
|
|
248 fd->index_sz * sizeof(*fd->index) - index_end);
|
|
249 }
|
|
250 idx = &fd->index[e.refid+1];
|
|
251 idx->refid = e.refid;
|
|
252 idx->start = INT_MIN;
|
|
253 idx->end = INT_MAX;
|
|
254 idx->nslice = idx->nalloc = 0;
|
|
255 idx->e = NULL;
|
|
256 idx_stack[(idx_stack_ptr = 0)] = idx;
|
|
257 }
|
|
258
|
|
259 while (!(e.start >= idx->start && e.end <= idx->end)) {
|
|
260 idx = idx_stack[--idx_stack_ptr];
|
|
261 }
|
|
262
|
|
263 // Now contains, so append
|
|
264 if (idx->nslice+1 >= idx->nalloc) {
|
|
265 idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
|
|
266 idx->e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
|
|
267 }
|
|
268
|
|
269 e.nalloc = e.nslice = 0; e.e = NULL;
|
|
270 *(ep = &idx->e[idx->nslice++]) = e;
|
|
271 idx = ep;
|
|
272
|
|
273 if (++idx_stack_ptr >= idx_stack_alloc) {
|
|
274 idx_stack_alloc *= 2;
|
|
275 idx_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
|
|
276 }
|
|
277 idx_stack[idx_stack_ptr] = idx;
|
|
278
|
|
279 while (pos < kstr.l && kstr.s[pos] != '\n')
|
|
280 pos++;
|
|
281 pos++;
|
|
282 } while (pos < kstr.l);
|
|
283
|
|
284 free(idx_stack);
|
|
285 free(kstr.s);
|
|
286
|
|
287 // dump_index(fd);
|
|
288
|
|
289 return 0;
|
|
290 }
|
|
291
|
|
292 static void cram_index_free_recurse(cram_index *e) {
|
|
293 if (e->e) {
|
|
294 int i;
|
|
295 for (i = 0; i < e->nslice; i++) {
|
|
296 cram_index_free_recurse(&e->e[i]);
|
|
297 }
|
|
298 free(e->e);
|
|
299 }
|
|
300 }
|
|
301
|
|
302 void cram_index_free(cram_fd *fd) {
|
|
303 int i;
|
|
304
|
|
305 if (!fd->index)
|
|
306 return;
|
|
307
|
|
308 for (i = 0; i < fd->index_sz; i++) {
|
|
309 cram_index_free_recurse(&fd->index[i]);
|
|
310 }
|
|
311 free(fd->index);
|
|
312
|
|
313 fd->index = NULL;
|
|
314 }
|
|
315
|
|
316 /*
|
|
317 * Searches the index for the first slice overlapping a reference ID
|
|
318 * and position, or one immediately preceding it if none is found in
|
|
319 * the index to overlap this position. (Our index may have missing
|
|
320 * entries, but we require at least one per reference.)
|
|
321 *
|
|
322 * If the index finds multiple slices overlapping this position we
|
|
323 * return the first one only. Subsequent calls should specifying
|
|
324 * "from" as the last slice we checked to find the next one. Otherwise
|
|
325 * set "from" to be NULL to find the first one.
|
|
326 *
|
|
327 * Returns the cram_index pointer on sucess
|
|
328 * NULL on failure
|
|
329 */
|
|
330 cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
|
|
331 cram_index *from) {
|
|
332 int i, j, k;
|
|
333 cram_index *e;
|
|
334
|
|
335 if (refid+1 < 0 || refid+1 >= fd->index_sz)
|
|
336 return NULL;
|
|
337
|
|
338 i = 0, j = fd->index[refid+1].nslice-1;
|
|
339
|
|
340 if (!from)
|
|
341 from = &fd->index[refid+1];
|
|
342
|
|
343 for (k = j/2; k != i; k = (j-i)/2 + i) {
|
|
344 if (from->e[k].refid > refid) {
|
|
345 j = k;
|
|
346 continue;
|
|
347 }
|
|
348
|
|
349 if (from->e[k].refid < refid) {
|
|
350 i = k;
|
|
351 continue;
|
|
352 }
|
|
353
|
|
354 if (from->e[k].start >= pos) {
|
|
355 j = k;
|
|
356 continue;
|
|
357 }
|
|
358
|
|
359 if (from->e[k].start < pos) {
|
|
360 i = k;
|
|
361 continue;
|
|
362 }
|
|
363 }
|
|
364 // i==j or i==j-1. Check if j is better.
|
|
365 if (from->e[j].start < pos && from->e[j].refid == refid)
|
|
366 i = j;
|
|
367
|
|
368 /* The above found *a* bin overlapping, but not necessarily the first */
|
|
369 while (i > 0 && from->e[i-1].end >= pos)
|
|
370 i--;
|
|
371
|
|
372 /* Special case for matching a start pos */
|
|
373 if (i+1 < from->nslice &&
|
|
374 from->e[i+1].start == pos &&
|
|
375 from->e[i+1].refid == refid)
|
|
376 i++;
|
|
377
|
|
378 e = &from->e[i];
|
|
379
|
|
380 return e;
|
|
381 }
|
|
382
|
|
383
|
|
384 /*
|
|
385 * Skips to a container overlapping the start coordinate listed in
|
|
386 * cram_range.
|
|
387 *
|
|
388 * In theory we call cram_index_query multiple times, once per slice
|
|
389 * overlapping the range. However slices may be absent from the index
|
|
390 * which makes this problematic. Instead we find the left-most slice
|
|
391 * and then read from then on, skipping decoding of slices and/or
|
|
392 * whole containers when they don't overlap the specified cram_range.
|
|
393 *
|
|
394 * Returns 0 on success
|
|
395 * -1 on failure
|
|
396 */
|
|
397 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
|
|
398 cram_index *e;
|
|
399
|
|
400 // Ideally use an index, so see if we have one.
|
|
401 if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
|
|
402 if (0 != cram_seek(fd, e->offset, SEEK_SET))
|
|
403 if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
|
|
404 return -1;
|
|
405 } else {
|
|
406 fprintf(stderr, "Unknown reference ID. Missing from index?\n");
|
|
407 return -1;
|
|
408 }
|
|
409
|
|
410 if (fd->ctr) {
|
|
411 cram_free_container(fd->ctr);
|
|
412 fd->ctr = NULL;
|
|
413 fd->ooc = 0;
|
|
414 }
|
|
415
|
|
416 return 0;
|
|
417 }
|
|
418
|
|
419
|
|
420 /*
|
|
421 * A specialised form of cram_index_build (below) that deals with slices
|
|
422 * having multiple references in this (ref_id -2). In this scenario we
|
|
423 * decode the slice to look at the RI data series instead.
|
|
424 *
|
|
425 * Returns 0 on success
|
|
426 * -1 on failure
|
|
427 */
|
|
428 static int cram_index_build_multiref(cram_fd *fd,
|
|
429 cram_container *c,
|
|
430 cram_slice *s,
|
|
431 zfp *fp,
|
|
432 off_t cpos,
|
|
433 int32_t landmark,
|
|
434 int sz) {
|
|
435 int i, ref = -2, ref_start = 0, ref_end;
|
|
436 char buf[1024];
|
|
437
|
|
438 if (0 != cram_decode_slice(fd, c, s, fd->header))
|
|
439 return -1;
|
|
440
|
|
441 ref_end = INT_MIN;
|
|
442 for (i = 0; i < s->hdr->num_records; i++) {
|
|
443 if (s->crecs[i].ref_id == ref) {
|
|
444 if (ref_end < s->crecs[i].aend)
|
|
445 ref_end = s->crecs[i].aend;
|
|
446 continue;
|
|
447 }
|
|
448
|
|
449 if (ref != -2) {
|
|
450 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
|
|
451 ref, ref_start, ref_end - ref_start + 1,
|
|
452 (int64_t)cpos, landmark, sz);
|
|
453 zfputs(buf, fp);
|
|
454 }
|
|
455
|
|
456 ref = s->crecs[i].ref_id;
|
|
457 ref_start = s->crecs[i].apos;
|
|
458 ref_end = INT_MIN;
|
|
459 }
|
|
460
|
|
461 if (ref != -2) {
|
|
462 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
|
|
463 ref, ref_start, ref_end - ref_start + 1,
|
|
464 (int64_t)cpos, landmark, sz);
|
|
465 zfputs(buf, fp);
|
|
466 }
|
|
467
|
|
468 return 0;
|
|
469 }
|
|
470
|
|
471 /*
|
|
472 * Builds an index file.
|
|
473 *
|
|
474 * fd is a newly opened cram file that we wish to index.
|
|
475 * fn_base is the filename of the associated CRAM file. Internally we
|
|
476 * add ".crai" to this to get the index filename.
|
|
477 *
|
|
478 * Returns 0 on success
|
|
479 * -1 on failure
|
|
480 */
|
|
481 int cram_index_build(cram_fd *fd, const char *fn_base) {
|
|
482 cram_container *c;
|
|
483 off_t cpos, spos, hpos;
|
|
484 zfp *fp;
|
|
485 char fn_idx[PATH_MAX];
|
|
486
|
|
487 if (strlen(fn_base) > PATH_MAX-6)
|
|
488 return -1;
|
|
489
|
|
490 sprintf(fn_idx, "%s.crai", fn_base);
|
|
491 if (!(fp = zfopen(fn_idx, "wz"))) {
|
|
492 perror(fn_idx);
|
|
493 return -1;
|
|
494 }
|
|
495
|
|
496 cpos = htell(fd->fp);
|
|
497 while ((c = cram_read_container(fd))) {
|
|
498 int j;
|
|
499
|
|
500 if (fd->err) {
|
|
501 perror("Cram container read");
|
|
502 return 1;
|
|
503 }
|
|
504
|
|
505 hpos = htell(fd->fp);
|
|
506
|
|
507 if (!(c->comp_hdr_block = cram_read_block(fd)))
|
|
508 return 1;
|
|
509 assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
|
|
510
|
|
511 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
|
|
512 if (!c->comp_hdr)
|
|
513 return -1;
|
|
514
|
|
515 // 2.0 format
|
|
516 for (j = 0; j < c->num_landmarks; j++) {
|
|
517 char buf[1024];
|
|
518 cram_slice *s;
|
|
519 int sz;
|
|
520
|
|
521 spos = htell(fd->fp);
|
|
522 assert(spos - cpos - c->offset == c->landmark[j]);
|
|
523
|
|
524 if (!(s = cram_read_slice(fd))) {
|
|
525 zfclose(fp);
|
|
526 return -1;
|
|
527 }
|
|
528
|
|
529 sz = (int)(htell(fd->fp) - spos);
|
|
530
|
|
531 if (s->hdr->ref_seq_id == -2) {
|
|
532 cram_index_build_multiref(fd, c, s, fp,
|
|
533 cpos, c->landmark[j], sz);
|
|
534 } else {
|
|
535 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
|
|
536 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
|
|
537 s->hdr->ref_seq_span, (int64_t)cpos,
|
|
538 c->landmark[j], sz);
|
|
539 zfputs(buf, fp);
|
|
540 }
|
|
541
|
|
542 cram_free_slice(s);
|
|
543 }
|
|
544
|
|
545 cpos = htell(fd->fp);
|
|
546 assert(cpos == hpos + c->length);
|
|
547
|
|
548 cram_free_container(c);
|
|
549 }
|
|
550 if (fd->err) {
|
|
551 zfclose(fp);
|
|
552 return -1;
|
|
553 }
|
|
554
|
|
555
|
|
556 return zfclose(fp);
|
|
557 }
|