0
|
1 #include <stdlib.h>
|
|
2 #include <stdint.h>
|
|
3 #include <string.h>
|
|
4 #include <stdio.h>
|
|
5 #include <zlib.h>
|
|
6
|
|
7 #include "ksort.h"
|
|
8 KSORT_INIT_GENERIC(uint64_t)
|
|
9
|
|
10 #include "kseq.h"
|
|
11 KSTREAM_INIT(gzFile, gzread, 8192)
|
|
12
|
|
13 typedef struct {
|
|
14 int n, m;
|
|
15 uint64_t *a;
|
|
16 int *idx;
|
|
17 } bed_reglist_t;
|
|
18
|
|
19 #include "khash.h"
|
|
20 KHASH_MAP_INIT_STR(reg, bed_reglist_t)
|
|
21
|
|
22 #define LIDX_SHIFT 13
|
|
23
|
|
24 typedef kh_reg_t reghash_t;
|
|
25
|
|
26 int *bed_index_core(int n, uint64_t *a, int *n_idx)
|
|
27 {
|
|
28 int i, j, m, *idx;
|
|
29 m = *n_idx = 0; idx = 0;
|
|
30 for (i = 0; i < n; ++i) {
|
|
31 int beg, end;
|
|
32 beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
|
|
33 if (m < end + 1) {
|
|
34 int oldm = m;
|
|
35 m = end + 1;
|
|
36 kroundup32(m);
|
|
37 idx = realloc(idx, m * sizeof(int));
|
|
38 for (j = oldm; j < m; ++j) idx[j] = -1;
|
|
39 }
|
|
40 if (beg == end) {
|
|
41 if (idx[beg] < 0) idx[beg] = i;
|
|
42 } else {
|
|
43 for (j = beg; j <= end; ++j)
|
|
44 if (idx[j] < 0) idx[j] = i;
|
|
45 }
|
|
46 *n_idx = end + 1;
|
|
47 }
|
|
48 return idx;
|
|
49 }
|
|
50
|
|
51 void bed_index(void *_h)
|
|
52 {
|
|
53 reghash_t *h = (reghash_t*)_h;
|
|
54 khint_t k;
|
|
55 for (k = 0; k < kh_end(h); ++k) {
|
|
56 if (kh_exist(h, k)) {
|
|
57 bed_reglist_t *p = &kh_val(h, k);
|
|
58 if (p->idx) free(p->idx);
|
|
59 ks_introsort(uint64_t, p->n, p->a);
|
|
60 p->idx = bed_index_core(p->n, p->a, &p->m);
|
|
61 }
|
|
62 }
|
|
63 }
|
|
64
|
|
65 int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
|
|
66 {
|
|
67 int i, min_off;
|
|
68 if (p->n == 0) return 0;
|
|
69 min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
|
|
70 if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
|
|
71 int n = beg>>LIDX_SHIFT;
|
|
72 if (n > p->n) n = p->n;
|
|
73 for (i = n - 1; i >= 0; --i)
|
|
74 if (p->idx[i] >= 0) break;
|
|
75 min_off = i >= 0? p->idx[i] : 0;
|
|
76 }
|
|
77 for (i = min_off; i < p->n; ++i) {
|
|
78 if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
|
|
79 if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
|
|
80 return 1; // find the overlap; return
|
|
81 }
|
|
82 return 0;
|
|
83 }
|
|
84
|
|
85 int bed_overlap(const void *_h, const char *chr, int beg, int end)
|
|
86 {
|
|
87 const reghash_t *h = (const reghash_t*)_h;
|
|
88 khint_t k;
|
|
89 if (!h) return 0;
|
|
90 k = kh_get(reg, h, chr);
|
|
91 if (k == kh_end(h)) return 0;
|
|
92 return bed_overlap_core(&kh_val(h, k), beg, end);
|
|
93 }
|
|
94
|
|
95 void *bed_read(const char *fn)
|
|
96 {
|
|
97 reghash_t *h = kh_init(reg);
|
|
98 gzFile fp;
|
|
99 kstream_t *ks;
|
|
100 int dret;
|
|
101 kstring_t *str;
|
|
102 // read the list
|
|
103 fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
|
104 if (fp == 0) return 0;
|
|
105 str = calloc(1, sizeof(kstring_t));
|
|
106 ks = ks_init(fp);
|
|
107 while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
|
|
108 int beg = -1, end = -1;
|
|
109 bed_reglist_t *p;
|
|
110 khint_t k = kh_get(reg, h, str->s);
|
|
111 if (k == kh_end(h)) { // absent from the hash table
|
|
112 int ret;
|
|
113 char *s = strdup(str->s);
|
|
114 k = kh_put(reg, h, s, &ret);
|
|
115 memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
|
|
116 }
|
|
117 p = &kh_val(h, k);
|
|
118 if (dret != '\n') { // if the lines has other characters
|
|
119 if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
|
|
120 beg = atoi(str->s); // begin
|
|
121 if (dret != '\n') {
|
|
122 if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0]))
|
|
123 end = atoi(str->s); // end
|
|
124 }
|
|
125 }
|
|
126 }
|
|
127 if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
|
|
128 if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
|
|
129 if (beg >= 0 && end > beg) {
|
|
130 if (p->n == p->m) {
|
|
131 p->m = p->m? p->m<<1 : 4;
|
|
132 p->a = realloc(p->a, p->m * 8);
|
|
133 }
|
|
134 p->a[p->n++] = (uint64_t)beg<<32 | end;
|
|
135 }
|
|
136 }
|
|
137 ks_destroy(ks);
|
|
138 gzclose(fp);
|
|
139 free(str->s); free(str);
|
|
140 bed_index(h);
|
|
141 return h;
|
|
142 }
|
|
143
|
|
144 void bed_destroy(void *_h)
|
|
145 {
|
|
146 reghash_t *h = (reghash_t*)_h;
|
|
147 khint_t k;
|
|
148 for (k = 0; k < kh_end(h); ++k) {
|
|
149 if (kh_exist(h, k)) {
|
|
150 free(kh_val(h, k).a);
|
|
151 free(kh_val(h, k).idx);
|
|
152 free((char*)kh_key(h, k));
|
|
153 }
|
|
154 }
|
|
155 kh_destroy(reg, h);
|
|
156 }
|