Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bedidx.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <stdlib.h> | |
2 #include <stdint.h> | |
3 #include <string.h> | |
4 #include <stdio.h> | |
5 #include <zlib.h> | |
6 | |
7 #ifdef _WIN32 | |
8 #define drand48() ((double)rand() / RAND_MAX) | |
9 #endif | |
10 | |
11 #include "ksort.h" | |
12 KSORT_INIT_GENERIC(uint64_t) | |
13 | |
14 #include "kseq.h" | |
15 KSTREAM_INIT(gzFile, gzread, 8192) | |
16 | |
17 typedef struct { | |
18 int n, m; | |
19 uint64_t *a; | |
20 int *idx; | |
21 } bed_reglist_t; | |
22 | |
23 #include "khash.h" | |
24 KHASH_MAP_INIT_STR(reg, bed_reglist_t) | |
25 | |
26 #define LIDX_SHIFT 13 | |
27 | |
28 typedef kh_reg_t reghash_t; | |
29 | |
30 int *bed_index_core(int n, uint64_t *a, int *n_idx) | |
31 { | |
32 int i, j, m, *idx; | |
33 m = *n_idx = 0; idx = 0; | |
34 for (i = 0; i < n; ++i) { | |
35 int beg, end; | |
36 beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; | |
37 if (m < end + 1) { | |
38 int oldm = m; | |
39 m = end + 1; | |
40 kroundup32(m); | |
41 idx = realloc(idx, m * sizeof(int)); | |
42 for (j = oldm; j < m; ++j) idx[j] = -1; | |
43 } | |
44 if (beg == end) { | |
45 if (idx[beg] < 0) idx[beg] = i; | |
46 } else { | |
47 for (j = beg; j <= end; ++j) | |
48 if (idx[j] < 0) idx[j] = i; | |
49 } | |
50 *n_idx = end + 1; | |
51 } | |
52 return idx; | |
53 } | |
54 | |
55 void bed_index(void *_h) | |
56 { | |
57 reghash_t *h = (reghash_t*)_h; | |
58 khint_t k; | |
59 for (k = 0; k < kh_end(h); ++k) { | |
60 if (kh_exist(h, k)) { | |
61 bed_reglist_t *p = &kh_val(h, k); | |
62 if (p->idx) free(p->idx); | |
63 ks_introsort(uint64_t, p->n, p->a); | |
64 p->idx = bed_index_core(p->n, p->a, &p->m); | |
65 } | |
66 } | |
67 } | |
68 | |
69 int bed_overlap_core(const bed_reglist_t *p, int beg, int end) | |
70 { | |
71 int i, min_off; | |
72 if (p->n == 0) return 0; | |
73 min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; | |
74 if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here | |
75 int n = beg>>LIDX_SHIFT; | |
76 if (n > p->n) n = p->n; | |
77 for (i = n - 1; i >= 0; --i) | |
78 if (p->idx[i] >= 0) break; | |
79 min_off = i >= 0? p->idx[i] : 0; | |
80 } | |
81 for (i = min_off; i < p->n; ++i) { | |
82 if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed | |
83 if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) | |
84 return 1; // find the overlap; return | |
85 } | |
86 return 0; | |
87 } | |
88 | |
89 int bed_overlap(const void *_h, const char *chr, int beg, int end) | |
90 { | |
91 const reghash_t *h = (const reghash_t*)_h; | |
92 khint_t k; | |
93 if (!h) return 0; | |
94 k = kh_get(reg, h, chr); | |
95 if (k == kh_end(h)) return 0; | |
96 return bed_overlap_core(&kh_val(h, k), beg, end); | |
97 } | |
98 | |
99 void *bed_read(const char *fn) | |
100 { | |
101 reghash_t *h = kh_init(reg); | |
102 gzFile fp; | |
103 kstream_t *ks; | |
104 int dret; | |
105 kstring_t *str; | |
106 // read the list | |
107 fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); | |
108 if (fp == 0) return 0; | |
109 str = calloc(1, sizeof(kstring_t)); | |
110 ks = ks_init(fp); | |
111 while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name | |
112 int beg = -1, end = -1; | |
113 bed_reglist_t *p; | |
114 khint_t k = kh_get(reg, h, str->s); | |
115 if (k == kh_end(h)) { // absent from the hash table | |
116 int ret; | |
117 char *s = strdup(str->s); | |
118 k = kh_put(reg, h, s, &ret); | |
119 memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); | |
120 } | |
121 p = &kh_val(h, k); | |
122 if (dret != '\n') { // if the lines has other characters | |
123 if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { | |
124 beg = atoi(str->s); // begin | |
125 if (dret != '\n') { | |
126 if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { | |
127 end = atoi(str->s); // end | |
128 if (end < beg) end = -1; | |
129 } | |
130 } | |
131 } | |
132 } | |
133 if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line | |
134 if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column | |
135 if (beg >= 0 && end > beg) { | |
136 if (p->n == p->m) { | |
137 p->m = p->m? p->m<<1 : 4; | |
138 p->a = realloc(p->a, p->m * 8); | |
139 } | |
140 p->a[p->n++] = (uint64_t)beg<<32 | end; | |
141 } | |
142 } | |
143 ks_destroy(ks); | |
144 gzclose(fp); | |
145 free(str->s); free(str); | |
146 bed_index(h); | |
147 return h; | |
148 } | |
149 | |
150 void bed_destroy(void *_h) | |
151 { | |
152 reghash_t *h = (reghash_t*)_h; | |
153 khint_t k; | |
154 for (k = 0; k < kh_end(h); ++k) { | |
155 if (kh_exist(h, k)) { | |
156 free(kh_val(h, k).a); | |
157 free(kh_val(h, k).idx); | |
158 free((char*)kh_key(h, k)); | |
159 } | |
160 } | |
161 kh_destroy(reg, h); | |
162 } |