annotate PsiCLASS-1.0.2/samtools-0.1.19/bedidx.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1 #include <stdlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
2 #include <stdint.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
3 #include <string.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
4 #include <stdio.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
5 #include <zlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
6
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
7 #ifdef _WIN32
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
8 #define drand48() ((double)rand() / RAND_MAX)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
9 #endif
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
10
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
11 #include "ksort.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
12 KSORT_INIT_GENERIC(uint64_t)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
13
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
14 #include "kseq.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
15 KSTREAM_INIT(gzFile, gzread, 8192)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
16
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
17 typedef struct {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
18 int n, m;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
19 uint64_t *a;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
20 int *idx;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
21 } bed_reglist_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
22
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
23 #include "khash.h"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
24 KHASH_MAP_INIT_STR(reg, bed_reglist_t)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
25
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
26 #define LIDX_SHIFT 13
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
27
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
28 typedef kh_reg_t reghash_t;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
29
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
30 int *bed_index_core(int n, uint64_t *a, int *n_idx)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
31 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
32 int i, j, m, *idx;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
33 m = *n_idx = 0; idx = 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
34 for (i = 0; i < n; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
35 int beg, end;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
36 beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
37 if (m < end + 1) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
38 int oldm = m;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
39 m = end + 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
40 kroundup32(m);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
41 idx = realloc(idx, m * sizeof(int));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
42 for (j = oldm; j < m; ++j) idx[j] = -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
43 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
44 if (beg == end) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
45 if (idx[beg] < 0) idx[beg] = i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
46 } else {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
47 for (j = beg; j <= end; ++j)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
48 if (idx[j] < 0) idx[j] = i;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
49 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
50 *n_idx = end + 1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
51 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
52 return idx;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
53 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
54
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
55 void bed_index(void *_h)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
56 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
57 reghash_t *h = (reghash_t*)_h;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
58 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
59 for (k = 0; k < kh_end(h); ++k) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
60 if (kh_exist(h, k)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
61 bed_reglist_t *p = &kh_val(h, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
62 if (p->idx) free(p->idx);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
63 ks_introsort(uint64_t, p->n, p->a);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
64 p->idx = bed_index_core(p->n, p->a, &p->m);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
65 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
66 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
67 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
68
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
69 int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
70 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
71 int i, min_off;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
72 if (p->n == 0) return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
73 min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
74 if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
75 int n = beg>>LIDX_SHIFT;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
76 if (n > p->n) n = p->n;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
77 for (i = n - 1; i >= 0; --i)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
78 if (p->idx[i] >= 0) break;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
79 min_off = i >= 0? p->idx[i] : 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
80 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
81 for (i = min_off; i < p->n; ++i) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
82 if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
83 if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
84 return 1; // find the overlap; return
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
85 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
86 return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
87 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
88
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
89 int bed_overlap(const void *_h, const char *chr, int beg, int end)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
90 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
91 const reghash_t *h = (const reghash_t*)_h;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
92 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
93 if (!h) return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
94 k = kh_get(reg, h, chr);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
95 if (k == kh_end(h)) return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
96 return bed_overlap_core(&kh_val(h, k), beg, end);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
97 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
98
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
99 void *bed_read(const char *fn)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
100 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
101 reghash_t *h = kh_init(reg);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
102 gzFile fp;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
103 kstream_t *ks;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
104 int dret;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
105 kstring_t *str;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
106 // read the list
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
107 fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
108 if (fp == 0) return 0;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
109 str = calloc(1, sizeof(kstring_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
110 ks = ks_init(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
111 while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
112 int beg = -1, end = -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
113 bed_reglist_t *p;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
114 khint_t k = kh_get(reg, h, str->s);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
115 if (k == kh_end(h)) { // absent from the hash table
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
116 int ret;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
117 char *s = strdup(str->s);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
118 k = kh_put(reg, h, s, &ret);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
119 memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
120 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
121 p = &kh_val(h, k);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
122 if (dret != '\n') { // if the lines has other characters
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
123 if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
124 beg = atoi(str->s); // begin
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
125 if (dret != '\n') {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
126 if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
127 end = atoi(str->s); // end
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
128 if (end < beg) end = -1;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
129 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
130 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
131 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
132 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
133 if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
134 if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
135 if (beg >= 0 && end > beg) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
136 if (p->n == p->m) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
137 p->m = p->m? p->m<<1 : 4;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
138 p->a = realloc(p->a, p->m * 8);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
139 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
140 p->a[p->n++] = (uint64_t)beg<<32 | end;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
141 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
142 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
143 ks_destroy(ks);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
144 gzclose(fp);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
145 free(str->s); free(str);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
146 bed_index(h);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
147 return h;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
148 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
149
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
150 void bed_destroy(void *_h)
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
151 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
152 reghash_t *h = (reghash_t*)_h;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
153 khint_t k;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
154 for (k = 0; k < kh_end(h); ++k) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
155 if (kh_exist(h, k)) {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
156 free(kh_val(h, k).a);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
157 free(kh_val(h, k).idx);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
158 free((char*)kh_key(h, k));
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
159 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
160 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
161 kh_destroy(reg, h);
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
162 }