Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bcftools/index.c @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 #include <assert.h> | |
| 2 #include <ctype.h> | |
| 3 #include <sys/stat.h> | |
| 4 #include "bam_endian.h" | |
| 5 #include "kstring.h" | |
| 6 #include "bcf.h" | |
| 7 #ifdef _USE_KNETFILE | |
| 8 #include "knetfile.h" | |
| 9 #endif | |
| 10 | |
| 11 #define TAD_LIDX_SHIFT 13 | |
| 12 | |
| 13 typedef struct { | |
| 14 int32_t n, m; | |
| 15 uint64_t *offset; | |
| 16 } bcf_lidx_t; | |
| 17 | |
| 18 struct __bcf_idx_t { | |
| 19 int32_t n; | |
| 20 bcf_lidx_t *index2; | |
| 21 }; | |
| 22 | |
| 23 /************ | |
| 24 * indexing * | |
| 25 ************/ | |
| 26 | |
| 27 static inline void insert_offset2(bcf_lidx_t *index2, int _beg, int _end, uint64_t offset) | |
| 28 { | |
| 29 int i, beg, end; | |
| 30 beg = _beg >> TAD_LIDX_SHIFT; | |
| 31 end = (_end - 1) >> TAD_LIDX_SHIFT; | |
| 32 if (index2->m < end + 1) { | |
| 33 int old_m = index2->m; | |
| 34 index2->m = end + 1; | |
| 35 kroundup32(index2->m); | |
| 36 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); | |
| 37 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); | |
| 38 } | |
| 39 if (beg == end) { | |
| 40 if (index2->offset[beg] == 0) index2->offset[beg] = offset; | |
| 41 } else { | |
| 42 for (i = beg; i <= end; ++i) | |
| 43 if (index2->offset[i] == 0) index2->offset[i] = offset; | |
| 44 } | |
| 45 if (index2->n < end + 1) index2->n = end + 1; | |
| 46 } | |
| 47 | |
| 48 bcf_idx_t *bcf_idx_core(bcf_t *bp, bcf_hdr_t *h) | |
| 49 { | |
| 50 bcf_idx_t *idx; | |
| 51 int32_t last_coor, last_tid; | |
| 52 uint64_t last_off; | |
| 53 kstring_t *str; | |
| 54 BGZF *fp = bp->fp; | |
| 55 bcf1_t *b; | |
| 56 int ret; | |
| 57 | |
| 58 b = calloc(1, sizeof(bcf1_t)); | |
| 59 str = calloc(1, sizeof(kstring_t)); | |
| 60 idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); | |
| 61 idx->n = h->n_ref; | |
| 62 idx->index2 = calloc(h->n_ref, sizeof(bcf_lidx_t)); | |
| 63 | |
| 64 last_tid = 0xffffffffu; | |
| 65 last_off = bgzf_tell(fp); last_coor = 0xffffffffu; | |
| 66 while ((ret = bcf_read(bp, h, b)) > 0) { | |
| 67 int end, tmp; | |
| 68 if (last_tid != b->tid) { // change of chromosomes | |
| 69 last_tid = b->tid; | |
| 70 } else if (last_coor > b->pos) { | |
| 71 fprintf(stderr, "[bcf_idx_core] the input is out of order\n"); | |
| 72 free(str->s); free(str); free(idx); bcf_destroy(b); | |
| 73 return 0; | |
| 74 } | |
| 75 tmp = strlen(b->ref); | |
| 76 end = b->pos + (tmp > 0? tmp : 1); | |
| 77 insert_offset2(&idx->index2[b->tid], b->pos, end, last_off); | |
| 78 last_off = bgzf_tell(fp); | |
| 79 last_coor = b->pos; | |
| 80 } | |
| 81 free(str->s); free(str); bcf_destroy(b); | |
| 82 return idx; | |
| 83 } | |
| 84 | |
| 85 void bcf_idx_destroy(bcf_idx_t *idx) | |
| 86 { | |
| 87 int i; | |
| 88 if (idx == 0) return; | |
| 89 for (i = 0; i < idx->n; ++i) free(idx->index2[i].offset); | |
| 90 free(idx->index2); | |
| 91 free(idx); | |
| 92 } | |
| 93 | |
| 94 /****************** | |
| 95 * index file I/O * | |
| 96 ******************/ | |
| 97 | |
| 98 void bcf_idx_save(const bcf_idx_t *idx, BGZF *fp) | |
| 99 { | |
| 100 int32_t i, ti_is_be; | |
| 101 ti_is_be = bam_is_big_endian(); | |
| 102 bgzf_write(fp, "BCI\4", 4); | |
| 103 if (ti_is_be) { | |
| 104 uint32_t x = idx->n; | |
| 105 bgzf_write(fp, bam_swap_endian_4p(&x), 4); | |
| 106 } else bgzf_write(fp, &idx->n, 4); | |
| 107 for (i = 0; i < idx->n; ++i) { | |
| 108 bcf_lidx_t *index2 = idx->index2 + i; | |
| 109 // write linear index (index2) | |
| 110 if (ti_is_be) { | |
| 111 int x = index2->n; | |
| 112 bgzf_write(fp, bam_swap_endian_4p(&x), 4); | |
| 113 } else bgzf_write(fp, &index2->n, 4); | |
| 114 if (ti_is_be) { // big endian | |
| 115 int x; | |
| 116 for (x = 0; (int)x < index2->n; ++x) | |
| 117 bam_swap_endian_8p(&index2->offset[x]); | |
| 118 bgzf_write(fp, index2->offset, 8 * index2->n); | |
| 119 for (x = 0; (int)x < index2->n; ++x) | |
| 120 bam_swap_endian_8p(&index2->offset[x]); | |
| 121 } else bgzf_write(fp, index2->offset, 8 * index2->n); | |
| 122 } | |
| 123 } | |
| 124 | |
| 125 static bcf_idx_t *bcf_idx_load_core(BGZF *fp) | |
| 126 { | |
| 127 int i, ti_is_be; | |
| 128 char magic[4]; | |
| 129 bcf_idx_t *idx; | |
| 130 ti_is_be = bam_is_big_endian(); | |
| 131 if (fp == 0) { | |
| 132 fprintf(stderr, "[%s] fail to load index.\n", __func__); | |
| 133 return 0; | |
| 134 } | |
| 135 bgzf_read(fp, magic, 4); | |
| 136 if (strncmp(magic, "BCI\4", 4)) { | |
| 137 fprintf(stderr, "[%s] wrong magic number.\n", __func__); | |
| 138 return 0; | |
| 139 } | |
| 140 idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); | |
| 141 bgzf_read(fp, &idx->n, 4); | |
| 142 if (ti_is_be) bam_swap_endian_4p(&idx->n); | |
| 143 idx->index2 = (bcf_lidx_t*)calloc(idx->n, sizeof(bcf_lidx_t)); | |
| 144 for (i = 0; i < idx->n; ++i) { | |
| 145 bcf_lidx_t *index2 = idx->index2 + i; | |
| 146 int j; | |
| 147 bgzf_read(fp, &index2->n, 4); | |
| 148 if (ti_is_be) bam_swap_endian_4p(&index2->n); | |
| 149 index2->m = index2->n; | |
| 150 index2->offset = (uint64_t*)calloc(index2->m, 8); | |
| 151 bgzf_read(fp, index2->offset, index2->n * 8); | |
| 152 if (ti_is_be) | |
| 153 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); | |
| 154 } | |
| 155 return idx; | |
| 156 } | |
| 157 | |
| 158 bcf_idx_t *bcf_idx_load_local(const char *fnidx) | |
| 159 { | |
| 160 BGZF *fp; | |
| 161 fp = bgzf_open(fnidx, "r"); | |
| 162 if (fp) { | |
| 163 bcf_idx_t *idx = bcf_idx_load_core(fp); | |
| 164 bgzf_close(fp); | |
| 165 return idx; | |
| 166 } else return 0; | |
| 167 } | |
| 168 | |
| 169 #ifdef _USE_KNETFILE | |
| 170 static void download_from_remote(const char *url) | |
| 171 { | |
| 172 const int buf_size = 1 * 1024 * 1024; | |
| 173 char *fn; | |
| 174 FILE *fp; | |
| 175 uint8_t *buf; | |
| 176 knetFile *fp_remote; | |
| 177 int l; | |
| 178 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; | |
| 179 l = strlen(url); | |
| 180 for (fn = (char*)url + l - 1; fn >= url; --fn) | |
| 181 if (*fn == '/') break; | |
| 182 ++fn; // fn now points to the file name | |
| 183 fp_remote = knet_open(url, "r"); | |
| 184 if (fp_remote == 0) { | |
| 185 fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); | |
| 186 return; | |
| 187 } | |
| 188 if ((fp = fopen(fn, "w")) == 0) { | |
| 189 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); | |
| 190 knet_close(fp_remote); | |
| 191 return; | |
| 192 } | |
| 193 buf = (uint8_t*)calloc(buf_size, 1); | |
| 194 while ((l = knet_read(fp_remote, buf, buf_size)) != 0) | |
| 195 fwrite(buf, 1, l, fp); | |
| 196 free(buf); | |
| 197 fclose(fp); | |
| 198 knet_close(fp_remote); | |
| 199 } | |
| 200 #else | |
| 201 static void download_from_remote(const char *url) | |
| 202 { | |
| 203 return; | |
| 204 } | |
| 205 #endif | |
| 206 | |
| 207 static char *get_local_version(const char *fn) | |
| 208 { | |
| 209 struct stat sbuf; | |
| 210 char *fnidx = (char*)calloc(strlen(fn) + 5, 1); | |
| 211 strcat(strcpy(fnidx, fn), ".bci"); | |
| 212 if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) { | |
| 213 char *p, *url; | |
| 214 int l = strlen(fnidx); | |
| 215 for (p = fnidx + l - 1; p >= fnidx; --p) | |
| 216 if (*p == '/') break; | |
| 217 url = fnidx; fnidx = strdup(p + 1); | |
| 218 if (stat(fnidx, &sbuf) == 0) { | |
| 219 free(url); | |
| 220 return fnidx; | |
| 221 } | |
| 222 fprintf(stderr, "[%s] downloading the index file...\n", __func__); | |
| 223 download_from_remote(url); | |
| 224 free(url); | |
| 225 } | |
| 226 if (stat(fnidx, &sbuf) == 0) return fnidx; | |
| 227 free(fnidx); return 0; | |
| 228 } | |
| 229 | |
| 230 bcf_idx_t *bcf_idx_load(const char *fn) | |
| 231 { | |
| 232 bcf_idx_t *idx; | |
| 233 char *fname = get_local_version(fn); | |
| 234 if (fname == 0) return 0; | |
| 235 idx = bcf_idx_load_local(fname); | |
| 236 free(fname); | |
| 237 return idx; | |
| 238 } | |
| 239 | |
| 240 int bcf_idx_build2(const char *fn, const char *_fnidx) | |
| 241 { | |
| 242 char *fnidx; | |
| 243 BGZF *fpidx; | |
| 244 bcf_t *bp; | |
| 245 bcf_idx_t *idx; | |
| 246 bcf_hdr_t *h; | |
| 247 if ((bp = bcf_open(fn, "r")) == 0) { | |
| 248 fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n"); | |
| 249 return -1; | |
| 250 } | |
| 251 h = bcf_hdr_read(bp); | |
| 252 idx = bcf_idx_core(bp, h); | |
| 253 bcf_close(bp); | |
| 254 if (_fnidx == 0) { | |
| 255 fnidx = (char*)calloc(strlen(fn) + 5, 1); | |
| 256 strcpy(fnidx, fn); strcat(fnidx, ".bci"); | |
| 257 } else fnidx = strdup(_fnidx); | |
| 258 fpidx = bgzf_open(fnidx, "w"); | |
| 259 if (fpidx == 0) { | |
| 260 fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n"); | |
| 261 free(fnidx); | |
| 262 bcf_idx_destroy(idx); | |
| 263 return -1; | |
| 264 } | |
| 265 bcf_idx_save(idx, fpidx); | |
| 266 bcf_idx_destroy(idx); | |
| 267 bgzf_close(fpidx); | |
| 268 free(fnidx); | |
| 269 return 0; | |
| 270 } | |
| 271 | |
| 272 int bcf_idx_build(const char *fn) | |
| 273 { | |
| 274 return bcf_idx_build2(fn, 0); | |
| 275 } | |
| 276 | |
| 277 /******************************************** | |
| 278 * parse a region in the format chr:beg-end * | |
| 279 ********************************************/ | |
| 280 | |
| 281 int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end) | |
| 282 { | |
| 283 char *s, *p; | |
| 284 int i, l, k; | |
| 285 l = strlen(str); | |
| 286 p = s = (char*)malloc(l+1); | |
| 287 /* squeeze out "," */ | |
| 288 for (i = k = 0; i != l; ++i) | |
| 289 if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; | |
| 290 s[k] = 0; | |
| 291 for (i = 0; i != k; ++i) if (s[i] == ':') break; | |
| 292 s[i] = 0; | |
| 293 if ((*tid = bcf_str2id(str2id, s)) < 0) { | |
| 294 free(s); | |
| 295 return -1; | |
| 296 } | |
| 297 if (i == k) { /* dump the whole sequence */ | |
| 298 *begin = 0; *end = 1<<29; free(s); | |
| 299 return 0; | |
| 300 } | |
| 301 for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; | |
| 302 *begin = atoi(p); | |
| 303 if (i < k) { | |
| 304 p = s + i + 1; | |
| 305 *end = atoi(p); | |
| 306 } else *end = 1<<29; | |
| 307 if (*begin > 0) --*begin; | |
| 308 free(s); | |
| 309 if (*begin > *end) return -1; | |
| 310 return 0; | |
| 311 } | |
| 312 | |
| 313 /******************************* | |
| 314 * retrieve a specified region * | |
| 315 *******************************/ | |
| 316 | |
| 317 uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg) | |
| 318 { | |
| 319 uint64_t min_off, *offset; | |
| 320 int i; | |
| 321 if (beg < 0) beg = 0; | |
| 322 offset = idx->index2[tid].offset; | |
| 323 for (i = beg>>TAD_LIDX_SHIFT; i < idx->index2[tid].n && offset[i] == 0; ++i); | |
| 324 min_off = (i == idx->index2[tid].n)? offset[idx->index2[tid].n-1] : offset[i]; | |
| 325 return min_off; | |
| 326 } | |
| 327 | |
| 328 int bcf_main_index(int argc, char *argv[]) | |
| 329 { | |
| 330 if (argc == 1) { | |
| 331 fprintf(stderr, "Usage: bcftools index <in.bcf>\n"); | |
| 332 return 1; | |
| 333 } | |
| 334 bcf_idx_build(argv[1]); | |
| 335 return 0; | |
| 336 } |
