Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/samtools-0.1.19/bcftools/bcfutils.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/bcfutils.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,504 @@ +#include <string.h> +#include <math.h> +#include <assert.h> +#include "bcf.h" +#include "kstring.h" +#include "khash.h" +KHASH_MAP_INIT_STR(str2id, int) + +#ifdef _WIN32 +#define srand48(x) srand(x) +#define drand48() ((double)rand() / RAND_MAX) +#endif + +// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated... +void *bcf_build_refhash(bcf_hdr_t *h) +{ + khash_t(str2id) *hash; + int i, ret; + hash = kh_init(str2id); + for (i = 0; i < h->n_ref; ++i) { + khint_t k; + k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret + kh_val(hash, k) = i; + } + return hash; +} + +void *bcf_str2id_init() +{ + return kh_init(str2id); +} + +void bcf_str2id_destroy(void *_hash) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + if (hash) kh_destroy(str2id, hash); // Note that strings are not freed. +} + +void bcf_str2id_thorough_destroy(void *_hash) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + khint_t k; + if (hash == 0) return; + for (k = 0; k < kh_end(hash); ++k) + if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); + kh_destroy(str2id, hash); +} + +int bcf_str2id(void *_hash, const char *str) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + khint_t k; + if (!hash) return -1; + k = kh_get(str2id, hash, str); + return k == kh_end(hash)? -1 : kh_val(hash, k); +} + +int bcf_str2id_add(void *_hash, const char *str) +{ + khint_t k; + int ret; + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + if (!hash) return -1; + k = kh_put(str2id, hash, str, &ret); + if (ret == 0) return kh_val(hash, k); + kh_val(hash, k) = kh_size(hash) - 1; + return kh_val(hash, k); +} + +void bcf_fit_alt(bcf1_t *b, int mask) +{ + mask |= 1; // REF must be always present + + int i,j,nals=0; + for (i=0; i<sizeof(int); i++) + if ( mask&1<<i) nals++; + if ( b->n_alleles <= nals ) return; + + // update ALT, in principle any of the alleles can be removed + char *p; + if ( nals>1 ) + { + char *dst, *src; + int n=0, nalts=nals-1; + for (src=dst=p=b->alt, i=1; *p; p++) + { + if ( *p!=',' ) continue; + + if ( mask&1<<i ) + { + n++; + if ( src!=dst ) + { + memmove(dst,src,p-src); + dst += p-src; + } + else dst = p; + if ( n<nalts ) { *dst=','; dst++; } + } + i++; + + if ( n>=nalts ) { *dst=0; break; } + src = p+1; + } + if ( n<nalts ) + { + memmove(dst,src,p-src); + dst += p-src; + *dst = 0; + } + p = dst; + } + else p = b->alt, *p = '\0'; + p++; + memmove(p, b->flt, b->str + b->l_str - b->flt); + b->l_str -= b->flt - p; + + // update PL and GT + int ipl=-1, igt=-1; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) ipl = i; + if (g->fmt == bcf_str2int("GT", 2)) igt = i; + } + + // .. create mapping between old and new indexes + int npl = nals * (nals+1) / 2; + int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles)); + int kori=0,knew=0; + for (i=0; i<b->n_alleles; i++) + { + for (j=0; j<=i; j++) + { + int skip=0; + if ( i && !(mask&1<<i) ) skip=1; + if ( j && !(mask&1<<j) ) skip=1; + if ( !skip ) { map[knew++] = kori; } + kori++; + } + } + // .. apply to all samples + int n_smpl = b->n_smpl; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) + { + g->len = npl; + uint8_t *d = (uint8_t*)g->data; + int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2; + for (knew=ismpl=0; ismpl<n_smpl; ismpl++) + { + uint8_t *dl = d + ismpl * npl_ori; + for (j=0; j<npl; j++) d[knew++] = dl[map[j]]; + } + } // FIXME: to add GL + } + // update GTs + map[0] = 0; + for (i=1, knew=0; i<b->n_alleles; i++) + map[i] = mask&1<<i ? ++knew : -1; + for (i=0; i<n_smpl; i++) + { + uint8_t gt = ((uint8_t*)b->gi[igt].data)[i]; + int a1 = (gt>>3)&7; + int a2 = gt&7; + assert( map[a1]>=0 && map[a2]>=0 ); + ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2]; + } + free(map); + b->n_alleles = nals; + bcf_sync(b); +} + +int bcf_shrink_alt(bcf1_t *b, int n) +{ + char *p; + int i, j, k, n_smpl = b->n_smpl; + if (b->n_alleles <= n) return -1; + // update ALT + if (n > 1) { + for (p = b->alt, k = 1; *p; ++p) + if (*p == ',' && ++k == n) break; + *p = '\0'; + } else p = b->alt, *p = '\0'; + ++p; + memmove(p, b->flt, b->str + b->l_str - b->flt); + b->l_str -= b->flt - p; + // update PL + for (i = 0; i < b->n_gi; ++i) { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) { + int l, x = b->n_alleles * (b->n_alleles + 1) / 2; + uint8_t *d = (uint8_t*)g->data; + g->len = n * (n + 1) / 2; + for (l = k = 0; l < n_smpl; ++l) { + uint8_t *dl = d + l * x; + for (j = 0; j < g->len; ++j) d[k++] = dl[j]; + } + } // FIXME: to add GL + } + b->n_alleles = n; + bcf_sync(b); + return 0; +} + +int bcf_gl2pl(bcf1_t *b) +{ + char *p; + int i, n_smpl = b->n_smpl; + bcf_ginfo_t *g; + float *d0; + uint8_t *d1; + if (strstr(b->fmt, "PL")) return -1; + if ((p = strstr(b->fmt, "GL")) == 0) return -1; + *p = 'P'; + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("GL", 2)) + break; + g = b->gi + i; + g->fmt = bcf_str2int("PL", 2); + g->len /= 4; // 4 == sizeof(float) + d0 = (float*)g->data; d1 = (uint8_t*)g->data; + for (i = 0; i < n_smpl * g->len; ++i) { + int x = (int)(-10. * d0[i] + .499); + if (x > 255) x = 255; + if (x < 0) x = 0; + d1[i] = x; + } + return 0; +} +/* FIXME: this function will fail given AB:GTX:GT. BCFtools never + * produces such FMT, but others may do. */ +int bcf_fix_gt(bcf1_t *b) +{ + char *s; + int i; + uint32_t tmp; + bcf_ginfo_t gt; + // check the presence of the GT FMT + if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first + assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact + tmp = bcf_str2int("GT", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug... + gt = b->gi[i]; + // move GT to the first + for (; i > 0; --i) b->gi[i] = b->gi[i-1]; + b->gi[0] = gt; + if ( s[3]==0 ) + memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT + else + memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT: + b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; + return 0; +} + +int bcf_fix_pl(bcf1_t *b) +{ + int i; + uint32_t tmp; + uint8_t *PL, *swap; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // prepare + gi = b->gi + i; + PL = (uint8_t*)gi->data; + swap = alloca(gi->len); + // loop through individuals + for (i = 0; i < b->n_smpl; ++i) { + int k, l, x; + uint8_t *PLi = PL + i * gi->len; + memcpy(swap, PLi, gi->len); + for (k = x = 0; k < b->n_alleles; ++k) + for (l = k; l < b->n_alleles; ++l) + PLi[l*(l+1)/2 + k] = swap[x++]; + } + return 0; +} + +int bcf_smpl_covered(const bcf1_t *b) +{ + int i, j, n = 0; + uint32_t tmp; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // count how many samples having PL!=[0..0] + gi = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len; + for (j = 0; j < gi->len; ++j) + if (PLi[j]) break; + if (j < gi->len) ++n; + } + return n; +} + +static void *locate_field(const bcf1_t *b, const char *fmt, int l) +{ + int i; + uint32_t tmp; + tmp = bcf_str2int(fmt, l); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + return i == b->n_gi? 0 : b->gi[i].data; +} + +int bcf_anno_max(bcf1_t *b) +{ + int k, max_gq, max_sp, n_het; + kstring_t str; + uint8_t *gt, *gq; + int32_t *sp; + max_gq = max_sp = n_het = 0; + gt = locate_field(b, "GT", 2); + if (gt == 0) return -1; + gq = locate_field(b, "GQ", 2); + sp = locate_field(b, "SP", 2); + if (sp) + for (k = 0; k < b->n_smpl; ++k) + if (gt[k]&0x3f) + max_sp = max_sp > (int)sp[k]? max_sp : sp[k]; + if (gq) + for (k = 0; k < b->n_smpl; ++k) + if (gt[k]&0x3f) + max_gq = max_gq > (int)gq[k]? max_gq : gq[k]; + for (k = 0; k < b->n_smpl; ++k) { + int a1, a2; + a1 = gt[k]&7; a2 = gt[k]>>3&7; + if ((!a1 && a2) || (!a2 && a1)) { // a het + if (gq == 0) ++n_het; + else if (gq[k] >= 20) ++n_het; + } + } + if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499); + if (max_sp < 0) max_sp = 0; + memset(&str, 0, sizeof(kstring_t)); + if (*b->info) kputc(';', &str); + ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq); + bcf_append_info(b, str.s, str.l); + free(str.s); + return 0; +} + +// FIXME: only data are shuffled; the header is NOT +int bcf_shuffle(bcf1_t *b, int seed) +{ + int i, j, *a; + if (seed > 0) srand48(seed); + a = malloc(b->n_smpl * sizeof(int)); + for (i = 0; i < b->n_smpl; ++i) a[i] = i; + for (i = b->n_smpl; i > 1; --i) { + int tmp; + j = (int)(drand48() * i); + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; + } + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap, *data = (uint8_t*)gi->data; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < b->n_smpl; ++i) + memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); + free(gi->data); + gi->data = swap; + } + free(a); + return 0; +} + +bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) +{ + int i, ret, j; + khint_t k; + bcf_hdr_t *h; + khash_t(str2id) *hash; + kstring_t s; + s.l = s.m = 0; s.s = 0; + hash = kh_init(str2id); + for (i = 0; i < h0->n_smpl; ++i) { + k = kh_put(str2id, hash, h0->sns[i], &ret); + kh_val(hash, k) = i; + } + for (i = j = 0; i < n; ++i) { + k = kh_get(str2id, hash, samples[i]); + if (k != kh_end(hash)) { + list[j++] = kh_val(hash, k); + kputs(samples[i], &s); kputc('\0', &s); + } + } + if (j < n) + { + fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + exit(1); + } + kh_destroy(str2id, hash); + h = calloc(1, sizeof(bcf_hdr_t)); + *h = *h0; + h->ns = 0; h->sns = 0; + h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm); + h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt); + h->l_smpl = s.l; h->sname = s.s; + bcf_hdr_sync(h); + return h; +} + +int bcf_subsam(int n_smpl, int *list, bcf1_t *b) +{ + int i, j; + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < n_smpl; ++i) + memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + free(gi->data); + gi->data = swap; + } + b->n_smpl = n_smpl; + return 0; +} + +static int8_t nt4_table[128] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4 +}; + +int bcf_gl10(const bcf1_t *b, uint8_t *gl) +{ + int a[4], k, l, map[4], k1, j, i; + const bcf_ginfo_t *PL; + char *s; + if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + a[0] = nt4_table[(int)b->ref[0]]; + if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T + a[1] = a[2] = a[3] = -2; // -1 has a special meaning + if (b->alt[0] == 0) return -1; // no alternate allele + map[0] = map[1] = map[2] = map[3] = -2; + map[a[0]] = 0; + for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) { + if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base + a[k+1] = nt4_table[(int)*s]; + if (a[k+1] >= 0) map[a[k+1]] = k+1; + else k1 = k + 1; + if (s[1] == 0) break; // the end of the ALT string + } + for (k = 0; k < 4; ++k) + if (map[k] < 0) map[k] = k1; + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual + uint8_t *g = gl + 10 * i; + for (k = j = 0; k < 4; ++k) { + for (l = k; l < 4; ++l) { + int t, x = map[k], y = map[l]; + if (x > y) t = x, x = y, y = t; // make sure x is the smaller + g[j++] = p[y * (y+1) / 2 + x]; + } + } + } + return 0; +} + +int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl) +{ + int k, l, j, i; + const bcf_ginfo_t *PL; + if (b->alt[0] == 0) return -1; // no alternate allele + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual + uint8_t *g = gl + 10 * i; + for (k = j = 0; k < 4; ++k) { + for (l = k; l < 4; ++l) { + int t, x = k, y = l; + if (x > y) t = x, x = y, y = t; // make sure x is the smaller + x = y * (y+1) / 2 + x; + g[j++] = x < PL->len? p[x] : 255; + } + } + } + return 0; +}