Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/samtools-0.1.16/bcftools/bcfutils.c @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 #include <string.h> | |
| 2 #include <math.h> | |
| 3 #include "bcf.h" | |
| 4 #include "kstring.h" | |
| 5 #include "khash.h" | |
| 6 KHASH_MAP_INIT_STR(str2id, int) | |
| 7 | |
| 8 void *bcf_build_refhash(bcf_hdr_t *h) | |
| 9 { | |
| 10 khash_t(str2id) *hash; | |
| 11 int i, ret; | |
| 12 hash = kh_init(str2id); | |
| 13 for (i = 0; i < h->n_ref; ++i) { | |
| 14 khint_t k; | |
| 15 k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret | |
| 16 kh_val(hash, k) = i; | |
| 17 } | |
| 18 return hash; | |
| 19 } | |
| 20 | |
| 21 void *bcf_str2id_init() | |
| 22 { | |
| 23 return kh_init(str2id); | |
| 24 } | |
| 25 | |
| 26 void bcf_str2id_destroy(void *_hash) | |
| 27 { | |
| 28 khash_t(str2id) *hash = (khash_t(str2id)*)_hash; | |
| 29 if (hash) kh_destroy(str2id, hash); // Note that strings are not freed. | |
| 30 } | |
| 31 | |
| 32 void bcf_str2id_thorough_destroy(void *_hash) | |
| 33 { | |
| 34 khash_t(str2id) *hash = (khash_t(str2id)*)_hash; | |
| 35 khint_t k; | |
| 36 if (hash == 0) return; | |
| 37 for (k = 0; k < kh_end(hash); ++k) | |
| 38 if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); | |
| 39 kh_destroy(str2id, hash); | |
| 40 } | |
| 41 | |
| 42 int bcf_str2id(void *_hash, const char *str) | |
| 43 { | |
| 44 khash_t(str2id) *hash = (khash_t(str2id)*)_hash; | |
| 45 khint_t k; | |
| 46 if (!hash) return -1; | |
| 47 k = kh_get(str2id, hash, str); | |
| 48 return k == kh_end(hash)? -1 : kh_val(hash, k); | |
| 49 } | |
| 50 | |
| 51 int bcf_str2id_add(void *_hash, const char *str) | |
| 52 { | |
| 53 khint_t k; | |
| 54 int ret; | |
| 55 khash_t(str2id) *hash = (khash_t(str2id)*)_hash; | |
| 56 if (!hash) return -1; | |
| 57 k = kh_put(str2id, hash, str, &ret); | |
| 58 if (ret == 0) return kh_val(hash, k); | |
| 59 kh_val(hash, k) = kh_size(hash) - 1; | |
| 60 return kh_val(hash, k); | |
| 61 } | |
| 62 | |
| 63 int bcf_shrink_alt(bcf1_t *b, int n) | |
| 64 { | |
| 65 char *p; | |
| 66 int i, j, k, n_smpl = b->n_smpl; | |
| 67 if (b->n_alleles <= n) return -1; | |
| 68 // update ALT | |
| 69 if (n > 1) { | |
| 70 for (p = b->alt, k = 1; *p; ++p) | |
| 71 if (*p == ',' && ++k == n) break; | |
| 72 *p = '\0'; | |
| 73 } else p = b->alt, *p = '\0'; | |
| 74 ++p; | |
| 75 memmove(p, b->flt, b->str + b->l_str - b->flt); | |
| 76 b->l_str -= b->flt - p; | |
| 77 // update PL | |
| 78 for (i = 0; i < b->n_gi; ++i) { | |
| 79 bcf_ginfo_t *g = b->gi + i; | |
| 80 if (g->fmt == bcf_str2int("PL", 2)) { | |
| 81 int l, x = b->n_alleles * (b->n_alleles + 1) / 2; | |
| 82 uint8_t *d = (uint8_t*)g->data; | |
| 83 g->len = n * (n + 1) / 2; | |
| 84 for (l = k = 0; l < n_smpl; ++l) { | |
| 85 uint8_t *dl = d + l * x; | |
| 86 for (j = 0; j < g->len; ++j) d[k++] = dl[j]; | |
| 87 } | |
| 88 } // FIXME: to add GL | |
| 89 } | |
| 90 b->n_alleles = n; | |
| 91 bcf_sync(b); | |
| 92 return 0; | |
| 93 } | |
| 94 | |
| 95 int bcf_gl2pl(bcf1_t *b) | |
| 96 { | |
| 97 char *p; | |
| 98 int i, n_smpl = b->n_smpl; | |
| 99 bcf_ginfo_t *g; | |
| 100 float *d0; | |
| 101 uint8_t *d1; | |
| 102 if (strstr(b->fmt, "PL")) return -1; | |
| 103 if ((p = strstr(b->fmt, "GL")) == 0) return -1; | |
| 104 *p = 'P'; | |
| 105 for (i = 0; i < b->n_gi; ++i) | |
| 106 if (b->gi[i].fmt == bcf_str2int("GL", 2)) | |
| 107 break; | |
| 108 g = b->gi + i; | |
| 109 g->fmt = bcf_str2int("PL", 2); | |
| 110 g->len /= 4; // 4 == sizeof(float) | |
| 111 d0 = (float*)g->data; d1 = (uint8_t*)g->data; | |
| 112 for (i = 0; i < n_smpl * g->len; ++i) { | |
| 113 int x = (int)(-10. * d0[i] + .499); | |
| 114 if (x > 255) x = 255; | |
| 115 if (x < 0) x = 0; | |
| 116 d1[i] = x; | |
| 117 } | |
| 118 return 0; | |
| 119 } | |
| 120 /* FIXME: this function will fail given AB:GTX:GT. BCFtools never | |
| 121 * produces such FMT, but others may do. */ | |
| 122 int bcf_fix_gt(bcf1_t *b) | |
| 123 { | |
| 124 char *s; | |
| 125 int i; | |
| 126 uint32_t tmp; | |
| 127 bcf_ginfo_t gt; | |
| 128 // check the presence of the GT FMT | |
| 129 if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first | |
| 130 if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact | |
| 131 tmp = bcf_str2int("GT", 2); | |
| 132 for (i = 0; i < b->n_gi; ++i) | |
| 133 if (b->gi[i].fmt == tmp) break; | |
| 134 if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug... | |
| 135 gt = b->gi[i]; | |
| 136 // move GT to the first | |
| 137 for (; i > 0; --i) b->gi[i] = b->gi[i-1]; | |
| 138 b->gi[0] = gt; | |
| 139 memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt); | |
| 140 b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; | |
| 141 return 0; | |
| 142 } | |
| 143 | |
| 144 int bcf_fix_pl(bcf1_t *b) | |
| 145 { | |
| 146 int i; | |
| 147 uint32_t tmp; | |
| 148 uint8_t *PL, *swap; | |
| 149 bcf_ginfo_t *gi; | |
| 150 // pinpoint PL | |
| 151 tmp = bcf_str2int("PL", 2); | |
| 152 for (i = 0; i < b->n_gi; ++i) | |
| 153 if (b->gi[i].fmt == tmp) break; | |
| 154 if (i == b->n_gi) return 0; | |
| 155 // prepare | |
| 156 gi = b->gi + i; | |
| 157 PL = (uint8_t*)gi->data; | |
| 158 swap = alloca(gi->len); | |
| 159 // loop through individuals | |
| 160 for (i = 0; i < b->n_smpl; ++i) { | |
| 161 int k, l, x; | |
| 162 uint8_t *PLi = PL + i * gi->len; | |
| 163 memcpy(swap, PLi, gi->len); | |
| 164 for (k = x = 0; k < b->n_alleles; ++k) | |
| 165 for (l = k; l < b->n_alleles; ++l) | |
| 166 PLi[l*(l+1)/2 + k] = swap[x++]; | |
| 167 } | |
| 168 return 0; | |
| 169 } | |
| 170 | |
| 171 int bcf_smpl_covered(const bcf1_t *b) | |
| 172 { | |
| 173 int i, j, n = 0; | |
| 174 uint32_t tmp; | |
| 175 bcf_ginfo_t *gi; | |
| 176 // pinpoint PL | |
| 177 tmp = bcf_str2int("PL", 2); | |
| 178 for (i = 0; i < b->n_gi; ++i) | |
| 179 if (b->gi[i].fmt == tmp) break; | |
| 180 if (i == b->n_gi) return 0; | |
| 181 // count how many samples having PL!=[0..0] | |
| 182 gi = b->gi + i; | |
| 183 for (i = 0; i < b->n_smpl; ++i) { | |
| 184 uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len; | |
| 185 for (j = 0; j < gi->len; ++j) | |
| 186 if (PLi[j]) break; | |
| 187 if (j < gi->len) ++n; | |
| 188 } | |
| 189 return n; | |
| 190 } | |
| 191 | |
| 192 static void *locate_field(const bcf1_t *b, const char *fmt, int l) | |
| 193 { | |
| 194 int i; | |
| 195 uint32_t tmp; | |
| 196 tmp = bcf_str2int(fmt, l); | |
| 197 for (i = 0; i < b->n_gi; ++i) | |
| 198 if (b->gi[i].fmt == tmp) break; | |
| 199 return i == b->n_gi? 0 : b->gi[i].data; | |
| 200 } | |
| 201 | |
| 202 int bcf_anno_max(bcf1_t *b) | |
| 203 { | |
| 204 int k, max_gq, max_sp, n_het; | |
| 205 kstring_t str; | |
| 206 uint8_t *gt, *gq; | |
| 207 int32_t *sp; | |
| 208 max_gq = max_sp = n_het = 0; | |
| 209 gt = locate_field(b, "GT", 2); | |
| 210 if (gt == 0) return -1; | |
| 211 gq = locate_field(b, "GQ", 2); | |
| 212 sp = locate_field(b, "SP", 2); | |
| 213 if (sp) | |
| 214 for (k = 0; k < b->n_smpl; ++k) | |
| 215 if (gt[k]&0x3f) | |
| 216 max_sp = max_sp > (int)sp[k]? max_sp : sp[k]; | |
| 217 if (gq) | |
| 218 for (k = 0; k < b->n_smpl; ++k) | |
| 219 if (gt[k]&0x3f) | |
| 220 max_gq = max_gq > (int)gq[k]? max_gq : gq[k]; | |
| 221 for (k = 0; k < b->n_smpl; ++k) { | |
| 222 int a1, a2; | |
| 223 a1 = gt[k]&7; a2 = gt[k]>>3&7; | |
| 224 if ((!a1 && a2) || (!a2 && a1)) { // a het | |
| 225 if (gq == 0) ++n_het; | |
| 226 else if (gq[k] >= 20) ++n_het; | |
| 227 } | |
| 228 } | |
| 229 if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499); | |
| 230 if (max_sp < 0) max_sp = 0; | |
| 231 memset(&str, 0, sizeof(kstring_t)); | |
| 232 if (*b->info) kputc(';', &str); | |
| 233 ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq); | |
| 234 bcf_append_info(b, str.s, str.l); | |
| 235 free(str.s); | |
| 236 return 0; | |
| 237 } | |
| 238 | |
| 239 // FIXME: only data are shuffled; the header is NOT | |
| 240 int bcf_shuffle(bcf1_t *b, int seed) | |
| 241 { | |
| 242 int i, j, *a; | |
| 243 if (seed > 0) srand48(seed); | |
| 244 a = malloc(b->n_smpl * sizeof(int)); | |
| 245 for (i = 0; i < b->n_smpl; ++i) a[i] = i; | |
| 246 for (i = b->n_smpl; i > 1; --i) { | |
| 247 int tmp; | |
| 248 j = (int)(drand48() * i); | |
| 249 tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; | |
| 250 } | |
| 251 for (j = 0; j < b->n_gi; ++j) { | |
| 252 bcf_ginfo_t *gi = b->gi + j; | |
| 253 uint8_t *swap, *data = (uint8_t*)gi->data; | |
| 254 swap = malloc(gi->len * b->n_smpl); | |
| 255 for (i = 0; i < b->n_smpl; ++i) | |
| 256 memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); | |
| 257 free(gi->data); | |
| 258 gi->data = swap; | |
| 259 } | |
| 260 free(a); | |
| 261 return 0; | |
| 262 } | |
| 263 | |
| 264 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) | |
| 265 { | |
| 266 int i, ret, j; | |
| 267 khint_t k; | |
| 268 bcf_hdr_t *h; | |
| 269 khash_t(str2id) *hash; | |
| 270 kstring_t s; | |
| 271 s.l = s.m = 0; s.s = 0; | |
| 272 hash = kh_init(str2id); | |
| 273 for (i = 0; i < h0->n_smpl; ++i) { | |
| 274 k = kh_put(str2id, hash, h0->sns[i], &ret); | |
| 275 kh_val(hash, k) = i; | |
| 276 } | |
| 277 for (i = j = 0; i < n; ++i) { | |
| 278 k = kh_get(str2id, hash, samples[i]); | |
| 279 if (k != kh_end(hash)) { | |
| 280 list[j++] = kh_val(hash, k); | |
| 281 kputs(samples[i], &s); kputc('\0', &s); | |
| 282 } | |
| 283 } | |
| 284 if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); | |
| 285 kh_destroy(str2id, hash); | |
| 286 h = calloc(1, sizeof(bcf_hdr_t)); | |
| 287 *h = *h0; | |
| 288 h->ns = 0; h->sns = 0; | |
| 289 h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm); | |
| 290 h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt); | |
| 291 h->l_smpl = s.l; h->sname = s.s; | |
| 292 bcf_hdr_sync(h); | |
| 293 return h; | |
| 294 } | |
| 295 | |
| 296 int bcf_subsam(int n_smpl, int *list, bcf1_t *b) | |
| 297 { | |
| 298 int i, j; | |
| 299 for (j = 0; j < b->n_gi; ++j) { | |
| 300 bcf_ginfo_t *gi = b->gi + j; | |
| 301 uint8_t *swap; | |
| 302 swap = malloc(gi->len * b->n_smpl); | |
| 303 for (i = 0; i < n_smpl; ++i) | |
| 304 memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); | |
| 305 free(gi->data); | |
| 306 gi->data = swap; | |
| 307 } | |
| 308 b->n_smpl = n_smpl; | |
| 309 return 0; | |
| 310 } |
