comparison PsiCLASS-1.0.2/samtools-0.1.19/bcftools/bcfutils.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #include <string.h>
2 #include <math.h>
3 #include <assert.h>
4 #include "bcf.h"
5 #include "kstring.h"
6 #include "khash.h"
7 KHASH_MAP_INIT_STR(str2id, int)
8
9 #ifdef _WIN32
10 #define srand48(x) srand(x)
11 #define drand48() ((double)rand() / RAND_MAX)
12 #endif
13
14 // FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
15 void *bcf_build_refhash(bcf_hdr_t *h)
16 {
17 khash_t(str2id) *hash;
18 int i, ret;
19 hash = kh_init(str2id);
20 for (i = 0; i < h->n_ref; ++i) {
21 khint_t k;
22 k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
23 kh_val(hash, k) = i;
24 }
25 return hash;
26 }
27
28 void *bcf_str2id_init()
29 {
30 return kh_init(str2id);
31 }
32
33 void bcf_str2id_destroy(void *_hash)
34 {
35 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
36 if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
37 }
38
39 void bcf_str2id_thorough_destroy(void *_hash)
40 {
41 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
42 khint_t k;
43 if (hash == 0) return;
44 for (k = 0; k < kh_end(hash); ++k)
45 if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
46 kh_destroy(str2id, hash);
47 }
48
49 int bcf_str2id(void *_hash, const char *str)
50 {
51 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
52 khint_t k;
53 if (!hash) return -1;
54 k = kh_get(str2id, hash, str);
55 return k == kh_end(hash)? -1 : kh_val(hash, k);
56 }
57
58 int bcf_str2id_add(void *_hash, const char *str)
59 {
60 khint_t k;
61 int ret;
62 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
63 if (!hash) return -1;
64 k = kh_put(str2id, hash, str, &ret);
65 if (ret == 0) return kh_val(hash, k);
66 kh_val(hash, k) = kh_size(hash) - 1;
67 return kh_val(hash, k);
68 }
69
70 void bcf_fit_alt(bcf1_t *b, int mask)
71 {
72 mask |= 1; // REF must be always present
73
74 int i,j,nals=0;
75 for (i=0; i<sizeof(int); i++)
76 if ( mask&1<<i) nals++;
77 if ( b->n_alleles <= nals ) return;
78
79 // update ALT, in principle any of the alleles can be removed
80 char *p;
81 if ( nals>1 )
82 {
83 char *dst, *src;
84 int n=0, nalts=nals-1;
85 for (src=dst=p=b->alt, i=1; *p; p++)
86 {
87 if ( *p!=',' ) continue;
88
89 if ( mask&1<<i )
90 {
91 n++;
92 if ( src!=dst )
93 {
94 memmove(dst,src,p-src);
95 dst += p-src;
96 }
97 else dst = p;
98 if ( n<nalts ) { *dst=','; dst++; }
99 }
100 i++;
101
102 if ( n>=nalts ) { *dst=0; break; }
103 src = p+1;
104 }
105 if ( n<nalts )
106 {
107 memmove(dst,src,p-src);
108 dst += p-src;
109 *dst = 0;
110 }
111 p = dst;
112 }
113 else p = b->alt, *p = '\0';
114 p++;
115 memmove(p, b->flt, b->str + b->l_str - b->flt);
116 b->l_str -= b->flt - p;
117
118 // update PL and GT
119 int ipl=-1, igt=-1;
120 for (i = 0; i < b->n_gi; ++i)
121 {
122 bcf_ginfo_t *g = b->gi + i;
123 if (g->fmt == bcf_str2int("PL", 2)) ipl = i;
124 if (g->fmt == bcf_str2int("GT", 2)) igt = i;
125 }
126
127 // .. create mapping between old and new indexes
128 int npl = nals * (nals+1) / 2;
129 int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles));
130 int kori=0,knew=0;
131 for (i=0; i<b->n_alleles; i++)
132 {
133 for (j=0; j<=i; j++)
134 {
135 int skip=0;
136 if ( i && !(mask&1<<i) ) skip=1;
137 if ( j && !(mask&1<<j) ) skip=1;
138 if ( !skip ) { map[knew++] = kori; }
139 kori++;
140 }
141 }
142 // .. apply to all samples
143 int n_smpl = b->n_smpl;
144 for (i = 0; i < b->n_gi; ++i)
145 {
146 bcf_ginfo_t *g = b->gi + i;
147 if (g->fmt == bcf_str2int("PL", 2))
148 {
149 g->len = npl;
150 uint8_t *d = (uint8_t*)g->data;
151 int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2;
152 for (knew=ismpl=0; ismpl<n_smpl; ismpl++)
153 {
154 uint8_t *dl = d + ismpl * npl_ori;
155 for (j=0; j<npl; j++) d[knew++] = dl[map[j]];
156 }
157 } // FIXME: to add GL
158 }
159 // update GTs
160 map[0] = 0;
161 for (i=1, knew=0; i<b->n_alleles; i++)
162 map[i] = mask&1<<i ? ++knew : -1;
163 for (i=0; i<n_smpl; i++)
164 {
165 uint8_t gt = ((uint8_t*)b->gi[igt].data)[i];
166 int a1 = (gt>>3)&7;
167 int a2 = gt&7;
168 assert( map[a1]>=0 && map[a2]>=0 );
169 ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)&gt) | map[a1]<<3 | map[a2];
170 }
171 free(map);
172 b->n_alleles = nals;
173 bcf_sync(b);
174 }
175
176 int bcf_shrink_alt(bcf1_t *b, int n)
177 {
178 char *p;
179 int i, j, k, n_smpl = b->n_smpl;
180 if (b->n_alleles <= n) return -1;
181 // update ALT
182 if (n > 1) {
183 for (p = b->alt, k = 1; *p; ++p)
184 if (*p == ',' && ++k == n) break;
185 *p = '\0';
186 } else p = b->alt, *p = '\0';
187 ++p;
188 memmove(p, b->flt, b->str + b->l_str - b->flt);
189 b->l_str -= b->flt - p;
190 // update PL
191 for (i = 0; i < b->n_gi; ++i) {
192 bcf_ginfo_t *g = b->gi + i;
193 if (g->fmt == bcf_str2int("PL", 2)) {
194 int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
195 uint8_t *d = (uint8_t*)g->data;
196 g->len = n * (n + 1) / 2;
197 for (l = k = 0; l < n_smpl; ++l) {
198 uint8_t *dl = d + l * x;
199 for (j = 0; j < g->len; ++j) d[k++] = dl[j];
200 }
201 } // FIXME: to add GL
202 }
203 b->n_alleles = n;
204 bcf_sync(b);
205 return 0;
206 }
207
208 int bcf_gl2pl(bcf1_t *b)
209 {
210 char *p;
211 int i, n_smpl = b->n_smpl;
212 bcf_ginfo_t *g;
213 float *d0;
214 uint8_t *d1;
215 if (strstr(b->fmt, "PL")) return -1;
216 if ((p = strstr(b->fmt, "GL")) == 0) return -1;
217 *p = 'P';
218 for (i = 0; i < b->n_gi; ++i)
219 if (b->gi[i].fmt == bcf_str2int("GL", 2))
220 break;
221 g = b->gi + i;
222 g->fmt = bcf_str2int("PL", 2);
223 g->len /= 4; // 4 == sizeof(float)
224 d0 = (float*)g->data; d1 = (uint8_t*)g->data;
225 for (i = 0; i < n_smpl * g->len; ++i) {
226 int x = (int)(-10. * d0[i] + .499);
227 if (x > 255) x = 255;
228 if (x < 0) x = 0;
229 d1[i] = x;
230 }
231 return 0;
232 }
233 /* FIXME: this function will fail given AB:GTX:GT. BCFtools never
234 * produces such FMT, but others may do. */
235 int bcf_fix_gt(bcf1_t *b)
236 {
237 char *s;
238 int i;
239 uint32_t tmp;
240 bcf_ginfo_t gt;
241 // check the presence of the GT FMT
242 if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first
243 assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact
244 tmp = bcf_str2int("GT", 2);
245 for (i = 0; i < b->n_gi; ++i)
246 if (b->gi[i].fmt == tmp) break;
247 if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug...
248 gt = b->gi[i];
249 // move GT to the first
250 for (; i > 0; --i) b->gi[i] = b->gi[i-1];
251 b->gi[0] = gt;
252 if ( s[3]==0 )
253 memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT
254 else
255 memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT:
256 b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':';
257 return 0;
258 }
259
260 int bcf_fix_pl(bcf1_t *b)
261 {
262 int i;
263 uint32_t tmp;
264 uint8_t *PL, *swap;
265 bcf_ginfo_t *gi;
266 // pinpoint PL
267 tmp = bcf_str2int("PL", 2);
268 for (i = 0; i < b->n_gi; ++i)
269 if (b->gi[i].fmt == tmp) break;
270 if (i == b->n_gi) return 0;
271 // prepare
272 gi = b->gi + i;
273 PL = (uint8_t*)gi->data;
274 swap = alloca(gi->len);
275 // loop through individuals
276 for (i = 0; i < b->n_smpl; ++i) {
277 int k, l, x;
278 uint8_t *PLi = PL + i * gi->len;
279 memcpy(swap, PLi, gi->len);
280 for (k = x = 0; k < b->n_alleles; ++k)
281 for (l = k; l < b->n_alleles; ++l)
282 PLi[l*(l+1)/2 + k] = swap[x++];
283 }
284 return 0;
285 }
286
287 int bcf_smpl_covered(const bcf1_t *b)
288 {
289 int i, j, n = 0;
290 uint32_t tmp;
291 bcf_ginfo_t *gi;
292 // pinpoint PL
293 tmp = bcf_str2int("PL", 2);
294 for (i = 0; i < b->n_gi; ++i)
295 if (b->gi[i].fmt == tmp) break;
296 if (i == b->n_gi) return 0;
297 // count how many samples having PL!=[0..0]
298 gi = b->gi + i;
299 for (i = 0; i < b->n_smpl; ++i) {
300 uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
301 for (j = 0; j < gi->len; ++j)
302 if (PLi[j]) break;
303 if (j < gi->len) ++n;
304 }
305 return n;
306 }
307
308 static void *locate_field(const bcf1_t *b, const char *fmt, int l)
309 {
310 int i;
311 uint32_t tmp;
312 tmp = bcf_str2int(fmt, l);
313 for (i = 0; i < b->n_gi; ++i)
314 if (b->gi[i].fmt == tmp) break;
315 return i == b->n_gi? 0 : b->gi[i].data;
316 }
317
318 int bcf_anno_max(bcf1_t *b)
319 {
320 int k, max_gq, max_sp, n_het;
321 kstring_t str;
322 uint8_t *gt, *gq;
323 int32_t *sp;
324 max_gq = max_sp = n_het = 0;
325 gt = locate_field(b, "GT", 2);
326 if (gt == 0) return -1;
327 gq = locate_field(b, "GQ", 2);
328 sp = locate_field(b, "SP", 2);
329 if (sp)
330 for (k = 0; k < b->n_smpl; ++k)
331 if (gt[k]&0x3f)
332 max_sp = max_sp > (int)sp[k]? max_sp : sp[k];
333 if (gq)
334 for (k = 0; k < b->n_smpl; ++k)
335 if (gt[k]&0x3f)
336 max_gq = max_gq > (int)gq[k]? max_gq : gq[k];
337 for (k = 0; k < b->n_smpl; ++k) {
338 int a1, a2;
339 a1 = gt[k]&7; a2 = gt[k]>>3&7;
340 if ((!a1 && a2) || (!a2 && a1)) { // a het
341 if (gq == 0) ++n_het;
342 else if (gq[k] >= 20) ++n_het;
343 }
344 }
345 if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499);
346 if (max_sp < 0) max_sp = 0;
347 memset(&str, 0, sizeof(kstring_t));
348 if (*b->info) kputc(';', &str);
349 ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq);
350 bcf_append_info(b, str.s, str.l);
351 free(str.s);
352 return 0;
353 }
354
355 // FIXME: only data are shuffled; the header is NOT
356 int bcf_shuffle(bcf1_t *b, int seed)
357 {
358 int i, j, *a;
359 if (seed > 0) srand48(seed);
360 a = malloc(b->n_smpl * sizeof(int));
361 for (i = 0; i < b->n_smpl; ++i) a[i] = i;
362 for (i = b->n_smpl; i > 1; --i) {
363 int tmp;
364 j = (int)(drand48() * i);
365 tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
366 }
367 for (j = 0; j < b->n_gi; ++j) {
368 bcf_ginfo_t *gi = b->gi + j;
369 uint8_t *swap, *data = (uint8_t*)gi->data;
370 swap = malloc(gi->len * b->n_smpl);
371 for (i = 0; i < b->n_smpl; ++i)
372 memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
373 free(gi->data);
374 gi->data = swap;
375 }
376 free(a);
377 return 0;
378 }
379
380 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
381 {
382 int i, ret, j;
383 khint_t k;
384 bcf_hdr_t *h;
385 khash_t(str2id) *hash;
386 kstring_t s;
387 s.l = s.m = 0; s.s = 0;
388 hash = kh_init(str2id);
389 for (i = 0; i < h0->n_smpl; ++i) {
390 k = kh_put(str2id, hash, h0->sns[i], &ret);
391 kh_val(hash, k) = i;
392 }
393 for (i = j = 0; i < n; ++i) {
394 k = kh_get(str2id, hash, samples[i]);
395 if (k != kh_end(hash)) {
396 list[j++] = kh_val(hash, k);
397 kputs(samples[i], &s); kputc('\0', &s);
398 }
399 }
400 if (j < n)
401 {
402 fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
403 exit(1);
404 }
405 kh_destroy(str2id, hash);
406 h = calloc(1, sizeof(bcf_hdr_t));
407 *h = *h0;
408 h->ns = 0; h->sns = 0;
409 h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
410 h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
411 h->l_smpl = s.l; h->sname = s.s;
412 bcf_hdr_sync(h);
413 return h;
414 }
415
416 int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
417 {
418 int i, j;
419 for (j = 0; j < b->n_gi; ++j) {
420 bcf_ginfo_t *gi = b->gi + j;
421 uint8_t *swap;
422 swap = malloc(gi->len * b->n_smpl);
423 for (i = 0; i < n_smpl; ++i)
424 memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
425 free(gi->data);
426 gi->data = swap;
427 }
428 b->n_smpl = n_smpl;
429 return 0;
430 }
431
432 static int8_t nt4_table[128] = {
433 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
434 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
435 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
436 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
437 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
438 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
439 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
440 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
441 };
442
443 int bcf_gl10(const bcf1_t *b, uint8_t *gl)
444 {
445 int a[4], k, l, map[4], k1, j, i;
446 const bcf_ginfo_t *PL;
447 char *s;
448 if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
449 for (i = 0; i < b->n_gi; ++i)
450 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
451 if (i == b->n_gi) return -1; // no PL
452 PL = b->gi + i;
453 a[0] = nt4_table[(int)b->ref[0]];
454 if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
455 a[1] = a[2] = a[3] = -2; // -1 has a special meaning
456 if (b->alt[0] == 0) return -1; // no alternate allele
457 map[0] = map[1] = map[2] = map[3] = -2;
458 map[a[0]] = 0;
459 for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
460 if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
461 a[k+1] = nt4_table[(int)*s];
462 if (a[k+1] >= 0) map[a[k+1]] = k+1;
463 else k1 = k + 1;
464 if (s[1] == 0) break; // the end of the ALT string
465 }
466 for (k = 0; k < 4; ++k)
467 if (map[k] < 0) map[k] = k1;
468 for (i = 0; i < b->n_smpl; ++i) {
469 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
470 uint8_t *g = gl + 10 * i;
471 for (k = j = 0; k < 4; ++k) {
472 for (l = k; l < 4; ++l) {
473 int t, x = map[k], y = map[l];
474 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
475 g[j++] = p[y * (y+1) / 2 + x];
476 }
477 }
478 }
479 return 0;
480 }
481
482 int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
483 {
484 int k, l, j, i;
485 const bcf_ginfo_t *PL;
486 if (b->alt[0] == 0) return -1; // no alternate allele
487 for (i = 0; i < b->n_gi; ++i)
488 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
489 if (i == b->n_gi) return -1; // no PL
490 PL = b->gi + i;
491 for (i = 0; i < b->n_smpl; ++i) {
492 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
493 uint8_t *g = gl + 10 * i;
494 for (k = j = 0; k < 4; ++k) {
495 for (l = k; l < 4; ++l) {
496 int t, x = k, y = l;
497 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
498 x = y * (y+1) / 2 + x;
499 g[j++] = x < PL->len? p[x] : 255;
500 }
501 }
502 }
503 return 0;
504 }