Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bcftools/bcfutils.c @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <string.h> | |
2 #include <math.h> | |
3 #include <assert.h> | |
4 #include "bcf.h" | |
5 #include "kstring.h" | |
6 #include "khash.h" | |
7 KHASH_MAP_INIT_STR(str2id, int) | |
8 | |
9 #ifdef _WIN32 | |
10 #define srand48(x) srand(x) | |
11 #define drand48() ((double)rand() / RAND_MAX) | |
12 #endif | |
13 | |
14 // FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated... | |
15 void *bcf_build_refhash(bcf_hdr_t *h) | |
16 { | |
17 khash_t(str2id) *hash; | |
18 int i, ret; | |
19 hash = kh_init(str2id); | |
20 for (i = 0; i < h->n_ref; ++i) { | |
21 khint_t k; | |
22 k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret | |
23 kh_val(hash, k) = i; | |
24 } | |
25 return hash; | |
26 } | |
27 | |
28 void *bcf_str2id_init() | |
29 { | |
30 return kh_init(str2id); | |
31 } | |
32 | |
33 void bcf_str2id_destroy(void *_hash) | |
34 { | |
35 khash_t(str2id) *hash = (khash_t(str2id)*)_hash; | |
36 if (hash) kh_destroy(str2id, hash); // Note that strings are not freed. | |
37 } | |
38 | |
39 void bcf_str2id_thorough_destroy(void *_hash) | |
40 { | |
41 khash_t(str2id) *hash = (khash_t(str2id)*)_hash; | |
42 khint_t k; | |
43 if (hash == 0) return; | |
44 for (k = 0; k < kh_end(hash); ++k) | |
45 if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); | |
46 kh_destroy(str2id, hash); | |
47 } | |
48 | |
49 int bcf_str2id(void *_hash, const char *str) | |
50 { | |
51 khash_t(str2id) *hash = (khash_t(str2id)*)_hash; | |
52 khint_t k; | |
53 if (!hash) return -1; | |
54 k = kh_get(str2id, hash, str); | |
55 return k == kh_end(hash)? -1 : kh_val(hash, k); | |
56 } | |
57 | |
58 int bcf_str2id_add(void *_hash, const char *str) | |
59 { | |
60 khint_t k; | |
61 int ret; | |
62 khash_t(str2id) *hash = (khash_t(str2id)*)_hash; | |
63 if (!hash) return -1; | |
64 k = kh_put(str2id, hash, str, &ret); | |
65 if (ret == 0) return kh_val(hash, k); | |
66 kh_val(hash, k) = kh_size(hash) - 1; | |
67 return kh_val(hash, k); | |
68 } | |
69 | |
70 void bcf_fit_alt(bcf1_t *b, int mask) | |
71 { | |
72 mask |= 1; // REF must be always present | |
73 | |
74 int i,j,nals=0; | |
75 for (i=0; i<sizeof(int); i++) | |
76 if ( mask&1<<i) nals++; | |
77 if ( b->n_alleles <= nals ) return; | |
78 | |
79 // update ALT, in principle any of the alleles can be removed | |
80 char *p; | |
81 if ( nals>1 ) | |
82 { | |
83 char *dst, *src; | |
84 int n=0, nalts=nals-1; | |
85 for (src=dst=p=b->alt, i=1; *p; p++) | |
86 { | |
87 if ( *p!=',' ) continue; | |
88 | |
89 if ( mask&1<<i ) | |
90 { | |
91 n++; | |
92 if ( src!=dst ) | |
93 { | |
94 memmove(dst,src,p-src); | |
95 dst += p-src; | |
96 } | |
97 else dst = p; | |
98 if ( n<nalts ) { *dst=','; dst++; } | |
99 } | |
100 i++; | |
101 | |
102 if ( n>=nalts ) { *dst=0; break; } | |
103 src = p+1; | |
104 } | |
105 if ( n<nalts ) | |
106 { | |
107 memmove(dst,src,p-src); | |
108 dst += p-src; | |
109 *dst = 0; | |
110 } | |
111 p = dst; | |
112 } | |
113 else p = b->alt, *p = '\0'; | |
114 p++; | |
115 memmove(p, b->flt, b->str + b->l_str - b->flt); | |
116 b->l_str -= b->flt - p; | |
117 | |
118 // update PL and GT | |
119 int ipl=-1, igt=-1; | |
120 for (i = 0; i < b->n_gi; ++i) | |
121 { | |
122 bcf_ginfo_t *g = b->gi + i; | |
123 if (g->fmt == bcf_str2int("PL", 2)) ipl = i; | |
124 if (g->fmt == bcf_str2int("GT", 2)) igt = i; | |
125 } | |
126 | |
127 // .. create mapping between old and new indexes | |
128 int npl = nals * (nals+1) / 2; | |
129 int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles)); | |
130 int kori=0,knew=0; | |
131 for (i=0; i<b->n_alleles; i++) | |
132 { | |
133 for (j=0; j<=i; j++) | |
134 { | |
135 int skip=0; | |
136 if ( i && !(mask&1<<i) ) skip=1; | |
137 if ( j && !(mask&1<<j) ) skip=1; | |
138 if ( !skip ) { map[knew++] = kori; } | |
139 kori++; | |
140 } | |
141 } | |
142 // .. apply to all samples | |
143 int n_smpl = b->n_smpl; | |
144 for (i = 0; i < b->n_gi; ++i) | |
145 { | |
146 bcf_ginfo_t *g = b->gi + i; | |
147 if (g->fmt == bcf_str2int("PL", 2)) | |
148 { | |
149 g->len = npl; | |
150 uint8_t *d = (uint8_t*)g->data; | |
151 int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2; | |
152 for (knew=ismpl=0; ismpl<n_smpl; ismpl++) | |
153 { | |
154 uint8_t *dl = d + ismpl * npl_ori; | |
155 for (j=0; j<npl; j++) d[knew++] = dl[map[j]]; | |
156 } | |
157 } // FIXME: to add GL | |
158 } | |
159 // update GTs | |
160 map[0] = 0; | |
161 for (i=1, knew=0; i<b->n_alleles; i++) | |
162 map[i] = mask&1<<i ? ++knew : -1; | |
163 for (i=0; i<n_smpl; i++) | |
164 { | |
165 uint8_t gt = ((uint8_t*)b->gi[igt].data)[i]; | |
166 int a1 = (gt>>3)&7; | |
167 int a2 = gt&7; | |
168 assert( map[a1]>=0 && map[a2]>=0 ); | |
169 ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2]; | |
170 } | |
171 free(map); | |
172 b->n_alleles = nals; | |
173 bcf_sync(b); | |
174 } | |
175 | |
176 int bcf_shrink_alt(bcf1_t *b, int n) | |
177 { | |
178 char *p; | |
179 int i, j, k, n_smpl = b->n_smpl; | |
180 if (b->n_alleles <= n) return -1; | |
181 // update ALT | |
182 if (n > 1) { | |
183 for (p = b->alt, k = 1; *p; ++p) | |
184 if (*p == ',' && ++k == n) break; | |
185 *p = '\0'; | |
186 } else p = b->alt, *p = '\0'; | |
187 ++p; | |
188 memmove(p, b->flt, b->str + b->l_str - b->flt); | |
189 b->l_str -= b->flt - p; | |
190 // update PL | |
191 for (i = 0; i < b->n_gi; ++i) { | |
192 bcf_ginfo_t *g = b->gi + i; | |
193 if (g->fmt == bcf_str2int("PL", 2)) { | |
194 int l, x = b->n_alleles * (b->n_alleles + 1) / 2; | |
195 uint8_t *d = (uint8_t*)g->data; | |
196 g->len = n * (n + 1) / 2; | |
197 for (l = k = 0; l < n_smpl; ++l) { | |
198 uint8_t *dl = d + l * x; | |
199 for (j = 0; j < g->len; ++j) d[k++] = dl[j]; | |
200 } | |
201 } // FIXME: to add GL | |
202 } | |
203 b->n_alleles = n; | |
204 bcf_sync(b); | |
205 return 0; | |
206 } | |
207 | |
208 int bcf_gl2pl(bcf1_t *b) | |
209 { | |
210 char *p; | |
211 int i, n_smpl = b->n_smpl; | |
212 bcf_ginfo_t *g; | |
213 float *d0; | |
214 uint8_t *d1; | |
215 if (strstr(b->fmt, "PL")) return -1; | |
216 if ((p = strstr(b->fmt, "GL")) == 0) return -1; | |
217 *p = 'P'; | |
218 for (i = 0; i < b->n_gi; ++i) | |
219 if (b->gi[i].fmt == bcf_str2int("GL", 2)) | |
220 break; | |
221 g = b->gi + i; | |
222 g->fmt = bcf_str2int("PL", 2); | |
223 g->len /= 4; // 4 == sizeof(float) | |
224 d0 = (float*)g->data; d1 = (uint8_t*)g->data; | |
225 for (i = 0; i < n_smpl * g->len; ++i) { | |
226 int x = (int)(-10. * d0[i] + .499); | |
227 if (x > 255) x = 255; | |
228 if (x < 0) x = 0; | |
229 d1[i] = x; | |
230 } | |
231 return 0; | |
232 } | |
233 /* FIXME: this function will fail given AB:GTX:GT. BCFtools never | |
234 * produces such FMT, but others may do. */ | |
235 int bcf_fix_gt(bcf1_t *b) | |
236 { | |
237 char *s; | |
238 int i; | |
239 uint32_t tmp; | |
240 bcf_ginfo_t gt; | |
241 // check the presence of the GT FMT | |
242 if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first | |
243 assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact | |
244 tmp = bcf_str2int("GT", 2); | |
245 for (i = 0; i < b->n_gi; ++i) | |
246 if (b->gi[i].fmt == tmp) break; | |
247 if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug... | |
248 gt = b->gi[i]; | |
249 // move GT to the first | |
250 for (; i > 0; --i) b->gi[i] = b->gi[i-1]; | |
251 b->gi[0] = gt; | |
252 if ( s[3]==0 ) | |
253 memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT | |
254 else | |
255 memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT: | |
256 b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; | |
257 return 0; | |
258 } | |
259 | |
260 int bcf_fix_pl(bcf1_t *b) | |
261 { | |
262 int i; | |
263 uint32_t tmp; | |
264 uint8_t *PL, *swap; | |
265 bcf_ginfo_t *gi; | |
266 // pinpoint PL | |
267 tmp = bcf_str2int("PL", 2); | |
268 for (i = 0; i < b->n_gi; ++i) | |
269 if (b->gi[i].fmt == tmp) break; | |
270 if (i == b->n_gi) return 0; | |
271 // prepare | |
272 gi = b->gi + i; | |
273 PL = (uint8_t*)gi->data; | |
274 swap = alloca(gi->len); | |
275 // loop through individuals | |
276 for (i = 0; i < b->n_smpl; ++i) { | |
277 int k, l, x; | |
278 uint8_t *PLi = PL + i * gi->len; | |
279 memcpy(swap, PLi, gi->len); | |
280 for (k = x = 0; k < b->n_alleles; ++k) | |
281 for (l = k; l < b->n_alleles; ++l) | |
282 PLi[l*(l+1)/2 + k] = swap[x++]; | |
283 } | |
284 return 0; | |
285 } | |
286 | |
287 int bcf_smpl_covered(const bcf1_t *b) | |
288 { | |
289 int i, j, n = 0; | |
290 uint32_t tmp; | |
291 bcf_ginfo_t *gi; | |
292 // pinpoint PL | |
293 tmp = bcf_str2int("PL", 2); | |
294 for (i = 0; i < b->n_gi; ++i) | |
295 if (b->gi[i].fmt == tmp) break; | |
296 if (i == b->n_gi) return 0; | |
297 // count how many samples having PL!=[0..0] | |
298 gi = b->gi + i; | |
299 for (i = 0; i < b->n_smpl; ++i) { | |
300 uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len; | |
301 for (j = 0; j < gi->len; ++j) | |
302 if (PLi[j]) break; | |
303 if (j < gi->len) ++n; | |
304 } | |
305 return n; | |
306 } | |
307 | |
308 static void *locate_field(const bcf1_t *b, const char *fmt, int l) | |
309 { | |
310 int i; | |
311 uint32_t tmp; | |
312 tmp = bcf_str2int(fmt, l); | |
313 for (i = 0; i < b->n_gi; ++i) | |
314 if (b->gi[i].fmt == tmp) break; | |
315 return i == b->n_gi? 0 : b->gi[i].data; | |
316 } | |
317 | |
318 int bcf_anno_max(bcf1_t *b) | |
319 { | |
320 int k, max_gq, max_sp, n_het; | |
321 kstring_t str; | |
322 uint8_t *gt, *gq; | |
323 int32_t *sp; | |
324 max_gq = max_sp = n_het = 0; | |
325 gt = locate_field(b, "GT", 2); | |
326 if (gt == 0) return -1; | |
327 gq = locate_field(b, "GQ", 2); | |
328 sp = locate_field(b, "SP", 2); | |
329 if (sp) | |
330 for (k = 0; k < b->n_smpl; ++k) | |
331 if (gt[k]&0x3f) | |
332 max_sp = max_sp > (int)sp[k]? max_sp : sp[k]; | |
333 if (gq) | |
334 for (k = 0; k < b->n_smpl; ++k) | |
335 if (gt[k]&0x3f) | |
336 max_gq = max_gq > (int)gq[k]? max_gq : gq[k]; | |
337 for (k = 0; k < b->n_smpl; ++k) { | |
338 int a1, a2; | |
339 a1 = gt[k]&7; a2 = gt[k]>>3&7; | |
340 if ((!a1 && a2) || (!a2 && a1)) { // a het | |
341 if (gq == 0) ++n_het; | |
342 else if (gq[k] >= 20) ++n_het; | |
343 } | |
344 } | |
345 if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499); | |
346 if (max_sp < 0) max_sp = 0; | |
347 memset(&str, 0, sizeof(kstring_t)); | |
348 if (*b->info) kputc(';', &str); | |
349 ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq); | |
350 bcf_append_info(b, str.s, str.l); | |
351 free(str.s); | |
352 return 0; | |
353 } | |
354 | |
355 // FIXME: only data are shuffled; the header is NOT | |
356 int bcf_shuffle(bcf1_t *b, int seed) | |
357 { | |
358 int i, j, *a; | |
359 if (seed > 0) srand48(seed); | |
360 a = malloc(b->n_smpl * sizeof(int)); | |
361 for (i = 0; i < b->n_smpl; ++i) a[i] = i; | |
362 for (i = b->n_smpl; i > 1; --i) { | |
363 int tmp; | |
364 j = (int)(drand48() * i); | |
365 tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; | |
366 } | |
367 for (j = 0; j < b->n_gi; ++j) { | |
368 bcf_ginfo_t *gi = b->gi + j; | |
369 uint8_t *swap, *data = (uint8_t*)gi->data; | |
370 swap = malloc(gi->len * b->n_smpl); | |
371 for (i = 0; i < b->n_smpl; ++i) | |
372 memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); | |
373 free(gi->data); | |
374 gi->data = swap; | |
375 } | |
376 free(a); | |
377 return 0; | |
378 } | |
379 | |
380 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) | |
381 { | |
382 int i, ret, j; | |
383 khint_t k; | |
384 bcf_hdr_t *h; | |
385 khash_t(str2id) *hash; | |
386 kstring_t s; | |
387 s.l = s.m = 0; s.s = 0; | |
388 hash = kh_init(str2id); | |
389 for (i = 0; i < h0->n_smpl; ++i) { | |
390 k = kh_put(str2id, hash, h0->sns[i], &ret); | |
391 kh_val(hash, k) = i; | |
392 } | |
393 for (i = j = 0; i < n; ++i) { | |
394 k = kh_get(str2id, hash, samples[i]); | |
395 if (k != kh_end(hash)) { | |
396 list[j++] = kh_val(hash, k); | |
397 kputs(samples[i], &s); kputc('\0', &s); | |
398 } | |
399 } | |
400 if (j < n) | |
401 { | |
402 fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); | |
403 exit(1); | |
404 } | |
405 kh_destroy(str2id, hash); | |
406 h = calloc(1, sizeof(bcf_hdr_t)); | |
407 *h = *h0; | |
408 h->ns = 0; h->sns = 0; | |
409 h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm); | |
410 h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt); | |
411 h->l_smpl = s.l; h->sname = s.s; | |
412 bcf_hdr_sync(h); | |
413 return h; | |
414 } | |
415 | |
416 int bcf_subsam(int n_smpl, int *list, bcf1_t *b) | |
417 { | |
418 int i, j; | |
419 for (j = 0; j < b->n_gi; ++j) { | |
420 bcf_ginfo_t *gi = b->gi + j; | |
421 uint8_t *swap; | |
422 swap = malloc(gi->len * b->n_smpl); | |
423 for (i = 0; i < n_smpl; ++i) | |
424 memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); | |
425 free(gi->data); | |
426 gi->data = swap; | |
427 } | |
428 b->n_smpl = n_smpl; | |
429 return 0; | |
430 } | |
431 | |
432 static int8_t nt4_table[128] = { | |
433 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
434 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
435 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, | |
436 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, | |
437 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
438 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, | |
439 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, | |
440 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4 | |
441 }; | |
442 | |
443 int bcf_gl10(const bcf1_t *b, uint8_t *gl) | |
444 { | |
445 int a[4], k, l, map[4], k1, j, i; | |
446 const bcf_ginfo_t *PL; | |
447 char *s; | |
448 if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles | |
449 for (i = 0; i < b->n_gi; ++i) | |
450 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; | |
451 if (i == b->n_gi) return -1; // no PL | |
452 PL = b->gi + i; | |
453 a[0] = nt4_table[(int)b->ref[0]]; | |
454 if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T | |
455 a[1] = a[2] = a[3] = -2; // -1 has a special meaning | |
456 if (b->alt[0] == 0) return -1; // no alternate allele | |
457 map[0] = map[1] = map[2] = map[3] = -2; | |
458 map[a[0]] = 0; | |
459 for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) { | |
460 if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base | |
461 a[k+1] = nt4_table[(int)*s]; | |
462 if (a[k+1] >= 0) map[a[k+1]] = k+1; | |
463 else k1 = k + 1; | |
464 if (s[1] == 0) break; // the end of the ALT string | |
465 } | |
466 for (k = 0; k < 4; ++k) | |
467 if (map[k] < 0) map[k] = k1; | |
468 for (i = 0; i < b->n_smpl; ++i) { | |
469 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual | |
470 uint8_t *g = gl + 10 * i; | |
471 for (k = j = 0; k < 4; ++k) { | |
472 for (l = k; l < 4; ++l) { | |
473 int t, x = map[k], y = map[l]; | |
474 if (x > y) t = x, x = y, y = t; // make sure x is the smaller | |
475 g[j++] = p[y * (y+1) / 2 + x]; | |
476 } | |
477 } | |
478 } | |
479 return 0; | |
480 } | |
481 | |
482 int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl) | |
483 { | |
484 int k, l, j, i; | |
485 const bcf_ginfo_t *PL; | |
486 if (b->alt[0] == 0) return -1; // no alternate allele | |
487 for (i = 0; i < b->n_gi; ++i) | |
488 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; | |
489 if (i == b->n_gi) return -1; // no PL | |
490 PL = b->gi + i; | |
491 for (i = 0; i < b->n_smpl; ++i) { | |
492 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual | |
493 uint8_t *g = gl + 10 * i; | |
494 for (k = j = 0; k < 4; ++k) { | |
495 for (l = k; l < 4; ++l) { | |
496 int t, x = k, y = l; | |
497 if (x > y) t = x, x = y, y = t; // make sure x is the smaller | |
498 x = y * (y+1) / 2 + x; | |
499 g[j++] = x < PL->len? p[x] : 255; | |
500 } | |
501 } | |
502 } | |
503 return 0; | |
504 } |