comparison ezBAMQC/src/htslib/vcf.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /* vcf.c -- VCF/BCF API functions.
2
3 Copyright (C) 2012, 2013 Broad Institute.
4 Copyright (C) 2012-2014 Genome Research Ltd.
5 Portions copyright (C) 2014 Intel Corporation.
6
7 Author: Heng Li <lh3@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE. */
26
27 #include <zlib.h>
28 #include <stdio.h>
29 #include <ctype.h>
30 #include <assert.h>
31 #include <string.h>
32 #include <stdlib.h>
33 #include <limits.h>
34 #include "htslib/kstring.h"
35 #include "htslib/bgzf.h"
36 #include "htslib/vcf.h"
37 #include "htslib/tbx.h"
38 #include "htslib/hfile.h"
39 #include "htslib/khash_str2int.h"
40
41 #include "htslib/khash.h"
42 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
43 typedef khash_t(vdict) vdict_t;
44
45 #include "htslib/kseq.h"
46 KSTREAM_DECLARE(gzFile, gzread)
47
48 uint32_t bcf_float_missing = 0x7F800001;
49 uint32_t bcf_float_vector_end = 0x7F800002;
50 uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
51 static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
52
53 /*************************
54 *** VCF header parser ***
55 *************************/
56
57 int bcf_hdr_sync(bcf_hdr_t *h);
58
59 int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
60 {
61 if ( !s ) return 0;
62
63 const char *ss = s;
64 while ( !*ss && isspace(*ss) ) ss++;
65 if ( !*ss )
66 {
67 fprintf(stderr,"[E::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__);
68 abort();
69 }
70
71 vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
72 int ret;
73 char *sdup = strdup(s);
74 int k = kh_put(vdict, d, sdup, &ret);
75 if (ret) { // absent
76 kh_val(d, k) = bcf_idinfo_def;
77 kh_val(d, k).id = kh_size(d) - 1;
78 } else {
79 if (hts_verbose >= 2)
80 {
81 fprintf(stderr, "[E::%s] Duplicated sample name '%s'\n", __func__, s);
82 abort();
83 }
84 free(sdup);
85 return -1;
86 }
87 int n = kh_size(d);
88 h->samples = (char**) realloc(h->samples,sizeof(char*)*n);
89 h->samples[n-1] = sdup;
90 h->dirty = 1;
91 return 0;
92 }
93
94 int bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
95 {
96 int ret = 0;
97 int i = 0;
98 const char *p, *q;
99 // add samples
100 for (p = q = str;; ++q) {
101 if (*q != '\t' && *q != 0 && *q != '\n') continue;
102 if (++i > 9) {
103 char *s = (char*)malloc(q - p + 1);
104 strncpy(s, p, q - p);
105 s[q - p] = 0;
106 if ( bcf_hdr_add_sample(h,s) < 0 ) ret = -1;
107 free(s);
108 }
109 if (*q == 0 || *q == '\n') break;
110 p = q + 1;
111 }
112 bcf_hdr_add_sample(h,NULL);
113 return ret;
114 }
115
116 int bcf_hdr_sync(bcf_hdr_t *h)
117 {
118 int i;
119 for (i = 0; i < 3; i++)
120 {
121 vdict_t *d = (vdict_t*)h->dict[i];
122 khint_t k;
123
124 // find out the largest id, there may be holes because of IDX
125 int max_id = -1;
126 for (k=kh_begin(d); k<kh_end(d); k++)
127 {
128 if (!kh_exist(d,k)) continue;
129 if ( max_id < kh_val(d,k).id ) max_id = kh_val(d,k).id;
130 }
131 if ( max_id >= h->n[i] )
132 {
133 h->id[i] = (bcf_idpair_t*)realloc(h->id[i], (max_id+1)*sizeof(bcf_idpair_t));
134 for (k=h->n[i]; k<=max_id; k++)
135 {
136 h->id[i][k].key = NULL;
137 h->id[i][k].val = NULL;
138 }
139 h->n[i] = max_id+1;
140 }
141 for (k=kh_begin(d); k<kh_end(d); k++)
142 {
143 if (!kh_exist(d,k)) continue;
144 h->id[i][kh_val(d,k).id].key = kh_key(d,k);
145 h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
146 }
147 }
148 h->dirty = 0;
149 return 0;
150 }
151
152 void bcf_hrec_destroy(bcf_hrec_t *hrec)
153 {
154 free(hrec->key);
155 if ( hrec->value ) free(hrec->value);
156 int i;
157 for (i=0; i<hrec->nkeys; i++)
158 {
159 free(hrec->keys[i]);
160 free(hrec->vals[i]);
161 }
162 free(hrec->keys);
163 free(hrec->vals);
164 free(hrec);
165 }
166
167 // Copies all fields except IDX.
168 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
169 {
170 bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
171 out->type = hrec->type;
172 if ( hrec->key ) out->key = strdup(hrec->key);
173 if ( hrec->value ) out->value = strdup(hrec->value);
174 out->nkeys = hrec->nkeys;
175 out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
176 out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
177 int i, j = 0;
178 for (i=0; i<hrec->nkeys; i++)
179 {
180 if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
181 if ( hrec->keys[i] ) out->keys[j] = strdup(hrec->keys[i]);
182 if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]);
183 j++;
184 }
185 if ( i!=j ) out->nkeys -= i-j; // IDX was omitted
186 return out;
187 }
188
189 void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
190 {
191 fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
192 int i;
193 for (i=0; i<hrec->nkeys; i++)
194 fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
195 fprintf(fp, "\n");
196 }
197
198 void bcf_header_debug(bcf_hdr_t *hdr)
199 {
200 int i, j;
201 for (i=0; i<hdr->nhrec; i++)
202 {
203 if ( !hdr->hrec[i]->value )
204 {
205 fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
206 fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
207 for (j=1; j<hdr->hrec[i]->nkeys; j++)
208 fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
209 fprintf(stderr,">\n");
210 }
211 else
212 fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
213 }
214 }
215
216 void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
217 {
218 int n = ++hrec->nkeys;
219 hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
220 hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
221 assert( len );
222 hrec->keys[n-1] = (char*) malloc((len+1)*sizeof(char));
223 memcpy(hrec->keys[n-1],str,len);
224 hrec->keys[n-1][len] = 0;
225 hrec->vals[n-1] = NULL;
226 }
227
228 void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
229 {
230 if ( !str ) { hrec->vals[i] = NULL; return; }
231 if ( hrec->vals[i] ) free(hrec->vals[i]);
232 if ( is_quoted )
233 {
234 hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
235 hrec->vals[i][0] = '"';
236 memcpy(&hrec->vals[i][1],str,len);
237 hrec->vals[i][len+1] = '"';
238 hrec->vals[i][len+2] = 0;
239 }
240 else
241 {
242 hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
243 memcpy(hrec->vals[i],str,len);
244 hrec->vals[i][len] = 0;
245 }
246 }
247
248 void hrec_add_idx(bcf_hrec_t *hrec, int idx)
249 {
250 int n = ++hrec->nkeys;
251 hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
252 hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
253 hrec->keys[n-1] = strdup("IDX");
254 kstring_t str = {0,0,0};
255 kputw(idx, &str);
256 hrec->vals[n-1] = str.s;
257 }
258
259 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
260 {
261 int i;
262 for (i=0; i<hrec->nkeys; i++)
263 if ( !strcasecmp(key,hrec->keys[i]) ) return i;
264 return -1;
265 }
266
267 static inline int is_escaped(const char *min, const char *str)
268 {
269 int n = 0;
270 while ( --str>=min && *str=='\\' ) n++;
271 return n%2;
272 }
273
274 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
275 {
276 const char *p = line;
277 if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
278 p += 2;
279
280 const char *q = p;
281 while ( *q && *q!='=' ) q++;
282 int n = q-p;
283 if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format
284
285 bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
286 hrec->key = (char*) malloc(sizeof(char)*(n+1));
287 memcpy(hrec->key,p,n);
288 hrec->key[n] = 0;
289
290 p = ++q;
291 if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
292 {
293 while ( *q && *q!='\n' ) q++;
294 hrec->value = (char*) malloc((q-p+1)*sizeof(char));
295 memcpy(hrec->value, p, q-p);
296 hrec->value[q-p] = 0;
297 *len = q-line+1;
298 return hrec;
299 }
300
301 // structured line, e.g. ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
302 int nopen = 1;
303 while ( *q && *q!='\n' && nopen )
304 {
305 p = ++q;
306 while ( *q && isalnum(*q) ) q++;
307 n = q-p;
308 if ( *q!='=' || !n )
309 {
310 // wrong format
311 while ( *q && *q!='\n' ) q++;
312 kstring_t tmp = {0,0,0};
313 kputsn(line,q-line,&tmp);
314 fprintf(stderr,"Could not parse the header line: \"%s\"\n", tmp.s);
315 free(tmp.s);
316 *len = q-line+1;
317 bcf_hrec_destroy(hrec);
318 return NULL;
319 }
320 bcf_hrec_add_key(hrec, p, q-p);
321 p = ++q;
322 int quoted = *p=='"' ? 1 : 0;
323 if ( quoted ) p++, q++;
324 while (1)
325 {
326 if ( !*q ) break;
327 if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; }
328 else
329 {
330 if ( *q=='<' ) nopen++;
331 if ( *q=='>' ) nopen--;
332 if ( !nopen ) break;
333 if ( *q==',' && nopen==1 ) break;
334 }
335 q++;
336 }
337 bcf_hrec_set_val(hrec, hrec->nkeys-1, p, q-p, quoted);
338 if ( quoted ) q++;
339 if ( *q=='>' ) { nopen--; q++; }
340 }
341 *len = q-line+1;
342 return hrec;
343 }
344
345 // returns: 1 when hdr needs to be synced, 0 otherwise
346 int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
347 {
348 // contig
349 int i,j,k, ret;
350 char *str;
351 if ( !strcmp(hrec->key, "contig") )
352 {
353 hrec->type = BCF_HL_CTG;
354
355 // Get the contig ID ($str) and length ($j)
356 i = bcf_hrec_find_key(hrec,"length");
357 if ( i<0 ) j = 0;
358 else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0;
359
360 i = bcf_hrec_find_key(hrec,"ID");
361 if ( i<0 ) return 0;
362 str = strdup(hrec->vals[i]);
363
364 // Register in the dictionary
365 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
366 k = kh_put(vdict, d, str, &ret);
367 if ( !ret ) { free(str); return 0; } // already present
368
369 int idx = bcf_hrec_find_key(hrec,"IDX");
370 if ( idx!=-1 )
371 {
372 char *tmp = hrec->vals[idx];
373 idx = strtol(hrec->vals[idx], &tmp, 10);
374 if ( *tmp )
375 {
376 fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
377 return 0;
378 }
379 }
380 else
381 {
382 idx = kh_size(d) - 1;
383 hrec_add_idx(hrec, idx);
384 }
385
386 kh_val(d, k) = bcf_idinfo_def;
387 kh_val(d, k).id = idx;
388 kh_val(d, k).info[0] = j;
389 kh_val(d, k).hrec[0] = hrec;
390
391 return 1;
392 }
393
394 if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
395 else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
396 else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
397 else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; }
398 else return 0;
399
400 // INFO/FILTER/FORMAT
401 char *id = NULL;
402 int type = -1, num = -1, var = -1, idx = -1;
403 for (i=0; i<hrec->nkeys; i++)
404 {
405 if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
406 else if ( !strcmp(hrec->keys[i], "IDX") )
407 {
408 char *tmp = hrec->vals[i];
409 idx = strtol(hrec->vals[i], &tmp, 10);
410 if ( *tmp )
411 {
412 fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
413 return 0;
414 }
415 }
416 else if ( !strcmp(hrec->keys[i], "Type") )
417 {
418 if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
419 else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
420 else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
421 else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
422 else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
423 else
424 {
425 fprintf(stderr, "[E::%s] The type \"%s\" not supported, assuming \"String\"\n", __func__, hrec->vals[i]);
426 type = BCF_HT_STR;
427 }
428 }
429 else if ( !strcmp(hrec->keys[i], "Number") )
430 {
431 if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
432 else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
433 else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
434 else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
435 else
436 {
437 sscanf(hrec->vals[i],"%d",&num);
438 var = BCF_VL_FIXED;
439 }
440 if (var != BCF_VL_FIXED) num = 0xfffff;
441 }
442 }
443 uint32_t info = (uint32_t)num<<12 | var<<8 | type<<4 | hrec->type;
444
445 if ( !id ) return 0;
446 str = strdup(id);
447
448 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
449 k = kh_put(vdict, d, str, &ret);
450 if ( !ret )
451 {
452 // already present
453 free(str);
454 if ( kh_val(d, k).hrec[info&0xf] ) return 0;
455 kh_val(d, k).info[info&0xf] = info;
456 kh_val(d, k).hrec[info&0xf] = hrec;
457 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
458 return 1;
459 }
460 kh_val(d, k) = bcf_idinfo_def;
461 kh_val(d, k).info[info&0xf] = info;
462 kh_val(d, k).hrec[info&0xf] = hrec;
463 kh_val(d, k).id = idx==-1 ? kh_size(d) - 1 : idx;
464
465 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
466
467 return 1;
468 }
469
470 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
471 {
472 if ( !hrec ) return 0;
473
474 hrec->type = BCF_HL_GEN;
475 if ( !bcf_hdr_register_hrec(hdr,hrec) )
476 {
477 // If one of the hashed field, then it is already present
478 if ( hrec->type != BCF_HL_GEN )
479 {
480 bcf_hrec_destroy(hrec);
481 return 0;
482 }
483
484 // Is one of the generic fields and already present?
485 int i;
486 for (i=0; i<hdr->nhrec; i++)
487 {
488 if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
489 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break;
490 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break;
491 }
492 if ( i<hdr->nhrec )
493 {
494 bcf_hrec_destroy(hrec);
495 return 0;
496 }
497 }
498
499 // New record, needs to be added
500 int n = ++hdr->nhrec;
501 hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
502 hdr->hrec[n-1] = hrec;
503 hdr->dirty = 1;
504
505 return hrec->type==BCF_HL_GEN ? 0 : 1;
506 }
507
508 /*
509 * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed),
510 * the STR,GEN lines are searched for linearly in a linked list of all header lines.
511 * This may become a problem for VCFs with huge headers, we might need to build a
512 * dictionary for these lines as well.
513 */
514 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
515 {
516 int i;
517 if ( type==BCF_HL_GEN )
518 {
519 for (i=0; i<hdr->nhrec; i++)
520 {
521 if ( hdr->hrec[i]->type!=type ) continue;
522 if ( strcmp(hdr->hrec[i]->key,key) ) continue;
523 if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i];
524 }
525 return NULL;
526 }
527 else if ( type==BCF_HL_STR )
528 {
529 for (i=0; i<hdr->nhrec; i++)
530 {
531 if ( hdr->hrec[i]->type!=type ) continue;
532 if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
533 int j = bcf_hrec_find_key(hdr->hrec[i],key);
534 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
535 }
536 return NULL;
537 }
538 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
539 khint_t k = kh_get(vdict, d, value);
540 if ( k == kh_end(d) ) return NULL;
541 return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
542 }
543
544 void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
545 {
546 static int PL_warned = 0, GL_warned = 0;
547
548 if ( !PL_warned )
549 {
550 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
551 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
552 {
553 fprintf(stderr,"[W::%s] PL should be declared as Number=G\n", __func__);
554 PL_warned = 1;
555 }
556 }
557 if ( !GL_warned )
558 {
559 int id = bcf_hdr_id2int(hdr, BCF_HL_FMT, "GL");
560 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
561 {
562 fprintf(stderr,"[W::%s] GL should be declared as Number=G\n", __func__);
563 PL_warned = 1;
564 }
565 }
566 }
567
568 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
569 {
570 int len, needs_sync = 0;
571 char *p = htxt;
572
573 // Check sanity: "fileformat" string must come as first
574 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
575 if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
576 fprintf(stderr, "[W::%s] The first line should be ##fileformat; is the VCF/BCF header broken?\n", __func__);
577 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
578
579 // The filter PASS must appear first in the dictionary
580 hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
581 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
582
583 // Parse the whole header
584 while ( (hrec=bcf_hdr_parse_line(hdr,p,&len)) )
585 {
586 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
587 p += len;
588 }
589 int ret = bcf_hdr_parse_sample_line(hdr,p);
590 bcf_hdr_sync(hdr);
591 bcf_hdr_check_sanity(hdr);
592 return ret;
593 }
594
595 int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
596 {
597 int len;
598 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
599 if ( !hrec ) return -1;
600 bcf_hdr_add_hrec(hdr, hrec);
601 return 0;
602 }
603
604 void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
605 {
606 int i;
607 bcf_hrec_t *hrec;
608 while (1)
609 {
610 if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
611 {
612 hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
613 if ( !hrec ) return;
614
615 for (i=0; i<hdr->nhrec; i++)
616 if ( hdr->hrec[i]==hrec ) break;
617 assert( i<hdr->nhrec );
618
619 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
620 khint_t k = kh_get(vdict, d, key);
621 kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
622 }
623 else
624 {
625 for (i=0; i<hdr->nhrec; i++)
626 {
627 if ( hdr->hrec[i]->type!=type ) continue;
628 if ( type==BCF_HL_GEN )
629 {
630 if ( !strcmp(hdr->hrec[i]->key,key) ) break;
631 }
632 else
633 {
634 // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
635 int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
636 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
637 }
638 }
639 if ( i==hdr->nhrec ) return;
640 hrec = hdr->hrec[i];
641 }
642
643 hdr->nhrec--;
644 if ( i < hdr->nhrec )
645 memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
646 bcf_hrec_destroy(hrec);
647 hdr->dirty = 1;
648 }
649 }
650
651 int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
652 {
653 va_list ap;
654 va_start(ap, fmt);
655 int n = vsnprintf(NULL, 0, fmt, ap) + 2;
656 va_end(ap);
657
658 char *line = (char*)malloc(n);
659 va_start(ap, fmt);
660 vsnprintf(line, n, fmt, ap);
661 va_end(ap);
662
663 int ret = bcf_hdr_append(hdr, line);
664
665 free(line);
666 return ret;
667 }
668
669
670 /**********************
671 *** BCF header I/O ***
672 **********************/
673
674 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
675 {
676 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
677 if ( !hrec )
678 {
679 fprintf(stderr,"No version string found, assuming VCFv4.2\n");
680 return "VCFv4.2";
681 }
682 return hrec->value;
683 }
684
685 void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
686 {
687 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
688 if ( !hrec )
689 {
690 int len;
691 kstring_t str = {0,0,0};
692 ksprintf(&str,"##fileformat=%s", version);
693 hrec = bcf_hdr_parse_line(hdr, str.s, &len);
694 free(str.s);
695 }
696 else
697 {
698 free(hrec->value);
699 hrec->value = strdup(version);
700 }
701 hdr->dirty = 1;
702 }
703
704 bcf_hdr_t *bcf_hdr_init(const char *mode)
705 {
706 int i;
707 bcf_hdr_t *h;
708 h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
709 for (i = 0; i < 3; ++i)
710 h->dict[i] = kh_init(vdict);
711 if ( strchr(mode,'w') )
712 {
713 bcf_hdr_append(h, "##fileformat=VCFv4.2");
714 // The filter PASS must appear first in the dictionary
715 bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
716 }
717 return h;
718 }
719
720 void bcf_hdr_destroy(bcf_hdr_t *h)
721 {
722 int i;
723 khint_t k;
724 for (i = 0; i < 3; ++i) {
725 vdict_t *d = (vdict_t*)h->dict[i];
726 if (d == 0) continue;
727 for (k = kh_begin(d); k != kh_end(d); ++k)
728 if (kh_exist(d, k)) free((char*)kh_key(d, k));
729 kh_destroy(vdict, d);
730 free(h->id[i]);
731 }
732 for (i=0; i<h->nhrec; i++)
733 bcf_hrec_destroy(h->hrec[i]);
734 if (h->nhrec) free(h->hrec);
735 if (h->samples) free(h->samples);
736 free(h->keep_samples);
737 free(h->transl[0]); free(h->transl[1]);
738 free(h->mem.s);
739 free(h);
740 }
741
742 bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
743 {
744 if (hfp->format.format == vcf)
745 return vcf_hdr_read(hfp);
746
747 BGZF *fp = hfp->fp.bgzf;
748 uint8_t magic[5];
749 bcf_hdr_t *h;
750 h = bcf_hdr_init("r");
751 if ( bgzf_read(fp, magic, 5)<0 )
752 {
753 fprintf(stderr,"[%s:%d %s] Failed to read the header (reading BCF in text mode?)\n", __FILE__,__LINE__,__FUNCTION__);
754 return NULL;
755 }
756 if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
757 {
758 if (!strncmp((char*)magic, "BCF", 3))
759 fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 is supported.\n", __FILE__,__LINE__,__FUNCTION__);
760 else if (hts_verbose >= 2)
761 fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__);
762 bcf_hdr_destroy(h);
763 return 0;
764 }
765 int hlen;
766 char *htxt;
767 bgzf_read(fp, &hlen, 4);
768 htxt = (char*)malloc(hlen);
769 bgzf_read(fp, htxt, hlen);
770 bcf_hdr_parse(h, htxt);
771 free(htxt);
772 return h;
773 }
774
775 int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
776 {
777 if ( h->dirty ) bcf_hdr_sync(h);
778 if (hfp->format.format == vcf || hfp->format.format == text_format)
779 return vcf_hdr_write(hfp, h);
780
781 int hlen;
782 char *htxt = bcf_hdr_fmt_text(h, 1, &hlen);
783 hlen++; // include the \0 byte
784
785 BGZF *fp = hfp->fp.bgzf;
786 if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
787 if ( bgzf_write(fp, &hlen, 4) !=4 ) return -1;
788 if ( bgzf_write(fp, htxt, hlen) != hlen ) return -1;
789
790 free(htxt);
791 return 0;
792 }
793
794 /********************
795 *** BCF site I/O ***
796 ********************/
797
798 bcf1_t *bcf_init1()
799 {
800 bcf1_t *v;
801 v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
802 return v;
803 }
804
805 void bcf_clear(bcf1_t *v)
806 {
807 int i;
808 for (i=0; i<v->d.m_info; i++)
809 {
810 if ( v->d.info[i].vptr_free )
811 {
812 free(v->d.info[i].vptr - v->d.info[i].vptr_off);
813 v->d.info[i].vptr_free = 0;
814 }
815 }
816 for (i=0; i<v->d.m_fmt; i++)
817 {
818 if ( v->d.fmt[i].p_free )
819 {
820 free(v->d.fmt[i].p - v->d.fmt[i].p_off);
821 v->d.fmt[i].p_free = 0;
822 }
823 }
824 v->rid = v->pos = v->rlen = v->unpacked = 0;
825 bcf_float_set_missing(v->qual);
826 v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
827 v->shared.l = v->indiv.l = 0;
828 v->d.var_type = -1;
829 v->d.shared_dirty = 0;
830 v->d.indiv_dirty = 0;
831 v->d.n_flt = 0;
832 v->errcode = 0;
833 if (v->d.m_als) v->d.als[0] = 0;
834 if (v->d.m_id) v->d.id[0] = 0;
835 }
836
837 void bcf_empty1(bcf1_t *v)
838 {
839 bcf_clear1(v);
840 free(v->d.id);
841 free(v->d.als);
842 free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
843 if (v->d.var ) free(v->d.var);
844 free(v->shared.s); free(v->indiv.s);
845 }
846
847 void bcf_destroy1(bcf1_t *v)
848 {
849 bcf_empty1(v);
850 free(v);
851 }
852
853 static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
854 {
855 uint32_t x[8];
856 int ret;
857 if ((ret = bgzf_read(fp, x, 32)) != 32) {
858 if (ret == 0) return -1;
859 return -2;
860 }
861 bcf_clear1(v);
862 x[0] -= 24; // to exclude six 32-bit integers
863 ks_resize(&v->shared, x[0]);
864 ks_resize(&v->indiv, x[1]);
865 memcpy(v, x + 2, 16);
866 v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff;
867 v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff;
868 v->shared.l = x[0], v->indiv.l = x[1];
869
870 // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
871 if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
872
873 bgzf_read(fp, v->shared.s, v->shared.l);
874 bgzf_read(fp, v->indiv.s, v->indiv.l);
875 return 0;
876 }
877
878 #define bit_array_size(n) ((n)/8+1)
879 #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
880 #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
881 #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
882
883 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
884 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
885 {
886 if ( !hdr->keep_samples ) return 0;
887 if ( !bcf_hdr_nsamples(hdr) )
888 {
889 rec->indiv.l = rec->n_sample = 0;
890 return 0;
891 }
892
893 int i, j;
894 uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
895 bcf_dec_t *dec = &rec->d;
896 hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
897 for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
898
899 for (i=0; i<rec->n_fmt; i++)
900 {
901 ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
902 src = dec->fmt[i].p - dec->fmt[i].size;
903 if ( dst )
904 {
905 memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
906 dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
907 }
908 dst = dec->fmt[i].p;
909 for (j=0; j<hdr->nsamples_ori; j++)
910 {
911 src += dec->fmt[i].size;
912 if ( !bit_array_test(hdr->keep_samples,j) ) continue;
913 memmove(dst, src, dec->fmt[i].size);
914 dst += dec->fmt[i].size;
915 }
916 rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
917 dec->fmt[i].p_len = dst - dec->fmt[i].p;
918 }
919 rec->unpacked |= BCF_UN_FMT;
920
921 rec->n_sample = bcf_hdr_nsamples(hdr);
922 return 0;
923 }
924
925 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
926 {
927 if (fp->format.format == vcf) return vcf_read(fp,h,v);
928 int ret = bcf_read1_core(fp->fp.bgzf, v);
929 if ( ret!=0 || !h->keep_samples ) return ret;
930 return bcf_subset_format(h,v);
931 }
932
933 int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end)
934 {
935 bcf1_t *v = (bcf1_t *) vv;
936 int ret;
937 if ((ret = bcf_read1_core(fp, v)) >= 0)
938 *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
939 return ret;
940 }
941
942 static inline void bcf1_sync_id(bcf1_t *line, kstring_t *str)
943 {
944 // single typed string
945 if ( line->d.id && strcmp(line->d.id, ".") ) bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
946 else bcf_enc_size(str, 0, BCF_BT_CHAR);
947 }
948 static inline void bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
949 {
950 // list of typed strings
951 int i;
952 for (i=0; i<line->n_allele; i++)
953 bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]);
954 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
955 }
956 static inline void bcf1_sync_filter(bcf1_t *line, kstring_t *str)
957 {
958 // typed vector of integers
959 if ( line->d.n_flt ) bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
960 else bcf_enc_vint(str, 0, 0, -1);
961 }
962
963 static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str)
964 {
965 // pairs of typed vectors
966 int i, irm = -1;
967 for (i=0; i<line->n_info; i++)
968 {
969 bcf_info_t *info = &line->d.info[i];
970 if ( !info->vptr )
971 {
972 // marked for removal
973 if ( irm < 0 ) irm = i;
974 continue;
975 }
976 kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str);
977 if ( irm >=0 )
978 {
979 bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
980 while ( irm<=i && line->d.info[irm].vptr ) irm++;
981 }
982 }
983 if ( irm>=0 ) line->n_info = irm;
984 }
985
986 static int bcf1_sync(bcf1_t *line)
987 {
988 char *shared_ori = line->shared.s;
989 size_t prev_len;
990
991 kstring_t tmp = {0,0,0};
992 if ( !line->shared.l )
993 {
994 // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
995 tmp = line->shared;
996 bcf1_sync_id(line, &tmp);
997 line->unpack_size[0] = tmp.l; prev_len = tmp.l;
998
999 bcf1_sync_alleles(line, &tmp);
1000 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
1001
1002 bcf1_sync_filter(line, &tmp);
1003 line->unpack_size[2] = tmp.l - prev_len;
1004
1005 bcf1_sync_info(line, &tmp);
1006 line->shared = tmp;
1007 }
1008 else if ( line->d.shared_dirty )
1009 {
1010 // The line was edited, update the BCF data block, ptr_ori points
1011 // to the original unchanged BCF data.
1012 uint8_t *ptr_ori = (uint8_t *) line->shared.s;
1013
1014 assert( line->unpacked & BCF_UN_STR );
1015
1016 // ID: single typed string
1017 if ( line->d.shared_dirty & BCF1_DIRTY_ID )
1018 bcf1_sync_id(line, &tmp);
1019 else
1020 kputsn_(ptr_ori, line->unpack_size[0], &tmp);
1021 ptr_ori += line->unpack_size[0];
1022 line->unpack_size[0] = tmp.l; prev_len = tmp.l;
1023
1024 // REF+ALT: list of typed strings
1025 if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
1026 bcf1_sync_alleles(line, &tmp);
1027 else
1028 {
1029 kputsn_(ptr_ori, line->unpack_size[1], &tmp);
1030 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1031 }
1032 ptr_ori += line->unpack_size[1];
1033 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
1034
1035 if ( line->unpacked & BCF_UN_FLT )
1036 {
1037 // FILTER: typed vector of integers
1038 if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
1039 bcf1_sync_filter(line, &tmp);
1040 else if ( line->d.n_flt )
1041 kputsn_(ptr_ori, line->unpack_size[2], &tmp);
1042 else
1043 bcf_enc_vint(&tmp, 0, 0, -1);
1044 ptr_ori += line->unpack_size[2];
1045 line->unpack_size[2] = tmp.l - prev_len;
1046
1047 if ( line->unpacked & BCF_UN_INFO )
1048 {
1049 // INFO: pairs of typed vectors
1050 if ( line->d.shared_dirty & BCF1_DIRTY_INF )
1051 {
1052 bcf1_sync_info(line, &tmp);
1053 ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
1054 }
1055 }
1056 }
1057
1058 int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
1059 if ( size ) kputsn_(ptr_ori, size, &tmp);
1060
1061 free(line->shared.s);
1062 line->shared = tmp;
1063 }
1064 if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
1065 {
1066 // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
1067 size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
1068 int i;
1069 for (i=0; i<line->n_info; i++)
1070 {
1071 uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
1072 line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
1073 off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
1074 if ( vptr_free )
1075 {
1076 free(vptr_free);
1077 line->d.info[i].vptr_free = 0;
1078 }
1079 }
1080 }
1081
1082 if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
1083 {
1084 // The genotype fields changed or are not present
1085 tmp.l = tmp.m = 0; tmp.s = NULL;
1086 int i, irm = -1;
1087 for (i=0; i<line->n_fmt; i++)
1088 {
1089 bcf_fmt_t *fmt = &line->d.fmt[i];
1090 if ( !fmt->p )
1091 {
1092 // marked for removal
1093 if ( irm < 0 ) irm = i;
1094 continue;
1095 }
1096 kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
1097 if ( irm >=0 )
1098 {
1099 bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
1100 while ( irm<=i && line->d.fmt[irm].p ) irm++;
1101 }
1102
1103 }
1104 if ( irm>=0 ) line->n_fmt = irm;
1105 free(line->indiv.s);
1106 line->indiv = tmp;
1107
1108 // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
1109 size_t off_new = 0;
1110 for (i=0; i<line->n_fmt; i++)
1111 {
1112 uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
1113 line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
1114 off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
1115 if ( p_free )
1116 {
1117 free(p_free);
1118 line->d.fmt[i].p_free = 0;
1119 }
1120 }
1121 }
1122 if ( !line->n_sample ) line->n_fmt = 0;
1123 line->d.shared_dirty = line->d.indiv_dirty = 0;
1124 return 0;
1125 }
1126
1127 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
1128 {
1129 bcf1_sync(src);
1130
1131 bcf_clear(dst);
1132 dst->rid = src->rid;
1133 dst->pos = src->pos;
1134 dst->rlen = src->rlen;
1135 dst->qual = src->qual;
1136 dst->n_info = src->n_info; dst->n_allele = src->n_allele;
1137 dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
1138
1139 dst->shared.m = dst->shared.l = src->shared.l;
1140 dst->shared.s = (char*) malloc(dst->shared.l);
1141 memcpy(dst->shared.s,src->shared.s,dst->shared.l);
1142
1143 dst->indiv.m = dst->indiv.l = src->indiv.l;
1144 dst->indiv.s = (char*) malloc(dst->indiv.l);
1145 memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
1146
1147 return dst;
1148 }
1149 bcf1_t *bcf_dup(bcf1_t *src)
1150 {
1151 bcf1_t *out = bcf_init1();
1152 return bcf_copy(out, src);
1153 }
1154
1155 int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v)
1156 {
1157 if ( h->dirty )
1158 {
1159 // we could as well call bcf_hdr_sync here, not sure
1160 fprintf(stderr,"FIXME: dirty header not synced\n");
1161 exit(1);
1162 }
1163 if ( bcf_hdr_nsamples(h)!=v->n_sample )
1164 {
1165 fprintf(stderr,"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
1166 __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
1167 return -1;
1168 }
1169
1170 if ( hfp->format.format == vcf || hfp->format.format == text_format )
1171 return vcf_write(hfp,h,v);
1172
1173 if ( v->errcode )
1174 {
1175 // vcf_parse1() encountered a new contig or tag, undeclared in the
1176 // header. At this point, the header must have been printed,
1177 // proceeding would lead to a broken BCF file. Errors must be checked
1178 // and cleared by the caller before we can proceed.
1179 fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,v->errcode);
1180 exit(1);
1181 }
1182 bcf1_sync(v); // check if the BCF record was modified
1183
1184 BGZF *fp = hfp->fp.bgzf;
1185 uint32_t x[8];
1186 x[0] = v->shared.l + 24; // to include six 32-bit integers
1187 x[1] = v->indiv.l;
1188 memcpy(x + 2, v, 16);
1189 x[6] = (uint32_t)v->n_allele<<16 | v->n_info;
1190 x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample;
1191 if ( bgzf_write(fp, x, 32) != 32 ) return -1;
1192 if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
1193 if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
1194 return 0;
1195 }
1196
1197 /**********************
1198 *** VCF header I/O ***
1199 **********************/
1200
1201 bcf_hdr_t *vcf_hdr_read(htsFile *fp)
1202 {
1203 kstring_t txt, *s = &fp->line;
1204 bcf_hdr_t *h;
1205 h = bcf_hdr_init("r");
1206 txt.l = txt.m = 0; txt.s = 0;
1207 while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
1208 if (s->l == 0) continue;
1209 if (s->s[0] != '#') {
1210 if (hts_verbose >= 2)
1211 fprintf(stderr, "[E::%s] no sample line\n", __func__);
1212 free(txt.s);
1213 bcf_hdr_destroy(h);
1214 return 0;
1215 }
1216 if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
1217 int dret;
1218 gzFile f;
1219 kstream_t *ks;
1220 kstring_t tmp;
1221 tmp.l = tmp.m = 0; tmp.s = 0;
1222 f = gzopen(fp->fn_aux, "r");
1223 ks = ks_init(f);
1224 while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
1225 int c;
1226 kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
1227 ks_getuntil(ks, 0, &tmp, &dret);
1228 kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
1229 kputsn(">\n", 2, &txt);
1230 if (dret != '\n')
1231 while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
1232 }
1233 free(tmp.s);
1234 ks_destroy(ks);
1235 gzclose(f);
1236 }
1237 kputsn(s->s, s->l, &txt);
1238 kputc('\n', &txt);
1239 if (s->s[1] != '#') break;
1240 }
1241 if ( !txt.s )
1242 {
1243 fprintf(stderr,"[%s:%d %s] Could not read the header\n", __FILE__,__LINE__,__FUNCTION__);
1244 return NULL;
1245 }
1246 bcf_hdr_parse(h, txt.s);
1247
1248 // check tabix index, are all contigs listed in the header? add the missing ones
1249 tbx_t *idx = tbx_index_load(fp->fn);
1250 if ( idx )
1251 {
1252 int i, n, need_sync = 0;
1253 const char **names = tbx_seqnames(idx, &n);
1254 for (i=0; i<n; i++)
1255 {
1256 bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
1257 if ( hrec ) continue;
1258 hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
1259 hrec->key = strdup("contig");
1260 bcf_hrec_add_key(hrec, "ID", strlen("ID"));
1261 bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
1262 bcf_hdr_add_hrec(h, hrec);
1263 need_sync = 1;
1264 }
1265 free(names);
1266 tbx_destroy(idx);
1267 if ( need_sync )
1268 bcf_hdr_sync(h);
1269 }
1270 free(txt.s);
1271 return h;
1272 }
1273
1274 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
1275 {
1276 int i, n;
1277 char **lines = hts_readlines(fname, &n);
1278 if ( !lines ) return 1;
1279 for (i=0; i<n-1; i++)
1280 {
1281 int k;
1282 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
1283 if ( hrec ) bcf_hdr_add_hrec(hdr, hrec);
1284 free(lines[i]);
1285 }
1286 bcf_hdr_parse_sample_line(hdr,lines[n-1]);
1287 free(lines[n-1]);
1288 free(lines);
1289 bcf_hdr_sync(hdr);
1290 return 0;
1291 }
1292
1293 static void _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
1294 {
1295 if ( !hrec->value )
1296 {
1297 int j, nout = 0;
1298 ksprintf(str, "##%s=<", hrec->key);
1299 for (j=0; j<hrec->nkeys; j++)
1300 {
1301 // do not output IDX if output is VCF
1302 if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
1303 if ( nout ) kputc(',',str);
1304 ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
1305 nout++;
1306 }
1307 ksprintf(str,">\n");
1308 }
1309 else
1310 ksprintf(str,"##%s=%s\n", hrec->key,hrec->value);
1311 }
1312
1313 void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
1314 {
1315 _bcf_hrec_format(hrec,0,str);
1316 }
1317 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
1318 {
1319 int i;
1320 kstring_t txt = {0,0,0};
1321 for (i=0; i<hdr->nhrec; i++)
1322 _bcf_hrec_format(hdr->hrec[i], is_bcf, &txt);
1323
1324 ksprintf(&txt,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
1325 if ( bcf_hdr_nsamples(hdr) )
1326 {
1327 ksprintf(&txt,"\tFORMAT");
1328 for (i=0; i<bcf_hdr_nsamples(hdr); i++)
1329 ksprintf(&txt,"\t%s", hdr->samples[i]);
1330 }
1331 ksprintf(&txt,"\n");
1332
1333 if ( len ) *len = txt.l;
1334 return txt.s;
1335 }
1336
1337 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
1338 {
1339 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
1340 int tid, m = kh_size(d);
1341 const char **names = (const char**) calloc(m,sizeof(const char*));
1342 khint_t k;
1343 for (k=kh_begin(d); k<kh_end(d); k++)
1344 {
1345 if ( !kh_exist(d,k) ) continue;
1346 tid = kh_val(d,k).id;
1347 assert( tid<m );
1348 names[tid] = kh_key(d,k);
1349 }
1350 // sanity check: there should be no gaps
1351 for (tid=0; tid<m; tid++)
1352 assert(names[tid]);
1353 *n = m;
1354 return names;
1355 }
1356
1357 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
1358 {
1359 int hlen;
1360 char *htxt = bcf_hdr_fmt_text(h, 0, &hlen);
1361 while (hlen && htxt[hlen-1] == 0) --hlen; // kill trailing zeros
1362 int ret;
1363 if ( fp->format.compression!=no_compression )
1364 ret = bgzf_write(fp->fp.bgzf, htxt, hlen);
1365 else
1366 ret = hwrite(fp->fp.hfile, htxt, hlen);
1367 free(htxt);
1368 return ret<0 ? -1 : 0;
1369 }
1370
1371 /***********************
1372 *** Typed value I/O ***
1373 ***********************/
1374
1375 void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
1376 {
1377 int32_t max = INT32_MIN + 1, min = INT32_MAX;
1378 int i;
1379 if (n == 0) bcf_enc_size(s, 0, BCF_BT_NULL);
1380 else if (n == 1) bcf_enc_int1(s, a[0]);
1381 else {
1382 if (wsize <= 0) wsize = n;
1383 for (i = 0; i < n; ++i) {
1384 if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
1385 if (max < a[i]) max = a[i];
1386 if (min > a[i]) min = a[i];
1387 }
1388 if (max <= INT8_MAX && min > bcf_int8_vector_end) {
1389 bcf_enc_size(s, wsize, BCF_BT_INT8);
1390 for (i = 0; i < n; ++i)
1391 if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
1392 else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
1393 else kputc(a[i], s);
1394 } else if (max <= INT16_MAX && min > bcf_int16_vector_end) {
1395 bcf_enc_size(s, wsize, BCF_BT_INT16);
1396 for (i = 0; i < n; ++i)
1397 {
1398 int16_t x;
1399 if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
1400 else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
1401 else x = a[i];
1402 kputsn((char*)&x, 2, s);
1403 }
1404 } else {
1405 bcf_enc_size(s, wsize, BCF_BT_INT32);
1406 for (i = 0; i < n; ++i) {
1407 int32_t x = a[i];
1408 kputsn((char*)&x, 4, s);
1409 }
1410 }
1411 }
1412 }
1413
1414 void bcf_enc_vfloat(kstring_t *s, int n, float *a)
1415 {
1416 bcf_enc_size(s, n, BCF_BT_FLOAT);
1417 kputsn((char*)a, n << 2, s);
1418 }
1419
1420 void bcf_enc_vchar(kstring_t *s, int l, const char *a)
1421 {
1422 bcf_enc_size(s, l, BCF_BT_CHAR);
1423 kputsn(a, l, s);
1424 }
1425
1426 void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
1427 {
1428 int j = 0;
1429 if (n == 0) {
1430 kputc('.', s);
1431 return;
1432 }
1433 if (type == BCF_BT_CHAR)
1434 {
1435 char *p = (char*)data;
1436 for (j = 0; j < n && *p; ++j, ++p)
1437 {
1438 if ( *p==bcf_str_missing ) kputc('.', s);
1439 else kputc(*p, s);
1440 }
1441 }
1442 else
1443 {
1444 #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
1445 type_t *p = (type_t *) data; \
1446 for (j=0; j<n; j++) \
1447 { \
1448 if ( is_vector_end ) break; \
1449 if ( j ) kputc(',', s); \
1450 if ( is_missing ) kputc('.', s); \
1451 else kprint; \
1452 } \
1453 }
1454 switch (type) {
1455 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, kputw(p[j], s)); break;
1456 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, kputw(p[j], s)); break;
1457 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, kputw(p[j], s)); break;
1458 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), ksprintf(s, "%g", p[j])); break;
1459 default: fprintf(stderr,"todo: type %d\n", type); exit(1); break;
1460 }
1461 #undef BRANCH
1462 }
1463 }
1464
1465 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
1466 {
1467 int x, type;
1468 x = bcf_dec_size(ptr, &ptr, &type);
1469 bcf_fmt_array(s, x, type, ptr);
1470 return ptr + (x << bcf_type_shift[type]);
1471 }
1472
1473 /********************
1474 *** VCF site I/O ***
1475 ********************/
1476
1477 typedef struct {
1478 int key, max_m, size, offset;
1479 uint32_t is_gt:1, max_g:15, max_l:16;
1480 uint32_t y;
1481 uint8_t *buf;
1482 } fmt_aux_t;
1483
1484 static inline void align_mem(kstring_t *s)
1485 {
1486 if (s->l&7) {
1487 uint64_t zero = 0;
1488 int l = ((s->l + 7)>>3<<3) - s->l;
1489 kputsn((char*)&zero, l, s);
1490 }
1491 }
1492
1493 // p,q is the start and the end of the FORMAT field
1494 int _vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
1495 {
1496 if ( !bcf_hdr_nsamples(h) ) return 0;
1497
1498 char *r, *t;
1499 int j, l, m, g;
1500 khint_t k;
1501 ks_tokaux_t aux1;
1502 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
1503 kstring_t *mem = (kstring_t*)&h->mem;
1504 mem->l = 0;
1505
1506 // count the number of format fields
1507 for (r = p, v->n_fmt = 1; *r; ++r)
1508 if (*r == ':') ++v->n_fmt;
1509 char *end = s->s + s->l;
1510 if ( q>=end )
1511 {
1512 fprintf(stderr,"[%s:%d %s] Error: FORMAT column with no sample columns starting at %s:%d\n", __FILE__,__LINE__,__FUNCTION__,s->s,v->pos+1);
1513 return -1;
1514 }
1515
1516 fmt_aux_t *fmt = (fmt_aux_t*)alloca(v->n_fmt * sizeof(fmt_aux_t));
1517 // get format information from the dictionary
1518 for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
1519 *(char*)aux1.p = 0;
1520 k = kh_get(vdict, d, t);
1521 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
1522 fprintf(stderr, "[W::%s] FORMAT '%s' is not defined in the header, assuming Type=String\n", __func__, t);
1523 kstring_t tmp = {0,0,0};
1524 int l;
1525 ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
1526 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
1527 free(tmp.s);
1528 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
1529 k = kh_get(vdict, d, t);
1530 v->errcode = BCF_ERR_TAG_UNDEF;
1531 }
1532 fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
1533 fmt[j].key = kh_val(d, k).id;
1534 fmt[j].is_gt = !strcmp(t, "GT");
1535 fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
1536 }
1537 // compute max
1538 int n_sample_ori = -1;
1539 r = q + 1; // r: position in the format string
1540 m = l = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
1541 while ( r<end )
1542 {
1543 // can we skip some samples?
1544 if ( h->keep_samples )
1545 {
1546 n_sample_ori++;
1547 if ( !bit_array_test(h->keep_samples,n_sample_ori) )
1548 {
1549 while ( *r!='\t' && r<end ) r++;
1550 if ( *r=='\t' ) { *r = 0; r++; }
1551 continue;
1552 }
1553 }
1554
1555 // collect fmt stats: max vector size, length, number of alleles
1556 j = 0; // j-th format field
1557 for (;;)
1558 {
1559 if ( *r == '\t' ) *r = 0;
1560 if ( *r == ':' || !*r ) // end of field or end of sample
1561 {
1562 if (fmt[j].max_m < m) fmt[j].max_m = m;
1563 if (fmt[j].max_l < l - 1) fmt[j].max_l = l - 1;
1564 if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].max_g = g;
1565 l = 0, m = g = 1;
1566 if ( *r==':' )
1567 {
1568 j++;
1569 if ( j>=v->n_fmt )
1570 {
1571 fprintf(stderr,"Incorrect number of FORMAT fields at %s:%d\n", h->id[BCF_DT_CTG][v->rid].key,v->pos+1);
1572 exit(1);
1573 }
1574 }
1575 else break;
1576 }
1577 else if ( *r== ',' ) m++;
1578 else if ( fmt[j].is_gt && (*r == '|' || *r == '/') ) g++;
1579 if ( r>=end ) break;
1580 r++; l++;
1581 }
1582 v->n_sample++;
1583 if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
1584 r++;
1585 }
1586
1587 // allocate memory for arrays
1588 for (j = 0; j < v->n_fmt; ++j) {
1589 fmt_aux_t *f = &fmt[j];
1590 if ( !f->max_m ) f->max_m = 1; // omitted trailing format field
1591 if ((f->y>>4&0xf) == BCF_HT_STR) {
1592 f->size = f->is_gt? f->max_g << 2 : f->max_l;
1593 } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
1594 f->size = f->max_m << 2;
1595 } else
1596 {
1597 fprintf(stderr, "[E::%s] the format type %d currently not supported\n", __func__, f->y>>4&0xf);
1598 abort(); // I do not know how to do with Flag in the genotype fields
1599 }
1600 align_mem(mem);
1601 f->offset = mem->l;
1602 ks_resize(mem, mem->l + v->n_sample * f->size);
1603 mem->l += v->n_sample * f->size;
1604 }
1605 for (j = 0; j < v->n_fmt; ++j)
1606 fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
1607 // fill the sample fields; at beginning of the loop, t points to the first char of a format
1608 n_sample_ori = -1;
1609 t = q + 1; m = 0; // m: sample id
1610 while ( t<end )
1611 {
1612 // can we skip some samples?
1613 if ( h->keep_samples )
1614 {
1615 n_sample_ori++;
1616 if ( !bit_array_test(h->keep_samples,n_sample_ori) )
1617 {
1618 while ( *t && t<end ) t++;
1619 t++;
1620 continue;
1621 }
1622 }
1623 if ( m == bcf_hdr_nsamples(h) ) break;
1624
1625 j = 0; // j-th format field, m-th sample
1626 while ( *t )
1627 {
1628 fmt_aux_t *z = &fmt[j];
1629 if ((z->y>>4&0xf) == BCF_HT_STR) {
1630 if (z->is_gt) { // genotypes
1631 int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m);
1632 for (l = 0;; ++t) {
1633 if (*t == '.') ++t, x[l++] = is_phased;
1634 else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased;
1635 #if THOROUGH_SANITY_CHECKS
1636 assert( 0 ); // success of strtol,strtod not checked
1637 #endif
1638 is_phased = (*t == '|');
1639 if (*t == ':' || *t == 0) break;
1640 }
1641 if ( !l ) x[l++] = 0; // An empty field, insert missing value
1642 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
1643 } else {
1644 char *x = (char*)z->buf + z->size * m;
1645 for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
1646 for (; l < z->size; ++l) x[l] = 0;
1647 }
1648 } else if ((z->y>>4&0xf) == BCF_HT_INT) {
1649 int32_t *x = (int32_t*)(z->buf + z->size * m);
1650 for (l = 0;; ++t) {
1651 if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
1652 else x[l++] = strtol(t, &t, 10);
1653 if (*t == ':' || *t == 0) break;
1654 }
1655 if ( !l ) x[l++] = bcf_int32_missing;
1656 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
1657 } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
1658 float *x = (float*)(z->buf + z->size * m);
1659 for (l = 0;; ++t) {
1660 if (*t == '.' && !isdigit(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
1661 else x[l++] = strtod(t, &t);
1662 if (*t == ':' || *t == 0) break;
1663 }
1664 if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value
1665 for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
1666 } else abort();
1667 if (*t == 0) {
1668 for (++j; j < v->n_fmt; ++j) { // fill end-of-vector values
1669 z = &fmt[j];
1670 if ((z->y>>4&0xf) == BCF_HT_STR) {
1671 if (z->is_gt) {
1672 int32_t *x = (int32_t*)(z->buf + z->size * m);
1673 x[0] = bcf_int32_missing;
1674 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
1675 } else {
1676 char *x = (char*)z->buf + z->size * m;
1677 if ( z->size ) x[0] = '.';
1678 for (l = 1; l < z->size; ++l) x[l] = 0;
1679 }
1680 } else if ((z->y>>4&0xf) == BCF_HT_INT) {
1681 int32_t *x = (int32_t*)(z->buf + z->size * m);
1682 x[0] = bcf_int32_missing;
1683 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
1684 } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
1685 float *x = (float*)(z->buf + z->size * m);
1686 bcf_float_set_missing(x[0]);
1687 for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
1688 }
1689 }
1690 break;
1691 }
1692 else
1693 {
1694 if (*t == ':') ++j;
1695 t++;
1696 }
1697 }
1698 m++; t++;
1699 }
1700
1701 // write individual genotype information
1702 kstring_t *str = &v->indiv;
1703 int i;
1704 if (v->n_sample > 0) {
1705 for (i = 0; i < v->n_fmt; ++i) {
1706 fmt_aux_t *z = &fmt[i];
1707 bcf_enc_int1(str, z->key);
1708 if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
1709 bcf_enc_size(str, z->size, BCF_BT_CHAR);
1710 kputsn((char*)z->buf, z->size * v->n_sample, str);
1711 } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
1712 bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
1713 } else {
1714 bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
1715 kputsn((char*)z->buf, z->size * v->n_sample, str);
1716 }
1717 }
1718 }
1719
1720 if ( v->n_sample!=bcf_hdr_nsamples(h) )
1721 {
1722 fprintf(stderr,"[%s:%d %s] Number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
1723 __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
1724 v->errcode |= BCF_ERR_NCOLS;
1725 return -1;
1726 }
1727
1728 return 0;
1729 }
1730
1731 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
1732 {
1733 int i = 0;
1734 char *p, *q, *r, *t;
1735 kstring_t *str;
1736 khint_t k;
1737 ks_tokaux_t aux;
1738
1739 bcf_clear1(v);
1740 str = &v->shared;
1741 memset(&aux, 0, sizeof(ks_tokaux_t));
1742 for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
1743 q = (char*)aux.p;
1744 *q = 0;
1745 if (i == 0) { // CHROM
1746 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
1747 k = kh_get(vdict, d, p);
1748 if (k == kh_end(d))
1749 {
1750 // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
1751 // been already printed, but will enable tools like vcfcheck to proceed.
1752 fprintf(stderr, "[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p);
1753 kstring_t tmp = {0,0,0};
1754 int l;
1755 ksprintf(&tmp, "##contig=<ID=%s>", p);
1756 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
1757 free(tmp.s);
1758 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
1759 k = kh_get(vdict, d, p);
1760 v->errcode = BCF_ERR_CTG_UNDEF;
1761 }
1762 v->rid = kh_val(d, k).id;
1763 } else if (i == 1) { // POS
1764 v->pos = atoi(p) - 1;
1765 } else if (i == 2) { // ID
1766 if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
1767 else bcf_enc_size(str, 0, BCF_BT_CHAR);
1768 } else if (i == 3) { // REF
1769 bcf_enc_vchar(str, q - p, p);
1770 v->n_allele = 1, v->rlen = q - p;
1771 } else if (i == 4) { // ALT
1772 if (strcmp(p, ".")) {
1773 for (r = t = p;; ++r) {
1774 if (*r == ',' || *r == 0) {
1775 bcf_enc_vchar(str, r - t, t);
1776 t = r + 1;
1777 ++v->n_allele;
1778 }
1779 if (r == q) break;
1780 }
1781 }
1782 } else if (i == 5) { // QUAL
1783 if (strcmp(p, ".")) v->qual = atof(p);
1784 else memcpy(&v->qual, &bcf_float_missing, 4);
1785 if ( v->max_unpack && !(v->max_unpack>>1) ) return 0; // BCF_UN_STR
1786 } else if (i == 6) { // FILTER
1787 if (strcmp(p, ".")) {
1788 int32_t *a;
1789 int n_flt = 1, i;
1790 ks_tokaux_t aux1;
1791 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
1792 // count the number of filters
1793 if (*(q-1) == ';') *(q-1) = 0;
1794 for (r = p; *r; ++r)
1795 if (*r == ';') ++n_flt;
1796 a = (int32_t*)alloca(n_flt * sizeof(int32_t));
1797 // add filters
1798 for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
1799 *(char*)aux1.p = 0;
1800 k = kh_get(vdict, d, t);
1801 if (k == kh_end(d))
1802 {
1803 // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
1804 // been already printed, but will enable tools like vcfcheck to proceed.
1805 fprintf(stderr, "[W::%s] FILTER '%s' is not defined in the header\n", __func__, t);
1806 kstring_t tmp = {0,0,0};
1807 int l;
1808 ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
1809 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
1810 free(tmp.s);
1811 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
1812 k = kh_get(vdict, d, t);
1813 v->errcode = BCF_ERR_TAG_UNDEF;
1814 }
1815 a[i++] = kh_val(d, k).id;
1816 }
1817 n_flt = i;
1818 bcf_enc_vint(str, n_flt, a, -1);
1819 } else bcf_enc_vint(str, 0, 0, -1);
1820 if ( v->max_unpack && !(v->max_unpack>>2) ) return 0; // BCF_UN_FLT
1821 } else if (i == 7) { // INFO
1822 char *key;
1823 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
1824 v->n_info = 0;
1825 if (strcmp(p, ".")) {
1826 if (*(q-1) == ';') *(q-1) = 0;
1827 for (r = key = p;; ++r) {
1828 int c;
1829 char *val, *end;
1830 if (*r != ';' && *r != '=' && *r != 0) continue;
1831 val = end = 0;
1832 c = *r; *r = 0;
1833 if (c == '=') {
1834 val = r + 1;
1835 for (end = val; *end != ';' && *end != 0; ++end);
1836 c = *end; *end = 0;
1837 } else end = r;
1838 if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO
1839 k = kh_get(vdict, d, key);
1840 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
1841 {
1842 fprintf(stderr, "[W::%s] INFO '%s' is not defined in the header, assuming Type=String\n", __func__, key);
1843 kstring_t tmp = {0,0,0};
1844 int l;
1845 ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
1846 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
1847 free(tmp.s);
1848 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
1849 k = kh_get(vdict, d, key);
1850 v->errcode = BCF_ERR_TAG_UNDEF;
1851 }
1852 uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
1853 ++v->n_info;
1854 bcf_enc_int1(str, kh_val(d, k).id);
1855 if (val == 0) {
1856 bcf_enc_size(str, 0, BCF_BT_NULL);
1857 } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
1858 bcf_enc_vchar(str, end - val, val);
1859 } else { // int/float value/array
1860 int i, n_val;
1861 char *t, *te;
1862 for (t = val, n_val = 1; *t; ++t) // count the number of values
1863 if (*t == ',') ++n_val;
1864 if ((y>>4&0xf) == BCF_HT_INT) {
1865 int32_t *z;
1866 z = (int32_t*)alloca(n_val * sizeof(int32_t));
1867 for (i = 0, t = val; i < n_val; ++i, ++t)
1868 {
1869 z[i] = strtol(t, &te, 10);
1870 if ( te==t ) // conversion failed
1871 {
1872 z[i] = bcf_int32_missing;
1873 while ( *te && *te!=',' ) te++;
1874 }
1875 t = te;
1876 }
1877 bcf_enc_vint(str, n_val, z, -1);
1878 if (strcmp(key, "END") == 0) v->rlen = z[0] - v->pos;
1879 } else if ((y>>4&0xf) == BCF_HT_REAL) {
1880 float *z;
1881 z = (float*)alloca(n_val * sizeof(float));
1882 for (i = 0, t = val; i < n_val; ++i, ++t)
1883 {
1884 z[i] = strtod(t, &te);
1885 if ( te==t ) // conversion failed
1886 {
1887 bcf_float_set_missing(z[i]);
1888 while ( *te && *te!=',' ) te++;
1889 }
1890 t = te;
1891 }
1892 bcf_enc_vfloat(str, n_val, z);
1893 }
1894 }
1895 if (c == 0) break;
1896 r = end;
1897 key = r + 1;
1898 }
1899 }
1900 if ( v->max_unpack && !(v->max_unpack>>3) ) return 0;
1901 } else if (i == 8) // FORMAT
1902 return _vcf_parse_format(s, h, v, p, q);
1903 }
1904 return 0;
1905 }
1906
1907 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1908 {
1909 int ret;
1910 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
1911 if (ret < 0) return -1;
1912 return vcf_parse1(&fp->line, h, v);
1913 }
1914
1915 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
1916 {
1917 uint8_t *ptr_start = ptr;
1918 fmt->id = bcf_dec_typed_int1(ptr, &ptr);
1919 fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
1920 fmt->size = fmt->n << bcf_type_shift[fmt->type];
1921 fmt->p = ptr;
1922 fmt->p_off = ptr - ptr_start;
1923 fmt->p_free = 0;
1924 ptr += n_sample * fmt->size;
1925 fmt->p_len = ptr - fmt->p;
1926 return ptr;
1927 }
1928
1929 static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
1930 {
1931 uint8_t *ptr_start = ptr;
1932 info->key = bcf_dec_typed_int1(ptr, &ptr);
1933 info->len = bcf_dec_size(ptr, &ptr, &info->type);
1934 info->vptr = ptr;
1935 info->vptr_off = ptr - ptr_start;
1936 info->vptr_free = 0;
1937 info->v1.i = 0;
1938 if (info->len == 1) {
1939 if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
1940 else if (info->type == BCF_BT_INT32) info->v1.i = *(int32_t*)ptr;
1941 else if (info->type == BCF_BT_FLOAT) info->v1.f = *(float*)ptr;
1942 else if (info->type == BCF_BT_INT16) info->v1.i = *(int16_t*)ptr;
1943 }
1944 ptr += info->len << bcf_type_shift[info->type];
1945 info->vptr_len = ptr - info->vptr;
1946 return ptr;
1947 }
1948
1949 int bcf_unpack(bcf1_t *b, int which)
1950 {
1951 if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
1952 uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
1953 int *offset, i;
1954 bcf_dec_t *d = &b->d;
1955 if (which & BCF_UN_FLT) which |= BCF_UN_STR;
1956 if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
1957 if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
1958 {
1959 kstring_t tmp;
1960
1961 // ID
1962 tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
1963 ptr_ori = ptr;
1964 ptr = bcf_fmt_sized_array(&tmp, ptr);
1965 b->unpack_size[0] = ptr - ptr_ori;
1966 kputc('\0', &tmp);
1967 d->id = tmp.s; d->m_id = tmp.m;
1968
1969 // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
1970 tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
1971 offset = (int*)alloca(b->n_allele * sizeof(int));
1972 ptr_ori = ptr;
1973 for (i = 0; i < b->n_allele; ++i) {
1974 offset[i] = tmp.l;
1975 ptr = bcf_fmt_sized_array(&tmp, ptr);
1976 kputc('\0', &tmp);
1977 }
1978 b->unpack_size[1] = ptr - ptr_ori;
1979 d->als = tmp.s; d->m_als = tmp.m;
1980
1981 hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
1982 for (i = 0; i < b->n_allele; ++i)
1983 d->allele[i] = d->als + offset[i];
1984 b->unpacked |= BCF_UN_STR;
1985 }
1986 if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
1987 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
1988 ptr_ori = ptr;
1989 if (*ptr>>4) {
1990 int type;
1991 d->n_flt = bcf_dec_size(ptr, &ptr, &type);
1992 hts_expand(int, d->n_flt, d->m_flt, d->flt);
1993 for (i = 0; i < d->n_flt; ++i)
1994 d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
1995 } else ++ptr, d->n_flt = 0;
1996 b->unpack_size[2] = ptr - ptr_ori;
1997 b->unpacked |= BCF_UN_FLT;
1998 }
1999 if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
2000 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
2001 hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
2002 for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
2003 for (i = 0; i < b->n_info; ++i)
2004 ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
2005 b->unpacked |= BCF_UN_INFO;
2006 }
2007 if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
2008 ptr = (uint8_t*)b->indiv.s;
2009 hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
2010 for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
2011 for (i = 0; i < b->n_fmt; ++i)
2012 ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
2013 b->unpacked |= BCF_UN_FMT;
2014 }
2015 return 0;
2016 }
2017
2018 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
2019 {
2020 int i;
2021 bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
2022 kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM
2023 kputc('\t', s); kputw(v->pos + 1, s); // POS
2024 kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
2025 kputc('\t', s); // REF
2026 if (v->n_allele > 0) kputs(v->d.allele[0], s);
2027 else kputc('.', s);
2028 kputc('\t', s); // ALT
2029 if (v->n_allele > 1) {
2030 for (i = 1; i < v->n_allele; ++i) {
2031 if (i > 1) kputc(',', s);
2032 kputs(v->d.allele[i], s);
2033 }
2034 } else kputc('.', s);
2035 kputc('\t', s); // QUAL
2036 if ( bcf_float_is_missing(v->qual) ) kputc('.', s); // QUAL
2037 else ksprintf(s, "%g", v->qual);
2038 kputc('\t', s); // FILTER
2039 if (v->d.n_flt) {
2040 for (i = 0; i < v->d.n_flt; ++i) {
2041 if (i) kputc(';', s);
2042 kputs(h->id[BCF_DT_ID][v->d.flt[i]].key, s);
2043 }
2044 } else kputc('.', s);
2045 kputc('\t', s); // INFO
2046 if (v->n_info) {
2047 int first = 1;
2048 for (i = 0; i < v->n_info; ++i) {
2049 bcf_info_t *z = &v->d.info[i];
2050 if ( !z->vptr ) continue;
2051 if ( !first ) kputc(';', s); first = 0;
2052 kputs(h->id[BCF_DT_ID][z->key].key, s);
2053 if (z->len <= 0) continue;
2054 kputc('=', s);
2055 if (z->len == 1)
2056 {
2057 switch (z->type)
2058 {
2059 case BCF_BT_INT8: if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
2060 case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
2061 case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
2062 case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else ksprintf(s, "%g", z->v1.f); break;
2063 case BCF_BT_CHAR: kputc(z->v1.i, s); break;
2064 default: fprintf(stderr,"todo: type %d\n", z->type); exit(1); break;
2065 }
2066 }
2067 else bcf_fmt_array(s, z->len, z->type, z->vptr);
2068 }
2069 if ( first ) kputc('.', s);
2070 } else kputc('.', s);
2071 // FORMAT and individual information
2072 if (v->n_sample)
2073 {
2074 int i,j;
2075 if ( v->n_fmt)
2076 {
2077 int gt_i = -1;
2078 bcf_fmt_t *fmt = v->d.fmt;
2079 int first = 1;
2080 for (i = 0; i < (int)v->n_fmt; ++i) {
2081 if ( !fmt[i].p ) continue;
2082 kputc(!first ? ':' : '\t', s); first = 0;
2083 if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
2084 {
2085 fprintf(stderr, "[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, fmt[i].id);
2086 abort();
2087 }
2088 kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
2089 if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
2090 }
2091 if ( first ) kputs("\t.", s);
2092 for (j = 0; j < v->n_sample; ++j) {
2093 kputc('\t', s);
2094 first = 1;
2095 for (i = 0; i < (int)v->n_fmt; ++i) {
2096 bcf_fmt_t *f = &fmt[i];
2097 if ( !f->p ) continue;
2098 if (!first) kputc(':', s); first = 0;
2099 if (gt_i == i)
2100 bcf_format_gt(f,j,s);
2101 else
2102 bcf_fmt_array(s, f->n, f->type, f->p + j * f->size);
2103 }
2104 if ( first ) kputc('.', s);
2105 }
2106 }
2107 else
2108 for (j=0; j<=v->n_sample; j++)
2109 kputs("\t.", s);
2110 }
2111 kputc('\n', s);
2112 return 0;
2113 }
2114
2115 int vcf_write_line(htsFile *fp, kstring_t *line)
2116 {
2117 int ret;
2118 if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
2119 if ( fp->format.compression!=no_compression )
2120 ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
2121 else
2122 ret = hwrite(fp->fp.hfile, line->s, line->l);
2123 return ret==line->l ? 0 : -1;
2124 }
2125
2126 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2127 {
2128 int ret;
2129 fp->line.l = 0;
2130 vcf_format1(h, v, &fp->line);
2131 if ( fp->format.compression!=no_compression )
2132 ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2133 else
2134 ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2135 return ret==fp->line.l ? 0 : -1;
2136 }
2137
2138 /************************
2139 * Data access routines *
2140 ************************/
2141
2142 int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
2143 {
2144 khint_t k;
2145 vdict_t *d = (vdict_t*)h->dict[which];
2146 k = kh_get(vdict, d, id);
2147 return k == kh_end(d)? -1 : kh_val(d, k).id;
2148 }
2149
2150
2151 /********************
2152 *** BCF indexing ***
2153 ********************/
2154
2155 hts_idx_t *bcf_index(htsFile *fp, int min_shift)
2156 {
2157 int n_lvls, i;
2158 bcf1_t *b;
2159 hts_idx_t *idx;
2160 bcf_hdr_t *h;
2161 int64_t max_len = 0, s;
2162 h = bcf_hdr_read(fp);
2163 if ( !h ) return NULL;
2164 int nids = 0;
2165 for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
2166 {
2167 if ( !h->id[BCF_DT_CTG][i].val ) continue;
2168 if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0];
2169 nids++;
2170 }
2171 if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken.
2172 max_len += 256;
2173 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
2174 idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
2175 b = bcf_init1();
2176 while (bcf_read1(fp,h, b) >= 0) {
2177 int ret;
2178 ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
2179 if (ret < 0)
2180 {
2181 bcf_destroy1(b);
2182 hts_idx_destroy(idx);
2183 return NULL;
2184 }
2185 }
2186 hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
2187 bcf_destroy1(b);
2188 bcf_hdr_destroy(h);
2189 return idx;
2190 }
2191
2192 int bcf_index_build(const char *fn, int min_shift)
2193 {
2194 htsFile *fp;
2195 hts_idx_t *idx;
2196 if ((fp = hts_open(fn, "rb")) == 0) return -1;
2197 if ( fp->format.compression!=bgzf ) { hts_close(fp); return -1; }
2198 idx = bcf_index(fp, min_shift);
2199 hts_close(fp);
2200 if ( !idx ) return -1;
2201 hts_idx_save(idx, fn, HTS_FMT_CSI);
2202 hts_idx_destroy(idx);
2203 return 0;
2204 }
2205
2206 /*****************
2207 *** Utilities ***
2208 *****************/
2209
2210 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
2211 {
2212 int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0;
2213 for (i=0; i<src->nhrec; i++)
2214 {
2215 if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
2216 {
2217 int j;
2218 for (j=0; j<ndst_ori; j++)
2219 {
2220 if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
2221
2222 // Checking only the key part of generic lines, otherwise
2223 // the VCFs are too verbose. Should we perhaps add a flag
2224 // to bcf_hdr_combine() and make this optional?
2225 if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
2226 }
2227 if ( j>=ndst_ori )
2228 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2229 }
2230 else if ( src->hrec[i]->type==BCF_HL_STR )
2231 {
2232 // NB: we are ignoring fields without ID
2233 int j = bcf_hrec_find_key(src->hrec[i],"ID");
2234 if ( j>=0 )
2235 {
2236 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
2237 if ( !rec )
2238 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2239 }
2240 }
2241 else
2242 {
2243 int j = bcf_hrec_find_key(src->hrec[i],"ID");
2244 assert( j>=0 ); // this should always be true for valid VCFs
2245
2246 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
2247 if ( !rec )
2248 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2249 else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
2250 {
2251 // Check that both records are of the same type. The bcf_hdr_id2length
2252 // macro cannot be used here because dst header is not synced yet.
2253 vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
2254 vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
2255 khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
2256 khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
2257 if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
2258 {
2259 fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different lengths\n", src->hrec[i]->vals[0]);
2260 ret |= 1;
2261 }
2262 if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
2263 {
2264 fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different types\n", src->hrec[i]->vals[0]);
2265 ret |= 1;
2266 }
2267 }
2268 }
2269 }
2270 if ( need_sync ) bcf_hdr_sync(dst);
2271 return ret;
2272 }
2273 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
2274 {
2275 int i;
2276 if ( line->errcode )
2277 {
2278 fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,line->errcode);
2279 exit(1);
2280 }
2281 if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id
2282 if ( !src_hdr->ntransl ) // called for the first time, see what needs translating
2283 {
2284 int dict;
2285 for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG
2286 {
2287 src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
2288 for (i=0; i<src_hdr->n[dict]; i++)
2289 {
2290 if ( !src_hdr->id[dict][i].key || !dst_hdr->id[dict][i].key ) // gap left after removed BCF header lines
2291 src_hdr->transl[dict][i] = -1;
2292 else if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) )
2293 {
2294 src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
2295 src_hdr->ntransl++;
2296 }
2297 else
2298 src_hdr->transl[dict][i] = -1;
2299 }
2300 }
2301 if ( !src_hdr->ntransl )
2302 {
2303 free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
2304 free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
2305 src_hdr->ntransl = -1;
2306 }
2307 if ( src_hdr->ntransl==-1 ) return 0;
2308 }
2309 bcf_unpack(line,BCF_UN_ALL);
2310
2311 // CHROM
2312 if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
2313
2314 // FILTER
2315 for (i=0; i<line->d.n_flt; i++)
2316 {
2317 int src_id = line->d.flt[i];
2318 if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
2319 line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
2320 line->d.shared_dirty |= BCF1_DIRTY_FLT;
2321 }
2322
2323 // INFO
2324 for (i=0; i<line->n_info; i++)
2325 {
2326 int src_id = line->d.info[i].key;
2327 int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
2328 if ( dst_id<0 ) continue;
2329 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2330 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2331 if ( src_size==dst_size ) // can overwrite
2332 {
2333 line->d.info[i].key = dst_id;
2334 uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
2335 if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
2336 else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
2337 else { *(uint32_t*)vptr = (uint32_t)dst_id; }
2338 }
2339 else // must realloc
2340 {
2341 bcf_info_t *info = &line->d.info[i];
2342 assert( !info->vptr_free );
2343 kstring_t str = {0,0,0};
2344 bcf_enc_int1(&str, dst_id);
2345 bcf_enc_size(&str, info->len,info->type);
2346 info->vptr_off = str.l;
2347 kputsn((char*)info->vptr, info->vptr_len, &str);
2348 info->vptr = (uint8_t*)str.s + info->vptr_off;
2349 info->vptr_free = 1;
2350 info->key = dst_id;
2351 line->d.shared_dirty |= BCF1_DIRTY_INF;
2352 }
2353 }
2354
2355 // FORMAT
2356 for (i=0; i<line->n_fmt; i++)
2357 {
2358 int src_id = line->d.fmt[i].id;
2359 int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
2360 if ( dst_id<0 ) continue;
2361 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2362 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2363 if ( src_size==dst_size ) // can overwrite
2364 {
2365 line->d.fmt[i].id = dst_id;
2366 uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits)
2367 if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
2368 else if ( dst_size==BCF_BT_INT16 ) { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; }
2369 else { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; p[3] = x[2]; p[4] = x[3]; }
2370 }
2371 else // must realloc
2372 {
2373 bcf_fmt_t *fmt = &line->d.fmt[i];
2374 assert( !fmt->p_free );
2375 kstring_t str = {0,0,0};
2376 bcf_enc_int1(&str, dst_id);
2377 bcf_enc_size(&str, fmt->n, fmt->type);
2378 fmt->p_off = str.l;
2379 kputsn((char*)fmt->p, fmt->p_len, &str);
2380 fmt->p = (uint8_t*)str.s + fmt->p_off;
2381 fmt->p_free = 1;
2382 fmt->id = dst_id;
2383 line->d.indiv_dirty = 1;
2384 }
2385 }
2386 return 0;
2387 }
2388
2389 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
2390 {
2391 bcf_hdr_t *hout = bcf_hdr_init("r");
2392 char *htxt = bcf_hdr_fmt_text(hdr, 1, NULL);
2393 bcf_hdr_parse(hout, htxt);
2394 free(htxt);
2395 return hout;
2396 }
2397
2398 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
2399 {
2400 int hlen;
2401 void *names_hash = khash_str2int_init();
2402 char *htxt = bcf_hdr_fmt_text(h0, 1, &hlen);
2403 kstring_t str;
2404 bcf_hdr_t *h;
2405 str.l = str.m = 0; str.s = 0;
2406 h = bcf_hdr_init("w");
2407 bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
2408 int j;
2409 for (j=0; j<n; j++) imap[j] = -1;
2410 if ( bcf_hdr_nsamples(h0) > 0) {
2411 char *p;
2412 int i = 0, end = n? 8 : 7;
2413 while ((p = strstr(htxt, "#CHROM\t")) != 0)
2414 if (p > htxt && *(p-1) == '\n') break;
2415 while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
2416 if (i != end) {
2417 free(h); free(str.s);
2418 return 0; // malformated header
2419 }
2420 kputsn(htxt, p - htxt, &str);
2421 for (i = 0; i < n; ++i) {
2422 if ( khash_str2int_has_key(names_hash,samples[i]) )
2423 {
2424 fprintf(stderr,"[E::bcf_hdr_subset] Duplicate sample name \"%s\".\n", samples[i]);
2425 free(str.s);
2426 free(htxt);
2427 khash_str2int_destroy(names_hash);
2428 bcf_hdr_destroy(h);
2429 return NULL;
2430 }
2431 imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
2432 if (imap[i] < 0) continue;
2433 kputc('\t', &str);
2434 kputs(samples[i], &str);
2435 khash_str2int_inc(names_hash,samples[i]);
2436 }
2437 } else kputsn(htxt, hlen, &str);
2438 while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
2439 kputc('\n',&str);
2440 bcf_hdr_parse(h, str.s);
2441 free(str.s);
2442 free(htxt);
2443 khash_str2int_destroy(names_hash);
2444 return h;
2445 }
2446
2447 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
2448 {
2449 if ( samples && !strcmp("-",samples) ) return 0; // keep all samples
2450
2451 hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
2452 if ( !samples ) { bcf_hdr_nsamples(hdr) = 0; return 0; } // exclude all samples
2453
2454 int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
2455 hdr->keep_samples = (uint8_t*) calloc(narr,1);
2456 if ( samples[0]=='^' )
2457 for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
2458
2459 int idx, n, ret = 0;
2460 char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
2461 if ( !smpls ) return -1;
2462 for (i=0; i<n; i++)
2463 {
2464 idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
2465 if ( idx<0 )
2466 {
2467 if ( !ret ) ret = i+1;
2468 continue;
2469 }
2470 assert( idx<bcf_hdr_nsamples(hdr) );
2471 if ( samples[0]=='^' )
2472 bit_array_clear(hdr->keep_samples, idx);
2473 else
2474 bit_array_set(hdr->keep_samples, idx);
2475 }
2476 for (i=0; i<n; i++) free(smpls[i]);
2477 free(smpls);
2478
2479 bcf_hdr_nsamples(hdr) = 0;
2480 for (i=0; i<hdr->nsamples_ori; i++)
2481 if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
2482 if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
2483 else
2484 {
2485 char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
2486 idx = 0;
2487 for (i=0; i<hdr->nsamples_ori; i++)
2488 if ( bit_array_test(hdr->keep_samples,i) ) samples[idx++] = strdup(hdr->samples[i]);
2489 free(hdr->samples);
2490 hdr->samples = samples;
2491
2492 // delete original samples from the dictionary
2493 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
2494 int k;
2495 for (k = kh_begin(d); k != kh_end(d); ++k)
2496 if (kh_exist(d, k)) free((char*)kh_key(d, k));
2497 kh_destroy(vdict, d);
2498
2499 // add the subset back
2500 hdr->dict[BCF_DT_SAMPLE] = d = kh_init(vdict);
2501 for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2502 {
2503 int ignore, k = kh_put(vdict, d, hdr->samples[i], &ignore);
2504 kh_val(d, k) = bcf_idinfo_def;
2505 kh_val(d, k).id = kh_size(d) - 1;
2506 }
2507 bcf_hdr_sync(hdr);
2508 }
2509
2510 return ret;
2511 }
2512
2513 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
2514 {
2515 kstring_t ind;
2516 ind.s = 0; ind.l = ind.m = 0;
2517 if (n) {
2518 bcf_fmt_t *fmt;
2519 int i, j;
2520 fmt = (bcf_fmt_t*)alloca(v->n_fmt * sizeof(bcf_fmt_t));
2521 uint8_t *ptr = (uint8_t*)v->indiv.s;
2522 for (i = 0; i < v->n_fmt; ++i)
2523 ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
2524 for (i = 0; i < (int)v->n_fmt; ++i) {
2525 bcf_fmt_t *f = &fmt[i];
2526 bcf_enc_int1(&ind, f->id);
2527 bcf_enc_size(&ind, f->n, f->type);
2528 for (j = 0; j < n; ++j)
2529 if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
2530 }
2531 for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
2532 v->n_sample = i;
2533 } else v->n_sample = 0;
2534 if ( !v->n_sample ) v->n_fmt = 0;
2535 free(v->indiv.s);
2536 v->indiv = ind;
2537 v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again
2538 return 0;
2539 }
2540
2541 int bcf_is_snp(bcf1_t *v)
2542 {
2543 int i;
2544 bcf_unpack(v, BCF_UN_STR);
2545 for (i = 0; i < v->n_allele; ++i)
2546 {
2547 if ( v->d.allele[i][1]==0 ) continue;
2548
2549 // mpileup's <X> allele, see also below. This is not completely satisfactory,
2550 // a general library is here narrowly tailored to fit samtools.
2551 if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
2552
2553 break;
2554 }
2555 return i == v->n_allele;
2556 }
2557
2558 static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *var)
2559 {
2560 // The most frequent case
2561 if ( !ref[1] && !alt[1] )
2562 {
2563 if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
2564 if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
2565 var->n = 1; var->type = VCF_SNP; return;
2566 }
2567 if ( alt[0]=='<' )
2568 {
2569 if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
2570 var->type = VCF_OTHER;
2571 return;
2572 }
2573
2574 const char *r = ref, *a = alt;
2575 while (*r && *a && *r==*a ) { r++; a++; }
2576
2577 if ( *a && !*r )
2578 {
2579 while ( *a ) a++;
2580 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
2581 }
2582 else if ( *r && !*a )
2583 {
2584 while ( *r ) r++;
2585 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
2586 }
2587 else if ( !*r && !*a )
2588 {
2589 var->n = 0; var->type = VCF_REF; return;
2590 }
2591
2592 const char *re = r, *ae = a;
2593 while ( re[1] ) re++;
2594 while ( ae[1] ) ae++;
2595 while ( *re==*ae && re>r && ae>a ) { re--; ae--; }
2596 if ( ae==a )
2597 {
2598 if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
2599 var->n = -(re-r);
2600 if ( *re==*ae ) { var->type = VCF_INDEL; return; }
2601 var->type = VCF_OTHER; return;
2602 }
2603 else if ( re==r )
2604 {
2605 var->n = ae-a;
2606 if ( *re==*ae ) { var->type = VCF_INDEL; return; }
2607 var->type = VCF_OTHER; return;
2608 }
2609
2610 var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
2611 var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
2612
2613 // should do also complex events, SVs, etc...
2614 }
2615
2616 static void bcf_set_variant_types(bcf1_t *b)
2617 {
2618 if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
2619 bcf_dec_t *d = &b->d;
2620 if ( d->n_var < b->n_allele )
2621 {
2622 d->var = (variant_t *) realloc(d->var, sizeof(variant_t)*b->n_allele);
2623 d->n_var = b->n_allele;
2624 }
2625 int i;
2626 b->d.var_type = 0;
2627 for (i=1; i<b->n_allele; i++)
2628 {
2629 bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
2630 b->d.var_type |= d->var[i].type;
2631 //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
2632 }
2633 }
2634
2635 int bcf_get_variant_types(bcf1_t *rec)
2636 {
2637 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
2638 return rec->d.var_type;
2639 }
2640 int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
2641 {
2642 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
2643 return rec->d.var[ith_allele].type;
2644 }
2645
2646 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
2647 {
2648 // Is the field already present?
2649 int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
2650 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header
2651 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
2652
2653 for (i=0; i<line->n_info; i++)
2654 if ( inf_id==line->d.info[i].key ) break;
2655 bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
2656
2657 if ( !n || (type==BCF_HT_STR && !values) )
2658 {
2659 if ( inf )
2660 {
2661 // Mark the tag for removal, free existing memory if necessary
2662 if ( inf->vptr_free )
2663 {
2664 free(inf->vptr - inf->vptr_off);
2665 inf->vptr_free = 0;
2666 }
2667 line->d.shared_dirty |= BCF1_DIRTY_INF;
2668 inf->vptr = NULL;
2669 }
2670 return 0;
2671 }
2672
2673 // Encode the values and determine the size required to accommodate the values
2674 kstring_t str = {0,0,0};
2675 bcf_enc_int1(&str, inf_id);
2676 if ( type==BCF_HT_INT )
2677 bcf_enc_vint(&str, n, (int32_t*)values, -1);
2678 else if ( type==BCF_HT_REAL )
2679 bcf_enc_vfloat(&str, n, (float*)values);
2680 else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
2681 {
2682 if ( values==NULL )
2683 bcf_enc_size(&str, 0, BCF_BT_NULL);
2684 else
2685 bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
2686 }
2687 else
2688 {
2689 fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
2690 abort();
2691 }
2692
2693 // Is the INFO tag already present
2694 if ( inf )
2695 {
2696 // Is it big enough to accommodate new block?
2697 if ( str.l <= inf->vptr_len + inf->vptr_off )
2698 {
2699 if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
2700 uint8_t *ptr = inf->vptr - inf->vptr_off;
2701 memcpy(ptr, str.s, str.l);
2702 free(str.s);
2703 int vptr_free = inf->vptr_free;
2704 bcf_unpack_info_core1(ptr, inf);
2705 inf->vptr_free = vptr_free;
2706 }
2707 else
2708 {
2709 assert( !inf->vptr_free ); // fix the caller or improve here: this has been modified before
2710 bcf_unpack_info_core1((uint8_t*)str.s, inf);
2711 inf->vptr_free = 1;
2712 line->d.shared_dirty |= BCF1_DIRTY_INF;
2713 }
2714 }
2715 else
2716 {
2717 // The tag is not present, create new one
2718 line->n_info++;
2719 hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
2720 inf = &line->d.info[line->n_info-1];
2721 bcf_unpack_info_core1((uint8_t*)str.s, inf);
2722 inf->vptr_free = 1;
2723 line->d.shared_dirty |= BCF1_DIRTY_INF;
2724 }
2725 line->unpacked |= BCF_UN_INFO;
2726 return 0;
2727 }
2728
2729 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
2730 {
2731 if ( !n )
2732 return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
2733
2734 int i, max_len = 0;
2735 for (i=0; i<n; i++)
2736 {
2737 int len = strlen(values[i]);
2738 if ( len > max_len ) max_len = len;
2739 }
2740 char *out = (char*) malloc(max_len*n);
2741 if ( !out ) return -2;
2742 for (i=0; i<n; i++)
2743 {
2744 char *dst = out+i*max_len;
2745 const char *src = values[i];
2746 int j = 0;
2747 while ( src[j] ) { dst[j] = src[j]; j++; }
2748 for (; j<max_len; j++) dst[j] = 0;
2749 }
2750 int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
2751 free(out);
2752 return ret;
2753 }
2754
2755 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
2756 {
2757 // Is the field already present?
2758 int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
2759 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
2760 {
2761 if ( !n ) return 0;
2762 return -1; // the key not present in the header
2763 }
2764
2765 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
2766
2767 for (i=0; i<line->n_fmt; i++)
2768 if ( line->d.fmt[i].id==fmt_id ) break;
2769 bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
2770
2771 if ( !n )
2772 {
2773 if ( fmt )
2774 {
2775 // Mark the tag for removal, free existing memory if necessary
2776 if ( fmt->p_free )
2777 {
2778 free(fmt->p - fmt->p_off);
2779 fmt->p_free = 0;
2780 }
2781 line->d.indiv_dirty = 1;
2782 fmt->p = NULL;
2783 }
2784 return 0;
2785 }
2786
2787 line->n_sample = bcf_hdr_nsamples(hdr);
2788 int nps = n / line->n_sample; // number of values per sample
2789 assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample
2790
2791 // Encode the values and determine the size required to accommodate the values
2792 kstring_t str = {0,0,0};
2793 bcf_enc_int1(&str, fmt_id);
2794 if ( type==BCF_HT_INT )
2795 bcf_enc_vint(&str, n, (int32_t*)values, nps);
2796 else if ( type==BCF_HT_REAL )
2797 {
2798 bcf_enc_size(&str, nps, BCF_BT_FLOAT);
2799 kputsn((char*)values, nps*line->n_sample*sizeof(float), &str);
2800 }
2801 else if ( type==BCF_HT_STR )
2802 {
2803 bcf_enc_size(&str, nps, BCF_BT_CHAR);
2804 kputsn((char*)values, nps*line->n_sample, &str);
2805 }
2806 else
2807 {
2808 fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
2809 abort();
2810 }
2811
2812 if ( !fmt )
2813 {
2814 // Not present, new format field
2815 line->n_fmt++;
2816 hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
2817
2818 // Special case: VCF specification requires that GT is always first
2819 if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
2820 {
2821 for (i=line->n_fmt-1; i>0; i--)
2822 line->d.fmt[i] = line->d.fmt[i-1];
2823 fmt = &line->d.fmt[0];
2824 }
2825 else
2826 fmt = &line->d.fmt[line->n_fmt-1];
2827 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
2828 line->d.indiv_dirty = 1;
2829 fmt->p_free = 1;
2830 }
2831 else
2832 {
2833 // The tag is already present, check if it is big enough to accomodate the new block
2834 if ( str.l <= fmt->p_len + fmt->p_off )
2835 {
2836 // good, the block is big enough
2837 if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
2838 uint8_t *ptr = fmt->p - fmt->p_off;
2839 memcpy(ptr, str.s, str.l);
2840 free(str.s);
2841 int p_free = fmt->p_free;
2842 bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
2843 fmt->p_free = p_free;
2844 }
2845 else
2846 {
2847 assert( !fmt->p_free ); // fix the caller or improve here: this has been modified before
2848 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
2849 fmt->p_free = 1;
2850 line->d.indiv_dirty = 1;
2851 }
2852 }
2853 line->unpacked |= BCF_UN_FMT;
2854 return 0;
2855 }
2856
2857
2858 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
2859 {
2860 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
2861 line->d.shared_dirty |= BCF1_DIRTY_FLT;
2862 line->d.n_flt = n;
2863 if ( !n ) return 0;
2864 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
2865 int i;
2866 for (i=0; i<n; i++)
2867 line->d.flt[i] = flt_ids[i];
2868 return 0;
2869 }
2870
2871 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
2872 {
2873 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
2874 int i;
2875 for (i=0; i<line->d.n_flt; i++)
2876 if ( flt_id==line->d.flt[i] ) break;
2877 if ( i<line->d.n_flt ) return 0; // this filter is already set
2878 line->d.shared_dirty |= BCF1_DIRTY_FLT;
2879 if ( flt_id==0 ) // set to PASS
2880 line->d.n_flt = 1;
2881 else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
2882 line->d.n_flt = 1;
2883 else
2884 line->d.n_flt++;
2885 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
2886 line->d.flt[line->d.n_flt-1] = flt_id;
2887 return 1;
2888 }
2889 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
2890 {
2891 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
2892 int i;
2893 for (i=0; i<line->d.n_flt; i++)
2894 if ( flt_id==line->d.flt[i] ) break;
2895 if ( i==line->d.n_flt ) return 0; // the filter is not present
2896 line->d.shared_dirty |= BCF1_DIRTY_FLT;
2897 if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
2898 line->d.n_flt--;
2899 if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
2900 return 0;
2901 }
2902
2903 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
2904 {
2905 if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
2906 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
2907 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header
2908
2909 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
2910 if ( id==0 && !line->d.n_flt) return 1; // PASS
2911
2912 int i;
2913 for (i=0; i<line->d.n_flt; i++)
2914 if ( line->d.flt[i]==id ) return 1;
2915 return 0;
2916 }
2917
2918 static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
2919 {
2920 line->d.shared_dirty |= BCF1_DIRTY_ALS;
2921
2922 line->n_allele = nals;
2923 hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
2924
2925 char *als = line->d.als;
2926 int n = 0;
2927 while (n<nals)
2928 {
2929 line->d.allele[n] = als;
2930 while ( *als ) als++;
2931 als++;
2932 n++;
2933 }
2934 return 0;
2935 }
2936 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
2937 {
2938 kstring_t tmp = {0,0,0};
2939 char *free_old = NULL;
2940
2941 // If the supplied alleles are not pointers to line->d.als, the existing block can be reused.
2942 int i;
2943 for (i=0; i<nals; i++)
2944 if ( alleles[i]>=line->d.als && alleles[i]<line->d.als+line->d.m_als ) break;
2945 if ( i==nals )
2946 {
2947 // all alleles point elsewhere, reuse the existing block
2948 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
2949 }
2950 else
2951 free_old = line->d.als;
2952
2953 for (i=0; i<nals; i++)
2954 {
2955 kputs(alleles[i], &tmp);
2956 kputc(0, &tmp);
2957 }
2958 line->d.als = tmp.s; line->d.m_als = tmp.m;
2959 free(free_old);
2960 return _bcf1_sync_alleles(hdr,line,nals);
2961 }
2962
2963 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
2964 {
2965 kstring_t tmp;
2966 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
2967 kputs(alleles_string, &tmp);
2968 line->d.als = tmp.s; line->d.m_als = tmp.m;
2969
2970 int nals = 1;
2971 char *t = line->d.als;
2972 while (*t)
2973 {
2974 if ( *t==',' ) { *t = 0; nals++; }
2975 t++;
2976 }
2977 return _bcf1_sync_alleles(hdr, line, nals);
2978 }
2979
2980 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
2981 {
2982 kstring_t tmp;
2983 tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
2984 if ( id )
2985 kputs(id, &tmp);
2986 else
2987 kputs(".", &tmp);
2988 line->d.id = tmp.s; line->d.m_id = tmp.m;
2989 line->d.shared_dirty |= BCF1_DIRTY_ID;
2990 return 0;
2991 }
2992
2993 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
2994 {
2995 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
2996 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header
2997 return bcf_get_fmt_id(line, id);
2998 }
2999
3000 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
3001 {
3002 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
3003 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header
3004 return bcf_get_info_id(line, id);
3005 }
3006
3007 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
3008 {
3009 int i;
3010 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
3011 for (i=0; i<line->n_fmt; i++)
3012 {
3013 if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
3014 }
3015 return NULL;
3016 }
3017
3018 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
3019 {
3020 int i;
3021 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
3022 for (i=0; i<line->n_info; i++)
3023 {
3024 if ( line->d.info[i].key==id ) return &line->d.info[i];
3025 }
3026 return NULL;
3027 }
3028
3029
3030 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
3031 {
3032 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
3033 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header
3034 if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type
3035
3036 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
3037
3038 for (i=0; i<line->n_info; i++)
3039 if ( line->d.info[i].key==tag_id ) break;
3040 if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record
3041 if ( type==BCF_HT_FLAG ) return 1;
3042
3043 bcf_info_t *info = &line->d.info[i];
3044 if ( type==BCF_HT_STR )
3045 {
3046 if ( *ndst < info->len+1 )
3047 {
3048 *ndst = info->len + 1;
3049 *dst = realloc(*dst, *ndst);
3050 }
3051 memcpy(*dst,info->vptr,info->len);
3052 ((uint8_t*)*dst)[info->len] = 0;
3053 return info->len;
3054 }
3055
3056 // Make sure the buffer is big enough
3057 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
3058 if ( *ndst < info->len )
3059 {
3060 *ndst = info->len;
3061 *dst = realloc(*dst, *ndst * size1);
3062 }
3063
3064 if ( info->len == 1 )
3065 {
3066 if ( info->type==BCF_BT_FLOAT ) *((float*)*dst) = info->v1.f;
3067 else
3068 {
3069 #define BRANCH(type_t, missing) { \
3070 if ( info->v1.i==missing ) *((int32_t*)*dst) = bcf_int32_missing; \
3071 else *((int32_t*)*dst) = info->v1.i; \
3072 }
3073 switch (info->type)
3074 {
3075 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing ); break;
3076 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing); break;
3077 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing); break;
3078 }
3079 #undef BRANCH
3080 }
3081 return 1;
3082 }
3083
3084 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, out_type_t) { \
3085 out_type_t *tmp = (out_type_t *) *dst; \
3086 type_t *p = (type_t *) info->vptr; \
3087 for (j=0; j<info->len; j++) \
3088 { \
3089 if ( is_vector_end ) return j; \
3090 if ( is_missing ) set_missing; \
3091 else *tmp = p[j]; \
3092 tmp++; \
3093 } \
3094 return j; \
3095 }
3096 switch (info->type) {
3097 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, int32_t); break;
3098 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, int32_t); break;
3099 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, int32_t); break;
3100 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), float); break;
3101 default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
3102 }
3103 #undef BRANCH
3104 return -4; // this can never happen
3105 }
3106
3107 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
3108 {
3109 int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
3110 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
3111 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type
3112
3113 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
3114
3115 for (i=0; i<line->n_fmt; i++)
3116 if ( line->d.fmt[i].id==tag_id ) break;
3117 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
3118 bcf_fmt_t *fmt = &line->d.fmt[i];
3119
3120 int nsmpl = bcf_hdr_nsamples(hdr);
3121 if ( !*dst )
3122 {
3123 *dst = (char**) malloc(sizeof(char*)*nsmpl);
3124 if ( !*dst ) return -4; // could not alloc
3125 (*dst)[0] = NULL;
3126 }
3127 int n = (fmt->n+1)*nsmpl;
3128 if ( *ndst < n )
3129 {
3130 (*dst)[0] = realloc((*dst)[0], n);
3131 if ( !(*dst)[0] ) return -4; // could not alloc
3132 *ndst = n;
3133 }
3134 for (i=0; i<nsmpl; i++)
3135 {
3136 uint8_t *src = fmt->p + i*fmt->n;
3137 uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
3138 memcpy(tmp,src,fmt->n);
3139 tmp[fmt->n] = 0;
3140 (*dst)[i] = (char*) tmp;
3141 }
3142 return n;
3143 }
3144
3145 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
3146 {
3147 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
3148 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
3149 if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
3150 {
3151 // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
3152 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
3153 }
3154 else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type
3155
3156 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
3157
3158 for (i=0; i<line->n_fmt; i++)
3159 if ( line->d.fmt[i].id==tag_id ) break;
3160 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
3161 bcf_fmt_t *fmt = &line->d.fmt[i];
3162
3163 if ( type==BCF_HT_STR )
3164 {
3165 int n = fmt->n*bcf_hdr_nsamples(hdr);
3166 if ( *ndst < n )
3167 {
3168 *dst = realloc(*dst, n);
3169 if ( !*dst ) return -4; // could not alloc
3170 *ndst = n;
3171 }
3172 memcpy(*dst,fmt->p,n);
3173 return n;
3174 }
3175
3176 // Make sure the buffer is big enough
3177 int nsmpl = bcf_hdr_nsamples(hdr);
3178 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
3179 if ( *ndst < fmt->n*nsmpl )
3180 {
3181 *ndst = fmt->n*nsmpl;
3182 *dst = realloc(*dst, *ndst*size1);
3183 if ( !dst ) return -4; // could not alloc
3184 }
3185
3186 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \
3187 out_type_t *tmp = (out_type_t *) *dst; \
3188 type_t *p = (type_t*) fmt->p; \
3189 for (i=0; i<nsmpl; i++) \
3190 { \
3191 for (j=0; j<fmt->n; j++) \
3192 { \
3193 if ( is_missing ) set_missing; \
3194 else if ( is_vector_end ) { set_vector_end; break; } \
3195 else *tmp = p[j]; \
3196 tmp++; \
3197 } \
3198 for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
3199 p = (type_t *)((char *)p + fmt->size); \
3200 } \
3201 }
3202 switch (fmt->type) {
3203 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
3204 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
3205 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
3206 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break;
3207 default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1);
3208 }
3209 #undef BRANCH
3210 return nsmpl*fmt->n;
3211 }
3212