0
|
1 /* vcf.c -- VCF/BCF API functions.
|
|
2
|
|
3 Copyright (C) 2012, 2013 Broad Institute.
|
|
4 Copyright (C) 2012-2014 Genome Research Ltd.
|
|
5 Portions copyright (C) 2014 Intel Corporation.
|
|
6
|
|
7 Author: Heng Li <lh3@sanger.ac.uk>
|
|
8
|
|
9 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10 of this software and associated documentation files (the "Software"), to deal
|
|
11 in the Software without restriction, including without limitation the rights
|
|
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13 copies of the Software, and to permit persons to whom the Software is
|
|
14 furnished to do so, subject to the following conditions:
|
|
15
|
|
16 The above copyright notice and this permission notice shall be included in
|
|
17 all copies or substantial portions of the Software.
|
|
18
|
|
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
25 DEALINGS IN THE SOFTWARE. */
|
|
26
|
|
27 #include <zlib.h>
|
|
28 #include <stdio.h>
|
|
29 #include <ctype.h>
|
|
30 #include <assert.h>
|
|
31 #include <string.h>
|
|
32 #include <stdlib.h>
|
|
33 #include <limits.h>
|
|
34 #include "htslib/kstring.h"
|
|
35 #include "htslib/bgzf.h"
|
|
36 #include "htslib/vcf.h"
|
|
37 #include "htslib/tbx.h"
|
|
38 #include "htslib/hfile.h"
|
|
39 #include "htslib/khash_str2int.h"
|
|
40
|
|
41 #include "htslib/khash.h"
|
|
42 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
|
|
43 typedef khash_t(vdict) vdict_t;
|
|
44
|
|
45 #include "htslib/kseq.h"
|
|
46 KSTREAM_DECLARE(gzFile, gzread)
|
|
47
|
|
48 uint32_t bcf_float_missing = 0x7F800001;
|
|
49 uint32_t bcf_float_vector_end = 0x7F800002;
|
|
50 uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
51 static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
|
|
52
|
|
53 /*************************
|
|
54 *** VCF header parser ***
|
|
55 *************************/
|
|
56
|
|
57 int bcf_hdr_sync(bcf_hdr_t *h);
|
|
58
|
|
59 int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
|
|
60 {
|
|
61 if ( !s ) return 0;
|
|
62
|
|
63 const char *ss = s;
|
|
64 while ( !*ss && isspace(*ss) ) ss++;
|
|
65 if ( !*ss )
|
|
66 {
|
|
67 fprintf(stderr,"[E::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__);
|
|
68 abort();
|
|
69 }
|
|
70
|
|
71 vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
|
|
72 int ret;
|
|
73 char *sdup = strdup(s);
|
|
74 int k = kh_put(vdict, d, sdup, &ret);
|
|
75 if (ret) { // absent
|
|
76 kh_val(d, k) = bcf_idinfo_def;
|
|
77 kh_val(d, k).id = kh_size(d) - 1;
|
|
78 } else {
|
|
79 if (hts_verbose >= 2)
|
|
80 {
|
|
81 fprintf(stderr, "[E::%s] Duplicated sample name '%s'\n", __func__, s);
|
|
82 abort();
|
|
83 }
|
|
84 free(sdup);
|
|
85 return -1;
|
|
86 }
|
|
87 int n = kh_size(d);
|
|
88 h->samples = (char**) realloc(h->samples,sizeof(char*)*n);
|
|
89 h->samples[n-1] = sdup;
|
|
90 h->dirty = 1;
|
|
91 return 0;
|
|
92 }
|
|
93
|
|
94 int bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
|
|
95 {
|
|
96 int ret = 0;
|
|
97 int i = 0;
|
|
98 const char *p, *q;
|
|
99 // add samples
|
|
100 for (p = q = str;; ++q) {
|
|
101 if (*q != '\t' && *q != 0 && *q != '\n') continue;
|
|
102 if (++i > 9) {
|
|
103 char *s = (char*)malloc(q - p + 1);
|
|
104 strncpy(s, p, q - p);
|
|
105 s[q - p] = 0;
|
|
106 if ( bcf_hdr_add_sample(h,s) < 0 ) ret = -1;
|
|
107 free(s);
|
|
108 }
|
|
109 if (*q == 0 || *q == '\n') break;
|
|
110 p = q + 1;
|
|
111 }
|
|
112 bcf_hdr_add_sample(h,NULL);
|
|
113 return ret;
|
|
114 }
|
|
115
|
|
116 int bcf_hdr_sync(bcf_hdr_t *h)
|
|
117 {
|
|
118 int i;
|
|
119 for (i = 0; i < 3; i++)
|
|
120 {
|
|
121 vdict_t *d = (vdict_t*)h->dict[i];
|
|
122 khint_t k;
|
|
123
|
|
124 // find out the largest id, there may be holes because of IDX
|
|
125 int max_id = -1;
|
|
126 for (k=kh_begin(d); k<kh_end(d); k++)
|
|
127 {
|
|
128 if (!kh_exist(d,k)) continue;
|
|
129 if ( max_id < kh_val(d,k).id ) max_id = kh_val(d,k).id;
|
|
130 }
|
|
131 if ( max_id >= h->n[i] )
|
|
132 {
|
|
133 h->id[i] = (bcf_idpair_t*)realloc(h->id[i], (max_id+1)*sizeof(bcf_idpair_t));
|
|
134 for (k=h->n[i]; k<=max_id; k++)
|
|
135 {
|
|
136 h->id[i][k].key = NULL;
|
|
137 h->id[i][k].val = NULL;
|
|
138 }
|
|
139 h->n[i] = max_id+1;
|
|
140 }
|
|
141 for (k=kh_begin(d); k<kh_end(d); k++)
|
|
142 {
|
|
143 if (!kh_exist(d,k)) continue;
|
|
144 h->id[i][kh_val(d,k).id].key = kh_key(d,k);
|
|
145 h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
|
|
146 }
|
|
147 }
|
|
148 h->dirty = 0;
|
|
149 return 0;
|
|
150 }
|
|
151
|
|
152 void bcf_hrec_destroy(bcf_hrec_t *hrec)
|
|
153 {
|
|
154 free(hrec->key);
|
|
155 if ( hrec->value ) free(hrec->value);
|
|
156 int i;
|
|
157 for (i=0; i<hrec->nkeys; i++)
|
|
158 {
|
|
159 free(hrec->keys[i]);
|
|
160 free(hrec->vals[i]);
|
|
161 }
|
|
162 free(hrec->keys);
|
|
163 free(hrec->vals);
|
|
164 free(hrec);
|
|
165 }
|
|
166
|
|
167 // Copies all fields except IDX.
|
|
168 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
|
|
169 {
|
|
170 bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
|
|
171 out->type = hrec->type;
|
|
172 if ( hrec->key ) out->key = strdup(hrec->key);
|
|
173 if ( hrec->value ) out->value = strdup(hrec->value);
|
|
174 out->nkeys = hrec->nkeys;
|
|
175 out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
|
|
176 out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
|
|
177 int i, j = 0;
|
|
178 for (i=0; i<hrec->nkeys; i++)
|
|
179 {
|
|
180 if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
|
|
181 if ( hrec->keys[i] ) out->keys[j] = strdup(hrec->keys[i]);
|
|
182 if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]);
|
|
183 j++;
|
|
184 }
|
|
185 if ( i!=j ) out->nkeys -= i-j; // IDX was omitted
|
|
186 return out;
|
|
187 }
|
|
188
|
|
189 void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
|
|
190 {
|
|
191 fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
|
|
192 int i;
|
|
193 for (i=0; i<hrec->nkeys; i++)
|
|
194 fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
|
|
195 fprintf(fp, "\n");
|
|
196 }
|
|
197
|
|
198 void bcf_header_debug(bcf_hdr_t *hdr)
|
|
199 {
|
|
200 int i, j;
|
|
201 for (i=0; i<hdr->nhrec; i++)
|
|
202 {
|
|
203 if ( !hdr->hrec[i]->value )
|
|
204 {
|
|
205 fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
|
|
206 fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
|
|
207 for (j=1; j<hdr->hrec[i]->nkeys; j++)
|
|
208 fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
|
|
209 fprintf(stderr,">\n");
|
|
210 }
|
|
211 else
|
|
212 fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
|
|
213 }
|
|
214 }
|
|
215
|
|
216 void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
|
|
217 {
|
|
218 int n = ++hrec->nkeys;
|
|
219 hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
|
|
220 hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
|
|
221 assert( len );
|
|
222 hrec->keys[n-1] = (char*) malloc((len+1)*sizeof(char));
|
|
223 memcpy(hrec->keys[n-1],str,len);
|
|
224 hrec->keys[n-1][len] = 0;
|
|
225 hrec->vals[n-1] = NULL;
|
|
226 }
|
|
227
|
|
228 void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
|
|
229 {
|
|
230 if ( !str ) { hrec->vals[i] = NULL; return; }
|
|
231 if ( hrec->vals[i] ) free(hrec->vals[i]);
|
|
232 if ( is_quoted )
|
|
233 {
|
|
234 hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
|
|
235 hrec->vals[i][0] = '"';
|
|
236 memcpy(&hrec->vals[i][1],str,len);
|
|
237 hrec->vals[i][len+1] = '"';
|
|
238 hrec->vals[i][len+2] = 0;
|
|
239 }
|
|
240 else
|
|
241 {
|
|
242 hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
|
|
243 memcpy(hrec->vals[i],str,len);
|
|
244 hrec->vals[i][len] = 0;
|
|
245 }
|
|
246 }
|
|
247
|
|
248 void hrec_add_idx(bcf_hrec_t *hrec, int idx)
|
|
249 {
|
|
250 int n = ++hrec->nkeys;
|
|
251 hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
|
|
252 hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
|
|
253 hrec->keys[n-1] = strdup("IDX");
|
|
254 kstring_t str = {0,0,0};
|
|
255 kputw(idx, &str);
|
|
256 hrec->vals[n-1] = str.s;
|
|
257 }
|
|
258
|
|
259 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
|
|
260 {
|
|
261 int i;
|
|
262 for (i=0; i<hrec->nkeys; i++)
|
|
263 if ( !strcasecmp(key,hrec->keys[i]) ) return i;
|
|
264 return -1;
|
|
265 }
|
|
266
|
|
267 static inline int is_escaped(const char *min, const char *str)
|
|
268 {
|
|
269 int n = 0;
|
|
270 while ( --str>=min && *str=='\\' ) n++;
|
|
271 return n%2;
|
|
272 }
|
|
273
|
|
274 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
|
|
275 {
|
|
276 const char *p = line;
|
|
277 if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
|
|
278 p += 2;
|
|
279
|
|
280 const char *q = p;
|
|
281 while ( *q && *q!='=' ) q++;
|
|
282 int n = q-p;
|
|
283 if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format
|
|
284
|
|
285 bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
|
|
286 hrec->key = (char*) malloc(sizeof(char)*(n+1));
|
|
287 memcpy(hrec->key,p,n);
|
|
288 hrec->key[n] = 0;
|
|
289
|
|
290 p = ++q;
|
|
291 if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
|
|
292 {
|
|
293 while ( *q && *q!='\n' ) q++;
|
|
294 hrec->value = (char*) malloc((q-p+1)*sizeof(char));
|
|
295 memcpy(hrec->value, p, q-p);
|
|
296 hrec->value[q-p] = 0;
|
|
297 *len = q-line+1;
|
|
298 return hrec;
|
|
299 }
|
|
300
|
|
301 // structured line, e.g. ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
|
|
302 int nopen = 1;
|
|
303 while ( *q && *q!='\n' && nopen )
|
|
304 {
|
|
305 p = ++q;
|
|
306 while ( *q && isalnum(*q) ) q++;
|
|
307 n = q-p;
|
|
308 if ( *q!='=' || !n )
|
|
309 {
|
|
310 // wrong format
|
|
311 while ( *q && *q!='\n' ) q++;
|
|
312 kstring_t tmp = {0,0,0};
|
|
313 kputsn(line,q-line,&tmp);
|
|
314 fprintf(stderr,"Could not parse the header line: \"%s\"\n", tmp.s);
|
|
315 free(tmp.s);
|
|
316 *len = q-line+1;
|
|
317 bcf_hrec_destroy(hrec);
|
|
318 return NULL;
|
|
319 }
|
|
320 bcf_hrec_add_key(hrec, p, q-p);
|
|
321 p = ++q;
|
|
322 int quoted = *p=='"' ? 1 : 0;
|
|
323 if ( quoted ) p++, q++;
|
|
324 while (1)
|
|
325 {
|
|
326 if ( !*q ) break;
|
|
327 if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; }
|
|
328 else
|
|
329 {
|
|
330 if ( *q=='<' ) nopen++;
|
|
331 if ( *q=='>' ) nopen--;
|
|
332 if ( !nopen ) break;
|
|
333 if ( *q==',' && nopen==1 ) break;
|
|
334 }
|
|
335 q++;
|
|
336 }
|
|
337 bcf_hrec_set_val(hrec, hrec->nkeys-1, p, q-p, quoted);
|
|
338 if ( quoted ) q++;
|
|
339 if ( *q=='>' ) { nopen--; q++; }
|
|
340 }
|
|
341 *len = q-line+1;
|
|
342 return hrec;
|
|
343 }
|
|
344
|
|
345 // returns: 1 when hdr needs to be synced, 0 otherwise
|
|
346 int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
|
|
347 {
|
|
348 // contig
|
|
349 int i,j,k, ret;
|
|
350 char *str;
|
|
351 if ( !strcmp(hrec->key, "contig") )
|
|
352 {
|
|
353 hrec->type = BCF_HL_CTG;
|
|
354
|
|
355 // Get the contig ID ($str) and length ($j)
|
|
356 i = bcf_hrec_find_key(hrec,"length");
|
|
357 if ( i<0 ) j = 0;
|
|
358 else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0;
|
|
359
|
|
360 i = bcf_hrec_find_key(hrec,"ID");
|
|
361 if ( i<0 ) return 0;
|
|
362 str = strdup(hrec->vals[i]);
|
|
363
|
|
364 // Register in the dictionary
|
|
365 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
|
|
366 k = kh_put(vdict, d, str, &ret);
|
|
367 if ( !ret ) { free(str); return 0; } // already present
|
|
368
|
|
369 int idx = bcf_hrec_find_key(hrec,"IDX");
|
|
370 if ( idx!=-1 )
|
|
371 {
|
|
372 char *tmp = hrec->vals[idx];
|
|
373 idx = strtol(hrec->vals[idx], &tmp, 10);
|
|
374 if ( *tmp )
|
|
375 {
|
|
376 fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
|
|
377 return 0;
|
|
378 }
|
|
379 }
|
|
380 else
|
|
381 {
|
|
382 idx = kh_size(d) - 1;
|
|
383 hrec_add_idx(hrec, idx);
|
|
384 }
|
|
385
|
|
386 kh_val(d, k) = bcf_idinfo_def;
|
|
387 kh_val(d, k).id = idx;
|
|
388 kh_val(d, k).info[0] = j;
|
|
389 kh_val(d, k).hrec[0] = hrec;
|
|
390
|
|
391 return 1;
|
|
392 }
|
|
393
|
|
394 if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
|
|
395 else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
|
|
396 else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
|
|
397 else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; }
|
|
398 else return 0;
|
|
399
|
|
400 // INFO/FILTER/FORMAT
|
|
401 char *id = NULL;
|
|
402 int type = -1, num = -1, var = -1, idx = -1;
|
|
403 for (i=0; i<hrec->nkeys; i++)
|
|
404 {
|
|
405 if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
|
|
406 else if ( !strcmp(hrec->keys[i], "IDX") )
|
|
407 {
|
|
408 char *tmp = hrec->vals[i];
|
|
409 idx = strtol(hrec->vals[i], &tmp, 10);
|
|
410 if ( *tmp )
|
|
411 {
|
|
412 fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
|
|
413 return 0;
|
|
414 }
|
|
415 }
|
|
416 else if ( !strcmp(hrec->keys[i], "Type") )
|
|
417 {
|
|
418 if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
|
|
419 else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
|
|
420 else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
|
|
421 else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
|
|
422 else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
|
|
423 else
|
|
424 {
|
|
425 fprintf(stderr, "[E::%s] The type \"%s\" not supported, assuming \"String\"\n", __func__, hrec->vals[i]);
|
|
426 type = BCF_HT_STR;
|
|
427 }
|
|
428 }
|
|
429 else if ( !strcmp(hrec->keys[i], "Number") )
|
|
430 {
|
|
431 if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
|
|
432 else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
|
|
433 else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
|
|
434 else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
|
|
435 else
|
|
436 {
|
|
437 sscanf(hrec->vals[i],"%d",&num);
|
|
438 var = BCF_VL_FIXED;
|
|
439 }
|
|
440 if (var != BCF_VL_FIXED) num = 0xfffff;
|
|
441 }
|
|
442 }
|
|
443 uint32_t info = (uint32_t)num<<12 | var<<8 | type<<4 | hrec->type;
|
|
444
|
|
445 if ( !id ) return 0;
|
|
446 str = strdup(id);
|
|
447
|
|
448 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
|
|
449 k = kh_put(vdict, d, str, &ret);
|
|
450 if ( !ret )
|
|
451 {
|
|
452 // already present
|
|
453 free(str);
|
|
454 if ( kh_val(d, k).hrec[info&0xf] ) return 0;
|
|
455 kh_val(d, k).info[info&0xf] = info;
|
|
456 kh_val(d, k).hrec[info&0xf] = hrec;
|
|
457 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
|
|
458 return 1;
|
|
459 }
|
|
460 kh_val(d, k) = bcf_idinfo_def;
|
|
461 kh_val(d, k).info[info&0xf] = info;
|
|
462 kh_val(d, k).hrec[info&0xf] = hrec;
|
|
463 kh_val(d, k).id = idx==-1 ? kh_size(d) - 1 : idx;
|
|
464
|
|
465 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
|
|
466
|
|
467 return 1;
|
|
468 }
|
|
469
|
|
470 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
|
|
471 {
|
|
472 if ( !hrec ) return 0;
|
|
473
|
|
474 hrec->type = BCF_HL_GEN;
|
|
475 if ( !bcf_hdr_register_hrec(hdr,hrec) )
|
|
476 {
|
|
477 // If one of the hashed field, then it is already present
|
|
478 if ( hrec->type != BCF_HL_GEN )
|
|
479 {
|
|
480 bcf_hrec_destroy(hrec);
|
|
481 return 0;
|
|
482 }
|
|
483
|
|
484 // Is one of the generic fields and already present?
|
|
485 int i;
|
|
486 for (i=0; i<hdr->nhrec; i++)
|
|
487 {
|
|
488 if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
|
|
489 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break;
|
|
490 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break;
|
|
491 }
|
|
492 if ( i<hdr->nhrec )
|
|
493 {
|
|
494 bcf_hrec_destroy(hrec);
|
|
495 return 0;
|
|
496 }
|
|
497 }
|
|
498
|
|
499 // New record, needs to be added
|
|
500 int n = ++hdr->nhrec;
|
|
501 hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
|
|
502 hdr->hrec[n-1] = hrec;
|
|
503 hdr->dirty = 1;
|
|
504
|
|
505 return hrec->type==BCF_HL_GEN ? 0 : 1;
|
|
506 }
|
|
507
|
|
508 /*
|
|
509 * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed),
|
|
510 * the STR,GEN lines are searched for linearly in a linked list of all header lines.
|
|
511 * This may become a problem for VCFs with huge headers, we might need to build a
|
|
512 * dictionary for these lines as well.
|
|
513 */
|
|
514 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
|
|
515 {
|
|
516 int i;
|
|
517 if ( type==BCF_HL_GEN )
|
|
518 {
|
|
519 for (i=0; i<hdr->nhrec; i++)
|
|
520 {
|
|
521 if ( hdr->hrec[i]->type!=type ) continue;
|
|
522 if ( strcmp(hdr->hrec[i]->key,key) ) continue;
|
|
523 if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i];
|
|
524 }
|
|
525 return NULL;
|
|
526 }
|
|
527 else if ( type==BCF_HL_STR )
|
|
528 {
|
|
529 for (i=0; i<hdr->nhrec; i++)
|
|
530 {
|
|
531 if ( hdr->hrec[i]->type!=type ) continue;
|
|
532 if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
|
|
533 int j = bcf_hrec_find_key(hdr->hrec[i],key);
|
|
534 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
|
|
535 }
|
|
536 return NULL;
|
|
537 }
|
|
538 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
|
|
539 khint_t k = kh_get(vdict, d, value);
|
|
540 if ( k == kh_end(d) ) return NULL;
|
|
541 return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
|
|
542 }
|
|
543
|
|
544 void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
|
|
545 {
|
|
546 static int PL_warned = 0, GL_warned = 0;
|
|
547
|
|
548 if ( !PL_warned )
|
|
549 {
|
|
550 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
|
|
551 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
|
|
552 {
|
|
553 fprintf(stderr,"[W::%s] PL should be declared as Number=G\n", __func__);
|
|
554 PL_warned = 1;
|
|
555 }
|
|
556 }
|
|
557 if ( !GL_warned )
|
|
558 {
|
|
559 int id = bcf_hdr_id2int(hdr, BCF_HL_FMT, "GL");
|
|
560 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
|
|
561 {
|
|
562 fprintf(stderr,"[W::%s] GL should be declared as Number=G\n", __func__);
|
|
563 PL_warned = 1;
|
|
564 }
|
|
565 }
|
|
566 }
|
|
567
|
|
568 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
|
|
569 {
|
|
570 int len, needs_sync = 0;
|
|
571 char *p = htxt;
|
|
572
|
|
573 // Check sanity: "fileformat" string must come as first
|
|
574 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
|
|
575 if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
|
|
576 fprintf(stderr, "[W::%s] The first line should be ##fileformat; is the VCF/BCF header broken?\n", __func__);
|
|
577 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
|
|
578
|
|
579 // The filter PASS must appear first in the dictionary
|
|
580 hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
|
|
581 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
|
|
582
|
|
583 // Parse the whole header
|
|
584 while ( (hrec=bcf_hdr_parse_line(hdr,p,&len)) )
|
|
585 {
|
|
586 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
|
|
587 p += len;
|
|
588 }
|
|
589 int ret = bcf_hdr_parse_sample_line(hdr,p);
|
|
590 bcf_hdr_sync(hdr);
|
|
591 bcf_hdr_check_sanity(hdr);
|
|
592 return ret;
|
|
593 }
|
|
594
|
|
595 int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
|
|
596 {
|
|
597 int len;
|
|
598 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
|
|
599 if ( !hrec ) return -1;
|
|
600 bcf_hdr_add_hrec(hdr, hrec);
|
|
601 return 0;
|
|
602 }
|
|
603
|
|
604 void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
|
|
605 {
|
|
606 int i;
|
|
607 bcf_hrec_t *hrec;
|
|
608 while (1)
|
|
609 {
|
|
610 if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
|
|
611 {
|
|
612 hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
|
|
613 if ( !hrec ) return;
|
|
614
|
|
615 for (i=0; i<hdr->nhrec; i++)
|
|
616 if ( hdr->hrec[i]==hrec ) break;
|
|
617 assert( i<hdr->nhrec );
|
|
618
|
|
619 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
|
|
620 khint_t k = kh_get(vdict, d, key);
|
|
621 kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
|
|
622 }
|
|
623 else
|
|
624 {
|
|
625 for (i=0; i<hdr->nhrec; i++)
|
|
626 {
|
|
627 if ( hdr->hrec[i]->type!=type ) continue;
|
|
628 if ( type==BCF_HL_GEN )
|
|
629 {
|
|
630 if ( !strcmp(hdr->hrec[i]->key,key) ) break;
|
|
631 }
|
|
632 else
|
|
633 {
|
|
634 // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
|
|
635 int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
|
|
636 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
|
|
637 }
|
|
638 }
|
|
639 if ( i==hdr->nhrec ) return;
|
|
640 hrec = hdr->hrec[i];
|
|
641 }
|
|
642
|
|
643 hdr->nhrec--;
|
|
644 if ( i < hdr->nhrec )
|
|
645 memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
|
|
646 bcf_hrec_destroy(hrec);
|
|
647 hdr->dirty = 1;
|
|
648 }
|
|
649 }
|
|
650
|
|
651 int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
|
|
652 {
|
|
653 va_list ap;
|
|
654 va_start(ap, fmt);
|
|
655 int n = vsnprintf(NULL, 0, fmt, ap) + 2;
|
|
656 va_end(ap);
|
|
657
|
|
658 char *line = (char*)malloc(n);
|
|
659 va_start(ap, fmt);
|
|
660 vsnprintf(line, n, fmt, ap);
|
|
661 va_end(ap);
|
|
662
|
|
663 int ret = bcf_hdr_append(hdr, line);
|
|
664
|
|
665 free(line);
|
|
666 return ret;
|
|
667 }
|
|
668
|
|
669
|
|
670 /**********************
|
|
671 *** BCF header I/O ***
|
|
672 **********************/
|
|
673
|
|
674 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
|
|
675 {
|
|
676 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
|
|
677 if ( !hrec )
|
|
678 {
|
|
679 fprintf(stderr,"No version string found, assuming VCFv4.2\n");
|
|
680 return "VCFv4.2";
|
|
681 }
|
|
682 return hrec->value;
|
|
683 }
|
|
684
|
|
685 void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
|
|
686 {
|
|
687 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
|
|
688 if ( !hrec )
|
|
689 {
|
|
690 int len;
|
|
691 kstring_t str = {0,0,0};
|
|
692 ksprintf(&str,"##fileformat=%s", version);
|
|
693 hrec = bcf_hdr_parse_line(hdr, str.s, &len);
|
|
694 free(str.s);
|
|
695 }
|
|
696 else
|
|
697 {
|
|
698 free(hrec->value);
|
|
699 hrec->value = strdup(version);
|
|
700 }
|
|
701 hdr->dirty = 1;
|
|
702 }
|
|
703
|
|
704 bcf_hdr_t *bcf_hdr_init(const char *mode)
|
|
705 {
|
|
706 int i;
|
|
707 bcf_hdr_t *h;
|
|
708 h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
|
|
709 for (i = 0; i < 3; ++i)
|
|
710 h->dict[i] = kh_init(vdict);
|
|
711 if ( strchr(mode,'w') )
|
|
712 {
|
|
713 bcf_hdr_append(h, "##fileformat=VCFv4.2");
|
|
714 // The filter PASS must appear first in the dictionary
|
|
715 bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
|
|
716 }
|
|
717 return h;
|
|
718 }
|
|
719
|
|
720 void bcf_hdr_destroy(bcf_hdr_t *h)
|
|
721 {
|
|
722 int i;
|
|
723 khint_t k;
|
|
724 for (i = 0; i < 3; ++i) {
|
|
725 vdict_t *d = (vdict_t*)h->dict[i];
|
|
726 if (d == 0) continue;
|
|
727 for (k = kh_begin(d); k != kh_end(d); ++k)
|
|
728 if (kh_exist(d, k)) free((char*)kh_key(d, k));
|
|
729 kh_destroy(vdict, d);
|
|
730 free(h->id[i]);
|
|
731 }
|
|
732 for (i=0; i<h->nhrec; i++)
|
|
733 bcf_hrec_destroy(h->hrec[i]);
|
|
734 if (h->nhrec) free(h->hrec);
|
|
735 if (h->samples) free(h->samples);
|
|
736 free(h->keep_samples);
|
|
737 free(h->transl[0]); free(h->transl[1]);
|
|
738 free(h->mem.s);
|
|
739 free(h);
|
|
740 }
|
|
741
|
|
742 bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
|
|
743 {
|
|
744 if (hfp->format.format == vcf)
|
|
745 return vcf_hdr_read(hfp);
|
|
746
|
|
747 BGZF *fp = hfp->fp.bgzf;
|
|
748 uint8_t magic[5];
|
|
749 bcf_hdr_t *h;
|
|
750 h = bcf_hdr_init("r");
|
|
751 if ( bgzf_read(fp, magic, 5)<0 )
|
|
752 {
|
|
753 fprintf(stderr,"[%s:%d %s] Failed to read the header (reading BCF in text mode?)\n", __FILE__,__LINE__,__FUNCTION__);
|
|
754 return NULL;
|
|
755 }
|
|
756 if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
|
|
757 {
|
|
758 if (!strncmp((char*)magic, "BCF", 3))
|
|
759 fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 is supported.\n", __FILE__,__LINE__,__FUNCTION__);
|
|
760 else if (hts_verbose >= 2)
|
|
761 fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__);
|
|
762 bcf_hdr_destroy(h);
|
|
763 return 0;
|
|
764 }
|
|
765 int hlen;
|
|
766 char *htxt;
|
|
767 bgzf_read(fp, &hlen, 4);
|
|
768 htxt = (char*)malloc(hlen);
|
|
769 bgzf_read(fp, htxt, hlen);
|
|
770 bcf_hdr_parse(h, htxt);
|
|
771 free(htxt);
|
|
772 return h;
|
|
773 }
|
|
774
|
|
775 int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
|
|
776 {
|
|
777 if ( h->dirty ) bcf_hdr_sync(h);
|
|
778 if (hfp->format.format == vcf || hfp->format.format == text_format)
|
|
779 return vcf_hdr_write(hfp, h);
|
|
780
|
|
781 int hlen;
|
|
782 char *htxt = bcf_hdr_fmt_text(h, 1, &hlen);
|
|
783 hlen++; // include the \0 byte
|
|
784
|
|
785 BGZF *fp = hfp->fp.bgzf;
|
|
786 if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
|
|
787 if ( bgzf_write(fp, &hlen, 4) !=4 ) return -1;
|
|
788 if ( bgzf_write(fp, htxt, hlen) != hlen ) return -1;
|
|
789
|
|
790 free(htxt);
|
|
791 return 0;
|
|
792 }
|
|
793
|
|
794 /********************
|
|
795 *** BCF site I/O ***
|
|
796 ********************/
|
|
797
|
|
798 bcf1_t *bcf_init1()
|
|
799 {
|
|
800 bcf1_t *v;
|
|
801 v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
|
|
802 return v;
|
|
803 }
|
|
804
|
|
805 void bcf_clear(bcf1_t *v)
|
|
806 {
|
|
807 int i;
|
|
808 for (i=0; i<v->d.m_info; i++)
|
|
809 {
|
|
810 if ( v->d.info[i].vptr_free )
|
|
811 {
|
|
812 free(v->d.info[i].vptr - v->d.info[i].vptr_off);
|
|
813 v->d.info[i].vptr_free = 0;
|
|
814 }
|
|
815 }
|
|
816 for (i=0; i<v->d.m_fmt; i++)
|
|
817 {
|
|
818 if ( v->d.fmt[i].p_free )
|
|
819 {
|
|
820 free(v->d.fmt[i].p - v->d.fmt[i].p_off);
|
|
821 v->d.fmt[i].p_free = 0;
|
|
822 }
|
|
823 }
|
|
824 v->rid = v->pos = v->rlen = v->unpacked = 0;
|
|
825 bcf_float_set_missing(v->qual);
|
|
826 v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
|
|
827 v->shared.l = v->indiv.l = 0;
|
|
828 v->d.var_type = -1;
|
|
829 v->d.shared_dirty = 0;
|
|
830 v->d.indiv_dirty = 0;
|
|
831 v->d.n_flt = 0;
|
|
832 v->errcode = 0;
|
|
833 if (v->d.m_als) v->d.als[0] = 0;
|
|
834 if (v->d.m_id) v->d.id[0] = 0;
|
|
835 }
|
|
836
|
|
837 void bcf_empty1(bcf1_t *v)
|
|
838 {
|
|
839 bcf_clear1(v);
|
|
840 free(v->d.id);
|
|
841 free(v->d.als);
|
|
842 free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
|
|
843 if (v->d.var ) free(v->d.var);
|
|
844 free(v->shared.s); free(v->indiv.s);
|
|
845 }
|
|
846
|
|
847 void bcf_destroy1(bcf1_t *v)
|
|
848 {
|
|
849 bcf_empty1(v);
|
|
850 free(v);
|
|
851 }
|
|
852
|
|
853 static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
|
|
854 {
|
|
855 uint32_t x[8];
|
|
856 int ret;
|
|
857 if ((ret = bgzf_read(fp, x, 32)) != 32) {
|
|
858 if (ret == 0) return -1;
|
|
859 return -2;
|
|
860 }
|
|
861 bcf_clear1(v);
|
|
862 x[0] -= 24; // to exclude six 32-bit integers
|
|
863 ks_resize(&v->shared, x[0]);
|
|
864 ks_resize(&v->indiv, x[1]);
|
|
865 memcpy(v, x + 2, 16);
|
|
866 v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff;
|
|
867 v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff;
|
|
868 v->shared.l = x[0], v->indiv.l = x[1];
|
|
869
|
|
870 // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
|
|
871 if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
|
|
872
|
|
873 bgzf_read(fp, v->shared.s, v->shared.l);
|
|
874 bgzf_read(fp, v->indiv.s, v->indiv.l);
|
|
875 return 0;
|
|
876 }
|
|
877
|
|
878 #define bit_array_size(n) ((n)/8+1)
|
|
879 #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
|
|
880 #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
|
|
881 #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
|
|
882
|
|
883 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
|
|
884 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
|
|
885 {
|
|
886 if ( !hdr->keep_samples ) return 0;
|
|
887 if ( !bcf_hdr_nsamples(hdr) )
|
|
888 {
|
|
889 rec->indiv.l = rec->n_sample = 0;
|
|
890 return 0;
|
|
891 }
|
|
892
|
|
893 int i, j;
|
|
894 uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
|
|
895 bcf_dec_t *dec = &rec->d;
|
|
896 hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
|
|
897 for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
|
|
898
|
|
899 for (i=0; i<rec->n_fmt; i++)
|
|
900 {
|
|
901 ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
|
|
902 src = dec->fmt[i].p - dec->fmt[i].size;
|
|
903 if ( dst )
|
|
904 {
|
|
905 memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
|
|
906 dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
|
|
907 }
|
|
908 dst = dec->fmt[i].p;
|
|
909 for (j=0; j<hdr->nsamples_ori; j++)
|
|
910 {
|
|
911 src += dec->fmt[i].size;
|
|
912 if ( !bit_array_test(hdr->keep_samples,j) ) continue;
|
|
913 memmove(dst, src, dec->fmt[i].size);
|
|
914 dst += dec->fmt[i].size;
|
|
915 }
|
|
916 rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
|
|
917 dec->fmt[i].p_len = dst - dec->fmt[i].p;
|
|
918 }
|
|
919 rec->unpacked |= BCF_UN_FMT;
|
|
920
|
|
921 rec->n_sample = bcf_hdr_nsamples(hdr);
|
|
922 return 0;
|
|
923 }
|
|
924
|
|
925 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
|
|
926 {
|
|
927 if (fp->format.format == vcf) return vcf_read(fp,h,v);
|
|
928 int ret = bcf_read1_core(fp->fp.bgzf, v);
|
|
929 if ( ret!=0 || !h->keep_samples ) return ret;
|
|
930 return bcf_subset_format(h,v);
|
|
931 }
|
|
932
|
|
933 int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end)
|
|
934 {
|
|
935 bcf1_t *v = (bcf1_t *) vv;
|
|
936 int ret;
|
|
937 if ((ret = bcf_read1_core(fp, v)) >= 0)
|
|
938 *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
|
|
939 return ret;
|
|
940 }
|
|
941
|
|
942 static inline void bcf1_sync_id(bcf1_t *line, kstring_t *str)
|
|
943 {
|
|
944 // single typed string
|
|
945 if ( line->d.id && strcmp(line->d.id, ".") ) bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
|
|
946 else bcf_enc_size(str, 0, BCF_BT_CHAR);
|
|
947 }
|
|
948 static inline void bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
|
|
949 {
|
|
950 // list of typed strings
|
|
951 int i;
|
|
952 for (i=0; i<line->n_allele; i++)
|
|
953 bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]);
|
|
954 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
|
|
955 }
|
|
956 static inline void bcf1_sync_filter(bcf1_t *line, kstring_t *str)
|
|
957 {
|
|
958 // typed vector of integers
|
|
959 if ( line->d.n_flt ) bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
|
|
960 else bcf_enc_vint(str, 0, 0, -1);
|
|
961 }
|
|
962
|
|
963 static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str)
|
|
964 {
|
|
965 // pairs of typed vectors
|
|
966 int i, irm = -1;
|
|
967 for (i=0; i<line->n_info; i++)
|
|
968 {
|
|
969 bcf_info_t *info = &line->d.info[i];
|
|
970 if ( !info->vptr )
|
|
971 {
|
|
972 // marked for removal
|
|
973 if ( irm < 0 ) irm = i;
|
|
974 continue;
|
|
975 }
|
|
976 kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str);
|
|
977 if ( irm >=0 )
|
|
978 {
|
|
979 bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
|
|
980 while ( irm<=i && line->d.info[irm].vptr ) irm++;
|
|
981 }
|
|
982 }
|
|
983 if ( irm>=0 ) line->n_info = irm;
|
|
984 }
|
|
985
|
|
986 static int bcf1_sync(bcf1_t *line)
|
|
987 {
|
|
988 char *shared_ori = line->shared.s;
|
|
989 size_t prev_len;
|
|
990
|
|
991 kstring_t tmp = {0,0,0};
|
|
992 if ( !line->shared.l )
|
|
993 {
|
|
994 // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
|
|
995 tmp = line->shared;
|
|
996 bcf1_sync_id(line, &tmp);
|
|
997 line->unpack_size[0] = tmp.l; prev_len = tmp.l;
|
|
998
|
|
999 bcf1_sync_alleles(line, &tmp);
|
|
1000 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
|
|
1001
|
|
1002 bcf1_sync_filter(line, &tmp);
|
|
1003 line->unpack_size[2] = tmp.l - prev_len;
|
|
1004
|
|
1005 bcf1_sync_info(line, &tmp);
|
|
1006 line->shared = tmp;
|
|
1007 }
|
|
1008 else if ( line->d.shared_dirty )
|
|
1009 {
|
|
1010 // The line was edited, update the BCF data block, ptr_ori points
|
|
1011 // to the original unchanged BCF data.
|
|
1012 uint8_t *ptr_ori = (uint8_t *) line->shared.s;
|
|
1013
|
|
1014 assert( line->unpacked & BCF_UN_STR );
|
|
1015
|
|
1016 // ID: single typed string
|
|
1017 if ( line->d.shared_dirty & BCF1_DIRTY_ID )
|
|
1018 bcf1_sync_id(line, &tmp);
|
|
1019 else
|
|
1020 kputsn_(ptr_ori, line->unpack_size[0], &tmp);
|
|
1021 ptr_ori += line->unpack_size[0];
|
|
1022 line->unpack_size[0] = tmp.l; prev_len = tmp.l;
|
|
1023
|
|
1024 // REF+ALT: list of typed strings
|
|
1025 if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
|
|
1026 bcf1_sync_alleles(line, &tmp);
|
|
1027 else
|
|
1028 {
|
|
1029 kputsn_(ptr_ori, line->unpack_size[1], &tmp);
|
|
1030 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
|
|
1031 }
|
|
1032 ptr_ori += line->unpack_size[1];
|
|
1033 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
|
|
1034
|
|
1035 if ( line->unpacked & BCF_UN_FLT )
|
|
1036 {
|
|
1037 // FILTER: typed vector of integers
|
|
1038 if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
|
|
1039 bcf1_sync_filter(line, &tmp);
|
|
1040 else if ( line->d.n_flt )
|
|
1041 kputsn_(ptr_ori, line->unpack_size[2], &tmp);
|
|
1042 else
|
|
1043 bcf_enc_vint(&tmp, 0, 0, -1);
|
|
1044 ptr_ori += line->unpack_size[2];
|
|
1045 line->unpack_size[2] = tmp.l - prev_len;
|
|
1046
|
|
1047 if ( line->unpacked & BCF_UN_INFO )
|
|
1048 {
|
|
1049 // INFO: pairs of typed vectors
|
|
1050 if ( line->d.shared_dirty & BCF1_DIRTY_INF )
|
|
1051 {
|
|
1052 bcf1_sync_info(line, &tmp);
|
|
1053 ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
|
|
1054 }
|
|
1055 }
|
|
1056 }
|
|
1057
|
|
1058 int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
|
|
1059 if ( size ) kputsn_(ptr_ori, size, &tmp);
|
|
1060
|
|
1061 free(line->shared.s);
|
|
1062 line->shared = tmp;
|
|
1063 }
|
|
1064 if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
|
|
1065 {
|
|
1066 // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
|
|
1067 size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
|
|
1068 int i;
|
|
1069 for (i=0; i<line->n_info; i++)
|
|
1070 {
|
|
1071 uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
|
|
1072 line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
|
|
1073 off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
|
|
1074 if ( vptr_free )
|
|
1075 {
|
|
1076 free(vptr_free);
|
|
1077 line->d.info[i].vptr_free = 0;
|
|
1078 }
|
|
1079 }
|
|
1080 }
|
|
1081
|
|
1082 if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
|
|
1083 {
|
|
1084 // The genotype fields changed or are not present
|
|
1085 tmp.l = tmp.m = 0; tmp.s = NULL;
|
|
1086 int i, irm = -1;
|
|
1087 for (i=0; i<line->n_fmt; i++)
|
|
1088 {
|
|
1089 bcf_fmt_t *fmt = &line->d.fmt[i];
|
|
1090 if ( !fmt->p )
|
|
1091 {
|
|
1092 // marked for removal
|
|
1093 if ( irm < 0 ) irm = i;
|
|
1094 continue;
|
|
1095 }
|
|
1096 kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
|
|
1097 if ( irm >=0 )
|
|
1098 {
|
|
1099 bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
|
|
1100 while ( irm<=i && line->d.fmt[irm].p ) irm++;
|
|
1101 }
|
|
1102
|
|
1103 }
|
|
1104 if ( irm>=0 ) line->n_fmt = irm;
|
|
1105 free(line->indiv.s);
|
|
1106 line->indiv = tmp;
|
|
1107
|
|
1108 // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
|
|
1109 size_t off_new = 0;
|
|
1110 for (i=0; i<line->n_fmt; i++)
|
|
1111 {
|
|
1112 uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
|
|
1113 line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
|
|
1114 off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
|
|
1115 if ( p_free )
|
|
1116 {
|
|
1117 free(p_free);
|
|
1118 line->d.fmt[i].p_free = 0;
|
|
1119 }
|
|
1120 }
|
|
1121 }
|
|
1122 if ( !line->n_sample ) line->n_fmt = 0;
|
|
1123 line->d.shared_dirty = line->d.indiv_dirty = 0;
|
|
1124 return 0;
|
|
1125 }
|
|
1126
|
|
1127 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
|
|
1128 {
|
|
1129 bcf1_sync(src);
|
|
1130
|
|
1131 bcf_clear(dst);
|
|
1132 dst->rid = src->rid;
|
|
1133 dst->pos = src->pos;
|
|
1134 dst->rlen = src->rlen;
|
|
1135 dst->qual = src->qual;
|
|
1136 dst->n_info = src->n_info; dst->n_allele = src->n_allele;
|
|
1137 dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
|
|
1138
|
|
1139 dst->shared.m = dst->shared.l = src->shared.l;
|
|
1140 dst->shared.s = (char*) malloc(dst->shared.l);
|
|
1141 memcpy(dst->shared.s,src->shared.s,dst->shared.l);
|
|
1142
|
|
1143 dst->indiv.m = dst->indiv.l = src->indiv.l;
|
|
1144 dst->indiv.s = (char*) malloc(dst->indiv.l);
|
|
1145 memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
|
|
1146
|
|
1147 return dst;
|
|
1148 }
|
|
1149 bcf1_t *bcf_dup(bcf1_t *src)
|
|
1150 {
|
|
1151 bcf1_t *out = bcf_init1();
|
|
1152 return bcf_copy(out, src);
|
|
1153 }
|
|
1154
|
|
1155 int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v)
|
|
1156 {
|
|
1157 if ( h->dirty )
|
|
1158 {
|
|
1159 // we could as well call bcf_hdr_sync here, not sure
|
|
1160 fprintf(stderr,"FIXME: dirty header not synced\n");
|
|
1161 exit(1);
|
|
1162 }
|
|
1163 if ( bcf_hdr_nsamples(h)!=v->n_sample )
|
|
1164 {
|
|
1165 fprintf(stderr,"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
|
|
1166 __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
|
|
1167 return -1;
|
|
1168 }
|
|
1169
|
|
1170 if ( hfp->format.format == vcf || hfp->format.format == text_format )
|
|
1171 return vcf_write(hfp,h,v);
|
|
1172
|
|
1173 if ( v->errcode )
|
|
1174 {
|
|
1175 // vcf_parse1() encountered a new contig or tag, undeclared in the
|
|
1176 // header. At this point, the header must have been printed,
|
|
1177 // proceeding would lead to a broken BCF file. Errors must be checked
|
|
1178 // and cleared by the caller before we can proceed.
|
|
1179 fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,v->errcode);
|
|
1180 exit(1);
|
|
1181 }
|
|
1182 bcf1_sync(v); // check if the BCF record was modified
|
|
1183
|
|
1184 BGZF *fp = hfp->fp.bgzf;
|
|
1185 uint32_t x[8];
|
|
1186 x[0] = v->shared.l + 24; // to include six 32-bit integers
|
|
1187 x[1] = v->indiv.l;
|
|
1188 memcpy(x + 2, v, 16);
|
|
1189 x[6] = (uint32_t)v->n_allele<<16 | v->n_info;
|
|
1190 x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample;
|
|
1191 if ( bgzf_write(fp, x, 32) != 32 ) return -1;
|
|
1192 if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
|
|
1193 if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
|
|
1194 return 0;
|
|
1195 }
|
|
1196
|
|
1197 /**********************
|
|
1198 *** VCF header I/O ***
|
|
1199 **********************/
|
|
1200
|
|
1201 bcf_hdr_t *vcf_hdr_read(htsFile *fp)
|
|
1202 {
|
|
1203 kstring_t txt, *s = &fp->line;
|
|
1204 bcf_hdr_t *h;
|
|
1205 h = bcf_hdr_init("r");
|
|
1206 txt.l = txt.m = 0; txt.s = 0;
|
|
1207 while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
|
|
1208 if (s->l == 0) continue;
|
|
1209 if (s->s[0] != '#') {
|
|
1210 if (hts_verbose >= 2)
|
|
1211 fprintf(stderr, "[E::%s] no sample line\n", __func__);
|
|
1212 free(txt.s);
|
|
1213 bcf_hdr_destroy(h);
|
|
1214 return 0;
|
|
1215 }
|
|
1216 if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
|
|
1217 int dret;
|
|
1218 gzFile f;
|
|
1219 kstream_t *ks;
|
|
1220 kstring_t tmp;
|
|
1221 tmp.l = tmp.m = 0; tmp.s = 0;
|
|
1222 f = gzopen(fp->fn_aux, "r");
|
|
1223 ks = ks_init(f);
|
|
1224 while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
|
|
1225 int c;
|
|
1226 kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
|
|
1227 ks_getuntil(ks, 0, &tmp, &dret);
|
|
1228 kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
|
|
1229 kputsn(">\n", 2, &txt);
|
|
1230 if (dret != '\n')
|
|
1231 while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
|
|
1232 }
|
|
1233 free(tmp.s);
|
|
1234 ks_destroy(ks);
|
|
1235 gzclose(f);
|
|
1236 }
|
|
1237 kputsn(s->s, s->l, &txt);
|
|
1238 kputc('\n', &txt);
|
|
1239 if (s->s[1] != '#') break;
|
|
1240 }
|
|
1241 if ( !txt.s )
|
|
1242 {
|
|
1243 fprintf(stderr,"[%s:%d %s] Could not read the header\n", __FILE__,__LINE__,__FUNCTION__);
|
|
1244 return NULL;
|
|
1245 }
|
|
1246 bcf_hdr_parse(h, txt.s);
|
|
1247
|
|
1248 // check tabix index, are all contigs listed in the header? add the missing ones
|
|
1249 tbx_t *idx = tbx_index_load(fp->fn);
|
|
1250 if ( idx )
|
|
1251 {
|
|
1252 int i, n, need_sync = 0;
|
|
1253 const char **names = tbx_seqnames(idx, &n);
|
|
1254 for (i=0; i<n; i++)
|
|
1255 {
|
|
1256 bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
|
|
1257 if ( hrec ) continue;
|
|
1258 hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
|
|
1259 hrec->key = strdup("contig");
|
|
1260 bcf_hrec_add_key(hrec, "ID", strlen("ID"));
|
|
1261 bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
|
|
1262 bcf_hdr_add_hrec(h, hrec);
|
|
1263 need_sync = 1;
|
|
1264 }
|
|
1265 free(names);
|
|
1266 tbx_destroy(idx);
|
|
1267 if ( need_sync )
|
|
1268 bcf_hdr_sync(h);
|
|
1269 }
|
|
1270 free(txt.s);
|
|
1271 return h;
|
|
1272 }
|
|
1273
|
|
1274 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
|
|
1275 {
|
|
1276 int i, n;
|
|
1277 char **lines = hts_readlines(fname, &n);
|
|
1278 if ( !lines ) return 1;
|
|
1279 for (i=0; i<n-1; i++)
|
|
1280 {
|
|
1281 int k;
|
|
1282 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
|
|
1283 if ( hrec ) bcf_hdr_add_hrec(hdr, hrec);
|
|
1284 free(lines[i]);
|
|
1285 }
|
|
1286 bcf_hdr_parse_sample_line(hdr,lines[n-1]);
|
|
1287 free(lines[n-1]);
|
|
1288 free(lines);
|
|
1289 bcf_hdr_sync(hdr);
|
|
1290 return 0;
|
|
1291 }
|
|
1292
|
|
1293 static void _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
|
|
1294 {
|
|
1295 if ( !hrec->value )
|
|
1296 {
|
|
1297 int j, nout = 0;
|
|
1298 ksprintf(str, "##%s=<", hrec->key);
|
|
1299 for (j=0; j<hrec->nkeys; j++)
|
|
1300 {
|
|
1301 // do not output IDX if output is VCF
|
|
1302 if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
|
|
1303 if ( nout ) kputc(',',str);
|
|
1304 ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
|
|
1305 nout++;
|
|
1306 }
|
|
1307 ksprintf(str,">\n");
|
|
1308 }
|
|
1309 else
|
|
1310 ksprintf(str,"##%s=%s\n", hrec->key,hrec->value);
|
|
1311 }
|
|
1312
|
|
1313 void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
|
|
1314 {
|
|
1315 _bcf_hrec_format(hrec,0,str);
|
|
1316 }
|
|
1317 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
|
|
1318 {
|
|
1319 int i;
|
|
1320 kstring_t txt = {0,0,0};
|
|
1321 for (i=0; i<hdr->nhrec; i++)
|
|
1322 _bcf_hrec_format(hdr->hrec[i], is_bcf, &txt);
|
|
1323
|
|
1324 ksprintf(&txt,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
|
|
1325 if ( bcf_hdr_nsamples(hdr) )
|
|
1326 {
|
|
1327 ksprintf(&txt,"\tFORMAT");
|
|
1328 for (i=0; i<bcf_hdr_nsamples(hdr); i++)
|
|
1329 ksprintf(&txt,"\t%s", hdr->samples[i]);
|
|
1330 }
|
|
1331 ksprintf(&txt,"\n");
|
|
1332
|
|
1333 if ( len ) *len = txt.l;
|
|
1334 return txt.s;
|
|
1335 }
|
|
1336
|
|
1337 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
|
|
1338 {
|
|
1339 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
|
|
1340 int tid, m = kh_size(d);
|
|
1341 const char **names = (const char**) calloc(m,sizeof(const char*));
|
|
1342 khint_t k;
|
|
1343 for (k=kh_begin(d); k<kh_end(d); k++)
|
|
1344 {
|
|
1345 if ( !kh_exist(d,k) ) continue;
|
|
1346 tid = kh_val(d,k).id;
|
|
1347 assert( tid<m );
|
|
1348 names[tid] = kh_key(d,k);
|
|
1349 }
|
|
1350 // sanity check: there should be no gaps
|
|
1351 for (tid=0; tid<m; tid++)
|
|
1352 assert(names[tid]);
|
|
1353 *n = m;
|
|
1354 return names;
|
|
1355 }
|
|
1356
|
|
1357 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
|
|
1358 {
|
|
1359 int hlen;
|
|
1360 char *htxt = bcf_hdr_fmt_text(h, 0, &hlen);
|
|
1361 while (hlen && htxt[hlen-1] == 0) --hlen; // kill trailing zeros
|
|
1362 int ret;
|
|
1363 if ( fp->format.compression!=no_compression )
|
|
1364 ret = bgzf_write(fp->fp.bgzf, htxt, hlen);
|
|
1365 else
|
|
1366 ret = hwrite(fp->fp.hfile, htxt, hlen);
|
|
1367 free(htxt);
|
|
1368 return ret<0 ? -1 : 0;
|
|
1369 }
|
|
1370
|
|
1371 /***********************
|
|
1372 *** Typed value I/O ***
|
|
1373 ***********************/
|
|
1374
|
|
1375 void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
|
|
1376 {
|
|
1377 int32_t max = INT32_MIN + 1, min = INT32_MAX;
|
|
1378 int i;
|
|
1379 if (n == 0) bcf_enc_size(s, 0, BCF_BT_NULL);
|
|
1380 else if (n == 1) bcf_enc_int1(s, a[0]);
|
|
1381 else {
|
|
1382 if (wsize <= 0) wsize = n;
|
|
1383 for (i = 0; i < n; ++i) {
|
|
1384 if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
|
|
1385 if (max < a[i]) max = a[i];
|
|
1386 if (min > a[i]) min = a[i];
|
|
1387 }
|
|
1388 if (max <= INT8_MAX && min > bcf_int8_vector_end) {
|
|
1389 bcf_enc_size(s, wsize, BCF_BT_INT8);
|
|
1390 for (i = 0; i < n; ++i)
|
|
1391 if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
|
|
1392 else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
|
|
1393 else kputc(a[i], s);
|
|
1394 } else if (max <= INT16_MAX && min > bcf_int16_vector_end) {
|
|
1395 bcf_enc_size(s, wsize, BCF_BT_INT16);
|
|
1396 for (i = 0; i < n; ++i)
|
|
1397 {
|
|
1398 int16_t x;
|
|
1399 if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
|
|
1400 else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
|
|
1401 else x = a[i];
|
|
1402 kputsn((char*)&x, 2, s);
|
|
1403 }
|
|
1404 } else {
|
|
1405 bcf_enc_size(s, wsize, BCF_BT_INT32);
|
|
1406 for (i = 0; i < n; ++i) {
|
|
1407 int32_t x = a[i];
|
|
1408 kputsn((char*)&x, 4, s);
|
|
1409 }
|
|
1410 }
|
|
1411 }
|
|
1412 }
|
|
1413
|
|
1414 void bcf_enc_vfloat(kstring_t *s, int n, float *a)
|
|
1415 {
|
|
1416 bcf_enc_size(s, n, BCF_BT_FLOAT);
|
|
1417 kputsn((char*)a, n << 2, s);
|
|
1418 }
|
|
1419
|
|
1420 void bcf_enc_vchar(kstring_t *s, int l, const char *a)
|
|
1421 {
|
|
1422 bcf_enc_size(s, l, BCF_BT_CHAR);
|
|
1423 kputsn(a, l, s);
|
|
1424 }
|
|
1425
|
|
1426 void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
|
|
1427 {
|
|
1428 int j = 0;
|
|
1429 if (n == 0) {
|
|
1430 kputc('.', s);
|
|
1431 return;
|
|
1432 }
|
|
1433 if (type == BCF_BT_CHAR)
|
|
1434 {
|
|
1435 char *p = (char*)data;
|
|
1436 for (j = 0; j < n && *p; ++j, ++p)
|
|
1437 {
|
|
1438 if ( *p==bcf_str_missing ) kputc('.', s);
|
|
1439 else kputc(*p, s);
|
|
1440 }
|
|
1441 }
|
|
1442 else
|
|
1443 {
|
|
1444 #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
|
|
1445 type_t *p = (type_t *) data; \
|
|
1446 for (j=0; j<n; j++) \
|
|
1447 { \
|
|
1448 if ( is_vector_end ) break; \
|
|
1449 if ( j ) kputc(',', s); \
|
|
1450 if ( is_missing ) kputc('.', s); \
|
|
1451 else kprint; \
|
|
1452 } \
|
|
1453 }
|
|
1454 switch (type) {
|
|
1455 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, kputw(p[j], s)); break;
|
|
1456 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, kputw(p[j], s)); break;
|
|
1457 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, kputw(p[j], s)); break;
|
|
1458 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), ksprintf(s, "%g", p[j])); break;
|
|
1459 default: fprintf(stderr,"todo: type %d\n", type); exit(1); break;
|
|
1460 }
|
|
1461 #undef BRANCH
|
|
1462 }
|
|
1463 }
|
|
1464
|
|
1465 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
|
|
1466 {
|
|
1467 int x, type;
|
|
1468 x = bcf_dec_size(ptr, &ptr, &type);
|
|
1469 bcf_fmt_array(s, x, type, ptr);
|
|
1470 return ptr + (x << bcf_type_shift[type]);
|
|
1471 }
|
|
1472
|
|
1473 /********************
|
|
1474 *** VCF site I/O ***
|
|
1475 ********************/
|
|
1476
|
|
1477 typedef struct {
|
|
1478 int key, max_m, size, offset;
|
|
1479 uint32_t is_gt:1, max_g:15, max_l:16;
|
|
1480 uint32_t y;
|
|
1481 uint8_t *buf;
|
|
1482 } fmt_aux_t;
|
|
1483
|
|
1484 static inline void align_mem(kstring_t *s)
|
|
1485 {
|
|
1486 if (s->l&7) {
|
|
1487 uint64_t zero = 0;
|
|
1488 int l = ((s->l + 7)>>3<<3) - s->l;
|
|
1489 kputsn((char*)&zero, l, s);
|
|
1490 }
|
|
1491 }
|
|
1492
|
|
1493 // p,q is the start and the end of the FORMAT field
|
|
1494 int _vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
|
|
1495 {
|
|
1496 if ( !bcf_hdr_nsamples(h) ) return 0;
|
|
1497
|
|
1498 char *r, *t;
|
|
1499 int j, l, m, g;
|
|
1500 khint_t k;
|
|
1501 ks_tokaux_t aux1;
|
|
1502 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
|
|
1503 kstring_t *mem = (kstring_t*)&h->mem;
|
|
1504 mem->l = 0;
|
|
1505
|
|
1506 // count the number of format fields
|
|
1507 for (r = p, v->n_fmt = 1; *r; ++r)
|
|
1508 if (*r == ':') ++v->n_fmt;
|
|
1509 char *end = s->s + s->l;
|
|
1510 if ( q>=end )
|
|
1511 {
|
|
1512 fprintf(stderr,"[%s:%d %s] Error: FORMAT column with no sample columns starting at %s:%d\n", __FILE__,__LINE__,__FUNCTION__,s->s,v->pos+1);
|
|
1513 return -1;
|
|
1514 }
|
|
1515
|
|
1516 fmt_aux_t *fmt = (fmt_aux_t*)alloca(v->n_fmt * sizeof(fmt_aux_t));
|
|
1517 // get format information from the dictionary
|
|
1518 for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
|
|
1519 *(char*)aux1.p = 0;
|
|
1520 k = kh_get(vdict, d, t);
|
|
1521 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
|
|
1522 fprintf(stderr, "[W::%s] FORMAT '%s' is not defined in the header, assuming Type=String\n", __func__, t);
|
|
1523 kstring_t tmp = {0,0,0};
|
|
1524 int l;
|
|
1525 ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
|
|
1526 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
|
|
1527 free(tmp.s);
|
|
1528 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
|
|
1529 k = kh_get(vdict, d, t);
|
|
1530 v->errcode = BCF_ERR_TAG_UNDEF;
|
|
1531 }
|
|
1532 fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
|
|
1533 fmt[j].key = kh_val(d, k).id;
|
|
1534 fmt[j].is_gt = !strcmp(t, "GT");
|
|
1535 fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
|
|
1536 }
|
|
1537 // compute max
|
|
1538 int n_sample_ori = -1;
|
|
1539 r = q + 1; // r: position in the format string
|
|
1540 m = l = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
|
|
1541 while ( r<end )
|
|
1542 {
|
|
1543 // can we skip some samples?
|
|
1544 if ( h->keep_samples )
|
|
1545 {
|
|
1546 n_sample_ori++;
|
|
1547 if ( !bit_array_test(h->keep_samples,n_sample_ori) )
|
|
1548 {
|
|
1549 while ( *r!='\t' && r<end ) r++;
|
|
1550 if ( *r=='\t' ) { *r = 0; r++; }
|
|
1551 continue;
|
|
1552 }
|
|
1553 }
|
|
1554
|
|
1555 // collect fmt stats: max vector size, length, number of alleles
|
|
1556 j = 0; // j-th format field
|
|
1557 for (;;)
|
|
1558 {
|
|
1559 if ( *r == '\t' ) *r = 0;
|
|
1560 if ( *r == ':' || !*r ) // end of field or end of sample
|
|
1561 {
|
|
1562 if (fmt[j].max_m < m) fmt[j].max_m = m;
|
|
1563 if (fmt[j].max_l < l - 1) fmt[j].max_l = l - 1;
|
|
1564 if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].max_g = g;
|
|
1565 l = 0, m = g = 1;
|
|
1566 if ( *r==':' )
|
|
1567 {
|
|
1568 j++;
|
|
1569 if ( j>=v->n_fmt )
|
|
1570 {
|
|
1571 fprintf(stderr,"Incorrect number of FORMAT fields at %s:%d\n", h->id[BCF_DT_CTG][v->rid].key,v->pos+1);
|
|
1572 exit(1);
|
|
1573 }
|
|
1574 }
|
|
1575 else break;
|
|
1576 }
|
|
1577 else if ( *r== ',' ) m++;
|
|
1578 else if ( fmt[j].is_gt && (*r == '|' || *r == '/') ) g++;
|
|
1579 if ( r>=end ) break;
|
|
1580 r++; l++;
|
|
1581 }
|
|
1582 v->n_sample++;
|
|
1583 if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
|
|
1584 r++;
|
|
1585 }
|
|
1586
|
|
1587 // allocate memory for arrays
|
|
1588 for (j = 0; j < v->n_fmt; ++j) {
|
|
1589 fmt_aux_t *f = &fmt[j];
|
|
1590 if ( !f->max_m ) f->max_m = 1; // omitted trailing format field
|
|
1591 if ((f->y>>4&0xf) == BCF_HT_STR) {
|
|
1592 f->size = f->is_gt? f->max_g << 2 : f->max_l;
|
|
1593 } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
|
|
1594 f->size = f->max_m << 2;
|
|
1595 } else
|
|
1596 {
|
|
1597 fprintf(stderr, "[E::%s] the format type %d currently not supported\n", __func__, f->y>>4&0xf);
|
|
1598 abort(); // I do not know how to do with Flag in the genotype fields
|
|
1599 }
|
|
1600 align_mem(mem);
|
|
1601 f->offset = mem->l;
|
|
1602 ks_resize(mem, mem->l + v->n_sample * f->size);
|
|
1603 mem->l += v->n_sample * f->size;
|
|
1604 }
|
|
1605 for (j = 0; j < v->n_fmt; ++j)
|
|
1606 fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
|
|
1607 // fill the sample fields; at beginning of the loop, t points to the first char of a format
|
|
1608 n_sample_ori = -1;
|
|
1609 t = q + 1; m = 0; // m: sample id
|
|
1610 while ( t<end )
|
|
1611 {
|
|
1612 // can we skip some samples?
|
|
1613 if ( h->keep_samples )
|
|
1614 {
|
|
1615 n_sample_ori++;
|
|
1616 if ( !bit_array_test(h->keep_samples,n_sample_ori) )
|
|
1617 {
|
|
1618 while ( *t && t<end ) t++;
|
|
1619 t++;
|
|
1620 continue;
|
|
1621 }
|
|
1622 }
|
|
1623 if ( m == bcf_hdr_nsamples(h) ) break;
|
|
1624
|
|
1625 j = 0; // j-th format field, m-th sample
|
|
1626 while ( *t )
|
|
1627 {
|
|
1628 fmt_aux_t *z = &fmt[j];
|
|
1629 if ((z->y>>4&0xf) == BCF_HT_STR) {
|
|
1630 if (z->is_gt) { // genotypes
|
|
1631 int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m);
|
|
1632 for (l = 0;; ++t) {
|
|
1633 if (*t == '.') ++t, x[l++] = is_phased;
|
|
1634 else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased;
|
|
1635 #if THOROUGH_SANITY_CHECKS
|
|
1636 assert( 0 ); // success of strtol,strtod not checked
|
|
1637 #endif
|
|
1638 is_phased = (*t == '|');
|
|
1639 if (*t == ':' || *t == 0) break;
|
|
1640 }
|
|
1641 if ( !l ) x[l++] = 0; // An empty field, insert missing value
|
|
1642 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
|
|
1643 } else {
|
|
1644 char *x = (char*)z->buf + z->size * m;
|
|
1645 for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
|
|
1646 for (; l < z->size; ++l) x[l] = 0;
|
|
1647 }
|
|
1648 } else if ((z->y>>4&0xf) == BCF_HT_INT) {
|
|
1649 int32_t *x = (int32_t*)(z->buf + z->size * m);
|
|
1650 for (l = 0;; ++t) {
|
|
1651 if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
|
|
1652 else x[l++] = strtol(t, &t, 10);
|
|
1653 if (*t == ':' || *t == 0) break;
|
|
1654 }
|
|
1655 if ( !l ) x[l++] = bcf_int32_missing;
|
|
1656 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
|
|
1657 } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
|
|
1658 float *x = (float*)(z->buf + z->size * m);
|
|
1659 for (l = 0;; ++t) {
|
|
1660 if (*t == '.' && !isdigit(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
|
|
1661 else x[l++] = strtod(t, &t);
|
|
1662 if (*t == ':' || *t == 0) break;
|
|
1663 }
|
|
1664 if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value
|
|
1665 for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
|
|
1666 } else abort();
|
|
1667 if (*t == 0) {
|
|
1668 for (++j; j < v->n_fmt; ++j) { // fill end-of-vector values
|
|
1669 z = &fmt[j];
|
|
1670 if ((z->y>>4&0xf) == BCF_HT_STR) {
|
|
1671 if (z->is_gt) {
|
|
1672 int32_t *x = (int32_t*)(z->buf + z->size * m);
|
|
1673 x[0] = bcf_int32_missing;
|
|
1674 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
|
|
1675 } else {
|
|
1676 char *x = (char*)z->buf + z->size * m;
|
|
1677 if ( z->size ) x[0] = '.';
|
|
1678 for (l = 1; l < z->size; ++l) x[l] = 0;
|
|
1679 }
|
|
1680 } else if ((z->y>>4&0xf) == BCF_HT_INT) {
|
|
1681 int32_t *x = (int32_t*)(z->buf + z->size * m);
|
|
1682 x[0] = bcf_int32_missing;
|
|
1683 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
|
|
1684 } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
|
|
1685 float *x = (float*)(z->buf + z->size * m);
|
|
1686 bcf_float_set_missing(x[0]);
|
|
1687 for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
|
|
1688 }
|
|
1689 }
|
|
1690 break;
|
|
1691 }
|
|
1692 else
|
|
1693 {
|
|
1694 if (*t == ':') ++j;
|
|
1695 t++;
|
|
1696 }
|
|
1697 }
|
|
1698 m++; t++;
|
|
1699 }
|
|
1700
|
|
1701 // write individual genotype information
|
|
1702 kstring_t *str = &v->indiv;
|
|
1703 int i;
|
|
1704 if (v->n_sample > 0) {
|
|
1705 for (i = 0; i < v->n_fmt; ++i) {
|
|
1706 fmt_aux_t *z = &fmt[i];
|
|
1707 bcf_enc_int1(str, z->key);
|
|
1708 if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
|
|
1709 bcf_enc_size(str, z->size, BCF_BT_CHAR);
|
|
1710 kputsn((char*)z->buf, z->size * v->n_sample, str);
|
|
1711 } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
|
|
1712 bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
|
|
1713 } else {
|
|
1714 bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
|
|
1715 kputsn((char*)z->buf, z->size * v->n_sample, str);
|
|
1716 }
|
|
1717 }
|
|
1718 }
|
|
1719
|
|
1720 if ( v->n_sample!=bcf_hdr_nsamples(h) )
|
|
1721 {
|
|
1722 fprintf(stderr,"[%s:%d %s] Number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
|
|
1723 __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
|
|
1724 v->errcode |= BCF_ERR_NCOLS;
|
|
1725 return -1;
|
|
1726 }
|
|
1727
|
|
1728 return 0;
|
|
1729 }
|
|
1730
|
|
1731 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
|
|
1732 {
|
|
1733 int i = 0;
|
|
1734 char *p, *q, *r, *t;
|
|
1735 kstring_t *str;
|
|
1736 khint_t k;
|
|
1737 ks_tokaux_t aux;
|
|
1738
|
|
1739 bcf_clear1(v);
|
|
1740 str = &v->shared;
|
|
1741 memset(&aux, 0, sizeof(ks_tokaux_t));
|
|
1742 for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
|
|
1743 q = (char*)aux.p;
|
|
1744 *q = 0;
|
|
1745 if (i == 0) { // CHROM
|
|
1746 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
|
|
1747 k = kh_get(vdict, d, p);
|
|
1748 if (k == kh_end(d))
|
|
1749 {
|
|
1750 // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
|
|
1751 // been already printed, but will enable tools like vcfcheck to proceed.
|
|
1752 fprintf(stderr, "[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p);
|
|
1753 kstring_t tmp = {0,0,0};
|
|
1754 int l;
|
|
1755 ksprintf(&tmp, "##contig=<ID=%s>", p);
|
|
1756 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
|
|
1757 free(tmp.s);
|
|
1758 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
|
|
1759 k = kh_get(vdict, d, p);
|
|
1760 v->errcode = BCF_ERR_CTG_UNDEF;
|
|
1761 }
|
|
1762 v->rid = kh_val(d, k).id;
|
|
1763 } else if (i == 1) { // POS
|
|
1764 v->pos = atoi(p) - 1;
|
|
1765 } else if (i == 2) { // ID
|
|
1766 if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
|
|
1767 else bcf_enc_size(str, 0, BCF_BT_CHAR);
|
|
1768 } else if (i == 3) { // REF
|
|
1769 bcf_enc_vchar(str, q - p, p);
|
|
1770 v->n_allele = 1, v->rlen = q - p;
|
|
1771 } else if (i == 4) { // ALT
|
|
1772 if (strcmp(p, ".")) {
|
|
1773 for (r = t = p;; ++r) {
|
|
1774 if (*r == ',' || *r == 0) {
|
|
1775 bcf_enc_vchar(str, r - t, t);
|
|
1776 t = r + 1;
|
|
1777 ++v->n_allele;
|
|
1778 }
|
|
1779 if (r == q) break;
|
|
1780 }
|
|
1781 }
|
|
1782 } else if (i == 5) { // QUAL
|
|
1783 if (strcmp(p, ".")) v->qual = atof(p);
|
|
1784 else memcpy(&v->qual, &bcf_float_missing, 4);
|
|
1785 if ( v->max_unpack && !(v->max_unpack>>1) ) return 0; // BCF_UN_STR
|
|
1786 } else if (i == 6) { // FILTER
|
|
1787 if (strcmp(p, ".")) {
|
|
1788 int32_t *a;
|
|
1789 int n_flt = 1, i;
|
|
1790 ks_tokaux_t aux1;
|
|
1791 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
|
|
1792 // count the number of filters
|
|
1793 if (*(q-1) == ';') *(q-1) = 0;
|
|
1794 for (r = p; *r; ++r)
|
|
1795 if (*r == ';') ++n_flt;
|
|
1796 a = (int32_t*)alloca(n_flt * sizeof(int32_t));
|
|
1797 // add filters
|
|
1798 for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
|
|
1799 *(char*)aux1.p = 0;
|
|
1800 k = kh_get(vdict, d, t);
|
|
1801 if (k == kh_end(d))
|
|
1802 {
|
|
1803 // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
|
|
1804 // been already printed, but will enable tools like vcfcheck to proceed.
|
|
1805 fprintf(stderr, "[W::%s] FILTER '%s' is not defined in the header\n", __func__, t);
|
|
1806 kstring_t tmp = {0,0,0};
|
|
1807 int l;
|
|
1808 ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
|
|
1809 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
|
|
1810 free(tmp.s);
|
|
1811 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
|
|
1812 k = kh_get(vdict, d, t);
|
|
1813 v->errcode = BCF_ERR_TAG_UNDEF;
|
|
1814 }
|
|
1815 a[i++] = kh_val(d, k).id;
|
|
1816 }
|
|
1817 n_flt = i;
|
|
1818 bcf_enc_vint(str, n_flt, a, -1);
|
|
1819 } else bcf_enc_vint(str, 0, 0, -1);
|
|
1820 if ( v->max_unpack && !(v->max_unpack>>2) ) return 0; // BCF_UN_FLT
|
|
1821 } else if (i == 7) { // INFO
|
|
1822 char *key;
|
|
1823 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
|
|
1824 v->n_info = 0;
|
|
1825 if (strcmp(p, ".")) {
|
|
1826 if (*(q-1) == ';') *(q-1) = 0;
|
|
1827 for (r = key = p;; ++r) {
|
|
1828 int c;
|
|
1829 char *val, *end;
|
|
1830 if (*r != ';' && *r != '=' && *r != 0) continue;
|
|
1831 val = end = 0;
|
|
1832 c = *r; *r = 0;
|
|
1833 if (c == '=') {
|
|
1834 val = r + 1;
|
|
1835 for (end = val; *end != ';' && *end != 0; ++end);
|
|
1836 c = *end; *end = 0;
|
|
1837 } else end = r;
|
|
1838 if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO
|
|
1839 k = kh_get(vdict, d, key);
|
|
1840 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
|
|
1841 {
|
|
1842 fprintf(stderr, "[W::%s] INFO '%s' is not defined in the header, assuming Type=String\n", __func__, key);
|
|
1843 kstring_t tmp = {0,0,0};
|
|
1844 int l;
|
|
1845 ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
|
|
1846 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
|
|
1847 free(tmp.s);
|
|
1848 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
|
|
1849 k = kh_get(vdict, d, key);
|
|
1850 v->errcode = BCF_ERR_TAG_UNDEF;
|
|
1851 }
|
|
1852 uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
|
|
1853 ++v->n_info;
|
|
1854 bcf_enc_int1(str, kh_val(d, k).id);
|
|
1855 if (val == 0) {
|
|
1856 bcf_enc_size(str, 0, BCF_BT_NULL);
|
|
1857 } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
|
|
1858 bcf_enc_vchar(str, end - val, val);
|
|
1859 } else { // int/float value/array
|
|
1860 int i, n_val;
|
|
1861 char *t, *te;
|
|
1862 for (t = val, n_val = 1; *t; ++t) // count the number of values
|
|
1863 if (*t == ',') ++n_val;
|
|
1864 if ((y>>4&0xf) == BCF_HT_INT) {
|
|
1865 int32_t *z;
|
|
1866 z = (int32_t*)alloca(n_val * sizeof(int32_t));
|
|
1867 for (i = 0, t = val; i < n_val; ++i, ++t)
|
|
1868 {
|
|
1869 z[i] = strtol(t, &te, 10);
|
|
1870 if ( te==t ) // conversion failed
|
|
1871 {
|
|
1872 z[i] = bcf_int32_missing;
|
|
1873 while ( *te && *te!=',' ) te++;
|
|
1874 }
|
|
1875 t = te;
|
|
1876 }
|
|
1877 bcf_enc_vint(str, n_val, z, -1);
|
|
1878 if (strcmp(key, "END") == 0) v->rlen = z[0] - v->pos;
|
|
1879 } else if ((y>>4&0xf) == BCF_HT_REAL) {
|
|
1880 float *z;
|
|
1881 z = (float*)alloca(n_val * sizeof(float));
|
|
1882 for (i = 0, t = val; i < n_val; ++i, ++t)
|
|
1883 {
|
|
1884 z[i] = strtod(t, &te);
|
|
1885 if ( te==t ) // conversion failed
|
|
1886 {
|
|
1887 bcf_float_set_missing(z[i]);
|
|
1888 while ( *te && *te!=',' ) te++;
|
|
1889 }
|
|
1890 t = te;
|
|
1891 }
|
|
1892 bcf_enc_vfloat(str, n_val, z);
|
|
1893 }
|
|
1894 }
|
|
1895 if (c == 0) break;
|
|
1896 r = end;
|
|
1897 key = r + 1;
|
|
1898 }
|
|
1899 }
|
|
1900 if ( v->max_unpack && !(v->max_unpack>>3) ) return 0;
|
|
1901 } else if (i == 8) // FORMAT
|
|
1902 return _vcf_parse_format(s, h, v, p, q);
|
|
1903 }
|
|
1904 return 0;
|
|
1905 }
|
|
1906
|
|
1907 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
|
|
1908 {
|
|
1909 int ret;
|
|
1910 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
|
|
1911 if (ret < 0) return -1;
|
|
1912 return vcf_parse1(&fp->line, h, v);
|
|
1913 }
|
|
1914
|
|
1915 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
|
|
1916 {
|
|
1917 uint8_t *ptr_start = ptr;
|
|
1918 fmt->id = bcf_dec_typed_int1(ptr, &ptr);
|
|
1919 fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
|
|
1920 fmt->size = fmt->n << bcf_type_shift[fmt->type];
|
|
1921 fmt->p = ptr;
|
|
1922 fmt->p_off = ptr - ptr_start;
|
|
1923 fmt->p_free = 0;
|
|
1924 ptr += n_sample * fmt->size;
|
|
1925 fmt->p_len = ptr - fmt->p;
|
|
1926 return ptr;
|
|
1927 }
|
|
1928
|
|
1929 static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
|
|
1930 {
|
|
1931 uint8_t *ptr_start = ptr;
|
|
1932 info->key = bcf_dec_typed_int1(ptr, &ptr);
|
|
1933 info->len = bcf_dec_size(ptr, &ptr, &info->type);
|
|
1934 info->vptr = ptr;
|
|
1935 info->vptr_off = ptr - ptr_start;
|
|
1936 info->vptr_free = 0;
|
|
1937 info->v1.i = 0;
|
|
1938 if (info->len == 1) {
|
|
1939 if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
|
|
1940 else if (info->type == BCF_BT_INT32) info->v1.i = *(int32_t*)ptr;
|
|
1941 else if (info->type == BCF_BT_FLOAT) info->v1.f = *(float*)ptr;
|
|
1942 else if (info->type == BCF_BT_INT16) info->v1.i = *(int16_t*)ptr;
|
|
1943 }
|
|
1944 ptr += info->len << bcf_type_shift[info->type];
|
|
1945 info->vptr_len = ptr - info->vptr;
|
|
1946 return ptr;
|
|
1947 }
|
|
1948
|
|
1949 int bcf_unpack(bcf1_t *b, int which)
|
|
1950 {
|
|
1951 if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
|
|
1952 uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
|
|
1953 int *offset, i;
|
|
1954 bcf_dec_t *d = &b->d;
|
|
1955 if (which & BCF_UN_FLT) which |= BCF_UN_STR;
|
|
1956 if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
|
|
1957 if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
|
|
1958 {
|
|
1959 kstring_t tmp;
|
|
1960
|
|
1961 // ID
|
|
1962 tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
|
|
1963 ptr_ori = ptr;
|
|
1964 ptr = bcf_fmt_sized_array(&tmp, ptr);
|
|
1965 b->unpack_size[0] = ptr - ptr_ori;
|
|
1966 kputc('\0', &tmp);
|
|
1967 d->id = tmp.s; d->m_id = tmp.m;
|
|
1968
|
|
1969 // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
|
|
1970 tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
|
|
1971 offset = (int*)alloca(b->n_allele * sizeof(int));
|
|
1972 ptr_ori = ptr;
|
|
1973 for (i = 0; i < b->n_allele; ++i) {
|
|
1974 offset[i] = tmp.l;
|
|
1975 ptr = bcf_fmt_sized_array(&tmp, ptr);
|
|
1976 kputc('\0', &tmp);
|
|
1977 }
|
|
1978 b->unpack_size[1] = ptr - ptr_ori;
|
|
1979 d->als = tmp.s; d->m_als = tmp.m;
|
|
1980
|
|
1981 hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
|
|
1982 for (i = 0; i < b->n_allele; ++i)
|
|
1983 d->allele[i] = d->als + offset[i];
|
|
1984 b->unpacked |= BCF_UN_STR;
|
|
1985 }
|
|
1986 if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
|
|
1987 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
|
|
1988 ptr_ori = ptr;
|
|
1989 if (*ptr>>4) {
|
|
1990 int type;
|
|
1991 d->n_flt = bcf_dec_size(ptr, &ptr, &type);
|
|
1992 hts_expand(int, d->n_flt, d->m_flt, d->flt);
|
|
1993 for (i = 0; i < d->n_flt; ++i)
|
|
1994 d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
|
|
1995 } else ++ptr, d->n_flt = 0;
|
|
1996 b->unpack_size[2] = ptr - ptr_ori;
|
|
1997 b->unpacked |= BCF_UN_FLT;
|
|
1998 }
|
|
1999 if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
|
|
2000 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
|
|
2001 hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
|
|
2002 for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
|
|
2003 for (i = 0; i < b->n_info; ++i)
|
|
2004 ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
|
|
2005 b->unpacked |= BCF_UN_INFO;
|
|
2006 }
|
|
2007 if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
|
|
2008 ptr = (uint8_t*)b->indiv.s;
|
|
2009 hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
|
|
2010 for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
|
|
2011 for (i = 0; i < b->n_fmt; ++i)
|
|
2012 ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
|
|
2013 b->unpacked |= BCF_UN_FMT;
|
|
2014 }
|
|
2015 return 0;
|
|
2016 }
|
|
2017
|
|
2018 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
|
|
2019 {
|
|
2020 int i;
|
|
2021 bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
|
|
2022 kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM
|
|
2023 kputc('\t', s); kputw(v->pos + 1, s); // POS
|
|
2024 kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
|
|
2025 kputc('\t', s); // REF
|
|
2026 if (v->n_allele > 0) kputs(v->d.allele[0], s);
|
|
2027 else kputc('.', s);
|
|
2028 kputc('\t', s); // ALT
|
|
2029 if (v->n_allele > 1) {
|
|
2030 for (i = 1; i < v->n_allele; ++i) {
|
|
2031 if (i > 1) kputc(',', s);
|
|
2032 kputs(v->d.allele[i], s);
|
|
2033 }
|
|
2034 } else kputc('.', s);
|
|
2035 kputc('\t', s); // QUAL
|
|
2036 if ( bcf_float_is_missing(v->qual) ) kputc('.', s); // QUAL
|
|
2037 else ksprintf(s, "%g", v->qual);
|
|
2038 kputc('\t', s); // FILTER
|
|
2039 if (v->d.n_flt) {
|
|
2040 for (i = 0; i < v->d.n_flt; ++i) {
|
|
2041 if (i) kputc(';', s);
|
|
2042 kputs(h->id[BCF_DT_ID][v->d.flt[i]].key, s);
|
|
2043 }
|
|
2044 } else kputc('.', s);
|
|
2045 kputc('\t', s); // INFO
|
|
2046 if (v->n_info) {
|
|
2047 int first = 1;
|
|
2048 for (i = 0; i < v->n_info; ++i) {
|
|
2049 bcf_info_t *z = &v->d.info[i];
|
|
2050 if ( !z->vptr ) continue;
|
|
2051 if ( !first ) kputc(';', s); first = 0;
|
|
2052 kputs(h->id[BCF_DT_ID][z->key].key, s);
|
|
2053 if (z->len <= 0) continue;
|
|
2054 kputc('=', s);
|
|
2055 if (z->len == 1)
|
|
2056 {
|
|
2057 switch (z->type)
|
|
2058 {
|
|
2059 case BCF_BT_INT8: if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
|
|
2060 case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
|
|
2061 case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
|
|
2062 case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else ksprintf(s, "%g", z->v1.f); break;
|
|
2063 case BCF_BT_CHAR: kputc(z->v1.i, s); break;
|
|
2064 default: fprintf(stderr,"todo: type %d\n", z->type); exit(1); break;
|
|
2065 }
|
|
2066 }
|
|
2067 else bcf_fmt_array(s, z->len, z->type, z->vptr);
|
|
2068 }
|
|
2069 if ( first ) kputc('.', s);
|
|
2070 } else kputc('.', s);
|
|
2071 // FORMAT and individual information
|
|
2072 if (v->n_sample)
|
|
2073 {
|
|
2074 int i,j;
|
|
2075 if ( v->n_fmt)
|
|
2076 {
|
|
2077 int gt_i = -1;
|
|
2078 bcf_fmt_t *fmt = v->d.fmt;
|
|
2079 int first = 1;
|
|
2080 for (i = 0; i < (int)v->n_fmt; ++i) {
|
|
2081 if ( !fmt[i].p ) continue;
|
|
2082 kputc(!first ? ':' : '\t', s); first = 0;
|
|
2083 if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
|
|
2084 {
|
|
2085 fprintf(stderr, "[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, fmt[i].id);
|
|
2086 abort();
|
|
2087 }
|
|
2088 kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
|
|
2089 if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
|
|
2090 }
|
|
2091 if ( first ) kputs("\t.", s);
|
|
2092 for (j = 0; j < v->n_sample; ++j) {
|
|
2093 kputc('\t', s);
|
|
2094 first = 1;
|
|
2095 for (i = 0; i < (int)v->n_fmt; ++i) {
|
|
2096 bcf_fmt_t *f = &fmt[i];
|
|
2097 if ( !f->p ) continue;
|
|
2098 if (!first) kputc(':', s); first = 0;
|
|
2099 if (gt_i == i)
|
|
2100 bcf_format_gt(f,j,s);
|
|
2101 else
|
|
2102 bcf_fmt_array(s, f->n, f->type, f->p + j * f->size);
|
|
2103 }
|
|
2104 if ( first ) kputc('.', s);
|
|
2105 }
|
|
2106 }
|
|
2107 else
|
|
2108 for (j=0; j<=v->n_sample; j++)
|
|
2109 kputs("\t.", s);
|
|
2110 }
|
|
2111 kputc('\n', s);
|
|
2112 return 0;
|
|
2113 }
|
|
2114
|
|
2115 int vcf_write_line(htsFile *fp, kstring_t *line)
|
|
2116 {
|
|
2117 int ret;
|
|
2118 if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
|
|
2119 if ( fp->format.compression!=no_compression )
|
|
2120 ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
|
|
2121 else
|
|
2122 ret = hwrite(fp->fp.hfile, line->s, line->l);
|
|
2123 return ret==line->l ? 0 : -1;
|
|
2124 }
|
|
2125
|
|
2126 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
|
|
2127 {
|
|
2128 int ret;
|
|
2129 fp->line.l = 0;
|
|
2130 vcf_format1(h, v, &fp->line);
|
|
2131 if ( fp->format.compression!=no_compression )
|
|
2132 ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
|
|
2133 else
|
|
2134 ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
|
|
2135 return ret==fp->line.l ? 0 : -1;
|
|
2136 }
|
|
2137
|
|
2138 /************************
|
|
2139 * Data access routines *
|
|
2140 ************************/
|
|
2141
|
|
2142 int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
|
|
2143 {
|
|
2144 khint_t k;
|
|
2145 vdict_t *d = (vdict_t*)h->dict[which];
|
|
2146 k = kh_get(vdict, d, id);
|
|
2147 return k == kh_end(d)? -1 : kh_val(d, k).id;
|
|
2148 }
|
|
2149
|
|
2150
|
|
2151 /********************
|
|
2152 *** BCF indexing ***
|
|
2153 ********************/
|
|
2154
|
|
2155 hts_idx_t *bcf_index(htsFile *fp, int min_shift)
|
|
2156 {
|
|
2157 int n_lvls, i;
|
|
2158 bcf1_t *b;
|
|
2159 hts_idx_t *idx;
|
|
2160 bcf_hdr_t *h;
|
|
2161 int64_t max_len = 0, s;
|
|
2162 h = bcf_hdr_read(fp);
|
|
2163 if ( !h ) return NULL;
|
|
2164 int nids = 0;
|
|
2165 for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
|
|
2166 {
|
|
2167 if ( !h->id[BCF_DT_CTG][i].val ) continue;
|
|
2168 if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0];
|
|
2169 nids++;
|
|
2170 }
|
|
2171 if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken.
|
|
2172 max_len += 256;
|
|
2173 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
|
|
2174 idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
|
|
2175 b = bcf_init1();
|
|
2176 while (bcf_read1(fp,h, b) >= 0) {
|
|
2177 int ret;
|
|
2178 ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
|
|
2179 if (ret < 0)
|
|
2180 {
|
|
2181 bcf_destroy1(b);
|
|
2182 hts_idx_destroy(idx);
|
|
2183 return NULL;
|
|
2184 }
|
|
2185 }
|
|
2186 hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
|
|
2187 bcf_destroy1(b);
|
|
2188 bcf_hdr_destroy(h);
|
|
2189 return idx;
|
|
2190 }
|
|
2191
|
|
2192 int bcf_index_build(const char *fn, int min_shift)
|
|
2193 {
|
|
2194 htsFile *fp;
|
|
2195 hts_idx_t *idx;
|
|
2196 if ((fp = hts_open(fn, "rb")) == 0) return -1;
|
|
2197 if ( fp->format.compression!=bgzf ) { hts_close(fp); return -1; }
|
|
2198 idx = bcf_index(fp, min_shift);
|
|
2199 hts_close(fp);
|
|
2200 if ( !idx ) return -1;
|
|
2201 hts_idx_save(idx, fn, HTS_FMT_CSI);
|
|
2202 hts_idx_destroy(idx);
|
|
2203 return 0;
|
|
2204 }
|
|
2205
|
|
2206 /*****************
|
|
2207 *** Utilities ***
|
|
2208 *****************/
|
|
2209
|
|
2210 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
|
|
2211 {
|
|
2212 int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0;
|
|
2213 for (i=0; i<src->nhrec; i++)
|
|
2214 {
|
|
2215 if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
|
|
2216 {
|
|
2217 int j;
|
|
2218 for (j=0; j<ndst_ori; j++)
|
|
2219 {
|
|
2220 if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
|
|
2221
|
|
2222 // Checking only the key part of generic lines, otherwise
|
|
2223 // the VCFs are too verbose. Should we perhaps add a flag
|
|
2224 // to bcf_hdr_combine() and make this optional?
|
|
2225 if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
|
|
2226 }
|
|
2227 if ( j>=ndst_ori )
|
|
2228 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
|
|
2229 }
|
|
2230 else if ( src->hrec[i]->type==BCF_HL_STR )
|
|
2231 {
|
|
2232 // NB: we are ignoring fields without ID
|
|
2233 int j = bcf_hrec_find_key(src->hrec[i],"ID");
|
|
2234 if ( j>=0 )
|
|
2235 {
|
|
2236 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
|
|
2237 if ( !rec )
|
|
2238 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
|
|
2239 }
|
|
2240 }
|
|
2241 else
|
|
2242 {
|
|
2243 int j = bcf_hrec_find_key(src->hrec[i],"ID");
|
|
2244 assert( j>=0 ); // this should always be true for valid VCFs
|
|
2245
|
|
2246 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
|
|
2247 if ( !rec )
|
|
2248 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
|
|
2249 else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
|
|
2250 {
|
|
2251 // Check that both records are of the same type. The bcf_hdr_id2length
|
|
2252 // macro cannot be used here because dst header is not synced yet.
|
|
2253 vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
|
|
2254 vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
|
|
2255 khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
|
|
2256 khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
|
|
2257 if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
|
|
2258 {
|
|
2259 fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different lengths\n", src->hrec[i]->vals[0]);
|
|
2260 ret |= 1;
|
|
2261 }
|
|
2262 if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
|
|
2263 {
|
|
2264 fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different types\n", src->hrec[i]->vals[0]);
|
|
2265 ret |= 1;
|
|
2266 }
|
|
2267 }
|
|
2268 }
|
|
2269 }
|
|
2270 if ( need_sync ) bcf_hdr_sync(dst);
|
|
2271 return ret;
|
|
2272 }
|
|
2273 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
|
|
2274 {
|
|
2275 int i;
|
|
2276 if ( line->errcode )
|
|
2277 {
|
|
2278 fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,line->errcode);
|
|
2279 exit(1);
|
|
2280 }
|
|
2281 if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id
|
|
2282 if ( !src_hdr->ntransl ) // called for the first time, see what needs translating
|
|
2283 {
|
|
2284 int dict;
|
|
2285 for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG
|
|
2286 {
|
|
2287 src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
|
|
2288 for (i=0; i<src_hdr->n[dict]; i++)
|
|
2289 {
|
|
2290 if ( !src_hdr->id[dict][i].key || !dst_hdr->id[dict][i].key ) // gap left after removed BCF header lines
|
|
2291 src_hdr->transl[dict][i] = -1;
|
|
2292 else if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) )
|
|
2293 {
|
|
2294 src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
|
|
2295 src_hdr->ntransl++;
|
|
2296 }
|
|
2297 else
|
|
2298 src_hdr->transl[dict][i] = -1;
|
|
2299 }
|
|
2300 }
|
|
2301 if ( !src_hdr->ntransl )
|
|
2302 {
|
|
2303 free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
|
|
2304 free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
|
|
2305 src_hdr->ntransl = -1;
|
|
2306 }
|
|
2307 if ( src_hdr->ntransl==-1 ) return 0;
|
|
2308 }
|
|
2309 bcf_unpack(line,BCF_UN_ALL);
|
|
2310
|
|
2311 // CHROM
|
|
2312 if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
|
|
2313
|
|
2314 // FILTER
|
|
2315 for (i=0; i<line->d.n_flt; i++)
|
|
2316 {
|
|
2317 int src_id = line->d.flt[i];
|
|
2318 if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
|
|
2319 line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
|
|
2320 line->d.shared_dirty |= BCF1_DIRTY_FLT;
|
|
2321 }
|
|
2322
|
|
2323 // INFO
|
|
2324 for (i=0; i<line->n_info; i++)
|
|
2325 {
|
|
2326 int src_id = line->d.info[i].key;
|
|
2327 int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
|
|
2328 if ( dst_id<0 ) continue;
|
|
2329 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
|
|
2330 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
|
|
2331 if ( src_size==dst_size ) // can overwrite
|
|
2332 {
|
|
2333 line->d.info[i].key = dst_id;
|
|
2334 uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
|
|
2335 if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
|
|
2336 else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
|
|
2337 else { *(uint32_t*)vptr = (uint32_t)dst_id; }
|
|
2338 }
|
|
2339 else // must realloc
|
|
2340 {
|
|
2341 bcf_info_t *info = &line->d.info[i];
|
|
2342 assert( !info->vptr_free );
|
|
2343 kstring_t str = {0,0,0};
|
|
2344 bcf_enc_int1(&str, dst_id);
|
|
2345 bcf_enc_size(&str, info->len,info->type);
|
|
2346 info->vptr_off = str.l;
|
|
2347 kputsn((char*)info->vptr, info->vptr_len, &str);
|
|
2348 info->vptr = (uint8_t*)str.s + info->vptr_off;
|
|
2349 info->vptr_free = 1;
|
|
2350 info->key = dst_id;
|
|
2351 line->d.shared_dirty |= BCF1_DIRTY_INF;
|
|
2352 }
|
|
2353 }
|
|
2354
|
|
2355 // FORMAT
|
|
2356 for (i=0; i<line->n_fmt; i++)
|
|
2357 {
|
|
2358 int src_id = line->d.fmt[i].id;
|
|
2359 int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
|
|
2360 if ( dst_id<0 ) continue;
|
|
2361 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
|
|
2362 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
|
|
2363 if ( src_size==dst_size ) // can overwrite
|
|
2364 {
|
|
2365 line->d.fmt[i].id = dst_id;
|
|
2366 uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits)
|
|
2367 if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
|
|
2368 else if ( dst_size==BCF_BT_INT16 ) { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; }
|
|
2369 else { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; p[3] = x[2]; p[4] = x[3]; }
|
|
2370 }
|
|
2371 else // must realloc
|
|
2372 {
|
|
2373 bcf_fmt_t *fmt = &line->d.fmt[i];
|
|
2374 assert( !fmt->p_free );
|
|
2375 kstring_t str = {0,0,0};
|
|
2376 bcf_enc_int1(&str, dst_id);
|
|
2377 bcf_enc_size(&str, fmt->n, fmt->type);
|
|
2378 fmt->p_off = str.l;
|
|
2379 kputsn((char*)fmt->p, fmt->p_len, &str);
|
|
2380 fmt->p = (uint8_t*)str.s + fmt->p_off;
|
|
2381 fmt->p_free = 1;
|
|
2382 fmt->id = dst_id;
|
|
2383 line->d.indiv_dirty = 1;
|
|
2384 }
|
|
2385 }
|
|
2386 return 0;
|
|
2387 }
|
|
2388
|
|
2389 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
|
|
2390 {
|
|
2391 bcf_hdr_t *hout = bcf_hdr_init("r");
|
|
2392 char *htxt = bcf_hdr_fmt_text(hdr, 1, NULL);
|
|
2393 bcf_hdr_parse(hout, htxt);
|
|
2394 free(htxt);
|
|
2395 return hout;
|
|
2396 }
|
|
2397
|
|
2398 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
|
|
2399 {
|
|
2400 int hlen;
|
|
2401 void *names_hash = khash_str2int_init();
|
|
2402 char *htxt = bcf_hdr_fmt_text(h0, 1, &hlen);
|
|
2403 kstring_t str;
|
|
2404 bcf_hdr_t *h;
|
|
2405 str.l = str.m = 0; str.s = 0;
|
|
2406 h = bcf_hdr_init("w");
|
|
2407 bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
|
|
2408 int j;
|
|
2409 for (j=0; j<n; j++) imap[j] = -1;
|
|
2410 if ( bcf_hdr_nsamples(h0) > 0) {
|
|
2411 char *p;
|
|
2412 int i = 0, end = n? 8 : 7;
|
|
2413 while ((p = strstr(htxt, "#CHROM\t")) != 0)
|
|
2414 if (p > htxt && *(p-1) == '\n') break;
|
|
2415 while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
|
|
2416 if (i != end) {
|
|
2417 free(h); free(str.s);
|
|
2418 return 0; // malformated header
|
|
2419 }
|
|
2420 kputsn(htxt, p - htxt, &str);
|
|
2421 for (i = 0; i < n; ++i) {
|
|
2422 if ( khash_str2int_has_key(names_hash,samples[i]) )
|
|
2423 {
|
|
2424 fprintf(stderr,"[E::bcf_hdr_subset] Duplicate sample name \"%s\".\n", samples[i]);
|
|
2425 free(str.s);
|
|
2426 free(htxt);
|
|
2427 khash_str2int_destroy(names_hash);
|
|
2428 bcf_hdr_destroy(h);
|
|
2429 return NULL;
|
|
2430 }
|
|
2431 imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
|
|
2432 if (imap[i] < 0) continue;
|
|
2433 kputc('\t', &str);
|
|
2434 kputs(samples[i], &str);
|
|
2435 khash_str2int_inc(names_hash,samples[i]);
|
|
2436 }
|
|
2437 } else kputsn(htxt, hlen, &str);
|
|
2438 while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
|
|
2439 kputc('\n',&str);
|
|
2440 bcf_hdr_parse(h, str.s);
|
|
2441 free(str.s);
|
|
2442 free(htxt);
|
|
2443 khash_str2int_destroy(names_hash);
|
|
2444 return h;
|
|
2445 }
|
|
2446
|
|
2447 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
|
|
2448 {
|
|
2449 if ( samples && !strcmp("-",samples) ) return 0; // keep all samples
|
|
2450
|
|
2451 hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
|
|
2452 if ( !samples ) { bcf_hdr_nsamples(hdr) = 0; return 0; } // exclude all samples
|
|
2453
|
|
2454 int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
|
|
2455 hdr->keep_samples = (uint8_t*) calloc(narr,1);
|
|
2456 if ( samples[0]=='^' )
|
|
2457 for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
|
|
2458
|
|
2459 int idx, n, ret = 0;
|
|
2460 char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
|
|
2461 if ( !smpls ) return -1;
|
|
2462 for (i=0; i<n; i++)
|
|
2463 {
|
|
2464 idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
|
|
2465 if ( idx<0 )
|
|
2466 {
|
|
2467 if ( !ret ) ret = i+1;
|
|
2468 continue;
|
|
2469 }
|
|
2470 assert( idx<bcf_hdr_nsamples(hdr) );
|
|
2471 if ( samples[0]=='^' )
|
|
2472 bit_array_clear(hdr->keep_samples, idx);
|
|
2473 else
|
|
2474 bit_array_set(hdr->keep_samples, idx);
|
|
2475 }
|
|
2476 for (i=0; i<n; i++) free(smpls[i]);
|
|
2477 free(smpls);
|
|
2478
|
|
2479 bcf_hdr_nsamples(hdr) = 0;
|
|
2480 for (i=0; i<hdr->nsamples_ori; i++)
|
|
2481 if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
|
|
2482 if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
|
|
2483 else
|
|
2484 {
|
|
2485 char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
|
|
2486 idx = 0;
|
|
2487 for (i=0; i<hdr->nsamples_ori; i++)
|
|
2488 if ( bit_array_test(hdr->keep_samples,i) ) samples[idx++] = strdup(hdr->samples[i]);
|
|
2489 free(hdr->samples);
|
|
2490 hdr->samples = samples;
|
|
2491
|
|
2492 // delete original samples from the dictionary
|
|
2493 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
|
|
2494 int k;
|
|
2495 for (k = kh_begin(d); k != kh_end(d); ++k)
|
|
2496 if (kh_exist(d, k)) free((char*)kh_key(d, k));
|
|
2497 kh_destroy(vdict, d);
|
|
2498
|
|
2499 // add the subset back
|
|
2500 hdr->dict[BCF_DT_SAMPLE] = d = kh_init(vdict);
|
|
2501 for (i=0; i<bcf_hdr_nsamples(hdr); i++)
|
|
2502 {
|
|
2503 int ignore, k = kh_put(vdict, d, hdr->samples[i], &ignore);
|
|
2504 kh_val(d, k) = bcf_idinfo_def;
|
|
2505 kh_val(d, k).id = kh_size(d) - 1;
|
|
2506 }
|
|
2507 bcf_hdr_sync(hdr);
|
|
2508 }
|
|
2509
|
|
2510 return ret;
|
|
2511 }
|
|
2512
|
|
2513 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
|
|
2514 {
|
|
2515 kstring_t ind;
|
|
2516 ind.s = 0; ind.l = ind.m = 0;
|
|
2517 if (n) {
|
|
2518 bcf_fmt_t *fmt;
|
|
2519 int i, j;
|
|
2520 fmt = (bcf_fmt_t*)alloca(v->n_fmt * sizeof(bcf_fmt_t));
|
|
2521 uint8_t *ptr = (uint8_t*)v->indiv.s;
|
|
2522 for (i = 0; i < v->n_fmt; ++i)
|
|
2523 ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
|
|
2524 for (i = 0; i < (int)v->n_fmt; ++i) {
|
|
2525 bcf_fmt_t *f = &fmt[i];
|
|
2526 bcf_enc_int1(&ind, f->id);
|
|
2527 bcf_enc_size(&ind, f->n, f->type);
|
|
2528 for (j = 0; j < n; ++j)
|
|
2529 if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
|
|
2530 }
|
|
2531 for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
|
|
2532 v->n_sample = i;
|
|
2533 } else v->n_sample = 0;
|
|
2534 if ( !v->n_sample ) v->n_fmt = 0;
|
|
2535 free(v->indiv.s);
|
|
2536 v->indiv = ind;
|
|
2537 v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again
|
|
2538 return 0;
|
|
2539 }
|
|
2540
|
|
2541 int bcf_is_snp(bcf1_t *v)
|
|
2542 {
|
|
2543 int i;
|
|
2544 bcf_unpack(v, BCF_UN_STR);
|
|
2545 for (i = 0; i < v->n_allele; ++i)
|
|
2546 {
|
|
2547 if ( v->d.allele[i][1]==0 ) continue;
|
|
2548
|
|
2549 // mpileup's <X> allele, see also below. This is not completely satisfactory,
|
|
2550 // a general library is here narrowly tailored to fit samtools.
|
|
2551 if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
|
|
2552
|
|
2553 break;
|
|
2554 }
|
|
2555 return i == v->n_allele;
|
|
2556 }
|
|
2557
|
|
2558 static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *var)
|
|
2559 {
|
|
2560 // The most frequent case
|
|
2561 if ( !ref[1] && !alt[1] )
|
|
2562 {
|
|
2563 if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
|
|
2564 if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
|
|
2565 var->n = 1; var->type = VCF_SNP; return;
|
|
2566 }
|
|
2567 if ( alt[0]=='<' )
|
|
2568 {
|
|
2569 if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
|
|
2570 var->type = VCF_OTHER;
|
|
2571 return;
|
|
2572 }
|
|
2573
|
|
2574 const char *r = ref, *a = alt;
|
|
2575 while (*r && *a && *r==*a ) { r++; a++; }
|
|
2576
|
|
2577 if ( *a && !*r )
|
|
2578 {
|
|
2579 while ( *a ) a++;
|
|
2580 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
|
|
2581 }
|
|
2582 else if ( *r && !*a )
|
|
2583 {
|
|
2584 while ( *r ) r++;
|
|
2585 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
|
|
2586 }
|
|
2587 else if ( !*r && !*a )
|
|
2588 {
|
|
2589 var->n = 0; var->type = VCF_REF; return;
|
|
2590 }
|
|
2591
|
|
2592 const char *re = r, *ae = a;
|
|
2593 while ( re[1] ) re++;
|
|
2594 while ( ae[1] ) ae++;
|
|
2595 while ( *re==*ae && re>r && ae>a ) { re--; ae--; }
|
|
2596 if ( ae==a )
|
|
2597 {
|
|
2598 if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
|
|
2599 var->n = -(re-r);
|
|
2600 if ( *re==*ae ) { var->type = VCF_INDEL; return; }
|
|
2601 var->type = VCF_OTHER; return;
|
|
2602 }
|
|
2603 else if ( re==r )
|
|
2604 {
|
|
2605 var->n = ae-a;
|
|
2606 if ( *re==*ae ) { var->type = VCF_INDEL; return; }
|
|
2607 var->type = VCF_OTHER; return;
|
|
2608 }
|
|
2609
|
|
2610 var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
|
|
2611 var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
|
|
2612
|
|
2613 // should do also complex events, SVs, etc...
|
|
2614 }
|
|
2615
|
|
2616 static void bcf_set_variant_types(bcf1_t *b)
|
|
2617 {
|
|
2618 if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
|
|
2619 bcf_dec_t *d = &b->d;
|
|
2620 if ( d->n_var < b->n_allele )
|
|
2621 {
|
|
2622 d->var = (variant_t *) realloc(d->var, sizeof(variant_t)*b->n_allele);
|
|
2623 d->n_var = b->n_allele;
|
|
2624 }
|
|
2625 int i;
|
|
2626 b->d.var_type = 0;
|
|
2627 for (i=1; i<b->n_allele; i++)
|
|
2628 {
|
|
2629 bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
|
|
2630 b->d.var_type |= d->var[i].type;
|
|
2631 //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
|
|
2632 }
|
|
2633 }
|
|
2634
|
|
2635 int bcf_get_variant_types(bcf1_t *rec)
|
|
2636 {
|
|
2637 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
|
|
2638 return rec->d.var_type;
|
|
2639 }
|
|
2640 int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
|
|
2641 {
|
|
2642 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
|
|
2643 return rec->d.var[ith_allele].type;
|
|
2644 }
|
|
2645
|
|
2646 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
|
|
2647 {
|
|
2648 // Is the field already present?
|
|
2649 int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
|
|
2650 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header
|
|
2651 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
|
|
2652
|
|
2653 for (i=0; i<line->n_info; i++)
|
|
2654 if ( inf_id==line->d.info[i].key ) break;
|
|
2655 bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
|
|
2656
|
|
2657 if ( !n || (type==BCF_HT_STR && !values) )
|
|
2658 {
|
|
2659 if ( inf )
|
|
2660 {
|
|
2661 // Mark the tag for removal, free existing memory if necessary
|
|
2662 if ( inf->vptr_free )
|
|
2663 {
|
|
2664 free(inf->vptr - inf->vptr_off);
|
|
2665 inf->vptr_free = 0;
|
|
2666 }
|
|
2667 line->d.shared_dirty |= BCF1_DIRTY_INF;
|
|
2668 inf->vptr = NULL;
|
|
2669 }
|
|
2670 return 0;
|
|
2671 }
|
|
2672
|
|
2673 // Encode the values and determine the size required to accommodate the values
|
|
2674 kstring_t str = {0,0,0};
|
|
2675 bcf_enc_int1(&str, inf_id);
|
|
2676 if ( type==BCF_HT_INT )
|
|
2677 bcf_enc_vint(&str, n, (int32_t*)values, -1);
|
|
2678 else if ( type==BCF_HT_REAL )
|
|
2679 bcf_enc_vfloat(&str, n, (float*)values);
|
|
2680 else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
|
|
2681 {
|
|
2682 if ( values==NULL )
|
|
2683 bcf_enc_size(&str, 0, BCF_BT_NULL);
|
|
2684 else
|
|
2685 bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
|
|
2686 }
|
|
2687 else
|
|
2688 {
|
|
2689 fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
|
|
2690 abort();
|
|
2691 }
|
|
2692
|
|
2693 // Is the INFO tag already present
|
|
2694 if ( inf )
|
|
2695 {
|
|
2696 // Is it big enough to accommodate new block?
|
|
2697 if ( str.l <= inf->vptr_len + inf->vptr_off )
|
|
2698 {
|
|
2699 if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
|
|
2700 uint8_t *ptr = inf->vptr - inf->vptr_off;
|
|
2701 memcpy(ptr, str.s, str.l);
|
|
2702 free(str.s);
|
|
2703 int vptr_free = inf->vptr_free;
|
|
2704 bcf_unpack_info_core1(ptr, inf);
|
|
2705 inf->vptr_free = vptr_free;
|
|
2706 }
|
|
2707 else
|
|
2708 {
|
|
2709 assert( !inf->vptr_free ); // fix the caller or improve here: this has been modified before
|
|
2710 bcf_unpack_info_core1((uint8_t*)str.s, inf);
|
|
2711 inf->vptr_free = 1;
|
|
2712 line->d.shared_dirty |= BCF1_DIRTY_INF;
|
|
2713 }
|
|
2714 }
|
|
2715 else
|
|
2716 {
|
|
2717 // The tag is not present, create new one
|
|
2718 line->n_info++;
|
|
2719 hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
|
|
2720 inf = &line->d.info[line->n_info-1];
|
|
2721 bcf_unpack_info_core1((uint8_t*)str.s, inf);
|
|
2722 inf->vptr_free = 1;
|
|
2723 line->d.shared_dirty |= BCF1_DIRTY_INF;
|
|
2724 }
|
|
2725 line->unpacked |= BCF_UN_INFO;
|
|
2726 return 0;
|
|
2727 }
|
|
2728
|
|
2729 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
|
|
2730 {
|
|
2731 if ( !n )
|
|
2732 return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
|
|
2733
|
|
2734 int i, max_len = 0;
|
|
2735 for (i=0; i<n; i++)
|
|
2736 {
|
|
2737 int len = strlen(values[i]);
|
|
2738 if ( len > max_len ) max_len = len;
|
|
2739 }
|
|
2740 char *out = (char*) malloc(max_len*n);
|
|
2741 if ( !out ) return -2;
|
|
2742 for (i=0; i<n; i++)
|
|
2743 {
|
|
2744 char *dst = out+i*max_len;
|
|
2745 const char *src = values[i];
|
|
2746 int j = 0;
|
|
2747 while ( src[j] ) { dst[j] = src[j]; j++; }
|
|
2748 for (; j<max_len; j++) dst[j] = 0;
|
|
2749 }
|
|
2750 int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
|
|
2751 free(out);
|
|
2752 return ret;
|
|
2753 }
|
|
2754
|
|
2755 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
|
|
2756 {
|
|
2757 // Is the field already present?
|
|
2758 int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
|
|
2759 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
|
|
2760 {
|
|
2761 if ( !n ) return 0;
|
|
2762 return -1; // the key not present in the header
|
|
2763 }
|
|
2764
|
|
2765 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
|
|
2766
|
|
2767 for (i=0; i<line->n_fmt; i++)
|
|
2768 if ( line->d.fmt[i].id==fmt_id ) break;
|
|
2769 bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
|
|
2770
|
|
2771 if ( !n )
|
|
2772 {
|
|
2773 if ( fmt )
|
|
2774 {
|
|
2775 // Mark the tag for removal, free existing memory if necessary
|
|
2776 if ( fmt->p_free )
|
|
2777 {
|
|
2778 free(fmt->p - fmt->p_off);
|
|
2779 fmt->p_free = 0;
|
|
2780 }
|
|
2781 line->d.indiv_dirty = 1;
|
|
2782 fmt->p = NULL;
|
|
2783 }
|
|
2784 return 0;
|
|
2785 }
|
|
2786
|
|
2787 line->n_sample = bcf_hdr_nsamples(hdr);
|
|
2788 int nps = n / line->n_sample; // number of values per sample
|
|
2789 assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample
|
|
2790
|
|
2791 // Encode the values and determine the size required to accommodate the values
|
|
2792 kstring_t str = {0,0,0};
|
|
2793 bcf_enc_int1(&str, fmt_id);
|
|
2794 if ( type==BCF_HT_INT )
|
|
2795 bcf_enc_vint(&str, n, (int32_t*)values, nps);
|
|
2796 else if ( type==BCF_HT_REAL )
|
|
2797 {
|
|
2798 bcf_enc_size(&str, nps, BCF_BT_FLOAT);
|
|
2799 kputsn((char*)values, nps*line->n_sample*sizeof(float), &str);
|
|
2800 }
|
|
2801 else if ( type==BCF_HT_STR )
|
|
2802 {
|
|
2803 bcf_enc_size(&str, nps, BCF_BT_CHAR);
|
|
2804 kputsn((char*)values, nps*line->n_sample, &str);
|
|
2805 }
|
|
2806 else
|
|
2807 {
|
|
2808 fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
|
|
2809 abort();
|
|
2810 }
|
|
2811
|
|
2812 if ( !fmt )
|
|
2813 {
|
|
2814 // Not present, new format field
|
|
2815 line->n_fmt++;
|
|
2816 hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
|
|
2817
|
|
2818 // Special case: VCF specification requires that GT is always first
|
|
2819 if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
|
|
2820 {
|
|
2821 for (i=line->n_fmt-1; i>0; i--)
|
|
2822 line->d.fmt[i] = line->d.fmt[i-1];
|
|
2823 fmt = &line->d.fmt[0];
|
|
2824 }
|
|
2825 else
|
|
2826 fmt = &line->d.fmt[line->n_fmt-1];
|
|
2827 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
|
|
2828 line->d.indiv_dirty = 1;
|
|
2829 fmt->p_free = 1;
|
|
2830 }
|
|
2831 else
|
|
2832 {
|
|
2833 // The tag is already present, check if it is big enough to accomodate the new block
|
|
2834 if ( str.l <= fmt->p_len + fmt->p_off )
|
|
2835 {
|
|
2836 // good, the block is big enough
|
|
2837 if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
|
|
2838 uint8_t *ptr = fmt->p - fmt->p_off;
|
|
2839 memcpy(ptr, str.s, str.l);
|
|
2840 free(str.s);
|
|
2841 int p_free = fmt->p_free;
|
|
2842 bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
|
|
2843 fmt->p_free = p_free;
|
|
2844 }
|
|
2845 else
|
|
2846 {
|
|
2847 assert( !fmt->p_free ); // fix the caller or improve here: this has been modified before
|
|
2848 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
|
|
2849 fmt->p_free = 1;
|
|
2850 line->d.indiv_dirty = 1;
|
|
2851 }
|
|
2852 }
|
|
2853 line->unpacked |= BCF_UN_FMT;
|
|
2854 return 0;
|
|
2855 }
|
|
2856
|
|
2857
|
|
2858 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
|
|
2859 {
|
|
2860 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
|
|
2861 line->d.shared_dirty |= BCF1_DIRTY_FLT;
|
|
2862 line->d.n_flt = n;
|
|
2863 if ( !n ) return 0;
|
|
2864 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
|
|
2865 int i;
|
|
2866 for (i=0; i<n; i++)
|
|
2867 line->d.flt[i] = flt_ids[i];
|
|
2868 return 0;
|
|
2869 }
|
|
2870
|
|
2871 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
|
|
2872 {
|
|
2873 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
|
|
2874 int i;
|
|
2875 for (i=0; i<line->d.n_flt; i++)
|
|
2876 if ( flt_id==line->d.flt[i] ) break;
|
|
2877 if ( i<line->d.n_flt ) return 0; // this filter is already set
|
|
2878 line->d.shared_dirty |= BCF1_DIRTY_FLT;
|
|
2879 if ( flt_id==0 ) // set to PASS
|
|
2880 line->d.n_flt = 1;
|
|
2881 else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
|
|
2882 line->d.n_flt = 1;
|
|
2883 else
|
|
2884 line->d.n_flt++;
|
|
2885 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
|
|
2886 line->d.flt[line->d.n_flt-1] = flt_id;
|
|
2887 return 1;
|
|
2888 }
|
|
2889 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
|
|
2890 {
|
|
2891 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
|
|
2892 int i;
|
|
2893 for (i=0; i<line->d.n_flt; i++)
|
|
2894 if ( flt_id==line->d.flt[i] ) break;
|
|
2895 if ( i==line->d.n_flt ) return 0; // the filter is not present
|
|
2896 line->d.shared_dirty |= BCF1_DIRTY_FLT;
|
|
2897 if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
|
|
2898 line->d.n_flt--;
|
|
2899 if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
|
|
2900 return 0;
|
|
2901 }
|
|
2902
|
|
2903 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
|
|
2904 {
|
|
2905 if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
|
|
2906 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
|
|
2907 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header
|
|
2908
|
|
2909 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
|
|
2910 if ( id==0 && !line->d.n_flt) return 1; // PASS
|
|
2911
|
|
2912 int i;
|
|
2913 for (i=0; i<line->d.n_flt; i++)
|
|
2914 if ( line->d.flt[i]==id ) return 1;
|
|
2915 return 0;
|
|
2916 }
|
|
2917
|
|
2918 static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
|
|
2919 {
|
|
2920 line->d.shared_dirty |= BCF1_DIRTY_ALS;
|
|
2921
|
|
2922 line->n_allele = nals;
|
|
2923 hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
|
|
2924
|
|
2925 char *als = line->d.als;
|
|
2926 int n = 0;
|
|
2927 while (n<nals)
|
|
2928 {
|
|
2929 line->d.allele[n] = als;
|
|
2930 while ( *als ) als++;
|
|
2931 als++;
|
|
2932 n++;
|
|
2933 }
|
|
2934 return 0;
|
|
2935 }
|
|
2936 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
|
|
2937 {
|
|
2938 kstring_t tmp = {0,0,0};
|
|
2939 char *free_old = NULL;
|
|
2940
|
|
2941 // If the supplied alleles are not pointers to line->d.als, the existing block can be reused.
|
|
2942 int i;
|
|
2943 for (i=0; i<nals; i++)
|
|
2944 if ( alleles[i]>=line->d.als && alleles[i]<line->d.als+line->d.m_als ) break;
|
|
2945 if ( i==nals )
|
|
2946 {
|
|
2947 // all alleles point elsewhere, reuse the existing block
|
|
2948 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
|
|
2949 }
|
|
2950 else
|
|
2951 free_old = line->d.als;
|
|
2952
|
|
2953 for (i=0; i<nals; i++)
|
|
2954 {
|
|
2955 kputs(alleles[i], &tmp);
|
|
2956 kputc(0, &tmp);
|
|
2957 }
|
|
2958 line->d.als = tmp.s; line->d.m_als = tmp.m;
|
|
2959 free(free_old);
|
|
2960 return _bcf1_sync_alleles(hdr,line,nals);
|
|
2961 }
|
|
2962
|
|
2963 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
|
|
2964 {
|
|
2965 kstring_t tmp;
|
|
2966 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
|
|
2967 kputs(alleles_string, &tmp);
|
|
2968 line->d.als = tmp.s; line->d.m_als = tmp.m;
|
|
2969
|
|
2970 int nals = 1;
|
|
2971 char *t = line->d.als;
|
|
2972 while (*t)
|
|
2973 {
|
|
2974 if ( *t==',' ) { *t = 0; nals++; }
|
|
2975 t++;
|
|
2976 }
|
|
2977 return _bcf1_sync_alleles(hdr, line, nals);
|
|
2978 }
|
|
2979
|
|
2980 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
|
|
2981 {
|
|
2982 kstring_t tmp;
|
|
2983 tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
|
|
2984 if ( id )
|
|
2985 kputs(id, &tmp);
|
|
2986 else
|
|
2987 kputs(".", &tmp);
|
|
2988 line->d.id = tmp.s; line->d.m_id = tmp.m;
|
|
2989 line->d.shared_dirty |= BCF1_DIRTY_ID;
|
|
2990 return 0;
|
|
2991 }
|
|
2992
|
|
2993 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
|
|
2994 {
|
|
2995 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
|
|
2996 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header
|
|
2997 return bcf_get_fmt_id(line, id);
|
|
2998 }
|
|
2999
|
|
3000 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
|
|
3001 {
|
|
3002 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
|
|
3003 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header
|
|
3004 return bcf_get_info_id(line, id);
|
|
3005 }
|
|
3006
|
|
3007 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
|
|
3008 {
|
|
3009 int i;
|
|
3010 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
|
|
3011 for (i=0; i<line->n_fmt; i++)
|
|
3012 {
|
|
3013 if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
|
|
3014 }
|
|
3015 return NULL;
|
|
3016 }
|
|
3017
|
|
3018 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
|
|
3019 {
|
|
3020 int i;
|
|
3021 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
|
|
3022 for (i=0; i<line->n_info; i++)
|
|
3023 {
|
|
3024 if ( line->d.info[i].key==id ) return &line->d.info[i];
|
|
3025 }
|
|
3026 return NULL;
|
|
3027 }
|
|
3028
|
|
3029
|
|
3030 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
|
|
3031 {
|
|
3032 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
|
|
3033 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header
|
|
3034 if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type
|
|
3035
|
|
3036 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
|
|
3037
|
|
3038 for (i=0; i<line->n_info; i++)
|
|
3039 if ( line->d.info[i].key==tag_id ) break;
|
|
3040 if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record
|
|
3041 if ( type==BCF_HT_FLAG ) return 1;
|
|
3042
|
|
3043 bcf_info_t *info = &line->d.info[i];
|
|
3044 if ( type==BCF_HT_STR )
|
|
3045 {
|
|
3046 if ( *ndst < info->len+1 )
|
|
3047 {
|
|
3048 *ndst = info->len + 1;
|
|
3049 *dst = realloc(*dst, *ndst);
|
|
3050 }
|
|
3051 memcpy(*dst,info->vptr,info->len);
|
|
3052 ((uint8_t*)*dst)[info->len] = 0;
|
|
3053 return info->len;
|
|
3054 }
|
|
3055
|
|
3056 // Make sure the buffer is big enough
|
|
3057 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
|
|
3058 if ( *ndst < info->len )
|
|
3059 {
|
|
3060 *ndst = info->len;
|
|
3061 *dst = realloc(*dst, *ndst * size1);
|
|
3062 }
|
|
3063
|
|
3064 if ( info->len == 1 )
|
|
3065 {
|
|
3066 if ( info->type==BCF_BT_FLOAT ) *((float*)*dst) = info->v1.f;
|
|
3067 else
|
|
3068 {
|
|
3069 #define BRANCH(type_t, missing) { \
|
|
3070 if ( info->v1.i==missing ) *((int32_t*)*dst) = bcf_int32_missing; \
|
|
3071 else *((int32_t*)*dst) = info->v1.i; \
|
|
3072 }
|
|
3073 switch (info->type)
|
|
3074 {
|
|
3075 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing ); break;
|
|
3076 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing); break;
|
|
3077 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing); break;
|
|
3078 }
|
|
3079 #undef BRANCH
|
|
3080 }
|
|
3081 return 1;
|
|
3082 }
|
|
3083
|
|
3084 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, out_type_t) { \
|
|
3085 out_type_t *tmp = (out_type_t *) *dst; \
|
|
3086 type_t *p = (type_t *) info->vptr; \
|
|
3087 for (j=0; j<info->len; j++) \
|
|
3088 { \
|
|
3089 if ( is_vector_end ) return j; \
|
|
3090 if ( is_missing ) set_missing; \
|
|
3091 else *tmp = p[j]; \
|
|
3092 tmp++; \
|
|
3093 } \
|
|
3094 return j; \
|
|
3095 }
|
|
3096 switch (info->type) {
|
|
3097 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, int32_t); break;
|
|
3098 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, int32_t); break;
|
|
3099 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, int32_t); break;
|
|
3100 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), float); break;
|
|
3101 default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
|
|
3102 }
|
|
3103 #undef BRANCH
|
|
3104 return -4; // this can never happen
|
|
3105 }
|
|
3106
|
|
3107 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
|
|
3108 {
|
|
3109 int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
|
|
3110 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
|
|
3111 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type
|
|
3112
|
|
3113 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
|
|
3114
|
|
3115 for (i=0; i<line->n_fmt; i++)
|
|
3116 if ( line->d.fmt[i].id==tag_id ) break;
|
|
3117 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
|
|
3118 bcf_fmt_t *fmt = &line->d.fmt[i];
|
|
3119
|
|
3120 int nsmpl = bcf_hdr_nsamples(hdr);
|
|
3121 if ( !*dst )
|
|
3122 {
|
|
3123 *dst = (char**) malloc(sizeof(char*)*nsmpl);
|
|
3124 if ( !*dst ) return -4; // could not alloc
|
|
3125 (*dst)[0] = NULL;
|
|
3126 }
|
|
3127 int n = (fmt->n+1)*nsmpl;
|
|
3128 if ( *ndst < n )
|
|
3129 {
|
|
3130 (*dst)[0] = realloc((*dst)[0], n);
|
|
3131 if ( !(*dst)[0] ) return -4; // could not alloc
|
|
3132 *ndst = n;
|
|
3133 }
|
|
3134 for (i=0; i<nsmpl; i++)
|
|
3135 {
|
|
3136 uint8_t *src = fmt->p + i*fmt->n;
|
|
3137 uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
|
|
3138 memcpy(tmp,src,fmt->n);
|
|
3139 tmp[fmt->n] = 0;
|
|
3140 (*dst)[i] = (char*) tmp;
|
|
3141 }
|
|
3142 return n;
|
|
3143 }
|
|
3144
|
|
3145 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
|
|
3146 {
|
|
3147 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
|
|
3148 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
|
|
3149 if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
|
|
3150 {
|
|
3151 // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
|
|
3152 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
|
|
3153 }
|
|
3154 else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type
|
|
3155
|
|
3156 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
|
|
3157
|
|
3158 for (i=0; i<line->n_fmt; i++)
|
|
3159 if ( line->d.fmt[i].id==tag_id ) break;
|
|
3160 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
|
|
3161 bcf_fmt_t *fmt = &line->d.fmt[i];
|
|
3162
|
|
3163 if ( type==BCF_HT_STR )
|
|
3164 {
|
|
3165 int n = fmt->n*bcf_hdr_nsamples(hdr);
|
|
3166 if ( *ndst < n )
|
|
3167 {
|
|
3168 *dst = realloc(*dst, n);
|
|
3169 if ( !*dst ) return -4; // could not alloc
|
|
3170 *ndst = n;
|
|
3171 }
|
|
3172 memcpy(*dst,fmt->p,n);
|
|
3173 return n;
|
|
3174 }
|
|
3175
|
|
3176 // Make sure the buffer is big enough
|
|
3177 int nsmpl = bcf_hdr_nsamples(hdr);
|
|
3178 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
|
|
3179 if ( *ndst < fmt->n*nsmpl )
|
|
3180 {
|
|
3181 *ndst = fmt->n*nsmpl;
|
|
3182 *dst = realloc(*dst, *ndst*size1);
|
|
3183 if ( !dst ) return -4; // could not alloc
|
|
3184 }
|
|
3185
|
|
3186 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \
|
|
3187 out_type_t *tmp = (out_type_t *) *dst; \
|
|
3188 type_t *p = (type_t*) fmt->p; \
|
|
3189 for (i=0; i<nsmpl; i++) \
|
|
3190 { \
|
|
3191 for (j=0; j<fmt->n; j++) \
|
|
3192 { \
|
|
3193 if ( is_missing ) set_missing; \
|
|
3194 else if ( is_vector_end ) { set_vector_end; break; } \
|
|
3195 else *tmp = p[j]; \
|
|
3196 tmp++; \
|
|
3197 } \
|
|
3198 for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
|
|
3199 p = (type_t *)((char *)p + fmt->size); \
|
|
3200 } \
|
|
3201 }
|
|
3202 switch (fmt->type) {
|
|
3203 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
|
|
3204 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
|
|
3205 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
|
|
3206 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break;
|
|
3207 default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1);
|
|
3208 }
|
|
3209 #undef BRANCH
|
|
3210 return nsmpl*fmt->n;
|
|
3211 }
|
|
3212
|