Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/vcf.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* vcf.c -- VCF/BCF API functions. | |
2 | |
3 Copyright (C) 2012, 2013 Broad Institute. | |
4 Copyright (C) 2012-2014 Genome Research Ltd. | |
5 Portions copyright (C) 2014 Intel Corporation. | |
6 | |
7 Author: Heng Li <lh3@sanger.ac.uk> | |
8 | |
9 Permission is hereby granted, free of charge, to any person obtaining a copy | |
10 of this software and associated documentation files (the "Software"), to deal | |
11 in the Software without restriction, including without limitation the rights | |
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
13 copies of the Software, and to permit persons to whom the Software is | |
14 furnished to do so, subject to the following conditions: | |
15 | |
16 The above copyright notice and this permission notice shall be included in | |
17 all copies or substantial portions of the Software. | |
18 | |
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
25 DEALINGS IN THE SOFTWARE. */ | |
26 | |
27 #include <zlib.h> | |
28 #include <stdio.h> | |
29 #include <ctype.h> | |
30 #include <assert.h> | |
31 #include <string.h> | |
32 #include <stdlib.h> | |
33 #include <limits.h> | |
34 #include "htslib/kstring.h" | |
35 #include "htslib/bgzf.h" | |
36 #include "htslib/vcf.h" | |
37 #include "htslib/tbx.h" | |
38 #include "htslib/hfile.h" | |
39 #include "htslib/khash_str2int.h" | |
40 | |
41 #include "htslib/khash.h" | |
42 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) | |
43 typedef khash_t(vdict) vdict_t; | |
44 | |
45 #include "htslib/kseq.h" | |
46 KSTREAM_DECLARE(gzFile, gzread) | |
47 | |
48 uint32_t bcf_float_missing = 0x7F800001; | |
49 uint32_t bcf_float_vector_end = 0x7F800002; | |
50 uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; | |
51 static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 }; | |
52 | |
53 /************************* | |
54 *** VCF header parser *** | |
55 *************************/ | |
56 | |
57 int bcf_hdr_sync(bcf_hdr_t *h); | |
58 | |
59 int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) | |
60 { | |
61 if ( !s ) return 0; | |
62 | |
63 const char *ss = s; | |
64 while ( !*ss && isspace(*ss) ) ss++; | |
65 if ( !*ss ) | |
66 { | |
67 fprintf(stderr,"[E::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__); | |
68 abort(); | |
69 } | |
70 | |
71 vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE]; | |
72 int ret; | |
73 char *sdup = strdup(s); | |
74 int k = kh_put(vdict, d, sdup, &ret); | |
75 if (ret) { // absent | |
76 kh_val(d, k) = bcf_idinfo_def; | |
77 kh_val(d, k).id = kh_size(d) - 1; | |
78 } else { | |
79 if (hts_verbose >= 2) | |
80 { | |
81 fprintf(stderr, "[E::%s] Duplicated sample name '%s'\n", __func__, s); | |
82 abort(); | |
83 } | |
84 free(sdup); | |
85 return -1; | |
86 } | |
87 int n = kh_size(d); | |
88 h->samples = (char**) realloc(h->samples,sizeof(char*)*n); | |
89 h->samples[n-1] = sdup; | |
90 h->dirty = 1; | |
91 return 0; | |
92 } | |
93 | |
94 int bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) | |
95 { | |
96 int ret = 0; | |
97 int i = 0; | |
98 const char *p, *q; | |
99 // add samples | |
100 for (p = q = str;; ++q) { | |
101 if (*q != '\t' && *q != 0 && *q != '\n') continue; | |
102 if (++i > 9) { | |
103 char *s = (char*)malloc(q - p + 1); | |
104 strncpy(s, p, q - p); | |
105 s[q - p] = 0; | |
106 if ( bcf_hdr_add_sample(h,s) < 0 ) ret = -1; | |
107 free(s); | |
108 } | |
109 if (*q == 0 || *q == '\n') break; | |
110 p = q + 1; | |
111 } | |
112 bcf_hdr_add_sample(h,NULL); | |
113 return ret; | |
114 } | |
115 | |
116 int bcf_hdr_sync(bcf_hdr_t *h) | |
117 { | |
118 int i; | |
119 for (i = 0; i < 3; i++) | |
120 { | |
121 vdict_t *d = (vdict_t*)h->dict[i]; | |
122 khint_t k; | |
123 | |
124 // find out the largest id, there may be holes because of IDX | |
125 int max_id = -1; | |
126 for (k=kh_begin(d); k<kh_end(d); k++) | |
127 { | |
128 if (!kh_exist(d,k)) continue; | |
129 if ( max_id < kh_val(d,k).id ) max_id = kh_val(d,k).id; | |
130 } | |
131 if ( max_id >= h->n[i] ) | |
132 { | |
133 h->id[i] = (bcf_idpair_t*)realloc(h->id[i], (max_id+1)*sizeof(bcf_idpair_t)); | |
134 for (k=h->n[i]; k<=max_id; k++) | |
135 { | |
136 h->id[i][k].key = NULL; | |
137 h->id[i][k].val = NULL; | |
138 } | |
139 h->n[i] = max_id+1; | |
140 } | |
141 for (k=kh_begin(d); k<kh_end(d); k++) | |
142 { | |
143 if (!kh_exist(d,k)) continue; | |
144 h->id[i][kh_val(d,k).id].key = kh_key(d,k); | |
145 h->id[i][kh_val(d,k).id].val = &kh_val(d,k); | |
146 } | |
147 } | |
148 h->dirty = 0; | |
149 return 0; | |
150 } | |
151 | |
152 void bcf_hrec_destroy(bcf_hrec_t *hrec) | |
153 { | |
154 free(hrec->key); | |
155 if ( hrec->value ) free(hrec->value); | |
156 int i; | |
157 for (i=0; i<hrec->nkeys; i++) | |
158 { | |
159 free(hrec->keys[i]); | |
160 free(hrec->vals[i]); | |
161 } | |
162 free(hrec->keys); | |
163 free(hrec->vals); | |
164 free(hrec); | |
165 } | |
166 | |
167 // Copies all fields except IDX. | |
168 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec) | |
169 { | |
170 bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); | |
171 out->type = hrec->type; | |
172 if ( hrec->key ) out->key = strdup(hrec->key); | |
173 if ( hrec->value ) out->value = strdup(hrec->value); | |
174 out->nkeys = hrec->nkeys; | |
175 out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys); | |
176 out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys); | |
177 int i, j = 0; | |
178 for (i=0; i<hrec->nkeys; i++) | |
179 { | |
180 if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue; | |
181 if ( hrec->keys[i] ) out->keys[j] = strdup(hrec->keys[i]); | |
182 if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]); | |
183 j++; | |
184 } | |
185 if ( i!=j ) out->nkeys -= i-j; // IDX was omitted | |
186 return out; | |
187 } | |
188 | |
189 void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec) | |
190 { | |
191 fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:""); | |
192 int i; | |
193 for (i=0; i<hrec->nkeys; i++) | |
194 fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]); | |
195 fprintf(fp, "\n"); | |
196 } | |
197 | |
198 void bcf_header_debug(bcf_hdr_t *hdr) | |
199 { | |
200 int i, j; | |
201 for (i=0; i<hdr->nhrec; i++) | |
202 { | |
203 if ( !hdr->hrec[i]->value ) | |
204 { | |
205 fprintf(stderr, "##%s=<", hdr->hrec[i]->key); | |
206 fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]); | |
207 for (j=1; j<hdr->hrec[i]->nkeys; j++) | |
208 fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]); | |
209 fprintf(stderr,">\n"); | |
210 } | |
211 else | |
212 fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value); | |
213 } | |
214 } | |
215 | |
216 void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len) | |
217 { | |
218 int n = ++hrec->nkeys; | |
219 hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n); | |
220 hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n); | |
221 assert( len ); | |
222 hrec->keys[n-1] = (char*) malloc((len+1)*sizeof(char)); | |
223 memcpy(hrec->keys[n-1],str,len); | |
224 hrec->keys[n-1][len] = 0; | |
225 hrec->vals[n-1] = NULL; | |
226 } | |
227 | |
228 void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted) | |
229 { | |
230 if ( !str ) { hrec->vals[i] = NULL; return; } | |
231 if ( hrec->vals[i] ) free(hrec->vals[i]); | |
232 if ( is_quoted ) | |
233 { | |
234 hrec->vals[i] = (char*) malloc((len+3)*sizeof(char)); | |
235 hrec->vals[i][0] = '"'; | |
236 memcpy(&hrec->vals[i][1],str,len); | |
237 hrec->vals[i][len+1] = '"'; | |
238 hrec->vals[i][len+2] = 0; | |
239 } | |
240 else | |
241 { | |
242 hrec->vals[i] = (char*) malloc((len+1)*sizeof(char)); | |
243 memcpy(hrec->vals[i],str,len); | |
244 hrec->vals[i][len] = 0; | |
245 } | |
246 } | |
247 | |
248 void hrec_add_idx(bcf_hrec_t *hrec, int idx) | |
249 { | |
250 int n = ++hrec->nkeys; | |
251 hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n); | |
252 hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n); | |
253 hrec->keys[n-1] = strdup("IDX"); | |
254 kstring_t str = {0,0,0}; | |
255 kputw(idx, &str); | |
256 hrec->vals[n-1] = str.s; | |
257 } | |
258 | |
259 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key) | |
260 { | |
261 int i; | |
262 for (i=0; i<hrec->nkeys; i++) | |
263 if ( !strcasecmp(key,hrec->keys[i]) ) return i; | |
264 return -1; | |
265 } | |
266 | |
267 static inline int is_escaped(const char *min, const char *str) | |
268 { | |
269 int n = 0; | |
270 while ( --str>=min && *str=='\\' ) n++; | |
271 return n%2; | |
272 } | |
273 | |
274 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) | |
275 { | |
276 const char *p = line; | |
277 if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; } | |
278 p += 2; | |
279 | |
280 const char *q = p; | |
281 while ( *q && *q!='=' ) q++; | |
282 int n = q-p; | |
283 if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format | |
284 | |
285 bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); | |
286 hrec->key = (char*) malloc(sizeof(char)*(n+1)); | |
287 memcpy(hrec->key,p,n); | |
288 hrec->key[n] = 0; | |
289 | |
290 p = ++q; | |
291 if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579 | |
292 { | |
293 while ( *q && *q!='\n' ) q++; | |
294 hrec->value = (char*) malloc((q-p+1)*sizeof(char)); | |
295 memcpy(hrec->value, p, q-p); | |
296 hrec->value[q-p] = 0; | |
297 *len = q-line+1; | |
298 return hrec; | |
299 } | |
300 | |
301 // structured line, e.g. ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias"> | |
302 int nopen = 1; | |
303 while ( *q && *q!='\n' && nopen ) | |
304 { | |
305 p = ++q; | |
306 while ( *q && isalnum(*q) ) q++; | |
307 n = q-p; | |
308 if ( *q!='=' || !n ) | |
309 { | |
310 // wrong format | |
311 while ( *q && *q!='\n' ) q++; | |
312 kstring_t tmp = {0,0,0}; | |
313 kputsn(line,q-line,&tmp); | |
314 fprintf(stderr,"Could not parse the header line: \"%s\"\n", tmp.s); | |
315 free(tmp.s); | |
316 *len = q-line+1; | |
317 bcf_hrec_destroy(hrec); | |
318 return NULL; | |
319 } | |
320 bcf_hrec_add_key(hrec, p, q-p); | |
321 p = ++q; | |
322 int quoted = *p=='"' ? 1 : 0; | |
323 if ( quoted ) p++, q++; | |
324 while (1) | |
325 { | |
326 if ( !*q ) break; | |
327 if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; } | |
328 else | |
329 { | |
330 if ( *q=='<' ) nopen++; | |
331 if ( *q=='>' ) nopen--; | |
332 if ( !nopen ) break; | |
333 if ( *q==',' && nopen==1 ) break; | |
334 } | |
335 q++; | |
336 } | |
337 bcf_hrec_set_val(hrec, hrec->nkeys-1, p, q-p, quoted); | |
338 if ( quoted ) q++; | |
339 if ( *q=='>' ) { nopen--; q++; } | |
340 } | |
341 *len = q-line+1; | |
342 return hrec; | |
343 } | |
344 | |
345 // returns: 1 when hdr needs to be synced, 0 otherwise | |
346 int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) | |
347 { | |
348 // contig | |
349 int i,j,k, ret; | |
350 char *str; | |
351 if ( !strcmp(hrec->key, "contig") ) | |
352 { | |
353 hrec->type = BCF_HL_CTG; | |
354 | |
355 // Get the contig ID ($str) and length ($j) | |
356 i = bcf_hrec_find_key(hrec,"length"); | |
357 if ( i<0 ) j = 0; | |
358 else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0; | |
359 | |
360 i = bcf_hrec_find_key(hrec,"ID"); | |
361 if ( i<0 ) return 0; | |
362 str = strdup(hrec->vals[i]); | |
363 | |
364 // Register in the dictionary | |
365 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG]; | |
366 k = kh_put(vdict, d, str, &ret); | |
367 if ( !ret ) { free(str); return 0; } // already present | |
368 | |
369 int idx = bcf_hrec_find_key(hrec,"IDX"); | |
370 if ( idx!=-1 ) | |
371 { | |
372 char *tmp = hrec->vals[idx]; | |
373 idx = strtol(hrec->vals[idx], &tmp, 10); | |
374 if ( *tmp ) | |
375 { | |
376 fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__); | |
377 return 0; | |
378 } | |
379 } | |
380 else | |
381 { | |
382 idx = kh_size(d) - 1; | |
383 hrec_add_idx(hrec, idx); | |
384 } | |
385 | |
386 kh_val(d, k) = bcf_idinfo_def; | |
387 kh_val(d, k).id = idx; | |
388 kh_val(d, k).info[0] = j; | |
389 kh_val(d, k).hrec[0] = hrec; | |
390 | |
391 return 1; | |
392 } | |
393 | |
394 if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO; | |
395 else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT; | |
396 else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT; | |
397 else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; } | |
398 else return 0; | |
399 | |
400 // INFO/FILTER/FORMAT | |
401 char *id = NULL; | |
402 int type = -1, num = -1, var = -1, idx = -1; | |
403 for (i=0; i<hrec->nkeys; i++) | |
404 { | |
405 if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i]; | |
406 else if ( !strcmp(hrec->keys[i], "IDX") ) | |
407 { | |
408 char *tmp = hrec->vals[i]; | |
409 idx = strtol(hrec->vals[i], &tmp, 10); | |
410 if ( *tmp ) | |
411 { | |
412 fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__); | |
413 return 0; | |
414 } | |
415 } | |
416 else if ( !strcmp(hrec->keys[i], "Type") ) | |
417 { | |
418 if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT; | |
419 else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL; | |
420 else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR; | |
421 else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR; | |
422 else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG; | |
423 else | |
424 { | |
425 fprintf(stderr, "[E::%s] The type \"%s\" not supported, assuming \"String\"\n", __func__, hrec->vals[i]); | |
426 type = BCF_HT_STR; | |
427 } | |
428 } | |
429 else if ( !strcmp(hrec->keys[i], "Number") ) | |
430 { | |
431 if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A; | |
432 else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R; | |
433 else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G; | |
434 else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR; | |
435 else | |
436 { | |
437 sscanf(hrec->vals[i],"%d",&num); | |
438 var = BCF_VL_FIXED; | |
439 } | |
440 if (var != BCF_VL_FIXED) num = 0xfffff; | |
441 } | |
442 } | |
443 uint32_t info = (uint32_t)num<<12 | var<<8 | type<<4 | hrec->type; | |
444 | |
445 if ( !id ) return 0; | |
446 str = strdup(id); | |
447 | |
448 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID]; | |
449 k = kh_put(vdict, d, str, &ret); | |
450 if ( !ret ) | |
451 { | |
452 // already present | |
453 free(str); | |
454 if ( kh_val(d, k).hrec[info&0xf] ) return 0; | |
455 kh_val(d, k).info[info&0xf] = info; | |
456 kh_val(d, k).hrec[info&0xf] = hrec; | |
457 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id); | |
458 return 1; | |
459 } | |
460 kh_val(d, k) = bcf_idinfo_def; | |
461 kh_val(d, k).info[info&0xf] = info; | |
462 kh_val(d, k).hrec[info&0xf] = hrec; | |
463 kh_val(d, k).id = idx==-1 ? kh_size(d) - 1 : idx; | |
464 | |
465 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id); | |
466 | |
467 return 1; | |
468 } | |
469 | |
470 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) | |
471 { | |
472 if ( !hrec ) return 0; | |
473 | |
474 hrec->type = BCF_HL_GEN; | |
475 if ( !bcf_hdr_register_hrec(hdr,hrec) ) | |
476 { | |
477 // If one of the hashed field, then it is already present | |
478 if ( hrec->type != BCF_HL_GEN ) | |
479 { | |
480 bcf_hrec_destroy(hrec); | |
481 return 0; | |
482 } | |
483 | |
484 // Is one of the generic fields and already present? | |
485 int i; | |
486 for (i=0; i<hdr->nhrec; i++) | |
487 { | |
488 if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue; | |
489 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break; | |
490 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break; | |
491 } | |
492 if ( i<hdr->nhrec ) | |
493 { | |
494 bcf_hrec_destroy(hrec); | |
495 return 0; | |
496 } | |
497 } | |
498 | |
499 // New record, needs to be added | |
500 int n = ++hdr->nhrec; | |
501 hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); | |
502 hdr->hrec[n-1] = hrec; | |
503 hdr->dirty = 1; | |
504 | |
505 return hrec->type==BCF_HL_GEN ? 0 : 1; | |
506 } | |
507 | |
508 /* | |
509 * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed), | |
510 * the STR,GEN lines are searched for linearly in a linked list of all header lines. | |
511 * This may become a problem for VCFs with huge headers, we might need to build a | |
512 * dictionary for these lines as well. | |
513 */ | |
514 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class) | |
515 { | |
516 int i; | |
517 if ( type==BCF_HL_GEN ) | |
518 { | |
519 for (i=0; i<hdr->nhrec; i++) | |
520 { | |
521 if ( hdr->hrec[i]->type!=type ) continue; | |
522 if ( strcmp(hdr->hrec[i]->key,key) ) continue; | |
523 if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i]; | |
524 } | |
525 return NULL; | |
526 } | |
527 else if ( type==BCF_HL_STR ) | |
528 { | |
529 for (i=0; i<hdr->nhrec; i++) | |
530 { | |
531 if ( hdr->hrec[i]->type!=type ) continue; | |
532 if ( strcmp(hdr->hrec[i]->key,str_class) ) continue; | |
533 int j = bcf_hrec_find_key(hdr->hrec[i],key); | |
534 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i]; | |
535 } | |
536 return NULL; | |
537 } | |
538 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; | |
539 khint_t k = kh_get(vdict, d, value); | |
540 if ( k == kh_end(d) ) return NULL; | |
541 return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type]; | |
542 } | |
543 | |
544 void bcf_hdr_check_sanity(bcf_hdr_t *hdr) | |
545 { | |
546 static int PL_warned = 0, GL_warned = 0; | |
547 | |
548 if ( !PL_warned ) | |
549 { | |
550 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL"); | |
551 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G ) | |
552 { | |
553 fprintf(stderr,"[W::%s] PL should be declared as Number=G\n", __func__); | |
554 PL_warned = 1; | |
555 } | |
556 } | |
557 if ( !GL_warned ) | |
558 { | |
559 int id = bcf_hdr_id2int(hdr, BCF_HL_FMT, "GL"); | |
560 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G ) | |
561 { | |
562 fprintf(stderr,"[W::%s] GL should be declared as Number=G\n", __func__); | |
563 PL_warned = 1; | |
564 } | |
565 } | |
566 } | |
567 | |
568 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) | |
569 { | |
570 int len, needs_sync = 0; | |
571 char *p = htxt; | |
572 | |
573 // Check sanity: "fileformat" string must come as first | |
574 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); | |
575 if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") ) | |
576 fprintf(stderr, "[W::%s] The first line should be ##fileformat; is the VCF/BCF header broken?\n", __func__); | |
577 needs_sync += bcf_hdr_add_hrec(hdr, hrec); | |
578 | |
579 // The filter PASS must appear first in the dictionary | |
580 hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len); | |
581 needs_sync += bcf_hdr_add_hrec(hdr, hrec); | |
582 | |
583 // Parse the whole header | |
584 while ( (hrec=bcf_hdr_parse_line(hdr,p,&len)) ) | |
585 { | |
586 needs_sync += bcf_hdr_add_hrec(hdr, hrec); | |
587 p += len; | |
588 } | |
589 int ret = bcf_hdr_parse_sample_line(hdr,p); | |
590 bcf_hdr_sync(hdr); | |
591 bcf_hdr_check_sanity(hdr); | |
592 return ret; | |
593 } | |
594 | |
595 int bcf_hdr_append(bcf_hdr_t *hdr, const char *line) | |
596 { | |
597 int len; | |
598 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len); | |
599 if ( !hrec ) return -1; | |
600 bcf_hdr_add_hrec(hdr, hrec); | |
601 return 0; | |
602 } | |
603 | |
604 void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) | |
605 { | |
606 int i; | |
607 bcf_hrec_t *hrec; | |
608 while (1) | |
609 { | |
610 if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG ) | |
611 { | |
612 hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL); | |
613 if ( !hrec ) return; | |
614 | |
615 for (i=0; i<hdr->nhrec; i++) | |
616 if ( hdr->hrec[i]==hrec ) break; | |
617 assert( i<hdr->nhrec ); | |
618 | |
619 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; | |
620 khint_t k = kh_get(vdict, d, key); | |
621 kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL; | |
622 } | |
623 else | |
624 { | |
625 for (i=0; i<hdr->nhrec; i++) | |
626 { | |
627 if ( hdr->hrec[i]->type!=type ) continue; | |
628 if ( type==BCF_HL_GEN ) | |
629 { | |
630 if ( !strcmp(hdr->hrec[i]->key,key) ) break; | |
631 } | |
632 else | |
633 { | |
634 // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec() | |
635 int j = bcf_hrec_find_key(hdr->hrec[i], "ID"); | |
636 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break; | |
637 } | |
638 } | |
639 if ( i==hdr->nhrec ) return; | |
640 hrec = hdr->hrec[i]; | |
641 } | |
642 | |
643 hdr->nhrec--; | |
644 if ( i < hdr->nhrec ) | |
645 memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*)); | |
646 bcf_hrec_destroy(hrec); | |
647 hdr->dirty = 1; | |
648 } | |
649 } | |
650 | |
651 int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...) | |
652 { | |
653 va_list ap; | |
654 va_start(ap, fmt); | |
655 int n = vsnprintf(NULL, 0, fmt, ap) + 2; | |
656 va_end(ap); | |
657 | |
658 char *line = (char*)malloc(n); | |
659 va_start(ap, fmt); | |
660 vsnprintf(line, n, fmt, ap); | |
661 va_end(ap); | |
662 | |
663 int ret = bcf_hdr_append(hdr, line); | |
664 | |
665 free(line); | |
666 return ret; | |
667 } | |
668 | |
669 | |
670 /********************** | |
671 *** BCF header I/O *** | |
672 **********************/ | |
673 | |
674 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr) | |
675 { | |
676 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL); | |
677 if ( !hrec ) | |
678 { | |
679 fprintf(stderr,"No version string found, assuming VCFv4.2\n"); | |
680 return "VCFv4.2"; | |
681 } | |
682 return hrec->value; | |
683 } | |
684 | |
685 void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) | |
686 { | |
687 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL); | |
688 if ( !hrec ) | |
689 { | |
690 int len; | |
691 kstring_t str = {0,0,0}; | |
692 ksprintf(&str,"##fileformat=%s", version); | |
693 hrec = bcf_hdr_parse_line(hdr, str.s, &len); | |
694 free(str.s); | |
695 } | |
696 else | |
697 { | |
698 free(hrec->value); | |
699 hrec->value = strdup(version); | |
700 } | |
701 hdr->dirty = 1; | |
702 } | |
703 | |
704 bcf_hdr_t *bcf_hdr_init(const char *mode) | |
705 { | |
706 int i; | |
707 bcf_hdr_t *h; | |
708 h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t)); | |
709 for (i = 0; i < 3; ++i) | |
710 h->dict[i] = kh_init(vdict); | |
711 if ( strchr(mode,'w') ) | |
712 { | |
713 bcf_hdr_append(h, "##fileformat=VCFv4.2"); | |
714 // The filter PASS must appear first in the dictionary | |
715 bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">"); | |
716 } | |
717 return h; | |
718 } | |
719 | |
720 void bcf_hdr_destroy(bcf_hdr_t *h) | |
721 { | |
722 int i; | |
723 khint_t k; | |
724 for (i = 0; i < 3; ++i) { | |
725 vdict_t *d = (vdict_t*)h->dict[i]; | |
726 if (d == 0) continue; | |
727 for (k = kh_begin(d); k != kh_end(d); ++k) | |
728 if (kh_exist(d, k)) free((char*)kh_key(d, k)); | |
729 kh_destroy(vdict, d); | |
730 free(h->id[i]); | |
731 } | |
732 for (i=0; i<h->nhrec; i++) | |
733 bcf_hrec_destroy(h->hrec[i]); | |
734 if (h->nhrec) free(h->hrec); | |
735 if (h->samples) free(h->samples); | |
736 free(h->keep_samples); | |
737 free(h->transl[0]); free(h->transl[1]); | |
738 free(h->mem.s); | |
739 free(h); | |
740 } | |
741 | |
742 bcf_hdr_t *bcf_hdr_read(htsFile *hfp) | |
743 { | |
744 if (hfp->format.format == vcf) | |
745 return vcf_hdr_read(hfp); | |
746 | |
747 BGZF *fp = hfp->fp.bgzf; | |
748 uint8_t magic[5]; | |
749 bcf_hdr_t *h; | |
750 h = bcf_hdr_init("r"); | |
751 if ( bgzf_read(fp, magic, 5)<0 ) | |
752 { | |
753 fprintf(stderr,"[%s:%d %s] Failed to read the header (reading BCF in text mode?)\n", __FILE__,__LINE__,__FUNCTION__); | |
754 return NULL; | |
755 } | |
756 if (strncmp((char*)magic, "BCF\2\2", 5) != 0) | |
757 { | |
758 if (!strncmp((char*)magic, "BCF", 3)) | |
759 fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 is supported.\n", __FILE__,__LINE__,__FUNCTION__); | |
760 else if (hts_verbose >= 2) | |
761 fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__); | |
762 bcf_hdr_destroy(h); | |
763 return 0; | |
764 } | |
765 int hlen; | |
766 char *htxt; | |
767 bgzf_read(fp, &hlen, 4); | |
768 htxt = (char*)malloc(hlen); | |
769 bgzf_read(fp, htxt, hlen); | |
770 bcf_hdr_parse(h, htxt); | |
771 free(htxt); | |
772 return h; | |
773 } | |
774 | |
775 int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h) | |
776 { | |
777 if ( h->dirty ) bcf_hdr_sync(h); | |
778 if (hfp->format.format == vcf || hfp->format.format == text_format) | |
779 return vcf_hdr_write(hfp, h); | |
780 | |
781 int hlen; | |
782 char *htxt = bcf_hdr_fmt_text(h, 1, &hlen); | |
783 hlen++; // include the \0 byte | |
784 | |
785 BGZF *fp = hfp->fp.bgzf; | |
786 if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1; | |
787 if ( bgzf_write(fp, &hlen, 4) !=4 ) return -1; | |
788 if ( bgzf_write(fp, htxt, hlen) != hlen ) return -1; | |
789 | |
790 free(htxt); | |
791 return 0; | |
792 } | |
793 | |
794 /******************** | |
795 *** BCF site I/O *** | |
796 ********************/ | |
797 | |
798 bcf1_t *bcf_init1() | |
799 { | |
800 bcf1_t *v; | |
801 v = (bcf1_t*)calloc(1, sizeof(bcf1_t)); | |
802 return v; | |
803 } | |
804 | |
805 void bcf_clear(bcf1_t *v) | |
806 { | |
807 int i; | |
808 for (i=0; i<v->d.m_info; i++) | |
809 { | |
810 if ( v->d.info[i].vptr_free ) | |
811 { | |
812 free(v->d.info[i].vptr - v->d.info[i].vptr_off); | |
813 v->d.info[i].vptr_free = 0; | |
814 } | |
815 } | |
816 for (i=0; i<v->d.m_fmt; i++) | |
817 { | |
818 if ( v->d.fmt[i].p_free ) | |
819 { | |
820 free(v->d.fmt[i].p - v->d.fmt[i].p_off); | |
821 v->d.fmt[i].p_free = 0; | |
822 } | |
823 } | |
824 v->rid = v->pos = v->rlen = v->unpacked = 0; | |
825 bcf_float_set_missing(v->qual); | |
826 v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0; | |
827 v->shared.l = v->indiv.l = 0; | |
828 v->d.var_type = -1; | |
829 v->d.shared_dirty = 0; | |
830 v->d.indiv_dirty = 0; | |
831 v->d.n_flt = 0; | |
832 v->errcode = 0; | |
833 if (v->d.m_als) v->d.als[0] = 0; | |
834 if (v->d.m_id) v->d.id[0] = 0; | |
835 } | |
836 | |
837 void bcf_empty1(bcf1_t *v) | |
838 { | |
839 bcf_clear1(v); | |
840 free(v->d.id); | |
841 free(v->d.als); | |
842 free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt); | |
843 if (v->d.var ) free(v->d.var); | |
844 free(v->shared.s); free(v->indiv.s); | |
845 } | |
846 | |
847 void bcf_destroy1(bcf1_t *v) | |
848 { | |
849 bcf_empty1(v); | |
850 free(v); | |
851 } | |
852 | |
853 static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) | |
854 { | |
855 uint32_t x[8]; | |
856 int ret; | |
857 if ((ret = bgzf_read(fp, x, 32)) != 32) { | |
858 if (ret == 0) return -1; | |
859 return -2; | |
860 } | |
861 bcf_clear1(v); | |
862 x[0] -= 24; // to exclude six 32-bit integers | |
863 ks_resize(&v->shared, x[0]); | |
864 ks_resize(&v->indiv, x[1]); | |
865 memcpy(v, x + 2, 16); | |
866 v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff; | |
867 v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff; | |
868 v->shared.l = x[0], v->indiv.l = x[1]; | |
869 | |
870 // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 | |
871 if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; | |
872 | |
873 bgzf_read(fp, v->shared.s, v->shared.l); | |
874 bgzf_read(fp, v->indiv.s, v->indiv.l); | |
875 return 0; | |
876 } | |
877 | |
878 #define bit_array_size(n) ((n)/8+1) | |
879 #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8)) | |
880 #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8))) | |
881 #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8))) | |
882 | |
883 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt); | |
884 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) | |
885 { | |
886 if ( !hdr->keep_samples ) return 0; | |
887 if ( !bcf_hdr_nsamples(hdr) ) | |
888 { | |
889 rec->indiv.l = rec->n_sample = 0; | |
890 return 0; | |
891 } | |
892 | |
893 int i, j; | |
894 uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src; | |
895 bcf_dec_t *dec = &rec->d; | |
896 hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt); | |
897 for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0; | |
898 | |
899 for (i=0; i<rec->n_fmt; i++) | |
900 { | |
901 ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]); | |
902 src = dec->fmt[i].p - dec->fmt[i].size; | |
903 if ( dst ) | |
904 { | |
905 memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off); | |
906 dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off; | |
907 } | |
908 dst = dec->fmt[i].p; | |
909 for (j=0; j<hdr->nsamples_ori; j++) | |
910 { | |
911 src += dec->fmt[i].size; | |
912 if ( !bit_array_test(hdr->keep_samples,j) ) continue; | |
913 memmove(dst, src, dec->fmt[i].size); | |
914 dst += dec->fmt[i].size; | |
915 } | |
916 rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p); | |
917 dec->fmt[i].p_len = dst - dec->fmt[i].p; | |
918 } | |
919 rec->unpacked |= BCF_UN_FMT; | |
920 | |
921 rec->n_sample = bcf_hdr_nsamples(hdr); | |
922 return 0; | |
923 } | |
924 | |
925 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) | |
926 { | |
927 if (fp->format.format == vcf) return vcf_read(fp,h,v); | |
928 int ret = bcf_read1_core(fp->fp.bgzf, v); | |
929 if ( ret!=0 || !h->keep_samples ) return ret; | |
930 return bcf_subset_format(h,v); | |
931 } | |
932 | |
933 int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end) | |
934 { | |
935 bcf1_t *v = (bcf1_t *) vv; | |
936 int ret; | |
937 if ((ret = bcf_read1_core(fp, v)) >= 0) | |
938 *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen; | |
939 return ret; | |
940 } | |
941 | |
942 static inline void bcf1_sync_id(bcf1_t *line, kstring_t *str) | |
943 { | |
944 // single typed string | |
945 if ( line->d.id && strcmp(line->d.id, ".") ) bcf_enc_vchar(str, strlen(line->d.id), line->d.id); | |
946 else bcf_enc_size(str, 0, BCF_BT_CHAR); | |
947 } | |
948 static inline void bcf1_sync_alleles(bcf1_t *line, kstring_t *str) | |
949 { | |
950 // list of typed strings | |
951 int i; | |
952 for (i=0; i<line->n_allele; i++) | |
953 bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]); | |
954 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]); | |
955 } | |
956 static inline void bcf1_sync_filter(bcf1_t *line, kstring_t *str) | |
957 { | |
958 // typed vector of integers | |
959 if ( line->d.n_flt ) bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1); | |
960 else bcf_enc_vint(str, 0, 0, -1); | |
961 } | |
962 | |
963 static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str) | |
964 { | |
965 // pairs of typed vectors | |
966 int i, irm = -1; | |
967 for (i=0; i<line->n_info; i++) | |
968 { | |
969 bcf_info_t *info = &line->d.info[i]; | |
970 if ( !info->vptr ) | |
971 { | |
972 // marked for removal | |
973 if ( irm < 0 ) irm = i; | |
974 continue; | |
975 } | |
976 kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str); | |
977 if ( irm >=0 ) | |
978 { | |
979 bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp; | |
980 while ( irm<=i && line->d.info[irm].vptr ) irm++; | |
981 } | |
982 } | |
983 if ( irm>=0 ) line->n_info = irm; | |
984 } | |
985 | |
986 static int bcf1_sync(bcf1_t *line) | |
987 { | |
988 char *shared_ori = line->shared.s; | |
989 size_t prev_len; | |
990 | |
991 kstring_t tmp = {0,0,0}; | |
992 if ( !line->shared.l ) | |
993 { | |
994 // New line created via API, BCF data blocks do not exist. Get it ready for BCF output | |
995 tmp = line->shared; | |
996 bcf1_sync_id(line, &tmp); | |
997 line->unpack_size[0] = tmp.l; prev_len = tmp.l; | |
998 | |
999 bcf1_sync_alleles(line, &tmp); | |
1000 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l; | |
1001 | |
1002 bcf1_sync_filter(line, &tmp); | |
1003 line->unpack_size[2] = tmp.l - prev_len; | |
1004 | |
1005 bcf1_sync_info(line, &tmp); | |
1006 line->shared = tmp; | |
1007 } | |
1008 else if ( line->d.shared_dirty ) | |
1009 { | |
1010 // The line was edited, update the BCF data block, ptr_ori points | |
1011 // to the original unchanged BCF data. | |
1012 uint8_t *ptr_ori = (uint8_t *) line->shared.s; | |
1013 | |
1014 assert( line->unpacked & BCF_UN_STR ); | |
1015 | |
1016 // ID: single typed string | |
1017 if ( line->d.shared_dirty & BCF1_DIRTY_ID ) | |
1018 bcf1_sync_id(line, &tmp); | |
1019 else | |
1020 kputsn_(ptr_ori, line->unpack_size[0], &tmp); | |
1021 ptr_ori += line->unpack_size[0]; | |
1022 line->unpack_size[0] = tmp.l; prev_len = tmp.l; | |
1023 | |
1024 // REF+ALT: list of typed strings | |
1025 if ( line->d.shared_dirty & BCF1_DIRTY_ALS ) | |
1026 bcf1_sync_alleles(line, &tmp); | |
1027 else | |
1028 { | |
1029 kputsn_(ptr_ori, line->unpack_size[1], &tmp); | |
1030 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]); | |
1031 } | |
1032 ptr_ori += line->unpack_size[1]; | |
1033 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l; | |
1034 | |
1035 if ( line->unpacked & BCF_UN_FLT ) | |
1036 { | |
1037 // FILTER: typed vector of integers | |
1038 if ( line->d.shared_dirty & BCF1_DIRTY_FLT ) | |
1039 bcf1_sync_filter(line, &tmp); | |
1040 else if ( line->d.n_flt ) | |
1041 kputsn_(ptr_ori, line->unpack_size[2], &tmp); | |
1042 else | |
1043 bcf_enc_vint(&tmp, 0, 0, -1); | |
1044 ptr_ori += line->unpack_size[2]; | |
1045 line->unpack_size[2] = tmp.l - prev_len; | |
1046 | |
1047 if ( line->unpacked & BCF_UN_INFO ) | |
1048 { | |
1049 // INFO: pairs of typed vectors | |
1050 if ( line->d.shared_dirty & BCF1_DIRTY_INF ) | |
1051 { | |
1052 bcf1_sync_info(line, &tmp); | |
1053 ptr_ori = (uint8_t*)line->shared.s + line->shared.l; | |
1054 } | |
1055 } | |
1056 } | |
1057 | |
1058 int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s; | |
1059 if ( size ) kputsn_(ptr_ori, size, &tmp); | |
1060 | |
1061 free(line->shared.s); | |
1062 line->shared = tmp; | |
1063 } | |
1064 if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO ) | |
1065 { | |
1066 // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers | |
1067 size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2]; | |
1068 int i; | |
1069 for (i=0; i<line->n_info; i++) | |
1070 { | |
1071 uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL; | |
1072 line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off; | |
1073 off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off; | |
1074 if ( vptr_free ) | |
1075 { | |
1076 free(vptr_free); | |
1077 line->d.info[i].vptr_free = 0; | |
1078 } | |
1079 } | |
1080 } | |
1081 | |
1082 if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) ) | |
1083 { | |
1084 // The genotype fields changed or are not present | |
1085 tmp.l = tmp.m = 0; tmp.s = NULL; | |
1086 int i, irm = -1; | |
1087 for (i=0; i<line->n_fmt; i++) | |
1088 { | |
1089 bcf_fmt_t *fmt = &line->d.fmt[i]; | |
1090 if ( !fmt->p ) | |
1091 { | |
1092 // marked for removal | |
1093 if ( irm < 0 ) irm = i; | |
1094 continue; | |
1095 } | |
1096 kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp); | |
1097 if ( irm >=0 ) | |
1098 { | |
1099 bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt; | |
1100 while ( irm<=i && line->d.fmt[irm].p ) irm++; | |
1101 } | |
1102 | |
1103 } | |
1104 if ( irm>=0 ) line->n_fmt = irm; | |
1105 free(line->indiv.s); | |
1106 line->indiv = tmp; | |
1107 | |
1108 // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers | |
1109 size_t off_new = 0; | |
1110 for (i=0; i<line->n_fmt; i++) | |
1111 { | |
1112 uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL; | |
1113 line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off; | |
1114 off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off; | |
1115 if ( p_free ) | |
1116 { | |
1117 free(p_free); | |
1118 line->d.fmt[i].p_free = 0; | |
1119 } | |
1120 } | |
1121 } | |
1122 if ( !line->n_sample ) line->n_fmt = 0; | |
1123 line->d.shared_dirty = line->d.indiv_dirty = 0; | |
1124 return 0; | |
1125 } | |
1126 | |
1127 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) | |
1128 { | |
1129 bcf1_sync(src); | |
1130 | |
1131 bcf_clear(dst); | |
1132 dst->rid = src->rid; | |
1133 dst->pos = src->pos; | |
1134 dst->rlen = src->rlen; | |
1135 dst->qual = src->qual; | |
1136 dst->n_info = src->n_info; dst->n_allele = src->n_allele; | |
1137 dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample; | |
1138 | |
1139 dst->shared.m = dst->shared.l = src->shared.l; | |
1140 dst->shared.s = (char*) malloc(dst->shared.l); | |
1141 memcpy(dst->shared.s,src->shared.s,dst->shared.l); | |
1142 | |
1143 dst->indiv.m = dst->indiv.l = src->indiv.l; | |
1144 dst->indiv.s = (char*) malloc(dst->indiv.l); | |
1145 memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l); | |
1146 | |
1147 return dst; | |
1148 } | |
1149 bcf1_t *bcf_dup(bcf1_t *src) | |
1150 { | |
1151 bcf1_t *out = bcf_init1(); | |
1152 return bcf_copy(out, src); | |
1153 } | |
1154 | |
1155 int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v) | |
1156 { | |
1157 if ( h->dirty ) | |
1158 { | |
1159 // we could as well call bcf_hdr_sync here, not sure | |
1160 fprintf(stderr,"FIXME: dirty header not synced\n"); | |
1161 exit(1); | |
1162 } | |
1163 if ( bcf_hdr_nsamples(h)!=v->n_sample ) | |
1164 { | |
1165 fprintf(stderr,"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n", | |
1166 __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h)); | |
1167 return -1; | |
1168 } | |
1169 | |
1170 if ( hfp->format.format == vcf || hfp->format.format == text_format ) | |
1171 return vcf_write(hfp,h,v); | |
1172 | |
1173 if ( v->errcode ) | |
1174 { | |
1175 // vcf_parse1() encountered a new contig or tag, undeclared in the | |
1176 // header. At this point, the header must have been printed, | |
1177 // proceeding would lead to a broken BCF file. Errors must be checked | |
1178 // and cleared by the caller before we can proceed. | |
1179 fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,v->errcode); | |
1180 exit(1); | |
1181 } | |
1182 bcf1_sync(v); // check if the BCF record was modified | |
1183 | |
1184 BGZF *fp = hfp->fp.bgzf; | |
1185 uint32_t x[8]; | |
1186 x[0] = v->shared.l + 24; // to include six 32-bit integers | |
1187 x[1] = v->indiv.l; | |
1188 memcpy(x + 2, v, 16); | |
1189 x[6] = (uint32_t)v->n_allele<<16 | v->n_info; | |
1190 x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample; | |
1191 if ( bgzf_write(fp, x, 32) != 32 ) return -1; | |
1192 if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1; | |
1193 if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1; | |
1194 return 0; | |
1195 } | |
1196 | |
1197 /********************** | |
1198 *** VCF header I/O *** | |
1199 **********************/ | |
1200 | |
1201 bcf_hdr_t *vcf_hdr_read(htsFile *fp) | |
1202 { | |
1203 kstring_t txt, *s = &fp->line; | |
1204 bcf_hdr_t *h; | |
1205 h = bcf_hdr_init("r"); | |
1206 txt.l = txt.m = 0; txt.s = 0; | |
1207 while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { | |
1208 if (s->l == 0) continue; | |
1209 if (s->s[0] != '#') { | |
1210 if (hts_verbose >= 2) | |
1211 fprintf(stderr, "[E::%s] no sample line\n", __func__); | |
1212 free(txt.s); | |
1213 bcf_hdr_destroy(h); | |
1214 return 0; | |
1215 } | |
1216 if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here | |
1217 int dret; | |
1218 gzFile f; | |
1219 kstream_t *ks; | |
1220 kstring_t tmp; | |
1221 tmp.l = tmp.m = 0; tmp.s = 0; | |
1222 f = gzopen(fp->fn_aux, "r"); | |
1223 ks = ks_init(f); | |
1224 while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { | |
1225 int c; | |
1226 kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); | |
1227 ks_getuntil(ks, 0, &tmp, &dret); | |
1228 kputs(",length=", &txt); kputw(atol(tmp.s), &txt); | |
1229 kputsn(">\n", 2, &txt); | |
1230 if (dret != '\n') | |
1231 while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line | |
1232 } | |
1233 free(tmp.s); | |
1234 ks_destroy(ks); | |
1235 gzclose(f); | |
1236 } | |
1237 kputsn(s->s, s->l, &txt); | |
1238 kputc('\n', &txt); | |
1239 if (s->s[1] != '#') break; | |
1240 } | |
1241 if ( !txt.s ) | |
1242 { | |
1243 fprintf(stderr,"[%s:%d %s] Could not read the header\n", __FILE__,__LINE__,__FUNCTION__); | |
1244 return NULL; | |
1245 } | |
1246 bcf_hdr_parse(h, txt.s); | |
1247 | |
1248 // check tabix index, are all contigs listed in the header? add the missing ones | |
1249 tbx_t *idx = tbx_index_load(fp->fn); | |
1250 if ( idx ) | |
1251 { | |
1252 int i, n, need_sync = 0; | |
1253 const char **names = tbx_seqnames(idx, &n); | |
1254 for (i=0; i<n; i++) | |
1255 { | |
1256 bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL); | |
1257 if ( hrec ) continue; | |
1258 hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); | |
1259 hrec->key = strdup("contig"); | |
1260 bcf_hrec_add_key(hrec, "ID", strlen("ID")); | |
1261 bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); | |
1262 bcf_hdr_add_hrec(h, hrec); | |
1263 need_sync = 1; | |
1264 } | |
1265 free(names); | |
1266 tbx_destroy(idx); | |
1267 if ( need_sync ) | |
1268 bcf_hdr_sync(h); | |
1269 } | |
1270 free(txt.s); | |
1271 return h; | |
1272 } | |
1273 | |
1274 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname) | |
1275 { | |
1276 int i, n; | |
1277 char **lines = hts_readlines(fname, &n); | |
1278 if ( !lines ) return 1; | |
1279 for (i=0; i<n-1; i++) | |
1280 { | |
1281 int k; | |
1282 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k); | |
1283 if ( hrec ) bcf_hdr_add_hrec(hdr, hrec); | |
1284 free(lines[i]); | |
1285 } | |
1286 bcf_hdr_parse_sample_line(hdr,lines[n-1]); | |
1287 free(lines[n-1]); | |
1288 free(lines); | |
1289 bcf_hdr_sync(hdr); | |
1290 return 0; | |
1291 } | |
1292 | |
1293 static void _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str) | |
1294 { | |
1295 if ( !hrec->value ) | |
1296 { | |
1297 int j, nout = 0; | |
1298 ksprintf(str, "##%s=<", hrec->key); | |
1299 for (j=0; j<hrec->nkeys; j++) | |
1300 { | |
1301 // do not output IDX if output is VCF | |
1302 if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue; | |
1303 if ( nout ) kputc(',',str); | |
1304 ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]); | |
1305 nout++; | |
1306 } | |
1307 ksprintf(str,">\n"); | |
1308 } | |
1309 else | |
1310 ksprintf(str,"##%s=%s\n", hrec->key,hrec->value); | |
1311 } | |
1312 | |
1313 void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str) | |
1314 { | |
1315 _bcf_hrec_format(hrec,0,str); | |
1316 } | |
1317 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len) | |
1318 { | |
1319 int i; | |
1320 kstring_t txt = {0,0,0}; | |
1321 for (i=0; i<hdr->nhrec; i++) | |
1322 _bcf_hrec_format(hdr->hrec[i], is_bcf, &txt); | |
1323 | |
1324 ksprintf(&txt,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); | |
1325 if ( bcf_hdr_nsamples(hdr) ) | |
1326 { | |
1327 ksprintf(&txt,"\tFORMAT"); | |
1328 for (i=0; i<bcf_hdr_nsamples(hdr); i++) | |
1329 ksprintf(&txt,"\t%s", hdr->samples[i]); | |
1330 } | |
1331 ksprintf(&txt,"\n"); | |
1332 | |
1333 if ( len ) *len = txt.l; | |
1334 return txt.s; | |
1335 } | |
1336 | |
1337 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n) | |
1338 { | |
1339 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; | |
1340 int tid, m = kh_size(d); | |
1341 const char **names = (const char**) calloc(m,sizeof(const char*)); | |
1342 khint_t k; | |
1343 for (k=kh_begin(d); k<kh_end(d); k++) | |
1344 { | |
1345 if ( !kh_exist(d,k) ) continue; | |
1346 tid = kh_val(d,k).id; | |
1347 assert( tid<m ); | |
1348 names[tid] = kh_key(d,k); | |
1349 } | |
1350 // sanity check: there should be no gaps | |
1351 for (tid=0; tid<m; tid++) | |
1352 assert(names[tid]); | |
1353 *n = m; | |
1354 return names; | |
1355 } | |
1356 | |
1357 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) | |
1358 { | |
1359 int hlen; | |
1360 char *htxt = bcf_hdr_fmt_text(h, 0, &hlen); | |
1361 while (hlen && htxt[hlen-1] == 0) --hlen; // kill trailing zeros | |
1362 int ret; | |
1363 if ( fp->format.compression!=no_compression ) | |
1364 ret = bgzf_write(fp->fp.bgzf, htxt, hlen); | |
1365 else | |
1366 ret = hwrite(fp->fp.hfile, htxt, hlen); | |
1367 free(htxt); | |
1368 return ret<0 ? -1 : 0; | |
1369 } | |
1370 | |
1371 /*********************** | |
1372 *** Typed value I/O *** | |
1373 ***********************/ | |
1374 | |
1375 void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) | |
1376 { | |
1377 int32_t max = INT32_MIN + 1, min = INT32_MAX; | |
1378 int i; | |
1379 if (n == 0) bcf_enc_size(s, 0, BCF_BT_NULL); | |
1380 else if (n == 1) bcf_enc_int1(s, a[0]); | |
1381 else { | |
1382 if (wsize <= 0) wsize = n; | |
1383 for (i = 0; i < n; ++i) { | |
1384 if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue; | |
1385 if (max < a[i]) max = a[i]; | |
1386 if (min > a[i]) min = a[i]; | |
1387 } | |
1388 if (max <= INT8_MAX && min > bcf_int8_vector_end) { | |
1389 bcf_enc_size(s, wsize, BCF_BT_INT8); | |
1390 for (i = 0; i < n; ++i) | |
1391 if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s); | |
1392 else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s); | |
1393 else kputc(a[i], s); | |
1394 } else if (max <= INT16_MAX && min > bcf_int16_vector_end) { | |
1395 bcf_enc_size(s, wsize, BCF_BT_INT16); | |
1396 for (i = 0; i < n; ++i) | |
1397 { | |
1398 int16_t x; | |
1399 if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end; | |
1400 else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing; | |
1401 else x = a[i]; | |
1402 kputsn((char*)&x, 2, s); | |
1403 } | |
1404 } else { | |
1405 bcf_enc_size(s, wsize, BCF_BT_INT32); | |
1406 for (i = 0; i < n; ++i) { | |
1407 int32_t x = a[i]; | |
1408 kputsn((char*)&x, 4, s); | |
1409 } | |
1410 } | |
1411 } | |
1412 } | |
1413 | |
1414 void bcf_enc_vfloat(kstring_t *s, int n, float *a) | |
1415 { | |
1416 bcf_enc_size(s, n, BCF_BT_FLOAT); | |
1417 kputsn((char*)a, n << 2, s); | |
1418 } | |
1419 | |
1420 void bcf_enc_vchar(kstring_t *s, int l, const char *a) | |
1421 { | |
1422 bcf_enc_size(s, l, BCF_BT_CHAR); | |
1423 kputsn(a, l, s); | |
1424 } | |
1425 | |
1426 void bcf_fmt_array(kstring_t *s, int n, int type, void *data) | |
1427 { | |
1428 int j = 0; | |
1429 if (n == 0) { | |
1430 kputc('.', s); | |
1431 return; | |
1432 } | |
1433 if (type == BCF_BT_CHAR) | |
1434 { | |
1435 char *p = (char*)data; | |
1436 for (j = 0; j < n && *p; ++j, ++p) | |
1437 { | |
1438 if ( *p==bcf_str_missing ) kputc('.', s); | |
1439 else kputc(*p, s); | |
1440 } | |
1441 } | |
1442 else | |
1443 { | |
1444 #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \ | |
1445 type_t *p = (type_t *) data; \ | |
1446 for (j=0; j<n; j++) \ | |
1447 { \ | |
1448 if ( is_vector_end ) break; \ | |
1449 if ( j ) kputc(',', s); \ | |
1450 if ( is_missing ) kputc('.', s); \ | |
1451 else kprint; \ | |
1452 } \ | |
1453 } | |
1454 switch (type) { | |
1455 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, kputw(p[j], s)); break; | |
1456 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, kputw(p[j], s)); break; | |
1457 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, kputw(p[j], s)); break; | |
1458 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), ksprintf(s, "%g", p[j])); break; | |
1459 default: fprintf(stderr,"todo: type %d\n", type); exit(1); break; | |
1460 } | |
1461 #undef BRANCH | |
1462 } | |
1463 } | |
1464 | |
1465 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) | |
1466 { | |
1467 int x, type; | |
1468 x = bcf_dec_size(ptr, &ptr, &type); | |
1469 bcf_fmt_array(s, x, type, ptr); | |
1470 return ptr + (x << bcf_type_shift[type]); | |
1471 } | |
1472 | |
1473 /******************** | |
1474 *** VCF site I/O *** | |
1475 ********************/ | |
1476 | |
1477 typedef struct { | |
1478 int key, max_m, size, offset; | |
1479 uint32_t is_gt:1, max_g:15, max_l:16; | |
1480 uint32_t y; | |
1481 uint8_t *buf; | |
1482 } fmt_aux_t; | |
1483 | |
1484 static inline void align_mem(kstring_t *s) | |
1485 { | |
1486 if (s->l&7) { | |
1487 uint64_t zero = 0; | |
1488 int l = ((s->l + 7)>>3<<3) - s->l; | |
1489 kputsn((char*)&zero, l, s); | |
1490 } | |
1491 } | |
1492 | |
1493 // p,q is the start and the end of the FORMAT field | |
1494 int _vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) | |
1495 { | |
1496 if ( !bcf_hdr_nsamples(h) ) return 0; | |
1497 | |
1498 char *r, *t; | |
1499 int j, l, m, g; | |
1500 khint_t k; | |
1501 ks_tokaux_t aux1; | |
1502 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; | |
1503 kstring_t *mem = (kstring_t*)&h->mem; | |
1504 mem->l = 0; | |
1505 | |
1506 // count the number of format fields | |
1507 for (r = p, v->n_fmt = 1; *r; ++r) | |
1508 if (*r == ':') ++v->n_fmt; | |
1509 char *end = s->s + s->l; | |
1510 if ( q>=end ) | |
1511 { | |
1512 fprintf(stderr,"[%s:%d %s] Error: FORMAT column with no sample columns starting at %s:%d\n", __FILE__,__LINE__,__FUNCTION__,s->s,v->pos+1); | |
1513 return -1; | |
1514 } | |
1515 | |
1516 fmt_aux_t *fmt = (fmt_aux_t*)alloca(v->n_fmt * sizeof(fmt_aux_t)); | |
1517 // get format information from the dictionary | |
1518 for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { | |
1519 *(char*)aux1.p = 0; | |
1520 k = kh_get(vdict, d, t); | |
1521 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) { | |
1522 fprintf(stderr, "[W::%s] FORMAT '%s' is not defined in the header, assuming Type=String\n", __func__, t); | |
1523 kstring_t tmp = {0,0,0}; | |
1524 int l; | |
1525 ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t); | |
1526 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); | |
1527 free(tmp.s); | |
1528 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); | |
1529 k = kh_get(vdict, d, t); | |
1530 v->errcode = BCF_ERR_TAG_UNDEF; | |
1531 } | |
1532 fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0; | |
1533 fmt[j].key = kh_val(d, k).id; | |
1534 fmt[j].is_gt = !strcmp(t, "GT"); | |
1535 fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT]; | |
1536 } | |
1537 // compute max | |
1538 int n_sample_ori = -1; | |
1539 r = q + 1; // r: position in the format string | |
1540 m = l = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles | |
1541 while ( r<end ) | |
1542 { | |
1543 // can we skip some samples? | |
1544 if ( h->keep_samples ) | |
1545 { | |
1546 n_sample_ori++; | |
1547 if ( !bit_array_test(h->keep_samples,n_sample_ori) ) | |
1548 { | |
1549 while ( *r!='\t' && r<end ) r++; | |
1550 if ( *r=='\t' ) { *r = 0; r++; } | |
1551 continue; | |
1552 } | |
1553 } | |
1554 | |
1555 // collect fmt stats: max vector size, length, number of alleles | |
1556 j = 0; // j-th format field | |
1557 for (;;) | |
1558 { | |
1559 if ( *r == '\t' ) *r = 0; | |
1560 if ( *r == ':' || !*r ) // end of field or end of sample | |
1561 { | |
1562 if (fmt[j].max_m < m) fmt[j].max_m = m; | |
1563 if (fmt[j].max_l < l - 1) fmt[j].max_l = l - 1; | |
1564 if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].max_g = g; | |
1565 l = 0, m = g = 1; | |
1566 if ( *r==':' ) | |
1567 { | |
1568 j++; | |
1569 if ( j>=v->n_fmt ) | |
1570 { | |
1571 fprintf(stderr,"Incorrect number of FORMAT fields at %s:%d\n", h->id[BCF_DT_CTG][v->rid].key,v->pos+1); | |
1572 exit(1); | |
1573 } | |
1574 } | |
1575 else break; | |
1576 } | |
1577 else if ( *r== ',' ) m++; | |
1578 else if ( fmt[j].is_gt && (*r == '|' || *r == '/') ) g++; | |
1579 if ( r>=end ) break; | |
1580 r++; l++; | |
1581 } | |
1582 v->n_sample++; | |
1583 if ( v->n_sample == bcf_hdr_nsamples(h) ) break; | |
1584 r++; | |
1585 } | |
1586 | |
1587 // allocate memory for arrays | |
1588 for (j = 0; j < v->n_fmt; ++j) { | |
1589 fmt_aux_t *f = &fmt[j]; | |
1590 if ( !f->max_m ) f->max_m = 1; // omitted trailing format field | |
1591 if ((f->y>>4&0xf) == BCF_HT_STR) { | |
1592 f->size = f->is_gt? f->max_g << 2 : f->max_l; | |
1593 } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) { | |
1594 f->size = f->max_m << 2; | |
1595 } else | |
1596 { | |
1597 fprintf(stderr, "[E::%s] the format type %d currently not supported\n", __func__, f->y>>4&0xf); | |
1598 abort(); // I do not know how to do with Flag in the genotype fields | |
1599 } | |
1600 align_mem(mem); | |
1601 f->offset = mem->l; | |
1602 ks_resize(mem, mem->l + v->n_sample * f->size); | |
1603 mem->l += v->n_sample * f->size; | |
1604 } | |
1605 for (j = 0; j < v->n_fmt; ++j) | |
1606 fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset; | |
1607 // fill the sample fields; at beginning of the loop, t points to the first char of a format | |
1608 n_sample_ori = -1; | |
1609 t = q + 1; m = 0; // m: sample id | |
1610 while ( t<end ) | |
1611 { | |
1612 // can we skip some samples? | |
1613 if ( h->keep_samples ) | |
1614 { | |
1615 n_sample_ori++; | |
1616 if ( !bit_array_test(h->keep_samples,n_sample_ori) ) | |
1617 { | |
1618 while ( *t && t<end ) t++; | |
1619 t++; | |
1620 continue; | |
1621 } | |
1622 } | |
1623 if ( m == bcf_hdr_nsamples(h) ) break; | |
1624 | |
1625 j = 0; // j-th format field, m-th sample | |
1626 while ( *t ) | |
1627 { | |
1628 fmt_aux_t *z = &fmt[j]; | |
1629 if ((z->y>>4&0xf) == BCF_HT_STR) { | |
1630 if (z->is_gt) { // genotypes | |
1631 int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m); | |
1632 for (l = 0;; ++t) { | |
1633 if (*t == '.') ++t, x[l++] = is_phased; | |
1634 else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased; | |
1635 #if THOROUGH_SANITY_CHECKS | |
1636 assert( 0 ); // success of strtol,strtod not checked | |
1637 #endif | |
1638 is_phased = (*t == '|'); | |
1639 if (*t == ':' || *t == 0) break; | |
1640 } | |
1641 if ( !l ) x[l++] = 0; // An empty field, insert missing value | |
1642 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; | |
1643 } else { | |
1644 char *x = (char*)z->buf + z->size * m; | |
1645 for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t; | |
1646 for (; l < z->size; ++l) x[l] = 0; | |
1647 } | |
1648 } else if ((z->y>>4&0xf) == BCF_HT_INT) { | |
1649 int32_t *x = (int32_t*)(z->buf + z->size * m); | |
1650 for (l = 0;; ++t) { | |
1651 if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "." | |
1652 else x[l++] = strtol(t, &t, 10); | |
1653 if (*t == ':' || *t == 0) break; | |
1654 } | |
1655 if ( !l ) x[l++] = bcf_int32_missing; | |
1656 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; | |
1657 } else if ((z->y>>4&0xf) == BCF_HT_REAL) { | |
1658 float *x = (float*)(z->buf + z->size * m); | |
1659 for (l = 0;; ++t) { | |
1660 if (*t == '.' && !isdigit(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "." | |
1661 else x[l++] = strtod(t, &t); | |
1662 if (*t == ':' || *t == 0) break; | |
1663 } | |
1664 if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value | |
1665 for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); | |
1666 } else abort(); | |
1667 if (*t == 0) { | |
1668 for (++j; j < v->n_fmt; ++j) { // fill end-of-vector values | |
1669 z = &fmt[j]; | |
1670 if ((z->y>>4&0xf) == BCF_HT_STR) { | |
1671 if (z->is_gt) { | |
1672 int32_t *x = (int32_t*)(z->buf + z->size * m); | |
1673 x[0] = bcf_int32_missing; | |
1674 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; | |
1675 } else { | |
1676 char *x = (char*)z->buf + z->size * m; | |
1677 if ( z->size ) x[0] = '.'; | |
1678 for (l = 1; l < z->size; ++l) x[l] = 0; | |
1679 } | |
1680 } else if ((z->y>>4&0xf) == BCF_HT_INT) { | |
1681 int32_t *x = (int32_t*)(z->buf + z->size * m); | |
1682 x[0] = bcf_int32_missing; | |
1683 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; | |
1684 } else if ((z->y>>4&0xf) == BCF_HT_REAL) { | |
1685 float *x = (float*)(z->buf + z->size * m); | |
1686 bcf_float_set_missing(x[0]); | |
1687 for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); | |
1688 } | |
1689 } | |
1690 break; | |
1691 } | |
1692 else | |
1693 { | |
1694 if (*t == ':') ++j; | |
1695 t++; | |
1696 } | |
1697 } | |
1698 m++; t++; | |
1699 } | |
1700 | |
1701 // write individual genotype information | |
1702 kstring_t *str = &v->indiv; | |
1703 int i; | |
1704 if (v->n_sample > 0) { | |
1705 for (i = 0; i < v->n_fmt; ++i) { | |
1706 fmt_aux_t *z = &fmt[i]; | |
1707 bcf_enc_int1(str, z->key); | |
1708 if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) { | |
1709 bcf_enc_size(str, z->size, BCF_BT_CHAR); | |
1710 kputsn((char*)z->buf, z->size * v->n_sample, str); | |
1711 } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) { | |
1712 bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2); | |
1713 } else { | |
1714 bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT); | |
1715 kputsn((char*)z->buf, z->size * v->n_sample, str); | |
1716 } | |
1717 } | |
1718 } | |
1719 | |
1720 if ( v->n_sample!=bcf_hdr_nsamples(h) ) | |
1721 { | |
1722 fprintf(stderr,"[%s:%d %s] Number of columns at %s:%d does not match the number of samples (%d vs %d).\n", | |
1723 __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h)); | |
1724 v->errcode |= BCF_ERR_NCOLS; | |
1725 return -1; | |
1726 } | |
1727 | |
1728 return 0; | |
1729 } | |
1730 | |
1731 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) | |
1732 { | |
1733 int i = 0; | |
1734 char *p, *q, *r, *t; | |
1735 kstring_t *str; | |
1736 khint_t k; | |
1737 ks_tokaux_t aux; | |
1738 | |
1739 bcf_clear1(v); | |
1740 str = &v->shared; | |
1741 memset(&aux, 0, sizeof(ks_tokaux_t)); | |
1742 for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) { | |
1743 q = (char*)aux.p; | |
1744 *q = 0; | |
1745 if (i == 0) { // CHROM | |
1746 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; | |
1747 k = kh_get(vdict, d, p); | |
1748 if (k == kh_end(d)) | |
1749 { | |
1750 // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has | |
1751 // been already printed, but will enable tools like vcfcheck to proceed. | |
1752 fprintf(stderr, "[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p); | |
1753 kstring_t tmp = {0,0,0}; | |
1754 int l; | |
1755 ksprintf(&tmp, "##contig=<ID=%s>", p); | |
1756 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); | |
1757 free(tmp.s); | |
1758 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); | |
1759 k = kh_get(vdict, d, p); | |
1760 v->errcode = BCF_ERR_CTG_UNDEF; | |
1761 } | |
1762 v->rid = kh_val(d, k).id; | |
1763 } else if (i == 1) { // POS | |
1764 v->pos = atoi(p) - 1; | |
1765 } else if (i == 2) { // ID | |
1766 if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); | |
1767 else bcf_enc_size(str, 0, BCF_BT_CHAR); | |
1768 } else if (i == 3) { // REF | |
1769 bcf_enc_vchar(str, q - p, p); | |
1770 v->n_allele = 1, v->rlen = q - p; | |
1771 } else if (i == 4) { // ALT | |
1772 if (strcmp(p, ".")) { | |
1773 for (r = t = p;; ++r) { | |
1774 if (*r == ',' || *r == 0) { | |
1775 bcf_enc_vchar(str, r - t, t); | |
1776 t = r + 1; | |
1777 ++v->n_allele; | |
1778 } | |
1779 if (r == q) break; | |
1780 } | |
1781 } | |
1782 } else if (i == 5) { // QUAL | |
1783 if (strcmp(p, ".")) v->qual = atof(p); | |
1784 else memcpy(&v->qual, &bcf_float_missing, 4); | |
1785 if ( v->max_unpack && !(v->max_unpack>>1) ) return 0; // BCF_UN_STR | |
1786 } else if (i == 6) { // FILTER | |
1787 if (strcmp(p, ".")) { | |
1788 int32_t *a; | |
1789 int n_flt = 1, i; | |
1790 ks_tokaux_t aux1; | |
1791 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; | |
1792 // count the number of filters | |
1793 if (*(q-1) == ';') *(q-1) = 0; | |
1794 for (r = p; *r; ++r) | |
1795 if (*r == ';') ++n_flt; | |
1796 a = (int32_t*)alloca(n_flt * sizeof(int32_t)); | |
1797 // add filters | |
1798 for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) { | |
1799 *(char*)aux1.p = 0; | |
1800 k = kh_get(vdict, d, t); | |
1801 if (k == kh_end(d)) | |
1802 { | |
1803 // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has | |
1804 // been already printed, but will enable tools like vcfcheck to proceed. | |
1805 fprintf(stderr, "[W::%s] FILTER '%s' is not defined in the header\n", __func__, t); | |
1806 kstring_t tmp = {0,0,0}; | |
1807 int l; | |
1808 ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t); | |
1809 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); | |
1810 free(tmp.s); | |
1811 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); | |
1812 k = kh_get(vdict, d, t); | |
1813 v->errcode = BCF_ERR_TAG_UNDEF; | |
1814 } | |
1815 a[i++] = kh_val(d, k).id; | |
1816 } | |
1817 n_flt = i; | |
1818 bcf_enc_vint(str, n_flt, a, -1); | |
1819 } else bcf_enc_vint(str, 0, 0, -1); | |
1820 if ( v->max_unpack && !(v->max_unpack>>2) ) return 0; // BCF_UN_FLT | |
1821 } else if (i == 7) { // INFO | |
1822 char *key; | |
1823 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; | |
1824 v->n_info = 0; | |
1825 if (strcmp(p, ".")) { | |
1826 if (*(q-1) == ';') *(q-1) = 0; | |
1827 for (r = key = p;; ++r) { | |
1828 int c; | |
1829 char *val, *end; | |
1830 if (*r != ';' && *r != '=' && *r != 0) continue; | |
1831 val = end = 0; | |
1832 c = *r; *r = 0; | |
1833 if (c == '=') { | |
1834 val = r + 1; | |
1835 for (end = val; *end != ';' && *end != 0; ++end); | |
1836 c = *end; *end = 0; | |
1837 } else end = r; | |
1838 if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO | |
1839 k = kh_get(vdict, d, key); | |
1840 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15) | |
1841 { | |
1842 fprintf(stderr, "[W::%s] INFO '%s' is not defined in the header, assuming Type=String\n", __func__, key); | |
1843 kstring_t tmp = {0,0,0}; | |
1844 int l; | |
1845 ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key); | |
1846 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); | |
1847 free(tmp.s); | |
1848 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h); | |
1849 k = kh_get(vdict, d, key); | |
1850 v->errcode = BCF_ERR_TAG_UNDEF; | |
1851 } | |
1852 uint32_t y = kh_val(d, k).info[BCF_HL_INFO]; | |
1853 ++v->n_info; | |
1854 bcf_enc_int1(str, kh_val(d, k).id); | |
1855 if (val == 0) { | |
1856 bcf_enc_size(str, 0, BCF_BT_NULL); | |
1857 } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string | |
1858 bcf_enc_vchar(str, end - val, val); | |
1859 } else { // int/float value/array | |
1860 int i, n_val; | |
1861 char *t, *te; | |
1862 for (t = val, n_val = 1; *t; ++t) // count the number of values | |
1863 if (*t == ',') ++n_val; | |
1864 if ((y>>4&0xf) == BCF_HT_INT) { | |
1865 int32_t *z; | |
1866 z = (int32_t*)alloca(n_val * sizeof(int32_t)); | |
1867 for (i = 0, t = val; i < n_val; ++i, ++t) | |
1868 { | |
1869 z[i] = strtol(t, &te, 10); | |
1870 if ( te==t ) // conversion failed | |
1871 { | |
1872 z[i] = bcf_int32_missing; | |
1873 while ( *te && *te!=',' ) te++; | |
1874 } | |
1875 t = te; | |
1876 } | |
1877 bcf_enc_vint(str, n_val, z, -1); | |
1878 if (strcmp(key, "END") == 0) v->rlen = z[0] - v->pos; | |
1879 } else if ((y>>4&0xf) == BCF_HT_REAL) { | |
1880 float *z; | |
1881 z = (float*)alloca(n_val * sizeof(float)); | |
1882 for (i = 0, t = val; i < n_val; ++i, ++t) | |
1883 { | |
1884 z[i] = strtod(t, &te); | |
1885 if ( te==t ) // conversion failed | |
1886 { | |
1887 bcf_float_set_missing(z[i]); | |
1888 while ( *te && *te!=',' ) te++; | |
1889 } | |
1890 t = te; | |
1891 } | |
1892 bcf_enc_vfloat(str, n_val, z); | |
1893 } | |
1894 } | |
1895 if (c == 0) break; | |
1896 r = end; | |
1897 key = r + 1; | |
1898 } | |
1899 } | |
1900 if ( v->max_unpack && !(v->max_unpack>>3) ) return 0; | |
1901 } else if (i == 8) // FORMAT | |
1902 return _vcf_parse_format(s, h, v, p, q); | |
1903 } | |
1904 return 0; | |
1905 } | |
1906 | |
1907 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) | |
1908 { | |
1909 int ret; | |
1910 ret = hts_getline(fp, KS_SEP_LINE, &fp->line); | |
1911 if (ret < 0) return -1; | |
1912 return vcf_parse1(&fp->line, h, v); | |
1913 } | |
1914 | |
1915 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt) | |
1916 { | |
1917 uint8_t *ptr_start = ptr; | |
1918 fmt->id = bcf_dec_typed_int1(ptr, &ptr); | |
1919 fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type); | |
1920 fmt->size = fmt->n << bcf_type_shift[fmt->type]; | |
1921 fmt->p = ptr; | |
1922 fmt->p_off = ptr - ptr_start; | |
1923 fmt->p_free = 0; | |
1924 ptr += n_sample * fmt->size; | |
1925 fmt->p_len = ptr - fmt->p; | |
1926 return ptr; | |
1927 } | |
1928 | |
1929 static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info) | |
1930 { | |
1931 uint8_t *ptr_start = ptr; | |
1932 info->key = bcf_dec_typed_int1(ptr, &ptr); | |
1933 info->len = bcf_dec_size(ptr, &ptr, &info->type); | |
1934 info->vptr = ptr; | |
1935 info->vptr_off = ptr - ptr_start; | |
1936 info->vptr_free = 0; | |
1937 info->v1.i = 0; | |
1938 if (info->len == 1) { | |
1939 if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr; | |
1940 else if (info->type == BCF_BT_INT32) info->v1.i = *(int32_t*)ptr; | |
1941 else if (info->type == BCF_BT_FLOAT) info->v1.f = *(float*)ptr; | |
1942 else if (info->type == BCF_BT_INT16) info->v1.i = *(int16_t*)ptr; | |
1943 } | |
1944 ptr += info->len << bcf_type_shift[info->type]; | |
1945 info->vptr_len = ptr - info->vptr; | |
1946 return ptr; | |
1947 } | |
1948 | |
1949 int bcf_unpack(bcf1_t *b, int which) | |
1950 { | |
1951 if ( !b->shared.l ) return 0; // Building a new BCF record from scratch | |
1952 uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori; | |
1953 int *offset, i; | |
1954 bcf_dec_t *d = &b->d; | |
1955 if (which & BCF_UN_FLT) which |= BCF_UN_STR; | |
1956 if (which & BCF_UN_INFO) which |= BCF_UN_SHR; | |
1957 if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR)) | |
1958 { | |
1959 kstring_t tmp; | |
1960 | |
1961 // ID | |
1962 tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id; | |
1963 ptr_ori = ptr; | |
1964 ptr = bcf_fmt_sized_array(&tmp, ptr); | |
1965 b->unpack_size[0] = ptr - ptr_ori; | |
1966 kputc('\0', &tmp); | |
1967 d->id = tmp.s; d->m_id = tmp.m; | |
1968 | |
1969 // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block | |
1970 tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als; | |
1971 offset = (int*)alloca(b->n_allele * sizeof(int)); | |
1972 ptr_ori = ptr; | |
1973 for (i = 0; i < b->n_allele; ++i) { | |
1974 offset[i] = tmp.l; | |
1975 ptr = bcf_fmt_sized_array(&tmp, ptr); | |
1976 kputc('\0', &tmp); | |
1977 } | |
1978 b->unpack_size[1] = ptr - ptr_ori; | |
1979 d->als = tmp.s; d->m_als = tmp.m; | |
1980 | |
1981 hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro | |
1982 for (i = 0; i < b->n_allele; ++i) | |
1983 d->allele[i] = d->als + offset[i]; | |
1984 b->unpacked |= BCF_UN_STR; | |
1985 } | |
1986 if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER | |
1987 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1]; | |
1988 ptr_ori = ptr; | |
1989 if (*ptr>>4) { | |
1990 int type; | |
1991 d->n_flt = bcf_dec_size(ptr, &ptr, &type); | |
1992 hts_expand(int, d->n_flt, d->m_flt, d->flt); | |
1993 for (i = 0; i < d->n_flt; ++i) | |
1994 d->flt[i] = bcf_dec_int1(ptr, type, &ptr); | |
1995 } else ++ptr, d->n_flt = 0; | |
1996 b->unpack_size[2] = ptr - ptr_ori; | |
1997 b->unpacked |= BCF_UN_FLT; | |
1998 } | |
1999 if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO | |
2000 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2]; | |
2001 hts_expand(bcf_info_t, b->n_info, d->m_info, d->info); | |
2002 for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0; | |
2003 for (i = 0; i < b->n_info; ++i) | |
2004 ptr = bcf_unpack_info_core1(ptr, &d->info[i]); | |
2005 b->unpacked |= BCF_UN_INFO; | |
2006 } | |
2007 if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT | |
2008 ptr = (uint8_t*)b->indiv.s; | |
2009 hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt); | |
2010 for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0; | |
2011 for (i = 0; i < b->n_fmt; ++i) | |
2012 ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]); | |
2013 b->unpacked |= BCF_UN_FMT; | |
2014 } | |
2015 return 0; | |
2016 } | |
2017 | |
2018 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) | |
2019 { | |
2020 int i; | |
2021 bcf_unpack((bcf1_t*)v, BCF_UN_ALL); | |
2022 kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM | |
2023 kputc('\t', s); kputw(v->pos + 1, s); // POS | |
2024 kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID | |
2025 kputc('\t', s); // REF | |
2026 if (v->n_allele > 0) kputs(v->d.allele[0], s); | |
2027 else kputc('.', s); | |
2028 kputc('\t', s); // ALT | |
2029 if (v->n_allele > 1) { | |
2030 for (i = 1; i < v->n_allele; ++i) { | |
2031 if (i > 1) kputc(',', s); | |
2032 kputs(v->d.allele[i], s); | |
2033 } | |
2034 } else kputc('.', s); | |
2035 kputc('\t', s); // QUAL | |
2036 if ( bcf_float_is_missing(v->qual) ) kputc('.', s); // QUAL | |
2037 else ksprintf(s, "%g", v->qual); | |
2038 kputc('\t', s); // FILTER | |
2039 if (v->d.n_flt) { | |
2040 for (i = 0; i < v->d.n_flt; ++i) { | |
2041 if (i) kputc(';', s); | |
2042 kputs(h->id[BCF_DT_ID][v->d.flt[i]].key, s); | |
2043 } | |
2044 } else kputc('.', s); | |
2045 kputc('\t', s); // INFO | |
2046 if (v->n_info) { | |
2047 int first = 1; | |
2048 for (i = 0; i < v->n_info; ++i) { | |
2049 bcf_info_t *z = &v->d.info[i]; | |
2050 if ( !z->vptr ) continue; | |
2051 if ( !first ) kputc(';', s); first = 0; | |
2052 kputs(h->id[BCF_DT_ID][z->key].key, s); | |
2053 if (z->len <= 0) continue; | |
2054 kputc('=', s); | |
2055 if (z->len == 1) | |
2056 { | |
2057 switch (z->type) | |
2058 { | |
2059 case BCF_BT_INT8: if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break; | |
2060 case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break; | |
2061 case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break; | |
2062 case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else ksprintf(s, "%g", z->v1.f); break; | |
2063 case BCF_BT_CHAR: kputc(z->v1.i, s); break; | |
2064 default: fprintf(stderr,"todo: type %d\n", z->type); exit(1); break; | |
2065 } | |
2066 } | |
2067 else bcf_fmt_array(s, z->len, z->type, z->vptr); | |
2068 } | |
2069 if ( first ) kputc('.', s); | |
2070 } else kputc('.', s); | |
2071 // FORMAT and individual information | |
2072 if (v->n_sample) | |
2073 { | |
2074 int i,j; | |
2075 if ( v->n_fmt) | |
2076 { | |
2077 int gt_i = -1; | |
2078 bcf_fmt_t *fmt = v->d.fmt; | |
2079 int first = 1; | |
2080 for (i = 0; i < (int)v->n_fmt; ++i) { | |
2081 if ( !fmt[i].p ) continue; | |
2082 kputc(!first ? ':' : '\t', s); first = 0; | |
2083 if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) ) | |
2084 { | |
2085 fprintf(stderr, "[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, fmt[i].id); | |
2086 abort(); | |
2087 } | |
2088 kputs(h->id[BCF_DT_ID][fmt[i].id].key, s); | |
2089 if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i; | |
2090 } | |
2091 if ( first ) kputs("\t.", s); | |
2092 for (j = 0; j < v->n_sample; ++j) { | |
2093 kputc('\t', s); | |
2094 first = 1; | |
2095 for (i = 0; i < (int)v->n_fmt; ++i) { | |
2096 bcf_fmt_t *f = &fmt[i]; | |
2097 if ( !f->p ) continue; | |
2098 if (!first) kputc(':', s); first = 0; | |
2099 if (gt_i == i) | |
2100 bcf_format_gt(f,j,s); | |
2101 else | |
2102 bcf_fmt_array(s, f->n, f->type, f->p + j * f->size); | |
2103 } | |
2104 if ( first ) kputc('.', s); | |
2105 } | |
2106 } | |
2107 else | |
2108 for (j=0; j<=v->n_sample; j++) | |
2109 kputs("\t.", s); | |
2110 } | |
2111 kputc('\n', s); | |
2112 return 0; | |
2113 } | |
2114 | |
2115 int vcf_write_line(htsFile *fp, kstring_t *line) | |
2116 { | |
2117 int ret; | |
2118 if ( line->s[line->l-1]!='\n' ) kputc('\n',line); | |
2119 if ( fp->format.compression!=no_compression ) | |
2120 ret = bgzf_write(fp->fp.bgzf, line->s, line->l); | |
2121 else | |
2122 ret = hwrite(fp->fp.hfile, line->s, line->l); | |
2123 return ret==line->l ? 0 : -1; | |
2124 } | |
2125 | |
2126 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) | |
2127 { | |
2128 int ret; | |
2129 fp->line.l = 0; | |
2130 vcf_format1(h, v, &fp->line); | |
2131 if ( fp->format.compression!=no_compression ) | |
2132 ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); | |
2133 else | |
2134 ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); | |
2135 return ret==fp->line.l ? 0 : -1; | |
2136 } | |
2137 | |
2138 /************************ | |
2139 * Data access routines * | |
2140 ************************/ | |
2141 | |
2142 int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id) | |
2143 { | |
2144 khint_t k; | |
2145 vdict_t *d = (vdict_t*)h->dict[which]; | |
2146 k = kh_get(vdict, d, id); | |
2147 return k == kh_end(d)? -1 : kh_val(d, k).id; | |
2148 } | |
2149 | |
2150 | |
2151 /******************** | |
2152 *** BCF indexing *** | |
2153 ********************/ | |
2154 | |
2155 hts_idx_t *bcf_index(htsFile *fp, int min_shift) | |
2156 { | |
2157 int n_lvls, i; | |
2158 bcf1_t *b; | |
2159 hts_idx_t *idx; | |
2160 bcf_hdr_t *h; | |
2161 int64_t max_len = 0, s; | |
2162 h = bcf_hdr_read(fp); | |
2163 if ( !h ) return NULL; | |
2164 int nids = 0; | |
2165 for (i = 0; i < h->n[BCF_DT_CTG]; ++i) | |
2166 { | |
2167 if ( !h->id[BCF_DT_CTG][i].val ) continue; | |
2168 if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0]; | |
2169 nids++; | |
2170 } | |
2171 if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken. | |
2172 max_len += 256; | |
2173 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3); | |
2174 idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); | |
2175 b = bcf_init1(); | |
2176 while (bcf_read1(fp,h, b) >= 0) { | |
2177 int ret; | |
2178 ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1); | |
2179 if (ret < 0) | |
2180 { | |
2181 bcf_destroy1(b); | |
2182 hts_idx_destroy(idx); | |
2183 return NULL; | |
2184 } | |
2185 } | |
2186 hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf)); | |
2187 bcf_destroy1(b); | |
2188 bcf_hdr_destroy(h); | |
2189 return idx; | |
2190 } | |
2191 | |
2192 int bcf_index_build(const char *fn, int min_shift) | |
2193 { | |
2194 htsFile *fp; | |
2195 hts_idx_t *idx; | |
2196 if ((fp = hts_open(fn, "rb")) == 0) return -1; | |
2197 if ( fp->format.compression!=bgzf ) { hts_close(fp); return -1; } | |
2198 idx = bcf_index(fp, min_shift); | |
2199 hts_close(fp); | |
2200 if ( !idx ) return -1; | |
2201 hts_idx_save(idx, fn, HTS_FMT_CSI); | |
2202 hts_idx_destroy(idx); | |
2203 return 0; | |
2204 } | |
2205 | |
2206 /***************** | |
2207 *** Utilities *** | |
2208 *****************/ | |
2209 | |
2210 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) | |
2211 { | |
2212 int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0; | |
2213 for (i=0; i<src->nhrec; i++) | |
2214 { | |
2215 if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) | |
2216 { | |
2217 int j; | |
2218 for (j=0; j<ndst_ori; j++) | |
2219 { | |
2220 if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue; | |
2221 | |
2222 // Checking only the key part of generic lines, otherwise | |
2223 // the VCFs are too verbose. Should we perhaps add a flag | |
2224 // to bcf_hdr_combine() and make this optional? | |
2225 if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break; | |
2226 } | |
2227 if ( j>=ndst_ori ) | |
2228 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); | |
2229 } | |
2230 else if ( src->hrec[i]->type==BCF_HL_STR ) | |
2231 { | |
2232 // NB: we are ignoring fields without ID | |
2233 int j = bcf_hrec_find_key(src->hrec[i],"ID"); | |
2234 if ( j>=0 ) | |
2235 { | |
2236 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key); | |
2237 if ( !rec ) | |
2238 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); | |
2239 } | |
2240 } | |
2241 else | |
2242 { | |
2243 int j = bcf_hrec_find_key(src->hrec[i],"ID"); | |
2244 assert( j>=0 ); // this should always be true for valid VCFs | |
2245 | |
2246 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL); | |
2247 if ( !rec ) | |
2248 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); | |
2249 else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) | |
2250 { | |
2251 // Check that both records are of the same type. The bcf_hdr_id2length | |
2252 // macro cannot be used here because dst header is not synced yet. | |
2253 vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID]; | |
2254 vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID]; | |
2255 khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]); | |
2256 khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]); | |
2257 if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) ) | |
2258 { | |
2259 fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different lengths\n", src->hrec[i]->vals[0]); | |
2260 ret |= 1; | |
2261 } | |
2262 if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) ) | |
2263 { | |
2264 fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different types\n", src->hrec[i]->vals[0]); | |
2265 ret |= 1; | |
2266 } | |
2267 } | |
2268 } | |
2269 } | |
2270 if ( need_sync ) bcf_hdr_sync(dst); | |
2271 return ret; | |
2272 } | |
2273 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) | |
2274 { | |
2275 int i; | |
2276 if ( line->errcode ) | |
2277 { | |
2278 fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,line->errcode); | |
2279 exit(1); | |
2280 } | |
2281 if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id | |
2282 if ( !src_hdr->ntransl ) // called for the first time, see what needs translating | |
2283 { | |
2284 int dict; | |
2285 for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG | |
2286 { | |
2287 src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int)); | |
2288 for (i=0; i<src_hdr->n[dict]; i++) | |
2289 { | |
2290 if ( !src_hdr->id[dict][i].key || !dst_hdr->id[dict][i].key ) // gap left after removed BCF header lines | |
2291 src_hdr->transl[dict][i] = -1; | |
2292 else if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) ) | |
2293 { | |
2294 src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key); | |
2295 src_hdr->ntransl++; | |
2296 } | |
2297 else | |
2298 src_hdr->transl[dict][i] = -1; | |
2299 } | |
2300 } | |
2301 if ( !src_hdr->ntransl ) | |
2302 { | |
2303 free(src_hdr->transl[0]); src_hdr->transl[0] = NULL; | |
2304 free(src_hdr->transl[1]); src_hdr->transl[1] = NULL; | |
2305 src_hdr->ntransl = -1; | |
2306 } | |
2307 if ( src_hdr->ntransl==-1 ) return 0; | |
2308 } | |
2309 bcf_unpack(line,BCF_UN_ALL); | |
2310 | |
2311 // CHROM | |
2312 if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid]; | |
2313 | |
2314 // FILTER | |
2315 for (i=0; i<line->d.n_flt; i++) | |
2316 { | |
2317 int src_id = line->d.flt[i]; | |
2318 if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 ) | |
2319 line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id]; | |
2320 line->d.shared_dirty |= BCF1_DIRTY_FLT; | |
2321 } | |
2322 | |
2323 // INFO | |
2324 for (i=0; i<line->n_info; i++) | |
2325 { | |
2326 int src_id = line->d.info[i].key; | |
2327 int dst_id = src_hdr->transl[BCF_DT_ID][src_id]; | |
2328 if ( dst_id<0 ) continue; | |
2329 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; | |
2330 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; | |
2331 if ( src_size==dst_size ) // can overwrite | |
2332 { | |
2333 line->d.info[i].key = dst_id; | |
2334 uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off; | |
2335 if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; } | |
2336 else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; } | |
2337 else { *(uint32_t*)vptr = (uint32_t)dst_id; } | |
2338 } | |
2339 else // must realloc | |
2340 { | |
2341 bcf_info_t *info = &line->d.info[i]; | |
2342 assert( !info->vptr_free ); | |
2343 kstring_t str = {0,0,0}; | |
2344 bcf_enc_int1(&str, dst_id); | |
2345 bcf_enc_size(&str, info->len,info->type); | |
2346 info->vptr_off = str.l; | |
2347 kputsn((char*)info->vptr, info->vptr_len, &str); | |
2348 info->vptr = (uint8_t*)str.s + info->vptr_off; | |
2349 info->vptr_free = 1; | |
2350 info->key = dst_id; | |
2351 line->d.shared_dirty |= BCF1_DIRTY_INF; | |
2352 } | |
2353 } | |
2354 | |
2355 // FORMAT | |
2356 for (i=0; i<line->n_fmt; i++) | |
2357 { | |
2358 int src_id = line->d.fmt[i].id; | |
2359 int dst_id = src_hdr->transl[BCF_DT_ID][src_id]; | |
2360 if ( dst_id<0 ) continue; | |
2361 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; | |
2362 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; | |
2363 if ( src_size==dst_size ) // can overwrite | |
2364 { | |
2365 line->d.fmt[i].id = dst_id; | |
2366 uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits) | |
2367 if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; } | |
2368 else if ( dst_size==BCF_BT_INT16 ) { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; } | |
2369 else { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; p[3] = x[2]; p[4] = x[3]; } | |
2370 } | |
2371 else // must realloc | |
2372 { | |
2373 bcf_fmt_t *fmt = &line->d.fmt[i]; | |
2374 assert( !fmt->p_free ); | |
2375 kstring_t str = {0,0,0}; | |
2376 bcf_enc_int1(&str, dst_id); | |
2377 bcf_enc_size(&str, fmt->n, fmt->type); | |
2378 fmt->p_off = str.l; | |
2379 kputsn((char*)fmt->p, fmt->p_len, &str); | |
2380 fmt->p = (uint8_t*)str.s + fmt->p_off; | |
2381 fmt->p_free = 1; | |
2382 fmt->id = dst_id; | |
2383 line->d.indiv_dirty = 1; | |
2384 } | |
2385 } | |
2386 return 0; | |
2387 } | |
2388 | |
2389 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) | |
2390 { | |
2391 bcf_hdr_t *hout = bcf_hdr_init("r"); | |
2392 char *htxt = bcf_hdr_fmt_text(hdr, 1, NULL); | |
2393 bcf_hdr_parse(hout, htxt); | |
2394 free(htxt); | |
2395 return hout; | |
2396 } | |
2397 | |
2398 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap) | |
2399 { | |
2400 int hlen; | |
2401 void *names_hash = khash_str2int_init(); | |
2402 char *htxt = bcf_hdr_fmt_text(h0, 1, &hlen); | |
2403 kstring_t str; | |
2404 bcf_hdr_t *h; | |
2405 str.l = str.m = 0; str.s = 0; | |
2406 h = bcf_hdr_init("w"); | |
2407 bcf_hdr_set_version(h,bcf_hdr_get_version(h0)); | |
2408 int j; | |
2409 for (j=0; j<n; j++) imap[j] = -1; | |
2410 if ( bcf_hdr_nsamples(h0) > 0) { | |
2411 char *p; | |
2412 int i = 0, end = n? 8 : 7; | |
2413 while ((p = strstr(htxt, "#CHROM\t")) != 0) | |
2414 if (p > htxt && *(p-1) == '\n') break; | |
2415 while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p; | |
2416 if (i != end) { | |
2417 free(h); free(str.s); | |
2418 return 0; // malformated header | |
2419 } | |
2420 kputsn(htxt, p - htxt, &str); | |
2421 for (i = 0; i < n; ++i) { | |
2422 if ( khash_str2int_has_key(names_hash,samples[i]) ) | |
2423 { | |
2424 fprintf(stderr,"[E::bcf_hdr_subset] Duplicate sample name \"%s\".\n", samples[i]); | |
2425 free(str.s); | |
2426 free(htxt); | |
2427 khash_str2int_destroy(names_hash); | |
2428 bcf_hdr_destroy(h); | |
2429 return NULL; | |
2430 } | |
2431 imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]); | |
2432 if (imap[i] < 0) continue; | |
2433 kputc('\t', &str); | |
2434 kputs(samples[i], &str); | |
2435 khash_str2int_inc(names_hash,samples[i]); | |
2436 } | |
2437 } else kputsn(htxt, hlen, &str); | |
2438 while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines | |
2439 kputc('\n',&str); | |
2440 bcf_hdr_parse(h, str.s); | |
2441 free(str.s); | |
2442 free(htxt); | |
2443 khash_str2int_destroy(names_hash); | |
2444 return h; | |
2445 } | |
2446 | |
2447 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) | |
2448 { | |
2449 if ( samples && !strcmp("-",samples) ) return 0; // keep all samples | |
2450 | |
2451 hdr->nsamples_ori = bcf_hdr_nsamples(hdr); | |
2452 if ( !samples ) { bcf_hdr_nsamples(hdr) = 0; return 0; } // exclude all samples | |
2453 | |
2454 int i, narr = bit_array_size(bcf_hdr_nsamples(hdr)); | |
2455 hdr->keep_samples = (uint8_t*) calloc(narr,1); | |
2456 if ( samples[0]=='^' ) | |
2457 for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i); | |
2458 | |
2459 int idx, n, ret = 0; | |
2460 char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n); | |
2461 if ( !smpls ) return -1; | |
2462 for (i=0; i<n; i++) | |
2463 { | |
2464 idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]); | |
2465 if ( idx<0 ) | |
2466 { | |
2467 if ( !ret ) ret = i+1; | |
2468 continue; | |
2469 } | |
2470 assert( idx<bcf_hdr_nsamples(hdr) ); | |
2471 if ( samples[0]=='^' ) | |
2472 bit_array_clear(hdr->keep_samples, idx); | |
2473 else | |
2474 bit_array_set(hdr->keep_samples, idx); | |
2475 } | |
2476 for (i=0; i<n; i++) free(smpls[i]); | |
2477 free(smpls); | |
2478 | |
2479 bcf_hdr_nsamples(hdr) = 0; | |
2480 for (i=0; i<hdr->nsamples_ori; i++) | |
2481 if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++; | |
2482 if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; } | |
2483 else | |
2484 { | |
2485 char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr)); | |
2486 idx = 0; | |
2487 for (i=0; i<hdr->nsamples_ori; i++) | |
2488 if ( bit_array_test(hdr->keep_samples,i) ) samples[idx++] = strdup(hdr->samples[i]); | |
2489 free(hdr->samples); | |
2490 hdr->samples = samples; | |
2491 | |
2492 // delete original samples from the dictionary | |
2493 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE]; | |
2494 int k; | |
2495 for (k = kh_begin(d); k != kh_end(d); ++k) | |
2496 if (kh_exist(d, k)) free((char*)kh_key(d, k)); | |
2497 kh_destroy(vdict, d); | |
2498 | |
2499 // add the subset back | |
2500 hdr->dict[BCF_DT_SAMPLE] = d = kh_init(vdict); | |
2501 for (i=0; i<bcf_hdr_nsamples(hdr); i++) | |
2502 { | |
2503 int ignore, k = kh_put(vdict, d, hdr->samples[i], &ignore); | |
2504 kh_val(d, k) = bcf_idinfo_def; | |
2505 kh_val(d, k).id = kh_size(d) - 1; | |
2506 } | |
2507 bcf_hdr_sync(hdr); | |
2508 } | |
2509 | |
2510 return ret; | |
2511 } | |
2512 | |
2513 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap) | |
2514 { | |
2515 kstring_t ind; | |
2516 ind.s = 0; ind.l = ind.m = 0; | |
2517 if (n) { | |
2518 bcf_fmt_t *fmt; | |
2519 int i, j; | |
2520 fmt = (bcf_fmt_t*)alloca(v->n_fmt * sizeof(bcf_fmt_t)); | |
2521 uint8_t *ptr = (uint8_t*)v->indiv.s; | |
2522 for (i = 0; i < v->n_fmt; ++i) | |
2523 ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]); | |
2524 for (i = 0; i < (int)v->n_fmt; ++i) { | |
2525 bcf_fmt_t *f = &fmt[i]; | |
2526 bcf_enc_int1(&ind, f->id); | |
2527 bcf_enc_size(&ind, f->n, f->type); | |
2528 for (j = 0; j < n; ++j) | |
2529 if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind); | |
2530 } | |
2531 for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i; | |
2532 v->n_sample = i; | |
2533 } else v->n_sample = 0; | |
2534 if ( !v->n_sample ) v->n_fmt = 0; | |
2535 free(v->indiv.s); | |
2536 v->indiv = ind; | |
2537 v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again | |
2538 return 0; | |
2539 } | |
2540 | |
2541 int bcf_is_snp(bcf1_t *v) | |
2542 { | |
2543 int i; | |
2544 bcf_unpack(v, BCF_UN_STR); | |
2545 for (i = 0; i < v->n_allele; ++i) | |
2546 { | |
2547 if ( v->d.allele[i][1]==0 ) continue; | |
2548 | |
2549 // mpileup's <X> allele, see also below. This is not completely satisfactory, | |
2550 // a general library is here narrowly tailored to fit samtools. | |
2551 if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue; | |
2552 | |
2553 break; | |
2554 } | |
2555 return i == v->n_allele; | |
2556 } | |
2557 | |
2558 static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *var) | |
2559 { | |
2560 // The most frequent case | |
2561 if ( !ref[1] && !alt[1] ) | |
2562 { | |
2563 if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; } | |
2564 if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant | |
2565 var->n = 1; var->type = VCF_SNP; return; | |
2566 } | |
2567 if ( alt[0]=='<' ) | |
2568 { | |
2569 if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant | |
2570 var->type = VCF_OTHER; | |
2571 return; | |
2572 } | |
2573 | |
2574 const char *r = ref, *a = alt; | |
2575 while (*r && *a && *r==*a ) { r++; a++; } | |
2576 | |
2577 if ( *a && !*r ) | |
2578 { | |
2579 while ( *a ) a++; | |
2580 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; | |
2581 } | |
2582 else if ( *r && !*a ) | |
2583 { | |
2584 while ( *r ) r++; | |
2585 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; | |
2586 } | |
2587 else if ( !*r && !*a ) | |
2588 { | |
2589 var->n = 0; var->type = VCF_REF; return; | |
2590 } | |
2591 | |
2592 const char *re = r, *ae = a; | |
2593 while ( re[1] ) re++; | |
2594 while ( ae[1] ) ae++; | |
2595 while ( *re==*ae && re>r && ae>a ) { re--; ae--; } | |
2596 if ( ae==a ) | |
2597 { | |
2598 if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; } | |
2599 var->n = -(re-r); | |
2600 if ( *re==*ae ) { var->type = VCF_INDEL; return; } | |
2601 var->type = VCF_OTHER; return; | |
2602 } | |
2603 else if ( re==r ) | |
2604 { | |
2605 var->n = ae-a; | |
2606 if ( *re==*ae ) { var->type = VCF_INDEL; return; } | |
2607 var->type = VCF_OTHER; return; | |
2608 } | |
2609 | |
2610 var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER; | |
2611 var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1; | |
2612 | |
2613 // should do also complex events, SVs, etc... | |
2614 } | |
2615 | |
2616 static void bcf_set_variant_types(bcf1_t *b) | |
2617 { | |
2618 if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR); | |
2619 bcf_dec_t *d = &b->d; | |
2620 if ( d->n_var < b->n_allele ) | |
2621 { | |
2622 d->var = (variant_t *) realloc(d->var, sizeof(variant_t)*b->n_allele); | |
2623 d->n_var = b->n_allele; | |
2624 } | |
2625 int i; | |
2626 b->d.var_type = 0; | |
2627 for (i=1; i<b->n_allele; i++) | |
2628 { | |
2629 bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]); | |
2630 b->d.var_type |= d->var[i].type; | |
2631 //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type); | |
2632 } | |
2633 } | |
2634 | |
2635 int bcf_get_variant_types(bcf1_t *rec) | |
2636 { | |
2637 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); | |
2638 return rec->d.var_type; | |
2639 } | |
2640 int bcf_get_variant_type(bcf1_t *rec, int ith_allele) | |
2641 { | |
2642 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); | |
2643 return rec->d.var[ith_allele].type; | |
2644 } | |
2645 | |
2646 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) | |
2647 { | |
2648 // Is the field already present? | |
2649 int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key); | |
2650 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header | |
2651 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); | |
2652 | |
2653 for (i=0; i<line->n_info; i++) | |
2654 if ( inf_id==line->d.info[i].key ) break; | |
2655 bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i]; | |
2656 | |
2657 if ( !n || (type==BCF_HT_STR && !values) ) | |
2658 { | |
2659 if ( inf ) | |
2660 { | |
2661 // Mark the tag for removal, free existing memory if necessary | |
2662 if ( inf->vptr_free ) | |
2663 { | |
2664 free(inf->vptr - inf->vptr_off); | |
2665 inf->vptr_free = 0; | |
2666 } | |
2667 line->d.shared_dirty |= BCF1_DIRTY_INF; | |
2668 inf->vptr = NULL; | |
2669 } | |
2670 return 0; | |
2671 } | |
2672 | |
2673 // Encode the values and determine the size required to accommodate the values | |
2674 kstring_t str = {0,0,0}; | |
2675 bcf_enc_int1(&str, inf_id); | |
2676 if ( type==BCF_HT_INT ) | |
2677 bcf_enc_vint(&str, n, (int32_t*)values, -1); | |
2678 else if ( type==BCF_HT_REAL ) | |
2679 bcf_enc_vfloat(&str, n, (float*)values); | |
2680 else if ( type==BCF_HT_FLAG || type==BCF_HT_STR ) | |
2681 { | |
2682 if ( values==NULL ) | |
2683 bcf_enc_size(&str, 0, BCF_BT_NULL); | |
2684 else | |
2685 bcf_enc_vchar(&str, strlen((char*)values), (char*)values); | |
2686 } | |
2687 else | |
2688 { | |
2689 fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type); | |
2690 abort(); | |
2691 } | |
2692 | |
2693 // Is the INFO tag already present | |
2694 if ( inf ) | |
2695 { | |
2696 // Is it big enough to accommodate new block? | |
2697 if ( str.l <= inf->vptr_len + inf->vptr_off ) | |
2698 { | |
2699 if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF; | |
2700 uint8_t *ptr = inf->vptr - inf->vptr_off; | |
2701 memcpy(ptr, str.s, str.l); | |
2702 free(str.s); | |
2703 int vptr_free = inf->vptr_free; | |
2704 bcf_unpack_info_core1(ptr, inf); | |
2705 inf->vptr_free = vptr_free; | |
2706 } | |
2707 else | |
2708 { | |
2709 assert( !inf->vptr_free ); // fix the caller or improve here: this has been modified before | |
2710 bcf_unpack_info_core1((uint8_t*)str.s, inf); | |
2711 inf->vptr_free = 1; | |
2712 line->d.shared_dirty |= BCF1_DIRTY_INF; | |
2713 } | |
2714 } | |
2715 else | |
2716 { | |
2717 // The tag is not present, create new one | |
2718 line->n_info++; | |
2719 hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info); | |
2720 inf = &line->d.info[line->n_info-1]; | |
2721 bcf_unpack_info_core1((uint8_t*)str.s, inf); | |
2722 inf->vptr_free = 1; | |
2723 line->d.shared_dirty |= BCF1_DIRTY_INF; | |
2724 } | |
2725 line->unpacked |= BCF_UN_INFO; | |
2726 return 0; | |
2727 } | |
2728 | |
2729 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n) | |
2730 { | |
2731 if ( !n ) | |
2732 return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR); | |
2733 | |
2734 int i, max_len = 0; | |
2735 for (i=0; i<n; i++) | |
2736 { | |
2737 int len = strlen(values[i]); | |
2738 if ( len > max_len ) max_len = len; | |
2739 } | |
2740 char *out = (char*) malloc(max_len*n); | |
2741 if ( !out ) return -2; | |
2742 for (i=0; i<n; i++) | |
2743 { | |
2744 char *dst = out+i*max_len; | |
2745 const char *src = values[i]; | |
2746 int j = 0; | |
2747 while ( src[j] ) { dst[j] = src[j]; j++; } | |
2748 for (; j<max_len; j++) dst[j] = 0; | |
2749 } | |
2750 int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR); | |
2751 free(out); | |
2752 return ret; | |
2753 } | |
2754 | |
2755 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) | |
2756 { | |
2757 // Is the field already present? | |
2758 int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key); | |
2759 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) ) | |
2760 { | |
2761 if ( !n ) return 0; | |
2762 return -1; // the key not present in the header | |
2763 } | |
2764 | |
2765 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); | |
2766 | |
2767 for (i=0; i<line->n_fmt; i++) | |
2768 if ( line->d.fmt[i].id==fmt_id ) break; | |
2769 bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i]; | |
2770 | |
2771 if ( !n ) | |
2772 { | |
2773 if ( fmt ) | |
2774 { | |
2775 // Mark the tag for removal, free existing memory if necessary | |
2776 if ( fmt->p_free ) | |
2777 { | |
2778 free(fmt->p - fmt->p_off); | |
2779 fmt->p_free = 0; | |
2780 } | |
2781 line->d.indiv_dirty = 1; | |
2782 fmt->p = NULL; | |
2783 } | |
2784 return 0; | |
2785 } | |
2786 | |
2787 line->n_sample = bcf_hdr_nsamples(hdr); | |
2788 int nps = n / line->n_sample; // number of values per sample | |
2789 assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample | |
2790 | |
2791 // Encode the values and determine the size required to accommodate the values | |
2792 kstring_t str = {0,0,0}; | |
2793 bcf_enc_int1(&str, fmt_id); | |
2794 if ( type==BCF_HT_INT ) | |
2795 bcf_enc_vint(&str, n, (int32_t*)values, nps); | |
2796 else if ( type==BCF_HT_REAL ) | |
2797 { | |
2798 bcf_enc_size(&str, nps, BCF_BT_FLOAT); | |
2799 kputsn((char*)values, nps*line->n_sample*sizeof(float), &str); | |
2800 } | |
2801 else if ( type==BCF_HT_STR ) | |
2802 { | |
2803 bcf_enc_size(&str, nps, BCF_BT_CHAR); | |
2804 kputsn((char*)values, nps*line->n_sample, &str); | |
2805 } | |
2806 else | |
2807 { | |
2808 fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type); | |
2809 abort(); | |
2810 } | |
2811 | |
2812 if ( !fmt ) | |
2813 { | |
2814 // Not present, new format field | |
2815 line->n_fmt++; | |
2816 hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt); | |
2817 | |
2818 // Special case: VCF specification requires that GT is always first | |
2819 if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] ) | |
2820 { | |
2821 for (i=line->n_fmt-1; i>0; i--) | |
2822 line->d.fmt[i] = line->d.fmt[i-1]; | |
2823 fmt = &line->d.fmt[0]; | |
2824 } | |
2825 else | |
2826 fmt = &line->d.fmt[line->n_fmt-1]; | |
2827 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt); | |
2828 line->d.indiv_dirty = 1; | |
2829 fmt->p_free = 1; | |
2830 } | |
2831 else | |
2832 { | |
2833 // The tag is already present, check if it is big enough to accomodate the new block | |
2834 if ( str.l <= fmt->p_len + fmt->p_off ) | |
2835 { | |
2836 // good, the block is big enough | |
2837 if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1; | |
2838 uint8_t *ptr = fmt->p - fmt->p_off; | |
2839 memcpy(ptr, str.s, str.l); | |
2840 free(str.s); | |
2841 int p_free = fmt->p_free; | |
2842 bcf_unpack_fmt_core1(ptr, line->n_sample, fmt); | |
2843 fmt->p_free = p_free; | |
2844 } | |
2845 else | |
2846 { | |
2847 assert( !fmt->p_free ); // fix the caller or improve here: this has been modified before | |
2848 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt); | |
2849 fmt->p_free = 1; | |
2850 line->d.indiv_dirty = 1; | |
2851 } | |
2852 } | |
2853 line->unpacked |= BCF_UN_FMT; | |
2854 return 0; | |
2855 } | |
2856 | |
2857 | |
2858 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n) | |
2859 { | |
2860 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); | |
2861 line->d.shared_dirty |= BCF1_DIRTY_FLT; | |
2862 line->d.n_flt = n; | |
2863 if ( !n ) return 0; | |
2864 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt); | |
2865 int i; | |
2866 for (i=0; i<n; i++) | |
2867 line->d.flt[i] = flt_ids[i]; | |
2868 return 0; | |
2869 } | |
2870 | |
2871 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id) | |
2872 { | |
2873 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); | |
2874 int i; | |
2875 for (i=0; i<line->d.n_flt; i++) | |
2876 if ( flt_id==line->d.flt[i] ) break; | |
2877 if ( i<line->d.n_flt ) return 0; // this filter is already set | |
2878 line->d.shared_dirty |= BCF1_DIRTY_FLT; | |
2879 if ( flt_id==0 ) // set to PASS | |
2880 line->d.n_flt = 1; | |
2881 else if ( line->d.n_flt==1 && line->d.flt[0]==0 ) | |
2882 line->d.n_flt = 1; | |
2883 else | |
2884 line->d.n_flt++; | |
2885 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt); | |
2886 line->d.flt[line->d.n_flt-1] = flt_id; | |
2887 return 1; | |
2888 } | |
2889 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass) | |
2890 { | |
2891 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); | |
2892 int i; | |
2893 for (i=0; i<line->d.n_flt; i++) | |
2894 if ( flt_id==line->d.flt[i] ) break; | |
2895 if ( i==line->d.n_flt ) return 0; // the filter is not present | |
2896 line->d.shared_dirty |= BCF1_DIRTY_FLT; | |
2897 if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt)); | |
2898 line->d.n_flt--; | |
2899 if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0); | |
2900 return 0; | |
2901 } | |
2902 | |
2903 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter) | |
2904 { | |
2905 if ( filter[0]=='.' && !filter[1] ) filter = "PASS"; | |
2906 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter); | |
2907 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header | |
2908 | |
2909 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); | |
2910 if ( id==0 && !line->d.n_flt) return 1; // PASS | |
2911 | |
2912 int i; | |
2913 for (i=0; i<line->d.n_flt; i++) | |
2914 if ( line->d.flt[i]==id ) return 1; | |
2915 return 0; | |
2916 } | |
2917 | |
2918 static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals) | |
2919 { | |
2920 line->d.shared_dirty |= BCF1_DIRTY_ALS; | |
2921 | |
2922 line->n_allele = nals; | |
2923 hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele); | |
2924 | |
2925 char *als = line->d.als; | |
2926 int n = 0; | |
2927 while (n<nals) | |
2928 { | |
2929 line->d.allele[n] = als; | |
2930 while ( *als ) als++; | |
2931 als++; | |
2932 n++; | |
2933 } | |
2934 return 0; | |
2935 } | |
2936 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals) | |
2937 { | |
2938 kstring_t tmp = {0,0,0}; | |
2939 char *free_old = NULL; | |
2940 | |
2941 // If the supplied alleles are not pointers to line->d.als, the existing block can be reused. | |
2942 int i; | |
2943 for (i=0; i<nals; i++) | |
2944 if ( alleles[i]>=line->d.als && alleles[i]<line->d.als+line->d.m_als ) break; | |
2945 if ( i==nals ) | |
2946 { | |
2947 // all alleles point elsewhere, reuse the existing block | |
2948 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als; | |
2949 } | |
2950 else | |
2951 free_old = line->d.als; | |
2952 | |
2953 for (i=0; i<nals; i++) | |
2954 { | |
2955 kputs(alleles[i], &tmp); | |
2956 kputc(0, &tmp); | |
2957 } | |
2958 line->d.als = tmp.s; line->d.m_als = tmp.m; | |
2959 free(free_old); | |
2960 return _bcf1_sync_alleles(hdr,line,nals); | |
2961 } | |
2962 | |
2963 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string) | |
2964 { | |
2965 kstring_t tmp; | |
2966 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als; | |
2967 kputs(alleles_string, &tmp); | |
2968 line->d.als = tmp.s; line->d.m_als = tmp.m; | |
2969 | |
2970 int nals = 1; | |
2971 char *t = line->d.als; | |
2972 while (*t) | |
2973 { | |
2974 if ( *t==',' ) { *t = 0; nals++; } | |
2975 t++; | |
2976 } | |
2977 return _bcf1_sync_alleles(hdr, line, nals); | |
2978 } | |
2979 | |
2980 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id) | |
2981 { | |
2982 kstring_t tmp; | |
2983 tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id; | |
2984 if ( id ) | |
2985 kputs(id, &tmp); | |
2986 else | |
2987 kputs(".", &tmp); | |
2988 line->d.id = tmp.s; line->d.m_id = tmp.m; | |
2989 line->d.shared_dirty |= BCF1_DIRTY_ID; | |
2990 return 0; | |
2991 } | |
2992 | |
2993 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) | |
2994 { | |
2995 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key); | |
2996 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header | |
2997 return bcf_get_fmt_id(line, id); | |
2998 } | |
2999 | |
3000 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) | |
3001 { | |
3002 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key); | |
3003 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header | |
3004 return bcf_get_info_id(line, id); | |
3005 } | |
3006 | |
3007 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id) | |
3008 { | |
3009 int i; | |
3010 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); | |
3011 for (i=0; i<line->n_fmt; i++) | |
3012 { | |
3013 if ( line->d.fmt[i].id==id ) return &line->d.fmt[i]; | |
3014 } | |
3015 return NULL; | |
3016 } | |
3017 | |
3018 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id) | |
3019 { | |
3020 int i; | |
3021 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); | |
3022 for (i=0; i<line->n_info; i++) | |
3023 { | |
3024 if ( line->d.info[i].key==id ) return &line->d.info[i]; | |
3025 } | |
3026 return NULL; | |
3027 } | |
3028 | |
3029 | |
3030 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) | |
3031 { | |
3032 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); | |
3033 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header | |
3034 if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type | |
3035 | |
3036 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); | |
3037 | |
3038 for (i=0; i<line->n_info; i++) | |
3039 if ( line->d.info[i].key==tag_id ) break; | |
3040 if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record | |
3041 if ( type==BCF_HT_FLAG ) return 1; | |
3042 | |
3043 bcf_info_t *info = &line->d.info[i]; | |
3044 if ( type==BCF_HT_STR ) | |
3045 { | |
3046 if ( *ndst < info->len+1 ) | |
3047 { | |
3048 *ndst = info->len + 1; | |
3049 *dst = realloc(*dst, *ndst); | |
3050 } | |
3051 memcpy(*dst,info->vptr,info->len); | |
3052 ((uint8_t*)*dst)[info->len] = 0; | |
3053 return info->len; | |
3054 } | |
3055 | |
3056 // Make sure the buffer is big enough | |
3057 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float); | |
3058 if ( *ndst < info->len ) | |
3059 { | |
3060 *ndst = info->len; | |
3061 *dst = realloc(*dst, *ndst * size1); | |
3062 } | |
3063 | |
3064 if ( info->len == 1 ) | |
3065 { | |
3066 if ( info->type==BCF_BT_FLOAT ) *((float*)*dst) = info->v1.f; | |
3067 else | |
3068 { | |
3069 #define BRANCH(type_t, missing) { \ | |
3070 if ( info->v1.i==missing ) *((int32_t*)*dst) = bcf_int32_missing; \ | |
3071 else *((int32_t*)*dst) = info->v1.i; \ | |
3072 } | |
3073 switch (info->type) | |
3074 { | |
3075 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing ); break; | |
3076 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing); break; | |
3077 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing); break; | |
3078 } | |
3079 #undef BRANCH | |
3080 } | |
3081 return 1; | |
3082 } | |
3083 | |
3084 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, out_type_t) { \ | |
3085 out_type_t *tmp = (out_type_t *) *dst; \ | |
3086 type_t *p = (type_t *) info->vptr; \ | |
3087 for (j=0; j<info->len; j++) \ | |
3088 { \ | |
3089 if ( is_vector_end ) return j; \ | |
3090 if ( is_missing ) set_missing; \ | |
3091 else *tmp = p[j]; \ | |
3092 tmp++; \ | |
3093 } \ | |
3094 return j; \ | |
3095 } | |
3096 switch (info->type) { | |
3097 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, int32_t); break; | |
3098 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, int32_t); break; | |
3099 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, int32_t); break; | |
3100 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), float); break; | |
3101 default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1); | |
3102 } | |
3103 #undef BRANCH | |
3104 return -4; // this can never happen | |
3105 } | |
3106 | |
3107 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst) | |
3108 { | |
3109 int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); | |
3110 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header | |
3111 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type | |
3112 | |
3113 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); | |
3114 | |
3115 for (i=0; i<line->n_fmt; i++) | |
3116 if ( line->d.fmt[i].id==tag_id ) break; | |
3117 if ( i==line->n_fmt ) return -3; // the tag is not present in this record | |
3118 bcf_fmt_t *fmt = &line->d.fmt[i]; | |
3119 | |
3120 int nsmpl = bcf_hdr_nsamples(hdr); | |
3121 if ( !*dst ) | |
3122 { | |
3123 *dst = (char**) malloc(sizeof(char*)*nsmpl); | |
3124 if ( !*dst ) return -4; // could not alloc | |
3125 (*dst)[0] = NULL; | |
3126 } | |
3127 int n = (fmt->n+1)*nsmpl; | |
3128 if ( *ndst < n ) | |
3129 { | |
3130 (*dst)[0] = realloc((*dst)[0], n); | |
3131 if ( !(*dst)[0] ) return -4; // could not alloc | |
3132 *ndst = n; | |
3133 } | |
3134 for (i=0; i<nsmpl; i++) | |
3135 { | |
3136 uint8_t *src = fmt->p + i*fmt->n; | |
3137 uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1); | |
3138 memcpy(tmp,src,fmt->n); | |
3139 tmp[fmt->n] = 0; | |
3140 (*dst)[i] = (char*) tmp; | |
3141 } | |
3142 return n; | |
3143 } | |
3144 | |
3145 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) | |
3146 { | |
3147 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); | |
3148 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header | |
3149 if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 ) | |
3150 { | |
3151 // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT. | |
3152 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; | |
3153 } | |
3154 else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type | |
3155 | |
3156 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); | |
3157 | |
3158 for (i=0; i<line->n_fmt; i++) | |
3159 if ( line->d.fmt[i].id==tag_id ) break; | |
3160 if ( i==line->n_fmt ) return -3; // the tag is not present in this record | |
3161 bcf_fmt_t *fmt = &line->d.fmt[i]; | |
3162 | |
3163 if ( type==BCF_HT_STR ) | |
3164 { | |
3165 int n = fmt->n*bcf_hdr_nsamples(hdr); | |
3166 if ( *ndst < n ) | |
3167 { | |
3168 *dst = realloc(*dst, n); | |
3169 if ( !*dst ) return -4; // could not alloc | |
3170 *ndst = n; | |
3171 } | |
3172 memcpy(*dst,fmt->p,n); | |
3173 return n; | |
3174 } | |
3175 | |
3176 // Make sure the buffer is big enough | |
3177 int nsmpl = bcf_hdr_nsamples(hdr); | |
3178 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float); | |
3179 if ( *ndst < fmt->n*nsmpl ) | |
3180 { | |
3181 *ndst = fmt->n*nsmpl; | |
3182 *dst = realloc(*dst, *ndst*size1); | |
3183 if ( !dst ) return -4; // could not alloc | |
3184 } | |
3185 | |
3186 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \ | |
3187 out_type_t *tmp = (out_type_t *) *dst; \ | |
3188 type_t *p = (type_t*) fmt->p; \ | |
3189 for (i=0; i<nsmpl; i++) \ | |
3190 { \ | |
3191 for (j=0; j<fmt->n; j++) \ | |
3192 { \ | |
3193 if ( is_missing ) set_missing; \ | |
3194 else if ( is_vector_end ) { set_vector_end; break; } \ | |
3195 else *tmp = p[j]; \ | |
3196 tmp++; \ | |
3197 } \ | |
3198 for (; j<fmt->n; j++) { set_vector_end; tmp++; } \ | |
3199 p = (type_t *)((char *)p + fmt->size); \ | |
3200 } \ | |
3201 } | |
3202 switch (fmt->type) { | |
3203 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; | |
3204 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; | |
3205 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break; | |
3206 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break; | |
3207 default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1); | |
3208 } | |
3209 #undef BRANCH | |
3210 return nsmpl*fmt->n; | |
3211 } | |
3212 |