annotate ezBAMQC/src/htslib/vcf.c @ 15:28cebcc7f774

Uploaded
author cshl-bsr
date Wed, 30 Mar 2016 12:15:18 -0400
parents dfa3745e5fd8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1 /* vcf.c -- VCF/BCF API functions.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3 Copyright (C) 2012, 2013 Broad Institute.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
4 Copyright (C) 2012-2014 Genome Research Ltd.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
5 Portions copyright (C) 2014 Intel Corporation.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
6
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
7 Author: Heng Li <lh3@sanger.ac.uk>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
8
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
9 Permission is hereby granted, free of charge, to any person obtaining a copy
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
10 of this software and associated documentation files (the "Software"), to deal
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
11 in the Software without restriction, including without limitation the rights
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
13 copies of the Software, and to permit persons to whom the Software is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
14 furnished to do so, subject to the following conditions:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
15
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
16 The above copyright notice and this permission notice shall be included in
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
17 all copies or substantial portions of the Software.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
18
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
25 DEALINGS IN THE SOFTWARE. */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
26
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
27 #include <zlib.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
28 #include <stdio.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
29 #include <ctype.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
30 #include <assert.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
31 #include <string.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
32 #include <stdlib.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
33 #include <limits.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
34 #include "htslib/kstring.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
35 #include "htslib/bgzf.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
36 #include "htslib/vcf.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
37 #include "htslib/tbx.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
38 #include "htslib/hfile.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
39 #include "htslib/khash_str2int.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
40
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
41 #include "htslib/khash.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
42 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
43 typedef khash_t(vdict) vdict_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
44
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
45 #include "htslib/kseq.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
46 KSTREAM_DECLARE(gzFile, gzread)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
47
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
48 uint32_t bcf_float_missing = 0x7F800001;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
49 uint32_t bcf_float_vector_end = 0x7F800002;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
50 uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
51 static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
52
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
53 /*************************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
54 *** VCF header parser ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
55 *************************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
56
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
57 int bcf_hdr_sync(bcf_hdr_t *h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
58
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
59 int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
60 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
61 if ( !s ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
62
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
63 const char *ss = s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
64 while ( !*ss && isspace(*ss) ) ss++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
65 if ( !*ss )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
66 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
67 fprintf(stderr,"[E::%s] Empty sample name: trailing spaces/tabs in the header line?\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
68 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
69 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
70
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
71 vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
72 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
73 char *sdup = strdup(s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
74 int k = kh_put(vdict, d, sdup, &ret);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
75 if (ret) { // absent
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
76 kh_val(d, k) = bcf_idinfo_def;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
77 kh_val(d, k).id = kh_size(d) - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
78 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
79 if (hts_verbose >= 2)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
80 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
81 fprintf(stderr, "[E::%s] Duplicated sample name '%s'\n", __func__, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
82 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
83 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
84 free(sdup);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
85 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
86 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
87 int n = kh_size(d);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
88 h->samples = (char**) realloc(h->samples,sizeof(char*)*n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
89 h->samples[n-1] = sdup;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
90 h->dirty = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
91 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
92 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
93
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
94 int bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
95 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
96 int ret = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
97 int i = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
98 const char *p, *q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
99 // add samples
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
100 for (p = q = str;; ++q) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
101 if (*q != '\t' && *q != 0 && *q != '\n') continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
102 if (++i > 9) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
103 char *s = (char*)malloc(q - p + 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
104 strncpy(s, p, q - p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
105 s[q - p] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
106 if ( bcf_hdr_add_sample(h,s) < 0 ) ret = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
107 free(s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
108 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
109 if (*q == 0 || *q == '\n') break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
110 p = q + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
111 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
112 bcf_hdr_add_sample(h,NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
113 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
114 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
115
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
116 int bcf_hdr_sync(bcf_hdr_t *h)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
117 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
118 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
119 for (i = 0; i < 3; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
120 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
121 vdict_t *d = (vdict_t*)h->dict[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
122 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
123
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
124 // find out the largest id, there may be holes because of IDX
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
125 int max_id = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
126 for (k=kh_begin(d); k<kh_end(d); k++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
127 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
128 if (!kh_exist(d,k)) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
129 if ( max_id < kh_val(d,k).id ) max_id = kh_val(d,k).id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
130 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
131 if ( max_id >= h->n[i] )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
132 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
133 h->id[i] = (bcf_idpair_t*)realloc(h->id[i], (max_id+1)*sizeof(bcf_idpair_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
134 for (k=h->n[i]; k<=max_id; k++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
135 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
136 h->id[i][k].key = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
137 h->id[i][k].val = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
138 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
139 h->n[i] = max_id+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
140 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
141 for (k=kh_begin(d); k<kh_end(d); k++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
142 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
143 if (!kh_exist(d,k)) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
144 h->id[i][kh_val(d,k).id].key = kh_key(d,k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
145 h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
146 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
147 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
148 h->dirty = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
149 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
150 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
151
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
152 void bcf_hrec_destroy(bcf_hrec_t *hrec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
153 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
154 free(hrec->key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
155 if ( hrec->value ) free(hrec->value);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
156 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
157 for (i=0; i<hrec->nkeys; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
158 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
159 free(hrec->keys[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
160 free(hrec->vals[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
161 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
162 free(hrec->keys);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
163 free(hrec->vals);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
164 free(hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
165 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
166
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
167 // Copies all fields except IDX.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
168 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
169 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
170 bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
171 out->type = hrec->type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
172 if ( hrec->key ) out->key = strdup(hrec->key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
173 if ( hrec->value ) out->value = strdup(hrec->value);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
174 out->nkeys = hrec->nkeys;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
175 out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
176 out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
177 int i, j = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
178 for (i=0; i<hrec->nkeys; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
179 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
180 if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
181 if ( hrec->keys[i] ) out->keys[j] = strdup(hrec->keys[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
182 if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
183 j++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
184 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
185 if ( i!=j ) out->nkeys -= i-j; // IDX was omitted
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
186 return out;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
187 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
188
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
189 void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
190 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
191 fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
192 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
193 for (i=0; i<hrec->nkeys; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
194 fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
195 fprintf(fp, "\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
196 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
197
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
198 void bcf_header_debug(bcf_hdr_t *hdr)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
199 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
200 int i, j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
201 for (i=0; i<hdr->nhrec; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
202 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
203 if ( !hdr->hrec[i]->value )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
204 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
205 fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
206 fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
207 for (j=1; j<hdr->hrec[i]->nkeys; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
208 fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
209 fprintf(stderr,">\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
210 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
211 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
212 fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
213 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
214 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
215
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
216 void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
217 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
218 int n = ++hrec->nkeys;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
219 hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
220 hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
221 assert( len );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
222 hrec->keys[n-1] = (char*) malloc((len+1)*sizeof(char));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
223 memcpy(hrec->keys[n-1],str,len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
224 hrec->keys[n-1][len] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
225 hrec->vals[n-1] = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
226 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
227
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
228 void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
229 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
230 if ( !str ) { hrec->vals[i] = NULL; return; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
231 if ( hrec->vals[i] ) free(hrec->vals[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
232 if ( is_quoted )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
233 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
234 hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
235 hrec->vals[i][0] = '"';
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
236 memcpy(&hrec->vals[i][1],str,len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
237 hrec->vals[i][len+1] = '"';
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
238 hrec->vals[i][len+2] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
239 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
240 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
241 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
242 hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
243 memcpy(hrec->vals[i],str,len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
244 hrec->vals[i][len] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
245 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
246 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
247
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
248 void hrec_add_idx(bcf_hrec_t *hrec, int idx)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
249 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
250 int n = ++hrec->nkeys;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
251 hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
252 hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
253 hrec->keys[n-1] = strdup("IDX");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
254 kstring_t str = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
255 kputw(idx, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
256 hrec->vals[n-1] = str.s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
257 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
258
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
259 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
260 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
261 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
262 for (i=0; i<hrec->nkeys; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
263 if ( !strcasecmp(key,hrec->keys[i]) ) return i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
264 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
265 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
266
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
267 static inline int is_escaped(const char *min, const char *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
268 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
269 int n = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
270 while ( --str>=min && *str=='\\' ) n++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
271 return n%2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
272 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
273
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
274 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
275 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
276 const char *p = line;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
277 if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
278 p += 2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
279
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
280 const char *q = p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
281 while ( *q && *q!='=' ) q++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
282 int n = q-p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
283 if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
284
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
285 bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
286 hrec->key = (char*) malloc(sizeof(char)*(n+1));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
287 memcpy(hrec->key,p,n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
288 hrec->key[n] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
289
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
290 p = ++q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
291 if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
292 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
293 while ( *q && *q!='\n' ) q++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
294 hrec->value = (char*) malloc((q-p+1)*sizeof(char));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
295 memcpy(hrec->value, p, q-p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
296 hrec->value[q-p] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
297 *len = q-line+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
298 return hrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
299 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
300
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
301 // structured line, e.g. ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
302 int nopen = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
303 while ( *q && *q!='\n' && nopen )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
304 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
305 p = ++q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
306 while ( *q && isalnum(*q) ) q++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
307 n = q-p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
308 if ( *q!='=' || !n )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
309 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
310 // wrong format
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
311 while ( *q && *q!='\n' ) q++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
312 kstring_t tmp = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
313 kputsn(line,q-line,&tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
314 fprintf(stderr,"Could not parse the header line: \"%s\"\n", tmp.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
315 free(tmp.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
316 *len = q-line+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
317 bcf_hrec_destroy(hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
318 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
319 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
320 bcf_hrec_add_key(hrec, p, q-p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
321 p = ++q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
322 int quoted = *p=='"' ? 1 : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
323 if ( quoted ) p++, q++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
324 while (1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
325 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
326 if ( !*q ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
327 if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
328 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
329 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
330 if ( *q=='<' ) nopen++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
331 if ( *q=='>' ) nopen--;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
332 if ( !nopen ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
333 if ( *q==',' && nopen==1 ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
334 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
335 q++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
336 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
337 bcf_hrec_set_val(hrec, hrec->nkeys-1, p, q-p, quoted);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
338 if ( quoted ) q++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
339 if ( *q=='>' ) { nopen--; q++; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
340 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
341 *len = q-line+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
342 return hrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
343 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
344
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
345 // returns: 1 when hdr needs to be synced, 0 otherwise
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
346 int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
347 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
348 // contig
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
349 int i,j,k, ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
350 char *str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
351 if ( !strcmp(hrec->key, "contig") )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
352 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
353 hrec->type = BCF_HL_CTG;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
354
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
355 // Get the contig ID ($str) and length ($j)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
356 i = bcf_hrec_find_key(hrec,"length");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
357 if ( i<0 ) j = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
358 else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
359
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
360 i = bcf_hrec_find_key(hrec,"ID");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
361 if ( i<0 ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
362 str = strdup(hrec->vals[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
363
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
364 // Register in the dictionary
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
365 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
366 k = kh_put(vdict, d, str, &ret);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
367 if ( !ret ) { free(str); return 0; } // already present
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
368
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
369 int idx = bcf_hrec_find_key(hrec,"IDX");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
370 if ( idx!=-1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
371 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
372 char *tmp = hrec->vals[idx];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
373 idx = strtol(hrec->vals[idx], &tmp, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
374 if ( *tmp )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
375 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
376 fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
377 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
378 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
379 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
380 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
381 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
382 idx = kh_size(d) - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
383 hrec_add_idx(hrec, idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
384 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
385
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
386 kh_val(d, k) = bcf_idinfo_def;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
387 kh_val(d, k).id = idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
388 kh_val(d, k).info[0] = j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
389 kh_val(d, k).hrec[0] = hrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
390
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
391 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
392 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
393
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
394 if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
395 else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
396 else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
397 else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
398 else return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
399
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
400 // INFO/FILTER/FORMAT
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
401 char *id = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
402 int type = -1, num = -1, var = -1, idx = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
403 for (i=0; i<hrec->nkeys; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
404 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
405 if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
406 else if ( !strcmp(hrec->keys[i], "IDX") )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
407 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
408 char *tmp = hrec->vals[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
409 idx = strtol(hrec->vals[i], &tmp, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
410 if ( *tmp )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
411 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
412 fprintf(stderr,"[%s:%d %s] Error parsing the IDX tag, skipping.\n", __FILE__,__LINE__,__FUNCTION__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
413 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
414 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
415 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
416 else if ( !strcmp(hrec->keys[i], "Type") )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
417 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
418 if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
419 else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
420 else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
421 else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
422 else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
423 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
424 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
425 fprintf(stderr, "[E::%s] The type \"%s\" not supported, assuming \"String\"\n", __func__, hrec->vals[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
426 type = BCF_HT_STR;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
427 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
428 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
429 else if ( !strcmp(hrec->keys[i], "Number") )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
430 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
431 if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
432 else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
433 else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
434 else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
435 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
436 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
437 sscanf(hrec->vals[i],"%d",&num);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
438 var = BCF_VL_FIXED;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
439 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
440 if (var != BCF_VL_FIXED) num = 0xfffff;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
441 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
442 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
443 uint32_t info = (uint32_t)num<<12 | var<<8 | type<<4 | hrec->type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
444
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
445 if ( !id ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
446 str = strdup(id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
447
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
448 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
449 k = kh_put(vdict, d, str, &ret);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
450 if ( !ret )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
451 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
452 // already present
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
453 free(str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
454 if ( kh_val(d, k).hrec[info&0xf] ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
455 kh_val(d, k).info[info&0xf] = info;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
456 kh_val(d, k).hrec[info&0xf] = hrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
457 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
458 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
459 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
460 kh_val(d, k) = bcf_idinfo_def;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
461 kh_val(d, k).info[info&0xf] = info;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
462 kh_val(d, k).hrec[info&0xf] = hrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
463 kh_val(d, k).id = idx==-1 ? kh_size(d) - 1 : idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
464
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
465 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
466
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
467 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
468 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
469
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
470 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
471 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
472 if ( !hrec ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
473
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
474 hrec->type = BCF_HL_GEN;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
475 if ( !bcf_hdr_register_hrec(hdr,hrec) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
476 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
477 // If one of the hashed field, then it is already present
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
478 if ( hrec->type != BCF_HL_GEN )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
479 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
480 bcf_hrec_destroy(hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
481 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
482 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
483
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
484 // Is one of the generic fields and already present?
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
485 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
486 for (i=0; i<hdr->nhrec; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
487 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
488 if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
489 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
490 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
491 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
492 if ( i<hdr->nhrec )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
493 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
494 bcf_hrec_destroy(hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
495 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
496 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
497 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
498
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
499 // New record, needs to be added
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
500 int n = ++hdr->nhrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
501 hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
502 hdr->hrec[n-1] = hrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
503 hdr->dirty = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
504
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
505 return hrec->type==BCF_HL_GEN ? 0 : 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
506 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
507
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
508 /*
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
509 * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed),
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
510 * the STR,GEN lines are searched for linearly in a linked list of all header lines.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
511 * This may become a problem for VCFs with huge headers, we might need to build a
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
512 * dictionary for these lines as well.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
513 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
514 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
515 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
516 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
517 if ( type==BCF_HL_GEN )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
518 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
519 for (i=0; i<hdr->nhrec; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
520 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
521 if ( hdr->hrec[i]->type!=type ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
522 if ( strcmp(hdr->hrec[i]->key,key) ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
523 if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
524 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
525 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
526 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
527 else if ( type==BCF_HL_STR )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
528 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
529 for (i=0; i<hdr->nhrec; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
530 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
531 if ( hdr->hrec[i]->type!=type ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
532 if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
533 int j = bcf_hrec_find_key(hdr->hrec[i],key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
534 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
535 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
536 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
537 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
538 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
539 khint_t k = kh_get(vdict, d, value);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
540 if ( k == kh_end(d) ) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
541 return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
542 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
543
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
544 void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
545 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
546 static int PL_warned = 0, GL_warned = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
547
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
548 if ( !PL_warned )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
549 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
550 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
551 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
552 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
553 fprintf(stderr,"[W::%s] PL should be declared as Number=G\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
554 PL_warned = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
555 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
556 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
557 if ( !GL_warned )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
558 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
559 int id = bcf_hdr_id2int(hdr, BCF_HL_FMT, "GL");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
560 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
561 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
562 fprintf(stderr,"[W::%s] GL should be declared as Number=G\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
563 PL_warned = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
564 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
565 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
566 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
567
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
568 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
569 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
570 int len, needs_sync = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
571 char *p = htxt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
572
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
573 // Check sanity: "fileformat" string must come as first
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
574 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
575 if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
576 fprintf(stderr, "[W::%s] The first line should be ##fileformat; is the VCF/BCF header broken?\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
577 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
578
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
579 // The filter PASS must appear first in the dictionary
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
580 hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
581 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
582
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
583 // Parse the whole header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
584 while ( (hrec=bcf_hdr_parse_line(hdr,p,&len)) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
585 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
586 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
587 p += len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
588 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
589 int ret = bcf_hdr_parse_sample_line(hdr,p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
590 bcf_hdr_sync(hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
591 bcf_hdr_check_sanity(hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
592 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
593 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
594
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
595 int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
596 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
597 int len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
598 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
599 if ( !hrec ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
600 bcf_hdr_add_hrec(hdr, hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
601 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
602 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
603
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
604 void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
605 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
606 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
607 bcf_hrec_t *hrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
608 while (1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
609 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
610 if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
611 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
612 hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
613 if ( !hrec ) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
614
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
615 for (i=0; i<hdr->nhrec; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
616 if ( hdr->hrec[i]==hrec ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
617 assert( i<hdr->nhrec );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
618
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
619 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
620 khint_t k = kh_get(vdict, d, key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
621 kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
622 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
623 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
624 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
625 for (i=0; i<hdr->nhrec; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
626 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
627 if ( hdr->hrec[i]->type!=type ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
628 if ( type==BCF_HL_GEN )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
629 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
630 if ( !strcmp(hdr->hrec[i]->key,key) ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
631 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
632 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
633 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
634 // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
635 int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
636 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
637 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
638 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
639 if ( i==hdr->nhrec ) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
640 hrec = hdr->hrec[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
641 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
642
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
643 hdr->nhrec--;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
644 if ( i < hdr->nhrec )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
645 memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
646 bcf_hrec_destroy(hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
647 hdr->dirty = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
648 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
649 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
650
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
651 int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
652 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
653 va_list ap;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
654 va_start(ap, fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
655 int n = vsnprintf(NULL, 0, fmt, ap) + 2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
656 va_end(ap);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
657
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
658 char *line = (char*)malloc(n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
659 va_start(ap, fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
660 vsnprintf(line, n, fmt, ap);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
661 va_end(ap);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
662
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
663 int ret = bcf_hdr_append(hdr, line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
664
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
665 free(line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
666 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
667 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
668
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
669
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
670 /**********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
671 *** BCF header I/O ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
672 **********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
673
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
674 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
675 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
676 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
677 if ( !hrec )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
678 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
679 fprintf(stderr,"No version string found, assuming VCFv4.2\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
680 return "VCFv4.2";
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
681 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
682 return hrec->value;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
683 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
684
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
685 void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
686 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
687 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
688 if ( !hrec )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
689 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
690 int len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
691 kstring_t str = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
692 ksprintf(&str,"##fileformat=%s", version);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
693 hrec = bcf_hdr_parse_line(hdr, str.s, &len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
694 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
695 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
696 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
697 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
698 free(hrec->value);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
699 hrec->value = strdup(version);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
700 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
701 hdr->dirty = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
702 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
703
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
704 bcf_hdr_t *bcf_hdr_init(const char *mode)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
705 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
706 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
707 bcf_hdr_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
708 h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
709 for (i = 0; i < 3; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
710 h->dict[i] = kh_init(vdict);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
711 if ( strchr(mode,'w') )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
712 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
713 bcf_hdr_append(h, "##fileformat=VCFv4.2");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
714 // The filter PASS must appear first in the dictionary
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
715 bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
716 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
717 return h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
718 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
719
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
720 void bcf_hdr_destroy(bcf_hdr_t *h)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
721 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
722 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
723 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
724 for (i = 0; i < 3; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
725 vdict_t *d = (vdict_t*)h->dict[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
726 if (d == 0) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
727 for (k = kh_begin(d); k != kh_end(d); ++k)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
728 if (kh_exist(d, k)) free((char*)kh_key(d, k));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
729 kh_destroy(vdict, d);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
730 free(h->id[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
731 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
732 for (i=0; i<h->nhrec; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
733 bcf_hrec_destroy(h->hrec[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
734 if (h->nhrec) free(h->hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
735 if (h->samples) free(h->samples);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
736 free(h->keep_samples);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
737 free(h->transl[0]); free(h->transl[1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
738 free(h->mem.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
739 free(h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
740 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
741
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
742 bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
743 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
744 if (hfp->format.format == vcf)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
745 return vcf_hdr_read(hfp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
746
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
747 BGZF *fp = hfp->fp.bgzf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
748 uint8_t magic[5];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
749 bcf_hdr_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
750 h = bcf_hdr_init("r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
751 if ( bgzf_read(fp, magic, 5)<0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
752 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
753 fprintf(stderr,"[%s:%d %s] Failed to read the header (reading BCF in text mode?)\n", __FILE__,__LINE__,__FUNCTION__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
754 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
755 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
756 if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
757 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
758 if (!strncmp((char*)magic, "BCF", 3))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
759 fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 is supported.\n", __FILE__,__LINE__,__FUNCTION__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
760 else if (hts_verbose >= 2)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
761 fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
762 bcf_hdr_destroy(h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
763 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
764 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
765 int hlen;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
766 char *htxt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
767 bgzf_read(fp, &hlen, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
768 htxt = (char*)malloc(hlen);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
769 bgzf_read(fp, htxt, hlen);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
770 bcf_hdr_parse(h, htxt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
771 free(htxt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
772 return h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
773 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
774
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
775 int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
776 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
777 if ( h->dirty ) bcf_hdr_sync(h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
778 if (hfp->format.format == vcf || hfp->format.format == text_format)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
779 return vcf_hdr_write(hfp, h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
780
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
781 int hlen;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
782 char *htxt = bcf_hdr_fmt_text(h, 1, &hlen);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
783 hlen++; // include the \0 byte
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
784
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
785 BGZF *fp = hfp->fp.bgzf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
786 if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
787 if ( bgzf_write(fp, &hlen, 4) !=4 ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
788 if ( bgzf_write(fp, htxt, hlen) != hlen ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
789
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
790 free(htxt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
791 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
792 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
793
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
794 /********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
795 *** BCF site I/O ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
796 ********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
797
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
798 bcf1_t *bcf_init1()
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
799 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
800 bcf1_t *v;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
801 v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
802 return v;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
803 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
804
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
805 void bcf_clear(bcf1_t *v)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
806 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
807 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
808 for (i=0; i<v->d.m_info; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
809 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
810 if ( v->d.info[i].vptr_free )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
811 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
812 free(v->d.info[i].vptr - v->d.info[i].vptr_off);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
813 v->d.info[i].vptr_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
814 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
815 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
816 for (i=0; i<v->d.m_fmt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
817 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
818 if ( v->d.fmt[i].p_free )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
819 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
820 free(v->d.fmt[i].p - v->d.fmt[i].p_off);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
821 v->d.fmt[i].p_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
822 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
823 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
824 v->rid = v->pos = v->rlen = v->unpacked = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
825 bcf_float_set_missing(v->qual);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
826 v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
827 v->shared.l = v->indiv.l = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
828 v->d.var_type = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
829 v->d.shared_dirty = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
830 v->d.indiv_dirty = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
831 v->d.n_flt = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
832 v->errcode = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
833 if (v->d.m_als) v->d.als[0] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
834 if (v->d.m_id) v->d.id[0] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
835 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
836
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
837 void bcf_empty1(bcf1_t *v)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
838 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
839 bcf_clear1(v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
840 free(v->d.id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
841 free(v->d.als);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
842 free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
843 if (v->d.var ) free(v->d.var);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
844 free(v->shared.s); free(v->indiv.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
845 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
846
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
847 void bcf_destroy1(bcf1_t *v)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
848 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
849 bcf_empty1(v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
850 free(v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
851 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
852
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
853 static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
854 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
855 uint32_t x[8];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
856 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
857 if ((ret = bgzf_read(fp, x, 32)) != 32) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
858 if (ret == 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
859 return -2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
860 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
861 bcf_clear1(v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
862 x[0] -= 24; // to exclude six 32-bit integers
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
863 ks_resize(&v->shared, x[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
864 ks_resize(&v->indiv, x[1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
865 memcpy(v, x + 2, 16);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
866 v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
867 v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
868 v->shared.l = x[0], v->indiv.l = x[1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
869
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
870 // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
871 if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
872
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
873 bgzf_read(fp, v->shared.s, v->shared.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
874 bgzf_read(fp, v->indiv.s, v->indiv.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
875 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
876 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
877
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
878 #define bit_array_size(n) ((n)/8+1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
879 #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
880 #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
881 #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
882
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
883 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
884 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
885 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
886 if ( !hdr->keep_samples ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
887 if ( !bcf_hdr_nsamples(hdr) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
888 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
889 rec->indiv.l = rec->n_sample = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
890 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
891 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
892
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
893 int i, j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
894 uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
895 bcf_dec_t *dec = &rec->d;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
896 hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
897 for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
898
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
899 for (i=0; i<rec->n_fmt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
900 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
901 ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
902 src = dec->fmt[i].p - dec->fmt[i].size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
903 if ( dst )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
904 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
905 memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
906 dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
907 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
908 dst = dec->fmt[i].p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
909 for (j=0; j<hdr->nsamples_ori; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
910 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
911 src += dec->fmt[i].size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
912 if ( !bit_array_test(hdr->keep_samples,j) ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
913 memmove(dst, src, dec->fmt[i].size);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
914 dst += dec->fmt[i].size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
915 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
916 rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
917 dec->fmt[i].p_len = dst - dec->fmt[i].p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
918 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
919 rec->unpacked |= BCF_UN_FMT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
920
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
921 rec->n_sample = bcf_hdr_nsamples(hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
922 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
923 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
924
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
925 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
926 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
927 if (fp->format.format == vcf) return vcf_read(fp,h,v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
928 int ret = bcf_read1_core(fp->fp.bgzf, v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
929 if ( ret!=0 || !h->keep_samples ) return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
930 return bcf_subset_format(h,v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
931 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
932
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
933 int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
934 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
935 bcf1_t *v = (bcf1_t *) vv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
936 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
937 if ((ret = bcf_read1_core(fp, v)) >= 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
938 *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
939 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
940 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
941
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
942 static inline void bcf1_sync_id(bcf1_t *line, kstring_t *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
943 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
944 // single typed string
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
945 if ( line->d.id && strcmp(line->d.id, ".") ) bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
946 else bcf_enc_size(str, 0, BCF_BT_CHAR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
947 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
948 static inline void bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
949 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
950 // list of typed strings
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
951 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
952 for (i=0; i<line->n_allele; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
953 bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
954 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
955 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
956 static inline void bcf1_sync_filter(bcf1_t *line, kstring_t *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
957 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
958 // typed vector of integers
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
959 if ( line->d.n_flt ) bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
960 else bcf_enc_vint(str, 0, 0, -1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
961 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
962
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
963 static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
964 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
965 // pairs of typed vectors
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
966 int i, irm = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
967 for (i=0; i<line->n_info; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
968 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
969 bcf_info_t *info = &line->d.info[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
970 if ( !info->vptr )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
971 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
972 // marked for removal
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
973 if ( irm < 0 ) irm = i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
974 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
975 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
976 kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
977 if ( irm >=0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
978 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
979 bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
980 while ( irm<=i && line->d.info[irm].vptr ) irm++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
981 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
982 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
983 if ( irm>=0 ) line->n_info = irm;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
984 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
985
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
986 static int bcf1_sync(bcf1_t *line)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
987 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
988 char *shared_ori = line->shared.s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
989 size_t prev_len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
990
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
991 kstring_t tmp = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
992 if ( !line->shared.l )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
993 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
994 // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
995 tmp = line->shared;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
996 bcf1_sync_id(line, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
997 line->unpack_size[0] = tmp.l; prev_len = tmp.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
998
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
999 bcf1_sync_alleles(line, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1000 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1001
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1002 bcf1_sync_filter(line, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1003 line->unpack_size[2] = tmp.l - prev_len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1004
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1005 bcf1_sync_info(line, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1006 line->shared = tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1007 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1008 else if ( line->d.shared_dirty )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1009 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1010 // The line was edited, update the BCF data block, ptr_ori points
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1011 // to the original unchanged BCF data.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1012 uint8_t *ptr_ori = (uint8_t *) line->shared.s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1013
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1014 assert( line->unpacked & BCF_UN_STR );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1015
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1016 // ID: single typed string
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1017 if ( line->d.shared_dirty & BCF1_DIRTY_ID )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1018 bcf1_sync_id(line, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1019 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1020 kputsn_(ptr_ori, line->unpack_size[0], &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1021 ptr_ori += line->unpack_size[0];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1022 line->unpack_size[0] = tmp.l; prev_len = tmp.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1023
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1024 // REF+ALT: list of typed strings
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1025 if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1026 bcf1_sync_alleles(line, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1027 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1028 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1029 kputsn_(ptr_ori, line->unpack_size[1], &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1030 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1031 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1032 ptr_ori += line->unpack_size[1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1033 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1034
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1035 if ( line->unpacked & BCF_UN_FLT )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1036 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1037 // FILTER: typed vector of integers
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1038 if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1039 bcf1_sync_filter(line, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1040 else if ( line->d.n_flt )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1041 kputsn_(ptr_ori, line->unpack_size[2], &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1042 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1043 bcf_enc_vint(&tmp, 0, 0, -1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1044 ptr_ori += line->unpack_size[2];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1045 line->unpack_size[2] = tmp.l - prev_len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1046
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1047 if ( line->unpacked & BCF_UN_INFO )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1048 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1049 // INFO: pairs of typed vectors
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1050 if ( line->d.shared_dirty & BCF1_DIRTY_INF )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1051 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1052 bcf1_sync_info(line, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1053 ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1054 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1055 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1056 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1057
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1058 int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1059 if ( size ) kputsn_(ptr_ori, size, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1060
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1061 free(line->shared.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1062 line->shared = tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1063 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1064 if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1065 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1066 // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1067 size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1068 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1069 for (i=0; i<line->n_info; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1070 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1071 uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1072 line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1073 off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1074 if ( vptr_free )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1075 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1076 free(vptr_free);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1077 line->d.info[i].vptr_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1078 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1079 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1080 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1081
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1082 if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1083 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1084 // The genotype fields changed or are not present
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1085 tmp.l = tmp.m = 0; tmp.s = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1086 int i, irm = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1087 for (i=0; i<line->n_fmt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1088 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1089 bcf_fmt_t *fmt = &line->d.fmt[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1090 if ( !fmt->p )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1091 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1092 // marked for removal
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1093 if ( irm < 0 ) irm = i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1094 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1095 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1096 kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1097 if ( irm >=0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1098 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1099 bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1100 while ( irm<=i && line->d.fmt[irm].p ) irm++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1101 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1102
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1103 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1104 if ( irm>=0 ) line->n_fmt = irm;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1105 free(line->indiv.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1106 line->indiv = tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1107
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1108 // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1109 size_t off_new = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1110 for (i=0; i<line->n_fmt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1111 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1112 uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1113 line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1114 off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1115 if ( p_free )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1116 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1117 free(p_free);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1118 line->d.fmt[i].p_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1119 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1120 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1121 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1122 if ( !line->n_sample ) line->n_fmt = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1123 line->d.shared_dirty = line->d.indiv_dirty = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1124 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1125 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1126
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1127 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1128 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1129 bcf1_sync(src);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1130
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1131 bcf_clear(dst);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1132 dst->rid = src->rid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1133 dst->pos = src->pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1134 dst->rlen = src->rlen;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1135 dst->qual = src->qual;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1136 dst->n_info = src->n_info; dst->n_allele = src->n_allele;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1137 dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1138
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1139 dst->shared.m = dst->shared.l = src->shared.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1140 dst->shared.s = (char*) malloc(dst->shared.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1141 memcpy(dst->shared.s,src->shared.s,dst->shared.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1142
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1143 dst->indiv.m = dst->indiv.l = src->indiv.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1144 dst->indiv.s = (char*) malloc(dst->indiv.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1145 memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1146
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1147 return dst;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1148 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1149 bcf1_t *bcf_dup(bcf1_t *src)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1150 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1151 bcf1_t *out = bcf_init1();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1152 return bcf_copy(out, src);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1153 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1154
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1155 int bcf_write(htsFile *hfp, const bcf_hdr_t *h, bcf1_t *v)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1156 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1157 if ( h->dirty )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1158 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1159 // we could as well call bcf_hdr_sync here, not sure
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1160 fprintf(stderr,"FIXME: dirty header not synced\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1161 exit(1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1162 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1163 if ( bcf_hdr_nsamples(h)!=v->n_sample )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1164 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1165 fprintf(stderr,"[%s:%d %s] Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1166 __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1167 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1168 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1169
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1170 if ( hfp->format.format == vcf || hfp->format.format == text_format )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1171 return vcf_write(hfp,h,v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1172
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1173 if ( v->errcode )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1174 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1175 // vcf_parse1() encountered a new contig or tag, undeclared in the
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1176 // header. At this point, the header must have been printed,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1177 // proceeding would lead to a broken BCF file. Errors must be checked
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1178 // and cleared by the caller before we can proceed.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1179 fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,v->errcode);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1180 exit(1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1181 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1182 bcf1_sync(v); // check if the BCF record was modified
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1183
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1184 BGZF *fp = hfp->fp.bgzf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1185 uint32_t x[8];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1186 x[0] = v->shared.l + 24; // to include six 32-bit integers
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1187 x[1] = v->indiv.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1188 memcpy(x + 2, v, 16);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1189 x[6] = (uint32_t)v->n_allele<<16 | v->n_info;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1190 x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1191 if ( bgzf_write(fp, x, 32) != 32 ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1192 if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1193 if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1194 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1195 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1196
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1197 /**********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1198 *** VCF header I/O ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1199 **********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1200
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1201 bcf_hdr_t *vcf_hdr_read(htsFile *fp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1202 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1203 kstring_t txt, *s = &fp->line;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1204 bcf_hdr_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1205 h = bcf_hdr_init("r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1206 txt.l = txt.m = 0; txt.s = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1207 while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1208 if (s->l == 0) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1209 if (s->s[0] != '#') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1210 if (hts_verbose >= 2)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1211 fprintf(stderr, "[E::%s] no sample line\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1212 free(txt.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1213 bcf_hdr_destroy(h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1214 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1215 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1216 if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1217 int dret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1218 gzFile f;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1219 kstream_t *ks;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1220 kstring_t tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1221 tmp.l = tmp.m = 0; tmp.s = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1222 f = gzopen(fp->fn_aux, "r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1223 ks = ks_init(f);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1224 while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1225 int c;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1226 kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1227 ks_getuntil(ks, 0, &tmp, &dret);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1228 kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1229 kputsn(">\n", 2, &txt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1230 if (dret != '\n')
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1231 while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1232 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1233 free(tmp.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1234 ks_destroy(ks);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1235 gzclose(f);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1236 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1237 kputsn(s->s, s->l, &txt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1238 kputc('\n', &txt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1239 if (s->s[1] != '#') break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1240 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1241 if ( !txt.s )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1242 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1243 fprintf(stderr,"[%s:%d %s] Could not read the header\n", __FILE__,__LINE__,__FUNCTION__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1244 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1245 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1246 bcf_hdr_parse(h, txt.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1247
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1248 // check tabix index, are all contigs listed in the header? add the missing ones
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1249 tbx_t *idx = tbx_index_load(fp->fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1250 if ( idx )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1251 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1252 int i, n, need_sync = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1253 const char **names = tbx_seqnames(idx, &n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1254 for (i=0; i<n; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1255 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1256 bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1257 if ( hrec ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1258 hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1259 hrec->key = strdup("contig");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1260 bcf_hrec_add_key(hrec, "ID", strlen("ID"));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1261 bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1262 bcf_hdr_add_hrec(h, hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1263 need_sync = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1264 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1265 free(names);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1266 tbx_destroy(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1267 if ( need_sync )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1268 bcf_hdr_sync(h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1269 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1270 free(txt.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1271 return h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1272 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1273
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1274 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1275 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1276 int i, n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1277 char **lines = hts_readlines(fname, &n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1278 if ( !lines ) return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1279 for (i=0; i<n-1; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1280 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1281 int k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1282 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1283 if ( hrec ) bcf_hdr_add_hrec(hdr, hrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1284 free(lines[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1285 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1286 bcf_hdr_parse_sample_line(hdr,lines[n-1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1287 free(lines[n-1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1288 free(lines);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1289 bcf_hdr_sync(hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1290 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1291 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1292
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1293 static void _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1294 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1295 if ( !hrec->value )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1296 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1297 int j, nout = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1298 ksprintf(str, "##%s=<", hrec->key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1299 for (j=0; j<hrec->nkeys; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1300 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1301 // do not output IDX if output is VCF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1302 if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1303 if ( nout ) kputc(',',str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1304 ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1305 nout++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1306 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1307 ksprintf(str,">\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1308 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1309 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1310 ksprintf(str,"##%s=%s\n", hrec->key,hrec->value);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1311 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1312
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1313 void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1314 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1315 _bcf_hrec_format(hrec,0,str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1316 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1317 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1318 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1319 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1320 kstring_t txt = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1321 for (i=0; i<hdr->nhrec; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1322 _bcf_hrec_format(hdr->hrec[i], is_bcf, &txt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1323
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1324 ksprintf(&txt,"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1325 if ( bcf_hdr_nsamples(hdr) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1326 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1327 ksprintf(&txt,"\tFORMAT");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1328 for (i=0; i<bcf_hdr_nsamples(hdr); i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1329 ksprintf(&txt,"\t%s", hdr->samples[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1330 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1331 ksprintf(&txt,"\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1332
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1333 if ( len ) *len = txt.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1334 return txt.s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1335 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1336
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1337 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1338 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1339 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1340 int tid, m = kh_size(d);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1341 const char **names = (const char**) calloc(m,sizeof(const char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1342 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1343 for (k=kh_begin(d); k<kh_end(d); k++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1344 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1345 if ( !kh_exist(d,k) ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1346 tid = kh_val(d,k).id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1347 assert( tid<m );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1348 names[tid] = kh_key(d,k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1349 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1350 // sanity check: there should be no gaps
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1351 for (tid=0; tid<m; tid++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1352 assert(names[tid]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1353 *n = m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1354 return names;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1355 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1356
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1357 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1358 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1359 int hlen;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1360 char *htxt = bcf_hdr_fmt_text(h, 0, &hlen);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1361 while (hlen && htxt[hlen-1] == 0) --hlen; // kill trailing zeros
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1362 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1363 if ( fp->format.compression!=no_compression )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1364 ret = bgzf_write(fp->fp.bgzf, htxt, hlen);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1365 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1366 ret = hwrite(fp->fp.hfile, htxt, hlen);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1367 free(htxt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1368 return ret<0 ? -1 : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1369 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1370
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1371 /***********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1372 *** Typed value I/O ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1373 ***********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1374
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1375 void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1376 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1377 int32_t max = INT32_MIN + 1, min = INT32_MAX;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1378 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1379 if (n == 0) bcf_enc_size(s, 0, BCF_BT_NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1380 else if (n == 1) bcf_enc_int1(s, a[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1381 else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1382 if (wsize <= 0) wsize = n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1383 for (i = 0; i < n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1384 if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1385 if (max < a[i]) max = a[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1386 if (min > a[i]) min = a[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1387 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1388 if (max <= INT8_MAX && min > bcf_int8_vector_end) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1389 bcf_enc_size(s, wsize, BCF_BT_INT8);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1390 for (i = 0; i < n; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1391 if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1392 else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1393 else kputc(a[i], s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1394 } else if (max <= INT16_MAX && min > bcf_int16_vector_end) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1395 bcf_enc_size(s, wsize, BCF_BT_INT16);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1396 for (i = 0; i < n; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1397 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1398 int16_t x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1399 if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1400 else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1401 else x = a[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1402 kputsn((char*)&x, 2, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1403 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1404 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1405 bcf_enc_size(s, wsize, BCF_BT_INT32);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1406 for (i = 0; i < n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1407 int32_t x = a[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1408 kputsn((char*)&x, 4, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1409 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1410 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1411 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1412 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1413
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1414 void bcf_enc_vfloat(kstring_t *s, int n, float *a)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1415 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1416 bcf_enc_size(s, n, BCF_BT_FLOAT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1417 kputsn((char*)a, n << 2, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1418 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1419
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1420 void bcf_enc_vchar(kstring_t *s, int l, const char *a)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1421 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1422 bcf_enc_size(s, l, BCF_BT_CHAR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1423 kputsn(a, l, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1424 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1425
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1426 void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1427 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1428 int j = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1429 if (n == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1430 kputc('.', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1431 return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1432 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1433 if (type == BCF_BT_CHAR)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1434 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1435 char *p = (char*)data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1436 for (j = 0; j < n && *p; ++j, ++p)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1437 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1438 if ( *p==bcf_str_missing ) kputc('.', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1439 else kputc(*p, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1440 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1441 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1442 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1443 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1444 #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1445 type_t *p = (type_t *) data; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1446 for (j=0; j<n; j++) \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1447 { \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1448 if ( is_vector_end ) break; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1449 if ( j ) kputc(',', s); \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1450 if ( is_missing ) kputc('.', s); \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1451 else kprint; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1452 } \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1453 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1454 switch (type) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1455 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, kputw(p[j], s)); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1456 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, kputw(p[j], s)); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1457 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, kputw(p[j], s)); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1458 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), ksprintf(s, "%g", p[j])); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1459 default: fprintf(stderr,"todo: type %d\n", type); exit(1); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1460 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1461 #undef BRANCH
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1462 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1463 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1464
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1465 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1466 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1467 int x, type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1468 x = bcf_dec_size(ptr, &ptr, &type);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1469 bcf_fmt_array(s, x, type, ptr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1470 return ptr + (x << bcf_type_shift[type]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1471 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1472
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1473 /********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1474 *** VCF site I/O ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1475 ********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1476
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1477 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1478 int key, max_m, size, offset;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1479 uint32_t is_gt:1, max_g:15, max_l:16;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1480 uint32_t y;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1481 uint8_t *buf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1482 } fmt_aux_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1483
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1484 static inline void align_mem(kstring_t *s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1485 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1486 if (s->l&7) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1487 uint64_t zero = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1488 int l = ((s->l + 7)>>3<<3) - s->l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1489 kputsn((char*)&zero, l, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1490 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1491 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1492
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1493 // p,q is the start and the end of the FORMAT field
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1494 int _vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1495 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1496 if ( !bcf_hdr_nsamples(h) ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1497
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1498 char *r, *t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1499 int j, l, m, g;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1500 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1501 ks_tokaux_t aux1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1502 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1503 kstring_t *mem = (kstring_t*)&h->mem;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1504 mem->l = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1505
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1506 // count the number of format fields
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1507 for (r = p, v->n_fmt = 1; *r; ++r)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1508 if (*r == ':') ++v->n_fmt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1509 char *end = s->s + s->l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1510 if ( q>=end )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1511 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1512 fprintf(stderr,"[%s:%d %s] Error: FORMAT column with no sample columns starting at %s:%d\n", __FILE__,__LINE__,__FUNCTION__,s->s,v->pos+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1513 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1514 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1515
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1516 fmt_aux_t *fmt = (fmt_aux_t*)alloca(v->n_fmt * sizeof(fmt_aux_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1517 // get format information from the dictionary
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1518 for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1519 *(char*)aux1.p = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1520 k = kh_get(vdict, d, t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1521 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1522 fprintf(stderr, "[W::%s] FORMAT '%s' is not defined in the header, assuming Type=String\n", __func__, t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1523 kstring_t tmp = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1524 int l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1525 ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1526 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1527 free(tmp.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1528 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1529 k = kh_get(vdict, d, t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1530 v->errcode = BCF_ERR_TAG_UNDEF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1531 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1532 fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1533 fmt[j].key = kh_val(d, k).id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1534 fmt[j].is_gt = !strcmp(t, "GT");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1535 fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1536 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1537 // compute max
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1538 int n_sample_ori = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1539 r = q + 1; // r: position in the format string
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1540 m = l = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1541 while ( r<end )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1542 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1543 // can we skip some samples?
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1544 if ( h->keep_samples )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1545 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1546 n_sample_ori++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1547 if ( !bit_array_test(h->keep_samples,n_sample_ori) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1548 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1549 while ( *r!='\t' && r<end ) r++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1550 if ( *r=='\t' ) { *r = 0; r++; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1551 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1552 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1553 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1554
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1555 // collect fmt stats: max vector size, length, number of alleles
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1556 j = 0; // j-th format field
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1557 for (;;)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1558 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1559 if ( *r == '\t' ) *r = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1560 if ( *r == ':' || !*r ) // end of field or end of sample
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1561 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1562 if (fmt[j].max_m < m) fmt[j].max_m = m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1563 if (fmt[j].max_l < l - 1) fmt[j].max_l = l - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1564 if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].max_g = g;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1565 l = 0, m = g = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1566 if ( *r==':' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1567 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1568 j++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1569 if ( j>=v->n_fmt )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1570 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1571 fprintf(stderr,"Incorrect number of FORMAT fields at %s:%d\n", h->id[BCF_DT_CTG][v->rid].key,v->pos+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1572 exit(1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1573 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1574 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1575 else break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1576 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1577 else if ( *r== ',' ) m++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1578 else if ( fmt[j].is_gt && (*r == '|' || *r == '/') ) g++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1579 if ( r>=end ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1580 r++; l++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1581 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1582 v->n_sample++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1583 if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1584 r++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1585 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1586
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1587 // allocate memory for arrays
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1588 for (j = 0; j < v->n_fmt; ++j) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1589 fmt_aux_t *f = &fmt[j];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1590 if ( !f->max_m ) f->max_m = 1; // omitted trailing format field
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1591 if ((f->y>>4&0xf) == BCF_HT_STR) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1592 f->size = f->is_gt? f->max_g << 2 : f->max_l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1593 } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1594 f->size = f->max_m << 2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1595 } else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1596 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1597 fprintf(stderr, "[E::%s] the format type %d currently not supported\n", __func__, f->y>>4&0xf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1598 abort(); // I do not know how to do with Flag in the genotype fields
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1599 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1600 align_mem(mem);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1601 f->offset = mem->l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1602 ks_resize(mem, mem->l + v->n_sample * f->size);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1603 mem->l += v->n_sample * f->size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1604 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1605 for (j = 0; j < v->n_fmt; ++j)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1606 fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1607 // fill the sample fields; at beginning of the loop, t points to the first char of a format
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1608 n_sample_ori = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1609 t = q + 1; m = 0; // m: sample id
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1610 while ( t<end )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1611 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1612 // can we skip some samples?
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1613 if ( h->keep_samples )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1614 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1615 n_sample_ori++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1616 if ( !bit_array_test(h->keep_samples,n_sample_ori) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1617 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1618 while ( *t && t<end ) t++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1619 t++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1620 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1621 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1622 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1623 if ( m == bcf_hdr_nsamples(h) ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1624
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1625 j = 0; // j-th format field, m-th sample
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1626 while ( *t )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1627 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1628 fmt_aux_t *z = &fmt[j];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1629 if ((z->y>>4&0xf) == BCF_HT_STR) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1630 if (z->is_gt) { // genotypes
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1631 int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1632 for (l = 0;; ++t) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1633 if (*t == '.') ++t, x[l++] = is_phased;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1634 else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1635 #if THOROUGH_SANITY_CHECKS
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1636 assert( 0 ); // success of strtol,strtod not checked
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1637 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1638 is_phased = (*t == '|');
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1639 if (*t == ':' || *t == 0) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1640 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1641 if ( !l ) x[l++] = 0; // An empty field, insert missing value
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1642 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1643 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1644 char *x = (char*)z->buf + z->size * m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1645 for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1646 for (; l < z->size; ++l) x[l] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1647 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1648 } else if ((z->y>>4&0xf) == BCF_HT_INT) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1649 int32_t *x = (int32_t*)(z->buf + z->size * m);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1650 for (l = 0;; ++t) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1651 if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1652 else x[l++] = strtol(t, &t, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1653 if (*t == ':' || *t == 0) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1654 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1655 if ( !l ) x[l++] = bcf_int32_missing;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1656 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1657 } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1658 float *x = (float*)(z->buf + z->size * m);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1659 for (l = 0;; ++t) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1660 if (*t == '.' && !isdigit(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1661 else x[l++] = strtod(t, &t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1662 if (*t == ':' || *t == 0) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1663 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1664 if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1665 for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1666 } else abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1667 if (*t == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1668 for (++j; j < v->n_fmt; ++j) { // fill end-of-vector values
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1669 z = &fmt[j];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1670 if ((z->y>>4&0xf) == BCF_HT_STR) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1671 if (z->is_gt) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1672 int32_t *x = (int32_t*)(z->buf + z->size * m);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1673 x[0] = bcf_int32_missing;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1674 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1675 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1676 char *x = (char*)z->buf + z->size * m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1677 if ( z->size ) x[0] = '.';
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1678 for (l = 1; l < z->size; ++l) x[l] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1679 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1680 } else if ((z->y>>4&0xf) == BCF_HT_INT) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1681 int32_t *x = (int32_t*)(z->buf + z->size * m);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1682 x[0] = bcf_int32_missing;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1683 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1684 } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1685 float *x = (float*)(z->buf + z->size * m);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1686 bcf_float_set_missing(x[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1687 for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1688 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1689 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1690 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1691 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1692 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1693 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1694 if (*t == ':') ++j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1695 t++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1696 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1697 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1698 m++; t++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1699 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1700
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1701 // write individual genotype information
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1702 kstring_t *str = &v->indiv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1703 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1704 if (v->n_sample > 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1705 for (i = 0; i < v->n_fmt; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1706 fmt_aux_t *z = &fmt[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1707 bcf_enc_int1(str, z->key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1708 if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1709 bcf_enc_size(str, z->size, BCF_BT_CHAR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1710 kputsn((char*)z->buf, z->size * v->n_sample, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1711 } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1712 bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1713 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1714 bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1715 kputsn((char*)z->buf, z->size * v->n_sample, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1716 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1717 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1718 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1719
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1720 if ( v->n_sample!=bcf_hdr_nsamples(h) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1721 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1722 fprintf(stderr,"[%s:%d %s] Number of columns at %s:%d does not match the number of samples (%d vs %d).\n",
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1723 __FILE__,__LINE__,__FUNCTION__,bcf_seqname(h,v),v->pos+1, v->n_sample,bcf_hdr_nsamples(h));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1724 v->errcode |= BCF_ERR_NCOLS;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1725 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1726 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1727
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1728 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1729 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1730
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1731 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1732 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1733 int i = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1734 char *p, *q, *r, *t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1735 kstring_t *str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1736 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1737 ks_tokaux_t aux;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1738
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1739 bcf_clear1(v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1740 str = &v->shared;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1741 memset(&aux, 0, sizeof(ks_tokaux_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1742 for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1743 q = (char*)aux.p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1744 *q = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1745 if (i == 0) { // CHROM
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1746 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1747 k = kh_get(vdict, d, p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1748 if (k == kh_end(d))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1749 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1750 // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1751 // been already printed, but will enable tools like vcfcheck to proceed.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1752 fprintf(stderr, "[W::%s] contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)\n", __func__, p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1753 kstring_t tmp = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1754 int l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1755 ksprintf(&tmp, "##contig=<ID=%s>", p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1756 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1757 free(tmp.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1758 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1759 k = kh_get(vdict, d, p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1760 v->errcode = BCF_ERR_CTG_UNDEF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1761 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1762 v->rid = kh_val(d, k).id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1763 } else if (i == 1) { // POS
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1764 v->pos = atoi(p) - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1765 } else if (i == 2) { // ID
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1766 if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1767 else bcf_enc_size(str, 0, BCF_BT_CHAR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1768 } else if (i == 3) { // REF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1769 bcf_enc_vchar(str, q - p, p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1770 v->n_allele = 1, v->rlen = q - p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1771 } else if (i == 4) { // ALT
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1772 if (strcmp(p, ".")) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1773 for (r = t = p;; ++r) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1774 if (*r == ',' || *r == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1775 bcf_enc_vchar(str, r - t, t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1776 t = r + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1777 ++v->n_allele;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1778 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1779 if (r == q) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1780 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1781 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1782 } else if (i == 5) { // QUAL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1783 if (strcmp(p, ".")) v->qual = atof(p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1784 else memcpy(&v->qual, &bcf_float_missing, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1785 if ( v->max_unpack && !(v->max_unpack>>1) ) return 0; // BCF_UN_STR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1786 } else if (i == 6) { // FILTER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1787 if (strcmp(p, ".")) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1788 int32_t *a;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1789 int n_flt = 1, i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1790 ks_tokaux_t aux1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1791 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1792 // count the number of filters
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1793 if (*(q-1) == ';') *(q-1) = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1794 for (r = p; *r; ++r)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1795 if (*r == ';') ++n_flt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1796 a = (int32_t*)alloca(n_flt * sizeof(int32_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1797 // add filters
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1798 for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1799 *(char*)aux1.p = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1800 k = kh_get(vdict, d, t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1801 if (k == kh_end(d))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1802 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1803 // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1804 // been already printed, but will enable tools like vcfcheck to proceed.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1805 fprintf(stderr, "[W::%s] FILTER '%s' is not defined in the header\n", __func__, t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1806 kstring_t tmp = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1807 int l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1808 ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1809 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1810 free(tmp.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1811 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1812 k = kh_get(vdict, d, t);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1813 v->errcode = BCF_ERR_TAG_UNDEF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1814 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1815 a[i++] = kh_val(d, k).id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1816 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1817 n_flt = i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1818 bcf_enc_vint(str, n_flt, a, -1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1819 } else bcf_enc_vint(str, 0, 0, -1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1820 if ( v->max_unpack && !(v->max_unpack>>2) ) return 0; // BCF_UN_FLT
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1821 } else if (i == 7) { // INFO
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1822 char *key;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1823 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1824 v->n_info = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1825 if (strcmp(p, ".")) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1826 if (*(q-1) == ';') *(q-1) = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1827 for (r = key = p;; ++r) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1828 int c;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1829 char *val, *end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1830 if (*r != ';' && *r != '=' && *r != 0) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1831 val = end = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1832 c = *r; *r = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1833 if (c == '=') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1834 val = r + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1835 for (end = val; *end != ';' && *end != 0; ++end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1836 c = *end; *end = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1837 } else end = r;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1838 if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1839 k = kh_get(vdict, d, key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1840 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1841 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1842 fprintf(stderr, "[W::%s] INFO '%s' is not defined in the header, assuming Type=String\n", __func__, key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1843 kstring_t tmp = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1844 int l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1845 ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1846 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1847 free(tmp.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1848 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1849 k = kh_get(vdict, d, key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1850 v->errcode = BCF_ERR_TAG_UNDEF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1851 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1852 uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1853 ++v->n_info;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1854 bcf_enc_int1(str, kh_val(d, k).id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1855 if (val == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1856 bcf_enc_size(str, 0, BCF_BT_NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1857 } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1858 bcf_enc_vchar(str, end - val, val);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1859 } else { // int/float value/array
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1860 int i, n_val;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1861 char *t, *te;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1862 for (t = val, n_val = 1; *t; ++t) // count the number of values
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1863 if (*t == ',') ++n_val;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1864 if ((y>>4&0xf) == BCF_HT_INT) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1865 int32_t *z;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1866 z = (int32_t*)alloca(n_val * sizeof(int32_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1867 for (i = 0, t = val; i < n_val; ++i, ++t)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1868 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1869 z[i] = strtol(t, &te, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1870 if ( te==t ) // conversion failed
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1871 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1872 z[i] = bcf_int32_missing;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1873 while ( *te && *te!=',' ) te++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1874 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1875 t = te;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1876 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1877 bcf_enc_vint(str, n_val, z, -1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1878 if (strcmp(key, "END") == 0) v->rlen = z[0] - v->pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1879 } else if ((y>>4&0xf) == BCF_HT_REAL) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1880 float *z;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1881 z = (float*)alloca(n_val * sizeof(float));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1882 for (i = 0, t = val; i < n_val; ++i, ++t)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1883 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1884 z[i] = strtod(t, &te);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1885 if ( te==t ) // conversion failed
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1886 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1887 bcf_float_set_missing(z[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1888 while ( *te && *te!=',' ) te++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1889 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1890 t = te;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1891 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1892 bcf_enc_vfloat(str, n_val, z);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1893 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1894 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1895 if (c == 0) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1896 r = end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1897 key = r + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1898 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1899 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1900 if ( v->max_unpack && !(v->max_unpack>>3) ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1901 } else if (i == 8) // FORMAT
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1902 return _vcf_parse_format(s, h, v, p, q);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1903 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1904 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1905 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1906
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1907 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1908 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1909 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1910 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1911 if (ret < 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1912 return vcf_parse1(&fp->line, h, v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1913 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1914
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1915 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1916 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1917 uint8_t *ptr_start = ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1918 fmt->id = bcf_dec_typed_int1(ptr, &ptr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1919 fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1920 fmt->size = fmt->n << bcf_type_shift[fmt->type];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1921 fmt->p = ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1922 fmt->p_off = ptr - ptr_start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1923 fmt->p_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1924 ptr += n_sample * fmt->size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1925 fmt->p_len = ptr - fmt->p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1926 return ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1927 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1928
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1929 static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1930 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1931 uint8_t *ptr_start = ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1932 info->key = bcf_dec_typed_int1(ptr, &ptr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1933 info->len = bcf_dec_size(ptr, &ptr, &info->type);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1934 info->vptr = ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1935 info->vptr_off = ptr - ptr_start;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1936 info->vptr_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1937 info->v1.i = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1938 if (info->len == 1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1939 if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1940 else if (info->type == BCF_BT_INT32) info->v1.i = *(int32_t*)ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1941 else if (info->type == BCF_BT_FLOAT) info->v1.f = *(float*)ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1942 else if (info->type == BCF_BT_INT16) info->v1.i = *(int16_t*)ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1943 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1944 ptr += info->len << bcf_type_shift[info->type];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1945 info->vptr_len = ptr - info->vptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1946 return ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1947 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1948
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1949 int bcf_unpack(bcf1_t *b, int which)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1950 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1951 if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1952 uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1953 int *offset, i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1954 bcf_dec_t *d = &b->d;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1955 if (which & BCF_UN_FLT) which |= BCF_UN_STR;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1956 if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1957 if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1958 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1959 kstring_t tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1960
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1961 // ID
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1962 tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1963 ptr_ori = ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1964 ptr = bcf_fmt_sized_array(&tmp, ptr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1965 b->unpack_size[0] = ptr - ptr_ori;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1966 kputc('\0', &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1967 d->id = tmp.s; d->m_id = tmp.m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1968
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1969 // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1970 tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1971 offset = (int*)alloca(b->n_allele * sizeof(int));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1972 ptr_ori = ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1973 for (i = 0; i < b->n_allele; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1974 offset[i] = tmp.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1975 ptr = bcf_fmt_sized_array(&tmp, ptr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1976 kputc('\0', &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1977 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1978 b->unpack_size[1] = ptr - ptr_ori;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1979 d->als = tmp.s; d->m_als = tmp.m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1980
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1981 hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1982 for (i = 0; i < b->n_allele; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1983 d->allele[i] = d->als + offset[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1984 b->unpacked |= BCF_UN_STR;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1985 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1986 if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1987 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1988 ptr_ori = ptr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1989 if (*ptr>>4) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1990 int type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1991 d->n_flt = bcf_dec_size(ptr, &ptr, &type);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1992 hts_expand(int, d->n_flt, d->m_flt, d->flt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1993 for (i = 0; i < d->n_flt; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1994 d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1995 } else ++ptr, d->n_flt = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1996 b->unpack_size[2] = ptr - ptr_ori;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1997 b->unpacked |= BCF_UN_FLT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1998 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1999 if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2000 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2001 hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2002 for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2003 for (i = 0; i < b->n_info; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2004 ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2005 b->unpacked |= BCF_UN_INFO;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2006 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2007 if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2008 ptr = (uint8_t*)b->indiv.s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2009 hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2010 for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2011 for (i = 0; i < b->n_fmt; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2012 ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2013 b->unpacked |= BCF_UN_FMT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2014 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2015 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2016 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2017
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2018 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2019 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2020 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2021 bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2022 kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2023 kputc('\t', s); kputw(v->pos + 1, s); // POS
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2024 kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2025 kputc('\t', s); // REF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2026 if (v->n_allele > 0) kputs(v->d.allele[0], s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2027 else kputc('.', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2028 kputc('\t', s); // ALT
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2029 if (v->n_allele > 1) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2030 for (i = 1; i < v->n_allele; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2031 if (i > 1) kputc(',', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2032 kputs(v->d.allele[i], s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2033 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2034 } else kputc('.', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2035 kputc('\t', s); // QUAL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2036 if ( bcf_float_is_missing(v->qual) ) kputc('.', s); // QUAL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2037 else ksprintf(s, "%g", v->qual);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2038 kputc('\t', s); // FILTER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2039 if (v->d.n_flt) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2040 for (i = 0; i < v->d.n_flt; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2041 if (i) kputc(';', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2042 kputs(h->id[BCF_DT_ID][v->d.flt[i]].key, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2043 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2044 } else kputc('.', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2045 kputc('\t', s); // INFO
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2046 if (v->n_info) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2047 int first = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2048 for (i = 0; i < v->n_info; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2049 bcf_info_t *z = &v->d.info[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2050 if ( !z->vptr ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2051 if ( !first ) kputc(';', s); first = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2052 kputs(h->id[BCF_DT_ID][z->key].key, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2053 if (z->len <= 0) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2054 kputc('=', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2055 if (z->len == 1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2056 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2057 switch (z->type)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2058 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2059 case BCF_BT_INT8: if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2060 case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2061 case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2062 case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else ksprintf(s, "%g", z->v1.f); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2063 case BCF_BT_CHAR: kputc(z->v1.i, s); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2064 default: fprintf(stderr,"todo: type %d\n", z->type); exit(1); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2065 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2066 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2067 else bcf_fmt_array(s, z->len, z->type, z->vptr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2068 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2069 if ( first ) kputc('.', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2070 } else kputc('.', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2071 // FORMAT and individual information
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2072 if (v->n_sample)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2073 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2074 int i,j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2075 if ( v->n_fmt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2076 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2077 int gt_i = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2078 bcf_fmt_t *fmt = v->d.fmt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2079 int first = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2080 for (i = 0; i < (int)v->n_fmt; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2081 if ( !fmt[i].p ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2082 kputc(!first ? ':' : '\t', s); first = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2083 if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2084 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2085 fprintf(stderr, "[E::%s] invalid BCF, the FORMAT tag id=%d not present in the header.\n", __func__, fmt[i].id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2086 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2087 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2088 kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2089 if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2090 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2091 if ( first ) kputs("\t.", s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2092 for (j = 0; j < v->n_sample; ++j) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2093 kputc('\t', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2094 first = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2095 for (i = 0; i < (int)v->n_fmt; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2096 bcf_fmt_t *f = &fmt[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2097 if ( !f->p ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2098 if (!first) kputc(':', s); first = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2099 if (gt_i == i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2100 bcf_format_gt(f,j,s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2101 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2102 bcf_fmt_array(s, f->n, f->type, f->p + j * f->size);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2103 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2104 if ( first ) kputc('.', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2105 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2106 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2107 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2108 for (j=0; j<=v->n_sample; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2109 kputs("\t.", s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2110 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2111 kputc('\n', s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2112 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2113 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2114
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2115 int vcf_write_line(htsFile *fp, kstring_t *line)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2116 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2117 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2118 if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2119 if ( fp->format.compression!=no_compression )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2120 ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2121 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2122 ret = hwrite(fp->fp.hfile, line->s, line->l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2123 return ret==line->l ? 0 : -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2124 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2125
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2126 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2127 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2128 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2129 fp->line.l = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2130 vcf_format1(h, v, &fp->line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2131 if ( fp->format.compression!=no_compression )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2132 ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2133 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2134 ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2135 return ret==fp->line.l ? 0 : -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2136 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2137
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2138 /************************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2139 * Data access routines *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2140 ************************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2141
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2142 int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2143 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2144 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2145 vdict_t *d = (vdict_t*)h->dict[which];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2146 k = kh_get(vdict, d, id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2147 return k == kh_end(d)? -1 : kh_val(d, k).id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2148 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2149
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2150
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2151 /********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2152 *** BCF indexing ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2153 ********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2154
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2155 hts_idx_t *bcf_index(htsFile *fp, int min_shift)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2156 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2157 int n_lvls, i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2158 bcf1_t *b;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2159 hts_idx_t *idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2160 bcf_hdr_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2161 int64_t max_len = 0, s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2162 h = bcf_hdr_read(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2163 if ( !h ) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2164 int nids = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2165 for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2166 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2167 if ( !h->id[BCF_DT_CTG][i].val ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2168 if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2169 nids++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2170 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2171 if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2172 max_len += 256;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2173 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2174 idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2175 b = bcf_init1();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2176 while (bcf_read1(fp,h, b) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2177 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2178 ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2179 if (ret < 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2180 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2181 bcf_destroy1(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2182 hts_idx_destroy(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2183 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2184 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2185 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2186 hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2187 bcf_destroy1(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2188 bcf_hdr_destroy(h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2189 return idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2190 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2191
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2192 int bcf_index_build(const char *fn, int min_shift)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2193 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2194 htsFile *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2195 hts_idx_t *idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2196 if ((fp = hts_open(fn, "rb")) == 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2197 if ( fp->format.compression!=bgzf ) { hts_close(fp); return -1; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2198 idx = bcf_index(fp, min_shift);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2199 hts_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2200 if ( !idx ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2201 hts_idx_save(idx, fn, HTS_FMT_CSI);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2202 hts_idx_destroy(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2203 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2204 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2205
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2206 /*****************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2207 *** Utilities ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2208 *****************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2209
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2210 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2211 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2212 int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2213 for (i=0; i<src->nhrec; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2214 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2215 if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2216 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2217 int j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2218 for (j=0; j<ndst_ori; j++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2219 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2220 if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2221
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2222 // Checking only the key part of generic lines, otherwise
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2223 // the VCFs are too verbose. Should we perhaps add a flag
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2224 // to bcf_hdr_combine() and make this optional?
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2225 if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2226 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2227 if ( j>=ndst_ori )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2228 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2229 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2230 else if ( src->hrec[i]->type==BCF_HL_STR )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2231 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2232 // NB: we are ignoring fields without ID
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2233 int j = bcf_hrec_find_key(src->hrec[i],"ID");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2234 if ( j>=0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2235 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2236 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2237 if ( !rec )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2238 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2239 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2240 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2241 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2242 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2243 int j = bcf_hrec_find_key(src->hrec[i],"ID");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2244 assert( j>=0 ); // this should always be true for valid VCFs
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2245
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2246 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2247 if ( !rec )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2248 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2249 else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2250 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2251 // Check that both records are of the same type. The bcf_hdr_id2length
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2252 // macro cannot be used here because dst header is not synced yet.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2253 vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2254 vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2255 khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2256 khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2257 if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2258 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2259 fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different lengths\n", src->hrec[i]->vals[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2260 ret |= 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2261 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2262 if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2263 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2264 fprintf(stderr,"Warning: trying to combine \"%s\" tag definitions of different types\n", src->hrec[i]->vals[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2265 ret |= 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2266 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2267 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2268 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2269 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2270 if ( need_sync ) bcf_hdr_sync(dst);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2271 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2272 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2273 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2274 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2275 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2276 if ( line->errcode )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2277 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2278 fprintf(stderr,"[%s:%d %s] Unchecked error (%d), exiting.\n", __FILE__,__LINE__,__FUNCTION__,line->errcode);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2279 exit(1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2280 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2281 if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2282 if ( !src_hdr->ntransl ) // called for the first time, see what needs translating
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2283 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2284 int dict;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2285 for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2286 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2287 src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2288 for (i=0; i<src_hdr->n[dict]; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2289 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2290 if ( !src_hdr->id[dict][i].key || !dst_hdr->id[dict][i].key ) // gap left after removed BCF header lines
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2291 src_hdr->transl[dict][i] = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2292 else if ( i>=dst_hdr->n[dict] || strcmp(src_hdr->id[dict][i].key,dst_hdr->id[dict][i].key) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2293 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2294 src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2295 src_hdr->ntransl++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2296 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2297 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2298 src_hdr->transl[dict][i] = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2299 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2300 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2301 if ( !src_hdr->ntransl )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2302 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2303 free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2304 free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2305 src_hdr->ntransl = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2306 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2307 if ( src_hdr->ntransl==-1 ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2308 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2309 bcf_unpack(line,BCF_UN_ALL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2310
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2311 // CHROM
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2312 if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2313
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2314 // FILTER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2315 for (i=0; i<line->d.n_flt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2316 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2317 int src_id = line->d.flt[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2318 if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2319 line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2320 line->d.shared_dirty |= BCF1_DIRTY_FLT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2321 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2322
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2323 // INFO
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2324 for (i=0; i<line->n_info; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2325 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2326 int src_id = line->d.info[i].key;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2327 int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2328 if ( dst_id<0 ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2329 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2330 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2331 if ( src_size==dst_size ) // can overwrite
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2332 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2333 line->d.info[i].key = dst_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2334 uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2335 if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2336 else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2337 else { *(uint32_t*)vptr = (uint32_t)dst_id; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2338 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2339 else // must realloc
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2340 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2341 bcf_info_t *info = &line->d.info[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2342 assert( !info->vptr_free );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2343 kstring_t str = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2344 bcf_enc_int1(&str, dst_id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2345 bcf_enc_size(&str, info->len,info->type);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2346 info->vptr_off = str.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2347 kputsn((char*)info->vptr, info->vptr_len, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2348 info->vptr = (uint8_t*)str.s + info->vptr_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2349 info->vptr_free = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2350 info->key = dst_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2351 line->d.shared_dirty |= BCF1_DIRTY_INF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2352 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2353 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2354
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2355 // FORMAT
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2356 for (i=0; i<line->n_fmt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2357 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2358 int src_id = line->d.fmt[i].id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2359 int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2360 if ( dst_id<0 ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2361 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2362 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2363 if ( src_size==dst_size ) // can overwrite
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2364 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2365 line->d.fmt[i].id = dst_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2366 uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2367 if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2368 else if ( dst_size==BCF_BT_INT16 ) { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2369 else { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; p[3] = x[2]; p[4] = x[3]; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2370 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2371 else // must realloc
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2372 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2373 bcf_fmt_t *fmt = &line->d.fmt[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2374 assert( !fmt->p_free );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2375 kstring_t str = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2376 bcf_enc_int1(&str, dst_id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2377 bcf_enc_size(&str, fmt->n, fmt->type);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2378 fmt->p_off = str.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2379 kputsn((char*)fmt->p, fmt->p_len, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2380 fmt->p = (uint8_t*)str.s + fmt->p_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2381 fmt->p_free = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2382 fmt->id = dst_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2383 line->d.indiv_dirty = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2384 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2385 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2386 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2387 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2388
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2389 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2390 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2391 bcf_hdr_t *hout = bcf_hdr_init("r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2392 char *htxt = bcf_hdr_fmt_text(hdr, 1, NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2393 bcf_hdr_parse(hout, htxt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2394 free(htxt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2395 return hout;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2396 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2397
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2398 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2399 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2400 int hlen;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2401 void *names_hash = khash_str2int_init();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2402 char *htxt = bcf_hdr_fmt_text(h0, 1, &hlen);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2403 kstring_t str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2404 bcf_hdr_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2405 str.l = str.m = 0; str.s = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2406 h = bcf_hdr_init("w");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2407 bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2408 int j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2409 for (j=0; j<n; j++) imap[j] = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2410 if ( bcf_hdr_nsamples(h0) > 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2411 char *p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2412 int i = 0, end = n? 8 : 7;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2413 while ((p = strstr(htxt, "#CHROM\t")) != 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2414 if (p > htxt && *(p-1) == '\n') break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2415 while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2416 if (i != end) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2417 free(h); free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2418 return 0; // malformated header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2419 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2420 kputsn(htxt, p - htxt, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2421 for (i = 0; i < n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2422 if ( khash_str2int_has_key(names_hash,samples[i]) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2423 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2424 fprintf(stderr,"[E::bcf_hdr_subset] Duplicate sample name \"%s\".\n", samples[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2425 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2426 free(htxt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2427 khash_str2int_destroy(names_hash);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2428 bcf_hdr_destroy(h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2429 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2430 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2431 imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2432 if (imap[i] < 0) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2433 kputc('\t', &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2434 kputs(samples[i], &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2435 khash_str2int_inc(names_hash,samples[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2436 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2437 } else kputsn(htxt, hlen, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2438 while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2439 kputc('\n',&str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2440 bcf_hdr_parse(h, str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2441 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2442 free(htxt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2443 khash_str2int_destroy(names_hash);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2444 return h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2445 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2446
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2447 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2448 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2449 if ( samples && !strcmp("-",samples) ) return 0; // keep all samples
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2450
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2451 hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2452 if ( !samples ) { bcf_hdr_nsamples(hdr) = 0; return 0; } // exclude all samples
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2453
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2454 int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2455 hdr->keep_samples = (uint8_t*) calloc(narr,1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2456 if ( samples[0]=='^' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2457 for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2458
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2459 int idx, n, ret = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2460 char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2461 if ( !smpls ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2462 for (i=0; i<n; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2463 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2464 idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2465 if ( idx<0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2466 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2467 if ( !ret ) ret = i+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2468 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2469 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2470 assert( idx<bcf_hdr_nsamples(hdr) );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2471 if ( samples[0]=='^' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2472 bit_array_clear(hdr->keep_samples, idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2473 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2474 bit_array_set(hdr->keep_samples, idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2475 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2476 for (i=0; i<n; i++) free(smpls[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2477 free(smpls);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2478
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2479 bcf_hdr_nsamples(hdr) = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2480 for (i=0; i<hdr->nsamples_ori; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2481 if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2482 if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2483 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2484 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2485 char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2486 idx = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2487 for (i=0; i<hdr->nsamples_ori; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2488 if ( bit_array_test(hdr->keep_samples,i) ) samples[idx++] = strdup(hdr->samples[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2489 free(hdr->samples);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2490 hdr->samples = samples;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2491
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2492 // delete original samples from the dictionary
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2493 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2494 int k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2495 for (k = kh_begin(d); k != kh_end(d); ++k)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2496 if (kh_exist(d, k)) free((char*)kh_key(d, k));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2497 kh_destroy(vdict, d);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2498
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2499 // add the subset back
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2500 hdr->dict[BCF_DT_SAMPLE] = d = kh_init(vdict);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2501 for (i=0; i<bcf_hdr_nsamples(hdr); i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2502 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2503 int ignore, k = kh_put(vdict, d, hdr->samples[i], &ignore);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2504 kh_val(d, k) = bcf_idinfo_def;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2505 kh_val(d, k).id = kh_size(d) - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2506 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2507 bcf_hdr_sync(hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2508 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2509
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2510 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2511 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2512
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2513 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2514 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2515 kstring_t ind;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2516 ind.s = 0; ind.l = ind.m = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2517 if (n) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2518 bcf_fmt_t *fmt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2519 int i, j;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2520 fmt = (bcf_fmt_t*)alloca(v->n_fmt * sizeof(bcf_fmt_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2521 uint8_t *ptr = (uint8_t*)v->indiv.s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2522 for (i = 0; i < v->n_fmt; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2523 ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2524 for (i = 0; i < (int)v->n_fmt; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2525 bcf_fmt_t *f = &fmt[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2526 bcf_enc_int1(&ind, f->id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2527 bcf_enc_size(&ind, f->n, f->type);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2528 for (j = 0; j < n; ++j)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2529 if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2530 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2531 for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2532 v->n_sample = i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2533 } else v->n_sample = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2534 if ( !v->n_sample ) v->n_fmt = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2535 free(v->indiv.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2536 v->indiv = ind;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2537 v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2538 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2539 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2540
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2541 int bcf_is_snp(bcf1_t *v)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2542 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2543 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2544 bcf_unpack(v, BCF_UN_STR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2545 for (i = 0; i < v->n_allele; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2546 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2547 if ( v->d.allele[i][1]==0 ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2548
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2549 // mpileup's <X> allele, see also below. This is not completely satisfactory,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2550 // a general library is here narrowly tailored to fit samtools.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2551 if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2552
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2553 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2554 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2555 return i == v->n_allele;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2556 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2557
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2558 static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *var)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2559 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2560 // The most frequent case
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2561 if ( !ref[1] && !alt[1] )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2562 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2563 if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2564 if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2565 var->n = 1; var->type = VCF_SNP; return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2566 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2567 if ( alt[0]=='<' )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2568 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2569 if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2570 var->type = VCF_OTHER;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2571 return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2572 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2573
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2574 const char *r = ref, *a = alt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2575 while (*r && *a && *r==*a ) { r++; a++; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2576
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2577 if ( *a && !*r )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2578 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2579 while ( *a ) a++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2580 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2581 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2582 else if ( *r && !*a )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2583 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2584 while ( *r ) r++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2585 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2586 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2587 else if ( !*r && !*a )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2588 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2589 var->n = 0; var->type = VCF_REF; return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2590 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2591
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2592 const char *re = r, *ae = a;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2593 while ( re[1] ) re++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2594 while ( ae[1] ) ae++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2595 while ( *re==*ae && re>r && ae>a ) { re--; ae--; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2596 if ( ae==a )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2597 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2598 if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2599 var->n = -(re-r);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2600 if ( *re==*ae ) { var->type = VCF_INDEL; return; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2601 var->type = VCF_OTHER; return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2602 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2603 else if ( re==r )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2604 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2605 var->n = ae-a;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2606 if ( *re==*ae ) { var->type = VCF_INDEL; return; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2607 var->type = VCF_OTHER; return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2608 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2609
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2610 var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2611 var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2612
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2613 // should do also complex events, SVs, etc...
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2614 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2615
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2616 static void bcf_set_variant_types(bcf1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2617 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2618 if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2619 bcf_dec_t *d = &b->d;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2620 if ( d->n_var < b->n_allele )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2621 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2622 d->var = (variant_t *) realloc(d->var, sizeof(variant_t)*b->n_allele);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2623 d->n_var = b->n_allele;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2624 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2625 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2626 b->d.var_type = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2627 for (i=1; i<b->n_allele; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2628 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2629 bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2630 b->d.var_type |= d->var[i].type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2631 //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2632 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2633 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2634
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2635 int bcf_get_variant_types(bcf1_t *rec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2636 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2637 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2638 return rec->d.var_type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2639 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2640 int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2641 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2642 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2643 return rec->d.var[ith_allele].type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2644 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2645
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2646 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2647 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2648 // Is the field already present?
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2649 int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2650 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2651 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2652
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2653 for (i=0; i<line->n_info; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2654 if ( inf_id==line->d.info[i].key ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2655 bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2656
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2657 if ( !n || (type==BCF_HT_STR && !values) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2658 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2659 if ( inf )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2660 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2661 // Mark the tag for removal, free existing memory if necessary
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2662 if ( inf->vptr_free )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2663 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2664 free(inf->vptr - inf->vptr_off);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2665 inf->vptr_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2666 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2667 line->d.shared_dirty |= BCF1_DIRTY_INF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2668 inf->vptr = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2669 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2670 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2671 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2672
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2673 // Encode the values and determine the size required to accommodate the values
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2674 kstring_t str = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2675 bcf_enc_int1(&str, inf_id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2676 if ( type==BCF_HT_INT )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2677 bcf_enc_vint(&str, n, (int32_t*)values, -1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2678 else if ( type==BCF_HT_REAL )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2679 bcf_enc_vfloat(&str, n, (float*)values);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2680 else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2681 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2682 if ( values==NULL )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2683 bcf_enc_size(&str, 0, BCF_BT_NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2684 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2685 bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2686 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2687 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2688 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2689 fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2690 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2691 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2692
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2693 // Is the INFO tag already present
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2694 if ( inf )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2695 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2696 // Is it big enough to accommodate new block?
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2697 if ( str.l <= inf->vptr_len + inf->vptr_off )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2698 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2699 if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2700 uint8_t *ptr = inf->vptr - inf->vptr_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2701 memcpy(ptr, str.s, str.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2702 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2703 int vptr_free = inf->vptr_free;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2704 bcf_unpack_info_core1(ptr, inf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2705 inf->vptr_free = vptr_free;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2706 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2707 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2708 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2709 assert( !inf->vptr_free ); // fix the caller or improve here: this has been modified before
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2710 bcf_unpack_info_core1((uint8_t*)str.s, inf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2711 inf->vptr_free = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2712 line->d.shared_dirty |= BCF1_DIRTY_INF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2713 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2714 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2715 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2716 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2717 // The tag is not present, create new one
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2718 line->n_info++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2719 hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2720 inf = &line->d.info[line->n_info-1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2721 bcf_unpack_info_core1((uint8_t*)str.s, inf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2722 inf->vptr_free = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2723 line->d.shared_dirty |= BCF1_DIRTY_INF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2724 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2725 line->unpacked |= BCF_UN_INFO;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2726 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2727 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2728
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2729 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2730 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2731 if ( !n )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2732 return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2733
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2734 int i, max_len = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2735 for (i=0; i<n; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2736 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2737 int len = strlen(values[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2738 if ( len > max_len ) max_len = len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2739 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2740 char *out = (char*) malloc(max_len*n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2741 if ( !out ) return -2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2742 for (i=0; i<n; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2743 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2744 char *dst = out+i*max_len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2745 const char *src = values[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2746 int j = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2747 while ( src[j] ) { dst[j] = src[j]; j++; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2748 for (; j<max_len; j++) dst[j] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2749 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2750 int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2751 free(out);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2752 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2753 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2754
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2755 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2756 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2757 // Is the field already present?
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2758 int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2759 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2760 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2761 if ( !n ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2762 return -1; // the key not present in the header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2763 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2764
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2765 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2766
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2767 for (i=0; i<line->n_fmt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2768 if ( line->d.fmt[i].id==fmt_id ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2769 bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2770
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2771 if ( !n )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2772 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2773 if ( fmt )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2774 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2775 // Mark the tag for removal, free existing memory if necessary
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2776 if ( fmt->p_free )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2777 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2778 free(fmt->p - fmt->p_off);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2779 fmt->p_free = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2780 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2781 line->d.indiv_dirty = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2782 fmt->p = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2783 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2784 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2785 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2786
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2787 line->n_sample = bcf_hdr_nsamples(hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2788 int nps = n / line->n_sample; // number of values per sample
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2789 assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2790
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2791 // Encode the values and determine the size required to accommodate the values
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2792 kstring_t str = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2793 bcf_enc_int1(&str, fmt_id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2794 if ( type==BCF_HT_INT )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2795 bcf_enc_vint(&str, n, (int32_t*)values, nps);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2796 else if ( type==BCF_HT_REAL )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2797 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2798 bcf_enc_size(&str, nps, BCF_BT_FLOAT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2799 kputsn((char*)values, nps*line->n_sample*sizeof(float), &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2800 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2801 else if ( type==BCF_HT_STR )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2802 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2803 bcf_enc_size(&str, nps, BCF_BT_CHAR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2804 kputsn((char*)values, nps*line->n_sample, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2805 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2806 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2807 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2808 fprintf(stderr, "[E::%s] the type %d not implemented yet\n", __func__, type);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2809 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2810 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2811
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2812 if ( !fmt )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2813 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2814 // Not present, new format field
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2815 line->n_fmt++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2816 hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2817
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2818 // Special case: VCF specification requires that GT is always first
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2819 if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2820 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2821 for (i=line->n_fmt-1; i>0; i--)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2822 line->d.fmt[i] = line->d.fmt[i-1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2823 fmt = &line->d.fmt[0];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2824 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2825 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2826 fmt = &line->d.fmt[line->n_fmt-1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2827 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2828 line->d.indiv_dirty = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2829 fmt->p_free = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2830 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2831 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2832 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2833 // The tag is already present, check if it is big enough to accomodate the new block
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2834 if ( str.l <= fmt->p_len + fmt->p_off )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2835 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2836 // good, the block is big enough
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2837 if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2838 uint8_t *ptr = fmt->p - fmt->p_off;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2839 memcpy(ptr, str.s, str.l);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2840 free(str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2841 int p_free = fmt->p_free;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2842 bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2843 fmt->p_free = p_free;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2844 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2845 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2846 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2847 assert( !fmt->p_free ); // fix the caller or improve here: this has been modified before
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2848 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2849 fmt->p_free = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2850 line->d.indiv_dirty = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2851 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2852 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2853 line->unpacked |= BCF_UN_FMT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2854 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2855 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2856
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2857
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2858 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2859 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2860 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2861 line->d.shared_dirty |= BCF1_DIRTY_FLT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2862 line->d.n_flt = n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2863 if ( !n ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2864 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2865 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2866 for (i=0; i<n; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2867 line->d.flt[i] = flt_ids[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2868 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2869 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2870
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2871 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2872 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2873 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2874 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2875 for (i=0; i<line->d.n_flt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2876 if ( flt_id==line->d.flt[i] ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2877 if ( i<line->d.n_flt ) return 0; // this filter is already set
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2878 line->d.shared_dirty |= BCF1_DIRTY_FLT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2879 if ( flt_id==0 ) // set to PASS
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2880 line->d.n_flt = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2881 else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2882 line->d.n_flt = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2883 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2884 line->d.n_flt++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2885 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2886 line->d.flt[line->d.n_flt-1] = flt_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2887 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2888 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2889 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2890 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2891 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2892 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2893 for (i=0; i<line->d.n_flt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2894 if ( flt_id==line->d.flt[i] ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2895 if ( i==line->d.n_flt ) return 0; // the filter is not present
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2896 line->d.shared_dirty |= BCF1_DIRTY_FLT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2897 if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2898 line->d.n_flt--;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2899 if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2900 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2901 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2902
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2903 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2904 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2905 if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2906 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2907 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2908
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2909 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2910 if ( id==0 && !line->d.n_flt) return 1; // PASS
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2911
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2912 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2913 for (i=0; i<line->d.n_flt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2914 if ( line->d.flt[i]==id ) return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2915 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2916 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2917
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2918 static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2919 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2920 line->d.shared_dirty |= BCF1_DIRTY_ALS;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2921
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2922 line->n_allele = nals;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2923 hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2924
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2925 char *als = line->d.als;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2926 int n = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2927 while (n<nals)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2928 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2929 line->d.allele[n] = als;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2930 while ( *als ) als++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2931 als++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2932 n++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2933 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2934 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2935 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2936 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2937 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2938 kstring_t tmp = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2939 char *free_old = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2940
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2941 // If the supplied alleles are not pointers to line->d.als, the existing block can be reused.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2942 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2943 for (i=0; i<nals; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2944 if ( alleles[i]>=line->d.als && alleles[i]<line->d.als+line->d.m_als ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2945 if ( i==nals )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2946 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2947 // all alleles point elsewhere, reuse the existing block
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2948 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2949 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2950 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2951 free_old = line->d.als;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2952
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2953 for (i=0; i<nals; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2954 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2955 kputs(alleles[i], &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2956 kputc(0, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2957 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2958 line->d.als = tmp.s; line->d.m_als = tmp.m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2959 free(free_old);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2960 return _bcf1_sync_alleles(hdr,line,nals);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2961 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2962
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2963 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2964 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2965 kstring_t tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2966 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2967 kputs(alleles_string, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2968 line->d.als = tmp.s; line->d.m_als = tmp.m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2969
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2970 int nals = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2971 char *t = line->d.als;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2972 while (*t)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2973 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2974 if ( *t==',' ) { *t = 0; nals++; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2975 t++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2976 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2977 return _bcf1_sync_alleles(hdr, line, nals);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2978 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2979
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2980 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2981 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2982 kstring_t tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2983 tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2984 if ( id )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2985 kputs(id, &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2986 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2987 kputs(".", &tmp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2988 line->d.id = tmp.s; line->d.m_id = tmp.m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2989 line->d.shared_dirty |= BCF1_DIRTY_ID;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2990 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2991 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2992
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2993 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2994 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2995 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2996 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2997 return bcf_get_fmt_id(line, id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2998 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2999
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3000 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3001 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3002 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3003 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3004 return bcf_get_info_id(line, id);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3005 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3006
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3007 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3008 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3009 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3010 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3011 for (i=0; i<line->n_fmt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3012 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3013 if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3014 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3015 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3016 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3017
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3018 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3019 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3020 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3021 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3022 for (i=0; i<line->n_info; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3023 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3024 if ( line->d.info[i].key==id ) return &line->d.info[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3025 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3026 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3027 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3028
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3029
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3030 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3031 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3032 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3033 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3034 if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3035
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3036 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3037
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3038 for (i=0; i<line->n_info; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3039 if ( line->d.info[i].key==tag_id ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3040 if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3041 if ( type==BCF_HT_FLAG ) return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3042
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3043 bcf_info_t *info = &line->d.info[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3044 if ( type==BCF_HT_STR )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3045 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3046 if ( *ndst < info->len+1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3047 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3048 *ndst = info->len + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3049 *dst = realloc(*dst, *ndst);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3050 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3051 memcpy(*dst,info->vptr,info->len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3052 ((uint8_t*)*dst)[info->len] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3053 return info->len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3054 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3055
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3056 // Make sure the buffer is big enough
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3057 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3058 if ( *ndst < info->len )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3059 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3060 *ndst = info->len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3061 *dst = realloc(*dst, *ndst * size1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3062 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3063
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3064 if ( info->len == 1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3065 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3066 if ( info->type==BCF_BT_FLOAT ) *((float*)*dst) = info->v1.f;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3067 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3068 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3069 #define BRANCH(type_t, missing) { \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3070 if ( info->v1.i==missing ) *((int32_t*)*dst) = bcf_int32_missing; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3071 else *((int32_t*)*dst) = info->v1.i; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3072 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3073 switch (info->type)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3074 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3075 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing ); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3076 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3077 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3078 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3079 #undef BRANCH
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3080 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3081 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3082 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3083
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3084 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, out_type_t) { \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3085 out_type_t *tmp = (out_type_t *) *dst; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3086 type_t *p = (type_t *) info->vptr; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3087 for (j=0; j<info->len; j++) \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3088 { \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3089 if ( is_vector_end ) return j; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3090 if ( is_missing ) set_missing; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3091 else *tmp = p[j]; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3092 tmp++; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3093 } \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3094 return j; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3095 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3096 switch (info->type) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3097 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, int32_t); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3098 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, int32_t); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3099 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, int32_t); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3100 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), float); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3101 default: fprintf(stderr,"TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type); exit(1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3102 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3103 #undef BRANCH
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3104 return -4; // this can never happen
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3105 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3106
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3107 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3108 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3109 int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3110 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3111 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3112
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3113 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3114
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3115 for (i=0; i<line->n_fmt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3116 if ( line->d.fmt[i].id==tag_id ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3117 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3118 bcf_fmt_t *fmt = &line->d.fmt[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3119
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3120 int nsmpl = bcf_hdr_nsamples(hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3121 if ( !*dst )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3122 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3123 *dst = (char**) malloc(sizeof(char*)*nsmpl);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3124 if ( !*dst ) return -4; // could not alloc
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3125 (*dst)[0] = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3126 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3127 int n = (fmt->n+1)*nsmpl;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3128 if ( *ndst < n )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3129 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3130 (*dst)[0] = realloc((*dst)[0], n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3131 if ( !(*dst)[0] ) return -4; // could not alloc
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3132 *ndst = n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3133 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3134 for (i=0; i<nsmpl; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3135 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3136 uint8_t *src = fmt->p + i*fmt->n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3137 uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3138 memcpy(tmp,src,fmt->n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3139 tmp[fmt->n] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3140 (*dst)[i] = (char*) tmp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3141 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3142 return n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3143 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3144
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3145 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3146 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3147 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3148 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3149 if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3150 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3151 // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3152 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3153 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3154 else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3155
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3156 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3157
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3158 for (i=0; i<line->n_fmt; i++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3159 if ( line->d.fmt[i].id==tag_id ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3160 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3161 bcf_fmt_t *fmt = &line->d.fmt[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3162
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3163 if ( type==BCF_HT_STR )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3164 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3165 int n = fmt->n*bcf_hdr_nsamples(hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3166 if ( *ndst < n )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3167 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3168 *dst = realloc(*dst, n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3169 if ( !*dst ) return -4; // could not alloc
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3170 *ndst = n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3171 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3172 memcpy(*dst,fmt->p,n);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3173 return n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3174 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3175
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3176 // Make sure the buffer is big enough
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3177 int nsmpl = bcf_hdr_nsamples(hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3178 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3179 if ( *ndst < fmt->n*nsmpl )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3180 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3181 *ndst = fmt->n*nsmpl;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3182 *dst = realloc(*dst, *ndst*size1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3183 if ( !dst ) return -4; // could not alloc
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3184 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3185
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3186 #define BRANCH(type_t, is_missing, is_vector_end, set_missing, set_vector_end, out_type_t) { \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3187 out_type_t *tmp = (out_type_t *) *dst; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3188 type_t *p = (type_t*) fmt->p; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3189 for (i=0; i<nsmpl; i++) \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3190 { \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3191 for (j=0; j<fmt->n; j++) \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3192 { \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3193 if ( is_missing ) set_missing; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3194 else if ( is_vector_end ) { set_vector_end; break; } \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3195 else *tmp = p[j]; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3196 tmp++; \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3197 } \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3198 for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3199 p = (type_t *)((char *)p + fmt->size); \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3200 } \
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3201 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3202 switch (fmt->type) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3203 case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3204 case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3205 case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, int32_t); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3206 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), float); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3207 default: fprintf(stderr,"TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->type); exit(1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3208 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3209 #undef BRANCH
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3210 return nsmpl*fmt->n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3211 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3212