0
|
1 /* vcfutils.c -- allele-related utility functions.
|
|
2
|
|
3 Copyright (C) 2012-2014 Genome Research Ltd.
|
|
4
|
|
5 Author: Petr Danecek <pd3@sanger.ac.uk>
|
|
6
|
|
7 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8 of this software and associated documentation files (the "Software"), to deal
|
|
9 in the Software without restriction, including without limitation the rights
|
|
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11 copies of the Software, and to permit persons to whom the Software is
|
|
12 furnished to do so, subject to the following conditions:
|
|
13
|
|
14 The above copyright notice and this permission notice shall be included in
|
|
15 all copies or substantial portions of the Software.
|
|
16
|
|
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
23 DEALINGS IN THE SOFTWARE. */
|
|
24
|
|
25 #include "htslib/vcfutils.h"
|
|
26
|
|
27 int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
|
|
28 {
|
|
29 int i;
|
|
30 for (i=0; i<line->n_allele; i++) ac[i]=0;
|
|
31
|
|
32 // Use INFO/AC,AN field only when asked
|
|
33 if ( which&BCF_UN_INFO )
|
|
34 {
|
|
35 bcf_unpack(line, BCF_UN_INFO);
|
|
36 int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN");
|
|
37 int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC");
|
|
38 int i, an=-1, ac_len=0, ac_type=0;
|
|
39 uint8_t *ac_ptr=NULL;
|
|
40 if ( an_id>=0 && ac_id>=0 )
|
|
41 {
|
|
42 for (i=0; i<line->n_info; i++)
|
|
43 {
|
|
44 bcf_info_t *z = &line->d.info[i];
|
|
45 if ( z->key == an_id ) an = z->v1.i;
|
|
46 else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; }
|
|
47 }
|
|
48 }
|
|
49 if ( an>=0 && ac_ptr )
|
|
50 {
|
|
51 int nac = 0;
|
|
52 #define BRANCH_INT(type_t) { \
|
|
53 type_t *p = (type_t *) ac_ptr; \
|
|
54 for (i=0; i<ac_len; i++) \
|
|
55 { \
|
|
56 ac[i+1] = p[i]; \
|
|
57 nac += p[i]; \
|
|
58 } \
|
|
59 }
|
|
60 switch (ac_type) {
|
|
61 case BCF_BT_INT8: BRANCH_INT(int8_t); break;
|
|
62 case BCF_BT_INT16: BRANCH_INT(int16_t); break;
|
|
63 case BCF_BT_INT32: BRANCH_INT(int32_t); break;
|
|
64 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
|
|
65 }
|
|
66 #undef BRANCH_INT
|
|
67 if ( an<nac )
|
|
68 {
|
|
69 fprintf(stderr,"[E::%s] Incorrect AN/AC counts at %s:%d\n", __func__,header->id[BCF_DT_CTG][line->rid].key, line->pos+1);
|
|
70 exit(1);
|
|
71 }
|
|
72 ac[0] = an - nac;
|
|
73 return 1;
|
|
74 }
|
|
75 }
|
|
76
|
|
77 // Split genotype fields only when asked
|
|
78 if ( which&BCF_UN_FMT )
|
|
79 {
|
|
80 int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT");
|
|
81 if ( gt_id<0 ) return 0;
|
|
82 bcf_unpack(line, BCF_UN_FMT);
|
|
83 bcf_fmt_t *fmt_gt = NULL;
|
|
84 for (i=0; i<(int)line->n_fmt; i++)
|
|
85 if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
|
|
86 if ( !fmt_gt ) return 0;
|
|
87 #define BRANCH_INT(type_t,vector_end) { \
|
|
88 for (i=0; i<line->n_sample; i++) \
|
|
89 { \
|
|
90 type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
|
|
91 int ial; \
|
|
92 for (ial=0; ial<fmt_gt->n; ial++) \
|
|
93 { \
|
|
94 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
|
|
95 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
|
|
96 if ( p[ial]>>1 > line->n_allele ) \
|
|
97 { \
|
|
98 fprintf(stderr,"[E::%s] Incorrect allele (\"%d\") in %s at %s:%d\n", __func__,(p[ial]>>1)-1, header->samples[i],header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \
|
|
99 exit(1); \
|
|
100 } \
|
|
101 ac[(p[ial]>>1)-1]++; \
|
|
102 } \
|
|
103 } \
|
|
104 }
|
|
105 switch (fmt_gt->type) {
|
|
106 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
|
|
107 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
|
|
108 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
|
|
109 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
|
|
110 }
|
|
111 #undef BRANCH_INT
|
|
112 return 1;
|
|
113 }
|
|
114 return 0;
|
|
115 }
|
|
116
|
|
117 int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal)
|
|
118 {
|
|
119 int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0;
|
|
120 #define BRANCH_INT(type_t,vector_end) { \
|
|
121 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
|
|
122 for (i=0; i<fmt_ptr->n; i++) \
|
|
123 { \
|
|
124 if ( p[i] == vector_end ) break; /* smaller ploidy */ \
|
|
125 if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
|
|
126 int tmp = p[i]>>1; \
|
|
127 if ( tmp>1 ) \
|
|
128 { \
|
|
129 if ( !ial ) { ial = tmp; has_alt = 1; } \
|
|
130 else if ( tmp!=ial ) \
|
|
131 { \
|
|
132 if ( tmp<ial ) \
|
|
133 { \
|
|
134 jal = ial; \
|
|
135 ial = tmp; \
|
|
136 } \
|
|
137 else \
|
|
138 { \
|
|
139 jal = tmp; \
|
|
140 } \
|
|
141 has_alt = 2; \
|
|
142 } \
|
|
143 } \
|
|
144 else has_ref = 1; \
|
|
145 nals++; \
|
|
146 } \
|
|
147 }
|
|
148 switch (fmt_ptr->type) {
|
|
149 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
|
|
150 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
|
|
151 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
|
|
152 default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
|
|
153 }
|
|
154 #undef BRANCH_INT
|
|
155
|
|
156 if ( _ial ) *_ial = ial>0 ? ial-1 : ial;
|
|
157 if ( _jal ) *_jal = jal>0 ? jal-1 : jal;
|
|
158 if ( !nals ) return GT_UNKN;
|
|
159 if ( nals==1 )
|
|
160 return has_ref ? GT_HAPL_R : GT_HAPL_A;
|
|
161 if ( !has_ref )
|
|
162 return has_alt==1 ? GT_HOM_AA : GT_HET_AA;
|
|
163 if ( !has_alt )
|
|
164 return GT_HOM_RR;
|
|
165 return GT_HET_RA;
|
|
166 }
|
|
167
|
|
168 int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
|
|
169 {
|
|
170 int i;
|
|
171 bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT");
|
|
172 if ( !gt ) return 0;
|
|
173
|
|
174 int *ac = (int*) calloc(line->n_allele,sizeof(int));
|
|
175
|
|
176 // check if all alleles are populated
|
|
177 #define BRANCH(type_t,vector_end) { \
|
|
178 for (i=0; i<line->n_sample; i++) \
|
|
179 { \
|
|
180 type_t *p = (type_t*) (gt->p + i*gt->size); \
|
|
181 int ial; \
|
|
182 for (ial=0; ial<gt->n; ial++) \
|
|
183 { \
|
|
184 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
|
|
185 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
|
|
186 if ( (p[ial]>>1)-1 >= line->n_allele ) { free(ac); return -1; } \
|
|
187 ac[(p[ial]>>1)-1]++; \
|
|
188 } \
|
|
189 } \
|
|
190 }
|
|
191 switch (gt->type) {
|
|
192 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
|
|
193 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
|
|
194 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
|
|
195 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
|
|
196 }
|
|
197 #undef BRANCH
|
|
198
|
|
199 int rm_als = 0, nrm = 0;
|
|
200 for (i=1; i<line->n_allele; i++)
|
|
201 {
|
|
202 if ( !ac[i] ) { rm_als |= 1<<i; nrm++; }
|
|
203 }
|
|
204 free(ac);
|
|
205
|
|
206 if ( nrm ) bcf_remove_alleles(header, line, rm_als);
|
|
207 return nrm;
|
|
208 }
|
|
209
|
|
210 void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
|
|
211 {
|
|
212 int *map = (int*) calloc(line->n_allele, sizeof(int));
|
|
213
|
|
214 // create map of indexes from old to new ALT numbering and modify ALT
|
|
215 kstring_t str = {0,0,0};
|
|
216 kputs(line->d.allele[0], &str);
|
|
217
|
|
218 int nrm = 0, i,j; // i: ori alleles, j: new alleles
|
|
219 for (i=1, j=1; i<line->n_allele; i++)
|
|
220 {
|
|
221 if ( rm_mask & 1<<i )
|
|
222 {
|
|
223 // remove this allele
|
|
224 line->d.allele[i] = NULL;
|
|
225 nrm++;
|
|
226 continue;
|
|
227 }
|
|
228 kputc(',', &str);
|
|
229 kputs(line->d.allele[i], &str);
|
|
230 map[i] = j;
|
|
231 j++;
|
|
232 }
|
|
233 if ( !nrm ) { free(map); free(str.s); return; }
|
|
234
|
|
235 int nR_ori = line->n_allele;
|
|
236 int nR_new = line->n_allele-nrm;
|
|
237 assert(nR_new > 0); // should not be able to remove reference allele
|
|
238 int nA_ori = nR_ori-1;
|
|
239 int nA_new = nR_new-1;
|
|
240
|
|
241 int nG_ori = nR_ori*(nR_ori + 1)/2;
|
|
242 int nG_new = nR_new*(nR_new + 1)/2;
|
|
243
|
|
244 bcf_update_alleles_str(header, line, str.s);
|
|
245
|
|
246 // remove from Number=G, Number=R and Number=A INFO fields.
|
|
247 uint8_t *dat = NULL;
|
|
248 int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
|
|
249 for (i=0; i<line->n_info; i++)
|
|
250 {
|
|
251 bcf_info_t *info = &line->d.info[i];
|
|
252 int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);
|
|
253
|
|
254 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
|
|
255
|
|
256 int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
|
|
257 if ( type==BCF_HT_FLAG ) continue;
|
|
258 int size = 1;
|
|
259 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
|
|
260
|
|
261 mdat = mdat_bytes / size;
|
|
262 nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
|
|
263 mdat_bytes = mdat * size;
|
|
264 if ( nret<0 )
|
|
265 {
|
|
266 fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
|
|
267 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
|
|
268 exit(1);
|
|
269 }
|
|
270 if ( type==BCF_HT_STR )
|
|
271 {
|
|
272 str.l = 0;
|
|
273 char *ss = (char*) dat, *se = (char*) dat;
|
|
274 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
|
|
275 {
|
|
276 int nexp, inc = 0;
|
|
277 if ( vlen==BCF_VL_A )
|
|
278 {
|
|
279 nexp = nA_ori;
|
|
280 inc = 1;
|
|
281 }
|
|
282 else
|
|
283 nexp = nR_ori;
|
|
284 for (j=0; j<nexp; j++)
|
|
285 {
|
|
286 if ( !*se ) break;
|
|
287 while ( *se && *se!=',' ) se++;
|
|
288 if ( rm_mask & 1<<(j+inc) )
|
|
289 {
|
|
290 if ( *se ) se++;
|
|
291 ss = se;
|
|
292 continue;
|
|
293 }
|
|
294 if ( str.l ) kputc(',',&str);
|
|
295 kputsn(ss,se-ss,&str);
|
|
296 if ( *se ) se++;
|
|
297 ss = se;
|
|
298 }
|
|
299 assert( j==nexp );
|
|
300 }
|
|
301 else // Number=G, assuming diploid genotype
|
|
302 {
|
|
303 int k = 0, n = 0;
|
|
304 for (j=0; j<nR_ori; j++)
|
|
305 {
|
|
306 for (k=0; k<=j; k++)
|
|
307 {
|
|
308 if ( !*se ) break;
|
|
309 while ( *se && *se!=',' ) se++;
|
|
310 n++;
|
|
311 if ( rm_mask & 1<<j || rm_mask & 1<<k )
|
|
312 {
|
|
313 if ( *se ) se++;
|
|
314 ss = se;
|
|
315 continue;
|
|
316 }
|
|
317 if ( str.l ) kputc(',',&str);
|
|
318 kputsn(ss,se-ss,&str);
|
|
319 if ( *se ) se++;
|
|
320 ss = se;
|
|
321 }
|
|
322 if ( !*se ) break;
|
|
323 }
|
|
324 assert( n=nG_ori );
|
|
325 }
|
|
326
|
|
327 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
|
|
328 if ( nret<0 )
|
|
329 {
|
|
330 fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
|
|
331 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
|
|
332 exit(1);
|
|
333 }
|
|
334 continue;
|
|
335 }
|
|
336
|
|
337 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
|
|
338 {
|
|
339 int inc = 0, ntop;
|
|
340 if ( vlen==BCF_VL_A )
|
|
341 {
|
|
342 assert( nret==nA_ori );
|
|
343 ntop = nA_ori;
|
|
344 ndat = nA_new;
|
|
345 inc = 1;
|
|
346 }
|
|
347 else
|
|
348 {
|
|
349 assert( nret==nR_ori );
|
|
350 ntop = nR_ori;
|
|
351 ndat = nR_new;
|
|
352 }
|
|
353 int k = 0;
|
|
354
|
|
355 #define BRANCH(type_t,is_vector_end) \
|
|
356 { \
|
|
357 type_t *ptr = (type_t*) dat; \
|
|
358 int size = sizeof(type_t); \
|
|
359 for (j=0; j<ntop; j++) /* j:ori, k:new */ \
|
|
360 { \
|
|
361 if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
|
|
362 if ( rm_mask & 1<<(j+inc) ) continue; \
|
|
363 if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
|
|
364 k++; \
|
|
365 } \
|
|
366 }
|
|
367 switch (type)
|
|
368 {
|
|
369 case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
|
|
370 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
|
|
371 }
|
|
372 #undef BRANCH
|
|
373 }
|
|
374 else // Number=G
|
|
375 {
|
|
376 assert( nret==nG_ori );
|
|
377 int k, l_ori = -1, l_new = 0;
|
|
378 ndat = nG_new;
|
|
379
|
|
380 #define BRANCH(type_t,is_vector_end) \
|
|
381 { \
|
|
382 type_t *ptr = (type_t*) dat; \
|
|
383 int size = sizeof(type_t); \
|
|
384 for (j=0; j<nR_ori; j++) \
|
|
385 { \
|
|
386 for (k=0; k<=j; k++) \
|
|
387 { \
|
|
388 l_ori++; \
|
|
389 if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
|
|
390 if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \
|
|
391 if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
|
|
392 l_new++; \
|
|
393 } \
|
|
394 } \
|
|
395 }
|
|
396 switch (type)
|
|
397 {
|
|
398 case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
|
|
399 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
|
|
400 }
|
|
401 #undef BRANCH
|
|
402 }
|
|
403
|
|
404 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
|
|
405 if ( nret<0 )
|
|
406 {
|
|
407 fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
|
|
408 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
|
|
409 exit(1);
|
|
410 }
|
|
411 }
|
|
412
|
|
413 // Update GT fields, the allele indexes might have changed
|
|
414 for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
|
|
415 if ( i<line->n_allele )
|
|
416 {
|
|
417 mdat = mdat_bytes / 4; // sizeof(int32_t)
|
|
418 nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
|
|
419 mdat_bytes = mdat * 4;
|
|
420 if ( nret>0 )
|
|
421 {
|
|
422 nret /= line->n_sample;
|
|
423 int32_t *ptr = (int32_t*) dat;
|
|
424 for (i=0; i<line->n_sample; i++)
|
|
425 {
|
|
426 for (j=0; j<nret; j++)
|
|
427 {
|
|
428 if ( bcf_gt_is_missing(ptr[j]) ) continue;
|
|
429 if ( ptr[j]==bcf_int32_vector_end ) break;
|
|
430 int al = bcf_gt_allele(ptr[j]);
|
|
431 assert( al<nR_ori && map[al]>=0 );
|
|
432 ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
|
|
433 }
|
|
434 ptr += nret;
|
|
435 }
|
|
436 bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
|
|
437 }
|
|
438 }
|
|
439
|
|
440 // Remove from Number=G, Number=R and Number=A FORMAT fields.
|
|
441 // Assuming haploid or diploid GTs
|
|
442 for (i=0; i<line->n_fmt; i++)
|
|
443 {
|
|
444 bcf_fmt_t *fmt = &line->d.fmt[i];
|
|
445 int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);
|
|
446
|
|
447 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
|
|
448
|
|
449 int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
|
|
450 if ( type==BCF_HT_FLAG ) continue;
|
|
451
|
|
452 int size = 1;
|
|
453 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
|
|
454
|
|
455 mdat = mdat_bytes / size;
|
|
456 nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
|
|
457 mdat_bytes = mdat * size;
|
|
458 if ( nret<0 )
|
|
459 {
|
|
460 fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
|
|
461 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
|
|
462 exit(1);
|
|
463 }
|
|
464
|
|
465 if ( type==BCF_HT_STR )
|
|
466 {
|
|
467 int size = nret/line->n_sample; // number of bytes per sample
|
|
468 str.l = 0;
|
|
469 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
|
|
470 {
|
|
471 int nexp, inc = 0;
|
|
472 if ( vlen==BCF_VL_A )
|
|
473 {
|
|
474 nexp = nA_ori;
|
|
475 inc = 1;
|
|
476 }
|
|
477 else
|
|
478 nexp = nR_ori;
|
|
479 for (j=0; j<line->n_sample; j++)
|
|
480 {
|
|
481 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
|
|
482 int k_src = 0, k_dst = 0, l = str.l;
|
|
483 for (k_src=0; k_src<nexp; k_src++)
|
|
484 {
|
|
485 if ( ptr>=se || !*ptr) break;
|
|
486 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
|
|
487 if ( rm_mask & 1<<(k_src+inc) )
|
|
488 {
|
|
489 ss = ++ptr;
|
|
490 continue;
|
|
491 }
|
|
492 if ( k_dst ) kputc(',',&str);
|
|
493 kputsn(ss,ptr-ss,&str);
|
|
494 ss = ++ptr;
|
|
495 k_dst++;
|
|
496 }
|
|
497 assert( k_src==nexp );
|
|
498 l = str.l - l;
|
|
499 for (; l<size; l++) kputc(0, &str);
|
|
500 }
|
|
501 }
|
|
502 else // Number=G, diploid or haploid
|
|
503 {
|
|
504 for (j=0; j<line->n_sample; j++)
|
|
505 {
|
|
506 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
|
|
507 int k_src = 0, k_dst = 0, l = str.l;
|
|
508 int nexp = 0; // diploid or haploid?
|
|
509 while ( ptr<se )
|
|
510 {
|
|
511 if ( !*ptr ) break;
|
|
512 if ( *ptr==',' ) nexp++;
|
|
513 ptr++;
|
|
514 }
|
|
515 if ( ptr!=ss ) nexp++;
|
|
516 assert( nexp==nG_ori || nexp==nR_ori );
|
|
517 ptr = ss;
|
|
518 if ( nexp==nG_ori ) // diploid
|
|
519 {
|
|
520 int ia, ib;
|
|
521 for (ia=0; ia<nR_ori; ia++)
|
|
522 {
|
|
523 for (ib=0; ib<=ia; ib++)
|
|
524 {
|
|
525 if ( ptr>=se || !*ptr ) break;
|
|
526 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
|
|
527 if ( rm_mask & 1<<ia || rm_mask & 1<<ib )
|
|
528 {
|
|
529 ss = ++ptr;
|
|
530 continue;
|
|
531 }
|
|
532 if ( k_dst ) kputc(',',&str);
|
|
533 kputsn(ss,ptr-ss,&str);
|
|
534 ss = ++ptr;
|
|
535 k_dst++;
|
|
536 }
|
|
537 if ( ptr>=se || !*ptr ) break;
|
|
538 }
|
|
539 }
|
|
540 else // haploid
|
|
541 {
|
|
542 for (k_src=0; k_src<nR_ori; k_src++)
|
|
543 {
|
|
544 if ( ptr>=se || !*ptr ) break;
|
|
545 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
|
|
546 if ( rm_mask & 1<<k_src )
|
|
547 {
|
|
548 ss = ++ptr;
|
|
549 continue;
|
|
550 }
|
|
551 if ( k_dst ) kputc(',',&str);
|
|
552 kputsn(ss,ptr-ss,&str);
|
|
553 ss = ++ptr;
|
|
554 k_dst++;
|
|
555 }
|
|
556 assert( k_src==nR_ori );
|
|
557 l = str.l - l;
|
|
558 for (; l<size; l++) kputc(0, &str);
|
|
559 }
|
|
560 }
|
|
561 }
|
|
562 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
|
|
563 if ( nret<0 )
|
|
564 {
|
|
565 fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
|
|
566 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
|
|
567 exit(1);
|
|
568 }
|
|
569 continue;
|
|
570 }
|
|
571
|
|
572 int nori = nret / line->n_sample;
|
|
573 if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
|
|
574 {
|
|
575 int inc = 0, nnew;
|
|
576 if ( vlen==BCF_VL_A )
|
|
577 {
|
|
578 assert( nori==nA_ori ); // todo: will fail if all values are missing
|
|
579 ndat = nA_new*line->n_sample;
|
|
580 nnew = nA_new;
|
|
581 inc = 1;
|
|
582 }
|
|
583 else
|
|
584 {
|
|
585 assert( nori==nR_ori ); // todo: will fail if all values are missing
|
|
586 ndat = nR_new*line->n_sample;
|
|
587 nnew = nR_new;
|
|
588 }
|
|
589
|
|
590 #define BRANCH(type_t,is_vector_end) \
|
|
591 { \
|
|
592 for (j=0; j<line->n_sample; j++) \
|
|
593 { \
|
|
594 type_t *ptr_src = ((type_t*)dat) + j*nori; \
|
|
595 type_t *ptr_dst = ((type_t*)dat) + j*nnew; \
|
|
596 int size = sizeof(type_t); \
|
|
597 int k_src, k_dst = 0; \
|
|
598 for (k_src=0; k_src<nori; k_src++) \
|
|
599 { \
|
|
600 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
|
|
601 if ( rm_mask & 1<<(k_src+inc) ) continue; \
|
|
602 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
|
|
603 k_dst++; \
|
|
604 } \
|
|
605 } \
|
|
606 }
|
|
607 switch (type)
|
|
608 {
|
|
609 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
|
|
610 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
|
|
611 }
|
|
612 #undef BRANCH
|
|
613 }
|
|
614 else // Number=G, diploid or mixture of haploid+diploid
|
|
615 {
|
|
616 assert( nori==nG_ori );
|
|
617 ndat = nG_new*line->n_sample;
|
|
618
|
|
619 #define BRANCH(type_t,is_vector_end) \
|
|
620 { \
|
|
621 for (j=0; j<line->n_sample; j++) \
|
|
622 { \
|
|
623 type_t *ptr_src = ((type_t*)dat) + j*nori; \
|
|
624 type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
|
|
625 int size = sizeof(type_t); \
|
|
626 int ia, ib, k_dst = 0, k_src; \
|
|
627 int nset = 0; /* haploid or diploid? */ \
|
|
628 for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
|
|
629 if ( nset==nR_ori ) /* haploid */ \
|
|
630 { \
|
|
631 for (k_src=0; k_src<nR_ori; k_src++) \
|
|
632 { \
|
|
633 if ( rm_mask & 1<<k_src ) continue; \
|
|
634 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
|
|
635 k_dst++; \
|
|
636 } \
|
|
637 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
|
|
638 } \
|
|
639 else /* diploid */ \
|
|
640 { \
|
|
641 k_src = -1; \
|
|
642 for (ia=0; ia<nR_ori; ia++) \
|
|
643 { \
|
|
644 for (ib=0; ib<=ia; ib++) \
|
|
645 { \
|
|
646 k_src++; \
|
|
647 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
|
|
648 if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \
|
|
649 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
|
|
650 k_dst++; \
|
|
651 } \
|
|
652 } \
|
|
653 } \
|
|
654 } \
|
|
655 }
|
|
656 switch (type)
|
|
657 {
|
|
658 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
|
|
659 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
|
|
660 }
|
|
661 #undef BRANCH
|
|
662 }
|
|
663 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
|
|
664 if ( nret<0 )
|
|
665 {
|
|
666 fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
|
|
667 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
|
|
668 exit(1);
|
|
669 }
|
|
670 }
|
|
671 free(dat);
|
|
672 free(str.s);
|
|
673 free(map);
|
|
674 }
|
|
675
|