comparison ezBAMQC/src/htslib/vcfutils.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /* vcfutils.c -- allele-related utility functions.
2
3 Copyright (C) 2012-2014 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
24
25 #include "htslib/vcfutils.h"
26
27 int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
28 {
29 int i;
30 for (i=0; i<line->n_allele; i++) ac[i]=0;
31
32 // Use INFO/AC,AN field only when asked
33 if ( which&BCF_UN_INFO )
34 {
35 bcf_unpack(line, BCF_UN_INFO);
36 int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN");
37 int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC");
38 int i, an=-1, ac_len=0, ac_type=0;
39 uint8_t *ac_ptr=NULL;
40 if ( an_id>=0 && ac_id>=0 )
41 {
42 for (i=0; i<line->n_info; i++)
43 {
44 bcf_info_t *z = &line->d.info[i];
45 if ( z->key == an_id ) an = z->v1.i;
46 else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; }
47 }
48 }
49 if ( an>=0 && ac_ptr )
50 {
51 int nac = 0;
52 #define BRANCH_INT(type_t) { \
53 type_t *p = (type_t *) ac_ptr; \
54 for (i=0; i<ac_len; i++) \
55 { \
56 ac[i+1] = p[i]; \
57 nac += p[i]; \
58 } \
59 }
60 switch (ac_type) {
61 case BCF_BT_INT8: BRANCH_INT(int8_t); break;
62 case BCF_BT_INT16: BRANCH_INT(int16_t); break;
63 case BCF_BT_INT32: BRANCH_INT(int32_t); break;
64 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
65 }
66 #undef BRANCH_INT
67 if ( an<nac )
68 {
69 fprintf(stderr,"[E::%s] Incorrect AN/AC counts at %s:%d\n", __func__,header->id[BCF_DT_CTG][line->rid].key, line->pos+1);
70 exit(1);
71 }
72 ac[0] = an - nac;
73 return 1;
74 }
75 }
76
77 // Split genotype fields only when asked
78 if ( which&BCF_UN_FMT )
79 {
80 int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT");
81 if ( gt_id<0 ) return 0;
82 bcf_unpack(line, BCF_UN_FMT);
83 bcf_fmt_t *fmt_gt = NULL;
84 for (i=0; i<(int)line->n_fmt; i++)
85 if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
86 if ( !fmt_gt ) return 0;
87 #define BRANCH_INT(type_t,vector_end) { \
88 for (i=0; i<line->n_sample; i++) \
89 { \
90 type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
91 int ial; \
92 for (ial=0; ial<fmt_gt->n; ial++) \
93 { \
94 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
95 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
96 if ( p[ial]>>1 > line->n_allele ) \
97 { \
98 fprintf(stderr,"[E::%s] Incorrect allele (\"%d\") in %s at %s:%d\n", __func__,(p[ial]>>1)-1, header->samples[i],header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \
99 exit(1); \
100 } \
101 ac[(p[ial]>>1)-1]++; \
102 } \
103 } \
104 }
105 switch (fmt_gt->type) {
106 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
107 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
108 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
109 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
110 }
111 #undef BRANCH_INT
112 return 1;
113 }
114 return 0;
115 }
116
117 int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal)
118 {
119 int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0;
120 #define BRANCH_INT(type_t,vector_end) { \
121 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
122 for (i=0; i<fmt_ptr->n; i++) \
123 { \
124 if ( p[i] == vector_end ) break; /* smaller ploidy */ \
125 if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \
126 int tmp = p[i]>>1; \
127 if ( tmp>1 ) \
128 { \
129 if ( !ial ) { ial = tmp; has_alt = 1; } \
130 else if ( tmp!=ial ) \
131 { \
132 if ( tmp<ial ) \
133 { \
134 jal = ial; \
135 ial = tmp; \
136 } \
137 else \
138 { \
139 jal = tmp; \
140 } \
141 has_alt = 2; \
142 } \
143 } \
144 else has_ref = 1; \
145 nals++; \
146 } \
147 }
148 switch (fmt_ptr->type) {
149 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
150 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
151 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
152 default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break;
153 }
154 #undef BRANCH_INT
155
156 if ( _ial ) *_ial = ial>0 ? ial-1 : ial;
157 if ( _jal ) *_jal = jal>0 ? jal-1 : jal;
158 if ( !nals ) return GT_UNKN;
159 if ( nals==1 )
160 return has_ref ? GT_HAPL_R : GT_HAPL_A;
161 if ( !has_ref )
162 return has_alt==1 ? GT_HOM_AA : GT_HET_AA;
163 if ( !has_alt )
164 return GT_HOM_RR;
165 return GT_HET_RA;
166 }
167
168 int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
169 {
170 int i;
171 bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT");
172 if ( !gt ) return 0;
173
174 int *ac = (int*) calloc(line->n_allele,sizeof(int));
175
176 // check if all alleles are populated
177 #define BRANCH(type_t,vector_end) { \
178 for (i=0; i<line->n_sample; i++) \
179 { \
180 type_t *p = (type_t*) (gt->p + i*gt->size); \
181 int ial; \
182 for (ial=0; ial<gt->n; ial++) \
183 { \
184 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
185 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
186 if ( (p[ial]>>1)-1 >= line->n_allele ) { free(ac); return -1; } \
187 ac[(p[ial]>>1)-1]++; \
188 } \
189 } \
190 }
191 switch (gt->type) {
192 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
193 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
194 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
195 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
196 }
197 #undef BRANCH
198
199 int rm_als = 0, nrm = 0;
200 for (i=1; i<line->n_allele; i++)
201 {
202 if ( !ac[i] ) { rm_als |= 1<<i; nrm++; }
203 }
204 free(ac);
205
206 if ( nrm ) bcf_remove_alleles(header, line, rm_als);
207 return nrm;
208 }
209
210 void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
211 {
212 int *map = (int*) calloc(line->n_allele, sizeof(int));
213
214 // create map of indexes from old to new ALT numbering and modify ALT
215 kstring_t str = {0,0,0};
216 kputs(line->d.allele[0], &str);
217
218 int nrm = 0, i,j; // i: ori alleles, j: new alleles
219 for (i=1, j=1; i<line->n_allele; i++)
220 {
221 if ( rm_mask & 1<<i )
222 {
223 // remove this allele
224 line->d.allele[i] = NULL;
225 nrm++;
226 continue;
227 }
228 kputc(',', &str);
229 kputs(line->d.allele[i], &str);
230 map[i] = j;
231 j++;
232 }
233 if ( !nrm ) { free(map); free(str.s); return; }
234
235 int nR_ori = line->n_allele;
236 int nR_new = line->n_allele-nrm;
237 assert(nR_new > 0); // should not be able to remove reference allele
238 int nA_ori = nR_ori-1;
239 int nA_new = nR_new-1;
240
241 int nG_ori = nR_ori*(nR_ori + 1)/2;
242 int nG_new = nR_new*(nR_new + 1)/2;
243
244 bcf_update_alleles_str(header, line, str.s);
245
246 // remove from Number=G, Number=R and Number=A INFO fields.
247 uint8_t *dat = NULL;
248 int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
249 for (i=0; i<line->n_info; i++)
250 {
251 bcf_info_t *info = &line->d.info[i];
252 int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);
253
254 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
255
256 int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
257 if ( type==BCF_HT_FLAG ) continue;
258 int size = 1;
259 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
260
261 mdat = mdat_bytes / size;
262 nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
263 mdat_bytes = mdat * size;
264 if ( nret<0 )
265 {
266 fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
267 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
268 exit(1);
269 }
270 if ( type==BCF_HT_STR )
271 {
272 str.l = 0;
273 char *ss = (char*) dat, *se = (char*) dat;
274 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
275 {
276 int nexp, inc = 0;
277 if ( vlen==BCF_VL_A )
278 {
279 nexp = nA_ori;
280 inc = 1;
281 }
282 else
283 nexp = nR_ori;
284 for (j=0; j<nexp; j++)
285 {
286 if ( !*se ) break;
287 while ( *se && *se!=',' ) se++;
288 if ( rm_mask & 1<<(j+inc) )
289 {
290 if ( *se ) se++;
291 ss = se;
292 continue;
293 }
294 if ( str.l ) kputc(',',&str);
295 kputsn(ss,se-ss,&str);
296 if ( *se ) se++;
297 ss = se;
298 }
299 assert( j==nexp );
300 }
301 else // Number=G, assuming diploid genotype
302 {
303 int k = 0, n = 0;
304 for (j=0; j<nR_ori; j++)
305 {
306 for (k=0; k<=j; k++)
307 {
308 if ( !*se ) break;
309 while ( *se && *se!=',' ) se++;
310 n++;
311 if ( rm_mask & 1<<j || rm_mask & 1<<k )
312 {
313 if ( *se ) se++;
314 ss = se;
315 continue;
316 }
317 if ( str.l ) kputc(',',&str);
318 kputsn(ss,se-ss,&str);
319 if ( *se ) se++;
320 ss = se;
321 }
322 if ( !*se ) break;
323 }
324 assert( n=nG_ori );
325 }
326
327 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
328 if ( nret<0 )
329 {
330 fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
331 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
332 exit(1);
333 }
334 continue;
335 }
336
337 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
338 {
339 int inc = 0, ntop;
340 if ( vlen==BCF_VL_A )
341 {
342 assert( nret==nA_ori );
343 ntop = nA_ori;
344 ndat = nA_new;
345 inc = 1;
346 }
347 else
348 {
349 assert( nret==nR_ori );
350 ntop = nR_ori;
351 ndat = nR_new;
352 }
353 int k = 0;
354
355 #define BRANCH(type_t,is_vector_end) \
356 { \
357 type_t *ptr = (type_t*) dat; \
358 int size = sizeof(type_t); \
359 for (j=0; j<ntop; j++) /* j:ori, k:new */ \
360 { \
361 if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
362 if ( rm_mask & 1<<(j+inc) ) continue; \
363 if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
364 k++; \
365 } \
366 }
367 switch (type)
368 {
369 case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
370 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
371 }
372 #undef BRANCH
373 }
374 else // Number=G
375 {
376 assert( nret==nG_ori );
377 int k, l_ori = -1, l_new = 0;
378 ndat = nG_new;
379
380 #define BRANCH(type_t,is_vector_end) \
381 { \
382 type_t *ptr = (type_t*) dat; \
383 int size = sizeof(type_t); \
384 for (j=0; j<nR_ori; j++) \
385 { \
386 for (k=0; k<=j; k++) \
387 { \
388 l_ori++; \
389 if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
390 if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \
391 if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
392 l_new++; \
393 } \
394 } \
395 }
396 switch (type)
397 {
398 case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
399 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
400 }
401 #undef BRANCH
402 }
403
404 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
405 if ( nret<0 )
406 {
407 fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
408 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
409 exit(1);
410 }
411 }
412
413 // Update GT fields, the allele indexes might have changed
414 for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
415 if ( i<line->n_allele )
416 {
417 mdat = mdat_bytes / 4; // sizeof(int32_t)
418 nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
419 mdat_bytes = mdat * 4;
420 if ( nret>0 )
421 {
422 nret /= line->n_sample;
423 int32_t *ptr = (int32_t*) dat;
424 for (i=0; i<line->n_sample; i++)
425 {
426 for (j=0; j<nret; j++)
427 {
428 if ( bcf_gt_is_missing(ptr[j]) ) continue;
429 if ( ptr[j]==bcf_int32_vector_end ) break;
430 int al = bcf_gt_allele(ptr[j]);
431 assert( al<nR_ori && map[al]>=0 );
432 ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
433 }
434 ptr += nret;
435 }
436 bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
437 }
438 }
439
440 // Remove from Number=G, Number=R and Number=A FORMAT fields.
441 // Assuming haploid or diploid GTs
442 for (i=0; i<line->n_fmt; i++)
443 {
444 bcf_fmt_t *fmt = &line->d.fmt[i];
445 int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);
446
447 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
448
449 int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
450 if ( type==BCF_HT_FLAG ) continue;
451
452 int size = 1;
453 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
454
455 mdat = mdat_bytes / size;
456 nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
457 mdat_bytes = mdat * size;
458 if ( nret<0 )
459 {
460 fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
461 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
462 exit(1);
463 }
464
465 if ( type==BCF_HT_STR )
466 {
467 int size = nret/line->n_sample; // number of bytes per sample
468 str.l = 0;
469 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
470 {
471 int nexp, inc = 0;
472 if ( vlen==BCF_VL_A )
473 {
474 nexp = nA_ori;
475 inc = 1;
476 }
477 else
478 nexp = nR_ori;
479 for (j=0; j<line->n_sample; j++)
480 {
481 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
482 int k_src = 0, k_dst = 0, l = str.l;
483 for (k_src=0; k_src<nexp; k_src++)
484 {
485 if ( ptr>=se || !*ptr) break;
486 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
487 if ( rm_mask & 1<<(k_src+inc) )
488 {
489 ss = ++ptr;
490 continue;
491 }
492 if ( k_dst ) kputc(',',&str);
493 kputsn(ss,ptr-ss,&str);
494 ss = ++ptr;
495 k_dst++;
496 }
497 assert( k_src==nexp );
498 l = str.l - l;
499 for (; l<size; l++) kputc(0, &str);
500 }
501 }
502 else // Number=G, diploid or haploid
503 {
504 for (j=0; j<line->n_sample; j++)
505 {
506 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
507 int k_src = 0, k_dst = 0, l = str.l;
508 int nexp = 0; // diploid or haploid?
509 while ( ptr<se )
510 {
511 if ( !*ptr ) break;
512 if ( *ptr==',' ) nexp++;
513 ptr++;
514 }
515 if ( ptr!=ss ) nexp++;
516 assert( nexp==nG_ori || nexp==nR_ori );
517 ptr = ss;
518 if ( nexp==nG_ori ) // diploid
519 {
520 int ia, ib;
521 for (ia=0; ia<nR_ori; ia++)
522 {
523 for (ib=0; ib<=ia; ib++)
524 {
525 if ( ptr>=se || !*ptr ) break;
526 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
527 if ( rm_mask & 1<<ia || rm_mask & 1<<ib )
528 {
529 ss = ++ptr;
530 continue;
531 }
532 if ( k_dst ) kputc(',',&str);
533 kputsn(ss,ptr-ss,&str);
534 ss = ++ptr;
535 k_dst++;
536 }
537 if ( ptr>=se || !*ptr ) break;
538 }
539 }
540 else // haploid
541 {
542 for (k_src=0; k_src<nR_ori; k_src++)
543 {
544 if ( ptr>=se || !*ptr ) break;
545 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
546 if ( rm_mask & 1<<k_src )
547 {
548 ss = ++ptr;
549 continue;
550 }
551 if ( k_dst ) kputc(',',&str);
552 kputsn(ss,ptr-ss,&str);
553 ss = ++ptr;
554 k_dst++;
555 }
556 assert( k_src==nR_ori );
557 l = str.l - l;
558 for (; l<size; l++) kputc(0, &str);
559 }
560 }
561 }
562 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
563 if ( nret<0 )
564 {
565 fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
566 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
567 exit(1);
568 }
569 continue;
570 }
571
572 int nori = nret / line->n_sample;
573 if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
574 {
575 int inc = 0, nnew;
576 if ( vlen==BCF_VL_A )
577 {
578 assert( nori==nA_ori ); // todo: will fail if all values are missing
579 ndat = nA_new*line->n_sample;
580 nnew = nA_new;
581 inc = 1;
582 }
583 else
584 {
585 assert( nori==nR_ori ); // todo: will fail if all values are missing
586 ndat = nR_new*line->n_sample;
587 nnew = nR_new;
588 }
589
590 #define BRANCH(type_t,is_vector_end) \
591 { \
592 for (j=0; j<line->n_sample; j++) \
593 { \
594 type_t *ptr_src = ((type_t*)dat) + j*nori; \
595 type_t *ptr_dst = ((type_t*)dat) + j*nnew; \
596 int size = sizeof(type_t); \
597 int k_src, k_dst = 0; \
598 for (k_src=0; k_src<nori; k_src++) \
599 { \
600 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
601 if ( rm_mask & 1<<(k_src+inc) ) continue; \
602 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
603 k_dst++; \
604 } \
605 } \
606 }
607 switch (type)
608 {
609 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
610 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
611 }
612 #undef BRANCH
613 }
614 else // Number=G, diploid or mixture of haploid+diploid
615 {
616 assert( nori==nG_ori );
617 ndat = nG_new*line->n_sample;
618
619 #define BRANCH(type_t,is_vector_end) \
620 { \
621 for (j=0; j<line->n_sample; j++) \
622 { \
623 type_t *ptr_src = ((type_t*)dat) + j*nori; \
624 type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
625 int size = sizeof(type_t); \
626 int ia, ib, k_dst = 0, k_src; \
627 int nset = 0; /* haploid or diploid? */ \
628 for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
629 if ( nset==nR_ori ) /* haploid */ \
630 { \
631 for (k_src=0; k_src<nR_ori; k_src++) \
632 { \
633 if ( rm_mask & 1<<k_src ) continue; \
634 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
635 k_dst++; \
636 } \
637 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
638 } \
639 else /* diploid */ \
640 { \
641 k_src = -1; \
642 for (ia=0; ia<nR_ori; ia++) \
643 { \
644 for (ib=0; ib<=ia; ib++) \
645 { \
646 k_src++; \
647 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
648 if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \
649 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
650 k_dst++; \
651 } \
652 } \
653 } \
654 } \
655 }
656 switch (type)
657 {
658 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
659 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
660 }
661 #undef BRANCH
662 }
663 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
664 if ( nret<0 )
665 {
666 fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
667 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
668 exit(1);
669 }
670 }
671 free(dat);
672 free(str.s);
673 free(map);
674 }
675