Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/vcfutils.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* vcfutils.c -- allele-related utility functions. | |
2 | |
3 Copyright (C) 2012-2014 Genome Research Ltd. | |
4 | |
5 Author: Petr Danecek <pd3@sanger.ac.uk> | |
6 | |
7 Permission is hereby granted, free of charge, to any person obtaining a copy | |
8 of this software and associated documentation files (the "Software"), to deal | |
9 in the Software without restriction, including without limitation the rights | |
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
11 copies of the Software, and to permit persons to whom the Software is | |
12 furnished to do so, subject to the following conditions: | |
13 | |
14 The above copyright notice and this permission notice shall be included in | |
15 all copies or substantial portions of the Software. | |
16 | |
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
23 DEALINGS IN THE SOFTWARE. */ | |
24 | |
25 #include "htslib/vcfutils.h" | |
26 | |
27 int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) | |
28 { | |
29 int i; | |
30 for (i=0; i<line->n_allele; i++) ac[i]=0; | |
31 | |
32 // Use INFO/AC,AN field only when asked | |
33 if ( which&BCF_UN_INFO ) | |
34 { | |
35 bcf_unpack(line, BCF_UN_INFO); | |
36 int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN"); | |
37 int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC"); | |
38 int i, an=-1, ac_len=0, ac_type=0; | |
39 uint8_t *ac_ptr=NULL; | |
40 if ( an_id>=0 && ac_id>=0 ) | |
41 { | |
42 for (i=0; i<line->n_info; i++) | |
43 { | |
44 bcf_info_t *z = &line->d.info[i]; | |
45 if ( z->key == an_id ) an = z->v1.i; | |
46 else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; } | |
47 } | |
48 } | |
49 if ( an>=0 && ac_ptr ) | |
50 { | |
51 int nac = 0; | |
52 #define BRANCH_INT(type_t) { \ | |
53 type_t *p = (type_t *) ac_ptr; \ | |
54 for (i=0; i<ac_len; i++) \ | |
55 { \ | |
56 ac[i+1] = p[i]; \ | |
57 nac += p[i]; \ | |
58 } \ | |
59 } | |
60 switch (ac_type) { | |
61 case BCF_BT_INT8: BRANCH_INT(int8_t); break; | |
62 case BCF_BT_INT16: BRANCH_INT(int16_t); break; | |
63 case BCF_BT_INT32: BRANCH_INT(int32_t); break; | |
64 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; | |
65 } | |
66 #undef BRANCH_INT | |
67 if ( an<nac ) | |
68 { | |
69 fprintf(stderr,"[E::%s] Incorrect AN/AC counts at %s:%d\n", __func__,header->id[BCF_DT_CTG][line->rid].key, line->pos+1); | |
70 exit(1); | |
71 } | |
72 ac[0] = an - nac; | |
73 return 1; | |
74 } | |
75 } | |
76 | |
77 // Split genotype fields only when asked | |
78 if ( which&BCF_UN_FMT ) | |
79 { | |
80 int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT"); | |
81 if ( gt_id<0 ) return 0; | |
82 bcf_unpack(line, BCF_UN_FMT); | |
83 bcf_fmt_t *fmt_gt = NULL; | |
84 for (i=0; i<(int)line->n_fmt; i++) | |
85 if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } | |
86 if ( !fmt_gt ) return 0; | |
87 #define BRANCH_INT(type_t,vector_end) { \ | |
88 for (i=0; i<line->n_sample; i++) \ | |
89 { \ | |
90 type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ | |
91 int ial; \ | |
92 for (ial=0; ial<fmt_gt->n; ial++) \ | |
93 { \ | |
94 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ | |
95 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ | |
96 if ( p[ial]>>1 > line->n_allele ) \ | |
97 { \ | |
98 fprintf(stderr,"[E::%s] Incorrect allele (\"%d\") in %s at %s:%d\n", __func__,(p[ial]>>1)-1, header->samples[i],header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ | |
99 exit(1); \ | |
100 } \ | |
101 ac[(p[ial]>>1)-1]++; \ | |
102 } \ | |
103 } \ | |
104 } | |
105 switch (fmt_gt->type) { | |
106 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; | |
107 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; | |
108 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; | |
109 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; | |
110 } | |
111 #undef BRANCH_INT | |
112 return 1; | |
113 } | |
114 return 0; | |
115 } | |
116 | |
117 int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal) | |
118 { | |
119 int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0; | |
120 #define BRANCH_INT(type_t,vector_end) { \ | |
121 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \ | |
122 for (i=0; i<fmt_ptr->n; i++) \ | |
123 { \ | |
124 if ( p[i] == vector_end ) break; /* smaller ploidy */ \ | |
125 if ( bcf_gt_is_missing(p[i]) ) continue; /* missing allele */ \ | |
126 int tmp = p[i]>>1; \ | |
127 if ( tmp>1 ) \ | |
128 { \ | |
129 if ( !ial ) { ial = tmp; has_alt = 1; } \ | |
130 else if ( tmp!=ial ) \ | |
131 { \ | |
132 if ( tmp<ial ) \ | |
133 { \ | |
134 jal = ial; \ | |
135 ial = tmp; \ | |
136 } \ | |
137 else \ | |
138 { \ | |
139 jal = tmp; \ | |
140 } \ | |
141 has_alt = 2; \ | |
142 } \ | |
143 } \ | |
144 else has_ref = 1; \ | |
145 nals++; \ | |
146 } \ | |
147 } | |
148 switch (fmt_ptr->type) { | |
149 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; | |
150 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; | |
151 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; | |
152 default: fprintf(stderr, "[E::%s] todo: fmt_type %d\n", __func__, fmt_ptr->type); exit(1); break; | |
153 } | |
154 #undef BRANCH_INT | |
155 | |
156 if ( _ial ) *_ial = ial>0 ? ial-1 : ial; | |
157 if ( _jal ) *_jal = jal>0 ? jal-1 : jal; | |
158 if ( !nals ) return GT_UNKN; | |
159 if ( nals==1 ) | |
160 return has_ref ? GT_HAPL_R : GT_HAPL_A; | |
161 if ( !has_ref ) | |
162 return has_alt==1 ? GT_HOM_AA : GT_HET_AA; | |
163 if ( !has_alt ) | |
164 return GT_HOM_RR; | |
165 return GT_HET_RA; | |
166 } | |
167 | |
168 int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) | |
169 { | |
170 int i; | |
171 bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT"); | |
172 if ( !gt ) return 0; | |
173 | |
174 int *ac = (int*) calloc(line->n_allele,sizeof(int)); | |
175 | |
176 // check if all alleles are populated | |
177 #define BRANCH(type_t,vector_end) { \ | |
178 for (i=0; i<line->n_sample; i++) \ | |
179 { \ | |
180 type_t *p = (type_t*) (gt->p + i*gt->size); \ | |
181 int ial; \ | |
182 for (ial=0; ial<gt->n; ial++) \ | |
183 { \ | |
184 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ | |
185 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ | |
186 if ( (p[ial]>>1)-1 >= line->n_allele ) { free(ac); return -1; } \ | |
187 ac[(p[ial]>>1)-1]++; \ | |
188 } \ | |
189 } \ | |
190 } | |
191 switch (gt->type) { | |
192 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; | |
193 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; | |
194 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; | |
195 default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; | |
196 } | |
197 #undef BRANCH | |
198 | |
199 int rm_als = 0, nrm = 0; | |
200 for (i=1; i<line->n_allele; i++) | |
201 { | |
202 if ( !ac[i] ) { rm_als |= 1<<i; nrm++; } | |
203 } | |
204 free(ac); | |
205 | |
206 if ( nrm ) bcf_remove_alleles(header, line, rm_als); | |
207 return nrm; | |
208 } | |
209 | |
210 void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) | |
211 { | |
212 int *map = (int*) calloc(line->n_allele, sizeof(int)); | |
213 | |
214 // create map of indexes from old to new ALT numbering and modify ALT | |
215 kstring_t str = {0,0,0}; | |
216 kputs(line->d.allele[0], &str); | |
217 | |
218 int nrm = 0, i,j; // i: ori alleles, j: new alleles | |
219 for (i=1, j=1; i<line->n_allele; i++) | |
220 { | |
221 if ( rm_mask & 1<<i ) | |
222 { | |
223 // remove this allele | |
224 line->d.allele[i] = NULL; | |
225 nrm++; | |
226 continue; | |
227 } | |
228 kputc(',', &str); | |
229 kputs(line->d.allele[i], &str); | |
230 map[i] = j; | |
231 j++; | |
232 } | |
233 if ( !nrm ) { free(map); free(str.s); return; } | |
234 | |
235 int nR_ori = line->n_allele; | |
236 int nR_new = line->n_allele-nrm; | |
237 assert(nR_new > 0); // should not be able to remove reference allele | |
238 int nA_ori = nR_ori-1; | |
239 int nA_new = nR_new-1; | |
240 | |
241 int nG_ori = nR_ori*(nR_ori + 1)/2; | |
242 int nG_new = nR_new*(nR_new + 1)/2; | |
243 | |
244 bcf_update_alleles_str(header, line, str.s); | |
245 | |
246 // remove from Number=G, Number=R and Number=A INFO fields. | |
247 uint8_t *dat = NULL; | |
248 int mdat = 0, ndat = 0, mdat_bytes = 0, nret; | |
249 for (i=0; i<line->n_info; i++) | |
250 { | |
251 bcf_info_t *info = &line->d.info[i]; | |
252 int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key); | |
253 | |
254 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change | |
255 | |
256 int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key); | |
257 if ( type==BCF_HT_FLAG ) continue; | |
258 int size = 1; | |
259 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; | |
260 | |
261 mdat = mdat_bytes / size; | |
262 nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type); | |
263 mdat_bytes = mdat * size; | |
264 if ( nret<0 ) | |
265 { | |
266 fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
267 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); | |
268 exit(1); | |
269 } | |
270 if ( type==BCF_HT_STR ) | |
271 { | |
272 str.l = 0; | |
273 char *ss = (char*) dat, *se = (char*) dat; | |
274 if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) | |
275 { | |
276 int nexp, inc = 0; | |
277 if ( vlen==BCF_VL_A ) | |
278 { | |
279 nexp = nA_ori; | |
280 inc = 1; | |
281 } | |
282 else | |
283 nexp = nR_ori; | |
284 for (j=0; j<nexp; j++) | |
285 { | |
286 if ( !*se ) break; | |
287 while ( *se && *se!=',' ) se++; | |
288 if ( rm_mask & 1<<(j+inc) ) | |
289 { | |
290 if ( *se ) se++; | |
291 ss = se; | |
292 continue; | |
293 } | |
294 if ( str.l ) kputc(',',&str); | |
295 kputsn(ss,se-ss,&str); | |
296 if ( *se ) se++; | |
297 ss = se; | |
298 } | |
299 assert( j==nexp ); | |
300 } | |
301 else // Number=G, assuming diploid genotype | |
302 { | |
303 int k = 0, n = 0; | |
304 for (j=0; j<nR_ori; j++) | |
305 { | |
306 for (k=0; k<=j; k++) | |
307 { | |
308 if ( !*se ) break; | |
309 while ( *se && *se!=',' ) se++; | |
310 n++; | |
311 if ( rm_mask & 1<<j || rm_mask & 1<<k ) | |
312 { | |
313 if ( *se ) se++; | |
314 ss = se; | |
315 continue; | |
316 } | |
317 if ( str.l ) kputc(',',&str); | |
318 kputsn(ss,se-ss,&str); | |
319 if ( *se ) se++; | |
320 ss = se; | |
321 } | |
322 if ( !*se ) break; | |
323 } | |
324 assert( n=nG_ori ); | |
325 } | |
326 | |
327 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); | |
328 if ( nret<0 ) | |
329 { | |
330 fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
331 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); | |
332 exit(1); | |
333 } | |
334 continue; | |
335 } | |
336 | |
337 if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) | |
338 { | |
339 int inc = 0, ntop; | |
340 if ( vlen==BCF_VL_A ) | |
341 { | |
342 assert( nret==nA_ori ); | |
343 ntop = nA_ori; | |
344 ndat = nA_new; | |
345 inc = 1; | |
346 } | |
347 else | |
348 { | |
349 assert( nret==nR_ori ); | |
350 ntop = nR_ori; | |
351 ndat = nR_new; | |
352 } | |
353 int k = 0; | |
354 | |
355 #define BRANCH(type_t,is_vector_end) \ | |
356 { \ | |
357 type_t *ptr = (type_t*) dat; \ | |
358 int size = sizeof(type_t); \ | |
359 for (j=0; j<ntop; j++) /* j:ori, k:new */ \ | |
360 { \ | |
361 if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \ | |
362 if ( rm_mask & 1<<(j+inc) ) continue; \ | |
363 if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \ | |
364 k++; \ | |
365 } \ | |
366 } | |
367 switch (type) | |
368 { | |
369 case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break; | |
370 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break; | |
371 } | |
372 #undef BRANCH | |
373 } | |
374 else // Number=G | |
375 { | |
376 assert( nret==nG_ori ); | |
377 int k, l_ori = -1, l_new = 0; | |
378 ndat = nG_new; | |
379 | |
380 #define BRANCH(type_t,is_vector_end) \ | |
381 { \ | |
382 type_t *ptr = (type_t*) dat; \ | |
383 int size = sizeof(type_t); \ | |
384 for (j=0; j<nR_ori; j++) \ | |
385 { \ | |
386 for (k=0; k<=j; k++) \ | |
387 { \ | |
388 l_ori++; \ | |
389 if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \ | |
390 if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \ | |
391 if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \ | |
392 l_new++; \ | |
393 } \ | |
394 } \ | |
395 } | |
396 switch (type) | |
397 { | |
398 case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break; | |
399 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break; | |
400 } | |
401 #undef BRANCH | |
402 } | |
403 | |
404 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); | |
405 if ( nret<0 ) | |
406 { | |
407 fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
408 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); | |
409 exit(1); | |
410 } | |
411 } | |
412 | |
413 // Update GT fields, the allele indexes might have changed | |
414 for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break; | |
415 if ( i<line->n_allele ) | |
416 { | |
417 mdat = mdat_bytes / 4; // sizeof(int32_t) | |
418 nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat); | |
419 mdat_bytes = mdat * 4; | |
420 if ( nret>0 ) | |
421 { | |
422 nret /= line->n_sample; | |
423 int32_t *ptr = (int32_t*) dat; | |
424 for (i=0; i<line->n_sample; i++) | |
425 { | |
426 for (j=0; j<nret; j++) | |
427 { | |
428 if ( bcf_gt_is_missing(ptr[j]) ) continue; | |
429 if ( ptr[j]==bcf_int32_vector_end ) break; | |
430 int al = bcf_gt_allele(ptr[j]); | |
431 assert( al<nR_ori && map[al]>=0 ); | |
432 ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); | |
433 } | |
434 ptr += nret; | |
435 } | |
436 bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); | |
437 } | |
438 } | |
439 | |
440 // Remove from Number=G, Number=R and Number=A FORMAT fields. | |
441 // Assuming haploid or diploid GTs | |
442 for (i=0; i<line->n_fmt; i++) | |
443 { | |
444 bcf_fmt_t *fmt = &line->d.fmt[i]; | |
445 int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id); | |
446 | |
447 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change | |
448 | |
449 int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id); | |
450 if ( type==BCF_HT_FLAG ) continue; | |
451 | |
452 int size = 1; | |
453 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4; | |
454 | |
455 mdat = mdat_bytes / size; | |
456 nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type); | |
457 mdat_bytes = mdat * size; | |
458 if ( nret<0 ) | |
459 { | |
460 fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
461 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); | |
462 exit(1); | |
463 } | |
464 | |
465 if ( type==BCF_HT_STR ) | |
466 { | |
467 int size = nret/line->n_sample; // number of bytes per sample | |
468 str.l = 0; | |
469 if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) | |
470 { | |
471 int nexp, inc = 0; | |
472 if ( vlen==BCF_VL_A ) | |
473 { | |
474 nexp = nA_ori; | |
475 inc = 1; | |
476 } | |
477 else | |
478 nexp = nR_ori; | |
479 for (j=0; j<line->n_sample; j++) | |
480 { | |
481 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; | |
482 int k_src = 0, k_dst = 0, l = str.l; | |
483 for (k_src=0; k_src<nexp; k_src++) | |
484 { | |
485 if ( ptr>=se || !*ptr) break; | |
486 while ( ptr<se && *ptr && *ptr!=',' ) ptr++; | |
487 if ( rm_mask & 1<<(k_src+inc) ) | |
488 { | |
489 ss = ++ptr; | |
490 continue; | |
491 } | |
492 if ( k_dst ) kputc(',',&str); | |
493 kputsn(ss,ptr-ss,&str); | |
494 ss = ++ptr; | |
495 k_dst++; | |
496 } | |
497 assert( k_src==nexp ); | |
498 l = str.l - l; | |
499 for (; l<size; l++) kputc(0, &str); | |
500 } | |
501 } | |
502 else // Number=G, diploid or haploid | |
503 { | |
504 for (j=0; j<line->n_sample; j++) | |
505 { | |
506 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss; | |
507 int k_src = 0, k_dst = 0, l = str.l; | |
508 int nexp = 0; // diploid or haploid? | |
509 while ( ptr<se ) | |
510 { | |
511 if ( !*ptr ) break; | |
512 if ( *ptr==',' ) nexp++; | |
513 ptr++; | |
514 } | |
515 if ( ptr!=ss ) nexp++; | |
516 assert( nexp==nG_ori || nexp==nR_ori ); | |
517 ptr = ss; | |
518 if ( nexp==nG_ori ) // diploid | |
519 { | |
520 int ia, ib; | |
521 for (ia=0; ia<nR_ori; ia++) | |
522 { | |
523 for (ib=0; ib<=ia; ib++) | |
524 { | |
525 if ( ptr>=se || !*ptr ) break; | |
526 while ( ptr<se && *ptr && *ptr!=',' ) ptr++; | |
527 if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) | |
528 { | |
529 ss = ++ptr; | |
530 continue; | |
531 } | |
532 if ( k_dst ) kputc(',',&str); | |
533 kputsn(ss,ptr-ss,&str); | |
534 ss = ++ptr; | |
535 k_dst++; | |
536 } | |
537 if ( ptr>=se || !*ptr ) break; | |
538 } | |
539 } | |
540 else // haploid | |
541 { | |
542 for (k_src=0; k_src<nR_ori; k_src++) | |
543 { | |
544 if ( ptr>=se || !*ptr ) break; | |
545 while ( ptr<se && *ptr && *ptr!=',' ) ptr++; | |
546 if ( rm_mask & 1<<k_src ) | |
547 { | |
548 ss = ++ptr; | |
549 continue; | |
550 } | |
551 if ( k_dst ) kputc(',',&str); | |
552 kputsn(ss,ptr-ss,&str); | |
553 ss = ++ptr; | |
554 k_dst++; | |
555 } | |
556 assert( k_src==nR_ori ); | |
557 l = str.l - l; | |
558 for (; l<size; l++) kputc(0, &str); | |
559 } | |
560 } | |
561 } | |
562 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); | |
563 if ( nret<0 ) | |
564 { | |
565 fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
566 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); | |
567 exit(1); | |
568 } | |
569 continue; | |
570 } | |
571 | |
572 int nori = nret / line->n_sample; | |
573 if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G | |
574 { | |
575 int inc = 0, nnew; | |
576 if ( vlen==BCF_VL_A ) | |
577 { | |
578 assert( nori==nA_ori ); // todo: will fail if all values are missing | |
579 ndat = nA_new*line->n_sample; | |
580 nnew = nA_new; | |
581 inc = 1; | |
582 } | |
583 else | |
584 { | |
585 assert( nori==nR_ori ); // todo: will fail if all values are missing | |
586 ndat = nR_new*line->n_sample; | |
587 nnew = nR_new; | |
588 } | |
589 | |
590 #define BRANCH(type_t,is_vector_end) \ | |
591 { \ | |
592 for (j=0; j<line->n_sample; j++) \ | |
593 { \ | |
594 type_t *ptr_src = ((type_t*)dat) + j*nori; \ | |
595 type_t *ptr_dst = ((type_t*)dat) + j*nnew; \ | |
596 int size = sizeof(type_t); \ | |
597 int k_src, k_dst = 0; \ | |
598 for (k_src=0; k_src<nori; k_src++) \ | |
599 { \ | |
600 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \ | |
601 if ( rm_mask & 1<<(k_src+inc) ) continue; \ | |
602 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ | |
603 k_dst++; \ | |
604 } \ | |
605 } \ | |
606 } | |
607 switch (type) | |
608 { | |
609 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; | |
610 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; | |
611 } | |
612 #undef BRANCH | |
613 } | |
614 else // Number=G, diploid or mixture of haploid+diploid | |
615 { | |
616 assert( nori==nG_ori ); | |
617 ndat = nG_new*line->n_sample; | |
618 | |
619 #define BRANCH(type_t,is_vector_end) \ | |
620 { \ | |
621 for (j=0; j<line->n_sample; j++) \ | |
622 { \ | |
623 type_t *ptr_src = ((type_t*)dat) + j*nori; \ | |
624 type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \ | |
625 int size = sizeof(type_t); \ | |
626 int ia, ib, k_dst = 0, k_src; \ | |
627 int nset = 0; /* haploid or diploid? */ \ | |
628 for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \ | |
629 if ( nset==nR_ori ) /* haploid */ \ | |
630 { \ | |
631 for (k_src=0; k_src<nR_ori; k_src++) \ | |
632 { \ | |
633 if ( rm_mask & 1<<k_src ) continue; \ | |
634 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ | |
635 k_dst++; \ | |
636 } \ | |
637 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ | |
638 } \ | |
639 else /* diploid */ \ | |
640 { \ | |
641 k_src = -1; \ | |
642 for (ia=0; ia<nR_ori; ia++) \ | |
643 { \ | |
644 for (ib=0; ib<=ia; ib++) \ | |
645 { \ | |
646 k_src++; \ | |
647 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \ | |
648 if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \ | |
649 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \ | |
650 k_dst++; \ | |
651 } \ | |
652 } \ | |
653 } \ | |
654 } \ | |
655 } | |
656 switch (type) | |
657 { | |
658 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break; | |
659 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break; | |
660 } | |
661 #undef BRANCH | |
662 } | |
663 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); | |
664 if ( nret<0 ) | |
665 { | |
666 fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__, | |
667 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); | |
668 exit(1); | |
669 } | |
670 } | |
671 free(dat); | |
672 free(str.s); | |
673 free(map); | |
674 } | |
675 |