comparison ezBAMQC/src/htslib/test/test-vcf-api.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfa3745e5fd8
1 /* test/test-vcf-api.c -- VCF test harness.
2
3 Copyright (C) 2013, 2014 Genome Research Ltd.
4
5 Author: Petr Danecek <pd3@sanger.ac.uk>
6
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
24
25 #include <stdio.h>
26 #include <htslib/hts.h>
27 #include <htslib/vcf.h>
28 #include <htslib/kstring.h>
29 #include <htslib/kseq.h>
30
31 void write_bcf(char *fname)
32 {
33 // Init
34 htsFile *fp = hts_open(fname,"wb");
35 bcf_hdr_t *hdr = bcf_hdr_init("w");
36 bcf1_t *rec = bcf_init1();
37
38 // Create VCF header
39 kstring_t str = {0,0,0};
40 bcf_hdr_append(hdr, "##fileDate=20090805");
41 bcf_hdr_append(hdr, "##FORMAT=<ID=UF,Number=1,Type=Integer,Description=\"Unused FORMAT\">");
42 bcf_hdr_append(hdr, "##INFO=<ID=UI,Number=1,Type=Integer,Description=\"Unused INFO\">");
43 bcf_hdr_append(hdr, "##FILTER=<ID=Flt,Description=\"Unused FILTER\">");
44 bcf_hdr_append(hdr, "##unused=<XX=AA,Description=\"Unused generic\">");
45 bcf_hdr_append(hdr, "##unused=unformatted text 1");
46 bcf_hdr_append(hdr, "##unused=unformatted text 2");
47 bcf_hdr_append(hdr, "##contig=<ID=Unused,length=62435964>");
48 bcf_hdr_append(hdr, "##source=myImputationProgramV3.1");
49 bcf_hdr_append(hdr, "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta");
50 bcf_hdr_append(hdr, "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>");
51 bcf_hdr_append(hdr, "##phasing=partial");
52 bcf_hdr_append(hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">");
53 bcf_hdr_append(hdr, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">");
54 bcf_hdr_append(hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">");
55 bcf_hdr_append(hdr, "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">");
56 bcf_hdr_append(hdr, "##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership, build 129\">");
57 bcf_hdr_append(hdr, "##INFO=<ID=H2,Number=0,Type=Flag,Description=\"HapMap2 membership\">");
58 bcf_hdr_append(hdr, "##FILTER=<ID=q10,Description=\"Quality below 10\">");
59 bcf_hdr_append(hdr, "##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">");
60 bcf_hdr_append(hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
61 bcf_hdr_append(hdr, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
62 bcf_hdr_append(hdr, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">");
63 bcf_hdr_append(hdr, "##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"Haplotype Quality\">");
64 bcf_hdr_append(hdr, "##FORMAT=<ID=TS,Number=1,Type=String,Description=\"Test String\">");
65
66 bcf_hdr_add_sample(hdr, "NA00001");
67 bcf_hdr_add_sample(hdr, "NA00002");
68 bcf_hdr_add_sample(hdr, "NA00003");
69 bcf_hdr_add_sample(hdr, NULL); // to update internal structures
70 bcf_hdr_write(fp, hdr);
71
72
73 // Add a record
74 // 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
75 // .. CHROM
76 rec->rid = bcf_hdr_name2id(hdr, "20");
77 // .. POS
78 rec->pos = 14369;
79 // .. ID
80 bcf_update_id(hdr, rec, "rs6054257");
81 // .. REF and ALT
82 bcf_update_alleles_str(hdr, rec, "G,A");
83 // .. QUAL
84 rec->qual = 29;
85 // .. FILTER
86 int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS");
87 bcf_update_filter(hdr, rec, &tmpi, 1);
88 // .. INFO
89 tmpi = 3;
90 bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1);
91 tmpi = 14;
92 bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1);
93 float tmpf = 0.5;
94 bcf_update_info_float(hdr, rec, "AF", &tmpf, 1);
95 bcf_update_info_flag(hdr, rec, "DB", NULL, 1);
96 bcf_update_info_flag(hdr, rec, "H2", NULL, 1);
97 // .. FORMAT
98 int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int));
99 tmpia[0] = bcf_gt_phased(0);
100 tmpia[1] = bcf_gt_phased(0);
101 tmpia[2] = bcf_gt_phased(1);
102 tmpia[3] = bcf_gt_phased(0);
103 tmpia[4] = bcf_gt_unphased(1);
104 tmpia[5] = bcf_gt_unphased(1);
105 bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2);
106 tmpia[0] = 48;
107 tmpia[1] = 48;
108 tmpia[2] = 43;
109 bcf_update_format_int32(hdr, rec, "GQ", tmpia, bcf_hdr_nsamples(hdr));
110 tmpia[0] = 1;
111 tmpia[1] = 8;
112 tmpia[2] = 5;
113 bcf_update_format_int32(hdr, rec, "DP", tmpia, bcf_hdr_nsamples(hdr));
114 tmpia[0] = 51;
115 tmpia[1] = 51;
116 tmpia[2] = 51;
117 tmpia[3] = 51;
118 tmpia[4] = bcf_int32_missing;
119 tmpia[5] = bcf_int32_missing;
120 bcf_update_format_int32(hdr, rec, "HQ", tmpia, bcf_hdr_nsamples(hdr)*2);
121 char *tmp_str[] = {"String1","SomeOtherString2","YetAnotherString3"};
122 bcf_update_format_string(hdr, rec, "TS", (const char**)tmp_str, 3);
123 bcf_write1(fp, hdr, rec);
124
125 // 20 1110696 . A G,T 67 . NS=2;DP=10;AF=0.333,.;AA=T;DB GT 2 1 ./.
126 bcf_clear1(rec);
127 rec->rid = bcf_hdr_name2id(hdr, "20");
128 rec->pos = 1110695;
129 bcf_update_alleles_str(hdr, rec, "A,G,T");
130 rec->qual = 67;
131 tmpi = 2;
132 bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1);
133 tmpi = 10;
134 bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1);
135 float *tmpfa = (float*)malloc(2*sizeof(float));
136 tmpfa[0] = 0.333;
137 bcf_float_set_missing(tmpfa[1]);
138 bcf_update_info_float(hdr, rec, "AF", tmpfa, 2);
139 bcf_update_info_string(hdr, rec, "AA", "T");
140 bcf_update_info_flag(hdr, rec, "DB", NULL, 1);
141 tmpia[0] = bcf_gt_phased(2);
142 tmpia[1] = bcf_int32_vector_end;
143 tmpia[2] = bcf_gt_phased(1);
144 tmpia[3] = bcf_int32_vector_end;
145 tmpia[4] = bcf_gt_missing;
146 tmpia[5] = bcf_gt_missing;
147 bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2);
148 bcf_write1(fp, hdr, rec);
149
150 free(tmpia);
151 free(tmpfa);
152
153 // Clean
154 free(str.s);
155 bcf_destroy1(rec);
156 bcf_hdr_destroy(hdr);
157 int ret;
158 if ( (ret=hts_close(fp)) )
159 {
160 fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
161 exit(ret);
162 }
163 }
164
165 void bcf_to_vcf(char *fname)
166 {
167 htsFile *fp = hts_open(fname,"rb");
168 bcf_hdr_t *hdr = bcf_hdr_read(fp);
169 bcf1_t *rec = bcf_init1();
170
171 char *gz_fname = (char*) malloc(strlen(fname)+4);
172 snprintf(gz_fname,strlen(fname)+4,"%s.gz",fname);
173 htsFile *out = hts_open(gz_fname,"wg");
174
175 bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr);
176 bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused");
177 bcf_hdr_remove(hdr_out,BCF_HL_GEN,"unused");
178 bcf_hdr_remove(hdr_out,BCF_HL_FLT,"Flt");
179 bcf_hdr_remove(hdr_out,BCF_HL_INFO,"UI");
180 bcf_hdr_remove(hdr_out,BCF_HL_FMT,"UF");
181 bcf_hdr_remove(hdr_out,BCF_HL_CTG,"Unused");
182 bcf_hdr_write(out, hdr_out);
183
184 while ( bcf_read1(fp, hdr, rec)>=0 )
185 {
186 bcf_write1(out, hdr_out, rec);
187
188 // Test problems caused by bcf1_sync: the data block
189 // may be realloced, also the unpacked structures must
190 // get updated.
191 bcf_unpack(rec, BCF_UN_STR);
192 bcf_update_id(hdr, rec, 0);
193 bcf_update_format_int32(hdr, rec, "GQ", NULL, 0);
194
195 bcf1_t *dup = bcf_dup(rec); // force bcf1_sync call
196 bcf_write1(out, hdr_out, dup);
197 bcf_destroy1(dup);
198
199 bcf_update_alleles_str(hdr_out, rec, "G,A");
200 int32_t tmpi = 99;
201 bcf_update_info_int32(hdr_out, rec, "DP", &tmpi, 1);
202 int32_t tmpia[] = {9,9,9};
203 bcf_update_format_int32(hdr_out, rec, "DP", tmpia, 3);
204
205 bcf_write1(out, hdr_out, rec);
206 }
207
208 bcf_destroy1(rec);
209 bcf_hdr_destroy(hdr);
210 bcf_hdr_destroy(hdr_out);
211 int ret;
212 if ( (ret=hts_close(fp)) )
213 {
214 fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
215 exit(ret);
216 }
217 if ( (ret=hts_close(out)) )
218 {
219 fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret);
220 exit(ret);
221 }
222
223
224 // read gzip, write stdout
225 htsFile *gz_in = hts_open(gz_fname, "r");
226 if ( !gz_in )
227 {
228 fprintf(stderr,"Could not read: %s\n", gz_fname);
229 exit(1);
230 }
231
232 kstring_t line = {0,0,0};
233 while ( hts_getline(gz_in, KS_SEP_LINE, &line)>0 )
234 {
235 kputc('\n',&line);
236 fwrite(line.s,1,line.l,stdout);
237 }
238
239 if ( (ret=hts_close(gz_in)) )
240 {
241 fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret);
242 exit(ret);
243 }
244 free(line.s);
245 free(gz_fname);
246 }
247
248 void iterator(const char *fname)
249 {
250 htsFile *fp = hts_open(fname, "r");
251 bcf_hdr_t *hdr = bcf_hdr_read(fp);
252 hts_idx_t *idx;
253 hts_itr_t *iter;
254
255 bcf_index_build(fname, 0);
256 idx = bcf_index_load(fname);
257
258 iter = bcf_itr_queryi(idx, bcf_hdr_name2id(hdr, "20"), 1110600, 1110800);
259 bcf_itr_destroy(iter);
260
261 iter = bcf_itr_querys(idx, hdr, "20:1110600-1110800");
262 bcf_itr_destroy(iter);
263
264 hts_idx_destroy(idx);
265 bcf_hdr_destroy(hdr);
266 int ret;
267 if ( (ret=hts_close(fp)) )
268 {
269 fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
270 exit(ret);
271 }
272 }
273
274 int main(int argc, char **argv)
275 {
276 char *fname = argc>1 ? argv[1] : "rmme.bcf";
277 write_bcf(fname);
278 bcf_to_vcf(fname);
279 iterator(fname);
280 return 0;
281 }
282