annotate ezBAMQC/src/htslib/vcf_sweep.c @ 0:dfa3745e5fd8

Uploaded
author youngkim
date Thu, 24 Mar 2016 17:12:52 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1 /* vcf_sweep.c -- forward/reverse sweep API.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3 Copyright (C) 2013 Genome Research Ltd.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
4
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
5 Author: Petr Danecek <pd3@sanger.ac.uk>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
6
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
7 Permission is hereby granted, free of charge, to any person obtaining a copy
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
8 of this software and associated documentation files (the "Software"), to deal
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
9 in the Software without restriction, including without limitation the rights
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
11 copies of the Software, and to permit persons to whom the Software is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
12 furnished to do so, subject to the following conditions:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
13
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
14 The above copyright notice and this permission notice shall be included in
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
15 all copies or substantial portions of the Software.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
16
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
23 DEALINGS IN THE SOFTWARE. */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
24
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
25 #include "htslib/vcf_sweep.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
26 #include "htslib/bgzf.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
27
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
28 #define SW_FWD 0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
29 #define SW_BWD 1
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
30
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
31 struct _bcf_sweep_t
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
32 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
33 htsFile *file;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
34 bcf_hdr_t *hdr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
35 BGZF *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
36
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
37 int direction; // to tell if the direction has changed
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
38 int block_size; // the size of uncompressed data to hold in memory
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
39 bcf1_t *rec; // bcf buffer
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
40 int nrec, mrec; // number of used records; total size of the buffer
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
41 int lrid, lpos, lnals, lals_len, mlals; // to check uniqueness of a record
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
42 char *lals;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
43
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
44 uint64_t *idx; // uncompressed offsets of VCF/BCF records
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
45 int iidx, nidx, midx; // i: current offset; n: used; m: allocated
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
46 int idx_done; // the index is built during the first pass
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
47 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
48
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
49 BGZF *hts_get_bgzfp(htsFile *fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
50 int hts_useek(htsFile *file, long uoffset, int where);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
51 long hts_utell(htsFile *file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
52
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
53 static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
54 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
55 if ( sw->lrid!=rec->rid ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
56 if ( sw->lpos!=rec->pos ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
57 if ( sw->lnals!=rec->n_allele ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
58
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
59 char *t = rec->d.allele[sw->lnals-1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
60 int len = t - rec->d.allele[0] + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
61 while ( *t ) { t++; len++; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
62 if ( sw->lals_len!=len ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
63 if ( memcmp(sw->lals,rec->d.allele[0],len) ) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
64 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
65 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
66
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
67 static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
68 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
69 sw->lrid = rec->rid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
70 sw->lpos = rec->pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
71 sw->lnals = rec->n_allele;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
72
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
73 char *t = rec->d.allele[sw->lnals-1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
74 int len = t - rec->d.allele[0] + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
75 while ( *t ) { t++; len++; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
76 sw->lals_len = len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
77 hts_expand(char, len, sw->mlals, sw->lals);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
78 memcpy(sw->lals, rec->d.allele[0], len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
79 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
80
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
81 static void sw_fill_buffer(bcf_sweep_t *sw)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
82 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
83 if ( !sw->iidx ) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
84 sw->iidx--;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
85
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
86 int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
87 assert( ret==0 );
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
88
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
89 sw->nrec = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
90 bcf1_t *rec = &sw->rec[sw->nrec];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
91 while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
92 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
93 bcf_unpack(rec, BCF_UN_STR);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
94
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
95 // if not in the last block, stop at the saved record
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
96 if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
97
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
98 sw->nrec++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
99 hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
100 rec = &sw->rec[sw->nrec];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
101 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
102 sw_rec_save(sw, &sw->rec[0]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
103 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
104
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
105 bcf_sweep_t *bcf_sweep_init(const char *fname)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
106 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
107 bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
108 sw->file = hts_open(fname, "r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
109 sw->fp = hts_get_bgzfp(sw->file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
110 bgzf_index_build_init(sw->fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
111 sw->hdr = bcf_hdr_read(sw->file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
112 sw->mrec = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
113 sw->rec = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t)));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
114 sw->block_size = 1024*1024*3;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
115 sw->direction = SW_FWD;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
116 return sw;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
117 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
118
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
119 void bcf_empty1(bcf1_t *v);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
120 void bcf_sweep_destroy(bcf_sweep_t *sw)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
121 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
122 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
123 for (i=0; i<sw->mrec; i++) bcf_empty1(&sw->rec[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
124 free(sw->idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
125 free(sw->rec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
126 free(sw->lals);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
127 bcf_hdr_destroy(sw->hdr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
128 hts_close(sw->file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
129 free(sw);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
130 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
131
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
132 static void sw_seek(bcf_sweep_t *sw, int direction)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
133 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
134 sw->direction = direction;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
135 if ( direction==SW_FWD )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
136 hts_useek(sw->file, sw->idx[0], 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
137 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
138 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
139 sw->iidx = sw->nidx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
140 sw->nrec = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
141 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
142 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
143
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
144 bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
145 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
146 if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
147
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
148 long pos = hts_utell(sw->file);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
149
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
150 bcf1_t *rec = &sw->rec[0];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
151 int ret = bcf_read1(sw->file, sw->hdr, rec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
152
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
153 if ( ret!=0 ) // last record, get ready for sweeping backwards
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
154 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
155 sw->idx_done = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
156 sw->fp->idx_build_otf = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
157 sw_seek(sw, SW_BWD);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
158 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
159 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
160
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
161 if ( !sw->idx_done )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
162 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
163 if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
164 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
165 sw->nidx++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
166 hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
167 sw->idx[sw->nidx-1] = pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
168 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
169 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
170 return rec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
171 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
172
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
173 bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
174 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
175 if ( sw->direction==SW_FWD ) sw_seek(sw, SW_BWD);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
176 if ( !sw->nrec ) sw_fill_buffer(sw);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
177 if ( !sw->nrec ) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
178 return &sw->rec[ --sw->nrec ];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
179 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
180
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
181 bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw) { return sw->hdr; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
182