0
|
1 /* vcf_sweep.c -- forward/reverse sweep API.
|
|
2
|
|
3 Copyright (C) 2013 Genome Research Ltd.
|
|
4
|
|
5 Author: Petr Danecek <pd3@sanger.ac.uk>
|
|
6
|
|
7 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8 of this software and associated documentation files (the "Software"), to deal
|
|
9 in the Software without restriction, including without limitation the rights
|
|
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11 copies of the Software, and to permit persons to whom the Software is
|
|
12 furnished to do so, subject to the following conditions:
|
|
13
|
|
14 The above copyright notice and this permission notice shall be included in
|
|
15 all copies or substantial portions of the Software.
|
|
16
|
|
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
23 DEALINGS IN THE SOFTWARE. */
|
|
24
|
|
25 #include "htslib/vcf_sweep.h"
|
|
26 #include "htslib/bgzf.h"
|
|
27
|
|
28 #define SW_FWD 0
|
|
29 #define SW_BWD 1
|
|
30
|
|
31 struct _bcf_sweep_t
|
|
32 {
|
|
33 htsFile *file;
|
|
34 bcf_hdr_t *hdr;
|
|
35 BGZF *fp;
|
|
36
|
|
37 int direction; // to tell if the direction has changed
|
|
38 int block_size; // the size of uncompressed data to hold in memory
|
|
39 bcf1_t *rec; // bcf buffer
|
|
40 int nrec, mrec; // number of used records; total size of the buffer
|
|
41 int lrid, lpos, lnals, lals_len, mlals; // to check uniqueness of a record
|
|
42 char *lals;
|
|
43
|
|
44 uint64_t *idx; // uncompressed offsets of VCF/BCF records
|
|
45 int iidx, nidx, midx; // i: current offset; n: used; m: allocated
|
|
46 int idx_done; // the index is built during the first pass
|
|
47 };
|
|
48
|
|
49 BGZF *hts_get_bgzfp(htsFile *fp);
|
|
50 int hts_useek(htsFile *file, long uoffset, int where);
|
|
51 long hts_utell(htsFile *file);
|
|
52
|
|
53 static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec)
|
|
54 {
|
|
55 if ( sw->lrid!=rec->rid ) return 0;
|
|
56 if ( sw->lpos!=rec->pos ) return 0;
|
|
57 if ( sw->lnals!=rec->n_allele ) return 0;
|
|
58
|
|
59 char *t = rec->d.allele[sw->lnals-1];
|
|
60 int len = t - rec->d.allele[0] + 1;
|
|
61 while ( *t ) { t++; len++; }
|
|
62 if ( sw->lals_len!=len ) return 0;
|
|
63 if ( memcmp(sw->lals,rec->d.allele[0],len) ) return 0;
|
|
64 return 1;
|
|
65 }
|
|
66
|
|
67 static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec)
|
|
68 {
|
|
69 sw->lrid = rec->rid;
|
|
70 sw->lpos = rec->pos;
|
|
71 sw->lnals = rec->n_allele;
|
|
72
|
|
73 char *t = rec->d.allele[sw->lnals-1];
|
|
74 int len = t - rec->d.allele[0] + 1;
|
|
75 while ( *t ) { t++; len++; }
|
|
76 sw->lals_len = len;
|
|
77 hts_expand(char, len, sw->mlals, sw->lals);
|
|
78 memcpy(sw->lals, rec->d.allele[0], len);
|
|
79 }
|
|
80
|
|
81 static void sw_fill_buffer(bcf_sweep_t *sw)
|
|
82 {
|
|
83 if ( !sw->iidx ) return;
|
|
84 sw->iidx--;
|
|
85
|
|
86 int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0);
|
|
87 assert( ret==0 );
|
|
88
|
|
89 sw->nrec = 0;
|
|
90 bcf1_t *rec = &sw->rec[sw->nrec];
|
|
91 while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 )
|
|
92 {
|
|
93 bcf_unpack(rec, BCF_UN_STR);
|
|
94
|
|
95 // if not in the last block, stop at the saved record
|
|
96 if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break;
|
|
97
|
|
98 sw->nrec++;
|
|
99 hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec);
|
|
100 rec = &sw->rec[sw->nrec];
|
|
101 }
|
|
102 sw_rec_save(sw, &sw->rec[0]);
|
|
103 }
|
|
104
|
|
105 bcf_sweep_t *bcf_sweep_init(const char *fname)
|
|
106 {
|
|
107 bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t));
|
|
108 sw->file = hts_open(fname, "r");
|
|
109 sw->fp = hts_get_bgzfp(sw->file);
|
|
110 bgzf_index_build_init(sw->fp);
|
|
111 sw->hdr = bcf_hdr_read(sw->file);
|
|
112 sw->mrec = 1;
|
|
113 sw->rec = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t)));
|
|
114 sw->block_size = 1024*1024*3;
|
|
115 sw->direction = SW_FWD;
|
|
116 return sw;
|
|
117 }
|
|
118
|
|
119 void bcf_empty1(bcf1_t *v);
|
|
120 void bcf_sweep_destroy(bcf_sweep_t *sw)
|
|
121 {
|
|
122 int i;
|
|
123 for (i=0; i<sw->mrec; i++) bcf_empty1(&sw->rec[i]);
|
|
124 free(sw->idx);
|
|
125 free(sw->rec);
|
|
126 free(sw->lals);
|
|
127 bcf_hdr_destroy(sw->hdr);
|
|
128 hts_close(sw->file);
|
|
129 free(sw);
|
|
130 }
|
|
131
|
|
132 static void sw_seek(bcf_sweep_t *sw, int direction)
|
|
133 {
|
|
134 sw->direction = direction;
|
|
135 if ( direction==SW_FWD )
|
|
136 hts_useek(sw->file, sw->idx[0], 0);
|
|
137 else
|
|
138 {
|
|
139 sw->iidx = sw->nidx;
|
|
140 sw->nrec = 0;
|
|
141 }
|
|
142 }
|
|
143
|
|
144 bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw)
|
|
145 {
|
|
146 if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD);
|
|
147
|
|
148 long pos = hts_utell(sw->file);
|
|
149
|
|
150 bcf1_t *rec = &sw->rec[0];
|
|
151 int ret = bcf_read1(sw->file, sw->hdr, rec);
|
|
152
|
|
153 if ( ret!=0 ) // last record, get ready for sweeping backwards
|
|
154 {
|
|
155 sw->idx_done = 1;
|
|
156 sw->fp->idx_build_otf = 0;
|
|
157 sw_seek(sw, SW_BWD);
|
|
158 return NULL;
|
|
159 }
|
|
160
|
|
161 if ( !sw->idx_done )
|
|
162 {
|
|
163 if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size )
|
|
164 {
|
|
165 sw->nidx++;
|
|
166 hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx);
|
|
167 sw->idx[sw->nidx-1] = pos;
|
|
168 }
|
|
169 }
|
|
170 return rec;
|
|
171 }
|
|
172
|
|
173 bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw)
|
|
174 {
|
|
175 if ( sw->direction==SW_FWD ) sw_seek(sw, SW_BWD);
|
|
176 if ( !sw->nrec ) sw_fill_buffer(sw);
|
|
177 if ( !sw->nrec ) return NULL;
|
|
178 return &sw->rec[ --sw->nrec ];
|
|
179 }
|
|
180
|
|
181 bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw) { return sw->hdr; }
|
|
182
|