Mercurial > repos > youngkim > ezbamqc
comparison ezBAMQC/src/htslib/vcf_sweep.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfa3745e5fd8 |
---|---|
1 /* vcf_sweep.c -- forward/reverse sweep API. | |
2 | |
3 Copyright (C) 2013 Genome Research Ltd. | |
4 | |
5 Author: Petr Danecek <pd3@sanger.ac.uk> | |
6 | |
7 Permission is hereby granted, free of charge, to any person obtaining a copy | |
8 of this software and associated documentation files (the "Software"), to deal | |
9 in the Software without restriction, including without limitation the rights | |
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
11 copies of the Software, and to permit persons to whom the Software is | |
12 furnished to do so, subject to the following conditions: | |
13 | |
14 The above copyright notice and this permission notice shall be included in | |
15 all copies or substantial portions of the Software. | |
16 | |
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
23 DEALINGS IN THE SOFTWARE. */ | |
24 | |
25 #include "htslib/vcf_sweep.h" | |
26 #include "htslib/bgzf.h" | |
27 | |
28 #define SW_FWD 0 | |
29 #define SW_BWD 1 | |
30 | |
31 struct _bcf_sweep_t | |
32 { | |
33 htsFile *file; | |
34 bcf_hdr_t *hdr; | |
35 BGZF *fp; | |
36 | |
37 int direction; // to tell if the direction has changed | |
38 int block_size; // the size of uncompressed data to hold in memory | |
39 bcf1_t *rec; // bcf buffer | |
40 int nrec, mrec; // number of used records; total size of the buffer | |
41 int lrid, lpos, lnals, lals_len, mlals; // to check uniqueness of a record | |
42 char *lals; | |
43 | |
44 uint64_t *idx; // uncompressed offsets of VCF/BCF records | |
45 int iidx, nidx, midx; // i: current offset; n: used; m: allocated | |
46 int idx_done; // the index is built during the first pass | |
47 }; | |
48 | |
49 BGZF *hts_get_bgzfp(htsFile *fp); | |
50 int hts_useek(htsFile *file, long uoffset, int where); | |
51 long hts_utell(htsFile *file); | |
52 | |
53 static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec) | |
54 { | |
55 if ( sw->lrid!=rec->rid ) return 0; | |
56 if ( sw->lpos!=rec->pos ) return 0; | |
57 if ( sw->lnals!=rec->n_allele ) return 0; | |
58 | |
59 char *t = rec->d.allele[sw->lnals-1]; | |
60 int len = t - rec->d.allele[0] + 1; | |
61 while ( *t ) { t++; len++; } | |
62 if ( sw->lals_len!=len ) return 0; | |
63 if ( memcmp(sw->lals,rec->d.allele[0],len) ) return 0; | |
64 return 1; | |
65 } | |
66 | |
67 static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec) | |
68 { | |
69 sw->lrid = rec->rid; | |
70 sw->lpos = rec->pos; | |
71 sw->lnals = rec->n_allele; | |
72 | |
73 char *t = rec->d.allele[sw->lnals-1]; | |
74 int len = t - rec->d.allele[0] + 1; | |
75 while ( *t ) { t++; len++; } | |
76 sw->lals_len = len; | |
77 hts_expand(char, len, sw->mlals, sw->lals); | |
78 memcpy(sw->lals, rec->d.allele[0], len); | |
79 } | |
80 | |
81 static void sw_fill_buffer(bcf_sweep_t *sw) | |
82 { | |
83 if ( !sw->iidx ) return; | |
84 sw->iidx--; | |
85 | |
86 int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0); | |
87 assert( ret==0 ); | |
88 | |
89 sw->nrec = 0; | |
90 bcf1_t *rec = &sw->rec[sw->nrec]; | |
91 while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 ) | |
92 { | |
93 bcf_unpack(rec, BCF_UN_STR); | |
94 | |
95 // if not in the last block, stop at the saved record | |
96 if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break; | |
97 | |
98 sw->nrec++; | |
99 hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec); | |
100 rec = &sw->rec[sw->nrec]; | |
101 } | |
102 sw_rec_save(sw, &sw->rec[0]); | |
103 } | |
104 | |
105 bcf_sweep_t *bcf_sweep_init(const char *fname) | |
106 { | |
107 bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t)); | |
108 sw->file = hts_open(fname, "r"); | |
109 sw->fp = hts_get_bgzfp(sw->file); | |
110 bgzf_index_build_init(sw->fp); | |
111 sw->hdr = bcf_hdr_read(sw->file); | |
112 sw->mrec = 1; | |
113 sw->rec = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t))); | |
114 sw->block_size = 1024*1024*3; | |
115 sw->direction = SW_FWD; | |
116 return sw; | |
117 } | |
118 | |
119 void bcf_empty1(bcf1_t *v); | |
120 void bcf_sweep_destroy(bcf_sweep_t *sw) | |
121 { | |
122 int i; | |
123 for (i=0; i<sw->mrec; i++) bcf_empty1(&sw->rec[i]); | |
124 free(sw->idx); | |
125 free(sw->rec); | |
126 free(sw->lals); | |
127 bcf_hdr_destroy(sw->hdr); | |
128 hts_close(sw->file); | |
129 free(sw); | |
130 } | |
131 | |
132 static void sw_seek(bcf_sweep_t *sw, int direction) | |
133 { | |
134 sw->direction = direction; | |
135 if ( direction==SW_FWD ) | |
136 hts_useek(sw->file, sw->idx[0], 0); | |
137 else | |
138 { | |
139 sw->iidx = sw->nidx; | |
140 sw->nrec = 0; | |
141 } | |
142 } | |
143 | |
144 bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw) | |
145 { | |
146 if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD); | |
147 | |
148 long pos = hts_utell(sw->file); | |
149 | |
150 bcf1_t *rec = &sw->rec[0]; | |
151 int ret = bcf_read1(sw->file, sw->hdr, rec); | |
152 | |
153 if ( ret!=0 ) // last record, get ready for sweeping backwards | |
154 { | |
155 sw->idx_done = 1; | |
156 sw->fp->idx_build_otf = 0; | |
157 sw_seek(sw, SW_BWD); | |
158 return NULL; | |
159 } | |
160 | |
161 if ( !sw->idx_done ) | |
162 { | |
163 if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size ) | |
164 { | |
165 sw->nidx++; | |
166 hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx); | |
167 sw->idx[sw->nidx-1] = pos; | |
168 } | |
169 } | |
170 return rec; | |
171 } | |
172 | |
173 bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw) | |
174 { | |
175 if ( sw->direction==SW_FWD ) sw_seek(sw, SW_BWD); | |
176 if ( !sw->nrec ) sw_fill_buffer(sw); | |
177 if ( !sw->nrec ) return NULL; | |
178 return &sw->rec[ --sw->nrec ]; | |
179 } | |
180 | |
181 bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw) { return sw->hdr; } | |
182 |