Mercurial > repos > youngkim > ezbamqc
diff ezBAMQC/src/htslib/vcf_sweep.c @ 0:dfa3745e5fd8
Uploaded
author | youngkim |
---|---|
date | Thu, 24 Mar 2016 17:12:52 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ezBAMQC/src/htslib/vcf_sweep.c Thu Mar 24 17:12:52 2016 -0400 @@ -0,0 +1,182 @@ +/* vcf_sweep.c -- forward/reverse sweep API. + + Copyright (C) 2013 Genome Research Ltd. + + Author: Petr Danecek <pd3@sanger.ac.uk> + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include "htslib/vcf_sweep.h" +#include "htslib/bgzf.h" + +#define SW_FWD 0 +#define SW_BWD 1 + +struct _bcf_sweep_t +{ + htsFile *file; + bcf_hdr_t *hdr; + BGZF *fp; + + int direction; // to tell if the direction has changed + int block_size; // the size of uncompressed data to hold in memory + bcf1_t *rec; // bcf buffer + int nrec, mrec; // number of used records; total size of the buffer + int lrid, lpos, lnals, lals_len, mlals; // to check uniqueness of a record + char *lals; + + uint64_t *idx; // uncompressed offsets of VCF/BCF records + int iidx, nidx, midx; // i: current offset; n: used; m: allocated + int idx_done; // the index is built during the first pass +}; + +BGZF *hts_get_bgzfp(htsFile *fp); +int hts_useek(htsFile *file, long uoffset, int where); +long hts_utell(htsFile *file); + +static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec) +{ + if ( sw->lrid!=rec->rid ) return 0; + if ( sw->lpos!=rec->pos ) return 0; + if ( sw->lnals!=rec->n_allele ) return 0; + + char *t = rec->d.allele[sw->lnals-1]; + int len = t - rec->d.allele[0] + 1; + while ( *t ) { t++; len++; } + if ( sw->lals_len!=len ) return 0; + if ( memcmp(sw->lals,rec->d.allele[0],len) ) return 0; + return 1; +} + +static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec) +{ + sw->lrid = rec->rid; + sw->lpos = rec->pos; + sw->lnals = rec->n_allele; + + char *t = rec->d.allele[sw->lnals-1]; + int len = t - rec->d.allele[0] + 1; + while ( *t ) { t++; len++; } + sw->lals_len = len; + hts_expand(char, len, sw->mlals, sw->lals); + memcpy(sw->lals, rec->d.allele[0], len); +} + +static void sw_fill_buffer(bcf_sweep_t *sw) +{ + if ( !sw->iidx ) return; + sw->iidx--; + + int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0); + assert( ret==0 ); + + sw->nrec = 0; + bcf1_t *rec = &sw->rec[sw->nrec]; + while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 ) + { + bcf_unpack(rec, BCF_UN_STR); + + // if not in the last block, stop at the saved record + if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break; + + sw->nrec++; + hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec); + rec = &sw->rec[sw->nrec]; + } + sw_rec_save(sw, &sw->rec[0]); +} + +bcf_sweep_t *bcf_sweep_init(const char *fname) +{ + bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t)); + sw->file = hts_open(fname, "r"); + sw->fp = hts_get_bgzfp(sw->file); + bgzf_index_build_init(sw->fp); + sw->hdr = bcf_hdr_read(sw->file); + sw->mrec = 1; + sw->rec = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t))); + sw->block_size = 1024*1024*3; + sw->direction = SW_FWD; + return sw; +} + +void bcf_empty1(bcf1_t *v); +void bcf_sweep_destroy(bcf_sweep_t *sw) +{ + int i; + for (i=0; i<sw->mrec; i++) bcf_empty1(&sw->rec[i]); + free(sw->idx); + free(sw->rec); + free(sw->lals); + bcf_hdr_destroy(sw->hdr); + hts_close(sw->file); + free(sw); +} + +static void sw_seek(bcf_sweep_t *sw, int direction) +{ + sw->direction = direction; + if ( direction==SW_FWD ) + hts_useek(sw->file, sw->idx[0], 0); + else + { + sw->iidx = sw->nidx; + sw->nrec = 0; + } +} + +bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw) +{ + if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD); + + long pos = hts_utell(sw->file); + + bcf1_t *rec = &sw->rec[0]; + int ret = bcf_read1(sw->file, sw->hdr, rec); + + if ( ret!=0 ) // last record, get ready for sweeping backwards + { + sw->idx_done = 1; + sw->fp->idx_build_otf = 0; + sw_seek(sw, SW_BWD); + return NULL; + } + + if ( !sw->idx_done ) + { + if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size ) + { + sw->nidx++; + hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx); + sw->idx[sw->nidx-1] = pos; + } + } + return rec; +} + +bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw) +{ + if ( sw->direction==SW_FWD ) sw_seek(sw, SW_BWD); + if ( !sw->nrec ) sw_fill_buffer(sw); + if ( !sw->nrec ) return NULL; + return &sw->rec[ --sw->nrec ]; +} + +bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw) { return sw->hdr; } +