0
|
1 /* The MIT License
|
|
2
|
|
3 Copyright (c) 2010 Broad Institute
|
|
4
|
|
5 Permission is hereby granted, free of charge, to any person obtaining
|
|
6 a copy of this software and associated documentation files (the
|
|
7 "Software"), to deal in the Software without restriction, including
|
|
8 without limitation the rights to use, copy, modify, merge, publish,
|
|
9 distribute, sublicense, and/or sell copies of the Software, and to
|
|
10 permit persons to whom the Software is furnished to do so, subject to
|
|
11 the following conditions:
|
|
12
|
|
13 The above copyright notice and this permission notice shall be
|
|
14 included in all copies or substantial portions of the Software.
|
|
15
|
|
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23 SOFTWARE.
|
|
24 */
|
|
25
|
|
26 /* Contact: Heng Li <lh3@live.co.uk> */
|
|
27
|
|
28 #ifndef BCF_H
|
|
29 #define BCF_H
|
|
30
|
|
31 #define BCF_VERSION "0.1.19-44428cd"
|
|
32
|
|
33 #include <stdint.h>
|
|
34 #include <zlib.h>
|
|
35
|
|
36 #ifndef BCF_LITE
|
|
37 #include "bgzf.h"
|
|
38 typedef BGZF *bcfFile;
|
|
39 #else
|
|
40 typedef gzFile bcfFile;
|
|
41 #define bgzf_open(fn, mode) gzopen(fn, mode)
|
|
42 #define bgzf_fdopen(fd, mode) gzdopen(fd, mode)
|
|
43 #define bgzf_close(fp) gzclose(fp)
|
|
44 #define bgzf_read(fp, buf, len) gzread(fp, buf, len)
|
|
45 #define bgzf_write(fp, buf, len)
|
|
46 #define bgzf_flush(fp)
|
|
47 #endif
|
|
48
|
|
49 /*
|
|
50 A member in the structs below is said to "primary" if its content
|
|
51 cannot be inferred from other members in any of structs below; a
|
|
52 member is said to be "derived" if its content can be derived from
|
|
53 other members. For example, bcf1_t::str is primary as this comes from
|
|
54 the input data, while bcf1_t::info is derived as it can always be
|
|
55 correctly set if we know bcf1_t::str. Derived members are for quick
|
|
56 access to the content and must be synchronized with the primary data.
|
|
57 */
|
|
58
|
|
59 typedef struct {
|
|
60 uint32_t fmt; // format of the block, set by bcf_str2int().
|
|
61 int len; // length of data for each individual
|
|
62 void *data; // concatenated data
|
|
63 // derived info: fmt, len (<-bcf1_t::fmt)
|
|
64 } bcf_ginfo_t;
|
|
65
|
|
66 typedef struct {
|
|
67 int32_t tid, pos; // refID and 0-based position
|
|
68 int32_t l_str, m_str; // length and the allocated size of ->str
|
|
69 float qual; // SNP quality
|
|
70 char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7)
|
|
71 char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation
|
|
72 int n_gi, m_gi; // number and the allocated size of geno fields
|
|
73 bcf_ginfo_t *gi; // array of geno fields
|
|
74 int n_alleles, n_smpl; // number of alleles and samples
|
|
75 // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl)
|
|
76 uint8_t *ploidy; // ploidy of all samples; if NULL, ploidy of 2 is assumed.
|
|
77 } bcf1_t;
|
|
78
|
|
79 typedef struct {
|
|
80 int32_t n_ref, n_smpl; // number of reference sequences and samples
|
|
81 int32_t l_nm; // length of concatenated sequence names; 0 padded
|
|
82 int32_t l_smpl; // length of concatenated sample names; 0 padded
|
|
83 int32_t l_txt; // length of header text (lines started with ##)
|
|
84 char *name, *sname, *txt; // concatenated sequence names, sample names and header text
|
|
85 char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively
|
|
86 // derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname)
|
|
87 } bcf_hdr_t;
|
|
88
|
|
89 typedef struct {
|
|
90 int is_vcf; // if the file in operation is a VCF
|
|
91 void *v; // auxillary data structure for VCF
|
|
92 bcfFile fp; // file handler for BCF
|
|
93 } bcf_t;
|
|
94
|
|
95 struct __bcf_idx_t;
|
|
96 typedef struct __bcf_idx_t bcf_idx_t;
|
|
97
|
|
98 #ifdef __cplusplus
|
|
99 extern "C" {
|
|
100 #endif
|
|
101
|
|
102 // open a BCF file; for BCF file only
|
|
103 bcf_t *bcf_open(const char *fn, const char *mode);
|
|
104 // close file
|
|
105 int bcf_close(bcf_t *b);
|
|
106 // read one record from BCF; return -1 on end-of-file, and <-1 for errors
|
|
107 int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b);
|
|
108 // call this function if b->str is changed
|
|
109 int bcf_sync(bcf1_t *b);
|
|
110 // write a BCF record
|
|
111 int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b);
|
|
112 // read the BCF header; BCF only
|
|
113 bcf_hdr_t *bcf_hdr_read(bcf_t *b);
|
|
114 // write the BCF header
|
|
115 int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h);
|
|
116 // set bcf_hdr_t::ns and bcf_hdr_t::sns
|
|
117 int bcf_hdr_sync(bcf_hdr_t *b);
|
|
118 // destroy the header
|
|
119 void bcf_hdr_destroy(bcf_hdr_t *h);
|
|
120 // destroy a record
|
|
121 int bcf_destroy(bcf1_t *b);
|
|
122 // BCF->VCF conversion
|
|
123 char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b);
|
|
124 // append more info
|
|
125 int bcf_append_info(bcf1_t *b, const char *info, int l);
|
|
126 // remove tag
|
|
127 int remove_tag(char *string, const char *tag, char delim);
|
|
128 // remove info tag, string is the kstring holder of bcf1_t.str
|
|
129 void rm_info(kstring_t *string, const char *key);
|
|
130 // copy
|
|
131 int bcf_cpy(bcf1_t *r, const bcf1_t *b);
|
|
132
|
|
133 // open a VCF or BCF file if "b" is set in "mode"
|
|
134 bcf_t *vcf_open(const char *fn, const char *mode);
|
|
135 // close a VCF/BCF file
|
|
136 int vcf_close(bcf_t *bp);
|
|
137 // read the VCF/BCF header
|
|
138 bcf_hdr_t *vcf_hdr_read(bcf_t *bp);
|
|
139 // read the sequence dictionary from a separate file; required for VCF->BCF conversion
|
|
140 int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn);
|
|
141 // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors
|
|
142 int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
|
|
143 // write the VCF header
|
|
144 int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h);
|
|
145 // write a VCF record
|
|
146 int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
|
|
147
|
|
148 // keep the first n alleles and discard the rest
|
|
149 int bcf_shrink_alt(bcf1_t *b, int n);
|
|
150 // keep the masked alleles and discard the rest
|
|
151 void bcf_fit_alt(bcf1_t *b, int mask);
|
|
152 // convert GL to PL
|
|
153 int bcf_gl2pl(bcf1_t *b);
|
|
154 // if the site is an indel
|
|
155 int bcf_is_indel(const bcf1_t *b);
|
|
156 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list);
|
|
157 int bcf_subsam(int n_smpl, int *list, bcf1_t *b);
|
|
158 // move GT to the first FORMAT field
|
|
159 int bcf_fix_gt(bcf1_t *b);
|
|
160 // update PL generated by old samtools
|
|
161 int bcf_fix_pl(bcf1_t *b);
|
|
162 // convert PL to GLF-like 10-likelihood GL
|
|
163 int bcf_gl10(const bcf1_t *b, uint8_t *gl);
|
|
164 // convert up to 4 INDEL alleles to GLF-like 10-likelihood GL
|
|
165 int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl);
|
|
166
|
|
167 // string hash table
|
|
168 void *bcf_build_refhash(bcf_hdr_t *h);
|
|
169 void bcf_str2id_destroy(void *_hash);
|
|
170 void bcf_str2id_thorough_destroy(void *_hash);
|
|
171 int bcf_str2id_add(void *_hash, const char *str);
|
|
172 int bcf_str2id(void *_hash, const char *str);
|
|
173 void *bcf_str2id_init();
|
|
174
|
|
175 // indexing related functions
|
|
176 int bcf_idx_build(const char *fn);
|
|
177 uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg);
|
|
178 int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end);
|
|
179 bcf_idx_t *bcf_idx_load(const char *fn);
|
|
180 void bcf_idx_destroy(bcf_idx_t *idx);
|
|
181
|
|
182 #ifdef __cplusplus
|
|
183 }
|
|
184 #endif
|
|
185
|
|
186 static inline uint32_t bcf_str2int(const char *str, int l)
|
|
187 {
|
|
188 int i;
|
|
189 uint32_t x = 0;
|
|
190 for (i = 0; i < l && i < 4; ++i) {
|
|
191 if (str[i] == 0) return x;
|
|
192 x = x<<8 | str[i];
|
|
193 }
|
|
194 return x;
|
|
195 }
|
|
196
|
|
197 #endif
|