0
|
1 /* The MIT License
|
|
2
|
|
3 Copyright (c) 2008-2010 Genome Research Ltd (GRL).
|
|
4
|
|
5 Permission is hereby granted, free of charge, to any person obtaining
|
|
6 a copy of this software and associated documentation files (the
|
|
7 "Software"), to deal in the Software without restriction, including
|
|
8 without limitation the rights to use, copy, modify, merge, publish,
|
|
9 distribute, sublicense, and/or sell copies of the Software, and to
|
|
10 permit persons to whom the Software is furnished to do so, subject to
|
|
11 the following conditions:
|
|
12
|
|
13 The above copyright notice and this permission notice shall be
|
|
14 included in all copies or substantial portions of the Software.
|
|
15
|
|
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23 SOFTWARE.
|
|
24 */
|
|
25
|
|
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
|
|
27
|
|
28 #ifndef BAM_BAM_H
|
|
29 #define BAM_BAM_H
|
|
30
|
|
31 /*!
|
|
32 @header
|
|
33
|
|
34 BAM library provides I/O and various operations on manipulating files
|
|
35 in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
|
|
36 format. It now supports importing from or exporting to SAM, sorting,
|
|
37 merging, generating pileup, and quickly retrieval of reads overlapped
|
|
38 with a specified region.
|
|
39
|
|
40 @copyright Genome Research Ltd.
|
|
41 */
|
|
42
|
|
43 #define BAM_VERSION "0.1.19-44428cd"
|
|
44
|
|
45 #include <stdint.h>
|
|
46 #include <stdlib.h>
|
|
47 #include <string.h>
|
|
48 #include <stdio.h>
|
|
49
|
|
50 #ifndef BAM_LITE
|
|
51 #define BAM_VIRTUAL_OFFSET16
|
|
52 #include "bgzf.h"
|
|
53 /*! @abstract BAM file handler */
|
|
54 typedef BGZF *bamFile;
|
|
55 #define bam_open(fn, mode) bgzf_open(fn, mode)
|
|
56 #define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
|
|
57 #define bam_close(fp) bgzf_close(fp)
|
|
58 #define bam_read(fp, buf, size) bgzf_read(fp, buf, size)
|
|
59 #define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
|
|
60 #define bam_tell(fp) bgzf_tell(fp)
|
|
61 #define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
|
|
62 #else
|
|
63 #define BAM_TRUE_OFFSET
|
|
64 #include <zlib.h>
|
|
65 typedef gzFile bamFile;
|
|
66 #define bam_open(fn, mode) gzopen(fn, mode)
|
|
67 #define bam_dopen(fd, mode) gzdopen(fd, mode)
|
|
68 #define bam_close(fp) gzclose(fp)
|
|
69 #define bam_read(fp, buf, size) gzread(fp, buf, size)
|
|
70 /* no bam_write/bam_tell/bam_seek() here */
|
|
71 #endif
|
|
72
|
|
73 /*! @typedef
|
|
74 @abstract Structure for the alignment header.
|
|
75 @field n_targets number of reference sequences
|
|
76 @field target_name names of the reference sequences
|
|
77 @field target_len lengths of the referene sequences
|
|
78 @field dict header dictionary
|
|
79 @field hash hash table for fast name lookup
|
|
80 @field rg2lib hash table for @RG-ID -> LB lookup
|
|
81 @field l_text length of the plain text in the header
|
|
82 @field text plain text
|
|
83
|
|
84 @discussion Field hash points to null by default. It is a private
|
|
85 member.
|
|
86 */
|
|
87 typedef struct {
|
|
88 int32_t n_targets;
|
|
89 char **target_name;
|
|
90 uint32_t *target_len;
|
|
91 void *dict, *hash, *rg2lib;
|
|
92 uint32_t l_text, n_text;
|
|
93 char *text;
|
|
94 } bam_header_t;
|
|
95
|
|
96 /*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
|
|
97 #define BAM_FPAIRED 1
|
|
98 /*! @abstract the read is mapped in a proper pair */
|
|
99 #define BAM_FPROPER_PAIR 2
|
|
100 /*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
|
|
101 #define BAM_FUNMAP 4
|
|
102 /*! @abstract the mate is unmapped */
|
|
103 #define BAM_FMUNMAP 8
|
|
104 /*! @abstract the read is mapped to the reverse strand */
|
|
105 #define BAM_FREVERSE 16
|
|
106 /*! @abstract the mate is mapped to the reverse strand */
|
|
107 #define BAM_FMREVERSE 32
|
|
108 /*! @abstract this is read1 */
|
|
109 #define BAM_FREAD1 64
|
|
110 /*! @abstract this is read2 */
|
|
111 #define BAM_FREAD2 128
|
|
112 /*! @abstract not primary alignment */
|
|
113 #define BAM_FSECONDARY 256
|
|
114 /*! @abstract QC failure */
|
|
115 #define BAM_FQCFAIL 512
|
|
116 /*! @abstract optical or PCR duplicate */
|
|
117 #define BAM_FDUP 1024
|
|
118
|
|
119 #define BAM_OFDEC 0
|
|
120 #define BAM_OFHEX 1
|
|
121 #define BAM_OFSTR 2
|
|
122
|
|
123 /*! @abstract defautl mask for pileup */
|
|
124 #define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
|
|
125
|
|
126 #define BAM_CORE_SIZE sizeof(bam1_core_t)
|
|
127
|
|
128 /**
|
|
129 * Describing how CIGAR operation/length is packed in a 32-bit integer.
|
|
130 */
|
|
131 #define BAM_CIGAR_SHIFT 4
|
|
132 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
|
|
133
|
|
134 /*
|
|
135 CIGAR operations.
|
|
136 */
|
|
137 /*! @abstract CIGAR: M = match or mismatch*/
|
|
138 #define BAM_CMATCH 0
|
|
139 /*! @abstract CIGAR: I = insertion to the reference */
|
|
140 #define BAM_CINS 1
|
|
141 /*! @abstract CIGAR: D = deletion from the reference */
|
|
142 #define BAM_CDEL 2
|
|
143 /*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */
|
|
144 #define BAM_CREF_SKIP 3
|
|
145 /*! @abstract CIGAR: S = clip on the read with clipped sequence
|
|
146 present in qseq */
|
|
147 #define BAM_CSOFT_CLIP 4
|
|
148 /*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */
|
|
149 #define BAM_CHARD_CLIP 5
|
|
150 /*! @abstract CIGAR: P = padding */
|
|
151 #define BAM_CPAD 6
|
|
152 /*! @abstract CIGAR: equals = match */
|
|
153 #define BAM_CEQUAL 7
|
|
154 /*! @abstract CIGAR: X = mismatch */
|
|
155 #define BAM_CDIFF 8
|
|
156 #define BAM_CBACK 9
|
|
157
|
|
158 #define BAM_CIGAR_STR "MIDNSHP=XB"
|
|
159 #define BAM_CIGAR_TYPE 0x3C1A7
|
|
160
|
|
161 #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK)
|
|
162 #define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT)
|
|
163 #define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)])
|
|
164 #define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o))
|
|
165 #define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
|
|
166
|
|
167 /*! @typedef
|
|
168 @abstract Structure for core alignment information.
|
|
169 @field tid chromosome ID, defined by bam_header_t
|
|
170 @field pos 0-based leftmost coordinate
|
|
171 @field bin bin calculated by bam_reg2bin()
|
|
172 @field qual mapping quality
|
|
173 @field l_qname length of the query name
|
|
174 @field flag bitwise flag
|
|
175 @field n_cigar number of CIGAR operations
|
|
176 @field l_qseq length of the query sequence (read)
|
|
177 */
|
|
178 typedef struct {
|
|
179 int32_t tid;
|
|
180 int32_t pos;
|
|
181 uint32_t bin:16, qual:8, l_qname:8;
|
|
182 uint32_t flag:16, n_cigar:16;
|
|
183 int32_t l_qseq;
|
|
184 int32_t mtid;
|
|
185 int32_t mpos;
|
|
186 int32_t isize;
|
|
187 } bam1_core_t;
|
|
188
|
|
189 /*! @typedef
|
|
190 @abstract Structure for one alignment.
|
|
191 @field core core information about the alignment
|
|
192 @field l_aux length of auxiliary data
|
|
193 @field data_len current length of bam1_t::data
|
|
194 @field m_data maximum length of bam1_t::data
|
|
195 @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
|
|
196
|
|
197 @discussion Notes:
|
|
198
|
|
199 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
|
|
200 2. l_qseq is calculated from the total length of an alignment block
|
|
201 on reading or from CIGAR.
|
|
202 3. cigar data is encoded 4 bytes per CIGAR operation.
|
|
203 4. seq is nybble-encoded according to bam_nt16_table.
|
|
204 */
|
|
205 typedef struct {
|
|
206 bam1_core_t core;
|
|
207 int l_aux, data_len, m_data;
|
|
208 uint8_t *data;
|
|
209 } bam1_t;
|
|
210
|
|
211 typedef struct __bam_iter_t *bam_iter_t;
|
|
212
|
|
213 #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
|
|
214 #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
|
|
215
|
|
216 /*! @function
|
|
217 @abstract Get the CIGAR array
|
|
218 @param b pointer to an alignment
|
|
219 @return pointer to the CIGAR array
|
|
220
|
|
221 @discussion In the CIGAR array, each element is a 32-bit integer. The
|
|
222 lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
|
|
223 length of a CIGAR.
|
|
224 */
|
|
225 #define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
|
|
226
|
|
227 /*! @function
|
|
228 @abstract Get the name of the query
|
|
229 @param b pointer to an alignment
|
|
230 @return pointer to the name string, null terminated
|
|
231 */
|
|
232 #define bam1_qname(b) ((char*)((b)->data))
|
|
233
|
|
234 /*! @function
|
|
235 @abstract Get query sequence
|
|
236 @param b pointer to an alignment
|
|
237 @return pointer to sequence
|
|
238
|
|
239 @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
|
|
240 8 for T and 15 for N. Two bases are packed in one byte with the base
|
|
241 at the higher 4 bits having smaller coordinate on the read. It is
|
|
242 recommended to use bam1_seqi() macro to get the base.
|
|
243 */
|
|
244 #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
|
|
245
|
|
246 /*! @function
|
|
247 @abstract Get query quality
|
|
248 @param b pointer to an alignment
|
|
249 @return pointer to quality string
|
|
250 */
|
|
251 #define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
|
|
252
|
|
253 /*! @function
|
|
254 @abstract Get a base on read
|
|
255 @param s Query sequence returned by bam1_seq()
|
|
256 @param i The i-th position, 0-based
|
|
257 @return 4-bit integer representing the base.
|
|
258 */
|
|
259 //#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
|
|
260 #define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf)
|
|
261
|
|
262 #define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) )
|
|
263
|
|
264 /*! @function
|
|
265 @abstract Get query sequence and quality
|
|
266 @param b pointer to an alignment
|
|
267 @return pointer to the concatenated auxiliary data
|
|
268 */
|
|
269 #define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
|
|
270
|
|
271 #ifndef kroundup32
|
|
272 /*! @function
|
|
273 @abstract Round an integer to the next closest power-2 integer.
|
|
274 @param x integer to be rounded (in place)
|
|
275 @discussion x will be modified.
|
|
276 */
|
|
277 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
|
278 #endif
|
|
279
|
|
280 /*!
|
|
281 @abstract Whether the machine is big-endian; modified only in
|
|
282 bam_header_init().
|
|
283 */
|
|
284 extern int bam_is_be;
|
|
285
|
|
286 /*!
|
|
287 @abstract Verbose level between 0 and 3; 0 is supposed to disable all
|
|
288 debugging information, though this may not have been implemented.
|
|
289 */
|
|
290 extern int bam_verbose;
|
|
291
|
|
292 extern int bam_no_B;
|
|
293
|
|
294 /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
|
|
295 extern unsigned char bam_nt16_table[256];
|
|
296
|
|
297 /*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
|
|
298 extern char *bam_nt16_rev_table;
|
|
299
|
|
300 extern char bam_nt16_nt4_table[];
|
|
301
|
|
302 #ifdef __cplusplus
|
|
303 extern "C" {
|
|
304 #endif
|
|
305
|
|
306 /*********************
|
|
307 * Low-level SAM I/O *
|
|
308 *********************/
|
|
309
|
|
310 /*! @abstract TAM file handler */
|
|
311 typedef struct __tamFile_t *tamFile;
|
|
312
|
|
313 /*!
|
|
314 @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
|
|
315 @param fn SAM file name
|
|
316 @return SAM file handler
|
|
317 */
|
|
318 tamFile sam_open(const char *fn);
|
|
319
|
|
320 /*!
|
|
321 @abstract Close a SAM file handler
|
|
322 @param fp SAM file handler
|
|
323 */
|
|
324 void sam_close(tamFile fp);
|
|
325
|
|
326 /*!
|
|
327 @abstract Read one alignment from a SAM file handler
|
|
328 @param fp SAM file handler
|
|
329 @param header header information (ordered names of chromosomes)
|
|
330 @param b read alignment; all members in b will be updated
|
|
331 @return 0 if successful; otherwise negative
|
|
332 */
|
|
333 int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
|
|
334
|
|
335 /*!
|
|
336 @abstract Read header information from a TAB-delimited list file.
|
|
337 @param fn_list file name for the list
|
|
338 @return a pointer to the header structure
|
|
339
|
|
340 @discussion Each line in this file consists of chromosome name and
|
|
341 the length of chromosome.
|
|
342 */
|
|
343 bam_header_t *sam_header_read2(const char *fn_list);
|
|
344
|
|
345 /*!
|
|
346 @abstract Read header from a SAM file (if present)
|
|
347 @param fp SAM file handler
|
|
348 @return pointer to header struct; 0 if no @SQ lines available
|
|
349 */
|
|
350 bam_header_t *sam_header_read(tamFile fp);
|
|
351
|
|
352 /*!
|
|
353 @abstract Parse @SQ lines a update a header struct
|
|
354 @param h pointer to the header struct to be updated
|
|
355 @return number of target sequences
|
|
356
|
|
357 @discussion bam_header_t::{n_targets,target_len,target_name} will
|
|
358 be destroyed in the first place.
|
|
359 */
|
|
360 int sam_header_parse(bam_header_t *h);
|
|
361 int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
|
|
362
|
|
363 /*!
|
|
364 @abstract Parse @RG lines a update a header struct
|
|
365 @param h pointer to the header struct to be updated
|
|
366 @return number of @RG lines
|
|
367
|
|
368 @discussion bam_header_t::rg2lib will be destroyed in the first
|
|
369 place.
|
|
370 */
|
|
371 int sam_header_parse_rg(bam_header_t *h);
|
|
372
|
|
373 #define sam_write1(header, b) bam_view1(header, b)
|
|
374
|
|
375
|
|
376 /********************************
|
|
377 * APIs for string dictionaries *
|
|
378 ********************************/
|
|
379
|
|
380 int bam_strmap_put(void *strmap, const char *rg, const char *lib);
|
|
381 const char *bam_strmap_get(const void *strmap, const char *rg);
|
|
382 void *bam_strmap_dup(const void*);
|
|
383 void *bam_strmap_init();
|
|
384 void bam_strmap_destroy(void *strmap);
|
|
385
|
|
386
|
|
387 /*********************
|
|
388 * Low-level BAM I/O *
|
|
389 *********************/
|
|
390
|
|
391 /*!
|
|
392 @abstract Initialize a header structure.
|
|
393 @return the pointer to the header structure
|
|
394
|
|
395 @discussion This function also modifies the global variable
|
|
396 bam_is_be.
|
|
397 */
|
|
398 bam_header_t *bam_header_init();
|
|
399
|
|
400 /*!
|
|
401 @abstract Destroy a header structure.
|
|
402 @param header pointer to the header
|
|
403 */
|
|
404 void bam_header_destroy(bam_header_t *header);
|
|
405
|
|
406 /*!
|
|
407 @abstract Read a header structure from BAM.
|
|
408 @param fp BAM file handler, opened by bam_open()
|
|
409 @return pointer to the header structure
|
|
410
|
|
411 @discussion The file position indicator must be placed at the
|
|
412 beginning of the file. Upon success, the position indicator will
|
|
413 be set at the start of the first alignment.
|
|
414 */
|
|
415 bam_header_t *bam_header_read(bamFile fp);
|
|
416
|
|
417 /*!
|
|
418 @abstract Write a header structure to BAM.
|
|
419 @param fp BAM file handler
|
|
420 @param header pointer to the header structure
|
|
421 @return always 0 currently
|
|
422 */
|
|
423 int bam_header_write(bamFile fp, const bam_header_t *header);
|
|
424
|
|
425 /*!
|
|
426 @abstract Read an alignment from BAM.
|
|
427 @param fp BAM file handler
|
|
428 @param b read alignment; all members are updated.
|
|
429 @return number of bytes read from the file
|
|
430
|
|
431 @discussion The file position indicator must be
|
|
432 placed right before an alignment. Upon success, this function
|
|
433 will set the position indicator to the start of the next
|
|
434 alignment. This function is not affected by the machine
|
|
435 endianness.
|
|
436 */
|
|
437 int bam_read1(bamFile fp, bam1_t *b);
|
|
438
|
|
439 int bam_remove_B(bam1_t *b);
|
|
440
|
|
441 /*!
|
|
442 @abstract Write an alignment to BAM.
|
|
443 @param fp BAM file handler
|
|
444 @param c pointer to the bam1_core_t structure
|
|
445 @param data_len total length of variable size data related to
|
|
446 the alignment
|
|
447 @param data pointer to the concatenated data
|
|
448 @return number of bytes written to the file
|
|
449
|
|
450 @discussion This function is not affected by the machine
|
|
451 endianness.
|
|
452 */
|
|
453 int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
|
|
454
|
|
455 /*!
|
|
456 @abstract Write an alignment to BAM.
|
|
457 @param fp BAM file handler
|
|
458 @param b alignment to write
|
|
459 @return number of bytes written to the file
|
|
460
|
|
461 @abstract It is equivalent to:
|
|
462 bam_write1_core(fp, &b->core, b->data_len, b->data)
|
|
463 */
|
|
464 int bam_write1(bamFile fp, const bam1_t *b);
|
|
465
|
|
466 /*! @function
|
|
467 @abstract Initiate a pointer to bam1_t struct
|
|
468 */
|
|
469 #define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
|
|
470
|
|
471 /*! @function
|
|
472 @abstract Free the memory allocated for an alignment.
|
|
473 @param b pointer to an alignment
|
|
474 */
|
|
475 #define bam_destroy1(b) do { \
|
|
476 if (b) { free((b)->data); free(b); } \
|
|
477 } while (0)
|
|
478
|
|
479 /*!
|
|
480 @abstract Format a BAM record in the SAM format
|
|
481 @param header pointer to the header structure
|
|
482 @param b alignment to print
|
|
483 @return a pointer to the SAM string
|
|
484 */
|
|
485 char *bam_format1(const bam_header_t *header, const bam1_t *b);
|
|
486
|
|
487 char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
|
|
488
|
|
489 /*!
|
|
490 @abstract Check whether a BAM record is plausibly valid
|
|
491 @param header associated header structure, or NULL if unavailable
|
|
492 @param b alignment to validate
|
|
493 @return 0 if the alignment is invalid; non-zero otherwise
|
|
494
|
|
495 @discussion Simple consistency check of some of the fields of the
|
|
496 alignment record. If the header is provided, several additional checks
|
|
497 are made. Not all fields are checked, so a non-zero result is not a
|
|
498 guarantee that the record is valid. However it is usually good enough
|
|
499 to detect when bam_seek() has been called with a virtual file offset
|
|
500 that is not the offset of an alignment record.
|
|
501 */
|
|
502 int bam_validate1(const bam_header_t *header, const bam1_t *b);
|
|
503
|
|
504 const char *bam_get_library(bam_header_t *header, const bam1_t *b);
|
|
505
|
|
506
|
|
507 /***************
|
|
508 * pileup APIs *
|
|
509 ***************/
|
|
510
|
|
511 /*! @typedef
|
|
512 @abstract Structure for one alignment covering the pileup position.
|
|
513 @field b pointer to the alignment
|
|
514 @field qpos position of the read base at the pileup site, 0-based
|
|
515 @field indel indel length; 0 for no indel, positive for ins and negative for del
|
|
516 @field is_del 1 iff the base on the padded read is a deletion
|
|
517 @field level the level of the read in the "viewer" mode
|
|
518
|
|
519 @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
|
|
520 difference between the two functions is that the former does not
|
|
521 set bam_pileup1_t::level, while the later does. Level helps the
|
|
522 implementation of alignment viewers, but calculating this has some
|
|
523 overhead.
|
|
524 */
|
|
525 typedef struct {
|
|
526 bam1_t *b;
|
|
527 int32_t qpos;
|
|
528 int indel, level;
|
|
529 uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28;
|
|
530 } bam_pileup1_t;
|
|
531
|
|
532 typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
|
|
533
|
|
534 struct __bam_plp_t;
|
|
535 typedef struct __bam_plp_t *bam_plp_t;
|
|
536
|
|
537 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data);
|
|
538 int bam_plp_push(bam_plp_t iter, const bam1_t *b);
|
|
539 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
|
|
540 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
|
|
541 void bam_plp_set_mask(bam_plp_t iter, int mask);
|
|
542 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
|
|
543 void bam_plp_reset(bam_plp_t iter);
|
|
544 void bam_plp_destroy(bam_plp_t iter);
|
|
545
|
|
546 struct __bam_mplp_t;
|
|
547 typedef struct __bam_mplp_t *bam_mplp_t;
|
|
548
|
|
549 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
|
|
550 void bam_mplp_destroy(bam_mplp_t iter);
|
|
551 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
|
|
552 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
|
|
553
|
|
554 /*! @typedef
|
|
555 @abstract Type of function to be called by bam_plbuf_push().
|
|
556 @param tid chromosome ID as is defined in the header
|
|
557 @param pos start coordinate of the alignment, 0-based
|
|
558 @param n number of elements in pl array
|
|
559 @param pl array of alignments
|
|
560 @param data user provided data
|
|
561 @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
|
|
562 */
|
|
563 typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
|
|
564
|
|
565 typedef struct {
|
|
566 bam_plp_t iter;
|
|
567 bam_pileup_f func;
|
|
568 void *data;
|
|
569 } bam_plbuf_t;
|
|
570
|
|
571 void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
|
|
572 void bam_plbuf_reset(bam_plbuf_t *buf);
|
|
573 bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
|
|
574 void bam_plbuf_destroy(bam_plbuf_t *buf);
|
|
575 int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
|
|
576
|
|
577 int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
|
|
578
|
|
579 struct __bam_lplbuf_t;
|
|
580 typedef struct __bam_lplbuf_t bam_lplbuf_t;
|
|
581
|
|
582 void bam_lplbuf_reset(bam_lplbuf_t *buf);
|
|
583
|
|
584 /*! @abstract bam_plbuf_init() equivalent with level calculated. */
|
|
585 bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
|
|
586
|
|
587 /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */
|
|
588 void bam_lplbuf_destroy(bam_lplbuf_t *tv);
|
|
589
|
|
590 /*! @abstract bam_plbuf_push() equivalent with level calculated. */
|
|
591 int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
|
|
592
|
|
593
|
|
594 /*********************
|
|
595 * BAM indexing APIs *
|
|
596 *********************/
|
|
597
|
|
598 struct __bam_index_t;
|
|
599 typedef struct __bam_index_t bam_index_t;
|
|
600
|
|
601 /*!
|
|
602 @abstract Build index for a BAM file.
|
|
603 @discussion Index file "fn.bai" will be created.
|
|
604 @param fn name of the BAM file
|
|
605 @return always 0 currently
|
|
606 */
|
|
607 int bam_index_build(const char *fn);
|
|
608
|
|
609 /*!
|
|
610 @abstract Load index from file "fn.bai".
|
|
611 @param fn name of the BAM file (NOT the index file)
|
|
612 @return pointer to the index structure
|
|
613 */
|
|
614 bam_index_t *bam_index_load(const char *fn);
|
|
615
|
|
616 /*!
|
|
617 @abstract Destroy an index structure.
|
|
618 @param idx pointer to the index structure
|
|
619 */
|
|
620 void bam_index_destroy(bam_index_t *idx);
|
|
621
|
|
622 /*! @typedef
|
|
623 @abstract Type of function to be called by bam_fetch().
|
|
624 @param b the alignment
|
|
625 @param data user provided data
|
|
626 */
|
|
627 typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
|
|
628
|
|
629 /*!
|
|
630 @abstract Retrieve the alignments that are overlapped with the
|
|
631 specified region.
|
|
632
|
|
633 @discussion A user defined function will be called for each
|
|
634 retrieved alignment ordered by its start position.
|
|
635
|
|
636 @param fp BAM file handler
|
|
637 @param idx pointer to the alignment index
|
|
638 @param tid chromosome ID as is defined in the header
|
|
639 @param beg start coordinate, 0-based
|
|
640 @param end end coordinate, 0-based
|
|
641 @param data user provided data (will be transferred to func)
|
|
642 @param func user defined function
|
|
643 */
|
|
644 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
|
|
645
|
|
646 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end);
|
|
647 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b);
|
|
648 void bam_iter_destroy(bam_iter_t iter);
|
|
649
|
|
650 /*!
|
|
651 @abstract Parse a region in the format: "chr2:100,000-200,000".
|
|
652 @discussion bam_header_t::hash will be initialized if empty.
|
|
653 @param header pointer to the header structure
|
|
654 @param str string to be parsed
|
|
655 @param ref_id the returned chromosome ID
|
|
656 @param begin the returned start coordinate
|
|
657 @param end the returned end coordinate
|
|
658 @return 0 on success; -1 on failure
|
|
659 */
|
|
660 int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
|
|
661
|
|
662
|
|
663 /**************************
|
|
664 * APIs for optional tags *
|
|
665 **************************/
|
|
666
|
|
667 /*!
|
|
668 @abstract Retrieve data of a tag
|
|
669 @param b pointer to an alignment struct
|
|
670 @param tag two-character tag to be retrieved
|
|
671
|
|
672 @return pointer to the type and data. The first character is the
|
|
673 type that can be 'iIsScCdfAZH'.
|
|
674
|
|
675 @discussion Use bam_aux2?() series to convert the returned data to
|
|
676 the corresponding type.
|
|
677 */
|
|
678 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
|
|
679
|
|
680 int32_t bam_aux2i(const uint8_t *s);
|
|
681 float bam_aux2f(const uint8_t *s);
|
|
682 double bam_aux2d(const uint8_t *s);
|
|
683 char bam_aux2A(const uint8_t *s);
|
|
684 char *bam_aux2Z(const uint8_t *s);
|
|
685
|
|
686 int bam_aux_del(bam1_t *b, uint8_t *s);
|
|
687 void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
|
|
688 uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
|
|
689
|
|
690
|
|
691 /*****************
|
|
692 * Miscellaneous *
|
|
693 *****************/
|
|
694
|
|
695 /*!
|
|
696 @abstract Calculate the rightmost coordinate of an alignment on the
|
|
697 reference genome.
|
|
698
|
|
699 @param c pointer to the bam1_core_t structure
|
|
700 @param cigar the corresponding CIGAR array (from bam1_t::cigar)
|
|
701 @return the rightmost coordinate, 0-based
|
|
702 */
|
|
703 uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);
|
|
704
|
|
705 /*!
|
|
706 @abstract Calculate the length of the query sequence from CIGAR.
|
|
707 @param c pointer to the bam1_core_t structure
|
|
708 @param cigar the corresponding CIGAR array (from bam1_t::cigar)
|
|
709 @return length of the query sequence
|
|
710 */
|
|
711 int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
|
|
712
|
|
713 #ifdef __cplusplus
|
|
714 }
|
|
715 #endif
|
|
716
|
|
717 /*!
|
|
718 @abstract Calculate the minimum bin that contains a region [beg,end).
|
|
719 @param beg start of the region, 0-based
|
|
720 @param end end of the region, 0-based
|
|
721 @return bin
|
|
722 */
|
|
723 static inline int bam_reg2bin(uint32_t beg, uint32_t end)
|
|
724 {
|
|
725 --end;
|
|
726 if (beg>>14 == end>>14) return 4681 + (beg>>14);
|
|
727 if (beg>>17 == end>>17) return 585 + (beg>>17);
|
|
728 if (beg>>20 == end>>20) return 73 + (beg>>20);
|
|
729 if (beg>>23 == end>>23) return 9 + (beg>>23);
|
|
730 if (beg>>26 == end>>26) return 1 + (beg>>26);
|
|
731 return 0;
|
|
732 }
|
|
733
|
|
734 /*!
|
|
735 @abstract Copy an alignment
|
|
736 @param bdst destination alignment struct
|
|
737 @param bsrc source alignment struct
|
|
738 @return pointer to the destination alignment struct
|
|
739 */
|
|
740 static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
|
|
741 {
|
|
742 uint8_t *data = bdst->data;
|
|
743 int m_data = bdst->m_data; // backup data and m_data
|
|
744 if (m_data < bsrc->data_len) { // double the capacity
|
|
745 m_data = bsrc->data_len; kroundup32(m_data);
|
|
746 data = (uint8_t*)realloc(data, m_data);
|
|
747 }
|
|
748 memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
|
|
749 *bdst = *bsrc; // copy the rest
|
|
750 // restore the backup
|
|
751 bdst->m_data = m_data;
|
|
752 bdst->data = data;
|
|
753 return bdst;
|
|
754 }
|
|
755
|
|
756 /*!
|
|
757 @abstract Duplicate an alignment
|
|
758 @param src source alignment struct
|
|
759 @return pointer to the destination alignment struct
|
|
760 */
|
|
761 static inline bam1_t *bam_dup1(const bam1_t *src)
|
|
762 {
|
|
763 bam1_t *b;
|
|
764 b = bam_init1();
|
|
765 *b = *src;
|
|
766 b->m_data = b->data_len;
|
|
767 b->data = (uint8_t*)calloc(b->data_len, 1);
|
|
768 memcpy(b->data, src->data, b->data_len);
|
|
769 return b;
|
|
770 }
|
|
771
|
|
772 static inline int bam_aux_type2size(int x)
|
|
773 {
|
|
774 if (x == 'C' || x == 'c' || x == 'A') return 1;
|
|
775 else if (x == 'S' || x == 's') return 2;
|
|
776 else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4;
|
|
777 else return 0;
|
|
778 }
|
|
779
|
|
780 /*********************************
|
|
781 *** Compatibility with htslib ***
|
|
782 *********************************/
|
|
783
|
|
784 typedef bam_header_t bam_hdr_t;
|
|
785
|
|
786 #define bam_get_qname(b) bam1_qname(b)
|
|
787 #define bam_get_cigar(b) bam1_cigar(b)
|
|
788
|
|
789 #define bam_hdr_read(fp) bam_header_read(fp)
|
|
790 #define bam_hdr_write(fp, h) bam_header_write(fp, h)
|
|
791 #define bam_hdr_destroy(fp) bam_header_destroy(fp)
|
|
792
|
|
793 #endif
|