Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bam.h @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 | 
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:903fc43d6227 | 
|---|---|
| 1 /* The MIT License | |
| 2 | |
| 3 Copyright (c) 2008-2010 Genome Research Ltd (GRL). | |
| 4 | |
| 5 Permission is hereby granted, free of charge, to any person obtaining | |
| 6 a copy of this software and associated documentation files (the | |
| 7 "Software"), to deal in the Software without restriction, including | |
| 8 without limitation the rights to use, copy, modify, merge, publish, | |
| 9 distribute, sublicense, and/or sell copies of the Software, and to | |
| 10 permit persons to whom the Software is furnished to do so, subject to | |
| 11 the following conditions: | |
| 12 | |
| 13 The above copyright notice and this permission notice shall be | |
| 14 included in all copies or substantial portions of the Software. | |
| 15 | |
| 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
| 17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
| 18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
| 19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
| 20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
| 21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
| 22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 23 SOFTWARE. | |
| 24 */ | |
| 25 | |
| 26 /* Contact: Heng Li <lh3@sanger.ac.uk> */ | |
| 27 | |
| 28 #ifndef BAM_BAM_H | |
| 29 #define BAM_BAM_H | |
| 30 | |
| 31 /*! | |
| 32 @header | |
| 33 | |
| 34 BAM library provides I/O and various operations on manipulating files | |
| 35 in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) | |
| 36 format. It now supports importing from or exporting to SAM, sorting, | |
| 37 merging, generating pileup, and quickly retrieval of reads overlapped | |
| 38 with a specified region. | |
| 39 | |
| 40 @copyright Genome Research Ltd. | |
| 41 */ | |
| 42 | |
| 43 #define BAM_VERSION "0.1.19-44428cd" | |
| 44 | |
| 45 #include <stdint.h> | |
| 46 #include <stdlib.h> | |
| 47 #include <string.h> | |
| 48 #include <stdio.h> | |
| 49 | |
| 50 #ifndef BAM_LITE | |
| 51 #define BAM_VIRTUAL_OFFSET16 | |
| 52 #include "bgzf.h" | |
| 53 /*! @abstract BAM file handler */ | |
| 54 typedef BGZF *bamFile; | |
| 55 #define bam_open(fn, mode) bgzf_open(fn, mode) | |
| 56 #define bam_dopen(fd, mode) bgzf_fdopen(fd, mode) | |
| 57 #define bam_close(fp) bgzf_close(fp) | |
| 58 #define bam_read(fp, buf, size) bgzf_read(fp, buf, size) | |
| 59 #define bam_write(fp, buf, size) bgzf_write(fp, buf, size) | |
| 60 #define bam_tell(fp) bgzf_tell(fp) | |
| 61 #define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir) | |
| 62 #else | |
| 63 #define BAM_TRUE_OFFSET | |
| 64 #include <zlib.h> | |
| 65 typedef gzFile bamFile; | |
| 66 #define bam_open(fn, mode) gzopen(fn, mode) | |
| 67 #define bam_dopen(fd, mode) gzdopen(fd, mode) | |
| 68 #define bam_close(fp) gzclose(fp) | |
| 69 #define bam_read(fp, buf, size) gzread(fp, buf, size) | |
| 70 /* no bam_write/bam_tell/bam_seek() here */ | |
| 71 #endif | |
| 72 | |
| 73 /*! @typedef | |
| 74 @abstract Structure for the alignment header. | |
| 75 @field n_targets number of reference sequences | |
| 76 @field target_name names of the reference sequences | |
| 77 @field target_len lengths of the referene sequences | |
| 78 @field dict header dictionary | |
| 79 @field hash hash table for fast name lookup | |
| 80 @field rg2lib hash table for @RG-ID -> LB lookup | |
| 81 @field l_text length of the plain text in the header | |
| 82 @field text plain text | |
| 83 | |
| 84 @discussion Field hash points to null by default. It is a private | |
| 85 member. | |
| 86 */ | |
| 87 typedef struct { | |
| 88 int32_t n_targets; | |
| 89 char **target_name; | |
| 90 uint32_t *target_len; | |
| 91 void *dict, *hash, *rg2lib; | |
| 92 uint32_t l_text, n_text; | |
| 93 char *text; | |
| 94 } bam_header_t; | |
| 95 | |
| 96 /*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ | |
| 97 #define BAM_FPAIRED 1 | |
| 98 /*! @abstract the read is mapped in a proper pair */ | |
| 99 #define BAM_FPROPER_PAIR 2 | |
| 100 /*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ | |
| 101 #define BAM_FUNMAP 4 | |
| 102 /*! @abstract the mate is unmapped */ | |
| 103 #define BAM_FMUNMAP 8 | |
| 104 /*! @abstract the read is mapped to the reverse strand */ | |
| 105 #define BAM_FREVERSE 16 | |
| 106 /*! @abstract the mate is mapped to the reverse strand */ | |
| 107 #define BAM_FMREVERSE 32 | |
| 108 /*! @abstract this is read1 */ | |
| 109 #define BAM_FREAD1 64 | |
| 110 /*! @abstract this is read2 */ | |
| 111 #define BAM_FREAD2 128 | |
| 112 /*! @abstract not primary alignment */ | |
| 113 #define BAM_FSECONDARY 256 | |
| 114 /*! @abstract QC failure */ | |
| 115 #define BAM_FQCFAIL 512 | |
| 116 /*! @abstract optical or PCR duplicate */ | |
| 117 #define BAM_FDUP 1024 | |
| 118 | |
| 119 #define BAM_OFDEC 0 | |
| 120 #define BAM_OFHEX 1 | |
| 121 #define BAM_OFSTR 2 | |
| 122 | |
| 123 /*! @abstract defautl mask for pileup */ | |
| 124 #define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) | |
| 125 | |
| 126 #define BAM_CORE_SIZE sizeof(bam1_core_t) | |
| 127 | |
| 128 /** | |
| 129 * Describing how CIGAR operation/length is packed in a 32-bit integer. | |
| 130 */ | |
| 131 #define BAM_CIGAR_SHIFT 4 | |
| 132 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) | |
| 133 | |
| 134 /* | |
| 135 CIGAR operations. | |
| 136 */ | |
| 137 /*! @abstract CIGAR: M = match or mismatch*/ | |
| 138 #define BAM_CMATCH 0 | |
| 139 /*! @abstract CIGAR: I = insertion to the reference */ | |
| 140 #define BAM_CINS 1 | |
| 141 /*! @abstract CIGAR: D = deletion from the reference */ | |
| 142 #define BAM_CDEL 2 | |
| 143 /*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */ | |
| 144 #define BAM_CREF_SKIP 3 | |
| 145 /*! @abstract CIGAR: S = clip on the read with clipped sequence | |
| 146 present in qseq */ | |
| 147 #define BAM_CSOFT_CLIP 4 | |
| 148 /*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */ | |
| 149 #define BAM_CHARD_CLIP 5 | |
| 150 /*! @abstract CIGAR: P = padding */ | |
| 151 #define BAM_CPAD 6 | |
| 152 /*! @abstract CIGAR: equals = match */ | |
| 153 #define BAM_CEQUAL 7 | |
| 154 /*! @abstract CIGAR: X = mismatch */ | |
| 155 #define BAM_CDIFF 8 | |
| 156 #define BAM_CBACK 9 | |
| 157 | |
| 158 #define BAM_CIGAR_STR "MIDNSHP=XB" | |
| 159 #define BAM_CIGAR_TYPE 0x3C1A7 | |
| 160 | |
| 161 #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) | |
| 162 #define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) | |
| 163 #define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)]) | |
| 164 #define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o)) | |
| 165 #define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference | |
| 166 | |
| 167 /*! @typedef | |
| 168 @abstract Structure for core alignment information. | |
| 169 @field tid chromosome ID, defined by bam_header_t | |
| 170 @field pos 0-based leftmost coordinate | |
| 171 @field bin bin calculated by bam_reg2bin() | |
| 172 @field qual mapping quality | |
| 173 @field l_qname length of the query name | |
| 174 @field flag bitwise flag | |
| 175 @field n_cigar number of CIGAR operations | |
| 176 @field l_qseq length of the query sequence (read) | |
| 177 */ | |
| 178 typedef struct { | |
| 179 int32_t tid; | |
| 180 int32_t pos; | |
| 181 uint32_t bin:16, qual:8, l_qname:8; | |
| 182 uint32_t flag:16, n_cigar:16; | |
| 183 int32_t l_qseq; | |
| 184 int32_t mtid; | |
| 185 int32_t mpos; | |
| 186 int32_t isize; | |
| 187 } bam1_core_t; | |
| 188 | |
| 189 /*! @typedef | |
| 190 @abstract Structure for one alignment. | |
| 191 @field core core information about the alignment | |
| 192 @field l_aux length of auxiliary data | |
| 193 @field data_len current length of bam1_t::data | |
| 194 @field m_data maximum length of bam1_t::data | |
| 195 @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux | |
| 196 | |
| 197 @discussion Notes: | |
| 198 | |
| 199 1. qname is zero tailing and core.l_qname includes the tailing '\0'. | |
| 200 2. l_qseq is calculated from the total length of an alignment block | |
| 201 on reading or from CIGAR. | |
| 202 3. cigar data is encoded 4 bytes per CIGAR operation. | |
| 203 4. seq is nybble-encoded according to bam_nt16_table. | |
| 204 */ | |
| 205 typedef struct { | |
| 206 bam1_core_t core; | |
| 207 int l_aux, data_len, m_data; | |
| 208 uint8_t *data; | |
| 209 } bam1_t; | |
| 210 | |
| 211 typedef struct __bam_iter_t *bam_iter_t; | |
| 212 | |
| 213 #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) | |
| 214 #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) | |
| 215 | |
| 216 /*! @function | |
| 217 @abstract Get the CIGAR array | |
| 218 @param b pointer to an alignment | |
| 219 @return pointer to the CIGAR array | |
| 220 | |
| 221 @discussion In the CIGAR array, each element is a 32-bit integer. The | |
| 222 lower 4 bits gives a CIGAR operation and the higher 28 bits keep the | |
| 223 length of a CIGAR. | |
| 224 */ | |
| 225 #define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) | |
| 226 | |
| 227 /*! @function | |
| 228 @abstract Get the name of the query | |
| 229 @param b pointer to an alignment | |
| 230 @return pointer to the name string, null terminated | |
| 231 */ | |
| 232 #define bam1_qname(b) ((char*)((b)->data)) | |
| 233 | |
| 234 /*! @function | |
| 235 @abstract Get query sequence | |
| 236 @param b pointer to an alignment | |
| 237 @return pointer to sequence | |
| 238 | |
| 239 @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, | |
| 240 8 for T and 15 for N. Two bases are packed in one byte with the base | |
| 241 at the higher 4 bits having smaller coordinate on the read. It is | |
| 242 recommended to use bam1_seqi() macro to get the base. | |
| 243 */ | |
| 244 #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) | |
| 245 | |
| 246 /*! @function | |
| 247 @abstract Get query quality | |
| 248 @param b pointer to an alignment | |
| 249 @return pointer to quality string | |
| 250 */ | |
| 251 #define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) | |
| 252 | |
| 253 /*! @function | |
| 254 @abstract Get a base on read | |
| 255 @param s Query sequence returned by bam1_seq() | |
| 256 @param i The i-th position, 0-based | |
| 257 @return 4-bit integer representing the base. | |
| 258 */ | |
| 259 //#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) | |
| 260 #define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf) | |
| 261 | |
| 262 #define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) ) | |
| 263 | |
| 264 /*! @function | |
| 265 @abstract Get query sequence and quality | |
| 266 @param b pointer to an alignment | |
| 267 @return pointer to the concatenated auxiliary data | |
| 268 */ | |
| 269 #define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) | |
| 270 | |
| 271 #ifndef kroundup32 | |
| 272 /*! @function | |
| 273 @abstract Round an integer to the next closest power-2 integer. | |
| 274 @param x integer to be rounded (in place) | |
| 275 @discussion x will be modified. | |
| 276 */ | |
| 277 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) | |
| 278 #endif | |
| 279 | |
| 280 /*! | |
| 281 @abstract Whether the machine is big-endian; modified only in | |
| 282 bam_header_init(). | |
| 283 */ | |
| 284 extern int bam_is_be; | |
| 285 | |
| 286 /*! | |
| 287 @abstract Verbose level between 0 and 3; 0 is supposed to disable all | |
| 288 debugging information, though this may not have been implemented. | |
| 289 */ | |
| 290 extern int bam_verbose; | |
| 291 | |
| 292 extern int bam_no_B; | |
| 293 | |
| 294 /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ | |
| 295 extern unsigned char bam_nt16_table[256]; | |
| 296 | |
| 297 /*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */ | |
| 298 extern char *bam_nt16_rev_table; | |
| 299 | |
| 300 extern char bam_nt16_nt4_table[]; | |
| 301 | |
| 302 #ifdef __cplusplus | |
| 303 extern "C" { | |
| 304 #endif | |
| 305 | |
| 306 /********************* | |
| 307 * Low-level SAM I/O * | |
| 308 *********************/ | |
| 309 | |
| 310 /*! @abstract TAM file handler */ | |
| 311 typedef struct __tamFile_t *tamFile; | |
| 312 | |
| 313 /*! | |
| 314 @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib. | |
| 315 @param fn SAM file name | |
| 316 @return SAM file handler | |
| 317 */ | |
| 318 tamFile sam_open(const char *fn); | |
| 319 | |
| 320 /*! | |
| 321 @abstract Close a SAM file handler | |
| 322 @param fp SAM file handler | |
| 323 */ | |
| 324 void sam_close(tamFile fp); | |
| 325 | |
| 326 /*! | |
| 327 @abstract Read one alignment from a SAM file handler | |
| 328 @param fp SAM file handler | |
| 329 @param header header information (ordered names of chromosomes) | |
| 330 @param b read alignment; all members in b will be updated | |
| 331 @return 0 if successful; otherwise negative | |
| 332 */ | |
| 333 int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b); | |
| 334 | |
| 335 /*! | |
| 336 @abstract Read header information from a TAB-delimited list file. | |
| 337 @param fn_list file name for the list | |
| 338 @return a pointer to the header structure | |
| 339 | |
| 340 @discussion Each line in this file consists of chromosome name and | |
| 341 the length of chromosome. | |
| 342 */ | |
| 343 bam_header_t *sam_header_read2(const char *fn_list); | |
| 344 | |
| 345 /*! | |
| 346 @abstract Read header from a SAM file (if present) | |
| 347 @param fp SAM file handler | |
| 348 @return pointer to header struct; 0 if no @SQ lines available | |
| 349 */ | |
| 350 bam_header_t *sam_header_read(tamFile fp); | |
| 351 | |
| 352 /*! | |
| 353 @abstract Parse @SQ lines a update a header struct | |
| 354 @param h pointer to the header struct to be updated | |
| 355 @return number of target sequences | |
| 356 | |
| 357 @discussion bam_header_t::{n_targets,target_len,target_name} will | |
| 358 be destroyed in the first place. | |
| 359 */ | |
| 360 int sam_header_parse(bam_header_t *h); | |
| 361 int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); | |
| 362 | |
| 363 /*! | |
| 364 @abstract Parse @RG lines a update a header struct | |
| 365 @param h pointer to the header struct to be updated | |
| 366 @return number of @RG lines | |
| 367 | |
| 368 @discussion bam_header_t::rg2lib will be destroyed in the first | |
| 369 place. | |
| 370 */ | |
| 371 int sam_header_parse_rg(bam_header_t *h); | |
| 372 | |
| 373 #define sam_write1(header, b) bam_view1(header, b) | |
| 374 | |
| 375 | |
| 376 /******************************** | |
| 377 * APIs for string dictionaries * | |
| 378 ********************************/ | |
| 379 | |
| 380 int bam_strmap_put(void *strmap, const char *rg, const char *lib); | |
| 381 const char *bam_strmap_get(const void *strmap, const char *rg); | |
| 382 void *bam_strmap_dup(const void*); | |
| 383 void *bam_strmap_init(); | |
| 384 void bam_strmap_destroy(void *strmap); | |
| 385 | |
| 386 | |
| 387 /********************* | |
| 388 * Low-level BAM I/O * | |
| 389 *********************/ | |
| 390 | |
| 391 /*! | |
| 392 @abstract Initialize a header structure. | |
| 393 @return the pointer to the header structure | |
| 394 | |
| 395 @discussion This function also modifies the global variable | |
| 396 bam_is_be. | |
| 397 */ | |
| 398 bam_header_t *bam_header_init(); | |
| 399 | |
| 400 /*! | |
| 401 @abstract Destroy a header structure. | |
| 402 @param header pointer to the header | |
| 403 */ | |
| 404 void bam_header_destroy(bam_header_t *header); | |
| 405 | |
| 406 /*! | |
| 407 @abstract Read a header structure from BAM. | |
| 408 @param fp BAM file handler, opened by bam_open() | |
| 409 @return pointer to the header structure | |
| 410 | |
| 411 @discussion The file position indicator must be placed at the | |
| 412 beginning of the file. Upon success, the position indicator will | |
| 413 be set at the start of the first alignment. | |
| 414 */ | |
| 415 bam_header_t *bam_header_read(bamFile fp); | |
| 416 | |
| 417 /*! | |
| 418 @abstract Write a header structure to BAM. | |
| 419 @param fp BAM file handler | |
| 420 @param header pointer to the header structure | |
| 421 @return always 0 currently | |
| 422 */ | |
| 423 int bam_header_write(bamFile fp, const bam_header_t *header); | |
| 424 | |
| 425 /*! | |
| 426 @abstract Read an alignment from BAM. | |
| 427 @param fp BAM file handler | |
| 428 @param b read alignment; all members are updated. | |
| 429 @return number of bytes read from the file | |
| 430 | |
| 431 @discussion The file position indicator must be | |
| 432 placed right before an alignment. Upon success, this function | |
| 433 will set the position indicator to the start of the next | |
| 434 alignment. This function is not affected by the machine | |
| 435 endianness. | |
| 436 */ | |
| 437 int bam_read1(bamFile fp, bam1_t *b); | |
| 438 | |
| 439 int bam_remove_B(bam1_t *b); | |
| 440 | |
| 441 /*! | |
| 442 @abstract Write an alignment to BAM. | |
| 443 @param fp BAM file handler | |
| 444 @param c pointer to the bam1_core_t structure | |
| 445 @param data_len total length of variable size data related to | |
| 446 the alignment | |
| 447 @param data pointer to the concatenated data | |
| 448 @return number of bytes written to the file | |
| 449 | |
| 450 @discussion This function is not affected by the machine | |
| 451 endianness. | |
| 452 */ | |
| 453 int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data); | |
| 454 | |
| 455 /*! | |
| 456 @abstract Write an alignment to BAM. | |
| 457 @param fp BAM file handler | |
| 458 @param b alignment to write | |
| 459 @return number of bytes written to the file | |
| 460 | |
| 461 @abstract It is equivalent to: | |
| 462 bam_write1_core(fp, &b->core, b->data_len, b->data) | |
| 463 */ | |
| 464 int bam_write1(bamFile fp, const bam1_t *b); | |
| 465 | |
| 466 /*! @function | |
| 467 @abstract Initiate a pointer to bam1_t struct | |
| 468 */ | |
| 469 #define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) | |
| 470 | |
| 471 /*! @function | |
| 472 @abstract Free the memory allocated for an alignment. | |
| 473 @param b pointer to an alignment | |
| 474 */ | |
| 475 #define bam_destroy1(b) do { \ | |
| 476 if (b) { free((b)->data); free(b); } \ | |
| 477 } while (0) | |
| 478 | |
| 479 /*! | |
| 480 @abstract Format a BAM record in the SAM format | |
| 481 @param header pointer to the header structure | |
| 482 @param b alignment to print | |
| 483 @return a pointer to the SAM string | |
| 484 */ | |
| 485 char *bam_format1(const bam_header_t *header, const bam1_t *b); | |
| 486 | |
| 487 char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of); | |
| 488 | |
| 489 /*! | |
| 490 @abstract Check whether a BAM record is plausibly valid | |
| 491 @param header associated header structure, or NULL if unavailable | |
| 492 @param b alignment to validate | |
| 493 @return 0 if the alignment is invalid; non-zero otherwise | |
| 494 | |
| 495 @discussion Simple consistency check of some of the fields of the | |
| 496 alignment record. If the header is provided, several additional checks | |
| 497 are made. Not all fields are checked, so a non-zero result is not a | |
| 498 guarantee that the record is valid. However it is usually good enough | |
| 499 to detect when bam_seek() has been called with a virtual file offset | |
| 500 that is not the offset of an alignment record. | |
| 501 */ | |
| 502 int bam_validate1(const bam_header_t *header, const bam1_t *b); | |
| 503 | |
| 504 const char *bam_get_library(bam_header_t *header, const bam1_t *b); | |
| 505 | |
| 506 | |
| 507 /*************** | |
| 508 * pileup APIs * | |
| 509 ***************/ | |
| 510 | |
| 511 /*! @typedef | |
| 512 @abstract Structure for one alignment covering the pileup position. | |
| 513 @field b pointer to the alignment | |
| 514 @field qpos position of the read base at the pileup site, 0-based | |
| 515 @field indel indel length; 0 for no indel, positive for ins and negative for del | |
| 516 @field is_del 1 iff the base on the padded read is a deletion | |
| 517 @field level the level of the read in the "viewer" mode | |
| 518 | |
| 519 @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The | |
| 520 difference between the two functions is that the former does not | |
| 521 set bam_pileup1_t::level, while the later does. Level helps the | |
| 522 implementation of alignment viewers, but calculating this has some | |
| 523 overhead. | |
| 524 */ | |
| 525 typedef struct { | |
| 526 bam1_t *b; | |
| 527 int32_t qpos; | |
| 528 int indel, level; | |
| 529 uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; | |
| 530 } bam_pileup1_t; | |
| 531 | |
| 532 typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); | |
| 533 | |
| 534 struct __bam_plp_t; | |
| 535 typedef struct __bam_plp_t *bam_plp_t; | |
| 536 | |
| 537 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); | |
| 538 int bam_plp_push(bam_plp_t iter, const bam1_t *b); | |
| 539 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); | |
| 540 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); | |
| 541 void bam_plp_set_mask(bam_plp_t iter, int mask); | |
| 542 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); | |
| 543 void bam_plp_reset(bam_plp_t iter); | |
| 544 void bam_plp_destroy(bam_plp_t iter); | |
| 545 | |
| 546 struct __bam_mplp_t; | |
| 547 typedef struct __bam_mplp_t *bam_mplp_t; | |
| 548 | |
| 549 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); | |
| 550 void bam_mplp_destroy(bam_mplp_t iter); | |
| 551 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); | |
| 552 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); | |
| 553 | |
| 554 /*! @typedef | |
| 555 @abstract Type of function to be called by bam_plbuf_push(). | |
| 556 @param tid chromosome ID as is defined in the header | |
| 557 @param pos start coordinate of the alignment, 0-based | |
| 558 @param n number of elements in pl array | |
| 559 @param pl array of alignments | |
| 560 @param data user provided data | |
| 561 @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t. | |
| 562 */ | |
| 563 typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); | |
| 564 | |
| 565 typedef struct { | |
| 566 bam_plp_t iter; | |
| 567 bam_pileup_f func; | |
| 568 void *data; | |
| 569 } bam_plbuf_t; | |
| 570 | |
| 571 void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); | |
| 572 void bam_plbuf_reset(bam_plbuf_t *buf); | |
| 573 bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); | |
| 574 void bam_plbuf_destroy(bam_plbuf_t *buf); | |
| 575 int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); | |
| 576 | |
| 577 int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); | |
| 578 | |
| 579 struct __bam_lplbuf_t; | |
| 580 typedef struct __bam_lplbuf_t bam_lplbuf_t; | |
| 581 | |
| 582 void bam_lplbuf_reset(bam_lplbuf_t *buf); | |
| 583 | |
| 584 /*! @abstract bam_plbuf_init() equivalent with level calculated. */ | |
| 585 bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data); | |
| 586 | |
| 587 /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */ | |
| 588 void bam_lplbuf_destroy(bam_lplbuf_t *tv); | |
| 589 | |
| 590 /*! @abstract bam_plbuf_push() equivalent with level calculated. */ | |
| 591 int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); | |
| 592 | |
| 593 | |
| 594 /********************* | |
| 595 * BAM indexing APIs * | |
| 596 *********************/ | |
| 597 | |
| 598 struct __bam_index_t; | |
| 599 typedef struct __bam_index_t bam_index_t; | |
| 600 | |
| 601 /*! | |
| 602 @abstract Build index for a BAM file. | |
| 603 @discussion Index file "fn.bai" will be created. | |
| 604 @param fn name of the BAM file | |
| 605 @return always 0 currently | |
| 606 */ | |
| 607 int bam_index_build(const char *fn); | |
| 608 | |
| 609 /*! | |
| 610 @abstract Load index from file "fn.bai". | |
| 611 @param fn name of the BAM file (NOT the index file) | |
| 612 @return pointer to the index structure | |
| 613 */ | |
| 614 bam_index_t *bam_index_load(const char *fn); | |
| 615 | |
| 616 /*! | |
| 617 @abstract Destroy an index structure. | |
| 618 @param idx pointer to the index structure | |
| 619 */ | |
| 620 void bam_index_destroy(bam_index_t *idx); | |
| 621 | |
| 622 /*! @typedef | |
| 623 @abstract Type of function to be called by bam_fetch(). | |
| 624 @param b the alignment | |
| 625 @param data user provided data | |
| 626 */ | |
| 627 typedef int (*bam_fetch_f)(const bam1_t *b, void *data); | |
| 628 | |
| 629 /*! | |
| 630 @abstract Retrieve the alignments that are overlapped with the | |
| 631 specified region. | |
| 632 | |
| 633 @discussion A user defined function will be called for each | |
| 634 retrieved alignment ordered by its start position. | |
| 635 | |
| 636 @param fp BAM file handler | |
| 637 @param idx pointer to the alignment index | |
| 638 @param tid chromosome ID as is defined in the header | |
| 639 @param beg start coordinate, 0-based | |
| 640 @param end end coordinate, 0-based | |
| 641 @param data user provided data (will be transferred to func) | |
| 642 @param func user defined function | |
| 643 */ | |
| 644 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); | |
| 645 | |
| 646 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end); | |
| 647 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b); | |
| 648 void bam_iter_destroy(bam_iter_t iter); | |
| 649 | |
| 650 /*! | |
| 651 @abstract Parse a region in the format: "chr2:100,000-200,000". | |
| 652 @discussion bam_header_t::hash will be initialized if empty. | |
| 653 @param header pointer to the header structure | |
| 654 @param str string to be parsed | |
| 655 @param ref_id the returned chromosome ID | |
| 656 @param begin the returned start coordinate | |
| 657 @param end the returned end coordinate | |
| 658 @return 0 on success; -1 on failure | |
| 659 */ | |
| 660 int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); | |
| 661 | |
| 662 | |
| 663 /************************** | |
| 664 * APIs for optional tags * | |
| 665 **************************/ | |
| 666 | |
| 667 /*! | |
| 668 @abstract Retrieve data of a tag | |
| 669 @param b pointer to an alignment struct | |
| 670 @param tag two-character tag to be retrieved | |
| 671 | |
| 672 @return pointer to the type and data. The first character is the | |
| 673 type that can be 'iIsScCdfAZH'. | |
| 674 | |
| 675 @discussion Use bam_aux2?() series to convert the returned data to | |
| 676 the corresponding type. | |
| 677 */ | |
| 678 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); | |
| 679 | |
| 680 int32_t bam_aux2i(const uint8_t *s); | |
| 681 float bam_aux2f(const uint8_t *s); | |
| 682 double bam_aux2d(const uint8_t *s); | |
| 683 char bam_aux2A(const uint8_t *s); | |
| 684 char *bam_aux2Z(const uint8_t *s); | |
| 685 | |
| 686 int bam_aux_del(bam1_t *b, uint8_t *s); | |
| 687 void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); | |
| 688 uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() | |
| 689 | |
| 690 | |
| 691 /***************** | |
| 692 * Miscellaneous * | |
| 693 *****************/ | |
| 694 | |
| 695 /*! | |
| 696 @abstract Calculate the rightmost coordinate of an alignment on the | |
| 697 reference genome. | |
| 698 | |
| 699 @param c pointer to the bam1_core_t structure | |
| 700 @param cigar the corresponding CIGAR array (from bam1_t::cigar) | |
| 701 @return the rightmost coordinate, 0-based | |
| 702 */ | |
| 703 uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar); | |
| 704 | |
| 705 /*! | |
| 706 @abstract Calculate the length of the query sequence from CIGAR. | |
| 707 @param c pointer to the bam1_core_t structure | |
| 708 @param cigar the corresponding CIGAR array (from bam1_t::cigar) | |
| 709 @return length of the query sequence | |
| 710 */ | |
| 711 int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar); | |
| 712 | |
| 713 #ifdef __cplusplus | |
| 714 } | |
| 715 #endif | |
| 716 | |
| 717 /*! | |
| 718 @abstract Calculate the minimum bin that contains a region [beg,end). | |
| 719 @param beg start of the region, 0-based | |
| 720 @param end end of the region, 0-based | |
| 721 @return bin | |
| 722 */ | |
| 723 static inline int bam_reg2bin(uint32_t beg, uint32_t end) | |
| 724 { | |
| 725 --end; | |
| 726 if (beg>>14 == end>>14) return 4681 + (beg>>14); | |
| 727 if (beg>>17 == end>>17) return 585 + (beg>>17); | |
| 728 if (beg>>20 == end>>20) return 73 + (beg>>20); | |
| 729 if (beg>>23 == end>>23) return 9 + (beg>>23); | |
| 730 if (beg>>26 == end>>26) return 1 + (beg>>26); | |
| 731 return 0; | |
| 732 } | |
| 733 | |
| 734 /*! | |
| 735 @abstract Copy an alignment | |
| 736 @param bdst destination alignment struct | |
| 737 @param bsrc source alignment struct | |
| 738 @return pointer to the destination alignment struct | |
| 739 */ | |
| 740 static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) | |
| 741 { | |
| 742 uint8_t *data = bdst->data; | |
| 743 int m_data = bdst->m_data; // backup data and m_data | |
| 744 if (m_data < bsrc->data_len) { // double the capacity | |
| 745 m_data = bsrc->data_len; kroundup32(m_data); | |
| 746 data = (uint8_t*)realloc(data, m_data); | |
| 747 } | |
| 748 memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data | |
| 749 *bdst = *bsrc; // copy the rest | |
| 750 // restore the backup | |
| 751 bdst->m_data = m_data; | |
| 752 bdst->data = data; | |
| 753 return bdst; | |
| 754 } | |
| 755 | |
| 756 /*! | |
| 757 @abstract Duplicate an alignment | |
| 758 @param src source alignment struct | |
| 759 @return pointer to the destination alignment struct | |
| 760 */ | |
| 761 static inline bam1_t *bam_dup1(const bam1_t *src) | |
| 762 { | |
| 763 bam1_t *b; | |
| 764 b = bam_init1(); | |
| 765 *b = *src; | |
| 766 b->m_data = b->data_len; | |
| 767 b->data = (uint8_t*)calloc(b->data_len, 1); | |
| 768 memcpy(b->data, src->data, b->data_len); | |
| 769 return b; | |
| 770 } | |
| 771 | |
| 772 static inline int bam_aux_type2size(int x) | |
| 773 { | |
| 774 if (x == 'C' || x == 'c' || x == 'A') return 1; | |
| 775 else if (x == 'S' || x == 's') return 2; | |
| 776 else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4; | |
| 777 else return 0; | |
| 778 } | |
| 779 | |
| 780 /********************************* | |
| 781 *** Compatibility with htslib *** | |
| 782 *********************************/ | |
| 783 | |
| 784 typedef bam_header_t bam_hdr_t; | |
| 785 | |
| 786 #define bam_get_qname(b) bam1_qname(b) | |
| 787 #define bam_get_cigar(b) bam1_cigar(b) | |
| 788 | |
| 789 #define bam_hdr_read(fp) bam_header_read(fp) | |
| 790 #define bam_hdr_write(fp, h) bam_header_write(fp, h) | |
| 791 #define bam_hdr_destroy(fp) bam_header_destroy(fp) | |
| 792 | |
| 793 #endif | 
