annotate ezBAMQC/src/htslib/sam.c @ 20:9de3bbec2479 draft default tip

Uploaded
author youngkim
date Thu, 31 Mar 2016 10:10:37 -0400
parents dfa3745e5fd8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1 /* sam.c -- SAM and BAM file I/O and manipulation.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
2
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
3 Copyright (C) 2008-2010, 2012-2014 Genome Research Ltd.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
4 Copyright (C) 2010, 2012, 2013 Broad Institute.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
5
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
6 Author: Heng Li <lh3@sanger.ac.uk>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
7
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
8 Permission is hereby granted, free of charge, to any person obtaining a copy
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
9 of this software and associated documentation files (the "Software"), to deal
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
10 in the Software without restriction, including without limitation the rights
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
12 copies of the Software, and to permit persons to whom the Software is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
13 furnished to do so, subject to the following conditions:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
14
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
15 The above copyright notice and this permission notice shall be included in
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
16 all copies or substantial portions of the Software.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
17
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
24 DEALINGS IN THE SOFTWARE. */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
25
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
26 #include <stdio.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
27 #include <stdlib.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
28 #include <string.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
29 #include <errno.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
30 #include <ctype.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
31 #include <zlib.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
32 #include "htslib/sam.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
33 #include "htslib/bgzf.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
34 #include "cram/cram.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
35 #include "htslib/hfile.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
36
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
37 #include "htslib/khash.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
38 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
39
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
40 typedef khash_t(s2i) sdict_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
41
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
42 /**********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
43 *** BAM header I/O ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
44 **********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
45
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
46 bam_hdr_t *bam_hdr_init()
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
47 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
48 return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
49 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
50
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
51 void bam_hdr_destroy(bam_hdr_t *h)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
52 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
53 int32_t i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
54 if (h == NULL) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
55 if (h->target_name) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
56 for (i = 0; i < h->n_targets; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
57 free(h->target_name[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
58 free(h->target_name);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
59 free(h->target_len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
60 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
61 free(h->text); free(h->cigar_tab);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
62 if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
63 free(h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
64 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
65
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
66 bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
67 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
68 if (h0 == NULL) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
69 bam_hdr_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
70 if ((h = bam_hdr_init()) == NULL) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
71 // copy the simple data
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
72 h->n_targets = h0->n_targets;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
73 h->ignore_sam_err = h0->ignore_sam_err;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
74 h->l_text = h0->l_text;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
75 // Then the pointery stuff
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
76 h->cigar_tab = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
77 h->sdict = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
78 h->text = (char*)calloc(h->l_text + 1, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
79 memcpy(h->text, h0->text, h->l_text);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
80 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
81 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
82 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
83 for (i = 0; i < h->n_targets; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
84 h->target_len[i] = h0->target_len[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
85 h->target_name[i] = strdup(h0->target_name[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
86 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
87 return h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
88 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
89
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
90
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
91 static bam_hdr_t *hdr_from_dict(sdict_t *d)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
92 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
93 bam_hdr_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
94 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
95 h = bam_hdr_init();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
96 h->sdict = d;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
97 h->n_targets = kh_size(d);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
98 h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
99 h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
100 for (k = kh_begin(d); k != kh_end(d); ++k) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
101 if (!kh_exist(d, k)) continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
102 h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
103 h->target_len[kh_val(d, k)>>32] = kh_val(d, k)<<32>>32;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
104 kh_val(d, k) >>= 32;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
105 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
106 return h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
107 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
108
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
109 bam_hdr_t *bam_hdr_read(BGZF *fp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
110 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
111 bam_hdr_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
112 char buf[4];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
113 int magic_len, has_EOF;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
114 int32_t i = 1, name_len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
115 // check EOF
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
116 has_EOF = bgzf_check_EOF(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
117 if (has_EOF < 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
118 perror("[W::sam_hdr_read] bgzf_check_EOF");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
119 } else if (has_EOF == 0 && hts_verbose >= 2)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
120 fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
121 // read "BAM1"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
122 magic_len = bgzf_read(fp, buf, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
123 if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
124 if (hts_verbose >= 1) fprintf(stderr, "[E::%s] invalid BAM binary header\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
125 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
126 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
127 h = bam_hdr_init();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
128 // read plain text and the number of reference sequences
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
129 bgzf_read(fp, &h->l_text, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
130 if (fp->is_be) ed_swap_4p(&h->l_text);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
131 h->text = (char*)malloc(h->l_text + 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
132 h->text[h->l_text] = 0; // make sure it is NULL terminated
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
133 bgzf_read(fp, h->text, h->l_text);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
134 bgzf_read(fp, &h->n_targets, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
135 if (fp->is_be) ed_swap_4p(&h->n_targets);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
136 // read reference sequence names and lengths
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
137 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
138 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
139 for (i = 0; i != h->n_targets; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
140 bgzf_read(fp, &name_len, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
141 if (fp->is_be) ed_swap_4p(&name_len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
142 h->target_name[i] = (char*)calloc(name_len, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
143 bgzf_read(fp, h->target_name[i], name_len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
144 bgzf_read(fp, &h->target_len[i], 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
145 if (fp->is_be) ed_swap_4p(&h->target_len[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
146 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
147 return h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
148 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
149
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
150 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
151 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
152 char buf[4];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
153 int32_t i, name_len, x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
154 // write "BAM1"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
155 strncpy(buf, "BAM\1", 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
156 bgzf_write(fp, buf, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
157 // write plain text and the number of reference sequences
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
158 if (fp->is_be) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
159 x = ed_swap_4(h->l_text);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
160 bgzf_write(fp, &x, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
161 if (h->l_text) bgzf_write(fp, h->text, h->l_text);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
162 x = ed_swap_4(h->n_targets);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
163 bgzf_write(fp, &x, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
164 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
165 bgzf_write(fp, &h->l_text, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
166 if (h->l_text) bgzf_write(fp, h->text, h->l_text);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
167 bgzf_write(fp, &h->n_targets, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
168 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
169 // write sequence names and lengths
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
170 for (i = 0; i != h->n_targets; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
171 char *p = h->target_name[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
172 name_len = strlen(p) + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
173 if (fp->is_be) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
174 x = ed_swap_4(name_len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
175 bgzf_write(fp, &x, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
176 } else bgzf_write(fp, &name_len, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
177 bgzf_write(fp, p, name_len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
178 if (fp->is_be) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
179 x = ed_swap_4(h->target_len[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
180 bgzf_write(fp, &x, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
181 } else bgzf_write(fp, &h->target_len[i], 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
182 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
183 bgzf_flush(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
184 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
185 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
186
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
187 int bam_name2id(bam_hdr_t *h, const char *ref)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
188 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
189 sdict_t *d = (sdict_t*)h->sdict;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
190 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
191 if (h->sdict == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
192 int i, absent;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
193 d = kh_init(s2i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
194 for (i = 0; i < h->n_targets; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
195 k = kh_put(s2i, d, h->target_name[i], &absent);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
196 kh_val(d, k) = i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
197 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
198 h->sdict = d;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
199 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
200 k = kh_get(s2i, d, ref);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
201 return k == kh_end(d)? -1 : kh_val(d, k);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
202 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
203
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
204 /*************************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
205 *** BAM alignment I/O ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
206 *************************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
207
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
208 bam1_t *bam_init1()
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
209 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
210 return (bam1_t*)calloc(1, sizeof(bam1_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
211 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
212
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
213 void bam_destroy1(bam1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
214 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
215 if (b == 0) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
216 free(b->data); free(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
217 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
218
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
219 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
220 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
221 uint8_t *data = bdst->data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
222 int m_data = bdst->m_data; // backup data and m_data
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
223 if (m_data < bsrc->l_data) { // double the capacity
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
224 m_data = bsrc->l_data; kroundup32(m_data);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
225 data = (uint8_t*)realloc(data, m_data);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
226 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
227 memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
228 *bdst = *bsrc; // copy the rest
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
229 // restore the backup
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
230 bdst->m_data = m_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
231 bdst->data = data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
232 return bdst;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
233 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
234
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
235 bam1_t *bam_dup1(const bam1_t *bsrc)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
236 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
237 if (bsrc == NULL) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
238 bam1_t *bdst = bam_init1();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
239 if (bdst == NULL) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
240 return bam_copy1(bdst, bsrc);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
241 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
242
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
243 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
244 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
245 int k, l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
246 for (k = l = 0; k < n_cigar; ++k)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
247 if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
248 l += bam_cigar_oplen(cigar[k]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
249 return l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
250 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
251
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
252 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
253 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
254 int k, l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
255 for (k = l = 0; k < n_cigar; ++k)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
256 if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
257 l += bam_cigar_oplen(cigar[k]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
258 return l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
259 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
260
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
261 int32_t bam_endpos(const bam1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
262 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
263 if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
264 return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
265 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
266 return b->core.pos + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
267 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
268
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
269 static inline int aux_type2size(uint8_t type)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
270 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
271 switch (type) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
272 case 'A': case 'c': case 'C':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
273 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
274 case 's': case 'S':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
275 return 2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
276 case 'i': case 'I': case 'f':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
277 return 4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
278 case 'd':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
279 return 8;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
280 case 'Z': case 'H': case 'B':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
281 return type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
282 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
283 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
284 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
285 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
286
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
287 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
288 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
289 uint8_t *s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
290 uint32_t *cigar = (uint32_t*)(data + c->l_qname);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
291 uint32_t i, n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
292 s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
293 for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
294 while (s < data + l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
295 int size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
296 s += 2; // skip key
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
297 size = aux_type2size(*s); ++s; // skip type
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
298 switch (size) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
299 case 1: ++s; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
300 case 2: ed_swap_2p(s); s += 2; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
301 case 4: ed_swap_4p(s); s += 4; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
302 case 8: ed_swap_8p(s); s += 8; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
303 case 'Z':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
304 case 'H':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
305 while (*s) ++s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
306 ++s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
307 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
308 case 'B':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
309 size = aux_type2size(*s); ++s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
310 if (is_host) memcpy(&n, s, 4), ed_swap_4p(s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
311 else ed_swap_4p(s), memcpy(&n, s, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
312 s += 4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
313 switch (size) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
314 case 1: s += n; break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
315 case 2: for (i = 0; i < n; ++i, s += 2) ed_swap_2p(s); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
316 case 4: for (i = 0; i < n; ++i, s += 4) ed_swap_4p(s); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
317 case 8: for (i = 0; i < n; ++i, s += 8) ed_swap_8p(s); break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
318 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
319 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
320 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
321 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
322 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
323
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
324 int bam_read1(BGZF *fp, bam1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
325 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
326 bam1_core_t *c = &b->core;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
327 int32_t block_len, ret, i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
328 uint32_t x[8];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
329 if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
330 if (ret == 0) return -1; // normal end-of-file
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
331 else return -2; // truncated
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
332 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
333 if (bgzf_read(fp, x, 32) != 32) return -3;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
334 if (fp->is_be) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
335 ed_swap_4p(&block_len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
336 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
337 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
338 c->tid = x[0]; c->pos = x[1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
339 c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
340 c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
341 c->l_qseq = x[4];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
342 c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
343 b->l_data = block_len - 32;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
344 if (b->l_data < 0 || c->l_qseq < 0) return -4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
345 if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
346 return -4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
347 if (b->m_data < b->l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
348 b->m_data = b->l_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
349 kroundup32(b->m_data);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
350 b->data = (uint8_t*)realloc(b->data, b->m_data);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
351 if (!b->data)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
352 return -4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
353 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
354 if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
355 //b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
356 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
357 return 4 + block_len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
358 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
359
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
360 int bam_write1(BGZF *fp, const bam1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
361 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
362 const bam1_core_t *c = &b->core;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
363 uint32_t x[8], block_len = b->l_data + 32, y;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
364 int i, ok;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
365 x[0] = c->tid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
366 x[1] = c->pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
367 x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
368 x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
369 x[4] = c->l_qseq;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
370 x[5] = c->mtid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
371 x[6] = c->mpos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
372 x[7] = c->isize;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
373 ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
374 if (fp->is_be) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
375 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
376 y = block_len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
377 if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
378 swap_data(c, b->l_data, b->data, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
379 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
380 if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
381 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
382 if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
383 if (ok) ok = (bgzf_write(fp, b->data, b->l_data) >= 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
384 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
385 return ok? 4 + block_len : -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
386 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
387
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
388 /********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
389 *** BAM indexing ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
390 ********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
391
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
392 static hts_idx_t *bam_index(BGZF *fp, int min_shift)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
393 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
394 int n_lvls, i, fmt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
395 bam1_t *b;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
396 hts_idx_t *idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
397 bam_hdr_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
398 h = bam_hdr_read(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
399 if (min_shift > 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
400 int64_t max_len = 0, s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
401 for (i = 0; i < h->n_targets; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
402 if (max_len < h->target_len[i]) max_len = h->target_len[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
403 max_len += 256;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
404 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
405 fmt = HTS_FMT_CSI;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
406 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
407 idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
408 bam_hdr_destroy(h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
409 b = bam_init1();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
410 while (bam_read1(fp, b) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
411 int l, ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
412 l = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
413 if (l == 0) l = 1; // no zero-length records
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
414 ret = hts_idx_push(idx, b->core.tid, b->core.pos, b->core.pos + l, bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
415 if (ret < 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
416 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
417 // unsorted
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
418 bam_destroy1(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
419 hts_idx_destroy(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
420 return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
421 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
422 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
423 hts_idx_finish(idx, bgzf_tell(fp));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
424 bam_destroy1(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
425 return idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
426 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
427
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
428 int bam_index_build(const char *fn, int min_shift)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
429 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
430 hts_idx_t *idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
431 htsFile *fp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
432 int ret = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
433
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
434 if ((fp = hts_open(fn, "r")) == 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
435 switch (fp->format.format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
436 case cram:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
437 ret = cram_index_build(fp->fp.cram, fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
438 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
439
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
440 case bam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
441 idx = bam_index(fp->fp.bgzf, min_shift);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
442 if (idx) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
443 hts_idx_save(idx, fn, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
444 hts_idx_destroy(idx);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
445 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
446 else ret = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
447 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
448
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
449 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
450 ret = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
451 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
452 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
453 hts_close(fp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
454
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
455 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
456 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
457
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
458 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
459 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
460 bam1_t *b = bv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
461 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
462 if ((ret = bam_read1(fp, b)) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
463 *tid = b->core.tid; *beg = b->core.pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
464 *end = b->core.pos + (b->core.n_cigar? bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)) : 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
465 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
466 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
467 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
468
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
469 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
470 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
471 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
472 htsFile *fp = fpv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
473 bam1_t *b = bv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
474 return cram_get_bam_seq(fp->fp.cram, &b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
475 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
476
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
477 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
478 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
479 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
480 htsFile *fp = fpv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
481 bam1_t *b = bv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
482 switch (fp->format.format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
483 case bam: return bam_read1(bgzfp, b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
484 case cram: return cram_get_bam_seq(fp->fp.cram, &b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
485 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
486 // TODO Need headers available to implement this for SAM files
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
487 fprintf(stderr, "[sam_bam_cram_readrec] Not implemented for SAM files -- Exiting\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
488 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
489 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
490 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
491
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
492 // The CRAM implementation stores the loaded index within the cram_fd rather
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
493 // than separately as is done elsewhere in htslib. So if p is a pointer to
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
494 // an hts_idx_t with p->fmt == HTS_FMT_CRAI, then it actually points to an
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
495 // hts_cram_idx_t and should be cast accordingly.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
496 typedef struct hts_cram_idx_t {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
497 int fmt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
498 cram_fd *cram;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
499 } hts_cram_idx_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
500
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
501 hts_idx_t *sam_index_load(samFile *fp, const char *fn)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
502 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
503 switch (fp->format.format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
504 case bam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
505 return bam_index_load(fn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
506
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
507 case cram: {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
508 if (cram_index_load(fp->fp.cram, fn) < 0) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
509 // Cons up a fake "index" just pointing at the associated cram_fd:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
510 hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
511 if (idx == NULL) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
512 idx->fmt = HTS_FMT_CRAI;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
513 idx->cram = fp->fp.cram;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
514 return (hts_idx_t *) idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
515 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
516
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
517 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
518 return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
519 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
520 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
521
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
522 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
523 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
524 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
525 hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
526 if (iter == NULL) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
527
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
528 // Cons up a dummy iterator for which hts_itr_next() will simply invoke
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
529 // the readrec function:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
530 iter->read_rest = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
531 iter->off = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
532 iter->bins.a = NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
533 iter->readrec = readrec;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
534
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
535 if (tid >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
536 cram_range r = { tid, beg+1, end };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
537 if (cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r) != 0) { free(iter); return NULL; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
538 iter->curr_off = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
539 // The following fields are not required by hts_itr_next(), but are
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
540 // filled in in case user code wants to look at them.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
541 iter->tid = tid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
542 iter->beg = beg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
543 iter->end = end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
544 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
545 else switch (tid) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
546 case HTS_IDX_REST:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
547 iter->curr_off = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
548 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
549 case HTS_IDX_NONE:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
550 iter->curr_off = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
551 iter->finished = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
552 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
553 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
554 fprintf(stderr, "[cram_itr_query] tid=%d not implemented for CRAM files -- Exiting\n", tid);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
555 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
556 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
557 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
558
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
559 return iter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
560 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
561
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
562 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
563 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
564 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
565 if (idx == NULL)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
566 return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
567 else if (cidx->fmt == HTS_FMT_CRAI)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
568 return cram_itr_query(idx, tid, beg, end, cram_readrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
569 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
570 return hts_itr_query(idx, tid, beg, end, bam_readrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
571 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
572
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
573 static int cram_name2id(void *fdv, const char *ref)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
574 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
575 cram_fd *fd = (cram_fd *) fdv;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
576 return sam_hdr_name2ref(fd->header, ref);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
577 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
578
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
579 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
580 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
581 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
582 if (cidx->fmt == HTS_FMT_CRAI)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
583 return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
584 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
585 return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
586 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
587
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
588 /**********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
589 *** SAM header I/O ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
590 **********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
591
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
592 #include "htslib/kseq.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
593 #include "htslib/kstring.h"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
594
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
595 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
596 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
597 const char *q, *r, *p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
598 khash_t(s2i) *d;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
599 d = kh_init(s2i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
600 for (p = text; *p; ++p) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
601 if (strncmp(p, "@SQ", 3) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
602 char *sn = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
603 int ln = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
604 for (q = p + 4;; ++q) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
605 if (strncmp(q, "SN:", 3) == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
606 q += 3;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
607 for (r = q; *r != '\t' && *r != '\n'; ++r);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
608 sn = (char*)calloc(r - q + 1, 1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
609 strncpy(sn, q, r - q);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
610 q = r;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
611 } else if (strncmp(q, "LN:", 3) == 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
612 ln = strtol(q + 3, (char**)&q, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
613 while (*q != '\t' && *q != '\n') ++q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
614 if (*q == '\n') break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
615 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
616 p = q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
617 if (sn && ln >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
618 khint_t k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
619 int absent;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
620 k = kh_put(s2i, d, sn, &absent);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
621 if (!absent) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
622 if (hts_verbose >= 2)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
623 fprintf(stderr, "[W::%s] duplicated sequence '%s'\n", __func__, sn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
624 free(sn);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
625 } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
626 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
627 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
628 while (*p != '\n') ++p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
629 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
630 return hdr_from_dict(d);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
631 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
632
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
633 bam_hdr_t *sam_hdr_read(htsFile *fp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
634 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
635 switch (fp->format.format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
636 case bam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
637 return bam_hdr_read(fp->fp.bgzf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
638
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
639 case cram:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
640 return cram_header_to_bam(fp->fp.cram->header);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
641
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
642 case sam: {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
643 kstring_t str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
644 bam_hdr_t *h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
645 int has_SQ = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
646 str.l = str.m = 0; str.s = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
647 while (hts_getline(fp, KS_SEP_LINE, &fp->line) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
648 if (fp->line.s[0] != '@') break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
649 if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
650 kputsn(fp->line.s, fp->line.l, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
651 kputc('\n', &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
652 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
653 if (! has_SQ && fp->fn_aux) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
654 char line[2048];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
655 FILE *f = fopen(fp->fn_aux, "r");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
656 if (f == NULL) return NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
657 while (fgets(line, sizeof line, f)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
658 const char *name = strtok(line, "\t");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
659 const char *length = strtok(NULL, "\t");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
660 ksprintf(&str, "@SQ\tSN:%s\tLN:%s\n", name, length);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
661 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
662 fclose(f);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
663 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
664 if (str.l == 0) kputsn("", 0, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
665 h = sam_hdr_parse(str.l, str.s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
666 h->l_text = str.l; h->text = str.s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
667 return h;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
668 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
669
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
670 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
671 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
672 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
673 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
674
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
675 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
676 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
677 switch (fp->format.format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
678 case binary_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
679 fp->format.category = sequence_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
680 fp->format.format = bam;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
681 /* fall-through */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
682 case bam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
683 bam_hdr_write(fp->fp.bgzf, h);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
684 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
685
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
686 case cram: {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
687 cram_fd *fd = fp->fp.cram;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
688 if (cram_set_header(fd, bam_header_to_cram((bam_hdr_t *)h)) < 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
689 if (fp->fn_aux)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
690 cram_load_reference(fd, fp->fn_aux);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
691 if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
692 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
693 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
694
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
695 case text_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
696 fp->format.category = sequence_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
697 fp->format.format = sam;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
698 /* fall-through */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
699 case sam: {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
700 char *p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
701 hputs(h->text, fp->fp.hfile);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
702 p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
703 if (p == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
704 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
705 for (i = 0; i < h->n_targets; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
706 fp->line.l = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
707 kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
708 kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
709 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
710 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
711 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
712 if ( hflush(fp->fp.hfile) != 0 ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
713 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
714 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
715
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
716 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
717 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
718 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
719 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
720 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
721
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
722 /**********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
723 *** SAM record I/O ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
724 **********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
725
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
726 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
727 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
728 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
729 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
730 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
731 #define _parse_err(cond, msg) do { if ((cond) && hts_verbose >= 1) { fprintf(stderr, "[E::%s] " msg "\n", __func__); goto err_ret; } } while (0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
732 #define _parse_warn(cond, msg) if ((cond) && hts_verbose >= 2) fprintf(stderr, "[W::%s] " msg "\n", __func__)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
733
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
734 uint8_t *t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
735 char *p = s->s, *q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
736 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
737 kstring_t str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
738 bam1_core_t *c = &b->core;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
739
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
740 str.l = b->l_data = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
741 str.s = (char*)b->data; str.m = b->m_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
742 memset(c, 0, 32);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
743 if (h->cigar_tab == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
744 h->cigar_tab = (int8_t*) malloc(128);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
745 for (i = 0; i < 128; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
746 h->cigar_tab[i] = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
747 for (i = 0; BAM_CIGAR_STR[i]; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
748 h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
749 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
750 // qname
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
751 q = _read_token(p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
752 kputsn_(q, p - q, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
753 c->l_qname = p - q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
754 // flag
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
755 c->flag = strtol(p, &p, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
756 if (*p++ != '\t') goto err_ret; // malformated flag
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
757 // chr
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
758 q = _read_token(p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
759 if (strcmp(q, "*")) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
760 _parse_err(h->n_targets == 0, "missing SAM header");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
761 c->tid = bam_name2id(h, q);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
762 _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
763 } else c->tid = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
764 // pos
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
765 c->pos = strtol(p, &p, 10) - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
766 if (*p++ != '\t') goto err_ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
767 if (c->pos < 0 && c->tid >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
768 _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
769 c->tid = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
770 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
771 if (c->tid < 0) c->flag |= BAM_FUNMAP;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
772 // mapq
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
773 c->qual = strtol(p, &p, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
774 if (*p++ != '\t') goto err_ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
775 // cigar
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
776 if (*p != '*') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
777 uint32_t *cigar;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
778 size_t n_cigar = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
779 for (q = p; *p && *p != '\t'; ++p)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
780 if (!isdigit(*p)) ++n_cigar;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
781 if (*p++ != '\t') goto err_ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
782 _parse_err(n_cigar >= 65536, "too many CIGAR operations");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
783 c->n_cigar = n_cigar;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
784 _get_mem(uint32_t, &cigar, &str, c->n_cigar<<2);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
785 for (i = 0; i < c->n_cigar; ++i, ++q) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
786 int op;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
787 cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
788 op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
789 _parse_err(op < 0, "unrecognized CIGAR operator");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
790 cigar[i] |= op;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
791 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
792 i = bam_cigar2rlen(c->n_cigar, cigar);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
793 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
794 _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
795 c->flag |= BAM_FUNMAP;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
796 q = _read_token(p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
797 i = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
798 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
799 c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
800 // mate chr
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
801 q = _read_token(p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
802 if (strcmp(q, "=") == 0) c->mtid = c->tid;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
803 else if (strcmp(q, "*") == 0) c->mtid = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
804 else c->mtid = bam_name2id(h, q);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
805 // mpos
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
806 c->mpos = strtol(p, &p, 10) - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
807 if (*p++ != '\t') goto err_ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
808 if (c->mpos < 0 && c->mtid >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
809 _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
810 c->mtid = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
811 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
812 // tlen
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
813 c->isize = strtol(p, &p, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
814 if (*p++ != '\t') goto err_ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
815 // seq
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
816 q = _read_token(p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
817 if (strcmp(q, "*")) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
818 c->l_qseq = p - q - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
819 i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
820 _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
821 i = (c->l_qseq + 1) >> 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
822 _get_mem(uint8_t, &t, &str, i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
823 memset(t, 0, i);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
824 for (i = 0; i < c->l_qseq; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
825 t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
826 } else c->l_qseq = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
827 // qual
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
828 q = _read_token_aux(p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
829 _get_mem(uint8_t, &t, &str, c->l_qseq);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
830 if (strcmp(q, "*")) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
831 _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
832 for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
833 } else memset(t, 0xff, c->l_qseq);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
834 // aux
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
835 // Note that (like the bam1_core_t fields) this aux data in b->data is
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
836 // stored in host endianness; so there is no byte swapping needed here.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
837 while (p < s->s + s->l) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
838 uint8_t type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
839 q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
840 _parse_err(p - q - 1 < 6, "incomplete aux field");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
841 kputsn_(q, 2, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
842 q += 3; type = *q++; ++q; // q points to value
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
843 if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
844 kputc_('A', &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
845 kputc_(*q, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
846 } else if (type == 'i' || type == 'I') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
847 if (*q == '-') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
848 long x = strtol(q, &q, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
849 if (x >= INT8_MIN) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
850 kputc_('c', &str); kputc_(x, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
851 } else if (x >= INT16_MIN) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
852 int16_t y = x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
853 kputc_('s', &str); kputsn_((char*)&y, 2, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
854 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
855 int32_t y = x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
856 kputc_('i', &str); kputsn_(&y, 4, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
857 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
858 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
859 unsigned long x = strtoul(q, &q, 10);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
860 if (x <= UINT8_MAX) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
861 kputc_('C', &str); kputc_(x, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
862 } else if (x <= UINT16_MAX) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
863 uint16_t y = x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
864 kputc_('S', &str); kputsn_(&y, 2, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
865 } else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
866 uint32_t y = x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
867 kputc_('I', &str); kputsn_(&y, 4, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
868 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
869 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
870 } else if (type == 'f') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
871 float x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
872 x = strtod(q, &q);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
873 kputc_('f', &str); kputsn_(&x, 4, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
874 } else if (type == 'd') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
875 double x;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
876 x = strtod(q, &q);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
877 kputc_('d', &str); kputsn_(&x, 8, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
878 } else if (type == 'Z' || type == 'H') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
879 kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
880 } else if (type == 'B') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
881 int32_t n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
882 char *r;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
883 _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
884 type = *q++; // q points to the first ',' following the typing byte
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
885 for (r = q, n = 0; *r; ++r)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
886 if (*r == ',') ++n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
887 kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
888 // FIXME: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_()
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
889 if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
890 else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
891 else if (type == 's') while (q + 1 < p) { int16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
892 else if (type == 'S') while (q + 1 < p) { uint16_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 2, &str); }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
893 else if (type == 'i') while (q + 1 < p) { int32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
894 else if (type == 'I') while (q + 1 < p) { uint32_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 4, &str); }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
895 else if (type == 'f') while (q + 1 < p) { float x = strtod(q + 1, &q); kputsn_(&x, 4, &str); }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
896 else _parse_err(1, "unrecognized type");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
897 } else _parse_err(1, "unrecognized type");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
898 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
899 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
900 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
901
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
902 #undef _parse_warn
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
903 #undef _parse_err
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
904 #undef _get_mem
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
905 #undef _read_token_aux
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
906 #undef _read_token
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
907 err_ret:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
908 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
909 return -2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
910 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
911
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
912 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
913 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
914 switch (fp->format.format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
915 case bam: {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
916 int r = bam_read1(fp->fp.bgzf, b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
917 if (r >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
918 if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
919 b->core.mtid >= h->n_targets || b->core.mtid < -1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
920 return -3;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
921 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
922 return r;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
923 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
924
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
925 case cram:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
926 return cram_get_bam_seq(fp->fp.cram, &b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
927
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
928 case sam: {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
929 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
930 err_recover:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
931 if (fp->line.l == 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
932 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
933 if (ret < 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
934 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
935 ret = sam_parse1(&fp->line, h, b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
936 fp->line.l = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
937 if (ret < 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
938 if (hts_verbose >= 1)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
939 fprintf(stderr, "[W::%s] parse error at line %lld\n", __func__, (long long)fp->lineno);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
940 if (h->ignore_sam_err) goto err_recover;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
941 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
942 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
943 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
944
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
945 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
946 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
947 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
948 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
949
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
950 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
951 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
952 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
953 uint8_t *s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
954 const bam1_core_t *c = &b->core;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
955
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
956 str->l = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
957 kputsn(bam_get_qname(b), c->l_qname-1, str); kputc('\t', str); // query name
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
958 kputw(c->flag, str); kputc('\t', str); // flag
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
959 if (c->tid >= 0) { // chr
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
960 kputs(h->target_name[c->tid] , str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
961 kputc('\t', str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
962 } else kputsn("*\t", 2, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
963 kputw(c->pos + 1, str); kputc('\t', str); // pos
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
964 kputw(c->qual, str); kputc('\t', str); // qual
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
965 if (c->n_cigar) { // cigar
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
966 uint32_t *cigar = bam_get_cigar(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
967 for (i = 0; i < c->n_cigar; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
968 kputw(bam_cigar_oplen(cigar[i]), str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
969 kputc(bam_cigar_opchr(cigar[i]), str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
970 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
971 } else kputc('*', str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
972 kputc('\t', str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
973 if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
974 else if (c->mtid == c->tid) kputsn("=\t", 2, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
975 else {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
976 kputs(h->target_name[c->mtid], str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
977 kputc('\t', str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
978 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
979 kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
980 kputw(c->isize, str); kputc('\t', str); // template len
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
981 if (c->l_qseq) { // seq and qual
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
982 uint8_t *s = bam_get_seq(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
983 for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
984 kputc('\t', str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
985 s = bam_get_qual(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
986 if (s[0] == 0xff) kputc('*', str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
987 else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
988 } else kputsn("*\t*", 3, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
989 s = bam_get_aux(b); // aux
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
990 while (s+4 <= b->data + b->l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
991 uint8_t type, key[2];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
992 key[0] = s[0]; key[1] = s[1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
993 s += 2; type = *s++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
994 kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
995 if (type == 'A') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
996 kputsn("A:", 2, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
997 kputc(*s, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
998 ++s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
999 } else if (type == 'C') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1000 kputsn("i:", 2, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1001 kputw(*s, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1002 ++s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1003 } else if (type == 'c') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1004 kputsn("i:", 2, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1005 kputw(*(int8_t*)s, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1006 ++s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1007 } else if (type == 'S') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1008 if (s+2 <= b->data + b->l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1009 kputsn("i:", 2, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1010 kputw(*(uint16_t*)s, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1011 s += 2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1012 } else return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1013 } else if (type == 's') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1014 if (s+2 <= b->data + b->l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1015 kputsn("i:", 2, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1016 kputw(*(int16_t*)s, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1017 s += 2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1018 } else return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1019 } else if (type == 'I') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1020 if (s+4 <= b->data + b->l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1021 kputsn("i:", 2, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1022 kputuw(*(uint32_t*)s, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1023 s += 4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1024 } else return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1025 } else if (type == 'i') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1026 if (s+4 <= b->data + b->l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1027 kputsn("i:", 2, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1028 kputw(*(int32_t*)s, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1029 s += 4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1030 } else return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1031 } else if (type == 'f') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1032 if (s+4 <= b->data + b->l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1033 ksprintf(str, "f:%g", *(float*)s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1034 s += 4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1035 } else return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1036
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1037 } else if (type == 'd') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1038 if (s+8 <= b->data + b->l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1039 ksprintf(str, "d:%g", *(double*)s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1040 s += 8;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1041 } else return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1042 } else if (type == 'Z' || type == 'H') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1043 kputc(type, str); kputc(':', str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1044 while (s < b->data + b->l_data && *s) kputc(*s++, str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1045 if (s >= b->data + b->l_data)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1046 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1047 ++s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1048 } else if (type == 'B') {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1049 uint8_t sub_type = *(s++);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1050 int32_t n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1051 memcpy(&n, s, 4);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1052 s += 4; // no point to the start of the array
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1053 if (s + n >= b->data + b->l_data)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1054 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1055 kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1056 for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1057 kputc(',', str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1058 if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1059 else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1060 else if ('s' == sub_type) { kputw(*(int16_t*)s, str); s += 2; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1061 else if ('S' == sub_type) { kputw(*(uint16_t*)s, str); s += 2; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1062 else if ('i' == sub_type) { kputw(*(int32_t*)s, str); s += 4; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1063 else if ('I' == sub_type) { kputuw(*(uint32_t*)s, str); s += 4; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1064 else if ('f' == sub_type) { ksprintf(str, "%g", *(float*)s); s += 4; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1065 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1066 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1067 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1068 return str->l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1069 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1070
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1071 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1072 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1073 switch (fp->format.format) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1074 case binary_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1075 fp->format.category = sequence_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1076 fp->format.format = bam;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1077 /* fall-through */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1078 case bam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1079 return bam_write1(fp->fp.bgzf, b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1080
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1081 case cram:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1082 return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1083
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1084 case text_format:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1085 fp->format.category = sequence_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1086 fp->format.format = sam;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1087 /* fall-through */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1088 case sam:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1089 if (sam_format1(h, b, &fp->line) < 0) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1090 kputc('\n', &fp->line);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1091 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1092 return fp->line.l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1093
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1094 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1095 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1096 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1097 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1098
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1099 /************************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1100 *** Auxiliary fields ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1101 ************************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1102
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1103 void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1104 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1105 int ori_len = b->l_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1106 b->l_data += 3 + len;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1107 if (b->m_data < b->l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1108 b->m_data = b->l_data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1109 kroundup32(b->m_data);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1110 b->data = (uint8_t*)realloc(b->data, b->m_data);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1111 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1112 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1113 b->data[ori_len + 2] = type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1114 memcpy(b->data + ori_len + 3, data, len);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1115 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1116
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1117 static inline uint8_t *skip_aux(uint8_t *s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1118 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1119 int size = aux_type2size(*s); ++s; // skip type
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1120 uint32_t n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1121 switch (size) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1122 case 'Z':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1123 case 'H':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1124 while (*s) ++s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1125 return s + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1126 case 'B':
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1127 size = aux_type2size(*s); ++s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1128 memcpy(&n, s, 4); s += 4;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1129 return s + size * n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1130 case 0:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1131 abort();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1132 break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1133 default:
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1134 return s + size;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1135 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1136 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1137
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1138 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1139 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1140 uint8_t *s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1141 int y = tag[0]<<8 | tag[1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1142 s = bam_get_aux(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1143 while (s < b->data + b->l_data) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1144 int x = (int)s[0]<<8 | s[1];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1145 s += 2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1146 if (x == y) return s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1147 s = skip_aux(s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1148 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1149 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1150 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1151 // s MUST BE returned by bam_aux_get()
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1152 int bam_aux_del(bam1_t *b, uint8_t *s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1153 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1154 uint8_t *p, *aux;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1155 int l_aux = bam_get_l_aux(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1156 aux = bam_get_aux(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1157 p = s - 2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1158 s = skip_aux(s);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1159 memmove(p, s, l_aux - (s - aux));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1160 b->l_data -= s - p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1161 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1162 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1163
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1164 int32_t bam_aux2i(const uint8_t *s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1165 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1166 int type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1167 type = *s++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1168 if (type == 'c') return (int32_t)*(int8_t*)s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1169 else if (type == 'C') return (int32_t)*(uint8_t*)s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1170 else if (type == 's') return (int32_t)*(int16_t*)s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1171 else if (type == 'S') return (int32_t)*(uint16_t*)s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1172 else if (type == 'i' || type == 'I') return *(int32_t*)s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1173 else return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1174 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1175
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1176 double bam_aux2f(const uint8_t *s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1177 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1178 int type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1179 type = *s++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1180 if (type == 'd') return *(double*)s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1181 else if (type == 'f') return *(float*)s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1182 else return 0.0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1183 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1184
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1185 char bam_aux2A(const uint8_t *s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1186 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1187 int type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1188 type = *s++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1189 if (type == 'A') return *(char*)s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1190 else return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1191 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1192
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1193 char *bam_aux2Z(const uint8_t *s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1194 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1195 int type;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1196 type = *s++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1197 if (type == 'Z' || type == 'H') return (char*)s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1198 else return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1199 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1200
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1201 int sam_open_mode(char *mode, const char *fn, const char *format)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1202 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1203 // TODO Parse "bam5" etc for compression level
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1204 if (format == NULL) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1205 // Try to pick a format based on the filename extension
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1206 const char *ext = fn? strrchr(fn, '.') : NULL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1207 if (ext == NULL || strchr(ext, '/')) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1208 return sam_open_mode(mode, fn, ext+1);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1209 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1210 else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1211 else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1212 else if (strcmp(format, "sam") == 0) strcpy(mode, "");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1213 else return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1214
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1215 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1216 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1217
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1218 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1219 int bam_str2flag(const char *str)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1220 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1221 char *end, *beg = (char*) str;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1222 long int flag = strtol(str, &end, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1223 if ( end!=str ) return flag; // the conversion was successful
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1224 flag = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1225 while ( *str )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1226 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1227 end = beg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1228 while ( *end && *end!=',' ) end++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1229 if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1230 else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1231 else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1232 else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1233 else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1234 else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1235 else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1236 else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1237 else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1238 else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1239 else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1240 else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1241 else return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1242 if ( !*end ) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1243 beg = end + 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1244 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1245 return flag;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1246 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1247
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1248 char *bam_flag2str(int flag)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1249 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1250 kstring_t str = {0,0,0};
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1251 if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1252 if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1253 if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1254 if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1255 if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1256 if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1257 if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1258 if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1259 if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1260 if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1261 if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1262 if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1263 if ( str.l == 0 ) kputsn("", 0, &str);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1264 return str.s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1265 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1266
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1267
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1268 /**************************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1269 *** Pileup and Mpileup ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1270 **************************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1271
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1272 #if !defined(BAM_NO_PILEUP)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1273
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1274 #include <assert.h>
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1275
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1276 /*******************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1277 *** Memory pool ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1278 *******************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1279
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1280 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1281 int k, x, y, end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1282 } cstate_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1283
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1284 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1285
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1286 typedef struct __linkbuf_t {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1287 bam1_t b;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1288 int32_t beg, end;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1289 cstate_t s;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1290 struct __linkbuf_t *next;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1291 } lbnode_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1292
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1293 typedef struct {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1294 int cnt, n, max;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1295 lbnode_t **buf;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1296 } mempool_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1297
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1298 static mempool_t *mp_init(void)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1299 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1300 mempool_t *mp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1301 mp = (mempool_t*)calloc(1, sizeof(mempool_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1302 return mp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1303 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1304 static void mp_destroy(mempool_t *mp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1305 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1306 int k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1307 for (k = 0; k < mp->n; ++k) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1308 free(mp->buf[k]->b.data);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1309 free(mp->buf[k]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1310 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1311 free(mp->buf);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1312 free(mp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1313 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1314 static inline lbnode_t *mp_alloc(mempool_t *mp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1315 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1316 ++mp->cnt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1317 if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1318 else return mp->buf[--mp->n];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1319 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1320 static inline void mp_free(mempool_t *mp, lbnode_t *p)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1321 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1322 --mp->cnt; p->next = 0; // clear lbnode_t::next here
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1323 if (mp->n == mp->max) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1324 mp->max = mp->max? mp->max<<1 : 256;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1325 mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1326 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1327 mp->buf[mp->n++] = p;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1328 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1329
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1330 /**********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1331 *** CIGAR resolver ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1332 **********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1333
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1334 /* s->k: the index of the CIGAR operator that has just been processed.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1335 s->x: the reference coordinate of the start of s->k
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1336 s->y: the query coordiante of the start of s->k
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1337 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1338 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1339 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1340 #define _cop(c) ((c)&BAM_CIGAR_MASK)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1341 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1342
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1343 bam1_t *b = p->b;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1344 bam1_core_t *c = &b->core;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1345 uint32_t *cigar = bam_get_cigar(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1346 int k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1347 // determine the current CIGAR operation
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1348 // fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1349 if (s->k == -1) { // never processed
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1350 if (c->n_cigar == 1) { // just one operation, save a loop
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1351 if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1352 } else { // find the first match or deletion
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1353 for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1354 int op = _cop(cigar[k]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1355 int l = _cln(cigar[k]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1356 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1357 else if (op == BAM_CREF_SKIP) s->x += l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1358 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1359 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1360 assert(k < c->n_cigar);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1361 s->k = k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1362 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1363 } else { // the read has been processed before
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1364 int op, l = _cln(cigar[s->k]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1365 if (pos - s->x >= l) { // jump to the next operation
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1366 assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1367 op = _cop(cigar[s->k+1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1368 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1369 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1370 s->x += l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1371 ++s->k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1372 } else { // find the next M/D/N/=/X
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1373 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1374 s->x += l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1375 for (k = s->k + 1; k < c->n_cigar; ++k) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1376 op = _cop(cigar[k]), l = _cln(cigar[k]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1377 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1378 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1379 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1380 s->k = k;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1381 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1382 assert(s->k < c->n_cigar); // otherwise a bug
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1383 } // else, do nothing
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1384 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1385 { // collect pileup information
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1386 int op, l;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1387 op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1388 p->is_del = p->indel = p->is_refskip = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1389 if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1390 int op2 = _cop(cigar[s->k+1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1391 int l2 = _cln(cigar[s->k+1]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1392 if (op2 == BAM_CDEL) p->indel = -(int)l2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1393 else if (op2 == BAM_CINS) p->indel = l2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1394 else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1395 int l3 = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1396 for (k = s->k + 2; k < c->n_cigar; ++k) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1397 op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1398 if (op2 == BAM_CINS) l3 += l2;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1399 else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1400 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1401 if (l3 > 0) p->indel = l3;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1402 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1403 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1404 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1405 p->qpos = s->y + (pos - s->x);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1406 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1407 p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1408 p->is_refskip = (op == BAM_CREF_SKIP);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1409 } // cannot be other operations; otherwise a bug
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1410 p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1411 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1412 return 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1413 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1414
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1415 /***********************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1416 *** Pileup iterator ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1417 ***********************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1418
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1419 // Dictionary of overlapping reads
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1420 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1421 typedef khash_t(olap_hash) olap_hash_t;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1422
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1423 struct __bam_plp_t {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1424 mempool_t *mp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1425 lbnode_t *head, *tail, *dummy;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1426 int32_t tid, pos, max_tid, max_pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1427 int is_eof, max_plp, error, maxcnt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1428 uint64_t id;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1429 bam_pileup1_t *plp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1430 // for the "auto" interface only
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1431 bam1_t *b;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1432 bam_plp_auto_f func;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1433 void *data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1434 olap_hash_t *overlaps;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1435 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1436
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1437 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1438 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1439 bam_plp_t iter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1440 iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1441 iter->mp = mp_init();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1442 iter->head = iter->tail = mp_alloc(iter->mp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1443 iter->dummy = mp_alloc(iter->mp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1444 iter->max_tid = iter->max_pos = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1445 iter->maxcnt = 8000;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1446 if (func) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1447 iter->func = func;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1448 iter->data = data;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1449 iter->b = bam_init1();
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1450 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1451 return iter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1452 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1453
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1454 void bam_plp_init_overlaps(bam_plp_t iter)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1455 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1456 iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1457 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1458
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1459 void bam_plp_destroy(bam_plp_t iter)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1460 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1461 if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1462 mp_free(iter->mp, iter->dummy);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1463 mp_free(iter->mp, iter->head);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1464 if (iter->mp->cnt != 0)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1465 fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1466 mp_destroy(iter->mp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1467 if (iter->b) bam_destroy1(iter->b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1468 free(iter->plp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1469 free(iter);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1470 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1471
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1472
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1473 //---------------------------------
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1474 //--- Tweak overlapping reads
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1475 //---------------------------------
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1476
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1477 /**
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1478 * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1479 * cigar_iref2iseq_next() - get the next CMATCH base
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1480 * @cigar: pointer to current cigar block (rw)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1481 * @cigar_max: pointer just beyond the last cigar block
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1482 * @icig: position within the current cigar block (rw)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1483 * @iseq: position in the sequence (rw)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1484 * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1485 *
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1486 * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1487 */
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1488 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1489 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1490 int pos = *iref;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1491 if ( pos < 0 ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1492 *icig = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1493 *iseq = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1494 *iref = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1495 while ( *cigar<cigar_max )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1496 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1497 int cig = (**cigar) & BAM_CIGAR_MASK;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1498 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1499
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1500 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1501 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1502 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1503 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1504 pos -= ncig;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1505 if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1506 (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1507 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1508 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1509 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1510 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1511 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1512 pos -= ncig;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1513 if ( pos<0 ) pos = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1514 (*cigar)++; *icig = 0; *iref += ncig;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1515 continue;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1516 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1517 fprintf(stderr,"todo: cigar %d\n", cig);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1518 assert(0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1519 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1520 *iseq = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1521 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1522 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1523 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1524 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1525 while ( *cigar < cigar_max )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1526 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1527 int cig = (**cigar) & BAM_CIGAR_MASK;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1528 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1529
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1530 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1531 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1532 if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1533 (*iseq)++; (*icig)++; (*iref)++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1534 return BAM_CMATCH;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1535 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1536 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1537 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1538 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1539 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1540 fprintf(stderr,"todo: cigar %d\n", cig);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1541 assert(0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1542 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1543 *iseq = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1544 *iref = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1545 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1546 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1547
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1548 static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1549 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1550 uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1551 uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1552 int a_icig = 0, a_iseq = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1553 int b_icig = 0, b_iseq = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1554 uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1555 uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1556
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1557 int iref = b->core.pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1558 int a_iref = iref - a->core.pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1559 int b_iref = iref - b->core.pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1560 int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1561 if ( a_ret<0 ) return; // no overlap
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1562 int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1563 if ( b_ret<0 ) return; // no overlap
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1564
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1565 #if DBG
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1566 fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1567 a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1568 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1569
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1570 while ( 1 )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1571 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1572 // Increment reference position
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1573 while ( a_iref>=0 && a_iref < iref - a->core.pos )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1574 a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1575 if ( a_ret<0 ) break; // done
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1576 if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1577
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1578 while ( b_iref>=0 && b_iref < iref - b->core.pos )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1579 b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1580 if ( b_ret<0 ) break; // done
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1581 if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1582
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1583 iref++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1584 if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1585
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1586 if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1587 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1588 #if DBG
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1589 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1590 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1591 // we are very confident about this base
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1592 int qual = a_qual[a_iseq] + b_qual[b_iseq];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1593 a_qual[a_iseq] = qual>200 ? 200 : qual;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1594 b_qual[b_iseq] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1595 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1596 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1597 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1598 if ( a_qual[a_iseq] >= b_qual[b_iseq] )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1599 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1600 #if DBG
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1601 fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1602 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1603 a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1604 b_qual[b_iseq] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1605 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1606 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1607 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1608 #if DBG
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1609 fprintf(stderr,"[%c/%c]",tolower(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1610 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1611 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1612 a_qual[a_iseq] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1613 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1614 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1615 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1616 #if DBG
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1617 fprintf(stderr,"\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1618 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1619 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1620
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1621 // Fix overlapping reads. Simple soft-clipping did not give good results.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1622 // Lowering qualities of unwanted bases is more selective and works better.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1623 //
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1624 static void overlap_push(bam_plp_t iter, lbnode_t *node)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1625 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1626 if ( !iter->overlaps ) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1627
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1628 // mapped mates and paired reads only
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1629 if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1630
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1631 // no overlap possible, unless some wild cigar
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1632 if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1633
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1634 khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1635 if ( kitr==kh_end(iter->overlaps) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1636 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1637 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1638 kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1639 kh_value(iter->overlaps, kitr) = node;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1640 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1641 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1642 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1643 lbnode_t *a = kh_value(iter->overlaps, kitr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1644 tweak_overlap_quality(&a->b, &node->b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1645 kh_del(olap_hash, iter->overlaps, kitr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1646 assert(a->end-1 == a->s.end);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1647 a->end = a->b.core.pos + bam_cigar2rlen(a->b.core.n_cigar, bam_get_cigar(&a->b));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1648 a->s.end = a->end - 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1649 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1650 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1651
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1652 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1653 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1654 if ( !iter->overlaps ) return;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1655
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1656 khiter_t kitr;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1657 if ( b )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1658 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1659 kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1660 if ( kitr!=kh_end(iter->overlaps) )
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1661 kh_del(olap_hash, iter->overlaps, kitr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1662 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1663 else
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1664 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1665 // remove all
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1666 for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1667 if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1668 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1669 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1670
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1671
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1672
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1673 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1674 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1675 // buffer yet (the current position is still the maximum position across all buffered reads).
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1676 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1677 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1678 if (iter->error) { *_n_plp = -1; return 0; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1679 *_n_plp = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1680 if (iter->is_eof && iter->head->next == 0) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1681 while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1682 int n_plp = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1683 lbnode_t *p, *q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1684 // write iter->plp at iter->pos
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1685 iter->dummy->next = iter->head;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1686 for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1687 if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1688 overlap_remove(iter, &p->b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1689 q->next = p->next; mp_free(iter->mp, p); p = q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1690 } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1691 if (n_plp == iter->max_plp) { // then double the capacity
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1692 iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1693 iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1694 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1695 iter->plp[n_plp].b = &p->b;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1696 if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1697 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1698 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1699 iter->head = iter->dummy->next; // dummy->next may be changed
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1700 *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1701 // update iter->tid and iter->pos
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1702 if (iter->head->next) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1703 if (iter->tid > iter->head->b.core.tid) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1704 fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1705 iter->error = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1706 *_n_plp = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1707 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1708 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1709 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1710 if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1711 iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1712 } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1713 iter->pos = iter->head->beg; // jump to the next position
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1714 } else ++iter->pos; // scan contiguously
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1715 // return
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1716 if (n_plp) return iter->plp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1717 if (iter->is_eof && iter->head->next == 0) break;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1718 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1719 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1720 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1721
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1722 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1723 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1724 if (iter->error) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1725 if (b) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1726 if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1727 // Skip only unmapped reads here, any additional filtering must be done in iter->func
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1728 if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1729 if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1730 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1731 overlap_remove(iter, b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1732 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1733 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1734 bam_copy1(&iter->tail->b, b);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1735 overlap_push(iter, iter->tail);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1736 #ifndef BAM_NO_ID
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1737 iter->tail->b.id = iter->id++;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1738 #endif
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1739 iter->tail->beg = b->core.pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1740 iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1741 iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1742 if (b->core.tid < iter->max_tid) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1743 fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1744 iter->error = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1745 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1746 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1747 if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1748 fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1749 iter->error = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1750 return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1751 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1752 iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1753 if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1754 iter->tail->next = mp_alloc(iter->mp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1755 iter->tail = iter->tail->next;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1756 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1757 } else iter->is_eof = 1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1758 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1759 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1760
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1761 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1762 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1763 const bam_pileup1_t *plp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1764 if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1765 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1766 else { // no pileup line can be obtained; read alignments
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1767 *_n_plp = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1768 if (iter->is_eof) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1769 int ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1770 while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1771 if (bam_plp_push(iter, iter->b) < 0) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1772 *_n_plp = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1773 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1774 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1775 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1776 // otherwise no pileup line can be returned; read the next alignment.
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1777 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1778 if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1779 bam_plp_push(iter, 0);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1780 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1781 return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1782 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1783 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1784
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1785 void bam_plp_reset(bam_plp_t iter)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1786 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1787 lbnode_t *p, *q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1788 iter->max_tid = iter->max_pos = -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1789 iter->tid = iter->pos = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1790 iter->is_eof = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1791 for (p = iter->head; p->next;) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1792 overlap_remove(iter, NULL);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1793 q = p->next;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1794 mp_free(iter->mp, p);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1795 p = q;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1796 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1797 iter->head = iter->tail;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1798 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1799
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1800 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1801 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1802 iter->maxcnt = maxcnt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1803 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1804
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1805 /************************
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1806 *** Mpileup iterator ***
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1807 ************************/
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1808
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1809 struct __bam_mplp_t {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1810 int n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1811 uint64_t min, *pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1812 bam_plp_t *iter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1813 int *n_plp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1814 const bam_pileup1_t **plp;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1815 };
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1816
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1817 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1818 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1819 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1820 bam_mplp_t iter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1821 iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1822 iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1823 iter->n_plp = (int*)calloc(n, sizeof(int));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1824 iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1825 iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1826 iter->n = n;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1827 iter->min = (uint64_t)-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1828 for (i = 0; i < n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1829 iter->iter[i] = bam_plp_init(func, data[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1830 iter->pos[i] = iter->min;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1831 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1832 return iter;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1833 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1834
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1835 void bam_mplp_init_overlaps(bam_mplp_t iter)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1836 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1837 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1838 for (i = 0; i < iter->n; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1839 bam_plp_init_overlaps(iter->iter[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1840 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1841
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1842 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1843 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1844 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1845 for (i = 0; i < iter->n; ++i)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1846 iter->iter[i]->maxcnt = maxcnt;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1847 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1848
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1849 void bam_mplp_destroy(bam_mplp_t iter)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1850 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1851 int i;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1852 for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1853 free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1854 free(iter);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1855 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1856
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1857 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1858 {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1859 int i, ret = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1860 uint64_t new_min = (uint64_t)-1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1861 for (i = 0; i < iter->n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1862 if (iter->pos[i] == iter->min) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1863 int tid, pos;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1864 iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1865 if ( iter->iter[i]->error ) return -1;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1866 iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1867 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1868 if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1869 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1870 iter->min = new_min;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1871 if (new_min == (uint64_t)-1) return 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1872 *_tid = new_min>>32; *_pos = (uint32_t)new_min;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1873 for (i = 0; i < iter->n; ++i) {
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1874 if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1875 n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1876 ++ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1877 } else n_plp[i] = 0, plp[i] = 0;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1878 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1879 return ret;
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1880 }
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1881
dfa3745e5fd8 Uploaded
youngkim
parents:
diff changeset
1882 #endif // ~!defined(BAM_NO_PILEUP)