0
|
1 /* written by Jonas Behr, Regina Bohnert and Gunnar Raetsch, FML Tuebingen, Germany, 2010 */
|
|
2
|
|
3 #include <stdio.h>
|
|
4 #include <assert.h>
|
|
5 #include "sam.h"
|
|
6 #include "get_reads_direct.h"
|
|
7
|
|
8 #include <vector>
|
|
9 using std::vector;
|
|
10 #include <string>
|
|
11 using std::string;
|
|
12
|
|
13 #ifdef __cplusplus
|
|
14 extern "C" {
|
|
15 #endif
|
|
16
|
|
17 typedef struct {
|
|
18 int beg, end;
|
|
19 samfile_t *in;
|
|
20 } tmpstruct_t;
|
|
21
|
|
22 typedef struct {
|
|
23 uint64_t u, v;
|
|
24 } pair64_t;
|
|
25
|
|
26 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
|
|
27 {
|
|
28 uint32_t rbeg = b->core.pos;
|
|
29 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
|
|
30 return (rend > beg && rbeg < end);
|
|
31 }
|
|
32
|
|
33 pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off);
|
|
34
|
|
35 int bam_fetch_reads(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_header_t* header, vector<CRead*>* reads, char strand);
|
|
36
|
|
37 // callback for bam_plbuf_init()
|
|
38 static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
|
|
39 {
|
|
40 //tmpstruct_t *tmp = (tmpstruct_t*)data;
|
|
41 //if ((int)pos >= tmp->beg && (int)pos < tmp->end)
|
|
42 // printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n);
|
|
43 return 0;
|
|
44 }
|
|
45 #ifdef __cplusplus
|
|
46 }
|
|
47 #endif
|
|
48 int parse_sam_line(char* line, CRead* read);
|
|
49 //int set_strand(char c);
|
|
50 //void parse_cigar(bam1_t* b, CRead* read);
|
|
51
|
|
52
|
|
53 int get_reads_from_bam(char* filename, char* region, vector<CRead*>* reads, char strand, int lsubsample)
|
|
54 {
|
|
55 subsample = lsubsample;
|
|
56 //set_strand(strand);
|
|
57
|
|
58 srand (time(NULL));
|
|
59 //srand (1234);
|
|
60 tmpstruct_t tmp;
|
|
61 tmp.in = samopen(filename, "rb", 0);
|
|
62 if (tmp.in == 0) {
|
|
63 fprintf(stderr, "Fail to open BAM file %s\n", filename);
|
|
64 return 1;
|
|
65 }
|
|
66 int ref;
|
|
67 bam_index_t *idx;
|
|
68 bam_plbuf_t *buf;
|
|
69 idx = bam_index_load(filename); // load BAM index
|
|
70 if (idx == 0) {
|
|
71 fprintf(stderr, "BAM indexing file is not available.\n");
|
|
72 samclose(tmp.in);
|
|
73 return 1;
|
|
74 }
|
|
75 bam_parse_region(tmp.in->header, region, &ref,
|
|
76 &tmp.beg, &tmp.end); // parse the region
|
|
77 if (ref < 0) {
|
|
78 fprintf(stderr, "Invalid region %s\n", region);
|
|
79 bam_index_destroy(idx);
|
|
80 samclose(tmp.in);
|
|
81 return 1;
|
|
82 }
|
|
83
|
|
84 buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup
|
|
85
|
|
86 bam_fetch_reads(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, tmp.in->header, reads, strand);
|
|
87 //fprintf(stdout, "intron_list: %d \n", intron_list->size());
|
|
88
|
|
89 bam_plbuf_push(0, buf); // finalize pileup
|
|
90 bam_index_destroy(idx);
|
|
91 bam_plbuf_destroy(buf);
|
|
92 samclose(tmp.in);
|
|
93 return 0;
|
|
94 }
|
|
95
|
|
96
|
|
97 int bam_fetch_reads(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_header_t* header, vector<CRead*>* reads, char strand)
|
|
98 {
|
|
99 int n_off;
|
|
100 pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off);
|
|
101 if (off == 0) return 0;
|
|
102 {
|
|
103 // retrive alignments
|
|
104 uint64_t curr_off;
|
|
105 int i, ret, n_seeks;
|
|
106 n_seeks = 0; i = -1; curr_off = 0;
|
|
107 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
|
|
108 for (;;) {
|
|
109 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
|
|
110 if (i == n_off - 1) break; // no more chunks
|
|
111 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
|
|
112 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
|
|
113 bam_seek(fp, off[i+1].u, SEEK_SET);
|
|
114 curr_off = bam_tell(fp);
|
|
115 ++n_seeks;
|
|
116 }
|
|
117 ++i;
|
|
118 }
|
|
119 if ((ret = bam_read1(fp, b)) > 0) {
|
|
120 curr_off = bam_tell(fp);
|
|
121 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
|
|
122 else if (is_overlap(beg, end, b))
|
|
123 {
|
|
124 int rr = rand();
|
|
125 if ((rr%1000 < subsample))
|
|
126 {
|
|
127 CRead* read = new CRead();
|
|
128 parse_cigar(b, read, header);
|
|
129
|
|
130 if (strand == '0' || strand==read->strand[0] || read->strand[0]=='0')
|
|
131 {
|
|
132 reads->push_back(read);
|
|
133 }
|
|
134 else
|
|
135 {
|
|
136 delete read;
|
|
137 }
|
|
138 //else if (read->strand[0]=='0'&&((b->core.flag & g_flag_off) >0))
|
|
139 //{
|
|
140 // //fprintf(stdout, "(-)-strand; read->strand[0]==0, num_exons: %i \n", read->block_starts.size());
|
|
141 // // this flag means that the read has been reversed for alignment
|
|
142 // // flag bit set and (-)-strand requested
|
|
143 // reads->push_back(read);
|
|
144 //}
|
|
145 //else if (read->strand[0]=='0'&&(g_flag_on>0&&(b->core.flag & g_flag_on)==0))
|
|
146 //{
|
|
147 // //fprintf(stdout, "(+)-strand; read->strand[0]==0, num_exons: %i \n", read->block_starts.size());
|
|
148 // // (+)-strand requested and flag bit not set
|
|
149 // reads->push_back(read);
|
|
150 //}
|
|
151 }
|
|
152 }
|
|
153 } else break; // end of file
|
|
154 }
|
|
155 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);
|
|
156 bam_destroy1(b);
|
|
157 }
|
|
158 free(off);
|
|
159 return 0;
|
|
160 }
|
|
161
|
|
162 void parse_cigar(bam1_t* b, CRead* read, bam_header_t* header)
|
|
163 {
|
|
164 read->left = (b->core.flag & left_flag_mask) >0;
|
|
165 read->right = (b->core.flag & right_flag_mask) >0;
|
|
166 read->reverse = (b->core.flag & reverse_flag_mask) >0;
|
|
167
|
|
168 read->start_pos = b->core.pos+1;
|
|
169 read->set_strand('0');
|
|
170 read->read_id = new char[1000];
|
|
171 //sprintf(read->read_id, "%s\0", bam1_qname(b));
|
|
172 sprintf(read->read_id, "%s", bam1_qname(b));
|
|
173
|
|
174 for (int k = 0; k < b->core.n_cigar; ++k)
|
|
175 {
|
|
176 int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
|
|
177 int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
|
|
178 //int op = bam_cigar_op(bam1_cigar(b)[k]); // operation
|
|
179 //int l = bam_cigar_oplen(bam1_cigar(b)[k]); // length
|
|
180 //fprintf(stdout, "op:%d l:%d\n", op, l);
|
|
181 if (op == BAM_CMATCH)
|
|
182 {
|
|
183 if (k==0)
|
|
184 {
|
|
185 read->block_lengths.push_back(l);
|
|
186 read->block_starts.push_back(0);
|
|
187 }
|
|
188 else
|
|
189 {
|
|
190 int op_prev = bam1_cigar(b)[k-1] & BAM_CIGAR_MASK;
|
|
191 int l_prev = bam1_cigar(b)[k-1] >> BAM_CIGAR_SHIFT;
|
|
192 if (op_prev==BAM_CREF_SKIP)// intron before
|
|
193 {
|
|
194 if (read->block_lengths.size()>=1)
|
|
195 {
|
|
196 int last_block_start = (*(read->block_starts.end()-1));
|
|
197 int intron_start = last_block_start+(*(read->block_lengths.end()-1));
|
|
198 read->block_lengths.push_back(l);
|
|
199 read->block_starts.push_back(intron_start+l_prev);
|
|
200 }
|
|
201 else
|
|
202 {
|
|
203 // start of first block was not a match
|
|
204 read->block_lengths.push_back(l);
|
|
205 read->block_starts.push_back(0);
|
|
206 }
|
|
207 }
|
|
208 else
|
|
209 {
|
|
210 if (read->block_lengths.size()>=1)
|
|
211 (*(read->block_lengths.end()-1))+=l;
|
|
212 else
|
|
213 {
|
|
214 read->block_lengths.push_back(l);
|
|
215 read->block_starts.push_back(0);
|
|
216 }
|
|
217 }
|
|
218 }
|
|
219 }
|
|
220 else if (op == BAM_CDEL)
|
|
221 {
|
|
222 if (k>0 && read->block_lengths.size()>=1)
|
|
223 (*(read->block_lengths.end()-1))+=l;
|
|
224 }
|
|
225 else if (op == BAM_CREF_SKIP)//intron
|
|
226 {}
|
|
227 else if (op == BAM_CINS)
|
|
228 {}
|
|
229 else if (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP)
|
|
230 {
|
|
231 read->is_clipped = true;
|
|
232 }
|
|
233 }
|
|
234 // parse auxiliary data
|
|
235 uint8_t* s = bam1_aux(b);
|
|
236 uint8_t* end = b->data + b->data_len;
|
|
237 while (s < end)
|
|
238 {
|
|
239 uint8_t type, key[2];
|
|
240 key[0] = s[0]; key[1] = s[1];
|
|
241 s += 2; type = *s; ++s;
|
|
242 //fprintf(stdout, "\n%c%c:%c\n", key[0], key[1], type);
|
|
243 if (type == 'A')
|
|
244 {
|
|
245 if ( key[0] =='X' && key[1] == 'S')
|
|
246 {
|
|
247 read->set_strand((char) *s);
|
|
248 }
|
|
249 ++s;
|
|
250 }
|
|
251 else if (type == 'C')
|
|
252 {
|
|
253 if ( key[0] =='H' && key[1] == '0')
|
|
254 {
|
|
255 uint8_t matches = *s;
|
|
256 read->matches = (int) matches;
|
|
257 }
|
|
258 if ( key[0] =='N' && key[1] == 'M')
|
|
259 {
|
|
260 uint8_t mismatches = *s;
|
|
261 read->mismatches = (int) mismatches;
|
|
262 }
|
|
263 if ( key[0] =='H' && key[1] == 'I')
|
|
264 {
|
|
265 uint8_t mai = *s;
|
|
266 read->multiple_alignment_index = (int) mai;
|
|
267 }
|
|
268
|
|
269 ++s;
|
|
270 }
|
|
271 else if (type == 'c') { ++s; }
|
|
272 else if (type == 'S') { s += 2; }
|
|
273 else if (type == 's') { s += 2; }
|
|
274 else if (type == 'I') { s += 4; }
|
|
275 else if (type == 'i') { s += 4; }
|
|
276 else if (type == 'f') { s += 4; }
|
|
277 else if (type == 'd') { s += 8; }
|
|
278 else if (type == 'Z') { ++s; }
|
|
279 else if (type == 'H') { ++s; }
|
|
280 }
|
|
281 }
|
|
282
|
|
283 //int set_strand(char c)
|
|
284 //{
|
|
285 // if (c=='+')
|
|
286 // {
|
|
287 // char* fl = (char*) "0x0010";
|
|
288 // g_flag_on = strtol(fl, 0, 0);
|
|
289 // g_flag_off = 0;
|
|
290 // }
|
|
291 // else if (c=='-')
|
|
292 // {
|
|
293 // char* fl = (char*) "0x0010";
|
|
294 // g_flag_off = strtol(fl, 0, 0);
|
|
295 // g_flag_on = 0;
|
|
296 // }
|
|
297 // return 0;
|
|
298 //}
|
|
299
|