Mercurial > repos > vipints > rdiff
comparison rDiff/src/tools/read_utils/get_reads_direct.cpp @ 0:0f80a5141704
version 0.3 uploaded
| author | vipints |
|---|---|
| date | Thu, 14 Feb 2013 23:38:36 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0f80a5141704 |
|---|---|
| 1 /* written by Jonas Behr, Regina Bohnert and Gunnar Raetsch, FML Tuebingen, Germany, 2010 */ | |
| 2 | |
| 3 #include <stdio.h> | |
| 4 #include <assert.h> | |
| 5 #include "sam.h" | |
| 6 #include "get_reads_direct.h" | |
| 7 | |
| 8 #include <vector> | |
| 9 using std::vector; | |
| 10 #include <string> | |
| 11 using std::string; | |
| 12 | |
| 13 #ifdef __cplusplus | |
| 14 extern "C" { | |
| 15 #endif | |
| 16 | |
| 17 typedef struct { | |
| 18 int beg, end; | |
| 19 samfile_t *in; | |
| 20 } tmpstruct_t; | |
| 21 | |
| 22 typedef struct { | |
| 23 uint64_t u, v; | |
| 24 } pair64_t; | |
| 25 | |
| 26 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) | |
| 27 { | |
| 28 uint32_t rbeg = b->core.pos; | |
| 29 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1; | |
| 30 return (rend > beg && rbeg < end); | |
| 31 } | |
| 32 | |
| 33 pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off); | |
| 34 | |
| 35 int bam_fetch_reads(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_header_t* header, vector<CRead*>* reads, char strand); | |
| 36 | |
| 37 // callback for bam_plbuf_init() | |
| 38 static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) | |
| 39 { | |
| 40 //tmpstruct_t *tmp = (tmpstruct_t*)data; | |
| 41 //if ((int)pos >= tmp->beg && (int)pos < tmp->end) | |
| 42 // printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n); | |
| 43 return 0; | |
| 44 } | |
| 45 #ifdef __cplusplus | |
| 46 } | |
| 47 #endif | |
| 48 int parse_sam_line(char* line, CRead* read); | |
| 49 //int set_strand(char c); | |
| 50 //void parse_cigar(bam1_t* b, CRead* read); | |
| 51 | |
| 52 | |
| 53 int get_reads_from_bam(char* filename, char* region, vector<CRead*>* reads, char strand, int lsubsample) | |
| 54 { | |
| 55 subsample = lsubsample; | |
| 56 //set_strand(strand); | |
| 57 | |
| 58 srand (time(NULL)); | |
| 59 //srand (1234); | |
| 60 tmpstruct_t tmp; | |
| 61 tmp.in = samopen(filename, "rb", 0); | |
| 62 if (tmp.in == 0) { | |
| 63 fprintf(stderr, "Fail to open BAM file %s\n", filename); | |
| 64 return 1; | |
| 65 } | |
| 66 int ref; | |
| 67 bam_index_t *idx; | |
| 68 bam_plbuf_t *buf; | |
| 69 idx = bam_index_load(filename); // load BAM index | |
| 70 if (idx == 0) { | |
| 71 fprintf(stderr, "BAM indexing file is not available.\n"); | |
| 72 samclose(tmp.in); | |
| 73 return 1; | |
| 74 } | |
| 75 bam_parse_region(tmp.in->header, region, &ref, | |
| 76 &tmp.beg, &tmp.end); // parse the region | |
| 77 if (ref < 0) { | |
| 78 fprintf(stderr, "Invalid region %s\n", region); | |
| 79 bam_index_destroy(idx); | |
| 80 samclose(tmp.in); | |
| 81 return 1; | |
| 82 } | |
| 83 | |
| 84 buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup | |
| 85 | |
| 86 bam_fetch_reads(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, tmp.in->header, reads, strand); | |
| 87 //fprintf(stdout, "intron_list: %d \n", intron_list->size()); | |
| 88 | |
| 89 bam_plbuf_push(0, buf); // finalize pileup | |
| 90 bam_index_destroy(idx); | |
| 91 bam_plbuf_destroy(buf); | |
| 92 samclose(tmp.in); | |
| 93 return 0; | |
| 94 } | |
| 95 | |
| 96 | |
| 97 int bam_fetch_reads(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_header_t* header, vector<CRead*>* reads, char strand) | |
| 98 { | |
| 99 int n_off; | |
| 100 pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off); | |
| 101 if (off == 0) return 0; | |
| 102 { | |
| 103 // retrive alignments | |
| 104 uint64_t curr_off; | |
| 105 int i, ret, n_seeks; | |
| 106 n_seeks = 0; i = -1; curr_off = 0; | |
| 107 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); | |
| 108 for (;;) { | |
| 109 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk | |
| 110 if (i == n_off - 1) break; // no more chunks | |
| 111 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug | |
| 112 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek | |
| 113 bam_seek(fp, off[i+1].u, SEEK_SET); | |
| 114 curr_off = bam_tell(fp); | |
| 115 ++n_seeks; | |
| 116 } | |
| 117 ++i; | |
| 118 } | |
| 119 if ((ret = bam_read1(fp, b)) > 0) { | |
| 120 curr_off = bam_tell(fp); | |
| 121 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed | |
| 122 else if (is_overlap(beg, end, b)) | |
| 123 { | |
| 124 int rr = rand(); | |
| 125 if ((rr%1000 < subsample)) | |
| 126 { | |
| 127 CRead* read = new CRead(); | |
| 128 parse_cigar(b, read, header); | |
| 129 | |
| 130 if (strand == '0' || strand==read->strand[0] || read->strand[0]=='0') | |
| 131 { | |
| 132 reads->push_back(read); | |
| 133 } | |
| 134 else | |
| 135 { | |
| 136 delete read; | |
| 137 } | |
| 138 //else if (read->strand[0]=='0'&&((b->core.flag & g_flag_off) >0)) | |
| 139 //{ | |
| 140 // //fprintf(stdout, "(-)-strand; read->strand[0]==0, num_exons: %i \n", read->block_starts.size()); | |
| 141 // // this flag means that the read has been reversed for alignment | |
| 142 // // flag bit set and (-)-strand requested | |
| 143 // reads->push_back(read); | |
| 144 //} | |
| 145 //else if (read->strand[0]=='0'&&(g_flag_on>0&&(b->core.flag & g_flag_on)==0)) | |
| 146 //{ | |
| 147 // //fprintf(stdout, "(+)-strand; read->strand[0]==0, num_exons: %i \n", read->block_starts.size()); | |
| 148 // // (+)-strand requested and flag bit not set | |
| 149 // reads->push_back(read); | |
| 150 //} | |
| 151 } | |
| 152 } | |
| 153 } else break; // end of file | |
| 154 } | |
| 155 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks); | |
| 156 bam_destroy1(b); | |
| 157 } | |
| 158 free(off); | |
| 159 return 0; | |
| 160 } | |
| 161 | |
| 162 void parse_cigar(bam1_t* b, CRead* read, bam_header_t* header) | |
| 163 { | |
| 164 read->left = (b->core.flag & left_flag_mask) >0; | |
| 165 read->right = (b->core.flag & right_flag_mask) >0; | |
| 166 read->reverse = (b->core.flag & reverse_flag_mask) >0; | |
| 167 | |
| 168 read->start_pos = b->core.pos+1; | |
| 169 read->set_strand('0'); | |
| 170 read->read_id = new char[1000]; | |
| 171 //sprintf(read->read_id, "%s\0", bam1_qname(b)); | |
| 172 sprintf(read->read_id, "%s", bam1_qname(b)); | |
| 173 | |
| 174 for (int k = 0; k < b->core.n_cigar; ++k) | |
| 175 { | |
| 176 int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation | |
| 177 int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length | |
| 178 //int op = bam_cigar_op(bam1_cigar(b)[k]); // operation | |
| 179 //int l = bam_cigar_oplen(bam1_cigar(b)[k]); // length | |
| 180 //fprintf(stdout, "op:%d l:%d\n", op, l); | |
| 181 if (op == BAM_CMATCH) | |
| 182 { | |
| 183 if (k==0) | |
| 184 { | |
| 185 read->block_lengths.push_back(l); | |
| 186 read->block_starts.push_back(0); | |
| 187 } | |
| 188 else | |
| 189 { | |
| 190 int op_prev = bam1_cigar(b)[k-1] & BAM_CIGAR_MASK; | |
| 191 int l_prev = bam1_cigar(b)[k-1] >> BAM_CIGAR_SHIFT; | |
| 192 if (op_prev==BAM_CREF_SKIP)// intron before | |
| 193 { | |
| 194 if (read->block_lengths.size()>=1) | |
| 195 { | |
| 196 int last_block_start = (*(read->block_starts.end()-1)); | |
| 197 int intron_start = last_block_start+(*(read->block_lengths.end()-1)); | |
| 198 read->block_lengths.push_back(l); | |
| 199 read->block_starts.push_back(intron_start+l_prev); | |
| 200 } | |
| 201 else | |
| 202 { | |
| 203 // start of first block was not a match | |
| 204 read->block_lengths.push_back(l); | |
| 205 read->block_starts.push_back(0); | |
| 206 } | |
| 207 } | |
| 208 else | |
| 209 { | |
| 210 if (read->block_lengths.size()>=1) | |
| 211 (*(read->block_lengths.end()-1))+=l; | |
| 212 else | |
| 213 { | |
| 214 read->block_lengths.push_back(l); | |
| 215 read->block_starts.push_back(0); | |
| 216 } | |
| 217 } | |
| 218 } | |
| 219 } | |
| 220 else if (op == BAM_CDEL) | |
| 221 { | |
| 222 if (k>0 && read->block_lengths.size()>=1) | |
| 223 (*(read->block_lengths.end()-1))+=l; | |
| 224 } | |
| 225 else if (op == BAM_CREF_SKIP)//intron | |
| 226 {} | |
| 227 else if (op == BAM_CINS) | |
| 228 {} | |
| 229 else if (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) | |
| 230 { | |
| 231 read->is_clipped = true; | |
| 232 } | |
| 233 } | |
| 234 // parse auxiliary data | |
| 235 uint8_t* s = bam1_aux(b); | |
| 236 uint8_t* end = b->data + b->data_len; | |
| 237 while (s < end) | |
| 238 { | |
| 239 uint8_t type, key[2]; | |
| 240 key[0] = s[0]; key[1] = s[1]; | |
| 241 s += 2; type = *s; ++s; | |
| 242 //fprintf(stdout, "\n%c%c:%c\n", key[0], key[1], type); | |
| 243 if (type == 'A') | |
| 244 { | |
| 245 if ( key[0] =='X' && key[1] == 'S') | |
| 246 { | |
| 247 read->set_strand((char) *s); | |
| 248 } | |
| 249 ++s; | |
| 250 } | |
| 251 else if (type == 'C') | |
| 252 { | |
| 253 if ( key[0] =='H' && key[1] == '0') | |
| 254 { | |
| 255 uint8_t matches = *s; | |
| 256 read->matches = (int) matches; | |
| 257 } | |
| 258 if ( key[0] =='N' && key[1] == 'M') | |
| 259 { | |
| 260 uint8_t mismatches = *s; | |
| 261 read->mismatches = (int) mismatches; | |
| 262 } | |
| 263 if ( key[0] =='H' && key[1] == 'I') | |
| 264 { | |
| 265 uint8_t mai = *s; | |
| 266 read->multiple_alignment_index = (int) mai; | |
| 267 } | |
| 268 | |
| 269 ++s; | |
| 270 } | |
| 271 else if (type == 'c') { ++s; } | |
| 272 else if (type == 'S') { s += 2; } | |
| 273 else if (type == 's') { s += 2; } | |
| 274 else if (type == 'I') { s += 4; } | |
| 275 else if (type == 'i') { s += 4; } | |
| 276 else if (type == 'f') { s += 4; } | |
| 277 else if (type == 'd') { s += 8; } | |
| 278 else if (type == 'Z') { ++s; } | |
| 279 else if (type == 'H') { ++s; } | |
| 280 } | |
| 281 } | |
| 282 | |
| 283 //int set_strand(char c) | |
| 284 //{ | |
| 285 // if (c=='+') | |
| 286 // { | |
| 287 // char* fl = (char*) "0x0010"; | |
| 288 // g_flag_on = strtol(fl, 0, 0); | |
| 289 // g_flag_off = 0; | |
| 290 // } | |
| 291 // else if (c=='-') | |
| 292 // { | |
| 293 // char* fl = (char*) "0x0010"; | |
| 294 // g_flag_off = strtol(fl, 0, 0); | |
| 295 // g_flag_on = 0; | |
| 296 // } | |
| 297 // return 0; | |
| 298 //} | |
| 299 |
