Mercurial > repos > vipints > rdiff
comparison rDiff/src/tools/read_utils/get_reads.cpp @ 0:0f80a5141704
version 0.3 uploaded
| author | vipints |
|---|---|
| date | Thu, 14 Feb 2013 23:38:36 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0f80a5141704 |
|---|---|
| 1 /* written by Jonas Behr, Regina Bohnert and Gunnar Raetsch, FML Tuebingen, Germany, 2010 */ | |
| 2 | |
| 3 #include <stdio.h> | |
| 4 #include <string.h> | |
| 5 #include <signal.h> | |
| 6 #include <mex.h> | |
| 7 #include <algorithm> | |
| 8 #include <vector> | |
| 9 using std::vector; | |
| 10 #include "get_reads_direct.h" | |
| 11 #include "mex_input.h" | |
| 12 #include "read.h" | |
| 13 | |
| 14 #define MAXLINE 10000 | |
| 15 | |
| 16 /* | |
| 17 * input: | |
| 18 * 1 bam file | |
| 19 * 2 chromosome | |
| 20 * 3 region start (1-based index) | |
| 21 * 4 region end (1-based index) | |
| 22 * 5 strand (either '+' or '-' or '0') | |
| 23 * [6] collapse flag: if true the reads are collapsed to a coverage track | |
| 24 * [7] subsample percentage: percentage of reads to be subsampled (in per mill) | |
| 25 * [8] intron length filter | |
| 26 * [9] exon length filter | |
| 27 * [10] mismatch filter | |
| 28 * [11] bool: use mapped reads for coverage | |
| 29 * [12] bool: use spliced reads for coverage | |
| 30 * [13] return maxminlen | |
| 31 * [14] return pair coverage and pair index list | |
| 32 * [15] only_clipped | |
| 33 * [15] switch of pair filter | |
| 34 * | |
| 35 * output: | |
| 36 * 1 coverage | |
| 37 * [2] intron cell array | |
| 38 * [3] pair coverage | |
| 39 * [4] pair list | |
| 40 * | |
| 41 * example call: | |
| 42 * [cov introns] = get_reads('polyA_left_I+_el15_mm1_spliced.bam', 'I', 10000, 12000, '-', 1, 30); | |
| 43 */ | |
| 44 void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { | |
| 45 | |
| 46 if (nrhs<5 || nrhs>16 || (nlhs<1 || nlhs>4)) { | |
| 47 fprintf(stderr, "usage: [x [introns] [pair]] = get_reads(fname, chr, start, end, strand, [collapse], [subsample], [max intron length], [min exonlength], [max mismatches], [mapped], [spliced], [maxminlen], [pair], [only clipped], [all pairs]);\n"); | |
| 48 return; | |
| 49 } | |
| 50 | |
| 51 /* obligatory arguments | |
| 52 * **********************/ | |
| 53 char *fname = get_string(prhs[0]); | |
| 54 //fprintf(stdout, "arg1: %s\n", fname); | |
| 55 char *chr = get_string(prhs[1]); | |
| 56 //fprintf(stdout, "arg2: %s\n", chr); | |
| 57 int from_pos = get_int(prhs[2]); | |
| 58 //fprintf(stdout, "arg3: %d\n", from_pos); | |
| 59 int to_pos = get_int(prhs[3]); | |
| 60 //fprintf(stdout, "arg4: %d\n", to_pos); | |
| 61 char *strand = get_string(prhs[4]); | |
| 62 //fprintf(stdout, "arg5: %s\n", strand); | |
| 63 | |
| 64 if (from_pos>to_pos) | |
| 65 mexErrMsgTxt("Start (arg 3) must be <= end (arg 4)\n"); | |
| 66 | |
| 67 if (strand[0]!='+' && strand[0]!='-' && strand[0]!='0') | |
| 68 mexErrMsgTxt("Unknown strand (arg 5): either + or - or 0"); | |
| 69 | |
| 70 /* optional arguments | |
| 71 * ******************/ | |
| 72 int collapse = 0; | |
| 73 if (nrhs>=6) | |
| 74 collapse = get_int(prhs[5]); | |
| 75 | |
| 76 int subsample = 1000; | |
| 77 if (nrhs>=7) | |
| 78 subsample = get_int(prhs[6]); | |
| 79 | |
| 80 int intron_len_filter = 1e9; | |
| 81 if (nrhs>=8) | |
| 82 intron_len_filter = get_int(prhs[7]); | |
| 83 | |
| 84 int exon_len_filter = -1; | |
| 85 if (nrhs>=9) | |
| 86 exon_len_filter = get_int(prhs[8]); | |
| 87 | |
| 88 int filter_mismatch = 1e9; | |
| 89 if (nrhs>=10) | |
| 90 filter_mismatch = get_int(prhs[9]); | |
| 91 | |
| 92 int mapped = 1; | |
| 93 if (nrhs>=11) | |
| 94 mapped = get_int(prhs[10]); | |
| 95 | |
| 96 int spliced = 1; | |
| 97 if (nrhs>=12) | |
| 98 spliced = get_int(prhs[11]); | |
| 99 | |
| 100 int maxminlen = 0; | |
| 101 if (nrhs>=13) | |
| 102 maxminlen = get_int(prhs[12]); | |
| 103 | |
| 104 int pair_cov = 0; | |
| 105 if (nrhs>=14) | |
| 106 pair_cov = get_int(prhs[13]); | |
| 107 | |
| 108 int only_clipped = 0; | |
| 109 if (nrhs>=15) | |
| 110 only_clipped = get_int(prhs[14]); | |
| 111 | |
| 112 int no_pair_filter = 0; | |
| 113 if (nrhs>=16) | |
| 114 no_pair_filter = get_int(prhs[15]); | |
| 115 | |
| 116 | |
| 117 /* call function to get reads | |
| 118 * **************************/ | |
| 119 char region[MAXLINE]; | |
| 120 sprintf(region, "%s:%i-%i", chr, from_pos, to_pos); | |
| 121 | |
| 122 vector<CRead*> all_reads; | |
| 123 | |
| 124 get_reads_from_bam(fname, region, &all_reads, strand[0], subsample); | |
| 125 | |
| 126 for (int i=0; i<all_reads.size(); i++) { | |
| 127 all_reads[i]->strip_leftright_tag() ; | |
| 128 } | |
| 129 | |
| 130 /* filter reads | |
| 131 * **************/ | |
| 132 int left = 0; | |
| 133 int right = 0; | |
| 134 | |
| 135 vector<CRead*> reads; | |
| 136 for (int i=0; i<all_reads.size(); i++) { | |
| 137 if (all_reads[i]->left) | |
| 138 left++; | |
| 139 if (all_reads[i]->right) | |
| 140 right++; | |
| 141 //if (all_reads[i]->max_intron_len()<intron_len_filter && all_reads[i]->min_exon_len()>exon_len_filter && all_reads[i]->get_mismatches()<=filter_mismatch && all_reads[i]->multiple_alignment_index==0) | |
| 142 if (all_reads[i]->max_intron_len()<intron_len_filter && all_reads[i]->min_exon_len()>exon_len_filter && all_reads[i]->get_mismatches()<=filter_mismatch && (only_clipped==0 || all_reads[i]->is_clipped)) | |
| 143 reads.push_back(all_reads[i]); | |
| 144 } | |
| 145 | |
| 146 | |
| 147 /* prepare output | |
| 148 * **************/ | |
| 149 int num_rows = reads.size(); | |
| 150 int num_pos = to_pos-from_pos+1; | |
| 151 | |
| 152 if (pair_cov==1 && nlhs>=3) { | |
| 153 // sort reads by read_id | |
| 154 //printf("\n\nleft:%i right:%i \n\n", left, right); | |
| 155 //printf("\nreads[0]->read_id: %s\n", reads[0]->read_id); | |
| 156 sort(reads.begin(), reads.end(), CRead::compare_by_read_id); | |
| 157 } | |
| 158 | |
| 159 // read coverages collapsed | |
| 160 if (collapse) { | |
| 161 plhs[0] = mxCreateNumericMatrix(1, num_pos, mxUINT32_CLASS, mxREAL); | |
| 162 uint32_t *mask_ret = (uint32_t*) mxGetData(plhs[0]); | |
| 163 if (num_pos>0 && mask_ret==NULL) | |
| 164 mexErrMsgTxt("Error allocating memory\n"); | |
| 165 if (mapped && spliced) { | |
| 166 for (int i=0; i<reads.size(); i++) { | |
| 167 reads[i]->get_coverage(from_pos, to_pos, mask_ret); | |
| 168 } | |
| 169 } else { | |
| 170 for (int i=0; i<reads.size(); i++) { | |
| 171 ssize_t num_exons = reads[i]->block_starts.size(); | |
| 172 if ((num_exons==1 && mapped) || (num_exons>1 && spliced)) | |
| 173 reads[i]->get_coverage(from_pos, to_pos, mask_ret); | |
| 174 } | |
| 175 } | |
| 176 } | |
| 177 // reads not collapsed | |
| 178 else { | |
| 179 uint32_t nzmax = 0; // maximal number of nonzero elements | |
| 180 int len = to_pos-from_pos+1; | |
| 181 for (uint i=0; i<reads.size(); i++) | |
| 182 { | |
| 183 ssize_t num_exons = reads[i]->block_starts.size(); | |
| 184 if (!((mapped && spliced) || (num_exons==1 && mapped) || (num_exons>1 && spliced))) | |
| 185 { | |
| 186 continue; | |
| 187 } | |
| 188 for (uint n = 0; n < reads[i]->block_starts.size(); n++) | |
| 189 { | |
| 190 uint32_t from, to; | |
| 191 if (reads[i]->block_starts[n]+reads[i]->start_pos-from_pos >= 0) | |
| 192 from = reads[i]->block_starts[n]+reads[i]->start_pos-from_pos; | |
| 193 else | |
| 194 from = 0; | |
| 195 if (reads[i]->block_starts[n]+reads[i]->start_pos-from_pos+reads[i]->block_lengths[n] >= 0) | |
| 196 to = reads[i]->block_starts[n]+reads[i]->start_pos-from_pos+reads[i]->block_lengths[n]; | |
| 197 else | |
| 198 to = 0; | |
| 199 for (int bp=from; bp<to&bp<len; bp++) | |
| 200 { | |
| 201 nzmax++; | |
| 202 } | |
| 203 } | |
| 204 } | |
| 205 // 1st row: row indices | |
| 206 // 2nd row: column indices | |
| 207 plhs[0] = mxCreateDoubleMatrix(2, nzmax, mxREAL); | |
| 208 double *mask_ret = (double*) mxGetData(plhs[0]); | |
| 209 if (nzmax>0 && mask_ret==NULL) | |
| 210 mexErrMsgTxt("Error allocating memory\n"); | |
| 211 uint32_t mask_ret_c = 0; // counter | |
| 212 for (uint i=0; i<reads.size(); i++) | |
| 213 { | |
| 214 ssize_t num_exons = reads[i]->block_starts.size(); | |
| 215 if (!((mapped && spliced) || (num_exons==1 && mapped) || (num_exons>1 && spliced))) | |
| 216 { | |
| 217 continue; | |
| 218 } | |
| 219 reads[i]->get_reads_sparse(from_pos, to_pos, mask_ret, mask_ret_c, i); | |
| 220 } | |
| 221 if (mask_ret_c!=2*nzmax) | |
| 222 mexErrMsgTxt("Error filling index arrays for sparse matrix\n"); | |
| 223 } | |
| 224 // introns | |
| 225 if (maxminlen==0 && nlhs>=2) { | |
| 226 vector<int> intron_list; | |
| 227 for (int i=0; i<reads.size(); i++) { | |
| 228 reads[i]->get_introns(&intron_list); | |
| 229 } | |
| 230 | |
| 231 plhs[1] = mxCreateNumericMatrix(2, intron_list.size()/2, mxUINT32_CLASS, mxREAL); | |
| 232 uint32_t *p_intron_list = (uint32_t*) mxGetData(plhs[1]); | |
| 233 for (int p = 0; p<intron_list.size(); p++) { | |
| 234 p_intron_list[p] = intron_list[p]; | |
| 235 } | |
| 236 intron_list.clear(); | |
| 237 } else if (nlhs>=2) { | |
| 238 vector<uint32_t> intron_starts; | |
| 239 vector<uint32_t> intron_ends; | |
| 240 vector<uint32_t> block_len1; | |
| 241 vector<uint32_t> block_len2; | |
| 242 for (int i=0; i<reads.size(); i++) { | |
| 243 reads[i]->get_introns(&intron_starts, &intron_ends, &block_len1, &block_len2); | |
| 244 } | |
| 245 | |
| 246 plhs[1] = mxCreateNumericMatrix(4, intron_starts.size(), mxINT32_CLASS, mxREAL); | |
| 247 uint32_t *p_intron_list = (uint32_t*) mxGetData(plhs[1]); | |
| 248 for (int p = 0; p<intron_starts.size(); p++) { | |
| 249 p_intron_list[4*p] = intron_starts[p]; | |
| 250 p_intron_list[(4*p)+1] = intron_ends[p]; | |
| 251 p_intron_list[(4*p)+2] = block_len1[p]; | |
| 252 p_intron_list[(4*p)+3] = block_len2[p]; | |
| 253 } | |
| 254 intron_starts.clear() ; | |
| 255 intron_ends.clear() ; | |
| 256 block_len1.clear() ; | |
| 257 block_len2.clear() ; | |
| 258 } | |
| 259 if (pair_cov==1 && nlhs>=3) { | |
| 260 plhs[2] = mxCreateNumericMatrix(1, num_pos, mxUINT32_CLASS, mxREAL); | |
| 261 uint32_t *p_pair_map = (uint32_t*) mxGetData(plhs[2]); | |
| 262 if (num_pos>0 && p_pair_map==NULL) | |
| 263 mexErrMsgTxt("Error allocating memory\n"); | |
| 264 | |
| 265 vector<int> pair_ids; | |
| 266 | |
| 267 int take_cnt = 0; | |
| 268 int discard_cnt = 0; | |
| 269 int discard_strand_cnt = 0; | |
| 270 //printf("reads.size(): %i\n", reads.size()); | |
| 271 // find consecutive reads with the same id | |
| 272 for (int i=0; i<((int) reads.size())-1; i++) | |
| 273 { | |
| 274 //printf("reads[%i]->read_id: %s\n", i, reads[i]->read_id); | |
| 275 int j = i+1; | |
| 276 while(j<reads.size() && strcmp(reads[i]->read_id, reads[j]->read_id) == 0) | |
| 277 { | |
| 278 if ((reads[i]->left && reads[j]->right) || (reads[j]->left && reads[i]->right) && (reads[i]->reverse != reads[j]->reverse)) | |
| 279 { | |
| 280 if (reads[i]->get_last_position()==-1 || reads[j]->get_last_position()==-1) | |
| 281 break; | |
| 282 if (false)//(reads[i]->strand[0]=='0' && reads[j]->strand[0]=='0' ) | |
| 283 { | |
| 284 // discard pairs without strand information | |
| 285 discard_strand_cnt++; | |
| 286 } | |
| 287 else if (reads[i]->get_last_position()<reads[j]->start_pos && reads[j]->start_pos-reads[i]->get_last_position()<60000) | |
| 288 { | |
| 289 int from = std::max(0, reads[i]->get_last_position()-from_pos); | |
| 290 int to = std::min(num_pos-1, reads[j]->start_pos-from_pos); | |
| 291 pair_ids.push_back(i); | |
| 292 pair_ids.push_back(j); | |
| 293 for (int k=from; k<to; k++) | |
| 294 p_pair_map[k]++; | |
| 295 take_cnt++; | |
| 296 } | |
| 297 else if (reads[i]->start_pos>reads[j]->get_last_position() && reads[j]->get_last_position()-reads[i]->start_pos<60000) | |
| 298 { | |
| 299 int from = std::max(0, reads[j]->get_last_position()-from_pos); | |
| 300 int to = std::min(num_pos-1, reads[i]->start_pos-from_pos); | |
| 301 pair_ids.push_back(i); | |
| 302 pair_ids.push_back(j); | |
| 303 for (int k=from; k<to; k++) | |
| 304 p_pair_map[k]++; | |
| 305 take_cnt++; | |
| 306 } | |
| 307 else | |
| 308 { | |
| 309 if (no_pair_filter>0 && reads[i]->start_pos<reads[j]->start_pos) | |
| 310 { | |
| 311 pair_ids.push_back(i); | |
| 312 pair_ids.push_back(j); | |
| 313 take_cnt++; | |
| 314 } | |
| 315 else if (no_pair_filter>0) | |
| 316 { | |
| 317 pair_ids.push_back(j); | |
| 318 pair_ids.push_back(i); | |
| 319 take_cnt++; | |
| 320 } | |
| 321 else | |
| 322 discard_cnt++; | |
| 323 //printf("istart:%i, ilast:%i jstart:%i, jlast: %i\n", reads[i]->start_pos, reads[i]->get_last_position(), reads[j]->start_pos, reads[j]->get_last_position()); | |
| 324 } | |
| 325 } | |
| 326 else | |
| 327 discard_cnt++; | |
| 328 j++; | |
| 329 } | |
| 330 } | |
| 331 //printf("take:%i, discard:%i discard_strand_cnt:%i\n", take_cnt, discard_cnt+discard_strand_cnt, discard_strand_cnt); | |
| 332 | |
| 333 if (nlhs>=4) { | |
| 334 plhs[3] = mxCreateNumericMatrix(2, pair_ids.size()/2, mxUINT32_CLASS, mxREAL); | |
| 335 uint32_t *pair_ids_ret = (uint32_t*) mxGetData(plhs[3]); | |
| 336 if (pair_ids.size()>0 && pair_ids_ret==NULL) | |
| 337 mexErrMsgTxt("Error allocating memory\n"); | |
| 338 for (int i=0; i<pair_ids.size(); i++) { | |
| 339 pair_ids_ret[i] = pair_ids[i]; | |
| 340 } | |
| 341 } | |
| 342 } | |
| 343 for (int i=0; i<all_reads.size(); i++) | |
| 344 delete all_reads[i]; | |
| 345 } | |
| 346 |
