Mercurial > repos > vipints > deseq_hts
comparison deseq-hts_1.0/mex/get_reads.cpp @ 0:94a108763d9e draft
deseq-hts version 1.0 wraps the DESeq 1.6.0
author | vipints |
---|---|
date | Wed, 09 May 2012 20:43:47 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:94a108763d9e |
---|---|
1 /* | |
2 * This program is free software; you can redistribute it and/or modify | |
3 * it under the terms of the GNU General Public License as published by | |
4 * the Free Software Foundation; either version 3 of the License, or | |
5 * (at your option) any later version. | |
6 * | |
7 * Written (W) 2010-2011 Jonas Behr, Regina Bohnert, Gunnar Raetsch | |
8 * Copyright (C) 2010-2011 Max Planck Society | |
9 */ | |
10 | |
11 | |
12 #include <stdio.h> | |
13 #include <string.h> | |
14 #include <signal.h> | |
15 #include <mex.h> | |
16 #include <algorithm> | |
17 #include <vector> | |
18 using std::vector; | |
19 #include "get_reads_direct.h" | |
20 #include "mex_input.h" | |
21 #include "read.h" | |
22 | |
23 #define MAXLINE 10000 | |
24 | |
25 /* | |
26 * input: | |
27 * 1 bam file | |
28 * 2 chromosome | |
29 * 3 region start (1-based index) | |
30 * 4 region end (1-based index) | |
31 * 5 strand (either '+' or '-' or '0') | |
32 * [6] collapse flag: if true the reads are collapsed to a coverage track | |
33 * [7] subsample percentage: percentage of reads to be subsampled (in per mill) | |
34 * [8] intron length filter | |
35 * [9] exon length filter | |
36 * [10] mismatch filter | |
37 * [11] bool: use mapped reads for coverage | |
38 * [12] bool: use spliced reads for coverage | |
39 * [13] return maxminlen | |
40 * [14] return pair coverage | |
41 * | |
42 * output: | |
43 * 1 coverage | |
44 * [2] intron cell array | |
45 * [3] pair coverage | |
46 * [4] pair list | |
47 * | |
48 * example call: | |
49 * [cov introns] = get_reads('polyA_left_I+_el15_mm1_spliced.bam', 'I', 10000, 12000, '-', 1, 30); | |
50 */ | |
51 void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { | |
52 | |
53 if (nrhs<5 || nrhs>14 || (nlhs<1 || nlhs>4)) { | |
54 fprintf(stderr, "usage: [x [introns] [pair]] = get_reads(fname, chr, start, end, strand, [collapse], [subsample], [max intron length], [min exonlength], [max mismatches], [mapped], [spliced], [maxminlen], [pair]);\n"); | |
55 return; | |
56 } | |
57 | |
58 /* obligatory arguments | |
59 * **********************/ | |
60 char *fname = get_string(prhs[0]); | |
61 //fprintf(stdout, "arg1: %s\n", fname); | |
62 char *chr = get_string(prhs[1]); | |
63 //fprintf(stdout, "arg2: %s\n", chr); | |
64 int from_pos = get_int(prhs[2]); | |
65 //fprintf(stdout, "arg3: %d\n", from_pos); | |
66 int to_pos = get_int(prhs[3]); | |
67 //fprintf(stdout, "arg4: %d\n", to_pos); | |
68 char *strand = get_string(prhs[4]); | |
69 //fprintf(stdout, "arg5: %s\n", strand); | |
70 | |
71 if (from_pos>to_pos) | |
72 mexErrMsgTxt("Start (arg 3) must be <= end (arg 4)\n"); | |
73 | |
74 if (strand[0]!='+' && strand[0]!='-' && strand[0]!='0') | |
75 mexErrMsgTxt("Unknown strand (arg 5): either + or - or 0"); | |
76 | |
77 /* optional arguments | |
78 * ******************/ | |
79 int collapse = 0; | |
80 if (nrhs>=6) | |
81 collapse = get_int(prhs[5]); | |
82 | |
83 int subsample = 1000; | |
84 if (nrhs>=7) | |
85 subsample = get_int(prhs[6]); | |
86 | |
87 int intron_len_filter = 1e9; | |
88 if (nrhs>=8) | |
89 intron_len_filter = get_int(prhs[7]); | |
90 | |
91 int exon_len_filter = -1; | |
92 if (nrhs>=9) | |
93 exon_len_filter = get_int(prhs[8]); | |
94 | |
95 int filter_mismatch = 1e9; | |
96 if (nrhs>=10) | |
97 filter_mismatch = get_int(prhs[9]); | |
98 | |
99 int mapped = 1; | |
100 if (nrhs>=11) | |
101 mapped = get_int(prhs[10]); | |
102 | |
103 int spliced = 1; | |
104 if (nrhs>=12) | |
105 spliced = get_int(prhs[11]); | |
106 | |
107 int maxminlen = 0; | |
108 if (nrhs>=13) | |
109 maxminlen = get_int(prhs[12]); | |
110 | |
111 int pair_cov = 0; | |
112 if (nrhs>=14) | |
113 pair_cov = get_int(prhs[13]); | |
114 | |
115 /* call function to get reads | |
116 * **************************/ | |
117 char region[MAXLINE]; | |
118 sprintf(region, "%s:%i-%i", chr, from_pos, to_pos); | |
119 | |
120 vector<CRead*> all_reads; | |
121 | |
122 get_reads_from_bam(fname, region, &all_reads, strand[0], subsample); | |
123 | |
124 /* filter reads | |
125 * **************/ | |
126 int left = 0; | |
127 int right = 0; | |
128 | |
129 vector<CRead*> reads; | |
130 for (int i=0; i<all_reads.size(); i++) { | |
131 if (all_reads[i]->left) | |
132 left++; | |
133 if (all_reads[i]->right) | |
134 right++; | |
135 if (all_reads[i]->max_intron_len()<intron_len_filter && all_reads[i]->min_exon_len()>exon_len_filter && all_reads[i]->get_mismatches()<=filter_mismatch) | |
136 reads.push_back(all_reads[i]); | |
137 } | |
138 | |
139 | |
140 /* prepare output | |
141 * **************/ | |
142 int num_rows = reads.size(); | |
143 int num_pos = to_pos-from_pos+1; | |
144 | |
145 if (pair_cov==1 && nlhs>=3) { | |
146 // sort reads by read_id | |
147 printf("\n\nleft:%i right:%i \n\n", left, right); | |
148 sort(reads.begin(), reads.end(), CRead::compare_by_read_id); | |
149 } | |
150 | |
151 // read coverages collapsed | |
152 if (collapse) { | |
153 plhs[0] = mxCreateNumericMatrix(1, num_pos, mxUINT32_CLASS, mxREAL); | |
154 uint32_t *mask_ret = (uint32_t*) mxGetData(plhs[0]); | |
155 if (num_pos>0 && mask_ret==NULL) | |
156 mexErrMsgTxt("Error allocating memory\n"); | |
157 if (mapped && spliced) { | |
158 for (int i=0; i<reads.size(); i++) { | |
159 reads[i]->get_coverage(from_pos, to_pos, mask_ret); | |
160 } | |
161 } else { | |
162 for (int i=0; i<reads.size(); i++) { | |
163 ssize_t num_exons = reads[i]->block_starts.size(); | |
164 if ((num_exons==1 && mapped) || (num_exons>1 && spliced)) | |
165 reads[i]->get_coverage(from_pos, to_pos, mask_ret); | |
166 } | |
167 } | |
168 } | |
169 // reads not collapsed | |
170 else { | |
171 uint32_t nzmax = 0; // maximal number of nonzero elements | |
172 int len = to_pos-from_pos+1; | |
173 for (uint i=0; i<reads.size(); i++) { | |
174 for (uint n = 0; n < reads[i]->block_starts.size(); n++) { | |
175 uint32_t from, to; | |
176 if (reads[i]->block_starts[n]+reads[i]->start_pos-from_pos >= 0) | |
177 from = reads[i]->block_starts[n]+reads[i]->start_pos-from_pos; | |
178 else | |
179 from = 0; | |
180 if (reads[i]->block_starts[n]+reads[i]->start_pos-from_pos+reads[i]->block_lengths[n] >= 0) | |
181 to = reads[i]->block_starts[n]+reads[i]->start_pos-from_pos+reads[i]->block_lengths[n]; | |
182 else | |
183 to = 0; | |
184 for (int bp=from; bp<to&bp<len; bp++) { | |
185 nzmax++; | |
186 } | |
187 } | |
188 } | |
189 // 1st row: row indices | |
190 // 2nd row: column indices | |
191 plhs[0] = mxCreateDoubleMatrix(2, nzmax, mxREAL); | |
192 double *mask_ret = (double*) mxGetData(plhs[0]); | |
193 if (nzmax>0 && mask_ret==NULL) | |
194 mexErrMsgTxt("Error allocating memory\n"); | |
195 uint32_t mask_ret_c = 0; // counter | |
196 for (uint i=0; i<reads.size(); i++) { | |
197 reads[i]->get_reads_sparse(from_pos, to_pos, mask_ret, mask_ret_c, i); | |
198 } | |
199 if (mask_ret_c!=2*nzmax) | |
200 mexErrMsgTxt("Error filling index arrays for sparse matrix\n"); | |
201 } | |
202 // introns | |
203 if (maxminlen==0 && nlhs>=2) { | |
204 vector<int> intron_list; | |
205 for (int i=0; i<reads.size(); i++) { | |
206 reads[i]->get_introns(&intron_list); | |
207 } | |
208 | |
209 plhs[1] = mxCreateNumericMatrix(2, intron_list.size()/2, mxUINT32_CLASS, mxREAL); | |
210 uint32_t *p_intron_list = (uint32_t*) mxGetData(plhs[1]); | |
211 for (int p = 0; p<intron_list.size(); p++) { | |
212 p_intron_list[p] = intron_list[p]; | |
213 } | |
214 intron_list.clear(); | |
215 } else if (nlhs>=2) { | |
216 vector<uint32_t> intron_starts; | |
217 vector<uint32_t> intron_ends; | |
218 vector<uint32_t> block_len1; | |
219 vector<uint32_t> block_len2; | |
220 for (int i=0; i<reads.size(); i++) { | |
221 reads[i]->get_introns(&intron_starts, &intron_ends, &block_len1, &block_len2); | |
222 } | |
223 | |
224 plhs[1] = mxCreateNumericMatrix(4, intron_starts.size(), mxINT32_CLASS, mxREAL); | |
225 uint32_t *p_intron_list = (uint32_t*) mxGetData(plhs[1]); | |
226 for (int p = 0; p<intron_starts.size(); p++) { | |
227 p_intron_list[4*p] = intron_starts[p]; | |
228 p_intron_list[(4*p)+1] = intron_ends[p]; | |
229 p_intron_list[(4*p)+2] = block_len1[p]; | |
230 p_intron_list[(4*p)+3] = block_len2[p]; | |
231 } | |
232 intron_starts.clear() ; | |
233 intron_ends.clear() ; | |
234 block_len1.clear() ; | |
235 block_len2.clear() ; | |
236 } | |
237 if (pair_cov==1 && nlhs>=3) { | |
238 plhs[2] = mxCreateNumericMatrix(1, num_pos, mxUINT32_CLASS, mxREAL); | |
239 uint32_t *p_pair_map = (uint32_t*) mxGetData(plhs[2]); | |
240 if (num_pos>0 && p_pair_map==NULL) | |
241 mexErrMsgTxt("Error allocating memory\n"); | |
242 | |
243 vector<int> pair_ids; | |
244 | |
245 int take_cnt = 0; | |
246 int discard_cnt = 0; | |
247 // find consecutive reads with the same id | |
248 for (int i=0; i<((int) reads.size())-1; i++) { | |
249 int j = i+1; | |
250 while(j<reads.size() && strcmp(reads[i]->read_id, reads[j]->read_id) == 0) { | |
251 if ((reads[i]->left && reads[j]->right) || (reads[j]->left && reads[i]->right) && (reads[i]->reverse != reads[j]->reverse)) { | |
252 if (reads[i]->get_last_position()==-1 || reads[j]->get_last_position()==-1) | |
253 break; | |
254 if (reads[i]->get_last_position()<reads[j]->start_pos && reads[j]->start_pos-reads[i]->get_last_position()<60000) { | |
255 int from = std::max(0, reads[i]->get_last_position()-from_pos); | |
256 int to = std::min(num_pos-1, reads[j]->start_pos-from_pos); | |
257 pair_ids.push_back(i); | |
258 pair_ids.push_back(j); | |
259 for (int k=from; k<to; k++) | |
260 p_pair_map[k]++; | |
261 take_cnt++; | |
262 } else if (reads[i]->start_pos>reads[j]->get_last_position() && reads[j]->get_last_position()-reads[i]->start_pos<60000) { | |
263 int from = std::max(0, reads[j]->get_last_position()-from_pos); | |
264 int to = std::min(num_pos-1, reads[i]->start_pos-from_pos); | |
265 pair_ids.push_back(i); | |
266 pair_ids.push_back(j); | |
267 for (int k=from; k<to; k++) | |
268 p_pair_map[k]++; | |
269 take_cnt++; | |
270 } else | |
271 discard_cnt++; | |
272 } | |
273 else | |
274 discard_cnt++; | |
275 j++; | |
276 } | |
277 } | |
278 printf("take:%i, discard:%i \n", take_cnt, discard_cnt); | |
279 | |
280 if (nlhs>=4) { | |
281 plhs[3] = mxCreateNumericMatrix(2, pair_ids.size()/2, mxUINT32_CLASS, mxREAL); | |
282 uint32_t *pair_ids_ret = (uint32_t*) mxGetData(plhs[3]); | |
283 if (pair_ids.size()>0 && pair_ids_ret==NULL) | |
284 mexErrMsgTxt("Error allocating memory\n"); | |
285 for (int i=0; i<pair_ids.size(); i++) { | |
286 pair_ids_ret[i] = pair_ids[i]; | |
287 } | |
288 } | |
289 } | |
290 for (int i=0; i<all_reads.size(); i++) | |
291 delete all_reads[i]; | |
292 } | |
293 |