comparison rDiff/src/rdiff_caller.m @ 0:0f80a5141704

version 0.3 uploaded
author vipints
date Thu, 14 Feb 2013 23:38:36 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0f80a5141704
1 function STAT = rDiff_caller(PAR)
2 % genes = opt_transcripts_caller(PAR)
3 %
4 % -- input --
5 % PAR contains
6 % CFG: configuration struct
7 % genes: struct defining genes with start, stops, exons etc.
8 % profile_weights: weights of profile functions
9 % intron_dists: distances to closest intron
10 %
11 % -- output --
12 % genes: struct with additional fields of eg. estimated transcript weights
13 addpath ../../tests
14 addpath ../../variance/final
15 addpath ../../experimental
16
17 samtools='/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/samtools/';
18 CFG = PAR.CFG;
19 SUFF_NAME=CFG.SUFF_NAME;
20 %load(CFG.genes_path , 'genes');
21 %[genes]=mask_dubl(genes,0);
22
23 load(CFG.genes_path , 'genes');
24 [genes]=mask_dubl(genes,0);
25
26 %[genes]=remove_transcripts(genes);
27 genes=genes(PAR.genes_idx);
28 BAM_FILES=CFG.BAM_FILES;
29 JB_NR=CFG.JB_NR;
30
31
32
33 %clear PAR;
34
35 %%%% paths
36 addpath(CFG.paths);
37 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/variance/nbin');
38 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/experiments/');
39 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/tests/');
40 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/read_utils/');
41 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/tests/sequence_tools/');
42 addpath(genpath('/fml/ag-raetsch/home/drewe/svn/tools/chronux/'))
43 addpath('/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/tests/functions/');
44
45 out_base=CFG.out_base;
46 %sim_data_base=CFG.sim_data_base;
47 real_data_base=CFG.real_data_base;
48
49 expr12_bam=CFG.expr12_bam;
50 expr11_bam=CFG.expr11_bam;
51 expr22_bam=CFG.expr22_bam;
52 expr21_bam=CFG.expr21_bam;
53
54 CFG.SEQUENCED_LENGTH=80;
55 SEQUENCED_LENGTH=80;
56 %make map: gene->file
57
58 IDX1=CFG.IDX11;
59 IDX2=CFG.IDX22;
60
61 %BAM_FILES{IDX1}
62 %BAM_FILES{IDX2}
63
64
65 %genes=PAR.genes;
66 clear PAR
67
68 STAT=cell(size(genes));
69 do_save = 0;
70 out_base = '/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/out';
71 save_dir=fullfile(out_base,'cache');
72
73 for i=1:size(genes,2)
74 %try
75 RESULT=cell(1,7)
76 gene = genes(i);
77
78 RESULT{7}=JB_NR;
79 RESULT{1}=gene.name
80 load_only = false;
81
82 if or(isempty(gene.exons),gene.stop-gene.start<=SEQUENCED_LENGTH)
83 PV=1;
84 RESULT{2}={PV,''};
85 RESULT{3}={PV,''};
86 RESULT{4}={PV,''};
87 RESULT{5}=[Inf,Inf];
88 RESULT{6}=[Inf,Inf];
89 RESULT{7}={'empty gene exons'};
90 RESULT{8}={PV,''};
91 continue;
92 end
93
94 PV1=1;
95 for l=1:size(gene.transcripts,2)
96 EXONS=gene.exons{l}
97 if sum(EXONS(:,2)-EXONS(:,1))<=SEQUENCED_LENGTH
98 PV=1;
99 RESULT{2}={PV,''};
100 RESULT{3}={PV,''};
101 RESULT{4}={PV,''};
102 RESULT{5}=[Inf,Inf];
103 RESULT{6}=[Inf,Inf];
104 RESULT{7}={'empty gene exons'};
105 RESULT{8}={PV,''};
106 PV1=2;
107
108 continue;
109 end
110 end
111 if PV1==2
112 continue;
113 end
114 gene.name
115
116
117
118 %adjust start and stop positions on the negative strand
119 if strcmp(gene.strand,'-')
120 gene.start=gene.start+1;
121 gene.stop=gene.stop+1;
122 [reads11,reads12,intron11,intron12] = get_reads_dual2_intron(expr11_bam,expr12_bam,gene,samtools,false,10);
123 [reads11,FLAG]=remove_reads_from_other_genes(reads11,gene);
124 [reads12,FLAG]=remove_reads_from_other_genes(reads12,gene);
125
126
127 [reads21,reads22,intron21,intron22] = get_reads_dual2_intron(expr21_bam,expr22_bam,gene,samtools,false,10);
128 [reads21,FLAG]=remove_reads_from_other_genes(reads21,gene);
129 [reads22,FLAG]=remove_reads_from_other_genes(reads22,gene);
130 gene.start=gene.start-1;
131 gene.stop=gene.stop-1;
132 else
133 [reads11,reads12,intron11,intron12] = get_reads_dual2_intron(expr11_bam,expr12_bam,gene,samtools,false,10);
134 [reads11,FLAG]=remove_reads_from_other_genes(reads11,gene);
135 [reads12,FLAG]=remove_reads_from_other_genes(reads12,gene);
136
137 [reads21,reads22,intron21,intron22] = get_reads_dual2_intron(expr21_bam,expr22_bam,gene,samtools,false,10);
138 [reads21,FLAG]=remove_reads_from_other_genes(reads21,gene);
139 [reads22,FLAG]=remove_reads_from_other_genes(reads22,gene);
140 end
141
142 reads11=reads11(sum(reads11,2)>70,:);
143 reads12=reads12(sum(reads12,2)>70,:);
144 reads21=reads21(sum(reads21,2)>70,:);
145 reads22=reads22(sum(reads22,2)>70,:);
146 %% TEST with read cliping
147
148 READS1={reads11,reads12};
149 READS2={reads21,reads22};
150 reads1=[reads11;reads12];
151 reads2=[reads21;reads22];
152
153 TOTAL_SIZE=(size(reads12,1)+size(reads12,1)+(size(reads21,1)+size(reads22,1)));
154 MIN_SIZE=min(size(reads12,1)+size(reads12,1),(size(reads21,1)+size(reads22,1)));
155 %keyboard
156
157 CLIP=0;
158 if 1==0
159 if (size(reads1,1)>0)
160 for i=1:size(reads1,1)
161 reads1(i,find(reads1(i,:),2,'first'))=0;
162 end
163 for i=1:size(reads1,1)
164 reads1(i,find(reads1(i,:),2,'last'))=0;
165 end
166 end
167 if (size(reads2,1)>0)
168 for i=1:size(reads2,1)
169 reads2(i,find(reads2(i,:),2,'first'))=0;
170 end
171 for i=1:size(reads2,1)
172 reads2(i,find(reads2(i,:),2,'last'))=0;
173 end
174 end
175 CLIP=4;
176 end
177
178 % keybloard
179 %% END TEST
180
181 COUNTER=3;
182 FLAG=0;
183 for k=1:size(gene.transcripts,2)
184 if sum(gene.exons{k}(:,2)-gene.exons{k}(:,1))<75
185 FLAG=1;
186 end
187 end
188
189 if FLAG==1
190 RESULT{2}={'gene shorter than 75 bp'};
191 STAT{i}=RESULT;
192 continue
193 end
194
195 if 1==0
196
197 for V_COUNTER=2%:length(CFG.VARIANCES1)
198 if (TOTAL_SIZE>0)
199 if(MIN_SIZE>0)
200 PV=1;
201 INFO='';
202 [PV,INFO] =diff_mmd([reads11;reads12],[reads21;reads22],gene);
203 PV
204 RESULT{COUNTER}={PV,INFO};
205 COUNTER=COUNTER+1;
206 else
207 PV=1;
208 RESULT{COUNTER}={PV,2};
209 COUNTER=COUNTER+1;
210 end
211 else
212 PV=1;
213 RESULT{COUNTER}={PV,1};
214 COUNTER=COUNTER+1;
215 end
216 end
217 end
218 % keyboard
219
220 VAR_FACT=1;
221 COV_POS=1;
222 REWEIGHT=0;
223 if 1==1
224 for V_COUNTER=2%:length(CFG.VARIANCES1)
225 if (TOTAL_SIZE>0)
226 if(MIN_SIZE>0)
227 cen_ARR=0.1:0.1:1;
228 REWEIGHT_WEIGHTS=zeros(length( cen_ARR),size(reads1,2));
229 cen_count=1;
230 for censor_frac= cen_ARR
231 %if strcmp(gene.name,'AT1G01073')
232 % keyboard
233 %end
234 censor_frac
235 temp_reads1=reads1;
236 temp_reads2=reads2;
237 %cut to relevant positions
238 COVERAGE=sum(temp_reads1,1)+sum(temp_reads2,1);
239 NONZERO=COVERAGE>0;
240 SORTED_COVERAGES=sort(COVERAGE(NONZERO));
241 NR_OF_NONZERO=sum(NONZERO);
242 CHOSEN_POSITIONS=COVERAGE<=SORTED_COVERAGES(ceil(NR_OF_NONZERO*censor_frac));
243 REWEIGHT_WEIGHTS(cen_count,CHOSEN_POSITIONS)=1;
244 cen_count=cen_count+1;
245 %temp_reads1=temp_reads1(:,CHOSEN_POSITIONS);
246 %temp_reads2=temp_reads2(:,CHOSEN_POSITIONS);
247 %remove reads which have no coverage;
248 %temp_reads1=temp_reads1(sum(temp_reads1,2)>0,:);
249 %temp_reads2=temp_reads2(sum(temp_reads2,2)>0,:);
250 %SIZE1=size(temp_reads1,1);
251 %SIZE2=size(temp_reads2,1);
252 %MIN_SIZE=min(SIZE1,SIZE2);
253 %TOTAL_SIZE=SIZE1+SIZE2;
254 end
255 if (TOTAL_SIZE>0)
256 if(MIN_SIZE>0)
257 PV=1;
258 INFO='';
259 REWEIGHT=1;
260 [PV,INFO] =diff_mmd_variance_subsample3({reads11,reads12},{reads21,reads22},gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER},1,1,REWEIGHT, REWEIGHT_WEIGHTS)
261 RESULT{COUNTER}={PV,INFO};
262 COUNTER=COUNTER+1;
263 else
264 PV=1;
265 RESULT{COUNTER}={PV,2};
266 COUNTER=COUNTER+1;
267 end
268 else
269 PV=1;
270 RESULT{COUNTER}={PV,1};
271 COUNTER=COUNTER+1;
272 end
273 end
274 end
275 end
276 end
277
278 if 1==0
279 VAR_FACT=1;
280 COV_POS=1;
281 REWEIGHT=0;
282 for V_COUNTER=2%:length(CFG.VARIANCES1)
283 for cov_fact_ix=1:length(COV_POS)
284
285 if (TOTAL_SIZE>0)
286 if(MIN_SIZE>0)
287 PV=1;
288 INFO='';
289 [PV,INFO] =diff_mmd_variance_subsample2({reads11,reads12},{reads21,reads22},gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER},VAR_FACT(var_fact_ix),COV_POS,REWEIGHT);
290 RESULT{COUNTER}={PV,INFO};
291 COUNTER=COUNTER+1;
292 else
293 PV=1;
294 RESULT{COUNTER}={PV,2};
295 COUNTER=COUNTER+1;
296 end
297 else
298 PV=1;
299 RESULT{COUNTER}={PV,1};
300 COUNTER=COUNTER+1;
301 end
302 end
303 end
304
305 if 1==1
306 VAR_FACT=1;
307 COV_POS=1;
308 REWEIGHT=1;
309 for V_COUNTER=2%:length(CFG.VARIANCES1)
310 for var_fact_ix=1:length(VAR_FACT)
311
312 if (TOTAL_SIZE>0)
313 if(MIN_SIZE>0)
314 PV=1;
315 INFO='';
316 [PV,INFO] =diff_mmd_variance_subsample2({reads11,reads12},{reads21,reads22},gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER},VAR_FACT(var_fact_ix),COV_POS,REWEIGHT)
317 RESULT{COUNTER}={PV,INFO};
318 COUNTER=COUNTER+1;
319 else
320 PV=1;
321 RESULT{COUNTER}={PV,2};
322 COUNTER=COUNTER+1;
323 end
324 else
325 PV=1;
326 RESULT{COUNTER}={PV,1};
327 COUNTER=COUNTER+1;
328 end
329 end
330 end
331 end
332 end
333
334 %keyboard
335 TOTAL_SIZE=(size(reads12,1)+size(reads12,1)+(size(reads21,1)+size(reads22,1)));
336 MIN_SIZE=min(size(reads12,1)+size(reads12,1),(size(reads21,1)+size(reads22,1)));
337 if 1==0
338 for V_COUNTER=1
339 if (TOTAL_SIZE>0)
340 if(MIN_SIZE>0)
341 PV=1;
342 INFO='';
343 if isempty([intron11,intron12])|isempty([intron21,intron22])
344 [PV,INFO] =diff_mmd_variance_NB_NB_simple([reads11;reads12],[reads21;reads22],gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER});
345 else
346 [PV,INFO] =diff_mmd_variance_splice([reads11;reads12],[reads21;reads22],0.5,[intron11,intron12],[intron21,intron22],gene,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER});
347 end
348 RESULT{COUNTER}={PV,INFO};
349 COUNTER=COUNTER+1;
350 else
351 PV=1;
352 RESULT{COUNTER}={PV,2};
353 COUNTER=COUNTER+1;
354 end
355 else
356 PV=1;
357 RESULT{COUNTER}={PV,1};
358 COUNTER=COUNTER+1;
359 end
360 end
361 end
362 PV=1;
363 %keyboard
364 for V_COUNTER=length(CFG.VARIANCES1)
365 if (TOTAL_SIZE>0)
366
367 if(MIN_SIZE>0)
368 [P_VALUE, INFO]= diff_nbin7(READS1,READS2,gene,SEQUENCED_LENGTH,CFG.VARIANCES1{V_COUNTER},CFG.VARIANCES2{V_COUNTER});
369 RESULT{COUNTER}={P_VALUE,INFO};
370 if not(isempty(INFO))
371 if iscell(INFO)
372 RESULT{COUNTER}=INFO{5};
373 COUNTER=COUNTER+1;
374 end
375 end
376 else
377 PV=1;
378 RESULT{COUNTER}={PV,2};
379 COUNTER=COUNTER+1;
380 end
381 else
382 PV=1;
383 RESULT{COUNTER}={PV,1};
384 COUNTER=COUNTER+1;
385 end
386 end
387
388
389 [SPLICINGEVENTS,SEQUENCE,EXONSEQUENCE]=splicingsequence(gene);
390 [UNIQUE_NEW_EXONS,GRAPHNODES,ORDER_OF_GRAPHNODE,EIRS_IN_SEQ]=transform_single_end_reads(SPLICINGEVENTS,SEQUENCE,EXONSEQUENCE,CFG.SEQUENCED_LENGTH-CLIP);
391 [NEW_READS1,UNEXPLAINED_READS1,UNEXPLAINED_INDEX1]= convert_reads_to_region_indicators([reads11;reads12],UNIQUE_NEW_EXONS,GRAPHNODES,ORDER_OF_GRAPHNODE,EIRS_IN_SEQ,gene);
392 [NEW_READS2,UNEXPLAINED_READS2,UNEXPLAINED_INDEX2]= convert_reads_to_region_indicators([reads21;reads22],UNIQUE_NEW_EXONS,GRAPHNODES,ORDER_OF_GRAPHNODE,EIRS_IN_SEQ,gene);
393 % keyboard
394 TOTAL_SIZE=(size(NEW_READS1,1)+(size(NEW_READS2,1)));
395 MIN_SIZE=min(size(NEW_READS1,1),(size(NEW_READS2,1)));
396
397 if (TOTAL_SIZE>0)
398 if(MIN_SIZE>0)
399 [PV,INFO] = diff_poisson_bonf_3_unequal_segment(NEW_READS1,NEW_READS2,gene,CFG.SEQUENCED_LENGTH-CLIP);
400 RESULT{COUNTER}={PV,INFO};
401 COUNTER=COUNTER+1;
402 else
403 PV=1;
404 RESULT{COUNTER}={PV,2};
405 COUNTER=COUNTER+1;
406 end
407 else
408 PV=1;
409 RESULT{COUNTER}={PV,1};
410 COUNTER=COUNTER+1;
411 end
412 PV=1
413 if 1==0
414 if (TOTAL_SIZE>0)
415 if(MIN_SIZE>0)
416 [PV,INFO] = diff_poisson_bonf_4_unequal_segment(NEW_READS1,NEW_READS2,gene,CFG.SEQUENCED_LENGTH-CLIP);
417 RESULT{COUNTER}={PV,INFO};
418 COUNTER=COUNTER+1;
419 else
420 PV=1;
421 RESULT{COUNTER}={PV,2};
422 COUNTER=COUNTER+1;
423 end
424 else
425 PV=1;
426 RESULT{COUNTER}={PV,1};
427 COUNTER=COUNTER+1;
428 end
429 %keyboard
430 end
431 STAT{i}=RESULT;
432
433 end;
434 %keyboard
435 OUT_FILENAME=['/fml/ag-raetsch/home/drewe/svn/projects/RNASeq/difftest/out/analysis_artificial_variance_03_08_2012/' SUFF_NAME '_rep_mmd_07_07_2012_' int2str(JB_NR) '.mat'];
436 save(OUT_FILENAME,'STAT')