comparison rDiff/src/tests/rDiff_nonparametric.m @ 0:0f80a5141704

version 0.3 uploaded
author vipints
date Thu, 14 Feb 2013 23:38:36 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0f80a5141704
1 function [pval,info] = rDiff_nonparametric(CFG,READS1,READS2,variance_function_nonparametric_1, variance_function_nonparametric_2)
2
3 bootstraps=CFG.bootstraps;
4
5 %Initizialize the reads
6 SUMS1=[]; %the readcoverage for sample 1
7 SUMS2=[]; %the readcoverage for sample 2
8 reads1=[]; %the total of reads in sample 1
9 reads2=[]; %the total of reads in sample 2
10 NON_EMPTY=[];
11 for i=1:length(READS1)
12 if not(isempty(READS1{i}))
13 NON_EMPTY=[NON_EMPTY,i];
14 end
15 end
16 READS1={READS1{ NON_EMPTY}};
17 NON_EMPTY=[];
18 for i=1:length(READS2)
19 if not(isempty(READS2{i}))
20 NON_EMPTY=[NON_EMPTY,i];
21 end
22 end
23 READS2={READS2{NON_EMPTY}};
24 for i=1:length(READS1)
25 SUMS1=[SUMS1;sum(READS1{i},1)];
26 reads1=[reads1;READS1{i}];
27 end
28 for i=1:length(READS2)
29 SUMS2=[SUMS2;sum(READS2{i},1)];
30 reads2=[reads2;READS2{i}];
31 end
32
33 if size(reads1,1)==0 || size(reads2,1)==0
34 pval=1;
35 info=1;
36 bootstrap_results=1;
37 statistic=1;
38 return
39 end
40
41
42 %Get masks for the highly covered positions
43 [MASKS]=get_nonparametric_masks(CFG,reads1,reads2);
44
45
46 %Determine number of masks
47 NR_OF_MASKS=size(MASKS,1);
48
49 % Predict variance
50 VARIANCE1=(1e-8)*ones(size(reads1,2),1);
51 for i=1:length(READS1)
52 TEMP_VARIANCE1=predict_variance(sum(READS1{i},1)',variance_function_nonparametric_1);
53 TEMP_VARIANCE1(isnan(TEMP_VARIANCE1))=0;
54 TEMP_VARIANCE1(isinf(TEMP_VARIANCE1))=0;
55 VARIANCE1=VARIANCE1+TEMP_VARIANCE1;
56 end
57
58 VARIANCE2=(1e-8)*ones(size(reads1,2),1);
59 for i=1:length(READS2)
60 TEMP_VARIANCE2=predict_variance(sum(READS2{i},1)',variance_function_nonparametric_2);
61 TEMP_VARIANCE2(isnan(TEMP_VARIANCE2))=0;
62 TEMP_VARIANCE2(isinf(TEMP_VARIANCE2))=0;
63 VARIANCE2=VARIANCE2+TEMP_VARIANCE2;
64 end
65
66 %Get the mean variance
67 VARIANCE1=VARIANCE1/length(READS1);
68 VARIANCE2=VARIANCE2/length(READS2);
69
70 %Concatenation of all reads
71 allreads = [reads1;reads2];
72
73
74 %Determine the subsampling rate
75 p=sum(allreads,1)/size(allreads,1);
76 R=size(reads1,1);
77 c=(p.*(1-p))/(size(allreads,1)-1);
78 N1s=size(allreads,1)*c./(c+(VARIANCE1'/(R^2)));
79
80 p=sum(allreads,1)/size(allreads,1);
81 R=size(reads2,1);
82 c=(p.*(1-p))/(size(allreads,1)-1);
83 N2s=size(allreads,1)*c./(c+(VARIANCE2'/(R^2)));
84
85 %Round to get integer values
86 N1s=ceil(full(N1s));
87 N2s=ceil(full(N2s));
88
89
90 %Total number of reads in each replicate
91 N1 = size(reads1,1);
92 N2 = size(reads2,1);
93 N = N1 + N2;
94
95
96 bootstrap_results=ones(NR_OF_MASKS,bootstraps);
97 COUNTER=0;
98 R1s=[];
99 R2s=[];
100 statistic=ones(NR_OF_MASKS,bootstraps);
101 nr_of_slices=CFG.nr_of_slices;
102
103 TOTAL_SUM1=sum(reads1,1);
104 TOTAL_SUM2=sum(reads2,1);
105 TOTAL_SUM=TOTAL_SUM1+TOTAL_SUM2;
106
107 clear reads1
108 clear reads2
109 %Use the transpose to make the selection of columms faster
110 all_reads_trans=allreads';
111 clear allreads
112 if length(unique(TOTAL_SUM))<nr_of_slices+5
113 pval=1;
114 bootstrap_results=[];
115 statistic=[];
116 pval=1;
117 info = 1;
118 return
119 end
120
121 [SUM_SORTED,SUM_POS]=sort(TOTAL_SUM);
122 NR_OF_ZEROS=sum(TOTAL_SUM==0);
123
124 %precompute this sum once
125 all_reads_trans_row_sum=sum(all_reads_trans,1);
126
127 %These field contain
128 STAT_DIST=zeros(NR_OF_MASKS,nr_of_slices);
129 TEMP_DIST=zeros(1,nr_of_slices);
130
131 %Precompute normalizing constants
132 SUM_TOTAL_SUM1=sum(TOTAL_SUM1);
133 SUM_TOTAL_SUM2=sum(TOTAL_SUM2);
134 SUM_SUM_TOTAL_SUM=SUM_TOTAL_SUM1+SUM_TOTAL_SUM2;
135
136 %Precompute the some of the values or the slices
137 SLICES=zeros(nr_of_slices,size(TOTAL_SUM,2))==1;
138 N1_arr=zeros(nr_of_slices,1);
139 N2_arr=zeros(nr_of_slices,1);
140 FACT_arr=zeros(nr_of_slices,1);
141 V1_cell=cell(nr_of_slices,1);
142 V2_cell=cell(nr_of_slices,1);
143
144 %Detemine regions wher to match the variances
145 for s=nr_of_slices:(-1):1
146 LOWER_POS=min(NR_OF_ZEROS+ceil(((nr_of_slices-s)/nr_of_slices)*(length(TOTAL_SUM)-NR_OF_ZEROS))+1,length(TOTAL_SUM));
147 UPPER_POS=min(NR_OF_ZEROS+ceil(((nr_of_slices-s+1)/nr_of_slices)*(length(TOTAL_SUM)-NR_OF_ZEROS))+1,length(TOTAL_SUM));
148
149 SLICE_LOWER_BOUND=SUM_SORTED(LOWER_POS);
150 SLICE_UPPER_BOUND=SUM_SORTED(UPPER_POS);
151 if s==nr_of_slices
152 SLICE=and(TOTAL_SUM>=SLICE_LOWER_BOUND,TOTAL_SUM<=SLICE_UPPER_BOUND);
153 else
154 SLICE=and(TOTAL_SUM>SLICE_LOWER_BOUND,TOTAL_SUM<=SLICE_UPPER_BOUND);
155 end
156 SLICES(s,:)=SLICE;
157
158 N1s_temp=ceil(median(N1s(SLICE)));
159 N2s_temp=ceil(median(N2s(SLICE)));
160 N1s_temp=min(N1s_temp,N1);
161 N2s_temp=min(N2s_temp,N2);
162
163 N1_arr(s)=N1s_temp;
164 N2_arr(s)=N2s_temp;
165
166 FACT_arr(s)=sum(TOTAL_SUM1(SLICE)+TOTAL_SUM2(SLICE))/(SUM_SUM_TOTAL_SUM);
167
168 V1_cell{s}=TOTAL_SUM1(SLICE)/SUM_TOTAL_SUM1;%temporary variable to safe time
169 V2_cell{s}=TOTAL_SUM2(SLICE)/SUM_TOTAL_SUM2;%temporary variable to safe time
170 for mask_ix=1:NR_OF_MASKS
171 STAT_DIST(mask_ix,s)=eucl_dist_weigthed(V1_cell{s},V2_cell{s},MASKS(mask_ix,SLICES(s,:)));
172 end
173 end
174 SUM_SLICES=sum(SLICES,2);
175 STAT_DIST(isnan(TEMP_DIST))=0;
176 all_reads_trans_slices=cell(nr_of_slices,1);
177 for s=nr_of_slices:(-1):1
178 all_reads_trans_slices{s}=all_reads_trans(SLICES(s,:),:);
179 end
180
181 for i = 1:bootstraps
182 % permutation of the reads
183 read_per = randperm(N);
184
185 %Peform the computation for each region where the variances are matched
186 for s=nr_of_slices:(-1):1
187 if SUM_SLICES(s)==0;
188 continue;
189 end
190 %Create random samples 1 and 2
191 sample1 = sum(all_reads_trans_slices{s}(:,read_per(1:N1_arr(s))),2);
192 sample2 = sum(all_reads_trans_slices{s}(:,read_per((N1_arr(s)+1):(N1_arr(s)+N2_arr(s)))),2);
193
194 W1=sample1/sum(sample1)*FACT_arr(s);%temporary variable to safe time
195 W2=sample2/sum(sample2)*FACT_arr(s);%temporary variable to safe time
196
197 for mask_ix=1:NR_OF_MASKS
198 TEMP_DIST(mask_ix,s)=eucl_dist_weigthed(W1,W2,MASKS(mask_ix,SLICES(s,:))');
199 end
200
201 end
202
203 %make sure the normalisation doe not intruduces nan's
204 TEMP_DIST(isnan(TEMP_DIST))=0;
205
206 COUNTER=COUNTER+1;
207 %Compute the average from the different matching regionon
208 statistic(:,COUNTER)=mean(STAT_DIST,2);
209 bootstrap_results(:,COUNTER)=mean(TEMP_DIST,2);
210 end
211
212 bootstrap_results=bootstrap_results(:,1:COUNTER);
213 statistic=statistic(:,1:COUNTER);
214
215 pval=double(sum(bootstrap_results>=statistic,2)) / COUNTER;
216
217 info = {bootstrap_results,statistic,pval};
218 pval=min(pval)*10;
219
220 function result = eucl_dist_weigthed(A,B,W)
221
222 result = sqrt(sum( W.*((A - B) .^ 2) ));