| 0 | 1 function [pval,info] = rDiff_nonparametric(CFG,READS1,READS2,variance_function_nonparametric_1, variance_function_nonparametric_2) | 
|  | 2 | 
|  | 3 bootstraps=CFG.bootstraps; | 
|  | 4 | 
|  | 5 %Initizialize the reads | 
|  | 6 SUMS1=[]; %the readcoverage for sample 1 | 
|  | 7 SUMS2=[]; %the readcoverage for sample 2 | 
|  | 8 reads1=[]; %the total of reads in sample 1 | 
|  | 9 reads2=[]; %the total of reads in sample 2 | 
|  | 10 NON_EMPTY=[]; | 
|  | 11 for i=1:length(READS1) | 
|  | 12   if not(isempty(READS1{i})) | 
|  | 13     NON_EMPTY=[NON_EMPTY,i]; | 
|  | 14   end | 
|  | 15 end | 
|  | 16 READS1={READS1{ NON_EMPTY}}; | 
|  | 17 NON_EMPTY=[]; | 
|  | 18 for i=1:length(READS2) | 
|  | 19   if not(isempty(READS2{i})) | 
|  | 20     NON_EMPTY=[NON_EMPTY,i]; | 
|  | 21   end | 
|  | 22 end | 
|  | 23 READS2={READS2{NON_EMPTY}}; | 
|  | 24 for i=1:length(READS1) | 
|  | 25   SUMS1=[SUMS1;sum(READS1{i},1)]; | 
|  | 26   reads1=[reads1;READS1{i}]; | 
|  | 27 end | 
|  | 28 for i=1:length(READS2) | 
|  | 29   SUMS2=[SUMS2;sum(READS2{i},1)]; | 
|  | 30   reads2=[reads2;READS2{i}]; | 
|  | 31 end | 
|  | 32 | 
|  | 33 if size(reads1,1)==0 || size(reads2,1)==0 | 
|  | 34   pval=1; | 
|  | 35   info=1; | 
|  | 36   bootstrap_results=1; | 
|  | 37   statistic=1; | 
|  | 38   return | 
|  | 39 end | 
|  | 40 | 
|  | 41 | 
|  | 42 %Get masks for the highly covered positions | 
|  | 43 [MASKS]=get_nonparametric_masks(CFG,reads1,reads2); | 
|  | 44 | 
|  | 45 | 
|  | 46 %Determine number of masks | 
|  | 47 NR_OF_MASKS=size(MASKS,1); | 
|  | 48 | 
|  | 49 % Predict variance | 
|  | 50 VARIANCE1=(1e-8)*ones(size(reads1,2),1); | 
|  | 51 for i=1:length(READS1) | 
|  | 52     TEMP_VARIANCE1=predict_variance(sum(READS1{i},1)',variance_function_nonparametric_1); | 
|  | 53     TEMP_VARIANCE1(isnan(TEMP_VARIANCE1))=0; | 
|  | 54     TEMP_VARIANCE1(isinf(TEMP_VARIANCE1))=0; | 
|  | 55     VARIANCE1=VARIANCE1+TEMP_VARIANCE1; | 
|  | 56 end | 
|  | 57 | 
|  | 58 VARIANCE2=(1e-8)*ones(size(reads1,2),1); | 
|  | 59 for i=1:length(READS2) | 
|  | 60     TEMP_VARIANCE2=predict_variance(sum(READS2{i},1)',variance_function_nonparametric_2); | 
|  | 61     TEMP_VARIANCE2(isnan(TEMP_VARIANCE2))=0; | 
|  | 62     TEMP_VARIANCE2(isinf(TEMP_VARIANCE2))=0; | 
|  | 63     VARIANCE2=VARIANCE2+TEMP_VARIANCE2; | 
|  | 64 end | 
|  | 65 | 
|  | 66 %Get the mean variance | 
|  | 67 VARIANCE1=VARIANCE1/length(READS1); | 
|  | 68 VARIANCE2=VARIANCE2/length(READS2); | 
|  | 69 | 
|  | 70 %Concatenation of all reads | 
|  | 71 allreads = [reads1;reads2]; | 
|  | 72 | 
|  | 73 | 
|  | 74 %Determine the subsampling rate | 
|  | 75 p=sum(allreads,1)/size(allreads,1); | 
|  | 76 R=size(reads1,1); | 
|  | 77 c=(p.*(1-p))/(size(allreads,1)-1); | 
|  | 78 N1s=size(allreads,1)*c./(c+(VARIANCE1'/(R^2))); | 
|  | 79 | 
|  | 80 p=sum(allreads,1)/size(allreads,1); | 
|  | 81 R=size(reads2,1); | 
|  | 82 c=(p.*(1-p))/(size(allreads,1)-1); | 
|  | 83 N2s=size(allreads,1)*c./(c+(VARIANCE2'/(R^2))); | 
|  | 84 | 
|  | 85 %Round to get integer values | 
|  | 86 N1s=ceil(full(N1s)); | 
|  | 87 N2s=ceil(full(N2s)); | 
|  | 88 | 
|  | 89 | 
|  | 90 %Total number of reads in each replicate | 
|  | 91 N1 = size(reads1,1); | 
|  | 92 N2 = size(reads2,1); | 
|  | 93 N = N1 + N2; | 
|  | 94 | 
|  | 95 | 
|  | 96 bootstrap_results=ones(NR_OF_MASKS,bootstraps); | 
|  | 97 COUNTER=0; | 
|  | 98 R1s=[]; | 
|  | 99 R2s=[]; | 
|  | 100 statistic=ones(NR_OF_MASKS,bootstraps); | 
|  | 101 nr_of_slices=CFG.nr_of_slices; | 
|  | 102 | 
|  | 103 TOTAL_SUM1=sum(reads1,1); | 
|  | 104 TOTAL_SUM2=sum(reads2,1); | 
|  | 105 TOTAL_SUM=TOTAL_SUM1+TOTAL_SUM2; | 
|  | 106 | 
|  | 107 clear reads1 | 
|  | 108 clear reads2 | 
|  | 109 %Use the transpose to make the selection of columms faster | 
|  | 110 all_reads_trans=allreads'; | 
|  | 111 clear allreads | 
|  | 112 if length(unique(TOTAL_SUM))<nr_of_slices+5 | 
|  | 113     pval=1; | 
|  | 114     bootstrap_results=[]; | 
|  | 115     statistic=[]; | 
|  | 116     pval=1; | 
|  | 117     info = 1; | 
|  | 118     return | 
|  | 119 end | 
|  | 120 | 
|  | 121 [SUM_SORTED,SUM_POS]=sort(TOTAL_SUM); | 
|  | 122 NR_OF_ZEROS=sum(TOTAL_SUM==0); | 
|  | 123 | 
|  | 124 %precompute this sum once | 
|  | 125 all_reads_trans_row_sum=sum(all_reads_trans,1); | 
|  | 126 | 
|  | 127 %These field contain | 
|  | 128 STAT_DIST=zeros(NR_OF_MASKS,nr_of_slices); | 
|  | 129 TEMP_DIST=zeros(1,nr_of_slices); | 
|  | 130 | 
|  | 131 %Precompute normalizing constants | 
|  | 132 SUM_TOTAL_SUM1=sum(TOTAL_SUM1); | 
|  | 133 SUM_TOTAL_SUM2=sum(TOTAL_SUM2); | 
|  | 134 SUM_SUM_TOTAL_SUM=SUM_TOTAL_SUM1+SUM_TOTAL_SUM2; | 
|  | 135 | 
|  | 136 %Precompute the some of the values or the slices | 
|  | 137 SLICES=zeros(nr_of_slices,size(TOTAL_SUM,2))==1; | 
|  | 138 N1_arr=zeros(nr_of_slices,1); | 
|  | 139 N2_arr=zeros(nr_of_slices,1); | 
|  | 140 FACT_arr=zeros(nr_of_slices,1); | 
|  | 141 V1_cell=cell(nr_of_slices,1); | 
|  | 142 V2_cell=cell(nr_of_slices,1); | 
|  | 143 | 
|  | 144 %Detemine regions wher to match the variances | 
|  | 145 for s=nr_of_slices:(-1):1 | 
|  | 146         LOWER_POS=min(NR_OF_ZEROS+ceil(((nr_of_slices-s)/nr_of_slices)*(length(TOTAL_SUM)-NR_OF_ZEROS))+1,length(TOTAL_SUM)); | 
|  | 147         UPPER_POS=min(NR_OF_ZEROS+ceil(((nr_of_slices-s+1)/nr_of_slices)*(length(TOTAL_SUM)-NR_OF_ZEROS))+1,length(TOTAL_SUM)); | 
|  | 148 | 
|  | 149         SLICE_LOWER_BOUND=SUM_SORTED(LOWER_POS); | 
|  | 150         SLICE_UPPER_BOUND=SUM_SORTED(UPPER_POS); | 
|  | 151         if s==nr_of_slices | 
|  | 152 	  SLICE=and(TOTAL_SUM>=SLICE_LOWER_BOUND,TOTAL_SUM<=SLICE_UPPER_BOUND); | 
|  | 153 	else | 
|  | 154 	  SLICE=and(TOTAL_SUM>SLICE_LOWER_BOUND,TOTAL_SUM<=SLICE_UPPER_BOUND); | 
|  | 155         end | 
|  | 156 	SLICES(s,:)=SLICE; | 
|  | 157 | 
|  | 158         N1s_temp=ceil(median(N1s(SLICE))); | 
|  | 159         N2s_temp=ceil(median(N2s(SLICE))); | 
|  | 160         N1s_temp=min(N1s_temp,N1); | 
|  | 161         N2s_temp=min(N2s_temp,N2); | 
|  | 162 | 
|  | 163         N1_arr(s)=N1s_temp; | 
|  | 164         N2_arr(s)=N2s_temp; | 
|  | 165 | 
|  | 166         FACT_arr(s)=sum(TOTAL_SUM1(SLICE)+TOTAL_SUM2(SLICE))/(SUM_SUM_TOTAL_SUM); | 
|  | 167 | 
|  | 168         V1_cell{s}=TOTAL_SUM1(SLICE)/SUM_TOTAL_SUM1;%temporary variable to safe time | 
|  | 169         V2_cell{s}=TOTAL_SUM2(SLICE)/SUM_TOTAL_SUM2;%temporary variable to safe time | 
|  | 170         for mask_ix=1:NR_OF_MASKS | 
|  | 171             STAT_DIST(mask_ix,s)=eucl_dist_weigthed(V1_cell{s},V2_cell{s},MASKS(mask_ix,SLICES(s,:))); | 
|  | 172         end | 
|  | 173 end | 
|  | 174 SUM_SLICES=sum(SLICES,2); | 
|  | 175 STAT_DIST(isnan(TEMP_DIST))=0; | 
|  | 176 all_reads_trans_slices=cell(nr_of_slices,1); | 
|  | 177 for s=nr_of_slices:(-1):1 | 
|  | 178     all_reads_trans_slices{s}=all_reads_trans(SLICES(s,:),:); | 
|  | 179 end | 
|  | 180 | 
|  | 181 for i = 1:bootstraps | 
|  | 182     % permutation of the reads | 
|  | 183     read_per = randperm(N); | 
|  | 184 | 
|  | 185     %Peform the computation for each region where the variances are matched | 
|  | 186     for s=nr_of_slices:(-1):1 | 
|  | 187         if SUM_SLICES(s)==0; | 
|  | 188             continue; | 
|  | 189         end | 
|  | 190 	%Create random samples 1 and 2 | 
|  | 191         sample1 = sum(all_reads_trans_slices{s}(:,read_per(1:N1_arr(s))),2); | 
|  | 192         sample2 = sum(all_reads_trans_slices{s}(:,read_per((N1_arr(s)+1):(N1_arr(s)+N2_arr(s)))),2); | 
|  | 193 | 
|  | 194         W1=sample1/sum(sample1)*FACT_arr(s);%temporary variable to safe time | 
|  | 195         W2=sample2/sum(sample2)*FACT_arr(s);%temporary variable to safe time | 
|  | 196 | 
|  | 197         for mask_ix=1:NR_OF_MASKS | 
|  | 198             TEMP_DIST(mask_ix,s)=eucl_dist_weigthed(W1,W2,MASKS(mask_ix,SLICES(s,:))'); | 
|  | 199         end | 
|  | 200 | 
|  | 201     end | 
|  | 202 | 
|  | 203     %make sure the normalisation doe not intruduces nan's | 
|  | 204     TEMP_DIST(isnan(TEMP_DIST))=0; | 
|  | 205 | 
|  | 206     COUNTER=COUNTER+1; | 
|  | 207     %Compute the average from the different matching regionon | 
|  | 208     statistic(:,COUNTER)=mean(STAT_DIST,2); | 
|  | 209     bootstrap_results(:,COUNTER)=mean(TEMP_DIST,2); | 
|  | 210 end | 
|  | 211 | 
|  | 212 bootstrap_results=bootstrap_results(:,1:COUNTER); | 
|  | 213 statistic=statistic(:,1:COUNTER); | 
|  | 214 | 
|  | 215 pval=double(sum(bootstrap_results>=statistic,2)) / COUNTER; | 
|  | 216 | 
|  | 217 info = {bootstrap_results,statistic,pval}; | 
|  | 218 pval=min(pval)*10; | 
|  | 219 | 
|  | 220 function result = eucl_dist_weigthed(A,B,W) | 
|  | 221 | 
|  | 222 result = sqrt(sum( W.*((A - B) .^ 2) )); |