diff rDiff/src/tests/rDiff_nonparametric.m @ 0:0f80a5141704

version 0.3 uploaded
author vipints
date Thu, 14 Feb 2013 23:38:36 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rDiff/src/tests/rDiff_nonparametric.m	Thu Feb 14 23:38:36 2013 -0500
@@ -0,0 +1,222 @@
+function [pval,info] = rDiff_nonparametric(CFG,READS1,READS2,variance_function_nonparametric_1, variance_function_nonparametric_2)
+
+bootstraps=CFG.bootstraps;
+
+%Initizialize the reads
+SUMS1=[]; %the readcoverage for sample 1
+SUMS2=[]; %the readcoverage for sample 2
+reads1=[]; %the total of reads in sample 1
+reads2=[]; %the total of reads in sample 2
+NON_EMPTY=[];
+for i=1:length(READS1)
+  if not(isempty(READS1{i}))
+    NON_EMPTY=[NON_EMPTY,i];
+  end
+end
+READS1={READS1{ NON_EMPTY}};
+NON_EMPTY=[];
+for i=1:length(READS2)
+  if not(isempty(READS2{i}))
+    NON_EMPTY=[NON_EMPTY,i];
+  end
+end
+READS2={READS2{NON_EMPTY}};
+for i=1:length(READS1)
+  SUMS1=[SUMS1;sum(READS1{i},1)];
+  reads1=[reads1;READS1{i}];
+end
+for i=1:length(READS2)
+  SUMS2=[SUMS2;sum(READS2{i},1)];
+  reads2=[reads2;READS2{i}];
+end
+
+if size(reads1,1)==0 || size(reads2,1)==0
+  pval=1;
+  info=1;
+  bootstrap_results=1;
+  statistic=1;
+  return
+end
+
+
+%Get masks for the highly covered positions
+[MASKS]=get_nonparametric_masks(CFG,reads1,reads2);
+
+
+%Determine number of masks
+NR_OF_MASKS=size(MASKS,1);
+
+% Predict variance
+VARIANCE1=(1e-8)*ones(size(reads1,2),1);
+for i=1:length(READS1)
+    TEMP_VARIANCE1=predict_variance(sum(READS1{i},1)',variance_function_nonparametric_1);
+    TEMP_VARIANCE1(isnan(TEMP_VARIANCE1))=0;
+    TEMP_VARIANCE1(isinf(TEMP_VARIANCE1))=0;
+    VARIANCE1=VARIANCE1+TEMP_VARIANCE1;
+end
+
+VARIANCE2=(1e-8)*ones(size(reads1,2),1);
+for i=1:length(READS2)
+    TEMP_VARIANCE2=predict_variance(sum(READS2{i},1)',variance_function_nonparametric_2);
+    TEMP_VARIANCE2(isnan(TEMP_VARIANCE2))=0;
+    TEMP_VARIANCE2(isinf(TEMP_VARIANCE2))=0;
+    VARIANCE2=VARIANCE2+TEMP_VARIANCE2;
+end
+
+%Get the mean variance
+VARIANCE1=VARIANCE1/length(READS1);
+VARIANCE2=VARIANCE2/length(READS2);
+
+%Concatenation of all reads
+allreads = [reads1;reads2];
+
+
+%Determine the subsampling rate
+p=sum(allreads,1)/size(allreads,1);
+R=size(reads1,1);
+c=(p.*(1-p))/(size(allreads,1)-1);
+N1s=size(allreads,1)*c./(c+(VARIANCE1'/(R^2)));
+
+p=sum(allreads,1)/size(allreads,1);
+R=size(reads2,1);
+c=(p.*(1-p))/(size(allreads,1)-1);
+N2s=size(allreads,1)*c./(c+(VARIANCE2'/(R^2)));
+
+%Round to get integer values
+N1s=ceil(full(N1s));
+N2s=ceil(full(N2s));
+
+
+%Total number of reads in each replicate
+N1 = size(reads1,1);
+N2 = size(reads2,1);
+N = N1 + N2;
+
+
+bootstrap_results=ones(NR_OF_MASKS,bootstraps);
+COUNTER=0;
+R1s=[];
+R2s=[];
+statistic=ones(NR_OF_MASKS,bootstraps);
+nr_of_slices=CFG.nr_of_slices;
+
+TOTAL_SUM1=sum(reads1,1);
+TOTAL_SUM2=sum(reads2,1);
+TOTAL_SUM=TOTAL_SUM1+TOTAL_SUM2;
+
+clear reads1
+clear reads2
+%Use the transpose to make the selection of columms faster
+all_reads_trans=allreads';
+clear allreads
+if length(unique(TOTAL_SUM))<nr_of_slices+5
+    pval=1;
+    bootstrap_results=[];
+    statistic=[];
+    pval=1;
+    info = 1;
+    return
+end
+
+[SUM_SORTED,SUM_POS]=sort(TOTAL_SUM);
+NR_OF_ZEROS=sum(TOTAL_SUM==0);
+
+%precompute this sum once 
+all_reads_trans_row_sum=sum(all_reads_trans,1);
+
+%These field contain
+STAT_DIST=zeros(NR_OF_MASKS,nr_of_slices);
+TEMP_DIST=zeros(1,nr_of_slices);
+
+%Precompute normalizing constants
+SUM_TOTAL_SUM1=sum(TOTAL_SUM1);
+SUM_TOTAL_SUM2=sum(TOTAL_SUM2);
+SUM_SUM_TOTAL_SUM=SUM_TOTAL_SUM1+SUM_TOTAL_SUM2;
+
+%Precompute the some of the values or the slices
+SLICES=zeros(nr_of_slices,size(TOTAL_SUM,2))==1;
+N1_arr=zeros(nr_of_slices,1);
+N2_arr=zeros(nr_of_slices,1);
+FACT_arr=zeros(nr_of_slices,1);
+V1_cell=cell(nr_of_slices,1);
+V2_cell=cell(nr_of_slices,1);
+
+%Detemine regions wher to match the variances
+for s=nr_of_slices:(-1):1
+        LOWER_POS=min(NR_OF_ZEROS+ceil(((nr_of_slices-s)/nr_of_slices)*(length(TOTAL_SUM)-NR_OF_ZEROS))+1,length(TOTAL_SUM));
+        UPPER_POS=min(NR_OF_ZEROS+ceil(((nr_of_slices-s+1)/nr_of_slices)*(length(TOTAL_SUM)-NR_OF_ZEROS))+1,length(TOTAL_SUM));
+        
+        SLICE_LOWER_BOUND=SUM_SORTED(LOWER_POS);
+        SLICE_UPPER_BOUND=SUM_SORTED(UPPER_POS);
+        if s==nr_of_slices
+	  SLICE=and(TOTAL_SUM>=SLICE_LOWER_BOUND,TOTAL_SUM<=SLICE_UPPER_BOUND);
+	else
+	  SLICE=and(TOTAL_SUM>SLICE_LOWER_BOUND,TOTAL_SUM<=SLICE_UPPER_BOUND);
+        end
+	SLICES(s,:)=SLICE;
+        
+        N1s_temp=ceil(median(N1s(SLICE)));
+        N2s_temp=ceil(median(N2s(SLICE)));
+        N1s_temp=min(N1s_temp,N1);
+        N2s_temp=min(N2s_temp,N2);
+        
+        N1_arr(s)=N1s_temp;
+        N2_arr(s)=N2s_temp;
+        
+        FACT_arr(s)=sum(TOTAL_SUM1(SLICE)+TOTAL_SUM2(SLICE))/(SUM_SUM_TOTAL_SUM);
+        
+        V1_cell{s}=TOTAL_SUM1(SLICE)/SUM_TOTAL_SUM1;%temporary variable to safe time
+        V2_cell{s}=TOTAL_SUM2(SLICE)/SUM_TOTAL_SUM2;%temporary variable to safe time
+        for mask_ix=1:NR_OF_MASKS
+            STAT_DIST(mask_ix,s)=eucl_dist_weigthed(V1_cell{s},V2_cell{s},MASKS(mask_ix,SLICES(s,:)));
+        end
+end
+SUM_SLICES=sum(SLICES,2);
+STAT_DIST(isnan(TEMP_DIST))=0;
+all_reads_trans_slices=cell(nr_of_slices,1);
+for s=nr_of_slices:(-1):1
+    all_reads_trans_slices{s}=all_reads_trans(SLICES(s,:),:);
+end
+
+for i = 1:bootstraps
+    % permutation of the reads
+    read_per = randperm(N);
+     
+    %Peform the computation for each region where the variances are matched
+    for s=nr_of_slices:(-1):1
+        if SUM_SLICES(s)==0;
+            continue;
+        end 
+	%Create random samples 1 and 2
+        sample1 = sum(all_reads_trans_slices{s}(:,read_per(1:N1_arr(s))),2);
+        sample2 = sum(all_reads_trans_slices{s}(:,read_per((N1_arr(s)+1):(N1_arr(s)+N2_arr(s)))),2);
+        
+        W1=sample1/sum(sample1)*FACT_arr(s);%temporary variable to safe time
+        W2=sample2/sum(sample2)*FACT_arr(s);%temporary variable to safe time
+       
+        for mask_ix=1:NR_OF_MASKS
+            TEMP_DIST(mask_ix,s)=eucl_dist_weigthed(W1,W2,MASKS(mask_ix,SLICES(s,:))');
+        end
+        
+    end 
+    
+    %make sure the normalisation doe not intruduces nan's
+    TEMP_DIST(isnan(TEMP_DIST))=0;
+   
+    COUNTER=COUNTER+1;  
+    %Compute the average from the different matching regionon
+    statistic(:,COUNTER)=mean(STAT_DIST,2);
+    bootstrap_results(:,COUNTER)=mean(TEMP_DIST,2);
+end
+
+bootstrap_results=bootstrap_results(:,1:COUNTER);
+statistic=statistic(:,1:COUNTER);
+
+pval=double(sum(bootstrap_results>=statistic,2)) / COUNTER;
+
+info = {bootstrap_results,statistic,pval};
+pval=min(pval)*10;
+
+function result = eucl_dist_weigthed(A,B,W)
+
+result = sqrt(sum( W.*((A - B) .^ 2) ));