Mercurial > repos > vipints > rdiff
diff rDiff/src/get_reads_caller.m @ 0:0f80a5141704
version 0.3 uploaded
author | vipints |
---|---|
date | Thu, 14 Feb 2013 23:38:36 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rDiff/src/get_reads_caller.m Thu Feb 14 23:38:36 2013 -0500 @@ -0,0 +1,150 @@ +function [COUNTS]=get_reads_caller(PAR) + +CFG = PAR.CFG; +genes = PAR.genes; +clear PAR; + +% add paths +addpath(CFG.paths); + +data_dir=CFG.data_dir; +OUT_STR=[]; + +COUNTS=cell(size(genes,2),1); + + +% NON_PARAM_SAMPLE contains the read start density +if CFG.perform_nonparametric + NON_PARAM_SAMPLE=sparse([],[],[],10000,1,5000); +end + + +for i=1:size(genes,2) + + + + %TEMP_COUNT contins the counts for the current gene + TEMP_COUNT=cell(1,7); + gene = genes(i); + + %set default return values + TEMP_COUNT{1}=gene.name; + TEMP_COUNT{2}=[]; + TEMP_COUNT{3}=[]; + TEMP_COUNT{4}=[]; + TEMP_COUNT{5}=[]; + TEMP_COUNT{6}=[]; + + %check that the gene has exons defined + if isempty(gene.exons) + STAT{i}=TEMP_COUNT; + continue; + end + + %check that the gene is longer than the Reads. Otherwise the + %definition of regions does not makes sense + if gene.stop-gene.start<CFG.sequenced_length+3 + STAT{i}=TEMP_COUNT; + continue; + end + + %Get the reads from gene + [reads] = get_reads_for_gene(CFG,gene); + + + %Get total number of reads + NR_OF_READS=size(reads,1); + TEMP_COUNT{2}=NR_OF_READS; + + + NR_OF_TRANS=size(gene.transcripts,2); + %check that the gene has more than one isoform + if NR_OF_TRANS<=1 + STAT{i}=TEMP_COUNT; + continue; + end + + + EXON_IDX=sum(gene.exonsequence,1)<NR_OF_TRANS; + + %Transform the reads in to counts per region + [NEW_READS,UNEXPLAINED_READS,UNEXPLAINED_INDEX]= convert_reads_to_region_indicators(reads,gene); + + if length(unique(sum(NEW_READS,2)))>1 + warning(['Assignment of reads to regions is not unique for gene:' gene.name '\n']); + end + + + %Calulate gene expression + %Get the non_unique_regions + NON_ALT_EIRS=sum(gene.eirs_in_seq,1)==NR_OF_TRANS; + TEMP_COUNT{3}=sum(sum(NEW_READS(:,NON_ALT_EIRS),2)>0); + + + %Get Counts for nonparametric variance function + if CFG.perform_nonparametric + %Get the read starts + [TEMP,START]=max(reads,[],2); + read_starts=sparse((1:NR_OF_READS)',START,ones(NR_OF_READS,1),NR_OF_READS,size(reads,2),NR_OF_READS); + + %Get the index of the alternative reads + ALT_READ_IX=zeros(size(reads,1),1); + ALT_EIRS=and(sum(gene.eirs_in_seq,1)<NR_OF_TRANS,sum(gene.eirs_in_seq,1)>0); + ALT_READ_IX(UNEXPLAINED_INDEX==0)=sum(NEW_READS(:,ALT_EIRS),2)>0; + + %Get the coverage of the read starts + %COVERAGE=sum(read_starts(find(ALT_READ_IX>0),:),1); + if CFG.only_gene_start + COVERAGE=sum(reads,1); + else + COVERAGE=sum(reads(find(ALT_READ_IX>0),:),1); + end + if max(COVERAGE)>0 + TEMP_COUNT{4}=COVERAGE; + else + TEMP_COUNT{4}=[]; + end + end + + + %Get counts for parametric settting + if or(CFG.perform_parametric,CFG.perform_poisson) + %Get the alternative reads + ALT_EIRS=and(sum(gene.eirs_in_seq,1)<NR_OF_TRANS,sum(gene.eirs_in_seq,1)>0); + + %Return the Counts in the alternative EIRS + + COUNTS_PER_EIR=sum(NEW_READS(:,ALT_EIRS),1); + EXS_SEQ=gene.eirs_in_seq(:,ALT_EIRS); + [NEWCOLS,IDX2,POS]=unique(EXS_SEQ','rows'); + NEWCOLS=NEWCOLS'; + EIR_COUNTS=zeros(1,length(IDX2)); + for j=1:max(POS) + EIR_COUNTS(j)=sum(COUNTS_PER_EIR(POS==j)); + end + TEMP_COUNT{6}=EIR_COUNTS; + end + + clear NEW_READS + clear reads; + COUNTS{i}=TEMP_COUNT; + + OLD_OUT_STR=OUT_STR; + OUT_STR=['Finished ' num2str(i) ' out of ' num2str(size(genes,2)) ' genes']; + %print progress + if CFG.use_rproc + fprintf([OUT_STR '\n']) + else + % Erase old progress + fprintf(repmat('\b',1,length(OLD_OUT_STR))); + fprintf([OUT_STR]) + end + + +end +fprintf('\n') +%Save the counts +OUT_FILENAME=[CFG.outfile_prefix]; +save(OUT_FILENAME,'COUNTS') +%Save the nonparametric histogram +