annotate rDiff/src/get_reads_caller.m @ 2:233c30f91d66

updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 07:15:44 -0400
parents 0f80a5141704
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
1 function [COUNTS]=get_reads_caller(PAR)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
2
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
3 CFG = PAR.CFG;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
4 genes = PAR.genes;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
5 clear PAR;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
6
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
7 % add paths
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
8 addpath(CFG.paths);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
9
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
10 data_dir=CFG.data_dir;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
11 OUT_STR=[];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
12
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
13 COUNTS=cell(size(genes,2),1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
14
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
15
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
16 % NON_PARAM_SAMPLE contains the read start density
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
17 if CFG.perform_nonparametric
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
18 NON_PARAM_SAMPLE=sparse([],[],[],10000,1,5000);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
19 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
20
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
21
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
22 for i=1:size(genes,2)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
23
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
24
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
25
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
26 %TEMP_COUNT contins the counts for the current gene
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
27 TEMP_COUNT=cell(1,7);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
28 gene = genes(i);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
29
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
30 %set default return values
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
31 TEMP_COUNT{1}=gene.name;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
32 TEMP_COUNT{2}=[];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
33 TEMP_COUNT{3}=[];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
34 TEMP_COUNT{4}=[];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
35 TEMP_COUNT{5}=[];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
36 TEMP_COUNT{6}=[];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
37
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
38 %check that the gene has exons defined
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
39 if isempty(gene.exons)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
40 STAT{i}=TEMP_COUNT;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
41 continue;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
42 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
43
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
44 %check that the gene is longer than the Reads. Otherwise the
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
45 %definition of regions does not makes sense
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
46 if gene.stop-gene.start<CFG.sequenced_length+3
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
47 STAT{i}=TEMP_COUNT;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
48 continue;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
49 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
50
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
51 %Get the reads from gene
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
52 [reads] = get_reads_for_gene(CFG,gene);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
53
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
54
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
55 %Get total number of reads
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
56 NR_OF_READS=size(reads,1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
57 TEMP_COUNT{2}=NR_OF_READS;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
58
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
59
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
60 NR_OF_TRANS=size(gene.transcripts,2);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
61 %check that the gene has more than one isoform
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
62 if NR_OF_TRANS<=1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
63 STAT{i}=TEMP_COUNT;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
64 continue;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
65 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
66
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
67
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
68 EXON_IDX=sum(gene.exonsequence,1)<NR_OF_TRANS;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
69
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
70 %Transform the reads in to counts per region
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
71 [NEW_READS,UNEXPLAINED_READS,UNEXPLAINED_INDEX]= convert_reads_to_region_indicators(reads,gene);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
72
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
73 if length(unique(sum(NEW_READS,2)))>1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
74 warning(['Assignment of reads to regions is not unique for gene:' gene.name '\n']);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
75 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
76
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
77
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
78 %Calulate gene expression
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
79 %Get the non_unique_regions
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
80 NON_ALT_EIRS=sum(gene.eirs_in_seq,1)==NR_OF_TRANS;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
81 TEMP_COUNT{3}=sum(sum(NEW_READS(:,NON_ALT_EIRS),2)>0);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
82
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
83
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
84 %Get Counts for nonparametric variance function
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
85 if CFG.perform_nonparametric
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
86 %Get the read starts
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
87 [TEMP,START]=max(reads,[],2);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
88 read_starts=sparse((1:NR_OF_READS)',START,ones(NR_OF_READS,1),NR_OF_READS,size(reads,2),NR_OF_READS);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
89
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
90 %Get the index of the alternative reads
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
91 ALT_READ_IX=zeros(size(reads,1),1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
92 ALT_EIRS=and(sum(gene.eirs_in_seq,1)<NR_OF_TRANS,sum(gene.eirs_in_seq,1)>0);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
93 ALT_READ_IX(UNEXPLAINED_INDEX==0)=sum(NEW_READS(:,ALT_EIRS),2)>0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
94
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
95 %Get the coverage of the read starts
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
96 %COVERAGE=sum(read_starts(find(ALT_READ_IX>0),:),1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
97 if CFG.only_gene_start
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
98 COVERAGE=sum(reads,1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
99 else
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
100 COVERAGE=sum(reads(find(ALT_READ_IX>0),:),1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
101 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
102 if max(COVERAGE)>0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
103 TEMP_COUNT{4}=COVERAGE;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
104 else
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
105 TEMP_COUNT{4}=[];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
106 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
107 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
108
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
109
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
110 %Get counts for parametric settting
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
111 if or(CFG.perform_parametric,CFG.perform_poisson)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
112 %Get the alternative reads
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
113 ALT_EIRS=and(sum(gene.eirs_in_seq,1)<NR_OF_TRANS,sum(gene.eirs_in_seq,1)>0);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
114
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
115 %Return the Counts in the alternative EIRS
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
116
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
117 COUNTS_PER_EIR=sum(NEW_READS(:,ALT_EIRS),1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
118 EXS_SEQ=gene.eirs_in_seq(:,ALT_EIRS);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
119 [NEWCOLS,IDX2,POS]=unique(EXS_SEQ','rows');
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
120 NEWCOLS=NEWCOLS';
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
121 EIR_COUNTS=zeros(1,length(IDX2));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
122 for j=1:max(POS)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
123 EIR_COUNTS(j)=sum(COUNTS_PER_EIR(POS==j));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
124 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
125 TEMP_COUNT{6}=EIR_COUNTS;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
126 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
127
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
128 clear NEW_READS
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
129 clear reads;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
130 COUNTS{i}=TEMP_COUNT;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
131
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
132 OLD_OUT_STR=OUT_STR;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
133 OUT_STR=['Finished ' num2str(i) ' out of ' num2str(size(genes,2)) ' genes'];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
134 %print progress
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
135 if CFG.use_rproc
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
136 fprintf([OUT_STR '\n'])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
137 else
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
138 % Erase old progress
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
139 fprintf(repmat('\b',1,length(OLD_OUT_STR)));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
140 fprintf([OUT_STR])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
141 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
142
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
143
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
144 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
145 fprintf('\n')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
146 %Save the counts
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
147 OUT_FILENAME=[CFG.outfile_prefix];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
148 save(OUT_FILENAME,'COUNTS')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
149 %Save the nonparametric histogram
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
150