annotate rDiff/src/get_read_counts.m @ 2:233c30f91d66

updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 07:15:44 -0400
parents 0f80a5141704
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
1 function []=get_read_counts(CFG,genes)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
2
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
3 if CFG.use_rproc
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
4 JB_NR=1;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
5 JOB_INFO = rproc_empty();
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
6 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
7
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
8 %%% Get the read counts
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
9 if CFG.estimate_gene_expression==1
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
10 for RUN=1:size(CFG.BAM_FILES,2)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
11 % configuration
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
12 CFG.curr_bamfile = CFG.BAM_FILES{RUN};
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
13 if not(CFG.use_rproc)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
14 fprintf('Getting gene expression for: %s\n', CFG.curr_bamfile);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
15 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
16 tic
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
17 %define the splits of the genes for the jobs
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
18 idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))'];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
19 % submit jobs to cluster
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
20 for i = 1:CFG.rproc_num_jobs
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
21 PAR.genes = genes(idx(idx(:,2)==i,1));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
22 CFG.rproc_memreq = 5000;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
23 CFG.rproc_par.mem_req_resubmit = [10000 20000 32000];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
24 CFG.rproc_par.identifier = sprintf('Exp.%i-',i);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
25 CFG.outfile_prefix=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(i) '_of_' num2str(CFG.rproc_num_jobs) '.mat'];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
26 PAR.CFG=CFG;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
27 if CFG.use_rproc
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
28 fprintf(1, 'Submitting job %i to cluster\n',i);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
29 JOB_INFO(JB_NR) = rproc('get_reads_caller', PAR,CFG.rproc_memreq, CFG.rproc_par, CFG.rproc_time);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
30 JB_NR=JB_NR+1;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
31 else
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
32 get_reads_caller(PAR);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
33 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
34 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
35 toc
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
36 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
37 if CFG.use_rproc
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
38 [JOB_INFO num_crashed] = rproc_wait(JOB_INFO, 60, 1, -1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
39 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
40 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
41
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
42 %%% Generate the output files
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
43 %load the count files
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
44 %load the count files
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
45 READS_PER_GENE=zeros(size(genes,2),size(CFG.BAM_FILES,2));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
46 GENE_EXPR=zeros(size(genes,2),size(CFG.BAM_FILES,2));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
47
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
48 Counts_rDiff_parametric=cell(size(genes,2),size(CFG.BAM_FILES,2));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
49 Counts_rDiff_nonparametric=cell(size(genes,2),size(CFG.BAM_FILES,2));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
50
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
51
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
52 %Field containing the errors
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
53 ERRORS_NR=[];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
54 idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))'];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
55 % Iterate over the result files to load the data from the count files
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
56 for RUN=1:size(CFG.BAM_FILES,2)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
57 for j = 1:CFG.rproc_num_jobs
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
58 IN_FILENAME=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(j) '_of_' num2str(CFG.rproc_num_jobs) '.mat'];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
59 IDX=idx(idx(:,2)==j,1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
60 try
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
61 load(IN_FILENAME)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
62 for k=1:length(IDX)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
63 %Check wether COUNTS is empty
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
64 if isempty(COUNTS{k})
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
65 continue
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
66 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
67 %Get the number of reads mapping to a gene
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
68 if not(isempty(COUNTS{k}{2}))
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
69 READS_PER_GENE(IDX(k),RUN)=COUNTS{k}{2};
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
70 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
71
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
72 %get the number of nonalternative reads if
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
73 %possible. Otherwise use the number of reads
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
74 %mapping to a gene as gene expression
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
75 if not(isempty(COUNTS{k}{3}))
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
76 GENE_EXPE(IDX(k),RUN)=COUNTS{k}{3};
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
77 else
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
78 GENE_EXPE(IDX(k),RUN)=READS_PER_GENE(IDX(k),RUN);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
79 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
80
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
81 %get the Counts for rDiff.parametric
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
82 if not(isempty(COUNTS{k}{6}))
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
83 Counts_rDiff_parametric{IDX(k),RUN}=COUNTS{k}{6};
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
84 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
85
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
86 %get the counts for rDiff.nonparametric
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
87 if not(isempty(COUNTS{k}{4}))
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
88 Counts_rDiff_nonparametric{IDX(k),RUN}=COUNTS{k}{4};
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
89 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
90 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
91 catch
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
92 warning(['There was a problem loading: ' IN_FILENAME ])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
93 % If something went wrong
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
94 for k=1:length(IDX)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
95 READS_PER_GENE(IDX(k),RUN)=0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
96 GENE_EXPR(IDX(k),RUN)=0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
97 Counts_rDiff_parametric{IDX(k),RUN}={};
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
98 Counts_rDiff_nonparametric{IDX(k),RUN}={};
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
99 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
100 ERRORS_NR=[ERRORS_NR; [RUN,i]];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
101 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
102 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
103 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
104 if not(isempty(ERRORS_NR))
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
105 warning('There have been problems loading some of the raw count files');
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
106 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
107
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
108 %If less than 10 reads use abulute number of reads
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
109 GENE_EXPR(GENE_EXPR<10)=READS_PER_GENE(GENE_EXPR<10);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
110
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
111 %Generate gene expression tables
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
112
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
113 %Open file handler for the gene expression table
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
114 EXPR_TAB_FILENAME=[CFG.out_base 'Gene_expression.tab'];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
115 fid=fopen(EXPR_TAB_FILENAME,'w');
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
116
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
117 %print header
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
118 fprintf(fid,'gene');
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
119 for i=1:length(CFG.NAMES)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
120 fprintf(fid,'\t%s',CFG.NAMES{i});
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
121 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
122 fprintf(fid,'\n');
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
123
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
124 for j=1:size(genes,2)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
125 fprintf(fid,'%s',genes(j).name);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
126 for i=1:length(CFG.NAMES)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
127 fprintf(fid,'\t%i',GENE_EXPR(j,i));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
128 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
129 fprintf(fid,'\n');
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
130 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
131
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
132 fclose(fid)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
133
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
134 %Determine interpreter
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
135 if size(ver('Octave'),1)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
136 INTERPR = 1;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
137 else
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
138 INTERPR = 0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
139 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
140
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
141
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
142 %Save alternative region count file for rDiff.parametric
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
143 OUT_FILENAME=[CFG.out_base 'Alternative_region_counts.mat'];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
144 if INTERPR
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
145 save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_parametric')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
146 else
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
147 save(OUT_FILENAME,'Counts_rDiff_parametric','-v7.3')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
148 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
149
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
150 %Save alternative region count file for rDiff.nonparametric
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
151 OUT_FILENAME=[CFG.out_base 'Nonparametric_region_counts.mat'];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
152 if INTERPR
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
153 save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_nonparametric')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
154 else
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
155 save(OUT_FILENAME,'Counts_rDiff_nonparametric','-v7.3')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
156 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
157
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
158 return