comparison rDiff/src/get_read_counts.m @ 0:0f80a5141704

version 0.3 uploaded
author vipints
date Thu, 14 Feb 2013 23:38:36 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0f80a5141704
1 function []=get_read_counts(CFG,genes)
2
3 if CFG.use_rproc
4 JB_NR=1;
5 JOB_INFO = rproc_empty();
6 end
7
8 %%% Get the read counts
9 if CFG.estimate_gene_expression==1
10 for RUN=1:size(CFG.BAM_FILES,2)
11 % configuration
12 CFG.curr_bamfile = CFG.BAM_FILES{RUN};
13 if not(CFG.use_rproc)
14 fprintf('Getting gene expression for: %s\n', CFG.curr_bamfile);
15 end
16 tic
17 %define the splits of the genes for the jobs
18 idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))'];
19 % submit jobs to cluster
20 for i = 1:CFG.rproc_num_jobs
21 PAR.genes = genes(idx(idx(:,2)==i,1));
22 CFG.rproc_memreq = 5000;
23 CFG.rproc_par.mem_req_resubmit = [10000 20000 32000];
24 CFG.rproc_par.identifier = sprintf('Exp.%i-',i);
25 CFG.outfile_prefix=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(i) '_of_' num2str(CFG.rproc_num_jobs) '.mat'];
26 PAR.CFG=CFG;
27 if CFG.use_rproc
28 fprintf(1, 'Submitting job %i to cluster\n',i);
29 JOB_INFO(JB_NR) = rproc('get_reads_caller', PAR,CFG.rproc_memreq, CFG.rproc_par, CFG.rproc_time);
30 JB_NR=JB_NR+1;
31 else
32 get_reads_caller(PAR);
33 end
34 end
35 toc
36 end
37 if CFG.use_rproc
38 [JOB_INFO num_crashed] = rproc_wait(JOB_INFO, 60, 1, -1);
39 end
40 end
41
42 %%% Generate the output files
43 %load the count files
44 %load the count files
45 READS_PER_GENE=zeros(size(genes,2),size(CFG.BAM_FILES,2));
46 GENE_EXPR=zeros(size(genes,2),size(CFG.BAM_FILES,2));
47
48 Counts_rDiff_parametric=cell(size(genes,2),size(CFG.BAM_FILES,2));
49 Counts_rDiff_nonparametric=cell(size(genes,2),size(CFG.BAM_FILES,2));
50
51
52 %Field containing the errors
53 ERRORS_NR=[];
54 idx=[(1:size(genes,2))',ceil((1:size(genes,2))*CFG.rproc_num_jobs/size(genes,2))'];
55 % Iterate over the result files to load the data from the count files
56 for RUN=1:size(CFG.BAM_FILES,2)
57 for j = 1:CFG.rproc_num_jobs
58 IN_FILENAME=[CFG.out_base_temp CFG.NAMES{RUN} '_' num2str(j) '_of_' num2str(CFG.rproc_num_jobs) '.mat'];
59 IDX=idx(idx(:,2)==j,1);
60 try
61 load(IN_FILENAME)
62 for k=1:length(IDX)
63 %Check wether COUNTS is empty
64 if isempty(COUNTS{k})
65 continue
66 end
67 %Get the number of reads mapping to a gene
68 if not(isempty(COUNTS{k}{2}))
69 READS_PER_GENE(IDX(k),RUN)=COUNTS{k}{2};
70 end
71
72 %get the number of nonalternative reads if
73 %possible. Otherwise use the number of reads
74 %mapping to a gene as gene expression
75 if not(isempty(COUNTS{k}{3}))
76 GENE_EXPE(IDX(k),RUN)=COUNTS{k}{3};
77 else
78 GENE_EXPE(IDX(k),RUN)=READS_PER_GENE(IDX(k),RUN);
79 end
80
81 %get the Counts for rDiff.parametric
82 if not(isempty(COUNTS{k}{6}))
83 Counts_rDiff_parametric{IDX(k),RUN}=COUNTS{k}{6};
84 end
85
86 %get the counts for rDiff.nonparametric
87 if not(isempty(COUNTS{k}{4}))
88 Counts_rDiff_nonparametric{IDX(k),RUN}=COUNTS{k}{4};
89 end
90 end
91 catch
92 warning(['There was a problem loading: ' IN_FILENAME ])
93 % If something went wrong
94 for k=1:length(IDX)
95 READS_PER_GENE(IDX(k),RUN)=0;
96 GENE_EXPR(IDX(k),RUN)=0;
97 Counts_rDiff_parametric{IDX(k),RUN}={};
98 Counts_rDiff_nonparametric{IDX(k),RUN}={};
99 end
100 ERRORS_NR=[ERRORS_NR; [RUN,i]];
101 end
102 end
103 end
104 if not(isempty(ERRORS_NR))
105 warning('There have been problems loading some of the raw count files');
106 end
107
108 %If less than 10 reads use abulute number of reads
109 GENE_EXPR(GENE_EXPR<10)=READS_PER_GENE(GENE_EXPR<10);
110
111 %Generate gene expression tables
112
113 %Open file handler for the gene expression table
114 EXPR_TAB_FILENAME=[CFG.out_base 'Gene_expression.tab'];
115 fid=fopen(EXPR_TAB_FILENAME,'w');
116
117 %print header
118 fprintf(fid,'gene');
119 for i=1:length(CFG.NAMES)
120 fprintf(fid,'\t%s',CFG.NAMES{i});
121 end
122 fprintf(fid,'\n');
123
124 for j=1:size(genes,2)
125 fprintf(fid,'%s',genes(j).name);
126 for i=1:length(CFG.NAMES)
127 fprintf(fid,'\t%i',GENE_EXPR(j,i));
128 end
129 fprintf(fid,'\n');
130 end
131
132 fclose(fid)
133
134 %Determine interpreter
135 if size(ver('Octave'),1)
136 INTERPR = 1;
137 else
138 INTERPR = 0;
139 end
140
141
142 %Save alternative region count file for rDiff.parametric
143 OUT_FILENAME=[CFG.out_base 'Alternative_region_counts.mat'];
144 if INTERPR
145 save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_parametric')
146 else
147 save(OUT_FILENAME,'Counts_rDiff_parametric','-v7.3')
148 end
149
150 %Save alternative region count file for rDiff.nonparametric
151 OUT_FILENAME=[CFG.out_base 'Nonparametric_region_counts.mat'];
152 if INTERPR
153 save('-mat7-binary',OUT_FILENAME,'Counts_rDiff_nonparametric')
154 else
155 save(OUT_FILENAME,'Counts_rDiff_nonparametric','-v7.3')
156 end
157
158 return