0
+ − 1 function get_read_counts(anno_dir, outfile, varargin)
+ − 2 %
+ − 3 % -- input --
+ − 4 % anno_dir: directory of genes
+ − 5 % outfile: output file
+ − 6 % varargin: list of BAM files (at least two)
+ − 7
+ − 8 % DESeq paths
+ − 9 global DESEQ_PATH DESEQ_SRC_PATH
+ − 10
+ − 11 % interpreter paths
+ − 12 global INTERPRETER MATLAB_BIN_PATH OCTAVE_BIN_PATH
+ − 13
+ − 14 % SAMTools path
+ − 15 global SAMTOOLS_DIR
+ − 16
+ − 17 %%%% paths
+ − 18 addpath(sprintf('%s/tools', DESEQ_PATH));
+ − 19 addpath(sprintf('%s/mex', DESEQ_PATH));
+ − 20 addpath(sprintf('%s', DESEQ_SRC_PATH));
+ − 21
+ − 22 deseq_config;
+ − 23
+ − 24 %%% read list of replicate groups from variable length argument list
+ − 25 rg_list = cell(1,size(varargin, 2));
+ − 26 file_list = cell();
+ − 27 file_cond_ids = [];
+ − 28 file_rep_ids = [];
+ − 29 for idx = 1:size(varargin, 2)
+ − 30 rg_list(idx) = varargin(idx);
+ − 31 end
+ − 32 idx = strmatch('', rg_list, 'exact');
+ − 33 rg_list(idx) = [];
+ − 34 for idx = 1:length(rg_list),
+ − 35 items = separate(rg_list{idx}, ':');
+ − 36 for idx2 = 1:length(items)
+ − 37 if isempty(deblank(items{idx2})),
+ − 38 continue;
+ − 39 end;
+ − 40 file_list{end + 1} = items{idx2};
+ − 41 file_cond_ids(end + 1) = idx;
+ − 42 file_rep_ids(end + 1) = idx2;
+ − 43 end;
+ − 44 end;
+ − 45 clear idx idx2;
+ − 46
+ − 47 %%%%% adapt to number of input arguments
+ − 48 file_num = length(file_list);
+ − 49 RESULTS = cell(1, file_num);
+ − 50
+ − 51 %%%% get annotation file
+ − 52 load(sprintf('%s', anno_dir));
+ − 53
+ − 54 %%%%% mask overlapping gene regions -> later not counted
+ − 55 [genes] = mask_dubl(genes,0);
+ − 56
+ − 57 %%%% remove genes with no annotated exons or where no
+ − 58 idx = find(arrayfun(@(x)(~isempty(x.exons)*~isempty(x.start)*~isempty(x.stop)), genes));
+ − 59 fprintf('removed %i of %i genes, which had either no exons annotated or lacked a start or stop position\n', size(genes, 2) - size(idx, 2), size(genes, 2))
+ − 60 genes = genes(idx);
+ − 61 clear idx;
+ − 62
+ − 63 %%%% check if genes have field chr_num
+ − 64 if ~isfield(genes, 'chr_num')
+ − 65 chrms = unique({genes(:).chr});
+ − 66 for i = 1:length(genes)
+ − 67 genes(i).chr_num = strmatch(genes(i).chr, chrms, 'exact');
+ − 68 end;
+ − 69 end;
+ − 70
+ − 71 %%%% iterate over all given bam files
+ − 72 for f_idx = 1:file_num
+ − 73 expr1_bam = fullfile('', file_list{f_idx});
+ − 74 STAT = cell(size(genes, 2),1);
+ − 75 for i=1:size(genes,2)
+ − 76 RESULT = cell(1,7);
+ − 77 gene = genes(i);
+ − 78 RESULT{4} = f_idx;
+ − 79 RESULT{1} = gene.name;
+ − 80 if isempty(gene.exons)
+ − 81 RESULT{2} = inf;
+ − 82 RESULT{3} = inf;
+ − 83 RESULT{5} = [inf,inf];
+ − 84 STAT{i} = RESULT;
+ − 85 continue;
+ − 86 elseif or(isempty(gene.start),isempty(gene.stop))
+ − 87 RESULT{2} = inf;
+ − 88 RESULT{3} = inf;
+ − 89 RESULT{5} = [inf,inf];
+ − 90 STAT{i} = RESULT;
+ − 91 continue;
+ − 92 end
+ − 93 if ~isempty(gene.chr_num),
+ − 94 [mask1, read_intron_list] = get_reads(expr1_bam, gene.chr, gene.start, gene.stop, '0');
+ − 95 clear read_intron_list;
+ − 96 else
+ − 97 mask1 = [];
+ − 98 end;
+ − 99
+ − 100 if isempty(mask1)
+ − 101 reads1 = zeros(0,gene.stop-gene.start+1);
+ − 102 else
+ − 103 reads1 = sparse(mask1(1,:)',mask1(2,:)',ones(size(mask1,2),1),max(mask1(1,:)),gene.stop-gene.start+1);
+ − 104 end
+ − 105 if ~isempty(reads1);
+ − 106 [reads1,FLAG] = remove_reads_from_other_genes(reads1,gene);
+ − 107 end
+ − 108 L = size(reads1);
+ − 109 RESULT{2}=[size(reads1,1)]; % number of all reads falling in that gene
+ − 110 EXON_IDX=zeros(1,gene.stop-gene.start+1);
+ − 111 for t=1:size(gene.transcripts,2)
+ − 112 for e=1:size(gene.exons{t},1)
+ − 113 EXON_IDX((gene.exons{t}(e,1)-gene.start+1):(gene.exons{t}(e,2)-gene.start+1))=1;
+ − 114 end
+ − 115 end
+ − 116 reads1 = reads1(sum(reads1(:,find(EXON_IDX)),2)>0,:);
+ − 117 L1 = sum(EXON_IDX);
+ − 118 RESULT{3}=[size(reads1,1)]; % number of reads overlapping to exons
+ − 119 RESULT{5}=[L, L1]; % size of reads1, number of exonic positions
+ − 120 % old and weighted poisson new ,weighted regions reads and
+ − 121 % unexplained reads
+ − 122 clear reads1;
+ − 123 STAT{i} = RESULT;
+ − 124 end;
+ − 125 RESULTS{f_idx} = STAT;
+ − 126 end;
+ − 127
+ − 128 S=size(genes,2);
+ − 129 READCOUNTS_ALL=zeros(S, file_num);
+ − 130 READCOUNTS_EXON=zeros(S, file_num);
+ − 131 LENGTH_ALL=zeros(S,file_num);
+ − 132 LEN_EXON=zeros(S, file_num);
+ − 133
+ − 134 for j=1:file_num,
+ − 135 for i=1:S
+ − 136 T=RESULTS{j}{i};
+ − 137 if isempty(T)
+ − 138 continue
+ − 139 else
+ − 140 READCOUNTS_ALL(i,j)=T{2};
+ − 141 READCOUNTS_EXON(i,j)=T{3};
+ − 142 LENGTH_ALL(i,j)=T{5}(1);
+ − 143 LEN_EXON(i,j)=T{5}(2);
+ − 144 end
+ − 145 end
+ − 146 end
+ − 147
+ − 148 %%%%% write results for all bam files
+ − 149 fid_conditions = fopen(sprintf('%s_CONDITIONS.tab', outfile), 'w');
+ − 150 fid_counts = fopen(sprintf('%s_COUNTS.tab', outfile) ,'w');
+ − 151 fprintf(fid_counts,'gene');
+ − 152 fprintf(fid_conditions, 'file\tcondition\treplicate\n');
+ − 153 for j = 1:length(file_list)
+ − 154 fname = file_list{j} ;
+ − 155 fname = separate(fname, '/');
+ − 156 fname = fname{end};
+ − 157 fname = strrep(fname, '.bam', '') ;
+ − 158 fprintf(fid_counts,'\t%s', fname);
+ − 159 fprintf(fid_conditions, '%s\t%i\t%i\n', fname, file_cond_ids(j), file_rep_ids(j));
+ − 160 end;
+ − 161 fprintf(fid_counts,'\n') ;
+ − 162
+ − 163 for i = 1:size(genes,2)
+ − 164 fprintf(fid_counts,'%s',genes(i).name);
+ − 165 for j = 1:length(file_list),
+ − 166 fprintf(fid_counts,'\t%i', READCOUNTS_EXON(i,j));
+ − 167 end
+ − 168 fprintf(fid_counts,'\n');
+ − 169 end
+ − 170 fclose(fid_counts);
+ − 171 fclose(fid_conditions);
+ − 172 exit;