Mercurial > repos > vipints > deseq_hts
view deseq-hts_1.0/src/get_read_counts.m @ 9:e27b4f7811c2 draft
Updated DESeq version 1.12
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Tue, 08 Oct 2013 08:09:28 -0400 |
parents | 94a108763d9e |
children |
line wrap: on
line source
function get_read_counts(anno_dir, outfile, varargin) % % -- input -- % anno_dir: directory of genes % outfile: output file % varargin: list of BAM files (at least two) % DESeq paths global DESEQ_PATH DESEQ_SRC_PATH % interpreter paths global INTERPRETER MATLAB_BIN_PATH OCTAVE_BIN_PATH % SAMTools path global SAMTOOLS_DIR %%%% paths addpath(sprintf('%s/tools', DESEQ_PATH)); addpath(sprintf('%s/mex', DESEQ_PATH)); addpath(sprintf('%s', DESEQ_SRC_PATH)); deseq_config; %%% read list of replicate groups from variable length argument list rg_list = cell(1,size(varargin, 2)); file_list = cell(); file_cond_ids = []; file_rep_ids = []; for idx = 1:size(varargin, 2) rg_list(idx) = varargin(idx); end idx = strmatch('', rg_list, 'exact'); rg_list(idx) = []; for idx = 1:length(rg_list), items = separate(rg_list{idx}, ':'); for idx2 = 1:length(items) if isempty(deblank(items{idx2})), continue; end; file_list{end + 1} = items{idx2}; file_cond_ids(end + 1) = idx; file_rep_ids(end + 1) = idx2; end; end; clear idx idx2; %%%%% adapt to number of input arguments file_num = length(file_list); RESULTS = cell(1, file_num); %%%% get annotation file load(sprintf('%s', anno_dir)); %%%%% mask overlapping gene regions -> later not counted [genes] = mask_dubl(genes,0); %%%% remove genes with no annotated exons or where no idx = find(arrayfun(@(x)(~isempty(x.exons)*~isempty(x.start)*~isempty(x.stop)), genes)); fprintf('removed %i of %i genes, which had either no exons annotated or lacked a start or stop position\n', size(genes, 2) - size(idx, 2), size(genes, 2)) genes = genes(idx); clear idx; %%%% check if genes have field chr_num if ~isfield(genes, 'chr_num') chrms = unique({genes(:).chr}); for i = 1:length(genes) genes(i).chr_num = strmatch(genes(i).chr, chrms, 'exact'); end; end; %%%% iterate over all given bam files for f_idx = 1:file_num expr1_bam = fullfile('', file_list{f_idx}); STAT = cell(size(genes, 2),1); for i=1:size(genes,2) RESULT = cell(1,7); gene = genes(i); RESULT{4} = f_idx; RESULT{1} = gene.name; if isempty(gene.exons) RESULT{2} = inf; RESULT{3} = inf; RESULT{5} = [inf,inf]; STAT{i} = RESULT; continue; elseif or(isempty(gene.start),isempty(gene.stop)) RESULT{2} = inf; RESULT{3} = inf; RESULT{5} = [inf,inf]; STAT{i} = RESULT; continue; end if ~isempty(gene.chr_num), [mask1, read_intron_list] = get_reads(expr1_bam, gene.chr, gene.start, gene.stop, '0'); clear read_intron_list; else mask1 = []; end; if isempty(mask1) reads1 = zeros(0,gene.stop-gene.start+1); else reads1 = sparse(mask1(1,:)',mask1(2,:)',ones(size(mask1,2),1),max(mask1(1,:)),gene.stop-gene.start+1); end if ~isempty(reads1); [reads1,FLAG] = remove_reads_from_other_genes(reads1,gene); end L = size(reads1); RESULT{2}=[size(reads1,1)]; % number of all reads falling in that gene EXON_IDX=zeros(1,gene.stop-gene.start+1); for t=1:size(gene.transcripts,2) for e=1:size(gene.exons{t},1) EXON_IDX((gene.exons{t}(e,1)-gene.start+1):(gene.exons{t}(e,2)-gene.start+1))=1; end end reads1 = reads1(sum(reads1(:,find(EXON_IDX)),2)>0,:); L1 = sum(EXON_IDX); RESULT{3}=[size(reads1,1)]; % number of reads overlapping to exons RESULT{5}=[L, L1]; % size of reads1, number of exonic positions % old and weighted poisson new ,weighted regions reads and % unexplained reads clear reads1; STAT{i} = RESULT; end; RESULTS{f_idx} = STAT; end; S=size(genes,2); READCOUNTS_ALL=zeros(S, file_num); READCOUNTS_EXON=zeros(S, file_num); LENGTH_ALL=zeros(S,file_num); LEN_EXON=zeros(S, file_num); for j=1:file_num, for i=1:S T=RESULTS{j}{i}; if isempty(T) continue else READCOUNTS_ALL(i,j)=T{2}; READCOUNTS_EXON(i,j)=T{3}; LENGTH_ALL(i,j)=T{5}(1); LEN_EXON(i,j)=T{5}(2); end end end %%%%% write results for all bam files fid_conditions = fopen(sprintf('%s_CONDITIONS.tab', outfile), 'w'); fid_counts = fopen(sprintf('%s_COUNTS.tab', outfile) ,'w'); fprintf(fid_counts,'gene'); fprintf(fid_conditions, 'file\tcondition\treplicate\n'); for j = 1:length(file_list) fname = file_list{j} ; fname = separate(fname, '/'); fname = fname{end}; fname = strrep(fname, '.bam', '') ; fprintf(fid_counts,'\t%s', fname); fprintf(fid_conditions, '%s\t%i\t%i\n', fname, file_cond_ids(j), file_rep_ids(j)); end; fprintf(fid_counts,'\n') ; for i = 1:size(genes,2) fprintf(fid_counts,'%s',genes(i).name); for j = 1:length(file_list), fprintf(fid_counts,'\t%i', READCOUNTS_EXON(i,j)); end fprintf(fid_counts,'\n'); end fclose(fid_counts); fclose(fid_conditions); exit;