view rDiff/src/tools/sanitize_genes.m @ 2:233c30f91d66

updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 07:15:44 -0400
parents 0f80a5141704
children
line wrap: on
line source

function [genes]=sanitize_genes(genes,CFG)
%This function removes trnascript and genes which have a invalid
%structure and recomputes the splicegraph


%Mark genes with eronous exon definitions
RM_GENES_IDX=[]; %genes to keep
for i=1:size(genes,2)
  %remove transcripts which have a length smaller than
  %readlength
  RM_TR_IDX=[];
  START_MIN=inf;
  STOP_MAX=0;
  for j=1:size(genes(i).transcripts,2)
    if sum(genes(i).exons{j}(:,2)-genes(i).exons{j}(:,1))< CFG.sequenced_length
      RM_TR_IDX=[RM_TR_IDX,j];
    else
      START_MIN=min(START_MIN,genes(i).exons{j}(1,1));
      STOP_MAX=max(STOP_MAX,genes(i).exons{j}(end,2));
    end 
  end
  if ~isempty(RM_TR_IDX)
     genes(i).exons(RM_TR_IDX)=[];
    genes(i).transcripts(RM_TR_IDX)=[];
    genes(i).start=START_MIN;
    genes(i).stop=STOP_MAX;
  end
  if genes(i).start>START_MIN
      genes(i).start=START_MIN;
  end
  if genes(i).stop<STOP_MAX
      genes(i).stop=STOP_MAX;
  end
  
  
  %Check if exons are eronous
  CHECK=1;
  
  if size(genes(i).transcripts,2)==0
    CHECK=0;
  end
  
  for j=1:size(genes(i).transcripts,2)
    for k=1:(size(genes(i).exons{j},1)-1)
      if (genes(i).exons{j}(k,2)> genes(i).exons{j}(k+1,1))
	CHECK=0;
	break;
      end
    end
    
    if isempty(genes(i).exons{j})
	CHECK=0;
	break;
    end
    if CHECK==0
      break
    end
  end
  
  if genes(i).stop-genes(i).start<=CFG.sequenced_length
    CHECK=0;
  end
  if CHECK==0
    RM_GENES_IDX=[RM_GENES_IDX;i];
    genes(i).do_not_quant=1;
  else
      genes(i).do_not_quant=0;
  end
end
%genes(RM_GENES_IDX)=[];

% Create splicegraph 
for i=1:size(genes,2)
  gene=genes(i);
  ALL_EXO=[];
  
  for j=1:size(gene.exons,2)
    ALL_EXO=[ALL_EXO;gene.exons{j}];          
  end
  ALL_EXO=unique(ALL_EXO,'rows');
  GRAPH=zeros(size(ALL_EXO,1),size(ALL_EXO,1));
  for j=1:size(gene.exons,2)
    for k=1:(size(gene.exons{j},1)-1)
      [A,B]=intersect(ALL_EXO,gene.exons{j}(k,:),'rows');
      [A,C]=intersect(ALL_EXO,gene.exons{j}(k+1,:),'rows');
      GRAPH(B,C)=1;
    end
  end
  GRAPH=GRAPH+GRAPH';
  genes(i).splicegraph{1}=ALL_EXO';
  genes(i).splicegraph{2}=GRAPH;
end