view NEUMA-1.2.1/combine_gFVKM_iFVKM_max.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl

#This script was adopted and modified from ``scripts/combine_gFVKM_iFVKM_5.pl`.


if(@ARGV<4) { print "usage: $0 gFVKM_file iFVKM_file gene2NM_file EUMA_CUT(eg.50, in bp) output.final.gFVKM output.final.iFVKM\n"; exit; }
my ($gFVKM_file, $iFVKM_file, $gene2NM_file, $CUT, $final_gFVKM_file, $final_iFVKM_file ) = @ARGV;


my %gFVKM=();
my %gEUMA=();
my %allgenes=();
&read_gFVKM($gFVKM_file,\%gFVKM,\%gEUMA,\%allgenes);


my %iFVKM=();
my %iEUMA=();
my %Alltranscirpts=();
&read_iFVKM($iFVKM_file,\%iFVKM,\%iEUMA,\%Allisoforms);


my %gene2NM=();
&read_gene2NM($gene2NM_file,\%gene2NM);


my %derived_gFVKM = ();
&compute_derived_gFVKM(\%derived_gFVKM,\%iFVKM,\%allgenes);

my %final_gFVKM=();
&compute_final_gFVKM(\%allgenes,\%gFVKM,\%derived_gFVKM,\%final_gFVKM);

my %final_iFVKM=();
my %iFVKM_factor=();
&compute_final_iFVKM(\%final_iFVKM,\%iFVKM,\%final_gFVKM,\%iFVKM_factor,\%Allisoforms)

&generate_final_gFVKM_file($final_gFVKM_file,\%allgenes,\%gFVKM,\%derived_gFVKM,\%gEUMA,\%iFVKM,\%gene2NM,\%final_gFVKM);
&generate_final_iFVKM_file($final_iFVKM_file,\%iFVKM,\%final_iFVKM,\%iFVKM_factor,\%iEUMA,\%gene2NM,\%Allisoforms);



#################
## subroutines ##
#################


sub read_gFVKM {
 my $gFVKM_file = shift @_;
 my $pGFVKM = shift @_;
 my $pgEUMA = shift @_;
 my $pAllgenes = shift @_;

 open IN, $gFVKM_file or die "Can't open gFVKM file $gFVKM_file\n";
 while(<IN>){
  chomp;
  my ($gene,$gFVKM_0,$gEUMA_0) = (split/\t/)[0,1,3];
  next if $gEUMA_0 < $CUT;
  $pGFVKM->{$gene} = $gFVKM_0;
  $pgEUMA->{$gene} = $gEUMA_0;
  $pAllgenes->{$gene}=1;
 }
 close IN;
}


sub read_iFVKM {
 my $iFVKM_file = shift @_;
 my $pIFVKM = shift @_;
 my $piEUMA = shift @_;
 my $pAllisoforms = shift @_;

 open IN, $iFVKM_file or die "Can't open iFVKM file $iFVKM_file\n";
 while(<IN>){
  chomp;
  my ($gid,$isoform,$iFVKM_0,$iEUMA_0) = (split/\t/)[0,1,2,4];
  next if $iEUMA_0 < $CUT;
  $pIFVKM->{$gid}{$isoform} = $iFVKM_0;
  $piEUMA->{$gid}{$isoform} = $iEUMA_0;
  $pAllisoforms->{$gid}{$isoform} = 1;
 }
 close IN;
}


sub read_gene2NM {
 my $gene2NM_file = shift @_;
 my $pGene2NM = shift @_;

 open IN, $gene2NM_file or die "Can't open gene2NM file $gene2NM_file\n";
 while(<IN>){
  chomp;
  my ($gid,$isoform) = split/\t/;
  $pGene2NM->{$gid}{$isoform}=1;
 }
 close IN;
}



sub compute_derived_gFVKM {
 my $pDerived_gFVKM = shift @_;
 my $pIFVKM = shift @_;
 my $pAllgenes = shift @_;

 #compute derived gFVKM
 for my $gene (keys %$pIFVKM){
   my $sumiFVKM = &sum(values(%{$pIFVKM->{$gene}}));
   $pDerived_gFVKM->{$gene} = $sumiFVKM;
   $pAllgenes->{$gene}=1;
 }
}


sub compute_final_iFVKM {
 my $pFinal_iFVKM = shift @_;
 my $pIFVKM = shift @_;
 my $pFinalGFVKM = shift @_;
 my $pIFVKM_factor = shift @_;
 my $pAllisoforms = shift @_;

 #rescale iFVKM based on final gFVKM
 for my $gene (keys %$pIFVKM){
  my $sumiFVKM = &sum(values %{$pIFVKM->{$gene}});
  if($sumiFVKM == 0) { $pIFVKM_factor->{$gene} = "inf"; }
  else { $pIFVKM_factor->{$gene} = $pFinalGFVKM->{$gene} / $sumiFVKM; } 
  for my $isoform (keys %{$pIFVKM->{$gene}}) {
     $pFinal_iFVKM->{$gene} = $pIFVKM->{$gene};
     $pAllisoforms->{$gene}{$isoform} =1;
  }
 }

}


# using max of gFVKM and sumiFVKM, instead of average.
sub compute_final_gFVKM {
 my $pAllgenes = shift @_;
 my $pGFVKM = shift @_;
 my $pDerived_gFVKM = shift @_;
 my $pFinal_gFVKM = shift @_;

 for my $gene (sort keys %$pAllgenes){
  my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA';
  my $derived_gFVKM_val = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA';
  my $final_gFVKM = 'NA';
  if($orig_gFVKM ne 'NA' && $derived_gFVKM_val ne 'NA'){
     #$final_gFVKM = ($orig_gFVKM+$derived_gFVKM_val)/2;   #average
     $final_gFVKM = ($orig_gFVKM>=$derived_gFVKM_val?$orig_gFVKM:$derived_gFVKM_val);   #maximum
  }
  elsif($orig_gFVKM eq 'NA') { $final_gFVKM = $derived_gFVKM_val; }
  elsif($derived_gFVKM_val eq 'NA') { $final_gFVKM = $orig_gFVKM; }

  $pFinal_gFVKM->{$gene} = $final_gFVKM;
 }
}




sub generate_final_gFVKM_file {
 my $final_gFVKM_file = shift @_;
 my $pAllgenes = shift @_;
 my $pGFVKM = shift @_;
 my $pDerived_gFVKM = shift @_;
 my $pgEUMA = shift @_;
 my $pIFVKM = shift @_;
 my $pGene2NM = shift @_;
 my $pFinal_gFVKM = shift @_;

 open FG_FVKM, ">$final_gFVKM_file" or die "Can't write to final gFVKM file $final_gFVKM_file\n";
 print FG_FVKM "gene\tfinal.gFVK\torig.gFVK\tderived_gFVK\tgEUMA\t#isoforms\t#measured_isoforms\n";
 for my $gene (sort keys %$pAllgenes){

  my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA';
  my $derived_gFVKM = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA';
  my $final_gFVKM = (exists $pFinal_gFVKM->{$gene})?$pFinal_gFVKM->{$gene}:'NA';

  my $gEUMA='';
  if(exists $pgEUMA->{$gene}){ $gEUMA = $pgEUMA->{$gene}; }
  else { $gEUMA = 'below_cut'; }

  my $nTr = scalar keys %{$pGene2NM->{$gene}};
  my $nTr_measured = scalar keys %{$pIFVKM->{$gene}};

  print FG_FVKM "$gene\t$final_gFVKM\t$orig_gFVKM\t$derived_gFVKM\t$gEUMA\t$nTr\t$nTr_measured\n";
 }
 close FG_FVKM;
}



sub generate_final_iFVKM_file {
 my $final_iFVKM_file = shift @_;
 my $pIFVKM = shift @_;
 my $pFinal_iFVKM = shift @_;
 my $pIFVKM_factor = shift @_;
 my $piEUMA = shift @_;
 my $pGene2NM = shift @_;
 my $pAllisoforms = shift @_;

 #print scalar keys %$pDerived_iFVKM;  ##DEBUGGING

 open FI_FVKM, ">$final_iFVKM_file" or die "Can't write to final iFVKM file $final_iFVKM_file\n";
 print FI_FVKM "gene\tisoform\tfinal.iFVK\torig.iFVK\tiFVKM.factor\tiEUMA\t#isoforms\t#measured_isoforms\n";
 for my $gene (sort keys %$pAllisoforms){

  my $nTr = scalar keys %{$pGene2NM->{$gene}};
  my $nTr_measured = scalar keys %{$pIFVKM->{$gene}};

  my $iFVKM_factor = (exists $pIFVKM_factor->{$gene})?$pIFVKM_factor->{$gene}:'NA';

  for my $isoform (sort keys %{$pAllisoforms->{$gene}}){

   my $iEUMA='';
   if(exists $piEUMA->{$gene}{$isoform}) { $iEUMA = $piEUMA->{$gene}{$isoform}; }
   else { $iEUMA = 'below_cut'; }
 
   my $final_iFVKM = (exists $pFinal_iFVKM->{$gene} && exists ${$pFinal_iFVKM->{$gene}}{$isoform})?$pFinal_iFVKM->{$gene}{$isoform}:$pIFVKM->{$gene}{$isoform};
   print FI_FVKM "$gene\t$isoform\t$final_iFVKM\t$pIFVKM->{$gene}{$isoform}\t$iFVKM_factor\t$iEUMA\t$nTr\t$nTr_measured\n";
  }
 }
 close FI_FVKM;
}

sub sum {
 my $s=0;
 while(defined(my $x=shift @_)){
  $s+=$x;
 }
 return $s;
}