diff NEUMA-1.2.1/combine_gFVKM_iFVKM_max.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/NEUMA-1.2.1/combine_gFVKM_iFVKM_max.pl	Thu Aug 08 00:46:13 2013 -0400
@@ -0,0 +1,233 @@
+#!/usr/bin/perl
+
+#This script was adopted and modified from ``scripts/combine_gFVKM_iFVKM_5.pl`.
+
+
+if(@ARGV<4) { print "usage: $0 gFVKM_file iFVKM_file gene2NM_file EUMA_CUT(eg.50, in bp) output.final.gFVKM output.final.iFVKM\n"; exit; }
+my ($gFVKM_file, $iFVKM_file, $gene2NM_file, $CUT, $final_gFVKM_file, $final_iFVKM_file ) = @ARGV;
+
+
+my %gFVKM=();
+my %gEUMA=();
+my %allgenes=();
+&read_gFVKM($gFVKM_file,\%gFVKM,\%gEUMA,\%allgenes);
+
+
+my %iFVKM=();
+my %iEUMA=();
+my %Alltranscirpts=();
+&read_iFVKM($iFVKM_file,\%iFVKM,\%iEUMA,\%Allisoforms);
+
+
+my %gene2NM=();
+&read_gene2NM($gene2NM_file,\%gene2NM);
+
+
+my %derived_gFVKM = ();
+&compute_derived_gFVKM(\%derived_gFVKM,\%iFVKM,\%allgenes);
+
+my %final_gFVKM=();
+&compute_final_gFVKM(\%allgenes,\%gFVKM,\%derived_gFVKM,\%final_gFVKM);
+
+my %final_iFVKM=();
+my %iFVKM_factor=();
+&compute_final_iFVKM(\%final_iFVKM,\%iFVKM,\%final_gFVKM,\%iFVKM_factor,\%Allisoforms)
+
+&generate_final_gFVKM_file($final_gFVKM_file,\%allgenes,\%gFVKM,\%derived_gFVKM,\%gEUMA,\%iFVKM,\%gene2NM,\%final_gFVKM);
+&generate_final_iFVKM_file($final_iFVKM_file,\%iFVKM,\%final_iFVKM,\%iFVKM_factor,\%iEUMA,\%gene2NM,\%Allisoforms);
+
+
+
+#################
+## subroutines ##
+#################
+
+
+sub read_gFVKM {
+ my $gFVKM_file = shift @_;
+ my $pGFVKM = shift @_;
+ my $pgEUMA = shift @_;
+ my $pAllgenes = shift @_;
+
+ open IN, $gFVKM_file or die "Can't open gFVKM file $gFVKM_file\n";
+ while(<IN>){
+  chomp;
+  my ($gene,$gFVKM_0,$gEUMA_0) = (split/\t/)[0,1,3];
+  next if $gEUMA_0 < $CUT;
+  $pGFVKM->{$gene} = $gFVKM_0;
+  $pgEUMA->{$gene} = $gEUMA_0;
+  $pAllgenes->{$gene}=1;
+ }
+ close IN;
+}
+
+
+sub read_iFVKM {
+ my $iFVKM_file = shift @_;
+ my $pIFVKM = shift @_;
+ my $piEUMA = shift @_;
+ my $pAllisoforms = shift @_;
+
+ open IN, $iFVKM_file or die "Can't open iFVKM file $iFVKM_file\n";
+ while(<IN>){
+  chomp;
+  my ($gid,$isoform,$iFVKM_0,$iEUMA_0) = (split/\t/)[0,1,2,4];
+  next if $iEUMA_0 < $CUT;
+  $pIFVKM->{$gid}{$isoform} = $iFVKM_0;
+  $piEUMA->{$gid}{$isoform} = $iEUMA_0;
+  $pAllisoforms->{$gid}{$isoform} = 1;
+ }
+ close IN;
+}
+
+
+sub read_gene2NM {
+ my $gene2NM_file = shift @_;
+ my $pGene2NM = shift @_;
+
+ open IN, $gene2NM_file or die "Can't open gene2NM file $gene2NM_file\n";
+ while(<IN>){
+  chomp;
+  my ($gid,$isoform) = split/\t/;
+  $pGene2NM->{$gid}{$isoform}=1;
+ }
+ close IN;
+}
+
+
+
+sub compute_derived_gFVKM {
+ my $pDerived_gFVKM = shift @_;
+ my $pIFVKM = shift @_;
+ my $pAllgenes = shift @_;
+
+ #compute derived gFVKM
+ for my $gene (keys %$pIFVKM){
+   my $sumiFVKM = &sum(values(%{$pIFVKM->{$gene}}));
+   $pDerived_gFVKM->{$gene} = $sumiFVKM;
+   $pAllgenes->{$gene}=1;
+ }
+}
+
+
+sub compute_final_iFVKM {
+ my $pFinal_iFVKM = shift @_;
+ my $pIFVKM = shift @_;
+ my $pFinalGFVKM = shift @_;
+ my $pIFVKM_factor = shift @_;
+ my $pAllisoforms = shift @_;
+
+ #rescale iFVKM based on final gFVKM
+ for my $gene (keys %$pIFVKM){
+  my $sumiFVKM = &sum(values %{$pIFVKM->{$gene}});
+  if($sumiFVKM == 0) { $pIFVKM_factor->{$gene} = "inf"; }
+  else { $pIFVKM_factor->{$gene} = $pFinalGFVKM->{$gene} / $sumiFVKM; } 
+  for my $isoform (keys %{$pIFVKM->{$gene}}) {
+     $pFinal_iFVKM->{$gene} = $pIFVKM->{$gene};
+     $pAllisoforms->{$gene}{$isoform} =1;
+  }
+ }
+
+}
+
+
+# using max of gFVKM and sumiFVKM, instead of average.
+sub compute_final_gFVKM {
+ my $pAllgenes = shift @_;
+ my $pGFVKM = shift @_;
+ my $pDerived_gFVKM = shift @_;
+ my $pFinal_gFVKM = shift @_;
+
+ for my $gene (sort keys %$pAllgenes){
+  my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA';
+  my $derived_gFVKM_val = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA';
+  my $final_gFVKM = 'NA';
+  if($orig_gFVKM ne 'NA' && $derived_gFVKM_val ne 'NA'){
+     #$final_gFVKM = ($orig_gFVKM+$derived_gFVKM_val)/2;   #average
+     $final_gFVKM = ($orig_gFVKM>=$derived_gFVKM_val?$orig_gFVKM:$derived_gFVKM_val);   #maximum
+  }
+  elsif($orig_gFVKM eq 'NA') { $final_gFVKM = $derived_gFVKM_val; }
+  elsif($derived_gFVKM_val eq 'NA') { $final_gFVKM = $orig_gFVKM; }
+
+  $pFinal_gFVKM->{$gene} = $final_gFVKM;
+ }
+}
+
+
+
+
+sub generate_final_gFVKM_file {
+ my $final_gFVKM_file = shift @_;
+ my $pAllgenes = shift @_;
+ my $pGFVKM = shift @_;
+ my $pDerived_gFVKM = shift @_;
+ my $pgEUMA = shift @_;
+ my $pIFVKM = shift @_;
+ my $pGene2NM = shift @_;
+ my $pFinal_gFVKM = shift @_;
+
+ open FG_FVKM, ">$final_gFVKM_file" or die "Can't write to final gFVKM file $final_gFVKM_file\n";
+ print FG_FVKM "gene\tfinal.gFVK\torig.gFVK\tderived_gFVK\tgEUMA\t#isoforms\t#measured_isoforms\n";
+ for my $gene (sort keys %$pAllgenes){
+
+  my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA';
+  my $derived_gFVKM = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA';
+  my $final_gFVKM = (exists $pFinal_gFVKM->{$gene})?$pFinal_gFVKM->{$gene}:'NA';
+
+  my $gEUMA='';
+  if(exists $pgEUMA->{$gene}){ $gEUMA = $pgEUMA->{$gene}; }
+  else { $gEUMA = 'below_cut'; }
+
+  my $nTr = scalar keys %{$pGene2NM->{$gene}};
+  my $nTr_measured = scalar keys %{$pIFVKM->{$gene}};
+
+  print FG_FVKM "$gene\t$final_gFVKM\t$orig_gFVKM\t$derived_gFVKM\t$gEUMA\t$nTr\t$nTr_measured\n";
+ }
+ close FG_FVKM;
+}
+
+
+
+sub generate_final_iFVKM_file {
+ my $final_iFVKM_file = shift @_;
+ my $pIFVKM = shift @_;
+ my $pFinal_iFVKM = shift @_;
+ my $pIFVKM_factor = shift @_;
+ my $piEUMA = shift @_;
+ my $pGene2NM = shift @_;
+ my $pAllisoforms = shift @_;
+
+ #print scalar keys %$pDerived_iFVKM;  ##DEBUGGING
+
+ open FI_FVKM, ">$final_iFVKM_file" or die "Can't write to final iFVKM file $final_iFVKM_file\n";
+ print FI_FVKM "gene\tisoform\tfinal.iFVK\torig.iFVK\tiFVKM.factor\tiEUMA\t#isoforms\t#measured_isoforms\n";
+ for my $gene (sort keys %$pAllisoforms){
+
+  my $nTr = scalar keys %{$pGene2NM->{$gene}};
+  my $nTr_measured = scalar keys %{$pIFVKM->{$gene}};
+
+  my $iFVKM_factor = (exists $pIFVKM_factor->{$gene})?$pIFVKM_factor->{$gene}:'NA';
+
+  for my $isoform (sort keys %{$pAllisoforms->{$gene}}){
+
+   my $iEUMA='';
+   if(exists $piEUMA->{$gene}{$isoform}) { $iEUMA = $piEUMA->{$gene}{$isoform}; }
+   else { $iEUMA = 'below_cut'; }
+ 
+   my $final_iFVKM = (exists $pFinal_iFVKM->{$gene} && exists ${$pFinal_iFVKM->{$gene}}{$isoform})?$pFinal_iFVKM->{$gene}{$isoform}:$pIFVKM->{$gene}{$isoform};
+   print FI_FVKM "$gene\t$isoform\t$final_iFVKM\t$pIFVKM->{$gene}{$isoform}\t$iFVKM_factor\t$iEUMA\t$nTr\t$nTr_measured\n";
+  }
+ }
+ close FI_FVKM;
+}
+
+sub sum {
+ my $s=0;
+ while(defined(my $x=shift @_)){
+  $s+=$x;
+ }
+ return $s;
+}
+
+
+