Mercurial > repos > chawhwa > neuma
diff NEUMA-1.2.1/combine_gFVKM_iFVKM_max.pl @ 0:c44c43d185ef draft default tip
NEUMA-1.2.1 Uploaded
author | chawhwa |
---|---|
date | Thu, 08 Aug 2013 00:46:13 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NEUMA-1.2.1/combine_gFVKM_iFVKM_max.pl Thu Aug 08 00:46:13 2013 -0400 @@ -0,0 +1,233 @@ +#!/usr/bin/perl + +#This script was adopted and modified from ``scripts/combine_gFVKM_iFVKM_5.pl`. + + +if(@ARGV<4) { print "usage: $0 gFVKM_file iFVKM_file gene2NM_file EUMA_CUT(eg.50, in bp) output.final.gFVKM output.final.iFVKM\n"; exit; } +my ($gFVKM_file, $iFVKM_file, $gene2NM_file, $CUT, $final_gFVKM_file, $final_iFVKM_file ) = @ARGV; + + +my %gFVKM=(); +my %gEUMA=(); +my %allgenes=(); +&read_gFVKM($gFVKM_file,\%gFVKM,\%gEUMA,\%allgenes); + + +my %iFVKM=(); +my %iEUMA=(); +my %Alltranscirpts=(); +&read_iFVKM($iFVKM_file,\%iFVKM,\%iEUMA,\%Allisoforms); + + +my %gene2NM=(); +&read_gene2NM($gene2NM_file,\%gene2NM); + + +my %derived_gFVKM = (); +&compute_derived_gFVKM(\%derived_gFVKM,\%iFVKM,\%allgenes); + +my %final_gFVKM=(); +&compute_final_gFVKM(\%allgenes,\%gFVKM,\%derived_gFVKM,\%final_gFVKM); + +my %final_iFVKM=(); +my %iFVKM_factor=(); +&compute_final_iFVKM(\%final_iFVKM,\%iFVKM,\%final_gFVKM,\%iFVKM_factor,\%Allisoforms) + +&generate_final_gFVKM_file($final_gFVKM_file,\%allgenes,\%gFVKM,\%derived_gFVKM,\%gEUMA,\%iFVKM,\%gene2NM,\%final_gFVKM); +&generate_final_iFVKM_file($final_iFVKM_file,\%iFVKM,\%final_iFVKM,\%iFVKM_factor,\%iEUMA,\%gene2NM,\%Allisoforms); + + + +################# +## subroutines ## +################# + + +sub read_gFVKM { + my $gFVKM_file = shift @_; + my $pGFVKM = shift @_; + my $pgEUMA = shift @_; + my $pAllgenes = shift @_; + + open IN, $gFVKM_file or die "Can't open gFVKM file $gFVKM_file\n"; + while(<IN>){ + chomp; + my ($gene,$gFVKM_0,$gEUMA_0) = (split/\t/)[0,1,3]; + next if $gEUMA_0 < $CUT; + $pGFVKM->{$gene} = $gFVKM_0; + $pgEUMA->{$gene} = $gEUMA_0; + $pAllgenes->{$gene}=1; + } + close IN; +} + + +sub read_iFVKM { + my $iFVKM_file = shift @_; + my $pIFVKM = shift @_; + my $piEUMA = shift @_; + my $pAllisoforms = shift @_; + + open IN, $iFVKM_file or die "Can't open iFVKM file $iFVKM_file\n"; + while(<IN>){ + chomp; + my ($gid,$isoform,$iFVKM_0,$iEUMA_0) = (split/\t/)[0,1,2,4]; + next if $iEUMA_0 < $CUT; + $pIFVKM->{$gid}{$isoform} = $iFVKM_0; + $piEUMA->{$gid}{$isoform} = $iEUMA_0; + $pAllisoforms->{$gid}{$isoform} = 1; + } + close IN; +} + + +sub read_gene2NM { + my $gene2NM_file = shift @_; + my $pGene2NM = shift @_; + + open IN, $gene2NM_file or die "Can't open gene2NM file $gene2NM_file\n"; + while(<IN>){ + chomp; + my ($gid,$isoform) = split/\t/; + $pGene2NM->{$gid}{$isoform}=1; + } + close IN; +} + + + +sub compute_derived_gFVKM { + my $pDerived_gFVKM = shift @_; + my $pIFVKM = shift @_; + my $pAllgenes = shift @_; + + #compute derived gFVKM + for my $gene (keys %$pIFVKM){ + my $sumiFVKM = &sum(values(%{$pIFVKM->{$gene}})); + $pDerived_gFVKM->{$gene} = $sumiFVKM; + $pAllgenes->{$gene}=1; + } +} + + +sub compute_final_iFVKM { + my $pFinal_iFVKM = shift @_; + my $pIFVKM = shift @_; + my $pFinalGFVKM = shift @_; + my $pIFVKM_factor = shift @_; + my $pAllisoforms = shift @_; + + #rescale iFVKM based on final gFVKM + for my $gene (keys %$pIFVKM){ + my $sumiFVKM = &sum(values %{$pIFVKM->{$gene}}); + if($sumiFVKM == 0) { $pIFVKM_factor->{$gene} = "inf"; } + else { $pIFVKM_factor->{$gene} = $pFinalGFVKM->{$gene} / $sumiFVKM; } + for my $isoform (keys %{$pIFVKM->{$gene}}) { + $pFinal_iFVKM->{$gene} = $pIFVKM->{$gene}; + $pAllisoforms->{$gene}{$isoform} =1; + } + } + +} + + +# using max of gFVKM and sumiFVKM, instead of average. +sub compute_final_gFVKM { + my $pAllgenes = shift @_; + my $pGFVKM = shift @_; + my $pDerived_gFVKM = shift @_; + my $pFinal_gFVKM = shift @_; + + for my $gene (sort keys %$pAllgenes){ + my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA'; + my $derived_gFVKM_val = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA'; + my $final_gFVKM = 'NA'; + if($orig_gFVKM ne 'NA' && $derived_gFVKM_val ne 'NA'){ + #$final_gFVKM = ($orig_gFVKM+$derived_gFVKM_val)/2; #average + $final_gFVKM = ($orig_gFVKM>=$derived_gFVKM_val?$orig_gFVKM:$derived_gFVKM_val); #maximum + } + elsif($orig_gFVKM eq 'NA') { $final_gFVKM = $derived_gFVKM_val; } + elsif($derived_gFVKM_val eq 'NA') { $final_gFVKM = $orig_gFVKM; } + + $pFinal_gFVKM->{$gene} = $final_gFVKM; + } +} + + + + +sub generate_final_gFVKM_file { + my $final_gFVKM_file = shift @_; + my $pAllgenes = shift @_; + my $pGFVKM = shift @_; + my $pDerived_gFVKM = shift @_; + my $pgEUMA = shift @_; + my $pIFVKM = shift @_; + my $pGene2NM = shift @_; + my $pFinal_gFVKM = shift @_; + + open FG_FVKM, ">$final_gFVKM_file" or die "Can't write to final gFVKM file $final_gFVKM_file\n"; + print FG_FVKM "gene\tfinal.gFVK\torig.gFVK\tderived_gFVK\tgEUMA\t#isoforms\t#measured_isoforms\n"; + for my $gene (sort keys %$pAllgenes){ + + my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA'; + my $derived_gFVKM = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA'; + my $final_gFVKM = (exists $pFinal_gFVKM->{$gene})?$pFinal_gFVKM->{$gene}:'NA'; + + my $gEUMA=''; + if(exists $pgEUMA->{$gene}){ $gEUMA = $pgEUMA->{$gene}; } + else { $gEUMA = 'below_cut'; } + + my $nTr = scalar keys %{$pGene2NM->{$gene}}; + my $nTr_measured = scalar keys %{$pIFVKM->{$gene}}; + + print FG_FVKM "$gene\t$final_gFVKM\t$orig_gFVKM\t$derived_gFVKM\t$gEUMA\t$nTr\t$nTr_measured\n"; + } + close FG_FVKM; +} + + + +sub generate_final_iFVKM_file { + my $final_iFVKM_file = shift @_; + my $pIFVKM = shift @_; + my $pFinal_iFVKM = shift @_; + my $pIFVKM_factor = shift @_; + my $piEUMA = shift @_; + my $pGene2NM = shift @_; + my $pAllisoforms = shift @_; + + #print scalar keys %$pDerived_iFVKM; ##DEBUGGING + + open FI_FVKM, ">$final_iFVKM_file" or die "Can't write to final iFVKM file $final_iFVKM_file\n"; + print FI_FVKM "gene\tisoform\tfinal.iFVK\torig.iFVK\tiFVKM.factor\tiEUMA\t#isoforms\t#measured_isoforms\n"; + for my $gene (sort keys %$pAllisoforms){ + + my $nTr = scalar keys %{$pGene2NM->{$gene}}; + my $nTr_measured = scalar keys %{$pIFVKM->{$gene}}; + + my $iFVKM_factor = (exists $pIFVKM_factor->{$gene})?$pIFVKM_factor->{$gene}:'NA'; + + for my $isoform (sort keys %{$pAllisoforms->{$gene}}){ + + my $iEUMA=''; + if(exists $piEUMA->{$gene}{$isoform}) { $iEUMA = $piEUMA->{$gene}{$isoform}; } + else { $iEUMA = 'below_cut'; } + + my $final_iFVKM = (exists $pFinal_iFVKM->{$gene} && exists ${$pFinal_iFVKM->{$gene}}{$isoform})?$pFinal_iFVKM->{$gene}{$isoform}:$pIFVKM->{$gene}{$isoform}; + print FI_FVKM "$gene\t$isoform\t$final_iFVKM\t$pIFVKM->{$gene}{$isoform}\t$iFVKM_factor\t$iEUMA\t$nTr\t$nTr_measured\n"; + } + } + close FI_FVKM; +} + +sub sum { + my $s=0; + while(defined(my $x=shift @_)){ + $s+=$x; + } + return $s; +} + + +