comparison NEUMA-1.2.1/combine_gFVKM_iFVKM_max.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c44c43d185ef
1 #!/usr/bin/perl
2
3 #This script was adopted and modified from ``scripts/combine_gFVKM_iFVKM_5.pl`.
4
5
6 if(@ARGV<4) { print "usage: $0 gFVKM_file iFVKM_file gene2NM_file EUMA_CUT(eg.50, in bp) output.final.gFVKM output.final.iFVKM\n"; exit; }
7 my ($gFVKM_file, $iFVKM_file, $gene2NM_file, $CUT, $final_gFVKM_file, $final_iFVKM_file ) = @ARGV;
8
9
10 my %gFVKM=();
11 my %gEUMA=();
12 my %allgenes=();
13 &read_gFVKM($gFVKM_file,\%gFVKM,\%gEUMA,\%allgenes);
14
15
16 my %iFVKM=();
17 my %iEUMA=();
18 my %Alltranscirpts=();
19 &read_iFVKM($iFVKM_file,\%iFVKM,\%iEUMA,\%Allisoforms);
20
21
22 my %gene2NM=();
23 &read_gene2NM($gene2NM_file,\%gene2NM);
24
25
26 my %derived_gFVKM = ();
27 &compute_derived_gFVKM(\%derived_gFVKM,\%iFVKM,\%allgenes);
28
29 my %final_gFVKM=();
30 &compute_final_gFVKM(\%allgenes,\%gFVKM,\%derived_gFVKM,\%final_gFVKM);
31
32 my %final_iFVKM=();
33 my %iFVKM_factor=();
34 &compute_final_iFVKM(\%final_iFVKM,\%iFVKM,\%final_gFVKM,\%iFVKM_factor,\%Allisoforms)
35
36 &generate_final_gFVKM_file($final_gFVKM_file,\%allgenes,\%gFVKM,\%derived_gFVKM,\%gEUMA,\%iFVKM,\%gene2NM,\%final_gFVKM);
37 &generate_final_iFVKM_file($final_iFVKM_file,\%iFVKM,\%final_iFVKM,\%iFVKM_factor,\%iEUMA,\%gene2NM,\%Allisoforms);
38
39
40
41 #################
42 ## subroutines ##
43 #################
44
45
46 sub read_gFVKM {
47 my $gFVKM_file = shift @_;
48 my $pGFVKM = shift @_;
49 my $pgEUMA = shift @_;
50 my $pAllgenes = shift @_;
51
52 open IN, $gFVKM_file or die "Can't open gFVKM file $gFVKM_file\n";
53 while(<IN>){
54 chomp;
55 my ($gene,$gFVKM_0,$gEUMA_0) = (split/\t/)[0,1,3];
56 next if $gEUMA_0 < $CUT;
57 $pGFVKM->{$gene} = $gFVKM_0;
58 $pgEUMA->{$gene} = $gEUMA_0;
59 $pAllgenes->{$gene}=1;
60 }
61 close IN;
62 }
63
64
65 sub read_iFVKM {
66 my $iFVKM_file = shift @_;
67 my $pIFVKM = shift @_;
68 my $piEUMA = shift @_;
69 my $pAllisoforms = shift @_;
70
71 open IN, $iFVKM_file or die "Can't open iFVKM file $iFVKM_file\n";
72 while(<IN>){
73 chomp;
74 my ($gid,$isoform,$iFVKM_0,$iEUMA_0) = (split/\t/)[0,1,2,4];
75 next if $iEUMA_0 < $CUT;
76 $pIFVKM->{$gid}{$isoform} = $iFVKM_0;
77 $piEUMA->{$gid}{$isoform} = $iEUMA_0;
78 $pAllisoforms->{$gid}{$isoform} = 1;
79 }
80 close IN;
81 }
82
83
84 sub read_gene2NM {
85 my $gene2NM_file = shift @_;
86 my $pGene2NM = shift @_;
87
88 open IN, $gene2NM_file or die "Can't open gene2NM file $gene2NM_file\n";
89 while(<IN>){
90 chomp;
91 my ($gid,$isoform) = split/\t/;
92 $pGene2NM->{$gid}{$isoform}=1;
93 }
94 close IN;
95 }
96
97
98
99 sub compute_derived_gFVKM {
100 my $pDerived_gFVKM = shift @_;
101 my $pIFVKM = shift @_;
102 my $pAllgenes = shift @_;
103
104 #compute derived gFVKM
105 for my $gene (keys %$pIFVKM){
106 my $sumiFVKM = &sum(values(%{$pIFVKM->{$gene}}));
107 $pDerived_gFVKM->{$gene} = $sumiFVKM;
108 $pAllgenes->{$gene}=1;
109 }
110 }
111
112
113 sub compute_final_iFVKM {
114 my $pFinal_iFVKM = shift @_;
115 my $pIFVKM = shift @_;
116 my $pFinalGFVKM = shift @_;
117 my $pIFVKM_factor = shift @_;
118 my $pAllisoforms = shift @_;
119
120 #rescale iFVKM based on final gFVKM
121 for my $gene (keys %$pIFVKM){
122 my $sumiFVKM = &sum(values %{$pIFVKM->{$gene}});
123 if($sumiFVKM == 0) { $pIFVKM_factor->{$gene} = "inf"; }
124 else { $pIFVKM_factor->{$gene} = $pFinalGFVKM->{$gene} / $sumiFVKM; }
125 for my $isoform (keys %{$pIFVKM->{$gene}}) {
126 $pFinal_iFVKM->{$gene} = $pIFVKM->{$gene};
127 $pAllisoforms->{$gene}{$isoform} =1;
128 }
129 }
130
131 }
132
133
134 # using max of gFVKM and sumiFVKM, instead of average.
135 sub compute_final_gFVKM {
136 my $pAllgenes = shift @_;
137 my $pGFVKM = shift @_;
138 my $pDerived_gFVKM = shift @_;
139 my $pFinal_gFVKM = shift @_;
140
141 for my $gene (sort keys %$pAllgenes){
142 my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA';
143 my $derived_gFVKM_val = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA';
144 my $final_gFVKM = 'NA';
145 if($orig_gFVKM ne 'NA' && $derived_gFVKM_val ne 'NA'){
146 #$final_gFVKM = ($orig_gFVKM+$derived_gFVKM_val)/2; #average
147 $final_gFVKM = ($orig_gFVKM>=$derived_gFVKM_val?$orig_gFVKM:$derived_gFVKM_val); #maximum
148 }
149 elsif($orig_gFVKM eq 'NA') { $final_gFVKM = $derived_gFVKM_val; }
150 elsif($derived_gFVKM_val eq 'NA') { $final_gFVKM = $orig_gFVKM; }
151
152 $pFinal_gFVKM->{$gene} = $final_gFVKM;
153 }
154 }
155
156
157
158
159 sub generate_final_gFVKM_file {
160 my $final_gFVKM_file = shift @_;
161 my $pAllgenes = shift @_;
162 my $pGFVKM = shift @_;
163 my $pDerived_gFVKM = shift @_;
164 my $pgEUMA = shift @_;
165 my $pIFVKM = shift @_;
166 my $pGene2NM = shift @_;
167 my $pFinal_gFVKM = shift @_;
168
169 open FG_FVKM, ">$final_gFVKM_file" or die "Can't write to final gFVKM file $final_gFVKM_file\n";
170 print FG_FVKM "gene\tfinal.gFVK\torig.gFVK\tderived_gFVK\tgEUMA\t#isoforms\t#measured_isoforms\n";
171 for my $gene (sort keys %$pAllgenes){
172
173 my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA';
174 my $derived_gFVKM = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA';
175 my $final_gFVKM = (exists $pFinal_gFVKM->{$gene})?$pFinal_gFVKM->{$gene}:'NA';
176
177 my $gEUMA='';
178 if(exists $pgEUMA->{$gene}){ $gEUMA = $pgEUMA->{$gene}; }
179 else { $gEUMA = 'below_cut'; }
180
181 my $nTr = scalar keys %{$pGene2NM->{$gene}};
182 my $nTr_measured = scalar keys %{$pIFVKM->{$gene}};
183
184 print FG_FVKM "$gene\t$final_gFVKM\t$orig_gFVKM\t$derived_gFVKM\t$gEUMA\t$nTr\t$nTr_measured\n";
185 }
186 close FG_FVKM;
187 }
188
189
190
191 sub generate_final_iFVKM_file {
192 my $final_iFVKM_file = shift @_;
193 my $pIFVKM = shift @_;
194 my $pFinal_iFVKM = shift @_;
195 my $pIFVKM_factor = shift @_;
196 my $piEUMA = shift @_;
197 my $pGene2NM = shift @_;
198 my $pAllisoforms = shift @_;
199
200 #print scalar keys %$pDerived_iFVKM; ##DEBUGGING
201
202 open FI_FVKM, ">$final_iFVKM_file" or die "Can't write to final iFVKM file $final_iFVKM_file\n";
203 print FI_FVKM "gene\tisoform\tfinal.iFVK\torig.iFVK\tiFVKM.factor\tiEUMA\t#isoforms\t#measured_isoforms\n";
204 for my $gene (sort keys %$pAllisoforms){
205
206 my $nTr = scalar keys %{$pGene2NM->{$gene}};
207 my $nTr_measured = scalar keys %{$pIFVKM->{$gene}};
208
209 my $iFVKM_factor = (exists $pIFVKM_factor->{$gene})?$pIFVKM_factor->{$gene}:'NA';
210
211 for my $isoform (sort keys %{$pAllisoforms->{$gene}}){
212
213 my $iEUMA='';
214 if(exists $piEUMA->{$gene}{$isoform}) { $iEUMA = $piEUMA->{$gene}{$isoform}; }
215 else { $iEUMA = 'below_cut'; }
216
217 my $final_iFVKM = (exists $pFinal_iFVKM->{$gene} && exists ${$pFinal_iFVKM->{$gene}}{$isoform})?$pFinal_iFVKM->{$gene}{$isoform}:$pIFVKM->{$gene}{$isoform};
218 print FI_FVKM "$gene\t$isoform\t$final_iFVKM\t$pIFVKM->{$gene}{$isoform}\t$iFVKM_factor\t$iEUMA\t$nTr\t$nTr_measured\n";
219 }
220 }
221 close FI_FVKM;
222 }
223
224 sub sum {
225 my $s=0;
226 while(defined(my $x=shift @_)){
227 $s+=$x;
228 }
229 return $s;
230 }
231
232
233