Mercurial > repos > chawhwa > neuma
comparison NEUMA-1.2.1/combine_gFVKM_iFVKM_max.pl @ 0:c44c43d185ef draft default tip
NEUMA-1.2.1 Uploaded
author | chawhwa |
---|---|
date | Thu, 08 Aug 2013 00:46:13 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c44c43d185ef |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 #This script was adopted and modified from ``scripts/combine_gFVKM_iFVKM_5.pl`. | |
4 | |
5 | |
6 if(@ARGV<4) { print "usage: $0 gFVKM_file iFVKM_file gene2NM_file EUMA_CUT(eg.50, in bp) output.final.gFVKM output.final.iFVKM\n"; exit; } | |
7 my ($gFVKM_file, $iFVKM_file, $gene2NM_file, $CUT, $final_gFVKM_file, $final_iFVKM_file ) = @ARGV; | |
8 | |
9 | |
10 my %gFVKM=(); | |
11 my %gEUMA=(); | |
12 my %allgenes=(); | |
13 &read_gFVKM($gFVKM_file,\%gFVKM,\%gEUMA,\%allgenes); | |
14 | |
15 | |
16 my %iFVKM=(); | |
17 my %iEUMA=(); | |
18 my %Alltranscirpts=(); | |
19 &read_iFVKM($iFVKM_file,\%iFVKM,\%iEUMA,\%Allisoforms); | |
20 | |
21 | |
22 my %gene2NM=(); | |
23 &read_gene2NM($gene2NM_file,\%gene2NM); | |
24 | |
25 | |
26 my %derived_gFVKM = (); | |
27 &compute_derived_gFVKM(\%derived_gFVKM,\%iFVKM,\%allgenes); | |
28 | |
29 my %final_gFVKM=(); | |
30 &compute_final_gFVKM(\%allgenes,\%gFVKM,\%derived_gFVKM,\%final_gFVKM); | |
31 | |
32 my %final_iFVKM=(); | |
33 my %iFVKM_factor=(); | |
34 &compute_final_iFVKM(\%final_iFVKM,\%iFVKM,\%final_gFVKM,\%iFVKM_factor,\%Allisoforms) | |
35 | |
36 &generate_final_gFVKM_file($final_gFVKM_file,\%allgenes,\%gFVKM,\%derived_gFVKM,\%gEUMA,\%iFVKM,\%gene2NM,\%final_gFVKM); | |
37 &generate_final_iFVKM_file($final_iFVKM_file,\%iFVKM,\%final_iFVKM,\%iFVKM_factor,\%iEUMA,\%gene2NM,\%Allisoforms); | |
38 | |
39 | |
40 | |
41 ################# | |
42 ## subroutines ## | |
43 ################# | |
44 | |
45 | |
46 sub read_gFVKM { | |
47 my $gFVKM_file = shift @_; | |
48 my $pGFVKM = shift @_; | |
49 my $pgEUMA = shift @_; | |
50 my $pAllgenes = shift @_; | |
51 | |
52 open IN, $gFVKM_file or die "Can't open gFVKM file $gFVKM_file\n"; | |
53 while(<IN>){ | |
54 chomp; | |
55 my ($gene,$gFVKM_0,$gEUMA_0) = (split/\t/)[0,1,3]; | |
56 next if $gEUMA_0 < $CUT; | |
57 $pGFVKM->{$gene} = $gFVKM_0; | |
58 $pgEUMA->{$gene} = $gEUMA_0; | |
59 $pAllgenes->{$gene}=1; | |
60 } | |
61 close IN; | |
62 } | |
63 | |
64 | |
65 sub read_iFVKM { | |
66 my $iFVKM_file = shift @_; | |
67 my $pIFVKM = shift @_; | |
68 my $piEUMA = shift @_; | |
69 my $pAllisoforms = shift @_; | |
70 | |
71 open IN, $iFVKM_file or die "Can't open iFVKM file $iFVKM_file\n"; | |
72 while(<IN>){ | |
73 chomp; | |
74 my ($gid,$isoform,$iFVKM_0,$iEUMA_0) = (split/\t/)[0,1,2,4]; | |
75 next if $iEUMA_0 < $CUT; | |
76 $pIFVKM->{$gid}{$isoform} = $iFVKM_0; | |
77 $piEUMA->{$gid}{$isoform} = $iEUMA_0; | |
78 $pAllisoforms->{$gid}{$isoform} = 1; | |
79 } | |
80 close IN; | |
81 } | |
82 | |
83 | |
84 sub read_gene2NM { | |
85 my $gene2NM_file = shift @_; | |
86 my $pGene2NM = shift @_; | |
87 | |
88 open IN, $gene2NM_file or die "Can't open gene2NM file $gene2NM_file\n"; | |
89 while(<IN>){ | |
90 chomp; | |
91 my ($gid,$isoform) = split/\t/; | |
92 $pGene2NM->{$gid}{$isoform}=1; | |
93 } | |
94 close IN; | |
95 } | |
96 | |
97 | |
98 | |
99 sub compute_derived_gFVKM { | |
100 my $pDerived_gFVKM = shift @_; | |
101 my $pIFVKM = shift @_; | |
102 my $pAllgenes = shift @_; | |
103 | |
104 #compute derived gFVKM | |
105 for my $gene (keys %$pIFVKM){ | |
106 my $sumiFVKM = &sum(values(%{$pIFVKM->{$gene}})); | |
107 $pDerived_gFVKM->{$gene} = $sumiFVKM; | |
108 $pAllgenes->{$gene}=1; | |
109 } | |
110 } | |
111 | |
112 | |
113 sub compute_final_iFVKM { | |
114 my $pFinal_iFVKM = shift @_; | |
115 my $pIFVKM = shift @_; | |
116 my $pFinalGFVKM = shift @_; | |
117 my $pIFVKM_factor = shift @_; | |
118 my $pAllisoforms = shift @_; | |
119 | |
120 #rescale iFVKM based on final gFVKM | |
121 for my $gene (keys %$pIFVKM){ | |
122 my $sumiFVKM = &sum(values %{$pIFVKM->{$gene}}); | |
123 if($sumiFVKM == 0) { $pIFVKM_factor->{$gene} = "inf"; } | |
124 else { $pIFVKM_factor->{$gene} = $pFinalGFVKM->{$gene} / $sumiFVKM; } | |
125 for my $isoform (keys %{$pIFVKM->{$gene}}) { | |
126 $pFinal_iFVKM->{$gene} = $pIFVKM->{$gene}; | |
127 $pAllisoforms->{$gene}{$isoform} =1; | |
128 } | |
129 } | |
130 | |
131 } | |
132 | |
133 | |
134 # using max of gFVKM and sumiFVKM, instead of average. | |
135 sub compute_final_gFVKM { | |
136 my $pAllgenes = shift @_; | |
137 my $pGFVKM = shift @_; | |
138 my $pDerived_gFVKM = shift @_; | |
139 my $pFinal_gFVKM = shift @_; | |
140 | |
141 for my $gene (sort keys %$pAllgenes){ | |
142 my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA'; | |
143 my $derived_gFVKM_val = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA'; | |
144 my $final_gFVKM = 'NA'; | |
145 if($orig_gFVKM ne 'NA' && $derived_gFVKM_val ne 'NA'){ | |
146 #$final_gFVKM = ($orig_gFVKM+$derived_gFVKM_val)/2; #average | |
147 $final_gFVKM = ($orig_gFVKM>=$derived_gFVKM_val?$orig_gFVKM:$derived_gFVKM_val); #maximum | |
148 } | |
149 elsif($orig_gFVKM eq 'NA') { $final_gFVKM = $derived_gFVKM_val; } | |
150 elsif($derived_gFVKM_val eq 'NA') { $final_gFVKM = $orig_gFVKM; } | |
151 | |
152 $pFinal_gFVKM->{$gene} = $final_gFVKM; | |
153 } | |
154 } | |
155 | |
156 | |
157 | |
158 | |
159 sub generate_final_gFVKM_file { | |
160 my $final_gFVKM_file = shift @_; | |
161 my $pAllgenes = shift @_; | |
162 my $pGFVKM = shift @_; | |
163 my $pDerived_gFVKM = shift @_; | |
164 my $pgEUMA = shift @_; | |
165 my $pIFVKM = shift @_; | |
166 my $pGene2NM = shift @_; | |
167 my $pFinal_gFVKM = shift @_; | |
168 | |
169 open FG_FVKM, ">$final_gFVKM_file" or die "Can't write to final gFVKM file $final_gFVKM_file\n"; | |
170 print FG_FVKM "gene\tfinal.gFVK\torig.gFVK\tderived_gFVK\tgEUMA\t#isoforms\t#measured_isoforms\n"; | |
171 for my $gene (sort keys %$pAllgenes){ | |
172 | |
173 my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA'; | |
174 my $derived_gFVKM = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA'; | |
175 my $final_gFVKM = (exists $pFinal_gFVKM->{$gene})?$pFinal_gFVKM->{$gene}:'NA'; | |
176 | |
177 my $gEUMA=''; | |
178 if(exists $pgEUMA->{$gene}){ $gEUMA = $pgEUMA->{$gene}; } | |
179 else { $gEUMA = 'below_cut'; } | |
180 | |
181 my $nTr = scalar keys %{$pGene2NM->{$gene}}; | |
182 my $nTr_measured = scalar keys %{$pIFVKM->{$gene}}; | |
183 | |
184 print FG_FVKM "$gene\t$final_gFVKM\t$orig_gFVKM\t$derived_gFVKM\t$gEUMA\t$nTr\t$nTr_measured\n"; | |
185 } | |
186 close FG_FVKM; | |
187 } | |
188 | |
189 | |
190 | |
191 sub generate_final_iFVKM_file { | |
192 my $final_iFVKM_file = shift @_; | |
193 my $pIFVKM = shift @_; | |
194 my $pFinal_iFVKM = shift @_; | |
195 my $pIFVKM_factor = shift @_; | |
196 my $piEUMA = shift @_; | |
197 my $pGene2NM = shift @_; | |
198 my $pAllisoforms = shift @_; | |
199 | |
200 #print scalar keys %$pDerived_iFVKM; ##DEBUGGING | |
201 | |
202 open FI_FVKM, ">$final_iFVKM_file" or die "Can't write to final iFVKM file $final_iFVKM_file\n"; | |
203 print FI_FVKM "gene\tisoform\tfinal.iFVK\torig.iFVK\tiFVKM.factor\tiEUMA\t#isoforms\t#measured_isoforms\n"; | |
204 for my $gene (sort keys %$pAllisoforms){ | |
205 | |
206 my $nTr = scalar keys %{$pGene2NM->{$gene}}; | |
207 my $nTr_measured = scalar keys %{$pIFVKM->{$gene}}; | |
208 | |
209 my $iFVKM_factor = (exists $pIFVKM_factor->{$gene})?$pIFVKM_factor->{$gene}:'NA'; | |
210 | |
211 for my $isoform (sort keys %{$pAllisoforms->{$gene}}){ | |
212 | |
213 my $iEUMA=''; | |
214 if(exists $piEUMA->{$gene}{$isoform}) { $iEUMA = $piEUMA->{$gene}{$isoform}; } | |
215 else { $iEUMA = 'below_cut'; } | |
216 | |
217 my $final_iFVKM = (exists $pFinal_iFVKM->{$gene} && exists ${$pFinal_iFVKM->{$gene}}{$isoform})?$pFinal_iFVKM->{$gene}{$isoform}:$pIFVKM->{$gene}{$isoform}; | |
218 print FI_FVKM "$gene\t$isoform\t$final_iFVKM\t$pIFVKM->{$gene}{$isoform}\t$iFVKM_factor\t$iEUMA\t$nTr\t$nTr_measured\n"; | |
219 } | |
220 } | |
221 close FI_FVKM; | |
222 } | |
223 | |
224 sub sum { | |
225 my $s=0; | |
226 while(defined(my $x=shift @_)){ | |
227 $s+=$x; | |
228 } | |
229 return $s; | |
230 } | |
231 | |
232 | |
233 |