0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #This script was adopted and modified from ``scripts/combine_gFVKM_iFVKM_5.pl`.
|
|
4
|
|
5
|
|
6 if(@ARGV<4) { print "usage: $0 gFVKM_file iFVKM_file gene2NM_file EUMA_CUT(eg.50, in bp) output.final.gFVKM output.final.iFVKM\n"; exit; }
|
|
7 my ($gFVKM_file, $iFVKM_file, $gene2NM_file, $CUT, $final_gFVKM_file, $final_iFVKM_file ) = @ARGV;
|
|
8
|
|
9
|
|
10 my %gFVKM=();
|
|
11 my %gEUMA=();
|
|
12 my %allgenes=();
|
|
13 &read_gFVKM($gFVKM_file,\%gFVKM,\%gEUMA,\%allgenes);
|
|
14
|
|
15
|
|
16 my %iFVKM=();
|
|
17 my %iEUMA=();
|
|
18 my %Alltranscirpts=();
|
|
19 &read_iFVKM($iFVKM_file,\%iFVKM,\%iEUMA,\%Allisoforms);
|
|
20
|
|
21
|
|
22 my %gene2NM=();
|
|
23 &read_gene2NM($gene2NM_file,\%gene2NM);
|
|
24
|
|
25
|
|
26 my %derived_gFVKM = ();
|
|
27 &compute_derived_gFVKM(\%derived_gFVKM,\%iFVKM,\%allgenes);
|
|
28
|
|
29 my %final_gFVKM=();
|
|
30 &compute_final_gFVKM(\%allgenes,\%gFVKM,\%derived_gFVKM,\%final_gFVKM);
|
|
31
|
|
32 my %final_iFVKM=();
|
|
33 my %iFVKM_factor=();
|
|
34 &compute_final_iFVKM(\%final_iFVKM,\%iFVKM,\%final_gFVKM,\%iFVKM_factor,\%Allisoforms)
|
|
35
|
|
36 &generate_final_gFVKM_file($final_gFVKM_file,\%allgenes,\%gFVKM,\%derived_gFVKM,\%gEUMA,\%iFVKM,\%gene2NM,\%final_gFVKM);
|
|
37 &generate_final_iFVKM_file($final_iFVKM_file,\%iFVKM,\%final_iFVKM,\%iFVKM_factor,\%iEUMA,\%gene2NM,\%Allisoforms);
|
|
38
|
|
39
|
|
40
|
|
41 #################
|
|
42 ## subroutines ##
|
|
43 #################
|
|
44
|
|
45
|
|
46 sub read_gFVKM {
|
|
47 my $gFVKM_file = shift @_;
|
|
48 my $pGFVKM = shift @_;
|
|
49 my $pgEUMA = shift @_;
|
|
50 my $pAllgenes = shift @_;
|
|
51
|
|
52 open IN, $gFVKM_file or die "Can't open gFVKM file $gFVKM_file\n";
|
|
53 while(<IN>){
|
|
54 chomp;
|
|
55 my ($gene,$gFVKM_0,$gEUMA_0) = (split/\t/)[0,1,3];
|
|
56 next if $gEUMA_0 < $CUT;
|
|
57 $pGFVKM->{$gene} = $gFVKM_0;
|
|
58 $pgEUMA->{$gene} = $gEUMA_0;
|
|
59 $pAllgenes->{$gene}=1;
|
|
60 }
|
|
61 close IN;
|
|
62 }
|
|
63
|
|
64
|
|
65 sub read_iFVKM {
|
|
66 my $iFVKM_file = shift @_;
|
|
67 my $pIFVKM = shift @_;
|
|
68 my $piEUMA = shift @_;
|
|
69 my $pAllisoforms = shift @_;
|
|
70
|
|
71 open IN, $iFVKM_file or die "Can't open iFVKM file $iFVKM_file\n";
|
|
72 while(<IN>){
|
|
73 chomp;
|
|
74 my ($gid,$isoform,$iFVKM_0,$iEUMA_0) = (split/\t/)[0,1,2,4];
|
|
75 next if $iEUMA_0 < $CUT;
|
|
76 $pIFVKM->{$gid}{$isoform} = $iFVKM_0;
|
|
77 $piEUMA->{$gid}{$isoform} = $iEUMA_0;
|
|
78 $pAllisoforms->{$gid}{$isoform} = 1;
|
|
79 }
|
|
80 close IN;
|
|
81 }
|
|
82
|
|
83
|
|
84 sub read_gene2NM {
|
|
85 my $gene2NM_file = shift @_;
|
|
86 my $pGene2NM = shift @_;
|
|
87
|
|
88 open IN, $gene2NM_file or die "Can't open gene2NM file $gene2NM_file\n";
|
|
89 while(<IN>){
|
|
90 chomp;
|
|
91 my ($gid,$isoform) = split/\t/;
|
|
92 $pGene2NM->{$gid}{$isoform}=1;
|
|
93 }
|
|
94 close IN;
|
|
95 }
|
|
96
|
|
97
|
|
98
|
|
99 sub compute_derived_gFVKM {
|
|
100 my $pDerived_gFVKM = shift @_;
|
|
101 my $pIFVKM = shift @_;
|
|
102 my $pAllgenes = shift @_;
|
|
103
|
|
104 #compute derived gFVKM
|
|
105 for my $gene (keys %$pIFVKM){
|
|
106 my $sumiFVKM = &sum(values(%{$pIFVKM->{$gene}}));
|
|
107 $pDerived_gFVKM->{$gene} = $sumiFVKM;
|
|
108 $pAllgenes->{$gene}=1;
|
|
109 }
|
|
110 }
|
|
111
|
|
112
|
|
113 sub compute_final_iFVKM {
|
|
114 my $pFinal_iFVKM = shift @_;
|
|
115 my $pIFVKM = shift @_;
|
|
116 my $pFinalGFVKM = shift @_;
|
|
117 my $pIFVKM_factor = shift @_;
|
|
118 my $pAllisoforms = shift @_;
|
|
119
|
|
120 #rescale iFVKM based on final gFVKM
|
|
121 for my $gene (keys %$pIFVKM){
|
|
122 my $sumiFVKM = &sum(values %{$pIFVKM->{$gene}});
|
|
123 if($sumiFVKM == 0) { $pIFVKM_factor->{$gene} = "inf"; }
|
|
124 else { $pIFVKM_factor->{$gene} = $pFinalGFVKM->{$gene} / $sumiFVKM; }
|
|
125 for my $isoform (keys %{$pIFVKM->{$gene}}) {
|
|
126 $pFinal_iFVKM->{$gene} = $pIFVKM->{$gene};
|
|
127 $pAllisoforms->{$gene}{$isoform} =1;
|
|
128 }
|
|
129 }
|
|
130
|
|
131 }
|
|
132
|
|
133
|
|
134 # using max of gFVKM and sumiFVKM, instead of average.
|
|
135 sub compute_final_gFVKM {
|
|
136 my $pAllgenes = shift @_;
|
|
137 my $pGFVKM = shift @_;
|
|
138 my $pDerived_gFVKM = shift @_;
|
|
139 my $pFinal_gFVKM = shift @_;
|
|
140
|
|
141 for my $gene (sort keys %$pAllgenes){
|
|
142 my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA';
|
|
143 my $derived_gFVKM_val = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA';
|
|
144 my $final_gFVKM = 'NA';
|
|
145 if($orig_gFVKM ne 'NA' && $derived_gFVKM_val ne 'NA'){
|
|
146 #$final_gFVKM = ($orig_gFVKM+$derived_gFVKM_val)/2; #average
|
|
147 $final_gFVKM = ($orig_gFVKM>=$derived_gFVKM_val?$orig_gFVKM:$derived_gFVKM_val); #maximum
|
|
148 }
|
|
149 elsif($orig_gFVKM eq 'NA') { $final_gFVKM = $derived_gFVKM_val; }
|
|
150 elsif($derived_gFVKM_val eq 'NA') { $final_gFVKM = $orig_gFVKM; }
|
|
151
|
|
152 $pFinal_gFVKM->{$gene} = $final_gFVKM;
|
|
153 }
|
|
154 }
|
|
155
|
|
156
|
|
157
|
|
158
|
|
159 sub generate_final_gFVKM_file {
|
|
160 my $final_gFVKM_file = shift @_;
|
|
161 my $pAllgenes = shift @_;
|
|
162 my $pGFVKM = shift @_;
|
|
163 my $pDerived_gFVKM = shift @_;
|
|
164 my $pgEUMA = shift @_;
|
|
165 my $pIFVKM = shift @_;
|
|
166 my $pGene2NM = shift @_;
|
|
167 my $pFinal_gFVKM = shift @_;
|
|
168
|
|
169 open FG_FVKM, ">$final_gFVKM_file" or die "Can't write to final gFVKM file $final_gFVKM_file\n";
|
|
170 print FG_FVKM "gene\tfinal.gFVK\torig.gFVK\tderived_gFVK\tgEUMA\t#isoforms\t#measured_isoforms\n";
|
|
171 for my $gene (sort keys %$pAllgenes){
|
|
172
|
|
173 my $orig_gFVKM = (exists $pGFVKM->{$gene})?$pGFVKM->{$gene}:'NA';
|
|
174 my $derived_gFVKM = (exists $pDerived_gFVKM->{$gene})?$pDerived_gFVKM->{$gene}:'NA';
|
|
175 my $final_gFVKM = (exists $pFinal_gFVKM->{$gene})?$pFinal_gFVKM->{$gene}:'NA';
|
|
176
|
|
177 my $gEUMA='';
|
|
178 if(exists $pgEUMA->{$gene}){ $gEUMA = $pgEUMA->{$gene}; }
|
|
179 else { $gEUMA = 'below_cut'; }
|
|
180
|
|
181 my $nTr = scalar keys %{$pGene2NM->{$gene}};
|
|
182 my $nTr_measured = scalar keys %{$pIFVKM->{$gene}};
|
|
183
|
|
184 print FG_FVKM "$gene\t$final_gFVKM\t$orig_gFVKM\t$derived_gFVKM\t$gEUMA\t$nTr\t$nTr_measured\n";
|
|
185 }
|
|
186 close FG_FVKM;
|
|
187 }
|
|
188
|
|
189
|
|
190
|
|
191 sub generate_final_iFVKM_file {
|
|
192 my $final_iFVKM_file = shift @_;
|
|
193 my $pIFVKM = shift @_;
|
|
194 my $pFinal_iFVKM = shift @_;
|
|
195 my $pIFVKM_factor = shift @_;
|
|
196 my $piEUMA = shift @_;
|
|
197 my $pGene2NM = shift @_;
|
|
198 my $pAllisoforms = shift @_;
|
|
199
|
|
200 #print scalar keys %$pDerived_iFVKM; ##DEBUGGING
|
|
201
|
|
202 open FI_FVKM, ">$final_iFVKM_file" or die "Can't write to final iFVKM file $final_iFVKM_file\n";
|
|
203 print FI_FVKM "gene\tisoform\tfinal.iFVK\torig.iFVK\tiFVKM.factor\tiEUMA\t#isoforms\t#measured_isoforms\n";
|
|
204 for my $gene (sort keys %$pAllisoforms){
|
|
205
|
|
206 my $nTr = scalar keys %{$pGene2NM->{$gene}};
|
|
207 my $nTr_measured = scalar keys %{$pIFVKM->{$gene}};
|
|
208
|
|
209 my $iFVKM_factor = (exists $pIFVKM_factor->{$gene})?$pIFVKM_factor->{$gene}:'NA';
|
|
210
|
|
211 for my $isoform (sort keys %{$pAllisoforms->{$gene}}){
|
|
212
|
|
213 my $iEUMA='';
|
|
214 if(exists $piEUMA->{$gene}{$isoform}) { $iEUMA = $piEUMA->{$gene}{$isoform}; }
|
|
215 else { $iEUMA = 'below_cut'; }
|
|
216
|
|
217 my $final_iFVKM = (exists $pFinal_iFVKM->{$gene} && exists ${$pFinal_iFVKM->{$gene}}{$isoform})?$pFinal_iFVKM->{$gene}{$isoform}:$pIFVKM->{$gene}{$isoform};
|
|
218 print FI_FVKM "$gene\t$isoform\t$final_iFVKM\t$pIFVKM->{$gene}{$isoform}\t$iFVKM_factor\t$iEUMA\t$nTr\t$nTr_measured\n";
|
|
219 }
|
|
220 }
|
|
221 close FI_FVKM;
|
|
222 }
|
|
223
|
|
224 sub sum {
|
|
225 my $s=0;
|
|
226 while(defined(my $x=shift @_)){
|
|
227 $s+=$x;
|
|
228 }
|
|
229 return $s;
|
|
230 }
|
|
231
|
|
232
|
|
233
|