19
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 #Margaret Antonio 16.01.13
|
|
4
|
|
5 #DESCRIPTION: Takes two aggregate.pl outputs and compares them using mean difference, pval for each
|
|
6 #gene. Can compare, for example, 19F in glucose and TIGR4 in glucose.
|
|
7 #DIFFERENT GENOMES (ie. diff. strains).
|
|
8 #Requires CONVERSION FILE
|
|
9
|
|
10
|
|
11 use Data::Dumper;
|
|
12 use strict;
|
|
13 use Getopt::Long;
|
|
14 #use warnings;
|
|
15 use File::Path;
|
|
16 use File::Basename;
|
|
17 use Statistics::Distributions;
|
|
18
|
|
19 #ASSIGN INPUTS TO VARIABLES USING FLAGS
|
|
20 our ($input1,$input2,$out,$sortkey,$round,$l1,$l2,$cfile);
|
|
21 GetOptions(
|
|
22 'o:s' =>\$out,
|
|
23 's:i' => \$sortkey,
|
|
24 'r:i'=> \$round,
|
|
25 'l1:s'=> \$l1,
|
|
26 'l2:s'=> \$l2,
|
|
27 'c:s'=> \$cfile,
|
|
28 'input1:s'=>\$input1,
|
|
29 'input2:s'=>\$input2
|
|
30 );
|
|
31
|
|
32
|
|
33 #THE @files ARRAY WILL CONTAIN INPUT FILE NAMES
|
|
34 my @files=($input1,$input2);
|
|
35
|
|
36 #GET LABELS:
|
|
37 my @labels = ($l1,$l2);
|
|
38
|
|
39 sub cleaner{
|
|
40 my $line=$_[0];
|
|
41 chomp($line);
|
|
42 $line =~ s/\x0d{0,1}\x0a{0,1}\Z//s;
|
|
43 return $line;
|
|
44 }
|
|
45
|
|
46
|
|
47 #CHECK IF REQ. VARIABLES WERE DEFINED USING FLAGS. IF NOT THEN USE DEFAULT VALUES
|
|
48 if (!$out) {$out="comp.".$labels[0].$labels[1].".csv"}
|
|
49 if (!$round){$round='%.4f'}
|
|
50
|
|
51 #OPEN INPUTTED AGGREGATE GENE FILES AND STORE THEIR CONTENTS INTO TWO HASHES
|
|
52 #FILE1 GOES INTO HASH %ONE AND FILE2 GOES INTO HASH %TWO.
|
|
53
|
|
54 #FILE1 OPENING ---> %one WHERE KEY:VALUE IS GENE_ID:(GENE_ID,INSERTIONS,MEAN,ETC.)
|
|
55 my @header;
|
|
56 my %one;
|
|
57
|
|
58 open (F1,'<',$files[0]);
|
|
59
|
|
60 #STORE COLUMN NAMES (FIRST LINE OF FILE1) FOR HEADER AND APPEND LABELS
|
|
61 my $head=<F1>; #the header in the file
|
|
62 my @cols=split(',',$head);
|
|
63 @cols=@cols[0,1,2,3,4,5]; #get rid of blank columns
|
|
64 for (my $j=0;$j<scalar @cols;$j++){
|
|
65 $cols[$j]=cleaner($cols[$j]).'-'.$labels[0]; #mark each column name with file it comes from
|
|
66 }
|
|
67 push (@header,@cols);
|
|
68
|
|
69 while (my $line=<F1>){
|
|
70 chomp $line;
|
|
71 my @info=split(",",$line);
|
|
72 #Only keep the first 6 columns (Ones about blanks aren't needed for comparisons)
|
|
73 @info=@info[0,1,2,3,4,5];
|
|
74 #Sometimes genes that don't have a gene name can't be blank, so fill with NA
|
|
75 if (!$info[5]){
|
|
76 $info[5]="";
|
|
77 }
|
|
78
|
|
79 $one{$info[0]}=\@info;
|
|
80 }
|
|
81 close F1;
|
|
82
|
|
83 #FILE2 OPENING ---> %two WHERE KEY:VALUE IS GENE_ID:(GENE_ID,INSERTIONS,MEAN,ETC.)
|
|
84
|
|
85 my %two;
|
|
86 open (F2,'<',$files[1]);
|
|
87
|
|
88 #STORE COLUMN NAMES (FIRST LINE OF FILE2) FOR HEADER AND APPEND LABELS
|
|
89 $head=<F2>; #the header in the file
|
|
90 @cols=split(',',$head);
|
|
91 @cols=@cols[0,1,2,3,4,5]; #get rid of blank columns
|
|
92 for (my $j=0;$j<scalar @cols;$j++){
|
|
93 $cols[$j]=cleaner($cols[$j]).'-'.$labels[1]; #mark each column name with file it comes from
|
|
94 }
|
|
95 push (@header,@cols);
|
|
96
|
|
97 while (my $line=<F2>){
|
|
98 $line = cleaner($line);
|
|
99 my @info=split(",",$line);
|
|
100 @info=@info[0,1,2,3,4,5];
|
|
101 if (!$info[5]){
|
|
102 $info[5]="";
|
|
103 }
|
|
104
|
|
105 $two{$info[0]}=\@info;
|
|
106 }
|
|
107 close F2;
|
|
108
|
|
109
|
|
110 #READ CONVERSION FILE INTO ARRAY.
|
|
111 #Conversion file must have strain 1 for file 1 in column 1 (index 0) and
|
|
112 #strain 2 for file 2 in column 2 (index 1)
|
|
113 #conversion file must be tab delimited with no NA fields
|
|
114 #If homologs (exist then take info from hashes (%one and %two) by referring to gene_id in KEY
|
|
115
|
|
116 my @all; #store all homologs in this hash
|
|
117 open (CONV,'<',$cfile);
|
|
118 while (my $line=<CONV>){
|
|
119 $line = cleaner($line);
|
|
120 my @genes=split("\t",$line); #Array @genes will contain two genes (SP_0000,SPT_0000)
|
|
121 if (scalar @genes==2 and (exists $one{$genes[0]}) and (exists $two{$genes[1]})){
|
|
122 my @info;
|
|
123 my @oneArray=@{$one{$genes[0]}};
|
|
124 my @twoArray=@{$two{$genes[1]}};
|
|
125 push (@info,@oneArray,@twoArray);
|
|
126 # Fitness values at index 1 and 7
|
|
127 my $diff=sprintf("$round",($info[1]-$info[7]));
|
|
128 my $total1=$info[2];
|
|
129 my $total2=$info[8];
|
|
130 if (!$info[3] or $info[3] eq ""){$info[3]="NA"};
|
|
131 if (!$info[4] or $info[4] eq ""){$info[4]="NA"};
|
|
132 if (!$info[9] or $info[9] eq ""){$info[9]="NA"};
|
|
133 if (!$info[10] or $info[10] eq ""){$info[10]="NA"};
|
|
134 my $sd1=$info[3];
|
|
135 my $se1=$info[4];
|
|
136 my $sd2=$info[9];
|
|
137 my $se2=$info[10];
|
|
138 my $df=$total1+$total2-2;
|
|
139 my ($tdist,$pval);
|
|
140 #TDIST, PVAL calculations with fail if standard dev, error, or counts are not real numbers
|
|
141 #or if 0 ends up in denominator
|
|
142 if ($sd1 eq "NA" or $sd2 eq "NA" or $total1==0 or $total2==0 or $sd1==0 or $sd2==0){
|
|
143 ($tdist,$pval)=("NA","NA");
|
|
144 }
|
|
145 else{
|
|
146 $tdist=sqrt((($diff)/(sqrt((($sd1**2)/$total1)+(($sd2**2)/$total2))))**2);
|
|
147 $pval=Statistics::Distributions::tprob($df,$tdist);
|
|
148 }
|
|
149 push (@info,$diff,$df,$tdist,$pval);
|
|
150 push (@all,\@info);
|
|
151 }
|
|
152 }
|
|
153 close CONV;
|
|
154
|
|
155 #SORT THE HOMOLOGS BY THE SORTKEY OR BY DEFAULT DIFFERENCE IN MEAN FITNESSES
|
|
156 if (!$sortkey){
|
|
157 $sortkey=12; #for mean difference
|
|
158 }
|
|
159 my @sorted = sort { $b->[$sortkey] <=> $a->[$sortkey] } @all;
|
|
160
|
|
161 #FINISH THE HEADER BY ADDING COLUMN NAMES FOR MEAN-DIFF, DOF, TDIST, AND PVALUE
|
|
162 my $field="MeanDiff(".$labels[0].'.'.$labels[1].")";
|
|
163 push (@header,$field,"DOF","TDIST","PVALUE");
|
|
164
|
|
165 #PRINT MATCHED HOMOLOG INFORMATION INTO A SINGLE OUTPUT FILE
|
|
166 open OUT, '>',"$out";
|
|
167 print OUT (join(',',@header),"\n");
|
|
168 foreach (@sorted){
|
|
169 my @comparison=@{$_};
|
|
170 print OUT join(',',@comparison),"\n";
|
|
171 }
|
|
172
|
|
173 close OUT;
|
|
174
|
|
175
|