0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 #Margaret Antonio updated 16.10.03
|
|
4
|
|
5 use strict;
|
|
6 use Getopt::Long;
|
|
7 use warnings;
|
|
8 use Bio::SeqIO;
|
|
9
|
|
10 #AVAILABLE OPTIONS. WILL PRINT UPON ERROR
|
|
11 sub print_usage() {
|
|
12 print "\n####################################################################\n";
|
|
13 print "singleVal: creates a wig file with position and insertion count OR fitness\n";
|
|
14 print "\nDESCRIPTION: ";
|
|
15 print "Integrates multiple files of transposon insertion data and outputs\n";
|
|
16 print "aggregate fitness within a sliding window (specified by size and step).\n";
|
|
17
|
|
18 print "\nUSAGE:\n";
|
|
19 print "perl singleVal.pl <OPTIONS> <REQ OUTPUT TYPE(S)> <INPUT FILE(S)>\n\n";
|
|
20
|
|
21 print "\nREQUIRED:\n";
|
|
22 print " -d\tDirectory containing all input files (files from\n";
|
|
23 print " \taggregate script)\n";
|
|
24 print " \tOR\n";
|
|
25 print " \tIn the command line (without a flag), input the name(s) of\n";
|
|
26 print " \tfiles containing gene fitness values (output of calcFit). \n\n";
|
|
27 print " -x\tCutoff: Don't include fitness scores with average counts (c1+c2)/2 < x (default: 10)\n";
|
|
28 print " -b\tBlanks: Exclude -b % of blank fitness scores (scores where c2 = 0) (default: 0 = 0%)\n";
|
|
29 print " -w\tUse weighted algorithm to calculate averages, variance, sd, se\n";
|
|
30 print " -l\tWeight ceiling: maximum value to use as a weight (default: 50)\n";
|
|
31
|
|
32 print "OPTIONAL:\n";
|
|
33 print " -h\tPrints usage and exits program\n";
|
|
34 print " -o\tOutput file for comparison data. Default: singleVal.wig\n";
|
|
35 print " -v\tString value for output: 'fit' for fitness OR 'count' for count\n";
|
|
36 print " \tDefault: fit for fitness\n";
|
|
37 print " -n\tName of the reference genome, to be included in the wig header\n";
|
|
38 print " \tDefault: genome\n";
|
|
39
|
|
40 print " \n~~~~Always check that file paths are correctly specified~~~~\n";
|
|
41 print "\n##################################################################\n";
|
|
42
|
|
43 }
|
|
44
|
|
45
|
|
46 #ASSIGN INPUTS TO VARIABLES
|
|
47 our ($help,$ref_genome,$indir,$out,$cutoff,$blank_pc,$weight_ceiling);
|
|
48 GetOptions(
|
|
49 'h'=>\$help,
|
|
50 'o:s' =>\$out,
|
|
51 'c:i' =>\$cutoff,
|
|
52 'b:i' =>\$blank_pc,
|
|
53 'l:i' =>\$weight_ceiling,
|
|
54 );
|
|
55
|
|
56 sub get_time() {
|
|
57 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);
|
|
58 return "$hour:$min:$sec";
|
|
59 }
|
|
60
|
|
61 if ($help){
|
|
62 print_usage();
|
|
63 exit;
|
|
64 }
|
|
65 if (!$indir and (scalar @ARGV==0)){
|
|
66 print "\nERROR: Please correctly specify input files or directory\n";
|
|
67 print_usage();
|
|
68 print "\n";
|
|
69 exit;
|
|
70 }
|
|
71
|
|
72 #ADDED BY MLA allows input to be directory---good for inputting L1-L6
|
|
73 my @files=@ARGV;
|
|
74
|
|
75
|
|
76 #SET DEFAULTS
|
|
77 if (!$cutoff){$cutoff=10}
|
|
78 if (!$blank_pc){$blank_pc=0}
|
|
79 if (!$weight_ceiling){$weight_ceiling=50}
|
|
80 if (!$out){$out="singleVal.csv"}
|
|
81
|
|
82 # Returns mean, variance, sd, se
|
|
83 sub average {
|
|
84 my $scoreref = shift @_;
|
|
85 my @scores = @{$scoreref};
|
|
86
|
|
87 my $sum=0;
|
|
88 my $num=0;
|
|
89
|
|
90 # Get the average.
|
|
91 foreach my $w (@scores) {
|
|
92 $sum += $w;
|
|
93 $num++;
|
|
94 }
|
|
95 my $average= $sum/$num;
|
|
96 my $xminusxbars = 0;
|
|
97
|
|
98 # Get the variance.
|
|
99 foreach my $w (@scores) {
|
|
100 $xminusxbars += ($w-$average)**2;
|
|
101 }
|
|
102 my $variance = (1/($num-1)) * $xminusxbars;
|
|
103 my $sd = sqrt($variance);
|
|
104 my $se = $sd / sqrt($num);
|
|
105
|
|
106 return ($average, $variance, $sd, $se);
|
|
107
|
|
108 }
|
|
109
|
|
110 # Takes two parameters, both hashrefs to lists.
|
|
111 # 1) hashref to list of scores
|
|
112 # 2) hashref to list of weights, equal in length to the scores.
|
|
113 sub weighted_average {
|
|
114
|
|
115 my $scoreref = shift @_;
|
|
116 my $weightref = shift @_;
|
|
117 my @scores = @{$scoreref};
|
|
118 my @weights = @{$weightref};
|
|
119
|
|
120 my $sum=0;
|
|
121 my ($weighted_average, $weighted_variance)=(0,0);
|
|
122 my $v2;
|
|
123
|
|
124 # Go through once to get total, calculate V2.
|
|
125 for (my $i=0; $i<@weights; $i++) {
|
|
126 $sum += $weights[$i];
|
|
127 $v2 += $weights[$i]**2;
|
|
128 }
|
|
129 if ($sum == 0) { return 0; } # No scores given?
|
|
130
|
|
131 my $scor = join (' ', @scores);
|
|
132 my $wght = join (' ', @weights);
|
|
133
|
|
134 # Now calculated weighted average.
|
|
135 my ($top, $bottom) = (0,0);
|
|
136 for (my $i=0; $i<@weights; $i++) {
|
|
137 $top += $weights[$i] * $scores[$i];
|
|
138 $bottom += $weights[$i];
|
|
139 }
|
|
140 $weighted_average = $top/$bottom;
|
|
141 #print "WA: $weighted_average\n";
|
|
142
|
|
143 ($top, $bottom) = (0,0);
|
|
144 # Now calculate weighted sample variance.
|
|
145 for (my $i=0; $i<@weights; $i++) {
|
|
146 $top += ( $weights[$i] * ($scores[$i] - $weighted_average)**2);
|
|
147 $bottom += $weights[$i];
|
|
148 }
|
|
149 $weighted_variance = $top/$bottom;
|
|
150
|
|
151 my $weighted_stdev = sqrt($weighted_variance);
|
|
152 my $weighted_stder = $weighted_stdev / sqrt(@scores); # / length scores.
|
|
153
|
|
154 return ($weighted_average, $weighted_variance, $weighted_stdev, $weighted_stder);
|
|
155 }
|
|
156
|
|
157 my %pos_summary;
|
|
158
|
|
159
|
|
160 foreach my $filename (@files) {
|
|
161 print "\n",$filename,"\n";
|
|
162 open IN, $filename;
|
|
163 my %hash;
|
|
164 while (my $line = <IN>) {
|
|
165 chomp $line;
|
|
166 my @lines = split(/,/,$line);
|
|
167 my $pos = $lines[0];
|
|
168 my $w = $lines[12];
|
|
169 if ($w and $w eq 'nW') {next;}
|
|
170 if (!$w) { $w = 0 } # For blanks
|
|
171 my $c1 = $lines[2];
|
|
172 my $c2 = $lines[3];
|
|
173 my $avg = ($c1+$c2)/2; # Later: change which function to use? C1? AVG(C1+C2)?
|
|
174 if ($avg < $cutoff) { next; } # Skip cutoff genes.
|
|
175 if ($avg >= $weight_ceiling) { $avg = $weight_ceiling; } # Maximum weight.
|
|
176
|
|
177 my @empty;
|
|
178 if (!$pos_summary{$pos}) {
|
|
179 $pos_summary{$pos}{w} = [@empty];
|
|
180 $pos_summary{$pos}{s} = [@empty];
|
|
181 }
|
|
182 $pos_summary{$pos}{w} = [@{$pos_summary{$pos}{w}}, $w]; # List of Fitness scores.
|
|
183 $pos_summary{$pos}{s} = [@{$pos_summary{$pos}{s}}, $avg]; # List of counts used to generate those fitness scores.
|
|
184 }
|
|
185 close IN;
|
|
186
|
|
187 }
|
|
188
|
|
189 open SUMMARY, ">",$out;
|
|
190
|
|
191
|
|
192 print SUMMARY "pos,fitness,ins_count,fitness_sd,fitness_se\n";
|
|
193 # Now print out summary stats.
|
|
194 foreach my $key (sort {$a<=>$b} keys %pos_summary) {
|
|
195 if (!$key) {next}
|
|
196 my $sum=0;
|
|
197 my $num=0;
|
|
198 my $avgsum = 0;
|
|
199
|
|
200 # Get the average.
|
|
201 foreach my $w (@{$pos_summary{$key}{w}}) {
|
|
202 $sum += $w;
|
|
203 $num++;
|
|
204 }
|
|
205 my $average= $sum/$num;
|
|
206 my $xminusxbars = 0;
|
|
207
|
|
208 # Get the standard deviation.
|
|
209 foreach my $w (@{$pos_summary{$key}{w}}) {
|
|
210 $xminusxbars += ($w-$average)**2;
|
|
211 }
|
|
212 my ($sd, $se) = ('','');
|
|
213 if ($num > 1) {
|
|
214 $sd = sqrt($xminusxbars/($num-1));
|
|
215 $se = $sd / sqrt($num);
|
|
216 }
|
|
217
|
|
218 print SUMMARY "$key,$average,$num,$sd,$se\n";
|
|
219
|
|
220 }
|
|
221
|
|
222
|
|
223 close SUMMARY;
|