annotate singleFitness.pl @ 6:f4b2c15caf52 draft

Uploaded
author antmarge
date Sat, 18 Mar 2017 19:07:12 -0400
parents d3227d69c228
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
1 #!/usr/bin/perl -w
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
2
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
3 #Margaret Antonio updated 16.10.03
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
4
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
5 use strict;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
6 use Getopt::Long;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
7 use warnings;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
8 use Bio::SeqIO;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
9
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
10 #AVAILABLE OPTIONS. WILL PRINT UPON ERROR
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
11 sub print_usage() {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
12 print "\n####################################################################\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
13 print "singleVal: creates a wig file with position and insertion count OR fitness\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
14 print "\nDESCRIPTION: ";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
15 print "Integrates multiple files of transposon insertion data and outputs\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
16 print "aggregate fitness within a sliding window (specified by size and step).\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
17
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
18 print "\nUSAGE:\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
19 print "perl singleVal.pl <OPTIONS> <REQ OUTPUT TYPE(S)> <INPUT FILE(S)>\n\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
20
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
21 print "\nREQUIRED:\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
22 print " -d\tDirectory containing all input files (files from\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
23 print " \taggregate script)\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
24 print " \tOR\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
25 print " \tIn the command line (without a flag), input the name(s) of\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
26 print " \tfiles containing gene fitness values (output of calcFit). \n\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
27 print " -x\tCutoff: Don't include fitness scores with average counts (c1+c2)/2 < x (default: 10)\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
28 print " -b\tBlanks: Exclude -b % of blank fitness scores (scores where c2 = 0) (default: 0 = 0%)\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
29 print " -w\tUse weighted algorithm to calculate averages, variance, sd, se\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
30 print " -l\tWeight ceiling: maximum value to use as a weight (default: 50)\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
31
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
32 print "OPTIONAL:\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
33 print " -h\tPrints usage and exits program\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
34 print " -o\tOutput file for comparison data. Default: singleVal.wig\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
35 print " -v\tString value for output: 'fit' for fitness OR 'count' for count\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
36 print " \tDefault: fit for fitness\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
37 print " -n\tName of the reference genome, to be included in the wig header\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
38 print " \tDefault: genome\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
39
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
40 print " \n~~~~Always check that file paths are correctly specified~~~~\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
41 print "\n##################################################################\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
42
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
43 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
44
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
45
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
46 #ASSIGN INPUTS TO VARIABLES
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
47 our ($help,$ref_genome,$indir,$out,$cutoff,$blank_pc,$weight_ceiling);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
48 GetOptions(
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
49 'h'=>\$help,
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
50 'o:s' =>\$out,
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
51 'c:i' =>\$cutoff,
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
52 'b:i' =>\$blank_pc,
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
53 'l:i' =>\$weight_ceiling,
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
54 );
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
55
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
56 sub get_time() {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
57 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
58 return "$hour:$min:$sec";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
59 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
60
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
61 if ($help){
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
62 print_usage();
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
63 exit;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
64 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
65 if (!$indir and (scalar @ARGV==0)){
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
66 print "\nERROR: Please correctly specify input files or directory\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
67 print_usage();
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
68 print "\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
69 exit;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
70 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
71
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
72 #ADDED BY MLA allows input to be directory---good for inputting L1-L6
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
73 my @files=@ARGV;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
74
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
75
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
76 #SET DEFAULTS
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
77 if (!$cutoff){$cutoff=10}
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
78 if (!$blank_pc){$blank_pc=0}
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
79 if (!$weight_ceiling){$weight_ceiling=50}
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
80 if (!$out){$out="singleVal.csv"}
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
81
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
82 # Returns mean, variance, sd, se
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
83 sub average {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
84 my $scoreref = shift @_;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
85 my @scores = @{$scoreref};
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
86
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
87 my $sum=0;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
88 my $num=0;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
89
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
90 # Get the average.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
91 foreach my $w (@scores) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
92 $sum += $w;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
93 $num++;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
94 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
95 my $average= $sum/$num;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
96 my $xminusxbars = 0;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
97
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
98 # Get the variance.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
99 foreach my $w (@scores) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
100 $xminusxbars += ($w-$average)**2;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
101 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
102 my $variance = (1/($num-1)) * $xminusxbars;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
103 my $sd = sqrt($variance);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
104 my $se = $sd / sqrt($num);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
105
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
106 return ($average, $variance, $sd, $se);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
107
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
108 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
109
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
110 # Takes two parameters, both hashrefs to lists.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
111 # 1) hashref to list of scores
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
112 # 2) hashref to list of weights, equal in length to the scores.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
113 sub weighted_average {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
114
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
115 my $scoreref = shift @_;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
116 my $weightref = shift @_;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
117 my @scores = @{$scoreref};
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
118 my @weights = @{$weightref};
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
119
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
120 my $sum=0;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
121 my ($weighted_average, $weighted_variance)=(0,0);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
122 my $v2;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
123
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
124 # Go through once to get total, calculate V2.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
125 for (my $i=0; $i<@weights; $i++) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
126 $sum += $weights[$i];
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
127 $v2 += $weights[$i]**2;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
128 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
129 if ($sum == 0) { return 0; } # No scores given?
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
130
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
131 my $scor = join (' ', @scores);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
132 my $wght = join (' ', @weights);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
133
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
134 # Now calculated weighted average.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
135 my ($top, $bottom) = (0,0);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
136 for (my $i=0; $i<@weights; $i++) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
137 $top += $weights[$i] * $scores[$i];
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
138 $bottom += $weights[$i];
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
139 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
140 $weighted_average = $top/$bottom;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
141 #print "WA: $weighted_average\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
142
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
143 ($top, $bottom) = (0,0);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
144 # Now calculate weighted sample variance.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
145 for (my $i=0; $i<@weights; $i++) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
146 $top += ( $weights[$i] * ($scores[$i] - $weighted_average)**2);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
147 $bottom += $weights[$i];
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
148 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
149 $weighted_variance = $top/$bottom;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
150
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
151 my $weighted_stdev = sqrt($weighted_variance);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
152 my $weighted_stder = $weighted_stdev / sqrt(@scores); # / length scores.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
153
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
154 return ($weighted_average, $weighted_variance, $weighted_stdev, $weighted_stder);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
155 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
156
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
157 my %pos_summary;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
158
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
159
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
160 foreach my $filename (@files) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
161 print "\n",$filename,"\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
162 open IN, $filename;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
163 my %hash;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
164 while (my $line = <IN>) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
165 chomp $line;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
166 my @lines = split(/,/,$line);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
167 my $pos = $lines[0];
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
168 my $w = $lines[12];
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
169 if ($w and $w eq 'nW') {next;}
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
170 if (!$w) { $w = 0 } # For blanks
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
171 my $c1 = $lines[2];
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
172 my $c2 = $lines[3];
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
173 my $avg = ($c1+$c2)/2; # Later: change which function to use? C1? AVG(C1+C2)?
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
174 if ($avg < $cutoff) { next; } # Skip cutoff genes.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
175 if ($avg >= $weight_ceiling) { $avg = $weight_ceiling; } # Maximum weight.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
176
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
177 my @empty;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
178 if (!$pos_summary{$pos}) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
179 $pos_summary{$pos}{w} = [@empty];
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
180 $pos_summary{$pos}{s} = [@empty];
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
181 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
182 $pos_summary{$pos}{w} = [@{$pos_summary{$pos}{w}}, $w]; # List of Fitness scores.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
183 $pos_summary{$pos}{s} = [@{$pos_summary{$pos}{s}}, $avg]; # List of counts used to generate those fitness scores.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
184 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
185 close IN;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
186
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
187 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
188
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
189 open SUMMARY, ">",$out;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
190
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
191
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
192 print SUMMARY "pos,fitness,ins_count,fitness_sd,fitness_se\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
193 # Now print out summary stats.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
194 foreach my $key (sort {$a<=>$b} keys %pos_summary) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
195 if (!$key) {next}
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
196 my $sum=0;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
197 my $num=0;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
198 my $avgsum = 0;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
199
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
200 # Get the average.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
201 foreach my $w (@{$pos_summary{$key}{w}}) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
202 $sum += $w;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
203 $num++;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
204 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
205 my $average= $sum/$num;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
206 my $xminusxbars = 0;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
207
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
208 # Get the standard deviation.
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
209 foreach my $w (@{$pos_summary{$key}{w}}) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
210 $xminusxbars += ($w-$average)**2;
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
211 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
212 my ($sd, $se) = ('','');
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
213 if ($num > 1) {
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
214 $sd = sqrt($xminusxbars/($num-1));
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
215 $se = $sd / sqrt($num);
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
216 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
217
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
218 print SUMMARY "$key,$average,$num,$sd,$se\n";
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
219
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
220 }
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
221
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
222
d3227d69c228 Uploaded
antmarge
parents:
diff changeset
223 close SUMMARY;