Mercurial > repos > antmarge > singlefitness
comparison singleFitness.pl @ 0:d3227d69c228 draft
Uploaded
author | antmarge |
---|---|
date | Sat, 18 Mar 2017 16:23:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d3227d69c228 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 #Margaret Antonio updated 16.10.03 | |
4 | |
5 use strict; | |
6 use Getopt::Long; | |
7 use warnings; | |
8 use Bio::SeqIO; | |
9 | |
10 #AVAILABLE OPTIONS. WILL PRINT UPON ERROR | |
11 sub print_usage() { | |
12 print "\n####################################################################\n"; | |
13 print "singleVal: creates a wig file with position and insertion count OR fitness\n"; | |
14 print "\nDESCRIPTION: "; | |
15 print "Integrates multiple files of transposon insertion data and outputs\n"; | |
16 print "aggregate fitness within a sliding window (specified by size and step).\n"; | |
17 | |
18 print "\nUSAGE:\n"; | |
19 print "perl singleVal.pl <OPTIONS> <REQ OUTPUT TYPE(S)> <INPUT FILE(S)>\n\n"; | |
20 | |
21 print "\nREQUIRED:\n"; | |
22 print " -d\tDirectory containing all input files (files from\n"; | |
23 print " \taggregate script)\n"; | |
24 print " \tOR\n"; | |
25 print " \tIn the command line (without a flag), input the name(s) of\n"; | |
26 print " \tfiles containing gene fitness values (output of calcFit). \n\n"; | |
27 print " -x\tCutoff: Don't include fitness scores with average counts (c1+c2)/2 < x (default: 10)\n"; | |
28 print " -b\tBlanks: Exclude -b % of blank fitness scores (scores where c2 = 0) (default: 0 = 0%)\n"; | |
29 print " -w\tUse weighted algorithm to calculate averages, variance, sd, se\n"; | |
30 print " -l\tWeight ceiling: maximum value to use as a weight (default: 50)\n"; | |
31 | |
32 print "OPTIONAL:\n"; | |
33 print " -h\tPrints usage and exits program\n"; | |
34 print " -o\tOutput file for comparison data. Default: singleVal.wig\n"; | |
35 print " -v\tString value for output: 'fit' for fitness OR 'count' for count\n"; | |
36 print " \tDefault: fit for fitness\n"; | |
37 print " -n\tName of the reference genome, to be included in the wig header\n"; | |
38 print " \tDefault: genome\n"; | |
39 | |
40 print " \n~~~~Always check that file paths are correctly specified~~~~\n"; | |
41 print "\n##################################################################\n"; | |
42 | |
43 } | |
44 | |
45 | |
46 #ASSIGN INPUTS TO VARIABLES | |
47 our ($help,$ref_genome,$indir,$out,$cutoff,$blank_pc,$weight_ceiling); | |
48 GetOptions( | |
49 'h'=>\$help, | |
50 'o:s' =>\$out, | |
51 'c:i' =>\$cutoff, | |
52 'b:i' =>\$blank_pc, | |
53 'l:i' =>\$weight_ceiling, | |
54 ); | |
55 | |
56 sub get_time() { | |
57 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time); | |
58 return "$hour:$min:$sec"; | |
59 } | |
60 | |
61 if ($help){ | |
62 print_usage(); | |
63 exit; | |
64 } | |
65 if (!$indir and (scalar @ARGV==0)){ | |
66 print "\nERROR: Please correctly specify input files or directory\n"; | |
67 print_usage(); | |
68 print "\n"; | |
69 exit; | |
70 } | |
71 | |
72 #ADDED BY MLA allows input to be directory---good for inputting L1-L6 | |
73 my @files=@ARGV; | |
74 | |
75 | |
76 #SET DEFAULTS | |
77 if (!$cutoff){$cutoff=10} | |
78 if (!$blank_pc){$blank_pc=0} | |
79 if (!$weight_ceiling){$weight_ceiling=50} | |
80 if (!$out){$out="singleVal.csv"} | |
81 | |
82 # Returns mean, variance, sd, se | |
83 sub average { | |
84 my $scoreref = shift @_; | |
85 my @scores = @{$scoreref}; | |
86 | |
87 my $sum=0; | |
88 my $num=0; | |
89 | |
90 # Get the average. | |
91 foreach my $w (@scores) { | |
92 $sum += $w; | |
93 $num++; | |
94 } | |
95 my $average= $sum/$num; | |
96 my $xminusxbars = 0; | |
97 | |
98 # Get the variance. | |
99 foreach my $w (@scores) { | |
100 $xminusxbars += ($w-$average)**2; | |
101 } | |
102 my $variance = (1/($num-1)) * $xminusxbars; | |
103 my $sd = sqrt($variance); | |
104 my $se = $sd / sqrt($num); | |
105 | |
106 return ($average, $variance, $sd, $se); | |
107 | |
108 } | |
109 | |
110 # Takes two parameters, both hashrefs to lists. | |
111 # 1) hashref to list of scores | |
112 # 2) hashref to list of weights, equal in length to the scores. | |
113 sub weighted_average { | |
114 | |
115 my $scoreref = shift @_; | |
116 my $weightref = shift @_; | |
117 my @scores = @{$scoreref}; | |
118 my @weights = @{$weightref}; | |
119 | |
120 my $sum=0; | |
121 my ($weighted_average, $weighted_variance)=(0,0); | |
122 my $v2; | |
123 | |
124 # Go through once to get total, calculate V2. | |
125 for (my $i=0; $i<@weights; $i++) { | |
126 $sum += $weights[$i]; | |
127 $v2 += $weights[$i]**2; | |
128 } | |
129 if ($sum == 0) { return 0; } # No scores given? | |
130 | |
131 my $scor = join (' ', @scores); | |
132 my $wght = join (' ', @weights); | |
133 | |
134 # Now calculated weighted average. | |
135 my ($top, $bottom) = (0,0); | |
136 for (my $i=0; $i<@weights; $i++) { | |
137 $top += $weights[$i] * $scores[$i]; | |
138 $bottom += $weights[$i]; | |
139 } | |
140 $weighted_average = $top/$bottom; | |
141 #print "WA: $weighted_average\n"; | |
142 | |
143 ($top, $bottom) = (0,0); | |
144 # Now calculate weighted sample variance. | |
145 for (my $i=0; $i<@weights; $i++) { | |
146 $top += ( $weights[$i] * ($scores[$i] - $weighted_average)**2); | |
147 $bottom += $weights[$i]; | |
148 } | |
149 $weighted_variance = $top/$bottom; | |
150 | |
151 my $weighted_stdev = sqrt($weighted_variance); | |
152 my $weighted_stder = $weighted_stdev / sqrt(@scores); # / length scores. | |
153 | |
154 return ($weighted_average, $weighted_variance, $weighted_stdev, $weighted_stder); | |
155 } | |
156 | |
157 my %pos_summary; | |
158 | |
159 | |
160 foreach my $filename (@files) { | |
161 print "\n",$filename,"\n"; | |
162 open IN, $filename; | |
163 my %hash; | |
164 while (my $line = <IN>) { | |
165 chomp $line; | |
166 my @lines = split(/,/,$line); | |
167 my $pos = $lines[0]; | |
168 my $w = $lines[12]; | |
169 if ($w and $w eq 'nW') {next;} | |
170 if (!$w) { $w = 0 } # For blanks | |
171 my $c1 = $lines[2]; | |
172 my $c2 = $lines[3]; | |
173 my $avg = ($c1+$c2)/2; # Later: change which function to use? C1? AVG(C1+C2)? | |
174 if ($avg < $cutoff) { next; } # Skip cutoff genes. | |
175 if ($avg >= $weight_ceiling) { $avg = $weight_ceiling; } # Maximum weight. | |
176 | |
177 my @empty; | |
178 if (!$pos_summary{$pos}) { | |
179 $pos_summary{$pos}{w} = [@empty]; | |
180 $pos_summary{$pos}{s} = [@empty]; | |
181 } | |
182 $pos_summary{$pos}{w} = [@{$pos_summary{$pos}{w}}, $w]; # List of Fitness scores. | |
183 $pos_summary{$pos}{s} = [@{$pos_summary{$pos}{s}}, $avg]; # List of counts used to generate those fitness scores. | |
184 } | |
185 close IN; | |
186 | |
187 } | |
188 | |
189 open SUMMARY, ">",$out; | |
190 | |
191 | |
192 print SUMMARY "pos,fitness,ins_count,fitness_sd,fitness_se\n"; | |
193 # Now print out summary stats. | |
194 foreach my $key (sort {$a<=>$b} keys %pos_summary) { | |
195 if (!$key) {next} | |
196 my $sum=0; | |
197 my $num=0; | |
198 my $avgsum = 0; | |
199 | |
200 # Get the average. | |
201 foreach my $w (@{$pos_summary{$key}{w}}) { | |
202 $sum += $w; | |
203 $num++; | |
204 } | |
205 my $average= $sum/$num; | |
206 my $xminusxbars = 0; | |
207 | |
208 # Get the standard deviation. | |
209 foreach my $w (@{$pos_summary{$key}{w}}) { | |
210 $xminusxbars += ($w-$average)**2; | |
211 } | |
212 my ($sd, $se) = ('',''); | |
213 if ($num > 1) { | |
214 $sd = sqrt($xminusxbars/($num-1)); | |
215 $se = $sd / sqrt($num); | |
216 } | |
217 | |
218 print SUMMARY "$key,$average,$num,$sd,$se\n"; | |
219 | |
220 } | |
221 | |
222 | |
223 close SUMMARY; |