comparison singleFitness.pl @ 0:d3227d69c228 draft

Uploaded
author antmarge
date Sat, 18 Mar 2017 16:23:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d3227d69c228
1 #!/usr/bin/perl -w
2
3 #Margaret Antonio updated 16.10.03
4
5 use strict;
6 use Getopt::Long;
7 use warnings;
8 use Bio::SeqIO;
9
10 #AVAILABLE OPTIONS. WILL PRINT UPON ERROR
11 sub print_usage() {
12 print "\n####################################################################\n";
13 print "singleVal: creates a wig file with position and insertion count OR fitness\n";
14 print "\nDESCRIPTION: ";
15 print "Integrates multiple files of transposon insertion data and outputs\n";
16 print "aggregate fitness within a sliding window (specified by size and step).\n";
17
18 print "\nUSAGE:\n";
19 print "perl singleVal.pl <OPTIONS> <REQ OUTPUT TYPE(S)> <INPUT FILE(S)>\n\n";
20
21 print "\nREQUIRED:\n";
22 print " -d\tDirectory containing all input files (files from\n";
23 print " \taggregate script)\n";
24 print " \tOR\n";
25 print " \tIn the command line (without a flag), input the name(s) of\n";
26 print " \tfiles containing gene fitness values (output of calcFit). \n\n";
27 print " -x\tCutoff: Don't include fitness scores with average counts (c1+c2)/2 < x (default: 10)\n";
28 print " -b\tBlanks: Exclude -b % of blank fitness scores (scores where c2 = 0) (default: 0 = 0%)\n";
29 print " -w\tUse weighted algorithm to calculate averages, variance, sd, se\n";
30 print " -l\tWeight ceiling: maximum value to use as a weight (default: 50)\n";
31
32 print "OPTIONAL:\n";
33 print " -h\tPrints usage and exits program\n";
34 print " -o\tOutput file for comparison data. Default: singleVal.wig\n";
35 print " -v\tString value for output: 'fit' for fitness OR 'count' for count\n";
36 print " \tDefault: fit for fitness\n";
37 print " -n\tName of the reference genome, to be included in the wig header\n";
38 print " \tDefault: genome\n";
39
40 print " \n~~~~Always check that file paths are correctly specified~~~~\n";
41 print "\n##################################################################\n";
42
43 }
44
45
46 #ASSIGN INPUTS TO VARIABLES
47 our ($help,$ref_genome,$indir,$out,$cutoff,$blank_pc,$weight_ceiling);
48 GetOptions(
49 'h'=>\$help,
50 'o:s' =>\$out,
51 'c:i' =>\$cutoff,
52 'b:i' =>\$blank_pc,
53 'l:i' =>\$weight_ceiling,
54 );
55
56 sub get_time() {
57 my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);
58 return "$hour:$min:$sec";
59 }
60
61 if ($help){
62 print_usage();
63 exit;
64 }
65 if (!$indir and (scalar @ARGV==0)){
66 print "\nERROR: Please correctly specify input files or directory\n";
67 print_usage();
68 print "\n";
69 exit;
70 }
71
72 #ADDED BY MLA allows input to be directory---good for inputting L1-L6
73 my @files=@ARGV;
74
75
76 #SET DEFAULTS
77 if (!$cutoff){$cutoff=10}
78 if (!$blank_pc){$blank_pc=0}
79 if (!$weight_ceiling){$weight_ceiling=50}
80 if (!$out){$out="singleVal.csv"}
81
82 # Returns mean, variance, sd, se
83 sub average {
84 my $scoreref = shift @_;
85 my @scores = @{$scoreref};
86
87 my $sum=0;
88 my $num=0;
89
90 # Get the average.
91 foreach my $w (@scores) {
92 $sum += $w;
93 $num++;
94 }
95 my $average= $sum/$num;
96 my $xminusxbars = 0;
97
98 # Get the variance.
99 foreach my $w (@scores) {
100 $xminusxbars += ($w-$average)**2;
101 }
102 my $variance = (1/($num-1)) * $xminusxbars;
103 my $sd = sqrt($variance);
104 my $se = $sd / sqrt($num);
105
106 return ($average, $variance, $sd, $se);
107
108 }
109
110 # Takes two parameters, both hashrefs to lists.
111 # 1) hashref to list of scores
112 # 2) hashref to list of weights, equal in length to the scores.
113 sub weighted_average {
114
115 my $scoreref = shift @_;
116 my $weightref = shift @_;
117 my @scores = @{$scoreref};
118 my @weights = @{$weightref};
119
120 my $sum=0;
121 my ($weighted_average, $weighted_variance)=(0,0);
122 my $v2;
123
124 # Go through once to get total, calculate V2.
125 for (my $i=0; $i<@weights; $i++) {
126 $sum += $weights[$i];
127 $v2 += $weights[$i]**2;
128 }
129 if ($sum == 0) { return 0; } # No scores given?
130
131 my $scor = join (' ', @scores);
132 my $wght = join (' ', @weights);
133
134 # Now calculated weighted average.
135 my ($top, $bottom) = (0,0);
136 for (my $i=0; $i<@weights; $i++) {
137 $top += $weights[$i] * $scores[$i];
138 $bottom += $weights[$i];
139 }
140 $weighted_average = $top/$bottom;
141 #print "WA: $weighted_average\n";
142
143 ($top, $bottom) = (0,0);
144 # Now calculate weighted sample variance.
145 for (my $i=0; $i<@weights; $i++) {
146 $top += ( $weights[$i] * ($scores[$i] - $weighted_average)**2);
147 $bottom += $weights[$i];
148 }
149 $weighted_variance = $top/$bottom;
150
151 my $weighted_stdev = sqrt($weighted_variance);
152 my $weighted_stder = $weighted_stdev / sqrt(@scores); # / length scores.
153
154 return ($weighted_average, $weighted_variance, $weighted_stdev, $weighted_stder);
155 }
156
157 my %pos_summary;
158
159
160 foreach my $filename (@files) {
161 print "\n",$filename,"\n";
162 open IN, $filename;
163 my %hash;
164 while (my $line = <IN>) {
165 chomp $line;
166 my @lines = split(/,/,$line);
167 my $pos = $lines[0];
168 my $w = $lines[12];
169 if ($w and $w eq 'nW') {next;}
170 if (!$w) { $w = 0 } # For blanks
171 my $c1 = $lines[2];
172 my $c2 = $lines[3];
173 my $avg = ($c1+$c2)/2; # Later: change which function to use? C1? AVG(C1+C2)?
174 if ($avg < $cutoff) { next; } # Skip cutoff genes.
175 if ($avg >= $weight_ceiling) { $avg = $weight_ceiling; } # Maximum weight.
176
177 my @empty;
178 if (!$pos_summary{$pos}) {
179 $pos_summary{$pos}{w} = [@empty];
180 $pos_summary{$pos}{s} = [@empty];
181 }
182 $pos_summary{$pos}{w} = [@{$pos_summary{$pos}{w}}, $w]; # List of Fitness scores.
183 $pos_summary{$pos}{s} = [@{$pos_summary{$pos}{s}}, $avg]; # List of counts used to generate those fitness scores.
184 }
185 close IN;
186
187 }
188
189 open SUMMARY, ">",$out;
190
191
192 print SUMMARY "pos,fitness,ins_count,fitness_sd,fitness_se\n";
193 # Now print out summary stats.
194 foreach my $key (sort {$a<=>$b} keys %pos_summary) {
195 if (!$key) {next}
196 my $sum=0;
197 my $num=0;
198 my $avgsum = 0;
199
200 # Get the average.
201 foreach my $w (@{$pos_summary{$key}{w}}) {
202 $sum += $w;
203 $num++;
204 }
205 my $average= $sum/$num;
206 my $xminusxbars = 0;
207
208 # Get the standard deviation.
209 foreach my $w (@{$pos_summary{$key}{w}}) {
210 $xminusxbars += ($w-$average)**2;
211 }
212 my ($sd, $se) = ('','');
213 if ($num > 1) {
214 $sd = sqrt($xminusxbars/($num-1));
215 $se = $sd / sqrt($num);
216 }
217
218 print SUMMARY "$key,$average,$num,$sd,$se\n";
219
220 }
221
222
223 close SUMMARY;