singlefitness: singleFitness.pl comparison

comparison singleFitness.pl @ 0:d3227d69c228 draft

Uploaded

author	antmarge
date	Sat, 18 Mar 2017 16:23:05 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:d3227d69c228
+#!/usr/bin/perl -w
+#Margaret Antonio updated 16.10.03
+use strict;
+use Getopt::Long;
+use warnings;
+use Bio::SeqIO;
+#AVAILABLE OPTIONS. WILL PRINT UPON ERROR
+sub print_usage() {
+	print "\n####################################################################\n";
+print "singleVal: creates a wig file with position and insertion count OR fitness\n";
+print "\nDESCRIPTION: ";
+print "Integrates multiple files of transposon insertion data and outputs\n";
+print "aggregate fitness within a sliding window (specified by size and step).\n";
+print "\nUSAGE:\n";
+print "perl singleVal.pl <OPTIONS> <REQ OUTPUT TYPE(S)> <INPUT FILE(S)>\n\n";
+print "\nREQUIRED:\n";
+print " -d\tDirectory containing all input files (files from\n";
+print "   \taggregate script)\n";
+print "   \tOR\n";
+print "   \tIn the command line (without a flag), input the name(s) of\n";
+print "   \tfiles containing gene fitness values (output of calcFit). \n\n";
+print " -x\tCutoff: Don't include fitness scores with average counts (c1+c2)/2 < x (default: 10)\n";
+print " -b\tBlanks: Exclude -b % of blank fitness scores (scores where c2 = 0) (default: 0 = 0%)\n";
+print " -w\tUse weighted algorithm to calculate averages, variance, sd, se\n";
+print " -l\tWeight ceiling: maximum value to use as a weight (default: 50)\n";
+print "OPTIONAL:\n";
+print " -h\tPrints usage and exits program\n";
+print " -o\tOutput file for comparison data. Default: singleVal.wig\n";
+print " -v\tString value for output: 'fit' for fitness OR 'count' for count\n";
+print "   \tDefault: fit for fitness\n";
+print " -n\tName of the reference genome, to be included in the wig header\n";
+print "   \tDefault: genome\n";
+print " \n~~~~Always check that file paths are correctly specified~~~~\n";
+print "\n##################################################################\n";
+}
+#ASSIGN INPUTS TO VARIABLES
+our ($help,$ref_genome,$indir,$out,$cutoff,$blank_pc,$weight_ceiling);
+GetOptions(
+'h'=>\$help,
+'o:s' =>\$out,
+'c:i' =>\$cutoff,
+'b:i' =>\$blank_pc,
+'l:i' =>\$weight_ceiling,
+);
+sub get_time() {
+my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(time);
+return "$hour:$min:$sec";
+}
+if ($help){
+print_usage();
+exit;
+}
+if (!$indir and (scalar @ARGV==0)){
+	print "\nERROR: Please correctly specify input files or directory\n";
+print_usage();
+	print "\n";
+	exit;
+}
+#ADDED BY MLA allows input to be directory---good for inputting L1-L6
+my @files=@ARGV;
+#SET DEFAULTS
+if (!$cutoff){$cutoff=10}
+if (!$blank_pc){$blank_pc=0}
+if (!$weight_ceiling){$weight_ceiling=50}
+if (!$out){$out="singleVal.csv"}
+# Returns mean, variance, sd, se
+sub average {
+my $scoreref = shift @_;
+my @scores = @{$scoreref};
+my $sum=0;
+my $num=0;
+# Get the average.
+foreach my $w (@scores) {
+$sum += $w;
+$num++;
+}
+my $average= $sum/$num;
+my $xminusxbars = 0;
+# Get the variance.
+foreach my $w (@scores) {
+$xminusxbars += ($w-$average)**2;
+}
+my $variance = (1/($num-1)) * $xminusxbars;
+my $sd = sqrt($variance);
+my $se = $sd / sqrt($num);
+return ($average, $variance, $sd, $se);
+}
+# Takes two parameters, both hashrefs to lists.
+# 1) hashref to list of scores
+# 2) hashref to list of weights, equal in length to the scores.
+sub weighted_average {
+my $scoreref = shift @_;
+my $weightref = shift @_;
+my @scores = @{$scoreref};
+my @weights = @{$weightref};
+my $sum=0;
+my ($weighted_average, $weighted_variance)=(0,0);
+my $v2;
+# Go through once to get total, calculate V2.
+for (my $i=0; $i<@weights; $i++) {
+$sum += $weights[$i];
+$v2 += $weights[$i]**2;
+}
+if ($sum == 0) { return 0; } # No scores given?
+my $scor = join (' ', @scores);
+my $wght = join (' ', @weights);
+# Now calculated weighted average.
+my ($top, $bottom) = (0,0);
+for (my $i=0; $i<@weights; $i++) {
+$top += $weights[$i] * $scores[$i];
+$bottom += $weights[$i];
+}
+$weighted_average = $top/$bottom;
+#print "WA: $weighted_average\n";
+($top, $bottom) = (0,0);
+# Now calculate weighted sample variance.
+for (my $i=0; $i<@weights; $i++) {
+$top += ( $weights[$i] * ($scores[$i] - $weighted_average)**2);
+$bottom += $weights[$i];
+}
+$weighted_variance = $top/$bottom;
+my $weighted_stdev = sqrt($weighted_variance);
+my $weighted_stder = $weighted_stdev / sqrt(@scores);  # / length scores.
+return ($weighted_average, $weighted_variance, $weighted_stdev, $weighted_stder);
+}
+my %pos_summary;
+foreach my $filename (@files) {
+print "\n",$filename,"\n";
+open IN, $filename;
+my %hash;
+while (my $line = <IN>) {
+chomp $line;
+my @lines = split(/,/,$line);
+my $pos = $lines[0];
+my $w = $lines[12];
+if ($w and $w eq 'nW') {next;}
+if (!$w) { $w = 0 } # For blanks
+my $c1 = $lines[2];
+my $c2 = $lines[3];
+my $avg = ($c1+$c2)/2; # Later: change which function to use? C1? AVG(C1+C2)?
+if ($avg < $cutoff) { next; } # Skip cutoff genes.
+if ($avg >= $weight_ceiling) { $avg = $weight_ceiling; } # Maximum weight.
+my @empty;
+if (!$pos_summary{$pos}) {
+$pos_summary{$pos}{w} = [@empty];
+$pos_summary{$pos}{s} = [@empty];
+}
+$pos_summary{$pos}{w} = [@{$pos_summary{$pos}{w}}, $w];  # List of Fitness scores.
+$pos_summary{$pos}{s} = [@{$pos_summary{$pos}{s}}, $avg]; # List of counts used to generate those fitness scores.
+}
+close IN;
+}
+open SUMMARY, ">",$out;
+print SUMMARY "pos,fitness,ins_count,fitness_sd,fitness_se\n";
+# Now print out summary stats.
+foreach my $key (sort {$a<=>$b} keys %pos_summary) {
+if (!$key) {next}
+my $sum=0;
+my $num=0;
+my $avgsum = 0;
+# Get the average.
+foreach my $w (@{$pos_summary{$key}{w}}) {
+$sum += $w;
+$num++;
+}
+my $average= $sum/$num;
+my $xminusxbars = 0;
+# Get the standard deviation.
+foreach my $w (@{$pos_summary{$key}{w}}) {
+$xminusxbars += ($w-$average)**2;
+}
+my ($sd, $se) = ('','');
+if ($num > 1) {
+$sd = sqrt($xminusxbars/($num-1));
+$se = $sd / sqrt($num);
+}
+print SUMMARY "$key,$average,$num,$sd,$se\n";
+}
+close SUMMARY;

Mercurial > repos > antmarge > singlefitness

comparison singleFitness.pl @ 0:d3227d69c228 draft