diff pileup.pl @ 2:5f4cb6ee3427 draft

Uploaded
author ziru-zhou
date Tue, 18 Dec 2012 09:20:03 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pileup.pl	Tue Dec 18 09:20:03 2012 -0500
@@ -0,0 +1,90 @@
+#!/usr/bin/perl
+
+#purpose: script which calculates the percent of genome coverage and average coverage of bases
+#author: Ziru Zhou
+#date: October, 2012
+
+
+use strict;
+use warnings;
+use File::Basename;
+
+#globals
+#======================================================================
+my $bamfile = $ARGV[0];
+my $reffile = $ARGV[1];
+my $outputfile = $ARGV[2];
+my $bamname = $ARGV[3];
+my $refname = $ARGV[4];
+
+my $genomesize = 0;
+my $pileupwc = 0;
+my $pileupc4 = 0;
+
+my $percentofcover = 0;
+my $averagebasecoverage = 0;
+
+
+#function declarations
+#======================================================================
+#get the pileupwc value
+sub Getpileupwc()
+{
+	#system ("samtools pileup -f ${reffile} ${bamfile} | cut --fields=4 > tmp");
+	#$pileupwc = `cat tmp | wc -l`;
+	$pileupwc = `samtools pileup -f ${reffile} ${bamfile} | cut --fields=4 | wc -l`;
+}
+
+#get the pileupc4 value
+sub Getpileupc4()
+{
+	$pileupc4 = `samtools pileup -f ${reffile} ${bamfile} | cut --fields=4 | awk '{ total += \$1 } END { print total }'`;
+}
+
+#get the genomesize value
+sub Getgenomesize()
+{
+	$genomesize = `tail -n 1 "$reffile.fai"| cut --fields=3`;
+}
+
+#function to calculate the final 2 values for output
+sub Calculate()
+{
+	$percentofcover = ( int($pileupwc) / int($genomesize) ) * 100;
+	$averagebasecoverage = int($pileupc4) / int($pileupwc);
+}
+
+#function to write to output file
+sub Output()
+{
+	open FILE, '>'.$outputfile or die "unable to create $outputfile\n";
+
+	print FILE "\n#";
+	print FILE "\n# Generated by BAMEdit.  Please send your questions/comments to modENCODE DCC at help\@modencode.org.";
+	print FILE "\n#";
+	print FILE "\n# coverage: % of bases in genome covered";
+	print FILE "\n# avg coverage: average coverage of bases covered in genome ";
+	print FILE "\n#\n";
+	print FILE "input file\t$bamname\n";
+	print FILE "genome file\t$refname\n";
+	print FILE "genome length\t$genomesize";
+	printf FILE "coverage\t%.2f\n", $percentofcover;
+	printf FILE "avg coverage\t%.2f\n", $averagebasecoverage;
+
+	close FILE;
+}
+
+#function to clean up, the tmp file and the .fai file 
+sub Cleanup()
+{
+	system ("sudo rm ${reffile}.dat.fai");
+}
+
+#function calls
+#======================================================================
+Getpileupwc();
+Getpileupc4();
+Getgenomesize();
+Calculate();
+Output();
+Cleanup();