view CoverageReport.pl @ 26:859999cb135b draft

revised routines. Better handling of collapsing, isoforms and mutiple mapppings. Barplots revised to visualise average base coverage in exon instead of total number of reads in exon
author geert-vandeweyer
date Wed, 29 Nov 2017 08:12:27 -0500
parents 6cb012c8497a
children 576bfc1586f9
line wrap: on
line source

#!/usr/bin/perl

# load modules
use Getopt::Std;
use File::Basename;
use Number::Format;

# number format
my $de = new Number::Format(-thousands_sep =>',',-decimal_point => '.');

##########
## opts ##
##########
## input files
# b : path to input (b)am file
# t : path to input (t)arget regions in BED format
## output files
# o : report pdf (o)utput file
# z : all plots and tables in tar.g(z) format
## entries in the report
# r : Coverage per (r)egion (boolean)
# s : (s)ubregion coverage if average < specified (plots for positions along target region) (boolean)
# S : (S)ubregion coverage for ALL failed exons  => use either s OR S or you will have double plots. 
# A : (A)ll exons will be plotted. 
# L : (L)ist failed exons instead of plotting
# m : (m)inimal Coverage threshold
# f : fraction of average as threshold
# n : sample (n)ame.
# T : collapse overlapping Target regions. 

getopts('b:t:o:z:rsSALm:n:f:T', \%opts) ;

# make output directory in (tmp) working dir
our $wd = "/tmp/Coverage.".int(rand(1000));

while (-d $wd) {
	$wd = "/tmp/Coverage.".int(rand(1000));

}
system("mkdir $wd");
$wd = "/tmp/Coverage.993";
print "wd : $wd\n";
## variables
our %commandsrun = ();

if (!exists($opts{'b'}) || !-e $opts{'b'}) {
	die('Bam File not found');
}
if (!exists($opts{'t'}) || !-e $opts{'t'}) {
	die('Target File (BED) not found');
}

if (exists($opts{'m'})) {
	$thresh = $opts{'m'};
}
else {
	$thresh = 40;
}

if (exists($opts{'f'})) {
	$frac = $opts{'f'};
}
else {
	$frac = 0.2;
}

if (exists($opts{'o'})) {
	$pdffile = $opts{'o'};
}
else {
	$pdffile = "$wd/CoverageReport.pdf";
}

if (exists($opts{'z'})) {
	$tarfile = $opts{'z'};
}
else {
	$tarfile = "$wd/Results.tar.gz";
}

## 0. Collapse overlapping target regions.
if (defined($opts{'T'})) {
	## check BED format. Must have 6 cols if using this.
	my $head = `head -n 1 $opts{'t'}`;
	chomp;
	my @c = split(/\t/,$head);
	if (scalar(@c) < 6) {
		die("Targets BED file must be in 6-column format for collapsings. See tool documentation for more info.\n");
	}
	my $targets = $opts{'t'};
	my $tmptargets = "$wd/collapsedtargets.bed";
	system("sort -k1,1 -k2,2n $targets > $wd/sorted.targets.bed");
	system("bedtools merge -s -scores max -nms -i $wd/sorted.targets.bed > $tmptargets");
	open IN, $tmptargets;
	open OUT, ">$wd/collapsed.targets.renamed.bed";
	# we assume that overlapping fragments come from isoforms, not from different genes.
	my %counters = ();
	my @genes = ();
	while (<IN>) {
		chomp;
		my @p = split(/\t/,$_);
		my @g = split(/,/,$p[3]);
		$g[0] =~ m/(\S+)\(.*/;
		my $gene = $1;
		if (!defined($counters{$gene})) {
			push(@genes,$gene);
			$counters{$gene}{'lines'} = ();
			$counters{$gene}{'orient'} = $p[5];
		}
		$p[3] = $gene."(COLLAPSED)";
		push(@{$counters{$gene}{'lines'}},\@p);
	}
	close IN;
	foreach my $gene (@genes) {
		if ($counters{$gene}{'orient'} eq '-') {
			my $idx = scalar(@{$counters{$gene}{'lines'}}) + 1;
			foreach my $line (@{$counters{$gene}{'lines'}}) {
				$idx--;
				$line->[3] .= "|Region_$idx";
				print OUT join("\t",@$line)."\n";
			}
		}
		else {
			my $idx = 0;
                        foreach my $line (@{$counters{$gene}{'lines'}}) {
                                $idx++;
                                $line->[3] .= "|Region_$idx";
                                print OUT join("\t",@$line)."\n";
                        }
		}
	}
	close OUT;
	$opts{'t'} = "$wd/collapsed.targets.renamed.bed";
}
# 1. Coverage per exon
# included in 2.

# 2. Coverage per position
&SubRegionCoverage($opts{'b'}, $opts{'t'});
our %filehash;
our $tcov;
if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) {
	system("mkdir -p $wd/SplitFiles");
	system("rm $wd/SplitFiles/*");
	## get position coverages
	## split input files
	open IN, "$wd/Targets.Position.Coverage";
	open BCOVSUM, ">$wd/Results/".$opts{'n'}.".Average_Region_Coverage.txt";
	my $fileidx = 0;
	my $currreg = '';
	my $elength = 0;
	my $esum = 0;
	my $eline = "";
	my %out = ();
	while (<IN>) {
		my $line = $_;
		chomp($line);
		my @p = split(/\t/,$line);
		my $reg = $p[0].'-'.$p[1].'-'.$p[2]. ": $p[3]";
		my $ex = $p[3];
		my $epos = $p[1];
		# average exon coverage calculation.
		if (!defined($out{$ex})) {
			$out{$ex} = ();
		}
		if (!defined($out{$ex}{$p[0]})) {
			$out{$ex}{$p[0]} = ();
		}
		# needs to be transcript specific ($ex) and position specific ($epos) to handle both isoforms and PAR/multiple mapping situations.
		if (!defined($out{$ex}{$p[0]}{$epos})) {
			$out{$ex}{$p[0]}{$epos}{'r'} = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]";
			$out{$ex}{$p[0]}{$epos}{'c'} = ();

		}
		push(@{$out{$ex}{$p[0]}{$epos}{'c'}},$p[-1]);
		# splitting files 
		if ($reg ne $currreg) {
			## new exon open new outfile
			if ($currreg ne '') {
				print BCOVSUM "$eline\t".($esum/$elength)."\n";
				## filehandle is open. close it
				close OUT; 
			}
			$eline = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]"; #\t$p[6]\t$p[7]\t$p[8]";
			$esum = 0;
			$elength = 0;
			if (!exists($filehash{$reg})) {
				$fileidx++;
				$filehash{$reg}{'idx'} = $fileidx;
				$filehash{$reg}{'exon'} = $reg;
				open OUT, ">> $wd/SplitFiles/File_$fileidx.txt";
				$currreg = $reg;
			}
			else {
				open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt";
				$currreg = $reg;
			}
		}
		## print the line to the open filehandle. 	
		print OUT "$line\n";
		$esum += $p[-1];
		$elength++;
	}
	close OUT;
	close IN;
	if ($esum > 0) {
		print BCOVSUM "$eline\t".($esum/$elength)."\n";
	}
	close BCOVSUM;
	open OUT, ">$wd/avg.tcov.txt";

	foreach my $tr_ex (sort {$a cmp $b} keys(%out)) {
	   foreach my $chr (sort {$a cmp $b} keys(%{$out{$tr_ex}})) {	
		foreach(sort {$a <=> $b} keys(%{$out{$tr_ex}{$chr}})) {
			my ($avg,$nr,$nrcov) = GetStats(\@{$out{$tr_ex}{$chr}{$_}{'c'}});
			my $frac = 0;
			if ($nr > 0) {
				$frac = ($nrcov / $nr);
			}
			print OUT $out{$tr_ex}{$chr}{$_}{'r'}."\t".$avg."\t".$nrcov."\t".$nr."\t".$frac."\n";
		}
	   }
	}
	close OUT;
	$tcov = "$wd/avg.tcov.txt";

}
## sort output files according to targets file
my %hash = ();
open IN, $tcov;
while (<IN>) {
	my @p = split(/\t/,$_) ;
	$hash{$p[3].':'.$p[1]} = $_;
}
close IN;
open OUT, ">$tcov";
open IN, $opts{'t'};
while (<IN>) {
	my @p = split(/\t/,$_) ;
	print OUT $hash{$p[3].':'.$p[1]};
}
close IN;
close OUT;


####################################
## PROCESS RESULTS & CREATE PLOTS ##
####################################
system("mkdir $wd/Report");

system("mkdir $wd/Rout");
system("mkdir $wd/Plots");

$samplename = $opts{'n'};
$samplename =~ s/_/\\_/g;

# 0. Preamble
## compose preamble
open OUT, ">$wd/Report/Report.tex";
print OUT '\documentclass[a4paper,10pt]{article}'."\n";
print OUT '\usepackage[left=2cm,top=1.5cm,right=1.5cm,bottom=2.5cm,nohead]{geometry}'."\n";
print OUT '\usepackage{longtable}'."\n";
print OUT '\usepackage[T1]{fontenc}'."\n";
print OUT '\usepackage{fancyhdr}'."\n";
print OUT '\usepackage[latin9]{inputenc}'."\n";
print OUT '\usepackage{color}'."\n";
print OUT '\usepackage[pdftex]{graphicx}'."\n";
print OUT '\definecolor{grey}{RGB}{160,160,160}'."\n";
print OUT '\definecolor{darkgrey}{RGB}{100,100,100}'."\n";
print OUT '\definecolor{red}{RGB}{255,0,0}'."\n";
print OUT '\definecolor{orange}{RGB}{238,118,0}'."\n";
print OUT '\setlength\LTleft{0pt}'."\n";
print OUT '\setlength\LTright{0pt}'."\n";
print OUT '\begin{document}'."\n";
print OUT '\pagestyle{fancy}'."\n";
print OUT '\fancyhead{}'."\n";
print OUT '\renewcommand{\footrulewidth}{0.4pt}'."\n";
print OUT '\renewcommand{\headrulewidth}{0pt}'."\n";
print OUT '\fancyfoot[R]{\today\hspace{2cm}\thepage\ of \pageref{endofdoc}}'."\n";
print OUT '\fancyfoot[C]{}'."\n";
print OUT '\fancyfoot[L]{Coverage Report for ``'.$samplename.'"}'."\n";
print OUT '\let\oldsubsubsection=\subsubsection'."\n";
print OUT '\renewcommand{\subsubsection}{%'."\n";
print OUT '  \filbreak'."\n";
print OUT '  \oldsubsubsection'."\n";
print OUT '}'."\n";
# main title
print OUT '\section*{Coverage Report for ``'.$samplename.'"}'."\n";
close OUT; 

# 1. Summary Report
# Get samtools flagstat summary of BAM file	
my $flagstat = `samtools flagstat $opts{'b'}`;
my @s = split(/\n/,$flagstat);
# Get number of reads mapped in total 
## updated on 2012-10-1 !!
$totalmapped = $s[2];
$totalmapped =~ s/^(\d+)(\s.+)/$1/;
# count columns
my $head = `head -n 1 $tcov`;
chomp($head);
my @cols = split(/\t/,$head);
my $nrcols = scalar(@cols);
my $covcol = $nrcols - 3;
# get min/max/median/average coverage => values
my $covs = `cut -f $covcol $tcov`;
my @coverages = split(/\n/,$covs);
my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages);
my $spec = '';
if ($totalmapped != 0 && $totalmapped ne '') {
	$spec = sprintf("%.1f",($ontarget / $totalmapped)*100);
}
# get min/max/median/average coverage => boxplot in R
open OUT, ">$wd/Rout/boxplot.R";
print OUT 'coverage <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
print OUT 'coverage <- coverage[,'.$covcol.']'."\n";
print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480,type=c("cairo"))'."\n";
print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n";
print OUT 'graphics.off()'."\n";
close OUT;
print "Running boxplot.R : \n";
system("cd $wd/Rout && Rscript boxplot.R");

## global nt coverage plot
## use perl to make histogram (lower memory)
open IN, "$wd/Targets.Position.Coverage";
my %dens;
my $counter = 0;
my $sum = 0;
while (<IN>) {
	chomp();
	my @p = split(/\t/);
	$sum += $p[-1];
	$counter++;
	if (defined($dens{$p[-1]})) {
		$dens{$p[-1]}++;
	}
	else {
		$dens{$p[-1]} = 1;
	}
}
$avg = $sum/$counter;
close IN;
open OUT, ">$wd/Rout/hist.txt";
if (!defined($dens{'0'})) {
	$dens{'0'} = 0;
}
foreach (keys(%dens)) {
	print OUT "$_;$dens{$_}\n";
}
close OUT;
open OUT, ">$wd/Rout/ntplot.R";
# read coverage hist in R to plot
print OUT 'coverage <- read.table("hist.txt" , as.is = TRUE, header=FALSE,sep=";")'."\n";
print OUT 'mincov <- '."$thresh \n";
print OUT "avg <- round($avg)\n";
print OUT "colnames(coverage) <- c('cov','count')\n";
print OUT 'coverage$cov <- coverage$cov / avg'."\n";
print OUT 'rep <- which(coverage$cov > 1)'."\n";
print OUT 'coverage[coverage$cov > 1,1] <- 1'."\n";
print OUT 'values <- coverage[coverage$cov < 1,]'."\n";
print OUT 'values <- rbind(values,c(1,sum(coverage[coverage$cov == 1,"count"])))'."\n";
print OUT 'values <- values[order(values$cov),]'."\n";
print OUT 'prevcount <- 0'."\n";
# make cumulative count data frame
print OUT 'for (i in rev(values$cov)) {'."\n";
print OUT '  values[values$cov == i,"count"] <- prevcount + values[values$cov == i,"count"]'."\n";
print OUT '  prevcount <- values[values$cov == i,"count"]'."\n";
print OUT '}'."\n";
print OUT 'values$count <- values$count / (values[values$cov == 0,"count"] / 100)'."\n";
# get some values to plot lines.
print OUT 'mincov.x <- mincov/avg'."\n";
print OUT 'if (mincov/avg <= 1) {'."\n";
print OUT '  ii <- which(values$cov == mincov.x)'."\n";
print OUT '  if (length(ii) == 1) {'."\n";
print OUT '    mincov.y <- values[ii[1],"count"]'."\n";
print OUT '  } else {'."\n";
print OUT '    i1 <- max(which(values$cov < mincov.x))'."\n";
print OUT '    i2 <- min(which(values$cov > mincov.x))'."\n";
print OUT '    mincov.y <- ((values[i2,"count"] - values[i1,"count"])/(values[i2,"cov"] - values[i1,"cov"]))*(mincov.x - values[i1,"cov"]) + values[i1,"count"]'."\n";
print OUT '  }'."\n";
print OUT '}'."\n";
# open output image and create plot
print OUT 'png(file="../Plots/CoverageNtPlot.png", bg="white", width=540, height=480,type=c("cairo"))'."\n";
print OUT 'par(xaxs="i",yaxs="i")'."\n";
print OUT 'plot(values$cov,values$count,ylim=c(0,100),pch=".",main="Cumulative Normalised Base-Coverage Plot",xlab="Normalizalised Coverage",ylab="Cumulative Nr. Of Bases")'."\n";
print OUT 'lines(values$cov,values$count)'."\n";
print OUT 'if (mincov.x <= 1) {'."\n";
print OUT '  lines(c(mincov.x,mincov.x),c(0,mincov.y),lty=2,col="darkgreen")'."\n";
print OUT '  lines(c(0,mincov.x),c(mincov.y,mincov.y),lty=2,col="darkgreen")'."\n";
print OUT '  text(1,(95),pos=2,col="darkgreen",labels="Threshold: '.$thresh.'x")'."\n";
print OUT '  text(1,(91),pos=2,col="darkgreen",labels=paste("%Bases: ",round(mincov.y,2),"%",sep=""))'."\n";
print OUT '} else {'."\n";
print OUT '  text(1,(95),pos=2,col="darkgreen",labels="Threshold ('.$thresh.'x) > Average")'."\n";
print OUT '  text(1,(91),pos=2,col="darkgreen",labels="Plotting impossible")'."\n";
print OUT '}'."\n";
print OUT 'frac.x <- '."$frac\n";
print OUT 'ii <- which(values$cov == frac.x)'."\n";
print OUT 'if (length(ii) == 1) {'."\n";
print OUT '  frac.y <- values[ii[1],"count"]'."\n";
print OUT '} else {'."\n";
print OUT '  i1 <- max(which(values$cov < frac.x))'."\n";
print OUT '  i2 <- min(which(values$cov > frac.x))'."\n";
print OUT '  frac.y <- ((values[i2,"count"] - values[i1,"count"])/(values[i2,"cov"] - values[i1,"cov"]))*(frac.x - values[i1,"cov"]) + values[i1,"count"]'."\n";
print OUT '}'."\n";
print OUT 'lines(c(frac.x,frac.x),c(0,frac.y),lty=2,col="red")'."\n";
print OUT 'lines(c(0,frac.x),c(frac.y,frac.y),lty=2,col="red")'."\n";
#iprint OUT 'text((frac.x+0.05),(frac.y - 2),pos=4,col="red",labels=paste(frac.x," x Avg.Cov : ",round(frac.x * avg,2),"x",sep="" ))'."\n";
#print OUT 'text((frac.x+0.05),(frac.y-5),pos=4,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n";
print OUT 'text(1,86,pos=2,col="red",labels=paste(frac.x," x Avg.Cov : ",round(frac.x * avg,2),"x",sep="" ))'."\n";
print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n";

print OUT 'graphics.off()'."\n";

close OUT;
print "Running ntplot.r\n";
system("cd $wd/Rout && Rscript ntplot.R");
## PRINT TO .TEX FILE
open OUT, ">>$wd/Report/Report.tex";
# average coverage overviews
print OUT '\subsection*{Overall Summary}'."\n";
print OUT '{\small ';
# left : boxplot
print OUT '\begin{minipage}{0.3\linewidth}\centering'."\n";
print OUT '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/CoverageBoxPlot.png}'."\n";
print OUT '\end{minipage}'."\n";
# right : cum.cov.plot
print OUT '\hspace{0.6cm}'."\n";
print OUT '\begin{minipage}{0.65\linewidth}\centering'."\n";
print OUT '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/CoverageNtPlot.png}'."\n";
print OUT '\end{minipage} \\\\'."\n";
## next line
print OUT '\begin{minipage}{0.48\linewidth}'."\n";
print OUT '\vspace{-1.2em}'."\n";
print OUT '\begin{tabular}{ll}'."\n";
# bam statistics
print OUT '\multicolumn{2}{l}{\textbf{\underline{Samtools Flagstat Summary}}} \\\\'."\n";
foreach (@s) {
	$_ =~ m/^(\d+)\s(.+)$/;
	my $one = $1;
	my $two = $2;
	$two =~ s/\s\+\s0\s//;
	$two = ucfirst($two);
	$one =~ s/%/\\%/g;
	# remove '+ 0 ' from front
	$two =~ s/\+\s0\s//;
	# remove trailing from end
	$two =~ s/(\s\+.*)|(:.*)/\)/;
	$two =~ s/%/\\%/g;
	$two =~ s/>=/\$\\ge\$/g;
	$two = ucfirst($two);
	print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n";
	

}
print OUT '\end{tabular}\end{minipage}'."\n";
print OUT '\hspace{1.5cm}'."\n";
# target coverage statistics
print OUT '\begin{minipage}{0.4\linewidth}'."\n";
#print OUT '\vspace{-4.8em}'."\n";
print OUT '\begin{tabular}{ll}'."\n";
print OUT '\multicolumn{2}{l}{\textbf{\underline{Target Region Coverage}}} \\\\'."\n";
print OUT '\textbf{Number of Target Regions} & '.scalar(@coverages).' \\\\'."\n";
print OUT '\textbf{Minimal Region Coverage} & '.$min.' \\\\'."\n";
print OUT '\textbf{25\% Region Coverage} & '.$first.' \\\\'. "\n";
print OUT '\textbf{50\% (Median) Region Coverage} & '.$med.' \\\\'. "\n";
print OUT '\textbf{75\% Region Coverage} & '.$third.' \\\\'. "\n";
print OUT '\textbf{Maximal Region Coverage} & '.$max.' \\\\'. "\n";
print OUT '\textbf{Average Region Coverage} & '.int($eavg).' \\\\'. "\n";
print OUT '\textbf{Mapped On Target} & '.$spec.' \\\\'."\n";
print OUT '\multicolumn{2}{l}{\textbf{\underline{Target Base Coverage }}} \\\\'."\n";
print OUT '\textbf{Number of Target Bases} & '.$counter.' \\\\'."\n";
print OUT '\textbf{Average Base Coverage} & '.int($avg).' \\\\'. "\n";
print OUT '\textbf{Non-Covered Bases} & '.$dens{'0'}.' \\\\'."\n";
#print OUT '\textbf{Bases Covered $ge$ '.$frac.'xAvg.Cov} & '.
print OUT '\end{tabular}\end{minipage}}'."\n";
close OUT;

# 2. GLOBAL COVERAGE OVERVIEW PER GENE
@failedexons;
@allexons;
@allregions;
@failedregions;
%failednames;
%allnames;

if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})|| exists($opts{'A'})) {
	# count columns
	my $head = `head -n 1 '$tcov'`;
	chomp($head);
	my @cols = split(/\t/,$head);
	my $nrcols = scalar(@cols);
	my $covcol = $nrcols - 3;
	# Coverage Plots for each gene => barplots in R, table here.
	open IN, "$tcov";
	my $currgroup = '';
	my $startline = 0;
	my $stopline = 0;
	$linecounter = 0;
	while (<IN>) {
		$linecounter++;
		chomp($_);
		my @c = split(/\t/,$_);
		my $reg = $c[0].'-'.$c[1].'-'.$c[2];
		push(@allregions,$reg);
		my $group = $reg .": ".$c[3];
		#my $gene = $c[3];
		## coverage failure?
		if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) {
			push(@failedexons,$group);
			push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]);
			$failednames{$group} = $c[0].'-'.$c[1].'-'.$c[2];
		}
		## store exon
		push(@allexons,$group);
		$allnames{$group} = $c[0].'-'.$c[1].'-'.$c[2];
		if (!exists($opts{'r'}) && !exists($opts{'s'}) && !exists($opts{'S'}) && exists($opts{'A'})) {
			## no need for barplots
			next;
		}
		## extract and check gene
		my $gene = $group;
		$gene =~ s/^chr\S+: (\S+)[\|\s](.+)/$1/;
		if ($gene ne $currgroup ) {
		    if ($currgroup ne '') {
			# new gene, make plot. 
			open OUT, ">$wd/Rout/barplot.R";
			print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
			print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
			print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
			print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
			print OUT 'coverage[coverage < 1] <- 1'."\n";
			print OUT 'colors <- c(rep("grey",length(coverage)))'."\n";
			# coverage not whole target region => orange
			print OUT 'covperc <- coveragetable[c('.$startline.':'.$stopline.'),'.$nrcols.']'."\n";
			print OUT 'colors[covperc<1] <- "orange"'."\n";
			# coverage below threshold => red
			print OUT 'colors[coverage<'.$thresh.'] <- "red"'."\n";

			if ($stopline - $startline > 20) {
				$scale = 2;
			}
			else {
				$scale = 1;
			}
			my $width = 480 * $scale;
			my $height = 240 * $scale;
			print OUT 'png(file="../Plots/Coverage_'.$currgroup.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n";
			print OUT 'ylim = c(0,max(max(log10(coverage),log10('.($thresh+20).'))))'."\n";
			print OUT 'mp <- barplot(log10(coverage),col=colors,main="Exon Coverage for '.$currgroup.'",ylab="Log10(Coverage)",ylim=ylim)'."\n";
			print OUT 'text(mp, log10(coverage) + '.(0.4/$scale).',format(coverage),xpd = TRUE,srt=90)'."\n";
			print OUT 'text(mp,par("usr")[3]-0.05,labels=entries,srt=45,adj=1,xpd=TRUE)'."\n";
			print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
			print OUT 'graphics.off()'."\n";
			close OUT;
			system("cd $wd/Rout && Rscript barplot.R");
			if ($scale == 1) {
				push(@small,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
			}
			else {
				push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
			}

		    }
		    $currgroup = $gene;
		    $startline = $linecounter;
		}
		$stopline = $linecounter;
	}
	close IN;
	if ($currgroup ne '') {
		# last gene, make plot. 
		open OUT, ">$wd/Rout/barplot.R";
		print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
		print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
		print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
		print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
		print OUT 'coverage[coverage < 1] <- 1'."\n";
		print OUT 'colors <- c(rep("grey",length(coverage)))'."\n";
		print OUT 'colors[coverage<'.$thresh.'] <- "red"'."\n";

		if ($stopline - $startline > 20) {
			$scale = 2;
		}
		else {
			$scale = 1;
		}
		my $width = 480 * $scale;
		my $height = 240 * $scale;
		print OUT 'png(file="../Plots/Coverage_'.$currgroup.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n";
		print OUT 'ylim = c(0,max(max(log10(coverage),log10('.($thresh+20).'))))'."\n";
		print OUT 'mp <- barplot(log10(coverage),col=colors,main="Exon Coverage for '.$currgroup.'",ylab="Log10(Coverage)", ylim=ylim)'."\n";
		print OUT 'text(mp, log10(coverage) + log10(2),format(coverage),xpd = TRUE,srt=90)'."\n";
		print OUT 'text(mp,par("usr")[3]-0.1,labels=entries,srt=45,adj=1,xpd=TRUE)'."\n";
		print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
		print OUT 'graphics.off()'."\n";
		close OUT;
		system("cd $wd/Rout && Rscript barplot.R");
		if ($scale == 1) {
			push(@small,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
		}
		else {
			push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
		}
	}
	## print to TEX
	open OUT, ">>$wd/Report/Report.tex";
	print OUT '\subsection*{Gene Summaries}'."\n";
	print OUT '\underline{Legend:} \\\\'."\n";
	print OUT '{\color{red}\textbf{RED:} Average coverage did not reach set threshold of '.$thresh.'} \\\\'."\n";
	print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon (section with zero coverage found). Overruled by red.} \\\\' ."\n";
	$col = 1;
	foreach (@small) {
		if ($col > 2) {
			$col = 1;
			print OUT "\n";
		}
		print OUT '\begin{minipage}{0.5\linewidth}\centering'."\n";
		print OUT $_."\n";
		print OUT '\end{minipage}'."\n";
		$col++;
	}
	## new line 
	if ($col == 2) {
		print OUT '\\\\'." \n";
	}
	foreach(@large) {
		print OUT $_."\n";
	}
	close OUT;
	
}

# 3. Detailed overview of failed exons (globally failed)
if (exists($opts{'s'})) {
	# count columns
	my $head = `head -n 1 $wd/Targets.Position.Coverage`;
	chomp($head);
	my @cols = split(/\t/,$head);
	my $nrcols = scalar(@cols);
	my $covcol = $nrcols;
	my $poscol = $nrcols -1;
	# tex section header
	open TEX, ">>$wd/Report/Report.tex";
	print TEX '\subsection*{Failed Exon Plots}'."\n";
	$col = 1;
	print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n";
	foreach(sort(keys(%failednames)) ) {
	#foreach(@failedregions) {
		if ($col > 2) {
			$col = 1;
			print TEX "\n";
		}
		# which exon
		my $group = $_;
		my ($region,$name) = split(/: /,$group);
		#my $region = $failednames{$_};
		my $exon = $filehash{$group}{'exon'};
		# link exon to tmp file
		my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt";
		## determine transcript orientation and location
		my $firstline = `head -n 1 $exonfile`;
		my @firstcols = split(/\t/,$firstline);
		my $orient = $firstcols[5];
		my $genomicchr = $firstcols[0];
		my $genomicstart = $firstcols[1];
		my $genomicstop = $firstcols[2];
		if ($orient eq '+') {
			$bps = $genomicstop - $genomicstart + 1;
			$subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);
		}
		else {
			$bps = $genomicstop - $genomicstart + 1;
			$subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
		}
		# print Rscript
		open OUT, ">$wd/Rout/exonplot.R";
		print OUT 'coveragetable <- read.table("'.$exonfile.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
		print OUT 'coverage <- coveragetable[,'.$covcol.']'."\n";
		print OUT 'coverage[coverage < 1] <- 1'."\n";
		print OUT 'positions <- coveragetable[,'.$poscol.']'."\n";
		
		my $width = 480 ;
		my $height = 240 ;
		my $exonstr = $exon;
		$exonstr =~ s/\s/_/g;
		$exonstr =~ s/:/_/g;
		$exon =~ s/_/ /g;
		$exon =~ s/\|/ /g;
		$exon =~ s/chr.*: (.*)$/$1/;
		print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n";
		print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
		if ($orient eq '-') {
			print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n";
			print OUT 'mtext("'.$subtitle.'")'."\n";
		}
		else {
			print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",sub="(Transcribed from plus strand)")'."\n";
			print OUT 'mtext("'.$subtitle.'")'."\n";
		}
		print OUT 'lines(positions,log10(coverage))'."\n";
		print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
		print OUT 'failedpos <- positions[coverage<'.$thresh.']'."\n";
		print OUT 'failedcov <- coverage[coverage<'.$thresh.']'."\n";
		print OUT 'points(failedpos,log10(failedcov),col="red",pch=19)'."\n";
		print OUT 'graphics.off()'."\n";
		close OUT;
		# run R script
		system("cd $wd/Rout && Rscript exonplot.R");
		# Add to .TEX
		print TEX '\begin{minipage}{0.5\linewidth}\centering'."\n";
		print TEX '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$exonstr.'.png}'."\n";
		print TEX '\end{minipage}'."\n";
		$col++;
	}
}

## plot failed (subregion) or all exons
if (exists($opts{'S'}) || exists($opts{'A'})) {
	# count columns
	my $head = `head -n 1 $wd/Targets.Position.Coverage`;
	chomp($head);
	my @cols = split(/\t/,$head);
	my $nrcols = scalar(@cols);
	my $covcol = $nrcols;
	my $poscol = $nrcols -1;
	# tex section header
	open TEX, ">>$wd/Report/Report.tex";
	print TEX '\subsection*{Failed Exon Plots}'."\n";
	if (exists($opts{'S'})) {
		print TEX '\underline{NOTE:} ALL exons were tested for local coverage $<$'.$thresh.' \\\\'."\n";
	}
	elsif (exists($opts{'A'})) {
		print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n";
	}
	$col = 1;
	foreach(sort(keys(%allnames))) {

	#foreach(@allregions) {
		if ($col > 2) {
			$col = 1;
			print TEX "\n";
		}
		my $group = $_;
		my ($region,$name) = split(/: /,$group);
		# which exon
		#my $region = $_;
		#my $region = $allnames{$_};
		my $exon = $filehash{$group}{'exon'};
		# grep exon to tmp file
		my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt";
		## determine transcript orientation.
                my $firstline = `head -n 1 $exonfile`;
                my @firstcols = split(/\t/,$firstline);
                my $orient = $firstcols[5];
		my $genomicchr = $firstcols[0];
		my $genomicstart = $firstcols[1];
		my $genomicstop = $firstcols[2];

		if ($orient eq '+') {
			$bps = $genomicstop - $genomicstart + 1;
			$subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);

		}
		else {
			$bps = $genomicstop - $genomicstart + 1;
			$subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);

		}

		# check if failed
		if (exists($opts{'S'})) {
			my $cs = `cut -f $covcol '$exonfile' `;
			my @c = split(/\n/,$cs);
			@c = sort { $a <=> $b } @c;
			if ($c[0] >= $thresh) {
				# lowest coverage > threshold => skip
				next;
			}
		}
		# print Rscript
		open OUT, ">$wd/Rout/exonplot.R";
		print OUT 'coveragetable <- read.table("'.$exonfile.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
		print OUT 'coverage <- coveragetable[,'.$covcol.']'."\n";
		print OUT 'coverage[coverage < 1] <- 1'."\n";
		print OUT 'positions <- coveragetable[,'.$poscol.']'."\n";
		my $width = 480 ;
		my $height = 240 ;
		my $exonstr = $exon;
		$exonstr =~ s/\s/_/g;
		$exonstr =~ s/:/_/g;
		$exon =~ s/_/ /g;
		$exon =~ s/\|/ /g;
		$exon =~ s/^chr.*: (.*)$/$1/;
		print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n";
		print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
		if ($orient eq '-') {
			print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n";
			print OUT 'mtext("'.$subtitle.'")'."\n";
                }
                else {
			print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",sub="(Transcribed from plus strand)")'."\n";
			print OUT 'mtext("'.$subtitle.'")'."\n";
                }
		
		print OUT 'lines(positions,log10(coverage))'."\n";
		print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
		print OUT 'failedpos <- positions[coverage<'.$thresh.']'."\n";
		print OUT 'failedcov <- coverage[coverage<'.$thresh.']'."\n";
		print OUT 'points(failedpos,log10(failedcov),col="red",pch=19)'."\n";
		print OUT 'graphics.off()'."\n";
		close OUT;
		# run R script
		system("cd $wd/Rout && Rscript exonplot.R");
		# Add to .TEX
		print TEX '\begin{minipage}{0.5\linewidth}\centering'."\n";
		print TEX '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$exonstr.'.png}'."\n";
		print TEX '\end{minipage}'."\n";
		$col++;
	}
}
## list failed exons
if (exists($opts{'L'})) {
	# count columns
	my $head = `head -n 1 $wd/Targets.Position.Coverage`;
	chomp($head);
	my @cols = split(/\t/,$head);
	my $nrcols = scalar(@cols);
	my $covcol = $nrcols;
	my $poscol = $nrcols -1;
	## hash to print
	# tex section header
	open TEX, ">>$wd/Report/Report.tex";
	print TEX '\subsection*{List of Failed Exons}'."\n";
	print TEX '\underline{NOTE:} ALL exons were tested for local coverage $<$'.$thresh.' \\\\'."\n";
	print TEX '{\footnotesize\begin{longtable}[l]{@{\extracolsep{\fill}}llll}'."\n".'\hline'."\n";
	print TEX '\textbf{Target Name} & \textbf{Genomic Position} & \textbf{Avg.Coverage} & \textbf{Min.Coverage} \\\\'."\n".'\hline'."\n";
	print TEX '\endhead'."\n";
	print TEX '\hline '."\n".'\multicolumn{4}{r}{{\textsl{\footnotesize Continued on next page}}} \\\\ '."\n".'\hline' ."\n". '\endfoot' . "\n". '\endlastfoot' . "\n";
 
	$col = 1;
	open IN, "$wd/Targets.Global.Coverage";
	while (<IN>) {
		chomp($_);
		my @p = split(/\t/,$_);
		my $region = $p[0].'-'.$p[1].'-'.$p[2];
		my $exon = $filehash{$region}{'exon'};
		# grep exon to tmp file
		my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
		## determine transcript orientation.
                my $firstline = `head -n 1 $exonfile`;
                my @firstcols = split(/\t/,$firstline);
                my $orient = $firstcols[5];
		my $genomicchr = $firstcols[0];
		my $genomicstart = $firstcols[1];
		my $genomicstop = $firstcols[2];

		if ($orient eq '+') {
			#$bps = $genomicstop - $genomicstart + 1;
			$subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);

		}
		else {
			#$bps = $genomicstop - $genomicstart + 1;
			$subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
		}

		# check if failed
		my $cs = `cut -f $covcol '$exonfile' `;
		my @c = split(/\n/,$cs);
		my ($avg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@c);
		
		if ($min >= $thresh) {
			# lowest coverage > threshold => skip
			next;
		}
		
		# print to .tex table 
		if (length($exon) > 30) {
			$exon = substr($exon,0,27) . '...';
		}
		$exon =~ s/_/ /g;
		$exon =~ s/\|/ /g;

		print TEX "$exon & $subtitle & ".int($avg)." & $min ".'\\\\'."\n"; 
	}
	close IN;
	print TEX '\hline'."\n";
	print TEX '\end{longtable}}'."\n";
	close TEX;
}


## Close document
open OUT, ">>$wd/Report/Report.tex";
print OUT '\label{endofdoc}'."\n";
print OUT '\end{document}'."\n";
close OUT;
system("cd $wd/Report && pdflatex Report.tex > /dev/null 2>&1 && pdflatex Report.tex > /dev/null 2>&1 ");

## mv report to output file
system("cp -f $wd/Report/Report.pdf '$pdffile'");
##create tar.gz file
system("mkdir $wd/Results");
system("cp -Rf $wd/Plots $wd/Results/");
system("cp -Rf $wd/Report/ $wd/Results/");
if (-e "$wd/Targets.Global.Coverage") {
	system("cp -Rf $wd/Targets.Global.Coverage $wd/Results/");
}
if (-e "$wd/Targets.Position.Coverage") {
	system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/");
}

system("cd $wd  && tar czf '$tarfile' Results/");
## clean up (galaxy stores outside wd)
#system("rm -Rf $wd");
###############
## FUNCTIONS ##
###############
sub arraystats{
	my @array = @_;
	my $count = scalar(@array);
	if ($count == 0 ) {
		return (0,0,0,0,0,0,0);
	}
	@array = sort { $a <=> $b } @array;
	# median
	my $median = 0;
	if ($count % 2) { 
		$median = $array[int($count/2)]; 
	} else { 
		$median = ($array[$count/2] + $array[$count/2 - 1]) / 2; 
	} 
	# average 
	my $sum = 0;
	foreach (@array) { $sum += $_; } 
	my $average = $sum / $count;
	# quantiles (rounded)
	my $quart = int($count/4) ;	
	my $first = $array[$quart];
	my $third = $array[($quart*3)];
	my $min = $array[0];
	my $max = $array[($count-1)];
	return ($average,$median,$min,$max,$first,$third,$sum);
}

sub GetStats {
	my ($aref) = @_;
	if (scalar(@$aref) == 0) {
		return qw/0 0/;
	}
	# median
	my @s = sort {$a <=> $b } @$aref;
	my $nrzero = 0;
	my $len = scalar(@s);
	for (my $i = 0; $i< $len;$i++) {
		if ($s[$i] == 0) {
			$nrzero++;
		}
		else {
			last;
		}
	}
	my $nrcov = $len - $nrzero;
	# avg
	my $avg = 0;
    	foreach (@s) { $avg += $_ };
	$avg = sprintf("%.1f",($avg / scalar(@s)));
	return($avg,$len,$nrcov);
}


sub SubRegionCoverage {
	my ($bam,$targets) = @_;
	my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage";
	system($command);
	$commandsrun{$command} = 1;
}