view sum_fastqc.pl @ 5:7df018757d26 draft

Added additional fields, max %N sites and average length
author estrain
date Thu, 18 Oct 2018 17:14:47 -0400
parents d47775122e78
children 53bfb3b2c026
line wrap: on
line source

#!/usr/bin/perl

####################################################
## 
## sum_fastqc.pl
## 
## Errol Strain (estrain@gmail.com) 
##
## Description: Takes raw FASTQC output and produces
## simple table summary
##
#################################################### 

my($inname)=shift(@ARGV);
my($qscore)=shift(@ARGV);
$qscore=~s/\s+//g;
my(@qlist)=split(/\,/,$qscore);

print "Input\tFile\tFastQC\tPass-Fail\tReads\tPoor_Reads\tGC\%\tMaxN\%\tMeanLen\tMeanQ";
foreach(@qlist) {
  print "\tQ".$_;
}
print "\n";

foreach (@ARGV) {
  print_stats($_);
}

sub print_stats {
  $infile = shift;
  # First 10 lines of raw FASTQC contain basic overview
  @sumlines=`head -n 10 $infile`;
  chomp(@sumlines);

  # Sequence level Q scores are buried in the middle of the file
  @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
  chomp(@qlines);

  @nlines=`awk '/#Base\tN\-Count/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
  chomp(@nlines);

  @lenlines=`awk '/#Length\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
  chomp(@lenlines);

  @fastqc = split(/[\n\t]/,shift(@sumlines));
  @pass = split(/\t/,shift(@sumlines));
  shift(@sumlines);
  @fn = split(/\t/,shift(@sumlines));
  shift(@sumlines);
  shift(@sumlines);
  @nreads = split(/\t/,shift(@sumlines));
  @npoor = split(/\t/,shift(@sumlines));
  shift(@sumlines);
  @gc = split(/\t/,shift(@sumlines));

  print $inname."\t";
  print $fn[1]."\t";
  print $fastqc[1]."\t";
  print $pass[1]."\t";
  print $nreads[1]."\t";
  print $npoor[1]."\t";
  print $gc[1]."\t";
  print maxn(\@nlines)."\t";
  print meanlen($nreads[1],\@lenlines)."\t";
  print readmean($nreads[1],\@qlines);
  foreach $qs (@qlist) {
    print "\t";
    print qcal($nreads[1],$qs,\@qlines);
  }
  print "\n";
}

# Sum reads w/ Q scores > cutoff and divide by number of reads
sub qcal {
   $nreads=shift(@_);
   $cutoff=shift(@_);
   @qarray=@{$_[0]};
   $sum = 0;
  
   foreach $item (@qarray) {
     my($qval,$q)=split(/\t/,$item);
     if($qval>=$cutoff) {
       $sum += $q;
     }
   }
   $qmean = sprintf("%.2f", 100 * $sum / $nreads);
   return $qmean;
}

sub readmean {
   $nreads=shift(@_);
   @qarray=@{$_[0]};
   my($sum) = 0;
 
   foreach $item (@qarray) {
      my($qval,$q)=split(/\t/,$item);
      $sum += $q*$qval;
   }

   $readq = sprintf("%.2f", $sum / $nreads);
   return $readq;
}

sub maxn {
   @narray=@{$_[0]};
   my($max_nval)=0;

   foreach $item (@narray) {
     my($plist,$nval)=split(/\t/,$item);
     if($nval>$max_nval) {
       $max_nval=$nval;
     }
   }
   $max_nval = sprintf("%.2f", 100*$max_nval);
   return $max_nval;
}  

sub meanlen {
   $nreads=shift(@_);
   @larray=@{$_[0]};
   my($sum) = 0;

   foreach $item (@larray) {
     my($lenrange,$count)=split(/\t/,$item);
     my($l1,$l2)=split(/\-/,$lenrange);
     $sum+=(($l1+$l2)/2)*$count; 
   }
   $sum = sprintf("%.1f",$sum/$nreads);
   return $sum;
}