Mercurial > repos > estrain > sum_fastqc
view sum_fastqc.pl @ 5:7df018757d26 draft
Added additional fields, max %N sites and average length
author | estrain |
---|---|
date | Thu, 18 Oct 2018 17:14:47 -0400 |
parents | d47775122e78 |
children | 53bfb3b2c026 |
line wrap: on
line source
#!/usr/bin/perl #################################################### ## ## sum_fastqc.pl ## ## Errol Strain (estrain@gmail.com) ## ## Description: Takes raw FASTQC output and produces ## simple table summary ## #################################################### my($inname)=shift(@ARGV); my($qscore)=shift(@ARGV); $qscore=~s/\s+//g; my(@qlist)=split(/\,/,$qscore); print "Input\tFile\tFastQC\tPass-Fail\tReads\tPoor_Reads\tGC\%\tMaxN\%\tMeanLen\tMeanQ"; foreach(@qlist) { print "\tQ".$_; } print "\n"; foreach (@ARGV) { print_stats($_); } sub print_stats { $infile = shift; # First 10 lines of raw FASTQC contain basic overview @sumlines=`head -n 10 $infile`; chomp(@sumlines); # Sequence level Q scores are buried in the middle of the file @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; chomp(@qlines); @nlines=`awk '/#Base\tN\-Count/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; chomp(@nlines); @lenlines=`awk '/#Length\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; chomp(@lenlines); @fastqc = split(/[\n\t]/,shift(@sumlines)); @pass = split(/\t/,shift(@sumlines)); shift(@sumlines); @fn = split(/\t/,shift(@sumlines)); shift(@sumlines); shift(@sumlines); @nreads = split(/\t/,shift(@sumlines)); @npoor = split(/\t/,shift(@sumlines)); shift(@sumlines); @gc = split(/\t/,shift(@sumlines)); print $inname."\t"; print $fn[1]."\t"; print $fastqc[1]."\t"; print $pass[1]."\t"; print $nreads[1]."\t"; print $npoor[1]."\t"; print $gc[1]."\t"; print maxn(\@nlines)."\t"; print meanlen($nreads[1],\@lenlines)."\t"; print readmean($nreads[1],\@qlines); foreach $qs (@qlist) { print "\t"; print qcal($nreads[1],$qs,\@qlines); } print "\n"; } # Sum reads w/ Q scores > cutoff and divide by number of reads sub qcal { $nreads=shift(@_); $cutoff=shift(@_); @qarray=@{$_[0]}; $sum = 0; foreach $item (@qarray) { my($qval,$q)=split(/\t/,$item); if($qval>=$cutoff) { $sum += $q; } } $qmean = sprintf("%.2f", 100 * $sum / $nreads); return $qmean; } sub readmean { $nreads=shift(@_); @qarray=@{$_[0]}; my($sum) = 0; foreach $item (@qarray) { my($qval,$q)=split(/\t/,$item); $sum += $q*$qval; } $readq = sprintf("%.2f", $sum / $nreads); return $readq; } sub maxn { @narray=@{$_[0]}; my($max_nval)=0; foreach $item (@narray) { my($plist,$nval)=split(/\t/,$item); if($nval>$max_nval) { $max_nval=$nval; } } $max_nval = sprintf("%.2f", 100*$max_nval); return $max_nval; } sub meanlen { $nreads=shift(@_); @larray=@{$_[0]}; my($sum) = 0; foreach $item (@larray) { my($lenrange,$count)=split(/\t/,$item); my($l1,$l2)=split(/\-/,$lenrange); $sum+=(($l1+$l2)/2)*$count; } $sum = sprintf("%.1f",$sum/$nreads); return $sum; }