Mercurial > repos > estrain > sum_fastqc
changeset 10:8c2ad48a2eba draft
Uploaded
author | estrain |
---|---|
date | Thu, 18 Sep 2025 15:14:19 +0000 |
parents | b3d943bc70ae |
children | d8ff95d96848 |
files | sum_fastqc.pl sum_fastqc.xml sum_fastqc/sum_fastqc.pl sum_fastqc/sum_fastqc.xml |
diffstat | 4 files changed, 193 insertions(+), 191 deletions(-) [+] |
line wrap: on
line diff
--- a/sum_fastqc.pl Wed Feb 23 14:35:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,135 +0,0 @@ -#!/usr/bin/perl - -#################################################### -## -## sum_fastqc.pl -## -## Errol Strain (estrain@gmail.com) -## -## Description: Takes raw FASTQC output and produces -## simple table summary -## -#################################################### - -my($inname)=shift(@ARGV); -my($qscore)=shift(@ARGV); -$qscore=~s/\s+//g; -my(@qlist)=split(/\,/,$qscore); - -print "Input\tFile\tFastQC\tReads\tGC\%\tAvg_Len\tMax\_N\%\tMean_Q"; -foreach(@qlist) { - print "\tQ".$_."\%"; -} -print "\n"; - -foreach (@ARGV) { - print_stats($_); -} - -sub print_stats { - $infile = shift; - # First 10 lines of raw FASTQC contain basic overview - @sumlines=`head -n 10 $infile`; - chomp(@sumlines); - - # Sequence level Q scores are buried in the middle of the file - @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; - chomp(@qlines); - - @nlines=`awk '/#Base\tN\-Count/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; - chomp(@nlines); - - @lenlines=`awk '/#Length\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; - chomp(@lenlines); - - @fastqc = split(/[\n\t]/,shift(@sumlines)); - @pass = split(/\t/,shift(@sumlines)); - shift(@sumlines); - @fn = split(/\t/,shift(@sumlines)); - shift(@sumlines); - shift(@sumlines); - @nreads = split(/\t/,shift(@sumlines)); - @npoor = split(/\t/,shift(@sumlines)); - shift(@sumlines); - @gc = split(/\t/,shift(@sumlines)); - - print $inname."\t"; - print $fn[1]."\t"; - print $fastqc[1]."\t"; - print $nreads[1]."\t"; - print $gc[1]."\t"; - print meanlen($nreads[1],\@lenlines)."\t"; - print maxn(\@nlines)."\t"; - print readmean($nreads[1],\@qlines); - foreach $qs (@qlist) { - print "\t"; - print qcal($nreads[1],$qs,\@qlines); - } - print "\n"; -} - -# Sum reads w/ Q scores > cutoff and divide by number of reads -sub qcal { - $nreads=shift(@_); - $cutoff=shift(@_); - @qarray=@{$_[0]}; - $sum = 0; - - foreach $item (@qarray) { - my($qval,$q)=split(/\t/,$item); - if($qval>=$cutoff) { - $sum += $q; - } - } - $qmean = sprintf("%.2f", 100 * $sum / $nreads); - return $qmean; -} - -# Calculate mean read Q score -sub readmean { - $nreads=shift(@_); - @qarray=@{$_[0]}; - my($sum) = 0; - - foreach $item (@qarray) { - my($qval,$q)=split(/\t/,$item); - $sum += $q*$qval; - } - - $readq = sprintf("%.2f", $sum / $nreads); - return $readq; -} - -# Find position with hights fraction of Ns -sub maxn { - @narray=@{$_[0]}; - my($max_nval)=0; - - foreach $item (@narray) { - my($plist,$nval)=split(/\t/,$item); - if($nval>$max_nval) { - $max_nval=$nval; - } - } - $max_nval = sprintf("%.4f", $max_nval); - return $max_nval; -} - -# Calculate mean read length -sub meanlen { - $nreads=shift(@_); - @larray=@{$_[0]}; - my($sum) = 0; - - foreach $item (@larray) { - my($lenrange,$count)=split(/\t/,$item); - my(@lvals)=split(/\-/,$lenrange); - if(@lvals==2) { - $sum+=(($lvals[0]+$lvals[1])/2)*$count; - } else { - $sum+=($lvals[0])*$count; - } - } - $sum = sprintf("%.1f",$sum/$nreads); - return $sum; -}
--- a/sum_fastqc.xml Wed Feb 23 14:35:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -<tool id="sum_fastqc" name="sum_fastqc" version="0.21"> - <description>summarizes raw FASTQC output</description> - <requirements> - </requirements> - <command detect_errors="exit_code"><![CDATA[ - - #if $jobtype.select == "single" - #set inname = $jobtype.file - #set infile = $jobtype.file - perl $__tool_directory__/sum_fastqc.pl "$inname" $qset.qscore $infile > sum_fastqc.tab - #else if $jobtype.select == "col" - #set inname = $jobtype.coll.name - #set infile = $jobtype.coll.forward - #set infile2 = $jobtype.coll.reverse - perl $__tool_directory__/sum_fastqc.pl "$inname" $qset.qscore $infile $infile2 > sum_fastqc.tab; - #end if - - ]]></command> - <inputs> - <conditional name="jobtype"> - <param name="select" type="select" label="Select Input"> - <option value="single">Raw FASTQC output File</option> - <option value="col">Pair of raw FASTQC Files</option> - </param> - <when value="single"> - <param name="file" type="data" format="txt" label="Raw FASTQC" /> - </when> - <when value="col"> - <param name="coll" label="Raw FASTQC pair" type="data_collection" format="txt" collection_type="paired" /> - </when> - </conditional> - <conditional name="qset"> - <param name="selectq" type="select" label="Single or multiple Q scores"> - <option value="single">Single Q score</option> - <option value="mul">Multiple Q scores</option> - </param> - <when value="single"> - <param name="qscore" type="integer" label="Q score threshold (i.e. reads >= Q score)" value="30"> - <validator type="in_range" message="Must be integer(0,40)." min="0" max="40"/> - </param> - </when> - <when value="mul"> - <param name="qscore" type="text" label="Comma delimited Q score list (e.g. 25,30,35)" value="30"/> - </when> - </conditional> - </inputs> - <outputs> - <data format="tabular" name="FASTQC Summary" label="${tool.name} on ${on_string}: Contigs" from_work_dir="*.tab"/> - </outputs> - - <help><![CDATA[ - - ]]></help> - <citations> - </citations> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sum_fastqc/sum_fastqc.pl Thu Sep 18 15:14:19 2025 +0000 @@ -0,0 +1,137 @@ +#!/usr/bin/perl + +#################################################### +## +## sum_fastqc.pl +## +## Errol Strain (estrain@gmail.com) +## +## Description: Takes raw FASTQC output and produces +## simple table summary +## +#################################################### + +my($inname)=shift(@ARGV); +my($qscore)=shift(@ARGV); +$qscore=~s/\s+//g; +my(@qlist)=split(/\,/,$qscore); + +print "Input\tFile\tFastQC\tReads\tBases\tGC\%\tAvg_Len\tMax\_N\%\tMean_Q"; +foreach(@qlist) { + print "\tQ".$_."\%"; +} +print "\n"; + +foreach (@ARGV) { + print_stats($_); +} + +sub print_stats { + $infile = shift; + # First 10 lines of raw FASTQC contain basic overview + @sumlines=`head -n 10 $infile`; + chomp(@sumlines); + + # Sequence level Q scores are buried in the middle of the file + @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; + chomp(@qlines); + + @nlines=`awk '/#Base\tN\-Count/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; + chomp(@nlines); + + @lenlines=`awk '/#Length\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; + chomp(@lenlines); + + @fastqc = split(/[\n\t]/,shift(@sumlines)); + @pass = split(/\t/,shift(@sumlines)); + shift(@sumlines); + @fn = split(/\t/,shift(@sumlines)); + shift(@sumlines); + shift(@sumlines); + @nreads = split(/\t/,shift(@sumlines)); + @tb = split(/\t/,shift(@sumlines)); + @npoor = split(/\t/,shift(@sumlines)); + shift(@sumlines); + @gc = split(/\t/,shift(@sumlines)); + + print $inname."\t"; + print $fn[1]."\t"; + print $fastqc[1]."\t"; + print $nreads[1]."\t"; + print $tb[1]."\t"; + print $gc[1]."\t"; + print meanlen($nreads[1],\@lenlines)."\t"; + print maxn(\@nlines)."\t"; + print readmean($nreads[1],\@qlines); + foreach $qs (@qlist) { + print "\t"; + print qcal($nreads[1],$qs,\@qlines); + } + print "\n"; +} + +# Sum reads w/ Q scores > cutoff and divide by number of reads +sub qcal { + $nreads=shift(@_); + $cutoff=shift(@_); + @qarray=@{$_[0]}; + $sum = 0; + + foreach $item (@qarray) { + my($qval,$q)=split(/\t/,$item); + if($qval>=$cutoff) { + $sum += $q; + } + } + $qmean = sprintf("%.2f", 100 * $sum / $nreads); + return $qmean; +} + +# Calculate mean read Q score +sub readmean { + $nreads=shift(@_); + @qarray=@{$_[0]}; + my($sum) = 0; + + foreach $item (@qarray) { + my($qval,$q)=split(/\t/,$item); + $sum += $q*$qval; + } + + $readq = sprintf("%.2f", $sum / $nreads); + return $readq; +} + +# Find position with hights fraction of Ns +sub maxn { + @narray=@{$_[0]}; + my($max_nval)=0; + + foreach $item (@narray) { + my($plist,$nval)=split(/\t/,$item); + if($nval>$max_nval) { + $max_nval=$nval; + } + } + $max_nval = sprintf("%.4f", $max_nval); + return $max_nval; +} + +# Calculate mean read length +sub meanlen { + $nreads=shift(@_); + @larray=@{$_[0]}; + my($sum) = 0; + + foreach $item (@larray) { + my($lenrange,$count)=split(/\t/,$item); + my(@lvals)=split(/\-/,$lenrange); + if(@lvals==2) { + $sum+=(($lvals[0]+$lvals[1])/2)*$count; + } else { + $sum+=($lvals[0])*$count; + } + } + $sum = sprintf("%.1f",$sum/$nreads); + return $sum; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sum_fastqc/sum_fastqc.xml Thu Sep 18 15:14:19 2025 +0000 @@ -0,0 +1,56 @@ +<tool id="sum_fastqc" name="sum_fastqc" version="0.22"> + <description>summarizes raw FASTQC output</description> + <requirements> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + + #if $jobtype.select == "single" + #set inname = $jobtype.file + #set infile = $jobtype.file + perl $__tool_directory__/sum_fastqc.pl "$inname" $qset.qscore $infile > sum_fastqc.tab + #else if $jobtype.select == "col" + #set inname = $jobtype.coll.name + #set infile = $jobtype.coll.forward + #set infile2 = $jobtype.coll.reverse + perl $__tool_directory__/sum_fastqc.pl "$inname" $qset.qscore $infile $infile2 > sum_fastqc.tab; + #end if + + ]]></command> + <inputs> + <conditional name="jobtype"> + <param name="select" type="select" label="Select Input"> + <option value="single">Raw FASTQC output File</option> + <option value="col">Pair of raw FASTQC Files</option> + </param> + <when value="single"> + <param name="file" type="data" format="txt" label="Raw FASTQC" /> + </when> + <when value="col"> + <param name="coll" label="Raw FASTQC pair" type="data_collection" format="txt" collection_type="paired" /> + </when> + </conditional> + <conditional name="qset"> + <param name="selectq" type="select" label="Single or multiple Q scores"> + <option value="single">Single Q score</option> + <option value="mul">Multiple Q scores</option> + </param> + <when value="single"> + <param name="qscore" type="integer" label="Q score threshold (i.e. reads >= Q score)" value="30"> + <validator type="in_range" message="Must be integer(0,40)." min="0" max="40"/> + </param> + </when> + <when value="mul"> + <param name="qscore" type="text" label="Comma delimited Q score list (e.g. 25,30,35)" value="30"/> + </when> + </conditional> + </inputs> + <outputs> + <data format="tabular" name="FASTQC Summary" label="${tool.name} on ${on_string}: Contigs" from_work_dir="*.tab"/> + </outputs> + + <help><![CDATA[ + + ]]></help> + <citations> + </citations> +</tool>