Mercurial > repos > estrain > sum_fastqc

--- a/sum_fastqc.pl	Wed Feb 23 14:35:40 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,135 +0,0 @@
-#!/usr/bin/perl
-
-####################################################
-##
-## sum_fastqc.pl
-##
-## Errol Strain (estrain@gmail.com)
-##
-## Description: Takes raw FASTQC output and produces
-## simple table summary
-##
-####################################################
-
-my($inname)=shift(@ARGV);
-my($qscore)=shift(@ARGV);
-$qscore=~s/\s+//g;
-my(@qlist)=split(/\,/,$qscore);
-
-print "Input\tFile\tFastQC\tReads\tGC\%\tAvg_Len\tMax\_N\%\tMean_Q";
-foreach(@qlist) {
-  print "\tQ".$_."\%";
-}
-print "\n";
-
-foreach (@ARGV) {
-  print_stats($_);
-}
-
-sub print_stats {
-  $infile = shift;
-  # First 10 lines of raw FASTQC contain basic overview
-  @sumlines=`head -n 10 $infile`;
-  chomp(@sumlines);
-
-  # Sequence level Q scores are buried in the middle of the file
-  @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
-  chomp(@qlines);
-
-  @nlines=`awk '/#Base\tN\-Count/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
-  chomp(@nlines);
-
-  @lenlines=`awk '/#Length\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
-  chomp(@lenlines);
-
-  @fastqc = split(/[\n\t]/,shift(@sumlines));
-  @pass = split(/\t/,shift(@sumlines));
-  shift(@sumlines);
-  @fn = split(/\t/,shift(@sumlines));
-  shift(@sumlines);
-  shift(@sumlines);
-  @nreads = split(/\t/,shift(@sumlines));
-  @npoor = split(/\t/,shift(@sumlines));
-  shift(@sumlines);
-  @gc = split(/\t/,shift(@sumlines));
-
-  print $inname."\t";
-  print $fn[1]."\t";
-  print $fastqc[1]."\t";
-  print $nreads[1]."\t";
-  print $gc[1]."\t";
-  print meanlen($nreads[1],\@lenlines)."\t";
-  print maxn(\@nlines)."\t";
-  print readmean($nreads[1],\@qlines);
-  foreach $qs (@qlist) {
-    print "\t";
-    print qcal($nreads[1],$qs,\@qlines);
-  }
-  print "\n";
-}
-
-# Sum reads w/ Q scores > cutoff and divide by number of reads
-sub qcal {
-   $nreads=shift(@_);
-   $cutoff=shift(@_);
-   @qarray=@{$_[0]};
-   $sum = 0;
-
-   foreach $item (@qarray) {
-     my($qval,$q)=split(/\t/,$item);
-     if($qval>=$cutoff) {
-       $sum += $q;
-     }
-   }
-   $qmean = sprintf("%.2f", 100 * $sum / $nreads);
-   return $qmean;
-}
-
-# Calculate mean read Q score
-sub readmean {
-   $nreads=shift(@_);
-   @qarray=@{$_[0]};
-   my($sum) = 0;
-
-   foreach $item (@qarray) {
-      my($qval,$q)=split(/\t/,$item);
-      $sum += $q*$qval;
-   }
-
-   $readq = sprintf("%.2f", $sum / $nreads);
-   return $readq;
-}
-
-# Find position with hights fraction of Ns
-sub maxn {
-   @narray=@{$_[0]};
-   my($max_nval)=0;
-
-   foreach $item (@narray) {
-     my($plist,$nval)=split(/\t/,$item);
-     if($nval>$max_nval) {
-       $max_nval=$nval;
-     }
-   }
-   $max_nval = sprintf("%.4f", $max_nval);
-   return $max_nval;
-}
-
-# Calculate mean read length
-sub meanlen {
-   $nreads=shift(@_);
-   @larray=@{$_[0]};
-   my($sum) = 0;
-
-   foreach $item (@larray) {
-     my($lenrange,$count)=split(/\t/,$item);
-     my(@lvals)=split(/\-/,$lenrange);
-     if(@lvals==2) {
-       $sum+=(($lvals[0]+$lvals[1])/2)*$count;
-     } else {
-       $sum+=($lvals[0])*$count;
-     }
-   }
-   $sum = sprintf("%.1f",$sum/$nreads);
-   return $sum;
-}
--- a/sum_fastqc.xml	Wed Feb 23 14:35:40 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,56 +0,0 @@
-<tool id="sum_fastqc" name="sum_fastqc" version="0.21">
-    <description>summarizes raw FASTQC output</description>
-    <requirements>
-    </requirements>
-    <command detect_errors="exit_code"><![CDATA[
-
-      #if $jobtype.select == "single"
-        #set inname = $jobtype.file
-        #set infile = $jobtype.file
-        perl $__tool_directory__/sum_fastqc.pl "$inname" $qset.qscore $infile > sum_fastqc.tab
-      #else if $jobtype.select == "col"
-        #set inname = $jobtype.coll.name
-        #set infile = $jobtype.coll.forward
-        #set infile2 = $jobtype.coll.reverse
-        perl $__tool_directory__/sum_fastqc.pl "$inname" $qset.qscore $infile $infile2 > sum_fastqc.tab;
-      #end if
-
-    ]]></command>
-    <inputs>
-      <conditional name="jobtype">
-        <param name="select" type="select" label="Select Input">
-          <option value="single">Raw FASTQC output File</option>
-          <option value="col">Pair of raw FASTQC Files</option>
-        </param>
-        <when value="single">
-          <param name="file" type="data" format="txt" label="Raw FASTQC" />
-        </when>
-        <when value="col">
-          <param name="coll" label="Raw FASTQC pair" type="data_collection" format="txt" collection_type="paired" />
-        </when>
-      </conditional>
-      <conditional name="qset">
-        <param name="selectq" type="select" label="Single or multiple Q scores">
-          <option value="single">Single Q score</option>
-          <option value="mul">Multiple Q scores</option>
-        </param>
-        <when value="single">
-          <param name="qscore" type="integer" label="Q score threshold (i.e. reads >= Q score)" value="30">
-            <validator type="in_range" message="Must be integer(0,40)." min="0" max="40"/>
-          </param>
-        </when>
-        <when value="mul">
-          <param name="qscore" type="text" label="Comma delimited Q score list (e.g. 25,30,35)" value="30"/>
-        </when>
-      </conditional>
-    </inputs>
-    <outputs>
-      <data format="tabular" name="FASTQC Summary" label="${tool.name} on ${on_string}: Contigs" from_work_dir="*.tab"/>
-    </outputs>
-
-    <help><![CDATA[
-
-    ]]></help>
-     <citations>
-    </citations>
-</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sum_fastqc/sum_fastqc.pl	Thu Sep 18 15:14:19 2025 +0000
@@ -0,0 +1,137 @@
+#!/usr/bin/perl
+
+####################################################
+##
+## sum_fastqc.pl
+##
+## Errol Strain (estrain@gmail.com)
+##
+## Description: Takes raw FASTQC output and produces
+## simple table summary
+##
+####################################################
+
+my($inname)=shift(@ARGV);
+my($qscore)=shift(@ARGV);
+$qscore=~s/\s+//g;
+my(@qlist)=split(/\,/,$qscore);
+
+print "Input\tFile\tFastQC\tReads\tBases\tGC\%\tAvg_Len\tMax\_N\%\tMean_Q";
+foreach(@qlist) {
+  print "\tQ".$_."\%";
+}
+print "\n";
+
+foreach (@ARGV) {
+  print_stats($_);
+}
+
+sub print_stats {
+  $infile = shift;
+  # First 10 lines of raw FASTQC contain basic overview
+  @sumlines=`head -n 10 $infile`;
+  chomp(@sumlines);
+
+  # Sequence level Q scores are buried in the middle of the file
+  @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
+  chomp(@qlines);
+
+  @nlines=`awk '/#Base\tN\-Count/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
+  chomp(@nlines);
+
+  @lenlines=`awk '/#Length\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
+  chomp(@lenlines);
+
+  @fastqc = split(/[\n\t]/,shift(@sumlines));
+  @pass = split(/\t/,shift(@sumlines));
+  shift(@sumlines);
+  @fn = split(/\t/,shift(@sumlines));
+  shift(@sumlines);
+  shift(@sumlines);
+  @nreads = split(/\t/,shift(@sumlines));
+  @tb = split(/\t/,shift(@sumlines));
+  @npoor = split(/\t/,shift(@sumlines));
+  shift(@sumlines);
+  @gc = split(/\t/,shift(@sumlines));
+
+  print $inname."\t";
+  print $fn[1]."\t";
+  print $fastqc[1]."\t";
+  print $nreads[1]."\t";
+  print $tb[1]."\t";
+  print $gc[1]."\t";
+  print meanlen($nreads[1],\@lenlines)."\t";
+  print maxn(\@nlines)."\t";
+  print readmean($nreads[1],\@qlines);
+  foreach $qs (@qlist) {
+    print "\t";
+    print qcal($nreads[1],$qs,\@qlines);
+  }
+  print "\n";
+}
+
+# Sum reads w/ Q scores > cutoff and divide by number of reads
+sub qcal {
+   $nreads=shift(@_);
+   $cutoff=shift(@_);
+   @qarray=@{$_[0]};
+   $sum = 0;
+
+   foreach $item (@qarray) {
+     my($qval,$q)=split(/\t/,$item);
+     if($qval>=$cutoff) {
+       $sum += $q;
+     }
+   }
+   $qmean = sprintf("%.2f", 100 * $sum / $nreads);
+   return $qmean;
+}
+
+# Calculate mean read Q score
+sub readmean {
+   $nreads=shift(@_);
+   @qarray=@{$_[0]};
+   my($sum) = 0;
+
+   foreach $item (@qarray) {
+      my($qval,$q)=split(/\t/,$item);
+      $sum += $q*$qval;
+   }
+
+   $readq = sprintf("%.2f", $sum / $nreads);
+   return $readq;
+}
+
+# Find position with hights fraction of Ns
+sub maxn {
+   @narray=@{$_[0]};
+   my($max_nval)=0;
+
+   foreach $item (@narray) {
+     my($plist,$nval)=split(/\t/,$item);
+     if($nval>$max_nval) {
+       $max_nval=$nval;
+     }
+   }
+   $max_nval = sprintf("%.4f", $max_nval);
+   return $max_nval;
+}
+
+# Calculate mean read length
+sub meanlen {
+   $nreads=shift(@_);
+   @larray=@{$_[0]};
+   my($sum) = 0;
+
+   foreach $item (@larray) {
+     my($lenrange,$count)=split(/\t/,$item);
+     my(@lvals)=split(/\-/,$lenrange);
+     if(@lvals==2) {
+       $sum+=(($lvals[0]+$lvals[1])/2)*$count;
+     } else {
+       $sum+=($lvals[0])*$count;
+     }
+   }
+   $sum = sprintf("%.1f",$sum/$nreads);
+   return $sum;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sum_fastqc/sum_fastqc.xml	Thu Sep 18 15:14:19 2025 +0000
@@ -0,0 +1,56 @@
+<tool id="sum_fastqc" name="sum_fastqc" version="0.22">
+    <description>summarizes raw FASTQC output</description>
+    <requirements>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+
+      #if $jobtype.select == "single"
+        #set inname = $jobtype.file
+        #set infile = $jobtype.file
+        perl $__tool_directory__/sum_fastqc.pl "$inname" $qset.qscore $infile > sum_fastqc.tab
+      #else if $jobtype.select == "col"
+        #set inname = $jobtype.coll.name
+        #set infile = $jobtype.coll.forward
+        #set infile2 = $jobtype.coll.reverse
+        perl $__tool_directory__/sum_fastqc.pl "$inname" $qset.qscore $infile $infile2 > sum_fastqc.tab;
+      #end if
+
+    ]]></command>
+    <inputs>
+      <conditional name="jobtype">
+        <param name="select" type="select" label="Select Input">
+          <option value="single">Raw FASTQC output File</option>
+          <option value="col">Pair of raw FASTQC Files</option>
+        </param>
+        <when value="single">
+          <param name="file" type="data" format="txt" label="Raw FASTQC" />
+        </when>
+        <when value="col">
+          <param name="coll" label="Raw FASTQC pair" type="data_collection" format="txt" collection_type="paired" />
+        </when>
+      </conditional>
+      <conditional name="qset">
+        <param name="selectq" type="select" label="Single or multiple Q scores">
+          <option value="single">Single Q score</option>
+          <option value="mul">Multiple Q scores</option>
+        </param>
+        <when value="single">
+          <param name="qscore" type="integer" label="Q score threshold (i.e. reads >= Q score)" value="30">
+            <validator type="in_range" message="Must be integer(0,40)." min="0" max="40"/>
+          </param>
+        </when>
+        <when value="mul">
+          <param name="qscore" type="text" label="Comma delimited Q score list (e.g. 25,30,35)" value="30"/>
+        </when>
+      </conditional>
+    </inputs>
+    <outputs>
+      <data format="tabular" name="FASTQC Summary" label="${tool.name} on ${on_string}: Contigs" from_work_dir="*.tab"/>
+    </outputs>
+
+    <help><![CDATA[
+
+    ]]></help>
+     <citations>
+    </citations>
+</tool>