comparison sum_fastqc.pl @ 5:7df018757d26 draft

Added additional fields, max %N sites and average length
author estrain
date Thu, 18 Oct 2018 17:14:47 -0400
parents d47775122e78
children 53bfb3b2c026
comparison
equal deleted inserted replaced
4:d47775122e78 5:7df018757d26
14 my($inname)=shift(@ARGV); 14 my($inname)=shift(@ARGV);
15 my($qscore)=shift(@ARGV); 15 my($qscore)=shift(@ARGV);
16 $qscore=~s/\s+//g; 16 $qscore=~s/\s+//g;
17 my(@qlist)=split(/\,/,$qscore); 17 my(@qlist)=split(/\,/,$qscore);
18 18
19 print "Input\tFile\tFastQC\tPass-Fail\tReads\tPoor_Reads\tGC\tMeanQ"; 19 print "Input\tFile\tFastQC\tPass-Fail\tReads\tPoor_Reads\tGC\%\tMaxN\%\tMeanLen\tMeanQ";
20 foreach(@qlist) { 20 foreach(@qlist) {
21 print "\tQ".$_; 21 print "\tQ".$_;
22 } 22 }
23 print "\n"; 23 print "\n";
24 24
34 34
35 # Sequence level Q scores are buried in the middle of the file 35 # Sequence level Q scores are buried in the middle of the file
36 @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; 36 @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
37 chomp(@qlines); 37 chomp(@qlines);
38 38
39 @nlines=`awk '/#Base\tN\-Count/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
40 chomp(@nlines);
41
42 @lenlines=`awk '/#Length\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
43 chomp(@lenlines);
44
39 @fastqc = split(/[\n\t]/,shift(@sumlines)); 45 @fastqc = split(/[\n\t]/,shift(@sumlines));
40 @pass = split(/\t/,shift(@sumlines)); 46 @pass = split(/\t/,shift(@sumlines));
41 shift(@sumlines); 47 shift(@sumlines);
42 @fn = split(/\t/,shift(@sumlines)); 48 @fn = split(/\t/,shift(@sumlines));
43 shift(@sumlines); 49 shift(@sumlines);
52 print $fastqc[1]."\t"; 58 print $fastqc[1]."\t";
53 print $pass[1]."\t"; 59 print $pass[1]."\t";
54 print $nreads[1]."\t"; 60 print $nreads[1]."\t";
55 print $npoor[1]."\t"; 61 print $npoor[1]."\t";
56 print $gc[1]."\t"; 62 print $gc[1]."\t";
63 print maxn(\@nlines)."\t";
64 print meanlen($nreads[1],\@lenlines)."\t";
57 print readmean($nreads[1],\@qlines); 65 print readmean($nreads[1],\@qlines);
58 foreach $qs (@qlist) { 66 foreach $qs (@qlist) {
59 print "\t"; 67 print "\t";
60 print qcal($nreads[1],$qs,\@qlines); 68 print qcal($nreads[1],$qs,\@qlines);
61 } 69 }
68 $cutoff=shift(@_); 76 $cutoff=shift(@_);
69 @qarray=@{$_[0]}; 77 @qarray=@{$_[0]};
70 $sum = 0; 78 $sum = 0;
71 79
72 foreach $item (@qarray) { 80 foreach $item (@qarray) {
73 my($qval,$q)=split(/\t/,$item); 81 my($qval,$q)=split(/\t/,$item);
74 if($qval>=$cutoff) { 82 if($qval>=$cutoff) {
75 $sum += $q; 83 $sum += $q;
76 } 84 }
77 } 85 }
78 $qmean = sprintf("%.2f", 100 * $sum / $nreads); 86 $qmean = sprintf("%.2f", 100 * $sum / $nreads);
79 return $qmean; 87 return $qmean;
80 } 88 }
81 89
90 } 98 }
91 99
92 $readq = sprintf("%.2f", $sum / $nreads); 100 $readq = sprintf("%.2f", $sum / $nreads);
93 return $readq; 101 return $readq;
94 } 102 }
103
104 sub maxn {
105 @narray=@{$_[0]};
106 my($max_nval)=0;
107
108 foreach $item (@narray) {
109 my($plist,$nval)=split(/\t/,$item);
110 if($nval>$max_nval) {
111 $max_nval=$nval;
112 }
113 }
114 $max_nval = sprintf("%.2f", 100*$max_nval);
115 return $max_nval;
116 }
117
118 sub meanlen {
119 $nreads=shift(@_);
120 @larray=@{$_[0]};
121 my($sum) = 0;
122
123 foreach $item (@larray) {
124 my($lenrange,$count)=split(/\t/,$item);
125 my($l1,$l2)=split(/\-/,$lenrange);
126 $sum+=(($l1+$l2)/2)*$count;
127 }
128 $sum = sprintf("%.1f",$sum/$nreads);
129 return $sum;
130 }