annotate sum_fastqc.pl @ 7:53bfb3b2c026 draft

Uploaded
author estrain
date Thu, 18 Oct 2018 22:04:04 -0400
parents 7df018757d26
children 5a9a44e23dad
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
1 #!/usr/bin/perl
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
2
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
3 ####################################################
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
4 ##
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
5 ## sum_fastqc.pl
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
6 ##
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
7 ## Errol Strain (estrain@gmail.com)
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
8 ##
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
9 ## Description: Takes raw FASTQC output and produces
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
10 ## simple table summary
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
11 ##
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
12 ####################################################
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
13
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
14 my($inname)=shift(@ARGV);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
15 my($qscore)=shift(@ARGV);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
16 $qscore=~s/\s+//g;
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
17 my(@qlist)=split(/\,/,$qscore);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
18
7
53bfb3b2c026 Uploaded
estrain
parents: 5
diff changeset
19 print "Input\tFile\tFastQC\tPass-Fail\tReads\tPoor_Reads\tGC\%\tMax_N\%\tAvg_Len\tMean_Q";
3
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
20 foreach(@qlist) {
7
53bfb3b2c026 Uploaded
estrain
parents: 5
diff changeset
21 print "\tQ".$_."\%";
3
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
22 }
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
23 print "\n";
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
24
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
25 foreach (@ARGV) {
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
26 print_stats($_);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
27 }
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
28
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
29 sub print_stats {
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
30 $infile = shift;
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
31 # First 10 lines of raw FASTQC contain basic overview
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
32 @sumlines=`head -n 10 $infile`;
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
33 chomp(@sumlines);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
34
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
35 # Sequence level Q scores are buried in the middle of the file
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
36 @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
37 chomp(@qlines);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
38
5
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
39 @nlines=`awk '/#Base\tN\-Count/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
40 chomp(@nlines);
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
41
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
42 @lenlines=`awk '/#Length\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
43 chomp(@lenlines);
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
44
3
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
45 @fastqc = split(/[\n\t]/,shift(@sumlines));
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
46 @pass = split(/\t/,shift(@sumlines));
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
47 shift(@sumlines);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
48 @fn = split(/\t/,shift(@sumlines));
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
49 shift(@sumlines);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
50 shift(@sumlines);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
51 @nreads = split(/\t/,shift(@sumlines));
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
52 @npoor = split(/\t/,shift(@sumlines));
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
53 shift(@sumlines);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
54 @gc = split(/\t/,shift(@sumlines));
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
55
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
56 print $inname."\t";
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
57 print $fn[1]."\t";
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
58 print $fastqc[1]."\t";
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
59 print $pass[1]."\t";
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
60 print $nreads[1]."\t";
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
61 print $npoor[1]."\t";
4
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
62 print $gc[1]."\t";
5
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
63 print maxn(\@nlines)."\t";
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
64 print meanlen($nreads[1],\@lenlines)."\t";
4
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
65 print readmean($nreads[1],\@qlines);
3
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
66 foreach $qs (@qlist) {
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
67 print "\t";
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
68 print qcal($nreads[1],$qs,\@qlines);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
69 }
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
70 print "\n";
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
71 }
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
72
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
73 # Sum reads w/ Q scores > cutoff and divide by number of reads
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
74 sub qcal {
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
75 $nreads=shift(@_);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
76 $cutoff=shift(@_);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
77 @qarray=@{$_[0]};
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
78 $sum = 0;
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
79
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
80 foreach $item (@qarray) {
5
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
81 my($qval,$q)=split(/\t/,$item);
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
82 if($qval>=$cutoff) {
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
83 $sum += $q;
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
84 }
3
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
85 }
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
86 $qmean = sprintf("%.2f", 100 * $sum / $nreads);
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
87 return $qmean;
8256c1d0d63b Uploaded
estrain
parents:
diff changeset
88 }
4
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
89
7
53bfb3b2c026 Uploaded
estrain
parents: 5
diff changeset
90 # Calculate mean read Q score
4
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
91 sub readmean {
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
92 $nreads=shift(@_);
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
93 @qarray=@{$_[0]};
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
94 my($sum) = 0;
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
95
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
96 foreach $item (@qarray) {
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
97 my($qval,$q)=split(/\t/,$item);
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
98 $sum += $q*$qval;
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
99 }
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
100
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
101 $readq = sprintf("%.2f", $sum / $nreads);
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
102 return $readq;
d47775122e78 Uploaded
estrain
parents: 3
diff changeset
103 }
5
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
104
7
53bfb3b2c026 Uploaded
estrain
parents: 5
diff changeset
105 # Find position with hights fraction of Ns
5
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
106 sub maxn {
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
107 @narray=@{$_[0]};
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
108 my($max_nval)=0;
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
109
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
110 foreach $item (@narray) {
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
111 my($plist,$nval)=split(/\t/,$item);
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
112 if($nval>$max_nval) {
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
113 $max_nval=$nval;
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
114 }
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
115 }
7
53bfb3b2c026 Uploaded
estrain
parents: 5
diff changeset
116 $max_nval = sprintf("%.4f", $max_nval);
5
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
117 return $max_nval;
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
118 }
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
119
7
53bfb3b2c026 Uploaded
estrain
parents: 5
diff changeset
120 # Calculate mean read length
5
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
121 sub meanlen {
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
122 $nreads=shift(@_);
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
123 @larray=@{$_[0]};
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
124 my($sum) = 0;
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
125
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
126 foreach $item (@larray) {
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
127 my($lenrange,$count)=split(/\t/,$item);
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
128 my($l1,$l2)=split(/\-/,$lenrange);
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
129 $sum+=(($l1+$l2)/2)*$count;
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
130 }
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
131 $sum = sprintf("%.1f",$sum/$nreads);
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
132 return $sum;
7df018757d26 Added additional fields, max %N sites and average length
estrain
parents: 4
diff changeset
133 }