Mercurial > repos > estrain > sum_fastqc
annotate sum_fastqc.pl @ 5:7df018757d26 draft
Added additional fields, max %N sites and average length
author | estrain |
---|---|
date | Thu, 18 Oct 2018 17:14:47 -0400 |
parents | d47775122e78 |
children | 53bfb3b2c026 |
rev | line source |
---|---|
3 | 1 #!/usr/bin/perl |
2 | |
3 #################################################### | |
4 ## | |
5 ## sum_fastqc.pl | |
6 ## | |
7 ## Errol Strain (estrain@gmail.com) | |
8 ## | |
9 ## Description: Takes raw FASTQC output and produces | |
10 ## simple table summary | |
11 ## | |
12 #################################################### | |
13 | |
14 my($inname)=shift(@ARGV); | |
15 my($qscore)=shift(@ARGV); | |
16 $qscore=~s/\s+//g; | |
17 my(@qlist)=split(/\,/,$qscore); | |
18 | |
5
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
19 print "Input\tFile\tFastQC\tPass-Fail\tReads\tPoor_Reads\tGC\%\tMaxN\%\tMeanLen\tMeanQ"; |
3 | 20 foreach(@qlist) { |
21 print "\tQ".$_; | |
22 } | |
23 print "\n"; | |
24 | |
25 foreach (@ARGV) { | |
26 print_stats($_); | |
27 } | |
28 | |
29 sub print_stats { | |
30 $infile = shift; | |
31 # First 10 lines of raw FASTQC contain basic overview | |
32 @sumlines=`head -n 10 $infile`; | |
33 chomp(@sumlines); | |
34 | |
35 # Sequence level Q scores are buried in the middle of the file | |
36 @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; | |
37 chomp(@qlines); | |
38 | |
5
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
39 @nlines=`awk '/#Base\tN\-Count/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
40 chomp(@nlines); |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
41 |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
42 @lenlines=`awk '/#Length\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
43 chomp(@lenlines); |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
44 |
3 | 45 @fastqc = split(/[\n\t]/,shift(@sumlines)); |
46 @pass = split(/\t/,shift(@sumlines)); | |
47 shift(@sumlines); | |
48 @fn = split(/\t/,shift(@sumlines)); | |
49 shift(@sumlines); | |
50 shift(@sumlines); | |
51 @nreads = split(/\t/,shift(@sumlines)); | |
52 @npoor = split(/\t/,shift(@sumlines)); | |
53 shift(@sumlines); | |
54 @gc = split(/\t/,shift(@sumlines)); | |
55 | |
56 print $inname."\t"; | |
57 print $fn[1]."\t"; | |
58 print $fastqc[1]."\t"; | |
59 print $pass[1]."\t"; | |
60 print $nreads[1]."\t"; | |
61 print $npoor[1]."\t"; | |
4 | 62 print $gc[1]."\t"; |
5
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
63 print maxn(\@nlines)."\t"; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
64 print meanlen($nreads[1],\@lenlines)."\t"; |
4 | 65 print readmean($nreads[1],\@qlines); |
3 | 66 foreach $qs (@qlist) { |
67 print "\t"; | |
68 print qcal($nreads[1],$qs,\@qlines); | |
69 } | |
70 print "\n"; | |
71 } | |
72 | |
73 # Sum reads w/ Q scores > cutoff and divide by number of reads | |
74 sub qcal { | |
75 $nreads=shift(@_); | |
76 $cutoff=shift(@_); | |
77 @qarray=@{$_[0]}; | |
78 $sum = 0; | |
79 | |
80 foreach $item (@qarray) { | |
5
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
81 my($qval,$q)=split(/\t/,$item); |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
82 if($qval>=$cutoff) { |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
83 $sum += $q; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
84 } |
3 | 85 } |
86 $qmean = sprintf("%.2f", 100 * $sum / $nreads); | |
87 return $qmean; | |
88 } | |
4 | 89 |
90 sub readmean { | |
91 $nreads=shift(@_); | |
92 @qarray=@{$_[0]}; | |
93 my($sum) = 0; | |
94 | |
95 foreach $item (@qarray) { | |
96 my($qval,$q)=split(/\t/,$item); | |
97 $sum += $q*$qval; | |
98 } | |
99 | |
100 $readq = sprintf("%.2f", $sum / $nreads); | |
101 return $readq; | |
102 } | |
5
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
103 |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
104 sub maxn { |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
105 @narray=@{$_[0]}; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
106 my($max_nval)=0; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
107 |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
108 foreach $item (@narray) { |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
109 my($plist,$nval)=split(/\t/,$item); |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
110 if($nval>$max_nval) { |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
111 $max_nval=$nval; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
112 } |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
113 } |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
114 $max_nval = sprintf("%.2f", 100*$max_nval); |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
115 return $max_nval; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
116 } |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
117 |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
118 sub meanlen { |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
119 $nreads=shift(@_); |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
120 @larray=@{$_[0]}; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
121 my($sum) = 0; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
122 |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
123 foreach $item (@larray) { |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
124 my($lenrange,$count)=split(/\t/,$item); |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
125 my($l1,$l2)=split(/\-/,$lenrange); |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
126 $sum+=(($l1+$l2)/2)*$count; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
127 } |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
128 $sum = sprintf("%.1f",$sum/$nreads); |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
129 return $sum; |
7df018757d26
Added additional fields, max %N sites and average length
estrain
parents:
4
diff
changeset
|
130 } |