Mercurial > repos > nml > fastqc_stats
comparison fastqc_stats.pl @ 0:2c74c5c70520 draft default tip
planemo upload for repository https://github.com/phac-nml/galaxy_tools commit d5b7cb71616c0ac20e02ff8cb8c147b9d4a31691
author | nml |
---|---|
date | Wed, 11 Oct 2017 16:22:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2c74c5c70520 |
---|---|
1 #!/usr/bin/env perl | |
2 package nml_fastqc_stats; | |
3 use strict; | |
4 use warnings; | |
5 use Pod::Usage; | |
6 use Getopt::Long; | |
7 use autodie; | |
8 use File::Basename; | |
9 use Bio::SeqIO; | |
10 use File::Temp qw/ tempdir /; | |
11 __PACKAGE__->run unless caller; | |
12 | |
13 | |
14 | |
15 sub run { | |
16 #grabbing arguments either from command line or from another module/script | |
17 my ( $rawdatas, $fastqs,$num_bps,$sample,$out) = get_parameters(@_); | |
18 | |
19 print_header($out,scalar @$fastqs); | |
20 my @results; | |
21 foreach my $rawdata ( @$rawdatas) { | |
22 my %results = %{ parse_FastQC($rawdata) } ; | |
23 push @results,\%results; | |
24 } | |
25 | |
26 | |
27 my %results; | |
28 | |
29 | |
30 if ( scalar @$rawdatas ==1 ) { | |
31 %results = % { $results[0]}; | |
32 | |
33 } | |
34 else { | |
35 #need to combine the info into a SINGLE results file | |
36 | |
37 my ($r1,$r2) = ($results[0],$results[1]); | |
38 | |
39 | |
40 foreach my $key( keys %$r1) { | |
41 if ( exists $r2->{$key}) { | |
42 my ($value1,$value2) = ($r1->{$key},$r2->{$key}); | |
43 | |
44 #check to see if data that could/should be different | |
45 my %diff_ignore = ('Filename'=>1); | |
46 my %combine = ('duplicate_lvl'=>1,'dist_length'=>1,'overre'=>1,'Total Sequences'=>1,'Sequence length'=>1,'%GC'=>1); | |
47 if (exists $combine{$key} ) { | |
48 if ( $key eq 'duplicate_lvl') { | |
49 $results{'Duplicate level R1'} = $value1; | |
50 $results{'Duplicate level R2'} = $value2; | |
51 } | |
52 elsif ( $key eq 'dist_length') { | |
53 my %dist=%{$value1}; | |
54 foreach my $key( keys %$value2) { | |
55 #adding the number of reads to either existing range or new | |
56 if ( exists $dist{$key}) { | |
57 $dist{$key}{'count'}+=$value2->{$key}{'count'}; | |
58 | |
59 } | |
60 else { | |
61 $dist{$key} = $value2->{$key}; | |
62 } | |
63 } | |
64 | |
65 $results{$key} = \%dist; | |
66 | |
67 } | |
68 elsif ( $key eq 'Total Sequences') { | |
69 $results{$key}= $value1 + $value2; | |
70 } | |
71 elsif ( $key eq 'overre') { | |
72 $results{$key}=$value1 + $value2; | |
73 } | |
74 elsif ( $key eq '%GC') { | |
75 $results{$key}=($value1 + $value2)/2; | |
76 } | |
77 elsif ( $key eq 'Sequence length') { | |
78 if ( $value1 eq $value2) { | |
79 $results{$key}= $value1; | |
80 } | |
81 else { | |
82 #figure out the range by splitting all values into array then sort | |
83 my @number = split /-/,$value1; | |
84 map { push @number,$_ } split'-',$value2; | |
85 @number = sort {$a <=> $b } @number; | |
86 $results{$key} = $number[0] . '-' . $number[$#number]; | |
87 } | |
88 } | |
89 | |
90 } | |
91 #both the same we are good | |
92 elsif ( $value1 eq $value2) { | |
93 $results{$key}=$value1; | |
94 } | |
95 elsif ( ! exists $diff_ignore{$key}) { | |
96 die "Have different value between fastqc files for '$key'\n"; | |
97 } | |
98 | |
99 | |
100 | |
101 } | |
102 else { | |
103 die "Cannot find key : '$key' in reverse fastqc file\n"; | |
104 } | |
105 } | |
106 | |
107 | |
108 } | |
109 | |
110 #add sample name given by user | |
111 $results{'filename'} = $sample; | |
112 | |
113 #perform coverage, total base pair and SE or PE | |
114 if ( $fastqs) { | |
115 my $result = determine_stats($fastqs,$num_bps); | |
116 if ( $result) { | |
117 foreach ( keys %$result) { | |
118 if ( exists $results{$_}) { | |
119 die "Have two functions with key: '$_'\n"; | |
120 } | |
121 else { | |
122 $results{$_}= $result->{$_}; | |
123 } | |
124 } | |
125 } | |
126 } | |
127 | |
128 print_csv($out,\%results); | |
129 | |
130 | |
131 | |
132 return 1; | |
133 } | |
134 | |
135 sub determine_stats { | |
136 my ($fastqs,$num_bps) = @_; | |
137 | |
138 | |
139 my %results; | |
140 my $number = scalar @$fastqs; | |
141 my $status; | |
142 | |
143 if ( $number ==1) { | |
144 $status='SE'; | |
145 } | |
146 elsif ( $number ==2) { | |
147 $status='PE'; | |
148 } | |
149 else { | |
150 $status='N/A'; | |
151 } | |
152 | |
153 $results{'pe_status'} = $status; | |
154 | |
155 #determine total base pair | |
156 | |
157 my $all_fastq = join (' ' , map { "\"$_\"" } @$fastqs); | |
158 | |
159 #one liner from : http://onetipperday.blogspot.ca/2012/05/simple-way-to-get-reads-length.html with some modification by Philip Mabon | |
160 my $total=`cat $all_fastq | perl -ne '\$s=<>;<>;<>;chomp(\$s);print length(\$s)."\n";' | sort | uniq -c | perl -e 'while(my \$line=<>){chomp \$line; \$line =~s/^\\s+//;(\$l,\$n)=split/\\s+/,\$line; \$t+= \$l*\$n;} print "\$t\n"'`; | |
161 chomp $total; | |
162 | |
163 $results{'total_bp'} = $total; | |
164 | |
165 if ($total) { | |
166 | |
167 $results{'coverage'} = sprintf "%.2f", ($total/$num_bps); | |
168 | |
169 #reference name stuff | |
170 #my $name = basename($reference); | |
171 #$name =~ s/\.fasta//; | |
172 | |
173 $results{'reference'} = $num_bps; | |
174 | |
175 | |
176 } | |
177 | |
178 return \%results; | |
179 | |
180 } | |
181 | |
182 | |
183 sub print_csv { | |
184 my ($out_fh,$results) = @_; | |
185 my %results = %{$results}; | |
186 | |
187 my @line; | |
188 | |
189 #Name | |
190 my $name = $results{'filename'} || 'N/A'; | |
191 $name =~ s/.fastq//; | |
192 push @line,$name; | |
193 | |
194 #Indicate if PE,SE or multiple | |
195 my $status= $results{'pe_status'} || 'N/A'; | |
196 push @line,$status; | |
197 | |
198 #encoding of fastq file | |
199 push @line, $results{'Encoding'} || 'N/A'; | |
200 | |
201 #number of reads found | |
202 my $reads = $results{'Total Sequences'} || 'N/A'; | |
203 | |
204 push @line,$reads || 'N/A'; | |
205 | |
206 #number of Total Base Pairs | |
207 push @line, $results{'total_bp'} || 'N/A'; | |
208 | |
209 #sequence read length range | |
210 push @line, $results{'Sequence length'} || 'N/A'; | |
211 | |
212 #most abundant read length | |
213 | |
214 my ($most_abundant) = sort {$results{'dist_length'}{$b}{'count'} <=> $results{'dist_length'}{$a}{'count'} } keys %{$results{'dist_length'}}; | |
215 my ($most_count) = $results{'dist_length'}{$most_abundant}{'count'} || 'N/A'; | |
216 | |
217 push @line, $most_abundant; | |
218 push @line, $most_count; | |
219 | |
220 #coverage against reference | |
221 push @line, $results{'coverage'} || 'N/A'; | |
222 push @line, $results{'reference'} || 'N/A'; | |
223 | |
224 | |
225 #duplicate level | |
226 if ( $results{'pe_status'} eq 'SE') { | |
227 push @line, $results{'duplicate_lvl'} || 'N/A'; | |
228 } | |
229 elsif ( $results{'pe_status'} eq 'PE') { | |
230 push @line, $results{'Duplicate level R1'} || 'N/A'; | |
231 push @line, $results{'Duplicate level R2'} || 'N/A'; | |
232 } | |
233 | |
234 | |
235 | |
236 #determine percentage of reads that are over-represented sequences | |
237 push @line, exists $results{'overre'} ? $results{'overre'} : 'N/A'; | |
238 | |
239 print $out_fh join("\t",@line) . "\n"; | |
240 | |
241 return; | |
242 } | |
243 | |
244 sub print_header { | |
245 my ($out_fh,$fastqs) = @_; | |
246 | |
247 my @headers; | |
248 if ( $fastqs==2) { | |
249 @headers = ('Name','SE/PE','Encoding' , '# of Reads', 'Total # Base Pairs', 'Sequence length range','Most abundant read length','# of reads for abundant','Estimated Coverage','Reference length','Duplicate % R1','Duplicate % R2','# of Overrepresented sequences'); | |
250 } | |
251 else { | |
252 @headers = ('Name','SE/PE','Encoding' , '# of Reads', 'Total # Base Pairs', 'Sequence length range','Most abundant read length','# of reads for abundant','Estimated Coverage','Reference length','Duplicate %','# of Overrepresented sequences'); | |
253 } | |
254 | |
255 print $out_fh join("\t",@headers) . "\n"; | |
256 | |
257 | |
258 return; | |
259 } | |
260 | |
261 | |
262 | |
263 sub parse_FastQC { | |
264 my ($file) = @_; | |
265 | |
266 my %results; | |
267 | |
268 my %todo = ( | |
269 'Per base sequence quality' => \&per_base_quality, | |
270 'Per sequence quality scores' => \&per_seq_quality, | |
271 'Per base sequence content' => \&per_seq_content, | |
272 'Per base GC content' => \&per_base_gc_content, | |
273 'Per sequence GC content' => \&per_seq_gc_content, | |
274 'Per base N content' => \&per_base_n_content, | |
275 'Sequence Length Distribution' => \&seq_length_dis, | |
276 'Sequence Duplication Levels' => \&seq_dupli_lvl, | |
277 'Overrepresented sequences' => \&overrepresented_seq, | |
278 'Kmer Content' => \&kmer_content, | |
279 'Basic Statistics' => \&basic_stats | |
280 | |
281 ); | |
282 | |
283 | |
284 open my $in , '<', $file; | |
285 | |
286 #get header | |
287 my $version = <$in>; | |
288 | |
289 #create functional code | |
290 my $next_sec = next_section($in); | |
291 | |
292 my %sections; | |
293 | |
294 while ( my ($sec_name,$fh_lines) = $next_sec->()) { | |
295 if (! ($sec_name || $fh_lines)) { | |
296 last; | |
297 } | |
298 | |
299 | |
300 #see if we are at a beginning of new section | |
301 if ($sec_name =~ /^>>(.*)\s+(warn|fail|pass)$/) { | |
302 my ($name,$status) = ($1,$2); | |
303 $sections{$name}= $status; | |
304 | |
305 if ( exists $todo{$name}) { | |
306 my $result =$todo{$name}->($fh_lines); | |
307 | |
308 if ( $result) { | |
309 #combine results together | |
310 foreach ( keys %$result) { | |
311 if ( exists $results{$_}) { | |
312 die "Have two functions with key: '$_'\n"; | |
313 } | |
314 else { | |
315 $results{$_}= $result->{$_}; | |
316 } | |
317 } | |
318 } | |
319 } | |
320 } | |
321 } | |
322 | |
323 | |
324 return \%results; | |
325 } | |
326 | |
327 | |
328 sub basic_stats { | |
329 my ($lines) = @_; | |
330 my $header = <$lines>; | |
331 | |
332 my %stats; | |
333 | |
334 while (<$lines>) { | |
335 chomp; | |
336 my @data = split/\t/; | |
337 my $value = pop @data; | |
338 if ( $value eq '>>END_MODULE') { | |
339 last; | |
340 } | |
341 my $key = join(' ',@data); | |
342 $stats{$key}=$value; | |
343 } | |
344 | |
345 return \%stats; | |
346 } | |
347 | |
348 | |
349 | |
350 sub per_base_quality { | |
351 my ($lines) = @_; | |
352 my %results; | |
353 | |
354 return \%results; | |
355 } | |
356 | |
357 sub per_seq_quality { | |
358 my ($lines) = @_; | |
359 my %results; | |
360 | |
361 return \%results; | |
362 } | |
363 | |
364 sub per_seq_content { | |
365 my ($lines) = @_; | |
366 my %results; | |
367 | |
368 return \%results; | |
369 } | |
370 sub per_base_gc_content { | |
371 my ($lines) = @_; | |
372 my %results; | |
373 | |
374 return \%results; | |
375 } | |
376 | |
377 sub per_seq_gc_content { | |
378 my ($lines) = @_; | |
379 my %results; | |
380 | |
381 return \%results; | |
382 } | |
383 | |
384 sub per_base_n_content { | |
385 my ($lines) = @_; | |
386 my %results; | |
387 | |
388 return \%results; | |
389 } | |
390 | |
391 sub seq_length_dis { | |
392 my ($lines) = @_; | |
393 my %results; | |
394 | |
395 my $header = <$lines>; | |
396 my %lengths; | |
397 while (<$lines>) { | |
398 chomp; | |
399 if ( $_ =~ /^>>/) { | |
400 next; | |
401 } | |
402 | |
403 my ($key,$value) = split/\s+/; | |
404 if ( $key =~ /(\d+)-(\d+)/) { | |
405 $lengths{$1}{'count'} =$value; | |
406 $lengths{$1}{'key'} =$key; | |
407 | |
408 } | |
409 else { | |
410 $lengths{$key}{'count'} =$value; | |
411 $lengths{$key}{'key'} =$key; | |
412 } | |
413 | |
414 } | |
415 | |
416 | |
417 return {'dist_length' => \%lengths}; | |
418 } | |
419 sub seq_dupli_lvl { | |
420 my ($lines) = @_; | |
421 | |
422 my $perc_dupl = <$lines>; | |
423 my $perc; | |
424 if ( $perc_dupl =~ /^\#Total Duplicate Percentage\s+(.*)$/ ) { | |
425 $perc = $1; | |
426 } | |
427 elsif ($perc_dupl =~ /^\#Total Deduplicated Percentage\s+(.*)$/ ) { | |
428 $perc = $1; | |
429 #version 0.11.2 and above, it indicates as Deduplicate insetad of Duplicated | |
430 #we want to know % of Duplicate sequences instead | |
431 $perc = $perc - 100.00; | |
432 } | |
433 | |
434 my $header = <$lines>; | |
435 #ignoring the results of the graph for now | |
436 | |
437 $perc = sprintf "%.2f",$perc; | |
438 | |
439 return {'duplicate_lvl' => $perc}; | |
440 } | |
441 | |
442 sub overrepresented_seq { | |
443 my ($lines) = @_; | |
444 | |
445 my %results; | |
446 | |
447 my $header = <$lines>; | |
448 my %seqs; | |
449 my $total; | |
450 | |
451 while ( <$lines>) { | |
452 chomp; | |
453 if ( $_=~ />>END_MODULE/) { | |
454 last; | |
455 } | |
456 my @data = split/\s+/; | |
457 | |
458 $seqs{$data[0]} =$data[2]; | |
459 $total+=$data[2]; | |
460 | |
461 } | |
462 | |
463 if ( ! $total) { | |
464 $results{'overre'}=0; | |
465 | |
466 } | |
467 else { | |
468 $results{'overre'} = sprintf "%.2f",$total; | |
469 } | |
470 | |
471 | |
472 | |
473 return \%results; | |
474 } | |
475 | |
476 sub kmer_content { | |
477 my ($lines) = @_; | |
478 my %results; | |
479 | |
480 return \%results; | |
481 } | |
482 | |
483 | |
484 | |
485 sub next_section { | |
486 my ($in)=@_; | |
487 my $lines; | |
488 | |
489 return sub { | |
490 local $/ = ">>END_MODULE\n"; | |
491 $lines= <$in>; | |
492 my ($name,$sec); | |
493 { | |
494 if ($lines) { | |
495 local $/ = "\n"; | |
496 open $sec ,'<',\$lines; | |
497 $name = <$sec>; | |
498 chomp $name; | |
499 } | |
500 | |
501 } | |
502 | |
503 | |
504 return ($name,$sec); | |
505 } | |
506 } | |
507 | |
508 | |
509 sub get_parameters { | |
510 my ($out,$sample,@rawdatas,@fastq,$ref,$rawdataSE,$rawdataPE_R1,$rawdataPE_R2,$fastqSE,$fastqPE_R1,$fastqPE_R2,$galaxy,$num_bps); | |
511 #determine if our input are as sub arguments or getopt::long | |
512 if ( @_ && $_[0] eq __PACKAGE__ ) { | |
513 # Get command line options | |
514 GetOptions( | |
515 'o|out=s' => \$out, | |
516 'fastq_se=s' => \$fastqSE, | |
517 'fastq_pe_1=s' => \$fastqPE_R1, | |
518 'fastq_pe_2=s' => \$fastqPE_R2, | |
519 'g_rawdata_se=s' => \$rawdataSE, | |
520 'g_rawdata_pe_1=s' => \$rawdataPE_R1, | |
521 'g_rawdata_pe_2=s' => \$rawdataPE_R2, | |
522 'sample=s' => \$sample, | |
523 'ref=s' => \$ref, | |
524 'num_bps=s' => \$num_bps | |
525 ); | |
526 } | |
527 else { | |
528 die "NYI\n"; | |
529 #( $file, $out ) = @_; | |
530 } | |
531 | |
532 if ( !$sample) { | |
533 $sample = "Unknown sample"; | |
534 } | |
535 | |
536 if ( $fastqSE) { | |
537 if ( ! (-e $fastqSE) ) { | |
538 print "ERROR: Was given a fastq SE file but could not find it: '$fastqSE'\n"; | |
539 pod2usage( -verbose => 1 ); | |
540 } | |
541 else { | |
542 push @fastq,$fastqSE; | |
543 } | |
544 if ( !$rawdataSE || !( -e $rawdataSE)) { | |
545 print "ERROR: Was not given or could not rawdata file: '$rawdataSE'\n"; | |
546 pod2usage( -verbose => 1 ); | |
547 } | |
548 else { | |
549 push @rawdatas,$rawdataSE; | |
550 } | |
551 | |
552 } | |
553 elsif ( !$fastqPE_R1 || ! (-e $fastqPE_R1) || !$fastqPE_R2 || ! (-e $fastqPE_R2) ) { | |
554 print "ERROR: Was given a fastqPE R1 or R2 file but could not find it: '$fastqPE_R1' , '$fastqPE_R2'\n"; | |
555 pod2usage( -verbose => 1 ); | |
556 | |
557 } | |
558 else { | |
559 push @fastq,$fastqPE_R1; | |
560 push @fastq,$fastqPE_R2; | |
561 | |
562 | |
563 for my $rawdata ( ($rawdataPE_R1,$rawdataPE_R2)) { | |
564 if (!$rawdata || !(-e $rawdata)) { | |
565 print "ERROR: Was not given or could not rawdata file: '$rawdata'\n"; | |
566 pod2usage( -verbose => 1 ); | |
567 } | |
568 else { | |
569 push @rawdatas,$rawdata; | |
570 } | |
571 } | |
572 } | |
573 | |
574 if ( $ref && !( -e $ref ) ) { | |
575 print "ERROR: Was given a reference file but could not find it: '$ref'\n"; | |
576 pod2usage( -verbose => 1 ); | |
577 } | |
578 | |
579 if ($ref && $num_bps) | |
580 { | |
581 print "ERROR: Was given both a reference file and number of base pairs. One or the other please."; | |
582 pod2usage( -verbose => 1 ); | |
583 } | |
584 | |
585 if ($ref) | |
586 { | |
587 $num_bps = 0; | |
588 my $in = Bio::SeqIO->new(-format=>'fasta' , -file => $ref); | |
589 while ( my $seq = $in->next_seq()) { | |
590 $num_bps += $seq->length(); | |
591 } | |
592 | |
593 if ($num_bps == 0) | |
594 { | |
595 print "ERROR: number of base pairs read from reference file is 0. Please check validity of reference file."; | |
596 pod2usage( -verbose=> 1 ); | |
597 } | |
598 } | |
599 | |
600 $out = _set_out_fh($out); | |
601 | |
602 return (\@rawdatas,\@fastq,$num_bps,$sample,$out); | |
603 } | |
604 | |
605 | |
606 | |
607 sub _set_out_fh { | |
608 my ($output) = @_; | |
609 my $out_fh; | |
610 | |
611 if ( defined $output && ref $output && ref $output eq 'GLOB' ) { | |
612 $out_fh = $output; | |
613 } | |
614 elsif ( defined $output ) { | |
615 open( $out_fh, '>', $output ); | |
616 } | |
617 else { | |
618 $out_fh = \*STDOUT; | |
619 } | |
620 | |
621 return $out_fh; | |
622 } | |
623 | |
624 | |
625 | |
626 1; | |
627 | |
628 =head1 NAME | |
629 | |
630 nml_fastqc_stats.pl.pl - (Galaxy only script) Parse fastqc runs into a csv summary report | |
631 | |
632 | |
633 =head1 SYNOPSIS | |
634 | |
635 nml_fastqc_stats.pl.pl -z fastqc.txt | |
636 nml_fastqc_stats.pl.pl -z fastqc.txt -o results.csv --ref NC_33333.fa | |
637 nml_fastqc_stats.pl.pl -z fastqc.txt -o results.csv --ref NC_33333.fa --fastq ya_R1.fastq --fatq ya_R2.fastq | |
638 | |
639 =head1 OPTIONS | |
640 | |
641 =over | |
642 | |
643 | |
644 =item B<-z> | |
645 | |
646 FastQC txt rawdata file | |
647 | |
648 | |
649 =item B<-o> B<--out> | |
650 | |
651 Output csv file | |
652 | |
653 =item B<--fastq> | |
654 | |
655 Path to fastq file(s) . Used for providing total base pairs and coverage information | |
656 | |
657 | |
658 =item B<--ref> | |
659 | |
660 Path to reference file. Needed for providing coverage information | |
661 | |
662 =back | |
663 | |
664 =head1 DESCRIPTION | |
665 | |
666 | |
667 | |
668 | |
669 =cut |