comparison fastqc_stats.pl @ 0:2c74c5c70520 draft default tip

planemo upload for repository https://github.com/phac-nml/galaxy_tools commit d5b7cb71616c0ac20e02ff8cb8c147b9d4a31691
author nml
date Wed, 11 Oct 2017 16:22:08 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2c74c5c70520
1 #!/usr/bin/env perl
2 package nml_fastqc_stats;
3 use strict;
4 use warnings;
5 use Pod::Usage;
6 use Getopt::Long;
7 use autodie;
8 use File::Basename;
9 use Bio::SeqIO;
10 use File::Temp qw/ tempdir /;
11 __PACKAGE__->run unless caller;
12
13
14
15 sub run {
16 #grabbing arguments either from command line or from another module/script
17 my ( $rawdatas, $fastqs,$num_bps,$sample,$out) = get_parameters(@_);
18
19 print_header($out,scalar @$fastqs);
20 my @results;
21 foreach my $rawdata ( @$rawdatas) {
22 my %results = %{ parse_FastQC($rawdata) } ;
23 push @results,\%results;
24 }
25
26
27 my %results;
28
29
30 if ( scalar @$rawdatas ==1 ) {
31 %results = % { $results[0]};
32
33 }
34 else {
35 #need to combine the info into a SINGLE results file
36
37 my ($r1,$r2) = ($results[0],$results[1]);
38
39
40 foreach my $key( keys %$r1) {
41 if ( exists $r2->{$key}) {
42 my ($value1,$value2) = ($r1->{$key},$r2->{$key});
43
44 #check to see if data that could/should be different
45 my %diff_ignore = ('Filename'=>1);
46 my %combine = ('duplicate_lvl'=>1,'dist_length'=>1,'overre'=>1,'Total Sequences'=>1,'Sequence length'=>1,'%GC'=>1);
47 if (exists $combine{$key} ) {
48 if ( $key eq 'duplicate_lvl') {
49 $results{'Duplicate level R1'} = $value1;
50 $results{'Duplicate level R2'} = $value2;
51 }
52 elsif ( $key eq 'dist_length') {
53 my %dist=%{$value1};
54 foreach my $key( keys %$value2) {
55 #adding the number of reads to either existing range or new
56 if ( exists $dist{$key}) {
57 $dist{$key}{'count'}+=$value2->{$key}{'count'};
58
59 }
60 else {
61 $dist{$key} = $value2->{$key};
62 }
63 }
64
65 $results{$key} = \%dist;
66
67 }
68 elsif ( $key eq 'Total Sequences') {
69 $results{$key}= $value1 + $value2;
70 }
71 elsif ( $key eq 'overre') {
72 $results{$key}=$value1 + $value2;
73 }
74 elsif ( $key eq '%GC') {
75 $results{$key}=($value1 + $value2)/2;
76 }
77 elsif ( $key eq 'Sequence length') {
78 if ( $value1 eq $value2) {
79 $results{$key}= $value1;
80 }
81 else {
82 #figure out the range by splitting all values into array then sort
83 my @number = split /-/,$value1;
84 map { push @number,$_ } split'-',$value2;
85 @number = sort {$a <=> $b } @number;
86 $results{$key} = $number[0] . '-' . $number[$#number];
87 }
88 }
89
90 }
91 #both the same we are good
92 elsif ( $value1 eq $value2) {
93 $results{$key}=$value1;
94 }
95 elsif ( ! exists $diff_ignore{$key}) {
96 die "Have different value between fastqc files for '$key'\n";
97 }
98
99
100
101 }
102 else {
103 die "Cannot find key : '$key' in reverse fastqc file\n";
104 }
105 }
106
107
108 }
109
110 #add sample name given by user
111 $results{'filename'} = $sample;
112
113 #perform coverage, total base pair and SE or PE
114 if ( $fastqs) {
115 my $result = determine_stats($fastqs,$num_bps);
116 if ( $result) {
117 foreach ( keys %$result) {
118 if ( exists $results{$_}) {
119 die "Have two functions with key: '$_'\n";
120 }
121 else {
122 $results{$_}= $result->{$_};
123 }
124 }
125 }
126 }
127
128 print_csv($out,\%results);
129
130
131
132 return 1;
133 }
134
135 sub determine_stats {
136 my ($fastqs,$num_bps) = @_;
137
138
139 my %results;
140 my $number = scalar @$fastqs;
141 my $status;
142
143 if ( $number ==1) {
144 $status='SE';
145 }
146 elsif ( $number ==2) {
147 $status='PE';
148 }
149 else {
150 $status='N/A';
151 }
152
153 $results{'pe_status'} = $status;
154
155 #determine total base pair
156
157 my $all_fastq = join (' ' , map { "\"$_\"" } @$fastqs);
158
159 #one liner from : http://onetipperday.blogspot.ca/2012/05/simple-way-to-get-reads-length.html with some modification by Philip Mabon
160 my $total=`cat $all_fastq | perl -ne '\$s=<>;<>;<>;chomp(\$s);print length(\$s)."\n";' | sort | uniq -c | perl -e 'while(my \$line=<>){chomp \$line; \$line =~s/^\\s+//;(\$l,\$n)=split/\\s+/,\$line; \$t+= \$l*\$n;} print "\$t\n"'`;
161 chomp $total;
162
163 $results{'total_bp'} = $total;
164
165 if ($total) {
166
167 $results{'coverage'} = sprintf "%.2f", ($total/$num_bps);
168
169 #reference name stuff
170 #my $name = basename($reference);
171 #$name =~ s/\.fasta//;
172
173 $results{'reference'} = $num_bps;
174
175
176 }
177
178 return \%results;
179
180 }
181
182
183 sub print_csv {
184 my ($out_fh,$results) = @_;
185 my %results = %{$results};
186
187 my @line;
188
189 #Name
190 my $name = $results{'filename'} || 'N/A';
191 $name =~ s/.fastq//;
192 push @line,$name;
193
194 #Indicate if PE,SE or multiple
195 my $status= $results{'pe_status'} || 'N/A';
196 push @line,$status;
197
198 #encoding of fastq file
199 push @line, $results{'Encoding'} || 'N/A';
200
201 #number of reads found
202 my $reads = $results{'Total Sequences'} || 'N/A';
203
204 push @line,$reads || 'N/A';
205
206 #number of Total Base Pairs
207 push @line, $results{'total_bp'} || 'N/A';
208
209 #sequence read length range
210 push @line, $results{'Sequence length'} || 'N/A';
211
212 #most abundant read length
213
214 my ($most_abundant) = sort {$results{'dist_length'}{$b}{'count'} <=> $results{'dist_length'}{$a}{'count'} } keys %{$results{'dist_length'}};
215 my ($most_count) = $results{'dist_length'}{$most_abundant}{'count'} || 'N/A';
216
217 push @line, $most_abundant;
218 push @line, $most_count;
219
220 #coverage against reference
221 push @line, $results{'coverage'} || 'N/A';
222 push @line, $results{'reference'} || 'N/A';
223
224
225 #duplicate level
226 if ( $results{'pe_status'} eq 'SE') {
227 push @line, $results{'duplicate_lvl'} || 'N/A';
228 }
229 elsif ( $results{'pe_status'} eq 'PE') {
230 push @line, $results{'Duplicate level R1'} || 'N/A';
231 push @line, $results{'Duplicate level R2'} || 'N/A';
232 }
233
234
235
236 #determine percentage of reads that are over-represented sequences
237 push @line, exists $results{'overre'} ? $results{'overre'} : 'N/A';
238
239 print $out_fh join("\t",@line) . "\n";
240
241 return;
242 }
243
244 sub print_header {
245 my ($out_fh,$fastqs) = @_;
246
247 my @headers;
248 if ( $fastqs==2) {
249 @headers = ('Name','SE/PE','Encoding' , '# of Reads', 'Total # Base Pairs', 'Sequence length range','Most abundant read length','# of reads for abundant','Estimated Coverage','Reference length','Duplicate % R1','Duplicate % R2','# of Overrepresented sequences');
250 }
251 else {
252 @headers = ('Name','SE/PE','Encoding' , '# of Reads', 'Total # Base Pairs', 'Sequence length range','Most abundant read length','# of reads for abundant','Estimated Coverage','Reference length','Duplicate %','# of Overrepresented sequences');
253 }
254
255 print $out_fh join("\t",@headers) . "\n";
256
257
258 return;
259 }
260
261
262
263 sub parse_FastQC {
264 my ($file) = @_;
265
266 my %results;
267
268 my %todo = (
269 'Per base sequence quality' => \&per_base_quality,
270 'Per sequence quality scores' => \&per_seq_quality,
271 'Per base sequence content' => \&per_seq_content,
272 'Per base GC content' => \&per_base_gc_content,
273 'Per sequence GC content' => \&per_seq_gc_content,
274 'Per base N content' => \&per_base_n_content,
275 'Sequence Length Distribution' => \&seq_length_dis,
276 'Sequence Duplication Levels' => \&seq_dupli_lvl,
277 'Overrepresented sequences' => \&overrepresented_seq,
278 'Kmer Content' => \&kmer_content,
279 'Basic Statistics' => \&basic_stats
280
281 );
282
283
284 open my $in , '<', $file;
285
286 #get header
287 my $version = <$in>;
288
289 #create functional code
290 my $next_sec = next_section($in);
291
292 my %sections;
293
294 while ( my ($sec_name,$fh_lines) = $next_sec->()) {
295 if (! ($sec_name || $fh_lines)) {
296 last;
297 }
298
299
300 #see if we are at a beginning of new section
301 if ($sec_name =~ /^>>(.*)\s+(warn|fail|pass)$/) {
302 my ($name,$status) = ($1,$2);
303 $sections{$name}= $status;
304
305 if ( exists $todo{$name}) {
306 my $result =$todo{$name}->($fh_lines);
307
308 if ( $result) {
309 #combine results together
310 foreach ( keys %$result) {
311 if ( exists $results{$_}) {
312 die "Have two functions with key: '$_'\n";
313 }
314 else {
315 $results{$_}= $result->{$_};
316 }
317 }
318 }
319 }
320 }
321 }
322
323
324 return \%results;
325 }
326
327
328 sub basic_stats {
329 my ($lines) = @_;
330 my $header = <$lines>;
331
332 my %stats;
333
334 while (<$lines>) {
335 chomp;
336 my @data = split/\t/;
337 my $value = pop @data;
338 if ( $value eq '>>END_MODULE') {
339 last;
340 }
341 my $key = join(' ',@data);
342 $stats{$key}=$value;
343 }
344
345 return \%stats;
346 }
347
348
349
350 sub per_base_quality {
351 my ($lines) = @_;
352 my %results;
353
354 return \%results;
355 }
356
357 sub per_seq_quality {
358 my ($lines) = @_;
359 my %results;
360
361 return \%results;
362 }
363
364 sub per_seq_content {
365 my ($lines) = @_;
366 my %results;
367
368 return \%results;
369 }
370 sub per_base_gc_content {
371 my ($lines) = @_;
372 my %results;
373
374 return \%results;
375 }
376
377 sub per_seq_gc_content {
378 my ($lines) = @_;
379 my %results;
380
381 return \%results;
382 }
383
384 sub per_base_n_content {
385 my ($lines) = @_;
386 my %results;
387
388 return \%results;
389 }
390
391 sub seq_length_dis {
392 my ($lines) = @_;
393 my %results;
394
395 my $header = <$lines>;
396 my %lengths;
397 while (<$lines>) {
398 chomp;
399 if ( $_ =~ /^>>/) {
400 next;
401 }
402
403 my ($key,$value) = split/\s+/;
404 if ( $key =~ /(\d+)-(\d+)/) {
405 $lengths{$1}{'count'} =$value;
406 $lengths{$1}{'key'} =$key;
407
408 }
409 else {
410 $lengths{$key}{'count'} =$value;
411 $lengths{$key}{'key'} =$key;
412 }
413
414 }
415
416
417 return {'dist_length' => \%lengths};
418 }
419 sub seq_dupli_lvl {
420 my ($lines) = @_;
421
422 my $perc_dupl = <$lines>;
423 my $perc;
424 if ( $perc_dupl =~ /^\#Total Duplicate Percentage\s+(.*)$/ ) {
425 $perc = $1;
426 }
427 elsif ($perc_dupl =~ /^\#Total Deduplicated Percentage\s+(.*)$/ ) {
428 $perc = $1;
429 #version 0.11.2 and above, it indicates as Deduplicate insetad of Duplicated
430 #we want to know % of Duplicate sequences instead
431 $perc = $perc - 100.00;
432 }
433
434 my $header = <$lines>;
435 #ignoring the results of the graph for now
436
437 $perc = sprintf "%.2f",$perc;
438
439 return {'duplicate_lvl' => $perc};
440 }
441
442 sub overrepresented_seq {
443 my ($lines) = @_;
444
445 my %results;
446
447 my $header = <$lines>;
448 my %seqs;
449 my $total;
450
451 while ( <$lines>) {
452 chomp;
453 if ( $_=~ />>END_MODULE/) {
454 last;
455 }
456 my @data = split/\s+/;
457
458 $seqs{$data[0]} =$data[2];
459 $total+=$data[2];
460
461 }
462
463 if ( ! $total) {
464 $results{'overre'}=0;
465
466 }
467 else {
468 $results{'overre'} = sprintf "%.2f",$total;
469 }
470
471
472
473 return \%results;
474 }
475
476 sub kmer_content {
477 my ($lines) = @_;
478 my %results;
479
480 return \%results;
481 }
482
483
484
485 sub next_section {
486 my ($in)=@_;
487 my $lines;
488
489 return sub {
490 local $/ = ">>END_MODULE\n";
491 $lines= <$in>;
492 my ($name,$sec);
493 {
494 if ($lines) {
495 local $/ = "\n";
496 open $sec ,'<',\$lines;
497 $name = <$sec>;
498 chomp $name;
499 }
500
501 }
502
503
504 return ($name,$sec);
505 }
506 }
507
508
509 sub get_parameters {
510 my ($out,$sample,@rawdatas,@fastq,$ref,$rawdataSE,$rawdataPE_R1,$rawdataPE_R2,$fastqSE,$fastqPE_R1,$fastqPE_R2,$galaxy,$num_bps);
511 #determine if our input are as sub arguments or getopt::long
512 if ( @_ && $_[0] eq __PACKAGE__ ) {
513 # Get command line options
514 GetOptions(
515 'o|out=s' => \$out,
516 'fastq_se=s' => \$fastqSE,
517 'fastq_pe_1=s' => \$fastqPE_R1,
518 'fastq_pe_2=s' => \$fastqPE_R2,
519 'g_rawdata_se=s' => \$rawdataSE,
520 'g_rawdata_pe_1=s' => \$rawdataPE_R1,
521 'g_rawdata_pe_2=s' => \$rawdataPE_R2,
522 'sample=s' => \$sample,
523 'ref=s' => \$ref,
524 'num_bps=s' => \$num_bps
525 );
526 }
527 else {
528 die "NYI\n";
529 #( $file, $out ) = @_;
530 }
531
532 if ( !$sample) {
533 $sample = "Unknown sample";
534 }
535
536 if ( $fastqSE) {
537 if ( ! (-e $fastqSE) ) {
538 print "ERROR: Was given a fastq SE file but could not find it: '$fastqSE'\n";
539 pod2usage( -verbose => 1 );
540 }
541 else {
542 push @fastq,$fastqSE;
543 }
544 if ( !$rawdataSE || !( -e $rawdataSE)) {
545 print "ERROR: Was not given or could not rawdata file: '$rawdataSE'\n";
546 pod2usage( -verbose => 1 );
547 }
548 else {
549 push @rawdatas,$rawdataSE;
550 }
551
552 }
553 elsif ( !$fastqPE_R1 || ! (-e $fastqPE_R1) || !$fastqPE_R2 || ! (-e $fastqPE_R2) ) {
554 print "ERROR: Was given a fastqPE R1 or R2 file but could not find it: '$fastqPE_R1' , '$fastqPE_R2'\n";
555 pod2usage( -verbose => 1 );
556
557 }
558 else {
559 push @fastq,$fastqPE_R1;
560 push @fastq,$fastqPE_R2;
561
562
563 for my $rawdata ( ($rawdataPE_R1,$rawdataPE_R2)) {
564 if (!$rawdata || !(-e $rawdata)) {
565 print "ERROR: Was not given or could not rawdata file: '$rawdata'\n";
566 pod2usage( -verbose => 1 );
567 }
568 else {
569 push @rawdatas,$rawdata;
570 }
571 }
572 }
573
574 if ( $ref && !( -e $ref ) ) {
575 print "ERROR: Was given a reference file but could not find it: '$ref'\n";
576 pod2usage( -verbose => 1 );
577 }
578
579 if ($ref && $num_bps)
580 {
581 print "ERROR: Was given both a reference file and number of base pairs. One or the other please.";
582 pod2usage( -verbose => 1 );
583 }
584
585 if ($ref)
586 {
587 $num_bps = 0;
588 my $in = Bio::SeqIO->new(-format=>'fasta' , -file => $ref);
589 while ( my $seq = $in->next_seq()) {
590 $num_bps += $seq->length();
591 }
592
593 if ($num_bps == 0)
594 {
595 print "ERROR: number of base pairs read from reference file is 0. Please check validity of reference file.";
596 pod2usage( -verbose=> 1 );
597 }
598 }
599
600 $out = _set_out_fh($out);
601
602 return (\@rawdatas,\@fastq,$num_bps,$sample,$out);
603 }
604
605
606
607 sub _set_out_fh {
608 my ($output) = @_;
609 my $out_fh;
610
611 if ( defined $output && ref $output && ref $output eq 'GLOB' ) {
612 $out_fh = $output;
613 }
614 elsif ( defined $output ) {
615 open( $out_fh, '>', $output );
616 }
617 else {
618 $out_fh = \*STDOUT;
619 }
620
621 return $out_fh;
622 }
623
624
625
626 1;
627
628 =head1 NAME
629
630 nml_fastqc_stats.pl.pl - (Galaxy only script) Parse fastqc runs into a csv summary report
631
632
633 =head1 SYNOPSIS
634
635 nml_fastqc_stats.pl.pl -z fastqc.txt
636 nml_fastqc_stats.pl.pl -z fastqc.txt -o results.csv --ref NC_33333.fa
637 nml_fastqc_stats.pl.pl -z fastqc.txt -o results.csv --ref NC_33333.fa --fastq ya_R1.fastq --fatq ya_R2.fastq
638
639 =head1 OPTIONS
640
641 =over
642
643
644 =item B<-z>
645
646 FastQC txt rawdata file
647
648
649 =item B<-o> B<--out>
650
651 Output csv file
652
653 =item B<--fastq>
654
655 Path to fastq file(s) . Used for providing total base pairs and coverage information
656
657
658 =item B<--ref>
659
660 Path to reference file. Needed for providing coverage information
661
662 =back
663
664 =head1 DESCRIPTION
665
666
667
668
669 =cut