Mercurial > repos > nml > fastqc_stats

diff fastqc_stats.pl @ 0:2c74c5c70520 draft default tip
planemo upload for repository https://github.com/phac-nml/galaxy_tools commit d5b7cb71616c0ac20e02ff8cb8c147b9d4a31691
author: nml
date: Wed, 11 Oct 2017 16:22:08 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastqc_stats.pl	Wed Oct 11 16:22:08 2017 -0400
@@ -0,0 +1,669 @@
+#!/usr/bin/env perl
+package nml_fastqc_stats;
+use strict;
+use warnings;
+use Pod::Usage;
+use Getopt::Long;
+use autodie;
+use File::Basename;
+use Bio::SeqIO;
+use File::Temp qw/ tempdir /;
+__PACKAGE__->run unless caller;
+
+
+
+sub run {
+    #grabbing arguments either from command line or from another module/script
+    my ( $rawdatas, $fastqs,$num_bps,$sample,$out) = get_parameters(@_);
+
+    print_header($out,scalar @$fastqs);
+    my @results;
+    foreach my $rawdata ( @$rawdatas) {
+        my %results = %{ parse_FastQC($rawdata) } ;
+        push @results,\%results;
+    }
+
+
+    my %results;
+    
+    
+    if ( scalar @$rawdatas ==1  ) {
+        %results = % { $results[0]};
+        
+    }
+    else {
+        #need to combine the info into a SINGLE results file
+
+        my ($r1,$r2) = ($results[0],$results[1]);
+
+        
+        foreach my $key( keys %$r1) {
+            if ( exists $r2->{$key}) {
+                my ($value1,$value2) = ($r1->{$key},$r2->{$key});
+
+                #check to see if data that could/should be different
+                my %diff_ignore = ('Filename'=>1);
+                my %combine = ('duplicate_lvl'=>1,'dist_length'=>1,'overre'=>1,'Total Sequences'=>1,'Sequence length'=>1,'%GC'=>1);
+                if (exists $combine{$key} ) {
+                    if ( $key eq 'duplicate_lvl') {
+                        $results{'Duplicate level R1'} = $value1;
+                        $results{'Duplicate level R2'} = $value2;
+                    }
+                    elsif ( $key eq 'dist_length') {
+                        my %dist=%{$value1};
+                        foreach my $key( keys %$value2) {
+                            #adding the number of reads to either existing range or new
+                            if ( exists $dist{$key}) {
+                                $dist{$key}{'count'}+=$value2->{$key}{'count'};
+                                
+                            }
+                            else {
+                                $dist{$key} = $value2->{$key};
+                            }
+                        }
+
+                        $results{$key} = \%dist;
+                        
+                    }
+                    elsif ( $key eq 'Total Sequences') {
+                        $results{$key}= $value1 + $value2;
+                    }
+                    elsif ( $key eq 'overre') {
+                        $results{$key}=$value1 + $value2;
+                    }
+                    elsif ( $key eq '%GC') {
+                        $results{$key}=($value1 + $value2)/2;
+                    }
+                    elsif ( $key eq 'Sequence length') {
+                        if ( $value1 eq $value2) {
+                            $results{$key}= $value1;
+                        }
+                        else {
+                            #figure out the range by splitting all values into array then sort
+                            my @number = split /-/,$value1;
+                            map { push @number,$_ }  split'-',$value2;
+                            @number = sort {$a <=> $b } @number;
+                            $results{$key} = $number[0] . '-' . $number[$#number];
+                        }
+                    }
+                    
+                }
+                #both the same we are good
+                elsif ( $value1 eq $value2) {
+                    $results{$key}=$value1;
+                }
+                elsif ( ! exists $diff_ignore{$key}) {
+                    die "Have different value between fastqc files for '$key'\n";
+                }
+
+                 
+                
+            }
+            else {
+                die "Cannot find key : '$key' in reverse fastqc file\n";
+            }
+        }
+        
+        
+    }
+
+    #add sample name given by user
+    $results{'filename'} = $sample;
+    
+    #perform coverage, total base pair and SE or PE
+    if ( $fastqs) {
+        my $result = determine_stats($fastqs,$num_bps);
+        if ( $result) {
+            foreach ( keys %$result) {
+                if ( exists $results{$_}) {
+                    die "Have two functions with key: '$_'\n";
+                }
+                else {
+                    $results{$_}= $result->{$_};
+                    }
+            }
+        }
+    }
+
+    print_csv($out,\%results);
+
+    
+    
+    return 1;
+}
+
+sub determine_stats {
+    my ($fastqs,$num_bps) = @_;
+    
+    
+    my %results;
+    my $number = scalar @$fastqs;
+    my $status;
+    
+    if ( $number ==1) {
+        $status='SE';
+    }
+    elsif ( $number ==2) {
+        $status='PE';
+    }
+    else {
+        $status='N/A';
+    }
+
+    $results{'pe_status'} = $status;
+
+    #determine total base pair
+
+    my $all_fastq = join (' ' , map { "\"$_\"" } @$fastqs);
+
+    #one liner from : http://onetipperday.blogspot.ca/2012/05/simple-way-to-get-reads-length.html with some modification by Philip Mabon
+    my $total=`cat $all_fastq | perl -ne '\$s=<>;<>;<>;chomp(\$s);print length(\$s)."\n";' | sort | uniq -c | perl -e 'while(my \$line=<>){chomp \$line; \$line =~s/^\\s+//;(\$l,\$n)=split/\\s+/,\$line; \$t+= \$l*\$n;} print "\$t\n"'`;
+    chomp $total;
+    
+    $results{'total_bp'} = $total;
+    
+    if ($total) {
+
+        $results{'coverage'} = sprintf "%.2f", ($total/$num_bps);
+       
+	#reference name stuff
+	#my $name = basename($reference);
+        #$name =~ s/\.fasta//;
+        
+        $results{'reference'} = $num_bps;
+        
+        
+    }
+    
+    return \%results;
+    
+}
+
+
+sub print_csv {
+    my ($out_fh,$results) = @_;
+    my %results = %{$results};
+
+    my @line;
+    
+    #Name
+    my $name = $results{'filename'} || 'N/A';
+    $name =~ s/.fastq//;
+    push @line,$name;
+
+    #Indicate if PE,SE or multiple
+    my $status= $results{'pe_status'} || 'N/A';
+    push @line,$status;
+
+    #encoding of fastq file
+    push @line, $results{'Encoding'} || 'N/A';
+    
+    #number of reads found
+    my $reads = $results{'Total Sequences'} || 'N/A';
+
+    push @line,$reads || 'N/A';
+
+    #number of Total Base Pairs
+    push @line, $results{'total_bp'} || 'N/A';
+
+    #sequence read length range
+    push @line, $results{'Sequence length'} || 'N/A';
+    
+    #most abundant read length
+
+    my ($most_abundant) = sort {$results{'dist_length'}{$b}{'count'} <=> $results{'dist_length'}{$a}{'count'} } keys %{$results{'dist_length'}};
+    my ($most_count) = $results{'dist_length'}{$most_abundant}{'count'} || 'N/A';
+    
+    push @line, $most_abundant;
+    push @line, $most_count;
+
+    #coverage against reference
+    push @line, $results{'coverage'} || 'N/A';
+    push @line, $results{'reference'} || 'N/A';
+
+    
+    #duplicate level
+    if ( $results{'pe_status'} eq 'SE') {
+        push @line, $results{'duplicate_lvl'} || 'N/A';
+    }
+    elsif (  $results{'pe_status'} eq 'PE') {
+        push @line, $results{'Duplicate level R1'} || 'N/A';
+        push @line, $results{'Duplicate level R2'} || 'N/A';
+    }
+
+
+
+    #determine percentage of reads that are over-represented sequences
+    push @line, exists $results{'overre'} ? $results{'overre'} : 'N/A';
+    
+    print $out_fh join("\t",@line) . "\n";
+    
+    return;
+}
+    
+sub print_header {
+    my ($out_fh,$fastqs) = @_;
+
+    my @headers;
+    if ( $fastqs==2) {
+        @headers = ('Name','SE/PE','Encoding' , '# of Reads', 'Total # Base Pairs', 'Sequence length range','Most abundant read length','# of reads for abundant','Estimated Coverage','Reference length','Duplicate % R1','Duplicate % R2','# of Overrepresented sequences');  
+    }
+    else {
+        @headers = ('Name','SE/PE','Encoding' , '# of Reads', 'Total # Base Pairs', 'Sequence length range','Most abundant read length','# of reads for abundant','Estimated Coverage','Reference length','Duplicate %','# of Overrepresented sequences');  
+    } 
+
+    print $out_fh join("\t",@headers) . "\n";
+    
+
+    return;
+}
+    
+
+
+sub parse_FastQC {
+    my ($file) = @_;
+
+    my %results;
+    
+    my %todo = (
+        'Per base sequence quality' => \&per_base_quality,
+        'Per sequence quality scores' => \&per_seq_quality,
+        'Per base sequence content' => \&per_seq_content,
+        'Per base GC content' => \&per_base_gc_content,
+        'Per sequence GC content' => \&per_seq_gc_content,
+        'Per base N content' => \&per_base_n_content,
+        'Sequence Length Distribution' => \&seq_length_dis,
+        'Sequence Duplication Levels' => \&seq_dupli_lvl,
+        'Overrepresented sequences' => \&overrepresented_seq,
+        'Kmer Content' => \&kmer_content,
+        'Basic Statistics' => \&basic_stats
+   
+    );
+    
+    
+    open my $in , '<', $file;
+
+    #get header
+    my $version = <$in>;
+
+    #create functional code
+    my $next_sec = next_section($in);
+    
+    my %sections;
+    
+    while ( my ($sec_name,$fh_lines) = $next_sec->()) {
+        if (! ($sec_name || $fh_lines)) {
+            last;
+        }
+        
+
+        #see if we are at a beginning of new section
+        if ($sec_name =~ /^>>(.*)\s+(warn|fail|pass)$/) {
+            my ($name,$status) = ($1,$2);
+            $sections{$name}= $status;
+            
+            if ( exists $todo{$name}) {
+                my $result =$todo{$name}->($fh_lines);
+            
+                if ( $result) {
+                    #combine results together
+                    foreach ( keys %$result) {
+                        if ( exists $results{$_}) {
+                            die "Have two functions with key: '$_'\n";
+                        }
+                        else {
+                            $results{$_}= $result->{$_};
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    
+    return \%results;
+}
+
+
+sub basic_stats {
+    my ($lines) = @_;
+    my $header = <$lines>;
+
+    my %stats;
+    
+    while (<$lines>) {
+        chomp;
+        my @data = split/\t/;
+        my $value = pop @data;
+        if ( $value eq '>>END_MODULE') {
+            last;
+        }
+        my $key = join(' ',@data);
+        $stats{$key}=$value;
+    }
+    
+    return \%stats;
+}
+
+
+
+sub per_base_quality {
+	my ($lines) = @_;
+	my %results;
+
+	return \%results;
+}
+
+sub per_seq_quality {
+	my ($lines) = @_;
+	my %results;
+
+	return \%results;
+}
+
+sub per_seq_content {
+	my ($lines) = @_;
+	my %results;
+
+	return \%results;
+}
+sub per_base_gc_content {
+	my ($lines) = @_;
+	my %results;
+
+	return \%results;
+}
+
+sub per_seq_gc_content {
+	my ($lines) = @_;
+	my %results;
+
+	return \%results;
+}
+
+sub per_base_n_content {
+	my ($lines) = @_;
+	my %results;
+
+	return \%results;
+}
+
+sub seq_length_dis {
+    my ($lines) = @_;
+    my %results;
+    
+    my $header = <$lines>;
+    my %lengths;
+    while (<$lines>) {
+        chomp;
+        if ( $_ =~ /^>>/) {
+            next;
+        }
+        
+        my ($key,$value) = split/\s+/;
+        if ( $key =~ /(\d+)-(\d+)/) {
+            $lengths{$1}{'count'} =$value;
+            $lengths{$1}{'key'} =$key;
+            
+        }
+        else {
+            $lengths{$key}{'count'} =$value;
+                $lengths{$key}{'key'} =$key;
+        }
+        
+    }
+        
+    
+    return {'dist_length' => \%lengths};
+}
+sub seq_dupli_lvl {
+    my ($lines) = @_;
+
+    my $perc_dupl = <$lines>;
+    my $perc;
+    if ( $perc_dupl =~ /^\#Total Duplicate Percentage\s+(.*)$/ ) {
+        $perc = $1;
+    }
+    elsif ($perc_dupl =~ /^\#Total Deduplicated Percentage\s+(.*)$/ ) {
+        $perc = $1;
+        #version 0.11.2 and above, it indicates as Deduplicate insetad of Duplicated
+        #we want to know % of Duplicate sequences instead
+        $perc = $perc - 100.00;
+    }
+
+    my $header = <$lines>;
+    #ignoring the results of the graph for now
+
+    $perc = sprintf "%.2f",$perc;
+
+    return {'duplicate_lvl' => $perc};
+}
+
+sub overrepresented_seq {
+    my ($lines) = @_;
+    
+    my %results;
+    
+    my $header = <$lines>;
+    my %seqs;
+    my $total;
+    
+    while ( <$lines>) {
+        chomp;
+        if ( $_=~ />>END_MODULE/) {
+            last;
+        }
+        my @data = split/\s+/;
+        
+        $seqs{$data[0]} =$data[2];
+        $total+=$data[2];
+    
+    }
+
+    if ( ! $total) {
+        $results{'overre'}=0;
+        
+    }
+    else {
+        $results{'overre'} = sprintf "%.2f",$total;        
+    }
+
+    
+
+    return \%results;
+}
+
+sub kmer_content {
+	my ($lines) = @_;
+	my %results;
+
+	return \%results;
+}
+
+
+
+sub next_section {
+    my ($in)=@_;
+    my $lines;
+    
+    return sub {
+        local $/ = ">>END_MODULE\n";    
+        $lines= <$in>;
+        my ($name,$sec);
+        {
+            if ($lines) {
+                local $/ = "\n";
+                open $sec ,'<',\$lines;
+                $name = <$sec>;
+                chomp $name;
+            }
+            
+        }
+        
+        
+        return ($name,$sec);
+    }
+}
+
+
+sub get_parameters {
+    my ($out,$sample,@rawdatas,@fastq,$ref,$rawdataSE,$rawdataPE_R1,$rawdataPE_R2,$fastqSE,$fastqPE_R1,$fastqPE_R2,$galaxy,$num_bps);
+    #determine if our input are as sub arguments or getopt::long
+    if ( @_ && $_[0] eq __PACKAGE__ ) {
+        # Get command line options
+        GetOptions(
+            'o|out=s'   => \$out,
+            'fastq_se=s' => \$fastqSE,
+            'fastq_pe_1=s' => \$fastqPE_R1,
+            'fastq_pe_2=s' => \$fastqPE_R2,
+            'g_rawdata_se=s' => \$rawdataSE,
+            'g_rawdata_pe_1=s' => \$rawdataPE_R1,
+            'g_rawdata_pe_2=s' => \$rawdataPE_R2,
+            'sample=s' => \$sample,
+            'ref=s' => \$ref,
+	    'num_bps=s' => \$num_bps
+        );
+    }
+    else {
+        die "NYI\n";
+        #( $file, $out ) = @_;
+    }
+
+    if ( !$sample) {
+        $sample = "Unknown sample";
+    }
+    
+    if ( $fastqSE) {
+        if ( ! (-e $fastqSE) ) {
+            print "ERROR: Was given a fastq SE file but could not find it: '$fastqSE'\n";
+            pod2usage( -verbose => 1 );
+        }
+        else {
+            push @fastq,$fastqSE;
+        }
+        if ( !$rawdataSE || !( -e $rawdataSE)) {
+            print "ERROR: Was not given or could not rawdata file: '$rawdataSE'\n";
+            pod2usage( -verbose => 1 );
+        }
+        else {
+            push @rawdatas,$rawdataSE;
+        }
+        
+    }
+    elsif (  !$fastqPE_R1 || ! (-e $fastqPE_R1)  || !$fastqPE_R2 || ! (-e $fastqPE_R2) ) {
+        print "ERROR: Was given a fastqPE R1 or R2 file but could not find it: '$fastqPE_R1' , '$fastqPE_R2'\n";
+        pod2usage( -verbose => 1 );
+
+    }
+    else {
+        push @fastq,$fastqPE_R1;
+        push @fastq,$fastqPE_R2;
+
+        
+        for my $rawdata ( ($rawdataPE_R1,$rawdataPE_R2)) {
+            if (!$rawdata || !(-e $rawdata)) {
+                print "ERROR: Was not given or could not rawdata file: '$rawdata'\n";
+                pod2usage( -verbose => 1 );                
+            }
+            else {
+                push @rawdatas,$rawdata;
+            }
+        }        
+    }
+
+    if ( $ref && !( -e $ref ) ) {
+        print "ERROR: Was given a reference file but could not find it: '$ref'\n";
+        pod2usage( -verbose => 1 );
+    }
+
+    if ($ref && $num_bps)
+    {
+	print "ERROR: Was given both a reference file and number of base pairs. One or the other please.";
+	pod2usage( -verbose => 1 );
+    }
+
+    if ($ref)
+    {
+	$num_bps = 0;
+        my $in = Bio::SeqIO->new(-format=>'fasta' , -file => $ref);
+        while ( my $seq = $in->next_seq()) {
+            $num_bps += $seq->length();
+        }
+
+	if ($num_bps == 0)
+	{
+	    print "ERROR: number of base pairs read from reference file is 0. Please check validity of reference file.";
+	    pod2usage( -verbose=> 1 );
+	}
+    }
+
+    $out = _set_out_fh($out);
+    
+    return (\@rawdatas,\@fastq,$num_bps,$sample,$out);
+}
+
+
+
+sub _set_out_fh {
+    my ($output) = @_;
+    my $out_fh;
+    
+    if ( defined $output && ref $output && ref $output eq 'GLOB' ) {
+        $out_fh = $output;
+    }
+    elsif ( defined $output ) {
+        open( $out_fh, '>', $output );
+    }
+    else {
+        $out_fh = \*STDOUT;
+    }
+
+    return $out_fh;
+}
+
+
+    
+1;
+
+=head1 NAME
+
+nml_fastqc_stats.pl.pl - (Galaxy only script) Parse fastqc runs into a csv summary report
+
+
+=head1 SYNOPSIS
+
+     nml_fastqc_stats.pl.pl -z fastqc.txt
+     nml_fastqc_stats.pl.pl -z fastqc.txt -o results.csv --ref NC_33333.fa
+     nml_fastqc_stats.pl.pl -z fastqc.txt -o results.csv --ref NC_33333.fa --fastq ya_R1.fastq --fatq ya_R2.fastq
+
+=head1 OPTIONS
+
+=over
+
+
+=item B<-z>
+
+FastQC txt rawdata file
+
+
+=item B<-o> B<--out>
+
+Output csv file
+
+=item  B<--fastq>
+
+Path to fastq file(s) . Used for providing total base pairs and coverage information
+
+
+=item  B<--ref>
+
+Path to reference file. Needed for providing coverage information
+
+=back
+
+=head1 DESCRIPTION
+
+
+
+
+=cut