Mercurial > repos > nml > seqtk_nml
diff seqtk_nml.pl @ 0:e1867440ed36 draft
planemo upload for repository https://github.com/phac-nml/snvphyl-galaxy commit 008f4667b70be22e9ddf496738b3f74bb942ed28
author | nml |
---|---|
date | Tue, 19 Sep 2017 16:37:42 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seqtk_nml.pl Tue Sep 19 16:37:42 2017 -0400 @@ -0,0 +1,268 @@ +#!/usr/bin/env perl +package seqtk_nml; +use warnings; +use strict; +use Bio::SeqIO; +use Getopt::Long; +use Pod::Usage; +use File::Copy; +__PACKAGE__->run unless caller; + + +my $rv; + +sub get_parameters { + my ($fastaref, $type, $coverage, $length,$log); + my ($for,$rev,$out_for,$out_rev); + #determine if our input are as sub arguments or getopt::long + if ( @_ && $_[0] eq __PACKAGE__ ) { + Getopt::Long::Configure('bundling'); + GetOptions( + 'ref=s' => \$fastaref, + 'type=s' => \$type, + 'forward=s' => \$for, + 'reverse=s' => \$rev, + 'out_forward=s' => \$out_for, + 'out_reverse=s' => \$out_rev, + 'log=s'=> \$log, + 'cov=s' => \$coverage + ); + } + + if ( !$for || !( -e $for ) ) { + print "ERROR: Was not given or could not find fastq file: '$for'\n"; + pod2usage( -verbose => 1 ); + } + + if ( !$out_for ) { + print "ERROR: Was not given output file path for fastq\n"; + pod2usage( -verbose => 1 ); + } + + if ( $type eq 'paired') { + if ( !$rev || !( -e $rev ) ) { + print "ERROR: Was not given or could not find reverse fastq file: '$rev'\n"; + pod2usage( -verbose => 1 ); + } + + if ( !$out_rev ) { + print "ERROR: Was not given output file path for reverse fastq\n"; + pod2usage( -verbose => 1 ); + } + } + + if ( !$coverage ) { + print "ERROR: Was not given a coverage number\n"; + pod2usage( -verbose => 1 ); + } + + if ( $coverage <=0 ) { + print "ERROR: Was given a coverage less than 0\n"; + pod2usage( -verbose => 1 ); + } + + + + if ( !$log ) { + print "ERROR: Was not given a log file\n"; + pod2usage( -verbose => 1 ); + } + + + return ($fastaref, $type, $coverage, $length, $log,$for,$rev,$out_for,$out_rev); +} + + +sub run { + my ($fastaref, $type, $coverage, $length, $log,$for,$rev,$out_for,$out_rev) = get_parameters(@_); + my $subsample_size; + + + #open log fh here + open my $log_fh,">" ,"$log"; + + + my @in_fastqs; + my @out_fastqs; + + + if ($type eq "single"){ + $in_fastqs[0] = $for; + $out_fastqs[0] = $out_for; + }elsif ($type eq 'paired' ) { + $in_fastqs[0] = $for; + $in_fastqs[1] = $rev; + $out_fastqs[0] = $out_for; + $out_fastqs[1] = $out_rev; + } + else { + die "Given unknown read type of '$type'"; + } + + #get total read lengths from all fastq files given + my $total= get_total_length(@in_fastqs); + + + if (!($coverage)){ + $coverage = 50; + } + + my $seq_in = Bio::SeqIO->new( + -format => 'fasta', + -file => $fastaref, + ); + + while ( my $seq = $seq_in->next_seq()) { + $length += $seq->length(); + } + + + print $log_fh "Downsampling to coverage of: $coverage\n"; + print $log_fh "Total number of Basepairs: $total\n"; + print $log_fh "Length of Reference: $length\n"; + + my $rawcoverage = $total/$length; + printf $log_fh "Raw Coverage: %.3f\n",$rawcoverage; + + if($rawcoverage > $coverage){ + #need to downsample + #calculate $subsample_size + $subsample_size = $coverage/$rawcoverage; + printf $log_fh "subsample: %.3f",$subsample_size; + + foreach my $fastq (@in_fastqs){ + my $out = shift @out_fastqs; + #seed always set to 42 for reproducibility + my $seqCommand = "seqtk sample -s42 $fastq $subsample_size > $out"; + $rv = system($seqCommand); + #need to bit shift 8 bit because seqtk exit code for some reason are greater then standard 0-255 values that most unix application expect + die "Error when running '$seqCommand' command" if $rv >>8; + } + } else { + #no sampling needed, just copy the fastq's to the output + print "Subsampling not required\n"; + foreach my $fastq (@in_fastqs){ + my $out = shift @out_fastqs; + copy($fastq,$out) || die "Not able to copy '$fastq' to '$out' with error $!"; + } + } + + + +} + + +sub get_total_length { + my (@files) = @_; + my $total; + foreach my $fastq( @files) { + + open my $in, "<",$fastq || die "Could not open file '$fastq'"; + #skip first 3 lines + for ( 0..2) { + my $line = <$in>; + } + + while ( <$in>) { + chomp; + $total+=length($_); + #skip first 3 lines + for ( 0..2) { + my $line = <$in>; + } + } + close $in; + + + } + + return $total; + +} + +1; + + +=head1 NAME + + + +seqtk_nml.pl - Down sample fastq(s) if raw coverage is above user provided level + + +=head1 SYNOPSIS + + seqtk_nml.pl --ref reference.fasta --forward first_R1.fastq --reverse --reverse_R2.fastq --out_forward answer_R1.fastq --out_reverse answer_R2.fastq --log log-file + + +=head1 OPTIONS + +=over + +=item + +=item B<--ref> + +Reference fasta file that we getting the expected length [Required] + + +=item B<--cov> + +Coverage desired i.e 50.0 + + +=item B<--forward> + +Forward fastq read file. [Required] + +=item B<--reverse> + +Reverse fastq read file. Can be optional + + +=item B<--out_forward> + +Downsampled forward fastq read file. [Required] + +=item B<--out_reverse> + +Downsampled reverse fastq read file. Can be optional + + +=item B<--log> + +Log file that indicate what has happen. [Required] + +=item B<--type> + +Indicate to application if we are receiving one or two fastq files [Required] ['paired','single'] + + + +=back + +=head1 DESCRIPTION + + +Downsample fastq(s) reads based on the raw coverage from reference fasta file. Needed when we have too much data to run correctly in downstream analyses tools. i.e spades , snvphyl , etc.. + + +=cut + + + + + +=back + + +=head1 SYNOPSIS + + + +=head1 DESCRIPTION + + + +=cut +