Mercurial > repos > nml > seqtk_nml
view seqtk_nml.pl @ 1:f49992c79fe4 draft default tip
planemo upload for repository https://github.com/phac-nml/snvphyl-galaxy commit 969557932bff35913d93068d16facb8da4d64123
author | nml |
---|---|
date | Thu, 02 Nov 2017 14:09:07 -0400 |
parents | e1867440ed36 |
children |
line wrap: on
line source
#!/usr/bin/env perl package seqtk_nml; use warnings; use strict; use Bio::SeqIO; use Getopt::Long; use Pod::Usage; use File::Copy; __PACKAGE__->run unless caller; my $rv; sub get_parameters { my ($fastaref, $type, $coverage, $length,$log); my ($for,$rev,$out_for,$out_rev); #determine if our input are as sub arguments or getopt::long if ( @_ && $_[0] eq __PACKAGE__ ) { Getopt::Long::Configure('bundling'); GetOptions( 'ref=s' => \$fastaref, 'type=s' => \$type, 'forward=s' => \$for, 'reverse=s' => \$rev, 'out_forward=s' => \$out_for, 'out_reverse=s' => \$out_rev, 'log=s'=> \$log, 'cov=s' => \$coverage ); } if ( !$for || !( -e $for ) ) { print "ERROR: Was not given or could not find fastq file: '$for'\n"; pod2usage( -verbose => 1 ); } if ( !$out_for ) { print "ERROR: Was not given output file path for fastq\n"; pod2usage( -verbose => 1 ); } if ( $type eq 'paired') { if ( !$rev || !( -e $rev ) ) { print "ERROR: Was not given or could not find reverse fastq file: '$rev'\n"; pod2usage( -verbose => 1 ); } if ( !$out_rev ) { print "ERROR: Was not given output file path for reverse fastq\n"; pod2usage( -verbose => 1 ); } } if ( !$coverage ) { print "ERROR: Was not given a coverage number\n"; pod2usage( -verbose => 1 ); } if ( $coverage <=0 ) { print "ERROR: Was given a coverage less than 0\n"; pod2usage( -verbose => 1 ); } if ( !$log ) { print "ERROR: Was not given a log file\n"; pod2usage( -verbose => 1 ); } return ($fastaref, $type, $coverage, $length, $log,$for,$rev,$out_for,$out_rev); } sub run { my ($fastaref, $type, $coverage, $length, $log,$for,$rev,$out_for,$out_rev) = get_parameters(@_); my $subsample_size; #open log fh here open my $log_fh,">" ,"$log"; my @in_fastqs; my @out_fastqs; if ($type eq "single"){ $in_fastqs[0] = $for; $out_fastqs[0] = $out_for; }elsif ($type eq 'paired' ) { $in_fastqs[0] = $for; $in_fastqs[1] = $rev; $out_fastqs[0] = $out_for; $out_fastqs[1] = $out_rev; } else { die "Given unknown read type of '$type'"; } #get total read lengths from all fastq files given my $total= get_total_length(@in_fastqs); if (!($coverage)){ $coverage = 50; } my $seq_in = Bio::SeqIO->new( -format => 'fasta', -file => $fastaref, ); while ( my $seq = $seq_in->next_seq()) { $length += $seq->length(); } print $log_fh "Downsampling to coverage of: $coverage\n"; print $log_fh "Total number of Basepairs: $total\n"; print $log_fh "Length of Reference: $length\n"; my $rawcoverage = $total/$length; printf $log_fh "Raw Coverage: %.3f\n",$rawcoverage; if($rawcoverage > $coverage){ #need to downsample #calculate $subsample_size $subsample_size = $coverage/$rawcoverage; printf $log_fh "subsample: %.3f",$subsample_size; foreach my $fastq (@in_fastqs){ my $out = shift @out_fastqs; #seed always set to 42 for reproducibility my $seqCommand = "seqtk sample -s42 $fastq $subsample_size > $out"; $rv = system($seqCommand); #need to bit shift 8 bit because seqtk exit code for some reason are greater then standard 0-255 values that most unix application expect die "Error when running '$seqCommand' command" if $rv >>8; } } else { #no sampling needed, just copy the fastq's to the output print "Subsampling not required\n"; foreach my $fastq (@in_fastqs){ my $out = shift @out_fastqs; copy($fastq,$out) || die "Not able to copy '$fastq' to '$out' with error $!"; } } } sub get_total_length { my (@files) = @_; my $total; foreach my $fastq( @files) { open my $in, "<",$fastq || die "Could not open file '$fastq'"; #skip first 3 lines for ( 0..2) { my $line = <$in>; } while ( <$in>) { chomp; $total+=length($_); #skip first 3 lines for ( 0..2) { my $line = <$in>; } } close $in; } return $total; } 1; =head1 NAME seqtk_nml.pl - Down sample fastq(s) if raw coverage is above user provided level =head1 SYNOPSIS seqtk_nml.pl --ref reference.fasta --forward first_R1.fastq --reverse --reverse_R2.fastq --out_forward answer_R1.fastq --out_reverse answer_R2.fastq --log log-file =head1 OPTIONS =over =item =item B<--ref> Reference fasta file that we getting the expected length [Required] =item B<--cov> Coverage desired i.e 50.0 =item B<--forward> Forward fastq read file. [Required] =item B<--reverse> Reverse fastq read file. Can be optional =item B<--out_forward> Downsampled forward fastq read file. [Required] =item B<--out_reverse> Downsampled reverse fastq read file. Can be optional =item B<--log> Log file that indicate what has happen. [Required] =item B<--type> Indicate to application if we are receiving one or two fastq files [Required] ['paired','single'] =back =head1 DESCRIPTION Downsample fastq(s) reads based on the raw coverage from reference fasta file. Needed when we have too much data to run correctly in downstream analyses tools. i.e spades , snvphyl , etc.. =cut =back =head1 SYNOPSIS =head1 DESCRIPTION =cut