comparison seqtk_nml.pl @ 0:e1867440ed36 draft

planemo upload for repository https://github.com/phac-nml/snvphyl-galaxy commit 008f4667b70be22e9ddf496738b3f74bb942ed28
author nml
date Tue, 19 Sep 2017 16:37:42 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e1867440ed36
1 #!/usr/bin/env perl
2 package seqtk_nml;
3 use warnings;
4 use strict;
5 use Bio::SeqIO;
6 use Getopt::Long;
7 use Pod::Usage;
8 use File::Copy;
9 __PACKAGE__->run unless caller;
10
11
12 my $rv;
13
14 sub get_parameters {
15 my ($fastaref, $type, $coverage, $length,$log);
16 my ($for,$rev,$out_for,$out_rev);
17 #determine if our input are as sub arguments or getopt::long
18 if ( @_ && $_[0] eq __PACKAGE__ ) {
19 Getopt::Long::Configure('bundling');
20 GetOptions(
21 'ref=s' => \$fastaref,
22 'type=s' => \$type,
23 'forward=s' => \$for,
24 'reverse=s' => \$rev,
25 'out_forward=s' => \$out_for,
26 'out_reverse=s' => \$out_rev,
27 'log=s'=> \$log,
28 'cov=s' => \$coverage
29 );
30 }
31
32 if ( !$for || !( -e $for ) ) {
33 print "ERROR: Was not given or could not find fastq file: '$for'\n";
34 pod2usage( -verbose => 1 );
35 }
36
37 if ( !$out_for ) {
38 print "ERROR: Was not given output file path for fastq\n";
39 pod2usage( -verbose => 1 );
40 }
41
42 if ( $type eq 'paired') {
43 if ( !$rev || !( -e $rev ) ) {
44 print "ERROR: Was not given or could not find reverse fastq file: '$rev'\n";
45 pod2usage( -verbose => 1 );
46 }
47
48 if ( !$out_rev ) {
49 print "ERROR: Was not given output file path for reverse fastq\n";
50 pod2usage( -verbose => 1 );
51 }
52 }
53
54 if ( !$coverage ) {
55 print "ERROR: Was not given a coverage number\n";
56 pod2usage( -verbose => 1 );
57 }
58
59 if ( $coverage <=0 ) {
60 print "ERROR: Was given a coverage less than 0\n";
61 pod2usage( -verbose => 1 );
62 }
63
64
65
66 if ( !$log ) {
67 print "ERROR: Was not given a log file\n";
68 pod2usage( -verbose => 1 );
69 }
70
71
72 return ($fastaref, $type, $coverage, $length, $log,$for,$rev,$out_for,$out_rev);
73 }
74
75
76 sub run {
77 my ($fastaref, $type, $coverage, $length, $log,$for,$rev,$out_for,$out_rev) = get_parameters(@_);
78 my $subsample_size;
79
80
81 #open log fh here
82 open my $log_fh,">" ,"$log";
83
84
85 my @in_fastqs;
86 my @out_fastqs;
87
88
89 if ($type eq "single"){
90 $in_fastqs[0] = $for;
91 $out_fastqs[0] = $out_for;
92 }elsif ($type eq 'paired' ) {
93 $in_fastqs[0] = $for;
94 $in_fastqs[1] = $rev;
95 $out_fastqs[0] = $out_for;
96 $out_fastqs[1] = $out_rev;
97 }
98 else {
99 die "Given unknown read type of '$type'";
100 }
101
102 #get total read lengths from all fastq files given
103 my $total= get_total_length(@in_fastqs);
104
105
106 if (!($coverage)){
107 $coverage = 50;
108 }
109
110 my $seq_in = Bio::SeqIO->new(
111 -format => 'fasta',
112 -file => $fastaref,
113 );
114
115 while ( my $seq = $seq_in->next_seq()) {
116 $length += $seq->length();
117 }
118
119
120 print $log_fh "Downsampling to coverage of: $coverage\n";
121 print $log_fh "Total number of Basepairs: $total\n";
122 print $log_fh "Length of Reference: $length\n";
123
124 my $rawcoverage = $total/$length;
125 printf $log_fh "Raw Coverage: %.3f\n",$rawcoverage;
126
127 if($rawcoverage > $coverage){
128 #need to downsample
129 #calculate $subsample_size
130 $subsample_size = $coverage/$rawcoverage;
131 printf $log_fh "subsample: %.3f",$subsample_size;
132
133 foreach my $fastq (@in_fastqs){
134 my $out = shift @out_fastqs;
135 #seed always set to 42 for reproducibility
136 my $seqCommand = "seqtk sample -s42 $fastq $subsample_size > $out";
137 $rv = system($seqCommand);
138 #need to bit shift 8 bit because seqtk exit code for some reason are greater then standard 0-255 values that most unix application expect
139 die "Error when running '$seqCommand' command" if $rv >>8;
140 }
141 } else {
142 #no sampling needed, just copy the fastq's to the output
143 print "Subsampling not required\n";
144 foreach my $fastq (@in_fastqs){
145 my $out = shift @out_fastqs;
146 copy($fastq,$out) || die "Not able to copy '$fastq' to '$out' with error $!";
147 }
148 }
149
150
151
152 }
153
154
155 sub get_total_length {
156 my (@files) = @_;
157 my $total;
158 foreach my $fastq( @files) {
159
160 open my $in, "<",$fastq || die "Could not open file '$fastq'";
161 #skip first 3 lines
162 for ( 0..2) {
163 my $line = <$in>;
164 }
165
166 while ( <$in>) {
167 chomp;
168 $total+=length($_);
169 #skip first 3 lines
170 for ( 0..2) {
171 my $line = <$in>;
172 }
173 }
174 close $in;
175
176
177 }
178
179 return $total;
180
181 }
182
183 1;
184
185
186 =head1 NAME
187
188
189
190 seqtk_nml.pl - Down sample fastq(s) if raw coverage is above user provided level
191
192
193 =head1 SYNOPSIS
194
195 seqtk_nml.pl --ref reference.fasta --forward first_R1.fastq --reverse --reverse_R2.fastq --out_forward answer_R1.fastq --out_reverse answer_R2.fastq --log log-file
196
197
198 =head1 OPTIONS
199
200 =over
201
202 =item
203
204 =item B<--ref>
205
206 Reference fasta file that we getting the expected length [Required]
207
208
209 =item B<--cov>
210
211 Coverage desired i.e 50.0
212
213
214 =item B<--forward>
215
216 Forward fastq read file. [Required]
217
218 =item B<--reverse>
219
220 Reverse fastq read file. Can be optional
221
222
223 =item B<--out_forward>
224
225 Downsampled forward fastq read file. [Required]
226
227 =item B<--out_reverse>
228
229 Downsampled reverse fastq read file. Can be optional
230
231
232 =item B<--log>
233
234 Log file that indicate what has happen. [Required]
235
236 =item B<--type>
237
238 Indicate to application if we are receiving one or two fastq files [Required] ['paired','single']
239
240
241
242 =back
243
244 =head1 DESCRIPTION
245
246
247 Downsample fastq(s) reads based on the raw coverage from reference fasta file. Needed when we have too much data to run correctly in downstream analyses tools. i.e spades , snvphyl , etc..
248
249
250 =cut
251
252
253
254
255
256 =back
257
258
259 =head1 SYNOPSIS
260
261
262
263 =head1 DESCRIPTION
264
265
266
267 =cut
268