Mercurial > repos > nml > seqtk_nml
comparison seqtk_nml.pl @ 0:e1867440ed36 draft
planemo upload for repository https://github.com/phac-nml/snvphyl-galaxy commit 008f4667b70be22e9ddf496738b3f74bb942ed28
author | nml |
---|---|
date | Tue, 19 Sep 2017 16:37:42 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e1867440ed36 |
---|---|
1 #!/usr/bin/env perl | |
2 package seqtk_nml; | |
3 use warnings; | |
4 use strict; | |
5 use Bio::SeqIO; | |
6 use Getopt::Long; | |
7 use Pod::Usage; | |
8 use File::Copy; | |
9 __PACKAGE__->run unless caller; | |
10 | |
11 | |
12 my $rv; | |
13 | |
14 sub get_parameters { | |
15 my ($fastaref, $type, $coverage, $length,$log); | |
16 my ($for,$rev,$out_for,$out_rev); | |
17 #determine if our input are as sub arguments or getopt::long | |
18 if ( @_ && $_[0] eq __PACKAGE__ ) { | |
19 Getopt::Long::Configure('bundling'); | |
20 GetOptions( | |
21 'ref=s' => \$fastaref, | |
22 'type=s' => \$type, | |
23 'forward=s' => \$for, | |
24 'reverse=s' => \$rev, | |
25 'out_forward=s' => \$out_for, | |
26 'out_reverse=s' => \$out_rev, | |
27 'log=s'=> \$log, | |
28 'cov=s' => \$coverage | |
29 ); | |
30 } | |
31 | |
32 if ( !$for || !( -e $for ) ) { | |
33 print "ERROR: Was not given or could not find fastq file: '$for'\n"; | |
34 pod2usage( -verbose => 1 ); | |
35 } | |
36 | |
37 if ( !$out_for ) { | |
38 print "ERROR: Was not given output file path for fastq\n"; | |
39 pod2usage( -verbose => 1 ); | |
40 } | |
41 | |
42 if ( $type eq 'paired') { | |
43 if ( !$rev || !( -e $rev ) ) { | |
44 print "ERROR: Was not given or could not find reverse fastq file: '$rev'\n"; | |
45 pod2usage( -verbose => 1 ); | |
46 } | |
47 | |
48 if ( !$out_rev ) { | |
49 print "ERROR: Was not given output file path for reverse fastq\n"; | |
50 pod2usage( -verbose => 1 ); | |
51 } | |
52 } | |
53 | |
54 if ( !$coverage ) { | |
55 print "ERROR: Was not given a coverage number\n"; | |
56 pod2usage( -verbose => 1 ); | |
57 } | |
58 | |
59 if ( $coverage <=0 ) { | |
60 print "ERROR: Was given a coverage less than 0\n"; | |
61 pod2usage( -verbose => 1 ); | |
62 } | |
63 | |
64 | |
65 | |
66 if ( !$log ) { | |
67 print "ERROR: Was not given a log file\n"; | |
68 pod2usage( -verbose => 1 ); | |
69 } | |
70 | |
71 | |
72 return ($fastaref, $type, $coverage, $length, $log,$for,$rev,$out_for,$out_rev); | |
73 } | |
74 | |
75 | |
76 sub run { | |
77 my ($fastaref, $type, $coverage, $length, $log,$for,$rev,$out_for,$out_rev) = get_parameters(@_); | |
78 my $subsample_size; | |
79 | |
80 | |
81 #open log fh here | |
82 open my $log_fh,">" ,"$log"; | |
83 | |
84 | |
85 my @in_fastqs; | |
86 my @out_fastqs; | |
87 | |
88 | |
89 if ($type eq "single"){ | |
90 $in_fastqs[0] = $for; | |
91 $out_fastqs[0] = $out_for; | |
92 }elsif ($type eq 'paired' ) { | |
93 $in_fastqs[0] = $for; | |
94 $in_fastqs[1] = $rev; | |
95 $out_fastqs[0] = $out_for; | |
96 $out_fastqs[1] = $out_rev; | |
97 } | |
98 else { | |
99 die "Given unknown read type of '$type'"; | |
100 } | |
101 | |
102 #get total read lengths from all fastq files given | |
103 my $total= get_total_length(@in_fastqs); | |
104 | |
105 | |
106 if (!($coverage)){ | |
107 $coverage = 50; | |
108 } | |
109 | |
110 my $seq_in = Bio::SeqIO->new( | |
111 -format => 'fasta', | |
112 -file => $fastaref, | |
113 ); | |
114 | |
115 while ( my $seq = $seq_in->next_seq()) { | |
116 $length += $seq->length(); | |
117 } | |
118 | |
119 | |
120 print $log_fh "Downsampling to coverage of: $coverage\n"; | |
121 print $log_fh "Total number of Basepairs: $total\n"; | |
122 print $log_fh "Length of Reference: $length\n"; | |
123 | |
124 my $rawcoverage = $total/$length; | |
125 printf $log_fh "Raw Coverage: %.3f\n",$rawcoverage; | |
126 | |
127 if($rawcoverage > $coverage){ | |
128 #need to downsample | |
129 #calculate $subsample_size | |
130 $subsample_size = $coverage/$rawcoverage; | |
131 printf $log_fh "subsample: %.3f",$subsample_size; | |
132 | |
133 foreach my $fastq (@in_fastqs){ | |
134 my $out = shift @out_fastqs; | |
135 #seed always set to 42 for reproducibility | |
136 my $seqCommand = "seqtk sample -s42 $fastq $subsample_size > $out"; | |
137 $rv = system($seqCommand); | |
138 #need to bit shift 8 bit because seqtk exit code for some reason are greater then standard 0-255 values that most unix application expect | |
139 die "Error when running '$seqCommand' command" if $rv >>8; | |
140 } | |
141 } else { | |
142 #no sampling needed, just copy the fastq's to the output | |
143 print "Subsampling not required\n"; | |
144 foreach my $fastq (@in_fastqs){ | |
145 my $out = shift @out_fastqs; | |
146 copy($fastq,$out) || die "Not able to copy '$fastq' to '$out' with error $!"; | |
147 } | |
148 } | |
149 | |
150 | |
151 | |
152 } | |
153 | |
154 | |
155 sub get_total_length { | |
156 my (@files) = @_; | |
157 my $total; | |
158 foreach my $fastq( @files) { | |
159 | |
160 open my $in, "<",$fastq || die "Could not open file '$fastq'"; | |
161 #skip first 3 lines | |
162 for ( 0..2) { | |
163 my $line = <$in>; | |
164 } | |
165 | |
166 while ( <$in>) { | |
167 chomp; | |
168 $total+=length($_); | |
169 #skip first 3 lines | |
170 for ( 0..2) { | |
171 my $line = <$in>; | |
172 } | |
173 } | |
174 close $in; | |
175 | |
176 | |
177 } | |
178 | |
179 return $total; | |
180 | |
181 } | |
182 | |
183 1; | |
184 | |
185 | |
186 =head1 NAME | |
187 | |
188 | |
189 | |
190 seqtk_nml.pl - Down sample fastq(s) if raw coverage is above user provided level | |
191 | |
192 | |
193 =head1 SYNOPSIS | |
194 | |
195 seqtk_nml.pl --ref reference.fasta --forward first_R1.fastq --reverse --reverse_R2.fastq --out_forward answer_R1.fastq --out_reverse answer_R2.fastq --log log-file | |
196 | |
197 | |
198 =head1 OPTIONS | |
199 | |
200 =over | |
201 | |
202 =item | |
203 | |
204 =item B<--ref> | |
205 | |
206 Reference fasta file that we getting the expected length [Required] | |
207 | |
208 | |
209 =item B<--cov> | |
210 | |
211 Coverage desired i.e 50.0 | |
212 | |
213 | |
214 =item B<--forward> | |
215 | |
216 Forward fastq read file. [Required] | |
217 | |
218 =item B<--reverse> | |
219 | |
220 Reverse fastq read file. Can be optional | |
221 | |
222 | |
223 =item B<--out_forward> | |
224 | |
225 Downsampled forward fastq read file. [Required] | |
226 | |
227 =item B<--out_reverse> | |
228 | |
229 Downsampled reverse fastq read file. Can be optional | |
230 | |
231 | |
232 =item B<--log> | |
233 | |
234 Log file that indicate what has happen. [Required] | |
235 | |
236 =item B<--type> | |
237 | |
238 Indicate to application if we are receiving one or two fastq files [Required] ['paired','single'] | |
239 | |
240 | |
241 | |
242 =back | |
243 | |
244 =head1 DESCRIPTION | |
245 | |
246 | |
247 Downsample fastq(s) reads based on the raw coverage from reference fasta file. Needed when we have too much data to run correctly in downstream analyses tools. i.e spades , snvphyl , etc.. | |
248 | |
249 | |
250 =cut | |
251 | |
252 | |
253 | |
254 | |
255 | |
256 =back | |
257 | |
258 | |
259 =head1 SYNOPSIS | |
260 | |
261 | |
262 | |
263 =head1 DESCRIPTION | |
264 | |
265 | |
266 | |
267 =cut | |
268 |