comparison rapsodyn/PrepareFastqLight.pl @ 7:3f7b0788a1c4 draft

Uploaded
author mcharles
date Tue, 07 Oct 2014 10:34:34 -0400
parents b0cbb9d21aa9
children d857538d9fea
comparison
equal deleted inserted replaced
6:1776b8ddd87e 7:3f7b0788a1c4
1 #!/usr/bin/perl 1 #!/usr/bin/perl
2 #V1.0.1 added log, option parameters
2 use strict; 3 use strict;
3 use warnings; 4 use warnings;
4 5 use Getopt::Long;
5 my $read1 = $ARGV[0]; 6
6 my $read2 = $ARGV[1]; 7 my $read1_file;
7 8 my $read2_file;
8 my $output1 = $ARGV[2]; 9 my $log_file;
9 my $output2 = $ARGV[3]; 10 my $output1_file;
10 11 my $output2_file;
11 my $TYPE = $ARGV[4]; 12
12 my $MIN_LENGTH = $ARGV[5]; 13 my $TYPE="sanger";
13 my $MIN_QUALITY = $ARGV[6]; 14 my $MIN_LENGTH=30;
14 15 my $MIN_QUALITY=30;
15 my $VERBOSE = $ARGV[7]; 16
16 17 my $VERBOSE = "OFF";
17 if (!$VERBOSE){ 18
18 $VERBOSE ="OFF"; 19 GetOptions (
19 } 20 "read1_file=s" => \$read1_file,
20 21 "read2_file=s" => \$read2_file,
21 open(READ1, $read1) or die ("Can't open $read1\n"); 22 "log_file=s" => \$log_file,
22 open(READ2, $read2) or die ("Can't open $read2\n"); 23 "output1_file=s" => \$output1_file,
23 open(OUT1, ">$output1") or die ("Can't open $output1\n"); 24 "output2_file=s" => \$output2_file,
24 open(OUT2, ">$output2") or die ("Can't open $output2\n"); 25 "type=s" => \$TYPE,
25 26 "min_length=i" => \$MIN_LENGTH,
27 "min_quality=i" => \$MIN_QUALITY,
28 "verbose=s" => \$VERBOSE
29 ) or die("Error in command line arguments\n");
30
31
32 my $nb_read1=0;
33 my $nb_base_read1=0;
34 my $nb_read2=0;
35 my $nb_base_read2=0;
36
37 my $nb_read1_t=0;
38 my $nb_base_read1_t=0;
39 my $nb_read2_t=0;
40 my $nb_base_read2_t=0;
41
42 my $nb_base_current_t=0;
43
44
45 open(READ1, $read1_file) or die ("Can't open $read1_file\n");
46 open(READ2, $read2_file) or die ("Can't open $read2_file\n");
47 open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n");
48 open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n");
26 49
27 my $error1=0; 50 my $error1=0;
28 my $error2=0; 51 my $error2=0;
29 my $error3=0; 52 my $error3=0;
30 my $error4=0; 53 my $error4=0;
41 my $ligne4_r1 =<READ1>; 64 my $ligne4_r1 =<READ1>;
42 my $ligne1_r2 =<READ2>; 65 my $ligne1_r2 =<READ2>;
43 my $ligne2_r2 =<READ2>; 66 my $ligne2_r2 =<READ2>;
44 my $ligne3_r2 =<READ2>; 67 my $ligne3_r2 =<READ2>;
45 my $ligne4_r2 =<READ2>; 68 my $ligne4_r2 =<READ2>;
69
70 $nb_read1++;
71 $nb_read2++;
46 72
47 #@ 1 sec 73 #@ 1 sec
48 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ 74 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
49 if ($VERBOSE eq "ON"){ 75 if ($VERBOSE eq "ON"){
50 print "Error in file format"; 76 print "Error in file format";
89 my $qual2; 115 my $qual2;
90 my $header1=""; 116 my $header1="";
91 my $header2=""; 117 my $header2="";
92 my $repheader1=""; 118 my $repheader1="";
93 my $repheader2=""; 119 my $repheader2="";
120
94 121
95 if ($ligne1_r1 =~/^\@(.*?)\#/){ 122 if ($ligne1_r1 =~/^\@(.*?)\#/){
96 $header1 = $1; 123 $header1 = $1;
97 } 124 }
98 125
188 #@ 1 - 2 sec 215 #@ 1 - 2 sec
189 else { 216 else {
190 ### Parsing sequence & qualité 217 ### Parsing sequence & qualité
191 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ 218 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
192 $seq1 = $1; 219 $seq1 = $1;
220 $nb_base_read1 += length($seq1);
193 } 221 }
194 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ 222 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
195 $seq2 = $1; 223 $seq2 = $1;
224 $nb_base_read2 += length($seq2);
196 } 225 }
197 if ($ligne4_r1 =~ /^(.*)\s*$/i){ 226 if ($ligne4_r1 =~ /^(.*)\s*$/i){
198 $qual1 = $1; 227 $qual1 = $1;
199 } 228 }
200 if ($ligne4_r2 =~ /^(.*)\s*$/i){ 229 if ($ligne4_r2 =~ /^(.*)\s*$/i){
234 } 263 }
235 #@ <1 sec 264 #@ <1 sec
236 else { 265 else {
237 my $fastq_lines_r1=""; 266 my $fastq_lines_r1="";
238 my $fastq_lines_r2=""; 267 my $fastq_lines_r2="";
268 my $nb_base_current_read1_t = 0;
269 my $nb_base_current_read2_t = 0;
270
239 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); 271 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
272 $nb_base_current_read1_t = $nb_base_current_t;
240 if ($fastq_lines_r1){ 273 if ($fastq_lines_r1){
241 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); 274 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
275 $nb_base_current_read2_t = $nb_base_current_t;
242 } 276 }
243 if ($fastq_lines_r2){ 277 if ($fastq_lines_r2){
244 print OUT1 $fastq_lines_r1; 278 print OUT1 $fastq_lines_r1;
245 print OUT2 $fastq_lines_r2; 279 print OUT2 $fastq_lines_r2;
280
281 $nb_read1_t++;
282 $nb_read2_t++;
283 $nb_base_read1_t += $nb_base_current_read1_t;
284 $nb_base_read2_t += $nb_base_current_read2_t;
285
286
246 } 287 }
247 } 288 }
248 } 289 }
249 290
250 # print OUT1 $ligne1_r1; 291
251 # print OUT1 $ligne2_r1;
252 # print OUT1 $ligne3_r1;
253 # print OUT1 $ligne4_r1;
254 # print OUT2 $ligne1_r2;
255 # print OUT2 $ligne2_r2;
256 # print OUT2 $ligne3_r2;
257 # print OUT2 $ligne4_r2;
258
259 #@ 7 sec 292 #@ 7 sec
260 } 293 }
261 } 294 }
262
263
264 295
265 close (READ1); 296 close (READ1);
266 close (READ2); 297 close (READ2);
267 close (OUT1); 298 close (OUT1);
268 close (OUT2); 299 close (OUT2);
269 300
301 open (LF,">$log_file") or die("Can't open $log_file\n");
302 print LF "\n####\t Fastq preparation \n";
303 print LF "## Before preparation\n";
304 print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n";
305 print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n";
306 print LF "## After preparation\n";
307 print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n";
308 print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n";
309 close (LF);
310
270 311
271 sub grooming_and_trimming{ 312 sub grooming_and_trimming{
272 my $header = shift; 313 my $header = shift;
273 my $seq = shift; 314 my $seq = shift;
274 my $quality = shift; 315 my $quality = shift;
324 $stopTrim = $coord{"stop"}; 365 $stopTrim = $coord{"stop"};
325 #print "$startTrim .. $stopTrim\n"; 366 #print "$startTrim .. $stopTrim\n";
326 367
327 } 368 }
328 my $lengthTrim = $stopTrim - $startTrim +1; 369 my $lengthTrim = $stopTrim - $startTrim +1;
329 370
371 #if ($stats_length{$lengthTrim}){
372 # $stats_length{$lengthTrim} = 1;
373 #}
374 #else {
375 # $stats_length{$lengthTrim}++;
376 #}
330 my $fastq_lines=""; 377 my $fastq_lines="";
331 378
332 # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){ 379 # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){
333 # print "HEAD:\t$header"; 380 # print "HEAD:\t$header";
334 # print "SEQ:\n$seq\n"; 381 # print "SEQ:\n$seq\n";
341 # print "$startTrim .. $stopTrim / $lengthTrim \n"; 388 # print "$startTrim .. $stopTrim / $lengthTrim \n";
342 # print $fastq_lines; 389 # print $fastq_lines;
343 # print "\n"; 390 # print "\n";
344 # } 391 # }
345 392
393 #for (my $i=$startTrim;$i<=$stopTrim;$i++){
394 # if ($stats_quality{ord($q{$i])}){
395 # $stats_quality{ord($q{$i])}=1;
396 # }
397 # else {
398 # $stats_quality{ord($q{$i])}++;
399 # }
400 #}
401
346 if ($lengthTrim >= $MIN_LENGTH){ 402 if ($lengthTrim >= $MIN_LENGTH){
347 $fastq_lines .= $header; 403 $fastq_lines .= $header;
348 $fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n"; 404 my $new_seq = substr($seq,$startTrim,$lengthTrim);
405 $nb_base_current_t = length($new_seq);
406 $fastq_lines .= $new_seq."\n";
349 $fastq_lines .= "+\n"; 407 $fastq_lines .= "+\n";
350 $fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n"; 408 my $new_q = substr($quality,$startTrim,$lengthTrim);
409 $fastq_lines .= $new_q."\n";
351 return $fastq_lines; 410 return $fastq_lines;
352 411
353 } 412 }
354 else { 413 else {
355 #print "Insufficient length after trimming\n"; 414 #print "Insufficient length after trimming\n";