comparison trim_galore @ 4:2c1f0fe810f7 draft

Uploaded
author bgruening
date Wed, 15 Apr 2015 11:32:11 -0400
parents 898db63d2e84
children 11962ce40855
comparison
equal deleted inserted replaced
3:eb546ac2aab2 4:2c1f0fe810f7
5 use IPC::Open3; 5 use IPC::Open3;
6 use File::Spec; 6 use File::Spec;
7 use File::Basename; 7 use File::Basename;
8 use Cwd; 8 use Cwd;
9 9
10 ## This program is Copyright (C) 2012-13, Felix Krueger (felix.krueger@babraham.ac.uk) 10 ## This program is Copyright (C) 2012-14, Felix Krueger (felix.krueger@babraham.ac.uk)
11 11
12 ## This program is free software: you can redistribute it and/or modify 12 ## This program is free software: you can redistribute it and/or modify
13 ## it under the terms of the GNU General Public License as published by 13 ## it under the terms of the GNU General Public License as published by
14 ## the Free Software Foundation, either version 3 of the License, or 14 ## the Free Software Foundation, either version 3 of the License, or
15 ## (at your option) any later version. 15 ## (at your option) any later version.
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>. 23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
24 24
25 25
26 26
27 ## this script is taking in FastQ sequences and trims them with Cutadapt 27 ## this script is taking in FastQ sequences and trims them with Cutadapt
28 ## last modified on 10 April 2013 28
29 ## last modified on 16 July 2014
30
31
29 32
30 ######################################################################## 33 ########################################################################
31 34
32 # change these paths if needed 35 # change these paths if needed
33 36
34 my $path_to_cutadapt = 'cutadapt'; 37 my $path_to_cutadapt = 'cutadapt';
35 my $path_to_fastqc = 'fastqc'; 38 my $path_to_fastqc = 'fastqc';
36 39
37 ######################################################################## 40 ########################################################################
38 41
39 42 my $trimmer_version = '0.3.7';
40 my $trimmer_version = '0.2.8';
41 my $DOWARN = 1; # print on screen warning and text by default 43 my $DOWARN = 1; # print on screen warning and text by default
42 BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } }; 44 BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } };
43 45
44 my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2) = process_commandline(); 46 my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2) = process_commandline();
45 47
46 my @filenames = @ARGV; 48 my @filenames = @ARGV;
49
47 50
48 die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n 51 die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n
49 USAGE: 'trim_galore [options] <filename(s)>' or 'trim_galore --help' for more options\n\n" unless (@filenames); 52 USAGE: 'trim_galore [options] <filename(s)>' or 'trim_galore --help' for more options\n\n" unless (@filenames);
50 53
51 54
64 67
65 unless (defined $stringency){ 68 unless (defined $stringency){
66 $stringency = 1; 69 $stringency = 1;
67 } 70 }
68 71
69 unless (defined $length_cutoff){
70 $length_cutoff = 20;
71 }
72
73 if ($phred_encoding == 64){ 72 if ($phred_encoding == 64){
74 $cutoff += 31; 73 $cutoff += 31;
75 } 74 }
76 75
77 my $file_1; 76 my $file_1;
84 83
85 sub trim{ 84 sub trim{
86 my $filename = shift; 85 my $filename = shift;
87 86
88 my $output_filename = (split (/\//,$filename))[-1]; 87 my $output_filename = (split (/\//,$filename))[-1];
89 # warn "Here is the outputfile name: $output_filename\n";
90 88
91 my $report = $output_filename; 89 my $report = $output_filename;
92 $report =~ s/$/_trimming_report.txt/; 90 $report =~ s/$/_trimming_report.txt/;
93 91
94 if ($no_report_file) { 92 if ($no_report_file) {
95 $report = File::Spec->devnull; 93 $report = File::Spec->devnull;
96 open (REPORT,'>',$report) or die "Failed to write to file: $!\n"; 94 open (REPORT,'>',$report) or die "Failed to write to file '$report': $!\n";
97 # warn "Redirecting report output to /dev/null\n"; 95 # warn "Redirecting report output to /dev/null\n";
98 } 96 }
99 else{ 97 else{
100 open (REPORT,'>',$output_dir.$report) or die "Failed to write to file: $!\n"; 98 open (REPORT,'>',$output_dir.$report) or die "Failed to write to file '$report': $!\n";
101 warn "Writing report to '$output_dir$report'\n"; 99 warn "Writing report to '$output_dir$report'\n";
102 } 100 }
103 101
104 warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n"; 102 warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
105 print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n"; 103 print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
104
105 if ($validate){ # paired-end mode
106 warn "Trimming mode: paired-end\n";
107 print REPORT "Trimming mode: paired-end\n";
108 }
109 else{
110 warn "Trimming mode: single-end\n";
111 print REPORT "Trimming mode: single-end\n";
112 }
113
114
115 warn "Trim Galore version: $trimmer_version\n";
116 print REPORT "Trim Galore version: $trimmer_version\n";
106 117
107 warn "Quality Phred score cutoff: $phred_score_cutoff\n"; 118 warn "Quality Phred score cutoff: $phred_score_cutoff\n";
108 print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n"; 119 print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n";
109 120
110 warn "Quality encoding type selected: ASCII+$phred_encoding\n"; 121 warn "Quality encoding type selected: ASCII+$phred_encoding\n";
191 if ($clip_r2){ 202 if ($clip_r2){
192 warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n"; 203 warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
193 print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n"; 204 print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
194 } 205 }
195 206
207 if ($three_prime_clip_r1){
208 warn "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n";
209 print REPORT "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n";
210 }
211 if ($three_prime_clip_r2){
212 warn "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n";
213 print REPORT "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n";
214 }
196 215
197 if ($fastqc){ 216 if ($fastqc){
198 warn "Running FastQC on the data once trimming has completed\n"; 217 warn "Running FastQC on the data once trimming has completed\n";
199 print REPORT "Running FastQC on the data once trimming has completed\n"; 218 print REPORT "Running FastQC on the data once trimming has completed\n";
200 219
233 sleep (3); 252 sleep (3);
234 } 253 }
235 else{ 254 else{
236 255
237 $temp = $filename; 256 $temp = $filename;
257 $temp =~ s/^.*\///; # replacing optional file path information
238 $temp =~ s/$/_qual_trimmed.fastq/; 258 $temp =~ s/$/_qual_trimmed.fastq/;
239 open (TEMP,'>',$output_dir.$temp) or die "Can't write to $temp: $!"; 259 open (TEMP,'>',$output_dir.$temp) or die "Can't write to '$temp': $!";
240 260
241 warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n"; 261 warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n";
242 sleep (3); 262 sleep (3);
243 263
244 open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!"; 264 open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!";
258 } 278 }
259 print TEMP "$l1$seq$l3$qual"; 279 print TEMP "$l1$seq$l3$qual";
260 } 280 }
261 281
262 warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n"; 282 warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n";
263 close QUAL or die "Unable to close filehandle: $!\n"; 283 close QUAL or die "Unable to close QUAL filehandle: $!\n";
264 sleep (3); 284 sleep (3);
265 285
266 } 286 }
267 } 287 }
268 288
282 else{ 302 else{
283 $output_filename =~ s/$/_trimmed.fq/; 303 $output_filename =~ s/$/_trimmed.fq/;
284 } 304 }
285 305
286 if ($gzip or $filename =~ /\.gz$/){ 306 if ($gzip or $filename =~ /\.gz$/){
287 unless ($dont_gzip){ 307 if ($dont_gzip){
288 if ($validate){ 308 open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file
289 open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; # don't need to gzip intermediate file
290 }
291 else{
292 $output_filename .= '.gz';
293 open (OUT,"| gzip -c - > ${output_dir}${output_filename}") or die "Can't write to $output_filename: $!\n";
294 }
295 } 309 }
296 else{ 310 else{
297 open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; # don't need to gzip intermediate file 311 ### 6 Jan 2014: had a request to also gzip intermediate files to save disk space
298 } 312 # if ($validate){
299 } 313 # open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file
300 else{ 314 # }
301 open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; 315 $output_filename .= '.gz';
316 open (OUT,"| gzip -c - > ${output_dir}${output_filename}") or die "Can't write to '$output_filename': $!\n";
317 }
318 }
319 else{
320 open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n";
302 } 321 }
303 warn "Writing final adapter and quality trimmed output to $output_filename\n\n"; 322 warn "Writing final adapter and quality trimmed output to $output_filename\n\n";
304 323
305 my $count = 0; 324 my $count = 0;
306 my $too_short = 0; 325 my $too_short = 0;
308 my $rrbs_trimmed = 0; 327 my $rrbs_trimmed = 0;
309 my $rrbs_trimmed_start = 0; 328 my $rrbs_trimmed_start = 0;
310 my $CAA = 0; 329 my $CAA = 0;
311 my $CGA = 0; 330 my $CGA = 0;
312 331
332 my $pid;
333
313 if ($rrbs and $cutoff != 0){ 334 if ($rrbs and $cutoff != 0){
314 335
315 ### optionally using 2 different adapters for read 1 and read 2 336 ### optionally using 2 different adapters for read 1 and read 2
316 if ($validate and $a2){ 337 if ($validate and $a2){
317 ### Figure out whether current file counts as read 1 or read 2 of paired-end files 338 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
318 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair 339 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
319 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; 340 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
320 sleep (3); 341 sleep (3);
321 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; 342 $pid = open (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
322 } 343 }
323 else{ # this is read 2 of a pair 344 else{ # this is read 2 of a pair
324 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n"; 345 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n";
325 sleep (3); 346 sleep (3);
326 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; 347 $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
327 } 348 }
328 } 349 }
329 ### Using the same adapter for both read 1 and read 2 350 ### Using the same adapter for both read 1 and read 2
330 else{ 351 else{
331 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; 352 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
332 sleep (3); 353 sleep (3);
333 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; 354 $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
334 } 355 }
335 356
336 close WRITER or die $!; # not needed 357 close WRITER or die $!; # not needed
337 358
338 open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file 359 open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file
422 print OUT "$l1$seq\n$l3$qual\n"; 443 print OUT "$l1$seq\n$l3$qual\n";
423 } 444 }
424 else{ # single end 445 else{ # single end
425 446
426 if ($clip_r1){ 447 if ($clip_r1){
427 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence 448 if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again
428 $qual = substr($qual,$clip_r1); 449 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
450 $qual = substr($qual,$clip_r1);
451 }
452 }
453
454 if ($three_prime_clip_r1){
455
456 if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again
457 # warn "seq/qual before/after trimming:\n$seq\n$qual\n";
458 $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence
459 $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1 ));
460 # warn "$seq\n$qual\n";
461 }
462
429 } 463 }
430 464
431 if (length $seq < $length_cutoff){ 465 if (length $seq < $length_cutoff){
432 ++$too_short; 466 ++$too_short;
433 next; 467 next;
446 480
447 close IN or die "Unable to close IN filehandle: $!"; 481 close IN or die "Unable to close IN filehandle: $!";
448 close QUAL or die "Unable to close QUAL filehandle: $!"; 482 close QUAL or die "Unable to close QUAL filehandle: $!";
449 close TRIM or die "Unable to close TRIM filehandle: $!"; 483 close TRIM or die "Unable to close TRIM filehandle: $!";
450 close OUT or die "Unable to close OUT filehandle: $!"; 484 close OUT or die "Unable to close OUT filehandle: $!";
485
451 } 486 }
452 else{ 487 else{
453 488
454 ### optionally using 2 different adapters for read 1 and read 2 489 ### optionally using 2 different adapters for read 1 and read 2
455 if ($validate and $a2){ 490 if ($validate and $a2){
456 ### Figure out whether current file counts as read 1 or read 2 of paired-end files 491 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
457 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair 492 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
458 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n"; 493 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
459 sleep (3); 494 sleep (3);
460 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!"; 495 $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
461 } 496 }
462 else{ # this is read 2 of a pair 497 else{ # this is read 2 of a pair
463 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n"; 498 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n";
464 sleep (3); 499 sleep (3);
465 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!"; 500 $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!";
466 } 501 }
467 } 502 }
468 ### Using the same adapter for both read 1 and read 2 503 ### Using the same adapter for both read 1 and read 2
469 else{ 504 else{
470 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n"; 505 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
471 sleep (3); 506 sleep (3);
472 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!"; 507 $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
473 } 508 }
474 509
475 close WRITER or die $!; # not needed 510 close WRITER or die $!; # not needed
476 511
477 while (1){ 512 while (1){
500 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE 535 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
501 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later) 536 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
502 print OUT "$l1$seq\n$l3$qual\n"; 537 print OUT "$l1$seq\n$l3$qual\n";
503 } 538 }
504 else{ # single end 539 else{ # single end
505 540
506 if ($clip_r1){ 541 if ($clip_r1){
507 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence 542 if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again
508 $qual = substr($qual,$clip_r1); 543 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
509 } 544 $qual = substr($qual,$clip_r1);
510 545 }
546 }
547
548 if ($three_prime_clip_r1){
549 if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again
550 # warn "seq/qual before/after trimming:\n$seq\n$qual\n";
551 $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence
552 $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1));
553 # warn "$seq\n$qual\n";sleep(1);
554 }
555 }
556
511 if (length $seq < $length_cutoff){ 557 if (length $seq < $length_cutoff){
512 ++$too_short; 558 ++$too_short;
513 next; 559 next;
514 } 560 }
515 else{ 561 else{
528 close ERROR or die "Unable to close ERROR filehandle: $!\n"; 574 close ERROR or die "Unable to close ERROR filehandle: $!\n";
529 close OUT or die "Unable to close OUT filehandle: $!\n"; 575 close OUT or die "Unable to close OUT filehandle: $!\n";
530 576
531 } 577 }
532 578
579
533 if ($rrbs){ 580 if ($rrbs){
534 unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files 581 unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files
535 582
536 # deleting temporary quality trimmed file 583 # deleting temporary quality trimmed file
537 my $deleted = unlink "$output_dir$temp"; 584 my $deleted = unlink "$output_dir$temp";
543 warn "Could not delete temporary file $temp"; 590 warn "Could not delete temporary file $temp";
544 } 591 }
545 } 592 }
546 } 593 }
547 594
595 ### Wait and reap the child process (Cutadapt) so that it doesn't become a zombie process
596 waitpid $pid, 0;
597 unless ($? == 0){
598 die "\n\nCutadapt terminated with exit signal: '$?'.\nTerminating Trim Galore run, please check error message(s) to get an idea what went wrong...\n\n";
599 }
600
548 warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n"; 601 warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
549 print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n"; 602 print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
550 603
551 warn "="x 45,"\n"; 604 warn "="x 45,"\n";
552 print REPORT "="x 45,"\n"; 605 print REPORT "="x 45,"\n";
554 warn "$count sequences processed in total\n"; 607 warn "$count sequences processed in total\n";
555 print REPORT "$count sequences processed in total\n"; 608 print REPORT "$count sequences processed in total\n";
556 609
557 ### only reporting this separately if quality and adapter trimming were performed separately 610 ### only reporting this separately if quality and adapter trimming were performed separately
558 if ($rrbs){ 611 if ($rrbs){
559 my $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100); 612 my $percentage_shortened;
613 if ($count){
614 $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100);
615 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
616 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
617 }
618 else{
619 warn "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n";
620 print REPORT "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n";
621 }
560 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n"; 622 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
561 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n"; 623 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
562 } 624 }
563 625
564 my $percentage_too_short = sprintf ("%.1f",$too_short/$count*100); 626 my $percentage_too_short;
565 warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n"; 627 if ($count){
566 print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n"; 628 $percentage_too_short = sprintf ("%.1f",$too_short/$count*100);
629 }
630 else{
631 $percentage_too_short = 'N/A';
632 }
633
634 if ($validate){ ### only for paired-end files
635 warn "The length threshold of paired-end sequences gets evaluated later on (in the validation step)\n";
636 }
637 else{ ### Single-end file
638 warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
639 print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
640 }
567 641
568 if ($rrbs){ 642 if ($rrbs){
569 my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100); 643 my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100);
570 warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n"; 644 warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
571 print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n"; 645 print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
809 } 883 }
810 } 884 }
811 885
812 chomp $seq_1; 886 chomp $seq_1;
813 chomp $seq_2; 887 chomp $seq_2;
888 chomp $qual_1;
889 chomp $qual_2;
814 890
815 if ($clip_r1){ 891 if ($clip_r1){
816 $seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence 892 if (length $seq_1 > $clip_r1){ # sequences that are already too short won't be trimmed again
817 $qual_1 = substr($qual_1,$clip_r1); 893 $seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
894 $qual_1 = substr($qual_1,$clip_r1);
895 }
818 } 896 }
819 if ($clip_r2){ 897 if ($clip_r2){
820 $seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence 898 if (length $seq_2 > $clip_r2){ # sequences that are already too short won't be trimmed again
821 $qual_2 = substr($qual_2,$clip_r2); 899 $seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence
822 } 900 $qual_2 = substr($qual_2,$clip_r2);
901 }
902 }
903
904 if ($three_prime_clip_r1){
905 if (length $seq_1 > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again
906 $seq_1 = substr($seq_1,0,(length($seq_1) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence
907 $qual_1 = substr($qual_1,0,(length($qual_1) - $three_prime_clip_r1));
908 }
909 }
910 if ($three_prime_clip_r2){
911 if (length $seq_2 > $three_prime_clip_r2){ # sequences that are already too short won't be clipped again
912 $seq_2 = substr($seq_2,0,(length($seq_2) - $three_prime_clip_r2)); # starting after the sequences to be trimmed until the end of the sequence
913 $qual_2 = substr($qual_2,0,(length($qual_2) - $three_prime_clip_r2));
914 }
915 }
916
917
823 918
824 ### making sure that the reads do have a sensible length 919 ### making sure that the reads do have a sensible length
825 if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){ 920 if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){
826 ++$sequence_pairs_removed; 921 ++$sequence_pairs_removed;
827 if ($retain){ # writing out single-end reads if they are longer than the cutoff 922 if ($retain){ # writing out single-end reads if they are longer than the cutoff
828 923
829 if ( length($seq_1) >= $length_read_1){ # read 1 is long enough 924 if ( length($seq_1) >= $length_read_1){ # read 1 is long enough
830 print UNPAIRED1 $id_1; 925 print UNPAIRED1 $id_1;
831 print UNPAIRED1 "$seq_1\n"; 926 print UNPAIRED1 "$seq_1\n";
832 print UNPAIRED1 $l3_1; 927 print UNPAIRED1 $l3_1;
833 print UNPAIRED1 $qual_1; 928 print UNPAIRED1 "$qual_1\n";
834 ++$read_1_printed; 929 ++$read_1_printed;
835 } 930 }
836 931
837 if ( length($seq_2) >= $length_read_2){ # read 2 is long enough 932 if ( length($seq_2) >= $length_read_2){ # read 2 is long enough
838 print UNPAIRED2 $id_2; 933 print UNPAIRED2 $id_2;
839 print UNPAIRED2 "$seq_2\n"; 934 print UNPAIRED2 "$seq_2\n";
840 print UNPAIRED2 $l3_2; 935 print UNPAIRED2 $l3_2;
841 print UNPAIRED2 $qual_2; 936 print UNPAIRED2 "$qual_2\n";
842 ++$read_2_printed; 937 ++$read_2_printed;
843 } 938 }
844 939
845 } 940 }
846 } 941 }
847 else{ 942 else{
848 print R1 $id_1; 943 print R1 $id_1;
849 print R1 "$seq_1\n"; 944 print R1 "$seq_1\n";
850 print R1 $l3_1; 945 print R1 $l3_1;
851 print R1 $qual_1; 946 print R1 "$qual_1\n";
852 947
853 print R2 $id_2; 948 print R2 $id_2;
854 print R2 "$seq_2\n"; 949 print R2 "$seq_2\n";
855 print R2 $l3_2; 950 print R2 $l3_2;
856 print R2 $qual_2; 951 print R2 "$qual_2\n";
857 } 952 }
858 953
859 } 954 }
860 my $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100); 955
956
957 my $percentage;
958
959 if ($count){
960 $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100);
961 }
962 else{
963 $percentage = 'N/A';
964 }
965
861 warn "Total number of sequences analysed: $count\n\n"; 966 warn "Total number of sequences analysed: $count\n\n";
862 warn "Number of sequence pairs removed: $sequence_pairs_removed ($percentage%)\n"; 967 warn "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
863 968
864 print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n"; 969 print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n";
865 print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n"; 970 print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
866 971
867 if ($keep){ 972 if ($keep){
913 my $no_report_file; 1018 my $no_report_file;
914 my $suppress_warn; 1019 my $suppress_warn;
915 my $dont_gzip; 1020 my $dont_gzip;
916 my $clip_r1; 1021 my $clip_r1;
917 my $clip_r2; 1022 my $clip_r2;
1023 my $three_prime_clip_r1;
1024 my $three_prime_clip_r2;
918 1025
919 my $command_line = GetOptions ('help|man' => \$help, 1026 my $command_line = GetOptions ('help|man' => \$help,
920 'q|quality=i' => \$quality, 1027 'q|quality=i' => \$quality,
921 'a|adapter=s' => \$adapter, 1028 'a|adapter=s' => \$adapter,
922 'a2|adapter2=s' => \$adapter2, 1029 'a2|adapter2=s' => \$adapter2,
942 'no_report_file' => \$no_report_file, 1049 'no_report_file' => \$no_report_file,
943 'suppress_warn' => \$suppress_warn, 1050 'suppress_warn' => \$suppress_warn,
944 'dont_gzip' => \$dont_gzip, 1051 'dont_gzip' => \$dont_gzip,
945 'clip_R1=i' => \$clip_r1, 1052 'clip_R1=i' => \$clip_r1,
946 'clip_R2=i' => \$clip_r2, 1053 'clip_R2=i' => \$clip_r2,
1054 'three_prime_clip_R1=i' => \$three_prime_clip_r1,
1055 'three_prime_clip_R2=i' => \$three_prime_clip_r2,
947 ); 1056 );
948
949 1057
950 ### EXIT ON ERROR if there were errors with any of the supplied options 1058 ### EXIT ON ERROR if there were errors with any of the supplied options
951 unless ($command_line){ 1059 unless ($command_line){
952 die "Please respecify command line options\n"; 1060 die "Please respecify command line options\n";
953 } 1061 }
966 1074
967 Quality-/Adapter-/RRBS-Trimming 1075 Quality-/Adapter-/RRBS-Trimming
968 (powered by Cutadapt) 1076 (powered by Cutadapt)
969 version $trimmer_version 1077 version $trimmer_version
970 1078
971 Last update: 10 04 2013 1079 Last update: 15 04 2014
972 1080
973 VERSION 1081 VERSION
974 exit; 1082 exit;
975 } 1083 }
976 1084
1028 else{ 1136 else{
1029 $error_rate = 0.1; # (default) 1137 $error_rate = 0.1; # (default)
1030 } 1138 }
1031 1139
1032 if (defined $adapter){ 1140 if (defined $adapter){
1033 unless ($adapter =~ /^[ACTGNactgn]+$/){ 1141 unless ($adapter =~ /^[ACTGNXactgnx]+$/){
1034 die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n"; 1142 die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n";
1035 } 1143 }
1036 $adapter = uc$adapter; 1144 $adapter = uc$adapter;
1037 } 1145 }
1038 1146
1044 die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n"; 1152 die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n";
1045 } 1153 }
1046 $adapter2 = uc$adapter2; 1154 $adapter2 = uc$adapter2;
1047 } 1155 }
1048 1156
1157 ### LENGTH CUTOFF
1158 unless (defined $length_cutoff){
1159 $length_cutoff = 20;
1160 }
1161
1049 ### files are supposed to be paired-end files 1162 ### files are supposed to be paired-end files
1050 if ($validate){ 1163 if ($validate){
1051 1164
1052 # making sure that an even number of reads has been supplied 1165 # making sure that an even number of reads has been supplied
1053 unless ((scalar@ARGV)%2 == 0){ 1166 unless ((scalar@ARGV)%2 == 0){
1054 die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n"; 1167 die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n";
1055 } 1168 }
1056 1169
1057 ## CUTOFF FOR VALIDATED READ-PAIRS 1170 ## CUTOFF FOR VALIDATED READ-PAIRS
1058
1059 if (defined $length_read_1 or defined $length_read_2){ 1171 if (defined $length_read_1 or defined $length_read_2){
1172
1060 unless ($retain){ 1173 unless ($retain){
1061 die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n"; 1174 die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n";
1062 } 1175 }
1063 1176
1064 if (defined $length_read_1){ 1177 if (defined $length_read_1){
1065 unless ($length_read_1 >= 15 and $length_read_1 <= 100){ 1178 unless ($length_read_1 >= 15 and $length_read_1 <= 100){
1066 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n"; 1179 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
1067 } 1180 }
1068 unless ($length_read_1 > $length_cutoff){ 1181 unless ($length_read_1 > $length_cutoff){
1069 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n"; 1182 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n";
1070 } 1183 }
1071 } 1184 }
1072 1185
1073 if (defined $length_read_2){ 1186 if (defined $length_read_2){
1074 unless ($length_read_2 >= 15 and $length_read_2 <= 100){ 1187 unless ($length_read_2 >= 15 and $length_read_2 <= 100){
1075 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n"; 1188 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
1076 } 1189 }
1077 unless ($length_read_2 > $length_cutoff){ 1190 unless ($length_read_2 > $length_cutoff){
1078 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n"; 1191 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n";
1079 } 1192 }
1080 } 1193 }
1081 } 1194 }
1082 1195
1083 if ($retain){ 1196 if ($retain){
1115 unless ($clip_r2 > 0 and $clip_r2 < 100){ 1228 unless ($clip_r2 > 0 and $clip_r2 < 100){
1116 die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n"; 1229 die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n";
1117 } 1230 }
1118 } 1231 }
1119 1232
1120 1233 ### Trimming at the 3' end
1121 return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2); 1234 if (defined $three_prime_clip_r1){ # trimming 3' bases of read 1
1235 unless ($three_prime_clip_r1 > 0 and $three_prime_clip_r1 < 100){
1236 die "The 3' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n";
1237 }
1238 }
1239
1240 if (defined $three_prime_clip_r2){ # trimming 3' bases of read 2
1241 unless ($three_prime_clip_r2 > 0 and $three_prime_clip_r2 < 100){
1242 die "The 3' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n";
1243 }
1244 }
1245
1246
1247 return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2);
1122 } 1248 }
1123 1249
1124 1250
1125 1251
1126 1252
1163 1289
1164 -a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This 1290 -a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This
1165 option requires '--paired' to be specified as well. 1291 option requires '--paired' to be specified as well.
1166 1292
1167 1293
1168 -s/--stringency <INT> Overlap with adapter sequence required to trim a sequence. Defaults to a 1294 --stringency <INT> Overlap with adapter sequence required to trim a sequence. Defaults to a
1169 very stringent setting of '1', i.e. even a single bp of overlapping sequence 1295 very stringent setting of 1, i.e. even a single bp of overlapping sequence
1170 will be trimmed of the 3' end of any read. 1296 will be trimmed off from the 3' end of any read.
1171 1297
1172 -e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching 1298 -e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching
1173 region) (default: 0.1) 1299 region) (default: 0.1)
1174 1300
1175 --gzip Compress the output file with GZIP. If the input files are GZIP-compressed 1301 --gzip Compress the output file with GZIP. If the input files are GZIP-compressed
1203 of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove 1329 of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove
1204 the first few bp because the end-repair reaction may introduce a bias towards low 1330 the first few bp because the end-repair reaction may introduce a bias towards low
1205 methylation. Please refer to the M-bias plot section in the Bismark User Guide for 1331 methylation. Please refer to the M-bias plot section in the Bismark User Guide for
1206 some examples. Default: OFF. 1332 some examples. Default: OFF.
1207 1333
1334 --three_prime_clip_R1 <int> Instructs Trim Galore to remove <int> bp from the 3' end of read 1 (or single-end
1335 reads) AFTER adapter/quality trimming has been performed. This may remove some unwanted
1336 bias from the 3' end that is not directly related to adapter sequence or basecall quality.
1337 Default: OFF.
1338
1339 --three_prime_clip_R2 <int> Instructs Trim Galore to remove <int> bp from the 3' end of read 2 AFTER
1340 adapter/quality trimming has been performed. This may remove some unwanted bias from
1341 the 3' end that is not directly related to adapter sequence or basecall quality.
1342 Default: OFF.
1208 1343
1209 1344
1210 RRBS-specific options (MspI digested material): 1345 RRBS-specific options (MspI digested material):
1211 1346
1212 --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition 1347 --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition
1271 -r2/--length_2 <INT> Unpaired single-end read length cutoff needed for read 2 to be written to 1406 -r2/--length_2 <INT> Unpaired single-end read length cutoff needed for read 2 to be written to
1272 '.unpaired_2.fq' output file. These reads may be mapped in single-end mode. 1407 '.unpaired_2.fq' output file. These reads may be mapped in single-end mode.
1273 Default: 35 bp. 1408 Default: 35 bp.
1274 1409
1275 1410
1276 Last modified on 15 July 2013. 1411 Last modified on 16 July 2014.
1277 1412
1278 HELP 1413 HELP
1279 exit; 1414 exit;
1280 } 1415 }