Mercurial > repos > bgruening > trim_galore
diff trim_galore @ 1:898db63d2e84 draft
upgrade to new version
author | bgruening |
---|---|
date | Wed, 17 Jul 2013 15:05:43 -0400 |
parents | 3c1664caa8e3 |
children | 2c1f0fe810f7 |
line wrap: on
line diff
--- a/trim_galore Sat Jul 06 09:52:23 2013 -0400 +++ b/trim_galore Wed Jul 17 15:05:43 2013 -0400 @@ -7,7 +7,7 @@ use File::Basename; use Cwd; -## This program is Copyright (C) 2012, Felix Krueger (felix.krueger@babraham.ac.uk) +## This program is Copyright (C) 2012-13, Felix Krueger (felix.krueger@babraham.ac.uk) ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -25,7 +25,7 @@ ## this script is taking in FastQ sequences and trims them with Cutadapt -## last modified on 18 10 2012 +## last modified on 10 April 2013 ######################################################################## @@ -37,11 +37,17 @@ ######################################################################## -my $trimmer_version = '0.2.5'; +my $trimmer_version = '0.2.8'; my $DOWARN = 1; # print on screen warning and text by default BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } }; -my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file) = process_commandline(); +my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2) = process_commandline(); + +my @filenames = @ARGV; + +die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n +USAGE: 'trim_galore [options] <filename(s)>' or 'trim_galore --help' for more options\n\n" unless (@filenames); + ### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED unless (defined $cutoff){ @@ -68,8 +74,6 @@ $cutoff += 31; } -my @filenames = @ARGV; - my $file_1; my $file_2; @@ -155,7 +159,7 @@ } if ($length_read_2 == 35){ - warn "Length cut-off for read 2: $length_read_2 b (default)\n"; + warn "Length cut-off for read 2: $length_read_2 bb (default)\n"; print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n"; } else{ @@ -180,6 +184,16 @@ print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n"; } + if ($clip_r1){ + warn "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n"; + print REPORT "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n"; + } + if ($clip_r2){ + warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n"; + print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n"; + } + + if ($fastqc){ warn "Running FastQC on the data once trimming has completed\n"; print REPORT "Running FastQC on the data once trimming has completed\n"; @@ -195,9 +209,13 @@ print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n"; } + if ($gzip or $filename =~ /\.gz$/){ - warn "Output file will be GZIP compressed\n"; - print REPORT "Output file will be GZIP compressed\n"; + $gzip = 1; + unless ($dont_gzip){ + warn "Output file(s) will be GZIP compressed\n"; + print REPORT "Output file will be GZIP compressed\n"; + } } warn "\n"; @@ -265,9 +283,24 @@ $output_filename =~ s/$/_trimmed.fq/; } + if ($gzip or $filename =~ /\.gz$/){ + unless ($dont_gzip){ + if ($validate){ + open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; # don't need to gzip intermediate file + } + else{ + $output_filename .= '.gz'; + open (OUT,"| gzip -c - > ${output_dir}${output_filename}") or die "Can't write to $output_filename: $!\n"; + } + } + else{ + open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; # don't need to gzip intermediate file + } + } + else{ + open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; + } warn "Writing final adapter and quality trimmed output to $output_filename\n\n"; - open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; - sleep (2); my $count = 0; my $too_short = 0; @@ -389,6 +422,12 @@ print OUT "$l1$seq\n$l3$qual\n"; } else{ # single end + + if ($clip_r1){ + $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence + $qual = substr($qual,$clip_r1); + } + if (length $seq < $length_cutoff){ ++$too_short; next; @@ -463,6 +502,12 @@ print OUT "$l1$seq\n$l3$qual\n"; } else{ # single end + + if ($clip_r1){ + $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence + $qual = substr($qual,$clip_r1); + } + if (length $seq < $length_cutoff){ ++$too_short; next; @@ -535,24 +580,17 @@ warn "\n"; print REPORT "\n"; - ### RUNNING FASTQC - if ($fastqc){ - warn "\n >>> Now running FastQC on the data <<<\n\n"; - sleep (5); - if ($fastqc_args){ - system ("$path_to_fastqc $fastqc_args $output_filename"); - } - else{ - system ("$path_to_fastqc $output_filename"); - } - } - - ### COMPRESSING OUTPUT FILE - unless ($validate){ # not gzipping intermediate files for paired-end files - if ($gzip or $filename =~ /\.gz$/){ - warn "\n >>> GZIP-ing the output file <<<\n\n"; - system ("gzip -f $output_filename"); - $output_filename = $output_filename.'.gz'; + ### RUNNING FASTQC unless we are dealing with paired-end files + unless($validate){ + if ($fastqc){ + warn "\n >>> Now running FastQC on the data <<<\n\n"; + sleep (5); + if ($fastqc_args){ + system ("$path_to_fastqc $fastqc_args $output_dir$output_filename"); + } + else{ + system ("$path_to_fastqc $output_dir$output_filename"); + } } } @@ -585,43 +623,24 @@ sleep (3); if ($fastqc_args){ - system ("$path_to_fastqc $fastqc_args $val_1"); + system ("$path_to_fastqc $fastqc_args $output_dir$val_1"); } else{ - system ("$path_to_fastqc $val_1"); + system ("$path_to_fastqc $output_dir$val_1"); } warn "\n >>> Now running FastQC on the validated data $val_2<<<\n\n"; sleep (3); if ($fastqc_args){ - system ("$path_to_fastqc $fastqc_args $val_2"); + system ("$path_to_fastqc $fastqc_args $output_dir$val_2"); } else{ - system ("$path_to_fastqc $val_2"); + system ("$path_to_fastqc $output_dir$val_2"); } } - if ($gzip or $filename =~ /\.gz$/){ - - # compressing main fastQ output files - warn "Compressing the validated output file $val_1 ...\n"; - system ("gzip -f $val_1"); - - warn "Compressing the validated output file $val_2 ...\n"; - system ("gzip -f $val_2"); - - - if ($retain){ # compressing unpaired reads - warn "Compressing the unpaired read output $un_1 ...\n"; - system ("gzip -f $un_1"); - - warn "Compressing the unpaired read output $un_2 ...\n"; - system ("gzip -f $un_2"); - } - } - warn "Deleting both intermediate output files $file_1 and $file_2\n"; unlink "$output_dir$file_1"; unlink "$output_dir$file_2"; @@ -641,7 +660,7 @@ my $file_1 = shift; my $file_2 = shift; - warn "file_1 $file_1 file_2 $file_2\n\n"; + warn "file_1: $file_1, file_2: $file_2\n\n"; if ($file_1 =~ /\.gz$/){ open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n"; @@ -677,8 +696,32 @@ $out_2 =~ s/trimmed\.fq$/val_2.fq/; } - open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n"; - open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n"; + if ($gzip){ + if ($dont_gzip){ + open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n"; + } + else{ + $out_1 .= '.gz'; + open (R1,"| gzip -c - > ${output_dir}${out_1}") or die "Can't write to $out_1: $!\n"; + } + } + else{ + open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n"; + } + + if ($gzip){ + if ($dont_gzip){ + open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n"; + } + else{ + $out_2 .= '.gz'; + open (R2,"| gzip -c - > ${output_dir}${out_2}") or die "Can't write to $out_2: $!\n"; + } + } + else{ + open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n"; + } + warn "Writing validated paired-end read 1 reads to $out_1\n"; warn "Writing validated paired-end read 2 reads to $out_2\n\n"; @@ -704,8 +747,31 @@ $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/; } - open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n"; - open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n"; + if ($gzip){ + if ($dont_gzip){ + open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n"; + } + else{ + $unpaired_1 .= '.gz'; + open (UNPAIRED1,"| gzip -c - > ${output_dir}${unpaired_1}") or die "Can't write to $unpaired_1: $!\n"; + } + } + else{ + open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n"; + } + + if ($gzip){ + if ($dont_gzip){ + open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n"; + } + else{ + $unpaired_2 .= '.gz'; + open (UNPAIRED2,"| gzip -c - > ${output_dir}${unpaired_2}") or die "Can't write to $unpaired_2: $!\n"; + } + } + else{ + open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n"; + } warn "Writing unpaired read 1 reads to $unpaired_1\n"; warn "Writing unpaired read 2 reads to $unpaired_2\n\n"; @@ -733,7 +799,7 @@ ++$count; - ## small check if the sequence files appear to FastQ files + ## small check if the sequence files appear to be FastQ files if ($count == 1){ # performed just once if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){ die "Input file doesn't seem to be in FastQ format at sequence $count\n"; @@ -746,6 +812,14 @@ chomp $seq_1; chomp $seq_2; + if ($clip_r1){ + $seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence + $qual_1 = substr($qual_1,$clip_r1); + } + if ($clip_r2){ + $seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence + $qual_2 = substr($qual_2,$clip_r2); + } ### making sure that the reads do have a sensible length if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){ @@ -795,6 +869,14 @@ warn "Number of unpaired read 2 reads printed: $read_2_printed\n"; } + close R1 or die $!; + close R2 or die $!; + + if ($retain){ + close UNPAIRED1 or die $!; + close UNPAIRED2 or die $!; + } + warn "\n"; if ($retain){ return ($out_1,$out_2,$unpaired_1,$unpaired_2); @@ -830,6 +912,9 @@ my $output_dir; my $no_report_file; my $suppress_warn; + my $dont_gzip; + my $clip_r1; + my $clip_r2; my $command_line = GetOptions ('help|man' => \$help, 'q|quality=i' => \$quality, @@ -856,8 +941,12 @@ 'o|output_dir=s' => \$output_dir, 'no_report_file' => \$no_report_file, 'suppress_warn' => \$suppress_warn, + 'dont_gzip' => \$dont_gzip, + 'clip_R1=i' => \$clip_r1, + 'clip_R2=i' => \$clip_r2, ); - + + ### EXIT ON ERROR if there were errors with any of the supplied options unless ($command_line){ die "Please respecify command line options\n"; @@ -879,7 +968,7 @@ (powered by Cutadapt) version $trimmer_version - Last update: 18 10 2012 + Last update: 10 04 2013 VERSION exit; @@ -1011,7 +1100,25 @@ $output_dir = ''; } - return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file); + ### Trimming at the 5' end + if (defined $clip_r2){ # trimming 5' bases of read 1 + die "Clipping the 5' end of read 2 is only allowed for paired-end files (--paired)\n" unless ($validate); + } + + if (defined $clip_r1){ # trimming 5' bases of read 1 + unless ($clip_r1 > 0 and $clip_r1 < 100){ + die "The 5' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n"; + } + } + + if (defined $clip_r2){ # trimming 5' bases of read 2 + unless ($clip_r2 > 0 and $clip_r2 < 100){ + die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n"; + } + } + + + return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2); } @@ -1065,8 +1172,11 @@ -e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching region) (default: 0.1) ---gzip Compress the output file with gzip. If the input files are gzip-compressed - the output files will be automatically gzip compressed as well. +--gzip Compress the output file with GZIP. If the input files are GZIP-compressed + the output files will automatically be GZIP compressed as well. As of v0.2.8 the + compression will take place on the fly. + +--dont_gzip Output files won't be compressed with GZIP. This option overrides --gzip. --length <INT> Discard reads that became shorter than length INT because of either quality or adapter trimming. A value of '0' effectively disables @@ -1084,6 +1194,17 @@ --suppress_warn If specified any output to STDOUT or STDERR will be suppressed. +--clip_R1 <int> Instructs Trim Galore to remove <int> bp from the 5' end of read 1 (or single-end + reads). This may be useful if the qualities were very poor, or if there is some + sort of unwanted bias at the 5' end. Default: OFF. + +--clip_R2 <int> Instructs Trim Galore to remove <int> bp from the 5' end of read 2 (paired-end reads + only). This may be useful if the qualities were very poor, or if there is some sort + of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove + the first few bp because the end-repair reaction may introduce a bias towards low + methylation. Please refer to the M-bias plot section in the Bismark User Guide for + some examples. Default: OFF. + RRBS-specific options (MspI digested material): @@ -1152,7 +1273,7 @@ Default: 35 bp. -Last modified on 18 Oct 2012. +Last modified on 15 July 2013. HELP exit;