Mercurial > repos > bgruening > trim_galore

diff trim_galore @ 0:3c1664caa8e3 draft
Uploaded
author: bgruening
date: Sat, 06 Jul 2013 09:52:23 -0400
children: 898db63d2e84
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trim_galore	Sat Jul 06 09:52:23 2013 -0400
@@ -0,0 +1,1159 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use Getopt::Long;
+use IPC::Open3;
+use File::Spec;
+use File::Basename;
+use Cwd;
+
+## This program is Copyright (C) 2012, Felix Krueger (felix.krueger@babraham.ac.uk)
+
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+
+## this script is taking in FastQ sequences and trims them with Cutadapt
+## last modified on 18 10 2012
+
+########################################################################
+
+# change these paths if needed
+
+my $path_to_cutadapt = 'cutadapt';
+my $path_to_fastqc = 'fastqc';
+
+########################################################################
+
+
+my $trimmer_version = '0.2.5';
+my $DOWARN = 1; # print on screen warning and text by default
+BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } };
+
+my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file) = process_commandline();
+
+### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED
+unless (defined $cutoff){
+  $cutoff = 20;
+}
+my $phred_score_cutoff = $cutoff; # only relevant for report
+
+unless (defined $adapter){
+  $adapter = 'AGATCGGAAGAGC';
+}
+unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming
+  $a2 = '';
+}
+
+unless (defined $stringency){
+  $stringency = 1;
+}
+
+unless (defined $length_cutoff){
+  $length_cutoff = 20;
+}
+
+if ($phred_encoding == 64){
+  $cutoff += 31;
+}
+
+my @filenames = @ARGV;
+
+my $file_1;
+my $file_2;
+
+foreach my $filename (@ARGV){
+  trim ($filename);
+}
+
+
+sub trim{
+  my $filename = shift;
+
+  my $output_filename = (split (/\//,$filename))[-1];
+  # warn "Here is the outputfile name: $output_filename\n";
+
+  my $report = $output_filename;
+  $report =~ s/$/_trimming_report.txt/;
+
+  if ($no_report_file) {
+    $report = File::Spec->devnull;
+    open (REPORT,'>',$report) or die "Failed to write to file: $!\n";
+    # warn "Redirecting report output to /dev/null\n";
+  }
+  else{
+    open (REPORT,'>',$output_dir.$report) or die "Failed to write to file: $!\n";
+    warn "Writing report to '$output_dir$report'\n";
+  }
+
+  warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
+  print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
+
+  warn "Quality Phred score cutoff: $phred_score_cutoff\n";
+  print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n";
+
+  warn "Quality encoding type selected: ASCII+$phred_encoding\n";
+  print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n";
+
+  warn "Adapter sequence: '$adapter'\n";
+  print REPORT "Adapter sequence: '$adapter'\n";
+
+  if ($error_rate == 0.1){
+    warn "Maximum trimming error rate: $error_rate (default)\n";
+  }
+  else{
+    warn "Maximum trimming error rate: $error_rate\n";
+  }
+
+  print REPORT "Maximum trimming error rate: $error_rate";
+  if ($error_rate == 0.1){
+    print REPORT " (default)\n";
+  }
+  else{
+    print REPORT "\n";
+  }
+
+  if ($a2){
+    warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
+    print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
+  }
+
+  warn "Minimum required adapter overlap (stringency): $stringency bp\n";
+  print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n";
+
+  if ($validate){
+    warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
+    print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
+  }
+  else{
+    warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
+    print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
+  }
+
+  if ($validate){ # only for paired-end files
+
+    if ($retain){ # keeping single-end reads if only one end is long enough
+
+      if ($length_read_1 == 35){
+	warn "Length cut-off for read 1: $length_read_1 bp (default)\n";
+	print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n";
+      }
+      else{
+	warn "Length cut-off for read 1: $length_read_1 bp\n";
+	print REPORT "Length cut-off for read 1: $length_read_1 bp\n";
+      }
+
+      if ($length_read_2 == 35){
+	warn "Length cut-off for read 2: $length_read_2 b (default)\n";
+	print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n";
+      }
+      else{
+	warn "Length cut-off for read 2: $length_read_2 bp\n";
+	print REPORT "Length cut-off for read 2: $length_read_2 bp\n";
+      }
+    }
+  }
+
+  if ($rrbs){
+    warn "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
+    print REPORT "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
+  }
+
+  if ($non_directional){
+    warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
+    print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
+  }
+
+  if ($trim){
+    warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
+    print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
+  }
+
+  if ($fastqc){
+    warn "Running FastQC on the data once trimming has completed\n";
+    print REPORT "Running FastQC on the data once trimming has completed\n";
+
+    if ($fastqc_args){
+      warn "Running FastQC with the following extra arguments: '$fastqc_args'\n";
+      print REPORT  "Running FastQC with the following extra arguments: $fastqc_args\n";
+    }
+  }
+
+  if ($keep and $rrbs){
+    warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
+    print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
+  }
+
+  if ($gzip or $filename =~ /\.gz$/){
+    warn "Output file will be GZIP compressed\n";
+    print REPORT "Output file will be GZIP compressed\n";
+  }
+
+  warn "\n";
+  print REPORT "\n";
+  sleep (3);
+
+  my $temp;
+
+  ### Proceeding differently for RRBS and other type of libraries
+  if ($rrbs){
+
+    ### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified
+    if ($cutoff == 0){
+      warn "Quality cutoff selected was 0    -    Skipping quality trimming altogether\n\n";
+      sleep (3);
+    }
+    else{
+
+      $temp = $filename;
+      $temp =~ s/$/_qual_trimmed.fastq/;
+      open (TEMP,'>',$output_dir.$temp) or die "Can't write to $temp: $!";
+
+      warn "  >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n";
+      sleep (3);
+
+      open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!";
+
+      my $qual_count = 0;
+
+      while (1){
+	my $l1 = <QUAL>;
+	my $seq = <QUAL>;
+	my $l3 = <QUAL>;
+	my $qual = <QUAL>;
+	last unless (defined $qual);
+	
+	$qual_count++;
+	if ($qual_count%10000000 == 0){
+	  warn "$qual_count sequences processed\n";
+	}
+	print TEMP "$l1$seq$l3$qual";
+      }
+
+      warn "\n  >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n";
+      close QUAL or die "Unable to close filehandle: $!\n";
+      sleep (3);
+
+    }
+  }
+
+
+  if ($output_filename =~ /\.fastq$/){
+    $output_filename =~ s/\.fastq$/_trimmed.fq/;
+  }
+  elsif ($output_filename =~ /\.fastq\.gz$/){
+    $output_filename =~ s/\.fastq\.gz$/_trimmed.fq/;
+  }
+  elsif ($output_filename =~ /\.fq$/){
+    $output_filename =~ s/\.fq$/_trimmed.fq/;
+  }
+  elsif ($output_filename =~ /\.fq\.gz$/){
+    $output_filename =~ s/\.fq\.gz$/_trimmed.fq/;
+  }
+  else{
+    $output_filename =~ s/$/_trimmed.fq/;
+  }
+
+  warn "Writing final adapter and quality trimmed output to $output_filename\n\n";
+  open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n";
+  sleep (2);
+
+  my $count = 0;
+  my $too_short = 0;
+  my $quality_trimmed = 0;
+  my $rrbs_trimmed = 0;
+  my $rrbs_trimmed_start = 0;
+  my $CAA = 0;
+  my $CGA = 0;
+
+  if ($rrbs and $cutoff != 0){
+
+    ### optionally using 2 different adapters for read 1 and read 2
+    if ($validate and $a2){
+      ### Figure out whether current file counts as read 1 or read 2 of paired-end files
+      if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
+	warn "\n  >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
+	sleep (3);
+	open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
+      }
+      else{                            # this is read 2 of a pair
+	warn "\n  >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n";
+	sleep (3);
+    	open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
+      }
+    }
+    ### Using the same adapter for both read 1 and read 2
+    else{
+      warn "\n  >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
+      sleep (3);
+      open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
+    }
+
+    close WRITER or die $!; # not needed
+
+    open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file
+
+    if ($filename =~ /\.gz$/){
+      open (IN,"zcat $filename |") or die $!; # original, untrimmed file
+    }
+    else{
+      open (IN,$filename) or die $!; # original, untrimmed file
+    }
+
+    while (1){
+
+      # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not
+      my $l1 = <TRIM>;
+      my $seq = <TRIM>; # adapter trimmed sequence
+      my $l3 = <TRIM>;
+      my $qual = <TRIM>;
+
+      $_ = <IN>;   # irrelevant
+      my $original_seq = <IN>;
+      $_ = <IN>;   # irrelevant
+      $_ = <IN>;   # irrelevant
+
+      $_ = <QUAL>; # irrelevant
+      my $qual_trimmed_seq = <QUAL>;
+      $_ = <QUAL>; # irrelevant
+      my $qual_trimmed_qual = <QUAL>;
+
+      last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings
+
+      $count++;
+      if ($count%10000000 == 0){
+	warn "$count sequences processed\n";
+      }
+
+      chomp $seq;
+      chomp $qual;
+      chomp $qual_trimmed_seq;
+      chomp $original_seq;
+
+      my $quality_trimmed_seq_length = length $qual_trimmed_seq;
+
+      if (length $original_seq > length $qual_trimmed_seq){
+	++$quality_trimmed;
+      }
+
+      my $nd = 0;
+
+      ### NON-DIRECTIONAL RRBS
+      if ($non_directional){
+	if (length$seq > 2){
+	  if ($seq =~ /^CAA/){
+	    ++$CAA;
+	    $seq = substr ($seq,2,length($seq)-2);
+	    $qual = substr ($qual,2,length($qual)-2);
+	    ++$rrbs_trimmed_start;
+	    $nd = 1;
+	  }
+	  elsif ($seq =~ /^CGA/){
+	    $seq = substr ($seq,2,length($seq)-2);
+	    $qual = substr ($qual,2,length($qual)-2);
+	    ++$CGA;
+	    ++$rrbs_trimmed_start;
+	    $nd = 1;
+	  }
+	}
+      }
+
+      ### directional read
+      unless ($nd == 1){
+	if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){
+	  $seq = substr ($seq,0,length($seq)-2);
+	  $qual = substr ($qual,0,length($qual)-2);
+	  ++$rrbs_trimmed;
+	}
+      }
+
+      ### Shortening all sequences by 1 bp on the 3' end
+      if ($trim){
+	$seq = substr($seq,0,length($seq)-1);
+	$qual = substr($qual,0,length($qual)-1);
+      }
+
+      ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
+      if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
+	print OUT "$l1$seq\n$l3$qual\n";
+      }
+      else{ # single end
+	if (length $seq < $length_cutoff){
+	  ++$too_short;
+	  next;
+	}
+	else{
+	  print OUT "$l1$seq\n$l3$qual\n";
+	}
+      }
+    }
+
+    print REPORT "\n";
+    while (<ERROR>){
+      warn $_;
+      print REPORT $_;
+    }
+
+    close IN or die "Unable to close IN filehandle: $!";
+    close QUAL or die "Unable to close QUAL filehandle: $!";
+    close TRIM or die "Unable to close TRIM filehandle: $!";
+    close OUT or die  "Unable to close OUT filehandle: $!";
+  }
+  else{
+
+    ### optionally using 2 different adapters for read 1 and read 2
+    if ($validate and $a2){
+      ### Figure out whether current file counts as read 1 or read 2 of paired-end files
+      if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
+	warn "\n  >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
+	sleep (3);
+	open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
+      }
+      else{                            # this is read 2 of a pair
+	warn "\n  >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n";
+	sleep (3);
+	open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!";
+      }
+    }
+    ### Using the same adapter for both read 1 and read 2
+    else{
+      warn "\n  >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
+      sleep (3);
+      open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
+    }
+
+    close WRITER or die $!; # not needed
+
+    while (1){
+
+      my $l1 = <TRIM>;
+      my $seq = <TRIM>; # quality and/or adapter trimmed sequence
+      my $l3 = <TRIM>;
+      my $qual = <TRIM>;
+      # print "$l1$seq\n$l3$qual\n";
+      last unless (defined $qual); # could be an empty string
+
+      $count++;
+      if ($count%10000000 == 0){
+	warn "$count sequences processed\n";
+      }
+
+      chomp $seq;
+      chomp $qual;
+
+      ### Shortening all sequences by 1 bp on the 3' end
+      if ($trim){
+	$seq = substr($seq,0,length($seq)-1);
+	$qual = substr($qual,0,length($qual)-1);
+      }
+
+      ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
+      if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
+	print OUT "$l1$seq\n$l3$qual\n";
+      }
+      else{ # single end
+	if (length $seq < $length_cutoff){
+	  ++$too_short;
+	  next;
+	}
+	else{
+	  print OUT "$l1$seq\n$l3$qual\n";
+	}
+      }
+    }
+
+    print REPORT "\n";
+    while (<ERROR>){
+      warn $_;
+      print REPORT $_;
+    }
+
+    close TRIM or die "Unable to close TRIM filehandle: $!\n";
+    close ERROR or die "Unable to close ERROR filehandle: $!\n";
+    close OUT or die  "Unable to close OUT filehandle: $!\n";
+
+  }
+
+  if ($rrbs){
+    unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files
+
+      # deleting temporary quality trimmed file
+      my $deleted = unlink "$output_dir$temp";
+
+      if ($deleted){
+	warn "Successfully deleted temporary file $temp\n\n";
+      }
+      else{
+	warn "Could not delete temporary file $temp";
+      }
+    }
+  }
+
+  warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
+  print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
+
+  warn "="x 45,"\n";
+  print REPORT "="x 45,"\n";
+
+  warn "$count sequences processed in total\n";
+  print REPORT "$count sequences processed in total\n";
+
+  ###  only reporting this separately if quality and adapter trimming were performed separately
+  if ($rrbs){
+    my $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100);
+    warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
+    print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
+  }
+
+  my $percentage_too_short = sprintf ("%.1f",$too_short/$count*100);
+  warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
+  print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
+
+  if ($rrbs){
+    my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100);
+    warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
+    print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
+  }
+
+  if ($non_directional){
+    my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100);
+    warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
+    print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
+  }
+
+  warn "\n";
+  print REPORT "\n";
+
+  ### RUNNING FASTQC
+  if ($fastqc){
+    warn "\n  >>> Now running FastQC on the data <<<\n\n";
+    sleep (5);
+    if ($fastqc_args){
+      system ("$path_to_fastqc $fastqc_args $output_filename");
+    }
+    else{
+      system ("$path_to_fastqc $output_filename");
+    }
+  }
+
+  ### COMPRESSING OUTPUT FILE
+  unless ($validate){ # not gzipping intermediate files for paired-end files
+    if ($gzip or $filename =~ /\.gz$/){
+      warn "\n  >>> GZIP-ing the output file <<<\n\n";
+      system ("gzip -f $output_filename");
+      $output_filename = $output_filename.'.gz';
+    }
+  }
+
+  ### VALIDATE PAIRED-END FILES
+  if ($validate){
+
+    ### Figure out whether current file counts as read 1 or read 2 of paired-end files
+
+    if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
+      $file_1 = $output_filename;
+      shift @filenames;
+      # warn "This is read 1: $file_1\n\n";
+    }
+    else{                            # this is read 2 of a pair
+      $file_2 = $output_filename;
+      shift @filenames;
+      # warn "This is read 2: $file_2\n\n";
+    }
+
+    if ($file_1 and $file_2){
+      warn "Validate paired-end files $file_1 and $file_2\n";
+      sleep (1);
+
+      my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2);
+
+      ### RUNNING FASTQC
+      if ($fastqc){
+
+	warn "\n  >>> Now running FastQC on the validated data $val_1<<<\n\n";
+	sleep (3);
+
+	if ($fastqc_args){
+	  system ("$path_to_fastqc $fastqc_args $val_1");
+	}
+	else{
+	  system ("$path_to_fastqc $val_1");
+	}
+
+	warn "\n  >>> Now running FastQC on the validated data $val_2<<<\n\n";
+	sleep (3);
+
+	if ($fastqc_args){
+	  system ("$path_to_fastqc $fastqc_args $val_2");
+	}
+	else{
+	  system ("$path_to_fastqc $val_2");
+	}
+	
+      }
+
+      if ($gzip or $filename =~ /\.gz$/){
+		
+	# compressing main fastQ output files
+	warn "Compressing the validated output file $val_1 ...\n";
+	system ("gzip -f $val_1");
+	
+	warn "Compressing the validated output file $val_2 ...\n";
+	system ("gzip -f $val_2");
+
+
+	if ($retain){ # compressing unpaired reads
+	  warn "Compressing the unpaired read output $un_1 ...\n";
+	  system ("gzip -f $un_1");
+	
+	  warn "Compressing the unpaired read output $un_2 ...\n";
+	  system ("gzip -f $un_2");
+	}
+      }
+
+      warn "Deleting both intermediate output files $file_1 and $file_2\n";
+      unlink "$output_dir$file_1";
+      unlink "$output_dir$file_2";
+
+      warn "\n",'='x100,"\n\n";
+      sleep (3);
+
+      $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed
+      $file_2 = undef;
+    }
+  }
+
+}
+
+sub validate_paired_end_files{
+
+  my $file_1 = shift;
+  my $file_2 = shift;
+
+  warn "file_1 $file_1 file_2 $file_2\n\n";
+
+  if ($file_1 =~ /\.gz$/){
+    open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n";
+  }
+  else{
+    open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n";
+  }
+
+  if ($file_2 =~ /\.gz$/){
+    open (IN2,"zcat $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n";
+  }
+  else{
+    open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n";
+  }
+
+  warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n";
+  sleep (3);
+
+  my $out_1 = $file_1;
+  my $out_2 = $file_2;
+
+  if ($out_1 =~ /gz$/){
+    $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/;
+  }
+  else{
+    $out_1 =~ s/trimmed\.fq$/val_1.fq/;
+  }
+
+  if ($out_2 =~ /gz$/){
+    $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/;
+  }
+  else{
+    $out_2 =~ s/trimmed\.fq$/val_2.fq/;
+  }
+
+  open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
+  open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
+  warn "Writing validated paired-end read 1 reads to $out_1\n";
+  warn "Writing validated paired-end read 2 reads to $out_2\n\n";
+
+  my $unpaired_1;
+  my $unpaired_2;
+
+  if ($retain){
+
+    $unpaired_1 = $file_1;
+    $unpaired_2 = $file_2;
+
+    if ($unpaired_1 =~ /gz$/){
+      $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/;
+    }
+    else{
+      $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/;
+    }
+
+    if ($unpaired_2 =~ /gz$/){
+      $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/;
+    }
+    else{
+      $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/;
+    }
+
+    open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
+    open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
+
+    warn "Writing unpaired read 1 reads to $unpaired_1\n";
+    warn "Writing unpaired read 2 reads to $unpaired_2\n\n";
+  }
+
+  my $sequence_pairs_removed = 0;
+  my $read_1_printed = 0;
+  my $read_2_printed = 0;
+
+  my $count = 0;
+
+  while (1){
+    my $id_1   = <IN1>;
+    my $seq_1  = <IN1>;
+    my $l3_1   = <IN1>;
+    my $qual_1 = <IN1>;
+    last unless ($id_1 and $seq_1 and $l3_1 and $qual_1);
+
+    my $id_2   = <IN2>;
+    my $seq_2  = <IN2>;
+    my $l3_2   = <IN2>;
+    my $qual_2 = <IN2>;
+    last unless ($id_2 and $seq_2 and $l3_2 and $qual_2);
+
+    ++$count;
+
+
+    ## small check if the sequence files appear to FastQ files
+    if ($count == 1){ # performed just once
+      if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){
+	die "Input file doesn't seem to be in FastQ format at sequence $count\n";
+      }
+      if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){
+	die "Input file doesn't seem to be in FastQ format at sequence $count\n";
+      }
+    }
+
+    chomp $seq_1;
+    chomp $seq_2;
+
+
+    ### making sure that the reads do have a sensible length
+    if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){
+      ++$sequence_pairs_removed;
+      if ($retain){ # writing out single-end reads if they are longer than the cutoff
+	
+	if ( length($seq_1) >= $length_read_1){ # read 1 is long enough
+	  print UNPAIRED1 $id_1;
+	  print UNPAIRED1 "$seq_1\n";
+	  print UNPAIRED1 $l3_1;
+	  print UNPAIRED1 $qual_1;
+	  ++$read_1_printed;
+	}
+	
+	if ( length($seq_2) >= $length_read_2){ # read 2 is long enough
+	  print UNPAIRED2 $id_2;
+	  print UNPAIRED2 "$seq_2\n";
+	  print UNPAIRED2 $l3_2;
+	  print UNPAIRED2 $qual_2;
+	  ++$read_2_printed;
+	}
+	
+      }
+    }
+    else{
+      print R1 $id_1;
+      print R1 "$seq_1\n";
+      print R1 $l3_1;
+      print R1 $qual_1;
+
+      print R2 $id_2;
+      print R2 "$seq_2\n";
+      print R2 $l3_2;
+      print R2 $qual_2;
+    }
+
+  }
+  my $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100);
+  warn "Total number of sequences analysed: $count\n\n";
+  warn "Number of sequence pairs removed: $sequence_pairs_removed ($percentage%)\n";
+
+  print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n";
+  print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
+
+  if ($keep){
+    warn "Number of unpaired read 1 reads printed: $read_1_printed\n";
+    warn "Number of unpaired read 2 reads printed: $read_2_printed\n";
+  }
+
+  warn "\n";
+  if ($retain){
+    return ($out_1,$out_2,$unpaired_1,$unpaired_2);
+  }
+  else{
+    return ($out_1,$out_2);
+  }
+}
+
+sub process_commandline{
+  my $help;
+  my $quality;
+  my $adapter;
+  my $adapter2;
+  my $stringency;
+  my $report;
+  my $version;
+  my $rrbs;
+  my $length_cutoff;
+  my $keep;
+  my $fastqc;
+  my $non_directional;
+  my $phred33;
+  my $phred64;
+  my $fastqc_args;
+  my $trim;
+  my $gzip;
+  my $validate;
+  my $retain;
+  my $length_read_1;
+  my $length_read_2;
+  my $error_rate;
+  my $output_dir;
+  my $no_report_file;
+  my $suppress_warn;
+
+  my $command_line = GetOptions ('help|man' => \$help,
+				 'q|quality=i' => \$quality,
+				 'a|adapter=s' => \$adapter,
+				 'a2|adapter2=s' => \$adapter2,
+				 'report' => \$report,
+				 'version' => \$version,
+				 'stringency=i' => \$stringency,
+				 'fastqc' => \$fastqc,
+				 'RRBS' => \$rrbs,	
+				 'keep' => \$keep,
+				 'length=i' => \$length_cutoff,
+				 'non_directional' => \$non_directional,
+				 'phred33' => \$phred33,
+				 'phred64' => \$phred64,
+				 'fastqc_args=s' => \$fastqc_args,
+				 'trim1' => \$trim,
+				 'gzip' => \$gzip,
+				 'paired_end' => \$validate,
+				 'retain_unpaired' => \$retain,
+				 'length_1|r1=i' => \$length_read_1,
+				 'length_2|r2=i' => \$length_read_2,
+				 'e|error_rate=s' => \$error_rate,
+				 'o|output_dir=s' => \$output_dir,
+				 'no_report_file' => \$no_report_file,
+				 'suppress_warn' => \$suppress_warn,
+				);
+  
+  ### EXIT ON ERROR if there were errors with any of the supplied options
+  unless ($command_line){
+    die "Please respecify command line options\n";
+  }
+
+  ### HELPFILE
+  if ($help){
+    print_helpfile();
+    exit;
+  }
+
+
+
+
+  if ($version){
+    print << "VERSION";
+
+                          Quality-/Adapter-/RRBS-Trimming
+                               (powered by Cutadapt)
+                                  version $trimmer_version
+
+                             Last update: 18 10 2012
+
+VERSION
+    exit;
+  }
+
+  ### RRBS
+  unless ($rrbs){
+    $rrbs = 0;
+  }
+
+  ### SUPRESS WARNINGS
+  if (defined $suppress_warn){
+    $DOWARN = 0;
+  }
+
+  ### QUALITY SCORES
+  my $phred_encoding;
+  if ($phred33){
+    if ($phred64){
+      die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n";
+    }
+    $phred_encoding = 33;
+  }
+  elsif ($phred64){
+    $phred_encoding = 64;
+  }
+  unless ($phred33 or $phred64){
+    warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n";
+    $phred_encoding = 33;
+    sleep (3);
+  }
+
+  ### NON-DIRECTIONAL RRBS
+  if ($non_directional){
+    unless ($rrbs){
+      die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n";
+    }
+  }
+  else{
+    $non_directional = 0;
+  }
+
+  if ($fastqc_args){
+    $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed
+  }
+  else{
+    $fastqc_args = 0;
+  }
+
+  ### CUSTOM ERROR RATE
+  if (defined $error_rate){
+    # make sure that the error rate is between 0 and 1
+    unless ($error_rate >= 0 and $error_rate <= 1){
+      die "Please specify an error rate between 0 and 1 (the default is 0.1)\n";
+    }
+  }
+  else{
+    $error_rate = 0.1; # (default)
+  }
+
+  if (defined $adapter){
+    unless ($adapter =~ /^[ACTGNactgn]+$/){
+      die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n";
+    }
+    $adapter = uc$adapter;
+  }
+
+  if (defined $adapter2){
+    unless ($validate){
+      die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n";
+    }
+    unless ($adapter2 =~ /^[ACTGNactgn]+$/){
+      die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n";
+    }
+    $adapter2 = uc$adapter2;
+  }
+
+  ### files are supposed to be paired-end files
+  if ($validate){
+
+    # making sure that an even number of reads has been supplied
+    unless ((scalar@ARGV)%2 == 0){
+      die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n";
+    }
+
+    ## CUTOFF FOR VALIDATED READ-PAIRS
+
+    if (defined $length_read_1 or defined $length_read_2){
+      unless ($retain){
+	die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n";
+      }
+
+      if (defined $length_read_1){
+	unless ($length_read_1 >= 15 and $length_read_1 <= 100){
+	  die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
+	}
+	unless ($length_read_1 > $length_cutoff){
+	  die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n";
+	}
+      }
+
+      if (defined $length_read_2){
+	unless ($length_read_2 >= 15 and $length_read_2 <= 100){
+	  die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
+	}
+	unless ($length_read_2 > $length_cutoff){
+	  die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n";
+	}
+      }
+    }
+
+    if ($retain){
+      $length_read_1 = 35 unless (defined $length_read_1);
+      $length_read_2 = 35 unless (defined $length_read_2);
+    }
+  }
+
+  unless ($no_report_file){
+    $no_report_file = 0;
+  }
+
+  ### OUTPUT DIR PATH
+  if ($output_dir){
+    unless ($output_dir =~ /\/$/){
+      $output_dir =~ s/$/\//;
+    }
+  }
+  else{
+    $output_dir = '';
+  }
+
+  return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file);
+}
+
+
+
+
+sub print_helpfile{
+  print << "HELP";
+
+ USAGE:
+
+trim_galore [options] <filename(s)>
+
+
+-h/--help               Print this help message and exits.
+
+-v/--version            Print the version information and exits.
+
+-q/--quality <INT>      Trim low-quality ends from reads in addition to adapter removal. For
+                        RRBS samples, quality trimming will be performed first, and adapter
+                        trimming is carried in a second round. Other files are quality and adapter
+                        trimmed in a single pass. The algorithm is the same as the one used by BWA
+                        (Subtract INT from all qualities; compute partial sums from all indices
+                        to the end of the sequence; cut sequence at the index at which the sum is
+                        minimal). Default Phred score: 20.
+
+--phred33               Instructs Cutadapt to use ASCII+33 quality scores as Phred scores
+                        (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON.
+
+--phred64               Instructs Cutadapt to use ASCII+64 quality scores as Phred scores
+                        (Illumina 1.5 encoding) for quality trimming.
+
+--fastqc                Run FastQC in the default mode on the FastQ file once trimming is complete.
+
+--fastqc_args "<ARGS>"  Passes extra arguments to FastQC. If more than one argument is to be passed
+                        to FastQC they must be in the form "arg1 arg2 etc.". An example would be:
+                        --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will
+                        automatically invoke FastQC, so --fastqc does not have to be specified
+                        separately.
+
+-a/--adapter <STRING>   Adapter sequence to be trimmed. If not specified explicitely, the first 13
+                        bp of the Illumina adapter 'AGATCGGAAGAGC' will be used by default.
+
+-a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This
+                        option requires '--paired' to be specified as well.
+
+
+-s/--stringency <INT>   Overlap with adapter sequence required to trim a sequence. Defaults to a
+                        very stringent setting of '1', i.e. even a single bp of overlapping sequence
+                        will be trimmed of the 3' end of any read.
+
+-e <ERROR RATE>         Maximum allowed error rate (no. of errors divided by the length of the matching
+                        region) (default: 0.1)
+
+--gzip                  Compress the output file with gzip. If the input files are gzip-compressed
+                        the output files will be automatically gzip compressed as well.
+
+--length <INT>          Discard reads that became shorter than length INT because of either
+                        quality or adapter trimming. A value of '0' effectively disables
+                        this behaviour. Default: 20 bp.
+
+                        For paired-end files, both reads of a read-pair need to be longer than
+                        <INT> bp to be printed out to validated paired-end files (see option --paired).
+                        If only one read became too short there is the possibility of keeping such
+                        unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp.
+
+-o/--output_dir <DIR>   If specified all output will be written to this directory instead of the current
+                        directory.
+
+--no_report_file        If specified no report file will be generated.
+
+--suppress_warn         If specified any output to STDOUT or STDERR will be suppressed.
+
+
+
+RRBS-specific options (MspI digested material):
+
+--rrbs                  Specifies that the input file was an MspI digested RRBS sample (recognition
+                        site: CCGG). Sequences which were adapter-trimmed will have a further 2 bp
+                        removed from their 3' end. This is to avoid that the filled-in C close to the
+                        second MspI site in a sequence is used for methylation calls. Sequences which
+                        were merely trimmed because of poor quality will not be shortened further.
+
+--non_directional       Selecting this option for non-directional RRBS libraries will screen
+                        quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read
+                        and, if found, removes the first two basepairs. Like with the option
+                        '--rrbs' this avoids using cytosine positions that were filled-in
+                        during the end-repair step. '--non_directional' requires '--rrbs' to
+                        be specified as well.
+
+--keep                  Keep the quality trimmed intermediate file. Default: off, which means
+                        the temporary file is being deleted after adapter trimming. Only has
+                        an effect for RRBS samples since other FastQ files are not trimmed
+                        for poor qualities separately.
+
+
+Note for RRBS using MseI:
+
+If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary
+to specify --rrbs or --non_directional since virtually all reads should start with the sequence
+'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA'
+restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply
+run Trim Galore! in the standard (i.e. non-RRBS) mode.
+
+
+Paired-end specific options:
+
+--paired                This option performs length trimming of quality/adapter/RRBS trimmed reads for
+                        paired-end files. To pass the validation test, both sequences of a sequence pair
+                        are required to have a certain minimum length which is governed by the option
+                        --length (see above). If only one read passes this length threshold the
+                        other read can be rescued (see option --retain_unpaired). Using this option lets
+                        you discard too short read pairs without disturbing the sequence-by-sequence order
+                        of FastQ files which is required by many aligners.
+
+                        Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g.
+                        file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... .
+
+-t/--trim1              Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that
+                        are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards
+                        alignments like this:
+
+                          R1 --------------------------->     or this:    ----------------------->  R1
+                          R2 <---------------------------                       <-----------------  R2
+
+                        as invalid (whenever a start/end coordinate is contained within the other read).
+
+--retain_unpaired       If only one of the two paired-end reads became too short, the longer
+                        read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq'
+                        output files. The length cutoff for unpaired single-end reads is
+                        governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF.
+
+-r1/--length_1 <INT>    Unpaired single-end read length cutoff needed for read 1 to be written to
+                        '.unpaired_1.fq' output file. These reads may be mapped in single-end mode.
+                        Default: 35 bp.
+
+-r2/--length_2 <INT>    Unpaired single-end read length cutoff needed for read 2 to be written to
+                        '.unpaired_2.fq' output file. These reads may be mapped in single-end mode.
+                        Default: 35 bp.
+
+
+Last modified on 18 Oct 2012.
+
+HELP
+  exit;
+}
author	bgruening
date	Sat, 06 Jul 2013 09:52:23 -0400
parents
children	898db63d2e84