# HG changeset patch # User bgruening # Date 1493058607 14400 # Node ID 80cd83b11214401e0bd26e774dab095020dbe1fe # Parent b4e39d993fc8ce7506fc01d828e1d430c03cf070 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/trim_galore commit 78bee2b2efd36fe9399ce574159fc007cb6bdfbf diff -r b4e39d993fc8 -r 80cd83b11214 test-data/paired_collection_example_results3.txt --- a/test-data/paired_collection_example_results3.txt Thu Apr 20 09:14:30 2017 -0400 +++ b/test-data/paired_collection_example_results3.txt Mon Apr 24 14:30:07 2017 -0400 @@ -3,8 +3,8 @@ ========================== Input filename: input_1.fastq Trimming mode: paired-end -Trim Galore version: 0.4.0 -Cutadapt version: 1.8 +Trim Galore version: 0.4.3 +Cutadapt version: 1.13 Quality Phred score cutoff: 20 Quality encoding type selected: ASCII+33 Adapter sequence: 'CTGTCTCTTATA' (Nextera Transposase sequence; auto-detected) @@ -15,10 +15,10 @@ Length cut-off for read 2: 35 bp (default) -This is cutadapt 1.8 with Python 3.5.3 +This is cutadapt 1.13 with Python 3.5.3 Command line parameters: -f fastq -e 0.1 -q 20 -O 1 -a CTGTCTCTTATA input_1.fastq -Trimming 1 adapter(s) with at most 10.0% errors in single-end mode ... -Finished in 0.10 s (1010 us/read; 0.06 M reads/minute). +Trimming 1 adapter with at most 10.0% errors in single-end mode ... +Finished in 0.01 s (101 us/read; 0.59 M reads/minute). === Summary === @@ -85,8 +85,8 @@ ========================== Input filename: input_2.fastq Trimming mode: paired-end -Trim Galore version: 0.4.0 -Cutadapt version: 1.8 +Trim Galore version: 0.4.3 +Cutadapt version: 1.13 Quality Phred score cutoff: 20 Quality encoding type selected: ASCII+33 Adapter sequence: 'CTGTCTCTTATA' (Nextera Transposase sequence; auto-detected) @@ -97,10 +97,10 @@ Length cut-off for read 2: 35 bp (default) -This is cutadapt 1.8 with Python 3.5.3 +This is cutadapt 1.13 with Python 3.5.3 Command line parameters: -f fastq -e 0.1 -q 20 -O 1 -a CTGTCTCTTATA input_2.fastq -Trimming 1 adapter(s) with at most 10.0% errors in single-end mode ... -Finished in 0.10 s (1000 us/read; 0.06 M reads/minute). +Trimming 1 adapter with at most 10.0% errors in single-end mode ... +Finished in 0.01 s (100 us/read; 0.60 M reads/minute). === Summary === diff -r b4e39d993fc8 -r 80cd83b11214 test-data/paired_collection_example_results3gz.txt --- a/test-data/paired_collection_example_results3gz.txt Thu Apr 20 09:14:30 2017 -0400 +++ b/test-data/paired_collection_example_results3gz.txt Mon Apr 24 14:30:07 2017 -0400 @@ -3,8 +3,8 @@ ========================== Input filename: input_1.fastq.gz Trimming mode: paired-end -Trim Galore version: 0.4.0 -Cutadapt version: 1.8 +Trim Galore version: 0.4.3 +Cutadapt version: 1.13 Quality Phred score cutoff: 20 Quality encoding type selected: ASCII+33 Adapter sequence: 'CTGTCTCTTATA' (Nextera Transposase sequence; auto-detected) @@ -16,10 +16,10 @@ Output file will be GZIP compressed -This is cutadapt 1.8 with Python 3.5.3 +This is cutadapt 1.13 with Python 3.5.3 Command line parameters: -f fastq -e 0.1 -q 20 -O 1 -a CTGTCTCTTATA input_1.fastq.gz -Trimming 1 adapter(s) with at most 10.0% errors in single-end mode ... -Finished in 0.10 s (1010 us/read; 0.06 M reads/minute). +Trimming 1 adapter with at most 10.0% errors in single-end mode ... +Finished in 0.01 s (101 us/read; 0.59 M reads/minute). === Summary === @@ -86,8 +86,8 @@ ========================== Input filename: input_2.fastq.gz Trimming mode: paired-end -Trim Galore version: 0.4.0 -Cutadapt version: 1.8 +Trim Galore version: 0.4.3 +Cutadapt version: 1.13 Quality Phred score cutoff: 20 Quality encoding type selected: ASCII+33 Adapter sequence: 'CTGTCTCTTATA' (Nextera Transposase sequence; auto-detected) @@ -99,10 +99,10 @@ Output file will be GZIP compressed -This is cutadapt 1.8 with Python 3.5.3 +This is cutadapt 1.13 with Python 3.5.3 Command line parameters: -f fastq -e 0.1 -q 20 -O 1 -a CTGTCTCTTATA input_2.fastq.gz -Trimming 1 adapter(s) with at most 10.0% errors in single-end mode ... -Finished in 0.10 s (1000 us/read; 0.06 M reads/minute). +Trimming 1 adapter with at most 10.0% errors in single-end mode ... +Finished in 0.01 s (100 us/read; 0.60 M reads/minute). === Summary === diff -r b4e39d993fc8 -r 80cd83b11214 test-data/paired_example_results2.txt --- a/test-data/paired_example_results2.txt Thu Apr 20 09:14:30 2017 -0400 +++ b/test-data/paired_example_results2.txt Mon Apr 24 14:30:07 2017 -0400 @@ -3,8 +3,8 @@ ========================== Input filename: input_1.fastq Trimming mode: paired-end -Trim Galore version: 0.4.0 -Cutadapt version: 1.8 +Trim Galore version: 0.4.3 +Cutadapt version: 1.13 Quality Phred score cutoff: 20 Quality encoding type selected: ASCII+33 Adapter sequence: 'CTGTCTCTTATA' (Nextera Transposase sequence; auto-detected) @@ -13,10 +13,10 @@ Minimum required sequence length for both reads before a sequence pair gets removed: 20 bp -This is cutadapt 1.8 with Python 3.5.3 +This is cutadapt 1.13 with Python 3.5.3 Command line parameters: -f fastq -e 0.1 -q 20 -O 1 -a CTGTCTCTTATA input_1.fastq -Trimming 1 adapter(s) with at most 10.0% errors in single-end mode ... -Finished in 0.10 s (1010 us/read; 0.06 M reads/minute). +Trimming 1 adapter with at most 10.0% errors in single-end mode ... +Finished in 0.01 s (101 us/read; 0.59 M reads/minute). === Summary === @@ -83,8 +83,8 @@ ========================== Input filename: input_2.fastq Trimming mode: paired-end -Trim Galore version: 0.4.0 -Cutadapt version: 1.8 +Trim Galore version: 0.4.3 +Cutadapt version: 1.13 Quality Phred score cutoff: 20 Quality encoding type selected: ASCII+33 Adapter sequence: 'CTGTCTCTTATA' (Nextera Transposase sequence; auto-detected) @@ -93,10 +93,10 @@ Minimum required sequence length for both reads before a sequence pair gets removed: 20 bp -This is cutadapt 1.8 with Python 3.5.3 +This is cutadapt 1.13 with Python 3.5.3 Command line parameters: -f fastq -e 0.1 -q 20 -O 1 -a CTGTCTCTTATA input_2.fastq -Trimming 1 adapter(s) with at most 10.0% errors in single-end mode ... -Finished in 0.10 s (1000 us/read; 0.06 M reads/minute). +Trimming 1 adapter with at most 10.0% errors in single-end mode ... +Finished in 0.01 s (100 us/read; 0.60 M reads/minute). === Summary === diff -r b4e39d993fc8 -r 80cd83b11214 test-data/paired_example_results2gz.txt --- a/test-data/paired_example_results2gz.txt Thu Apr 20 09:14:30 2017 -0400 +++ b/test-data/paired_example_results2gz.txt Mon Apr 24 14:30:07 2017 -0400 @@ -3,8 +3,8 @@ ========================== Input filename: input_1.fastq.gz Trimming mode: paired-end -Trim Galore version: 0.4.0 -Cutadapt version: 1.8 +Trim Galore version: 0.4.3 +Cutadapt version: 1.13 Quality Phred score cutoff: 20 Quality encoding type selected: ASCII+33 Adapter sequence: 'CTGTCTCTTATA' (Nextera Transposase sequence; auto-detected) @@ -14,10 +14,10 @@ Output file will be GZIP compressed -This is cutadapt 1.8 with Python 3.5.3 +This is cutadapt 1.13 with Python 3.5.3 Command line parameters: -f fastq -e 0.1 -q 20 -O 1 -a CTGTCTCTTATA input_1.fastq.gz -Trimming 1 adapter(s) with at most 10.0% errors in single-end mode ... -Finished in 0.10 s (1010 us/read; 0.06 M reads/minute). +Trimming 1 adapter with at most 10.0% errors in single-end mode ... +Finished in 0.01 s (101 us/read; 0.59 M reads/minute). === Summary === @@ -84,8 +84,8 @@ ========================== Input filename: input_2.fastq.gz Trimming mode: paired-end -Trim Galore version: 0.4.0 -Cutadapt version: 1.8 +Trim Galore version: 0.4.3 +Cutadapt version: 1.13 Quality Phred score cutoff: 20 Quality encoding type selected: ASCII+33 Adapter sequence: 'CTGTCTCTTATA' (Nextera Transposase sequence; auto-detected) @@ -95,10 +95,10 @@ Output file will be GZIP compressed -This is cutadapt 1.8 with Python 3.5.3 +This is cutadapt 1.13 with Python 3.5.3 Command line parameters: -f fastq -e 0.1 -q 20 -O 1 -a CTGTCTCTTATA input_2.fastq.gz -Trimming 1 adapter(s) with at most 10.0% errors in single-end mode ... -Finished in 0.10 s (1000 us/read; 0.06 M reads/minute). +Trimming 1 adapter with at most 10.0% errors in single-end mode ... +Finished in 0.01 s (100 us/read; 0.60 M reads/minute). === Summary === diff -r b4e39d993fc8 -r 80cd83b11214 test-data/sanger_full_range_report_results1.txt --- a/test-data/sanger_full_range_report_results1.txt Thu Apr 20 09:14:30 2017 -0400 +++ b/test-data/sanger_full_range_report_results1.txt Mon Apr 24 14:30:07 2017 -0400 @@ -3,8 +3,8 @@ ========================== Input filename: input_1.fastq Trimming mode: single-end -Trim Galore version: 0.4.0 -Cutadapt version: 1.8 +Trim Galore version: 0.4.3 +Cutadapt version: 1.13 Quality Phred score cutoff: 20 Quality encoding type selected: ASCII+33 Adapter sequence: 'AGATCGGAAGAGC' (Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)) @@ -13,10 +13,10 @@ Minimum required sequence length before a sequence gets removed: 20 bp -This is cutadapt 1.8 with Python 3.5.3 +This is cutadapt 1.13 with Python 3.5.3 Command line parameters: -f fastq -e 0.1 -q 20 -O 1 -a AGATCGGAAGAGC input_1.fastq -Trimming 1 adapter(s) with at most 10.0% errors in single-end mode ... -Finished in 0.10 s (50000 us/read; 0.00 M reads/minute). +Trimming 1 adapter with at most 10.0% errors in single-end mode ... +Finished in 0.01 s (5000 us/read; 0.01 M reads/minute). === Summary === diff -r b4e39d993fc8 -r 80cd83b11214 test-data/sanger_full_range_report_results1gz.txt --- a/test-data/sanger_full_range_report_results1gz.txt Thu Apr 20 09:14:30 2017 -0400 +++ b/test-data/sanger_full_range_report_results1gz.txt Mon Apr 24 14:30:07 2017 -0400 @@ -3,8 +3,8 @@ ========================== Input filename: input_1.fastq.gz Trimming mode: single-end -Trim Galore version: 0.4.0 -Cutadapt version: 1.8 +Trim Galore version: 0.4.3 +Cutadapt version: 1.13 Quality Phred score cutoff: 20 Quality encoding type selected: ASCII+33 Adapter sequence: 'AGATCGGAAGAGC' (Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)) @@ -14,10 +14,10 @@ Output file will be GZIP compressed -This is cutadapt 1.8 with Python 3.5.3 +This is cutadapt 1.13 with Python 3.5.3 Command line parameters: -f fastq -e 0.1 -q 20 -O 1 -a AGATCGGAAGAGC input_1.fastq.gz -Trimming 1 adapter(s) with at most 10.0% errors in single-end mode ... -Finished in 0.10 s (50000 us/read; 0.00 M reads/minute). +Trimming 1 adapter with at most 10.0% errors in single-end mode ... +Finished in 0.01 s (5000 us/read; 0.01 M reads/minute). === Summary === diff -r b4e39d993fc8 -r 80cd83b11214 tool_dependencies.xml --- a/tool_dependencies.xml Thu Apr 20 09:14:30 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ - - - - - - diff -r b4e39d993fc8 -r 80cd83b11214 trim_galore --- a/trim_galore Thu Apr 20 09:14:30 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1609 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use Getopt::Long; -use IPC::Open3; -use File::Spec; -use File::Basename; -use Cwd; - -## This program is Copyright (C) 2012-14, Felix Krueger (felix.krueger@babraham.ac.uk) - -## This program is free software: you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation, either version 3 of the License, or -## (at your option) any later version. - -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. - -## You should have received a copy of the GNU General Public License -## along with this program. If not, see . - - -## this script is taking in FastQ sequences and trims them using Cutadapt - -## last modified on 01 May 2015 - -my $DOWARN = 1; # print on screen warning and text by default -BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } }; - -my $trimmer_version = '0.4.0'; - - -my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2,$nextera,$small_rna,$path_to_cutadapt,$illumina) = process_commandline(); - -my @filenames = @ARGV; -die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n -USAGE: 'trim_galore [options] ' or 'trim_galore --help' for more options\n\n" unless (@filenames); -file_sanity_check($filenames[0]); - - -######################################################################## - -my $path_to_fastqc = 'fastqc'; - -# Before we start let's have quick look if Cutadapt seems to be working with the path information provided -# To change the path to Cutadapt use --path_to_cutadapt /full/path/to/the/Cutadapt/executable - -if(defined $path_to_cutadapt){ - warn "Path to Cutadapt set as: '$path_to_cutadapt' (user defined)\n"; - # we'll simply use this -} -else{ - $path_to_cutadapt = 'cutadapt'; # default, assuming it is in the PATH - warn "Path to Cutadapt set as: '$path_to_cutadapt' (default)\n"; -} -my $cutadapt_version; -my $return = system "$path_to_cutadapt --version"; #>/dev/null 2>&1"; -if ($return == -1){ - die "Failed to execute Cutadapt porperly. Please install Cutadapt first and make sure it is in the PATH, or specify the path to the Cutadapt executable using --path_to_cutadapt /path/to/cutadapt\n\n"; -} -else{ - warn "Cutadapt seems to be working fine (tested command '$path_to_cutadapt --version')\n"; - $cutadapt_version = `$path_to_cutadapt --version`; - chomp $cutadapt_version; - # warn "Cutadapt version: $cutadapt_version\n"; -} - - -######################################################################## - -sub autodetect_adapter_type{ - warn "\n\nAUTO-DETECTING ADAPTER TYPE\n===========================\n"; - warn "Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> $ARGV[0] <<)\n\n"; - - if ($ARGV[0] =~ /gz$/){ - open (AUTODETECT,"zcat $ARGV[0] |") or die "Failed to read from file $ARGV[0]\n"; - } - else{ - open (AUTODETECT,$ARGV[0]) or die "Failed to read from file $ARGV[0]\n"; - } - - my %adapters; - - $adapters{'Illumina'} -> {seq} = 'AGATCGGAAGAGC'; - $adapters{'Illumina'} -> {count}= 0; - $adapters{'Illumina'} -> {name}= 'Illumina TruSeq, Sanger iPCR; auto-detected'; - - $adapters{'Nextera'} -> {seq} = 'CTGTCTCTTATA'; - $adapters{'Nextera'} -> {count}= 0; - $adapters{'Nextera'} -> {name}= 'Nextera Transposase sequence; auto-detected'; - - $adapters{'smallRNA'} -> {seq} = 'ATGGAATTCTCG'; - $adapters{'smallRNA'} -> {count}= 0; - $adapters{'smallRNA'} -> {name}= 'Illumina small RNA adapter; auto-detected'; - - - # we will read the first 1 million sequences, or until the end of the file whatever comes first, and then use the adapter that for trimming which was found to occcur most often - my $count = 0; - while (1){ - - my $line1 = ; - my $line2 = ; - my $line3 = ; - my $line4 = ; - last unless ($line4); - $count++; - last if ($count == 1000000); - - chomp $line2; - $adapters{'Illumina'}->{count}++ unless (index($line2,'AGATCGGAAGAGC')== -1); - $adapters{'Nextera'} ->{count}++ unless (index($line2,'CTGTCTCTTATA') == -1); - $adapters{'smallRNA'}->{count}++ unless (index($line2,'ATGGAATTCTCG') == -1); - - } - - my $highest; - my $second; - my $seq; - my $adapter_name; - - warn "Found perfect matches for the following adapter sequences:\nAdapter type\tCount\tSequence\tSequences analysed\tPercentage\n"; - foreach my $adapter (sort {$adapters{$b}->{count}<=>$adapters{$a}->{count}} keys %adapters){ - - my $percentage = sprintf("%.2f",$adapters{$adapter}->{count}/$count*100); - - warn "$adapter\t$adapters{$adapter}->{count}\t$adapters{$adapter}->{seq}\t$count\t$percentage\n"; - - unless (defined $highest){ - $highest = $adapter; - $seq = $adapters{$adapter}->{seq}; - $adapter_name = $adapters{$adapter}->{name}; - next; - } - unless (defined $second){ - $second = $adapter; - } - } - - - # using the highest occurrence as adapter to look out for - if ($adapters{$highest}->{count} == $adapters{$second}->{count}){ - warn "Unable to auto-detect most prominent adapter from the first specified file (count $highest: $adapters{$highest}->{count}, count $second: $adapters{$second}->{second})\n"; - - if ($adapters{$highest}->{count} == 0){ - warn "Defaulting to Illumina universal adapter ( AGATCGGAAGAGC ). Specify -a SEQUENCE to avoid this behavior).\n\n"; - $adapter_name = 'Illumina TruSeq, Sanger iPCR; default (inconclusive auto-detection)'; - $seq = 'AGATCGGAAGAGC'; - } - else{ - warn "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n"; - } - } - else{ - warn "Using $highest adapter for trimming (count: $adapters{$highest}->{count}). Second best hit was $second (count: $adapters{$second}->{count})\n\n"; - } - - close AUTODETECT; - - return ($seq,$adapter_name); - -} - - - -### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED -unless (defined $cutoff){ - $cutoff = 20; -} -my $phred_score_cutoff = $cutoff; # only relevant for report -my $adapter_name = ''; -unless (defined $adapter){ - if ($nextera){ - $adapter = 'CTGTCTCTTATA'; - $adapter_name = 'Nextera Transposase sequence; user defined'; - } - elsif($small_rna){ - $adapter = 'ATGGAATTCTCG'; - $adapter_name = 'Illumina small RNA adapter; user defined'; - } - elsif($illumina){ - $adapter = 'AGATCGGAAGAGC'; - $adapter_name = 'Illumina TruSeq, Sanger iPCR; user defined'; - } - else{ # default - ($adapter,$adapter_name) = autodetect_adapter_type(); - } -} -unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming - $a2 = ''; -} - -unless (defined $stringency){ - $stringency = 1; -} - -if ($phred_encoding == 64){ - $cutoff += 31; -} - -my $file_1; -my $file_2; - -foreach my $filename (@ARGV){ - trim ($filename); -} - - -sub trim{ - my $filename = shift; - - my $output_filename = (split (/\//,$filename))[-1]; - - my $report = $output_filename; - $report =~ s/$/_trimming_report.txt/; - - if ($no_report_file) { - $report = File::Spec->devnull; - open (REPORT,'>',$report) or die "Failed to write to file '$report': $!\n"; - # warn "Redirecting report output to /dev/null\n"; - } - else{ - open (REPORT,'>',$output_dir.$report) or die "Failed to write to file '$report': $!\n"; - warn "Writing report to '$output_dir$report'\n"; - } - - warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n"; - print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n"; - - if ($validate){ # paired-end mode - warn "Trimming mode: paired-end\n"; - print REPORT "Trimming mode: paired-end\n"; - } - else{ - warn "Trimming mode: single-end\n"; - print REPORT "Trimming mode: single-end\n"; - } - - - warn "Trim Galore version: $trimmer_version\n"; - print REPORT "Trim Galore version: $trimmer_version\n"; - - warn "Cutadapt version: $cutadapt_version\n"; - print REPORT "Cutadapt version: $cutadapt_version\n"; - - warn "Quality Phred score cutoff: $phred_score_cutoff\n"; - print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n"; - - warn "Quality encoding type selected: ASCII+$phred_encoding\n"; - print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n"; - - warn "Adapter sequence: '$adapter' ($adapter_name)\n"; - print REPORT "Adapter sequence: '$adapter' ($adapter_name)\n"; - - if ($error_rate == 0.1){ - warn "Maximum trimming error rate: $error_rate (default)\n"; - } - else{ - warn "Maximum trimming error rate: $error_rate\n"; - } - - print REPORT "Maximum trimming error rate: $error_rate"; - if ($error_rate == 0.1){ - print REPORT " (default)\n"; - } - else{ - print REPORT "\n"; - } - - if ($a2){ - warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n"; - print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n"; - } - - warn "Minimum required adapter overlap (stringency): $stringency bp\n"; - print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n"; - - if ($validate){ - warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n"; - print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n"; - } - else{ - warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n"; - print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n"; - } - - if ($validate){ # only for paired-end files - - if ($retain){ # keeping single-end reads if only one end is long enough - - if ($length_read_1 == 35){ - warn "Length cut-off for read 1: $length_read_1 bp (default)\n"; - print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n"; - } - else{ - warn "Length cut-off for read 1: $length_read_1 bp\n"; - print REPORT "Length cut-off for read 1: $length_read_1 bp\n"; - } - - if ($length_read_2 == 35){ - warn "Length cut-off for read 2: $length_read_2 bb (default)\n"; - print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n"; - } - else{ - warn "Length cut-off for read 2: $length_read_2 bp\n"; - print REPORT "Length cut-off for read 2: $length_read_2 bp\n"; - } - } - } - - if ($rrbs){ - warn "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n"; - print REPORT "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n"; - } - - if ($non_directional){ - warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n"; - print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n"; - } - - if ($trim){ - warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n"; - print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n"; - } - - if ($clip_r1){ - warn "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n"; - print REPORT "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n"; - } - if ($clip_r2){ - warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n"; - print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n"; - } - - if ($three_prime_clip_r1){ - warn "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n"; - print REPORT "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n"; - } - if ($three_prime_clip_r2){ - warn "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n"; - print REPORT "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n"; - } - - if ($fastqc){ - warn "Running FastQC on the data once trimming has completed\n"; - print REPORT "Running FastQC on the data once trimming has completed\n"; - - if ($fastqc_args){ - warn "Running FastQC with the following extra arguments: '$fastqc_args'\n"; - print REPORT "Running FastQC with the following extra arguments: $fastqc_args\n"; - } - } - - if ($keep and $rrbs){ - warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n"; - print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n"; - } - - - if ($gzip or $filename =~ /\.gz$/){ - $gzip = 1; - unless ($dont_gzip){ - warn "Output file(s) will be GZIP compressed\n"; - print REPORT "Output file will be GZIP compressed\n"; - } - } - - warn "\n"; - print REPORT "\n"; - sleep (3); - - my $temp; - - ### Proceeding differently for RRBS and other type of libraries - if ($rrbs){ - - ### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified - if ($cutoff == 0){ - warn "Quality cutoff selected was 0 - Skipping quality trimming altogether\n\n"; - sleep (3); - } - else{ - - $temp = $filename; - $temp =~ s/^.*\///; # replacing optional file path information - $temp =~ s/$/_qual_trimmed.fastq/; - open (TEMP,'>',$output_dir.$temp) or die "Can't write to '$temp': $!"; - - warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n"; - sleep (3); - - open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!"; - - my $qual_count = 0; - - while (1){ - my $l1 = ; - my $seq = ; - my $l3 = ; - my $qual = ; - last unless (defined $qual); - - $qual_count++; - if ($qual_count%10000000 == 0){ - warn "$qual_count sequences processed\n"; - } - print TEMP "$l1$seq$l3$qual"; - } - - warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n"; - close QUAL or die "Unable to close QUAL filehandle: $!\n"; - sleep (3); - - } - } - - - if ($output_filename =~ /\.fastq$/){ - $output_filename =~ s/\.fastq$/_trimmed.fq/; - } - elsif ($output_filename =~ /\.fastq\.gz$/){ - $output_filename =~ s/\.fastq\.gz$/_trimmed.fq/; - } - elsif ($output_filename =~ /\.fq$/){ - $output_filename =~ s/\.fq$/_trimmed.fq/; - } - elsif ($output_filename =~ /\.fq\.gz$/){ - $output_filename =~ s/\.fq\.gz$/_trimmed.fq/; - } - else{ - $output_filename =~ s/$/_trimmed.fq/; - } - - if ($gzip or $filename =~ /\.gz$/){ - if ($dont_gzip){ - open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file - } - else{ - ### 6 Jan 2014: had a request to also gzip intermediate files to save disk space - # if ($validate){ - # open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file - # } - $output_filename .= '.gz'; - open (OUT,"| gzip -c - > ${output_dir}${output_filename}") or die "Can't write to '$output_filename': $!\n"; - } - } - else{ - open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; - } - warn "Writing final adapter and quality trimmed output to $output_filename\n\n"; - - my $count = 0; - my $too_short = 0; - my $quality_trimmed = 0; - my $rrbs_trimmed = 0; - my $rrbs_trimmed_start = 0; - my $CAA = 0; - my $CGA = 0; - - my $pid; - - if ($rrbs and $cutoff != 0){ - - ### optionally using 2 different adapters for read 1 and read 2 - if ($validate and $a2){ - ### Figure out whether current file counts as read 1 or read 2 of paired-end files - if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair - warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; - sleep (3); - $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; - } - else{ # this is read 2 of a pair - warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n"; - sleep (3); - $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; - } - } - ### Using the same adapter for both read 1 and read 2 - else{ - warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; - sleep (3); - $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; - } - - close WRITER or die $!; # not needed - - open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file - - if ($filename =~ /\.gz$/){ - open (IN,"zcat $filename |") or die $!; # original, untrimmed file - } - else{ - open (IN,$filename) or die $!; # original, untrimmed file - } - - while (1){ - - # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not - my $l1 = ; - my $seq = ; # adapter trimmed sequence - my $l3 = ; - my $qual = ; - - $_ = ; # irrelevant - my $original_seq = ; - $_ = ; # irrelevant - $_ = ; # irrelevant - - $_ = ; # irrelevant - my $qual_trimmed_seq = ; - $_ = ; # irrelevant - my $qual_trimmed_qual = ; - - last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings - - $count++; - if ($count%10000000 == 0){ - warn "$count sequences processed\n"; - } - - chomp $seq; - chomp $qual; - chomp $qual_trimmed_seq; - chomp $original_seq; - - my $quality_trimmed_seq_length = length $qual_trimmed_seq; - - if (length $original_seq > length $qual_trimmed_seq){ - ++$quality_trimmed; - } - - my $nd = 0; - - ### NON-DIRECTIONAL RRBS - if ($non_directional){ - if (length$seq > 2){ - if ($seq =~ /^CAA/){ - ++$CAA; - $seq = substr ($seq,2,length($seq)-2); - $qual = substr ($qual,2,length($qual)-2); - ++$rrbs_trimmed_start; - $nd = 1; - } - elsif ($seq =~ /^CGA/){ - $seq = substr ($seq,2,length($seq)-2); - $qual = substr ($qual,2,length($qual)-2); - ++$CGA; - ++$rrbs_trimmed_start; - $nd = 1; - } - } - } - - ### directional read - unless ($nd == 1){ - if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){ - $seq = substr ($seq,0,length($seq)-2); - $qual = substr ($qual,0,length($qual)-2); - ++$rrbs_trimmed; - } - } - - ### Shortening all sequences by 1 bp on the 3' end - if ($trim){ - $seq = substr($seq,0,length($seq)-1); - $qual = substr($qual,0,length($qual)-1); - } - - ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE - if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later) - print OUT "$l1$seq\n$l3$qual\n"; - } - else{ # single end - - if ($clip_r1){ - if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again - $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence - $qual = substr($qual,$clip_r1); - } - } - - if ($three_prime_clip_r1){ - - if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again - # warn "seq/qual before/after trimming:\n$seq\n$qual\n"; - $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence - $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1 )); - # warn "$seq\n$qual\n"; - } - - } - - if (length $seq < $length_cutoff){ - ++$too_short; - next; - } - else{ - print OUT "$l1$seq\n$l3$qual\n"; - } - } - } - - print REPORT "\n"; - while (){ - warn $_; - print REPORT $_; - } - - close IN or die "Unable to close IN filehandle: $!"; - close QUAL or die "Unable to close QUAL filehandle: $!"; - close TRIM or die "Unable to close TRIM filehandle: $!"; - close OUT or die "Unable to close OUT filehandle: $!"; - - } - else{ - - ### optionally using 2 different adapters for read 1 and read 2 - if ($validate and $a2){ - ### Figure out whether current file counts as read 1 or read 2 of paired-end files - if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair - warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n"; - sleep (3); - $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!"; - } - else{ # this is read 2 of a pair - warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n"; - sleep (3); - $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!"; - } - } - ### Using the same adapter for both read 1 and read 2 - else{ - warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n"; - sleep (3); - $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!"; - } - - close WRITER or die $!; # not needed - - while (1){ - - my $l1 = ; - my $seq = ; # quality and/or adapter trimmed sequence - my $l3 = ; - my $qual = ; - # print "$l1$seq\n$l3$qual\n"; - last unless (defined $qual); # could be an empty string - - $count++; - if ($count%10000000 == 0){ - warn "$count sequences processed\n"; - } - - chomp $seq; - chomp $qual; - - ### Shortening all sequences by 1 bp on the 3' end - if ($trim){ - $seq = substr($seq,0,length($seq)-1); - $qual = substr($qual,0,length($qual)-1); - } - - ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE - if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later) - print OUT "$l1$seq\n$l3$qual\n"; - } - else{ # single end - - if ($clip_r1){ - if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again - $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence - $qual = substr($qual,$clip_r1); - } - } - - if ($three_prime_clip_r1){ - if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again - # warn "seq/qual before/after trimming:\n$seq\n$qual\n"; - $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence - $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1)); - # warn "$seq\n$qual\n";sleep(1); - } - } - - if (length $seq < $length_cutoff){ - ++$too_short; - next; - } - else{ - print OUT "$l1$seq\n$l3$qual\n"; - } - } - } - - print REPORT "\n"; - while (){ - warn $_; - print REPORT $_; - } - - close TRIM or die "Unable to close TRIM filehandle: $!\n"; - close ERROR or die "Unable to close ERROR filehandle: $!\n"; - close OUT or die "Unable to close OUT filehandle: $!\n"; - - } - - - if ($rrbs){ - unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files - - # deleting temporary quality trimmed file - my $deleted = unlink "$output_dir$temp"; - - if ($deleted){ - warn "Successfully deleted temporary file $temp\n\n"; - } - else{ - warn "Could not delete temporary file $temp"; - } - } - } - - ### Wait and reap the child process (Cutadapt) so that it doesn't become a zombie process - waitpid $pid, 0; - unless ($? == 0){ - die "\n\nCutadapt terminated with exit signal: '$?'.\nTerminating Trim Galore run, please check error message(s) to get an idea what went wrong...\n\n"; - } - - warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n"; - print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n"; - - warn "="x 45,"\n"; - print REPORT "="x 45,"\n"; - - warn "$count sequences processed in total\n"; - print REPORT "$count sequences processed in total\n"; - - ### only reporting this separately if quality and adapter trimming were performed separately - if ($rrbs){ - my $percentage_shortened; - if ($count){ - $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100); - warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n"; - print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n"; - } - else{ - warn "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n"; - print REPORT "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n"; - } - } - - my $percentage_too_short; - if ($count){ - $percentage_too_short = sprintf ("%.1f",$too_short/$count*100); - } - else{ - $percentage_too_short = 'N/A'; - } - - if ($validate){ ### only for paired-end files - warn "The length threshold of paired-end sequences gets evaluated later on (in the validation step)\n"; - } - else{ ### Single-end file - warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n"; - print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n"; - } - - if ($rrbs){ - my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100); - warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n"; - print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n"; - } - - if ($non_directional){ - my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100); - warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n"; - print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n"; - } - - warn "\n"; - print REPORT "\n"; - - ### RUNNING FASTQC unless we are dealing with paired-end files - unless($validate){ - if ($fastqc){ - warn "\n >>> Now running FastQC on the data <<<\n\n"; - sleep (5); - if ($fastqc_args){ - system ("$path_to_fastqc $fastqc_args $output_dir$output_filename"); - } - else{ - system ("$path_to_fastqc $output_dir$output_filename"); - } - } - } - - ### VALIDATE PAIRED-END FILES - if ($validate){ - - ### Figure out whether current file counts as read 1 or read 2 of paired-end files - - if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair - $file_1 = $output_filename; - shift @filenames; - # warn "This is read 1: $file_1\n\n"; - } - else{ # this is read 2 of a pair - $file_2 = $output_filename; - shift @filenames; - # warn "This is read 2: $file_2\n\n"; - } - - if ($file_1 and $file_2){ - warn "Validate paired-end files $file_1 and $file_2\n"; - sleep (1); - - my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2); - - ### RUNNING FASTQC - if ($fastqc){ - - warn "\n >>> Now running FastQC on the validated data $val_1<<<\n\n"; - sleep (3); - - if ($fastqc_args){ - system ("$path_to_fastqc $fastqc_args $output_dir$val_1"); - } - else{ - system ("$path_to_fastqc $output_dir$val_1"); - } - - warn "\n >>> Now running FastQC on the validated data $val_2<<<\n\n"; - sleep (3); - - if ($fastqc_args){ - system ("$path_to_fastqc $fastqc_args $output_dir$val_2"); - } - else{ - system ("$path_to_fastqc $output_dir$val_2"); - } - - } - - warn "Deleting both intermediate output files $file_1 and $file_2\n"; - unlink "$output_dir$file_1"; - unlink "$output_dir$file_2"; - - warn "\n",'='x100,"\n\n"; - sleep (3); - - $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed - $file_2 = undef; - } - } - -} - -sub validate_paired_end_files{ - - my $file_1 = shift; - my $file_2 = shift; - - warn "file_1: $file_1, file_2: $file_2\n\n"; - - if ($file_1 =~ /\.gz$/){ - open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n"; - } - else{ - open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n"; - } - - if ($file_2 =~ /\.gz$/){ - open (IN2,"zcat $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n"; - } - else{ - open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n"; - } - - warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n"; - sleep (3); - - my $out_1 = $file_1; - my $out_2 = $file_2; - - if ($out_1 =~ /gz$/){ - $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/; - } - else{ - $out_1 =~ s/trimmed\.fq$/val_1.fq/; - } - - if ($out_2 =~ /gz$/){ - $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/; - } - else{ - $out_2 =~ s/trimmed\.fq$/val_2.fq/; - } - - if ($gzip){ - if ($dont_gzip){ - open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n"; - } - else{ - $out_1 .= '.gz'; - open (R1,"| gzip -c - > ${output_dir}${out_1}") or die "Can't write to $out_1: $!\n"; - } - } - else{ - open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n"; - } - - if ($gzip){ - if ($dont_gzip){ - open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n"; - } - else{ - $out_2 .= '.gz'; - open (R2,"| gzip -c - > ${output_dir}${out_2}") or die "Can't write to $out_2: $!\n"; - } - } - else{ - open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n"; - } - - warn "Writing validated paired-end read 1 reads to $out_1\n"; - warn "Writing validated paired-end read 2 reads to $out_2\n\n"; - - my $unpaired_1; - my $unpaired_2; - - if ($retain){ - - $unpaired_1 = $file_1; - $unpaired_2 = $file_2; - - if ($unpaired_1 =~ /gz$/){ - $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/; - } - else{ - $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/; - } - - if ($unpaired_2 =~ /gz$/){ - $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/; - } - else{ - $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/; - } - - if ($gzip){ - if ($dont_gzip){ - open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n"; - } - else{ - $unpaired_1 .= '.gz'; - open (UNPAIRED1,"| gzip -c - > ${output_dir}${unpaired_1}") or die "Can't write to $unpaired_1: $!\n"; - } - } - else{ - open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n"; - } - - if ($gzip){ - if ($dont_gzip){ - open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n"; - } - else{ - $unpaired_2 .= '.gz'; - open (UNPAIRED2,"| gzip -c - > ${output_dir}${unpaired_2}") or die "Can't write to $unpaired_2: $!\n"; - } - } - else{ - open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n"; - } - - warn "Writing unpaired read 1 reads to $unpaired_1\n"; - warn "Writing unpaired read 2 reads to $unpaired_2\n\n"; - } - - my $sequence_pairs_removed = 0; - my $read_1_printed = 0; - my $read_2_printed = 0; - - my $count = 0; - - while (1){ - my $id_1 = ; - my $seq_1 = ; - my $l3_1 = ; - my $qual_1 = ; - last unless ($id_1 and $seq_1 and $l3_1 and $qual_1); - - my $id_2 = ; - my $seq_2 = ; - my $l3_2 = ; - my $qual_2 = ; - last unless ($id_2 and $seq_2 and $l3_2 and $qual_2); - - ++$count; - - - ## small check if the sequence files appear to be FastQ files - if ($count == 1){ # performed just once - if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){ - die "Input file doesn't seem to be in FastQ format at sequence $count\n"; - } - if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){ - die "Input file doesn't seem to be in FastQ format at sequence $count\n"; - } - } - - chomp $seq_1; - chomp $seq_2; - chomp $qual_1; - chomp $qual_2; - - if ($clip_r1){ - if (length $seq_1 > $clip_r1){ # sequences that are already too short won't be trimmed again - $seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence - $qual_1 = substr($qual_1,$clip_r1); - } - } - if ($clip_r2){ - if (length $seq_2 > $clip_r2){ # sequences that are already too short won't be trimmed again - $seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence - $qual_2 = substr($qual_2,$clip_r2); - } - } - - if ($three_prime_clip_r1){ - if (length $seq_1 > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again - $seq_1 = substr($seq_1,0,(length($seq_1) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence - $qual_1 = substr($qual_1,0,(length($qual_1) - $three_prime_clip_r1)); - } - } - if ($three_prime_clip_r2){ - if (length $seq_2 > $three_prime_clip_r2){ # sequences that are already too short won't be clipped again - $seq_2 = substr($seq_2,0,(length($seq_2) - $three_prime_clip_r2)); # starting after the sequences to be trimmed until the end of the sequence - $qual_2 = substr($qual_2,0,(length($qual_2) - $three_prime_clip_r2)); - } - } - - - - ### making sure that the reads do have a sensible length - if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){ - ++$sequence_pairs_removed; - if ($retain){ # writing out single-end reads if they are longer than the cutoff - - if ( length($seq_1) >= $length_read_1){ # read 1 is long enough - print UNPAIRED1 $id_1; - print UNPAIRED1 "$seq_1\n"; - print UNPAIRED1 $l3_1; - print UNPAIRED1 "$qual_1\n"; - ++$read_1_printed; - } - - if ( length($seq_2) >= $length_read_2){ # read 2 is long enough - print UNPAIRED2 $id_2; - print UNPAIRED2 "$seq_2\n"; - print UNPAIRED2 $l3_2; - print UNPAIRED2 "$qual_2\n"; - ++$read_2_printed; - } - - } - } - else{ - print R1 $id_1; - print R1 "$seq_1\n"; - print R1 $l3_1; - print R1 "$qual_1\n"; - - print R2 $id_2; - print R2 "$seq_2\n"; - print R2 $l3_2; - print R2 "$qual_2\n"; - } - - } - - - my $percentage; - - if ($count){ - $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100); - } - else{ - $percentage = 'N/A'; - } - - warn "Total number of sequences analysed: $count\n\n"; - warn "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n"; - - print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n"; - print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n"; - - if ($keep){ - warn "Number of unpaired read 1 reads printed: $read_1_printed\n"; - warn "Number of unpaired read 2 reads printed: $read_2_printed\n"; - } - - close R1 or die $!; - close R2 or die $!; - - if ($retain){ - close UNPAIRED1 or die $!; - close UNPAIRED2 or die $!; - } - - warn "\n"; - if ($retain){ - return ($out_1,$out_2,$unpaired_1,$unpaired_2); - } - else{ - return ($out_1,$out_2); - } -} - - -sub file_sanity_check{ - - my $file = shift; - open (SANITY,$file) or die "Failed to read from file '$file' to perform sanity check\n"; - - # just processing a single FastQ entry - my $id = ; - my $seq = ; - my $three = ; - my $qual = ; - - unless ($id and $seq and $three and $qual){ - warn "Input file '$file' seems to be completely empty. Consider respecifying!\n\n"; - } - return; - chomp $seq; - - # testing if the file is a colorspace file in which case we bail - if ($seq =~ /\d+/){ - die "File seems to be in SOLiD colorspace format which is not supported by Trim Galore (sequence is: '$seq')! Please use Cutadapt on colorspace files separately and check its documentation!\n\n"; - } - - close SANITY; -} - - -sub process_commandline{ - my $help; - my $quality; - my $adapter; - my $adapter2; - my $stringency; - my $report; - my $version; - my $rrbs; - my $length_cutoff; - my $keep; - my $fastqc; - my $non_directional; - my $phred33; - my $phred64; - my $fastqc_args; - my $trim; - my $gzip; - my $validate; - my $retain; - my $length_read_1; - my $length_read_2; - my $error_rate; - my $output_dir; - my $no_report_file; - my $suppress_warn; - my $dont_gzip; - my $clip_r1; - my $clip_r2; - my $three_prime_clip_r1; - my $three_prime_clip_r2; - my $nextera; - my $small_rna; - my $illumina; - my $path_to_cutadapt; - - my $command_line = GetOptions ('help|man' => \$help, - 'q|quality=i' => \$quality, - 'a|adapter=s' => \$adapter, - 'a2|adapter2=s' => \$adapter2, - 'report' => \$report, - 'version' => \$version, - 'stringency=i' => \$stringency, - 'fastqc' => \$fastqc, - 'RRBS' => \$rrbs, - 'keep' => \$keep, - 'length=i' => \$length_cutoff, - 'non_directional' => \$non_directional, - 'phred33' => \$phred33, - 'phred64' => \$phred64, - 'fastqc_args=s' => \$fastqc_args, - 'trim1' => \$trim, - 'gzip' => \$gzip, - 'paired_end' => \$validate, - 'retain_unpaired' => \$retain, - 'length_1|r1=i' => \$length_read_1, - 'length_2|r2=i' => \$length_read_2, - 'e|error_rate=s' => \$error_rate, - 'o|output_dir=s' => \$output_dir, - 'no_report_file' => \$no_report_file, - 'suppress_warn' => \$suppress_warn, - 'dont_gzip' => \$dont_gzip, - 'clip_R1=i' => \$clip_r1, - 'clip_R2=i' => \$clip_r2, - 'three_prime_clip_R1=i' => \$three_prime_clip_r1, - 'three_prime_clip_R2=i' => \$three_prime_clip_r2, - 'illumina' => \$illumina, - 'nextera' => \$nextera, - 'small_rna' => \$small_rna, - 'path_to_cutadapt=s' => \$path_to_cutadapt, - ); - - ### EXIT ON ERROR if there were errors with any of the supplied options - unless ($command_line){ - die "Please respecify command line options\n"; - } - - ### HELPFILE - if ($help){ - print_helpfile(); - exit; - } - - - - - - if ($version){ - print << "VERSION"; - - Quality-/Adapter-/RRBS-Trimming - (powered by Cutadapt) - version $trimmer_version - - Last update: 06 05 2015 - -VERSION - exit; - } - - ### RRBS - unless ($rrbs){ - $rrbs = 0; - } - - ### SUPRESS WARNINGS - if (defined $suppress_warn){ - $DOWARN = 0; - } - - ### QUALITY SCORES - my $phred_encoding; - if ($phred33){ - if ($phred64){ - die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n"; - } - $phred_encoding = 33; - } - elsif ($phred64){ - $phred_encoding = 64; - } - unless ($phred33 or $phred64){ - warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n"; - $phred_encoding = 33; - sleep (1); - } - - ### NON-DIRECTIONAL RRBS - if ($non_directional){ - unless ($rrbs){ - die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n"; - } - } - else{ - $non_directional = 0; - } - - if ($fastqc_args){ - $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed - } - else{ - $fastqc_args = 0; - } - - ### CUSTOM ERROR RATE - if (defined $error_rate){ - # make sure that the error rate is between 0 and 1 - unless ($error_rate >= 0 and $error_rate <= 1){ - die "Please specify an error rate between 0 and 1 (the default is 0.1)\n"; - } - } - else{ - $error_rate = 0.1; # (default) - } - - if ($nextera and $small_rna or $nextera and $illumina or $illumina and $small_rna ){ - die "You can't use several different adapter types at the same time. Make your choice or consider using -a and -a2\n\n"; - } - - if (defined $adapter){ - unless ($adapter =~ /^[ACTGNXactgnx]+$/){ - die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n"; - } - $adapter = uc$adapter; - - if ($illumina){ - die "You can't supply an adapter sequence AND use the Illumina universal adapter sequence. Make your choice.\n\n"; - } - if ($nextera){ - die "You can't supply an adapter sequence AND use the Nextera transposase adapter sequence. Make your choice.\n\n"; - } - if ($small_rna){ - die "You can't supply an adapter sequence AND use the Illumina small RNA adapter sequence. Make your choice.\n\n"; - } - } - - if (defined $adapter2){ - unless ($validate){ - die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n"; - } - unless ($adapter2 =~ /^[ACTGNactgn]+$/){ - die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n"; - } - $adapter2 = uc$adapter2; - } - - ### LENGTH CUTOFF - unless (defined $length_cutoff){ - $length_cutoff = 20; - } - - ### files are supposed to be paired-end files - if ($validate){ - - # making sure that an even number of reads has been supplied - unless ((scalar@ARGV)%2 == 0){ - die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n"; - } - - ## CUTOFF FOR VALIDATED READ-PAIRS - if (defined $length_read_1 or defined $length_read_2){ - - unless ($retain){ - die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n"; - } - - if (defined $length_read_1){ - unless ($length_read_1 >= 15 and $length_read_1 <= 100){ - die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n"; - } - unless ($length_read_1 > $length_cutoff){ - die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n"; - } - } - - if (defined $length_read_2){ - unless ($length_read_2 >= 15 and $length_read_2 <= 100){ - die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n"; - } - unless ($length_read_2 > $length_cutoff){ - die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n"; - } - } - } - - if ($retain){ - $length_read_1 = 35 unless (defined $length_read_1); - $length_read_2 = 35 unless (defined $length_read_2); - } - } - - unless ($no_report_file){ - $no_report_file = 0; - } - - ### OUTPUT DIR PATH - if ($output_dir){ - unless ($output_dir =~ /\/$/){ - $output_dir =~ s/$/\//; - } - } - else{ - $output_dir = ''; - } - - ### Trimming at the 5' end - if (defined $clip_r2){ # trimming 5' bases of read 1 - die "Clipping the 5' end of read 2 is only allowed for paired-end files (--paired)\n" unless ($validate); - } - - if (defined $clip_r1){ # trimming 5' bases of read 1 - unless ($clip_r1 > 0 and $clip_r1 < 100){ - die "The 5' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n"; - } - } - - if (defined $clip_r2){ # trimming 5' bases of read 2 - unless ($clip_r2 > 0 and $clip_r2 < 100){ - die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n"; - } - } - - ### Trimming at the 3' end - if (defined $three_prime_clip_r1){ # trimming 3' bases of read 1 - unless ($three_prime_clip_r1 > 0 and $three_prime_clip_r1 < 100){ - die "The 3' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n"; - } - } - - if (defined $three_prime_clip_r2){ # trimming 3' bases of read 2 - unless ($three_prime_clip_r2 > 0 and $three_prime_clip_r2 < 100){ - die "The 3' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n"; - } - } - - - return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2,$nextera,$small_rna,$path_to_cutadapt,$illumina); -} - - - - -sub print_helpfile{ - print << "HELP"; - - USAGE: - -trim_galore [options] - - --h/--help Print this help message and exits. - --v/--version Print the version information and exits. - --q/--quality Trim low-quality ends from reads in addition to adapter removal. For - RRBS samples, quality trimming will be performed first, and adapter - trimming is carried in a second round. Other files are quality and adapter - trimmed in a single pass. The algorithm is the same as the one used by BWA - (Subtract INT from all qualities; compute partial sums from all indices - to the end of the sequence; cut sequence at the index at which the sum is - minimal). Default Phred score: 20. - ---phred33 Instructs Cutadapt to use ASCII+33 quality scores as Phred scores - (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON. - ---phred64 Instructs Cutadapt to use ASCII+64 quality scores as Phred scores - (Illumina 1.5 encoding) for quality trimming. - ---fastqc Run FastQC in the default mode on the FastQ file once trimming is complete. - ---fastqc_args "" Passes extra arguments to FastQC. If more than one argument is to be passed - to FastQC they must be in the form "arg1 arg2 etc.". An example would be: - --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will - automatically invoke FastQC, so --fastqc does not have to be specified - separately. - --a/--adapter Adapter sequence to be trimmed. If not specified explicitly, Trim Galore will - try to auto-detect whether the Illumina universal, Nextera transposase or Illumina - small RNA adapter sequence was used. Also see '--illumina', '--nextera' and - '--small_rna'. If no adapter can be detected within the first 1 million sequences - of the first file specified Trim Galore defaults to '--illumina'. - --a2/--adapter2 Optional adapter sequence to be trimmed off read 2 of paired-end files. This - option requires '--paired' to be specified as well. - ---illumina Adapter sequence to be trimmed is the first 13bp of the Illumina universal adapter - 'AGATCGGAAGAGC' instead of the default auto-detection of adapter sequence. - ---nextera Adapter sequence to be trimmed is the first 12bp of the Nextera adapter - 'CTGTCTCTTATA' instead of the default auto-detection of adapter sequence. - ---small_rna Adapter sequence to be trimmed is the first 12bp of the Illumina Small RNA Adapter - 'ATGGAATTCTCG' instead of the default auto-detection of adapter sequence. - - ---stringency Overlap with adapter sequence required to trim a sequence. Defaults to a - very stringent setting of 1, i.e. even a single bp of overlapping sequence - will be trimmed off from the 3' end of any read. - --e Maximum allowed error rate (no. of errors divided by the length of the matching - region) (default: 0.1) - ---gzip Compress the output file with GZIP. If the input files are GZIP-compressed - the output files will automatically be GZIP compressed as well. As of v0.2.8 the - compression will take place on the fly. - ---dont_gzip Output files won't be compressed with GZIP. This option overrides --gzip. - ---length Discard reads that became shorter than length INT because of either - quality or adapter trimming. A value of '0' effectively disables - this behaviour. Default: 20 bp. - - For paired-end files, both reads of a read-pair need to be longer than - bp to be printed out to validated paired-end files (see option --paired). - If only one read became too short there is the possibility of keeping such - unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp. - --o/--output_dir If specified all output will be written to this directory instead of the current - directory. - ---no_report_file If specified no report file will be generated. - ---suppress_warn If specified any output to STDOUT or STDERR will be suppressed. - ---clip_R1 Instructs Trim Galore to remove bp from the 5' end of read 1 (or single-end - reads). This may be useful if the qualities were very poor, or if there is some - sort of unwanted bias at the 5' end. Default: OFF. - ---clip_R2 Instructs Trim Galore to remove bp from the 5' end of read 2 (paired-end reads - only). This may be useful if the qualities were very poor, or if there is some sort - of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove - the first few bp because the end-repair reaction may introduce a bias towards low - methylation. Please refer to the M-bias plot section in the Bismark User Guide for - some examples. Default: OFF. - ---three_prime_clip_R1 Instructs Trim Galore to remove bp from the 3' end of read 1 (or single-end - reads) AFTER adapter/quality trimming has been performed. This may remove some unwanted - bias from the 3' end that is not directly related to adapter sequence or basecall quality. - Default: OFF. - ---three_prime_clip_R2 Instructs Trim Galore to remove bp from the 3' end of read 2 AFTER - adapter/quality trimming has been performed. This may remove some unwanted bias from - the 3' end that is not directly related to adapter sequence or basecall quality. - Default: OFF. - ---path_to_cutadapt You may use this option to specify a path to the Cutadapt executable, - e.g. /my/home/cutadapt-1.7.1/bin/cutadapt. Else it is assumed that Cutadapt is in - the PATH. - - -RRBS-specific options (MspI digested material): - ---rrbs Specifies that the input file was an MspI digested RRBS sample (recognition - site: CCGG). Sequences which were adapter-trimmed will have a further 2 bp - removed from their 3' end. This is to avoid that the filled-in C close to the - second MspI site in a sequence is used for methylation calls. Sequences which - were merely trimmed because of poor quality will not be shortened further. - ---non_directional Selecting this option for non-directional RRBS libraries will screen - quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read - and, if found, removes the first two basepairs. Like with the option - '--rrbs' this avoids using cytosine positions that were filled-in - during the end-repair step. '--non_directional' requires '--rrbs' to - be specified as well. - ---keep Keep the quality trimmed intermediate file. Default: off, which means - the temporary file is being deleted after adapter trimming. Only has - an effect for RRBS samples since other FastQ files are not trimmed - for poor qualities separately. - - -Note for RRBS using MseI: - -If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary -to specify --rrbs or --non_directional since virtually all reads should start with the sequence -'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA' -restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply -run Trim Galore! in the standard (i.e. non-RRBS) mode. - - -Paired-end specific options: - ---paired This option performs length trimming of quality/adapter/RRBS trimmed reads for - paired-end files. To pass the validation test, both sequences of a sequence pair - are required to have a certain minimum length which is governed by the option - --length (see above). If only one read passes this length threshold the - other read can be rescued (see option --retain_unpaired). Using this option lets - you discard too short read pairs without disturbing the sequence-by-sequence order - of FastQ files which is required by many aligners. - - Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g. - file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... . - --t/--trim1 Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that - are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards - alignments like this: - - R1 ---------------------------> or this: -----------------------> R1 - R2 <--------------------------- <----------------- R2 - - as invalid (whenever a start/end coordinate is contained within the other read). - ---retain_unpaired If only one of the two paired-end reads became too short, the longer - read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq' - output files. The length cutoff for unpaired single-end reads is - governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF. - --r1/--length_1 Unpaired single-end read length cutoff needed for read 1 to be written to - '.unpaired_1.fq' output file. These reads may be mapped in single-end mode. - Default: 35 bp. - --r2/--length_2 Unpaired single-end read length cutoff needed for read 2 to be written to - '.unpaired_2.fq' output file. These reads may be mapped in single-end mode. - Default: 35 bp. - - -Last modified on 06 May 2015. - -HELP - exit; -} diff -r b4e39d993fc8 -r 80cd83b11214 trim_galore.xml --- a/trim_galore.xml Thu Apr 20 09:14:30 2017 -0400 +++ b/trim_galore.xml Mon Apr 24 14:30:07 2017 -0400 @@ -1,5 +1,5 @@ - - + + Quality and adapter trimmer of reads @@ -24,7 +24,6 @@ - ^[ACTGNactgn]*$ @@ -44,16 +43,12 @@ - - cutadapt - cutadapt + trim-galore - perl '$__tool_directory__/trim_galore' --version + trim_galore --version - - $report_file; + cat ./*_trimming_report.txt > '$report_file' #end if - -]]> - + ]]> @@ -283,8 +275,8 @@ label="Screen quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read and, if found, removes the first two basepairs" /> + - singlePaired['sPaired'] == "single" @@ -331,8 +323,8 @@ params['settingsType'] == "custom" params['report'] == True + - @@ -449,8 +441,7 @@ - - | R2 <----------------- | @@ -600,7 +591,7 @@ | Selecting this option for non-directional RRBS libraries will screen quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read and, if found, removes the first two basepairs. Like with the option '--rrbs' this avoids using cytosine positions that were filled-in during the end-repair step. '--non_directional' requires '--rrbs' to be specified as well. | - | *option --non_directional*]]> - + | *option --non_directional* + ]]>