view ucsb_phylogenetics/trimmingreads/TrimmingReads.pl @ 4:5b23f3eb3f09 draft

Added sequence gap remover and Trimming Reads.
author ucsb-phylogenetics
date Sun, 24 Jun 2012 16:02:29 -0400
parents
children
line wrap: on
line source

#! /usr/bin/perl

use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use List::Util qw(sum min max);

my $seqFormat = "a";		# 1: Sanger; 2: Solexa; 3: Illumina 1.3+; 4: Illumina 1.5+;
my $subVal;

# Parameter variables
my $file;
my $helpAsked;
my $rTrimBases = 0;
my $lTrimBases = 0;
my $qCutOff = 0;
my $lenCutOff = -1;
my $outFile = "";
my $isQualTrimming = 1;

GetOptions(
			"i=s" => \$file,
			"h|help" => \$helpAsked,
			"l|leftTrimBases=i" => \$lTrimBases,
			"o|outputFile=s" => \$outFile,
			"r|rightTrimBases=i" => \$rTrimBases,
			"q|qualCutOff=i" => \$qCutOff,
			"n|lenCutOff=i" => \$lenCutOff,
		  );
if(defined($helpAsked)) {
	prtUsage();
	exit;
}
if(!defined($file)) {
	prtError("No input files are provided");
}

my ($fileName, $filePath) = fileparse($file);
$outFile = $file . "_trimmed" if($outFile eq "");

open(I, "$file") or die "Can not open file $file\n";
open(O, ">$outFile") or die "Can not create file $outFile\n";
my $tmpLine = <I>;
close(I);
if($tmpLine =~ /^@/) {
	print "Input read/sequence format: FASTQ\n";
	if($lTrimBases == 0 && $rTrimBases == 0 && $qCutOff == 0 && $lenCutOff == -1) {
		print "Trimming parameters are not set.\nNothing to do.\nExiting...\n";
		unlink($outFile);
		exit;
	}
	print "Checking FASTQ variant: File $file...\n";
	my $nLines = checkFastQFormat($file, 1);
	if($seqFormat == 1) {
		$subVal = 33;
		print "Input FASTQ file format: Sanger\n";
	}
	if($seqFormat == 2) {
		$subVal = 64;
		print "Input FASTQ file format: Solexa\n";
	}
	if($seqFormat == 3) {
		$subVal = 64;
		print "Input FASTQ file format: Illumina 1.3+\n";
	}
	if($seqFormat == 4) {
		$subVal = 64;
		print "Input FASTQ file format: Illumina 1.5+\n";
	}
	if($seqFormat == 5) {
		$subVal = 33;
		print "Input FASTQ file format: Illumina 1.8+\n";
	}
	if($lTrimBases != 0 || $rTrimBases != 0) {
		print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end";
		$isQualTrimming = 0;
	}
	else {
		print "Trimming based on PHRED quality score (< $qCutOff)";
	}
	print " followed by length filtering (< $lenCutOff bp)\n";

	open(I, "$file") or die "Can not open file $file\n";	
	my $c = 0;
	my $currId1 = "";
	my $currId2 = "";
	my $currSeq = "";
	my $currQual = "";
	
	
	while(my $line = <I>) {
		chomp $line;
        $c++;
        if($c == 5) {
                $c = 1;
        }
        if($c == 1) {
        	$currId1 = $line;
        }
        if($c == 3) {
        	$currId2 = $line;
        }
        if($isQualTrimming == 0) {
	        if($c == 2) {
	        	$currSeq = trimSeq($line);
	        }
	        elsif($c == 4) {
	        	$currQual = trimSeq($line);
	        	print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff);
	        }
        }
        else {
	        if($c == 4) {
				$currQual = trimSeq4Qual($line);
				if(defined($currQual)) {
					my $len = length $currQual;
					$currSeq =~ /^(.{$len})/;
					$currSeq = $1;
		        	print O "$currId1\n$currSeq\n$currId2\n$currQual\n" if(length $currSeq >= $lenCutOff);
				}
	        }
	        elsif($c == 2) {
	        	$currSeq = $line;
	        }
        }
	}
	print "Trimmed file is generated: $outFile\n";
}
else {
	print "Input read/sequence format: FASTA\n";
	if($qCutOff != 0) {
		print "Warning: Quality trimming can not be performed for FASTA files\n";
		$qCutOff = 0;
	}
	if($lTrimBases == 0 && $rTrimBases == 0 && $lenCutOff == -1) {
		print "Trimming parameters are not set.\nNothing to do.\nExiting...\n";
		unlink($outFile);
		exit;
	}
	print "Trimming $lTrimBases bases from left end and $rTrimBases bases from right end followed by length filtering (< $lenCutOff bp)\n";
	open(I, "$file") or die "Can not open file $file\n";
	my $prevFastaSeqId = "";
	my $fastaSeqId = "";
	my $fastaSeq = "";
	
	while(my $line = <I>) {
		chomp $line;
		if($line =~ /^>/) {
			$prevFastaSeqId = $fastaSeqId;
			$fastaSeqId = $line;
			if($fastaSeq ne "") {
				my $outSeq = trimSeq($fastaSeq);
				print O "$prevFastaSeqId\n", formatSeq($outSeq), "\n" if(length $outSeq >= $lenCutOff);
			}
			$fastaSeq = "";
		}
		else {
			$fastaSeq .= $line;
		}
	}
	if($fastaSeq ne "") {
		$prevFastaSeqId = $fastaSeqId;
		my $outSeq = trimSeq($fastaSeq);
		print O "$prevFastaSeqId\n", formatSeq($outSeq), "\n" if(length $outSeq >= $lenCutOff);
	}
	print "Trimmed read/sequence file is generated: $outFile\n";
}
close(O);
close(I);


sub trimSeq {
	my $seq = $_[0];
	$seq =~ /^.{$lTrimBases}(.+).{$rTrimBases}$/;
	return $1;
}

sub trimSeq4Qual {
	my $qual = $_[0];
	my @ASCII = unpack("C*", $qual);
	my $trimCount = 0;
	for(my $i=@ASCII; $i>0; $i--) {
		my $val = $ASCII[$i-1];
		$val -= $subVal;
		if($val < $qCutOff) {
			$trimCount++;
		}
		else {
			last;
		}
	}
	$qual =~ /^(.+).{$trimCount}$/;
	return $1;
}

sub formatSeq {
	my $seq = $_[0];
    my $newSeq = "";
    my $ch = 60;
    for(my $i=0; $i<length $seq; $i+=$ch) {
        $newSeq .= substr($seq, $i, $ch) . "\n";
    }
    chomp($newSeq);         # To remove \n at the end of the whole sequence..
    return $newSeq;
}

sub prtHelp {
	print "\n$0 options:\n\n";
	print "### Input reads/sequences (FASTQ/FASTA) (Required)\n";
	print "  -i <Read/Sequence file>\n";
	print "    File containing reads/sequences in either FASTQ or FASTA format\n";
	print "\n";
	print "### Other options [Optional]\n";
	print "  -h | -help\n";
	print "    Prints this help\n";
	print "--------------------------------- Trimming Options ---------------------------------\n";
	print "  -l | -leftTrimBases <Integer>\n";
	print "    Number of bases to be trimmed from left end (5' end)\n";
	print "    default: 0\n";
	print "  -r | -rightTrimBases <Integer>\n";
	print "    Number of bases to be trimmed from right end (3' end)\n";
	print "    default: 0\n";
	print "  -q | -qualCutOff <Integer> (Only for FASTQ files)\n";
	print "    Cut-off PHRED quality score for trimming reads from right end (3' end)\n";
	print "      For eg.: -q 20, will trim bases having PHRED quality score less than 20 at 3' end of the read\n";
	print "      Note: Quality trimming can be performed only if -l and -r are not used\n";
	print "    default: 0 (i.e. quality trimming is OFF)\n";
	print "  -n | -lenCutOff <Integer>\n";
	print "    Read length cut-off\n";
	print "    Reads shorter than given length will be discarded\n";
	print "    default: -1 (i.e. length filtering is OFF)\n";
	print "--------------------------------- Output Options ---------------------------------\n";
	print "  -o | -outputFile <Output file name>\n";
	print "    Output will be stored in the given file\n";
	print "    default: By default, output file will be stored where the input file is\n";
	print "\n";
}

sub prtError {
	my $msg = $_[0];
	print STDERR "+======================================================================+\n";
	printf STDERR "|%-70s|\n", "  Error:";
	printf STDERR "|%-70s|\n", "       $msg";
	print STDERR "+======================================================================+\n";
	prtUsage();
	exit;
}

sub prtUsage {
	print "\nUsage: perl $0 <options>\n";
	prtHelp();
}

sub checkFastQFormat {				# Takes FASTQ file as an input and if the format is incorrect it will print error and exit, otherwise it will return the number of lines in the file.
	my $file = $_[0];
	my $isVariantIdntfcntOn = $_[1];
	my $lines = 0;
	open(F, "<$file") or die "Can not open file $file\n";
	my $counter = 0;
	my $minVal = 1000;
	my $maxVal = 0;
	while(my $line = <F>) {
		$lines++;
		$counter++;
		next if($line =~ /^\n$/);
		if($counter == 1 && $line !~ /^\@/) {
			prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
		}
		if($counter == 3 && $line !~ /^\+/) {
			prtErrorExit("Invalid FASTQ file format.\n\t\tFile: $file");
		}
		if($counter == 4 && $lines < 1000000) {
			chomp $line;
			my @ASCII = unpack("C*", $line);
			$minVal = min(min(@ASCII), $minVal);
			$maxVal = max(max(@ASCII), $maxVal);
		}
		if($counter == 4) {
			$counter = 0;
		}
	}
	close(F);
	my $tseqFormat = 0;
	if($minVal >= 33 && $minVal <= 73 && $maxVal >= 33 && $maxVal <= 73) {
		$tseqFormat = 1;
	}
	elsif($minVal >= 66 && $minVal <= 105 && $maxVal >= 66 && $maxVal <= 105) {
		$tseqFormat = 4;			# Illumina 1.5+
	}
	elsif($minVal >= 64 && $minVal <= 105 && $maxVal >= 64 && $maxVal <= 105) {
		$tseqFormat = 3;			# Illumina 1.3+
	}
	elsif($minVal >= 59 && $minVal <= 105 && $maxVal >= 59 && $maxVal <= 105) {
		$tseqFormat = 2;			# Solexa
	}
	elsif($minVal >= 33 && $minVal <= 74 && $maxVal >= 33 && $maxVal <= 74) {
		$tseqFormat = 5;			# Illumina 1.8+
	}
	if($isVariantIdntfcntOn) {
		$seqFormat = $tseqFormat;
	}
	else {
		if($tseqFormat != $seqFormat) {
			print STDERR "Warning: It seems the specified variant of FASTQ doesn't match the quality values in input FASTQ files.\n";
		}
	}
	return $lines;
}