Mercurial > repos > geert-vandeweyer > multiplicom_primer_trimming

#!/usr/bin/perl

## needs twoBitToFa v285, or other versions supporting the -bed option.

# load modules
use Getopt::Std;

##########
## opts ##
##########
## input files
# i : input fastq 1
# I : input fastq 2 (if paired)
# b : bed file with amplicon positions
# r : read length (default 250)
# o : output fastq1
# O : output fastq2
# F : failed readpairs
# R : short report
# t : 2bit file location
# w : working directory (defaults to tmp)

getopts('i:I:b:r:o:O:F:R:t:', \%opts) ;

# check input values
if (!exists($opts{'i'}) || !-e $opts{'i'}) {
	die('Fastq File not found');
}
if (!exists($opts{'I'}) || !-e $opts{'I'}) {
	die('FastQ for paired end reads not found');
}
if (!exists($opts{'o'})) {
	die('No output file specified for forward reads');
}
if (!exists($opts{'O'})) {
	die('No output file specified for reverse reads');
}
if (!exists($opts{'F'})) {
	die('No output file specified for failed pairs');
}

if (!exists($opts{'b'}) || !-e $opts{'b'}) {
	die('BED-File not found');
}

#if (exists($opts{'m'})) {
#	$minmap = $opts{'m'};
#}
#else {
#	$minmap = 3;
#}
if (exists($opts{'r'})) {
	$readl = $opts{'r'};
}
else {
	print "Assuming default read length of 2x250bp\n";
	$readl = 250;
}
if (exists($opts{'t'})  && -e $opts{'t'}) {
	$tobit = $opts{'t'};
}
else {
	die("2BIT reference not found");
}

my $tbtf = `which twoBitToFa`;
chomp($tbtf) ;
if ($tbtf eq '') {
	if (-e "/opt/software/bin/twoBitToFa") {
		$tbtf = "/opt/software/bin/twoBitToFa";
	}
	else {
		die("Could not find a twoBitToFa executable.\n");
	}
}


# make output directory in (tmp) working dir
if (exists($opts{'w'}) && -d $opts{'w'}) {
	our $wd = $opts{'w'};
}
else {
	our $wd = "/tmp/Trim.".int(rand(1000));
	while (-d $wd) {
		$wd = "/tmp/Trim.".int(rand(1000));
	}
	system("mkdir $wd");
}
#print "Using wd : $wd\n";


## build sequence hash.
my %alen = %flen = %rlen = ();
open BED, $opts{'b'};
open OUT, ">$wd/bedfile.zero.bed";
my $minf = 999;
my $minr = 999;
while (<BED>) {
	if ($_ !~ m/^(chr.{1,2}\t)(\d+)(\t.*)/) {
		next;
	}
	chomp($_);
	my @p = split(/\t/,$_);
	my $fl = $p[6] - $p[1]; # p6 holds first non-primer position
	my $rl = $p[2] - $p[7]; # p7 hold last non-primer position
	## lengths
	$alen{"$p[0]:$p[1]-$p[2]"} = $p[7] - $p[6] + 1;
	$flen{"F:$p[0]:$p[1]-$p[2]"} = $fl;
	if ($fl < $minf) {
		$minf = $fl;
	}
	if ($rl < $minr) {
		$minr = $rl;
	}
	$rlen{"R:$p[0]:$p[1]-$p[2]"} = $rl;
	print OUT "$p[0]\t".($p[1]-1)."\t".($p[6]-1)."\tF:$p[0]:$p[1]-$p[2]\n";
	print OUT "$p[0]\t".$p[7]."\t".$p[2]."\tR:$p[0]:$p[1]-$p[2]\n";

}
close BED;
close OUT;

system("cd $wd && $tbtf -noMask -bed=bedfile.zero.bed $tobit amplicons.zero.fa");

open IN, "$wd/amplicons.zero.fa";
my %fseq = %rseq = ();
my %rmm = %fmm = ();
my @nts = ('A','C','T','G');

while(<IN>) {
	my $pr = $_;
	my $seq = <IN>;
	chomp($pr);
	chomp($seq);
	$pr = substr($pr,1);
	if (substr($pr,0,1) eq 'F') {
		$fseq{$pr} = $seq;
		for ($i = 0; $i < 10; $i++) {
			foreach(@nts) {
				my $mut = substr($fseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i);
				$fmm{$pr}{$mut} = $fseq{$pr};
			}
		}
	}
	else {
		$rseq{$pr} = rc($seq);
		for ($i = 0; $i< 10;$i++){
			foreach(@nts) {
				my $mut = substr($rseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i);
				$rmm{$pr}{$mut} = $rseq{$pr};
			}
		}

	}
}
close IN;

###############################
## generate smallest overlap F##
###############################
$ntf = $minf;
BUILDMIN:
my %fmin = ();
my %fpairs = ();
foreach( keys(%fseq)) {
	my $sub = substr($fseq{$_},0,$ntf);
	## clash => increase nt.
	if (exists($fmin{$sub})) {
		## check if not identical (yes, this is possible...) (same start + same length.)
		$_ =~ m/F:chr(.{1,2}):(\d+)-(\d+)/;
		my $cchr = $1;
		my $cstart = $2;
		my $cl = $flen{$_};
		my @prev = split(/\|/,$fmin{$sub});
		my $pprim = $prev[0];
		$pprim =~ m/F:chr(.{1,2}):(\d+)-(\d+)/;
		my $pchr = $1;
		my $pstart = $2;
		my $pl = $flen{$pprim};
		if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") {
			$ntf++;
			goto BUILDMIN;
		}
		else {
			$fmin{$sub} .= "|$_";
			my $rn = $_;
			$rn =~ s/F:/R:/;
			$fpairs{$_} .= "|$rn";
		}
	}
	else {
		$fmin{$sub} = $_;
		my $rn = $_;
		$rn =~ s/F:/R:/;
		$fpairs{$_} = "$rn";

	}
}

print "Minimal number of nucleotides needed for forward: $ntf\n";

## allow one mismatch.
my %mmhash =();
foreach (keys(%fmin)) {
	my $orig = $_;
	for ($i = 0; $i< length($orig);$i++){
		foreach(@nts) {
			if ($_ eq substr($orig,$i,1)) {
				$mmhash{$orig} = $orig;
				next;
			}
			my $mut = substr($orig,0,$i).$_.substr($orig,$i+1);
			## if in mmhash && not in $fmin => clash after mutation => delete from mmhash.
			if (exists($mmhash{$mut}) ) {
				if (!exists($fmin{$mut})) {
					delete($mmhash{$mut});
					next;
				}
				else {
					## mut == original from oth primer => do not add reference.
					next;
				}
			}
			else {
				$mmhash{$mut} = $orig;
			}
		}
	}
}

###############################
## generate smallest overlap R##
###############################
$ntr = $minr;
BUILDMINR:
my %rmin = ();
my %frairs = ();
foreach( keys(%rseq)) {
	my $sub = substr($rseq{$_},0,$ntr);
	## clash => increase nt.
	if (exists($rmin{$sub})) {
		## check if not identical (yes, this is possible...) (same start + same length.)
		$_ =~ m/R:chr(.{1,2}):(\d+)-(\d+)/;
		my $cchr = $1;
		my $cstart = $2;
		my $cl = $rlen{$_};
		my @prev = split(/\|/,$rmin{$sub});
		my $pprim = $prev[0];
		$pprim =~ m/R:chr(.{1,2}):(\d+)-(\d+)/;
		my $pchr = $1;
		my $pstart = $2;
		my $pl = $Rlen{$pprim};
		if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") {
			$ntr++;
			goto BUILDMINR;
		}
		else {
			$Rmin{$sub} .= "|$_";
			my $fn = $_;
			$fn =~ s/R:/F:/;
			$rpairs{$_} .= "|$fn";
		}
	}
	else {
		$rmin{$sub} = $_;
		my $fn = $_;
		$fn =~ s/R:/F:/;
		$rpairs{$_} = "$fn";

	}
}

print "Minimal number of nucleotides needed for reverse: $ntr\n";

## allow one mismatch.
my %rmmhash =();
foreach (keys(%rmin)) {
	my $orig = $_;
	for ($i = 0; $i< length($orig);$i++){
		foreach(@nts) {
			if ($_ eq substr($orig,$i,1)) {
				$rmmhash{$orig} = $orig;
				next;
			}
			my $mut = substr($orig,0,$i).$_.substr($orig,$i+1);
			## if in mmhash && not in $fmin => clash after mutation => delete from mmhash.
			if (exists($rmmhash{$mut}) ) {
				if (!exists($rmin{$mut})) {
					delete($rmmhash{$mut});
					next;
				}
				else {
					## mut == original from oth primer => do not add reference.
					next;
				}
			}
			else {
				$rmmhash{$mut} = $orig;
			}
		}
	}
}

##########################
## process fastq files. ##
##########################
open IN, $opts{'i'};						# open forward reads
open INR, $opts{'I'};						# open reverse reads
open OUTF, ">".$opts{'o'};					# where to put trimmed forward reads
open OUTR, ">".$opts{'O'};					## where to put trimmed Reverse reads
open FAIL, ">$opts{'F'}"; #$wd/failed.interlaced.fq";				# keep non-matches to seperate file => for debug?
my $outf = $outr = $failout ='';					# buffer output to speedup I/O
my $count = $failcount = 0 ;					# track output buffer
my $nrmissed = $nrfound = 0;					# statistics
my $foundboth = $byf = $byr = $toolongf = $toolongr = 0;	# statistics
my $bptrimmed = $totalbp = 0;					# statistics
while (my $r1 = <IN>) {
	# read in forward
	my $s1 = <IN>;
	chomp($s1);
	$totalbp += length($s1);
	my $d = <IN>;
	my $q1 = <IN>;
	chomp($q1);
	# read in reverse
	my $r2 = <INR>;
	my $s2 = <INR>;
	chomp($s2);
	$totalbp += length($s2);
	my $d = <INR>;
	my $q2 = <INR>;
	chomp($q2);
	## the seeds : first positions of reads, lenght is determined above.
	my $fseed = substr($s1,0,$ntf);
	my $rseed = substr($s2,0,$ntr);
	## check if the seed exists in the "mutated" forward primer list
	if (!exists($mmhash{$fseed})) {
		## not found : try the reverse primers.
		if (!exists($rmmhash{$rseed})) {
			# not found either : print out to failed reads in interlaced format
			$nrmissed++;
			$failout .= "$r1$s1\n+\n$q1\n$r2$s2\n+\n$q2\n";
			$failcount++;
			if ($failcount > 50000) {
				chomp($failout);
				print FAIL $failout;
				$failout = "\n";
				$failcount = 0;
			}
			next;
		}
		## trim reverse 5'
		$s2 = substr($s2,$rlen{$rmin{$rmmhash{$rseed}}});  # from position *length of reverse primer* : rseed points to the original primer in the rmm hash
		$q2 = substr($q2,$rlen{$rmin{$rmmhash{$rseed}}});
		## statistics
		$bptrimmed += $rlen{$rmin{$rmmhash{$rseed}}};
		## trim if readlength > amplicon size (without primers)
		if ($readl > $rlen{$rmin{$rmmhash{$rseed}}} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
			# trim to alength (incorrect if indels !)
			# statistics
			$toolongr++;
			$bptrimmed += length($s2) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
			## 5' primer is already trimmed. This removes fragments of 3' primer.
			$s2 = substr($s2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
			$q2 = substr($q2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
		}
		$byr++;
		## trim forward 5'
		my @fps = split(/\|/,$rpairs{$rmin{$rmmhash{$rseed}}});
		my $forok = 0;
		my $forl = 0;
		foreach(@fps) {
			if (exists($fmm{$_}{substr($s1,0,10)})) {
				$s1 = substr($s1,$flen{$_});
				$q1 = substr($q1,$flen{$_});
				#statistics
				$bptrimmed += $flen{$_};
				if ($readl > $flen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
					#  trim to alength (incorrect if indels !)
					# statistics
					$toolongf++;
					$bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)};

					$s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
					$q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
				}

				$forok++;
				$foundboth++;
				last;
			}
			else {
				if ($forl < $flen{$_}) {
					$forl = $flen{$_};
				}
			}
		}
		if ($forok == 0) {
			## trim by max length of should be forwards.
			$s1 = substr($s1,$forl);
			$q1 = substr($q1,$forl);
			# statistics
			$bptrimmed += $forl;
			if ($readl > $forl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
				# trim to alength (incorrect if indels !)
				# statistics
				$toolongf++;
				$bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
				$s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
				$q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
			}

		}
		$nrfound++;
	}
	else {
		## trim forward 5'
		$s1 = substr($s1,$flen{$fmin{$mmhash{$fseed}}});
		$q1 = substr($q1,$flen{$fmin{$mmhash{$fseed}}});
		# statistics
		$bptrimmed += $flen{$fmin{$mmhash{$fseed}}};

		if ($readl > $flen{$fmin{$mmhash{$fseed}}} + $alen{substr($fmin{$mmhash{$fseed}},2)}) {
			#  trim to alength (incorrect if indels !)
			# statistics
			$toolongf++;
			$bptrimmed += length($s1) - $alen{substr($fmin{$mmhash{$fseed}},2)};

			$s1 = substr($s1,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
			$q1 = substr($q1,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
		}
		$byf++;
		## trim reverse 5'
		my @rps = split(/\|/,$fpairs{$fmin{$mmhash{$fseed}}});
		$revok = 0;
		my $revl = 0;
		foreach(@rps) {
			if (exists($rmm{$_}{substr($s2,0,10)})) {
				$s2 = substr($s2,$rlen{$_});
				$q2 = substr($q2,$rlen{$_});
				# statistics
				$bptrimmed += $rlen{$_};
				if ($readl > $rlen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
					#  trim to alength (incorrect if indels !)
					# statistics
					$toolongr++;
					$bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)};

					$s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
					$q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
				}

				$revok++;
				$foundboth++;
				last;
			}
			else {
				if ($revl < $rlen{$_}) {
					$revl = $rlen{$_};
				}
			}
		}
		if ($revok == 0) {
			# trim by max length of should be reverses.
			$s2 = substr($s2,$revl);
			$q2 = substr($q2,$revl);
			# statistics
			$bptrimmed += $revl;
			if ($readl > $revl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
				# trim to alength (incorrect if indels !)
				# statistics
				$toolongr++;
				$bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)};
				$s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
				$q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
			}

		}
		$nrfound++;
	}
	$outf .= "$r1$s1\n+\n$q1\n";
	$outr .= "$r2$s2\n+\n$q2\n";
	$count++;
	if ($count > 100000) {
		print OUTF $outf;
		print OUTR $outr;

		$outf = "\n";
		$outr = "\n";
		$count = 0;
	}


}
chomp($outf);
chomp($outr);
chomp($failout);
print OUTF $outf;
print OUTR $outr;
print FAIL $failout;
close IN;
close INR;
close OUTF;
close OUTR;
close FAIL;
open REPORT, ">$opts{'R'}" or die ("Could not open report file");
print REPORT "Results: \n";
print REPORT "########\n";
print REPORT " Read pairs without match: $nrmissed\n";
print REPORT " Read pairs with a valid match: $nrfound\n";
print REPORT " Initial match on Forward: $byf\n";
print REPORT " Initial match on Reverse: $byr\n";
print REPORT " Both F and R Matched: $foundboth\n";
print REPORT " Forward reads trimmed to amplicon length: $toolongf\n";
print REPORT " Reverse reads trimmed to amplicon length: $toolongr\n";
print REPORT " Total basepairs in fastq files: $totalbp\n";
print REPORT " Total basepairs trimmed: $bptrimmed (".sprintf("%.2f",($bptrimmed/$totalbp)*100)."%)\n";
close REPORT;


if (!exists($opts{'w'})) {
	## clean up
	system("rm -Rf $wd");
}


sub rc {
	my $seq = shift;
	$seq =~ tr/ACGT/TGCA/;
	$seq = reverse($seq);
	return $seq;

}
author	geert-vandeweyer
date	Fri, 22 May 2015 08:27:03 -0400
parents
children