diff multiplicom_primer_trimming.pl @ 0:fadef644b886 draft

Uploaded
author geert-vandeweyer
date Fri, 22 May 2015 08:27:03 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multiplicom_primer_trimming.pl	Fri May 22 08:27:03 2015 -0400
@@ -0,0 +1,537 @@
+#!/usr/bin/perl
+
+## needs twoBitToFa v285, or other versions supporting the -bed option.
+
+# load modules
+use Getopt::Std;
+
+##########
+## opts ##
+##########
+## input files
+# i : input fastq 1
+# I : input fastq 2 (if paired)
+# b : bed file with amplicon positions
+# r : read length (default 250)
+# o : output fastq1
+# O : output fastq2
+# F : failed readpairs
+# R : short report
+# t : 2bit file location
+# w : working directory (defaults to tmp)
+
+getopts('i:I:b:r:o:O:F:R:t:', \%opts) ;
+ 
+# check input values
+if (!exists($opts{'i'}) || !-e $opts{'i'}) {
+	die('Fastq File not found');
+}
+if (!exists($opts{'I'}) || !-e $opts{'I'}) {
+	die('FastQ for paired end reads not found');
+}
+if (!exists($opts{'o'})) {
+	die('No output file specified for forward reads');
+}
+if (!exists($opts{'O'})) {
+	die('No output file specified for reverse reads');
+}
+if (!exists($opts{'F'})) {
+	die('No output file specified for failed pairs');
+}
+
+if (!exists($opts{'b'}) || !-e $opts{'b'}) {
+	die('BED-File not found');
+}
+
+#if (exists($opts{'m'})) {
+#	$minmap = $opts{'m'};
+#}
+#else {
+#	$minmap = 3;
+#}
+if (exists($opts{'r'})) {
+	$readl = $opts{'r'};
+}
+else {
+	print "Assuming default read length of 2x250bp\n";
+	$readl = 250;
+}
+if (exists($opts{'t'})  && -e $opts{'t'}) {
+	$tobit = $opts{'t'};
+}
+else {
+	die("2BIT reference not found");
+}
+
+my $tbtf = `which twoBitToFa`;
+chomp($tbtf) ;
+if ($tbtf eq '') {
+	if (-e "/opt/software/bin/twoBitToFa") {
+		$tbtf = "/opt/software/bin/twoBitToFa";
+	}
+	else {
+		die("Could not find a twoBitToFa executable.\n");
+	}
+}
+
+
+# make output directory in (tmp) working dir
+if (exists($opts{'w'}) && -d $opts{'w'}) {
+	our $wd = $opts{'w'};
+}
+else {
+	our $wd = "/tmp/Trim.".int(rand(1000));
+	while (-d $wd) {
+		$wd = "/tmp/Trim.".int(rand(1000));
+	}
+	system("mkdir $wd");
+}
+#print "Using wd : $wd\n";
+
+
+## build sequence hash.
+my %alen = %flen = %rlen = ();
+open BED, $opts{'b'};
+open OUT, ">$wd/bedfile.zero.bed";
+my $minf = 999;
+my $minr = 999;
+while (<BED>) {
+	if ($_ !~ m/^(chr.{1,2}\t)(\d+)(\t.*)/) {
+		next;
+	}
+	chomp($_);
+	my @p = split(/\t/,$_);
+	my $fl = $p[6] - $p[1]; # p6 holds first non-primer position
+	my $rl = $p[2] - $p[7]; # p7 hold last non-primer position
+	## lengths  
+	$alen{"$p[0]:$p[1]-$p[2]"} = $p[7] - $p[6] + 1;
+	$flen{"F:$p[0]:$p[1]-$p[2]"} = $fl;
+	if ($fl < $minf) {
+		$minf = $fl;
+	}
+	if ($rl < $minr) {
+		$minr = $rl;
+	}
+	$rlen{"R:$p[0]:$p[1]-$p[2]"} = $rl;
+	print OUT "$p[0]\t".($p[1]-1)."\t".($p[6]-1)."\tF:$p[0]:$p[1]-$p[2]\n";
+	print OUT "$p[0]\t".$p[7]."\t".$p[2]."\tR:$p[0]:$p[1]-$p[2]\n";
+
+}
+close BED;
+close OUT;
+
+system("cd $wd && $tbtf -noMask -bed=bedfile.zero.bed $tobit amplicons.zero.fa");
+
+open IN, "$wd/amplicons.zero.fa";
+my %fseq = %rseq = ();
+my %rmm = %fmm = ();
+my @nts = ('A','C','T','G');
+
+while(<IN>) {
+	my $pr = $_;
+	my $seq = <IN>;
+	chomp($pr);
+	chomp($seq);
+	$pr = substr($pr,1);
+	if (substr($pr,0,1) eq 'F') {
+		$fseq{$pr} = $seq;
+		for ($i = 0; $i < 10; $i++) {
+			foreach(@nts) {
+				my $mut = substr($fseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i);
+				$fmm{$pr}{$mut} = $fseq{$pr};
+			}
+		}
+	}
+	else {
+		$rseq{$pr} = rc($seq);
+		for ($i = 0; $i< 10;$i++){
+			foreach(@nts) {
+				my $mut = substr($rseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i);
+				$rmm{$pr}{$mut} = $rseq{$pr};
+			}
+		}
+
+	}
+}
+close IN;
+
+###############################
+## generate smallest overlap F##
+###############################
+$ntf = $minf;
+BUILDMIN:
+my %fmin = ();
+my %fpairs = ();
+foreach( keys(%fseq)) {
+	my $sub = substr($fseq{$_},0,$ntf);
+	## clash => increase nt.
+	if (exists($fmin{$sub})) {
+		## check if not identical (yes, this is possible...) (same start + same length.)
+		$_ =~ m/F:chr(.{1,2}):(\d+)-(\d+)/;
+		my $cchr = $1;
+		my $cstart = $2;
+		my $cl = $flen{$_};
+		my @prev = split(/\|/,$fmin{$sub});
+		my $pprim = $prev[0];
+		$pprim =~ m/F:chr(.{1,2}):(\d+)-(\d+)/;
+		my $pchr = $1;
+		my $pstart = $2;
+		my $pl = $flen{$pprim};
+		if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") {
+			$ntf++;
+			goto BUILDMIN;
+		}
+		else {
+			$fmin{$sub} .= "|$_";
+			my $rn = $_;
+			$rn =~ s/F:/R:/;
+			$fpairs{$_} .= "|$rn";
+		}
+	}
+	else {
+		$fmin{$sub} = $_;
+		my $rn = $_;
+		$rn =~ s/F:/R:/;
+		$fpairs{$_} = "$rn";
+
+	}
+}
+
+print "Minimal number of nucleotides needed for forward: $ntf\n";
+
+## allow one mismatch.
+my %mmhash =();
+foreach (keys(%fmin)) {
+	my $orig = $_;
+	for ($i = 0; $i< length($orig);$i++){
+		foreach(@nts) {
+			if ($_ eq substr($orig,$i,1)) {
+				$mmhash{$orig} = $orig;
+				next;
+			}
+			my $mut = substr($orig,0,$i).$_.substr($orig,$i+1);
+			## if in mmhash && not in $fmin => clash after mutation => delete from mmhash.
+			if (exists($mmhash{$mut}) ) {
+				if (!exists($fmin{$mut})) {
+					delete($mmhash{$mut});
+					next;
+				}
+				else {
+					## mut == original from oth primer => do not add reference.
+					next;
+				}
+			}
+			else {
+				$mmhash{$mut} = $orig;
+			}
+		}
+	}
+}
+			
+###############################
+## generate smallest overlap R##
+###############################
+$ntr = $minr;
+BUILDMINR:
+my %rmin = ();
+my %frairs = ();
+foreach( keys(%rseq)) {
+	my $sub = substr($rseq{$_},0,$ntr);
+	## clash => increase nt.
+	if (exists($rmin{$sub})) {
+		## check if not identical (yes, this is possible...) (same start + same length.)
+		$_ =~ m/R:chr(.{1,2}):(\d+)-(\d+)/;
+		my $cchr = $1;
+		my $cstart = $2;
+		my $cl = $rlen{$_};
+		my @prev = split(/\|/,$rmin{$sub});
+		my $pprim = $prev[0];
+		$pprim =~ m/R:chr(.{1,2}):(\d+)-(\d+)/;
+		my $pchr = $1;
+		my $pstart = $2;
+		my $pl = $Rlen{$pprim};
+		if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") {
+			$ntr++;
+			goto BUILDMINR;
+		}
+		else {
+			$Rmin{$sub} .= "|$_";
+			my $fn = $_;
+			$fn =~ s/R:/F:/;
+			$rpairs{$_} .= "|$fn";
+		}
+	}
+	else {
+		$rmin{$sub} = $_;
+		my $fn = $_;
+		$fn =~ s/R:/F:/;
+		$rpairs{$_} = "$fn";
+
+	}
+}
+
+print "Minimal number of nucleotides needed for reverse: $ntr\n";
+
+## allow one mismatch.
+my %rmmhash =();
+foreach (keys(%rmin)) {
+	my $orig = $_;
+	for ($i = 0; $i< length($orig);$i++){
+		foreach(@nts) {
+			if ($_ eq substr($orig,$i,1)) {
+				$rmmhash{$orig} = $orig;
+				next;
+			}
+			my $mut = substr($orig,0,$i).$_.substr($orig,$i+1);
+			## if in mmhash && not in $fmin => clash after mutation => delete from mmhash.
+			if (exists($rmmhash{$mut}) ) {
+				if (!exists($rmin{$mut})) {
+					delete($rmmhash{$mut});
+					next;
+				}
+				else {
+					## mut == original from oth primer => do not add reference.
+					next;
+				}
+			}
+			else {
+				$rmmhash{$mut} = $orig;
+			}
+		}
+	}
+}
+			
+##########################
+## process fastq files. ##
+##########################
+open IN, $opts{'i'};						# open forward reads
+open INR, $opts{'I'};						# open reverse reads
+open OUTF, ">".$opts{'o'};					# where to put trimmed forward reads
+open OUTR, ">".$opts{'O'};					## where to put trimmed Reverse reads
+open FAIL, ">$opts{'F'}"; #$wd/failed.interlaced.fq";				# keep non-matches to seperate file => for debug?
+my $outf = $outr = $failout ='';					# buffer output to speedup I/O
+my $count = $failcount = 0 ;					# track output buffer
+my $nrmissed = $nrfound = 0;					# statistics
+my $foundboth = $byf = $byr = $toolongf = $toolongr = 0;	# statistics
+my $bptrimmed = $totalbp = 0;					# statistics
+while (my $r1 = <IN>) {
+	# read in forward
+	my $s1 = <IN>;
+	chomp($s1);
+	$totalbp += length($s1);
+	my $d = <IN>;
+	my $q1 = <IN>;
+	chomp($q1);
+	# read in reverse
+	my $r2 = <INR>;
+	my $s2 = <INR>;
+	chomp($s2);
+	$totalbp += length($s2);
+	my $d = <INR>;
+	my $q2 = <INR>;
+	chomp($q2);
+	## the seeds : first positions of reads, lenght is determined above.
+	my $fseed = substr($s1,0,$ntf);
+	my $rseed = substr($s2,0,$ntr);
+	## check if the seed exists in the "mutated" forward primer list	
+	if (!exists($mmhash{$fseed})) {
+		## not found : try the reverse primers.
+		if (!exists($rmmhash{$rseed})) {
+			# not found either : print out to failed reads in interlaced format
+			$nrmissed++;
+			$failout .= "$r1$s1\n+\n$q1\n$r2$s2\n+\n$q2\n";
+			$failcount++;
+			if ($failcount > 50000) { 
+				chomp($failout);
+				print FAIL $failout; 
+				$failout = "\n";
+				$failcount = 0;
+			}
+			next;
+		}
+		## trim reverse 5'
+		$s2 = substr($s2,$rlen{$rmin{$rmmhash{$rseed}}});  # from position *length of reverse primer* : rseed points to the original primer in the rmm hash
+		$q2 = substr($q2,$rlen{$rmin{$rmmhash{$rseed}}});
+		## statistics
+		$bptrimmed += $rlen{$rmin{$rmmhash{$rseed}}};
+		## trim if readlength > amplicon size (without primers)
+		if ($readl > $rlen{$rmin{$rmmhash{$rseed}}} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
+			# trim to alength (incorrect if indels !)
+			# statistics
+			$toolongr++;
+			$bptrimmed += length($s2) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
+			## 5' primer is already trimmed. This removes fragments of 3' primer.
+			$s2 = substr($s2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
+			$q2 = substr($q2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
+		}
+		$byr++;
+		## trim forward 5'
+		my @fps = split(/\|/,$rpairs{$rmin{$rmmhash{$rseed}}});
+		my $forok = 0;
+		my $forl = 0;
+		foreach(@fps) {
+			if (exists($fmm{$_}{substr($s1,0,10)})) {
+				$s1 = substr($s1,$flen{$_});
+				$q1 = substr($q1,$flen{$_});
+				#statistics
+				$bptrimmed += $flen{$_};
+				if ($readl > $flen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
+					#  trim to alength (incorrect if indels !)
+					# statistics
+					$toolongf++;
+					$bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
+					
+					$s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
+					$q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
+				}
+
+				$forok++;
+				$foundboth++;
+				last;
+			}
+			else {
+				if ($forl < $flen{$_}) {
+					$forl = $flen{$_};
+				}
+			}
+		}
+		if ($forok == 0) {
+			## trim by max length of should be forwards.
+			$s1 = substr($s1,$forl);
+			$q1 = substr($q1,$forl);
+			# statistics
+			$bptrimmed += $forl; 
+			if ($readl > $forl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
+				# trim to alength (incorrect if indels !)
+				# statistics
+				$toolongf++;
+				$bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)};
+				$s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
+				$q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)});
+			}
+
+		}
+		$nrfound++;
+	}
+	else {
+		## trim forward 5'
+		$s1 = substr($s1,$flen{$fmin{$mmhash{$fseed}}});
+		$q1 = substr($q1,$flen{$fmin{$mmhash{$fseed}}});
+		# statistics
+		$bptrimmed += $flen{$fmin{$mmhash{$fseed}}};
+
+		if ($readl > $flen{$fmin{$mmhash{$fseed}}} + $alen{substr($fmin{$mmhash{$fseed}},2)}) {
+			#  trim to alength (incorrect if indels !)
+			# statistics
+			$toolongf++;
+			$bptrimmed += length($s1) - $alen{substr($fmin{$mmhash{$fseed}},2)};
+
+			$s1 = substr($s1,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
+			$q1 = substr($q1,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
+		}
+		$byf++;
+		## trim reverse 5' 
+		my @rps = split(/\|/,$fpairs{$fmin{$mmhash{$fseed}}});
+		$revok = 0;
+		my $revl = 0;
+		foreach(@rps) {
+			if (exists($rmm{$_}{substr($s2,0,10)})) {
+				$s2 = substr($s2,$rlen{$_});
+				$q2 = substr($q2,$rlen{$_});
+				# statistics
+				$bptrimmed += $rlen{$_};
+				if ($readl > $rlen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
+					#  trim to alength (incorrect if indels !)
+					# statistics
+					$toolongr++;
+					$bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)};
+
+					$s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
+					$q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
+				}
+
+				$revok++;
+				$foundboth++;
+				last;
+			}
+			else {
+				if ($revl < $rlen{$_}) {
+					$revl = $rlen{$_};
+				}	
+			}
+		}
+		if ($revok == 0) {
+			# trim by max length of should be reverses.
+			$s2 = substr($s2,$revl);
+			$q2 = substr($q2,$revl);
+			# statistics
+			$bptrimmed += $revl;
+			if ($readl > $revl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) {
+				# trim to alength (incorrect if indels !)
+				# statistics
+				$toolongr++;
+				$bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)};
+				$s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
+				$q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)});
+			}
+
+		}
+		$nrfound++;
+	}
+	$outf .= "$r1$s1\n+\n$q1\n";
+	$outr .= "$r2$s2\n+\n$q2\n";
+	$count++;
+	if ($count > 100000) {
+		print OUTF $outf;
+		print OUTR $outr; 
+	
+		$outf = "\n";
+		$outr = "\n";
+		$count = 0;
+	}
+
+	
+}
+chomp($outf);
+chomp($outr);
+chomp($failout);
+print OUTF $outf;
+print OUTR $outr; 
+print FAIL $failout;
+close IN;
+close INR;
+close OUTF;
+close OUTR;
+close FAIL;
+open REPORT, ">$opts{'R'}" or die ("Could not open report file");
+print REPORT "Results: \n";
+print REPORT "########\n";
+print REPORT " Read pairs without match: $nrmissed\n";
+print REPORT " Read pairs with a valid match: $nrfound\n";
+print REPORT " Initial match on Forward: $byf\n";
+print REPORT " Initial match on Reverse: $byr\n";
+print REPORT " Both F and R Matched: $foundboth\n";
+print REPORT " Forward reads trimmed to amplicon length: $toolongf\n";
+print REPORT " Reverse reads trimmed to amplicon length: $toolongr\n";	
+print REPORT " Total basepairs in fastq files: $totalbp\n";
+print REPORT " Total basepairs trimmed: $bptrimmed (".sprintf("%.2f",($bptrimmed/$totalbp)*100)."%)\n"; 
+close REPORT;
+
+
+
+
+
+
+if (!exists($opts{'w'})) {
+	## clean up
+	system("rm -Rf $wd");
+}
+
+
+sub rc {
+	my $seq = shift;
+	$seq =~ tr/ACGT/TGCA/;
+	$seq = reverse($seq);
+	return $seq;
+
+}