Mercurial > repos > geert-vandeweyer > multiplicom_primer_trimming
diff multiplicom_primer_trimming.pl @ 0:fadef644b886 draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Fri, 22 May 2015 08:27:03 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multiplicom_primer_trimming.pl Fri May 22 08:27:03 2015 -0400 @@ -0,0 +1,537 @@ +#!/usr/bin/perl + +## needs twoBitToFa v285, or other versions supporting the -bed option. + +# load modules +use Getopt::Std; + +########## +## opts ## +########## +## input files +# i : input fastq 1 +# I : input fastq 2 (if paired) +# b : bed file with amplicon positions +# r : read length (default 250) +# o : output fastq1 +# O : output fastq2 +# F : failed readpairs +# R : short report +# t : 2bit file location +# w : working directory (defaults to tmp) + +getopts('i:I:b:r:o:O:F:R:t:', \%opts) ; + +# check input values +if (!exists($opts{'i'}) || !-e $opts{'i'}) { + die('Fastq File not found'); +} +if (!exists($opts{'I'}) || !-e $opts{'I'}) { + die('FastQ for paired end reads not found'); +} +if (!exists($opts{'o'})) { + die('No output file specified for forward reads'); +} +if (!exists($opts{'O'})) { + die('No output file specified for reverse reads'); +} +if (!exists($opts{'F'})) { + die('No output file specified for failed pairs'); +} + +if (!exists($opts{'b'}) || !-e $opts{'b'}) { + die('BED-File not found'); +} + +#if (exists($opts{'m'})) { +# $minmap = $opts{'m'}; +#} +#else { +# $minmap = 3; +#} +if (exists($opts{'r'})) { + $readl = $opts{'r'}; +} +else { + print "Assuming default read length of 2x250bp\n"; + $readl = 250; +} +if (exists($opts{'t'}) && -e $opts{'t'}) { + $tobit = $opts{'t'}; +} +else { + die("2BIT reference not found"); +} + +my $tbtf = `which twoBitToFa`; +chomp($tbtf) ; +if ($tbtf eq '') { + if (-e "/opt/software/bin/twoBitToFa") { + $tbtf = "/opt/software/bin/twoBitToFa"; + } + else { + die("Could not find a twoBitToFa executable.\n"); + } +} + + +# make output directory in (tmp) working dir +if (exists($opts{'w'}) && -d $opts{'w'}) { + our $wd = $opts{'w'}; +} +else { + our $wd = "/tmp/Trim.".int(rand(1000)); + while (-d $wd) { + $wd = "/tmp/Trim.".int(rand(1000)); + } + system("mkdir $wd"); +} +#print "Using wd : $wd\n"; + + +## build sequence hash. +my %alen = %flen = %rlen = (); +open BED, $opts{'b'}; +open OUT, ">$wd/bedfile.zero.bed"; +my $minf = 999; +my $minr = 999; +while (<BED>) { + if ($_ !~ m/^(chr.{1,2}\t)(\d+)(\t.*)/) { + next; + } + chomp($_); + my @p = split(/\t/,$_); + my $fl = $p[6] - $p[1]; # p6 holds first non-primer position + my $rl = $p[2] - $p[7]; # p7 hold last non-primer position + ## lengths + $alen{"$p[0]:$p[1]-$p[2]"} = $p[7] - $p[6] + 1; + $flen{"F:$p[0]:$p[1]-$p[2]"} = $fl; + if ($fl < $minf) { + $minf = $fl; + } + if ($rl < $minr) { + $minr = $rl; + } + $rlen{"R:$p[0]:$p[1]-$p[2]"} = $rl; + print OUT "$p[0]\t".($p[1]-1)."\t".($p[6]-1)."\tF:$p[0]:$p[1]-$p[2]\n"; + print OUT "$p[0]\t".$p[7]."\t".$p[2]."\tR:$p[0]:$p[1]-$p[2]\n"; + +} +close BED; +close OUT; + +system("cd $wd && $tbtf -noMask -bed=bedfile.zero.bed $tobit amplicons.zero.fa"); + +open IN, "$wd/amplicons.zero.fa"; +my %fseq = %rseq = (); +my %rmm = %fmm = (); +my @nts = ('A','C','T','G'); + +while(<IN>) { + my $pr = $_; + my $seq = <IN>; + chomp($pr); + chomp($seq); + $pr = substr($pr,1); + if (substr($pr,0,1) eq 'F') { + $fseq{$pr} = $seq; + for ($i = 0; $i < 10; $i++) { + foreach(@nts) { + my $mut = substr($fseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i); + $fmm{$pr}{$mut} = $fseq{$pr}; + } + } + } + else { + $rseq{$pr} = rc($seq); + for ($i = 0; $i< 10;$i++){ + foreach(@nts) { + my $mut = substr($rseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i); + $rmm{$pr}{$mut} = $rseq{$pr}; + } + } + + } +} +close IN; + +############################### +## generate smallest overlap F## +############################### +$ntf = $minf; +BUILDMIN: +my %fmin = (); +my %fpairs = (); +foreach( keys(%fseq)) { + my $sub = substr($fseq{$_},0,$ntf); + ## clash => increase nt. + if (exists($fmin{$sub})) { + ## check if not identical (yes, this is possible...) (same start + same length.) + $_ =~ m/F:chr(.{1,2}):(\d+)-(\d+)/; + my $cchr = $1; + my $cstart = $2; + my $cl = $flen{$_}; + my @prev = split(/\|/,$fmin{$sub}); + my $pprim = $prev[0]; + $pprim =~ m/F:chr(.{1,2}):(\d+)-(\d+)/; + my $pchr = $1; + my $pstart = $2; + my $pl = $flen{$pprim}; + if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") { + $ntf++; + goto BUILDMIN; + } + else { + $fmin{$sub} .= "|$_"; + my $rn = $_; + $rn =~ s/F:/R:/; + $fpairs{$_} .= "|$rn"; + } + } + else { + $fmin{$sub} = $_; + my $rn = $_; + $rn =~ s/F:/R:/; + $fpairs{$_} = "$rn"; + + } +} + +print "Minimal number of nucleotides needed for forward: $ntf\n"; + +## allow one mismatch. +my %mmhash =(); +foreach (keys(%fmin)) { + my $orig = $_; + for ($i = 0; $i< length($orig);$i++){ + foreach(@nts) { + if ($_ eq substr($orig,$i,1)) { + $mmhash{$orig} = $orig; + next; + } + my $mut = substr($orig,0,$i).$_.substr($orig,$i+1); + ## if in mmhash && not in $fmin => clash after mutation => delete from mmhash. + if (exists($mmhash{$mut}) ) { + if (!exists($fmin{$mut})) { + delete($mmhash{$mut}); + next; + } + else { + ## mut == original from oth primer => do not add reference. + next; + } + } + else { + $mmhash{$mut} = $orig; + } + } + } +} + +############################### +## generate smallest overlap R## +############################### +$ntr = $minr; +BUILDMINR: +my %rmin = (); +my %frairs = (); +foreach( keys(%rseq)) { + my $sub = substr($rseq{$_},0,$ntr); + ## clash => increase nt. + if (exists($rmin{$sub})) { + ## check if not identical (yes, this is possible...) (same start + same length.) + $_ =~ m/R:chr(.{1,2}):(\d+)-(\d+)/; + my $cchr = $1; + my $cstart = $2; + my $cl = $rlen{$_}; + my @prev = split(/\|/,$rmin{$sub}); + my $pprim = $prev[0]; + $pprim =~ m/R:chr(.{1,2}):(\d+)-(\d+)/; + my $pchr = $1; + my $pstart = $2; + my $pl = $Rlen{$pprim}; + if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") { + $ntr++; + goto BUILDMINR; + } + else { + $Rmin{$sub} .= "|$_"; + my $fn = $_; + $fn =~ s/R:/F:/; + $rpairs{$_} .= "|$fn"; + } + } + else { + $rmin{$sub} = $_; + my $fn = $_; + $fn =~ s/R:/F:/; + $rpairs{$_} = "$fn"; + + } +} + +print "Minimal number of nucleotides needed for reverse: $ntr\n"; + +## allow one mismatch. +my %rmmhash =(); +foreach (keys(%rmin)) { + my $orig = $_; + for ($i = 0; $i< length($orig);$i++){ + foreach(@nts) { + if ($_ eq substr($orig,$i,1)) { + $rmmhash{$orig} = $orig; + next; + } + my $mut = substr($orig,0,$i).$_.substr($orig,$i+1); + ## if in mmhash && not in $fmin => clash after mutation => delete from mmhash. + if (exists($rmmhash{$mut}) ) { + if (!exists($rmin{$mut})) { + delete($rmmhash{$mut}); + next; + } + else { + ## mut == original from oth primer => do not add reference. + next; + } + } + else { + $rmmhash{$mut} = $orig; + } + } + } +} + +########################## +## process fastq files. ## +########################## +open IN, $opts{'i'}; # open forward reads +open INR, $opts{'I'}; # open reverse reads +open OUTF, ">".$opts{'o'}; # where to put trimmed forward reads +open OUTR, ">".$opts{'O'}; ## where to put trimmed Reverse reads +open FAIL, ">$opts{'F'}"; #$wd/failed.interlaced.fq"; # keep non-matches to seperate file => for debug? +my $outf = $outr = $failout =''; # buffer output to speedup I/O +my $count = $failcount = 0 ; # track output buffer +my $nrmissed = $nrfound = 0; # statistics +my $foundboth = $byf = $byr = $toolongf = $toolongr = 0; # statistics +my $bptrimmed = $totalbp = 0; # statistics +while (my $r1 = <IN>) { + # read in forward + my $s1 = <IN>; + chomp($s1); + $totalbp += length($s1); + my $d = <IN>; + my $q1 = <IN>; + chomp($q1); + # read in reverse + my $r2 = <INR>; + my $s2 = <INR>; + chomp($s2); + $totalbp += length($s2); + my $d = <INR>; + my $q2 = <INR>; + chomp($q2); + ## the seeds : first positions of reads, lenght is determined above. + my $fseed = substr($s1,0,$ntf); + my $rseed = substr($s2,0,$ntr); + ## check if the seed exists in the "mutated" forward primer list + if (!exists($mmhash{$fseed})) { + ## not found : try the reverse primers. + if (!exists($rmmhash{$rseed})) { + # not found either : print out to failed reads in interlaced format + $nrmissed++; + $failout .= "$r1$s1\n+\n$q1\n$r2$s2\n+\n$q2\n"; + $failcount++; + if ($failcount > 50000) { + chomp($failout); + print FAIL $failout; + $failout = "\n"; + $failcount = 0; + } + next; + } + ## trim reverse 5' + $s2 = substr($s2,$rlen{$rmin{$rmmhash{$rseed}}}); # from position *length of reverse primer* : rseed points to the original primer in the rmm hash + $q2 = substr($q2,$rlen{$rmin{$rmmhash{$rseed}}}); + ## statistics + $bptrimmed += $rlen{$rmin{$rmmhash{$rseed}}}; + ## trim if readlength > amplicon size (without primers) + if ($readl > $rlen{$rmin{$rmmhash{$rseed}}} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { + # trim to alength (incorrect if indels !) + # statistics + $toolongr++; + $bptrimmed += length($s2) - $alen{substr($rmin{$rmmhash{$rseed}},2)}; + ## 5' primer is already trimmed. This removes fragments of 3' primer. + $s2 = substr($s2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); + $q2 = substr($q2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); + } + $byr++; + ## trim forward 5' + my @fps = split(/\|/,$rpairs{$rmin{$rmmhash{$rseed}}}); + my $forok = 0; + my $forl = 0; + foreach(@fps) { + if (exists($fmm{$_}{substr($s1,0,10)})) { + $s1 = substr($s1,$flen{$_}); + $q1 = substr($q1,$flen{$_}); + #statistics + $bptrimmed += $flen{$_}; + if ($readl > $flen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { + # trim to alength (incorrect if indels !) + # statistics + $toolongf++; + $bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)}; + + $s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); + $q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); + } + + $forok++; + $foundboth++; + last; + } + else { + if ($forl < $flen{$_}) { + $forl = $flen{$_}; + } + } + } + if ($forok == 0) { + ## trim by max length of should be forwards. + $s1 = substr($s1,$forl); + $q1 = substr($q1,$forl); + # statistics + $bptrimmed += $forl; + if ($readl > $forl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { + # trim to alength (incorrect if indels !) + # statistics + $toolongf++; + $bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)}; + $s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); + $q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); + } + + } + $nrfound++; + } + else { + ## trim forward 5' + $s1 = substr($s1,$flen{$fmin{$mmhash{$fseed}}}); + $q1 = substr($q1,$flen{$fmin{$mmhash{$fseed}}}); + # statistics + $bptrimmed += $flen{$fmin{$mmhash{$fseed}}}; + + if ($readl > $flen{$fmin{$mmhash{$fseed}}} + $alen{substr($fmin{$mmhash{$fseed}},2)}) { + # trim to alength (incorrect if indels !) + # statistics + $toolongf++; + $bptrimmed += length($s1) - $alen{substr($fmin{$mmhash{$fseed}},2)}; + + $s1 = substr($s1,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); + $q1 = substr($q1,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); + } + $byf++; + ## trim reverse 5' + my @rps = split(/\|/,$fpairs{$fmin{$mmhash{$fseed}}}); + $revok = 0; + my $revl = 0; + foreach(@rps) { + if (exists($rmm{$_}{substr($s2,0,10)})) { + $s2 = substr($s2,$rlen{$_}); + $q2 = substr($q2,$rlen{$_}); + # statistics + $bptrimmed += $rlen{$_}; + if ($readl > $rlen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { + # trim to alength (incorrect if indels !) + # statistics + $toolongr++; + $bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)}; + + $s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); + $q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); + } + + $revok++; + $foundboth++; + last; + } + else { + if ($revl < $rlen{$_}) { + $revl = $rlen{$_}; + } + } + } + if ($revok == 0) { + # trim by max length of should be reverses. + $s2 = substr($s2,$revl); + $q2 = substr($q2,$revl); + # statistics + $bptrimmed += $revl; + if ($readl > $revl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { + # trim to alength (incorrect if indels !) + # statistics + $toolongr++; + $bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)}; + $s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); + $q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); + } + + } + $nrfound++; + } + $outf .= "$r1$s1\n+\n$q1\n"; + $outr .= "$r2$s2\n+\n$q2\n"; + $count++; + if ($count > 100000) { + print OUTF $outf; + print OUTR $outr; + + $outf = "\n"; + $outr = "\n"; + $count = 0; + } + + +} +chomp($outf); +chomp($outr); +chomp($failout); +print OUTF $outf; +print OUTR $outr; +print FAIL $failout; +close IN; +close INR; +close OUTF; +close OUTR; +close FAIL; +open REPORT, ">$opts{'R'}" or die ("Could not open report file"); +print REPORT "Results: \n"; +print REPORT "########\n"; +print REPORT " Read pairs without match: $nrmissed\n"; +print REPORT " Read pairs with a valid match: $nrfound\n"; +print REPORT " Initial match on Forward: $byf\n"; +print REPORT " Initial match on Reverse: $byr\n"; +print REPORT " Both F and R Matched: $foundboth\n"; +print REPORT " Forward reads trimmed to amplicon length: $toolongf\n"; +print REPORT " Reverse reads trimmed to amplicon length: $toolongr\n"; +print REPORT " Total basepairs in fastq files: $totalbp\n"; +print REPORT " Total basepairs trimmed: $bptrimmed (".sprintf("%.2f",($bptrimmed/$totalbp)*100)."%)\n"; +close REPORT; + + + + + + +if (!exists($opts{'w'})) { + ## clean up + system("rm -Rf $wd"); +} + + +sub rc { + my $seq = shift; + $seq =~ tr/ACGT/TGCA/; + $seq = reverse($seq); + return $seq; + +}