Mercurial > repos > geert-vandeweyer > multiplicom_primer_trimming
view multiplicom_primer_trimming.pl @ 1:b2125910c8fd draft default tip
changed ref genome table name
author | geert-vandeweyer |
---|---|
date | Fri, 22 May 2015 09:07:53 -0400 |
parents | fadef644b886 |
children |
line wrap: on
line source
#!/usr/bin/perl ## needs twoBitToFa v285, or other versions supporting the -bed option. # load modules use Getopt::Std; ########## ## opts ## ########## ## input files # i : input fastq 1 # I : input fastq 2 (if paired) # b : bed file with amplicon positions # r : read length (default 250) # o : output fastq1 # O : output fastq2 # F : failed readpairs # R : short report # t : 2bit file location # w : working directory (defaults to tmp) getopts('i:I:b:r:o:O:F:R:t:', \%opts) ; # check input values if (!exists($opts{'i'}) || !-e $opts{'i'}) { die('Fastq File not found'); } if (!exists($opts{'I'}) || !-e $opts{'I'}) { die('FastQ for paired end reads not found'); } if (!exists($opts{'o'})) { die('No output file specified for forward reads'); } if (!exists($opts{'O'})) { die('No output file specified for reverse reads'); } if (!exists($opts{'F'})) { die('No output file specified for failed pairs'); } if (!exists($opts{'b'}) || !-e $opts{'b'}) { die('BED-File not found'); } #if (exists($opts{'m'})) { # $minmap = $opts{'m'}; #} #else { # $minmap = 3; #} if (exists($opts{'r'})) { $readl = $opts{'r'}; } else { print "Assuming default read length of 2x250bp\n"; $readl = 250; } if (exists($opts{'t'}) && -e $opts{'t'}) { $tobit = $opts{'t'}; } else { die("2BIT reference not found"); } my $tbtf = `which twoBitToFa`; chomp($tbtf) ; if ($tbtf eq '') { if (-e "/opt/software/bin/twoBitToFa") { $tbtf = "/opt/software/bin/twoBitToFa"; } else { die("Could not find a twoBitToFa executable.\n"); } } # make output directory in (tmp) working dir if (exists($opts{'w'}) && -d $opts{'w'}) { our $wd = $opts{'w'}; } else { our $wd = "/tmp/Trim.".int(rand(1000)); while (-d $wd) { $wd = "/tmp/Trim.".int(rand(1000)); } system("mkdir $wd"); } #print "Using wd : $wd\n"; ## build sequence hash. my %alen = %flen = %rlen = (); open BED, $opts{'b'}; open OUT, ">$wd/bedfile.zero.bed"; my $minf = 999; my $minr = 999; while (<BED>) { if ($_ !~ m/^(chr.{1,2}\t)(\d+)(\t.*)/) { next; } chomp($_); my @p = split(/\t/,$_); my $fl = $p[6] - $p[1]; # p6 holds first non-primer position my $rl = $p[2] - $p[7]; # p7 hold last non-primer position ## lengths $alen{"$p[0]:$p[1]-$p[2]"} = $p[7] - $p[6] + 1; $flen{"F:$p[0]:$p[1]-$p[2]"} = $fl; if ($fl < $minf) { $minf = $fl; } if ($rl < $minr) { $minr = $rl; } $rlen{"R:$p[0]:$p[1]-$p[2]"} = $rl; print OUT "$p[0]\t".($p[1]-1)."\t".($p[6]-1)."\tF:$p[0]:$p[1]-$p[2]\n"; print OUT "$p[0]\t".$p[7]."\t".$p[2]."\tR:$p[0]:$p[1]-$p[2]\n"; } close BED; close OUT; system("cd $wd && $tbtf -noMask -bed=bedfile.zero.bed $tobit amplicons.zero.fa"); open IN, "$wd/amplicons.zero.fa"; my %fseq = %rseq = (); my %rmm = %fmm = (); my @nts = ('A','C','T','G'); while(<IN>) { my $pr = $_; my $seq = <IN>; chomp($pr); chomp($seq); $pr = substr($pr,1); if (substr($pr,0,1) eq 'F') { $fseq{$pr} = $seq; for ($i = 0; $i < 10; $i++) { foreach(@nts) { my $mut = substr($fseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i); $fmm{$pr}{$mut} = $fseq{$pr}; } } } else { $rseq{$pr} = rc($seq); for ($i = 0; $i< 10;$i++){ foreach(@nts) { my $mut = substr($rseq{$pr},0,$i).$_.substr($rseq{$pr},$i+1,9-$i); $rmm{$pr}{$mut} = $rseq{$pr}; } } } } close IN; ############################### ## generate smallest overlap F## ############################### $ntf = $minf; BUILDMIN: my %fmin = (); my %fpairs = (); foreach( keys(%fseq)) { my $sub = substr($fseq{$_},0,$ntf); ## clash => increase nt. if (exists($fmin{$sub})) { ## check if not identical (yes, this is possible...) (same start + same length.) $_ =~ m/F:chr(.{1,2}):(\d+)-(\d+)/; my $cchr = $1; my $cstart = $2; my $cl = $flen{$_}; my @prev = split(/\|/,$fmin{$sub}); my $pprim = $prev[0]; $pprim =~ m/F:chr(.{1,2}):(\d+)-(\d+)/; my $pchr = $1; my $pstart = $2; my $pl = $flen{$pprim}; if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") { $ntf++; goto BUILDMIN; } else { $fmin{$sub} .= "|$_"; my $rn = $_; $rn =~ s/F:/R:/; $fpairs{$_} .= "|$rn"; } } else { $fmin{$sub} = $_; my $rn = $_; $rn =~ s/F:/R:/; $fpairs{$_} = "$rn"; } } print "Minimal number of nucleotides needed for forward: $ntf\n"; ## allow one mismatch. my %mmhash =(); foreach (keys(%fmin)) { my $orig = $_; for ($i = 0; $i< length($orig);$i++){ foreach(@nts) { if ($_ eq substr($orig,$i,1)) { $mmhash{$orig} = $orig; next; } my $mut = substr($orig,0,$i).$_.substr($orig,$i+1); ## if in mmhash && not in $fmin => clash after mutation => delete from mmhash. if (exists($mmhash{$mut}) ) { if (!exists($fmin{$mut})) { delete($mmhash{$mut}); next; } else { ## mut == original from oth primer => do not add reference. next; } } else { $mmhash{$mut} = $orig; } } } } ############################### ## generate smallest overlap R## ############################### $ntr = $minr; BUILDMINR: my %rmin = (); my %frairs = (); foreach( keys(%rseq)) { my $sub = substr($rseq{$_},0,$ntr); ## clash => increase nt. if (exists($rmin{$sub})) { ## check if not identical (yes, this is possible...) (same start + same length.) $_ =~ m/R:chr(.{1,2}):(\d+)-(\d+)/; my $cchr = $1; my $cstart = $2; my $cl = $rlen{$_}; my @prev = split(/\|/,$rmin{$sub}); my $pprim = $prev[0]; $pprim =~ m/R:chr(.{1,2}):(\d+)-(\d+)/; my $pchr = $1; my $pstart = $2; my $pl = $Rlen{$pprim}; if ("$cchr" ne "$pchr" || "$cstart" ne "$pstart" || "$cl" ne "$pl") { $ntr++; goto BUILDMINR; } else { $Rmin{$sub} .= "|$_"; my $fn = $_; $fn =~ s/R:/F:/; $rpairs{$_} .= "|$fn"; } } else { $rmin{$sub} = $_; my $fn = $_; $fn =~ s/R:/F:/; $rpairs{$_} = "$fn"; } } print "Minimal number of nucleotides needed for reverse: $ntr\n"; ## allow one mismatch. my %rmmhash =(); foreach (keys(%rmin)) { my $orig = $_; for ($i = 0; $i< length($orig);$i++){ foreach(@nts) { if ($_ eq substr($orig,$i,1)) { $rmmhash{$orig} = $orig; next; } my $mut = substr($orig,0,$i).$_.substr($orig,$i+1); ## if in mmhash && not in $fmin => clash after mutation => delete from mmhash. if (exists($rmmhash{$mut}) ) { if (!exists($rmin{$mut})) { delete($rmmhash{$mut}); next; } else { ## mut == original from oth primer => do not add reference. next; } } else { $rmmhash{$mut} = $orig; } } } } ########################## ## process fastq files. ## ########################## open IN, $opts{'i'}; # open forward reads open INR, $opts{'I'}; # open reverse reads open OUTF, ">".$opts{'o'}; # where to put trimmed forward reads open OUTR, ">".$opts{'O'}; ## where to put trimmed Reverse reads open FAIL, ">$opts{'F'}"; #$wd/failed.interlaced.fq"; # keep non-matches to seperate file => for debug? my $outf = $outr = $failout =''; # buffer output to speedup I/O my $count = $failcount = 0 ; # track output buffer my $nrmissed = $nrfound = 0; # statistics my $foundboth = $byf = $byr = $toolongf = $toolongr = 0; # statistics my $bptrimmed = $totalbp = 0; # statistics while (my $r1 = <IN>) { # read in forward my $s1 = <IN>; chomp($s1); $totalbp += length($s1); my $d = <IN>; my $q1 = <IN>; chomp($q1); # read in reverse my $r2 = <INR>; my $s2 = <INR>; chomp($s2); $totalbp += length($s2); my $d = <INR>; my $q2 = <INR>; chomp($q2); ## the seeds : first positions of reads, lenght is determined above. my $fseed = substr($s1,0,$ntf); my $rseed = substr($s2,0,$ntr); ## check if the seed exists in the "mutated" forward primer list if (!exists($mmhash{$fseed})) { ## not found : try the reverse primers. if (!exists($rmmhash{$rseed})) { # not found either : print out to failed reads in interlaced format $nrmissed++; $failout .= "$r1$s1\n+\n$q1\n$r2$s2\n+\n$q2\n"; $failcount++; if ($failcount > 50000) { chomp($failout); print FAIL $failout; $failout = "\n"; $failcount = 0; } next; } ## trim reverse 5' $s2 = substr($s2,$rlen{$rmin{$rmmhash{$rseed}}}); # from position *length of reverse primer* : rseed points to the original primer in the rmm hash $q2 = substr($q2,$rlen{$rmin{$rmmhash{$rseed}}}); ## statistics $bptrimmed += $rlen{$rmin{$rmmhash{$rseed}}}; ## trim if readlength > amplicon size (without primers) if ($readl > $rlen{$rmin{$rmmhash{$rseed}}} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { # trim to alength (incorrect if indels !) # statistics $toolongr++; $bptrimmed += length($s2) - $alen{substr($rmin{$rmmhash{$rseed}},2)}; ## 5' primer is already trimmed. This removes fragments of 3' primer. $s2 = substr($s2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); $q2 = substr($q2,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); } $byr++; ## trim forward 5' my @fps = split(/\|/,$rpairs{$rmin{$rmmhash{$rseed}}}); my $forok = 0; my $forl = 0; foreach(@fps) { if (exists($fmm{$_}{substr($s1,0,10)})) { $s1 = substr($s1,$flen{$_}); $q1 = substr($q1,$flen{$_}); #statistics $bptrimmed += $flen{$_}; if ($readl > $flen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { # trim to alength (incorrect if indels !) # statistics $toolongf++; $bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)}; $s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); $q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); } $forok++; $foundboth++; last; } else { if ($forl < $flen{$_}) { $forl = $flen{$_}; } } } if ($forok == 0) { ## trim by max length of should be forwards. $s1 = substr($s1,$forl); $q1 = substr($q1,$forl); # statistics $bptrimmed += $forl; if ($readl > $forl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { # trim to alength (incorrect if indels !) # statistics $toolongf++; $bptrimmed += length($s1) - $alen{substr($rmin{$rmmhash{$rseed}},2)}; $s1 = substr($s1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); $q1 = substr($q1,0,$alen{substr($rmin{$rmmhash{$rseed}},2)}); } } $nrfound++; } else { ## trim forward 5' $s1 = substr($s1,$flen{$fmin{$mmhash{$fseed}}}); $q1 = substr($q1,$flen{$fmin{$mmhash{$fseed}}}); # statistics $bptrimmed += $flen{$fmin{$mmhash{$fseed}}}; if ($readl > $flen{$fmin{$mmhash{$fseed}}} + $alen{substr($fmin{$mmhash{$fseed}},2)}) { # trim to alength (incorrect if indels !) # statistics $toolongf++; $bptrimmed += length($s1) - $alen{substr($fmin{$mmhash{$fseed}},2)}; $s1 = substr($s1,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); $q1 = substr($q1,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); } $byf++; ## trim reverse 5' my @rps = split(/\|/,$fpairs{$fmin{$mmhash{$fseed}}}); $revok = 0; my $revl = 0; foreach(@rps) { if (exists($rmm{$_}{substr($s2,0,10)})) { $s2 = substr($s2,$rlen{$_}); $q2 = substr($q2,$rlen{$_}); # statistics $bptrimmed += $rlen{$_}; if ($readl > $rlen{$_} + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { # trim to alength (incorrect if indels !) # statistics $toolongr++; $bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)}; $s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); $q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); } $revok++; $foundboth++; last; } else { if ($revl < $rlen{$_}) { $revl = $rlen{$_}; } } } if ($revok == 0) { # trim by max length of should be reverses. $s2 = substr($s2,$revl); $q2 = substr($q2,$revl); # statistics $bptrimmed += $revl; if ($readl > $revl + $alen{substr($rmin{$rmmhash{$rseed}},2)} ) { # trim to alength (incorrect if indels !) # statistics $toolongr++; $bptrimmed += length($s2) - $alen{substr($fmin{$mmhash{$fseed}},2)}; $s2 = substr($s2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); $q2 = substr($q2,0,$alen{substr($fmin{$mmhash{$fseed}},2)}); } } $nrfound++; } $outf .= "$r1$s1\n+\n$q1\n"; $outr .= "$r2$s2\n+\n$q2\n"; $count++; if ($count > 100000) { print OUTF $outf; print OUTR $outr; $outf = "\n"; $outr = "\n"; $count = 0; } } chomp($outf); chomp($outr); chomp($failout); print OUTF $outf; print OUTR $outr; print FAIL $failout; close IN; close INR; close OUTF; close OUTR; close FAIL; open REPORT, ">$opts{'R'}" or die ("Could not open report file"); print REPORT "Results: \n"; print REPORT "########\n"; print REPORT " Read pairs without match: $nrmissed\n"; print REPORT " Read pairs with a valid match: $nrfound\n"; print REPORT " Initial match on Forward: $byf\n"; print REPORT " Initial match on Reverse: $byr\n"; print REPORT " Both F and R Matched: $foundboth\n"; print REPORT " Forward reads trimmed to amplicon length: $toolongf\n"; print REPORT " Reverse reads trimmed to amplicon length: $toolongr\n"; print REPORT " Total basepairs in fastq files: $totalbp\n"; print REPORT " Total basepairs trimmed: $bptrimmed (".sprintf("%.2f",($bptrimmed/$totalbp)*100)."%)\n"; close REPORT; if (!exists($opts{'w'})) { ## clean up system("rm -Rf $wd"); } sub rc { my $seq = shift; $seq =~ tr/ACGT/TGCA/; $seq = reverse($seq); return $seq; }