# HG changeset patch # User fcaramia # Date 1357709049 18000 # Node ID d32bddcff685dc0c328182f9635bf67e0b7e897e Uploaded diff -r 000000000000 -r d32bddcff685 alignCustomAmplicon/alignCustomAmplicon.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignCustomAmplicon/alignCustomAmplicon.pl Wed Jan 09 00:24:09 2013 -0500 @@ -0,0 +1,1825 @@ +#!/usr/bin/perl + +# script to convert paired end reads to a concensus sequence + +# Created by Jason Ellul, Feb 2012 +# Modified by Franco Caramia and Jason Ellul, July 2012 and November 2012 +# 0.4 Fixed mapping of concensus region (large insertions and deletions now picked up correctly) +# Added missalignment penalty +# Read 1 and 2 are now aligned first then they are aligned to the reference +# 0.5 Added ability to name amplicons by includinfo the info in the primers file +# Added the ability to output the alignments in a particular region using the new flag -a +# Added some extra stats to the output + +use strict; +use warnings; +use Getopt::Std; +use File::Basename; +use File::Find; +use File::Copy; +use File::MMagic; +use File::Temp qw/ tempfile tempdir /; +use Cwd 'abs_path'; +use threads; +use threads::shared; +use List::Util qw(sum); +use Thread::Queue; +use String::Approx 'adistr'; +use POSIX qw(floor ceil); +String::Approx::cache_max(25000); +String::Approx::cache_purge(0.25); +use constant false => 0; +use constant true => 1; +use Inline CPP => Config => AUTO_INCLUDE => '#include "utils.cpp"'; +use Inline CPP => <<'END_CPP'; + + + + +int levenshtein(char* seq1, char* seq2) +{ + // Returns edit distance of 2 sequences + string sam1(seq1); + string sam2(seq2); + return levenshteinDistance(sam1, sam2); +} + +double align2(char* seq1, char* seq2, SV* res1, SV* res2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug ) +{ + // Returns Sequence alignment of 2 sequences + // res1, res2 contain the alignment + //seq1: reference sequence + //seq2: amplicon sequence + //matchScore: reward for matching a nucleotide + //gapPenOpen: penalty for opening a gap + //gapPenext: penalty for extending a gap + //noFrontGapPenalty: No penalty will be impose for gaps at the beggining of the alignment + //noEndGapPenalty: No penalty will be impose for gaps at the end of the alignment + //debug: prints scores, alignments, etc... + + string sam1(seq1); + string sam2(seq2); + double ret = nw_align2(sam1, sam2, matchScore, missmatch_penalty, gapPenOpen, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug); + //Setting the return alignments + sv_setpvn(res1, (char*)sam1.c_str(), sam1.length()); + sv_setpvn(res2, (char*)sam2.c_str(), sam2.length()); + return ret; +} + +void adjustSequence(int refSize, string& seq, char dir) +{ + + int adjust = refSize - seq.length(); + if(dir == 'r') + for(int i=0; i msa; + vector partial; + vector seq; + + partial.push_back(rightSeq); + partial.push_back(leftSeq); + seq.push_back(ref); + + msa = nw_alignAlignments(partial, seq , matchScore, missmatch_penalty, gapPenOpen, gapPenExt, false, false, debug); + sv_setpvn(res1, (char*)msa[2].c_str(), msa[2].length()); + sv_setpvn(res2, (char*)msa[1].c_str(), msa[1].length()); + sv_setpvn(res3, (char*)msa[0].c_str(), msa[0].length()); + return ret; +} + +END_CPP + +$| = 1; + +# Grab and set all options +my $version = "0.5"; +my %OPTIONS = (p => 1); +getopts('a:dl:L:o:p:rsv', \%OPTIONS); + +die qq(version $version +Usage: alignCustomAmplicon.pl [OPTIONS] + +OPTIONS: -a STR print the alignment for any amplicons covering a particular region [chr:loc || chr:start-stop] + -d print debug info + -l STR filename of the log file [null] + -L STR append to an existing log file [null] + -o STR filename for bam file + -p INT number of threads [$OPTIONS{p}] + -r do not create stats report + -s sort the bam file + -v verbose progress +) if(@ARGV != 4); + +my $Script = 'alignCustomAmplicon'; +my $ref = shift @ARGV; +my $fastq1 = shift @ARGV; +my $fastq2 = shift @ARGV; +my $primers = shift @ARGV; +my $now; + +#create log file +if(defined $OPTIONS{l}) { + my $logfile = $OPTIONS{l}; + open (FH, ">$logfile") or die 'Couldn\'t create log file!\n'; + close (FH); + # Open the log file and redirect output to it + open (STDERR, ">>$logfile"); + open (STDOUT, ">>$logfile"); + my $now = localtime time; + print "Log File Created $now\n\n"; +} elsif(defined $OPTIONS{L}) { + #append to a log file + my $logfile = $OPTIONS{L}; + # Open the log file and redirect output to it + open (STDERR, ">>$logfile") or die 'Couldn\'t create log file!\n'; + open (STDOUT, ">>$logfile") or die 'Couldn\'t create log file!\n'; + my $now = localtime time; + print "Appending To Log File $now\n\n"; +} + +# print version of this script +print "Using $Script version $version \n\n"; + +if(defined $OPTIONS{d}) { + print "Setting threads to 1 as using debug\n"; + $OPTIONS{p} = 1; +} + +# grab sample name, set output file name and open some files +my ($name, $path, $suffix) = fileparse($fastq1, ('.fastq', '.fastq.gz')); +my $sampleName :shared = $name; +$sampleName =~ s/^(.*?_.*?)_.*/$1/; +open(PRIM, $primers) or die "can't open $primers: $!\n"; + +# set samfile name +my $bamfile_out = "${sampleName}_aligned.bam"; +if(defined $OPTIONS{o}) { + $bamfile_out = $OPTIONS{o}; +} +my $bam_to_sort = $bamfile_out; +if(defined $OPTIONS{s}) { + $bam_to_sort .= '.tmp'; +} + +my $stats = $bamfile_out; +if(!defined $OPTIONS{r}) { + $stats =~ s/\..*?$//; + open(STATS, ">${stats}_stats.csv"); +} + +# read in primer lengths +my %refFrags; +$refFrags{min} = 5000000; +while(my $line = ) { + chomp $line; + my @line_sp = split("\t", $line); + $refFrags{primers}{$line_sp[0]}{U}{F}{length} = $line_sp[1]; + $refFrags{primers}{$line_sp[0]}{D}{F}{length} = $line_sp[2]; + $refFrags{primers}{$line_sp[0]}{U}{R}{length} = $line_sp[2]; + $refFrags{primers}{$line_sp[0]}{D}{R}{length} = $line_sp[1]; + $refFrags{name}{$line_sp[0]} = $line_sp[3]; + $refFrags{min} = min([$refFrags{min}, $line_sp[1], $line_sp[2]]); +} +close(PRIM); +my @OrderedContigs :shared = sort sortChrom keys %{ $refFrags{primers} }; + +my %regions; +if(defined $OPTIONS{a}) { + my ($chr, $start, $end) = split_region($OPTIONS{a}); + foreach my $reg (@OrderedContigs) { + my ($chr_reg, $start_reg, $end_reg) = split_region($reg); + $regions{$reg} = 1 if $chr eq $chr_reg && $start_reg <= $end && $end_reg >= $start; + } +} + +# grab fasta file +my %refSeq; +read_fasta_file($ref, \%refSeq); + +# calculate read length and read group +my $readGroup :shared; +my $readLength; +my $mm = new File::MMagic; +my $filetype = $mm->checktype_filename(abs_path($fastq1)); +if($filetype eq "application/x-gzip") { + my $sampleRead = `gunzip -c $fastq1 | head -n 2 | tail -n 1`; + chomp $sampleRead; + $readLength = length($sampleRead); + $readGroup = `gunzip -c $fastq1 | head -n 1`; + chomp $readGroup; + $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/; +} elsif($filetype eq "text/plain") { + my $sampleRead = `head -n 2 $fastq1 | tail -n 1`; + chomp $sampleRead; + $readLength = length($sampleRead); + $readGroup = `head $fastq1 -n 1`; + chomp $readGroup; + $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/; +} else { + die("$fastq1 is filetype $filetype and not text/plain or application/x-gzip\n"); +} + +# create hash to keep track of maxdiff and readlengths +my %maxdiff :shared; + +# create hash to keep track of stats +my %stats :shared; +$stats{Amb} = 0; +$stats{PoorQualMates} = 0; +$stats{PoorMapQual} = 0; +$stats{PoorQual1} = 0; +$stats{PoorQual2} = 0; + +# setup other shared variables and caches +my %cache :shared; +my $cacheHits :shared = 0; +my %cacheAlign :shared; +$cacheAlign{Conc} = &share( {} ); +my $cacheAlignHits :shared = 0; +my %refLength :shared; + +# extract primers and fragments on forward and reverse strand and initialize counts +foreach my $key (@OrderedContigs) { + push @{ $refFrags{seqU} }, substr($refSeq{$key}, 0, $refFrags{primers}{$key}{U}{F}{length}); + push @{ $refFrags{seqD} }, substr($refSeq{$key}, -$refFrags{primers}{$key}{D}{F}{length}); + push @{ $refFrags{seqU} }, revcompl(substr($refSeq{$key}, -$refFrags{primers}{$key}{U}{R}{length})); + push @{ $refFrags{seqD} }, revcompl(substr($refSeq{$key}, 0, $refFrags{primers}{$key}{D}{R}{length})); + push @{ $refFrags{lenU} }, ($refFrags{primers}{$key}{U}{F}{length}, $refFrags{primers}{$key}{U}{R}{length}); + push @{ $refFrags{lenD} }, ($refFrags{primers}{$key}{D}{F}{length}, $refFrags{primers}{$key}{D}{R}{length}); + push @{ $refFrags{reg} }, ($key, $key); + push @{ $refFrags{strand} }, ('F', 'R'); + + # calculate new contig name with primers removed + my ($chr, $start, $stop) = split(":|-", $key); + $start += $refFrags{primers}{$key}{U}{F}{length}; + $stop -= $refFrags{primers}{$key}{D}{F}{length}; + $refLength{$key} = &share( {} ); + $refLength{$key}{newReg} = "$chr:$start-$stop"; + $refLength{$key}{length} = $stop - $start + 1; + + $cacheAlign{$key} = &share( {} ); + $cacheAlign{$key}{R1} = &share( {} ); + $cacheAlign{$key}{R2} = &share( {} ); + + $stats{$key} = &share( {} ); + $stats{$key}{Reads} = 0; + $stats{$key}{Bases} = 0; + $stats{$key}{Quality} = 0; + $stats{$key}{Mapping} = 0; + $stats{$key}{Read1} = 0; + $stats{$key}{Read2} = 0; + $stats{$key}{ConcMapping} = 0; + $stats{$key}{ConcReads} = 0; + $stats{$key}{NumberForward} = 0; + $stats{$key}{NumberReverse} = 0; + $stats{$key}{ConcErrorsBases} = 0; + $stats{$key}{ConcCorrectBases} = 0; + $stats{$key}{MinLength} = 100000; + $stats{$key}{MaxLength} = 0; + $stats{$key}{TotalA} = 0; + $stats{$key}{TotalC} = 0; + $stats{$key}{TotalG} = 0; + $stats{$key}{TotalT} = 0; + $stats{$key}{TotalN} = 0; + $stats{$key}{PoorMapQual} = 0; +} + +# start up worker threads +my $FindContig = Thread::Queue->new(); +my $ToAlign = Thread::Queue->new(); +my $Aligned = Thread::Queue->new(); +my $Samfile = Thread::Queue->new(); +my $WriteBam = Thread::Queue->new(); +my @threadArray; +for(my $index = 1; $index <= $OPTIONS{p}; $index++) { + push @threadArray, threads->create(\&thread_find_contig, $FindContig, \%refFrags, \%refSeq, $ToAlign, $Aligned, $WriteBam); + push @threadArray, threads->create(\&thread_process_align, $Aligned, \%refFrags, \%refSeq, $Samfile, $WriteBam) if $index <= max([1, $OPTIONS{p}/2]); + push @threadArray, threads->create(\&samfile_line, $Samfile, $WriteBam, \%refSeq) if $index <= max([1, $OPTIONS{p}/2]); +} +push @threadArray, threads->create(\&nwAlign, $ToAlign, $Aligned, \%regions); +push @threadArray, threads->create(\&writebamfile, $bam_to_sort, $WriteBam); + +# read in each sequence and its mate from the fastq file +my $read1; +my $read2; +my $seq1; +my $seq2; +my $qual1; +my $qual2; +my $desc; +my @strand; +my $cont = 1; + +# open fastq files +my $gz1; +my $gz2; +my $f1gz = 0; +my $f2gz = 0; +if($filetype eq "application/x-gzip") { + open(FQ1, "gunzip -dc $fastq1 |") or die "Can't fork: $!\n"; +} else { + open(FQ1, $fastq1) or die "$fastq1: $!"; +} + +$filetype = $mm->checktype_filename(abs_path($fastq2));; +if($filetype eq "application/x-gzip") { + open(FQ2, "gunzip -dc $fastq2 |") or die "Can't fork: $!\n"; +} elsif($filetype eq "text/plain") { + open(FQ2, $fastq2) or die "$fastq2: $!"; +} else { + die("$fastq2 is filetype $filetype and not text/plain or application/x-gzip\n"); +} + +# add reads to the queue +my $readNo = 0; +while ($cont) { + if($read1 = ) { # read 1 and 2 read name + $seq1 = ; # read 1 sequence + $desc = ; # read 1 desc + $qual1 = ; # read 1 quality + } else { + last; + } + + if($read2 = ) { # read 2 read name + $seq2 = ; # read 2 sequence + $desc = ; # read 2 sdesc + $qual2 = ; # read 2 quality + } else { + last; + } + + # read 1 the forward read + chomp $read1; + chomp $seq1; + chomp $qual1; + $read1 =~ s/^\@(.*)\s?.*/$1/; + + # read 2 the reverse read + chomp $read2; + chomp $seq2; + $seq2 = revcompl($seq2); + chomp $qual2; + $qual2 = reverse($qual2); + + $read2 =~ s/^\@(.*)\s?.*/$1/; + $readNo++; + + # keep at most 20 reads in buffer + while(1) { + if(defined $OPTIONS{v}) { + my $percAmb = $stats{Amb} / $readNo * 100; + my $percPoor = $stats{PoorQualMates} / $readNo * 100; + my $percPoorMap = $stats{PoorMapQual} / $readNo * 100; + my $percPoor1 = $stats{PoorQual1} / $readNo * 100; + my $percPoor2 = $stats{PoorQual2} / $readNo * 100; + $percAmb = sprintf("%.2f", $percAmb); + $percPoor = sprintf("%.2f", $percPoor); + $percPoorMap = sprintf("%.2f", $percPoorMap); + $percPoor1 = sprintf("%.2f", $percPoor1); + $percPoor2 = sprintf("%.2f", $percPoor2); + print STDERR "\r$readNo processed - Amb $percAmb - PoorQual $percPoor - PoorQual1 $percPoor1 - PoorQual2 $percPoor2 - PoorMappingQual $percPoorMap - FindContig ".$FindContig->pending()." - ToAlign ".$ToAlign->pending()." - Aligned ".$Aligned->pending()." - Sam ".$Samfile->pending()." - Writebam ".$WriteBam->pending()." "; + } + last if $FindContig->pending() <= 100; + } + my @readArray = ($read1, $seq1, $qual1, $read2, $seq2, $qual2); + { + lock($FindContig); # Keep other threads from changing the queue's contents + $FindContig->enqueue(\@readArray); + } +} +close(FQ1); +close(FQ2); + +# wait for all records to be processed and close down threads +$FindContig->enqueue( (undef) x $OPTIONS{p} ); +while($FindContig->pending() > 0 ) { + next; +} +$ToAlign->enqueue( (undef) ); +while($ToAlign->pending() > 0 ) { + next; +} +$Aligned->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) ); +while($Aligned->pending() > 0 ) { + next; +} +$Samfile->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) ); +while($Samfile->pending() > 0 ) { + next; +} +$WriteBam->enqueue( undef ); + +$_->join() for @threadArray; + +# sort the bam file +if(defined $OPTIONS{s}) { + system("MergeSamFiles.sh MAX_RECORDS_IN_RAM=2000000 I=$bam_to_sort O=$bamfile_out VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true"); + unlink $bam_to_sort; +} + +if(!defined $OPTIONS{r}) { + # print out statistics + # setup variables + my $totalRegionLengths = 0; + my $totalReads = 0; + my $totalFwd = 0; + my $totalRev = 0; + my $totalMatch = 0; + my $totalMiss = 0; + my $totalBases = 0; + my $totalRead1 = 0; + my $totalRead2 = 0; + my $totalA = 0; + my $totalC = 0; + my $totalG = 0; + my $totalT = 0; + my $totalN = 0; + my $totalConc = 0; + my $totalQuality = 0; + my $AllmaxRead = 0; + my $AllminRead = 100000; + my $avequal = 'NA'; + my $aveRead = 'NA'; + my $perA = 'NA'; + my $perC = 'NA'; + my $perG = 'NA'; + my $perT = 'NA'; + my $perN = 'NA'; + my $readsUsed = 0; + my @coverage; + my @off_coverage; + + # print column headers + print STATS 'AmpliconName,Region,RegionLengthMinusPrimers,TotalReadsPoorMapping,TotalReadsUsed,Reads1Kept,Reads2Kept,ReadsOut,ReadsFwd,ReadsRev,ReadsOverlap,'; + print STATS "Bases,AveQuality,MatchBases,ErrorBases,ErrorRate,MinReadLength,AveReadLength,MaxReadLength,PercA,PercC,PercG,PercT,PercN\n"; + + # for each region + foreach my $key (@OrderedContigs){ + # if we have some reads calculate averages + if($stats{$key}{Reads} > 0) { + $avequal = $stats{$key}{Quality} / $stats{$key}{Bases}; + $avequal = sprintf("%.4f", $avequal); + $aveRead = $stats{$key}{Bases} / $stats{$key}{Reads}; + $aveRead = sprintf("%.4f", $aveRead); + $perA = $stats{$key}{TotalA} / $stats{$key}{Bases} * 100; + $perA = sprintf("%.2f", $perA); + $perC = $stats{$key}{TotalC} / $stats{$key}{Bases} * 100; + $perC = sprintf("%.2f", $perC); + $perG = $stats{$key}{TotalG} / $stats{$key}{Bases} * 100; + $perG = sprintf("%.2f", $perG); + $perT = $stats{$key}{TotalT} / $stats{$key}{Bases} * 100; + $perT = sprintf("%.2f", $perT); + $perN = $stats{$key}{TotalN} / $stats{$key}{Bases} * 100; + $perN = sprintf("%.2f", $perN); + $AllminRead = min([$AllminRead, $stats{$key}{MinLength}]); + $AllmaxRead = max([$AllmaxRead, $stats{$key}{MaxLength}]); + $readsUsed = $stats{$key}{Reads} + $stats{$key}{ConcReads}; + } else { + $stats{$key}{MaxLength} = 'NA'; + $stats{$key}{MinLength} = 'NA'; + } + + # if we have reads which overlapped calculate an error rate + my $errRate = 'NA'; + if($stats{$key}{ConcReads} > 0) { + $errRate = $stats{$key}{ConcErrorsBases} / ($stats{$key}{ConcErrorsBases} + $stats{$key}{ConcCorrectBases}); + } + + # print the stats for this region + print STATS "$refFrags{name}{$key}," if defined $refFrags{name}{$key}; + print STATS "Unknown," unless defined $refFrags{name}{$key}; + print STATS "$refLength{$key}{newReg},$refLength{$key}{length},$stats{$key}{PoorMapQual},$readsUsed,$stats{$key}{Read1},$stats{$key}{Read2},$stats{$key}{Reads},$stats{$key}{NumberForward},"; + print STATS "$stats{$key}{NumberReverse},$stats{$key}{ConcReads},$stats{$key}{Bases},$avequal,$stats{$key}{ConcCorrectBases},"; + print STATS "$stats{$key}{ConcErrorsBases},$errRate,$stats{$key}{MinLength},$aveRead,$stats{$key}{MaxLength},$perA,$perC,$perG,$perT,$perN\n"; + + # add stats to grand totals + $totalRegionLengths += $refLength{$key}{length}; + $totalQuality += $stats{$key}{Quality}; + $totalReads += $stats{$key}{Reads}; + $totalFwd += $stats{$key}{NumberForward}; + $totalRev += $stats{$key}{NumberReverse}; + $totalBases += $stats{$key}{Bases}; + $totalRead1 += $stats{$key}{Read1}; + $totalRead2 += $stats{$key}{Read2}; + $totalA += $stats{$key}{TotalA}; + $totalC += $stats{$key}{TotalC}; + $totalT += $stats{$key}{TotalT}; + $totalG += $stats{$key}{TotalG}; + $totalN += $stats{$key}{TotalN}; + $totalMatch += $stats{$key}{ConcCorrectBases}; + $totalMiss += $stats{$key}{ConcErrorsBases}; + $totalConc += $stats{$key}{ConcReads}; + + # reset ave scores + $avequal = 'NA'; + $aveRead = 'NA'; + $perA = 'NA'; + $perC = 'NA'; + $perG = 'NA'; + $perT = 'NA'; + $perN = 'NA'; + $readsUsed = 0; + + if(defined $refFrags{name}{$key} && grep /^Off_target/, $refFrags{name}{$key}) { + push @off_coverage, $stats{$key}{Reads}; + } else { + push @coverage, $stats{$key}{Reads}; + } + } + + # if we have any reads calculate the averages + if($totalReads > 0) { + $avequal = $totalQuality / $totalBases; + $avequal = sprintf("%.4f", $avequal); + $aveRead = $totalBases / $totalReads; + $aveRead = sprintf("%.4f", $aveRead); + $perA = $totalA / $totalBases * 100; + $perA = sprintf("%.2f", $perA); + $perC = $totalC / $totalBases * 100; + $perC = sprintf("%.2f", $perC); + $perG = $totalG / $totalBases * 100; + $perG = sprintf("%.2f", $perG); + $perT = $totalT / $totalBases * 100; + $perT = sprintf("%.2f", $perT); + $perN = $totalN / $totalBases * 100; + $perN = sprintf("%.2f", $perN); + $readsUsed = $totalReads + $totalConc; + } else { + $AllmaxRead = 'NA'; + $AllminRead = 'NA'; + } + + # if we had any reads which overlapped calculate error rate + my $errRate = 'NA'; + if($totalConc > 0) { + $errRate = $totalMiss / ($totalMiss + $totalMatch); + } + + # print out totals + my $pcAmb = sprintf("%.2f", $stats{Amb} / $readNo * 100); + my $pcPQ = sprintf("%.2f", $stats{PoorQualMates} / $readNo * 100); + my $TotalReads = $readNo * 2; + my $Unmapped = $TotalReads - $readsUsed; + my $pcUnmapped = sprintf("%.2f", $Unmapped/$TotalReads * 100); + my $avCov = sprintf("%.0f", sum(@coverage)/@coverage); + my $avOff = sprintf("%.2f", sum(@off_coverage)/@off_coverage) if @off_coverage; + my $greater = scalar (grep { $_ > 0.2*$avCov } @coverage); + my $pcgreater = sprintf("%.2f", $greater/@coverage * 100); + + print STATS ",Total,$totalRegionLengths,$stats{PoorMapQual},$readsUsed,$totalRead1,$totalRead2,$totalReads,$totalFwd,$totalRev,$totalConc,$totalBases,$avequal,$totalMatch,"; + print STATS "$totalMiss,$errRate,$AllminRead,$aveRead,$AllmaxRead,$perA,$perC,$perG,$perT,$perN\n"; + print STATS "\nNo mate pairs:,$readNo\n"; + print STATS "No ambiguous mates:,$stats{Amb},$pcAmb\%\n"; + print STATS "No poor quality mates:,$stats{PoorQualMates},$pcPQ\%\n\n"; + print STATS "Total Number Of Reads:,$TotalReads\n"; + print STATS "Unmapped Reads:,$Unmapped,$pcUnmapped\%\n\n"; + print STATS "Min Mean Max ReadsOut:,".min([@coverage]).",$avCov,".max([@coverage])."\n"; + print STATS "No. Amplicons > 0.2 x ReadsOut:,$greater,$pcgreater\%\n"; + print STATS "Average ReadsOut Off Target Regions:,$avOff\n" if @off_coverage; + print STATS "\nAmplicons\nNo. Reads < 200, 200 <= No. Reads < 1000, No. Reads >= 1000\n"; + $greater = scalar (grep { $_ < 200 } @coverage); + print STATS "$greater,"; + $greater = scalar (grep { $_ >= 200 && $_ < 1000 } @coverage); + print STATS "$greater,"; + $greater = scalar (grep { $_ >= 1000 } @coverage); + print STATS "$greater\n"; + + print "Cache hits align contig: $cacheHits\n"; + print "Alignment cache hits: $cacheAlignHits\n"; + close(STATS); +} + +# close the log file +if(defined $OPTIONS{l} || defined $OPTIONS{L}) { + my $now = localtime time; + print "\nLog File Closed $now\n"; + close(STDERR); + close(STDOUT); +} + +# thread to find matching amplicon +sub thread_find_contig { + my $FindContig = shift; + my $refFrags = shift; + my $refSeq = shift; + my $ToAlign = shift; + my $Aligned = shift; + my $WriteBam = shift; + my $readArrayRef; + { + lock($FindContig); # Keep other threads from changing the queue's contents + $readArrayRef = $FindContig->dequeue(); + } + + while(defined $readArrayRef) { + my ($read1, $seq1, $qual1, $read2, $seq2, $qual2) = @{ $readArrayRef }; + my $contig = ''; + my $readRef = ''; + my $strand = ''; + + # trim the reads + my ($trimmed1, $qtrim1) = trim($seq1, $qual1, 30, 1); + my ($trimmed2, $qtrim2) = trim($seq2, $qual2, 30, 2); + + # if we have already mapped the reads use the results in the cache + if(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) { + my @array; + { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } }; + $array[0] = $read1; + $array[3] = $qtrim1; + $array[6] = $qtrim2; + { lock($cacheAlignHits); $cacheAlignHits++ }; + + # add aligned reads to queue + { + lock($Aligned); # Keep other threads from changing the queue's contents + $Aligned->enqueue(\@array); + } + } else { + # Use approx match to find most similar amplicon primer to reads + my @index1; + my @index2; + my $index_read1 = -2; + my $index_read2 = -2; + my $index_found = 0; + + if(length($trimmed1) > length($seq1)/2 || length($trimmed2) > length($seq2)/2) { + # we have at least 1 good read + # find closest matching amplicon for read 1 + my $string = substr($seq1, 0, $$refFrags{min}); + if(defined $cache{"$string.1"}) { + $cacheHits++; + { lock(%cache); @index1 = @{ $cache{"$string.1"} } }; + } else { + @index1 = amatch($string, \@{ $$refFrags{seqU} }); + { + lock(%cache); + $cache{"$string.1"} = &share( [] ); + @{ $cache{"$string.1"} } = @index1; + } + } + + # find closest matching amplicon for read 2 + $string = substr($seq2, -$$refFrags{min}); + if(defined $cache{"$string.2"}) { + $cacheHits++; + { lock(%cache); @index2 = @{ $cache{"$string.2"} } }; + } else { + @index2 = amatch($string, \@{ $$refFrags{seqD} }); + { + lock(%cache); + $cache{"$string.2"} = &share( [] ); + @{ $cache{"$string.2"} } = @index2; + } + } + + if($#index1 > 0 || $#index2 > 0) { + # if one read maps to multiple amplicons and the other read matches to just one of them use that + my %in_array2 = map { $_ => 1 } @index2; + my @common = grep { $in_array2{$_} } @index1; + if($#common == 0) { + @index1 = @common; + @index2 = @common; + } elsif($#common > 0) { + if($#index1 > 0) { + # matched to multiple primers so try and use whole read + my @refs; + foreach(@index1) { + if($$refFrags{strand}[$_] eq "F") { + push @refs, $$refSeq{${ $$refFrags{reg} }[$_]}; + } else { + push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]}); + } + } + my @index1b = amatch($seq1, \@refs); + if($#index1b >= 0) { + @index1 = @index1[@index1b]; + } + } + if($#index2 > 0) { + # matched to multiple primers so try and use whole read + my @refs; + foreach(@index2) { + if($$refFrags{strand}[$_] eq "F") { + push @refs, $$refSeq{${ $$refFrags{reg} }[$_]}; + } else { + push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]}); + } + } + my @index2b = amatch($seq2, \@refs); + if($#index2b >= 0) { + @index2 = @index2[@index2b]; + } + } + + # if one read still maps to multiple amplicons and the other read matches to just one of them use that + %in_array2 = map { $_ => 1 } @index2; + @common = grep { $in_array2{$_} } @index1; + if($#common == 0) { + @index1 = @common; + @index2 = @common; + } + } + } + + # if read 1 and read 2 match to the same amplicon assign it + if($#index1 == 0 && @index1 ~~ @index2 ) { + $index_read1 = $index1[0]; + $index_read2 = $index2[0]; + } else { + $index_read1 = -1; + $index_read2 = -1; + } + + # if any read after trimming is less than half the original length discard + if(length($trimmed1) < length($seq1)/2) { + $index_read1 = -2; + } + + if(length($trimmed2) < length($seq2)/2) { + $index_read2 = -2; + } + } + + if($index_read1 > -1 || $index_read2 > -1) { + my $conc = 0; + my %align; + + # at least 1 read matched to an amplicon + if($index_read1 > -1) { + $contig = $$refFrags{reg}[$index_read1]; + $readRef = $$refSeq{$contig}; + $strand = $$refFrags{strand}[$index_read1]; + } elsif($index_read2 > -1) { + $contig = $$refFrags{reg}[$index_read2]; + $readRef = $$refSeq{$contig}; + $strand = $$refFrags{strand}[$index_read2]; + } + + if($index_read1 == $index_read2 && length($readRef) < length($trimmed1) + length($trimmed2)) { + # reads must overlap so can use concensus + $conc = 1; + } else { + my $substr1; + my $substr2; + for(my $length = 5; $length <= min([length($trimmed1), length($trimmed2)]); $length++) { + $substr1 = substr($trimmed1, -$length); + $substr2 = substr($trimmed2, 0, $length); + if($substr1 eq $substr2) { + $conc = 1; + last; + } else { + # calculate max diff allowed + if(!defined $maxdiff{$length}) { + { lock(%maxdiff); $maxdiff{$length} = maxdiff($length, 0.02, 0.04); } + } + + my $diff = levenshtein($substr1, $substr2 ); + if($diff < $maxdiff{$length}) { + $conc = 1; + last; + } + } + } + } + + if($conc) { + # remove any trailing Ns + $trimmed1 = $seq1; + $trimmed2 = $seq2; + $trimmed1 =~ s/N*$//; + $trimmed2 =~ s/^N*//; + $qtrim1 = $qual1; + $qtrim2 = $qual2; + $qtrim1 = substr($qtrim1, 0, length($trimmed1)); + $qtrim2 = substr($qtrim2 , -length($trimmed2)); + } + + if($strand eq "R") { + my $trimmed1_tmp = revcompl($trimmed1); + my $qtrim1_tmp = reverse($qtrim1); + $trimmed1 = revcompl($trimmed2); + $qtrim1 = reverse($qtrim2); + $trimmed2 = $trimmed1_tmp; + $qtrim2 = $qtrim1_tmp; + my $index_read_tmp = $index_read1; + $index_read1 = $index_read2; + $index_read2 = $index_read_tmp; + } + + if($conc) { + my $align; + my $search1 = $trimmed1; + $search1 =~ s/N/./g; + my $search2 = $trimmed2; + $search2 =~ s/N/./g; + if($$refSeq{$contig} =~ /$search1/ && $$refSeq{$contig} =~ /$search2/) { + # the read matches exactly to the reference + $align{Ref1} = $$refSeq{$contig}; + $align{Read1} = $$refSeq{$contig}; + $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/; + $align{Read1} =~ s/[^X]/-/g; + $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X+/$trimmed1/ }); + $align{Read2} = $$refSeq{$contig}; + $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}XX$2/; + $align{Read2} =~ s/[^X]/-/g; + $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X+/$trimmed2/ }); + + # add aligned reads to queue + my @array; + { + lock($Aligned); # Keep other threads from changing the queue's contents + @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, $align{Ref1}, $align{Read2}, $qtrim2, $contig, $strand, $conc); + $Aligned->enqueue(\@array); + } + + # add alignment to cache + { + lock(%cacheAlign); + $cacheAlign{Conc}{$name} = &share( [] ); + @{ $cacheAlign{Conc}{$name} } = @array; + } + } elsif(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) { + my @array; + { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } }; + $array[0] = $read1; + $array[3] = $qtrim1; + $array[6] = $qtrim2; + { lock($cacheAlignHits); $cacheAlignHits++ }; + + # add aligned reads to queue + { + lock($Aligned); # Keep other threads from changing the queue's contents + $Aligned->enqueue(\@array); + } + } else { + + # align reads using NW2 + { + lock($ToAlign); # Keep other threads from changing the queue's contents + my @array = ($$refSeq{$contig}, $trimmed1, $trimmed2, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc); + #my @array = ($fastaIn, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc, "$trimmed1.$trimmed2"); + $ToAlign->enqueue(\@array); + } + } + } + + if(!$conc && $index_read1 > -1) { + my $search1 = $trimmed1; + $search1 =~ s/N/./g; + if($$refSeq{$contig} =~ /$search1/) { + # the read matches exactly to the reference + $align{Ref1} = $$refSeq{$contig}; + $align{Read1} = $$refSeq{$contig}; + $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/; + $align{Read1} =~ s/[^X]/-/g; + $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X/$trimmed1/ }); + + # add aligned reads to queue + { + lock($Aligned); # Keep other threads from changing the queue's contents + my @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, undef, undef, undef, $contig, $strand, $conc); + $Aligned->enqueue(\@array); + } + } elsif(defined $cacheAlign{$contig}{R1}{$trimmed1}) { + my @array; + { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R1}{$trimmed1} } }; + $array[0] = $read1; + $array[3] = $qtrim1; + { lock($cacheAlignHits); $cacheAlignHits++ }; + + # add aligned reads to queue + { + lock($Aligned); # Keep other threads from changing the queue's contents + $Aligned->enqueue(\@array); + } + } else { + # align read using NW2 + { + lock($ToAlign); # Keep other threads from changing the queue's contents + my @array = ($$refSeq{$contig}, $trimmed1, undef, $read1, $qtrim1, undef, $contig, $strand, $conc); + #my @array = ($fastaIn, $read1, $qtrim1, undef, $contig, $strand, $conc, $trimmed1); + $ToAlign->enqueue(\@array); + } + } + } + + if(!$conc && $index_read2 > -1) { + my $search2 = $trimmed2; + $search2 =~ s/N/./g; + if($$refSeq{$contig} =~ /$search2/) { + # the read matches exactly to the reference + $align{Ref2} = $$refSeq{$contig}; + $align{Read2} = $$refSeq{$contig}; + $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}X$2/; + $align{Read2} =~ s/[^X]/-/g; + $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X/$trimmed2/ }); + + # add aligned reads to queue + { + lock($Aligned); # Keep other threads from changing the queue's contents + my @array = ($read2, undef, undef, undef, $align{Ref2}, $align{Read2}, $qtrim2, $contig, $strand, $conc); + $Aligned->enqueue(\@array); + } + } elsif(defined $cacheAlign{$contig}{R2}{$trimmed2}) { + my @array; + { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R2}{$trimmed2} } }; + $array[0] = $read2; + $array[6] = $qtrim2; + { lock($cacheAlignHits); $cacheAlignHits++ }; + + # add aligned reads to queue + { + lock($Aligned); # Keep other threads from changing the queue's contents + $Aligned->enqueue(\@array); + } + } else { + # align read using NW2 + { + lock($ToAlign); # Keep other threads from changing the queue's contents + my @array = ($$refSeq{$contig}, undef, $trimmed2, $read2, undef, $qtrim2, $contig, $strand, $conc); + #my @array = ($fastaIn, $read2, undef, $qtrim2, $contig, $strand, $conc, $trimmed2); + $ToAlign->enqueue(\@array); + } + } + } + } + + if($index_read1 < 0) { + # discard read 1 + # send to writebamfile thread + { + lock($WriteBam); # Keep other threads from changing the queue's contents + $WriteBam->enqueue("$read1\t4\t*\t0\t0\t*\t*\t0\t0\t$seq1\t$qual1\tRG:Z:$readGroup\n"); + } + { lock(%stats); $stats{PoorQual1}++ }; + } + + if($index_read2 < 0) { + # discard read 2 + # send to writebamfile thread + { + lock($WriteBam); # Keep other threads from changing the queue's contents + $WriteBam->enqueue("$read2\t4\t*\t0\t0\t*\t*\t0\t0\t$seq2\t$qual2\tRG:Z:$readGroup\n"); + } + { lock(%stats); $stats{PoorQual2}++ }; + } + + if($index_read1 == -1 && $index_read2 == -1) { + { lock(%stats); $stats{Amb}++ }; + } elsif($index_read1 == -2 && $index_read2 == -2) { + { lock(%stats); $stats{PoorQualMates}++ }; + } + } + + # grab the next reads in queue + { + lock($FindContig); # Keep other threads from changing the queue's contents + $readArrayRef = $FindContig->dequeue(); + } + } + print "Closing Find Amplicon Thread\n"; + return; +} + +sub writebamfile { + my $bamfile = shift; + my $WriteBam = shift; + + # write samfile header + open(PIPE_TO_SAMTOOLS, "| samtools view -Sb -o $bamfile - 2>&1") or die "Can't fork: $!\n"; + foreach my $key (@OrderedContigs) { + print PIPE_TO_SAMTOOLS "\@SQ\tSN:$refLength{$key}{newReg}\tLN:$refLength{$key}{length}\n"; + } + print PIPE_TO_SAMTOOLS "\@RG\tID:$readGroup\tPL:Illumina\tSM:$sampleName\n"; + + my $read; + { + lock($WriteBam); # Keep other threads from changing the queue's contents + $read = $WriteBam->dequeue(); + } + while(defined $read) { + + # write read to bamfile + print PIPE_TO_SAMTOOLS $read; + + # grab next item in queue + { + lock($WriteBam); # Keep other threads from changing the queue's contents + $read = $WriteBam->dequeue(); + } + } + print "Closing Bam Writer\n"; + close PIPE_TO_SAMTOOLS or die "pipe could not be closed"; + return; +} + +sub thread_process_align { + my $Aligned = shift; + my $refFrags = shift; + my $refSeq = shift; + my $Samfile = shift; + my $WriteBam = shift; + + my $readArrayRef; + { + lock($Aligned); # Keep other threads from changing the queue's contents + $readArrayRef = $Aligned->dequeue(); + } + while(defined $readArrayRef) { + # grab the alignment + my ($read, $ref1, $read1, $qtrim1, $ref2, $read2, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef }; + # grab primer lengths + my $PU = $$refFrags{primers}{$contig}{U}{F}{length}; + my $PD = $$refFrags{primers}{$contig}{D}{F}{length}; + my %readStats; + my $orig_read1 = $read1; + my $orig_read2 = $read2; + my $orig_qtrim1 = $qtrim1; + my $orig_qtrim2 = $qtrim2; + $orig_read1 =~ s/-//g if defined $orig_read1; + $orig_read2 =~ s/-//g if defined $orig_read2; + + if(defined $OPTIONS{d}) { + # debug info + print "Alignment With Primers\n"; + print "$ref1\n$read1\n" if defined $read1; + print "$ref2\n$read2\n" if defined $read2; + print "\nFinal Alignment:\n"; + } + + # calculate where primers begin and end + if(defined $read1) { + if($strand eq 'F') { + $readStats{$contig}{Read1} = 1; + } else { + $readStats{$contig}{Read2} = 1; + } + + # determine the primer locations + my $offset = 0; + for(my $occurance = 0; $occurance < $PU; $occurance++) { + substr($ref1, $offset) =~ /^.*?([^-])/; + $offset += $-[1] + 1; + } + my $length = 0; + for(my $occurance = 0; $occurance < $PD; $occurance++) { + substr($ref1, 0, length($ref1) - $length) =~ /.*([^-]).*?$/; + $length = length($ref1) - $-[1]; + } + $length = length($ref1) - $length - $offset; + + # remove primers and adjust quality scores + $ref1 = substr($ref1, $offset, $length); + my $qualOffset = ( substr($read1, 0, $offset) =~ tr/ACTGN// ); + $read1 = substr($read1, $offset, $length); + $length = ( $read1 =~ tr/ACTGN// ); + $qtrim1 = substr($qtrim1, $qualOffset, $length); + } + + if(defined $read2) { + if($strand eq 'F') { + $readStats{$contig}{Read2} = 1; + } else { + $readStats{$contig}{Read1} = 1; + } + + # determine the primer locations + my $offset = 0; + for(my $occurance = 0; $occurance < $PU; $occurance++) { + substr($ref2, $offset) =~ /^.*?([^-])/; + $offset += $-[1] + 1; + } + my $length = 0; + for(my $occurance = 0; $occurance < $PD; $occurance++) { + substr($ref2, 0, length($ref2) - $length) =~ /.*([^-]).*?$/; + $length = length($ref2) - $-[1]; + } + $length = length($ref2) - $length - $offset; + + # remove primers and adjust quality scores + $ref2 = substr($ref2, $offset, $length); + my $qualOffset = ( substr($read2, 0, $offset) =~ tr/ACTGN// ); + $read2 = substr($read2, $offset, $length); + $length = ( $read2 =~ tr/ACTGN// ); + $qtrim2 = substr($qtrim2, $qualOffset, $length); + } + + # generate a concensus sequence if we have both read1 and read2 and they overlap + if($conc) { + # the following are 0 based indexes + my $R1qual = 0; # index for sanger qual read 1 + my $R2qual = 0; # index for sanger qual read 2 + my $aindex = 0; # index for alignment + my $q1; + my $q2; + my $concensus; + my $concqual; + + $readStats{$contig}{ConcReads} = 1; + $readStats{$contig}{ConcCorrectBases} = 0; + $readStats{$contig}{ConcErrorsBases} = 0; + while($aindex < length($ref1)) { + # base1 is current base from read 1 + # base2 is current base from read 2 + my $base1 = substr($read1, $aindex, 1); + my $base2 = substr($read2, $aindex, 1); + + # grab the next quality score + if($R1qual < length($qtrim1)) { + $q1 = substr($qtrim1, $R1qual, 1); + } + if($R2qual < length($qtrim2)) { + $q2 = substr($qtrim2, $R2qual, 1); + } + + # if we have a base in either + if(($base1 ne '-' && $base1 ne 'N') || ($base2 ne '-' && $base2 ne 'N')) { + if(($base2 eq "-" || $base2 eq 'N') && $base1 ne "-" && $base1 ne 'N') { + # we are only in read 1 or read 2 is a N + $concensus .= $base1; + $concqual .= $q1; + $R1qual++; + if($base2 eq 'N') { + $R2qual++; + } + } elsif(($base1 eq "-" || $base1 eq 'N') && $base2 ne "-" && $base2 ne 'N') { + # we are only in read 2 or read 1 is a N + $concensus .= $base2; + $concqual .= $q2; + $R2qual++; + if($base1 eq 'N') { + $R1qual++; + } + } else { + # we are in the overlap region + my $maxqual = sangerMax($q1, $q2); + if($base1 eq $base2) { + # both reads agree + $readStats{$contig}{ConcCorrectBases}++; + $concensus .= $base1; + $concqual .= $maxqual; + } elsif($q1 eq $q2) { + # reads have different bases but the same score + $readStats{$contig}{ConcErrorsBases}++; + $concensus .= 'N'; + $concqual .= '#'; + } elsif($q1 eq $maxqual) { + # reads have different bases but read 1 has the best score + $concensus .= $base1; + $concqual .= $maxqual; + $readStats{$contig}{ConcErrorsBases}++; + } else { + # reads have different bases but read 2 has the best score + $concensus .= $base2; + $concqual .= $maxqual; + $readStats{$contig}{ConcErrorsBases}++; + } + $R1qual++; + $R2qual++; + } + } elsif($base1 eq 'N' || $base2 eq 'N') { + if($base2 eq '-') { + # one is a deletion the other is an N + $concensus .= $base1; + $concqual .= $q1; + $R1qual++; + } elsif($base1 eq '-') { + # one is a deletion the other is an N + $concensus .= $base2; + $concqual .= $q2; + $R2qual++; + } elsif($base1 eq 'N' && $base2 eq 'N') { + # both reads are an N + $concensus .= 'N'; + $concqual .= sangerMax($q1, $q2); + $R1qual++; + $R2qual++; + } + } else { + $concensus .= '-'; + } + $aindex++; + } + $read1 = $concensus; + $qtrim1 = $concqual; + $read2 = undef; + $qtrim2 = undef; + } + + if(defined $read1) { + # remove any trailing - and send to samfile_line + $read1 =~ s/^-*?([^-].*)/$1/; + $ref1 = substr($ref1, -length($read1)); + $read1 =~ s/(.*[^-])-*?$/$1/; + $ref1 = substr($ref1, 0, length($read1)); + + my $tmp = $ref1; + $tmp =~ s/-//g; + my $rstart = index($$refSeq{$contig}, $tmp) - $PU + 1; + + # generate sam file line + { + lock($Samfile); # Keep other threads from changing the queue's contents + my @array = ($read, $qtrim1, $rstart, $read1, $ref1, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats); + $Samfile->enqueue(\@array); + } + } + + if(defined $read2) { + # remove any trailing - and send to samfile_line + $read2 =~ s/^-*?([^-].*)/$1/; + $ref2 = substr($ref2, -length($read2)); + $read2 =~ s/(.*[^-])-*?$/$1/; + $ref2 = substr($ref2, 0, length($read2)); + + my $tmp = $ref2; + $tmp =~ s/-//g; + my $rstart = rindex($$refSeq{$contig}, $tmp) - $PU + 1; + + # generate sam file line + { + lock($Samfile); # Keep other threads from changing the queue's contents + my @array = ($read, $qtrim2, $rstart, $read2, $ref2, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats); + $Samfile->enqueue(\@array); + } + } + + if(defined $OPTIONS{d}) { + # debug info + print "$ref1\n$read1\n" if defined $read1; + print "$ref2\n$read2\n" if defined $read2; + print "$qtrim1\n" if defined $read1; + print "$qtrim2\n" if defined $read2; + print "\n\n"; + } + + # grab next item in queue + { + lock($Aligned); # Keep other threads from changing the queue's contents + $readArrayRef = $Aligned->dequeue(); + } + } + print "Closing Alignment Processor\n"; + return; +} + +# function to take reverse compliment +sub revcompl { + my $string = shift @_; + my $revcomp = reverse($string); + $revcomp =~ tr/ACGTacgtKMSWRYBDHVkmswrybdhv/TGCAtgcaMKSWYRVHDBmkswyrvhdb/; + return $revcomp; +} + +# function to convert sanger fastq quality scores to phred scores +sub sanger2phred { + my $sang = shift @_; + my @sanger = split("", $sang); + my @phred; + foreach my $q (@sanger) { + my $Q = ord($q) - 33; + push @phred, $Q; + } + return @phred; +} + +# function to return the largest sanger quality score +sub sangerMax { + my ($s1, $s2) = @_; + my $Q1 = ord($s1) - 33; + my $Q2 = ord($s2) - 33; + if($Q1 > $Q2) { + return $s1; + } else { + return $s2; + } +} + +# function to read a fasta file as a hash +sub read_fasta_file { + my ($fasta_file, $seqs) = @_; + my $header; + my $seq = ""; + open (IN, $fasta_file) or die "can't open $fasta_file: $!\n"; + while (my $line = ) { + chomp $line; + if (grep />/, $line) { + if ($seq ne "") { + $seqs->{$header} = $seq; + } + $header = $line; + + $header =~ s/^>//; # remove ">" + $header =~ s/\s+$//; # remove trailing whitespace + $seq = ""; # clear out old sequence + } else { + $line =~ s/\s+//g; # remove whitespace + $seq .= $line; # add sequence + } + } + close IN; # finished with file + + if ($seq ne "") { # handle last sequence + $seqs->{$header} = $seq; + } + + return; +} + +# minimal element of a list +# +sub min +{ + my @list = @{$_[0]}; + my $min = $list[0]; + + foreach my $i (@list) + { + $min = $i if ($i < $min); + } + + return $min; +} + +# maximal element of a list +# +sub max +{ + my @list = @{$_[0]}; + my $max = $list[0]; + + foreach my $i (@list) + { + $max = $i if ($i > $max); + } + + return $max; +} + +# trim sequences using the BWA algorithm +sub trim { + + my ($s, $q, $threshold, $read) = @_; # read sequence and quality scores + if($read == 2) { + $q = reverse($q); + $s = reverse($s); + } + + my @array = sanger2phred($q); + my $length = scalar @array; + + # only calculate if quality fails near end + if( $array[$#array] >= $threshold ){ + if($read == 2) { + $q = reverse($q); + $s = reverse($s); + } + return ($s, $q); + } + + # run bwa equation + my @arg; + for( my $i = 0; $i < $length - 1; $i++ ){ + my $x = $i + 1; + for( my $j = $x; $j < $length; $j++ ){ + $arg[$x] += $threshold - $array[$j]; + } + } + + # find number of 5' bases to retain + my $index = 0; + my $maxval = 0; + for ( 1 .. $#arg ){ + if ( $maxval < $arg[$_] ){ + $index = $_; + $maxval = $arg[$_]; + } + } + + # trim reads + $s = substr($s,0,$index); + $q = substr($q,0,$index); + if($read == 2) { + $s = reverse($s); + $q = reverse($q); + } + + # return seq and quality + return ($s, $q); +} + + +sub samfile_line { + # grab the variables + my ($Samfile, $WriteBam, $refSeq) = @_; + my %SamStats; + my %minLength; + my %maxLength; + my $readArrayRef; + { + lock($Samfile); # Keep other threads from changing the queue's contents + $readArrayRef = $Samfile->dequeue(); + } + + while(defined $readArrayRef) { + my ($read, $qual, $rstart, $align, $ref, $reg, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, $readStats) = @{ $readArrayRef }; + my $seq = $align; + $seq =~ s/-//g; + + # calculate the edit difference - count indels as only 1 missmatch hence tranform the strings so that all indels have length 1 + my $sam; + my $s1; + my $s2; + my @s1 = split("", $align); + my @s2 = split("", $ref); + + my $indel = 0; + for(my $i = 0; $i <= $#s1; $i++) { + if(($s1[$i] ne "-" && $s2[$i] ne "-") || !$indel) { + $s1 .= $s1[$i]; + $s2 .= $s2[$i]; + $indel = 0; + if($s1[$i] eq "-" || $s2[$i] eq "-") { + $indel = 1; + } + } + } + my $mq = levenshtein($s1, $s2); + + my $cl = length($seq); + # calculate max diff allowed + if(!defined $maxdiff{$cl}) { + { lock(%maxdiff); $maxdiff{$cl} = maxdiff($cl, 0.02, 0.04); } + } + + if($mq > $maxdiff{$cl} || $cl == 0) { + { lock(%stats); $stats{$reg}{PoorMapQual}++; $stats{PoorMapQual}++; } + if(defined $orig_read1) { + $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read1\t$orig_qtrim1\tRG:Z:$readGroup\n"; + # send to writebamfile thread + { + lock($WriteBam); # Keep other threads from changing the queue's contents + $WriteBam->enqueue($sam); + } + } + if(defined $orig_read2) { + $read =~ s/[1](:[A-Z]+:[0-9]+:[ACTGN]+)$/2$1/; + $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read2\t$orig_qtrim2\tRG:Z:$readGroup\n"; + # send to writebamfile thread + { + lock($WriteBam); # Keep other threads from changing the queue's contents + $WriteBam->enqueue($sam); + } + } + } else { + $mq = 60; + + # add any stats from earlier + foreach my $key1 (keys %$readStats) { + foreach my $key2 (keys % { $$readStats{$key1} }) { + $SamStats{$key1}{$key2} += $$readStats{$key1}{$key2}; + } + } + + # keep track of quality scores + $SamStats{$reg}{Bases} += $cl; + $SamStats{$reg}{Quality} += $_ for sanger2phred($qual); + + # keep track of read lengths + if(defined $minLength{$reg}) { + $minLength{$reg} = min([$cl, $minLength{$reg}]); + $maxLength{$reg} = max([$cl, $maxLength{$reg}]); + } else { + $minLength{$reg} = $cl; + $maxLength{$reg} = $cl; + } + + # keep track of strand and bases seen + if($strand eq 'F') { + $SamStats{$reg}{NumberForward}++; + } else { + $SamStats{$reg}{NumberReverse}++; + } + $SamStats{$reg}{TotalA} += ($seq =~ tr/A//); + $SamStats{$reg}{TotalC} += ($seq =~ tr/C//); + $SamStats{$reg}{TotalG} += ($seq =~ tr/G//); + $SamStats{$reg}{TotalT} += ($seq =~ tr/T//); + $SamStats{$reg}{TotalN} += ($seq =~ tr/N//); + + # keep count the number of reads for each contig + $SamStats{$reg}{Reads}++; + + # calculate strand info + if($strand eq 'F') { + $strand = 0; + } else { + $strand = 16; + } + # calculate cigar + my $cigar = ""; + my $M = 0; + my $D = 0; + my $I = 0; + my $Subs = 0; + for(my $i = 0; $i < length($align); $i++) { + if(substr($align, $i, 1) eq "-") { + $D++; + if($M) { + $cigar .= "${M}M"; + $M = 0; + } elsif($I) { + $cigar .= "${I}I"; + $I = 0; + } + } elsif(substr($ref, $i, 1) eq "-") { + $I++; + if($M) { + $cigar .= "${M}M"; + $M = 0; + } elsif($D) { + $cigar .= "${D}D"; + $D = 0; + } + } else { + $M++; + if($D) { + $cigar .= "${D}D"; + $D = 0; + } elsif($I) { + $cigar .= "${I}I"; + $I = 0; + } + $Subs++ if substr($ref, $i, 1) ne substr($align, $i, 1); + } + } + if($M) { + $cigar .= "${M}M"; + } elsif($I) { + $cigar .= "${I}I"; + } elsif($D) { + $cigar .= "${D}D"; + } + + # construct sam file line + $sam = "$read\t$strand\t$refLength{$reg}{newReg}\t$rstart\t$mq\t$cigar\t*\t0\t0\t$seq\t$qual\tRG:Z:$readGroup\n"; + + # send to writebamfile thread + { + lock($WriteBam); # Keep other threads from changing the queue's contents + $WriteBam->enqueue($sam); + } + } + + #grab the next read + { + lock($Samfile); # Keep other threads from changing the queue's contents + $readArrayRef = $Samfile->dequeue(); + } + } + + { + # add stats from earlier + lock(%stats); + foreach my $key1 (keys %SamStats) { + $stats{$key1}{MinLength} = min([$minLength{$key1}, $stats{$key1}{MinLength}]); + $stats{$key1}{MaxLength} = max([$maxLength{$key1}, $stats{$key1}{MaxLength}]); + foreach my $key2 (keys % { $SamStats{$key1} }) { + $stats{$key1}{$key2} += $SamStats{$key1}{$key2}; + } + } + + } + print "Closing Sam File Line Generator\n"; + + return; +} + +# perform the alignment +sub nwAlign { + my ($ToAlign, $Aligned, $regions) = @_; + my $readArrayRef; + { + lock($ToAlign); # Keep other threads from changing the queue's contents + $readArrayRef = $ToAlign->dequeue(); + } + + while(defined $readArrayRef) { + # we have picked a contig + # check strand and take rev comp of ref if necessary + my ($ref, $left, $right, $read, $qtrim1, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef }; + my %align; + my $name; + #! Parameters + #! ClustalW Parameters + my $matchScore = 1.9; + my $missmatch_penalty = -0.4; + my $gapPenOpen = -10.0; + my $gapPenExt = -0.4; + my $leftAndRightGapPen = true; + my $debug = false; + my $cons = ""; + my $align1= ""; + my $align2= ""; + my $align3= ""; + my $score = 0.0; + my $singleAlign = false; #aligning 2 or 3 sequences + if($conc) + { + #got to concatenate seqs and align + $score = align3($ref, $right, $left, $align1, $align2, $align3, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, $leftAndRightGapPen, $debug); + $name = "$left.$right"; + print "$contig\n$align1\n$align2\n$align3\n\n" if defined $$regions{$contig}; + } + elsif(!defined $right) + { + #align with left + $score = align2($ref, $left, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, false, true, $debug); + $singleAlign = true; + $name = $left; + print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig}; + } + elsif(!defined $left) + { + #align with right + $score = align2($ref, $right, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen ,$gapPenExt ,true, false, $debug); + $singleAlign = true; + $name = $right; + print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig}; + } + + # add aligned reads to queue + # cache results + my $type; + { + lock($Aligned); # Keep other threads from changing the queue's contents + + my @array; + if($singleAlign && !defined $right) + { + @array = ($read, $align1, $align2, $qtrim1, undef, undef, undef, $contig, $strand, $conc); + { + lock(%cacheAlign); + $cacheAlign{$contig}{R1}{$name} = &share( [] ); + @{ $cacheAlign{$contig}{R1}{$name} } = @array; + } + } + elsif($singleAlign && !defined $left ) + { + @array = ($read, undef, undef, undef, $align1, $align2, $qtrim2, $contig, $strand, $conc); + { + lock(%cacheAlign); + $cacheAlign{$contig}{R2}{$name} = &share( [] ); + @{ $cacheAlign{$contig}{R2}{$name} } = @array; + } + } + else + { + @array = ($read, $align1, $align2, $qtrim1, $align1, $align3, $qtrim2, $contig, $strand, $conc); + { + lock(%cacheAlign); + $cacheAlign{Conc}{$name} = &share( [] ); + @{ $cacheAlign{Conc}{$name} } = @array; + } + } + $Aligned->enqueue(\@array); + } + + #grab the next read + { + lock($ToAlign); # Keep other threads from changing the queue's contents + $readArrayRef = $ToAlign->dequeue(); + } + } + # clean up + print "Closing NW2 Aligner\n"; + return; +} + +# function to calculat the minimum edit distance threshold (taken from bwa) +sub maxdiff { + my ($l, $err, $thres) = @_; + my $lambda = exp(-$l*$err); + my $y = 1.0; + my $x = 1; + my $sum = $lambda; + for(my $k = 1; $k < 1000; $k++) { + $y *= $l * $err; + $x *= $k; + $sum += $lambda * $y / $x; + if(1.0 - $sum < $thres) { + return $k; + } + } + return 2; +} + +# using approx matching return the positions in $list of those most closely matching $string +sub amatch { + my ($string, $list) = @_; + my @d = adistr($string, @$list); + my @d_index = sort { abs($d[$a]) <=> abs($d[$b]) } 0..$#$list; + + return if $d[$d_index[0]] > 0.25; + + my @pos = ($d_index[0]); + my $i = 1; + while($i <= $#d_index && $d[$d_index[0]] == $d[$d_index[$i]]) { + push @pos, $d_index[$i]; + $i++; + } + return @pos; +} + +# a sorting algorithm to sort the chromosomes +sub sortChrom { + my ($achr, $astart, $aend) = split(":|-", $a); + my ($bchr, $bstart, $bend) = split(":|-", $b); + $achr =~ s/X/23/; + $achr =~ s/Y/24/; + $achr =~ s/MT?/25/; + $bchr =~ s/X/23/; + $bchr =~ s/Y/24/; + $bchr =~ s/MT?/25/; + if($achr < $bchr) { + return -1; + } elsif($bchr < $achr) { + return 1; + } elsif($astart < $bstart) { + return -1; + } elsif($bstart < $astart) { + return 1; + } elsif($aend < $bend) { + return -1; + } elsif($bend < $aend) { + return 1; + } else { + return 0; + } +} + +# pass a code reference +sub modify { + my($text, $code) = @_; + $code->($text); + return $text; +} + +sub split_region { + my ($chr, $start, $end) = split(":|-", shift); + $end = $start unless defined $end; + return($chr, $start, $end); +} + +sub error_rate { + my ($x, $y) = @_; + my @x = split("", $x); + my @y = split("", $y); + my $s = 0; + my $l = 0; + for(my $i = 0; $i <= $#x; $i++) { + if($x[$i] ne "-" && $y[$i] ne "-") { + $s++ if $x[$i] ne $y[$i]; + $l++; + } + } + return($s / $l); +} + + diff -r 000000000000 -r d32bddcff685 alignCustomAmplicon/alignCustomAmplicon_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignCustomAmplicon/alignCustomAmplicon_wrapper.xml Wed Jan 09 00:24:09 2013 -0500 @@ -0,0 +1,82 @@ + + align amplicon to reference with primers + picard + samtools + gzip + Inline-CPP + + alignCustomAmplicon.pl -s -r -p 4 -o $output $refFile.fields.path $read1 $read2 $primers + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +It is an amplicon aligner that uses primers for higher accuracy. + +Reads with primers are aligned to the reference, then primers are discarded. + +If both reads are long enough, they are aligned with the reference and a consensus alignment is generated. + +Otherwise, each read is aligned separately. + +Sequences with bad quality reads are discarded. + + + +**Input** + +ref: + + Fasta file of ref gnome + +read1: + + Fastq file of left to right read + (Can also be compressed [fastq.gz]) + +read2: + + Fastq file of right to left read + (Can also be compressed [fastq.gz]) + +primers: + + Text file with primers name and length (see example) + +Example primers format:: + + #Name_of_amplicon length_left length_right + 1:115256345-115256520 23 23 + 1:115256436-115256606 25 22 + 1:115256530-115256724 23 23 + 1:115256532-115256723 23 23 + 4:55151914-55152086 21 23 + 4:55151935-55152132 20 23 + 4:55151991-55152182 23 24 + 4:55591944-55592136 23 24 + 4:55592065-55592263 20 23 + 4:55593504-55593674 24 25 + ... + + + + diff -r 000000000000 -r d32bddcff685 alignCustomAmplicon/all_fasta.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignCustomAmplicon/all_fasta.loc.sample Wed Jan 09 00:24:09 2013 -0500 @@ -0,0 +1,18 @@ +#This file lists the locations and dbkeys of all the fasta files +#under the "genome" directory (a directory that contains a directory +#for each build). The script extract_fasta.py will generate the file +#all_fasta.loc. This file has the format (white space characters are +#TAB characters): +# +# +# +#So, all_fasta.loc could look something like this: +# +#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa +#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa +# +#Your all_fasta.loc file should contain an entry for each individual +#fasta file. So there will be multiple fasta files for each build, +#such as with hg19 above. +# diff -r 000000000000 -r d32bddcff685 alignCustomAmplicon/tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignCustomAmplicon/tool_data_table_conf.xml.sample Wed Jan 09 00:24:09 2013 -0500 @@ -0,0 +1,8 @@ + + + + + value, dbkey, name, path + +
+
diff -r 000000000000 -r d32bddcff685 alignCustomAmplicon/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignCustomAmplicon/tool_dependencies.xml Wed Jan 09 00:24:09 2013 -0500 @@ -0,0 +1,19 @@ + + + + + + http://downloads.sourceforge.net/project/picard/picard-tools/1.56/picard-tools-1.56.zip + + picard-tools-1.56 + $INSTALL_DIR/jars + + + $INSTALL_DIR/jars + + + + + + + diff -r 000000000000 -r d32bddcff685 alignCustomAmplicon/utils.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignCustomAmplicon/utils.cpp Wed Jan 09 00:24:09 2013 -0500 @@ -0,0 +1,498 @@ +#include +#include +#include + +#define max(a,b,c) a > b ? ( a > c ? a : c ) : ( b > c ? b : c ) ; +#define min(a,b,c) a < b ? ( a < c ? a : c ) : ( b < c ? b : c ) ; +using namespace std; + +void printMatrix (double* m, int rows, int cols) +{ + for(int i = 0; i < rows; i++) + { + for(int j = 0; j < cols; j++) + { + printf("%.2f ", *(m + i*cols +j)); + } + printf("\n"); + } + +} + +int levenshteinDistance(string s, string t) +{ + //Calculate Edit distance of 2 strings + if(s == t) + return 0; + int size1 = s.length()+1; + int size2 = t.length()+1; + int d[size1][size2]; + + for (int i=0; idel && match > insert) + { + scores[i][j] = match; + dir[i][j] = 'D'; + gaps[i][j] = 'N'; + } + else if(del > insert) + { + scores[i][j] = del; + dir[i][j] = 'U'; + gaps[i][j] = 'O'; + } + else + { + scores[i][j] = insert; + dir[i][j] = 'L'; + gaps[i][j] = 'O'; + } + } + + //debug + //if (debug) + // printMatrix(*scores, size1+1, size2+1); + //printMatrix(*dir, size1+1, size2+1); + + string alignment1=""; + string alignment2=""; + + + int cont1 = size1; + int cont2 = size2; + int max1 = size1; + int max2 = size2; + double num = scores[size1][size2];; + + if (noEndGapPenalty) + { + //Search Maxes + + for(int i = 1; i <= size1; i++) + { + if(num 0 && cont2 > 0) + { + if(dir[cont1][cont2] == 'D') + { + alignment1+= seq1[cont1-1]; + alignment2+= seq2[cont2-1]; + cont1--; + cont2--; + + } + else if(dir[cont1][cont2] == 'L') + { + alignment1+= '-'; + alignment2+= seq2[cont2-1]; + cont2--; + } + else + { + alignment1+= seq1[cont1-1]; + alignment2+= '-'; + cont1--; + + } + } + + while (cont1 > 0) + { + alignment1+= seq1[cont1-1]; + alignment2+= '-'; + cont1--; + } + + while (cont2 > 0) + { + alignment1+= '-'; + alignment2+= seq2[cont2-1]; + cont2--; + } + + alignment1 = string (alignment1.rbegin(),alignment1.rend()); + alignment2 = string (alignment2.rbegin(),alignment2.rend()); + + + if (noEndGapPenalty) + { + //Need to fill out rest of alignment... + if (max1 != size1) + { + for (int i=max1; i nw_alignAlignments (vector a1, vector a2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug) +{ + //Do NW alignment of 2 strings... + int size1 = a1[0].length(); + int size2 = a2[0].length(); + int v1 = a1.size(); + int v2 = a2.size(); + int combs = v1 * v2; + + vector msa; + msa.clear(); + vector alignment1, alignment2; + + for (int i=0;i del && match > insert) + { + scores[i][j] = match; + dir[i][j] = 'D'; + gaps[i][j] = 'N'; + } + else if(del > insert) + { + scores[i][j] = del; + dir[i][j] = 'U'; + gaps[i][j] = 'O'; + } + else + { + scores[i][j] = insert; + dir[i][j] = 'L'; + gaps[i][j] = 'O'; + } + } + + //debug +// if (debug) +// printMatrix(*scores, size1+1, size2+1); + //printMatrix(*dir, size1+1, size2+1); + + + int cont1 = size1; + int cont2 = size2; + int max1 = size1; + int max2 = size2; + double num = scores[size1][size2]; + + if (noEndGapPenalty) + { + //Search Maxes + + for(int i = 1; i <= size1; i++) + { + if(num 0 && cont2 > 0) + { + if(dir[cont1][cont2] == 'D') + { + + for(int z=0;z 0) + { + for(int z=0;z 0) + { + for(int z=0;z