Mercurial > repos > fcaramia > custom_amplicon_alignment
comparison alignCustomAmplicon/alignCustomAmplicon.pl @ 0:d32bddcff685 draft
Uploaded
author | fcaramia |
---|---|
date | Wed, 09 Jan 2013 00:24:09 -0500 |
parents | |
children | 22f35f830f48 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d32bddcff685 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # script to convert paired end reads to a concensus sequence | |
4 | |
5 # Created by Jason Ellul, Feb 2012 | |
6 # Modified by Franco Caramia and Jason Ellul, July 2012 and November 2012 | |
7 # 0.4 Fixed mapping of concensus region (large insertions and deletions now picked up correctly) | |
8 # Added missalignment penalty | |
9 # Read 1 and 2 are now aligned first then they are aligned to the reference | |
10 # 0.5 Added ability to name amplicons by includinfo the info in the primers file | |
11 # Added the ability to output the alignments in a particular region using the new flag -a | |
12 # Added some extra stats to the output | |
13 | |
14 use strict; | |
15 use warnings; | |
16 use Getopt::Std; | |
17 use File::Basename; | |
18 use File::Find; | |
19 use File::Copy; | |
20 use File::MMagic; | |
21 use File::Temp qw/ tempfile tempdir /; | |
22 use Cwd 'abs_path'; | |
23 use threads; | |
24 use threads::shared; | |
25 use List::Util qw(sum); | |
26 use Thread::Queue; | |
27 use String::Approx 'adistr'; | |
28 use POSIX qw(floor ceil); | |
29 String::Approx::cache_max(25000); | |
30 String::Approx::cache_purge(0.25); | |
31 use constant false => 0; | |
32 use constant true => 1; | |
33 use Inline CPP => Config => AUTO_INCLUDE => '#include "utils.cpp"'; | |
34 use Inline CPP => <<'END_CPP'; | |
35 | |
36 | |
37 | |
38 | |
39 int levenshtein(char* seq1, char* seq2) | |
40 { | |
41 // Returns edit distance of 2 sequences | |
42 string sam1(seq1); | |
43 string sam2(seq2); | |
44 return levenshteinDistance(sam1, sam2); | |
45 } | |
46 | |
47 double align2(char* seq1, char* seq2, SV* res1, SV* res2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug ) | |
48 { | |
49 // Returns Sequence alignment of 2 sequences | |
50 // res1, res2 contain the alignment | |
51 //seq1: reference sequence | |
52 //seq2: amplicon sequence | |
53 //matchScore: reward for matching a nucleotide | |
54 //gapPenOpen: penalty for opening a gap | |
55 //gapPenext: penalty for extending a gap | |
56 //noFrontGapPenalty: No penalty will be impose for gaps at the beggining of the alignment | |
57 //noEndGapPenalty: No penalty will be impose for gaps at the end of the alignment | |
58 //debug: prints scores, alignments, etc... | |
59 | |
60 string sam1(seq1); | |
61 string sam2(seq2); | |
62 double ret = nw_align2(sam1, sam2, matchScore, missmatch_penalty, gapPenOpen, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug); | |
63 //Setting the return alignments | |
64 sv_setpvn(res1, (char*)sam1.c_str(), sam1.length()); | |
65 sv_setpvn(res2, (char*)sam2.c_str(), sam2.length()); | |
66 return ret; | |
67 } | |
68 | |
69 void adjustSequence(int refSize, string& seq, char dir) | |
70 { | |
71 | |
72 int adjust = refSize - seq.length(); | |
73 if(dir == 'r') | |
74 for(int i=0; i<adjust; ++i) seq = '-' + seq; | |
75 else | |
76 for(int i=0; i<adjust; ++i) seq += '-'; | |
77 } | |
78 | |
79 double align3(char* ref, char* right,char* left , SV* res1, SV* res2, SV* res3, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool leftAndRightGapPen, bool debug ) | |
80 { | |
81 // Returns Sequence alignment of 3 sequences | |
82 // res1, res2, res3 contain the alignment | |
83 //ref: reference sequence | |
84 //left: left amplicon sequence | |
85 //right: right amplicon sequence | |
86 //matchScore: reward for matching a nucleotide | |
87 //gapPenOpen: penalty for opening a gap | |
88 //gapPenext: penalty for extending a gap | |
89 //leftAndRightGapPen: Smart way to penalise according to the amplicon | |
90 //debug: prints scores, alignments, etc... | |
91 | |
92 | |
93 string leftSeq(left); | |
94 string rightSeq(right); | |
95 | |
96 bool noFrontGapPenalty = false; | |
97 bool noEndGapPenalty = false; | |
98 | |
99 //Align right with left reads | |
100 if(leftAndRightGapPen) | |
101 { | |
102 noFrontGapPenalty = true; | |
103 noEndGapPenalty = true; | |
104 } | |
105 | |
106 //first alignment the 2 reads | |
107 double ret = nw_align2(rightSeq, leftSeq, matchScore, missmatch_penalty*8, gapPenOpen*5, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug); | |
108 | |
109 //Setting the return alignments | |
110 vector <string> msa; | |
111 vector <string> partial; | |
112 vector <string> seq; | |
113 | |
114 partial.push_back(rightSeq); | |
115 partial.push_back(leftSeq); | |
116 seq.push_back(ref); | |
117 | |
118 msa = nw_alignAlignments(partial, seq , matchScore, missmatch_penalty, gapPenOpen, gapPenExt, false, false, debug); | |
119 sv_setpvn(res1, (char*)msa[2].c_str(), msa[2].length()); | |
120 sv_setpvn(res2, (char*)msa[1].c_str(), msa[1].length()); | |
121 sv_setpvn(res3, (char*)msa[0].c_str(), msa[0].length()); | |
122 return ret; | |
123 } | |
124 | |
125 END_CPP | |
126 | |
127 $| = 1; | |
128 | |
129 # Grab and set all options | |
130 my $version = "0.5"; | |
131 my %OPTIONS = (p => 1); | |
132 getopts('a:dl:L:o:p:rsv', \%OPTIONS); | |
133 | |
134 die qq(version $version | |
135 Usage: alignCustomAmplicon.pl [OPTIONS] <ref> <fastq[.gz] 1> <fastq[.gz] 2> <primers> | |
136 | |
137 OPTIONS: -a STR print the alignment for any amplicons covering a particular region [chr:loc || chr:start-stop] | |
138 -d print debug info | |
139 -l STR filename of the log file [null] | |
140 -L STR append to an existing log file [null] | |
141 -o STR filename for bam file | |
142 -p INT number of threads [$OPTIONS{p}] | |
143 -r do not create stats report | |
144 -s sort the bam file | |
145 -v verbose progress | |
146 ) if(@ARGV != 4); | |
147 | |
148 my $Script = 'alignCustomAmplicon'; | |
149 my $ref = shift @ARGV; | |
150 my $fastq1 = shift @ARGV; | |
151 my $fastq2 = shift @ARGV; | |
152 my $primers = shift @ARGV; | |
153 my $now; | |
154 | |
155 #create log file | |
156 if(defined $OPTIONS{l}) { | |
157 my $logfile = $OPTIONS{l}; | |
158 open (FH, ">$logfile") or die 'Couldn\'t create log file!\n'; | |
159 close (FH); | |
160 # Open the log file and redirect output to it | |
161 open (STDERR, ">>$logfile"); | |
162 open (STDOUT, ">>$logfile"); | |
163 my $now = localtime time; | |
164 print "Log File Created $now\n\n"; | |
165 } elsif(defined $OPTIONS{L}) { | |
166 #append to a log file | |
167 my $logfile = $OPTIONS{L}; | |
168 # Open the log file and redirect output to it | |
169 open (STDERR, ">>$logfile") or die 'Couldn\'t create log file!\n'; | |
170 open (STDOUT, ">>$logfile") or die 'Couldn\'t create log file!\n'; | |
171 my $now = localtime time; | |
172 print "Appending To Log File $now\n\n"; | |
173 } | |
174 | |
175 # print version of this script | |
176 print "Using $Script version $version \n\n"; | |
177 | |
178 if(defined $OPTIONS{d}) { | |
179 print "Setting threads to 1 as using debug\n"; | |
180 $OPTIONS{p} = 1; | |
181 } | |
182 | |
183 # grab sample name, set output file name and open some files | |
184 my ($name, $path, $suffix) = fileparse($fastq1, ('.fastq', '.fastq.gz')); | |
185 my $sampleName :shared = $name; | |
186 $sampleName =~ s/^(.*?_.*?)_.*/$1/; | |
187 open(PRIM, $primers) or die "can't open $primers: $!\n"; | |
188 | |
189 # set samfile name | |
190 my $bamfile_out = "${sampleName}_aligned.bam"; | |
191 if(defined $OPTIONS{o}) { | |
192 $bamfile_out = $OPTIONS{o}; | |
193 } | |
194 my $bam_to_sort = $bamfile_out; | |
195 if(defined $OPTIONS{s}) { | |
196 $bam_to_sort .= '.tmp'; | |
197 } | |
198 | |
199 my $stats = $bamfile_out; | |
200 if(!defined $OPTIONS{r}) { | |
201 $stats =~ s/\..*?$//; | |
202 open(STATS, ">${stats}_stats.csv"); | |
203 } | |
204 | |
205 # read in primer lengths | |
206 my %refFrags; | |
207 $refFrags{min} = 5000000; | |
208 while(my $line = <PRIM>) { | |
209 chomp $line; | |
210 my @line_sp = split("\t", $line); | |
211 $refFrags{primers}{$line_sp[0]}{U}{F}{length} = $line_sp[1]; | |
212 $refFrags{primers}{$line_sp[0]}{D}{F}{length} = $line_sp[2]; | |
213 $refFrags{primers}{$line_sp[0]}{U}{R}{length} = $line_sp[2]; | |
214 $refFrags{primers}{$line_sp[0]}{D}{R}{length} = $line_sp[1]; | |
215 $refFrags{name}{$line_sp[0]} = $line_sp[3]; | |
216 $refFrags{min} = min([$refFrags{min}, $line_sp[1], $line_sp[2]]); | |
217 } | |
218 close(PRIM); | |
219 my @OrderedContigs :shared = sort sortChrom keys %{ $refFrags{primers} }; | |
220 | |
221 my %regions; | |
222 if(defined $OPTIONS{a}) { | |
223 my ($chr, $start, $end) = split_region($OPTIONS{a}); | |
224 foreach my $reg (@OrderedContigs) { | |
225 my ($chr_reg, $start_reg, $end_reg) = split_region($reg); | |
226 $regions{$reg} = 1 if $chr eq $chr_reg && $start_reg <= $end && $end_reg >= $start; | |
227 } | |
228 } | |
229 | |
230 # grab fasta file | |
231 my %refSeq; | |
232 read_fasta_file($ref, \%refSeq); | |
233 | |
234 # calculate read length and read group | |
235 my $readGroup :shared; | |
236 my $readLength; | |
237 my $mm = new File::MMagic; | |
238 my $filetype = $mm->checktype_filename(abs_path($fastq1)); | |
239 if($filetype eq "application/x-gzip") { | |
240 my $sampleRead = `gunzip -c $fastq1 | head -n 2 | tail -n 1`; | |
241 chomp $sampleRead; | |
242 $readLength = length($sampleRead); | |
243 $readGroup = `gunzip -c $fastq1 | head -n 1`; | |
244 chomp $readGroup; | |
245 $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/; | |
246 } elsif($filetype eq "text/plain") { | |
247 my $sampleRead = `head -n 2 $fastq1 | tail -n 1`; | |
248 chomp $sampleRead; | |
249 $readLength = length($sampleRead); | |
250 $readGroup = `head $fastq1 -n 1`; | |
251 chomp $readGroup; | |
252 $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/; | |
253 } else { | |
254 die("$fastq1 is filetype $filetype and not text/plain or application/x-gzip\n"); | |
255 } | |
256 | |
257 # create hash to keep track of maxdiff and readlengths | |
258 my %maxdiff :shared; | |
259 | |
260 # create hash to keep track of stats | |
261 my %stats :shared; | |
262 $stats{Amb} = 0; | |
263 $stats{PoorQualMates} = 0; | |
264 $stats{PoorMapQual} = 0; | |
265 $stats{PoorQual1} = 0; | |
266 $stats{PoorQual2} = 0; | |
267 | |
268 # setup other shared variables and caches | |
269 my %cache :shared; | |
270 my $cacheHits :shared = 0; | |
271 my %cacheAlign :shared; | |
272 $cacheAlign{Conc} = &share( {} ); | |
273 my $cacheAlignHits :shared = 0; | |
274 my %refLength :shared; | |
275 | |
276 # extract primers and fragments on forward and reverse strand and initialize counts | |
277 foreach my $key (@OrderedContigs) { | |
278 push @{ $refFrags{seqU} }, substr($refSeq{$key}, 0, $refFrags{primers}{$key}{U}{F}{length}); | |
279 push @{ $refFrags{seqD} }, substr($refSeq{$key}, -$refFrags{primers}{$key}{D}{F}{length}); | |
280 push @{ $refFrags{seqU} }, revcompl(substr($refSeq{$key}, -$refFrags{primers}{$key}{U}{R}{length})); | |
281 push @{ $refFrags{seqD} }, revcompl(substr($refSeq{$key}, 0, $refFrags{primers}{$key}{D}{R}{length})); | |
282 push @{ $refFrags{lenU} }, ($refFrags{primers}{$key}{U}{F}{length}, $refFrags{primers}{$key}{U}{R}{length}); | |
283 push @{ $refFrags{lenD} }, ($refFrags{primers}{$key}{D}{F}{length}, $refFrags{primers}{$key}{D}{R}{length}); | |
284 push @{ $refFrags{reg} }, ($key, $key); | |
285 push @{ $refFrags{strand} }, ('F', 'R'); | |
286 | |
287 # calculate new contig name with primers removed | |
288 my ($chr, $start, $stop) = split(":|-", $key); | |
289 $start += $refFrags{primers}{$key}{U}{F}{length}; | |
290 $stop -= $refFrags{primers}{$key}{D}{F}{length}; | |
291 $refLength{$key} = &share( {} ); | |
292 $refLength{$key}{newReg} = "$chr:$start-$stop"; | |
293 $refLength{$key}{length} = $stop - $start + 1; | |
294 | |
295 $cacheAlign{$key} = &share( {} ); | |
296 $cacheAlign{$key}{R1} = &share( {} ); | |
297 $cacheAlign{$key}{R2} = &share( {} ); | |
298 | |
299 $stats{$key} = &share( {} ); | |
300 $stats{$key}{Reads} = 0; | |
301 $stats{$key}{Bases} = 0; | |
302 $stats{$key}{Quality} = 0; | |
303 $stats{$key}{Mapping} = 0; | |
304 $stats{$key}{Read1} = 0; | |
305 $stats{$key}{Read2} = 0; | |
306 $stats{$key}{ConcMapping} = 0; | |
307 $stats{$key}{ConcReads} = 0; | |
308 $stats{$key}{NumberForward} = 0; | |
309 $stats{$key}{NumberReverse} = 0; | |
310 $stats{$key}{ConcErrorsBases} = 0; | |
311 $stats{$key}{ConcCorrectBases} = 0; | |
312 $stats{$key}{MinLength} = 100000; | |
313 $stats{$key}{MaxLength} = 0; | |
314 $stats{$key}{TotalA} = 0; | |
315 $stats{$key}{TotalC} = 0; | |
316 $stats{$key}{TotalG} = 0; | |
317 $stats{$key}{TotalT} = 0; | |
318 $stats{$key}{TotalN} = 0; | |
319 $stats{$key}{PoorMapQual} = 0; | |
320 } | |
321 | |
322 # start up worker threads | |
323 my $FindContig = Thread::Queue->new(); | |
324 my $ToAlign = Thread::Queue->new(); | |
325 my $Aligned = Thread::Queue->new(); | |
326 my $Samfile = Thread::Queue->new(); | |
327 my $WriteBam = Thread::Queue->new(); | |
328 my @threadArray; | |
329 for(my $index = 1; $index <= $OPTIONS{p}; $index++) { | |
330 push @threadArray, threads->create(\&thread_find_contig, $FindContig, \%refFrags, \%refSeq, $ToAlign, $Aligned, $WriteBam); | |
331 push @threadArray, threads->create(\&thread_process_align, $Aligned, \%refFrags, \%refSeq, $Samfile, $WriteBam) if $index <= max([1, $OPTIONS{p}/2]); | |
332 push @threadArray, threads->create(\&samfile_line, $Samfile, $WriteBam, \%refSeq) if $index <= max([1, $OPTIONS{p}/2]); | |
333 } | |
334 push @threadArray, threads->create(\&nwAlign, $ToAlign, $Aligned, \%regions); | |
335 push @threadArray, threads->create(\&writebamfile, $bam_to_sort, $WriteBam); | |
336 | |
337 # read in each sequence and its mate from the fastq file | |
338 my $read1; | |
339 my $read2; | |
340 my $seq1; | |
341 my $seq2; | |
342 my $qual1; | |
343 my $qual2; | |
344 my $desc; | |
345 my @strand; | |
346 my $cont = 1; | |
347 | |
348 # open fastq files | |
349 my $gz1; | |
350 my $gz2; | |
351 my $f1gz = 0; | |
352 my $f2gz = 0; | |
353 if($filetype eq "application/x-gzip") { | |
354 open(FQ1, "gunzip -dc $fastq1 |") or die "Can't fork: $!\n"; | |
355 } else { | |
356 open(FQ1, $fastq1) or die "$fastq1: $!"; | |
357 } | |
358 | |
359 $filetype = $mm->checktype_filename(abs_path($fastq2));; | |
360 if($filetype eq "application/x-gzip") { | |
361 open(FQ2, "gunzip -dc $fastq2 |") or die "Can't fork: $!\n"; | |
362 } elsif($filetype eq "text/plain") { | |
363 open(FQ2, $fastq2) or die "$fastq2: $!"; | |
364 } else { | |
365 die("$fastq2 is filetype $filetype and not text/plain or application/x-gzip\n"); | |
366 } | |
367 | |
368 # add reads to the queue | |
369 my $readNo = 0; | |
370 while ($cont) { | |
371 if($read1 = <FQ1>) { # read 1 and 2 read name | |
372 $seq1 = <FQ1>; # read 1 sequence | |
373 $desc = <FQ1>; # read 1 desc | |
374 $qual1 = <FQ1>; # read 1 quality | |
375 } else { | |
376 last; | |
377 } | |
378 | |
379 if($read2 = <FQ2>) { # read 2 read name | |
380 $seq2 = <FQ2>; # read 2 sequence | |
381 $desc = <FQ2>; # read 2 sdesc | |
382 $qual2 = <FQ2>; # read 2 quality | |
383 } else { | |
384 last; | |
385 } | |
386 | |
387 # read 1 the forward read | |
388 chomp $read1; | |
389 chomp $seq1; | |
390 chomp $qual1; | |
391 $read1 =~ s/^\@(.*)\s?.*/$1/; | |
392 | |
393 # read 2 the reverse read | |
394 chomp $read2; | |
395 chomp $seq2; | |
396 $seq2 = revcompl($seq2); | |
397 chomp $qual2; | |
398 $qual2 = reverse($qual2); | |
399 | |
400 $read2 =~ s/^\@(.*)\s?.*/$1/; | |
401 $readNo++; | |
402 | |
403 # keep at most 20 reads in buffer | |
404 while(1) { | |
405 if(defined $OPTIONS{v}) { | |
406 my $percAmb = $stats{Amb} / $readNo * 100; | |
407 my $percPoor = $stats{PoorQualMates} / $readNo * 100; | |
408 my $percPoorMap = $stats{PoorMapQual} / $readNo * 100; | |
409 my $percPoor1 = $stats{PoorQual1} / $readNo * 100; | |
410 my $percPoor2 = $stats{PoorQual2} / $readNo * 100; | |
411 $percAmb = sprintf("%.2f", $percAmb); | |
412 $percPoor = sprintf("%.2f", $percPoor); | |
413 $percPoorMap = sprintf("%.2f", $percPoorMap); | |
414 $percPoor1 = sprintf("%.2f", $percPoor1); | |
415 $percPoor2 = sprintf("%.2f", $percPoor2); | |
416 print STDERR "\r$readNo processed - Amb $percAmb - PoorQual $percPoor - PoorQual1 $percPoor1 - PoorQual2 $percPoor2 - PoorMappingQual $percPoorMap - FindContig ".$FindContig->pending()." - ToAlign ".$ToAlign->pending()." - Aligned ".$Aligned->pending()." - Sam ".$Samfile->pending()." - Writebam ".$WriteBam->pending()." "; | |
417 } | |
418 last if $FindContig->pending() <= 100; | |
419 } | |
420 my @readArray = ($read1, $seq1, $qual1, $read2, $seq2, $qual2); | |
421 { | |
422 lock($FindContig); # Keep other threads from changing the queue's contents | |
423 $FindContig->enqueue(\@readArray); | |
424 } | |
425 } | |
426 close(FQ1); | |
427 close(FQ2); | |
428 | |
429 # wait for all records to be processed and close down threads | |
430 $FindContig->enqueue( (undef) x $OPTIONS{p} ); | |
431 while($FindContig->pending() > 0 ) { | |
432 next; | |
433 } | |
434 $ToAlign->enqueue( (undef) ); | |
435 while($ToAlign->pending() > 0 ) { | |
436 next; | |
437 } | |
438 $Aligned->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) ); | |
439 while($Aligned->pending() > 0 ) { | |
440 next; | |
441 } | |
442 $Samfile->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) ); | |
443 while($Samfile->pending() > 0 ) { | |
444 next; | |
445 } | |
446 $WriteBam->enqueue( undef ); | |
447 | |
448 $_->join() for @threadArray; | |
449 | |
450 # sort the bam file | |
451 if(defined $OPTIONS{s}) { | |
452 system("MergeSamFiles.sh MAX_RECORDS_IN_RAM=2000000 I=$bam_to_sort O=$bamfile_out VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true"); | |
453 unlink $bam_to_sort; | |
454 } | |
455 | |
456 if(!defined $OPTIONS{r}) { | |
457 # print out statistics | |
458 # setup variables | |
459 my $totalRegionLengths = 0; | |
460 my $totalReads = 0; | |
461 my $totalFwd = 0; | |
462 my $totalRev = 0; | |
463 my $totalMatch = 0; | |
464 my $totalMiss = 0; | |
465 my $totalBases = 0; | |
466 my $totalRead1 = 0; | |
467 my $totalRead2 = 0; | |
468 my $totalA = 0; | |
469 my $totalC = 0; | |
470 my $totalG = 0; | |
471 my $totalT = 0; | |
472 my $totalN = 0; | |
473 my $totalConc = 0; | |
474 my $totalQuality = 0; | |
475 my $AllmaxRead = 0; | |
476 my $AllminRead = 100000; | |
477 my $avequal = 'NA'; | |
478 my $aveRead = 'NA'; | |
479 my $perA = 'NA'; | |
480 my $perC = 'NA'; | |
481 my $perG = 'NA'; | |
482 my $perT = 'NA'; | |
483 my $perN = 'NA'; | |
484 my $readsUsed = 0; | |
485 my @coverage; | |
486 my @off_coverage; | |
487 | |
488 # print column headers | |
489 print STATS 'AmpliconName,Region,RegionLengthMinusPrimers,TotalReadsPoorMapping,TotalReadsUsed,Reads1Kept,Reads2Kept,ReadsOut,ReadsFwd,ReadsRev,ReadsOverlap,'; | |
490 print STATS "Bases,AveQuality,MatchBases,ErrorBases,ErrorRate,MinReadLength,AveReadLength,MaxReadLength,PercA,PercC,PercG,PercT,PercN\n"; | |
491 | |
492 # for each region | |
493 foreach my $key (@OrderedContigs){ | |
494 # if we have some reads calculate averages | |
495 if($stats{$key}{Reads} > 0) { | |
496 $avequal = $stats{$key}{Quality} / $stats{$key}{Bases}; | |
497 $avequal = sprintf("%.4f", $avequal); | |
498 $aveRead = $stats{$key}{Bases} / $stats{$key}{Reads}; | |
499 $aveRead = sprintf("%.4f", $aveRead); | |
500 $perA = $stats{$key}{TotalA} / $stats{$key}{Bases} * 100; | |
501 $perA = sprintf("%.2f", $perA); | |
502 $perC = $stats{$key}{TotalC} / $stats{$key}{Bases} * 100; | |
503 $perC = sprintf("%.2f", $perC); | |
504 $perG = $stats{$key}{TotalG} / $stats{$key}{Bases} * 100; | |
505 $perG = sprintf("%.2f", $perG); | |
506 $perT = $stats{$key}{TotalT} / $stats{$key}{Bases} * 100; | |
507 $perT = sprintf("%.2f", $perT); | |
508 $perN = $stats{$key}{TotalN} / $stats{$key}{Bases} * 100; | |
509 $perN = sprintf("%.2f", $perN); | |
510 $AllminRead = min([$AllminRead, $stats{$key}{MinLength}]); | |
511 $AllmaxRead = max([$AllmaxRead, $stats{$key}{MaxLength}]); | |
512 $readsUsed = $stats{$key}{Reads} + $stats{$key}{ConcReads}; | |
513 } else { | |
514 $stats{$key}{MaxLength} = 'NA'; | |
515 $stats{$key}{MinLength} = 'NA'; | |
516 } | |
517 | |
518 # if we have reads which overlapped calculate an error rate | |
519 my $errRate = 'NA'; | |
520 if($stats{$key}{ConcReads} > 0) { | |
521 $errRate = $stats{$key}{ConcErrorsBases} / ($stats{$key}{ConcErrorsBases} + $stats{$key}{ConcCorrectBases}); | |
522 } | |
523 | |
524 # print the stats for this region | |
525 print STATS "$refFrags{name}{$key}," if defined $refFrags{name}{$key}; | |
526 print STATS "Unknown," unless defined $refFrags{name}{$key}; | |
527 print STATS "$refLength{$key}{newReg},$refLength{$key}{length},$stats{$key}{PoorMapQual},$readsUsed,$stats{$key}{Read1},$stats{$key}{Read2},$stats{$key}{Reads},$stats{$key}{NumberForward},"; | |
528 print STATS "$stats{$key}{NumberReverse},$stats{$key}{ConcReads},$stats{$key}{Bases},$avequal,$stats{$key}{ConcCorrectBases},"; | |
529 print STATS "$stats{$key}{ConcErrorsBases},$errRate,$stats{$key}{MinLength},$aveRead,$stats{$key}{MaxLength},$perA,$perC,$perG,$perT,$perN\n"; | |
530 | |
531 # add stats to grand totals | |
532 $totalRegionLengths += $refLength{$key}{length}; | |
533 $totalQuality += $stats{$key}{Quality}; | |
534 $totalReads += $stats{$key}{Reads}; | |
535 $totalFwd += $stats{$key}{NumberForward}; | |
536 $totalRev += $stats{$key}{NumberReverse}; | |
537 $totalBases += $stats{$key}{Bases}; | |
538 $totalRead1 += $stats{$key}{Read1}; | |
539 $totalRead2 += $stats{$key}{Read2}; | |
540 $totalA += $stats{$key}{TotalA}; | |
541 $totalC += $stats{$key}{TotalC}; | |
542 $totalT += $stats{$key}{TotalT}; | |
543 $totalG += $stats{$key}{TotalG}; | |
544 $totalN += $stats{$key}{TotalN}; | |
545 $totalMatch += $stats{$key}{ConcCorrectBases}; | |
546 $totalMiss += $stats{$key}{ConcErrorsBases}; | |
547 $totalConc += $stats{$key}{ConcReads}; | |
548 | |
549 # reset ave scores | |
550 $avequal = 'NA'; | |
551 $aveRead = 'NA'; | |
552 $perA = 'NA'; | |
553 $perC = 'NA'; | |
554 $perG = 'NA'; | |
555 $perT = 'NA'; | |
556 $perN = 'NA'; | |
557 $readsUsed = 0; | |
558 | |
559 if(defined $refFrags{name}{$key} && grep /^Off_target/, $refFrags{name}{$key}) { | |
560 push @off_coverage, $stats{$key}{Reads}; | |
561 } else { | |
562 push @coverage, $stats{$key}{Reads}; | |
563 } | |
564 } | |
565 | |
566 # if we have any reads calculate the averages | |
567 if($totalReads > 0) { | |
568 $avequal = $totalQuality / $totalBases; | |
569 $avequal = sprintf("%.4f", $avequal); | |
570 $aveRead = $totalBases / $totalReads; | |
571 $aveRead = sprintf("%.4f", $aveRead); | |
572 $perA = $totalA / $totalBases * 100; | |
573 $perA = sprintf("%.2f", $perA); | |
574 $perC = $totalC / $totalBases * 100; | |
575 $perC = sprintf("%.2f", $perC); | |
576 $perG = $totalG / $totalBases * 100; | |
577 $perG = sprintf("%.2f", $perG); | |
578 $perT = $totalT / $totalBases * 100; | |
579 $perT = sprintf("%.2f", $perT); | |
580 $perN = $totalN / $totalBases * 100; | |
581 $perN = sprintf("%.2f", $perN); | |
582 $readsUsed = $totalReads + $totalConc; | |
583 } else { | |
584 $AllmaxRead = 'NA'; | |
585 $AllminRead = 'NA'; | |
586 } | |
587 | |
588 # if we had any reads which overlapped calculate error rate | |
589 my $errRate = 'NA'; | |
590 if($totalConc > 0) { | |
591 $errRate = $totalMiss / ($totalMiss + $totalMatch); | |
592 } | |
593 | |
594 # print out totals | |
595 my $pcAmb = sprintf("%.2f", $stats{Amb} / $readNo * 100); | |
596 my $pcPQ = sprintf("%.2f", $stats{PoorQualMates} / $readNo * 100); | |
597 my $TotalReads = $readNo * 2; | |
598 my $Unmapped = $TotalReads - $readsUsed; | |
599 my $pcUnmapped = sprintf("%.2f", $Unmapped/$TotalReads * 100); | |
600 my $avCov = sprintf("%.0f", sum(@coverage)/@coverage); | |
601 my $avOff = sprintf("%.2f", sum(@off_coverage)/@off_coverage) if @off_coverage; | |
602 my $greater = scalar (grep { $_ > 0.2*$avCov } @coverage); | |
603 my $pcgreater = sprintf("%.2f", $greater/@coverage * 100); | |
604 | |
605 print STATS ",Total,$totalRegionLengths,$stats{PoorMapQual},$readsUsed,$totalRead1,$totalRead2,$totalReads,$totalFwd,$totalRev,$totalConc,$totalBases,$avequal,$totalMatch,"; | |
606 print STATS "$totalMiss,$errRate,$AllminRead,$aveRead,$AllmaxRead,$perA,$perC,$perG,$perT,$perN\n"; | |
607 print STATS "\nNo mate pairs:,$readNo\n"; | |
608 print STATS "No ambiguous mates:,$stats{Amb},$pcAmb\%\n"; | |
609 print STATS "No poor quality mates:,$stats{PoorQualMates},$pcPQ\%\n\n"; | |
610 print STATS "Total Number Of Reads:,$TotalReads\n"; | |
611 print STATS "Unmapped Reads:,$Unmapped,$pcUnmapped\%\n\n"; | |
612 print STATS "Min Mean Max ReadsOut:,".min([@coverage]).",$avCov,".max([@coverage])."\n"; | |
613 print STATS "No. Amplicons > 0.2 x ReadsOut:,$greater,$pcgreater\%\n"; | |
614 print STATS "Average ReadsOut Off Target Regions:,$avOff\n" if @off_coverage; | |
615 print STATS "\nAmplicons\nNo. Reads < 200, 200 <= No. Reads < 1000, No. Reads >= 1000\n"; | |
616 $greater = scalar (grep { $_ < 200 } @coverage); | |
617 print STATS "$greater,"; | |
618 $greater = scalar (grep { $_ >= 200 && $_ < 1000 } @coverage); | |
619 print STATS "$greater,"; | |
620 $greater = scalar (grep { $_ >= 1000 } @coverage); | |
621 print STATS "$greater\n"; | |
622 | |
623 print "Cache hits align contig: $cacheHits\n"; | |
624 print "Alignment cache hits: $cacheAlignHits\n"; | |
625 close(STATS); | |
626 } | |
627 | |
628 # close the log file | |
629 if(defined $OPTIONS{l} || defined $OPTIONS{L}) { | |
630 my $now = localtime time; | |
631 print "\nLog File Closed $now\n"; | |
632 close(STDERR); | |
633 close(STDOUT); | |
634 } | |
635 | |
636 # thread to find matching amplicon | |
637 sub thread_find_contig { | |
638 my $FindContig = shift; | |
639 my $refFrags = shift; | |
640 my $refSeq = shift; | |
641 my $ToAlign = shift; | |
642 my $Aligned = shift; | |
643 my $WriteBam = shift; | |
644 my $readArrayRef; | |
645 { | |
646 lock($FindContig); # Keep other threads from changing the queue's contents | |
647 $readArrayRef = $FindContig->dequeue(); | |
648 } | |
649 | |
650 while(defined $readArrayRef) { | |
651 my ($read1, $seq1, $qual1, $read2, $seq2, $qual2) = @{ $readArrayRef }; | |
652 my $contig = ''; | |
653 my $readRef = ''; | |
654 my $strand = ''; | |
655 | |
656 # trim the reads | |
657 my ($trimmed1, $qtrim1) = trim($seq1, $qual1, 30, 1); | |
658 my ($trimmed2, $qtrim2) = trim($seq2, $qual2, 30, 2); | |
659 | |
660 # if we have already mapped the reads use the results in the cache | |
661 if(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) { | |
662 my @array; | |
663 { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } }; | |
664 $array[0] = $read1; | |
665 $array[3] = $qtrim1; | |
666 $array[6] = $qtrim2; | |
667 { lock($cacheAlignHits); $cacheAlignHits++ }; | |
668 | |
669 # add aligned reads to queue | |
670 { | |
671 lock($Aligned); # Keep other threads from changing the queue's contents | |
672 $Aligned->enqueue(\@array); | |
673 } | |
674 } else { | |
675 # Use approx match to find most similar amplicon primer to reads | |
676 my @index1; | |
677 my @index2; | |
678 my $index_read1 = -2; | |
679 my $index_read2 = -2; | |
680 my $index_found = 0; | |
681 | |
682 if(length($trimmed1) > length($seq1)/2 || length($trimmed2) > length($seq2)/2) { | |
683 # we have at least 1 good read | |
684 # find closest matching amplicon for read 1 | |
685 my $string = substr($seq1, 0, $$refFrags{min}); | |
686 if(defined $cache{"$string.1"}) { | |
687 $cacheHits++; | |
688 { lock(%cache); @index1 = @{ $cache{"$string.1"} } }; | |
689 } else { | |
690 @index1 = amatch($string, \@{ $$refFrags{seqU} }); | |
691 { | |
692 lock(%cache); | |
693 $cache{"$string.1"} = &share( [] ); | |
694 @{ $cache{"$string.1"} } = @index1; | |
695 } | |
696 } | |
697 | |
698 # find closest matching amplicon for read 2 | |
699 $string = substr($seq2, -$$refFrags{min}); | |
700 if(defined $cache{"$string.2"}) { | |
701 $cacheHits++; | |
702 { lock(%cache); @index2 = @{ $cache{"$string.2"} } }; | |
703 } else { | |
704 @index2 = amatch($string, \@{ $$refFrags{seqD} }); | |
705 { | |
706 lock(%cache); | |
707 $cache{"$string.2"} = &share( [] ); | |
708 @{ $cache{"$string.2"} } = @index2; | |
709 } | |
710 } | |
711 | |
712 if($#index1 > 0 || $#index2 > 0) { | |
713 # if one read maps to multiple amplicons and the other read matches to just one of them use that | |
714 my %in_array2 = map { $_ => 1 } @index2; | |
715 my @common = grep { $in_array2{$_} } @index1; | |
716 if($#common == 0) { | |
717 @index1 = @common; | |
718 @index2 = @common; | |
719 } elsif($#common > 0) { | |
720 if($#index1 > 0) { | |
721 # matched to multiple primers so try and use whole read | |
722 my @refs; | |
723 foreach(@index1) { | |
724 if($$refFrags{strand}[$_] eq "F") { | |
725 push @refs, $$refSeq{${ $$refFrags{reg} }[$_]}; | |
726 } else { | |
727 push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]}); | |
728 } | |
729 } | |
730 my @index1b = amatch($seq1, \@refs); | |
731 if($#index1b >= 0) { | |
732 @index1 = @index1[@index1b]; | |
733 } | |
734 } | |
735 if($#index2 > 0) { | |
736 # matched to multiple primers so try and use whole read | |
737 my @refs; | |
738 foreach(@index2) { | |
739 if($$refFrags{strand}[$_] eq "F") { | |
740 push @refs, $$refSeq{${ $$refFrags{reg} }[$_]}; | |
741 } else { | |
742 push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]}); | |
743 } | |
744 } | |
745 my @index2b = amatch($seq2, \@refs); | |
746 if($#index2b >= 0) { | |
747 @index2 = @index2[@index2b]; | |
748 } | |
749 } | |
750 | |
751 # if one read still maps to multiple amplicons and the other read matches to just one of them use that | |
752 %in_array2 = map { $_ => 1 } @index2; | |
753 @common = grep { $in_array2{$_} } @index1; | |
754 if($#common == 0) { | |
755 @index1 = @common; | |
756 @index2 = @common; | |
757 } | |
758 } | |
759 } | |
760 | |
761 # if read 1 and read 2 match to the same amplicon assign it | |
762 if($#index1 == 0 && @index1 ~~ @index2 ) { | |
763 $index_read1 = $index1[0]; | |
764 $index_read2 = $index2[0]; | |
765 } else { | |
766 $index_read1 = -1; | |
767 $index_read2 = -1; | |
768 } | |
769 | |
770 # if any read after trimming is less than half the original length discard | |
771 if(length($trimmed1) < length($seq1)/2) { | |
772 $index_read1 = -2; | |
773 } | |
774 | |
775 if(length($trimmed2) < length($seq2)/2) { | |
776 $index_read2 = -2; | |
777 } | |
778 } | |
779 | |
780 if($index_read1 > -1 || $index_read2 > -1) { | |
781 my $conc = 0; | |
782 my %align; | |
783 | |
784 # at least 1 read matched to an amplicon | |
785 if($index_read1 > -1) { | |
786 $contig = $$refFrags{reg}[$index_read1]; | |
787 $readRef = $$refSeq{$contig}; | |
788 $strand = $$refFrags{strand}[$index_read1]; | |
789 } elsif($index_read2 > -1) { | |
790 $contig = $$refFrags{reg}[$index_read2]; | |
791 $readRef = $$refSeq{$contig}; | |
792 $strand = $$refFrags{strand}[$index_read2]; | |
793 } | |
794 | |
795 if($index_read1 == $index_read2 && length($readRef) < length($trimmed1) + length($trimmed2)) { | |
796 # reads must overlap so can use concensus | |
797 $conc = 1; | |
798 } else { | |
799 my $substr1; | |
800 my $substr2; | |
801 for(my $length = 5; $length <= min([length($trimmed1), length($trimmed2)]); $length++) { | |
802 $substr1 = substr($trimmed1, -$length); | |
803 $substr2 = substr($trimmed2, 0, $length); | |
804 if($substr1 eq $substr2) { | |
805 $conc = 1; | |
806 last; | |
807 } else { | |
808 # calculate max diff allowed | |
809 if(!defined $maxdiff{$length}) { | |
810 { lock(%maxdiff); $maxdiff{$length} = maxdiff($length, 0.02, 0.04); } | |
811 } | |
812 | |
813 my $diff = levenshtein($substr1, $substr2 ); | |
814 if($diff < $maxdiff{$length}) { | |
815 $conc = 1; | |
816 last; | |
817 } | |
818 } | |
819 } | |
820 } | |
821 | |
822 if($conc) { | |
823 # remove any trailing Ns | |
824 $trimmed1 = $seq1; | |
825 $trimmed2 = $seq2; | |
826 $trimmed1 =~ s/N*$//; | |
827 $trimmed2 =~ s/^N*//; | |
828 $qtrim1 = $qual1; | |
829 $qtrim2 = $qual2; | |
830 $qtrim1 = substr($qtrim1, 0, length($trimmed1)); | |
831 $qtrim2 = substr($qtrim2 , -length($trimmed2)); | |
832 } | |
833 | |
834 if($strand eq "R") { | |
835 my $trimmed1_tmp = revcompl($trimmed1); | |
836 my $qtrim1_tmp = reverse($qtrim1); | |
837 $trimmed1 = revcompl($trimmed2); | |
838 $qtrim1 = reverse($qtrim2); | |
839 $trimmed2 = $trimmed1_tmp; | |
840 $qtrim2 = $qtrim1_tmp; | |
841 my $index_read_tmp = $index_read1; | |
842 $index_read1 = $index_read2; | |
843 $index_read2 = $index_read_tmp; | |
844 } | |
845 | |
846 if($conc) { | |
847 my $align; | |
848 my $search1 = $trimmed1; | |
849 $search1 =~ s/N/./g; | |
850 my $search2 = $trimmed2; | |
851 $search2 =~ s/N/./g; | |
852 if($$refSeq{$contig} =~ /$search1/ && $$refSeq{$contig} =~ /$search2/) { | |
853 # the read matches exactly to the reference | |
854 $align{Ref1} = $$refSeq{$contig}; | |
855 $align{Read1} = $$refSeq{$contig}; | |
856 $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/; | |
857 $align{Read1} =~ s/[^X]/-/g; | |
858 $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X+/$trimmed1/ }); | |
859 $align{Read2} = $$refSeq{$contig}; | |
860 $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}XX$2/; | |
861 $align{Read2} =~ s/[^X]/-/g; | |
862 $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X+/$trimmed2/ }); | |
863 | |
864 # add aligned reads to queue | |
865 my @array; | |
866 { | |
867 lock($Aligned); # Keep other threads from changing the queue's contents | |
868 @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, $align{Ref1}, $align{Read2}, $qtrim2, $contig, $strand, $conc); | |
869 $Aligned->enqueue(\@array); | |
870 } | |
871 | |
872 # add alignment to cache | |
873 { | |
874 lock(%cacheAlign); | |
875 $cacheAlign{Conc}{$name} = &share( [] ); | |
876 @{ $cacheAlign{Conc}{$name} } = @array; | |
877 } | |
878 } elsif(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) { | |
879 my @array; | |
880 { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } }; | |
881 $array[0] = $read1; | |
882 $array[3] = $qtrim1; | |
883 $array[6] = $qtrim2; | |
884 { lock($cacheAlignHits); $cacheAlignHits++ }; | |
885 | |
886 # add aligned reads to queue | |
887 { | |
888 lock($Aligned); # Keep other threads from changing the queue's contents | |
889 $Aligned->enqueue(\@array); | |
890 } | |
891 } else { | |
892 | |
893 # align reads using NW2 | |
894 { | |
895 lock($ToAlign); # Keep other threads from changing the queue's contents | |
896 my @array = ($$refSeq{$contig}, $trimmed1, $trimmed2, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc); | |
897 #my @array = ($fastaIn, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc, "$trimmed1.$trimmed2"); | |
898 $ToAlign->enqueue(\@array); | |
899 } | |
900 } | |
901 } | |
902 | |
903 if(!$conc && $index_read1 > -1) { | |
904 my $search1 = $trimmed1; | |
905 $search1 =~ s/N/./g; | |
906 if($$refSeq{$contig} =~ /$search1/) { | |
907 # the read matches exactly to the reference | |
908 $align{Ref1} = $$refSeq{$contig}; | |
909 $align{Read1} = $$refSeq{$contig}; | |
910 $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/; | |
911 $align{Read1} =~ s/[^X]/-/g; | |
912 $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X/$trimmed1/ }); | |
913 | |
914 # add aligned reads to queue | |
915 { | |
916 lock($Aligned); # Keep other threads from changing the queue's contents | |
917 my @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, undef, undef, undef, $contig, $strand, $conc); | |
918 $Aligned->enqueue(\@array); | |
919 } | |
920 } elsif(defined $cacheAlign{$contig}{R1}{$trimmed1}) { | |
921 my @array; | |
922 { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R1}{$trimmed1} } }; | |
923 $array[0] = $read1; | |
924 $array[3] = $qtrim1; | |
925 { lock($cacheAlignHits); $cacheAlignHits++ }; | |
926 | |
927 # add aligned reads to queue | |
928 { | |
929 lock($Aligned); # Keep other threads from changing the queue's contents | |
930 $Aligned->enqueue(\@array); | |
931 } | |
932 } else { | |
933 # align read using NW2 | |
934 { | |
935 lock($ToAlign); # Keep other threads from changing the queue's contents | |
936 my @array = ($$refSeq{$contig}, $trimmed1, undef, $read1, $qtrim1, undef, $contig, $strand, $conc); | |
937 #my @array = ($fastaIn, $read1, $qtrim1, undef, $contig, $strand, $conc, $trimmed1); | |
938 $ToAlign->enqueue(\@array); | |
939 } | |
940 } | |
941 } | |
942 | |
943 if(!$conc && $index_read2 > -1) { | |
944 my $search2 = $trimmed2; | |
945 $search2 =~ s/N/./g; | |
946 if($$refSeq{$contig} =~ /$search2/) { | |
947 # the read matches exactly to the reference | |
948 $align{Ref2} = $$refSeq{$contig}; | |
949 $align{Read2} = $$refSeq{$contig}; | |
950 $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}X$2/; | |
951 $align{Read2} =~ s/[^X]/-/g; | |
952 $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X/$trimmed2/ }); | |
953 | |
954 # add aligned reads to queue | |
955 { | |
956 lock($Aligned); # Keep other threads from changing the queue's contents | |
957 my @array = ($read2, undef, undef, undef, $align{Ref2}, $align{Read2}, $qtrim2, $contig, $strand, $conc); | |
958 $Aligned->enqueue(\@array); | |
959 } | |
960 } elsif(defined $cacheAlign{$contig}{R2}{$trimmed2}) { | |
961 my @array; | |
962 { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R2}{$trimmed2} } }; | |
963 $array[0] = $read2; | |
964 $array[6] = $qtrim2; | |
965 { lock($cacheAlignHits); $cacheAlignHits++ }; | |
966 | |
967 # add aligned reads to queue | |
968 { | |
969 lock($Aligned); # Keep other threads from changing the queue's contents | |
970 $Aligned->enqueue(\@array); | |
971 } | |
972 } else { | |
973 # align read using NW2 | |
974 { | |
975 lock($ToAlign); # Keep other threads from changing the queue's contents | |
976 my @array = ($$refSeq{$contig}, undef, $trimmed2, $read2, undef, $qtrim2, $contig, $strand, $conc); | |
977 #my @array = ($fastaIn, $read2, undef, $qtrim2, $contig, $strand, $conc, $trimmed2); | |
978 $ToAlign->enqueue(\@array); | |
979 } | |
980 } | |
981 } | |
982 } | |
983 | |
984 if($index_read1 < 0) { | |
985 # discard read 1 | |
986 # send to writebamfile thread | |
987 { | |
988 lock($WriteBam); # Keep other threads from changing the queue's contents | |
989 $WriteBam->enqueue("$read1\t4\t*\t0\t0\t*\t*\t0\t0\t$seq1\t$qual1\tRG:Z:$readGroup\n"); | |
990 } | |
991 { lock(%stats); $stats{PoorQual1}++ }; | |
992 } | |
993 | |
994 if($index_read2 < 0) { | |
995 # discard read 2 | |
996 # send to writebamfile thread | |
997 { | |
998 lock($WriteBam); # Keep other threads from changing the queue's contents | |
999 $WriteBam->enqueue("$read2\t4\t*\t0\t0\t*\t*\t0\t0\t$seq2\t$qual2\tRG:Z:$readGroup\n"); | |
1000 } | |
1001 { lock(%stats); $stats{PoorQual2}++ }; | |
1002 } | |
1003 | |
1004 if($index_read1 == -1 && $index_read2 == -1) { | |
1005 { lock(%stats); $stats{Amb}++ }; | |
1006 } elsif($index_read1 == -2 && $index_read2 == -2) { | |
1007 { lock(%stats); $stats{PoorQualMates}++ }; | |
1008 } | |
1009 } | |
1010 | |
1011 # grab the next reads in queue | |
1012 { | |
1013 lock($FindContig); # Keep other threads from changing the queue's contents | |
1014 $readArrayRef = $FindContig->dequeue(); | |
1015 } | |
1016 } | |
1017 print "Closing Find Amplicon Thread\n"; | |
1018 return; | |
1019 } | |
1020 | |
1021 sub writebamfile { | |
1022 my $bamfile = shift; | |
1023 my $WriteBam = shift; | |
1024 | |
1025 # write samfile header | |
1026 open(PIPE_TO_SAMTOOLS, "| samtools view -Sb -o $bamfile - 2>&1") or die "Can't fork: $!\n"; | |
1027 foreach my $key (@OrderedContigs) { | |
1028 print PIPE_TO_SAMTOOLS "\@SQ\tSN:$refLength{$key}{newReg}\tLN:$refLength{$key}{length}\n"; | |
1029 } | |
1030 print PIPE_TO_SAMTOOLS "\@RG\tID:$readGroup\tPL:Illumina\tSM:$sampleName\n"; | |
1031 | |
1032 my $read; | |
1033 { | |
1034 lock($WriteBam); # Keep other threads from changing the queue's contents | |
1035 $read = $WriteBam->dequeue(); | |
1036 } | |
1037 while(defined $read) { | |
1038 | |
1039 # write read to bamfile | |
1040 print PIPE_TO_SAMTOOLS $read; | |
1041 | |
1042 # grab next item in queue | |
1043 { | |
1044 lock($WriteBam); # Keep other threads from changing the queue's contents | |
1045 $read = $WriteBam->dequeue(); | |
1046 } | |
1047 } | |
1048 print "Closing Bam Writer\n"; | |
1049 close PIPE_TO_SAMTOOLS or die "pipe could not be closed"; | |
1050 return; | |
1051 } | |
1052 | |
1053 sub thread_process_align { | |
1054 my $Aligned = shift; | |
1055 my $refFrags = shift; | |
1056 my $refSeq = shift; | |
1057 my $Samfile = shift; | |
1058 my $WriteBam = shift; | |
1059 | |
1060 my $readArrayRef; | |
1061 { | |
1062 lock($Aligned); # Keep other threads from changing the queue's contents | |
1063 $readArrayRef = $Aligned->dequeue(); | |
1064 } | |
1065 while(defined $readArrayRef) { | |
1066 # grab the alignment | |
1067 my ($read, $ref1, $read1, $qtrim1, $ref2, $read2, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef }; | |
1068 # grab primer lengths | |
1069 my $PU = $$refFrags{primers}{$contig}{U}{F}{length}; | |
1070 my $PD = $$refFrags{primers}{$contig}{D}{F}{length}; | |
1071 my %readStats; | |
1072 my $orig_read1 = $read1; | |
1073 my $orig_read2 = $read2; | |
1074 my $orig_qtrim1 = $qtrim1; | |
1075 my $orig_qtrim2 = $qtrim2; | |
1076 $orig_read1 =~ s/-//g if defined $orig_read1; | |
1077 $orig_read2 =~ s/-//g if defined $orig_read2; | |
1078 | |
1079 if(defined $OPTIONS{d}) { | |
1080 # debug info | |
1081 print "Alignment With Primers\n"; | |
1082 print "$ref1\n$read1\n" if defined $read1; | |
1083 print "$ref2\n$read2\n" if defined $read2; | |
1084 print "\nFinal Alignment:\n"; | |
1085 } | |
1086 | |
1087 # calculate where primers begin and end | |
1088 if(defined $read1) { | |
1089 if($strand eq 'F') { | |
1090 $readStats{$contig}{Read1} = 1; | |
1091 } else { | |
1092 $readStats{$contig}{Read2} = 1; | |
1093 } | |
1094 | |
1095 # determine the primer locations | |
1096 my $offset = 0; | |
1097 for(my $occurance = 0; $occurance < $PU; $occurance++) { | |
1098 substr($ref1, $offset) =~ /^.*?([^-])/; | |
1099 $offset += $-[1] + 1; | |
1100 } | |
1101 my $length = 0; | |
1102 for(my $occurance = 0; $occurance < $PD; $occurance++) { | |
1103 substr($ref1, 0, length($ref1) - $length) =~ /.*([^-]).*?$/; | |
1104 $length = length($ref1) - $-[1]; | |
1105 } | |
1106 $length = length($ref1) - $length - $offset; | |
1107 | |
1108 # remove primers and adjust quality scores | |
1109 $ref1 = substr($ref1, $offset, $length); | |
1110 my $qualOffset = ( substr($read1, 0, $offset) =~ tr/ACTGN// ); | |
1111 $read1 = substr($read1, $offset, $length); | |
1112 $length = ( $read1 =~ tr/ACTGN// ); | |
1113 $qtrim1 = substr($qtrim1, $qualOffset, $length); | |
1114 } | |
1115 | |
1116 if(defined $read2) { | |
1117 if($strand eq 'F') { | |
1118 $readStats{$contig}{Read2} = 1; | |
1119 } else { | |
1120 $readStats{$contig}{Read1} = 1; | |
1121 } | |
1122 | |
1123 # determine the primer locations | |
1124 my $offset = 0; | |
1125 for(my $occurance = 0; $occurance < $PU; $occurance++) { | |
1126 substr($ref2, $offset) =~ /^.*?([^-])/; | |
1127 $offset += $-[1] + 1; | |
1128 } | |
1129 my $length = 0; | |
1130 for(my $occurance = 0; $occurance < $PD; $occurance++) { | |
1131 substr($ref2, 0, length($ref2) - $length) =~ /.*([^-]).*?$/; | |
1132 $length = length($ref2) - $-[1]; | |
1133 } | |
1134 $length = length($ref2) - $length - $offset; | |
1135 | |
1136 # remove primers and adjust quality scores | |
1137 $ref2 = substr($ref2, $offset, $length); | |
1138 my $qualOffset = ( substr($read2, 0, $offset) =~ tr/ACTGN// ); | |
1139 $read2 = substr($read2, $offset, $length); | |
1140 $length = ( $read2 =~ tr/ACTGN// ); | |
1141 $qtrim2 = substr($qtrim2, $qualOffset, $length); | |
1142 } | |
1143 | |
1144 # generate a concensus sequence if we have both read1 and read2 and they overlap | |
1145 if($conc) { | |
1146 # the following are 0 based indexes | |
1147 my $R1qual = 0; # index for sanger qual read 1 | |
1148 my $R2qual = 0; # index for sanger qual read 2 | |
1149 my $aindex = 0; # index for alignment | |
1150 my $q1; | |
1151 my $q2; | |
1152 my $concensus; | |
1153 my $concqual; | |
1154 | |
1155 $readStats{$contig}{ConcReads} = 1; | |
1156 $readStats{$contig}{ConcCorrectBases} = 0; | |
1157 $readStats{$contig}{ConcErrorsBases} = 0; | |
1158 while($aindex < length($ref1)) { | |
1159 # base1 is current base from read 1 | |
1160 # base2 is current base from read 2 | |
1161 my $base1 = substr($read1, $aindex, 1); | |
1162 my $base2 = substr($read2, $aindex, 1); | |
1163 | |
1164 # grab the next quality score | |
1165 if($R1qual < length($qtrim1)) { | |
1166 $q1 = substr($qtrim1, $R1qual, 1); | |
1167 } | |
1168 if($R2qual < length($qtrim2)) { | |
1169 $q2 = substr($qtrim2, $R2qual, 1); | |
1170 } | |
1171 | |
1172 # if we have a base in either | |
1173 if(($base1 ne '-' && $base1 ne 'N') || ($base2 ne '-' && $base2 ne 'N')) { | |
1174 if(($base2 eq "-" || $base2 eq 'N') && $base1 ne "-" && $base1 ne 'N') { | |
1175 # we are only in read 1 or read 2 is a N | |
1176 $concensus .= $base1; | |
1177 $concqual .= $q1; | |
1178 $R1qual++; | |
1179 if($base2 eq 'N') { | |
1180 $R2qual++; | |
1181 } | |
1182 } elsif(($base1 eq "-" || $base1 eq 'N') && $base2 ne "-" && $base2 ne 'N') { | |
1183 # we are only in read 2 or read 1 is a N | |
1184 $concensus .= $base2; | |
1185 $concqual .= $q2; | |
1186 $R2qual++; | |
1187 if($base1 eq 'N') { | |
1188 $R1qual++; | |
1189 } | |
1190 } else { | |
1191 # we are in the overlap region | |
1192 my $maxqual = sangerMax($q1, $q2); | |
1193 if($base1 eq $base2) { | |
1194 # both reads agree | |
1195 $readStats{$contig}{ConcCorrectBases}++; | |
1196 $concensus .= $base1; | |
1197 $concqual .= $maxqual; | |
1198 } elsif($q1 eq $q2) { | |
1199 # reads have different bases but the same score | |
1200 $readStats{$contig}{ConcErrorsBases}++; | |
1201 $concensus .= 'N'; | |
1202 $concqual .= '#'; | |
1203 } elsif($q1 eq $maxqual) { | |
1204 # reads have different bases but read 1 has the best score | |
1205 $concensus .= $base1; | |
1206 $concqual .= $maxqual; | |
1207 $readStats{$contig}{ConcErrorsBases}++; | |
1208 } else { | |
1209 # reads have different bases but read 2 has the best score | |
1210 $concensus .= $base2; | |
1211 $concqual .= $maxqual; | |
1212 $readStats{$contig}{ConcErrorsBases}++; | |
1213 } | |
1214 $R1qual++; | |
1215 $R2qual++; | |
1216 } | |
1217 } elsif($base1 eq 'N' || $base2 eq 'N') { | |
1218 if($base2 eq '-') { | |
1219 # one is a deletion the other is an N | |
1220 $concensus .= $base1; | |
1221 $concqual .= $q1; | |
1222 $R1qual++; | |
1223 } elsif($base1 eq '-') { | |
1224 # one is a deletion the other is an N | |
1225 $concensus .= $base2; | |
1226 $concqual .= $q2; | |
1227 $R2qual++; | |
1228 } elsif($base1 eq 'N' && $base2 eq 'N') { | |
1229 # both reads are an N | |
1230 $concensus .= 'N'; | |
1231 $concqual .= sangerMax($q1, $q2); | |
1232 $R1qual++; | |
1233 $R2qual++; | |
1234 } | |
1235 } else { | |
1236 $concensus .= '-'; | |
1237 } | |
1238 $aindex++; | |
1239 } | |
1240 $read1 = $concensus; | |
1241 $qtrim1 = $concqual; | |
1242 $read2 = undef; | |
1243 $qtrim2 = undef; | |
1244 } | |
1245 | |
1246 if(defined $read1) { | |
1247 # remove any trailing - and send to samfile_line | |
1248 $read1 =~ s/^-*?([^-].*)/$1/; | |
1249 $ref1 = substr($ref1, -length($read1)); | |
1250 $read1 =~ s/(.*[^-])-*?$/$1/; | |
1251 $ref1 = substr($ref1, 0, length($read1)); | |
1252 | |
1253 my $tmp = $ref1; | |
1254 $tmp =~ s/-//g; | |
1255 my $rstart = index($$refSeq{$contig}, $tmp) - $PU + 1; | |
1256 | |
1257 # generate sam file line | |
1258 { | |
1259 lock($Samfile); # Keep other threads from changing the queue's contents | |
1260 my @array = ($read, $qtrim1, $rstart, $read1, $ref1, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats); | |
1261 $Samfile->enqueue(\@array); | |
1262 } | |
1263 } | |
1264 | |
1265 if(defined $read2) { | |
1266 # remove any trailing - and send to samfile_line | |
1267 $read2 =~ s/^-*?([^-].*)/$1/; | |
1268 $ref2 = substr($ref2, -length($read2)); | |
1269 $read2 =~ s/(.*[^-])-*?$/$1/; | |
1270 $ref2 = substr($ref2, 0, length($read2)); | |
1271 | |
1272 my $tmp = $ref2; | |
1273 $tmp =~ s/-//g; | |
1274 my $rstart = rindex($$refSeq{$contig}, $tmp) - $PU + 1; | |
1275 | |
1276 # generate sam file line | |
1277 { | |
1278 lock($Samfile); # Keep other threads from changing the queue's contents | |
1279 my @array = ($read, $qtrim2, $rstart, $read2, $ref2, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats); | |
1280 $Samfile->enqueue(\@array); | |
1281 } | |
1282 } | |
1283 | |
1284 if(defined $OPTIONS{d}) { | |
1285 # debug info | |
1286 print "$ref1\n$read1\n" if defined $read1; | |
1287 print "$ref2\n$read2\n" if defined $read2; | |
1288 print "$qtrim1\n" if defined $read1; | |
1289 print "$qtrim2\n" if defined $read2; | |
1290 print "\n\n"; | |
1291 } | |
1292 | |
1293 # grab next item in queue | |
1294 { | |
1295 lock($Aligned); # Keep other threads from changing the queue's contents | |
1296 $readArrayRef = $Aligned->dequeue(); | |
1297 } | |
1298 } | |
1299 print "Closing Alignment Processor\n"; | |
1300 return; | |
1301 } | |
1302 | |
1303 # function to take reverse compliment | |
1304 sub revcompl { | |
1305 my $string = shift @_; | |
1306 my $revcomp = reverse($string); | |
1307 $revcomp =~ tr/ACGTacgtKMSWRYBDHVkmswrybdhv/TGCAtgcaMKSWYRVHDBmkswyrvhdb/; | |
1308 return $revcomp; | |
1309 } | |
1310 | |
1311 # function to convert sanger fastq quality scores to phred scores | |
1312 sub sanger2phred { | |
1313 my $sang = shift @_; | |
1314 my @sanger = split("", $sang); | |
1315 my @phred; | |
1316 foreach my $q (@sanger) { | |
1317 my $Q = ord($q) - 33; | |
1318 push @phred, $Q; | |
1319 } | |
1320 return @phred; | |
1321 } | |
1322 | |
1323 # function to return the largest sanger quality score | |
1324 sub sangerMax { | |
1325 my ($s1, $s2) = @_; | |
1326 my $Q1 = ord($s1) - 33; | |
1327 my $Q2 = ord($s2) - 33; | |
1328 if($Q1 > $Q2) { | |
1329 return $s1; | |
1330 } else { | |
1331 return $s2; | |
1332 } | |
1333 } | |
1334 | |
1335 # function to read a fasta file as a hash | |
1336 sub read_fasta_file { | |
1337 my ($fasta_file, $seqs) = @_; | |
1338 my $header; | |
1339 my $seq = ""; | |
1340 open (IN, $fasta_file) or die "can't open $fasta_file: $!\n"; | |
1341 while (my $line = <IN>) { | |
1342 chomp $line; | |
1343 if (grep />/, $line) { | |
1344 if ($seq ne "") { | |
1345 $seqs->{$header} = $seq; | |
1346 } | |
1347 $header = $line; | |
1348 | |
1349 $header =~ s/^>//; # remove ">" | |
1350 $header =~ s/\s+$//; # remove trailing whitespace | |
1351 $seq = ""; # clear out old sequence | |
1352 } else { | |
1353 $line =~ s/\s+//g; # remove whitespace | |
1354 $seq .= $line; # add sequence | |
1355 } | |
1356 } | |
1357 close IN; # finished with file | |
1358 | |
1359 if ($seq ne "") { # handle last sequence | |
1360 $seqs->{$header} = $seq; | |
1361 } | |
1362 | |
1363 return; | |
1364 } | |
1365 | |
1366 # minimal element of a list | |
1367 # | |
1368 sub min | |
1369 { | |
1370 my @list = @{$_[0]}; | |
1371 my $min = $list[0]; | |
1372 | |
1373 foreach my $i (@list) | |
1374 { | |
1375 $min = $i if ($i < $min); | |
1376 } | |
1377 | |
1378 return $min; | |
1379 } | |
1380 | |
1381 # maximal element of a list | |
1382 # | |
1383 sub max | |
1384 { | |
1385 my @list = @{$_[0]}; | |
1386 my $max = $list[0]; | |
1387 | |
1388 foreach my $i (@list) | |
1389 { | |
1390 $max = $i if ($i > $max); | |
1391 } | |
1392 | |
1393 return $max; | |
1394 } | |
1395 | |
1396 # trim sequences using the BWA algorithm | |
1397 sub trim { | |
1398 | |
1399 my ($s, $q, $threshold, $read) = @_; # read sequence and quality scores | |
1400 if($read == 2) { | |
1401 $q = reverse($q); | |
1402 $s = reverse($s); | |
1403 } | |
1404 | |
1405 my @array = sanger2phred($q); | |
1406 my $length = scalar @array; | |
1407 | |
1408 # only calculate if quality fails near end | |
1409 if( $array[$#array] >= $threshold ){ | |
1410 if($read == 2) { | |
1411 $q = reverse($q); | |
1412 $s = reverse($s); | |
1413 } | |
1414 return ($s, $q); | |
1415 } | |
1416 | |
1417 # run bwa equation | |
1418 my @arg; | |
1419 for( my $i = 0; $i < $length - 1; $i++ ){ | |
1420 my $x = $i + 1; | |
1421 for( my $j = $x; $j < $length; $j++ ){ | |
1422 $arg[$x] += $threshold - $array[$j]; | |
1423 } | |
1424 } | |
1425 | |
1426 # find number of 5' bases to retain | |
1427 my $index = 0; | |
1428 my $maxval = 0; | |
1429 for ( 1 .. $#arg ){ | |
1430 if ( $maxval < $arg[$_] ){ | |
1431 $index = $_; | |
1432 $maxval = $arg[$_]; | |
1433 } | |
1434 } | |
1435 | |
1436 # trim reads | |
1437 $s = substr($s,0,$index); | |
1438 $q = substr($q,0,$index); | |
1439 if($read == 2) { | |
1440 $s = reverse($s); | |
1441 $q = reverse($q); | |
1442 } | |
1443 | |
1444 # return seq and quality | |
1445 return ($s, $q); | |
1446 } | |
1447 | |
1448 | |
1449 sub samfile_line { | |
1450 # grab the variables | |
1451 my ($Samfile, $WriteBam, $refSeq) = @_; | |
1452 my %SamStats; | |
1453 my %minLength; | |
1454 my %maxLength; | |
1455 my $readArrayRef; | |
1456 { | |
1457 lock($Samfile); # Keep other threads from changing the queue's contents | |
1458 $readArrayRef = $Samfile->dequeue(); | |
1459 } | |
1460 | |
1461 while(defined $readArrayRef) { | |
1462 my ($read, $qual, $rstart, $align, $ref, $reg, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, $readStats) = @{ $readArrayRef }; | |
1463 my $seq = $align; | |
1464 $seq =~ s/-//g; | |
1465 | |
1466 # calculate the edit difference - count indels as only 1 missmatch hence tranform the strings so that all indels have length 1 | |
1467 my $sam; | |
1468 my $s1; | |
1469 my $s2; | |
1470 my @s1 = split("", $align); | |
1471 my @s2 = split("", $ref); | |
1472 | |
1473 my $indel = 0; | |
1474 for(my $i = 0; $i <= $#s1; $i++) { | |
1475 if(($s1[$i] ne "-" && $s2[$i] ne "-") || !$indel) { | |
1476 $s1 .= $s1[$i]; | |
1477 $s2 .= $s2[$i]; | |
1478 $indel = 0; | |
1479 if($s1[$i] eq "-" || $s2[$i] eq "-") { | |
1480 $indel = 1; | |
1481 } | |
1482 } | |
1483 } | |
1484 my $mq = levenshtein($s1, $s2); | |
1485 | |
1486 my $cl = length($seq); | |
1487 # calculate max diff allowed | |
1488 if(!defined $maxdiff{$cl}) { | |
1489 { lock(%maxdiff); $maxdiff{$cl} = maxdiff($cl, 0.02, 0.04); } | |
1490 } | |
1491 | |
1492 if($mq > $maxdiff{$cl} || $cl == 0) { | |
1493 { lock(%stats); $stats{$reg}{PoorMapQual}++; $stats{PoorMapQual}++; } | |
1494 if(defined $orig_read1) { | |
1495 $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read1\t$orig_qtrim1\tRG:Z:$readGroup\n"; | |
1496 # send to writebamfile thread | |
1497 { | |
1498 lock($WriteBam); # Keep other threads from changing the queue's contents | |
1499 $WriteBam->enqueue($sam); | |
1500 } | |
1501 } | |
1502 if(defined $orig_read2) { | |
1503 $read =~ s/[1](:[A-Z]+:[0-9]+:[ACTGN]+)$/2$1/; | |
1504 $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read2\t$orig_qtrim2\tRG:Z:$readGroup\n"; | |
1505 # send to writebamfile thread | |
1506 { | |
1507 lock($WriteBam); # Keep other threads from changing the queue's contents | |
1508 $WriteBam->enqueue($sam); | |
1509 } | |
1510 } | |
1511 } else { | |
1512 $mq = 60; | |
1513 | |
1514 # add any stats from earlier | |
1515 foreach my $key1 (keys %$readStats) { | |
1516 foreach my $key2 (keys % { $$readStats{$key1} }) { | |
1517 $SamStats{$key1}{$key2} += $$readStats{$key1}{$key2}; | |
1518 } | |
1519 } | |
1520 | |
1521 # keep track of quality scores | |
1522 $SamStats{$reg}{Bases} += $cl; | |
1523 $SamStats{$reg}{Quality} += $_ for sanger2phred($qual); | |
1524 | |
1525 # keep track of read lengths | |
1526 if(defined $minLength{$reg}) { | |
1527 $minLength{$reg} = min([$cl, $minLength{$reg}]); | |
1528 $maxLength{$reg} = max([$cl, $maxLength{$reg}]); | |
1529 } else { | |
1530 $minLength{$reg} = $cl; | |
1531 $maxLength{$reg} = $cl; | |
1532 } | |
1533 | |
1534 # keep track of strand and bases seen | |
1535 if($strand eq 'F') { | |
1536 $SamStats{$reg}{NumberForward}++; | |
1537 } else { | |
1538 $SamStats{$reg}{NumberReverse}++; | |
1539 } | |
1540 $SamStats{$reg}{TotalA} += ($seq =~ tr/A//); | |
1541 $SamStats{$reg}{TotalC} += ($seq =~ tr/C//); | |
1542 $SamStats{$reg}{TotalG} += ($seq =~ tr/G//); | |
1543 $SamStats{$reg}{TotalT} += ($seq =~ tr/T//); | |
1544 $SamStats{$reg}{TotalN} += ($seq =~ tr/N//); | |
1545 | |
1546 # keep count the number of reads for each contig | |
1547 $SamStats{$reg}{Reads}++; | |
1548 | |
1549 # calculate strand info | |
1550 if($strand eq 'F') { | |
1551 $strand = 0; | |
1552 } else { | |
1553 $strand = 16; | |
1554 } | |
1555 # calculate cigar | |
1556 my $cigar = ""; | |
1557 my $M = 0; | |
1558 my $D = 0; | |
1559 my $I = 0; | |
1560 my $Subs = 0; | |
1561 for(my $i = 0; $i < length($align); $i++) { | |
1562 if(substr($align, $i, 1) eq "-") { | |
1563 $D++; | |
1564 if($M) { | |
1565 $cigar .= "${M}M"; | |
1566 $M = 0; | |
1567 } elsif($I) { | |
1568 $cigar .= "${I}I"; | |
1569 $I = 0; | |
1570 } | |
1571 } elsif(substr($ref, $i, 1) eq "-") { | |
1572 $I++; | |
1573 if($M) { | |
1574 $cigar .= "${M}M"; | |
1575 $M = 0; | |
1576 } elsif($D) { | |
1577 $cigar .= "${D}D"; | |
1578 $D = 0; | |
1579 } | |
1580 } else { | |
1581 $M++; | |
1582 if($D) { | |
1583 $cigar .= "${D}D"; | |
1584 $D = 0; | |
1585 } elsif($I) { | |
1586 $cigar .= "${I}I"; | |
1587 $I = 0; | |
1588 } | |
1589 $Subs++ if substr($ref, $i, 1) ne substr($align, $i, 1); | |
1590 } | |
1591 } | |
1592 if($M) { | |
1593 $cigar .= "${M}M"; | |
1594 } elsif($I) { | |
1595 $cigar .= "${I}I"; | |
1596 } elsif($D) { | |
1597 $cigar .= "${D}D"; | |
1598 } | |
1599 | |
1600 # construct sam file line | |
1601 $sam = "$read\t$strand\t$refLength{$reg}{newReg}\t$rstart\t$mq\t$cigar\t*\t0\t0\t$seq\t$qual\tRG:Z:$readGroup\n"; | |
1602 | |
1603 # send to writebamfile thread | |
1604 { | |
1605 lock($WriteBam); # Keep other threads from changing the queue's contents | |
1606 $WriteBam->enqueue($sam); | |
1607 } | |
1608 } | |
1609 | |
1610 #grab the next read | |
1611 { | |
1612 lock($Samfile); # Keep other threads from changing the queue's contents | |
1613 $readArrayRef = $Samfile->dequeue(); | |
1614 } | |
1615 } | |
1616 | |
1617 { | |
1618 # add stats from earlier | |
1619 lock(%stats); | |
1620 foreach my $key1 (keys %SamStats) { | |
1621 $stats{$key1}{MinLength} = min([$minLength{$key1}, $stats{$key1}{MinLength}]); | |
1622 $stats{$key1}{MaxLength} = max([$maxLength{$key1}, $stats{$key1}{MaxLength}]); | |
1623 foreach my $key2 (keys % { $SamStats{$key1} }) { | |
1624 $stats{$key1}{$key2} += $SamStats{$key1}{$key2}; | |
1625 } | |
1626 } | |
1627 | |
1628 } | |
1629 print "Closing Sam File Line Generator\n"; | |
1630 | |
1631 return; | |
1632 } | |
1633 | |
1634 # perform the alignment | |
1635 sub nwAlign { | |
1636 my ($ToAlign, $Aligned, $regions) = @_; | |
1637 my $readArrayRef; | |
1638 { | |
1639 lock($ToAlign); # Keep other threads from changing the queue's contents | |
1640 $readArrayRef = $ToAlign->dequeue(); | |
1641 } | |
1642 | |
1643 while(defined $readArrayRef) { | |
1644 # we have picked a contig | |
1645 # check strand and take rev comp of ref if necessary | |
1646 my ($ref, $left, $right, $read, $qtrim1, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef }; | |
1647 my %align; | |
1648 my $name; | |
1649 #! Parameters | |
1650 #! ClustalW Parameters | |
1651 my $matchScore = 1.9; | |
1652 my $missmatch_penalty = -0.4; | |
1653 my $gapPenOpen = -10.0; | |
1654 my $gapPenExt = -0.4; | |
1655 my $leftAndRightGapPen = true; | |
1656 my $debug = false; | |
1657 my $cons = ""; | |
1658 my $align1= ""; | |
1659 my $align2= ""; | |
1660 my $align3= ""; | |
1661 my $score = 0.0; | |
1662 my $singleAlign = false; #aligning 2 or 3 sequences | |
1663 if($conc) | |
1664 { | |
1665 #got to concatenate seqs and align | |
1666 $score = align3($ref, $right, $left, $align1, $align2, $align3, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, $leftAndRightGapPen, $debug); | |
1667 $name = "$left.$right"; | |
1668 print "$contig\n$align1\n$align2\n$align3\n\n" if defined $$regions{$contig}; | |
1669 } | |
1670 elsif(!defined $right) | |
1671 { | |
1672 #align with left | |
1673 $score = align2($ref, $left, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, false, true, $debug); | |
1674 $singleAlign = true; | |
1675 $name = $left; | |
1676 print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig}; | |
1677 } | |
1678 elsif(!defined $left) | |
1679 { | |
1680 #align with right | |
1681 $score = align2($ref, $right, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen ,$gapPenExt ,true, false, $debug); | |
1682 $singleAlign = true; | |
1683 $name = $right; | |
1684 print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig}; | |
1685 } | |
1686 | |
1687 # add aligned reads to queue | |
1688 # cache results | |
1689 my $type; | |
1690 { | |
1691 lock($Aligned); # Keep other threads from changing the queue's contents | |
1692 | |
1693 my @array; | |
1694 if($singleAlign && !defined $right) | |
1695 { | |
1696 @array = ($read, $align1, $align2, $qtrim1, undef, undef, undef, $contig, $strand, $conc); | |
1697 { | |
1698 lock(%cacheAlign); | |
1699 $cacheAlign{$contig}{R1}{$name} = &share( [] ); | |
1700 @{ $cacheAlign{$contig}{R1}{$name} } = @array; | |
1701 } | |
1702 } | |
1703 elsif($singleAlign && !defined $left ) | |
1704 { | |
1705 @array = ($read, undef, undef, undef, $align1, $align2, $qtrim2, $contig, $strand, $conc); | |
1706 { | |
1707 lock(%cacheAlign); | |
1708 $cacheAlign{$contig}{R2}{$name} = &share( [] ); | |
1709 @{ $cacheAlign{$contig}{R2}{$name} } = @array; | |
1710 } | |
1711 } | |
1712 else | |
1713 { | |
1714 @array = ($read, $align1, $align2, $qtrim1, $align1, $align3, $qtrim2, $contig, $strand, $conc); | |
1715 { | |
1716 lock(%cacheAlign); | |
1717 $cacheAlign{Conc}{$name} = &share( [] ); | |
1718 @{ $cacheAlign{Conc}{$name} } = @array; | |
1719 } | |
1720 } | |
1721 $Aligned->enqueue(\@array); | |
1722 } | |
1723 | |
1724 #grab the next read | |
1725 { | |
1726 lock($ToAlign); # Keep other threads from changing the queue's contents | |
1727 $readArrayRef = $ToAlign->dequeue(); | |
1728 } | |
1729 } | |
1730 # clean up | |
1731 print "Closing NW2 Aligner\n"; | |
1732 return; | |
1733 } | |
1734 | |
1735 # function to calculat the minimum edit distance threshold (taken from bwa) | |
1736 sub maxdiff { | |
1737 my ($l, $err, $thres) = @_; | |
1738 my $lambda = exp(-$l*$err); | |
1739 my $y = 1.0; | |
1740 my $x = 1; | |
1741 my $sum = $lambda; | |
1742 for(my $k = 1; $k < 1000; $k++) { | |
1743 $y *= $l * $err; | |
1744 $x *= $k; | |
1745 $sum += $lambda * $y / $x; | |
1746 if(1.0 - $sum < $thres) { | |
1747 return $k; | |
1748 } | |
1749 } | |
1750 return 2; | |
1751 } | |
1752 | |
1753 # using approx matching return the positions in $list of those most closely matching $string | |
1754 sub amatch { | |
1755 my ($string, $list) = @_; | |
1756 my @d = adistr($string, @$list); | |
1757 my @d_index = sort { abs($d[$a]) <=> abs($d[$b]) } 0..$#$list; | |
1758 | |
1759 return if $d[$d_index[0]] > 0.25; | |
1760 | |
1761 my @pos = ($d_index[0]); | |
1762 my $i = 1; | |
1763 while($i <= $#d_index && $d[$d_index[0]] == $d[$d_index[$i]]) { | |
1764 push @pos, $d_index[$i]; | |
1765 $i++; | |
1766 } | |
1767 return @pos; | |
1768 } | |
1769 | |
1770 # a sorting algorithm to sort the chromosomes | |
1771 sub sortChrom { | |
1772 my ($achr, $astart, $aend) = split(":|-", $a); | |
1773 my ($bchr, $bstart, $bend) = split(":|-", $b); | |
1774 $achr =~ s/X/23/; | |
1775 $achr =~ s/Y/24/; | |
1776 $achr =~ s/MT?/25/; | |
1777 $bchr =~ s/X/23/; | |
1778 $bchr =~ s/Y/24/; | |
1779 $bchr =~ s/MT?/25/; | |
1780 if($achr < $bchr) { | |
1781 return -1; | |
1782 } elsif($bchr < $achr) { | |
1783 return 1; | |
1784 } elsif($astart < $bstart) { | |
1785 return -1; | |
1786 } elsif($bstart < $astart) { | |
1787 return 1; | |
1788 } elsif($aend < $bend) { | |
1789 return -1; | |
1790 } elsif($bend < $aend) { | |
1791 return 1; | |
1792 } else { | |
1793 return 0; | |
1794 } | |
1795 } | |
1796 | |
1797 # pass a code reference | |
1798 sub modify { | |
1799 my($text, $code) = @_; | |
1800 $code->($text); | |
1801 return $text; | |
1802 } | |
1803 | |
1804 sub split_region { | |
1805 my ($chr, $start, $end) = split(":|-", shift); | |
1806 $end = $start unless defined $end; | |
1807 return($chr, $start, $end); | |
1808 } | |
1809 | |
1810 sub error_rate { | |
1811 my ($x, $y) = @_; | |
1812 my @x = split("", $x); | |
1813 my @y = split("", $y); | |
1814 my $s = 0; | |
1815 my $l = 0; | |
1816 for(my $i = 0; $i <= $#x; $i++) { | |
1817 if($x[$i] ne "-" && $y[$i] ne "-") { | |
1818 $s++ if $x[$i] ne $y[$i]; | |
1819 $l++; | |
1820 } | |
1821 } | |
1822 return($s / $l); | |
1823 } | |
1824 | |
1825 |