comparison alignCustomAmplicon/alignCustomAmplicon.pl @ 0:d32bddcff685 draft

Uploaded
author fcaramia
date Wed, 09 Jan 2013 00:24:09 -0500
parents
children 22f35f830f48
comparison
equal deleted inserted replaced
-1:000000000000 0:d32bddcff685
1 #!/usr/bin/perl
2
3 # script to convert paired end reads to a concensus sequence
4
5 # Created by Jason Ellul, Feb 2012
6 # Modified by Franco Caramia and Jason Ellul, July 2012 and November 2012
7 # 0.4 Fixed mapping of concensus region (large insertions and deletions now picked up correctly)
8 # Added missalignment penalty
9 # Read 1 and 2 are now aligned first then they are aligned to the reference
10 # 0.5 Added ability to name amplicons by includinfo the info in the primers file
11 # Added the ability to output the alignments in a particular region using the new flag -a
12 # Added some extra stats to the output
13
14 use strict;
15 use warnings;
16 use Getopt::Std;
17 use File::Basename;
18 use File::Find;
19 use File::Copy;
20 use File::MMagic;
21 use File::Temp qw/ tempfile tempdir /;
22 use Cwd 'abs_path';
23 use threads;
24 use threads::shared;
25 use List::Util qw(sum);
26 use Thread::Queue;
27 use String::Approx 'adistr';
28 use POSIX qw(floor ceil);
29 String::Approx::cache_max(25000);
30 String::Approx::cache_purge(0.25);
31 use constant false => 0;
32 use constant true => 1;
33 use Inline CPP => Config => AUTO_INCLUDE => '#include "utils.cpp"';
34 use Inline CPP => <<'END_CPP';
35
36
37
38
39 int levenshtein(char* seq1, char* seq2)
40 {
41 // Returns edit distance of 2 sequences
42 string sam1(seq1);
43 string sam2(seq2);
44 return levenshteinDistance(sam1, sam2);
45 }
46
47 double align2(char* seq1, char* seq2, SV* res1, SV* res2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug )
48 {
49 // Returns Sequence alignment of 2 sequences
50 // res1, res2 contain the alignment
51 //seq1: reference sequence
52 //seq2: amplicon sequence
53 //matchScore: reward for matching a nucleotide
54 //gapPenOpen: penalty for opening a gap
55 //gapPenext: penalty for extending a gap
56 //noFrontGapPenalty: No penalty will be impose for gaps at the beggining of the alignment
57 //noEndGapPenalty: No penalty will be impose for gaps at the end of the alignment
58 //debug: prints scores, alignments, etc...
59
60 string sam1(seq1);
61 string sam2(seq2);
62 double ret = nw_align2(sam1, sam2, matchScore, missmatch_penalty, gapPenOpen, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);
63 //Setting the return alignments
64 sv_setpvn(res1, (char*)sam1.c_str(), sam1.length());
65 sv_setpvn(res2, (char*)sam2.c_str(), sam2.length());
66 return ret;
67 }
68
69 void adjustSequence(int refSize, string& seq, char dir)
70 {
71
72 int adjust = refSize - seq.length();
73 if(dir == 'r')
74 for(int i=0; i<adjust; ++i) seq = '-' + seq;
75 else
76 for(int i=0; i<adjust; ++i) seq += '-';
77 }
78
79 double align3(char* ref, char* right,char* left , SV* res1, SV* res2, SV* res3, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool leftAndRightGapPen, bool debug )
80 {
81 // Returns Sequence alignment of 3 sequences
82 // res1, res2, res3 contain the alignment
83 //ref: reference sequence
84 //left: left amplicon sequence
85 //right: right amplicon sequence
86 //matchScore: reward for matching a nucleotide
87 //gapPenOpen: penalty for opening a gap
88 //gapPenext: penalty for extending a gap
89 //leftAndRightGapPen: Smart way to penalise according to the amplicon
90 //debug: prints scores, alignments, etc...
91
92
93 string leftSeq(left);
94 string rightSeq(right);
95
96 bool noFrontGapPenalty = false;
97 bool noEndGapPenalty = false;
98
99 //Align right with left reads
100 if(leftAndRightGapPen)
101 {
102 noFrontGapPenalty = true;
103 noEndGapPenalty = true;
104 }
105
106 //first alignment the 2 reads
107 double ret = nw_align2(rightSeq, leftSeq, matchScore, missmatch_penalty*8, gapPenOpen*5, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);
108
109 //Setting the return alignments
110 vector <string> msa;
111 vector <string> partial;
112 vector <string> seq;
113
114 partial.push_back(rightSeq);
115 partial.push_back(leftSeq);
116 seq.push_back(ref);
117
118 msa = nw_alignAlignments(partial, seq , matchScore, missmatch_penalty, gapPenOpen, gapPenExt, false, false, debug);
119 sv_setpvn(res1, (char*)msa[2].c_str(), msa[2].length());
120 sv_setpvn(res2, (char*)msa[1].c_str(), msa[1].length());
121 sv_setpvn(res3, (char*)msa[0].c_str(), msa[0].length());
122 return ret;
123 }
124
125 END_CPP
126
127 $| = 1;
128
129 # Grab and set all options
130 my $version = "0.5";
131 my %OPTIONS = (p => 1);
132 getopts('a:dl:L:o:p:rsv', \%OPTIONS);
133
134 die qq(version $version
135 Usage: alignCustomAmplicon.pl [OPTIONS] <ref> <fastq[.gz] 1> <fastq[.gz] 2> <primers>
136
137 OPTIONS: -a STR print the alignment for any amplicons covering a particular region [chr:loc || chr:start-stop]
138 -d print debug info
139 -l STR filename of the log file [null]
140 -L STR append to an existing log file [null]
141 -o STR filename for bam file
142 -p INT number of threads [$OPTIONS{p}]
143 -r do not create stats report
144 -s sort the bam file
145 -v verbose progress
146 ) if(@ARGV != 4);
147
148 my $Script = 'alignCustomAmplicon';
149 my $ref = shift @ARGV;
150 my $fastq1 = shift @ARGV;
151 my $fastq2 = shift @ARGV;
152 my $primers = shift @ARGV;
153 my $now;
154
155 #create log file
156 if(defined $OPTIONS{l}) {
157 my $logfile = $OPTIONS{l};
158 open (FH, ">$logfile") or die 'Couldn\'t create log file!\n';
159 close (FH);
160 # Open the log file and redirect output to it
161 open (STDERR, ">>$logfile");
162 open (STDOUT, ">>$logfile");
163 my $now = localtime time;
164 print "Log File Created $now\n\n";
165 } elsif(defined $OPTIONS{L}) {
166 #append to a log file
167 my $logfile = $OPTIONS{L};
168 # Open the log file and redirect output to it
169 open (STDERR, ">>$logfile") or die 'Couldn\'t create log file!\n';
170 open (STDOUT, ">>$logfile") or die 'Couldn\'t create log file!\n';
171 my $now = localtime time;
172 print "Appending To Log File $now\n\n";
173 }
174
175 # print version of this script
176 print "Using $Script version $version \n\n";
177
178 if(defined $OPTIONS{d}) {
179 print "Setting threads to 1 as using debug\n";
180 $OPTIONS{p} = 1;
181 }
182
183 # grab sample name, set output file name and open some files
184 my ($name, $path, $suffix) = fileparse($fastq1, ('.fastq', '.fastq.gz'));
185 my $sampleName :shared = $name;
186 $sampleName =~ s/^(.*?_.*?)_.*/$1/;
187 open(PRIM, $primers) or die "can't open $primers: $!\n";
188
189 # set samfile name
190 my $bamfile_out = "${sampleName}_aligned.bam";
191 if(defined $OPTIONS{o}) {
192 $bamfile_out = $OPTIONS{o};
193 }
194 my $bam_to_sort = $bamfile_out;
195 if(defined $OPTIONS{s}) {
196 $bam_to_sort .= '.tmp';
197 }
198
199 my $stats = $bamfile_out;
200 if(!defined $OPTIONS{r}) {
201 $stats =~ s/\..*?$//;
202 open(STATS, ">${stats}_stats.csv");
203 }
204
205 # read in primer lengths
206 my %refFrags;
207 $refFrags{min} = 5000000;
208 while(my $line = <PRIM>) {
209 chomp $line;
210 my @line_sp = split("\t", $line);
211 $refFrags{primers}{$line_sp[0]}{U}{F}{length} = $line_sp[1];
212 $refFrags{primers}{$line_sp[0]}{D}{F}{length} = $line_sp[2];
213 $refFrags{primers}{$line_sp[0]}{U}{R}{length} = $line_sp[2];
214 $refFrags{primers}{$line_sp[0]}{D}{R}{length} = $line_sp[1];
215 $refFrags{name}{$line_sp[0]} = $line_sp[3];
216 $refFrags{min} = min([$refFrags{min}, $line_sp[1], $line_sp[2]]);
217 }
218 close(PRIM);
219 my @OrderedContigs :shared = sort sortChrom keys %{ $refFrags{primers} };
220
221 my %regions;
222 if(defined $OPTIONS{a}) {
223 my ($chr, $start, $end) = split_region($OPTIONS{a});
224 foreach my $reg (@OrderedContigs) {
225 my ($chr_reg, $start_reg, $end_reg) = split_region($reg);
226 $regions{$reg} = 1 if $chr eq $chr_reg && $start_reg <= $end && $end_reg >= $start;
227 }
228 }
229
230 # grab fasta file
231 my %refSeq;
232 read_fasta_file($ref, \%refSeq);
233
234 # calculate read length and read group
235 my $readGroup :shared;
236 my $readLength;
237 my $mm = new File::MMagic;
238 my $filetype = $mm->checktype_filename(abs_path($fastq1));
239 if($filetype eq "application/x-gzip") {
240 my $sampleRead = `gunzip -c $fastq1 | head -n 2 | tail -n 1`;
241 chomp $sampleRead;
242 $readLength = length($sampleRead);
243 $readGroup = `gunzip -c $fastq1 | head -n 1`;
244 chomp $readGroup;
245 $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
246 } elsif($filetype eq "text/plain") {
247 my $sampleRead = `head -n 2 $fastq1 | tail -n 1`;
248 chomp $sampleRead;
249 $readLength = length($sampleRead);
250 $readGroup = `head $fastq1 -n 1`;
251 chomp $readGroup;
252 $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
253 } else {
254 die("$fastq1 is filetype $filetype and not text/plain or application/x-gzip\n");
255 }
256
257 # create hash to keep track of maxdiff and readlengths
258 my %maxdiff :shared;
259
260 # create hash to keep track of stats
261 my %stats :shared;
262 $stats{Amb} = 0;
263 $stats{PoorQualMates} = 0;
264 $stats{PoorMapQual} = 0;
265 $stats{PoorQual1} = 0;
266 $stats{PoorQual2} = 0;
267
268 # setup other shared variables and caches
269 my %cache :shared;
270 my $cacheHits :shared = 0;
271 my %cacheAlign :shared;
272 $cacheAlign{Conc} = &share( {} );
273 my $cacheAlignHits :shared = 0;
274 my %refLength :shared;
275
276 # extract primers and fragments on forward and reverse strand and initialize counts
277 foreach my $key (@OrderedContigs) {
278 push @{ $refFrags{seqU} }, substr($refSeq{$key}, 0, $refFrags{primers}{$key}{U}{F}{length});
279 push @{ $refFrags{seqD} }, substr($refSeq{$key}, -$refFrags{primers}{$key}{D}{F}{length});
280 push @{ $refFrags{seqU} }, revcompl(substr($refSeq{$key}, -$refFrags{primers}{$key}{U}{R}{length}));
281 push @{ $refFrags{seqD} }, revcompl(substr($refSeq{$key}, 0, $refFrags{primers}{$key}{D}{R}{length}));
282 push @{ $refFrags{lenU} }, ($refFrags{primers}{$key}{U}{F}{length}, $refFrags{primers}{$key}{U}{R}{length});
283 push @{ $refFrags{lenD} }, ($refFrags{primers}{$key}{D}{F}{length}, $refFrags{primers}{$key}{D}{R}{length});
284 push @{ $refFrags{reg} }, ($key, $key);
285 push @{ $refFrags{strand} }, ('F', 'R');
286
287 # calculate new contig name with primers removed
288 my ($chr, $start, $stop) = split(":|-", $key);
289 $start += $refFrags{primers}{$key}{U}{F}{length};
290 $stop -= $refFrags{primers}{$key}{D}{F}{length};
291 $refLength{$key} = &share( {} );
292 $refLength{$key}{newReg} = "$chr:$start-$stop";
293 $refLength{$key}{length} = $stop - $start + 1;
294
295 $cacheAlign{$key} = &share( {} );
296 $cacheAlign{$key}{R1} = &share( {} );
297 $cacheAlign{$key}{R2} = &share( {} );
298
299 $stats{$key} = &share( {} );
300 $stats{$key}{Reads} = 0;
301 $stats{$key}{Bases} = 0;
302 $stats{$key}{Quality} = 0;
303 $stats{$key}{Mapping} = 0;
304 $stats{$key}{Read1} = 0;
305 $stats{$key}{Read2} = 0;
306 $stats{$key}{ConcMapping} = 0;
307 $stats{$key}{ConcReads} = 0;
308 $stats{$key}{NumberForward} = 0;
309 $stats{$key}{NumberReverse} = 0;
310 $stats{$key}{ConcErrorsBases} = 0;
311 $stats{$key}{ConcCorrectBases} = 0;
312 $stats{$key}{MinLength} = 100000;
313 $stats{$key}{MaxLength} = 0;
314 $stats{$key}{TotalA} = 0;
315 $stats{$key}{TotalC} = 0;
316 $stats{$key}{TotalG} = 0;
317 $stats{$key}{TotalT} = 0;
318 $stats{$key}{TotalN} = 0;
319 $stats{$key}{PoorMapQual} = 0;
320 }
321
322 # start up worker threads
323 my $FindContig = Thread::Queue->new();
324 my $ToAlign = Thread::Queue->new();
325 my $Aligned = Thread::Queue->new();
326 my $Samfile = Thread::Queue->new();
327 my $WriteBam = Thread::Queue->new();
328 my @threadArray;
329 for(my $index = 1; $index <= $OPTIONS{p}; $index++) {
330 push @threadArray, threads->create(\&thread_find_contig, $FindContig, \%refFrags, \%refSeq, $ToAlign, $Aligned, $WriteBam);
331 push @threadArray, threads->create(\&thread_process_align, $Aligned, \%refFrags, \%refSeq, $Samfile, $WriteBam) if $index <= max([1, $OPTIONS{p}/2]);
332 push @threadArray, threads->create(\&samfile_line, $Samfile, $WriteBam, \%refSeq) if $index <= max([1, $OPTIONS{p}/2]);
333 }
334 push @threadArray, threads->create(\&nwAlign, $ToAlign, $Aligned, \%regions);
335 push @threadArray, threads->create(\&writebamfile, $bam_to_sort, $WriteBam);
336
337 # read in each sequence and its mate from the fastq file
338 my $read1;
339 my $read2;
340 my $seq1;
341 my $seq2;
342 my $qual1;
343 my $qual2;
344 my $desc;
345 my @strand;
346 my $cont = 1;
347
348 # open fastq files
349 my $gz1;
350 my $gz2;
351 my $f1gz = 0;
352 my $f2gz = 0;
353 if($filetype eq "application/x-gzip") {
354 open(FQ1, "gunzip -dc $fastq1 |") or die "Can't fork: $!\n";
355 } else {
356 open(FQ1, $fastq1) or die "$fastq1: $!";
357 }
358
359 $filetype = $mm->checktype_filename(abs_path($fastq2));;
360 if($filetype eq "application/x-gzip") {
361 open(FQ2, "gunzip -dc $fastq2 |") or die "Can't fork: $!\n";
362 } elsif($filetype eq "text/plain") {
363 open(FQ2, $fastq2) or die "$fastq2: $!";
364 } else {
365 die("$fastq2 is filetype $filetype and not text/plain or application/x-gzip\n");
366 }
367
368 # add reads to the queue
369 my $readNo = 0;
370 while ($cont) {
371 if($read1 = <FQ1>) { # read 1 and 2 read name
372 $seq1 = <FQ1>; # read 1 sequence
373 $desc = <FQ1>; # read 1 desc
374 $qual1 = <FQ1>; # read 1 quality
375 } else {
376 last;
377 }
378
379 if($read2 = <FQ2>) { # read 2 read name
380 $seq2 = <FQ2>; # read 2 sequence
381 $desc = <FQ2>; # read 2 sdesc
382 $qual2 = <FQ2>; # read 2 quality
383 } else {
384 last;
385 }
386
387 # read 1 the forward read
388 chomp $read1;
389 chomp $seq1;
390 chomp $qual1;
391 $read1 =~ s/^\@(.*)\s?.*/$1/;
392
393 # read 2 the reverse read
394 chomp $read2;
395 chomp $seq2;
396 $seq2 = revcompl($seq2);
397 chomp $qual2;
398 $qual2 = reverse($qual2);
399
400 $read2 =~ s/^\@(.*)\s?.*/$1/;
401 $readNo++;
402
403 # keep at most 20 reads in buffer
404 while(1) {
405 if(defined $OPTIONS{v}) {
406 my $percAmb = $stats{Amb} / $readNo * 100;
407 my $percPoor = $stats{PoorQualMates} / $readNo * 100;
408 my $percPoorMap = $stats{PoorMapQual} / $readNo * 100;
409 my $percPoor1 = $stats{PoorQual1} / $readNo * 100;
410 my $percPoor2 = $stats{PoorQual2} / $readNo * 100;
411 $percAmb = sprintf("%.2f", $percAmb);
412 $percPoor = sprintf("%.2f", $percPoor);
413 $percPoorMap = sprintf("%.2f", $percPoorMap);
414 $percPoor1 = sprintf("%.2f", $percPoor1);
415 $percPoor2 = sprintf("%.2f", $percPoor2);
416 print STDERR "\r$readNo processed - Amb $percAmb - PoorQual $percPoor - PoorQual1 $percPoor1 - PoorQual2 $percPoor2 - PoorMappingQual $percPoorMap - FindContig ".$FindContig->pending()." - ToAlign ".$ToAlign->pending()." - Aligned ".$Aligned->pending()." - Sam ".$Samfile->pending()." - Writebam ".$WriteBam->pending()." ";
417 }
418 last if $FindContig->pending() <= 100;
419 }
420 my @readArray = ($read1, $seq1, $qual1, $read2, $seq2, $qual2);
421 {
422 lock($FindContig); # Keep other threads from changing the queue's contents
423 $FindContig->enqueue(\@readArray);
424 }
425 }
426 close(FQ1);
427 close(FQ2);
428
429 # wait for all records to be processed and close down threads
430 $FindContig->enqueue( (undef) x $OPTIONS{p} );
431 while($FindContig->pending() > 0 ) {
432 next;
433 }
434 $ToAlign->enqueue( (undef) );
435 while($ToAlign->pending() > 0 ) {
436 next;
437 }
438 $Aligned->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
439 while($Aligned->pending() > 0 ) {
440 next;
441 }
442 $Samfile->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
443 while($Samfile->pending() > 0 ) {
444 next;
445 }
446 $WriteBam->enqueue( undef );
447
448 $_->join() for @threadArray;
449
450 # sort the bam file
451 if(defined $OPTIONS{s}) {
452 system("MergeSamFiles.sh MAX_RECORDS_IN_RAM=2000000 I=$bam_to_sort O=$bamfile_out VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true");
453 unlink $bam_to_sort;
454 }
455
456 if(!defined $OPTIONS{r}) {
457 # print out statistics
458 # setup variables
459 my $totalRegionLengths = 0;
460 my $totalReads = 0;
461 my $totalFwd = 0;
462 my $totalRev = 0;
463 my $totalMatch = 0;
464 my $totalMiss = 0;
465 my $totalBases = 0;
466 my $totalRead1 = 0;
467 my $totalRead2 = 0;
468 my $totalA = 0;
469 my $totalC = 0;
470 my $totalG = 0;
471 my $totalT = 0;
472 my $totalN = 0;
473 my $totalConc = 0;
474 my $totalQuality = 0;
475 my $AllmaxRead = 0;
476 my $AllminRead = 100000;
477 my $avequal = 'NA';
478 my $aveRead = 'NA';
479 my $perA = 'NA';
480 my $perC = 'NA';
481 my $perG = 'NA';
482 my $perT = 'NA';
483 my $perN = 'NA';
484 my $readsUsed = 0;
485 my @coverage;
486 my @off_coverage;
487
488 # print column headers
489 print STATS 'AmpliconName,Region,RegionLengthMinusPrimers,TotalReadsPoorMapping,TotalReadsUsed,Reads1Kept,Reads2Kept,ReadsOut,ReadsFwd,ReadsRev,ReadsOverlap,';
490 print STATS "Bases,AveQuality,MatchBases,ErrorBases,ErrorRate,MinReadLength,AveReadLength,MaxReadLength,PercA,PercC,PercG,PercT,PercN\n";
491
492 # for each region
493 foreach my $key (@OrderedContigs){
494 # if we have some reads calculate averages
495 if($stats{$key}{Reads} > 0) {
496 $avequal = $stats{$key}{Quality} / $stats{$key}{Bases};
497 $avequal = sprintf("%.4f", $avequal);
498 $aveRead = $stats{$key}{Bases} / $stats{$key}{Reads};
499 $aveRead = sprintf("%.4f", $aveRead);
500 $perA = $stats{$key}{TotalA} / $stats{$key}{Bases} * 100;
501 $perA = sprintf("%.2f", $perA);
502 $perC = $stats{$key}{TotalC} / $stats{$key}{Bases} * 100;
503 $perC = sprintf("%.2f", $perC);
504 $perG = $stats{$key}{TotalG} / $stats{$key}{Bases} * 100;
505 $perG = sprintf("%.2f", $perG);
506 $perT = $stats{$key}{TotalT} / $stats{$key}{Bases} * 100;
507 $perT = sprintf("%.2f", $perT);
508 $perN = $stats{$key}{TotalN} / $stats{$key}{Bases} * 100;
509 $perN = sprintf("%.2f", $perN);
510 $AllminRead = min([$AllminRead, $stats{$key}{MinLength}]);
511 $AllmaxRead = max([$AllmaxRead, $stats{$key}{MaxLength}]);
512 $readsUsed = $stats{$key}{Reads} + $stats{$key}{ConcReads};
513 } else {
514 $stats{$key}{MaxLength} = 'NA';
515 $stats{$key}{MinLength} = 'NA';
516 }
517
518 # if we have reads which overlapped calculate an error rate
519 my $errRate = 'NA';
520 if($stats{$key}{ConcReads} > 0) {
521 $errRate = $stats{$key}{ConcErrorsBases} / ($stats{$key}{ConcErrorsBases} + $stats{$key}{ConcCorrectBases});
522 }
523
524 # print the stats for this region
525 print STATS "$refFrags{name}{$key}," if defined $refFrags{name}{$key};
526 print STATS "Unknown," unless defined $refFrags{name}{$key};
527 print STATS "$refLength{$key}{newReg},$refLength{$key}{length},$stats{$key}{PoorMapQual},$readsUsed,$stats{$key}{Read1},$stats{$key}{Read2},$stats{$key}{Reads},$stats{$key}{NumberForward},";
528 print STATS "$stats{$key}{NumberReverse},$stats{$key}{ConcReads},$stats{$key}{Bases},$avequal,$stats{$key}{ConcCorrectBases},";
529 print STATS "$stats{$key}{ConcErrorsBases},$errRate,$stats{$key}{MinLength},$aveRead,$stats{$key}{MaxLength},$perA,$perC,$perG,$perT,$perN\n";
530
531 # add stats to grand totals
532 $totalRegionLengths += $refLength{$key}{length};
533 $totalQuality += $stats{$key}{Quality};
534 $totalReads += $stats{$key}{Reads};
535 $totalFwd += $stats{$key}{NumberForward};
536 $totalRev += $stats{$key}{NumberReverse};
537 $totalBases += $stats{$key}{Bases};
538 $totalRead1 += $stats{$key}{Read1};
539 $totalRead2 += $stats{$key}{Read2};
540 $totalA += $stats{$key}{TotalA};
541 $totalC += $stats{$key}{TotalC};
542 $totalT += $stats{$key}{TotalT};
543 $totalG += $stats{$key}{TotalG};
544 $totalN += $stats{$key}{TotalN};
545 $totalMatch += $stats{$key}{ConcCorrectBases};
546 $totalMiss += $stats{$key}{ConcErrorsBases};
547 $totalConc += $stats{$key}{ConcReads};
548
549 # reset ave scores
550 $avequal = 'NA';
551 $aveRead = 'NA';
552 $perA = 'NA';
553 $perC = 'NA';
554 $perG = 'NA';
555 $perT = 'NA';
556 $perN = 'NA';
557 $readsUsed = 0;
558
559 if(defined $refFrags{name}{$key} && grep /^Off_target/, $refFrags{name}{$key}) {
560 push @off_coverage, $stats{$key}{Reads};
561 } else {
562 push @coverage, $stats{$key}{Reads};
563 }
564 }
565
566 # if we have any reads calculate the averages
567 if($totalReads > 0) {
568 $avequal = $totalQuality / $totalBases;
569 $avequal = sprintf("%.4f", $avequal);
570 $aveRead = $totalBases / $totalReads;
571 $aveRead = sprintf("%.4f", $aveRead);
572 $perA = $totalA / $totalBases * 100;
573 $perA = sprintf("%.2f", $perA);
574 $perC = $totalC / $totalBases * 100;
575 $perC = sprintf("%.2f", $perC);
576 $perG = $totalG / $totalBases * 100;
577 $perG = sprintf("%.2f", $perG);
578 $perT = $totalT / $totalBases * 100;
579 $perT = sprintf("%.2f", $perT);
580 $perN = $totalN / $totalBases * 100;
581 $perN = sprintf("%.2f", $perN);
582 $readsUsed = $totalReads + $totalConc;
583 } else {
584 $AllmaxRead = 'NA';
585 $AllminRead = 'NA';
586 }
587
588 # if we had any reads which overlapped calculate error rate
589 my $errRate = 'NA';
590 if($totalConc > 0) {
591 $errRate = $totalMiss / ($totalMiss + $totalMatch);
592 }
593
594 # print out totals
595 my $pcAmb = sprintf("%.2f", $stats{Amb} / $readNo * 100);
596 my $pcPQ = sprintf("%.2f", $stats{PoorQualMates} / $readNo * 100);
597 my $TotalReads = $readNo * 2;
598 my $Unmapped = $TotalReads - $readsUsed;
599 my $pcUnmapped = sprintf("%.2f", $Unmapped/$TotalReads * 100);
600 my $avCov = sprintf("%.0f", sum(@coverage)/@coverage);
601 my $avOff = sprintf("%.2f", sum(@off_coverage)/@off_coverage) if @off_coverage;
602 my $greater = scalar (grep { $_ > 0.2*$avCov } @coverage);
603 my $pcgreater = sprintf("%.2f", $greater/@coverage * 100);
604
605 print STATS ",Total,$totalRegionLengths,$stats{PoorMapQual},$readsUsed,$totalRead1,$totalRead2,$totalReads,$totalFwd,$totalRev,$totalConc,$totalBases,$avequal,$totalMatch,";
606 print STATS "$totalMiss,$errRate,$AllminRead,$aveRead,$AllmaxRead,$perA,$perC,$perG,$perT,$perN\n";
607 print STATS "\nNo mate pairs:,$readNo\n";
608 print STATS "No ambiguous mates:,$stats{Amb},$pcAmb\%\n";
609 print STATS "No poor quality mates:,$stats{PoorQualMates},$pcPQ\%\n\n";
610 print STATS "Total Number Of Reads:,$TotalReads\n";
611 print STATS "Unmapped Reads:,$Unmapped,$pcUnmapped\%\n\n";
612 print STATS "Min Mean Max ReadsOut:,".min([@coverage]).",$avCov,".max([@coverage])."\n";
613 print STATS "No. Amplicons > 0.2 x ReadsOut:,$greater,$pcgreater\%\n";
614 print STATS "Average ReadsOut Off Target Regions:,$avOff\n" if @off_coverage;
615 print STATS "\nAmplicons\nNo. Reads < 200, 200 <= No. Reads < 1000, No. Reads >= 1000\n";
616 $greater = scalar (grep { $_ < 200 } @coverage);
617 print STATS "$greater,";
618 $greater = scalar (grep { $_ >= 200 && $_ < 1000 } @coverage);
619 print STATS "$greater,";
620 $greater = scalar (grep { $_ >= 1000 } @coverage);
621 print STATS "$greater\n";
622
623 print "Cache hits align contig: $cacheHits\n";
624 print "Alignment cache hits: $cacheAlignHits\n";
625 close(STATS);
626 }
627
628 # close the log file
629 if(defined $OPTIONS{l} || defined $OPTIONS{L}) {
630 my $now = localtime time;
631 print "\nLog File Closed $now\n";
632 close(STDERR);
633 close(STDOUT);
634 }
635
636 # thread to find matching amplicon
637 sub thread_find_contig {
638 my $FindContig = shift;
639 my $refFrags = shift;
640 my $refSeq = shift;
641 my $ToAlign = shift;
642 my $Aligned = shift;
643 my $WriteBam = shift;
644 my $readArrayRef;
645 {
646 lock($FindContig); # Keep other threads from changing the queue's contents
647 $readArrayRef = $FindContig->dequeue();
648 }
649
650 while(defined $readArrayRef) {
651 my ($read1, $seq1, $qual1, $read2, $seq2, $qual2) = @{ $readArrayRef };
652 my $contig = '';
653 my $readRef = '';
654 my $strand = '';
655
656 # trim the reads
657 my ($trimmed1, $qtrim1) = trim($seq1, $qual1, 30, 1);
658 my ($trimmed2, $qtrim2) = trim($seq2, $qual2, 30, 2);
659
660 # if we have already mapped the reads use the results in the cache
661 if(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
662 my @array;
663 { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
664 $array[0] = $read1;
665 $array[3] = $qtrim1;
666 $array[6] = $qtrim2;
667 { lock($cacheAlignHits); $cacheAlignHits++ };
668
669 # add aligned reads to queue
670 {
671 lock($Aligned); # Keep other threads from changing the queue's contents
672 $Aligned->enqueue(\@array);
673 }
674 } else {
675 # Use approx match to find most similar amplicon primer to reads
676 my @index1;
677 my @index2;
678 my $index_read1 = -2;
679 my $index_read2 = -2;
680 my $index_found = 0;
681
682 if(length($trimmed1) > length($seq1)/2 || length($trimmed2) > length($seq2)/2) {
683 # we have at least 1 good read
684 # find closest matching amplicon for read 1
685 my $string = substr($seq1, 0, $$refFrags{min});
686 if(defined $cache{"$string.1"}) {
687 $cacheHits++;
688 { lock(%cache); @index1 = @{ $cache{"$string.1"} } };
689 } else {
690 @index1 = amatch($string, \@{ $$refFrags{seqU} });
691 {
692 lock(%cache);
693 $cache{"$string.1"} = &share( [] );
694 @{ $cache{"$string.1"} } = @index1;
695 }
696 }
697
698 # find closest matching amplicon for read 2
699 $string = substr($seq2, -$$refFrags{min});
700 if(defined $cache{"$string.2"}) {
701 $cacheHits++;
702 { lock(%cache); @index2 = @{ $cache{"$string.2"} } };
703 } else {
704 @index2 = amatch($string, \@{ $$refFrags{seqD} });
705 {
706 lock(%cache);
707 $cache{"$string.2"} = &share( [] );
708 @{ $cache{"$string.2"} } = @index2;
709 }
710 }
711
712 if($#index1 > 0 || $#index2 > 0) {
713 # if one read maps to multiple amplicons and the other read matches to just one of them use that
714 my %in_array2 = map { $_ => 1 } @index2;
715 my @common = grep { $in_array2{$_} } @index1;
716 if($#common == 0) {
717 @index1 = @common;
718 @index2 = @common;
719 } elsif($#common > 0) {
720 if($#index1 > 0) {
721 # matched to multiple primers so try and use whole read
722 my @refs;
723 foreach(@index1) {
724 if($$refFrags{strand}[$_] eq "F") {
725 push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
726 } else {
727 push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
728 }
729 }
730 my @index1b = amatch($seq1, \@refs);
731 if($#index1b >= 0) {
732 @index1 = @index1[@index1b];
733 }
734 }
735 if($#index2 > 0) {
736 # matched to multiple primers so try and use whole read
737 my @refs;
738 foreach(@index2) {
739 if($$refFrags{strand}[$_] eq "F") {
740 push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
741 } else {
742 push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
743 }
744 }
745 my @index2b = amatch($seq2, \@refs);
746 if($#index2b >= 0) {
747 @index2 = @index2[@index2b];
748 }
749 }
750
751 # if one read still maps to multiple amplicons and the other read matches to just one of them use that
752 %in_array2 = map { $_ => 1 } @index2;
753 @common = grep { $in_array2{$_} } @index1;
754 if($#common == 0) {
755 @index1 = @common;
756 @index2 = @common;
757 }
758 }
759 }
760
761 # if read 1 and read 2 match to the same amplicon assign it
762 if($#index1 == 0 && @index1 ~~ @index2 ) {
763 $index_read1 = $index1[0];
764 $index_read2 = $index2[0];
765 } else {
766 $index_read1 = -1;
767 $index_read2 = -1;
768 }
769
770 # if any read after trimming is less than half the original length discard
771 if(length($trimmed1) < length($seq1)/2) {
772 $index_read1 = -2;
773 }
774
775 if(length($trimmed2) < length($seq2)/2) {
776 $index_read2 = -2;
777 }
778 }
779
780 if($index_read1 > -1 || $index_read2 > -1) {
781 my $conc = 0;
782 my %align;
783
784 # at least 1 read matched to an amplicon
785 if($index_read1 > -1) {
786 $contig = $$refFrags{reg}[$index_read1];
787 $readRef = $$refSeq{$contig};
788 $strand = $$refFrags{strand}[$index_read1];
789 } elsif($index_read2 > -1) {
790 $contig = $$refFrags{reg}[$index_read2];
791 $readRef = $$refSeq{$contig};
792 $strand = $$refFrags{strand}[$index_read2];
793 }
794
795 if($index_read1 == $index_read2 && length($readRef) < length($trimmed1) + length($trimmed2)) {
796 # reads must overlap so can use concensus
797 $conc = 1;
798 } else {
799 my $substr1;
800 my $substr2;
801 for(my $length = 5; $length <= min([length($trimmed1), length($trimmed2)]); $length++) {
802 $substr1 = substr($trimmed1, -$length);
803 $substr2 = substr($trimmed2, 0, $length);
804 if($substr1 eq $substr2) {
805 $conc = 1;
806 last;
807 } else {
808 # calculate max diff allowed
809 if(!defined $maxdiff{$length}) {
810 { lock(%maxdiff); $maxdiff{$length} = maxdiff($length, 0.02, 0.04); }
811 }
812
813 my $diff = levenshtein($substr1, $substr2 );
814 if($diff < $maxdiff{$length}) {
815 $conc = 1;
816 last;
817 }
818 }
819 }
820 }
821
822 if($conc) {
823 # remove any trailing Ns
824 $trimmed1 = $seq1;
825 $trimmed2 = $seq2;
826 $trimmed1 =~ s/N*$//;
827 $trimmed2 =~ s/^N*//;
828 $qtrim1 = $qual1;
829 $qtrim2 = $qual2;
830 $qtrim1 = substr($qtrim1, 0, length($trimmed1));
831 $qtrim2 = substr($qtrim2 , -length($trimmed2));
832 }
833
834 if($strand eq "R") {
835 my $trimmed1_tmp = revcompl($trimmed1);
836 my $qtrim1_tmp = reverse($qtrim1);
837 $trimmed1 = revcompl($trimmed2);
838 $qtrim1 = reverse($qtrim2);
839 $trimmed2 = $trimmed1_tmp;
840 $qtrim2 = $qtrim1_tmp;
841 my $index_read_tmp = $index_read1;
842 $index_read1 = $index_read2;
843 $index_read2 = $index_read_tmp;
844 }
845
846 if($conc) {
847 my $align;
848 my $search1 = $trimmed1;
849 $search1 =~ s/N/./g;
850 my $search2 = $trimmed2;
851 $search2 =~ s/N/./g;
852 if($$refSeq{$contig} =~ /$search1/ && $$refSeq{$contig} =~ /$search2/) {
853 # the read matches exactly to the reference
854 $align{Ref1} = $$refSeq{$contig};
855 $align{Read1} = $$refSeq{$contig};
856 $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
857 $align{Read1} =~ s/[^X]/-/g;
858 $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X+/$trimmed1/ });
859 $align{Read2} = $$refSeq{$contig};
860 $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}XX$2/;
861 $align{Read2} =~ s/[^X]/-/g;
862 $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X+/$trimmed2/ });
863
864 # add aligned reads to queue
865 my @array;
866 {
867 lock($Aligned); # Keep other threads from changing the queue's contents
868 @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, $align{Ref1}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
869 $Aligned->enqueue(\@array);
870 }
871
872 # add alignment to cache
873 {
874 lock(%cacheAlign);
875 $cacheAlign{Conc}{$name} = &share( [] );
876 @{ $cacheAlign{Conc}{$name} } = @array;
877 }
878 } elsif(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
879 my @array;
880 { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
881 $array[0] = $read1;
882 $array[3] = $qtrim1;
883 $array[6] = $qtrim2;
884 { lock($cacheAlignHits); $cacheAlignHits++ };
885
886 # add aligned reads to queue
887 {
888 lock($Aligned); # Keep other threads from changing the queue's contents
889 $Aligned->enqueue(\@array);
890 }
891 } else {
892
893 # align reads using NW2
894 {
895 lock($ToAlign); # Keep other threads from changing the queue's contents
896 my @array = ($$refSeq{$contig}, $trimmed1, $trimmed2, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc);
897 #my @array = ($fastaIn, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc, "$trimmed1.$trimmed2");
898 $ToAlign->enqueue(\@array);
899 }
900 }
901 }
902
903 if(!$conc && $index_read1 > -1) {
904 my $search1 = $trimmed1;
905 $search1 =~ s/N/./g;
906 if($$refSeq{$contig} =~ /$search1/) {
907 # the read matches exactly to the reference
908 $align{Ref1} = $$refSeq{$contig};
909 $align{Read1} = $$refSeq{$contig};
910 $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
911 $align{Read1} =~ s/[^X]/-/g;
912 $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X/$trimmed1/ });
913
914 # add aligned reads to queue
915 {
916 lock($Aligned); # Keep other threads from changing the queue's contents
917 my @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
918 $Aligned->enqueue(\@array);
919 }
920 } elsif(defined $cacheAlign{$contig}{R1}{$trimmed1}) {
921 my @array;
922 { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R1}{$trimmed1} } };
923 $array[0] = $read1;
924 $array[3] = $qtrim1;
925 { lock($cacheAlignHits); $cacheAlignHits++ };
926
927 # add aligned reads to queue
928 {
929 lock($Aligned); # Keep other threads from changing the queue's contents
930 $Aligned->enqueue(\@array);
931 }
932 } else {
933 # align read using NW2
934 {
935 lock($ToAlign); # Keep other threads from changing the queue's contents
936 my @array = ($$refSeq{$contig}, $trimmed1, undef, $read1, $qtrim1, undef, $contig, $strand, $conc);
937 #my @array = ($fastaIn, $read1, $qtrim1, undef, $contig, $strand, $conc, $trimmed1);
938 $ToAlign->enqueue(\@array);
939 }
940 }
941 }
942
943 if(!$conc && $index_read2 > -1) {
944 my $search2 = $trimmed2;
945 $search2 =~ s/N/./g;
946 if($$refSeq{$contig} =~ /$search2/) {
947 # the read matches exactly to the reference
948 $align{Ref2} = $$refSeq{$contig};
949 $align{Read2} = $$refSeq{$contig};
950 $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}X$2/;
951 $align{Read2} =~ s/[^X]/-/g;
952 $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X/$trimmed2/ });
953
954 # add aligned reads to queue
955 {
956 lock($Aligned); # Keep other threads from changing the queue's contents
957 my @array = ($read2, undef, undef, undef, $align{Ref2}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
958 $Aligned->enqueue(\@array);
959 }
960 } elsif(defined $cacheAlign{$contig}{R2}{$trimmed2}) {
961 my @array;
962 { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R2}{$trimmed2} } };
963 $array[0] = $read2;
964 $array[6] = $qtrim2;
965 { lock($cacheAlignHits); $cacheAlignHits++ };
966
967 # add aligned reads to queue
968 {
969 lock($Aligned); # Keep other threads from changing the queue's contents
970 $Aligned->enqueue(\@array);
971 }
972 } else {
973 # align read using NW2
974 {
975 lock($ToAlign); # Keep other threads from changing the queue's contents
976 my @array = ($$refSeq{$contig}, undef, $trimmed2, $read2, undef, $qtrim2, $contig, $strand, $conc);
977 #my @array = ($fastaIn, $read2, undef, $qtrim2, $contig, $strand, $conc, $trimmed2);
978 $ToAlign->enqueue(\@array);
979 }
980 }
981 }
982 }
983
984 if($index_read1 < 0) {
985 # discard read 1
986 # send to writebamfile thread
987 {
988 lock($WriteBam); # Keep other threads from changing the queue's contents
989 $WriteBam->enqueue("$read1\t4\t*\t0\t0\t*\t*\t0\t0\t$seq1\t$qual1\tRG:Z:$readGroup\n");
990 }
991 { lock(%stats); $stats{PoorQual1}++ };
992 }
993
994 if($index_read2 < 0) {
995 # discard read 2
996 # send to writebamfile thread
997 {
998 lock($WriteBam); # Keep other threads from changing the queue's contents
999 $WriteBam->enqueue("$read2\t4\t*\t0\t0\t*\t*\t0\t0\t$seq2\t$qual2\tRG:Z:$readGroup\n");
1000 }
1001 { lock(%stats); $stats{PoorQual2}++ };
1002 }
1003
1004 if($index_read1 == -1 && $index_read2 == -1) {
1005 { lock(%stats); $stats{Amb}++ };
1006 } elsif($index_read1 == -2 && $index_read2 == -2) {
1007 { lock(%stats); $stats{PoorQualMates}++ };
1008 }
1009 }
1010
1011 # grab the next reads in queue
1012 {
1013 lock($FindContig); # Keep other threads from changing the queue's contents
1014 $readArrayRef = $FindContig->dequeue();
1015 }
1016 }
1017 print "Closing Find Amplicon Thread\n";
1018 return;
1019 }
1020
1021 sub writebamfile {
1022 my $bamfile = shift;
1023 my $WriteBam = shift;
1024
1025 # write samfile header
1026 open(PIPE_TO_SAMTOOLS, "| samtools view -Sb -o $bamfile - 2>&1") or die "Can't fork: $!\n";
1027 foreach my $key (@OrderedContigs) {
1028 print PIPE_TO_SAMTOOLS "\@SQ\tSN:$refLength{$key}{newReg}\tLN:$refLength{$key}{length}\n";
1029 }
1030 print PIPE_TO_SAMTOOLS "\@RG\tID:$readGroup\tPL:Illumina\tSM:$sampleName\n";
1031
1032 my $read;
1033 {
1034 lock($WriteBam); # Keep other threads from changing the queue's contents
1035 $read = $WriteBam->dequeue();
1036 }
1037 while(defined $read) {
1038
1039 # write read to bamfile
1040 print PIPE_TO_SAMTOOLS $read;
1041
1042 # grab next item in queue
1043 {
1044 lock($WriteBam); # Keep other threads from changing the queue's contents
1045 $read = $WriteBam->dequeue();
1046 }
1047 }
1048 print "Closing Bam Writer\n";
1049 close PIPE_TO_SAMTOOLS or die "pipe could not be closed";
1050 return;
1051 }
1052
1053 sub thread_process_align {
1054 my $Aligned = shift;
1055 my $refFrags = shift;
1056 my $refSeq = shift;
1057 my $Samfile = shift;
1058 my $WriteBam = shift;
1059
1060 my $readArrayRef;
1061 {
1062 lock($Aligned); # Keep other threads from changing the queue's contents
1063 $readArrayRef = $Aligned->dequeue();
1064 }
1065 while(defined $readArrayRef) {
1066 # grab the alignment
1067 my ($read, $ref1, $read1, $qtrim1, $ref2, $read2, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
1068 # grab primer lengths
1069 my $PU = $$refFrags{primers}{$contig}{U}{F}{length};
1070 my $PD = $$refFrags{primers}{$contig}{D}{F}{length};
1071 my %readStats;
1072 my $orig_read1 = $read1;
1073 my $orig_read2 = $read2;
1074 my $orig_qtrim1 = $qtrim1;
1075 my $orig_qtrim2 = $qtrim2;
1076 $orig_read1 =~ s/-//g if defined $orig_read1;
1077 $orig_read2 =~ s/-//g if defined $orig_read2;
1078
1079 if(defined $OPTIONS{d}) {
1080 # debug info
1081 print "Alignment With Primers\n";
1082 print "$ref1\n$read1\n" if defined $read1;
1083 print "$ref2\n$read2\n" if defined $read2;
1084 print "\nFinal Alignment:\n";
1085 }
1086
1087 # calculate where primers begin and end
1088 if(defined $read1) {
1089 if($strand eq 'F') {
1090 $readStats{$contig}{Read1} = 1;
1091 } else {
1092 $readStats{$contig}{Read2} = 1;
1093 }
1094
1095 # determine the primer locations
1096 my $offset = 0;
1097 for(my $occurance = 0; $occurance < $PU; $occurance++) {
1098 substr($ref1, $offset) =~ /^.*?([^-])/;
1099 $offset += $-[1] + 1;
1100 }
1101 my $length = 0;
1102 for(my $occurance = 0; $occurance < $PD; $occurance++) {
1103 substr($ref1, 0, length($ref1) - $length) =~ /.*([^-]).*?$/;
1104 $length = length($ref1) - $-[1];
1105 }
1106 $length = length($ref1) - $length - $offset;
1107
1108 # remove primers and adjust quality scores
1109 $ref1 = substr($ref1, $offset, $length);
1110 my $qualOffset = ( substr($read1, 0, $offset) =~ tr/ACTGN// );
1111 $read1 = substr($read1, $offset, $length);
1112 $length = ( $read1 =~ tr/ACTGN// );
1113 $qtrim1 = substr($qtrim1, $qualOffset, $length);
1114 }
1115
1116 if(defined $read2) {
1117 if($strand eq 'F') {
1118 $readStats{$contig}{Read2} = 1;
1119 } else {
1120 $readStats{$contig}{Read1} = 1;
1121 }
1122
1123 # determine the primer locations
1124 my $offset = 0;
1125 for(my $occurance = 0; $occurance < $PU; $occurance++) {
1126 substr($ref2, $offset) =~ /^.*?([^-])/;
1127 $offset += $-[1] + 1;
1128 }
1129 my $length = 0;
1130 for(my $occurance = 0; $occurance < $PD; $occurance++) {
1131 substr($ref2, 0, length($ref2) - $length) =~ /.*([^-]).*?$/;
1132 $length = length($ref2) - $-[1];
1133 }
1134 $length = length($ref2) - $length - $offset;
1135
1136 # remove primers and adjust quality scores
1137 $ref2 = substr($ref2, $offset, $length);
1138 my $qualOffset = ( substr($read2, 0, $offset) =~ tr/ACTGN// );
1139 $read2 = substr($read2, $offset, $length);
1140 $length = ( $read2 =~ tr/ACTGN// );
1141 $qtrim2 = substr($qtrim2, $qualOffset, $length);
1142 }
1143
1144 # generate a concensus sequence if we have both read1 and read2 and they overlap
1145 if($conc) {
1146 # the following are 0 based indexes
1147 my $R1qual = 0; # index for sanger qual read 1
1148 my $R2qual = 0; # index for sanger qual read 2
1149 my $aindex = 0; # index for alignment
1150 my $q1;
1151 my $q2;
1152 my $concensus;
1153 my $concqual;
1154
1155 $readStats{$contig}{ConcReads} = 1;
1156 $readStats{$contig}{ConcCorrectBases} = 0;
1157 $readStats{$contig}{ConcErrorsBases} = 0;
1158 while($aindex < length($ref1)) {
1159 # base1 is current base from read 1
1160 # base2 is current base from read 2
1161 my $base1 = substr($read1, $aindex, 1);
1162 my $base2 = substr($read2, $aindex, 1);
1163
1164 # grab the next quality score
1165 if($R1qual < length($qtrim1)) {
1166 $q1 = substr($qtrim1, $R1qual, 1);
1167 }
1168 if($R2qual < length($qtrim2)) {
1169 $q2 = substr($qtrim2, $R2qual, 1);
1170 }
1171
1172 # if we have a base in either
1173 if(($base1 ne '-' && $base1 ne 'N') || ($base2 ne '-' && $base2 ne 'N')) {
1174 if(($base2 eq "-" || $base2 eq 'N') && $base1 ne "-" && $base1 ne 'N') {
1175 # we are only in read 1 or read 2 is a N
1176 $concensus .= $base1;
1177 $concqual .= $q1;
1178 $R1qual++;
1179 if($base2 eq 'N') {
1180 $R2qual++;
1181 }
1182 } elsif(($base1 eq "-" || $base1 eq 'N') && $base2 ne "-" && $base2 ne 'N') {
1183 # we are only in read 2 or read 1 is a N
1184 $concensus .= $base2;
1185 $concqual .= $q2;
1186 $R2qual++;
1187 if($base1 eq 'N') {
1188 $R1qual++;
1189 }
1190 } else {
1191 # we are in the overlap region
1192 my $maxqual = sangerMax($q1, $q2);
1193 if($base1 eq $base2) {
1194 # both reads agree
1195 $readStats{$contig}{ConcCorrectBases}++;
1196 $concensus .= $base1;
1197 $concqual .= $maxqual;
1198 } elsif($q1 eq $q2) {
1199 # reads have different bases but the same score
1200 $readStats{$contig}{ConcErrorsBases}++;
1201 $concensus .= 'N';
1202 $concqual .= '#';
1203 } elsif($q1 eq $maxqual) {
1204 # reads have different bases but read 1 has the best score
1205 $concensus .= $base1;
1206 $concqual .= $maxqual;
1207 $readStats{$contig}{ConcErrorsBases}++;
1208 } else {
1209 # reads have different bases but read 2 has the best score
1210 $concensus .= $base2;
1211 $concqual .= $maxqual;
1212 $readStats{$contig}{ConcErrorsBases}++;
1213 }
1214 $R1qual++;
1215 $R2qual++;
1216 }
1217 } elsif($base1 eq 'N' || $base2 eq 'N') {
1218 if($base2 eq '-') {
1219 # one is a deletion the other is an N
1220 $concensus .= $base1;
1221 $concqual .= $q1;
1222 $R1qual++;
1223 } elsif($base1 eq '-') {
1224 # one is a deletion the other is an N
1225 $concensus .= $base2;
1226 $concqual .= $q2;
1227 $R2qual++;
1228 } elsif($base1 eq 'N' && $base2 eq 'N') {
1229 # both reads are an N
1230 $concensus .= 'N';
1231 $concqual .= sangerMax($q1, $q2);
1232 $R1qual++;
1233 $R2qual++;
1234 }
1235 } else {
1236 $concensus .= '-';
1237 }
1238 $aindex++;
1239 }
1240 $read1 = $concensus;
1241 $qtrim1 = $concqual;
1242 $read2 = undef;
1243 $qtrim2 = undef;
1244 }
1245
1246 if(defined $read1) {
1247 # remove any trailing - and send to samfile_line
1248 $read1 =~ s/^-*?([^-].*)/$1/;
1249 $ref1 = substr($ref1, -length($read1));
1250 $read1 =~ s/(.*[^-])-*?$/$1/;
1251 $ref1 = substr($ref1, 0, length($read1));
1252
1253 my $tmp = $ref1;
1254 $tmp =~ s/-//g;
1255 my $rstart = index($$refSeq{$contig}, $tmp) - $PU + 1;
1256
1257 # generate sam file line
1258 {
1259 lock($Samfile); # Keep other threads from changing the queue's contents
1260 my @array = ($read, $qtrim1, $rstart, $read1, $ref1, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
1261 $Samfile->enqueue(\@array);
1262 }
1263 }
1264
1265 if(defined $read2) {
1266 # remove any trailing - and send to samfile_line
1267 $read2 =~ s/^-*?([^-].*)/$1/;
1268 $ref2 = substr($ref2, -length($read2));
1269 $read2 =~ s/(.*[^-])-*?$/$1/;
1270 $ref2 = substr($ref2, 0, length($read2));
1271
1272 my $tmp = $ref2;
1273 $tmp =~ s/-//g;
1274 my $rstart = rindex($$refSeq{$contig}, $tmp) - $PU + 1;
1275
1276 # generate sam file line
1277 {
1278 lock($Samfile); # Keep other threads from changing the queue's contents
1279 my @array = ($read, $qtrim2, $rstart, $read2, $ref2, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
1280 $Samfile->enqueue(\@array);
1281 }
1282 }
1283
1284 if(defined $OPTIONS{d}) {
1285 # debug info
1286 print "$ref1\n$read1\n" if defined $read1;
1287 print "$ref2\n$read2\n" if defined $read2;
1288 print "$qtrim1\n" if defined $read1;
1289 print "$qtrim2\n" if defined $read2;
1290 print "\n\n";
1291 }
1292
1293 # grab next item in queue
1294 {
1295 lock($Aligned); # Keep other threads from changing the queue's contents
1296 $readArrayRef = $Aligned->dequeue();
1297 }
1298 }
1299 print "Closing Alignment Processor\n";
1300 return;
1301 }
1302
1303 # function to take reverse compliment
1304 sub revcompl {
1305 my $string = shift @_;
1306 my $revcomp = reverse($string);
1307 $revcomp =~ tr/ACGTacgtKMSWRYBDHVkmswrybdhv/TGCAtgcaMKSWYRVHDBmkswyrvhdb/;
1308 return $revcomp;
1309 }
1310
1311 # function to convert sanger fastq quality scores to phred scores
1312 sub sanger2phred {
1313 my $sang = shift @_;
1314 my @sanger = split("", $sang);
1315 my @phred;
1316 foreach my $q (@sanger) {
1317 my $Q = ord($q) - 33;
1318 push @phred, $Q;
1319 }
1320 return @phred;
1321 }
1322
1323 # function to return the largest sanger quality score
1324 sub sangerMax {
1325 my ($s1, $s2) = @_;
1326 my $Q1 = ord($s1) - 33;
1327 my $Q2 = ord($s2) - 33;
1328 if($Q1 > $Q2) {
1329 return $s1;
1330 } else {
1331 return $s2;
1332 }
1333 }
1334
1335 # function to read a fasta file as a hash
1336 sub read_fasta_file {
1337 my ($fasta_file, $seqs) = @_;
1338 my $header;
1339 my $seq = "";
1340 open (IN, $fasta_file) or die "can't open $fasta_file: $!\n";
1341 while (my $line = <IN>) {
1342 chomp $line;
1343 if (grep />/, $line) {
1344 if ($seq ne "") {
1345 $seqs->{$header} = $seq;
1346 }
1347 $header = $line;
1348
1349 $header =~ s/^>//; # remove ">"
1350 $header =~ s/\s+$//; # remove trailing whitespace
1351 $seq = ""; # clear out old sequence
1352 } else {
1353 $line =~ s/\s+//g; # remove whitespace
1354 $seq .= $line; # add sequence
1355 }
1356 }
1357 close IN; # finished with file
1358
1359 if ($seq ne "") { # handle last sequence
1360 $seqs->{$header} = $seq;
1361 }
1362
1363 return;
1364 }
1365
1366 # minimal element of a list
1367 #
1368 sub min
1369 {
1370 my @list = @{$_[0]};
1371 my $min = $list[0];
1372
1373 foreach my $i (@list)
1374 {
1375 $min = $i if ($i < $min);
1376 }
1377
1378 return $min;
1379 }
1380
1381 # maximal element of a list
1382 #
1383 sub max
1384 {
1385 my @list = @{$_[0]};
1386 my $max = $list[0];
1387
1388 foreach my $i (@list)
1389 {
1390 $max = $i if ($i > $max);
1391 }
1392
1393 return $max;
1394 }
1395
1396 # trim sequences using the BWA algorithm
1397 sub trim {
1398
1399 my ($s, $q, $threshold, $read) = @_; # read sequence and quality scores
1400 if($read == 2) {
1401 $q = reverse($q);
1402 $s = reverse($s);
1403 }
1404
1405 my @array = sanger2phred($q);
1406 my $length = scalar @array;
1407
1408 # only calculate if quality fails near end
1409 if( $array[$#array] >= $threshold ){
1410 if($read == 2) {
1411 $q = reverse($q);
1412 $s = reverse($s);
1413 }
1414 return ($s, $q);
1415 }
1416
1417 # run bwa equation
1418 my @arg;
1419 for( my $i = 0; $i < $length - 1; $i++ ){
1420 my $x = $i + 1;
1421 for( my $j = $x; $j < $length; $j++ ){
1422 $arg[$x] += $threshold - $array[$j];
1423 }
1424 }
1425
1426 # find number of 5' bases to retain
1427 my $index = 0;
1428 my $maxval = 0;
1429 for ( 1 .. $#arg ){
1430 if ( $maxval < $arg[$_] ){
1431 $index = $_;
1432 $maxval = $arg[$_];
1433 }
1434 }
1435
1436 # trim reads
1437 $s = substr($s,0,$index);
1438 $q = substr($q,0,$index);
1439 if($read == 2) {
1440 $s = reverse($s);
1441 $q = reverse($q);
1442 }
1443
1444 # return seq and quality
1445 return ($s, $q);
1446 }
1447
1448
1449 sub samfile_line {
1450 # grab the variables
1451 my ($Samfile, $WriteBam, $refSeq) = @_;
1452 my %SamStats;
1453 my %minLength;
1454 my %maxLength;
1455 my $readArrayRef;
1456 {
1457 lock($Samfile); # Keep other threads from changing the queue's contents
1458 $readArrayRef = $Samfile->dequeue();
1459 }
1460
1461 while(defined $readArrayRef) {
1462 my ($read, $qual, $rstart, $align, $ref, $reg, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, $readStats) = @{ $readArrayRef };
1463 my $seq = $align;
1464 $seq =~ s/-//g;
1465
1466 # calculate the edit difference - count indels as only 1 missmatch hence tranform the strings so that all indels have length 1
1467 my $sam;
1468 my $s1;
1469 my $s2;
1470 my @s1 = split("", $align);
1471 my @s2 = split("", $ref);
1472
1473 my $indel = 0;
1474 for(my $i = 0; $i <= $#s1; $i++) {
1475 if(($s1[$i] ne "-" && $s2[$i] ne "-") || !$indel) {
1476 $s1 .= $s1[$i];
1477 $s2 .= $s2[$i];
1478 $indel = 0;
1479 if($s1[$i] eq "-" || $s2[$i] eq "-") {
1480 $indel = 1;
1481 }
1482 }
1483 }
1484 my $mq = levenshtein($s1, $s2);
1485
1486 my $cl = length($seq);
1487 # calculate max diff allowed
1488 if(!defined $maxdiff{$cl}) {
1489 { lock(%maxdiff); $maxdiff{$cl} = maxdiff($cl, 0.02, 0.04); }
1490 }
1491
1492 if($mq > $maxdiff{$cl} || $cl == 0) {
1493 { lock(%stats); $stats{$reg}{PoorMapQual}++; $stats{PoorMapQual}++; }
1494 if(defined $orig_read1) {
1495 $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read1\t$orig_qtrim1\tRG:Z:$readGroup\n";
1496 # send to writebamfile thread
1497 {
1498 lock($WriteBam); # Keep other threads from changing the queue's contents
1499 $WriteBam->enqueue($sam);
1500 }
1501 }
1502 if(defined $orig_read2) {
1503 $read =~ s/[1](:[A-Z]+:[0-9]+:[ACTGN]+)$/2$1/;
1504 $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read2\t$orig_qtrim2\tRG:Z:$readGroup\n";
1505 # send to writebamfile thread
1506 {
1507 lock($WriteBam); # Keep other threads from changing the queue's contents
1508 $WriteBam->enqueue($sam);
1509 }
1510 }
1511 } else {
1512 $mq = 60;
1513
1514 # add any stats from earlier
1515 foreach my $key1 (keys %$readStats) {
1516 foreach my $key2 (keys % { $$readStats{$key1} }) {
1517 $SamStats{$key1}{$key2} += $$readStats{$key1}{$key2};
1518 }
1519 }
1520
1521 # keep track of quality scores
1522 $SamStats{$reg}{Bases} += $cl;
1523 $SamStats{$reg}{Quality} += $_ for sanger2phred($qual);
1524
1525 # keep track of read lengths
1526 if(defined $minLength{$reg}) {
1527 $minLength{$reg} = min([$cl, $minLength{$reg}]);
1528 $maxLength{$reg} = max([$cl, $maxLength{$reg}]);
1529 } else {
1530 $minLength{$reg} = $cl;
1531 $maxLength{$reg} = $cl;
1532 }
1533
1534 # keep track of strand and bases seen
1535 if($strand eq 'F') {
1536 $SamStats{$reg}{NumberForward}++;
1537 } else {
1538 $SamStats{$reg}{NumberReverse}++;
1539 }
1540 $SamStats{$reg}{TotalA} += ($seq =~ tr/A//);
1541 $SamStats{$reg}{TotalC} += ($seq =~ tr/C//);
1542 $SamStats{$reg}{TotalG} += ($seq =~ tr/G//);
1543 $SamStats{$reg}{TotalT} += ($seq =~ tr/T//);
1544 $SamStats{$reg}{TotalN} += ($seq =~ tr/N//);
1545
1546 # keep count the number of reads for each contig
1547 $SamStats{$reg}{Reads}++;
1548
1549 # calculate strand info
1550 if($strand eq 'F') {
1551 $strand = 0;
1552 } else {
1553 $strand = 16;
1554 }
1555 # calculate cigar
1556 my $cigar = "";
1557 my $M = 0;
1558 my $D = 0;
1559 my $I = 0;
1560 my $Subs = 0;
1561 for(my $i = 0; $i < length($align); $i++) {
1562 if(substr($align, $i, 1) eq "-") {
1563 $D++;
1564 if($M) {
1565 $cigar .= "${M}M";
1566 $M = 0;
1567 } elsif($I) {
1568 $cigar .= "${I}I";
1569 $I = 0;
1570 }
1571 } elsif(substr($ref, $i, 1) eq "-") {
1572 $I++;
1573 if($M) {
1574 $cigar .= "${M}M";
1575 $M = 0;
1576 } elsif($D) {
1577 $cigar .= "${D}D";
1578 $D = 0;
1579 }
1580 } else {
1581 $M++;
1582 if($D) {
1583 $cigar .= "${D}D";
1584 $D = 0;
1585 } elsif($I) {
1586 $cigar .= "${I}I";
1587 $I = 0;
1588 }
1589 $Subs++ if substr($ref, $i, 1) ne substr($align, $i, 1);
1590 }
1591 }
1592 if($M) {
1593 $cigar .= "${M}M";
1594 } elsif($I) {
1595 $cigar .= "${I}I";
1596 } elsif($D) {
1597 $cigar .= "${D}D";
1598 }
1599
1600 # construct sam file line
1601 $sam = "$read\t$strand\t$refLength{$reg}{newReg}\t$rstart\t$mq\t$cigar\t*\t0\t0\t$seq\t$qual\tRG:Z:$readGroup\n";
1602
1603 # send to writebamfile thread
1604 {
1605 lock($WriteBam); # Keep other threads from changing the queue's contents
1606 $WriteBam->enqueue($sam);
1607 }
1608 }
1609
1610 #grab the next read
1611 {
1612 lock($Samfile); # Keep other threads from changing the queue's contents
1613 $readArrayRef = $Samfile->dequeue();
1614 }
1615 }
1616
1617 {
1618 # add stats from earlier
1619 lock(%stats);
1620 foreach my $key1 (keys %SamStats) {
1621 $stats{$key1}{MinLength} = min([$minLength{$key1}, $stats{$key1}{MinLength}]);
1622 $stats{$key1}{MaxLength} = max([$maxLength{$key1}, $stats{$key1}{MaxLength}]);
1623 foreach my $key2 (keys % { $SamStats{$key1} }) {
1624 $stats{$key1}{$key2} += $SamStats{$key1}{$key2};
1625 }
1626 }
1627
1628 }
1629 print "Closing Sam File Line Generator\n";
1630
1631 return;
1632 }
1633
1634 # perform the alignment
1635 sub nwAlign {
1636 my ($ToAlign, $Aligned, $regions) = @_;
1637 my $readArrayRef;
1638 {
1639 lock($ToAlign); # Keep other threads from changing the queue's contents
1640 $readArrayRef = $ToAlign->dequeue();
1641 }
1642
1643 while(defined $readArrayRef) {
1644 # we have picked a contig
1645 # check strand and take rev comp of ref if necessary
1646 my ($ref, $left, $right, $read, $qtrim1, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
1647 my %align;
1648 my $name;
1649 #! Parameters
1650 #! ClustalW Parameters
1651 my $matchScore = 1.9;
1652 my $missmatch_penalty = -0.4;
1653 my $gapPenOpen = -10.0;
1654 my $gapPenExt = -0.4;
1655 my $leftAndRightGapPen = true;
1656 my $debug = false;
1657 my $cons = "";
1658 my $align1= "";
1659 my $align2= "";
1660 my $align3= "";
1661 my $score = 0.0;
1662 my $singleAlign = false; #aligning 2 or 3 sequences
1663 if($conc)
1664 {
1665 #got to concatenate seqs and align
1666 $score = align3($ref, $right, $left, $align1, $align2, $align3, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, $leftAndRightGapPen, $debug);
1667 $name = "$left.$right";
1668 print "$contig\n$align1\n$align2\n$align3\n\n" if defined $$regions{$contig};
1669 }
1670 elsif(!defined $right)
1671 {
1672 #align with left
1673 $score = align2($ref, $left, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, false, true, $debug);
1674 $singleAlign = true;
1675 $name = $left;
1676 print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig};
1677 }
1678 elsif(!defined $left)
1679 {
1680 #align with right
1681 $score = align2($ref, $right, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen ,$gapPenExt ,true, false, $debug);
1682 $singleAlign = true;
1683 $name = $right;
1684 print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig};
1685 }
1686
1687 # add aligned reads to queue
1688 # cache results
1689 my $type;
1690 {
1691 lock($Aligned); # Keep other threads from changing the queue's contents
1692
1693 my @array;
1694 if($singleAlign && !defined $right)
1695 {
1696 @array = ($read, $align1, $align2, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
1697 {
1698 lock(%cacheAlign);
1699 $cacheAlign{$contig}{R1}{$name} = &share( [] );
1700 @{ $cacheAlign{$contig}{R1}{$name} } = @array;
1701 }
1702 }
1703 elsif($singleAlign && !defined $left )
1704 {
1705 @array = ($read, undef, undef, undef, $align1, $align2, $qtrim2, $contig, $strand, $conc);
1706 {
1707 lock(%cacheAlign);
1708 $cacheAlign{$contig}{R2}{$name} = &share( [] );
1709 @{ $cacheAlign{$contig}{R2}{$name} } = @array;
1710 }
1711 }
1712 else
1713 {
1714 @array = ($read, $align1, $align2, $qtrim1, $align1, $align3, $qtrim2, $contig, $strand, $conc);
1715 {
1716 lock(%cacheAlign);
1717 $cacheAlign{Conc}{$name} = &share( [] );
1718 @{ $cacheAlign{Conc}{$name} } = @array;
1719 }
1720 }
1721 $Aligned->enqueue(\@array);
1722 }
1723
1724 #grab the next read
1725 {
1726 lock($ToAlign); # Keep other threads from changing the queue's contents
1727 $readArrayRef = $ToAlign->dequeue();
1728 }
1729 }
1730 # clean up
1731 print "Closing NW2 Aligner\n";
1732 return;
1733 }
1734
1735 # function to calculat the minimum edit distance threshold (taken from bwa)
1736 sub maxdiff {
1737 my ($l, $err, $thres) = @_;
1738 my $lambda = exp(-$l*$err);
1739 my $y = 1.0;
1740 my $x = 1;
1741 my $sum = $lambda;
1742 for(my $k = 1; $k < 1000; $k++) {
1743 $y *= $l * $err;
1744 $x *= $k;
1745 $sum += $lambda * $y / $x;
1746 if(1.0 - $sum < $thres) {
1747 return $k;
1748 }
1749 }
1750 return 2;
1751 }
1752
1753 # using approx matching return the positions in $list of those most closely matching $string
1754 sub amatch {
1755 my ($string, $list) = @_;
1756 my @d = adistr($string, @$list);
1757 my @d_index = sort { abs($d[$a]) <=> abs($d[$b]) } 0..$#$list;
1758
1759 return if $d[$d_index[0]] > 0.25;
1760
1761 my @pos = ($d_index[0]);
1762 my $i = 1;
1763 while($i <= $#d_index && $d[$d_index[0]] == $d[$d_index[$i]]) {
1764 push @pos, $d_index[$i];
1765 $i++;
1766 }
1767 return @pos;
1768 }
1769
1770 # a sorting algorithm to sort the chromosomes
1771 sub sortChrom {
1772 my ($achr, $astart, $aend) = split(":|-", $a);
1773 my ($bchr, $bstart, $bend) = split(":|-", $b);
1774 $achr =~ s/X/23/;
1775 $achr =~ s/Y/24/;
1776 $achr =~ s/MT?/25/;
1777 $bchr =~ s/X/23/;
1778 $bchr =~ s/Y/24/;
1779 $bchr =~ s/MT?/25/;
1780 if($achr < $bchr) {
1781 return -1;
1782 } elsif($bchr < $achr) {
1783 return 1;
1784 } elsif($astart < $bstart) {
1785 return -1;
1786 } elsif($bstart < $astart) {
1787 return 1;
1788 } elsif($aend < $bend) {
1789 return -1;
1790 } elsif($bend < $aend) {
1791 return 1;
1792 } else {
1793 return 0;
1794 }
1795 }
1796
1797 # pass a code reference
1798 sub modify {
1799 my($text, $code) = @_;
1800 $code->($text);
1801 return $text;
1802 }
1803
1804 sub split_region {
1805 my ($chr, $start, $end) = split(":|-", shift);
1806 $end = $start unless defined $end;
1807 return($chr, $start, $end);
1808 }
1809
1810 sub error_rate {
1811 my ($x, $y) = @_;
1812 my @x = split("", $x);
1813 my @y = split("", $y);
1814 my $s = 0;
1815 my $l = 0;
1816 for(my $i = 0; $i <= $#x; $i++) {
1817 if($x[$i] ne "-" && $y[$i] ne "-") {
1818 $s++ if $x[$i] ne $y[$i];
1819 $l++;
1820 }
1821 }
1822 return($s / $l);
1823 }
1824
1825