0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 # script to convert paired end reads to a concensus sequence
|
|
4
|
|
5 # Created by Jason Ellul, Feb 2012
|
|
6 # Modified by Franco Caramia and Jason Ellul, July 2012 and November 2012
|
|
7 # 0.4 Fixed mapping of concensus region (large insertions and deletions now picked up correctly)
|
|
8 # Added missalignment penalty
|
|
9 # Read 1 and 2 are now aligned first then they are aligned to the reference
|
|
10 # 0.5 Added ability to name amplicons by includinfo the info in the primers file
|
|
11 # Added the ability to output the alignments in a particular region using the new flag -a
|
|
12 # Added some extra stats to the output
|
|
13
|
|
14 use strict;
|
|
15 use warnings;
|
|
16 use Getopt::Std;
|
|
17 use File::Basename;
|
|
18 use File::Find;
|
|
19 use File::Copy;
|
|
20 use File::MMagic;
|
|
21 use File::Temp qw/ tempfile tempdir /;
|
|
22 use Cwd 'abs_path';
|
|
23 use threads;
|
|
24 use threads::shared;
|
|
25 use List::Util qw(sum);
|
|
26 use Thread::Queue;
|
|
27 use String::Approx 'adistr';
|
|
28 use POSIX qw(floor ceil);
|
|
29 String::Approx::cache_max(25000);
|
|
30 String::Approx::cache_purge(0.25);
|
|
31 use constant false => 0;
|
|
32 use constant true => 1;
|
|
33 use Inline CPP => Config => AUTO_INCLUDE => '#include "utils.cpp"';
|
|
34 use Inline CPP => <<'END_CPP';
|
|
35
|
|
36
|
|
37
|
|
38
|
|
39 int levenshtein(char* seq1, char* seq2)
|
|
40 {
|
|
41 // Returns edit distance of 2 sequences
|
|
42 string sam1(seq1);
|
|
43 string sam2(seq2);
|
|
44 return levenshteinDistance(sam1, sam2);
|
|
45 }
|
|
46
|
|
47 double align2(char* seq1, char* seq2, SV* res1, SV* res2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug )
|
|
48 {
|
|
49 // Returns Sequence alignment of 2 sequences
|
|
50 // res1, res2 contain the alignment
|
|
51 //seq1: reference sequence
|
|
52 //seq2: amplicon sequence
|
|
53 //matchScore: reward for matching a nucleotide
|
|
54 //gapPenOpen: penalty for opening a gap
|
|
55 //gapPenext: penalty for extending a gap
|
|
56 //noFrontGapPenalty: No penalty will be impose for gaps at the beggining of the alignment
|
|
57 //noEndGapPenalty: No penalty will be impose for gaps at the end of the alignment
|
|
58 //debug: prints scores, alignments, etc...
|
|
59
|
|
60 string sam1(seq1);
|
|
61 string sam2(seq2);
|
|
62 double ret = nw_align2(sam1, sam2, matchScore, missmatch_penalty, gapPenOpen, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);
|
|
63 //Setting the return alignments
|
|
64 sv_setpvn(res1, (char*)sam1.c_str(), sam1.length());
|
|
65 sv_setpvn(res2, (char*)sam2.c_str(), sam2.length());
|
|
66 return ret;
|
|
67 }
|
|
68
|
|
69 void adjustSequence(int refSize, string& seq, char dir)
|
|
70 {
|
|
71
|
|
72 int adjust = refSize - seq.length();
|
|
73 if(dir == 'r')
|
|
74 for(int i=0; i<adjust; ++i) seq = '-' + seq;
|
|
75 else
|
|
76 for(int i=0; i<adjust; ++i) seq += '-';
|
|
77 }
|
|
78
|
|
79 double align3(char* ref, char* right,char* left , SV* res1, SV* res2, SV* res3, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool leftAndRightGapPen, bool debug )
|
|
80 {
|
|
81 // Returns Sequence alignment of 3 sequences
|
|
82 // res1, res2, res3 contain the alignment
|
|
83 //ref: reference sequence
|
|
84 //left: left amplicon sequence
|
|
85 //right: right amplicon sequence
|
|
86 //matchScore: reward for matching a nucleotide
|
|
87 //gapPenOpen: penalty for opening a gap
|
|
88 //gapPenext: penalty for extending a gap
|
|
89 //leftAndRightGapPen: Smart way to penalise according to the amplicon
|
|
90 //debug: prints scores, alignments, etc...
|
|
91
|
|
92
|
|
93 string leftSeq(left);
|
|
94 string rightSeq(right);
|
|
95
|
|
96 bool noFrontGapPenalty = false;
|
|
97 bool noEndGapPenalty = false;
|
|
98
|
|
99 //Align right with left reads
|
|
100 if(leftAndRightGapPen)
|
|
101 {
|
|
102 noFrontGapPenalty = true;
|
|
103 noEndGapPenalty = true;
|
|
104 }
|
|
105
|
|
106 //first alignment the 2 reads
|
|
107 double ret = nw_align2(rightSeq, leftSeq, matchScore, missmatch_penalty*8, gapPenOpen*5, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);
|
|
108
|
|
109 //Setting the return alignments
|
|
110 vector <string> msa;
|
|
111 vector <string> partial;
|
|
112 vector <string> seq;
|
|
113
|
|
114 partial.push_back(rightSeq);
|
|
115 partial.push_back(leftSeq);
|
|
116 seq.push_back(ref);
|
|
117
|
|
118 msa = nw_alignAlignments(partial, seq , matchScore, missmatch_penalty, gapPenOpen, gapPenExt, false, false, debug);
|
|
119 sv_setpvn(res1, (char*)msa[2].c_str(), msa[2].length());
|
|
120 sv_setpvn(res2, (char*)msa[1].c_str(), msa[1].length());
|
|
121 sv_setpvn(res3, (char*)msa[0].c_str(), msa[0].length());
|
|
122 return ret;
|
|
123 }
|
|
124
|
|
125 END_CPP
|
|
126
|
|
127 $| = 1;
|
|
128
|
|
129 # Grab and set all options
|
|
130 my $version = "0.5";
|
|
131 my %OPTIONS = (p => 1);
|
|
132 getopts('a:dl:L:o:p:rsv', \%OPTIONS);
|
|
133
|
|
134 die qq(version $version
|
|
135 Usage: alignCustomAmplicon.pl [OPTIONS] <ref> <fastq[.gz] 1> <fastq[.gz] 2> <primers>
|
|
136
|
|
137 OPTIONS: -a STR print the alignment for any amplicons covering a particular region [chr:loc || chr:start-stop]
|
|
138 -d print debug info
|
|
139 -l STR filename of the log file [null]
|
|
140 -L STR append to an existing log file [null]
|
|
141 -o STR filename for bam file
|
|
142 -p INT number of threads [$OPTIONS{p}]
|
|
143 -r do not create stats report
|
|
144 -s sort the bam file
|
|
145 -v verbose progress
|
|
146 ) if(@ARGV != 4);
|
|
147
|
|
148 my $Script = 'alignCustomAmplicon';
|
|
149 my $ref = shift @ARGV;
|
|
150 my $fastq1 = shift @ARGV;
|
|
151 my $fastq2 = shift @ARGV;
|
|
152 my $primers = shift @ARGV;
|
|
153 my $now;
|
|
154
|
|
155 #create log file
|
|
156 if(defined $OPTIONS{l}) {
|
|
157 my $logfile = $OPTIONS{l};
|
|
158 open (FH, ">$logfile") or die 'Couldn\'t create log file!\n';
|
|
159 close (FH);
|
|
160 # Open the log file and redirect output to it
|
|
161 open (STDERR, ">>$logfile");
|
|
162 open (STDOUT, ">>$logfile");
|
|
163 my $now = localtime time;
|
|
164 print "Log File Created $now\n\n";
|
|
165 } elsif(defined $OPTIONS{L}) {
|
|
166 #append to a log file
|
|
167 my $logfile = $OPTIONS{L};
|
|
168 # Open the log file and redirect output to it
|
|
169 open (STDERR, ">>$logfile") or die 'Couldn\'t create log file!\n';
|
|
170 open (STDOUT, ">>$logfile") or die 'Couldn\'t create log file!\n';
|
|
171 my $now = localtime time;
|
|
172 print "Appending To Log File $now\n\n";
|
|
173 }
|
|
174
|
|
175 # print version of this script
|
|
176 print "Using $Script version $version \n\n";
|
|
177
|
|
178 if(defined $OPTIONS{d}) {
|
|
179 print "Setting threads to 1 as using debug\n";
|
|
180 $OPTIONS{p} = 1;
|
|
181 }
|
|
182
|
|
183 # grab sample name, set output file name and open some files
|
|
184 my ($name, $path, $suffix) = fileparse($fastq1, ('.fastq', '.fastq.gz'));
|
|
185 my $sampleName :shared = $name;
|
|
186 $sampleName =~ s/^(.*?_.*?)_.*/$1/;
|
|
187 open(PRIM, $primers) or die "can't open $primers: $!\n";
|
|
188
|
|
189 # set samfile name
|
|
190 my $bamfile_out = "${sampleName}_aligned.bam";
|
|
191 if(defined $OPTIONS{o}) {
|
|
192 $bamfile_out = $OPTIONS{o};
|
|
193 }
|
|
194 my $bam_to_sort = $bamfile_out;
|
|
195 if(defined $OPTIONS{s}) {
|
|
196 $bam_to_sort .= '.tmp';
|
|
197 }
|
|
198
|
|
199 my $stats = $bamfile_out;
|
|
200 if(!defined $OPTIONS{r}) {
|
|
201 $stats =~ s/\..*?$//;
|
|
202 open(STATS, ">${stats}_stats.csv");
|
|
203 }
|
|
204
|
|
205 # read in primer lengths
|
|
206 my %refFrags;
|
|
207 $refFrags{min} = 5000000;
|
|
208 while(my $line = <PRIM>) {
|
|
209 chomp $line;
|
|
210 my @line_sp = split("\t", $line);
|
|
211 $refFrags{primers}{$line_sp[0]}{U}{F}{length} = $line_sp[1];
|
|
212 $refFrags{primers}{$line_sp[0]}{D}{F}{length} = $line_sp[2];
|
|
213 $refFrags{primers}{$line_sp[0]}{U}{R}{length} = $line_sp[2];
|
|
214 $refFrags{primers}{$line_sp[0]}{D}{R}{length} = $line_sp[1];
|
|
215 $refFrags{name}{$line_sp[0]} = $line_sp[3];
|
|
216 $refFrags{min} = min([$refFrags{min}, $line_sp[1], $line_sp[2]]);
|
|
217 }
|
|
218 close(PRIM);
|
|
219 my @OrderedContigs :shared = sort sortChrom keys %{ $refFrags{primers} };
|
|
220
|
|
221 my %regions;
|
|
222 if(defined $OPTIONS{a}) {
|
|
223 my ($chr, $start, $end) = split_region($OPTIONS{a});
|
|
224 foreach my $reg (@OrderedContigs) {
|
|
225 my ($chr_reg, $start_reg, $end_reg) = split_region($reg);
|
|
226 $regions{$reg} = 1 if $chr eq $chr_reg && $start_reg <= $end && $end_reg >= $start;
|
|
227 }
|
|
228 }
|
|
229
|
|
230 # grab fasta file
|
|
231 my %refSeq;
|
|
232 read_fasta_file($ref, \%refSeq);
|
|
233
|
|
234 # calculate read length and read group
|
|
235 my $readGroup :shared;
|
|
236 my $readLength;
|
|
237 my $mm = new File::MMagic;
|
|
238 my $filetype = $mm->checktype_filename(abs_path($fastq1));
|
|
239 if($filetype eq "application/x-gzip") {
|
|
240 my $sampleRead = `gunzip -c $fastq1 | head -n 2 | tail -n 1`;
|
|
241 chomp $sampleRead;
|
|
242 $readLength = length($sampleRead);
|
|
243 $readGroup = `gunzip -c $fastq1 | head -n 1`;
|
|
244 chomp $readGroup;
|
|
245 $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
|
|
246 } elsif($filetype eq "text/plain") {
|
|
247 my $sampleRead = `head -n 2 $fastq1 | tail -n 1`;
|
|
248 chomp $sampleRead;
|
|
249 $readLength = length($sampleRead);
|
|
250 $readGroup = `head $fastq1 -n 1`;
|
|
251 chomp $readGroup;
|
|
252 $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
|
|
253 } else {
|
|
254 die("$fastq1 is filetype $filetype and not text/plain or application/x-gzip\n");
|
|
255 }
|
|
256
|
|
257 # create hash to keep track of maxdiff and readlengths
|
|
258 my %maxdiff :shared;
|
|
259
|
|
260 # create hash to keep track of stats
|
|
261 my %stats :shared;
|
|
262 $stats{Amb} = 0;
|
|
263 $stats{PoorQualMates} = 0;
|
|
264 $stats{PoorMapQual} = 0;
|
|
265 $stats{PoorQual1} = 0;
|
|
266 $stats{PoorQual2} = 0;
|
|
267
|
|
268 # setup other shared variables and caches
|
|
269 my %cache :shared;
|
|
270 my $cacheHits :shared = 0;
|
|
271 my %cacheAlign :shared;
|
|
272 $cacheAlign{Conc} = &share( {} );
|
|
273 my $cacheAlignHits :shared = 0;
|
|
274 my %refLength :shared;
|
|
275
|
|
276 # extract primers and fragments on forward and reverse strand and initialize counts
|
|
277 foreach my $key (@OrderedContigs) {
|
|
278 push @{ $refFrags{seqU} }, substr($refSeq{$key}, 0, $refFrags{primers}{$key}{U}{F}{length});
|
|
279 push @{ $refFrags{seqD} }, substr($refSeq{$key}, -$refFrags{primers}{$key}{D}{F}{length});
|
|
280 push @{ $refFrags{seqU} }, revcompl(substr($refSeq{$key}, -$refFrags{primers}{$key}{U}{R}{length}));
|
|
281 push @{ $refFrags{seqD} }, revcompl(substr($refSeq{$key}, 0, $refFrags{primers}{$key}{D}{R}{length}));
|
|
282 push @{ $refFrags{lenU} }, ($refFrags{primers}{$key}{U}{F}{length}, $refFrags{primers}{$key}{U}{R}{length});
|
|
283 push @{ $refFrags{lenD} }, ($refFrags{primers}{$key}{D}{F}{length}, $refFrags{primers}{$key}{D}{R}{length});
|
|
284 push @{ $refFrags{reg} }, ($key, $key);
|
|
285 push @{ $refFrags{strand} }, ('F', 'R');
|
|
286
|
|
287 # calculate new contig name with primers removed
|
|
288 my ($chr, $start, $stop) = split(":|-", $key);
|
|
289 $start += $refFrags{primers}{$key}{U}{F}{length};
|
|
290 $stop -= $refFrags{primers}{$key}{D}{F}{length};
|
|
291 $refLength{$key} = &share( {} );
|
|
292 $refLength{$key}{newReg} = "$chr:$start-$stop";
|
|
293 $refLength{$key}{length} = $stop - $start + 1;
|
|
294
|
|
295 $cacheAlign{$key} = &share( {} );
|
|
296 $cacheAlign{$key}{R1} = &share( {} );
|
|
297 $cacheAlign{$key}{R2} = &share( {} );
|
|
298
|
|
299 $stats{$key} = &share( {} );
|
|
300 $stats{$key}{Reads} = 0;
|
|
301 $stats{$key}{Bases} = 0;
|
|
302 $stats{$key}{Quality} = 0;
|
|
303 $stats{$key}{Mapping} = 0;
|
|
304 $stats{$key}{Read1} = 0;
|
|
305 $stats{$key}{Read2} = 0;
|
|
306 $stats{$key}{ConcMapping} = 0;
|
|
307 $stats{$key}{ConcReads} = 0;
|
|
308 $stats{$key}{NumberForward} = 0;
|
|
309 $stats{$key}{NumberReverse} = 0;
|
|
310 $stats{$key}{ConcErrorsBases} = 0;
|
|
311 $stats{$key}{ConcCorrectBases} = 0;
|
|
312 $stats{$key}{MinLength} = 100000;
|
|
313 $stats{$key}{MaxLength} = 0;
|
|
314 $stats{$key}{TotalA} = 0;
|
|
315 $stats{$key}{TotalC} = 0;
|
|
316 $stats{$key}{TotalG} = 0;
|
|
317 $stats{$key}{TotalT} = 0;
|
|
318 $stats{$key}{TotalN} = 0;
|
|
319 $stats{$key}{PoorMapQual} = 0;
|
|
320 }
|
|
321
|
|
322 # start up worker threads
|
|
323 my $FindContig = Thread::Queue->new();
|
|
324 my $ToAlign = Thread::Queue->new();
|
|
325 my $Aligned = Thread::Queue->new();
|
|
326 my $Samfile = Thread::Queue->new();
|
|
327 my $WriteBam = Thread::Queue->new();
|
|
328 my @threadArray;
|
|
329 for(my $index = 1; $index <= $OPTIONS{p}; $index++) {
|
|
330 push @threadArray, threads->create(\&thread_find_contig, $FindContig, \%refFrags, \%refSeq, $ToAlign, $Aligned, $WriteBam);
|
|
331 push @threadArray, threads->create(\&thread_process_align, $Aligned, \%refFrags, \%refSeq, $Samfile, $WriteBam) if $index <= max([1, $OPTIONS{p}/2]);
|
|
332 push @threadArray, threads->create(\&samfile_line, $Samfile, $WriteBam, \%refSeq) if $index <= max([1, $OPTIONS{p}/2]);
|
|
333 }
|
|
334 push @threadArray, threads->create(\&nwAlign, $ToAlign, $Aligned, \%regions);
|
|
335 push @threadArray, threads->create(\&writebamfile, $bam_to_sort, $WriteBam);
|
|
336
|
|
337 # read in each sequence and its mate from the fastq file
|
|
338 my $read1;
|
|
339 my $read2;
|
|
340 my $seq1;
|
|
341 my $seq2;
|
|
342 my $qual1;
|
|
343 my $qual2;
|
|
344 my $desc;
|
|
345 my @strand;
|
|
346 my $cont = 1;
|
|
347
|
|
348 # open fastq files
|
|
349 my $gz1;
|
|
350 my $gz2;
|
|
351 my $f1gz = 0;
|
|
352 my $f2gz = 0;
|
|
353 if($filetype eq "application/x-gzip") {
|
|
354 open(FQ1, "gunzip -dc $fastq1 |") or die "Can't fork: $!\n";
|
|
355 } else {
|
|
356 open(FQ1, $fastq1) or die "$fastq1: $!";
|
|
357 }
|
|
358
|
|
359 $filetype = $mm->checktype_filename(abs_path($fastq2));;
|
|
360 if($filetype eq "application/x-gzip") {
|
|
361 open(FQ2, "gunzip -dc $fastq2 |") or die "Can't fork: $!\n";
|
|
362 } elsif($filetype eq "text/plain") {
|
|
363 open(FQ2, $fastq2) or die "$fastq2: $!";
|
|
364 } else {
|
|
365 die("$fastq2 is filetype $filetype and not text/plain or application/x-gzip\n");
|
|
366 }
|
|
367
|
|
368 # add reads to the queue
|
|
369 my $readNo = 0;
|
|
370 while ($cont) {
|
|
371 if($read1 = <FQ1>) { # read 1 and 2 read name
|
|
372 $seq1 = <FQ1>; # read 1 sequence
|
|
373 $desc = <FQ1>; # read 1 desc
|
|
374 $qual1 = <FQ1>; # read 1 quality
|
|
375 } else {
|
|
376 last;
|
|
377 }
|
|
378
|
|
379 if($read2 = <FQ2>) { # read 2 read name
|
|
380 $seq2 = <FQ2>; # read 2 sequence
|
|
381 $desc = <FQ2>; # read 2 sdesc
|
|
382 $qual2 = <FQ2>; # read 2 quality
|
|
383 } else {
|
|
384 last;
|
|
385 }
|
|
386
|
|
387 # read 1 the forward read
|
|
388 chomp $read1;
|
|
389 chomp $seq1;
|
|
390 chomp $qual1;
|
|
391 $read1 =~ s/^\@(.*)\s?.*/$1/;
|
|
392
|
|
393 # read 2 the reverse read
|
|
394 chomp $read2;
|
|
395 chomp $seq2;
|
|
396 $seq2 = revcompl($seq2);
|
|
397 chomp $qual2;
|
|
398 $qual2 = reverse($qual2);
|
|
399
|
|
400 $read2 =~ s/^\@(.*)\s?.*/$1/;
|
|
401 $readNo++;
|
|
402
|
|
403 # keep at most 20 reads in buffer
|
|
404 while(1) {
|
|
405 if(defined $OPTIONS{v}) {
|
|
406 my $percAmb = $stats{Amb} / $readNo * 100;
|
|
407 my $percPoor = $stats{PoorQualMates} / $readNo * 100;
|
|
408 my $percPoorMap = $stats{PoorMapQual} / $readNo * 100;
|
|
409 my $percPoor1 = $stats{PoorQual1} / $readNo * 100;
|
|
410 my $percPoor2 = $stats{PoorQual2} / $readNo * 100;
|
|
411 $percAmb = sprintf("%.2f", $percAmb);
|
|
412 $percPoor = sprintf("%.2f", $percPoor);
|
|
413 $percPoorMap = sprintf("%.2f", $percPoorMap);
|
|
414 $percPoor1 = sprintf("%.2f", $percPoor1);
|
|
415 $percPoor2 = sprintf("%.2f", $percPoor2);
|
|
416 print STDERR "\r$readNo processed - Amb $percAmb - PoorQual $percPoor - PoorQual1 $percPoor1 - PoorQual2 $percPoor2 - PoorMappingQual $percPoorMap - FindContig ".$FindContig->pending()." - ToAlign ".$ToAlign->pending()." - Aligned ".$Aligned->pending()." - Sam ".$Samfile->pending()." - Writebam ".$WriteBam->pending()." ";
|
|
417 }
|
|
418 last if $FindContig->pending() <= 100;
|
|
419 }
|
|
420 my @readArray = ($read1, $seq1, $qual1, $read2, $seq2, $qual2);
|
|
421 {
|
|
422 lock($FindContig); # Keep other threads from changing the queue's contents
|
|
423 $FindContig->enqueue(\@readArray);
|
|
424 }
|
|
425 }
|
|
426 close(FQ1);
|
|
427 close(FQ2);
|
|
428
|
|
429 # wait for all records to be processed and close down threads
|
|
430 $FindContig->enqueue( (undef) x $OPTIONS{p} );
|
|
431 while($FindContig->pending() > 0 ) {
|
|
432 next;
|
|
433 }
|
|
434 $ToAlign->enqueue( (undef) );
|
|
435 while($ToAlign->pending() > 0 ) {
|
|
436 next;
|
|
437 }
|
|
438 $Aligned->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
|
|
439 while($Aligned->pending() > 0 ) {
|
|
440 next;
|
|
441 }
|
|
442 $Samfile->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
|
|
443 while($Samfile->pending() > 0 ) {
|
|
444 next;
|
|
445 }
|
|
446 $WriteBam->enqueue( undef );
|
|
447
|
|
448 $_->join() for @threadArray;
|
|
449
|
|
450 # sort the bam file
|
|
451 if(defined $OPTIONS{s}) {
|
|
452 system("MergeSamFiles.sh MAX_RECORDS_IN_RAM=2000000 I=$bam_to_sort O=$bamfile_out VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true");
|
|
453 unlink $bam_to_sort;
|
|
454 }
|
|
455
|
|
456 if(!defined $OPTIONS{r}) {
|
|
457 # print out statistics
|
|
458 # setup variables
|
|
459 my $totalRegionLengths = 0;
|
|
460 my $totalReads = 0;
|
|
461 my $totalFwd = 0;
|
|
462 my $totalRev = 0;
|
|
463 my $totalMatch = 0;
|
|
464 my $totalMiss = 0;
|
|
465 my $totalBases = 0;
|
|
466 my $totalRead1 = 0;
|
|
467 my $totalRead2 = 0;
|
|
468 my $totalA = 0;
|
|
469 my $totalC = 0;
|
|
470 my $totalG = 0;
|
|
471 my $totalT = 0;
|
|
472 my $totalN = 0;
|
|
473 my $totalConc = 0;
|
|
474 my $totalQuality = 0;
|
|
475 my $AllmaxRead = 0;
|
|
476 my $AllminRead = 100000;
|
|
477 my $avequal = 'NA';
|
|
478 my $aveRead = 'NA';
|
|
479 my $perA = 'NA';
|
|
480 my $perC = 'NA';
|
|
481 my $perG = 'NA';
|
|
482 my $perT = 'NA';
|
|
483 my $perN = 'NA';
|
|
484 my $readsUsed = 0;
|
|
485 my @coverage;
|
|
486 my @off_coverage;
|
|
487
|
|
488 # print column headers
|
|
489 print STATS 'AmpliconName,Region,RegionLengthMinusPrimers,TotalReadsPoorMapping,TotalReadsUsed,Reads1Kept,Reads2Kept,ReadsOut,ReadsFwd,ReadsRev,ReadsOverlap,';
|
|
490 print STATS "Bases,AveQuality,MatchBases,ErrorBases,ErrorRate,MinReadLength,AveReadLength,MaxReadLength,PercA,PercC,PercG,PercT,PercN\n";
|
|
491
|
|
492 # for each region
|
|
493 foreach my $key (@OrderedContigs){
|
|
494 # if we have some reads calculate averages
|
|
495 if($stats{$key}{Reads} > 0) {
|
|
496 $avequal = $stats{$key}{Quality} / $stats{$key}{Bases};
|
|
497 $avequal = sprintf("%.4f", $avequal);
|
|
498 $aveRead = $stats{$key}{Bases} / $stats{$key}{Reads};
|
|
499 $aveRead = sprintf("%.4f", $aveRead);
|
|
500 $perA = $stats{$key}{TotalA} / $stats{$key}{Bases} * 100;
|
|
501 $perA = sprintf("%.2f", $perA);
|
|
502 $perC = $stats{$key}{TotalC} / $stats{$key}{Bases} * 100;
|
|
503 $perC = sprintf("%.2f", $perC);
|
|
504 $perG = $stats{$key}{TotalG} / $stats{$key}{Bases} * 100;
|
|
505 $perG = sprintf("%.2f", $perG);
|
|
506 $perT = $stats{$key}{TotalT} / $stats{$key}{Bases} * 100;
|
|
507 $perT = sprintf("%.2f", $perT);
|
|
508 $perN = $stats{$key}{TotalN} / $stats{$key}{Bases} * 100;
|
|
509 $perN = sprintf("%.2f", $perN);
|
|
510 $AllminRead = min([$AllminRead, $stats{$key}{MinLength}]);
|
|
511 $AllmaxRead = max([$AllmaxRead, $stats{$key}{MaxLength}]);
|
|
512 $readsUsed = $stats{$key}{Reads} + $stats{$key}{ConcReads};
|
|
513 } else {
|
|
514 $stats{$key}{MaxLength} = 'NA';
|
|
515 $stats{$key}{MinLength} = 'NA';
|
|
516 }
|
|
517
|
|
518 # if we have reads which overlapped calculate an error rate
|
|
519 my $errRate = 'NA';
|
|
520 if($stats{$key}{ConcReads} > 0) {
|
|
521 $errRate = $stats{$key}{ConcErrorsBases} / ($stats{$key}{ConcErrorsBases} + $stats{$key}{ConcCorrectBases});
|
|
522 }
|
|
523
|
|
524 # print the stats for this region
|
|
525 print STATS "$refFrags{name}{$key}," if defined $refFrags{name}{$key};
|
|
526 print STATS "Unknown," unless defined $refFrags{name}{$key};
|
|
527 print STATS "$refLength{$key}{newReg},$refLength{$key}{length},$stats{$key}{PoorMapQual},$readsUsed,$stats{$key}{Read1},$stats{$key}{Read2},$stats{$key}{Reads},$stats{$key}{NumberForward},";
|
|
528 print STATS "$stats{$key}{NumberReverse},$stats{$key}{ConcReads},$stats{$key}{Bases},$avequal,$stats{$key}{ConcCorrectBases},";
|
|
529 print STATS "$stats{$key}{ConcErrorsBases},$errRate,$stats{$key}{MinLength},$aveRead,$stats{$key}{MaxLength},$perA,$perC,$perG,$perT,$perN\n";
|
|
530
|
|
531 # add stats to grand totals
|
|
532 $totalRegionLengths += $refLength{$key}{length};
|
|
533 $totalQuality += $stats{$key}{Quality};
|
|
534 $totalReads += $stats{$key}{Reads};
|
|
535 $totalFwd += $stats{$key}{NumberForward};
|
|
536 $totalRev += $stats{$key}{NumberReverse};
|
|
537 $totalBases += $stats{$key}{Bases};
|
|
538 $totalRead1 += $stats{$key}{Read1};
|
|
539 $totalRead2 += $stats{$key}{Read2};
|
|
540 $totalA += $stats{$key}{TotalA};
|
|
541 $totalC += $stats{$key}{TotalC};
|
|
542 $totalT += $stats{$key}{TotalT};
|
|
543 $totalG += $stats{$key}{TotalG};
|
|
544 $totalN += $stats{$key}{TotalN};
|
|
545 $totalMatch += $stats{$key}{ConcCorrectBases};
|
|
546 $totalMiss += $stats{$key}{ConcErrorsBases};
|
|
547 $totalConc += $stats{$key}{ConcReads};
|
|
548
|
|
549 # reset ave scores
|
|
550 $avequal = 'NA';
|
|
551 $aveRead = 'NA';
|
|
552 $perA = 'NA';
|
|
553 $perC = 'NA';
|
|
554 $perG = 'NA';
|
|
555 $perT = 'NA';
|
|
556 $perN = 'NA';
|
|
557 $readsUsed = 0;
|
|
558
|
|
559 if(defined $refFrags{name}{$key} && grep /^Off_target/, $refFrags{name}{$key}) {
|
|
560 push @off_coverage, $stats{$key}{Reads};
|
|
561 } else {
|
|
562 push @coverage, $stats{$key}{Reads};
|
|
563 }
|
|
564 }
|
|
565
|
|
566 # if we have any reads calculate the averages
|
|
567 if($totalReads > 0) {
|
|
568 $avequal = $totalQuality / $totalBases;
|
|
569 $avequal = sprintf("%.4f", $avequal);
|
|
570 $aveRead = $totalBases / $totalReads;
|
|
571 $aveRead = sprintf("%.4f", $aveRead);
|
|
572 $perA = $totalA / $totalBases * 100;
|
|
573 $perA = sprintf("%.2f", $perA);
|
|
574 $perC = $totalC / $totalBases * 100;
|
|
575 $perC = sprintf("%.2f", $perC);
|
|
576 $perG = $totalG / $totalBases * 100;
|
|
577 $perG = sprintf("%.2f", $perG);
|
|
578 $perT = $totalT / $totalBases * 100;
|
|
579 $perT = sprintf("%.2f", $perT);
|
|
580 $perN = $totalN / $totalBases * 100;
|
|
581 $perN = sprintf("%.2f", $perN);
|
|
582 $readsUsed = $totalReads + $totalConc;
|
|
583 } else {
|
|
584 $AllmaxRead = 'NA';
|
|
585 $AllminRead = 'NA';
|
|
586 }
|
|
587
|
|
588 # if we had any reads which overlapped calculate error rate
|
|
589 my $errRate = 'NA';
|
|
590 if($totalConc > 0) {
|
|
591 $errRate = $totalMiss / ($totalMiss + $totalMatch);
|
|
592 }
|
|
593
|
|
594 # print out totals
|
|
595 my $pcAmb = sprintf("%.2f", $stats{Amb} / $readNo * 100);
|
|
596 my $pcPQ = sprintf("%.2f", $stats{PoorQualMates} / $readNo * 100);
|
|
597 my $TotalReads = $readNo * 2;
|
|
598 my $Unmapped = $TotalReads - $readsUsed;
|
|
599 my $pcUnmapped = sprintf("%.2f", $Unmapped/$TotalReads * 100);
|
|
600 my $avCov = sprintf("%.0f", sum(@coverage)/@coverage);
|
|
601 my $avOff = sprintf("%.2f", sum(@off_coverage)/@off_coverage) if @off_coverage;
|
|
602 my $greater = scalar (grep { $_ > 0.2*$avCov } @coverage);
|
|
603 my $pcgreater = sprintf("%.2f", $greater/@coverage * 100);
|
|
604
|
|
605 print STATS ",Total,$totalRegionLengths,$stats{PoorMapQual},$readsUsed,$totalRead1,$totalRead2,$totalReads,$totalFwd,$totalRev,$totalConc,$totalBases,$avequal,$totalMatch,";
|
|
606 print STATS "$totalMiss,$errRate,$AllminRead,$aveRead,$AllmaxRead,$perA,$perC,$perG,$perT,$perN\n";
|
|
607 print STATS "\nNo mate pairs:,$readNo\n";
|
|
608 print STATS "No ambiguous mates:,$stats{Amb},$pcAmb\%\n";
|
|
609 print STATS "No poor quality mates:,$stats{PoorQualMates},$pcPQ\%\n\n";
|
|
610 print STATS "Total Number Of Reads:,$TotalReads\n";
|
|
611 print STATS "Unmapped Reads:,$Unmapped,$pcUnmapped\%\n\n";
|
|
612 print STATS "Min Mean Max ReadsOut:,".min([@coverage]).",$avCov,".max([@coverage])."\n";
|
|
613 print STATS "No. Amplicons > 0.2 x ReadsOut:,$greater,$pcgreater\%\n";
|
|
614 print STATS "Average ReadsOut Off Target Regions:,$avOff\n" if @off_coverage;
|
|
615 print STATS "\nAmplicons\nNo. Reads < 200, 200 <= No. Reads < 1000, No. Reads >= 1000\n";
|
|
616 $greater = scalar (grep { $_ < 200 } @coverage);
|
|
617 print STATS "$greater,";
|
|
618 $greater = scalar (grep { $_ >= 200 && $_ < 1000 } @coverage);
|
|
619 print STATS "$greater,";
|
|
620 $greater = scalar (grep { $_ >= 1000 } @coverage);
|
|
621 print STATS "$greater\n";
|
|
622
|
|
623 print "Cache hits align contig: $cacheHits\n";
|
|
624 print "Alignment cache hits: $cacheAlignHits\n";
|
|
625 close(STATS);
|
|
626 }
|
|
627
|
|
628 # close the log file
|
|
629 if(defined $OPTIONS{l} || defined $OPTIONS{L}) {
|
|
630 my $now = localtime time;
|
|
631 print "\nLog File Closed $now\n";
|
|
632 close(STDERR);
|
|
633 close(STDOUT);
|
|
634 }
|
|
635
|
|
636 # thread to find matching amplicon
|
|
637 sub thread_find_contig {
|
|
638 my $FindContig = shift;
|
|
639 my $refFrags = shift;
|
|
640 my $refSeq = shift;
|
|
641 my $ToAlign = shift;
|
|
642 my $Aligned = shift;
|
|
643 my $WriteBam = shift;
|
|
644 my $readArrayRef;
|
|
645 {
|
|
646 lock($FindContig); # Keep other threads from changing the queue's contents
|
|
647 $readArrayRef = $FindContig->dequeue();
|
|
648 }
|
|
649
|
|
650 while(defined $readArrayRef) {
|
|
651 my ($read1, $seq1, $qual1, $read2, $seq2, $qual2) = @{ $readArrayRef };
|
|
652 my $contig = '';
|
|
653 my $readRef = '';
|
|
654 my $strand = '';
|
|
655
|
|
656 # trim the reads
|
|
657 my ($trimmed1, $qtrim1) = trim($seq1, $qual1, 30, 1);
|
|
658 my ($trimmed2, $qtrim2) = trim($seq2, $qual2, 30, 2);
|
|
659
|
|
660 # if we have already mapped the reads use the results in the cache
|
|
661 if(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
|
|
662 my @array;
|
|
663 { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
|
|
664 $array[0] = $read1;
|
|
665 $array[3] = $qtrim1;
|
|
666 $array[6] = $qtrim2;
|
|
667 { lock($cacheAlignHits); $cacheAlignHits++ };
|
|
668
|
|
669 # add aligned reads to queue
|
|
670 {
|
|
671 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
672 $Aligned->enqueue(\@array);
|
|
673 }
|
|
674 } else {
|
|
675 # Use approx match to find most similar amplicon primer to reads
|
|
676 my @index1;
|
|
677 my @index2;
|
|
678 my $index_read1 = -2;
|
|
679 my $index_read2 = -2;
|
|
680 my $index_found = 0;
|
|
681
|
|
682 if(length($trimmed1) > length($seq1)/2 || length($trimmed2) > length($seq2)/2) {
|
|
683 # we have at least 1 good read
|
|
684 # find closest matching amplicon for read 1
|
|
685 my $string = substr($seq1, 0, $$refFrags{min});
|
|
686 if(defined $cache{"$string.1"}) {
|
|
687 $cacheHits++;
|
|
688 { lock(%cache); @index1 = @{ $cache{"$string.1"} } };
|
|
689 } else {
|
|
690 @index1 = amatch($string, \@{ $$refFrags{seqU} });
|
|
691 {
|
|
692 lock(%cache);
|
|
693 $cache{"$string.1"} = &share( [] );
|
|
694 @{ $cache{"$string.1"} } = @index1;
|
|
695 }
|
|
696 }
|
|
697
|
|
698 # find closest matching amplicon for read 2
|
|
699 $string = substr($seq2, -$$refFrags{min});
|
|
700 if(defined $cache{"$string.2"}) {
|
|
701 $cacheHits++;
|
|
702 { lock(%cache); @index2 = @{ $cache{"$string.2"} } };
|
|
703 } else {
|
|
704 @index2 = amatch($string, \@{ $$refFrags{seqD} });
|
|
705 {
|
|
706 lock(%cache);
|
|
707 $cache{"$string.2"} = &share( [] );
|
|
708 @{ $cache{"$string.2"} } = @index2;
|
|
709 }
|
|
710 }
|
|
711
|
|
712 if($#index1 > 0 || $#index2 > 0) {
|
|
713 # if one read maps to multiple amplicons and the other read matches to just one of them use that
|
|
714 my %in_array2 = map { $_ => 1 } @index2;
|
|
715 my @common = grep { $in_array2{$_} } @index1;
|
|
716 if($#common == 0) {
|
|
717 @index1 = @common;
|
|
718 @index2 = @common;
|
|
719 } elsif($#common > 0) {
|
|
720 if($#index1 > 0) {
|
|
721 # matched to multiple primers so try and use whole read
|
|
722 my @refs;
|
|
723 foreach(@index1) {
|
|
724 if($$refFrags{strand}[$_] eq "F") {
|
|
725 push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
|
|
726 } else {
|
|
727 push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
|
|
728 }
|
|
729 }
|
|
730 my @index1b = amatch($seq1, \@refs);
|
|
731 if($#index1b >= 0) {
|
|
732 @index1 = @index1[@index1b];
|
|
733 }
|
|
734 }
|
|
735 if($#index2 > 0) {
|
|
736 # matched to multiple primers so try and use whole read
|
|
737 my @refs;
|
|
738 foreach(@index2) {
|
|
739 if($$refFrags{strand}[$_] eq "F") {
|
|
740 push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
|
|
741 } else {
|
|
742 push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
|
|
743 }
|
|
744 }
|
|
745 my @index2b = amatch($seq2, \@refs);
|
|
746 if($#index2b >= 0) {
|
|
747 @index2 = @index2[@index2b];
|
|
748 }
|
|
749 }
|
|
750
|
|
751 # if one read still maps to multiple amplicons and the other read matches to just one of them use that
|
|
752 %in_array2 = map { $_ => 1 } @index2;
|
|
753 @common = grep { $in_array2{$_} } @index1;
|
|
754 if($#common == 0) {
|
|
755 @index1 = @common;
|
|
756 @index2 = @common;
|
|
757 }
|
|
758 }
|
|
759 }
|
|
760
|
|
761 # if read 1 and read 2 match to the same amplicon assign it
|
|
762 if($#index1 == 0 && @index1 ~~ @index2 ) {
|
|
763 $index_read1 = $index1[0];
|
|
764 $index_read2 = $index2[0];
|
|
765 } else {
|
|
766 $index_read1 = -1;
|
|
767 $index_read2 = -1;
|
|
768 }
|
|
769
|
|
770 # if any read after trimming is less than half the original length discard
|
|
771 if(length($trimmed1) < length($seq1)/2) {
|
|
772 $index_read1 = -2;
|
|
773 }
|
|
774
|
|
775 if(length($trimmed2) < length($seq2)/2) {
|
|
776 $index_read2 = -2;
|
|
777 }
|
|
778 }
|
|
779
|
|
780 if($index_read1 > -1 || $index_read2 > -1) {
|
|
781 my $conc = 0;
|
|
782 my %align;
|
|
783
|
|
784 # at least 1 read matched to an amplicon
|
|
785 if($index_read1 > -1) {
|
|
786 $contig = $$refFrags{reg}[$index_read1];
|
|
787 $readRef = $$refSeq{$contig};
|
|
788 $strand = $$refFrags{strand}[$index_read1];
|
|
789 } elsif($index_read2 > -1) {
|
|
790 $contig = $$refFrags{reg}[$index_read2];
|
|
791 $readRef = $$refSeq{$contig};
|
|
792 $strand = $$refFrags{strand}[$index_read2];
|
|
793 }
|
|
794
|
|
795 if($index_read1 == $index_read2 && length($readRef) < length($trimmed1) + length($trimmed2)) {
|
|
796 # reads must overlap so can use concensus
|
|
797 $conc = 1;
|
|
798 } else {
|
|
799 my $substr1;
|
|
800 my $substr2;
|
|
801 for(my $length = 5; $length <= min([length($trimmed1), length($trimmed2)]); $length++) {
|
|
802 $substr1 = substr($trimmed1, -$length);
|
|
803 $substr2 = substr($trimmed2, 0, $length);
|
|
804 if($substr1 eq $substr2) {
|
|
805 $conc = 1;
|
|
806 last;
|
|
807 } else {
|
|
808 # calculate max diff allowed
|
|
809 if(!defined $maxdiff{$length}) {
|
|
810 { lock(%maxdiff); $maxdiff{$length} = maxdiff($length, 0.02, 0.04); }
|
|
811 }
|
|
812
|
|
813 my $diff = levenshtein($substr1, $substr2 );
|
|
814 if($diff < $maxdiff{$length}) {
|
|
815 $conc = 1;
|
|
816 last;
|
|
817 }
|
|
818 }
|
|
819 }
|
|
820 }
|
|
821
|
|
822 if($conc) {
|
|
823 # remove any trailing Ns
|
|
824 $trimmed1 = $seq1;
|
|
825 $trimmed2 = $seq2;
|
|
826 $trimmed1 =~ s/N*$//;
|
|
827 $trimmed2 =~ s/^N*//;
|
|
828 $qtrim1 = $qual1;
|
|
829 $qtrim2 = $qual2;
|
|
830 $qtrim1 = substr($qtrim1, 0, length($trimmed1));
|
|
831 $qtrim2 = substr($qtrim2 , -length($trimmed2));
|
|
832 }
|
|
833
|
|
834 if($strand eq "R") {
|
|
835 my $trimmed1_tmp = revcompl($trimmed1);
|
|
836 my $qtrim1_tmp = reverse($qtrim1);
|
|
837 $trimmed1 = revcompl($trimmed2);
|
|
838 $qtrim1 = reverse($qtrim2);
|
|
839 $trimmed2 = $trimmed1_tmp;
|
|
840 $qtrim2 = $qtrim1_tmp;
|
|
841 my $index_read_tmp = $index_read1;
|
|
842 $index_read1 = $index_read2;
|
|
843 $index_read2 = $index_read_tmp;
|
|
844 }
|
|
845
|
|
846 if($conc) {
|
|
847 my $align;
|
|
848 my $search1 = $trimmed1;
|
|
849 $search1 =~ s/N/./g;
|
|
850 my $search2 = $trimmed2;
|
|
851 $search2 =~ s/N/./g;
|
|
852 if($$refSeq{$contig} =~ /$search1/ && $$refSeq{$contig} =~ /$search2/) {
|
|
853 # the read matches exactly to the reference
|
|
854 $align{Ref1} = $$refSeq{$contig};
|
|
855 $align{Read1} = $$refSeq{$contig};
|
|
856 $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
|
|
857 $align{Read1} =~ s/[^X]/-/g;
|
|
858 $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X+/$trimmed1/ });
|
|
859 $align{Read2} = $$refSeq{$contig};
|
|
860 $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}XX$2/;
|
|
861 $align{Read2} =~ s/[^X]/-/g;
|
|
862 $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X+/$trimmed2/ });
|
|
863
|
|
864 # add aligned reads to queue
|
|
865 my @array;
|
|
866 {
|
|
867 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
868 @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, $align{Ref1}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
|
|
869 $Aligned->enqueue(\@array);
|
|
870 }
|
|
871
|
|
872 # add alignment to cache
|
|
873 {
|
|
874 lock(%cacheAlign);
|
|
875 $cacheAlign{Conc}{$name} = &share( [] );
|
|
876 @{ $cacheAlign{Conc}{$name} } = @array;
|
|
877 }
|
|
878 } elsif(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
|
|
879 my @array;
|
|
880 { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
|
|
881 $array[0] = $read1;
|
|
882 $array[3] = $qtrim1;
|
|
883 $array[6] = $qtrim2;
|
|
884 { lock($cacheAlignHits); $cacheAlignHits++ };
|
|
885
|
|
886 # add aligned reads to queue
|
|
887 {
|
|
888 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
889 $Aligned->enqueue(\@array);
|
|
890 }
|
|
891 } else {
|
|
892
|
|
893 # align reads using NW2
|
|
894 {
|
|
895 lock($ToAlign); # Keep other threads from changing the queue's contents
|
|
896 my @array = ($$refSeq{$contig}, $trimmed1, $trimmed2, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc);
|
|
897 #my @array = ($fastaIn, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc, "$trimmed1.$trimmed2");
|
|
898 $ToAlign->enqueue(\@array);
|
|
899 }
|
|
900 }
|
|
901 }
|
|
902
|
|
903 if(!$conc && $index_read1 > -1) {
|
|
904 my $search1 = $trimmed1;
|
|
905 $search1 =~ s/N/./g;
|
|
906 if($$refSeq{$contig} =~ /$search1/) {
|
|
907 # the read matches exactly to the reference
|
|
908 $align{Ref1} = $$refSeq{$contig};
|
|
909 $align{Read1} = $$refSeq{$contig};
|
|
910 $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
|
|
911 $align{Read1} =~ s/[^X]/-/g;
|
|
912 $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X/$trimmed1/ });
|
|
913
|
|
914 # add aligned reads to queue
|
|
915 {
|
|
916 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
917 my @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
|
|
918 $Aligned->enqueue(\@array);
|
|
919 }
|
|
920 } elsif(defined $cacheAlign{$contig}{R1}{$trimmed1}) {
|
|
921 my @array;
|
|
922 { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R1}{$trimmed1} } };
|
|
923 $array[0] = $read1;
|
|
924 $array[3] = $qtrim1;
|
|
925 { lock($cacheAlignHits); $cacheAlignHits++ };
|
|
926
|
|
927 # add aligned reads to queue
|
|
928 {
|
|
929 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
930 $Aligned->enqueue(\@array);
|
|
931 }
|
|
932 } else {
|
|
933 # align read using NW2
|
|
934 {
|
|
935 lock($ToAlign); # Keep other threads from changing the queue's contents
|
|
936 my @array = ($$refSeq{$contig}, $trimmed1, undef, $read1, $qtrim1, undef, $contig, $strand, $conc);
|
|
937 #my @array = ($fastaIn, $read1, $qtrim1, undef, $contig, $strand, $conc, $trimmed1);
|
|
938 $ToAlign->enqueue(\@array);
|
|
939 }
|
|
940 }
|
|
941 }
|
|
942
|
|
943 if(!$conc && $index_read2 > -1) {
|
|
944 my $search2 = $trimmed2;
|
|
945 $search2 =~ s/N/./g;
|
|
946 if($$refSeq{$contig} =~ /$search2/) {
|
|
947 # the read matches exactly to the reference
|
|
948 $align{Ref2} = $$refSeq{$contig};
|
|
949 $align{Read2} = $$refSeq{$contig};
|
|
950 $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}X$2/;
|
|
951 $align{Read2} =~ s/[^X]/-/g;
|
|
952 $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X/$trimmed2/ });
|
|
953
|
|
954 # add aligned reads to queue
|
|
955 {
|
|
956 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
957 my @array = ($read2, undef, undef, undef, $align{Ref2}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
|
|
958 $Aligned->enqueue(\@array);
|
|
959 }
|
|
960 } elsif(defined $cacheAlign{$contig}{R2}{$trimmed2}) {
|
|
961 my @array;
|
|
962 { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R2}{$trimmed2} } };
|
|
963 $array[0] = $read2;
|
|
964 $array[6] = $qtrim2;
|
|
965 { lock($cacheAlignHits); $cacheAlignHits++ };
|
|
966
|
|
967 # add aligned reads to queue
|
|
968 {
|
|
969 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
970 $Aligned->enqueue(\@array);
|
|
971 }
|
|
972 } else {
|
|
973 # align read using NW2
|
|
974 {
|
|
975 lock($ToAlign); # Keep other threads from changing the queue's contents
|
|
976 my @array = ($$refSeq{$contig}, undef, $trimmed2, $read2, undef, $qtrim2, $contig, $strand, $conc);
|
|
977 #my @array = ($fastaIn, $read2, undef, $qtrim2, $contig, $strand, $conc, $trimmed2);
|
|
978 $ToAlign->enqueue(\@array);
|
|
979 }
|
|
980 }
|
|
981 }
|
|
982 }
|
|
983
|
|
984 if($index_read1 < 0) {
|
|
985 # discard read 1
|
|
986 # send to writebamfile thread
|
|
987 {
|
|
988 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
989 $WriteBam->enqueue("$read1\t4\t*\t0\t0\t*\t*\t0\t0\t$seq1\t$qual1\tRG:Z:$readGroup\n");
|
|
990 }
|
|
991 { lock(%stats); $stats{PoorQual1}++ };
|
|
992 }
|
|
993
|
|
994 if($index_read2 < 0) {
|
|
995 # discard read 2
|
|
996 # send to writebamfile thread
|
|
997 {
|
|
998 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
999 $WriteBam->enqueue("$read2\t4\t*\t0\t0\t*\t*\t0\t0\t$seq2\t$qual2\tRG:Z:$readGroup\n");
|
|
1000 }
|
|
1001 { lock(%stats); $stats{PoorQual2}++ };
|
|
1002 }
|
|
1003
|
|
1004 if($index_read1 == -1 && $index_read2 == -1) {
|
|
1005 { lock(%stats); $stats{Amb}++ };
|
|
1006 } elsif($index_read1 == -2 && $index_read2 == -2) {
|
|
1007 { lock(%stats); $stats{PoorQualMates}++ };
|
|
1008 }
|
|
1009 }
|
|
1010
|
|
1011 # grab the next reads in queue
|
|
1012 {
|
|
1013 lock($FindContig); # Keep other threads from changing the queue's contents
|
|
1014 $readArrayRef = $FindContig->dequeue();
|
|
1015 }
|
|
1016 }
|
|
1017 print "Closing Find Amplicon Thread\n";
|
|
1018 return;
|
|
1019 }
|
|
1020
|
|
1021 sub writebamfile {
|
|
1022 my $bamfile = shift;
|
|
1023 my $WriteBam = shift;
|
|
1024
|
|
1025 # write samfile header
|
|
1026 open(PIPE_TO_SAMTOOLS, "| samtools view -Sb -o $bamfile - 2>&1") or die "Can't fork: $!\n";
|
|
1027 foreach my $key (@OrderedContigs) {
|
|
1028 print PIPE_TO_SAMTOOLS "\@SQ\tSN:$refLength{$key}{newReg}\tLN:$refLength{$key}{length}\n";
|
|
1029 }
|
|
1030 print PIPE_TO_SAMTOOLS "\@RG\tID:$readGroup\tPL:Illumina\tSM:$sampleName\n";
|
|
1031
|
|
1032 my $read;
|
|
1033 {
|
|
1034 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1035 $read = $WriteBam->dequeue();
|
|
1036 }
|
|
1037 while(defined $read) {
|
|
1038
|
|
1039 # write read to bamfile
|
|
1040 print PIPE_TO_SAMTOOLS $read;
|
|
1041
|
|
1042 # grab next item in queue
|
|
1043 {
|
|
1044 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1045 $read = $WriteBam->dequeue();
|
|
1046 }
|
|
1047 }
|
|
1048 print "Closing Bam Writer\n";
|
|
1049 close PIPE_TO_SAMTOOLS or die "pipe could not be closed";
|
|
1050 return;
|
|
1051 }
|
|
1052
|
|
1053 sub thread_process_align {
|
|
1054 my $Aligned = shift;
|
|
1055 my $refFrags = shift;
|
|
1056 my $refSeq = shift;
|
|
1057 my $Samfile = shift;
|
|
1058 my $WriteBam = shift;
|
|
1059
|
|
1060 my $readArrayRef;
|
|
1061 {
|
|
1062 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
1063 $readArrayRef = $Aligned->dequeue();
|
|
1064 }
|
|
1065 while(defined $readArrayRef) {
|
|
1066 # grab the alignment
|
|
1067 my ($read, $ref1, $read1, $qtrim1, $ref2, $read2, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
|
|
1068 # grab primer lengths
|
|
1069 my $PU = $$refFrags{primers}{$contig}{U}{F}{length};
|
|
1070 my $PD = $$refFrags{primers}{$contig}{D}{F}{length};
|
|
1071 my %readStats;
|
|
1072 my $orig_read1 = $read1;
|
|
1073 my $orig_read2 = $read2;
|
|
1074 my $orig_qtrim1 = $qtrim1;
|
|
1075 my $orig_qtrim2 = $qtrim2;
|
|
1076 $orig_read1 =~ s/-//g if defined $orig_read1;
|
|
1077 $orig_read2 =~ s/-//g if defined $orig_read2;
|
|
1078
|
|
1079 if(defined $OPTIONS{d}) {
|
|
1080 # debug info
|
|
1081 print "Alignment With Primers\n";
|
|
1082 print "$ref1\n$read1\n" if defined $read1;
|
|
1083 print "$ref2\n$read2\n" if defined $read2;
|
|
1084 print "\nFinal Alignment:\n";
|
|
1085 }
|
|
1086
|
|
1087 # calculate where primers begin and end
|
|
1088 if(defined $read1) {
|
|
1089 if($strand eq 'F') {
|
|
1090 $readStats{$contig}{Read1} = 1;
|
|
1091 } else {
|
|
1092 $readStats{$contig}{Read2} = 1;
|
|
1093 }
|
|
1094
|
|
1095 # determine the primer locations
|
|
1096 my $offset = 0;
|
|
1097 for(my $occurance = 0; $occurance < $PU; $occurance++) {
|
|
1098 substr($ref1, $offset) =~ /^.*?([^-])/;
|
|
1099 $offset += $-[1] + 1;
|
|
1100 }
|
|
1101 my $length = 0;
|
|
1102 for(my $occurance = 0; $occurance < $PD; $occurance++) {
|
|
1103 substr($ref1, 0, length($ref1) - $length) =~ /.*([^-]).*?$/;
|
|
1104 $length = length($ref1) - $-[1];
|
|
1105 }
|
|
1106 $length = length($ref1) - $length - $offset;
|
|
1107
|
|
1108 # remove primers and adjust quality scores
|
|
1109 $ref1 = substr($ref1, $offset, $length);
|
|
1110 my $qualOffset = ( substr($read1, 0, $offset) =~ tr/ACTGN// );
|
|
1111 $read1 = substr($read1, $offset, $length);
|
|
1112 $length = ( $read1 =~ tr/ACTGN// );
|
|
1113 $qtrim1 = substr($qtrim1, $qualOffset, $length);
|
|
1114 }
|
|
1115
|
|
1116 if(defined $read2) {
|
|
1117 if($strand eq 'F') {
|
|
1118 $readStats{$contig}{Read2} = 1;
|
|
1119 } else {
|
|
1120 $readStats{$contig}{Read1} = 1;
|
|
1121 }
|
|
1122
|
|
1123 # determine the primer locations
|
|
1124 my $offset = 0;
|
|
1125 for(my $occurance = 0; $occurance < $PU; $occurance++) {
|
|
1126 substr($ref2, $offset) =~ /^.*?([^-])/;
|
|
1127 $offset += $-[1] + 1;
|
|
1128 }
|
|
1129 my $length = 0;
|
|
1130 for(my $occurance = 0; $occurance < $PD; $occurance++) {
|
|
1131 substr($ref2, 0, length($ref2) - $length) =~ /.*([^-]).*?$/;
|
|
1132 $length = length($ref2) - $-[1];
|
|
1133 }
|
|
1134 $length = length($ref2) - $length - $offset;
|
|
1135
|
|
1136 # remove primers and adjust quality scores
|
|
1137 $ref2 = substr($ref2, $offset, $length);
|
|
1138 my $qualOffset = ( substr($read2, 0, $offset) =~ tr/ACTGN// );
|
|
1139 $read2 = substr($read2, $offset, $length);
|
|
1140 $length = ( $read2 =~ tr/ACTGN// );
|
|
1141 $qtrim2 = substr($qtrim2, $qualOffset, $length);
|
|
1142 }
|
|
1143
|
|
1144 # generate a concensus sequence if we have both read1 and read2 and they overlap
|
|
1145 if($conc) {
|
|
1146 # the following are 0 based indexes
|
|
1147 my $R1qual = 0; # index for sanger qual read 1
|
|
1148 my $R2qual = 0; # index for sanger qual read 2
|
|
1149 my $aindex = 0; # index for alignment
|
|
1150 my $q1;
|
|
1151 my $q2;
|
|
1152 my $concensus;
|
|
1153 my $concqual;
|
|
1154
|
|
1155 $readStats{$contig}{ConcReads} = 1;
|
|
1156 $readStats{$contig}{ConcCorrectBases} = 0;
|
|
1157 $readStats{$contig}{ConcErrorsBases} = 0;
|
|
1158 while($aindex < length($ref1)) {
|
|
1159 # base1 is current base from read 1
|
|
1160 # base2 is current base from read 2
|
|
1161 my $base1 = substr($read1, $aindex, 1);
|
|
1162 my $base2 = substr($read2, $aindex, 1);
|
|
1163
|
|
1164 # grab the next quality score
|
|
1165 if($R1qual < length($qtrim1)) {
|
|
1166 $q1 = substr($qtrim1, $R1qual, 1);
|
|
1167 }
|
|
1168 if($R2qual < length($qtrim2)) {
|
|
1169 $q2 = substr($qtrim2, $R2qual, 1);
|
|
1170 }
|
|
1171
|
|
1172 # if we have a base in either
|
|
1173 if(($base1 ne '-' && $base1 ne 'N') || ($base2 ne '-' && $base2 ne 'N')) {
|
|
1174 if(($base2 eq "-" || $base2 eq 'N') && $base1 ne "-" && $base1 ne 'N') {
|
|
1175 # we are only in read 1 or read 2 is a N
|
|
1176 $concensus .= $base1;
|
|
1177 $concqual .= $q1;
|
|
1178 $R1qual++;
|
|
1179 if($base2 eq 'N') {
|
|
1180 $R2qual++;
|
|
1181 }
|
|
1182 } elsif(($base1 eq "-" || $base1 eq 'N') && $base2 ne "-" && $base2 ne 'N') {
|
|
1183 # we are only in read 2 or read 1 is a N
|
|
1184 $concensus .= $base2;
|
|
1185 $concqual .= $q2;
|
|
1186 $R2qual++;
|
|
1187 if($base1 eq 'N') {
|
|
1188 $R1qual++;
|
|
1189 }
|
|
1190 } else {
|
|
1191 # we are in the overlap region
|
|
1192 my $maxqual = sangerMax($q1, $q2);
|
|
1193 if($base1 eq $base2) {
|
|
1194 # both reads agree
|
|
1195 $readStats{$contig}{ConcCorrectBases}++;
|
|
1196 $concensus .= $base1;
|
|
1197 $concqual .= $maxqual;
|
|
1198 } elsif($q1 eq $q2) {
|
|
1199 # reads have different bases but the same score
|
|
1200 $readStats{$contig}{ConcErrorsBases}++;
|
|
1201 $concensus .= 'N';
|
|
1202 $concqual .= '#';
|
|
1203 } elsif($q1 eq $maxqual) {
|
|
1204 # reads have different bases but read 1 has the best score
|
|
1205 $concensus .= $base1;
|
|
1206 $concqual .= $maxqual;
|
|
1207 $readStats{$contig}{ConcErrorsBases}++;
|
|
1208 } else {
|
|
1209 # reads have different bases but read 2 has the best score
|
|
1210 $concensus .= $base2;
|
|
1211 $concqual .= $maxqual;
|
|
1212 $readStats{$contig}{ConcErrorsBases}++;
|
|
1213 }
|
|
1214 $R1qual++;
|
|
1215 $R2qual++;
|
|
1216 }
|
|
1217 } elsif($base1 eq 'N' || $base2 eq 'N') {
|
|
1218 if($base2 eq '-') {
|
|
1219 # one is a deletion the other is an N
|
|
1220 $concensus .= $base1;
|
|
1221 $concqual .= $q1;
|
|
1222 $R1qual++;
|
|
1223 } elsif($base1 eq '-') {
|
|
1224 # one is a deletion the other is an N
|
|
1225 $concensus .= $base2;
|
|
1226 $concqual .= $q2;
|
|
1227 $R2qual++;
|
|
1228 } elsif($base1 eq 'N' && $base2 eq 'N') {
|
|
1229 # both reads are an N
|
|
1230 $concensus .= 'N';
|
|
1231 $concqual .= sangerMax($q1, $q2);
|
|
1232 $R1qual++;
|
|
1233 $R2qual++;
|
|
1234 }
|
|
1235 } else {
|
|
1236 $concensus .= '-';
|
|
1237 }
|
|
1238 $aindex++;
|
|
1239 }
|
|
1240 $read1 = $concensus;
|
|
1241 $qtrim1 = $concqual;
|
|
1242 $read2 = undef;
|
|
1243 $qtrim2 = undef;
|
|
1244 }
|
|
1245
|
|
1246 if(defined $read1) {
|
|
1247 # remove any trailing - and send to samfile_line
|
|
1248 $read1 =~ s/^-*?([^-].*)/$1/;
|
|
1249 $ref1 = substr($ref1, -length($read1));
|
|
1250 $read1 =~ s/(.*[^-])-*?$/$1/;
|
|
1251 $ref1 = substr($ref1, 0, length($read1));
|
|
1252
|
|
1253 my $tmp = $ref1;
|
|
1254 $tmp =~ s/-//g;
|
|
1255 my $rstart = index($$refSeq{$contig}, $tmp) - $PU + 1;
|
|
1256
|
|
1257 # generate sam file line
|
|
1258 {
|
|
1259 lock($Samfile); # Keep other threads from changing the queue's contents
|
|
1260 my @array = ($read, $qtrim1, $rstart, $read1, $ref1, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
|
|
1261 $Samfile->enqueue(\@array);
|
|
1262 }
|
|
1263 }
|
|
1264
|
|
1265 if(defined $read2) {
|
|
1266 # remove any trailing - and send to samfile_line
|
|
1267 $read2 =~ s/^-*?([^-].*)/$1/;
|
|
1268 $ref2 = substr($ref2, -length($read2));
|
|
1269 $read2 =~ s/(.*[^-])-*?$/$1/;
|
|
1270 $ref2 = substr($ref2, 0, length($read2));
|
|
1271
|
|
1272 my $tmp = $ref2;
|
|
1273 $tmp =~ s/-//g;
|
|
1274 my $rstart = rindex($$refSeq{$contig}, $tmp) - $PU + 1;
|
|
1275
|
|
1276 # generate sam file line
|
|
1277 {
|
|
1278 lock($Samfile); # Keep other threads from changing the queue's contents
|
|
1279 my @array = ($read, $qtrim2, $rstart, $read2, $ref2, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
|
|
1280 $Samfile->enqueue(\@array);
|
|
1281 }
|
|
1282 }
|
|
1283
|
|
1284 if(defined $OPTIONS{d}) {
|
|
1285 # debug info
|
|
1286 print "$ref1\n$read1\n" if defined $read1;
|
|
1287 print "$ref2\n$read2\n" if defined $read2;
|
|
1288 print "$qtrim1\n" if defined $read1;
|
|
1289 print "$qtrim2\n" if defined $read2;
|
|
1290 print "\n\n";
|
|
1291 }
|
|
1292
|
|
1293 # grab next item in queue
|
|
1294 {
|
|
1295 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
1296 $readArrayRef = $Aligned->dequeue();
|
|
1297 }
|
|
1298 }
|
|
1299 print "Closing Alignment Processor\n";
|
|
1300 return;
|
|
1301 }
|
|
1302
|
|
1303 # function to take reverse compliment
|
|
1304 sub revcompl {
|
|
1305 my $string = shift @_;
|
|
1306 my $revcomp = reverse($string);
|
|
1307 $revcomp =~ tr/ACGTacgtKMSWRYBDHVkmswrybdhv/TGCAtgcaMKSWYRVHDBmkswyrvhdb/;
|
|
1308 return $revcomp;
|
|
1309 }
|
|
1310
|
|
1311 # function to convert sanger fastq quality scores to phred scores
|
|
1312 sub sanger2phred {
|
|
1313 my $sang = shift @_;
|
|
1314 my @sanger = split("", $sang);
|
|
1315 my @phred;
|
|
1316 foreach my $q (@sanger) {
|
|
1317 my $Q = ord($q) - 33;
|
|
1318 push @phred, $Q;
|
|
1319 }
|
|
1320 return @phred;
|
|
1321 }
|
|
1322
|
|
1323 # function to return the largest sanger quality score
|
|
1324 sub sangerMax {
|
|
1325 my ($s1, $s2) = @_;
|
|
1326 my $Q1 = ord($s1) - 33;
|
|
1327 my $Q2 = ord($s2) - 33;
|
|
1328 if($Q1 > $Q2) {
|
|
1329 return $s1;
|
|
1330 } else {
|
|
1331 return $s2;
|
|
1332 }
|
|
1333 }
|
|
1334
|
|
1335 # function to read a fasta file as a hash
|
|
1336 sub read_fasta_file {
|
|
1337 my ($fasta_file, $seqs) = @_;
|
|
1338 my $header;
|
|
1339 my $seq = "";
|
|
1340 open (IN, $fasta_file) or die "can't open $fasta_file: $!\n";
|
|
1341 while (my $line = <IN>) {
|
|
1342 chomp $line;
|
|
1343 if (grep />/, $line) {
|
|
1344 if ($seq ne "") {
|
|
1345 $seqs->{$header} = $seq;
|
|
1346 }
|
|
1347 $header = $line;
|
|
1348
|
|
1349 $header =~ s/^>//; # remove ">"
|
|
1350 $header =~ s/\s+$//; # remove trailing whitespace
|
|
1351 $seq = ""; # clear out old sequence
|
|
1352 } else {
|
|
1353 $line =~ s/\s+//g; # remove whitespace
|
|
1354 $seq .= $line; # add sequence
|
|
1355 }
|
|
1356 }
|
|
1357 close IN; # finished with file
|
|
1358
|
|
1359 if ($seq ne "") { # handle last sequence
|
|
1360 $seqs->{$header} = $seq;
|
|
1361 }
|
|
1362
|
|
1363 return;
|
|
1364 }
|
|
1365
|
|
1366 # minimal element of a list
|
|
1367 #
|
|
1368 sub min
|
|
1369 {
|
|
1370 my @list = @{$_[0]};
|
|
1371 my $min = $list[0];
|
|
1372
|
|
1373 foreach my $i (@list)
|
|
1374 {
|
|
1375 $min = $i if ($i < $min);
|
|
1376 }
|
|
1377
|
|
1378 return $min;
|
|
1379 }
|
|
1380
|
|
1381 # maximal element of a list
|
|
1382 #
|
|
1383 sub max
|
|
1384 {
|
|
1385 my @list = @{$_[0]};
|
|
1386 my $max = $list[0];
|
|
1387
|
|
1388 foreach my $i (@list)
|
|
1389 {
|
|
1390 $max = $i if ($i > $max);
|
|
1391 }
|
|
1392
|
|
1393 return $max;
|
|
1394 }
|
|
1395
|
|
1396 # trim sequences using the BWA algorithm
|
|
1397 sub trim {
|
|
1398
|
|
1399 my ($s, $q, $threshold, $read) = @_; # read sequence and quality scores
|
|
1400 if($read == 2) {
|
|
1401 $q = reverse($q);
|
|
1402 $s = reverse($s);
|
|
1403 }
|
|
1404
|
|
1405 my @array = sanger2phred($q);
|
|
1406 my $length = scalar @array;
|
|
1407
|
|
1408 # only calculate if quality fails near end
|
|
1409 if( $array[$#array] >= $threshold ){
|
|
1410 if($read == 2) {
|
|
1411 $q = reverse($q);
|
|
1412 $s = reverse($s);
|
|
1413 }
|
|
1414 return ($s, $q);
|
|
1415 }
|
|
1416
|
|
1417 # run bwa equation
|
|
1418 my @arg;
|
|
1419 for( my $i = 0; $i < $length - 1; $i++ ){
|
|
1420 my $x = $i + 1;
|
|
1421 for( my $j = $x; $j < $length; $j++ ){
|
|
1422 $arg[$x] += $threshold - $array[$j];
|
|
1423 }
|
|
1424 }
|
|
1425
|
|
1426 # find number of 5' bases to retain
|
|
1427 my $index = 0;
|
|
1428 my $maxval = 0;
|
|
1429 for ( 1 .. $#arg ){
|
|
1430 if ( $maxval < $arg[$_] ){
|
|
1431 $index = $_;
|
|
1432 $maxval = $arg[$_];
|
|
1433 }
|
|
1434 }
|
|
1435
|
|
1436 # trim reads
|
|
1437 $s = substr($s,0,$index);
|
|
1438 $q = substr($q,0,$index);
|
|
1439 if($read == 2) {
|
|
1440 $s = reverse($s);
|
|
1441 $q = reverse($q);
|
|
1442 }
|
|
1443
|
|
1444 # return seq and quality
|
|
1445 return ($s, $q);
|
|
1446 }
|
|
1447
|
|
1448
|
|
1449 sub samfile_line {
|
|
1450 # grab the variables
|
|
1451 my ($Samfile, $WriteBam, $refSeq) = @_;
|
|
1452 my %SamStats;
|
|
1453 my %minLength;
|
|
1454 my %maxLength;
|
|
1455 my $readArrayRef;
|
|
1456 {
|
|
1457 lock($Samfile); # Keep other threads from changing the queue's contents
|
|
1458 $readArrayRef = $Samfile->dequeue();
|
|
1459 }
|
|
1460
|
|
1461 while(defined $readArrayRef) {
|
|
1462 my ($read, $qual, $rstart, $align, $ref, $reg, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, $readStats) = @{ $readArrayRef };
|
|
1463 my $seq = $align;
|
|
1464 $seq =~ s/-//g;
|
|
1465
|
|
1466 # calculate the edit difference - count indels as only 1 missmatch hence tranform the strings so that all indels have length 1
|
|
1467 my $sam;
|
|
1468 my $s1;
|
|
1469 my $s2;
|
|
1470 my @s1 = split("", $align);
|
|
1471 my @s2 = split("", $ref);
|
|
1472
|
|
1473 my $indel = 0;
|
|
1474 for(my $i = 0; $i <= $#s1; $i++) {
|
|
1475 if(($s1[$i] ne "-" && $s2[$i] ne "-") || !$indel) {
|
|
1476 $s1 .= $s1[$i];
|
|
1477 $s2 .= $s2[$i];
|
|
1478 $indel = 0;
|
|
1479 if($s1[$i] eq "-" || $s2[$i] eq "-") {
|
|
1480 $indel = 1;
|
|
1481 }
|
|
1482 }
|
|
1483 }
|
|
1484 my $mq = levenshtein($s1, $s2);
|
|
1485
|
|
1486 my $cl = length($seq);
|
|
1487 # calculate max diff allowed
|
|
1488 if(!defined $maxdiff{$cl}) {
|
|
1489 { lock(%maxdiff); $maxdiff{$cl} = maxdiff($cl, 0.02, 0.04); }
|
|
1490 }
|
|
1491
|
|
1492 if($mq > $maxdiff{$cl} || $cl == 0) {
|
|
1493 { lock(%stats); $stats{$reg}{PoorMapQual}++; $stats{PoorMapQual}++; }
|
|
1494 if(defined $orig_read1) {
|
|
1495 $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read1\t$orig_qtrim1\tRG:Z:$readGroup\n";
|
|
1496 # send to writebamfile thread
|
|
1497 {
|
|
1498 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1499 $WriteBam->enqueue($sam);
|
|
1500 }
|
|
1501 }
|
|
1502 if(defined $orig_read2) {
|
|
1503 $read =~ s/[1](:[A-Z]+:[0-9]+:[ACTGN]+)$/2$1/;
|
|
1504 $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read2\t$orig_qtrim2\tRG:Z:$readGroup\n";
|
|
1505 # send to writebamfile thread
|
|
1506 {
|
|
1507 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1508 $WriteBam->enqueue($sam);
|
|
1509 }
|
|
1510 }
|
|
1511 } else {
|
|
1512 $mq = 60;
|
|
1513
|
|
1514 # add any stats from earlier
|
|
1515 foreach my $key1 (keys %$readStats) {
|
|
1516 foreach my $key2 (keys % { $$readStats{$key1} }) {
|
|
1517 $SamStats{$key1}{$key2} += $$readStats{$key1}{$key2};
|
|
1518 }
|
|
1519 }
|
|
1520
|
|
1521 # keep track of quality scores
|
|
1522 $SamStats{$reg}{Bases} += $cl;
|
|
1523 $SamStats{$reg}{Quality} += $_ for sanger2phred($qual);
|
|
1524
|
|
1525 # keep track of read lengths
|
|
1526 if(defined $minLength{$reg}) {
|
|
1527 $minLength{$reg} = min([$cl, $minLength{$reg}]);
|
|
1528 $maxLength{$reg} = max([$cl, $maxLength{$reg}]);
|
|
1529 } else {
|
|
1530 $minLength{$reg} = $cl;
|
|
1531 $maxLength{$reg} = $cl;
|
|
1532 }
|
|
1533
|
|
1534 # keep track of strand and bases seen
|
|
1535 if($strand eq 'F') {
|
|
1536 $SamStats{$reg}{NumberForward}++;
|
|
1537 } else {
|
|
1538 $SamStats{$reg}{NumberReverse}++;
|
|
1539 }
|
|
1540 $SamStats{$reg}{TotalA} += ($seq =~ tr/A//);
|
|
1541 $SamStats{$reg}{TotalC} += ($seq =~ tr/C//);
|
|
1542 $SamStats{$reg}{TotalG} += ($seq =~ tr/G//);
|
|
1543 $SamStats{$reg}{TotalT} += ($seq =~ tr/T//);
|
|
1544 $SamStats{$reg}{TotalN} += ($seq =~ tr/N//);
|
|
1545
|
|
1546 # keep count the number of reads for each contig
|
|
1547 $SamStats{$reg}{Reads}++;
|
|
1548
|
|
1549 # calculate strand info
|
|
1550 if($strand eq 'F') {
|
|
1551 $strand = 0;
|
|
1552 } else {
|
|
1553 $strand = 16;
|
|
1554 }
|
|
1555 # calculate cigar
|
|
1556 my $cigar = "";
|
|
1557 my $M = 0;
|
|
1558 my $D = 0;
|
|
1559 my $I = 0;
|
|
1560 my $Subs = 0;
|
|
1561 for(my $i = 0; $i < length($align); $i++) {
|
|
1562 if(substr($align, $i, 1) eq "-") {
|
|
1563 $D++;
|
|
1564 if($M) {
|
|
1565 $cigar .= "${M}M";
|
|
1566 $M = 0;
|
|
1567 } elsif($I) {
|
|
1568 $cigar .= "${I}I";
|
|
1569 $I = 0;
|
|
1570 }
|
|
1571 } elsif(substr($ref, $i, 1) eq "-") {
|
|
1572 $I++;
|
|
1573 if($M) {
|
|
1574 $cigar .= "${M}M";
|
|
1575 $M = 0;
|
|
1576 } elsif($D) {
|
|
1577 $cigar .= "${D}D";
|
|
1578 $D = 0;
|
|
1579 }
|
|
1580 } else {
|
|
1581 $M++;
|
|
1582 if($D) {
|
|
1583 $cigar .= "${D}D";
|
|
1584 $D = 0;
|
|
1585 } elsif($I) {
|
|
1586 $cigar .= "${I}I";
|
|
1587 $I = 0;
|
|
1588 }
|
|
1589 $Subs++ if substr($ref, $i, 1) ne substr($align, $i, 1);
|
|
1590 }
|
|
1591 }
|
|
1592 if($M) {
|
|
1593 $cigar .= "${M}M";
|
|
1594 } elsif($I) {
|
|
1595 $cigar .= "${I}I";
|
|
1596 } elsif($D) {
|
|
1597 $cigar .= "${D}D";
|
|
1598 }
|
|
1599
|
|
1600 # construct sam file line
|
|
1601 $sam = "$read\t$strand\t$refLength{$reg}{newReg}\t$rstart\t$mq\t$cigar\t*\t0\t0\t$seq\t$qual\tRG:Z:$readGroup\n";
|
|
1602
|
|
1603 # send to writebamfile thread
|
|
1604 {
|
|
1605 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1606 $WriteBam->enqueue($sam);
|
|
1607 }
|
|
1608 }
|
|
1609
|
|
1610 #grab the next read
|
|
1611 {
|
|
1612 lock($Samfile); # Keep other threads from changing the queue's contents
|
|
1613 $readArrayRef = $Samfile->dequeue();
|
|
1614 }
|
|
1615 }
|
|
1616
|
|
1617 {
|
|
1618 # add stats from earlier
|
|
1619 lock(%stats);
|
|
1620 foreach my $key1 (keys %SamStats) {
|
|
1621 $stats{$key1}{MinLength} = min([$minLength{$key1}, $stats{$key1}{MinLength}]);
|
|
1622 $stats{$key1}{MaxLength} = max([$maxLength{$key1}, $stats{$key1}{MaxLength}]);
|
|
1623 foreach my $key2 (keys % { $SamStats{$key1} }) {
|
|
1624 $stats{$key1}{$key2} += $SamStats{$key1}{$key2};
|
|
1625 }
|
|
1626 }
|
|
1627
|
|
1628 }
|
|
1629 print "Closing Sam File Line Generator\n";
|
|
1630
|
|
1631 return;
|
|
1632 }
|
|
1633
|
|
1634 # perform the alignment
|
|
1635 sub nwAlign {
|
|
1636 my ($ToAlign, $Aligned, $regions) = @_;
|
|
1637 my $readArrayRef;
|
|
1638 {
|
|
1639 lock($ToAlign); # Keep other threads from changing the queue's contents
|
|
1640 $readArrayRef = $ToAlign->dequeue();
|
|
1641 }
|
|
1642
|
|
1643 while(defined $readArrayRef) {
|
|
1644 # we have picked a contig
|
|
1645 # check strand and take rev comp of ref if necessary
|
|
1646 my ($ref, $left, $right, $read, $qtrim1, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
|
|
1647 my %align;
|
|
1648 my $name;
|
|
1649 #! Parameters
|
|
1650 #! ClustalW Parameters
|
|
1651 my $matchScore = 1.9;
|
|
1652 my $missmatch_penalty = -0.4;
|
|
1653 my $gapPenOpen = -10.0;
|
|
1654 my $gapPenExt = -0.4;
|
|
1655 my $leftAndRightGapPen = true;
|
|
1656 my $debug = false;
|
|
1657 my $cons = "";
|
|
1658 my $align1= "";
|
|
1659 my $align2= "";
|
|
1660 my $align3= "";
|
|
1661 my $score = 0.0;
|
|
1662 my $singleAlign = false; #aligning 2 or 3 sequences
|
|
1663 if($conc)
|
|
1664 {
|
|
1665 #got to concatenate seqs and align
|
|
1666 $score = align3($ref, $right, $left, $align1, $align2, $align3, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, $leftAndRightGapPen, $debug);
|
|
1667 $name = "$left.$right";
|
|
1668 print "$contig\n$align1\n$align2\n$align3\n\n" if defined $$regions{$contig};
|
|
1669 }
|
|
1670 elsif(!defined $right)
|
|
1671 {
|
|
1672 #align with left
|
|
1673 $score = align2($ref, $left, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, false, true, $debug);
|
|
1674 $singleAlign = true;
|
|
1675 $name = $left;
|
|
1676 print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig};
|
|
1677 }
|
|
1678 elsif(!defined $left)
|
|
1679 {
|
|
1680 #align with right
|
|
1681 $score = align2($ref, $right, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen ,$gapPenExt ,true, false, $debug);
|
|
1682 $singleAlign = true;
|
|
1683 $name = $right;
|
|
1684 print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig};
|
|
1685 }
|
|
1686
|
|
1687 # add aligned reads to queue
|
|
1688 # cache results
|
|
1689 my $type;
|
|
1690 {
|
|
1691 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
1692
|
|
1693 my @array;
|
|
1694 if($singleAlign && !defined $right)
|
|
1695 {
|
|
1696 @array = ($read, $align1, $align2, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
|
|
1697 {
|
|
1698 lock(%cacheAlign);
|
|
1699 $cacheAlign{$contig}{R1}{$name} = &share( [] );
|
|
1700 @{ $cacheAlign{$contig}{R1}{$name} } = @array;
|
|
1701 }
|
|
1702 }
|
|
1703 elsif($singleAlign && !defined $left )
|
|
1704 {
|
|
1705 @array = ($read, undef, undef, undef, $align1, $align2, $qtrim2, $contig, $strand, $conc);
|
|
1706 {
|
|
1707 lock(%cacheAlign);
|
|
1708 $cacheAlign{$contig}{R2}{$name} = &share( [] );
|
|
1709 @{ $cacheAlign{$contig}{R2}{$name} } = @array;
|
|
1710 }
|
|
1711 }
|
|
1712 else
|
|
1713 {
|
|
1714 @array = ($read, $align1, $align2, $qtrim1, $align1, $align3, $qtrim2, $contig, $strand, $conc);
|
|
1715 {
|
|
1716 lock(%cacheAlign);
|
|
1717 $cacheAlign{Conc}{$name} = &share( [] );
|
|
1718 @{ $cacheAlign{Conc}{$name} } = @array;
|
|
1719 }
|
|
1720 }
|
|
1721 $Aligned->enqueue(\@array);
|
|
1722 }
|
|
1723
|
|
1724 #grab the next read
|
|
1725 {
|
|
1726 lock($ToAlign); # Keep other threads from changing the queue's contents
|
|
1727 $readArrayRef = $ToAlign->dequeue();
|
|
1728 }
|
|
1729 }
|
|
1730 # clean up
|
|
1731 print "Closing NW2 Aligner\n";
|
|
1732 return;
|
|
1733 }
|
|
1734
|
|
1735 # function to calculat the minimum edit distance threshold (taken from bwa)
|
|
1736 sub maxdiff {
|
|
1737 my ($l, $err, $thres) = @_;
|
|
1738 my $lambda = exp(-$l*$err);
|
|
1739 my $y = 1.0;
|
|
1740 my $x = 1;
|
|
1741 my $sum = $lambda;
|
|
1742 for(my $k = 1; $k < 1000; $k++) {
|
|
1743 $y *= $l * $err;
|
|
1744 $x *= $k;
|
|
1745 $sum += $lambda * $y / $x;
|
|
1746 if(1.0 - $sum < $thres) {
|
|
1747 return $k;
|
|
1748 }
|
|
1749 }
|
|
1750 return 2;
|
|
1751 }
|
|
1752
|
|
1753 # using approx matching return the positions in $list of those most closely matching $string
|
|
1754 sub amatch {
|
|
1755 my ($string, $list) = @_;
|
|
1756 my @d = adistr($string, @$list);
|
|
1757 my @d_index = sort { abs($d[$a]) <=> abs($d[$b]) } 0..$#$list;
|
|
1758
|
|
1759 return if $d[$d_index[0]] > 0.25;
|
|
1760
|
|
1761 my @pos = ($d_index[0]);
|
|
1762 my $i = 1;
|
|
1763 while($i <= $#d_index && $d[$d_index[0]] == $d[$d_index[$i]]) {
|
|
1764 push @pos, $d_index[$i];
|
|
1765 $i++;
|
|
1766 }
|
|
1767 return @pos;
|
|
1768 }
|
|
1769
|
|
1770 # a sorting algorithm to sort the chromosomes
|
|
1771 sub sortChrom {
|
|
1772 my ($achr, $astart, $aend) = split(":|-", $a);
|
|
1773 my ($bchr, $bstart, $bend) = split(":|-", $b);
|
|
1774 $achr =~ s/X/23/;
|
|
1775 $achr =~ s/Y/24/;
|
|
1776 $achr =~ s/MT?/25/;
|
|
1777 $bchr =~ s/X/23/;
|
|
1778 $bchr =~ s/Y/24/;
|
|
1779 $bchr =~ s/MT?/25/;
|
|
1780 if($achr < $bchr) {
|
|
1781 return -1;
|
|
1782 } elsif($bchr < $achr) {
|
|
1783 return 1;
|
|
1784 } elsif($astart < $bstart) {
|
|
1785 return -1;
|
|
1786 } elsif($bstart < $astart) {
|
|
1787 return 1;
|
|
1788 } elsif($aend < $bend) {
|
|
1789 return -1;
|
|
1790 } elsif($bend < $aend) {
|
|
1791 return 1;
|
|
1792 } else {
|
|
1793 return 0;
|
|
1794 }
|
|
1795 }
|
|
1796
|
|
1797 # pass a code reference
|
|
1798 sub modify {
|
|
1799 my($text, $code) = @_;
|
|
1800 $code->($text);
|
|
1801 return $text;
|
|
1802 }
|
|
1803
|
|
1804 sub split_region {
|
|
1805 my ($chr, $start, $end) = split(":|-", shift);
|
|
1806 $end = $start unless defined $end;
|
|
1807 return($chr, $start, $end);
|
|
1808 }
|
|
1809
|
|
1810 sub error_rate {
|
|
1811 my ($x, $y) = @_;
|
|
1812 my @x = split("", $x);
|
|
1813 my @y = split("", $y);
|
|
1814 my $s = 0;
|
|
1815 my $l = 0;
|
|
1816 for(my $i = 0; $i <= $#x; $i++) {
|
|
1817 if($x[$i] ne "-" && $y[$i] ne "-") {
|
|
1818 $s++ if $x[$i] ne $y[$i];
|
|
1819 $l++;
|
|
1820 }
|
|
1821 }
|
|
1822 return($s / $l);
|
|
1823 }
|
|
1824
|
|
1825
|