0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 # script to convert paired end reads to a concensus sequence
|
|
4
|
|
5 # Created by Jason Ellul, Feb 2012
|
|
6 # Modified by Franco Caramia and Jason Ellul, July 2012 and November 2012
|
|
7 # 0.4 Fixed mapping of concensus region (large insertions and deletions now picked up correctly)
|
|
8 # Added missalignment penalty
|
|
9 # Read 1 and 2 are now aligned first then they are aligned to the reference
|
|
10 # 0.5 Added ability to name amplicons by includinfo the info in the primers file
|
|
11 # Added the ability to output the alignments in a particular region using the new flag -a
|
|
12 # Added some extra stats to the output
|
|
13
|
|
14 use strict;
|
|
15 use warnings;
|
|
16 use Getopt::Std;
|
|
17 use File::Basename;
|
|
18 use File::Find;
|
|
19 use File::Copy;
|
|
20 use File::MMagic;
|
|
21 use File::Temp qw/ tempfile tempdir /;
|
|
22 use Cwd 'abs_path';
|
|
23 use threads;
|
|
24 use threads::shared;
|
|
25 use List::Util qw(sum);
|
|
26 use Thread::Queue;
|
|
27 use String::Approx 'adistr';
|
|
28 use POSIX qw(floor ceil);
|
|
29 String::Approx::cache_max(25000);
|
|
30 String::Approx::cache_purge(0.25);
|
|
31 use constant false => 0;
|
|
32 use constant true => 1;
|
|
33 use Inline CPP => Config => AUTO_INCLUDE => '#include "utils.cpp"';
|
|
34 use Inline CPP => <<'END_CPP';
|
|
35
|
|
36
|
|
37
|
|
38
|
|
39 int levenshtein(char* seq1, char* seq2)
|
|
40 {
|
|
41 // Returns edit distance of 2 sequences
|
|
42 string sam1(seq1);
|
|
43 string sam2(seq2);
|
|
44 return levenshteinDistance(sam1, sam2);
|
|
45 }
|
|
46
|
|
47 double align2(char* seq1, char* seq2, SV* res1, SV* res2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug )
|
|
48 {
|
|
49 // Returns Sequence alignment of 2 sequences
|
|
50 // res1, res2 contain the alignment
|
|
51 //seq1: reference sequence
|
|
52 //seq2: amplicon sequence
|
|
53 //matchScore: reward for matching a nucleotide
|
|
54 //gapPenOpen: penalty for opening a gap
|
|
55 //gapPenext: penalty for extending a gap
|
|
56 //noFrontGapPenalty: No penalty will be impose for gaps at the beggining of the alignment
|
|
57 //noEndGapPenalty: No penalty will be impose for gaps at the end of the alignment
|
|
58 //debug: prints scores, alignments, etc...
|
|
59
|
|
60 string sam1(seq1);
|
|
61 string sam2(seq2);
|
|
62 double ret = nw_align2(sam1, sam2, matchScore, missmatch_penalty, gapPenOpen, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);
|
|
63 //Setting the return alignments
|
|
64 sv_setpvn(res1, (char*)sam1.c_str(), sam1.length());
|
|
65 sv_setpvn(res2, (char*)sam2.c_str(), sam2.length());
|
|
66 return ret;
|
|
67 }
|
|
68
|
|
69 void adjustSequence(int refSize, string& seq, char dir)
|
|
70 {
|
|
71
|
|
72 int adjust = refSize - seq.length();
|
|
73 if(dir == 'r')
|
|
74 for(int i=0; i<adjust; ++i) seq = '-' + seq;
|
|
75 else
|
|
76 for(int i=0; i<adjust; ++i) seq += '-';
|
|
77 }
|
|
78
|
|
79 double align3(char* ref, char* right,char* left , SV* res1, SV* res2, SV* res3, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool leftAndRightGapPen, bool debug )
|
|
80 {
|
|
81 // Returns Sequence alignment of 3 sequences
|
|
82 // res1, res2, res3 contain the alignment
|
|
83 //ref: reference sequence
|
|
84 //left: left amplicon sequence
|
|
85 //right: right amplicon sequence
|
|
86 //matchScore: reward for matching a nucleotide
|
|
87 //gapPenOpen: penalty for opening a gap
|
|
88 //gapPenext: penalty for extending a gap
|
|
89 //leftAndRightGapPen: Smart way to penalise according to the amplicon
|
|
90 //debug: prints scores, alignments, etc...
|
|
91
|
|
92
|
|
93 string leftSeq(left);
|
|
94 string rightSeq(right);
|
|
95
|
|
96 bool noFrontGapPenalty = false;
|
|
97 bool noEndGapPenalty = false;
|
|
98
|
|
99 //Align right with left reads
|
|
100 if(leftAndRightGapPen)
|
|
101 {
|
|
102 noFrontGapPenalty = true;
|
|
103 noEndGapPenalty = true;
|
|
104 }
|
|
105
|
|
106 //first alignment the 2 reads
|
|
107 double ret = nw_align2(rightSeq, leftSeq, matchScore, missmatch_penalty*8, gapPenOpen*5, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);
|
|
108
|
|
109 //Setting the return alignments
|
|
110 vector <string> msa;
|
|
111 vector <string> partial;
|
|
112 vector <string> seq;
|
|
113
|
|
114 partial.push_back(rightSeq);
|
|
115 partial.push_back(leftSeq);
|
|
116 seq.push_back(ref);
|
|
117
|
|
118 msa = nw_alignAlignments(partial, seq , matchScore, missmatch_penalty, gapPenOpen, gapPenExt, false, false, debug);
|
|
119 sv_setpvn(res1, (char*)msa[2].c_str(), msa[2].length());
|
|
120 sv_setpvn(res2, (char*)msa[1].c_str(), msa[1].length());
|
|
121 sv_setpvn(res3, (char*)msa[0].c_str(), msa[0].length());
|
|
122 return ret;
|
|
123 }
|
|
124
|
|
125 END_CPP
|
|
126
|
|
127 $| = 1;
|
|
128
|
|
129 # Grab and set all options
|
|
130 my $version = "0.5";
|
|
131 my %OPTIONS = (p => 1);
|
|
132 getopts('a:dl:L:o:p:rsv', \%OPTIONS);
|
|
133
|
|
134 die qq(version $version
|
|
135 Usage: alignCustomAmplicon.pl [OPTIONS] <ref> <fastq[.gz] 1> <fastq[.gz] 2> <primers>
|
|
136
|
|
137 OPTIONS: -a STR print the alignment for any amplicons covering a particular region [chr:loc || chr:start-stop]
|
|
138 -d print debug info
|
|
139 -l STR filename of the log file [null]
|
|
140 -L STR append to an existing log file [null]
|
|
141 -o STR filename for bam file
|
|
142 -p INT number of threads [$OPTIONS{p}]
|
|
143 -r do not create stats report
|
|
144 -s sort the bam file
|
|
145 -v verbose progress
|
4
|
146 -j STR picard jar
|
0
|
147 ) if(@ARGV != 4);
|
|
148
|
|
149 my $Script = 'alignCustomAmplicon';
|
|
150 my $ref = shift @ARGV;
|
|
151 my $fastq1 = shift @ARGV;
|
|
152 my $fastq2 = shift @ARGV;
|
|
153 my $primers = shift @ARGV;
|
|
154 my $now;
|
|
155
|
|
156 #create log file
|
|
157 if(defined $OPTIONS{l}) {
|
|
158 my $logfile = $OPTIONS{l};
|
|
159 open (FH, ">$logfile") or die 'Couldn\'t create log file!\n';
|
|
160 close (FH);
|
|
161 # Open the log file and redirect output to it
|
|
162 open (STDERR, ">>$logfile");
|
|
163 open (STDOUT, ">>$logfile");
|
|
164 my $now = localtime time;
|
|
165 print "Log File Created $now\n\n";
|
|
166 } elsif(defined $OPTIONS{L}) {
|
|
167 #append to a log file
|
|
168 my $logfile = $OPTIONS{L};
|
|
169 # Open the log file and redirect output to it
|
|
170 open (STDERR, ">>$logfile") or die 'Couldn\'t create log file!\n';
|
|
171 open (STDOUT, ">>$logfile") or die 'Couldn\'t create log file!\n';
|
|
172 my $now = localtime time;
|
|
173 print "Appending To Log File $now\n\n";
|
|
174 }
|
|
175
|
4
|
176 if (defined $OPTIONS{j})
|
|
177 {
|
|
178 my $JAR = $OPTIONS{j};
|
|
179
|
|
180 }
|
|
181
|
0
|
182 # print version of this script
|
|
183 print "Using $Script version $version \n\n";
|
|
184
|
|
185 if(defined $OPTIONS{d}) {
|
|
186 print "Setting threads to 1 as using debug\n";
|
|
187 $OPTIONS{p} = 1;
|
|
188 }
|
|
189
|
|
190 # grab sample name, set output file name and open some files
|
|
191 my ($name, $path, $suffix) = fileparse($fastq1, ('.fastq', '.fastq.gz'));
|
|
192 my $sampleName :shared = $name;
|
|
193 $sampleName =~ s/^(.*?_.*?)_.*/$1/;
|
|
194 open(PRIM, $primers) or die "can't open $primers: $!\n";
|
|
195
|
|
196 # set samfile name
|
|
197 my $bamfile_out = "${sampleName}_aligned.bam";
|
|
198 if(defined $OPTIONS{o}) {
|
|
199 $bamfile_out = $OPTIONS{o};
|
|
200 }
|
|
201 my $bam_to_sort = $bamfile_out;
|
|
202 if(defined $OPTIONS{s}) {
|
|
203 $bam_to_sort .= '.tmp';
|
|
204 }
|
|
205
|
|
206 my $stats = $bamfile_out;
|
|
207 if(!defined $OPTIONS{r}) {
|
|
208 $stats =~ s/\..*?$//;
|
|
209 open(STATS, ">${stats}_stats.csv");
|
|
210 }
|
|
211
|
|
212 # read in primer lengths
|
|
213 my %refFrags;
|
|
214 $refFrags{min} = 5000000;
|
|
215 while(my $line = <PRIM>) {
|
|
216 chomp $line;
|
|
217 my @line_sp = split("\t", $line);
|
|
218 $refFrags{primers}{$line_sp[0]}{U}{F}{length} = $line_sp[1];
|
|
219 $refFrags{primers}{$line_sp[0]}{D}{F}{length} = $line_sp[2];
|
|
220 $refFrags{primers}{$line_sp[0]}{U}{R}{length} = $line_sp[2];
|
|
221 $refFrags{primers}{$line_sp[0]}{D}{R}{length} = $line_sp[1];
|
|
222 $refFrags{name}{$line_sp[0]} = $line_sp[3];
|
|
223 $refFrags{min} = min([$refFrags{min}, $line_sp[1], $line_sp[2]]);
|
|
224 }
|
|
225 close(PRIM);
|
|
226 my @OrderedContigs :shared = sort sortChrom keys %{ $refFrags{primers} };
|
|
227
|
|
228 my %regions;
|
|
229 if(defined $OPTIONS{a}) {
|
|
230 my ($chr, $start, $end) = split_region($OPTIONS{a});
|
|
231 foreach my $reg (@OrderedContigs) {
|
|
232 my ($chr_reg, $start_reg, $end_reg) = split_region($reg);
|
|
233 $regions{$reg} = 1 if $chr eq $chr_reg && $start_reg <= $end && $end_reg >= $start;
|
|
234 }
|
|
235 }
|
|
236
|
|
237 # grab fasta file
|
|
238 my %refSeq;
|
|
239 read_fasta_file($ref, \%refSeq);
|
|
240
|
|
241 # calculate read length and read group
|
|
242 my $readGroup :shared;
|
|
243 my $readLength;
|
|
244 my $mm = new File::MMagic;
|
|
245 my $filetype = $mm->checktype_filename(abs_path($fastq1));
|
|
246 if($filetype eq "application/x-gzip") {
|
|
247 my $sampleRead = `gunzip -c $fastq1 | head -n 2 | tail -n 1`;
|
|
248 chomp $sampleRead;
|
|
249 $readLength = length($sampleRead);
|
|
250 $readGroup = `gunzip -c $fastq1 | head -n 1`;
|
|
251 chomp $readGroup;
|
|
252 $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
|
|
253 } elsif($filetype eq "text/plain") {
|
|
254 my $sampleRead = `head -n 2 $fastq1 | tail -n 1`;
|
|
255 chomp $sampleRead;
|
|
256 $readLength = length($sampleRead);
|
|
257 $readGroup = `head $fastq1 -n 1`;
|
|
258 chomp $readGroup;
|
|
259 $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
|
|
260 } else {
|
|
261 die("$fastq1 is filetype $filetype and not text/plain or application/x-gzip\n");
|
|
262 }
|
|
263
|
|
264 # create hash to keep track of maxdiff and readlengths
|
|
265 my %maxdiff :shared;
|
|
266
|
|
267 # create hash to keep track of stats
|
|
268 my %stats :shared;
|
|
269 $stats{Amb} = 0;
|
|
270 $stats{PoorQualMates} = 0;
|
|
271 $stats{PoorMapQual} = 0;
|
|
272 $stats{PoorQual1} = 0;
|
|
273 $stats{PoorQual2} = 0;
|
|
274
|
|
275 # setup other shared variables and caches
|
|
276 my %cache :shared;
|
|
277 my $cacheHits :shared = 0;
|
|
278 my %cacheAlign :shared;
|
|
279 $cacheAlign{Conc} = &share( {} );
|
|
280 my $cacheAlignHits :shared = 0;
|
|
281 my %refLength :shared;
|
|
282
|
|
283 # extract primers and fragments on forward and reverse strand and initialize counts
|
|
284 foreach my $key (@OrderedContigs) {
|
|
285 push @{ $refFrags{seqU} }, substr($refSeq{$key}, 0, $refFrags{primers}{$key}{U}{F}{length});
|
|
286 push @{ $refFrags{seqD} }, substr($refSeq{$key}, -$refFrags{primers}{$key}{D}{F}{length});
|
|
287 push @{ $refFrags{seqU} }, revcompl(substr($refSeq{$key}, -$refFrags{primers}{$key}{U}{R}{length}));
|
|
288 push @{ $refFrags{seqD} }, revcompl(substr($refSeq{$key}, 0, $refFrags{primers}{$key}{D}{R}{length}));
|
|
289 push @{ $refFrags{lenU} }, ($refFrags{primers}{$key}{U}{F}{length}, $refFrags{primers}{$key}{U}{R}{length});
|
|
290 push @{ $refFrags{lenD} }, ($refFrags{primers}{$key}{D}{F}{length}, $refFrags{primers}{$key}{D}{R}{length});
|
|
291 push @{ $refFrags{reg} }, ($key, $key);
|
|
292 push @{ $refFrags{strand} }, ('F', 'R');
|
|
293
|
|
294 # calculate new contig name with primers removed
|
|
295 my ($chr, $start, $stop) = split(":|-", $key);
|
|
296 $start += $refFrags{primers}{$key}{U}{F}{length};
|
|
297 $stop -= $refFrags{primers}{$key}{D}{F}{length};
|
|
298 $refLength{$key} = &share( {} );
|
|
299 $refLength{$key}{newReg} = "$chr:$start-$stop";
|
|
300 $refLength{$key}{length} = $stop - $start + 1;
|
|
301
|
|
302 $cacheAlign{$key} = &share( {} );
|
|
303 $cacheAlign{$key}{R1} = &share( {} );
|
|
304 $cacheAlign{$key}{R2} = &share( {} );
|
|
305
|
|
306 $stats{$key} = &share( {} );
|
|
307 $stats{$key}{Reads} = 0;
|
|
308 $stats{$key}{Bases} = 0;
|
|
309 $stats{$key}{Quality} = 0;
|
|
310 $stats{$key}{Mapping} = 0;
|
|
311 $stats{$key}{Read1} = 0;
|
|
312 $stats{$key}{Read2} = 0;
|
|
313 $stats{$key}{ConcMapping} = 0;
|
|
314 $stats{$key}{ConcReads} = 0;
|
|
315 $stats{$key}{NumberForward} = 0;
|
|
316 $stats{$key}{NumberReverse} = 0;
|
|
317 $stats{$key}{ConcErrorsBases} = 0;
|
|
318 $stats{$key}{ConcCorrectBases} = 0;
|
|
319 $stats{$key}{MinLength} = 100000;
|
|
320 $stats{$key}{MaxLength} = 0;
|
|
321 $stats{$key}{TotalA} = 0;
|
|
322 $stats{$key}{TotalC} = 0;
|
|
323 $stats{$key}{TotalG} = 0;
|
|
324 $stats{$key}{TotalT} = 0;
|
|
325 $stats{$key}{TotalN} = 0;
|
|
326 $stats{$key}{PoorMapQual} = 0;
|
|
327 }
|
|
328
|
|
329 # start up worker threads
|
|
330 my $FindContig = Thread::Queue->new();
|
|
331 my $ToAlign = Thread::Queue->new();
|
|
332 my $Aligned = Thread::Queue->new();
|
|
333 my $Samfile = Thread::Queue->new();
|
|
334 my $WriteBam = Thread::Queue->new();
|
|
335 my @threadArray;
|
|
336 for(my $index = 1; $index <= $OPTIONS{p}; $index++) {
|
|
337 push @threadArray, threads->create(\&thread_find_contig, $FindContig, \%refFrags, \%refSeq, $ToAlign, $Aligned, $WriteBam);
|
|
338 push @threadArray, threads->create(\&thread_process_align, $Aligned, \%refFrags, \%refSeq, $Samfile, $WriteBam) if $index <= max([1, $OPTIONS{p}/2]);
|
|
339 push @threadArray, threads->create(\&samfile_line, $Samfile, $WriteBam, \%refSeq) if $index <= max([1, $OPTIONS{p}/2]);
|
|
340 }
|
|
341 push @threadArray, threads->create(\&nwAlign, $ToAlign, $Aligned, \%regions);
|
|
342 push @threadArray, threads->create(\&writebamfile, $bam_to_sort, $WriteBam);
|
|
343
|
|
344 # read in each sequence and its mate from the fastq file
|
|
345 my $read1;
|
|
346 my $read2;
|
|
347 my $seq1;
|
|
348 my $seq2;
|
|
349 my $qual1;
|
|
350 my $qual2;
|
|
351 my $desc;
|
|
352 my @strand;
|
|
353 my $cont = 1;
|
|
354
|
|
355 # open fastq files
|
|
356 my $gz1;
|
|
357 my $gz2;
|
|
358 my $f1gz = 0;
|
|
359 my $f2gz = 0;
|
|
360 if($filetype eq "application/x-gzip") {
|
|
361 open(FQ1, "gunzip -dc $fastq1 |") or die "Can't fork: $!\n";
|
|
362 } else {
|
|
363 open(FQ1, $fastq1) or die "$fastq1: $!";
|
|
364 }
|
|
365
|
|
366 $filetype = $mm->checktype_filename(abs_path($fastq2));;
|
|
367 if($filetype eq "application/x-gzip") {
|
|
368 open(FQ2, "gunzip -dc $fastq2 |") or die "Can't fork: $!\n";
|
|
369 } elsif($filetype eq "text/plain") {
|
|
370 open(FQ2, $fastq2) or die "$fastq2: $!";
|
|
371 } else {
|
|
372 die("$fastq2 is filetype $filetype and not text/plain or application/x-gzip\n");
|
|
373 }
|
|
374
|
|
375 # add reads to the queue
|
|
376 my $readNo = 0;
|
|
377 while ($cont) {
|
|
378 if($read1 = <FQ1>) { # read 1 and 2 read name
|
|
379 $seq1 = <FQ1>; # read 1 sequence
|
|
380 $desc = <FQ1>; # read 1 desc
|
|
381 $qual1 = <FQ1>; # read 1 quality
|
|
382 } else {
|
|
383 last;
|
|
384 }
|
|
385
|
|
386 if($read2 = <FQ2>) { # read 2 read name
|
|
387 $seq2 = <FQ2>; # read 2 sequence
|
|
388 $desc = <FQ2>; # read 2 sdesc
|
|
389 $qual2 = <FQ2>; # read 2 quality
|
|
390 } else {
|
|
391 last;
|
|
392 }
|
|
393
|
|
394 # read 1 the forward read
|
|
395 chomp $read1;
|
|
396 chomp $seq1;
|
|
397 chomp $qual1;
|
|
398 $read1 =~ s/^\@(.*)\s?.*/$1/;
|
|
399
|
|
400 # read 2 the reverse read
|
|
401 chomp $read2;
|
|
402 chomp $seq2;
|
|
403 $seq2 = revcompl($seq2);
|
|
404 chomp $qual2;
|
|
405 $qual2 = reverse($qual2);
|
|
406
|
|
407 $read2 =~ s/^\@(.*)\s?.*/$1/;
|
|
408 $readNo++;
|
|
409
|
|
410 # keep at most 20 reads in buffer
|
|
411 while(1) {
|
|
412 if(defined $OPTIONS{v}) {
|
|
413 my $percAmb = $stats{Amb} / $readNo * 100;
|
|
414 my $percPoor = $stats{PoorQualMates} / $readNo * 100;
|
|
415 my $percPoorMap = $stats{PoorMapQual} / $readNo * 100;
|
|
416 my $percPoor1 = $stats{PoorQual1} / $readNo * 100;
|
|
417 my $percPoor2 = $stats{PoorQual2} / $readNo * 100;
|
|
418 $percAmb = sprintf("%.2f", $percAmb);
|
|
419 $percPoor = sprintf("%.2f", $percPoor);
|
|
420 $percPoorMap = sprintf("%.2f", $percPoorMap);
|
|
421 $percPoor1 = sprintf("%.2f", $percPoor1);
|
|
422 $percPoor2 = sprintf("%.2f", $percPoor2);
|
|
423 print STDERR "\r$readNo processed - Amb $percAmb - PoorQual $percPoor - PoorQual1 $percPoor1 - PoorQual2 $percPoor2 - PoorMappingQual $percPoorMap - FindContig ".$FindContig->pending()." - ToAlign ".$ToAlign->pending()." - Aligned ".$Aligned->pending()." - Sam ".$Samfile->pending()." - Writebam ".$WriteBam->pending()." ";
|
|
424 }
|
|
425 last if $FindContig->pending() <= 100;
|
|
426 }
|
|
427 my @readArray = ($read1, $seq1, $qual1, $read2, $seq2, $qual2);
|
|
428 {
|
|
429 lock($FindContig); # Keep other threads from changing the queue's contents
|
|
430 $FindContig->enqueue(\@readArray);
|
|
431 }
|
|
432 }
|
|
433 close(FQ1);
|
|
434 close(FQ2);
|
|
435
|
|
436 # wait for all records to be processed and close down threads
|
|
437 $FindContig->enqueue( (undef) x $OPTIONS{p} );
|
|
438 while($FindContig->pending() > 0 ) {
|
|
439 next;
|
|
440 }
|
|
441 $ToAlign->enqueue( (undef) );
|
|
442 while($ToAlign->pending() > 0 ) {
|
|
443 next;
|
|
444 }
|
|
445 $Aligned->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
|
|
446 while($Aligned->pending() > 0 ) {
|
|
447 next;
|
|
448 }
|
|
449 $Samfile->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
|
|
450 while($Samfile->pending() > 0 ) {
|
|
451 next;
|
|
452 }
|
|
453 $WriteBam->enqueue( undef );
|
|
454
|
|
455 $_->join() for @threadArray;
|
|
456
|
|
457 # sort the bam file
|
|
458 if(defined $OPTIONS{s}) {
|
4
|
459 system("java -Xmx8g -jar $JAR MAX_RECORDS_IN_RAM=2000000 I=$bam_to_sort O=$bamfile_out VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true");
|
0
|
460 unlink $bam_to_sort;
|
|
461 }
|
|
462
|
|
463 if(!defined $OPTIONS{r}) {
|
|
464 # print out statistics
|
|
465 # setup variables
|
|
466 my $totalRegionLengths = 0;
|
|
467 my $totalReads = 0;
|
|
468 my $totalFwd = 0;
|
|
469 my $totalRev = 0;
|
|
470 my $totalMatch = 0;
|
|
471 my $totalMiss = 0;
|
|
472 my $totalBases = 0;
|
|
473 my $totalRead1 = 0;
|
|
474 my $totalRead2 = 0;
|
|
475 my $totalA = 0;
|
|
476 my $totalC = 0;
|
|
477 my $totalG = 0;
|
|
478 my $totalT = 0;
|
|
479 my $totalN = 0;
|
|
480 my $totalConc = 0;
|
|
481 my $totalQuality = 0;
|
|
482 my $AllmaxRead = 0;
|
|
483 my $AllminRead = 100000;
|
|
484 my $avequal = 'NA';
|
|
485 my $aveRead = 'NA';
|
|
486 my $perA = 'NA';
|
|
487 my $perC = 'NA';
|
|
488 my $perG = 'NA';
|
|
489 my $perT = 'NA';
|
|
490 my $perN = 'NA';
|
|
491 my $readsUsed = 0;
|
|
492 my @coverage;
|
|
493 my @off_coverage;
|
|
494
|
|
495 # print column headers
|
|
496 print STATS 'AmpliconName,Region,RegionLengthMinusPrimers,TotalReadsPoorMapping,TotalReadsUsed,Reads1Kept,Reads2Kept,ReadsOut,ReadsFwd,ReadsRev,ReadsOverlap,';
|
|
497 print STATS "Bases,AveQuality,MatchBases,ErrorBases,ErrorRate,MinReadLength,AveReadLength,MaxReadLength,PercA,PercC,PercG,PercT,PercN\n";
|
|
498
|
|
499 # for each region
|
|
500 foreach my $key (@OrderedContigs){
|
|
501 # if we have some reads calculate averages
|
|
502 if($stats{$key}{Reads} > 0) {
|
|
503 $avequal = $stats{$key}{Quality} / $stats{$key}{Bases};
|
|
504 $avequal = sprintf("%.4f", $avequal);
|
|
505 $aveRead = $stats{$key}{Bases} / $stats{$key}{Reads};
|
|
506 $aveRead = sprintf("%.4f", $aveRead);
|
|
507 $perA = $stats{$key}{TotalA} / $stats{$key}{Bases} * 100;
|
|
508 $perA = sprintf("%.2f", $perA);
|
|
509 $perC = $stats{$key}{TotalC} / $stats{$key}{Bases} * 100;
|
|
510 $perC = sprintf("%.2f", $perC);
|
|
511 $perG = $stats{$key}{TotalG} / $stats{$key}{Bases} * 100;
|
|
512 $perG = sprintf("%.2f", $perG);
|
|
513 $perT = $stats{$key}{TotalT} / $stats{$key}{Bases} * 100;
|
|
514 $perT = sprintf("%.2f", $perT);
|
|
515 $perN = $stats{$key}{TotalN} / $stats{$key}{Bases} * 100;
|
|
516 $perN = sprintf("%.2f", $perN);
|
|
517 $AllminRead = min([$AllminRead, $stats{$key}{MinLength}]);
|
|
518 $AllmaxRead = max([$AllmaxRead, $stats{$key}{MaxLength}]);
|
|
519 $readsUsed = $stats{$key}{Reads} + $stats{$key}{ConcReads};
|
|
520 } else {
|
|
521 $stats{$key}{MaxLength} = 'NA';
|
|
522 $stats{$key}{MinLength} = 'NA';
|
|
523 }
|
|
524
|
|
525 # if we have reads which overlapped calculate an error rate
|
|
526 my $errRate = 'NA';
|
|
527 if($stats{$key}{ConcReads} > 0) {
|
|
528 $errRate = $stats{$key}{ConcErrorsBases} / ($stats{$key}{ConcErrorsBases} + $stats{$key}{ConcCorrectBases});
|
|
529 }
|
|
530
|
|
531 # print the stats for this region
|
|
532 print STATS "$refFrags{name}{$key}," if defined $refFrags{name}{$key};
|
|
533 print STATS "Unknown," unless defined $refFrags{name}{$key};
|
|
534 print STATS "$refLength{$key}{newReg},$refLength{$key}{length},$stats{$key}{PoorMapQual},$readsUsed,$stats{$key}{Read1},$stats{$key}{Read2},$stats{$key}{Reads},$stats{$key}{NumberForward},";
|
|
535 print STATS "$stats{$key}{NumberReverse},$stats{$key}{ConcReads},$stats{$key}{Bases},$avequal,$stats{$key}{ConcCorrectBases},";
|
|
536 print STATS "$stats{$key}{ConcErrorsBases},$errRate,$stats{$key}{MinLength},$aveRead,$stats{$key}{MaxLength},$perA,$perC,$perG,$perT,$perN\n";
|
|
537
|
|
538 # add stats to grand totals
|
|
539 $totalRegionLengths += $refLength{$key}{length};
|
|
540 $totalQuality += $stats{$key}{Quality};
|
|
541 $totalReads += $stats{$key}{Reads};
|
|
542 $totalFwd += $stats{$key}{NumberForward};
|
|
543 $totalRev += $stats{$key}{NumberReverse};
|
|
544 $totalBases += $stats{$key}{Bases};
|
|
545 $totalRead1 += $stats{$key}{Read1};
|
|
546 $totalRead2 += $stats{$key}{Read2};
|
|
547 $totalA += $stats{$key}{TotalA};
|
|
548 $totalC += $stats{$key}{TotalC};
|
|
549 $totalT += $stats{$key}{TotalT};
|
|
550 $totalG += $stats{$key}{TotalG};
|
|
551 $totalN += $stats{$key}{TotalN};
|
|
552 $totalMatch += $stats{$key}{ConcCorrectBases};
|
|
553 $totalMiss += $stats{$key}{ConcErrorsBases};
|
|
554 $totalConc += $stats{$key}{ConcReads};
|
|
555
|
|
556 # reset ave scores
|
|
557 $avequal = 'NA';
|
|
558 $aveRead = 'NA';
|
|
559 $perA = 'NA';
|
|
560 $perC = 'NA';
|
|
561 $perG = 'NA';
|
|
562 $perT = 'NA';
|
|
563 $perN = 'NA';
|
|
564 $readsUsed = 0;
|
|
565
|
|
566 if(defined $refFrags{name}{$key} && grep /^Off_target/, $refFrags{name}{$key}) {
|
|
567 push @off_coverage, $stats{$key}{Reads};
|
|
568 } else {
|
|
569 push @coverage, $stats{$key}{Reads};
|
|
570 }
|
|
571 }
|
|
572
|
|
573 # if we have any reads calculate the averages
|
|
574 if($totalReads > 0) {
|
|
575 $avequal = $totalQuality / $totalBases;
|
|
576 $avequal = sprintf("%.4f", $avequal);
|
|
577 $aveRead = $totalBases / $totalReads;
|
|
578 $aveRead = sprintf("%.4f", $aveRead);
|
|
579 $perA = $totalA / $totalBases * 100;
|
|
580 $perA = sprintf("%.2f", $perA);
|
|
581 $perC = $totalC / $totalBases * 100;
|
|
582 $perC = sprintf("%.2f", $perC);
|
|
583 $perG = $totalG / $totalBases * 100;
|
|
584 $perG = sprintf("%.2f", $perG);
|
|
585 $perT = $totalT / $totalBases * 100;
|
|
586 $perT = sprintf("%.2f", $perT);
|
|
587 $perN = $totalN / $totalBases * 100;
|
|
588 $perN = sprintf("%.2f", $perN);
|
|
589 $readsUsed = $totalReads + $totalConc;
|
|
590 } else {
|
|
591 $AllmaxRead = 'NA';
|
|
592 $AllminRead = 'NA';
|
|
593 }
|
|
594
|
|
595 # if we had any reads which overlapped calculate error rate
|
|
596 my $errRate = 'NA';
|
|
597 if($totalConc > 0) {
|
|
598 $errRate = $totalMiss / ($totalMiss + $totalMatch);
|
|
599 }
|
|
600
|
|
601 # print out totals
|
|
602 my $pcAmb = sprintf("%.2f", $stats{Amb} / $readNo * 100);
|
|
603 my $pcPQ = sprintf("%.2f", $stats{PoorQualMates} / $readNo * 100);
|
|
604 my $TotalReads = $readNo * 2;
|
|
605 my $Unmapped = $TotalReads - $readsUsed;
|
|
606 my $pcUnmapped = sprintf("%.2f", $Unmapped/$TotalReads * 100);
|
|
607 my $avCov = sprintf("%.0f", sum(@coverage)/@coverage);
|
|
608 my $avOff = sprintf("%.2f", sum(@off_coverage)/@off_coverage) if @off_coverage;
|
|
609 my $greater = scalar (grep { $_ > 0.2*$avCov } @coverage);
|
|
610 my $pcgreater = sprintf("%.2f", $greater/@coverage * 100);
|
|
611
|
|
612 print STATS ",Total,$totalRegionLengths,$stats{PoorMapQual},$readsUsed,$totalRead1,$totalRead2,$totalReads,$totalFwd,$totalRev,$totalConc,$totalBases,$avequal,$totalMatch,";
|
|
613 print STATS "$totalMiss,$errRate,$AllminRead,$aveRead,$AllmaxRead,$perA,$perC,$perG,$perT,$perN\n";
|
|
614 print STATS "\nNo mate pairs:,$readNo\n";
|
|
615 print STATS "No ambiguous mates:,$stats{Amb},$pcAmb\%\n";
|
|
616 print STATS "No poor quality mates:,$stats{PoorQualMates},$pcPQ\%\n\n";
|
|
617 print STATS "Total Number Of Reads:,$TotalReads\n";
|
|
618 print STATS "Unmapped Reads:,$Unmapped,$pcUnmapped\%\n\n";
|
|
619 print STATS "Min Mean Max ReadsOut:,".min([@coverage]).",$avCov,".max([@coverage])."\n";
|
|
620 print STATS "No. Amplicons > 0.2 x ReadsOut:,$greater,$pcgreater\%\n";
|
|
621 print STATS "Average ReadsOut Off Target Regions:,$avOff\n" if @off_coverage;
|
|
622 print STATS "\nAmplicons\nNo. Reads < 200, 200 <= No. Reads < 1000, No. Reads >= 1000\n";
|
|
623 $greater = scalar (grep { $_ < 200 } @coverage);
|
|
624 print STATS "$greater,";
|
|
625 $greater = scalar (grep { $_ >= 200 && $_ < 1000 } @coverage);
|
|
626 print STATS "$greater,";
|
|
627 $greater = scalar (grep { $_ >= 1000 } @coverage);
|
|
628 print STATS "$greater\n";
|
|
629
|
|
630 print "Cache hits align contig: $cacheHits\n";
|
|
631 print "Alignment cache hits: $cacheAlignHits\n";
|
|
632 close(STATS);
|
|
633 }
|
|
634
|
|
635 # close the log file
|
|
636 if(defined $OPTIONS{l} || defined $OPTIONS{L}) {
|
|
637 my $now = localtime time;
|
|
638 print "\nLog File Closed $now\n";
|
|
639 close(STDERR);
|
|
640 close(STDOUT);
|
|
641 }
|
|
642
|
|
643 # thread to find matching amplicon
|
|
644 sub thread_find_contig {
|
|
645 my $FindContig = shift;
|
|
646 my $refFrags = shift;
|
|
647 my $refSeq = shift;
|
|
648 my $ToAlign = shift;
|
|
649 my $Aligned = shift;
|
|
650 my $WriteBam = shift;
|
|
651 my $readArrayRef;
|
|
652 {
|
|
653 lock($FindContig); # Keep other threads from changing the queue's contents
|
|
654 $readArrayRef = $FindContig->dequeue();
|
|
655 }
|
|
656
|
|
657 while(defined $readArrayRef) {
|
|
658 my ($read1, $seq1, $qual1, $read2, $seq2, $qual2) = @{ $readArrayRef };
|
|
659 my $contig = '';
|
|
660 my $readRef = '';
|
|
661 my $strand = '';
|
|
662
|
|
663 # trim the reads
|
|
664 my ($trimmed1, $qtrim1) = trim($seq1, $qual1, 30, 1);
|
|
665 my ($trimmed2, $qtrim2) = trim($seq2, $qual2, 30, 2);
|
|
666
|
|
667 # if we have already mapped the reads use the results in the cache
|
|
668 if(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
|
|
669 my @array;
|
|
670 { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
|
|
671 $array[0] = $read1;
|
|
672 $array[3] = $qtrim1;
|
|
673 $array[6] = $qtrim2;
|
|
674 { lock($cacheAlignHits); $cacheAlignHits++ };
|
|
675
|
|
676 # add aligned reads to queue
|
|
677 {
|
|
678 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
679 $Aligned->enqueue(\@array);
|
|
680 }
|
|
681 } else {
|
|
682 # Use approx match to find most similar amplicon primer to reads
|
|
683 my @index1;
|
|
684 my @index2;
|
|
685 my $index_read1 = -2;
|
|
686 my $index_read2 = -2;
|
|
687 my $index_found = 0;
|
|
688
|
|
689 if(length($trimmed1) > length($seq1)/2 || length($trimmed2) > length($seq2)/2) {
|
|
690 # we have at least 1 good read
|
|
691 # find closest matching amplicon for read 1
|
|
692 my $string = substr($seq1, 0, $$refFrags{min});
|
|
693 if(defined $cache{"$string.1"}) {
|
|
694 $cacheHits++;
|
|
695 { lock(%cache); @index1 = @{ $cache{"$string.1"} } };
|
|
696 } else {
|
|
697 @index1 = amatch($string, \@{ $$refFrags{seqU} });
|
|
698 {
|
|
699 lock(%cache);
|
|
700 $cache{"$string.1"} = &share( [] );
|
|
701 @{ $cache{"$string.1"} } = @index1;
|
|
702 }
|
|
703 }
|
|
704
|
|
705 # find closest matching amplicon for read 2
|
|
706 $string = substr($seq2, -$$refFrags{min});
|
|
707 if(defined $cache{"$string.2"}) {
|
|
708 $cacheHits++;
|
|
709 { lock(%cache); @index2 = @{ $cache{"$string.2"} } };
|
|
710 } else {
|
|
711 @index2 = amatch($string, \@{ $$refFrags{seqD} });
|
|
712 {
|
|
713 lock(%cache);
|
|
714 $cache{"$string.2"} = &share( [] );
|
|
715 @{ $cache{"$string.2"} } = @index2;
|
|
716 }
|
|
717 }
|
|
718
|
|
719 if($#index1 > 0 || $#index2 > 0) {
|
|
720 # if one read maps to multiple amplicons and the other read matches to just one of them use that
|
|
721 my %in_array2 = map { $_ => 1 } @index2;
|
|
722 my @common = grep { $in_array2{$_} } @index1;
|
|
723 if($#common == 0) {
|
|
724 @index1 = @common;
|
|
725 @index2 = @common;
|
|
726 } elsif($#common > 0) {
|
|
727 if($#index1 > 0) {
|
|
728 # matched to multiple primers so try and use whole read
|
|
729 my @refs;
|
|
730 foreach(@index1) {
|
|
731 if($$refFrags{strand}[$_] eq "F") {
|
|
732 push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
|
|
733 } else {
|
|
734 push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
|
|
735 }
|
|
736 }
|
|
737 my @index1b = amatch($seq1, \@refs);
|
|
738 if($#index1b >= 0) {
|
|
739 @index1 = @index1[@index1b];
|
|
740 }
|
|
741 }
|
|
742 if($#index2 > 0) {
|
|
743 # matched to multiple primers so try and use whole read
|
|
744 my @refs;
|
|
745 foreach(@index2) {
|
|
746 if($$refFrags{strand}[$_] eq "F") {
|
|
747 push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
|
|
748 } else {
|
|
749 push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
|
|
750 }
|
|
751 }
|
|
752 my @index2b = amatch($seq2, \@refs);
|
|
753 if($#index2b >= 0) {
|
|
754 @index2 = @index2[@index2b];
|
|
755 }
|
|
756 }
|
|
757
|
|
758 # if one read still maps to multiple amplicons and the other read matches to just one of them use that
|
|
759 %in_array2 = map { $_ => 1 } @index2;
|
|
760 @common = grep { $in_array2{$_} } @index1;
|
|
761 if($#common == 0) {
|
|
762 @index1 = @common;
|
|
763 @index2 = @common;
|
|
764 }
|
|
765 }
|
|
766 }
|
|
767
|
|
768 # if read 1 and read 2 match to the same amplicon assign it
|
|
769 if($#index1 == 0 && @index1 ~~ @index2 ) {
|
|
770 $index_read1 = $index1[0];
|
|
771 $index_read2 = $index2[0];
|
|
772 } else {
|
|
773 $index_read1 = -1;
|
|
774 $index_read2 = -1;
|
|
775 }
|
|
776
|
|
777 # if any read after trimming is less than half the original length discard
|
|
778 if(length($trimmed1) < length($seq1)/2) {
|
|
779 $index_read1 = -2;
|
|
780 }
|
|
781
|
|
782 if(length($trimmed2) < length($seq2)/2) {
|
|
783 $index_read2 = -2;
|
|
784 }
|
|
785 }
|
|
786
|
|
787 if($index_read1 > -1 || $index_read2 > -1) {
|
|
788 my $conc = 0;
|
|
789 my %align;
|
|
790
|
|
791 # at least 1 read matched to an amplicon
|
|
792 if($index_read1 > -1) {
|
|
793 $contig = $$refFrags{reg}[$index_read1];
|
|
794 $readRef = $$refSeq{$contig};
|
|
795 $strand = $$refFrags{strand}[$index_read1];
|
|
796 } elsif($index_read2 > -1) {
|
|
797 $contig = $$refFrags{reg}[$index_read2];
|
|
798 $readRef = $$refSeq{$contig};
|
|
799 $strand = $$refFrags{strand}[$index_read2];
|
|
800 }
|
|
801
|
|
802 if($index_read1 == $index_read2 && length($readRef) < length($trimmed1) + length($trimmed2)) {
|
|
803 # reads must overlap so can use concensus
|
|
804 $conc = 1;
|
|
805 } else {
|
|
806 my $substr1;
|
|
807 my $substr2;
|
|
808 for(my $length = 5; $length <= min([length($trimmed1), length($trimmed2)]); $length++) {
|
|
809 $substr1 = substr($trimmed1, -$length);
|
|
810 $substr2 = substr($trimmed2, 0, $length);
|
|
811 if($substr1 eq $substr2) {
|
|
812 $conc = 1;
|
|
813 last;
|
|
814 } else {
|
|
815 # calculate max diff allowed
|
|
816 if(!defined $maxdiff{$length}) {
|
|
817 { lock(%maxdiff); $maxdiff{$length} = maxdiff($length, 0.02, 0.04); }
|
|
818 }
|
|
819
|
|
820 my $diff = levenshtein($substr1, $substr2 );
|
|
821 if($diff < $maxdiff{$length}) {
|
|
822 $conc = 1;
|
|
823 last;
|
|
824 }
|
|
825 }
|
|
826 }
|
|
827 }
|
|
828
|
|
829 if($conc) {
|
|
830 # remove any trailing Ns
|
|
831 $trimmed1 = $seq1;
|
|
832 $trimmed2 = $seq2;
|
|
833 $trimmed1 =~ s/N*$//;
|
|
834 $trimmed2 =~ s/^N*//;
|
|
835 $qtrim1 = $qual1;
|
|
836 $qtrim2 = $qual2;
|
|
837 $qtrim1 = substr($qtrim1, 0, length($trimmed1));
|
|
838 $qtrim2 = substr($qtrim2 , -length($trimmed2));
|
|
839 }
|
|
840
|
|
841 if($strand eq "R") {
|
|
842 my $trimmed1_tmp = revcompl($trimmed1);
|
|
843 my $qtrim1_tmp = reverse($qtrim1);
|
|
844 $trimmed1 = revcompl($trimmed2);
|
|
845 $qtrim1 = reverse($qtrim2);
|
|
846 $trimmed2 = $trimmed1_tmp;
|
|
847 $qtrim2 = $qtrim1_tmp;
|
|
848 my $index_read_tmp = $index_read1;
|
|
849 $index_read1 = $index_read2;
|
|
850 $index_read2 = $index_read_tmp;
|
|
851 }
|
|
852
|
|
853 if($conc) {
|
|
854 my $align;
|
|
855 my $search1 = $trimmed1;
|
|
856 $search1 =~ s/N/./g;
|
|
857 my $search2 = $trimmed2;
|
|
858 $search2 =~ s/N/./g;
|
|
859 if($$refSeq{$contig} =~ /$search1/ && $$refSeq{$contig} =~ /$search2/) {
|
|
860 # the read matches exactly to the reference
|
|
861 $align{Ref1} = $$refSeq{$contig};
|
|
862 $align{Read1} = $$refSeq{$contig};
|
|
863 $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
|
|
864 $align{Read1} =~ s/[^X]/-/g;
|
|
865 $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X+/$trimmed1/ });
|
|
866 $align{Read2} = $$refSeq{$contig};
|
|
867 $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}XX$2/;
|
|
868 $align{Read2} =~ s/[^X]/-/g;
|
|
869 $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X+/$trimmed2/ });
|
|
870
|
|
871 # add aligned reads to queue
|
|
872 my @array;
|
|
873 {
|
|
874 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
875 @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, $align{Ref1}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
|
|
876 $Aligned->enqueue(\@array);
|
|
877 }
|
|
878
|
|
879 # add alignment to cache
|
|
880 {
|
|
881 lock(%cacheAlign);
|
|
882 $cacheAlign{Conc}{$name} = &share( [] );
|
|
883 @{ $cacheAlign{Conc}{$name} } = @array;
|
|
884 }
|
|
885 } elsif(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
|
|
886 my @array;
|
|
887 { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
|
|
888 $array[0] = $read1;
|
|
889 $array[3] = $qtrim1;
|
|
890 $array[6] = $qtrim2;
|
|
891 { lock($cacheAlignHits); $cacheAlignHits++ };
|
|
892
|
|
893 # add aligned reads to queue
|
|
894 {
|
|
895 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
896 $Aligned->enqueue(\@array);
|
|
897 }
|
|
898 } else {
|
|
899
|
|
900 # align reads using NW2
|
|
901 {
|
|
902 lock($ToAlign); # Keep other threads from changing the queue's contents
|
|
903 my @array = ($$refSeq{$contig}, $trimmed1, $trimmed2, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc);
|
|
904 #my @array = ($fastaIn, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc, "$trimmed1.$trimmed2");
|
|
905 $ToAlign->enqueue(\@array);
|
|
906 }
|
|
907 }
|
|
908 }
|
|
909
|
|
910 if(!$conc && $index_read1 > -1) {
|
|
911 my $search1 = $trimmed1;
|
|
912 $search1 =~ s/N/./g;
|
|
913 if($$refSeq{$contig} =~ /$search1/) {
|
|
914 # the read matches exactly to the reference
|
|
915 $align{Ref1} = $$refSeq{$contig};
|
|
916 $align{Read1} = $$refSeq{$contig};
|
|
917 $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
|
|
918 $align{Read1} =~ s/[^X]/-/g;
|
|
919 $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X/$trimmed1/ });
|
|
920
|
|
921 # add aligned reads to queue
|
|
922 {
|
|
923 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
924 my @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
|
|
925 $Aligned->enqueue(\@array);
|
|
926 }
|
|
927 } elsif(defined $cacheAlign{$contig}{R1}{$trimmed1}) {
|
|
928 my @array;
|
|
929 { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R1}{$trimmed1} } };
|
|
930 $array[0] = $read1;
|
|
931 $array[3] = $qtrim1;
|
|
932 { lock($cacheAlignHits); $cacheAlignHits++ };
|
|
933
|
|
934 # add aligned reads to queue
|
|
935 {
|
|
936 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
937 $Aligned->enqueue(\@array);
|
|
938 }
|
|
939 } else {
|
|
940 # align read using NW2
|
|
941 {
|
|
942 lock($ToAlign); # Keep other threads from changing the queue's contents
|
|
943 my @array = ($$refSeq{$contig}, $trimmed1, undef, $read1, $qtrim1, undef, $contig, $strand, $conc);
|
|
944 #my @array = ($fastaIn, $read1, $qtrim1, undef, $contig, $strand, $conc, $trimmed1);
|
|
945 $ToAlign->enqueue(\@array);
|
|
946 }
|
|
947 }
|
|
948 }
|
|
949
|
|
950 if(!$conc && $index_read2 > -1) {
|
|
951 my $search2 = $trimmed2;
|
|
952 $search2 =~ s/N/./g;
|
|
953 if($$refSeq{$contig} =~ /$search2/) {
|
|
954 # the read matches exactly to the reference
|
|
955 $align{Ref2} = $$refSeq{$contig};
|
|
956 $align{Read2} = $$refSeq{$contig};
|
|
957 $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}X$2/;
|
|
958 $align{Read2} =~ s/[^X]/-/g;
|
|
959 $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X/$trimmed2/ });
|
|
960
|
|
961 # add aligned reads to queue
|
|
962 {
|
|
963 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
964 my @array = ($read2, undef, undef, undef, $align{Ref2}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
|
|
965 $Aligned->enqueue(\@array);
|
|
966 }
|
|
967 } elsif(defined $cacheAlign{$contig}{R2}{$trimmed2}) {
|
|
968 my @array;
|
|
969 { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R2}{$trimmed2} } };
|
|
970 $array[0] = $read2;
|
|
971 $array[6] = $qtrim2;
|
|
972 { lock($cacheAlignHits); $cacheAlignHits++ };
|
|
973
|
|
974 # add aligned reads to queue
|
|
975 {
|
|
976 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
977 $Aligned->enqueue(\@array);
|
|
978 }
|
|
979 } else {
|
|
980 # align read using NW2
|
|
981 {
|
|
982 lock($ToAlign); # Keep other threads from changing the queue's contents
|
|
983 my @array = ($$refSeq{$contig}, undef, $trimmed2, $read2, undef, $qtrim2, $contig, $strand, $conc);
|
|
984 #my @array = ($fastaIn, $read2, undef, $qtrim2, $contig, $strand, $conc, $trimmed2);
|
|
985 $ToAlign->enqueue(\@array);
|
|
986 }
|
|
987 }
|
|
988 }
|
|
989 }
|
|
990
|
|
991 if($index_read1 < 0) {
|
|
992 # discard read 1
|
|
993 # send to writebamfile thread
|
|
994 {
|
|
995 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
996 $WriteBam->enqueue("$read1\t4\t*\t0\t0\t*\t*\t0\t0\t$seq1\t$qual1\tRG:Z:$readGroup\n");
|
|
997 }
|
|
998 { lock(%stats); $stats{PoorQual1}++ };
|
|
999 }
|
|
1000
|
|
1001 if($index_read2 < 0) {
|
|
1002 # discard read 2
|
|
1003 # send to writebamfile thread
|
|
1004 {
|
|
1005 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1006 $WriteBam->enqueue("$read2\t4\t*\t0\t0\t*\t*\t0\t0\t$seq2\t$qual2\tRG:Z:$readGroup\n");
|
|
1007 }
|
|
1008 { lock(%stats); $stats{PoorQual2}++ };
|
|
1009 }
|
|
1010
|
|
1011 if($index_read1 == -1 && $index_read2 == -1) {
|
|
1012 { lock(%stats); $stats{Amb}++ };
|
|
1013 } elsif($index_read1 == -2 && $index_read2 == -2) {
|
|
1014 { lock(%stats); $stats{PoorQualMates}++ };
|
|
1015 }
|
|
1016 }
|
|
1017
|
|
1018 # grab the next reads in queue
|
|
1019 {
|
|
1020 lock($FindContig); # Keep other threads from changing the queue's contents
|
|
1021 $readArrayRef = $FindContig->dequeue();
|
|
1022 }
|
|
1023 }
|
|
1024 print "Closing Find Amplicon Thread\n";
|
|
1025 return;
|
|
1026 }
|
|
1027
|
|
1028 sub writebamfile {
|
|
1029 my $bamfile = shift;
|
|
1030 my $WriteBam = shift;
|
|
1031
|
|
1032 # write samfile header
|
|
1033 open(PIPE_TO_SAMTOOLS, "| samtools view -Sb -o $bamfile - 2>&1") or die "Can't fork: $!\n";
|
|
1034 foreach my $key (@OrderedContigs) {
|
|
1035 print PIPE_TO_SAMTOOLS "\@SQ\tSN:$refLength{$key}{newReg}\tLN:$refLength{$key}{length}\n";
|
|
1036 }
|
|
1037 print PIPE_TO_SAMTOOLS "\@RG\tID:$readGroup\tPL:Illumina\tSM:$sampleName\n";
|
|
1038
|
|
1039 my $read;
|
|
1040 {
|
|
1041 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1042 $read = $WriteBam->dequeue();
|
|
1043 }
|
|
1044 while(defined $read) {
|
|
1045
|
|
1046 # write read to bamfile
|
|
1047 print PIPE_TO_SAMTOOLS $read;
|
|
1048
|
|
1049 # grab next item in queue
|
|
1050 {
|
|
1051 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1052 $read = $WriteBam->dequeue();
|
|
1053 }
|
|
1054 }
|
|
1055 print "Closing Bam Writer\n";
|
|
1056 close PIPE_TO_SAMTOOLS or die "pipe could not be closed";
|
|
1057 return;
|
|
1058 }
|
|
1059
|
|
1060 sub thread_process_align {
|
|
1061 my $Aligned = shift;
|
|
1062 my $refFrags = shift;
|
|
1063 my $refSeq = shift;
|
|
1064 my $Samfile = shift;
|
|
1065 my $WriteBam = shift;
|
|
1066
|
|
1067 my $readArrayRef;
|
|
1068 {
|
|
1069 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
1070 $readArrayRef = $Aligned->dequeue();
|
|
1071 }
|
|
1072 while(defined $readArrayRef) {
|
|
1073 # grab the alignment
|
|
1074 my ($read, $ref1, $read1, $qtrim1, $ref2, $read2, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
|
|
1075 # grab primer lengths
|
|
1076 my $PU = $$refFrags{primers}{$contig}{U}{F}{length};
|
|
1077 my $PD = $$refFrags{primers}{$contig}{D}{F}{length};
|
|
1078 my %readStats;
|
|
1079 my $orig_read1 = $read1;
|
|
1080 my $orig_read2 = $read2;
|
|
1081 my $orig_qtrim1 = $qtrim1;
|
|
1082 my $orig_qtrim2 = $qtrim2;
|
|
1083 $orig_read1 =~ s/-//g if defined $orig_read1;
|
|
1084 $orig_read2 =~ s/-//g if defined $orig_read2;
|
|
1085
|
|
1086 if(defined $OPTIONS{d}) {
|
|
1087 # debug info
|
|
1088 print "Alignment With Primers\n";
|
|
1089 print "$ref1\n$read1\n" if defined $read1;
|
|
1090 print "$ref2\n$read2\n" if defined $read2;
|
|
1091 print "\nFinal Alignment:\n";
|
|
1092 }
|
|
1093
|
|
1094 # calculate where primers begin and end
|
|
1095 if(defined $read1) {
|
|
1096 if($strand eq 'F') {
|
|
1097 $readStats{$contig}{Read1} = 1;
|
|
1098 } else {
|
|
1099 $readStats{$contig}{Read2} = 1;
|
|
1100 }
|
|
1101
|
|
1102 # determine the primer locations
|
|
1103 my $offset = 0;
|
|
1104 for(my $occurance = 0; $occurance < $PU; $occurance++) {
|
|
1105 substr($ref1, $offset) =~ /^.*?([^-])/;
|
|
1106 $offset += $-[1] + 1;
|
|
1107 }
|
|
1108 my $length = 0;
|
|
1109 for(my $occurance = 0; $occurance < $PD; $occurance++) {
|
|
1110 substr($ref1, 0, length($ref1) - $length) =~ /.*([^-]).*?$/;
|
|
1111 $length = length($ref1) - $-[1];
|
|
1112 }
|
|
1113 $length = length($ref1) - $length - $offset;
|
|
1114
|
|
1115 # remove primers and adjust quality scores
|
|
1116 $ref1 = substr($ref1, $offset, $length);
|
|
1117 my $qualOffset = ( substr($read1, 0, $offset) =~ tr/ACTGN// );
|
|
1118 $read1 = substr($read1, $offset, $length);
|
|
1119 $length = ( $read1 =~ tr/ACTGN// );
|
|
1120 $qtrim1 = substr($qtrim1, $qualOffset, $length);
|
|
1121 }
|
|
1122
|
|
1123 if(defined $read2) {
|
|
1124 if($strand eq 'F') {
|
|
1125 $readStats{$contig}{Read2} = 1;
|
|
1126 } else {
|
|
1127 $readStats{$contig}{Read1} = 1;
|
|
1128 }
|
|
1129
|
|
1130 # determine the primer locations
|
|
1131 my $offset = 0;
|
|
1132 for(my $occurance = 0; $occurance < $PU; $occurance++) {
|
|
1133 substr($ref2, $offset) =~ /^.*?([^-])/;
|
|
1134 $offset += $-[1] + 1;
|
|
1135 }
|
|
1136 my $length = 0;
|
|
1137 for(my $occurance = 0; $occurance < $PD; $occurance++) {
|
|
1138 substr($ref2, 0, length($ref2) - $length) =~ /.*([^-]).*?$/;
|
|
1139 $length = length($ref2) - $-[1];
|
|
1140 }
|
|
1141 $length = length($ref2) - $length - $offset;
|
|
1142
|
|
1143 # remove primers and adjust quality scores
|
|
1144 $ref2 = substr($ref2, $offset, $length);
|
|
1145 my $qualOffset = ( substr($read2, 0, $offset) =~ tr/ACTGN// );
|
|
1146 $read2 = substr($read2, $offset, $length);
|
|
1147 $length = ( $read2 =~ tr/ACTGN// );
|
|
1148 $qtrim2 = substr($qtrim2, $qualOffset, $length);
|
|
1149 }
|
|
1150
|
|
1151 # generate a concensus sequence if we have both read1 and read2 and they overlap
|
|
1152 if($conc) {
|
|
1153 # the following are 0 based indexes
|
|
1154 my $R1qual = 0; # index for sanger qual read 1
|
|
1155 my $R2qual = 0; # index for sanger qual read 2
|
|
1156 my $aindex = 0; # index for alignment
|
|
1157 my $q1;
|
|
1158 my $q2;
|
|
1159 my $concensus;
|
|
1160 my $concqual;
|
|
1161
|
|
1162 $readStats{$contig}{ConcReads} = 1;
|
|
1163 $readStats{$contig}{ConcCorrectBases} = 0;
|
|
1164 $readStats{$contig}{ConcErrorsBases} = 0;
|
|
1165 while($aindex < length($ref1)) {
|
|
1166 # base1 is current base from read 1
|
|
1167 # base2 is current base from read 2
|
|
1168 my $base1 = substr($read1, $aindex, 1);
|
|
1169 my $base2 = substr($read2, $aindex, 1);
|
|
1170
|
|
1171 # grab the next quality score
|
|
1172 if($R1qual < length($qtrim1)) {
|
|
1173 $q1 = substr($qtrim1, $R1qual, 1);
|
|
1174 }
|
|
1175 if($R2qual < length($qtrim2)) {
|
|
1176 $q2 = substr($qtrim2, $R2qual, 1);
|
|
1177 }
|
|
1178
|
|
1179 # if we have a base in either
|
|
1180 if(($base1 ne '-' && $base1 ne 'N') || ($base2 ne '-' && $base2 ne 'N')) {
|
|
1181 if(($base2 eq "-" || $base2 eq 'N') && $base1 ne "-" && $base1 ne 'N') {
|
|
1182 # we are only in read 1 or read 2 is a N
|
|
1183 $concensus .= $base1;
|
|
1184 $concqual .= $q1;
|
|
1185 $R1qual++;
|
|
1186 if($base2 eq 'N') {
|
|
1187 $R2qual++;
|
|
1188 }
|
|
1189 } elsif(($base1 eq "-" || $base1 eq 'N') && $base2 ne "-" && $base2 ne 'N') {
|
|
1190 # we are only in read 2 or read 1 is a N
|
|
1191 $concensus .= $base2;
|
|
1192 $concqual .= $q2;
|
|
1193 $R2qual++;
|
|
1194 if($base1 eq 'N') {
|
|
1195 $R1qual++;
|
|
1196 }
|
|
1197 } else {
|
|
1198 # we are in the overlap region
|
|
1199 my $maxqual = sangerMax($q1, $q2);
|
|
1200 if($base1 eq $base2) {
|
|
1201 # both reads agree
|
|
1202 $readStats{$contig}{ConcCorrectBases}++;
|
|
1203 $concensus .= $base1;
|
|
1204 $concqual .= $maxqual;
|
|
1205 } elsif($q1 eq $q2) {
|
|
1206 # reads have different bases but the same score
|
|
1207 $readStats{$contig}{ConcErrorsBases}++;
|
|
1208 $concensus .= 'N';
|
|
1209 $concqual .= '#';
|
|
1210 } elsif($q1 eq $maxqual) {
|
|
1211 # reads have different bases but read 1 has the best score
|
|
1212 $concensus .= $base1;
|
|
1213 $concqual .= $maxqual;
|
|
1214 $readStats{$contig}{ConcErrorsBases}++;
|
|
1215 } else {
|
|
1216 # reads have different bases but read 2 has the best score
|
|
1217 $concensus .= $base2;
|
|
1218 $concqual .= $maxqual;
|
|
1219 $readStats{$contig}{ConcErrorsBases}++;
|
|
1220 }
|
|
1221 $R1qual++;
|
|
1222 $R2qual++;
|
|
1223 }
|
|
1224 } elsif($base1 eq 'N' || $base2 eq 'N') {
|
|
1225 if($base2 eq '-') {
|
|
1226 # one is a deletion the other is an N
|
|
1227 $concensus .= $base1;
|
|
1228 $concqual .= $q1;
|
|
1229 $R1qual++;
|
|
1230 } elsif($base1 eq '-') {
|
|
1231 # one is a deletion the other is an N
|
|
1232 $concensus .= $base2;
|
|
1233 $concqual .= $q2;
|
|
1234 $R2qual++;
|
|
1235 } elsif($base1 eq 'N' && $base2 eq 'N') {
|
|
1236 # both reads are an N
|
|
1237 $concensus .= 'N';
|
|
1238 $concqual .= sangerMax($q1, $q2);
|
|
1239 $R1qual++;
|
|
1240 $R2qual++;
|
|
1241 }
|
|
1242 } else {
|
|
1243 $concensus .= '-';
|
|
1244 }
|
|
1245 $aindex++;
|
|
1246 }
|
|
1247 $read1 = $concensus;
|
|
1248 $qtrim1 = $concqual;
|
|
1249 $read2 = undef;
|
|
1250 $qtrim2 = undef;
|
|
1251 }
|
|
1252
|
|
1253 if(defined $read1) {
|
|
1254 # remove any trailing - and send to samfile_line
|
|
1255 $read1 =~ s/^-*?([^-].*)/$1/;
|
|
1256 $ref1 = substr($ref1, -length($read1));
|
|
1257 $read1 =~ s/(.*[^-])-*?$/$1/;
|
|
1258 $ref1 = substr($ref1, 0, length($read1));
|
|
1259
|
|
1260 my $tmp = $ref1;
|
|
1261 $tmp =~ s/-//g;
|
|
1262 my $rstart = index($$refSeq{$contig}, $tmp) - $PU + 1;
|
|
1263
|
|
1264 # generate sam file line
|
|
1265 {
|
|
1266 lock($Samfile); # Keep other threads from changing the queue's contents
|
|
1267 my @array = ($read, $qtrim1, $rstart, $read1, $ref1, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
|
|
1268 $Samfile->enqueue(\@array);
|
|
1269 }
|
|
1270 }
|
|
1271
|
|
1272 if(defined $read2) {
|
|
1273 # remove any trailing - and send to samfile_line
|
|
1274 $read2 =~ s/^-*?([^-].*)/$1/;
|
|
1275 $ref2 = substr($ref2, -length($read2));
|
|
1276 $read2 =~ s/(.*[^-])-*?$/$1/;
|
|
1277 $ref2 = substr($ref2, 0, length($read2));
|
|
1278
|
|
1279 my $tmp = $ref2;
|
|
1280 $tmp =~ s/-//g;
|
|
1281 my $rstart = rindex($$refSeq{$contig}, $tmp) - $PU + 1;
|
|
1282
|
|
1283 # generate sam file line
|
|
1284 {
|
|
1285 lock($Samfile); # Keep other threads from changing the queue's contents
|
|
1286 my @array = ($read, $qtrim2, $rstart, $read2, $ref2, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
|
|
1287 $Samfile->enqueue(\@array);
|
|
1288 }
|
|
1289 }
|
|
1290
|
|
1291 if(defined $OPTIONS{d}) {
|
|
1292 # debug info
|
|
1293 print "$ref1\n$read1\n" if defined $read1;
|
|
1294 print "$ref2\n$read2\n" if defined $read2;
|
|
1295 print "$qtrim1\n" if defined $read1;
|
|
1296 print "$qtrim2\n" if defined $read2;
|
|
1297 print "\n\n";
|
|
1298 }
|
|
1299
|
|
1300 # grab next item in queue
|
|
1301 {
|
|
1302 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
1303 $readArrayRef = $Aligned->dequeue();
|
|
1304 }
|
|
1305 }
|
|
1306 print "Closing Alignment Processor\n";
|
|
1307 return;
|
|
1308 }
|
|
1309
|
|
1310 # function to take reverse compliment
|
|
1311 sub revcompl {
|
|
1312 my $string = shift @_;
|
|
1313 my $revcomp = reverse($string);
|
|
1314 $revcomp =~ tr/ACGTacgtKMSWRYBDHVkmswrybdhv/TGCAtgcaMKSWYRVHDBmkswyrvhdb/;
|
|
1315 return $revcomp;
|
|
1316 }
|
|
1317
|
|
1318 # function to convert sanger fastq quality scores to phred scores
|
|
1319 sub sanger2phred {
|
|
1320 my $sang = shift @_;
|
|
1321 my @sanger = split("", $sang);
|
|
1322 my @phred;
|
|
1323 foreach my $q (@sanger) {
|
|
1324 my $Q = ord($q) - 33;
|
|
1325 push @phred, $Q;
|
|
1326 }
|
|
1327 return @phred;
|
|
1328 }
|
|
1329
|
|
1330 # function to return the largest sanger quality score
|
|
1331 sub sangerMax {
|
|
1332 my ($s1, $s2) = @_;
|
|
1333 my $Q1 = ord($s1) - 33;
|
|
1334 my $Q2 = ord($s2) - 33;
|
|
1335 if($Q1 > $Q2) {
|
|
1336 return $s1;
|
|
1337 } else {
|
|
1338 return $s2;
|
|
1339 }
|
|
1340 }
|
|
1341
|
|
1342 # function to read a fasta file as a hash
|
|
1343 sub read_fasta_file {
|
|
1344 my ($fasta_file, $seqs) = @_;
|
|
1345 my $header;
|
|
1346 my $seq = "";
|
|
1347 open (IN, $fasta_file) or die "can't open $fasta_file: $!\n";
|
|
1348 while (my $line = <IN>) {
|
|
1349 chomp $line;
|
|
1350 if (grep />/, $line) {
|
|
1351 if ($seq ne "") {
|
|
1352 $seqs->{$header} = $seq;
|
|
1353 }
|
|
1354 $header = $line;
|
|
1355
|
|
1356 $header =~ s/^>//; # remove ">"
|
|
1357 $header =~ s/\s+$//; # remove trailing whitespace
|
|
1358 $seq = ""; # clear out old sequence
|
|
1359 } else {
|
|
1360 $line =~ s/\s+//g; # remove whitespace
|
|
1361 $seq .= $line; # add sequence
|
|
1362 }
|
|
1363 }
|
|
1364 close IN; # finished with file
|
|
1365
|
|
1366 if ($seq ne "") { # handle last sequence
|
|
1367 $seqs->{$header} = $seq;
|
|
1368 }
|
|
1369
|
|
1370 return;
|
|
1371 }
|
|
1372
|
|
1373 # minimal element of a list
|
|
1374 #
|
|
1375 sub min
|
|
1376 {
|
|
1377 my @list = @{$_[0]};
|
|
1378 my $min = $list[0];
|
|
1379
|
|
1380 foreach my $i (@list)
|
|
1381 {
|
|
1382 $min = $i if ($i < $min);
|
|
1383 }
|
|
1384
|
|
1385 return $min;
|
|
1386 }
|
|
1387
|
|
1388 # maximal element of a list
|
|
1389 #
|
|
1390 sub max
|
|
1391 {
|
|
1392 my @list = @{$_[0]};
|
|
1393 my $max = $list[0];
|
|
1394
|
|
1395 foreach my $i (@list)
|
|
1396 {
|
|
1397 $max = $i if ($i > $max);
|
|
1398 }
|
|
1399
|
|
1400 return $max;
|
|
1401 }
|
|
1402
|
|
1403 # trim sequences using the BWA algorithm
|
|
1404 sub trim {
|
|
1405
|
|
1406 my ($s, $q, $threshold, $read) = @_; # read sequence and quality scores
|
|
1407 if($read == 2) {
|
|
1408 $q = reverse($q);
|
|
1409 $s = reverse($s);
|
|
1410 }
|
|
1411
|
|
1412 my @array = sanger2phred($q);
|
|
1413 my $length = scalar @array;
|
|
1414
|
|
1415 # only calculate if quality fails near end
|
|
1416 if( $array[$#array] >= $threshold ){
|
|
1417 if($read == 2) {
|
|
1418 $q = reverse($q);
|
|
1419 $s = reverse($s);
|
|
1420 }
|
|
1421 return ($s, $q);
|
|
1422 }
|
|
1423
|
|
1424 # run bwa equation
|
|
1425 my @arg;
|
|
1426 for( my $i = 0; $i < $length - 1; $i++ ){
|
|
1427 my $x = $i + 1;
|
|
1428 for( my $j = $x; $j < $length; $j++ ){
|
|
1429 $arg[$x] += $threshold - $array[$j];
|
|
1430 }
|
|
1431 }
|
|
1432
|
|
1433 # find number of 5' bases to retain
|
|
1434 my $index = 0;
|
|
1435 my $maxval = 0;
|
|
1436 for ( 1 .. $#arg ){
|
|
1437 if ( $maxval < $arg[$_] ){
|
|
1438 $index = $_;
|
|
1439 $maxval = $arg[$_];
|
|
1440 }
|
|
1441 }
|
|
1442
|
|
1443 # trim reads
|
|
1444 $s = substr($s,0,$index);
|
|
1445 $q = substr($q,0,$index);
|
|
1446 if($read == 2) {
|
|
1447 $s = reverse($s);
|
|
1448 $q = reverse($q);
|
|
1449 }
|
|
1450
|
|
1451 # return seq and quality
|
|
1452 return ($s, $q);
|
|
1453 }
|
|
1454
|
|
1455
|
|
1456 sub samfile_line {
|
|
1457 # grab the variables
|
|
1458 my ($Samfile, $WriteBam, $refSeq) = @_;
|
|
1459 my %SamStats;
|
|
1460 my %minLength;
|
|
1461 my %maxLength;
|
|
1462 my $readArrayRef;
|
|
1463 {
|
|
1464 lock($Samfile); # Keep other threads from changing the queue's contents
|
|
1465 $readArrayRef = $Samfile->dequeue();
|
|
1466 }
|
|
1467
|
|
1468 while(defined $readArrayRef) {
|
|
1469 my ($read, $qual, $rstart, $align, $ref, $reg, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, $readStats) = @{ $readArrayRef };
|
|
1470 my $seq = $align;
|
|
1471 $seq =~ s/-//g;
|
|
1472
|
|
1473 # calculate the edit difference - count indels as only 1 missmatch hence tranform the strings so that all indels have length 1
|
|
1474 my $sam;
|
|
1475 my $s1;
|
|
1476 my $s2;
|
|
1477 my @s1 = split("", $align);
|
|
1478 my @s2 = split("", $ref);
|
|
1479
|
|
1480 my $indel = 0;
|
|
1481 for(my $i = 0; $i <= $#s1; $i++) {
|
|
1482 if(($s1[$i] ne "-" && $s2[$i] ne "-") || !$indel) {
|
|
1483 $s1 .= $s1[$i];
|
|
1484 $s2 .= $s2[$i];
|
|
1485 $indel = 0;
|
|
1486 if($s1[$i] eq "-" || $s2[$i] eq "-") {
|
|
1487 $indel = 1;
|
|
1488 }
|
|
1489 }
|
|
1490 }
|
|
1491 my $mq = levenshtein($s1, $s2);
|
|
1492
|
|
1493 my $cl = length($seq);
|
|
1494 # calculate max diff allowed
|
|
1495 if(!defined $maxdiff{$cl}) {
|
|
1496 { lock(%maxdiff); $maxdiff{$cl} = maxdiff($cl, 0.02, 0.04); }
|
|
1497 }
|
|
1498
|
|
1499 if($mq > $maxdiff{$cl} || $cl == 0) {
|
|
1500 { lock(%stats); $stats{$reg}{PoorMapQual}++; $stats{PoorMapQual}++; }
|
|
1501 if(defined $orig_read1) {
|
|
1502 $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read1\t$orig_qtrim1\tRG:Z:$readGroup\n";
|
|
1503 # send to writebamfile thread
|
|
1504 {
|
|
1505 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1506 $WriteBam->enqueue($sam);
|
|
1507 }
|
|
1508 }
|
|
1509 if(defined $orig_read2) {
|
|
1510 $read =~ s/[1](:[A-Z]+:[0-9]+:[ACTGN]+)$/2$1/;
|
|
1511 $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read2\t$orig_qtrim2\tRG:Z:$readGroup\n";
|
|
1512 # send to writebamfile thread
|
|
1513 {
|
|
1514 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1515 $WriteBam->enqueue($sam);
|
|
1516 }
|
|
1517 }
|
|
1518 } else {
|
|
1519 $mq = 60;
|
|
1520
|
|
1521 # add any stats from earlier
|
|
1522 foreach my $key1 (keys %$readStats) {
|
|
1523 foreach my $key2 (keys % { $$readStats{$key1} }) {
|
|
1524 $SamStats{$key1}{$key2} += $$readStats{$key1}{$key2};
|
|
1525 }
|
|
1526 }
|
|
1527
|
|
1528 # keep track of quality scores
|
|
1529 $SamStats{$reg}{Bases} += $cl;
|
|
1530 $SamStats{$reg}{Quality} += $_ for sanger2phred($qual);
|
|
1531
|
|
1532 # keep track of read lengths
|
|
1533 if(defined $minLength{$reg}) {
|
|
1534 $minLength{$reg} = min([$cl, $minLength{$reg}]);
|
|
1535 $maxLength{$reg} = max([$cl, $maxLength{$reg}]);
|
|
1536 } else {
|
|
1537 $minLength{$reg} = $cl;
|
|
1538 $maxLength{$reg} = $cl;
|
|
1539 }
|
|
1540
|
|
1541 # keep track of strand and bases seen
|
|
1542 if($strand eq 'F') {
|
|
1543 $SamStats{$reg}{NumberForward}++;
|
|
1544 } else {
|
|
1545 $SamStats{$reg}{NumberReverse}++;
|
|
1546 }
|
|
1547 $SamStats{$reg}{TotalA} += ($seq =~ tr/A//);
|
|
1548 $SamStats{$reg}{TotalC} += ($seq =~ tr/C//);
|
|
1549 $SamStats{$reg}{TotalG} += ($seq =~ tr/G//);
|
|
1550 $SamStats{$reg}{TotalT} += ($seq =~ tr/T//);
|
|
1551 $SamStats{$reg}{TotalN} += ($seq =~ tr/N//);
|
|
1552
|
|
1553 # keep count the number of reads for each contig
|
|
1554 $SamStats{$reg}{Reads}++;
|
|
1555
|
|
1556 # calculate strand info
|
|
1557 if($strand eq 'F') {
|
|
1558 $strand = 0;
|
|
1559 } else {
|
|
1560 $strand = 16;
|
|
1561 }
|
|
1562 # calculate cigar
|
|
1563 my $cigar = "";
|
|
1564 my $M = 0;
|
|
1565 my $D = 0;
|
|
1566 my $I = 0;
|
|
1567 my $Subs = 0;
|
|
1568 for(my $i = 0; $i < length($align); $i++) {
|
|
1569 if(substr($align, $i, 1) eq "-") {
|
|
1570 $D++;
|
|
1571 if($M) {
|
|
1572 $cigar .= "${M}M";
|
|
1573 $M = 0;
|
|
1574 } elsif($I) {
|
|
1575 $cigar .= "${I}I";
|
|
1576 $I = 0;
|
|
1577 }
|
|
1578 } elsif(substr($ref, $i, 1) eq "-") {
|
|
1579 $I++;
|
|
1580 if($M) {
|
|
1581 $cigar .= "${M}M";
|
|
1582 $M = 0;
|
|
1583 } elsif($D) {
|
|
1584 $cigar .= "${D}D";
|
|
1585 $D = 0;
|
|
1586 }
|
|
1587 } else {
|
|
1588 $M++;
|
|
1589 if($D) {
|
|
1590 $cigar .= "${D}D";
|
|
1591 $D = 0;
|
|
1592 } elsif($I) {
|
|
1593 $cigar .= "${I}I";
|
|
1594 $I = 0;
|
|
1595 }
|
|
1596 $Subs++ if substr($ref, $i, 1) ne substr($align, $i, 1);
|
|
1597 }
|
|
1598 }
|
|
1599 if($M) {
|
|
1600 $cigar .= "${M}M";
|
|
1601 } elsif($I) {
|
|
1602 $cigar .= "${I}I";
|
|
1603 } elsif($D) {
|
|
1604 $cigar .= "${D}D";
|
|
1605 }
|
|
1606
|
|
1607 # construct sam file line
|
|
1608 $sam = "$read\t$strand\t$refLength{$reg}{newReg}\t$rstart\t$mq\t$cigar\t*\t0\t0\t$seq\t$qual\tRG:Z:$readGroup\n";
|
|
1609
|
|
1610 # send to writebamfile thread
|
|
1611 {
|
|
1612 lock($WriteBam); # Keep other threads from changing the queue's contents
|
|
1613 $WriteBam->enqueue($sam);
|
|
1614 }
|
|
1615 }
|
|
1616
|
|
1617 #grab the next read
|
|
1618 {
|
|
1619 lock($Samfile); # Keep other threads from changing the queue's contents
|
|
1620 $readArrayRef = $Samfile->dequeue();
|
|
1621 }
|
|
1622 }
|
|
1623
|
|
1624 {
|
|
1625 # add stats from earlier
|
|
1626 lock(%stats);
|
|
1627 foreach my $key1 (keys %SamStats) {
|
|
1628 $stats{$key1}{MinLength} = min([$minLength{$key1}, $stats{$key1}{MinLength}]);
|
|
1629 $stats{$key1}{MaxLength} = max([$maxLength{$key1}, $stats{$key1}{MaxLength}]);
|
|
1630 foreach my $key2 (keys % { $SamStats{$key1} }) {
|
|
1631 $stats{$key1}{$key2} += $SamStats{$key1}{$key2};
|
|
1632 }
|
|
1633 }
|
|
1634
|
|
1635 }
|
|
1636 print "Closing Sam File Line Generator\n";
|
|
1637
|
|
1638 return;
|
|
1639 }
|
|
1640
|
|
1641 # perform the alignment
|
|
1642 sub nwAlign {
|
|
1643 my ($ToAlign, $Aligned, $regions) = @_;
|
|
1644 my $readArrayRef;
|
|
1645 {
|
|
1646 lock($ToAlign); # Keep other threads from changing the queue's contents
|
|
1647 $readArrayRef = $ToAlign->dequeue();
|
|
1648 }
|
|
1649
|
|
1650 while(defined $readArrayRef) {
|
|
1651 # we have picked a contig
|
|
1652 # check strand and take rev comp of ref if necessary
|
|
1653 my ($ref, $left, $right, $read, $qtrim1, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
|
|
1654 my %align;
|
|
1655 my $name;
|
|
1656 #! Parameters
|
|
1657 #! ClustalW Parameters
|
|
1658 my $matchScore = 1.9;
|
|
1659 my $missmatch_penalty = -0.4;
|
|
1660 my $gapPenOpen = -10.0;
|
|
1661 my $gapPenExt = -0.4;
|
|
1662 my $leftAndRightGapPen = true;
|
|
1663 my $debug = false;
|
|
1664 my $cons = "";
|
|
1665 my $align1= "";
|
|
1666 my $align2= "";
|
|
1667 my $align3= "";
|
|
1668 my $score = 0.0;
|
|
1669 my $singleAlign = false; #aligning 2 or 3 sequences
|
|
1670 if($conc)
|
|
1671 {
|
|
1672 #got to concatenate seqs and align
|
|
1673 $score = align3($ref, $right, $left, $align1, $align2, $align3, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, $leftAndRightGapPen, $debug);
|
|
1674 $name = "$left.$right";
|
|
1675 print "$contig\n$align1\n$align2\n$align3\n\n" if defined $$regions{$contig};
|
|
1676 }
|
|
1677 elsif(!defined $right)
|
|
1678 {
|
|
1679 #align with left
|
|
1680 $score = align2($ref, $left, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, false, true, $debug);
|
|
1681 $singleAlign = true;
|
|
1682 $name = $left;
|
|
1683 print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig};
|
|
1684 }
|
|
1685 elsif(!defined $left)
|
|
1686 {
|
|
1687 #align with right
|
|
1688 $score = align2($ref, $right, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen ,$gapPenExt ,true, false, $debug);
|
|
1689 $singleAlign = true;
|
|
1690 $name = $right;
|
|
1691 print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig};
|
|
1692 }
|
|
1693
|
|
1694 # add aligned reads to queue
|
|
1695 # cache results
|
|
1696 my $type;
|
|
1697 {
|
|
1698 lock($Aligned); # Keep other threads from changing the queue's contents
|
|
1699
|
|
1700 my @array;
|
|
1701 if($singleAlign && !defined $right)
|
|
1702 {
|
|
1703 @array = ($read, $align1, $align2, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
|
|
1704 {
|
|
1705 lock(%cacheAlign);
|
|
1706 $cacheAlign{$contig}{R1}{$name} = &share( [] );
|
|
1707 @{ $cacheAlign{$contig}{R1}{$name} } = @array;
|
|
1708 }
|
|
1709 }
|
|
1710 elsif($singleAlign && !defined $left )
|
|
1711 {
|
|
1712 @array = ($read, undef, undef, undef, $align1, $align2, $qtrim2, $contig, $strand, $conc);
|
|
1713 {
|
|
1714 lock(%cacheAlign);
|
|
1715 $cacheAlign{$contig}{R2}{$name} = &share( [] );
|
|
1716 @{ $cacheAlign{$contig}{R2}{$name} } = @array;
|
|
1717 }
|
|
1718 }
|
|
1719 else
|
|
1720 {
|
|
1721 @array = ($read, $align1, $align2, $qtrim1, $align1, $align3, $qtrim2, $contig, $strand, $conc);
|
|
1722 {
|
|
1723 lock(%cacheAlign);
|
|
1724 $cacheAlign{Conc}{$name} = &share( [] );
|
|
1725 @{ $cacheAlign{Conc}{$name} } = @array;
|
|
1726 }
|
|
1727 }
|
|
1728 $Aligned->enqueue(\@array);
|
|
1729 }
|
|
1730
|
|
1731 #grab the next read
|
|
1732 {
|
|
1733 lock($ToAlign); # Keep other threads from changing the queue's contents
|
|
1734 $readArrayRef = $ToAlign->dequeue();
|
|
1735 }
|
|
1736 }
|
|
1737 # clean up
|
|
1738 print "Closing NW2 Aligner\n";
|
|
1739 return;
|
|
1740 }
|
|
1741
|
|
1742 # function to calculat the minimum edit distance threshold (taken from bwa)
|
|
1743 sub maxdiff {
|
|
1744 my ($l, $err, $thres) = @_;
|
|
1745 my $lambda = exp(-$l*$err);
|
|
1746 my $y = 1.0;
|
|
1747 my $x = 1;
|
|
1748 my $sum = $lambda;
|
|
1749 for(my $k = 1; $k < 1000; $k++) {
|
|
1750 $y *= $l * $err;
|
|
1751 $x *= $k;
|
|
1752 $sum += $lambda * $y / $x;
|
|
1753 if(1.0 - $sum < $thres) {
|
|
1754 return $k;
|
|
1755 }
|
|
1756 }
|
|
1757 return 2;
|
|
1758 }
|
|
1759
|
|
1760 # using approx matching return the positions in $list of those most closely matching $string
|
|
1761 sub amatch {
|
|
1762 my ($string, $list) = @_;
|
|
1763 my @d = adistr($string, @$list);
|
|
1764 my @d_index = sort { abs($d[$a]) <=> abs($d[$b]) } 0..$#$list;
|
|
1765
|
|
1766 return if $d[$d_index[0]] > 0.25;
|
|
1767
|
|
1768 my @pos = ($d_index[0]);
|
|
1769 my $i = 1;
|
|
1770 while($i <= $#d_index && $d[$d_index[0]] == $d[$d_index[$i]]) {
|
|
1771 push @pos, $d_index[$i];
|
|
1772 $i++;
|
|
1773 }
|
|
1774 return @pos;
|
|
1775 }
|
|
1776
|
|
1777 # a sorting algorithm to sort the chromosomes
|
|
1778 sub sortChrom {
|
|
1779 my ($achr, $astart, $aend) = split(":|-", $a);
|
|
1780 my ($bchr, $bstart, $bend) = split(":|-", $b);
|
|
1781 $achr =~ s/X/23/;
|
|
1782 $achr =~ s/Y/24/;
|
|
1783 $achr =~ s/MT?/25/;
|
|
1784 $bchr =~ s/X/23/;
|
|
1785 $bchr =~ s/Y/24/;
|
|
1786 $bchr =~ s/MT?/25/;
|
|
1787 if($achr < $bchr) {
|
|
1788 return -1;
|
|
1789 } elsif($bchr < $achr) {
|
|
1790 return 1;
|
|
1791 } elsif($astart < $bstart) {
|
|
1792 return -1;
|
|
1793 } elsif($bstart < $astart) {
|
|
1794 return 1;
|
|
1795 } elsif($aend < $bend) {
|
|
1796 return -1;
|
|
1797 } elsif($bend < $aend) {
|
|
1798 return 1;
|
|
1799 } else {
|
|
1800 return 0;
|
|
1801 }
|
|
1802 }
|
|
1803
|
|
1804 # pass a code reference
|
|
1805 sub modify {
|
|
1806 my($text, $code) = @_;
|
|
1807 $code->($text);
|
|
1808 return $text;
|
|
1809 }
|
|
1810
|
|
1811 sub split_region {
|
|
1812 my ($chr, $start, $end) = split(":|-", shift);
|
|
1813 $end = $start unless defined $end;
|
|
1814 return($chr, $start, $end);
|
|
1815 }
|
|
1816
|
|
1817 sub error_rate {
|
|
1818 my ($x, $y) = @_;
|
|
1819 my @x = split("", $x);
|
|
1820 my @y = split("", $y);
|
|
1821 my $s = 0;
|
|
1822 my $l = 0;
|
|
1823 for(my $i = 0; $i <= $#x; $i++) {
|
|
1824 if($x[$i] ne "-" && $y[$i] ne "-") {
|
|
1825 $s++ if $x[$i] ne $y[$i];
|
|
1826 $l++;
|
|
1827 }
|
|
1828 }
|
|
1829 return($s / $l);
|
|
1830 }
|
|
1831
|
|
1832
|