annotate alignCustomAmplicon/alignCustomAmplicon.pl @ 5:0aaf65fbb48a draft default tip

Uploaded
author fcaramia
date Wed, 20 Mar 2013 00:22:08 -0400
parents 22f35f830f48
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1 #!/usr/bin/perl
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
2
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
3 # script to convert paired end reads to a concensus sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
4
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
5 # Created by Jason Ellul, Feb 2012
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
6 # Modified by Franco Caramia and Jason Ellul, July 2012 and November 2012
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
7 # 0.4 Fixed mapping of concensus region (large insertions and deletions now picked up correctly)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
8 # Added missalignment penalty
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
9 # Read 1 and 2 are now aligned first then they are aligned to the reference
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
10 # 0.5 Added ability to name amplicons by includinfo the info in the primers file
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
11 # Added the ability to output the alignments in a particular region using the new flag -a
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
12 # Added some extra stats to the output
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
13
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
14 use strict;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
15 use warnings;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
16 use Getopt::Std;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
17 use File::Basename;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
18 use File::Find;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
19 use File::Copy;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
20 use File::MMagic;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
21 use File::Temp qw/ tempfile tempdir /;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
22 use Cwd 'abs_path';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
23 use threads;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
24 use threads::shared;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
25 use List::Util qw(sum);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
26 use Thread::Queue;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
27 use String::Approx 'adistr';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
28 use POSIX qw(floor ceil);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
29 String::Approx::cache_max(25000);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
30 String::Approx::cache_purge(0.25);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
31 use constant false => 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
32 use constant true => 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
33 use Inline CPP => Config => AUTO_INCLUDE => '#include "utils.cpp"';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
34 use Inline CPP => <<'END_CPP';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
35
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
36
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
37
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
38
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
39 int levenshtein(char* seq1, char* seq2)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
40 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
41 // Returns edit distance of 2 sequences
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
42 string sam1(seq1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
43 string sam2(seq2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
44 return levenshteinDistance(sam1, sam2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
45 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
46
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
47 double align2(char* seq1, char* seq2, SV* res1, SV* res2, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool noFrontGapPenalty, bool noEndGapPenalty, bool debug )
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
48 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
49 // Returns Sequence alignment of 2 sequences
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
50 // res1, res2 contain the alignment
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
51 //seq1: reference sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
52 //seq2: amplicon sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
53 //matchScore: reward for matching a nucleotide
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
54 //gapPenOpen: penalty for opening a gap
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
55 //gapPenext: penalty for extending a gap
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
56 //noFrontGapPenalty: No penalty will be impose for gaps at the beggining of the alignment
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
57 //noEndGapPenalty: No penalty will be impose for gaps at the end of the alignment
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
58 //debug: prints scores, alignments, etc...
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
59
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
60 string sam1(seq1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
61 string sam2(seq2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
62 double ret = nw_align2(sam1, sam2, matchScore, missmatch_penalty, gapPenOpen, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
63 //Setting the return alignments
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
64 sv_setpvn(res1, (char*)sam1.c_str(), sam1.length());
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
65 sv_setpvn(res2, (char*)sam2.c_str(), sam2.length());
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
66 return ret;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
67 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
68
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
69 void adjustSequence(int refSize, string& seq, char dir)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
70 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
71
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
72 int adjust = refSize - seq.length();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
73 if(dir == 'r')
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
74 for(int i=0; i<adjust; ++i) seq = '-' + seq;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
75 else
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
76 for(int i=0; i<adjust; ++i) seq += '-';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
77 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
78
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
79 double align3(char* ref, char* right,char* left , SV* res1, SV* res2, SV* res3, double matchScore, double missmatch_penalty, double gapPenOpen, double gapPenExt, bool leftAndRightGapPen, bool debug )
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
80 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
81 // Returns Sequence alignment of 3 sequences
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
82 // res1, res2, res3 contain the alignment
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
83 //ref: reference sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
84 //left: left amplicon sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
85 //right: right amplicon sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
86 //matchScore: reward for matching a nucleotide
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
87 //gapPenOpen: penalty for opening a gap
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
88 //gapPenext: penalty for extending a gap
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
89 //leftAndRightGapPen: Smart way to penalise according to the amplicon
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
90 //debug: prints scores, alignments, etc...
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
91
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
92
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
93 string leftSeq(left);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
94 string rightSeq(right);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
95
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
96 bool noFrontGapPenalty = false;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
97 bool noEndGapPenalty = false;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
98
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
99 //Align right with left reads
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
100 if(leftAndRightGapPen)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
101 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
102 noFrontGapPenalty = true;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
103 noEndGapPenalty = true;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
104 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
105
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
106 //first alignment the 2 reads
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
107 double ret = nw_align2(rightSeq, leftSeq, matchScore, missmatch_penalty*8, gapPenOpen*5, gapPenExt, noFrontGapPenalty, noEndGapPenalty, debug);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
108
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
109 //Setting the return alignments
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
110 vector <string> msa;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
111 vector <string> partial;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
112 vector <string> seq;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
113
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
114 partial.push_back(rightSeq);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
115 partial.push_back(leftSeq);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
116 seq.push_back(ref);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
117
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
118 msa = nw_alignAlignments(partial, seq , matchScore, missmatch_penalty, gapPenOpen, gapPenExt, false, false, debug);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
119 sv_setpvn(res1, (char*)msa[2].c_str(), msa[2].length());
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
120 sv_setpvn(res2, (char*)msa[1].c_str(), msa[1].length());
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
121 sv_setpvn(res3, (char*)msa[0].c_str(), msa[0].length());
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
122 return ret;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
123 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
124
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
125 END_CPP
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
126
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
127 $| = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
128
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
129 # Grab and set all options
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
130 my $version = "0.5";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
131 my %OPTIONS = (p => 1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
132 getopts('a:dl:L:o:p:rsv', \%OPTIONS);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
133
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
134 die qq(version $version
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
135 Usage: alignCustomAmplicon.pl [OPTIONS] <ref> <fastq[.gz] 1> <fastq[.gz] 2> <primers>
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
136
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
137 OPTIONS: -a STR print the alignment for any amplicons covering a particular region [chr:loc || chr:start-stop]
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
138 -d print debug info
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
139 -l STR filename of the log file [null]
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
140 -L STR append to an existing log file [null]
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
141 -o STR filename for bam file
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
142 -p INT number of threads [$OPTIONS{p}]
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
143 -r do not create stats report
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
144 -s sort the bam file
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
145 -v verbose progress
4
22f35f830f48 Uploaded
fcaramia
parents: 0
diff changeset
146 -j STR picard jar
0
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
147 ) if(@ARGV != 4);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
148
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
149 my $Script = 'alignCustomAmplicon';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
150 my $ref = shift @ARGV;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
151 my $fastq1 = shift @ARGV;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
152 my $fastq2 = shift @ARGV;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
153 my $primers = shift @ARGV;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
154 my $now;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
155
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
156 #create log file
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
157 if(defined $OPTIONS{l}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
158 my $logfile = $OPTIONS{l};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
159 open (FH, ">$logfile") or die 'Couldn\'t create log file!\n';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
160 close (FH);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
161 # Open the log file and redirect output to it
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
162 open (STDERR, ">>$logfile");
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
163 open (STDOUT, ">>$logfile");
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
164 my $now = localtime time;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
165 print "Log File Created $now\n\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
166 } elsif(defined $OPTIONS{L}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
167 #append to a log file
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
168 my $logfile = $OPTIONS{L};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
169 # Open the log file and redirect output to it
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
170 open (STDERR, ">>$logfile") or die 'Couldn\'t create log file!\n';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
171 open (STDOUT, ">>$logfile") or die 'Couldn\'t create log file!\n';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
172 my $now = localtime time;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
173 print "Appending To Log File $now\n\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
174 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
175
4
22f35f830f48 Uploaded
fcaramia
parents: 0
diff changeset
176 if (defined $OPTIONS{j})
22f35f830f48 Uploaded
fcaramia
parents: 0
diff changeset
177 {
22f35f830f48 Uploaded
fcaramia
parents: 0
diff changeset
178 my $JAR = $OPTIONS{j};
22f35f830f48 Uploaded
fcaramia
parents: 0
diff changeset
179
22f35f830f48 Uploaded
fcaramia
parents: 0
diff changeset
180 }
22f35f830f48 Uploaded
fcaramia
parents: 0
diff changeset
181
0
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
182 # print version of this script
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
183 print "Using $Script version $version \n\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
184
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
185 if(defined $OPTIONS{d}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
186 print "Setting threads to 1 as using debug\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
187 $OPTIONS{p} = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
188 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
189
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
190 # grab sample name, set output file name and open some files
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
191 my ($name, $path, $suffix) = fileparse($fastq1, ('.fastq', '.fastq.gz'));
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
192 my $sampleName :shared = $name;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
193 $sampleName =~ s/^(.*?_.*?)_.*/$1/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
194 open(PRIM, $primers) or die "can't open $primers: $!\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
195
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
196 # set samfile name
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
197 my $bamfile_out = "${sampleName}_aligned.bam";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
198 if(defined $OPTIONS{o}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
199 $bamfile_out = $OPTIONS{o};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
200 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
201 my $bam_to_sort = $bamfile_out;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
202 if(defined $OPTIONS{s}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
203 $bam_to_sort .= '.tmp';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
204 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
205
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
206 my $stats = $bamfile_out;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
207 if(!defined $OPTIONS{r}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
208 $stats =~ s/\..*?$//;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
209 open(STATS, ">${stats}_stats.csv");
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
210 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
211
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
212 # read in primer lengths
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
213 my %refFrags;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
214 $refFrags{min} = 5000000;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
215 while(my $line = <PRIM>) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
216 chomp $line;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
217 my @line_sp = split("\t", $line);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
218 $refFrags{primers}{$line_sp[0]}{U}{F}{length} = $line_sp[1];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
219 $refFrags{primers}{$line_sp[0]}{D}{F}{length} = $line_sp[2];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
220 $refFrags{primers}{$line_sp[0]}{U}{R}{length} = $line_sp[2];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
221 $refFrags{primers}{$line_sp[0]}{D}{R}{length} = $line_sp[1];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
222 $refFrags{name}{$line_sp[0]} = $line_sp[3];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
223 $refFrags{min} = min([$refFrags{min}, $line_sp[1], $line_sp[2]]);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
224 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
225 close(PRIM);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
226 my @OrderedContigs :shared = sort sortChrom keys %{ $refFrags{primers} };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
227
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
228 my %regions;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
229 if(defined $OPTIONS{a}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
230 my ($chr, $start, $end) = split_region($OPTIONS{a});
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
231 foreach my $reg (@OrderedContigs) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
232 my ($chr_reg, $start_reg, $end_reg) = split_region($reg);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
233 $regions{$reg} = 1 if $chr eq $chr_reg && $start_reg <= $end && $end_reg >= $start;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
234 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
235 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
236
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
237 # grab fasta file
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
238 my %refSeq;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
239 read_fasta_file($ref, \%refSeq);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
240
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
241 # calculate read length and read group
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
242 my $readGroup :shared;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
243 my $readLength;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
244 my $mm = new File::MMagic;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
245 my $filetype = $mm->checktype_filename(abs_path($fastq1));
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
246 if($filetype eq "application/x-gzip") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
247 my $sampleRead = `gunzip -c $fastq1 | head -n 2 | tail -n 1`;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
248 chomp $sampleRead;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
249 $readLength = length($sampleRead);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
250 $readGroup = `gunzip -c $fastq1 | head -n 1`;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
251 chomp $readGroup;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
252 $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
253 } elsif($filetype eq "text/plain") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
254 my $sampleRead = `head -n 2 $fastq1 | tail -n 1`;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
255 chomp $sampleRead;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
256 $readLength = length($sampleRead);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
257 $readGroup = `head $fastq1 -n 1`;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
258 chomp $readGroup;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
259 $readGroup =~ s/.*?:.*?:(.*?):.*/${1}_L001/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
260 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
261 die("$fastq1 is filetype $filetype and not text/plain or application/x-gzip\n");
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
262 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
263
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
264 # create hash to keep track of maxdiff and readlengths
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
265 my %maxdiff :shared;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
266
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
267 # create hash to keep track of stats
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
268 my %stats :shared;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
269 $stats{Amb} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
270 $stats{PoorQualMates} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
271 $stats{PoorMapQual} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
272 $stats{PoorQual1} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
273 $stats{PoorQual2} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
274
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
275 # setup other shared variables and caches
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
276 my %cache :shared;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
277 my $cacheHits :shared = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
278 my %cacheAlign :shared;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
279 $cacheAlign{Conc} = &share( {} );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
280 my $cacheAlignHits :shared = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
281 my %refLength :shared;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
282
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
283 # extract primers and fragments on forward and reverse strand and initialize counts
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
284 foreach my $key (@OrderedContigs) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
285 push @{ $refFrags{seqU} }, substr($refSeq{$key}, 0, $refFrags{primers}{$key}{U}{F}{length});
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
286 push @{ $refFrags{seqD} }, substr($refSeq{$key}, -$refFrags{primers}{$key}{D}{F}{length});
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
287 push @{ $refFrags{seqU} }, revcompl(substr($refSeq{$key}, -$refFrags{primers}{$key}{U}{R}{length}));
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
288 push @{ $refFrags{seqD} }, revcompl(substr($refSeq{$key}, 0, $refFrags{primers}{$key}{D}{R}{length}));
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
289 push @{ $refFrags{lenU} }, ($refFrags{primers}{$key}{U}{F}{length}, $refFrags{primers}{$key}{U}{R}{length});
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
290 push @{ $refFrags{lenD} }, ($refFrags{primers}{$key}{D}{F}{length}, $refFrags{primers}{$key}{D}{R}{length});
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
291 push @{ $refFrags{reg} }, ($key, $key);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
292 push @{ $refFrags{strand} }, ('F', 'R');
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
293
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
294 # calculate new contig name with primers removed
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
295 my ($chr, $start, $stop) = split(":|-", $key);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
296 $start += $refFrags{primers}{$key}{U}{F}{length};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
297 $stop -= $refFrags{primers}{$key}{D}{F}{length};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
298 $refLength{$key} = &share( {} );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
299 $refLength{$key}{newReg} = "$chr:$start-$stop";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
300 $refLength{$key}{length} = $stop - $start + 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
301
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
302 $cacheAlign{$key} = &share( {} );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
303 $cacheAlign{$key}{R1} = &share( {} );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
304 $cacheAlign{$key}{R2} = &share( {} );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
305
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
306 $stats{$key} = &share( {} );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
307 $stats{$key}{Reads} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
308 $stats{$key}{Bases} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
309 $stats{$key}{Quality} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
310 $stats{$key}{Mapping} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
311 $stats{$key}{Read1} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
312 $stats{$key}{Read2} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
313 $stats{$key}{ConcMapping} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
314 $stats{$key}{ConcReads} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
315 $stats{$key}{NumberForward} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
316 $stats{$key}{NumberReverse} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
317 $stats{$key}{ConcErrorsBases} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
318 $stats{$key}{ConcCorrectBases} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
319 $stats{$key}{MinLength} = 100000;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
320 $stats{$key}{MaxLength} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
321 $stats{$key}{TotalA} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
322 $stats{$key}{TotalC} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
323 $stats{$key}{TotalG} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
324 $stats{$key}{TotalT} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
325 $stats{$key}{TotalN} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
326 $stats{$key}{PoorMapQual} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
327 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
328
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
329 # start up worker threads
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
330 my $FindContig = Thread::Queue->new();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
331 my $ToAlign = Thread::Queue->new();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
332 my $Aligned = Thread::Queue->new();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
333 my $Samfile = Thread::Queue->new();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
334 my $WriteBam = Thread::Queue->new();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
335 my @threadArray;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
336 for(my $index = 1; $index <= $OPTIONS{p}; $index++) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
337 push @threadArray, threads->create(\&thread_find_contig, $FindContig, \%refFrags, \%refSeq, $ToAlign, $Aligned, $WriteBam);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
338 push @threadArray, threads->create(\&thread_process_align, $Aligned, \%refFrags, \%refSeq, $Samfile, $WriteBam) if $index <= max([1, $OPTIONS{p}/2]);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
339 push @threadArray, threads->create(\&samfile_line, $Samfile, $WriteBam, \%refSeq) if $index <= max([1, $OPTIONS{p}/2]);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
340 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
341 push @threadArray, threads->create(\&nwAlign, $ToAlign, $Aligned, \%regions);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
342 push @threadArray, threads->create(\&writebamfile, $bam_to_sort, $WriteBam);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
343
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
344 # read in each sequence and its mate from the fastq file
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
345 my $read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
346 my $read2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
347 my $seq1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
348 my $seq2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
349 my $qual1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
350 my $qual2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
351 my $desc;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
352 my @strand;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
353 my $cont = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
354
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
355 # open fastq files
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
356 my $gz1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
357 my $gz2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
358 my $f1gz = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
359 my $f2gz = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
360 if($filetype eq "application/x-gzip") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
361 open(FQ1, "gunzip -dc $fastq1 |") or die "Can't fork: $!\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
362 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
363 open(FQ1, $fastq1) or die "$fastq1: $!";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
364 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
365
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
366 $filetype = $mm->checktype_filename(abs_path($fastq2));;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
367 if($filetype eq "application/x-gzip") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
368 open(FQ2, "gunzip -dc $fastq2 |") or die "Can't fork: $!\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
369 } elsif($filetype eq "text/plain") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
370 open(FQ2, $fastq2) or die "$fastq2: $!";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
371 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
372 die("$fastq2 is filetype $filetype and not text/plain or application/x-gzip\n");
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
373 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
374
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
375 # add reads to the queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
376 my $readNo = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
377 while ($cont) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
378 if($read1 = <FQ1>) { # read 1 and 2 read name
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
379 $seq1 = <FQ1>; # read 1 sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
380 $desc = <FQ1>; # read 1 desc
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
381 $qual1 = <FQ1>; # read 1 quality
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
382 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
383 last;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
384 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
385
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
386 if($read2 = <FQ2>) { # read 2 read name
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
387 $seq2 = <FQ2>; # read 2 sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
388 $desc = <FQ2>; # read 2 sdesc
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
389 $qual2 = <FQ2>; # read 2 quality
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
390 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
391 last;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
392 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
393
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
394 # read 1 the forward read
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
395 chomp $read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
396 chomp $seq1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
397 chomp $qual1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
398 $read1 =~ s/^\@(.*)\s?.*/$1/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
399
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
400 # read 2 the reverse read
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
401 chomp $read2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
402 chomp $seq2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
403 $seq2 = revcompl($seq2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
404 chomp $qual2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
405 $qual2 = reverse($qual2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
406
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
407 $read2 =~ s/^\@(.*)\s?.*/$1/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
408 $readNo++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
409
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
410 # keep at most 20 reads in buffer
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
411 while(1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
412 if(defined $OPTIONS{v}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
413 my $percAmb = $stats{Amb} / $readNo * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
414 my $percPoor = $stats{PoorQualMates} / $readNo * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
415 my $percPoorMap = $stats{PoorMapQual} / $readNo * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
416 my $percPoor1 = $stats{PoorQual1} / $readNo * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
417 my $percPoor2 = $stats{PoorQual2} / $readNo * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
418 $percAmb = sprintf("%.2f", $percAmb);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
419 $percPoor = sprintf("%.2f", $percPoor);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
420 $percPoorMap = sprintf("%.2f", $percPoorMap);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
421 $percPoor1 = sprintf("%.2f", $percPoor1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
422 $percPoor2 = sprintf("%.2f", $percPoor2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
423 print STDERR "\r$readNo processed - Amb $percAmb - PoorQual $percPoor - PoorQual1 $percPoor1 - PoorQual2 $percPoor2 - PoorMappingQual $percPoorMap - FindContig ".$FindContig->pending()." - ToAlign ".$ToAlign->pending()." - Aligned ".$Aligned->pending()." - Sam ".$Samfile->pending()." - Writebam ".$WriteBam->pending()." ";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
424 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
425 last if $FindContig->pending() <= 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
426 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
427 my @readArray = ($read1, $seq1, $qual1, $read2, $seq2, $qual2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
428 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
429 lock($FindContig); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
430 $FindContig->enqueue(\@readArray);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
431 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
432 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
433 close(FQ1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
434 close(FQ2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
435
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
436 # wait for all records to be processed and close down threads
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
437 $FindContig->enqueue( (undef) x $OPTIONS{p} );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
438 while($FindContig->pending() > 0 ) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
439 next;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
440 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
441 $ToAlign->enqueue( (undef) );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
442 while($ToAlign->pending() > 0 ) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
443 next;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
444 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
445 $Aligned->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
446 while($Aligned->pending() > 0 ) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
447 next;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
448 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
449 $Samfile->enqueue( (undef) x floor(max([1, $OPTIONS{p}/2])) );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
450 while($Samfile->pending() > 0 ) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
451 next;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
452 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
453 $WriteBam->enqueue( undef );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
454
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
455 $_->join() for @threadArray;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
456
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
457 # sort the bam file
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
458 if(defined $OPTIONS{s}) {
4
22f35f830f48 Uploaded
fcaramia
parents: 0
diff changeset
459 system("java -Xmx8g -jar $JAR MAX_RECORDS_IN_RAM=2000000 I=$bam_to_sort O=$bamfile_out VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true");
0
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
460 unlink $bam_to_sort;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
461 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
462
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
463 if(!defined $OPTIONS{r}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
464 # print out statistics
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
465 # setup variables
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
466 my $totalRegionLengths = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
467 my $totalReads = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
468 my $totalFwd = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
469 my $totalRev = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
470 my $totalMatch = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
471 my $totalMiss = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
472 my $totalBases = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
473 my $totalRead1 = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
474 my $totalRead2 = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
475 my $totalA = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
476 my $totalC = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
477 my $totalG = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
478 my $totalT = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
479 my $totalN = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
480 my $totalConc = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
481 my $totalQuality = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
482 my $AllmaxRead = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
483 my $AllminRead = 100000;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
484 my $avequal = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
485 my $aveRead = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
486 my $perA = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
487 my $perC = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
488 my $perG = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
489 my $perT = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
490 my $perN = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
491 my $readsUsed = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
492 my @coverage;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
493 my @off_coverage;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
494
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
495 # print column headers
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
496 print STATS 'AmpliconName,Region,RegionLengthMinusPrimers,TotalReadsPoorMapping,TotalReadsUsed,Reads1Kept,Reads2Kept,ReadsOut,ReadsFwd,ReadsRev,ReadsOverlap,';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
497 print STATS "Bases,AveQuality,MatchBases,ErrorBases,ErrorRate,MinReadLength,AveReadLength,MaxReadLength,PercA,PercC,PercG,PercT,PercN\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
498
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
499 # for each region
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
500 foreach my $key (@OrderedContigs){
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
501 # if we have some reads calculate averages
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
502 if($stats{$key}{Reads} > 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
503 $avequal = $stats{$key}{Quality} / $stats{$key}{Bases};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
504 $avequal = sprintf("%.4f", $avequal);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
505 $aveRead = $stats{$key}{Bases} / $stats{$key}{Reads};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
506 $aveRead = sprintf("%.4f", $aveRead);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
507 $perA = $stats{$key}{TotalA} / $stats{$key}{Bases} * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
508 $perA = sprintf("%.2f", $perA);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
509 $perC = $stats{$key}{TotalC} / $stats{$key}{Bases} * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
510 $perC = sprintf("%.2f", $perC);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
511 $perG = $stats{$key}{TotalG} / $stats{$key}{Bases} * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
512 $perG = sprintf("%.2f", $perG);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
513 $perT = $stats{$key}{TotalT} / $stats{$key}{Bases} * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
514 $perT = sprintf("%.2f", $perT);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
515 $perN = $stats{$key}{TotalN} / $stats{$key}{Bases} * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
516 $perN = sprintf("%.2f", $perN);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
517 $AllminRead = min([$AllminRead, $stats{$key}{MinLength}]);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
518 $AllmaxRead = max([$AllmaxRead, $stats{$key}{MaxLength}]);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
519 $readsUsed = $stats{$key}{Reads} + $stats{$key}{ConcReads};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
520 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
521 $stats{$key}{MaxLength} = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
522 $stats{$key}{MinLength} = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
523 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
524
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
525 # if we have reads which overlapped calculate an error rate
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
526 my $errRate = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
527 if($stats{$key}{ConcReads} > 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
528 $errRate = $stats{$key}{ConcErrorsBases} / ($stats{$key}{ConcErrorsBases} + $stats{$key}{ConcCorrectBases});
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
529 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
530
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
531 # print the stats for this region
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
532 print STATS "$refFrags{name}{$key}," if defined $refFrags{name}{$key};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
533 print STATS "Unknown," unless defined $refFrags{name}{$key};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
534 print STATS "$refLength{$key}{newReg},$refLength{$key}{length},$stats{$key}{PoorMapQual},$readsUsed,$stats{$key}{Read1},$stats{$key}{Read2},$stats{$key}{Reads},$stats{$key}{NumberForward},";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
535 print STATS "$stats{$key}{NumberReverse},$stats{$key}{ConcReads},$stats{$key}{Bases},$avequal,$stats{$key}{ConcCorrectBases},";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
536 print STATS "$stats{$key}{ConcErrorsBases},$errRate,$stats{$key}{MinLength},$aveRead,$stats{$key}{MaxLength},$perA,$perC,$perG,$perT,$perN\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
537
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
538 # add stats to grand totals
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
539 $totalRegionLengths += $refLength{$key}{length};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
540 $totalQuality += $stats{$key}{Quality};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
541 $totalReads += $stats{$key}{Reads};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
542 $totalFwd += $stats{$key}{NumberForward};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
543 $totalRev += $stats{$key}{NumberReverse};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
544 $totalBases += $stats{$key}{Bases};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
545 $totalRead1 += $stats{$key}{Read1};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
546 $totalRead2 += $stats{$key}{Read2};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
547 $totalA += $stats{$key}{TotalA};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
548 $totalC += $stats{$key}{TotalC};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
549 $totalT += $stats{$key}{TotalT};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
550 $totalG += $stats{$key}{TotalG};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
551 $totalN += $stats{$key}{TotalN};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
552 $totalMatch += $stats{$key}{ConcCorrectBases};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
553 $totalMiss += $stats{$key}{ConcErrorsBases};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
554 $totalConc += $stats{$key}{ConcReads};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
555
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
556 # reset ave scores
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
557 $avequal = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
558 $aveRead = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
559 $perA = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
560 $perC = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
561 $perG = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
562 $perT = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
563 $perN = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
564 $readsUsed = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
565
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
566 if(defined $refFrags{name}{$key} && grep /^Off_target/, $refFrags{name}{$key}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
567 push @off_coverage, $stats{$key}{Reads};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
568 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
569 push @coverage, $stats{$key}{Reads};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
570 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
571 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
572
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
573 # if we have any reads calculate the averages
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
574 if($totalReads > 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
575 $avequal = $totalQuality / $totalBases;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
576 $avequal = sprintf("%.4f", $avequal);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
577 $aveRead = $totalBases / $totalReads;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
578 $aveRead = sprintf("%.4f", $aveRead);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
579 $perA = $totalA / $totalBases * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
580 $perA = sprintf("%.2f", $perA);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
581 $perC = $totalC / $totalBases * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
582 $perC = sprintf("%.2f", $perC);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
583 $perG = $totalG / $totalBases * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
584 $perG = sprintf("%.2f", $perG);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
585 $perT = $totalT / $totalBases * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
586 $perT = sprintf("%.2f", $perT);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
587 $perN = $totalN / $totalBases * 100;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
588 $perN = sprintf("%.2f", $perN);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
589 $readsUsed = $totalReads + $totalConc;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
590 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
591 $AllmaxRead = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
592 $AllminRead = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
593 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
594
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
595 # if we had any reads which overlapped calculate error rate
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
596 my $errRate = 'NA';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
597 if($totalConc > 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
598 $errRate = $totalMiss / ($totalMiss + $totalMatch);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
599 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
600
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
601 # print out totals
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
602 my $pcAmb = sprintf("%.2f", $stats{Amb} / $readNo * 100);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
603 my $pcPQ = sprintf("%.2f", $stats{PoorQualMates} / $readNo * 100);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
604 my $TotalReads = $readNo * 2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
605 my $Unmapped = $TotalReads - $readsUsed;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
606 my $pcUnmapped = sprintf("%.2f", $Unmapped/$TotalReads * 100);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
607 my $avCov = sprintf("%.0f", sum(@coverage)/@coverage);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
608 my $avOff = sprintf("%.2f", sum(@off_coverage)/@off_coverage) if @off_coverage;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
609 my $greater = scalar (grep { $_ > 0.2*$avCov } @coverage);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
610 my $pcgreater = sprintf("%.2f", $greater/@coverage * 100);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
611
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
612 print STATS ",Total,$totalRegionLengths,$stats{PoorMapQual},$readsUsed,$totalRead1,$totalRead2,$totalReads,$totalFwd,$totalRev,$totalConc,$totalBases,$avequal,$totalMatch,";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
613 print STATS "$totalMiss,$errRate,$AllminRead,$aveRead,$AllmaxRead,$perA,$perC,$perG,$perT,$perN\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
614 print STATS "\nNo mate pairs:,$readNo\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
615 print STATS "No ambiguous mates:,$stats{Amb},$pcAmb\%\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
616 print STATS "No poor quality mates:,$stats{PoorQualMates},$pcPQ\%\n\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
617 print STATS "Total Number Of Reads:,$TotalReads\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
618 print STATS "Unmapped Reads:,$Unmapped,$pcUnmapped\%\n\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
619 print STATS "Min Mean Max ReadsOut:,".min([@coverage]).",$avCov,".max([@coverage])."\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
620 print STATS "No. Amplicons > 0.2 x ReadsOut:,$greater,$pcgreater\%\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
621 print STATS "Average ReadsOut Off Target Regions:,$avOff\n" if @off_coverage;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
622 print STATS "\nAmplicons\nNo. Reads < 200, 200 <= No. Reads < 1000, No. Reads >= 1000\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
623 $greater = scalar (grep { $_ < 200 } @coverage);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
624 print STATS "$greater,";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
625 $greater = scalar (grep { $_ >= 200 && $_ < 1000 } @coverage);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
626 print STATS "$greater,";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
627 $greater = scalar (grep { $_ >= 1000 } @coverage);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
628 print STATS "$greater\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
629
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
630 print "Cache hits align contig: $cacheHits\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
631 print "Alignment cache hits: $cacheAlignHits\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
632 close(STATS);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
633 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
634
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
635 # close the log file
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
636 if(defined $OPTIONS{l} || defined $OPTIONS{L}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
637 my $now = localtime time;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
638 print "\nLog File Closed $now\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
639 close(STDERR);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
640 close(STDOUT);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
641 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
642
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
643 # thread to find matching amplicon
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
644 sub thread_find_contig {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
645 my $FindContig = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
646 my $refFrags = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
647 my $refSeq = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
648 my $ToAlign = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
649 my $Aligned = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
650 my $WriteBam = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
651 my $readArrayRef;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
652 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
653 lock($FindContig); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
654 $readArrayRef = $FindContig->dequeue();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
655 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
656
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
657 while(defined $readArrayRef) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
658 my ($read1, $seq1, $qual1, $read2, $seq2, $qual2) = @{ $readArrayRef };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
659 my $contig = '';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
660 my $readRef = '';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
661 my $strand = '';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
662
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
663 # trim the reads
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
664 my ($trimmed1, $qtrim1) = trim($seq1, $qual1, 30, 1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
665 my ($trimmed2, $qtrim2) = trim($seq2, $qual2, 30, 2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
666
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
667 # if we have already mapped the reads use the results in the cache
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
668 if(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
669 my @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
670 { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
671 $array[0] = $read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
672 $array[3] = $qtrim1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
673 $array[6] = $qtrim2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
674 { lock($cacheAlignHits); $cacheAlignHits++ };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
675
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
676 # add aligned reads to queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
677 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
678 lock($Aligned); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
679 $Aligned->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
680 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
681 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
682 # Use approx match to find most similar amplicon primer to reads
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
683 my @index1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
684 my @index2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
685 my $index_read1 = -2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
686 my $index_read2 = -2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
687 my $index_found = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
688
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
689 if(length($trimmed1) > length($seq1)/2 || length($trimmed2) > length($seq2)/2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
690 # we have at least 1 good read
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
691 # find closest matching amplicon for read 1
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
692 my $string = substr($seq1, 0, $$refFrags{min});
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
693 if(defined $cache{"$string.1"}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
694 $cacheHits++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
695 { lock(%cache); @index1 = @{ $cache{"$string.1"} } };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
696 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
697 @index1 = amatch($string, \@{ $$refFrags{seqU} });
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
698 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
699 lock(%cache);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
700 $cache{"$string.1"} = &share( [] );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
701 @{ $cache{"$string.1"} } = @index1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
702 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
703 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
704
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
705 # find closest matching amplicon for read 2
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
706 $string = substr($seq2, -$$refFrags{min});
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
707 if(defined $cache{"$string.2"}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
708 $cacheHits++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
709 { lock(%cache); @index2 = @{ $cache{"$string.2"} } };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
710 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
711 @index2 = amatch($string, \@{ $$refFrags{seqD} });
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
712 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
713 lock(%cache);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
714 $cache{"$string.2"} = &share( [] );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
715 @{ $cache{"$string.2"} } = @index2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
716 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
717 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
718
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
719 if($#index1 > 0 || $#index2 > 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
720 # if one read maps to multiple amplicons and the other read matches to just one of them use that
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
721 my %in_array2 = map { $_ => 1 } @index2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
722 my @common = grep { $in_array2{$_} } @index1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
723 if($#common == 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
724 @index1 = @common;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
725 @index2 = @common;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
726 } elsif($#common > 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
727 if($#index1 > 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
728 # matched to multiple primers so try and use whole read
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
729 my @refs;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
730 foreach(@index1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
731 if($$refFrags{strand}[$_] eq "F") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
732 push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
733 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
734 push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
735 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
736 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
737 my @index1b = amatch($seq1, \@refs);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
738 if($#index1b >= 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
739 @index1 = @index1[@index1b];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
740 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
741 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
742 if($#index2 > 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
743 # matched to multiple primers so try and use whole read
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
744 my @refs;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
745 foreach(@index2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
746 if($$refFrags{strand}[$_] eq "F") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
747 push @refs, $$refSeq{${ $$refFrags{reg} }[$_]};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
748 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
749 push @refs, revcompl($$refSeq{${ $$refFrags{reg} }[$_]});
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
750 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
751 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
752 my @index2b = amatch($seq2, \@refs);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
753 if($#index2b >= 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
754 @index2 = @index2[@index2b];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
755 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
756 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
757
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
758 # if one read still maps to multiple amplicons and the other read matches to just one of them use that
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
759 %in_array2 = map { $_ => 1 } @index2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
760 @common = grep { $in_array2{$_} } @index1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
761 if($#common == 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
762 @index1 = @common;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
763 @index2 = @common;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
764 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
765 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
766 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
767
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
768 # if read 1 and read 2 match to the same amplicon assign it
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
769 if($#index1 == 0 && @index1 ~~ @index2 ) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
770 $index_read1 = $index1[0];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
771 $index_read2 = $index2[0];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
772 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
773 $index_read1 = -1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
774 $index_read2 = -1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
775 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
776
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
777 # if any read after trimming is less than half the original length discard
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
778 if(length($trimmed1) < length($seq1)/2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
779 $index_read1 = -2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
780 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
781
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
782 if(length($trimmed2) < length($seq2)/2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
783 $index_read2 = -2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
784 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
785 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
786
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
787 if($index_read1 > -1 || $index_read2 > -1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
788 my $conc = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
789 my %align;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
790
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
791 # at least 1 read matched to an amplicon
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
792 if($index_read1 > -1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
793 $contig = $$refFrags{reg}[$index_read1];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
794 $readRef = $$refSeq{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
795 $strand = $$refFrags{strand}[$index_read1];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
796 } elsif($index_read2 > -1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
797 $contig = $$refFrags{reg}[$index_read2];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
798 $readRef = $$refSeq{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
799 $strand = $$refFrags{strand}[$index_read2];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
800 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
801
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
802 if($index_read1 == $index_read2 && length($readRef) < length($trimmed1) + length($trimmed2)) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
803 # reads must overlap so can use concensus
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
804 $conc = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
805 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
806 my $substr1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
807 my $substr2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
808 for(my $length = 5; $length <= min([length($trimmed1), length($trimmed2)]); $length++) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
809 $substr1 = substr($trimmed1, -$length);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
810 $substr2 = substr($trimmed2, 0, $length);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
811 if($substr1 eq $substr2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
812 $conc = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
813 last;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
814 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
815 # calculate max diff allowed
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
816 if(!defined $maxdiff{$length}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
817 { lock(%maxdiff); $maxdiff{$length} = maxdiff($length, 0.02, 0.04); }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
818 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
819
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
820 my $diff = levenshtein($substr1, $substr2 );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
821 if($diff < $maxdiff{$length}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
822 $conc = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
823 last;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
824 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
825 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
826 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
827 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
828
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
829 if($conc) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
830 # remove any trailing Ns
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
831 $trimmed1 = $seq1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
832 $trimmed2 = $seq2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
833 $trimmed1 =~ s/N*$//;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
834 $trimmed2 =~ s/^N*//;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
835 $qtrim1 = $qual1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
836 $qtrim2 = $qual2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
837 $qtrim1 = substr($qtrim1, 0, length($trimmed1));
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
838 $qtrim2 = substr($qtrim2 , -length($trimmed2));
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
839 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
840
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
841 if($strand eq "R") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
842 my $trimmed1_tmp = revcompl($trimmed1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
843 my $qtrim1_tmp = reverse($qtrim1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
844 $trimmed1 = revcompl($trimmed2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
845 $qtrim1 = reverse($qtrim2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
846 $trimmed2 = $trimmed1_tmp;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
847 $qtrim2 = $qtrim1_tmp;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
848 my $index_read_tmp = $index_read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
849 $index_read1 = $index_read2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
850 $index_read2 = $index_read_tmp;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
851 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
852
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
853 if($conc) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
854 my $align;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
855 my $search1 = $trimmed1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
856 $search1 =~ s/N/./g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
857 my $search2 = $trimmed2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
858 $search2 =~ s/N/./g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
859 if($$refSeq{$contig} =~ /$search1/ && $$refSeq{$contig} =~ /$search2/) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
860 # the read matches exactly to the reference
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
861 $align{Ref1} = $$refSeq{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
862 $align{Read1} = $$refSeq{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
863 $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
864 $align{Read1} =~ s/[^X]/-/g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
865 $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X+/$trimmed1/ });
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
866 $align{Read2} = $$refSeq{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
867 $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}XX$2/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
868 $align{Read2} =~ s/[^X]/-/g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
869 $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X+/$trimmed2/ });
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
870
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
871 # add aligned reads to queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
872 my @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
873 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
874 lock($Aligned); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
875 @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, $align{Ref1}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
876 $Aligned->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
877 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
878
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
879 # add alignment to cache
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
880 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
881 lock(%cacheAlign);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
882 $cacheAlign{Conc}{$name} = &share( [] );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
883 @{ $cacheAlign{Conc}{$name} } = @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
884 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
885 } elsif(defined $cacheAlign{Conc}{"$trimmed1.$trimmed2"}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
886 my @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
887 { lock(%cacheAlign); @array = @{ $cacheAlign{Conc}{"$trimmed1.$trimmed2"} } };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
888 $array[0] = $read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
889 $array[3] = $qtrim1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
890 $array[6] = $qtrim2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
891 { lock($cacheAlignHits); $cacheAlignHits++ };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
892
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
893 # add aligned reads to queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
894 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
895 lock($Aligned); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
896 $Aligned->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
897 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
898 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
899
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
900 # align reads using NW2
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
901 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
902 lock($ToAlign); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
903 my @array = ($$refSeq{$contig}, $trimmed1, $trimmed2, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
904 #my @array = ($fastaIn, $read1, $qtrim1, $qtrim2, $contig, $strand, $conc, "$trimmed1.$trimmed2");
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
905 $ToAlign->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
906 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
907 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
908 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
909
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
910 if(!$conc && $index_read1 > -1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
911 my $search1 = $trimmed1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
912 $search1 =~ s/N/./g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
913 if($$refSeq{$contig} =~ /$search1/) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
914 # the read matches exactly to the reference
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
915 $align{Ref1} = $$refSeq{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
916 $align{Read1} = $$refSeq{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
917 $align{Read1} =~ s/^(.*?)$search1(.*)$/${1}X$2/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
918 $align{Read1} =~ s/[^X]/-/g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
919 $align{Read1} = modify($align{Read1}, sub { $_[0] =~ s/X/$trimmed1/ });
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
920
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
921 # add aligned reads to queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
922 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
923 lock($Aligned); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
924 my @array = ($read1, $align{Ref1}, $align{Read1}, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
925 $Aligned->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
926 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
927 } elsif(defined $cacheAlign{$contig}{R1}{$trimmed1}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
928 my @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
929 { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R1}{$trimmed1} } };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
930 $array[0] = $read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
931 $array[3] = $qtrim1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
932 { lock($cacheAlignHits); $cacheAlignHits++ };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
933
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
934 # add aligned reads to queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
935 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
936 lock($Aligned); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
937 $Aligned->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
938 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
939 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
940 # align read using NW2
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
941 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
942 lock($ToAlign); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
943 my @array = ($$refSeq{$contig}, $trimmed1, undef, $read1, $qtrim1, undef, $contig, $strand, $conc);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
944 #my @array = ($fastaIn, $read1, $qtrim1, undef, $contig, $strand, $conc, $trimmed1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
945 $ToAlign->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
946 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
947 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
948 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
949
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
950 if(!$conc && $index_read2 > -1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
951 my $search2 = $trimmed2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
952 $search2 =~ s/N/./g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
953 if($$refSeq{$contig} =~ /$search2/) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
954 # the read matches exactly to the reference
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
955 $align{Ref2} = $$refSeq{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
956 $align{Read2} = $$refSeq{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
957 $align{Read2} =~ s/^(.*)$search2(.*?)$/${1}X$2/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
958 $align{Read2} =~ s/[^X]/-/g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
959 $align{Read2} = modify($align{Read2}, sub { $_[0] =~ s/X/$trimmed2/ });
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
960
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
961 # add aligned reads to queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
962 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
963 lock($Aligned); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
964 my @array = ($read2, undef, undef, undef, $align{Ref2}, $align{Read2}, $qtrim2, $contig, $strand, $conc);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
965 $Aligned->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
966 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
967 } elsif(defined $cacheAlign{$contig}{R2}{$trimmed2}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
968 my @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
969 { lock(%cacheAlign); @array = @{ $cacheAlign{$contig}{R2}{$trimmed2} } };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
970 $array[0] = $read2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
971 $array[6] = $qtrim2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
972 { lock($cacheAlignHits); $cacheAlignHits++ };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
973
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
974 # add aligned reads to queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
975 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
976 lock($Aligned); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
977 $Aligned->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
978 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
979 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
980 # align read using NW2
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
981 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
982 lock($ToAlign); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
983 my @array = ($$refSeq{$contig}, undef, $trimmed2, $read2, undef, $qtrim2, $contig, $strand, $conc);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
984 #my @array = ($fastaIn, $read2, undef, $qtrim2, $contig, $strand, $conc, $trimmed2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
985 $ToAlign->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
986 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
987 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
988 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
989 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
990
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
991 if($index_read1 < 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
992 # discard read 1
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
993 # send to writebamfile thread
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
994 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
995 lock($WriteBam); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
996 $WriteBam->enqueue("$read1\t4\t*\t0\t0\t*\t*\t0\t0\t$seq1\t$qual1\tRG:Z:$readGroup\n");
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
997 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
998 { lock(%stats); $stats{PoorQual1}++ };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
999 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1000
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1001 if($index_read2 < 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1002 # discard read 2
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1003 # send to writebamfile thread
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1004 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1005 lock($WriteBam); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1006 $WriteBam->enqueue("$read2\t4\t*\t0\t0\t*\t*\t0\t0\t$seq2\t$qual2\tRG:Z:$readGroup\n");
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1007 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1008 { lock(%stats); $stats{PoorQual2}++ };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1009 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1010
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1011 if($index_read1 == -1 && $index_read2 == -1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1012 { lock(%stats); $stats{Amb}++ };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1013 } elsif($index_read1 == -2 && $index_read2 == -2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1014 { lock(%stats); $stats{PoorQualMates}++ };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1015 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1016 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1017
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1018 # grab the next reads in queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1019 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1020 lock($FindContig); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1021 $readArrayRef = $FindContig->dequeue();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1022 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1023 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1024 print "Closing Find Amplicon Thread\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1025 return;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1026 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1027
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1028 sub writebamfile {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1029 my $bamfile = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1030 my $WriteBam = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1031
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1032 # write samfile header
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1033 open(PIPE_TO_SAMTOOLS, "| samtools view -Sb -o $bamfile - 2>&1") or die "Can't fork: $!\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1034 foreach my $key (@OrderedContigs) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1035 print PIPE_TO_SAMTOOLS "\@SQ\tSN:$refLength{$key}{newReg}\tLN:$refLength{$key}{length}\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1036 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1037 print PIPE_TO_SAMTOOLS "\@RG\tID:$readGroup\tPL:Illumina\tSM:$sampleName\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1038
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1039 my $read;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1040 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1041 lock($WriteBam); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1042 $read = $WriteBam->dequeue();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1043 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1044 while(defined $read) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1045
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1046 # write read to bamfile
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1047 print PIPE_TO_SAMTOOLS $read;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1048
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1049 # grab next item in queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1050 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1051 lock($WriteBam); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1052 $read = $WriteBam->dequeue();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1053 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1054 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1055 print "Closing Bam Writer\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1056 close PIPE_TO_SAMTOOLS or die "pipe could not be closed";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1057 return;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1058 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1059
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1060 sub thread_process_align {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1061 my $Aligned = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1062 my $refFrags = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1063 my $refSeq = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1064 my $Samfile = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1065 my $WriteBam = shift;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1066
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1067 my $readArrayRef;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1068 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1069 lock($Aligned); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1070 $readArrayRef = $Aligned->dequeue();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1071 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1072 while(defined $readArrayRef) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1073 # grab the alignment
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1074 my ($read, $ref1, $read1, $qtrim1, $ref2, $read2, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1075 # grab primer lengths
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1076 my $PU = $$refFrags{primers}{$contig}{U}{F}{length};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1077 my $PD = $$refFrags{primers}{$contig}{D}{F}{length};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1078 my %readStats;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1079 my $orig_read1 = $read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1080 my $orig_read2 = $read2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1081 my $orig_qtrim1 = $qtrim1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1082 my $orig_qtrim2 = $qtrim2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1083 $orig_read1 =~ s/-//g if defined $orig_read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1084 $orig_read2 =~ s/-//g if defined $orig_read2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1085
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1086 if(defined $OPTIONS{d}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1087 # debug info
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1088 print "Alignment With Primers\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1089 print "$ref1\n$read1\n" if defined $read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1090 print "$ref2\n$read2\n" if defined $read2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1091 print "\nFinal Alignment:\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1092 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1093
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1094 # calculate where primers begin and end
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1095 if(defined $read1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1096 if($strand eq 'F') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1097 $readStats{$contig}{Read1} = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1098 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1099 $readStats{$contig}{Read2} = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1100 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1101
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1102 # determine the primer locations
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1103 my $offset = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1104 for(my $occurance = 0; $occurance < $PU; $occurance++) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1105 substr($ref1, $offset) =~ /^.*?([^-])/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1106 $offset += $-[1] + 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1107 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1108 my $length = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1109 for(my $occurance = 0; $occurance < $PD; $occurance++) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1110 substr($ref1, 0, length($ref1) - $length) =~ /.*([^-]).*?$/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1111 $length = length($ref1) - $-[1];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1112 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1113 $length = length($ref1) - $length - $offset;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1114
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1115 # remove primers and adjust quality scores
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1116 $ref1 = substr($ref1, $offset, $length);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1117 my $qualOffset = ( substr($read1, 0, $offset) =~ tr/ACTGN// );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1118 $read1 = substr($read1, $offset, $length);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1119 $length = ( $read1 =~ tr/ACTGN// );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1120 $qtrim1 = substr($qtrim1, $qualOffset, $length);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1121 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1122
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1123 if(defined $read2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1124 if($strand eq 'F') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1125 $readStats{$contig}{Read2} = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1126 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1127 $readStats{$contig}{Read1} = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1128 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1129
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1130 # determine the primer locations
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1131 my $offset = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1132 for(my $occurance = 0; $occurance < $PU; $occurance++) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1133 substr($ref2, $offset) =~ /^.*?([^-])/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1134 $offset += $-[1] + 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1135 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1136 my $length = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1137 for(my $occurance = 0; $occurance < $PD; $occurance++) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1138 substr($ref2, 0, length($ref2) - $length) =~ /.*([^-]).*?$/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1139 $length = length($ref2) - $-[1];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1140 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1141 $length = length($ref2) - $length - $offset;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1142
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1143 # remove primers and adjust quality scores
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1144 $ref2 = substr($ref2, $offset, $length);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1145 my $qualOffset = ( substr($read2, 0, $offset) =~ tr/ACTGN// );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1146 $read2 = substr($read2, $offset, $length);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1147 $length = ( $read2 =~ tr/ACTGN// );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1148 $qtrim2 = substr($qtrim2, $qualOffset, $length);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1149 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1150
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1151 # generate a concensus sequence if we have both read1 and read2 and they overlap
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1152 if($conc) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1153 # the following are 0 based indexes
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1154 my $R1qual = 0; # index for sanger qual read 1
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1155 my $R2qual = 0; # index for sanger qual read 2
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1156 my $aindex = 0; # index for alignment
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1157 my $q1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1158 my $q2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1159 my $concensus;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1160 my $concqual;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1161
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1162 $readStats{$contig}{ConcReads} = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1163 $readStats{$contig}{ConcCorrectBases} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1164 $readStats{$contig}{ConcErrorsBases} = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1165 while($aindex < length($ref1)) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1166 # base1 is current base from read 1
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1167 # base2 is current base from read 2
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1168 my $base1 = substr($read1, $aindex, 1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1169 my $base2 = substr($read2, $aindex, 1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1170
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1171 # grab the next quality score
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1172 if($R1qual < length($qtrim1)) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1173 $q1 = substr($qtrim1, $R1qual, 1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1174 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1175 if($R2qual < length($qtrim2)) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1176 $q2 = substr($qtrim2, $R2qual, 1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1177 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1178
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1179 # if we have a base in either
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1180 if(($base1 ne '-' && $base1 ne 'N') || ($base2 ne '-' && $base2 ne 'N')) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1181 if(($base2 eq "-" || $base2 eq 'N') && $base1 ne "-" && $base1 ne 'N') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1182 # we are only in read 1 or read 2 is a N
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1183 $concensus .= $base1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1184 $concqual .= $q1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1185 $R1qual++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1186 if($base2 eq 'N') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1187 $R2qual++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1188 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1189 } elsif(($base1 eq "-" || $base1 eq 'N') && $base2 ne "-" && $base2 ne 'N') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1190 # we are only in read 2 or read 1 is a N
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1191 $concensus .= $base2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1192 $concqual .= $q2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1193 $R2qual++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1194 if($base1 eq 'N') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1195 $R1qual++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1196 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1197 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1198 # we are in the overlap region
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1199 my $maxqual = sangerMax($q1, $q2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1200 if($base1 eq $base2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1201 # both reads agree
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1202 $readStats{$contig}{ConcCorrectBases}++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1203 $concensus .= $base1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1204 $concqual .= $maxqual;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1205 } elsif($q1 eq $q2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1206 # reads have different bases but the same score
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1207 $readStats{$contig}{ConcErrorsBases}++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1208 $concensus .= 'N';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1209 $concqual .= '#';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1210 } elsif($q1 eq $maxqual) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1211 # reads have different bases but read 1 has the best score
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1212 $concensus .= $base1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1213 $concqual .= $maxqual;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1214 $readStats{$contig}{ConcErrorsBases}++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1215 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1216 # reads have different bases but read 2 has the best score
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1217 $concensus .= $base2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1218 $concqual .= $maxqual;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1219 $readStats{$contig}{ConcErrorsBases}++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1220 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1221 $R1qual++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1222 $R2qual++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1223 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1224 } elsif($base1 eq 'N' || $base2 eq 'N') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1225 if($base2 eq '-') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1226 # one is a deletion the other is an N
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1227 $concensus .= $base1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1228 $concqual .= $q1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1229 $R1qual++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1230 } elsif($base1 eq '-') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1231 # one is a deletion the other is an N
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1232 $concensus .= $base2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1233 $concqual .= $q2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1234 $R2qual++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1235 } elsif($base1 eq 'N' && $base2 eq 'N') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1236 # both reads are an N
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1237 $concensus .= 'N';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1238 $concqual .= sangerMax($q1, $q2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1239 $R1qual++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1240 $R2qual++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1241 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1242 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1243 $concensus .= '-';
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1244 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1245 $aindex++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1246 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1247 $read1 = $concensus;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1248 $qtrim1 = $concqual;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1249 $read2 = undef;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1250 $qtrim2 = undef;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1251 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1252
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1253 if(defined $read1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1254 # remove any trailing - and send to samfile_line
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1255 $read1 =~ s/^-*?([^-].*)/$1/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1256 $ref1 = substr($ref1, -length($read1));
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1257 $read1 =~ s/(.*[^-])-*?$/$1/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1258 $ref1 = substr($ref1, 0, length($read1));
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1259
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1260 my $tmp = $ref1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1261 $tmp =~ s/-//g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1262 my $rstart = index($$refSeq{$contig}, $tmp) - $PU + 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1263
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1264 # generate sam file line
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1265 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1266 lock($Samfile); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1267 my @array = ($read, $qtrim1, $rstart, $read1, $ref1, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1268 $Samfile->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1269 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1270 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1271
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1272 if(defined $read2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1273 # remove any trailing - and send to samfile_line
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1274 $read2 =~ s/^-*?([^-].*)/$1/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1275 $ref2 = substr($ref2, -length($read2));
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1276 $read2 =~ s/(.*[^-])-*?$/$1/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1277 $ref2 = substr($ref2, 0, length($read2));
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1278
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1279 my $tmp = $ref2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1280 $tmp =~ s/-//g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1281 my $rstart = rindex($$refSeq{$contig}, $tmp) - $PU + 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1282
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1283 # generate sam file line
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1284 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1285 lock($Samfile); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1286 my @array = ($read, $qtrim2, $rstart, $read2, $ref2, $contig, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, \%readStats);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1287 $Samfile->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1288 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1289 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1290
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1291 if(defined $OPTIONS{d}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1292 # debug info
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1293 print "$ref1\n$read1\n" if defined $read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1294 print "$ref2\n$read2\n" if defined $read2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1295 print "$qtrim1\n" if defined $read1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1296 print "$qtrim2\n" if defined $read2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1297 print "\n\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1298 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1299
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1300 # grab next item in queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1301 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1302 lock($Aligned); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1303 $readArrayRef = $Aligned->dequeue();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1304 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1305 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1306 print "Closing Alignment Processor\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1307 return;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1308 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1309
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1310 # function to take reverse compliment
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1311 sub revcompl {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1312 my $string = shift @_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1313 my $revcomp = reverse($string);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1314 $revcomp =~ tr/ACGTacgtKMSWRYBDHVkmswrybdhv/TGCAtgcaMKSWYRVHDBmkswyrvhdb/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1315 return $revcomp;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1316 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1317
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1318 # function to convert sanger fastq quality scores to phred scores
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1319 sub sanger2phred {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1320 my $sang = shift @_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1321 my @sanger = split("", $sang);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1322 my @phred;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1323 foreach my $q (@sanger) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1324 my $Q = ord($q) - 33;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1325 push @phred, $Q;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1326 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1327 return @phred;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1328 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1329
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1330 # function to return the largest sanger quality score
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1331 sub sangerMax {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1332 my ($s1, $s2) = @_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1333 my $Q1 = ord($s1) - 33;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1334 my $Q2 = ord($s2) - 33;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1335 if($Q1 > $Q2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1336 return $s1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1337 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1338 return $s2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1339 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1340 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1341
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1342 # function to read a fasta file as a hash
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1343 sub read_fasta_file {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1344 my ($fasta_file, $seqs) = @_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1345 my $header;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1346 my $seq = "";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1347 open (IN, $fasta_file) or die "can't open $fasta_file: $!\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1348 while (my $line = <IN>) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1349 chomp $line;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1350 if (grep />/, $line) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1351 if ($seq ne "") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1352 $seqs->{$header} = $seq;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1353 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1354 $header = $line;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1355
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1356 $header =~ s/^>//; # remove ">"
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1357 $header =~ s/\s+$//; # remove trailing whitespace
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1358 $seq = ""; # clear out old sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1359 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1360 $line =~ s/\s+//g; # remove whitespace
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1361 $seq .= $line; # add sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1362 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1363 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1364 close IN; # finished with file
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1365
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1366 if ($seq ne "") { # handle last sequence
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1367 $seqs->{$header} = $seq;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1368 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1369
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1370 return;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1371 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1372
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1373 # minimal element of a list
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1374 #
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1375 sub min
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1376 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1377 my @list = @{$_[0]};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1378 my $min = $list[0];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1379
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1380 foreach my $i (@list)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1381 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1382 $min = $i if ($i < $min);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1383 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1384
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1385 return $min;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1386 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1387
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1388 # maximal element of a list
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1389 #
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1390 sub max
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1391 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1392 my @list = @{$_[0]};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1393 my $max = $list[0];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1394
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1395 foreach my $i (@list)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1396 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1397 $max = $i if ($i > $max);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1398 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1399
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1400 return $max;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1401 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1402
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1403 # trim sequences using the BWA algorithm
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1404 sub trim {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1405
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1406 my ($s, $q, $threshold, $read) = @_; # read sequence and quality scores
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1407 if($read == 2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1408 $q = reverse($q);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1409 $s = reverse($s);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1410 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1411
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1412 my @array = sanger2phred($q);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1413 my $length = scalar @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1414
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1415 # only calculate if quality fails near end
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1416 if( $array[$#array] >= $threshold ){
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1417 if($read == 2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1418 $q = reverse($q);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1419 $s = reverse($s);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1420 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1421 return ($s, $q);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1422 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1423
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1424 # run bwa equation
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1425 my @arg;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1426 for( my $i = 0; $i < $length - 1; $i++ ){
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1427 my $x = $i + 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1428 for( my $j = $x; $j < $length; $j++ ){
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1429 $arg[$x] += $threshold - $array[$j];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1430 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1431 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1432
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1433 # find number of 5' bases to retain
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1434 my $index = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1435 my $maxval = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1436 for ( 1 .. $#arg ){
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1437 if ( $maxval < $arg[$_] ){
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1438 $index = $_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1439 $maxval = $arg[$_];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1440 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1441 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1442
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1443 # trim reads
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1444 $s = substr($s,0,$index);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1445 $q = substr($q,0,$index);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1446 if($read == 2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1447 $s = reverse($s);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1448 $q = reverse($q);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1449 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1450
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1451 # return seq and quality
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1452 return ($s, $q);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1453 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1454
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1455
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1456 sub samfile_line {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1457 # grab the variables
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1458 my ($Samfile, $WriteBam, $refSeq) = @_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1459 my %SamStats;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1460 my %minLength;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1461 my %maxLength;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1462 my $readArrayRef;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1463 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1464 lock($Samfile); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1465 $readArrayRef = $Samfile->dequeue();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1466 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1467
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1468 while(defined $readArrayRef) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1469 my ($read, $qual, $rstart, $align, $ref, $reg, $strand, $orig_read1, $orig_read2, $orig_qtrim1, $orig_qtrim2, $readStats) = @{ $readArrayRef };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1470 my $seq = $align;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1471 $seq =~ s/-//g;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1472
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1473 # calculate the edit difference - count indels as only 1 missmatch hence tranform the strings so that all indels have length 1
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1474 my $sam;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1475 my $s1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1476 my $s2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1477 my @s1 = split("", $align);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1478 my @s2 = split("", $ref);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1479
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1480 my $indel = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1481 for(my $i = 0; $i <= $#s1; $i++) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1482 if(($s1[$i] ne "-" && $s2[$i] ne "-") || !$indel) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1483 $s1 .= $s1[$i];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1484 $s2 .= $s2[$i];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1485 $indel = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1486 if($s1[$i] eq "-" || $s2[$i] eq "-") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1487 $indel = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1488 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1489 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1490 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1491 my $mq = levenshtein($s1, $s2);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1492
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1493 my $cl = length($seq);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1494 # calculate max diff allowed
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1495 if(!defined $maxdiff{$cl}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1496 { lock(%maxdiff); $maxdiff{$cl} = maxdiff($cl, 0.02, 0.04); }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1497 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1498
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1499 if($mq > $maxdiff{$cl} || $cl == 0) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1500 { lock(%stats); $stats{$reg}{PoorMapQual}++; $stats{PoorMapQual}++; }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1501 if(defined $orig_read1) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1502 $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read1\t$orig_qtrim1\tRG:Z:$readGroup\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1503 # send to writebamfile thread
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1504 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1505 lock($WriteBam); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1506 $WriteBam->enqueue($sam);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1507 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1508 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1509 if(defined $orig_read2) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1510 $read =~ s/[1](:[A-Z]+:[0-9]+:[ACTGN]+)$/2$1/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1511 $sam = "$read\t4\t*\t0\t0\t*\t*\t0\t0\t$orig_read2\t$orig_qtrim2\tRG:Z:$readGroup\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1512 # send to writebamfile thread
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1513 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1514 lock($WriteBam); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1515 $WriteBam->enqueue($sam);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1516 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1517 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1518 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1519 $mq = 60;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1520
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1521 # add any stats from earlier
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1522 foreach my $key1 (keys %$readStats) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1523 foreach my $key2 (keys % { $$readStats{$key1} }) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1524 $SamStats{$key1}{$key2} += $$readStats{$key1}{$key2};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1525 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1526 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1527
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1528 # keep track of quality scores
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1529 $SamStats{$reg}{Bases} += $cl;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1530 $SamStats{$reg}{Quality} += $_ for sanger2phred($qual);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1531
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1532 # keep track of read lengths
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1533 if(defined $minLength{$reg}) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1534 $minLength{$reg} = min([$cl, $minLength{$reg}]);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1535 $maxLength{$reg} = max([$cl, $maxLength{$reg}]);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1536 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1537 $minLength{$reg} = $cl;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1538 $maxLength{$reg} = $cl;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1539 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1540
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1541 # keep track of strand and bases seen
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1542 if($strand eq 'F') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1543 $SamStats{$reg}{NumberForward}++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1544 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1545 $SamStats{$reg}{NumberReverse}++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1546 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1547 $SamStats{$reg}{TotalA} += ($seq =~ tr/A//);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1548 $SamStats{$reg}{TotalC} += ($seq =~ tr/C//);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1549 $SamStats{$reg}{TotalG} += ($seq =~ tr/G//);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1550 $SamStats{$reg}{TotalT} += ($seq =~ tr/T//);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1551 $SamStats{$reg}{TotalN} += ($seq =~ tr/N//);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1552
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1553 # keep count the number of reads for each contig
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1554 $SamStats{$reg}{Reads}++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1555
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1556 # calculate strand info
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1557 if($strand eq 'F') {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1558 $strand = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1559 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1560 $strand = 16;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1561 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1562 # calculate cigar
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1563 my $cigar = "";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1564 my $M = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1565 my $D = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1566 my $I = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1567 my $Subs = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1568 for(my $i = 0; $i < length($align); $i++) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1569 if(substr($align, $i, 1) eq "-") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1570 $D++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1571 if($M) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1572 $cigar .= "${M}M";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1573 $M = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1574 } elsif($I) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1575 $cigar .= "${I}I";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1576 $I = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1577 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1578 } elsif(substr($ref, $i, 1) eq "-") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1579 $I++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1580 if($M) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1581 $cigar .= "${M}M";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1582 $M = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1583 } elsif($D) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1584 $cigar .= "${D}D";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1585 $D = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1586 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1587 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1588 $M++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1589 if($D) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1590 $cigar .= "${D}D";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1591 $D = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1592 } elsif($I) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1593 $cigar .= "${I}I";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1594 $I = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1595 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1596 $Subs++ if substr($ref, $i, 1) ne substr($align, $i, 1);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1597 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1598 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1599 if($M) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1600 $cigar .= "${M}M";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1601 } elsif($I) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1602 $cigar .= "${I}I";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1603 } elsif($D) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1604 $cigar .= "${D}D";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1605 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1606
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1607 # construct sam file line
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1608 $sam = "$read\t$strand\t$refLength{$reg}{newReg}\t$rstart\t$mq\t$cigar\t*\t0\t0\t$seq\t$qual\tRG:Z:$readGroup\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1609
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1610 # send to writebamfile thread
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1611 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1612 lock($WriteBam); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1613 $WriteBam->enqueue($sam);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1614 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1615 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1616
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1617 #grab the next read
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1618 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1619 lock($Samfile); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1620 $readArrayRef = $Samfile->dequeue();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1621 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1622 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1623
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1624 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1625 # add stats from earlier
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1626 lock(%stats);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1627 foreach my $key1 (keys %SamStats) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1628 $stats{$key1}{MinLength} = min([$minLength{$key1}, $stats{$key1}{MinLength}]);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1629 $stats{$key1}{MaxLength} = max([$maxLength{$key1}, $stats{$key1}{MaxLength}]);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1630 foreach my $key2 (keys % { $SamStats{$key1} }) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1631 $stats{$key1}{$key2} += $SamStats{$key1}{$key2};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1632 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1633 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1634
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1635 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1636 print "Closing Sam File Line Generator\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1637
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1638 return;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1639 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1640
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1641 # perform the alignment
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1642 sub nwAlign {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1643 my ($ToAlign, $Aligned, $regions) = @_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1644 my $readArrayRef;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1645 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1646 lock($ToAlign); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1647 $readArrayRef = $ToAlign->dequeue();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1648 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1649
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1650 while(defined $readArrayRef) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1651 # we have picked a contig
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1652 # check strand and take rev comp of ref if necessary
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1653 my ($ref, $left, $right, $read, $qtrim1, $qtrim2, $contig, $strand, $conc) = @{ $readArrayRef };
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1654 my %align;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1655 my $name;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1656 #! Parameters
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1657 #! ClustalW Parameters
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1658 my $matchScore = 1.9;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1659 my $missmatch_penalty = -0.4;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1660 my $gapPenOpen = -10.0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1661 my $gapPenExt = -0.4;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1662 my $leftAndRightGapPen = true;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1663 my $debug = false;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1664 my $cons = "";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1665 my $align1= "";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1666 my $align2= "";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1667 my $align3= "";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1668 my $score = 0.0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1669 my $singleAlign = false; #aligning 2 or 3 sequences
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1670 if($conc)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1671 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1672 #got to concatenate seqs and align
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1673 $score = align3($ref, $right, $left, $align1, $align2, $align3, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, $leftAndRightGapPen, $debug);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1674 $name = "$left.$right";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1675 print "$contig\n$align1\n$align2\n$align3\n\n" if defined $$regions{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1676 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1677 elsif(!defined $right)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1678 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1679 #align with left
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1680 $score = align2($ref, $left, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen, $gapPenExt, false, true, $debug);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1681 $singleAlign = true;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1682 $name = $left;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1683 print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1684 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1685 elsif(!defined $left)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1686 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1687 #align with right
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1688 $score = align2($ref, $right, $align1, $align2, $matchScore, $missmatch_penalty, $gapPenOpen ,$gapPenExt ,true, false, $debug);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1689 $singleAlign = true;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1690 $name = $right;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1691 print "$contig\n$align1\n$align2\n\n" if defined $$regions{$contig};
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1692 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1693
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1694 # add aligned reads to queue
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1695 # cache results
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1696 my $type;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1697 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1698 lock($Aligned); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1699
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1700 my @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1701 if($singleAlign && !defined $right)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1702 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1703 @array = ($read, $align1, $align2, $qtrim1, undef, undef, undef, $contig, $strand, $conc);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1704 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1705 lock(%cacheAlign);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1706 $cacheAlign{$contig}{R1}{$name} = &share( [] );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1707 @{ $cacheAlign{$contig}{R1}{$name} } = @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1708 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1709 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1710 elsif($singleAlign && !defined $left )
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1711 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1712 @array = ($read, undef, undef, undef, $align1, $align2, $qtrim2, $contig, $strand, $conc);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1713 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1714 lock(%cacheAlign);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1715 $cacheAlign{$contig}{R2}{$name} = &share( [] );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1716 @{ $cacheAlign{$contig}{R2}{$name} } = @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1717 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1718 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1719 else
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1720 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1721 @array = ($read, $align1, $align2, $qtrim1, $align1, $align3, $qtrim2, $contig, $strand, $conc);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1722 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1723 lock(%cacheAlign);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1724 $cacheAlign{Conc}{$name} = &share( [] );
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1725 @{ $cacheAlign{Conc}{$name} } = @array;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1726 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1727 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1728 $Aligned->enqueue(\@array);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1729 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1730
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1731 #grab the next read
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1732 {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1733 lock($ToAlign); # Keep other threads from changing the queue's contents
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1734 $readArrayRef = $ToAlign->dequeue();
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1735 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1736 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1737 # clean up
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1738 print "Closing NW2 Aligner\n";
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1739 return;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1740 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1741
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1742 # function to calculat the minimum edit distance threshold (taken from bwa)
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1743 sub maxdiff {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1744 my ($l, $err, $thres) = @_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1745 my $lambda = exp(-$l*$err);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1746 my $y = 1.0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1747 my $x = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1748 my $sum = $lambda;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1749 for(my $k = 1; $k < 1000; $k++) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1750 $y *= $l * $err;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1751 $x *= $k;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1752 $sum += $lambda * $y / $x;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1753 if(1.0 - $sum < $thres) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1754 return $k;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1755 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1756 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1757 return 2;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1758 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1759
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1760 # using approx matching return the positions in $list of those most closely matching $string
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1761 sub amatch {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1762 my ($string, $list) = @_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1763 my @d = adistr($string, @$list);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1764 my @d_index = sort { abs($d[$a]) <=> abs($d[$b]) } 0..$#$list;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1765
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1766 return if $d[$d_index[0]] > 0.25;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1767
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1768 my @pos = ($d_index[0]);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1769 my $i = 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1770 while($i <= $#d_index && $d[$d_index[0]] == $d[$d_index[$i]]) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1771 push @pos, $d_index[$i];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1772 $i++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1773 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1774 return @pos;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1775 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1776
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1777 # a sorting algorithm to sort the chromosomes
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1778 sub sortChrom {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1779 my ($achr, $astart, $aend) = split(":|-", $a);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1780 my ($bchr, $bstart, $bend) = split(":|-", $b);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1781 $achr =~ s/X/23/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1782 $achr =~ s/Y/24/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1783 $achr =~ s/MT?/25/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1784 $bchr =~ s/X/23/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1785 $bchr =~ s/Y/24/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1786 $bchr =~ s/MT?/25/;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1787 if($achr < $bchr) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1788 return -1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1789 } elsif($bchr < $achr) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1790 return 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1791 } elsif($astart < $bstart) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1792 return -1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1793 } elsif($bstart < $astart) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1794 return 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1795 } elsif($aend < $bend) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1796 return -1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1797 } elsif($bend < $aend) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1798 return 1;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1799 } else {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1800 return 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1801 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1802 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1803
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1804 # pass a code reference
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1805 sub modify {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1806 my($text, $code) = @_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1807 $code->($text);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1808 return $text;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1809 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1810
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1811 sub split_region {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1812 my ($chr, $start, $end) = split(":|-", shift);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1813 $end = $start unless defined $end;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1814 return($chr, $start, $end);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1815 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1816
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1817 sub error_rate {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1818 my ($x, $y) = @_;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1819 my @x = split("", $x);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1820 my @y = split("", $y);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1821 my $s = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1822 my $l = 0;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1823 for(my $i = 0; $i <= $#x; $i++) {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1824 if($x[$i] ne "-" && $y[$i] ne "-") {
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1825 $s++ if $x[$i] ne $y[$i];
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1826 $l++;
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1827 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1828 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1829 return($s / $l);
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1830 }
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1831
d32bddcff685 Uploaded
fcaramia
parents:
diff changeset
1832