0
|
1 package SVUtil;
|
|
2
|
|
3 use strict;
|
|
4 use Carp;
|
|
5 use English;
|
|
6 use Bio::SeqIO;
|
|
7 use Bio::DB::Sam;
|
|
8 use File::Spec;
|
|
9 use File::Path qw(remove_tree);
|
|
10 use File::Temp qw/ tempfile tempdir /;
|
|
11 use Cwd;
|
|
12 use SCValidator qw(LEFT_CLIP RIGHT_CLIP);
|
|
13 require Exporter;
|
|
14 our @ISA = ('Exporter');
|
|
15 our @EXPORT = qw(is_new_pair clip_fa_file prepare_reads parse_range
|
|
16 get_input_bam get_work_dir is_PCR_dup read_fa_file find_smallest_cover
|
|
17 get_sclip_reads rev_comp);
|
|
18 our @EXPORT_OK = qw($use_scratch $spacer_seq $spcer_seq_len $work_dir);
|
|
19 our $use_scratch = 1;
|
|
20 our $work_dir;
|
|
21
|
|
22 sub rev_comp {
|
|
23 my $str = shift;
|
|
24 $str = reverse $str;
|
|
25 $str =~ tr/ACTG/TGAC/;
|
|
26 return $str;
|
|
27 }
|
|
28 # something about binormial distribution
|
|
29 sub choose {
|
|
30 my ($n,$k) = @_;
|
|
31 my ($result,$j) = (1,1);
|
|
32
|
|
33 return 0 if $k>$n||$k<0;
|
|
34 $k = ($n - $k) if ($n - $k) <$k;
|
|
35
|
|
36 while ($j <= $k ) {
|
|
37 $result *= $n--;
|
|
38 $result /= $j++;
|
|
39 }
|
|
40 return $result;
|
|
41 }
|
|
42
|
|
43 sub dbinorm { #desity funtion
|
|
44 my($n, $k, $p) = @_;
|
|
45 return unless($n > 0 && $k >= 0 && $n >= $k && $p >= 0 && $p <=1);
|
|
46 my $prob = log(choose($n, $k)) + ($n-$k)*log(1-$p) + $k*log($p);
|
|
47 return exp($prob);
|
|
48 }
|
|
49
|
|
50 sub find_smallest_cover {
|
|
51 my ($c, $p, $crit) = @_;
|
|
52 my @s_cover;
|
|
53 for( my $i = 1; $i < $c; $i++) {
|
|
54 my $n = 1;
|
|
55 while(1) {
|
|
56 my $tmp = 0;
|
|
57 for( my $j = 0; $j <= $i && $j < $n; $j++) {
|
|
58 $tmp += dbinorm($n, $j, $p);
|
|
59 }
|
|
60 if( $tmp < $crit) {
|
|
61 $s_cover[$i] = $n - 1;
|
|
62 last;
|
|
63 }
|
|
64 $n++;
|
|
65 }
|
|
66 }
|
|
67 return \@s_cover;
|
|
68 }
|
|
69
|
|
70
|
|
71
|
|
72 sub read_fa_file {
|
|
73 my $file = shift;
|
|
74 my $in = Bio::SeqIO->new(-file => $file, -format => 'Fasta');
|
|
75 my %seqs;
|
|
76 while( my $seq=$in->next_seq()) {
|
|
77 $seqs{$seq->display_id} = $seq->seq;
|
|
78 }
|
|
79 return \%seqs;
|
|
80 }
|
|
81
|
|
82 # rmdup: remove PCR dumplication related code
|
|
83 # you can replace it using your own criteria
|
|
84 sub is_new_pair {
|
|
85 my ($a, $sclip_len, $pairs) = @_;
|
|
86 foreach my $p (@{$pairs}) {
|
|
87 return undef if($a->start == $p->[0] && $a->end == $p->[1] &&
|
|
88 $a->mate_start == $p->[2] && $a->mate_end == $p->[3] && $sclip_len == $p->[4]);
|
|
89 }
|
|
90 return 1;
|
|
91 }
|
|
92
|
|
93 #
|
|
94 #clipping sequnce related code
|
|
95 #
|
|
96 # the spacer sequence, you can replace it with your own one
|
|
97 my $spacer_seq = 'ACGTACGT' . 'CGTA' x 6 . 'ATGCATGC';
|
|
98 my $spacer_seq_len = length($spacer_seq);
|
|
99
|
|
100 # clip the reads that with spacer_seq at one end
|
|
101 sub clip_fa_file {
|
|
102 my ($file, $ort) = @_;
|
|
103 my $out = $file . ".clip.fa";
|
|
104 my $in = Bio::SeqIO->new( -file => $file, -format => 'Fasta');
|
|
105 open my $OUT, ">$out" or croak "can't open $file.fa:$OS_ERROR";
|
|
106 while( my $seq=$in->next_seq()) {
|
|
107 print $OUT ">", $seq->display_id, "\n";
|
|
108 if($ort eq RIGHT_CLIP) {
|
|
109 print $OUT $seq->subseq($spacer_seq_len + 1, $seq->length), "\n";
|
|
110 }
|
|
111 else {
|
|
112 print $OUT $seq->subseq(1, $seq->length - $spacer_seq_len), "\n";
|
|
113 }
|
|
114 }
|
|
115 close($OUT);
|
|
116 return $out;
|
|
117 }
|
|
118
|
|
119 # prepare the reads into a fasta file
|
|
120 # due to most assembler can't handle short read well, we add
|
|
121 # the spacer_seq to the start/end of each sequence arrording to
|
|
122 # the softclip postion
|
|
123 sub prepare_reads {
|
|
124 my($r_reads, $clip) = @_;
|
|
125 my ($fh, $fa_name) = tempfile( DIR => $work_dir, SUFFIX => '.fa');
|
|
126 my $qual_name = $fa_name . ".qual";
|
|
127 my $spacer_qual = ' 50 ' x $spacer_seq_len;
|
|
128
|
|
129 open my $qual_fh, ">$qual_name" or croak "can't open $qual_name:$OS_ERROR";
|
|
130 foreach my $r (@{$r_reads}) {
|
|
131 print $fh ">", $r->{name}, "\n";
|
|
132 my $tmp = $r->{seq};
|
|
133 $tmp = defined $clip ?
|
|
134 ($clip == RIGHT_CLIP ? $spacer_seq . $tmp : $tmp . $spacer_seq) :
|
|
135 $tmp;
|
|
136 print $fh $tmp, "\n";
|
|
137 next unless $r->{qual};
|
|
138 print $qual_fh ">", $r->{name}, "\n";
|
|
139 $tmp = join(" ", @{$r->{qual}});
|
|
140 $tmp = defined $clip ?
|
|
141 ($clip == RIGHT_CLIP ? $spacer_qual . $tmp : $tmp . $spacer_qual) :
|
|
142 $tmp;
|
|
143 print $qual_fh $tmp, "\n";
|
|
144 }
|
|
145 close $qual_fh;
|
|
146 return $fa_name;
|
|
147 }
|
|
148
|
|
149 # check PCR duplicate
|
|
150 sub is_PCR_dup {
|
|
151 my ($a, $pairs, $sclip_len) = @_;
|
|
152 my ($s, $e, $ms, $me ) = ($a->start, $a->end, $a->mate_start ? $a->mate_start : 0,
|
|
153 $a->mate_end ? $a->mate_end : 0);
|
|
154 foreach my $p (@{$pairs}) {
|
|
155 return 1 if($s == $p->[0] && $e == $p->[1] &&
|
|
156 $ms == $p->[2] && $me == $p->[3] && $sclip_len == $p->[4]);
|
|
157 }
|
|
158 return;
|
|
159 }
|
|
160
|
|
161 # parse the range of input, format is: chr:start-end
|
|
162 # start and end is optional
|
|
163 sub parse_range {
|
|
164 my $range = shift;
|
|
165 my ($chr, $start, $end);
|
|
166 my @field = split /:/, $range;
|
|
167 $chr = $field[0];
|
|
168 # $chr = substr($chr, 3) if($chr =~ /^chr/);
|
|
169 if(@field > 1) {
|
|
170 @field = split /-/, $field[1];
|
|
171 $start = $field[0];
|
|
172 $end = $field[1] if($field[1]);
|
|
173 if($start !~ m/^\d+$/ or $end !~ m/^\d+$/) {
|
|
174 croak "wrong range format, need to be: chr:start-end";
|
|
175 }
|
|
176 }
|
|
177 return ($chr, $start, $end);
|
|
178 }
|
|
179
|
|
180 # St. Jude only
|
|
181 # grab the bam file from the bam dir
|
|
182 my $raw_bam_dir = "/nfs_exports/genomes/1/PCGP/BucketRaw";
|
|
183 sub get_input_bam {
|
|
184 my $sample = shift;
|
|
185 my ($group) = $sample =~ /(.*?)\d+/;
|
|
186 my $tmp_bam_dir = File::Spec->catdir($raw_bam_dir, $group);
|
|
187 opendir(my $dh, $tmp_bam_dir) or croak "can't open directory: $raw_bam_dir: $OS_ERROR";
|
|
188 my @files = grep { /^$sample-.*bam$/ } readdir($dh);
|
|
189 close $dh;
|
|
190 return File::Spec->catfile($raw_bam_dir, $group, $files[0]);
|
|
191 }
|
|
192
|
|
193 #scratch related code
|
|
194 sub get_work_dir {
|
|
195 my %options = @_;
|
|
196 my $scratch_dir;
|
|
197 if($options{-SCRATCH}){
|
|
198 $scratch_dir = (exists $options{-DIR} ? $options{-DIR} :
|
|
199 (-e '/scratch_space' ? '/scratch_space' : undef) );
|
|
200 }
|
|
201
|
|
202 my $dir;
|
|
203 if($scratch_dir) {
|
|
204 $scratch_dir = File::Spec->rel2abs($scratch_dir);
|
|
205 if($ENV{'JOB_ID'}) {
|
|
206 $dir = File::Spec->catdir($scratch_dir, $ENV{'JOB_ID'})
|
|
207 }
|
|
208 else {
|
|
209 $dir = tempdir(DIR => $scratch_dir);
|
|
210 }
|
|
211 }
|
|
212 else {
|
|
213 $dir = tempdir();
|
|
214 }
|
|
215 mkdir($dir) if( ! -e $dir);
|
|
216 $work_dir = $dir;
|
|
217 return $dir;
|
|
218 }
|
|
219
|
|
220
|
|
221 sub get_sclip_reads {
|
|
222 my %args = @_;
|
|
223 my ($sam, $chr, $start, $end, $clip, $min_len, $validator, $paired, $rmdup, $full_seq, $extra_base) =
|
|
224 ($args{-SAM}, $args{-CHR}, $args{-START}, $args{-END}, $args{-CLIP},
|
|
225 $args{-MINCLIPLEN}, $args{-VALIDATOR}, $args{-PAIRED}, $args{-RMDUP},
|
|
226 $args{-FULLSEQ}, $args{-EXTRA});
|
|
227 $extra_base = 0 unless $extra_base;
|
|
228 my @rtn;
|
|
229 $min_len = 0 unless $min_len;
|
|
230 my $max_sclip_len = 0;
|
|
231 my $range = $chr;
|
|
232 $range = $chr . ":" . $start . "-" . $end if($start && $end);
|
|
233 my @pairs;
|
|
234 my $strand_validator = $paired ? 1 : 0;
|
|
235
|
|
236 $sam->fetch($range, sub {
|
|
237 my $a = shift;
|
|
238 my $cigar_str = $a->cigar_str;
|
|
239 my @cigar_array = @{$a->cigar_array};
|
|
240 return if($cigar_str !~ m/S/);
|
|
241 return unless($a->start && $a->end);
|
|
242 #return if(!$a->proper_pair && $paired);
|
|
243 if($paired && !$strand_validator) {
|
|
244 $validator->add_validator("strand_validator");
|
|
245 }
|
|
246
|
|
247 if($paired && !$a->proper_pair) {
|
|
248 $validator->remove_validator("strand_validator");
|
|
249 $strand_validator = 0;
|
|
250 }
|
|
251 my ($sclip_len, $pos, $seq, $qual);
|
|
252 my @tmp = $a->qscore;
|
|
253 if($cigar_array[0]->[0] eq 'S' && $clip == LEFT_CLIP ) {
|
|
254 $pos = $a->start;
|
|
255 return if($pos < $start or $pos > $end); # the softclipping position is not in range
|
|
256 $sclip_len = $cigar_array[0]->[1];
|
|
257 #print $cigar_str, "\t", scalar @pairs, "\n";
|
|
258 return unless($validator && $validator->validate($a, LEFT_CLIP));
|
|
259 $max_sclip_len = $sclip_len if($max_sclip_len < $sclip_len);
|
|
260 return if(@pairs > 0 && $rmdup && is_PCR_dup($a, \@pairs, $sclip_len)); #it's a PCR dumplicate
|
|
261 $seq = $full_seq ? $a->query->dna : substr($a->query->dna, 0, $sclip_len + $extra_base);
|
|
262 @tmp = $full_seq ? @tmp : @tmp[0 .. ($sclip_len -1 + $extra_base)];
|
|
263
|
|
264 push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0,
|
|
265 $a->mate_end ? $a->mate_end : 0, $sclip_len] if($rmdup);
|
|
266 my $qscore = $a->qscore;
|
|
267
|
|
268 push @rtn, {
|
|
269 name => $a->qname,
|
|
270 seq => $seq,
|
|
271 qual => \@tmp,
|
|
272 full_seq => $a->query->dna,
|
|
273 full_qual => $qscore,
|
|
274 pos => $pos,
|
|
275 };
|
|
276 }
|
|
277 if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP ) {
|
|
278 $pos = $a->end;
|
|
279 return if($pos < $start or $pos > $end); # the softclipping position is not in range
|
|
280 # print $cigar_str, "\t", scalar @pairs, "\n";
|
|
281 return unless $validator->validate($a, RIGHT_CLIP);
|
|
282 $sclip_len = $cigar_array[$#cigar_array]->[1];
|
|
283 $max_sclip_len = $sclip_len if($max_sclip_len < $sclip_len);
|
|
284 return if(@pairs > 0 && $rmdup && is_PCR_dup($a, \@pairs, $sclip_len)); #it's a PCR dumplicate
|
|
285 $seq = $full_seq ? $a->qseq : substr($a->qseq, $a->l_qseq - $sclip_len - $extra_base );
|
|
286 @tmp = $full_seq ? @tmp : @tmp[($a->l_qseq - $sclip_len - $extra_base) .. ($a->l_qseq - 1)];
|
|
287 push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0,
|
|
288 $a->mate_end ? $a->mate_end : 0, $sclip_len] if($rmdup);
|
|
289 my $qscore = $a->qscore;
|
|
290 push @rtn, {
|
|
291 name => $a->qname,
|
|
292 seq => $seq,
|
|
293 qual => \@tmp,
|
|
294 full_seq => $a->query->dna,
|
|
295 full_qual => $qscore,
|
|
296 pos => $pos,
|
|
297 };
|
|
298 }
|
|
299 }
|
|
300 );
|
|
301 if($max_sclip_len >= $min_len) {
|
|
302 return @rtn;
|
|
303 }
|
|
304 else{
|
|
305 undef @rtn; return @rtn;
|
|
306 }
|
|
307 }
|
|
308
|
|
309 my $start_dir;
|
|
310 BEGIN {
|
|
311 $start_dir = getcwd;
|
|
312 }
|
|
313 END {
|
|
314 chdir($start_dir);
|
|
315 remove_tree($work_dir) if($use_scratch && $work_dir && $start_dir ne $work_dir);
|
|
316 }
|
|
317
|
|
318 1;
|