Mercurial > repos > jjohnson > crest
diff SVUtil.pm @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVUtil.pm Wed Feb 08 16:59:24 2012 -0500 @@ -0,0 +1,318 @@ +package SVUtil; + +use strict; +use Carp; +use English; +use Bio::SeqIO; +use Bio::DB::Sam; +use File::Spec; +use File::Path qw(remove_tree); +use File::Temp qw/ tempfile tempdir /; +use Cwd; +use SCValidator qw(LEFT_CLIP RIGHT_CLIP); +require Exporter; +our @ISA = ('Exporter'); +our @EXPORT = qw(is_new_pair clip_fa_file prepare_reads parse_range + get_input_bam get_work_dir is_PCR_dup read_fa_file find_smallest_cover + get_sclip_reads rev_comp); +our @EXPORT_OK = qw($use_scratch $spacer_seq $spcer_seq_len $work_dir); +our $use_scratch = 1; +our $work_dir; + +sub rev_comp { + my $str = shift; + $str = reverse $str; + $str =~ tr/ACTG/TGAC/; + return $str; +} +# something about binormial distribution +sub choose { + my ($n,$k) = @_; + my ($result,$j) = (1,1); + + return 0 if $k>$n||$k<0; + $k = ($n - $k) if ($n - $k) <$k; + + while ($j <= $k ) { + $result *= $n--; + $result /= $j++; + } + return $result; +} + +sub dbinorm { #desity funtion + my($n, $k, $p) = @_; + return unless($n > 0 && $k >= 0 && $n >= $k && $p >= 0 && $p <=1); + my $prob = log(choose($n, $k)) + ($n-$k)*log(1-$p) + $k*log($p); + return exp($prob); +} + +sub find_smallest_cover { + my ($c, $p, $crit) = @_; + my @s_cover; + for( my $i = 1; $i < $c; $i++) { + my $n = 1; + while(1) { + my $tmp = 0; + for( my $j = 0; $j <= $i && $j < $n; $j++) { + $tmp += dbinorm($n, $j, $p); + } + if( $tmp < $crit) { + $s_cover[$i] = $n - 1; + last; + } + $n++; + } + } + return \@s_cover; +} + + + +sub read_fa_file { + my $file = shift; + my $in = Bio::SeqIO->new(-file => $file, -format => 'Fasta'); + my %seqs; + while( my $seq=$in->next_seq()) { + $seqs{$seq->display_id} = $seq->seq; + } + return \%seqs; +} + +# rmdup: remove PCR dumplication related code +# you can replace it using your own criteria +sub is_new_pair { + my ($a, $sclip_len, $pairs) = @_; + foreach my $p (@{$pairs}) { + return undef if($a->start == $p->[0] && $a->end == $p->[1] && + $a->mate_start == $p->[2] && $a->mate_end == $p->[3] && $sclip_len == $p->[4]); + } + return 1; +} + +# +#clipping sequnce related code +# +# the spacer sequence, you can replace it with your own one +my $spacer_seq = 'ACGTACGT' . 'CGTA' x 6 . 'ATGCATGC'; +my $spacer_seq_len = length($spacer_seq); + +# clip the reads that with spacer_seq at one end +sub clip_fa_file { + my ($file, $ort) = @_; + my $out = $file . ".clip.fa"; + my $in = Bio::SeqIO->new( -file => $file, -format => 'Fasta'); + open my $OUT, ">$out" or croak "can't open $file.fa:$OS_ERROR"; + while( my $seq=$in->next_seq()) { + print $OUT ">", $seq->display_id, "\n"; + if($ort eq RIGHT_CLIP) { + print $OUT $seq->subseq($spacer_seq_len + 1, $seq->length), "\n"; + } + else { + print $OUT $seq->subseq(1, $seq->length - $spacer_seq_len), "\n"; + } + } + close($OUT); + return $out; +} + +# prepare the reads into a fasta file +# due to most assembler can't handle short read well, we add +# the spacer_seq to the start/end of each sequence arrording to +# the softclip postion +sub prepare_reads { + my($r_reads, $clip) = @_; + my ($fh, $fa_name) = tempfile( DIR => $work_dir, SUFFIX => '.fa'); + my $qual_name = $fa_name . ".qual"; + my $spacer_qual = ' 50 ' x $spacer_seq_len; + + open my $qual_fh, ">$qual_name" or croak "can't open $qual_name:$OS_ERROR"; + foreach my $r (@{$r_reads}) { + print $fh ">", $r->{name}, "\n"; + my $tmp = $r->{seq}; + $tmp = defined $clip ? + ($clip == RIGHT_CLIP ? $spacer_seq . $tmp : $tmp . $spacer_seq) : + $tmp; + print $fh $tmp, "\n"; + next unless $r->{qual}; + print $qual_fh ">", $r->{name}, "\n"; + $tmp = join(" ", @{$r->{qual}}); + $tmp = defined $clip ? + ($clip == RIGHT_CLIP ? $spacer_qual . $tmp : $tmp . $spacer_qual) : + $tmp; + print $qual_fh $tmp, "\n"; + } + close $qual_fh; + return $fa_name; +} + +# check PCR duplicate +sub is_PCR_dup { + my ($a, $pairs, $sclip_len) = @_; + my ($s, $e, $ms, $me ) = ($a->start, $a->end, $a->mate_start ? $a->mate_start : 0, + $a->mate_end ? $a->mate_end : 0); + foreach my $p (@{$pairs}) { + return 1 if($s == $p->[0] && $e == $p->[1] && + $ms == $p->[2] && $me == $p->[3] && $sclip_len == $p->[4]); + } + return; +} + +# parse the range of input, format is: chr:start-end +# start and end is optional +sub parse_range { + my $range = shift; + my ($chr, $start, $end); + my @field = split /:/, $range; + $chr = $field[0]; +# $chr = substr($chr, 3) if($chr =~ /^chr/); + if(@field > 1) { + @field = split /-/, $field[1]; + $start = $field[0]; + $end = $field[1] if($field[1]); + if($start !~ m/^\d+$/ or $end !~ m/^\d+$/) { + croak "wrong range format, need to be: chr:start-end"; + } + } + return ($chr, $start, $end); +} + +# St. Jude only +# grab the bam file from the bam dir +my $raw_bam_dir = "/nfs_exports/genomes/1/PCGP/BucketRaw"; +sub get_input_bam { + my $sample = shift; + my ($group) = $sample =~ /(.*?)\d+/; + my $tmp_bam_dir = File::Spec->catdir($raw_bam_dir, $group); + opendir(my $dh, $tmp_bam_dir) or croak "can't open directory: $raw_bam_dir: $OS_ERROR"; + my @files = grep { /^$sample-.*bam$/ } readdir($dh); + close $dh; + return File::Spec->catfile($raw_bam_dir, $group, $files[0]); +} + +#scratch related code +sub get_work_dir { + my %options = @_; + my $scratch_dir; + if($options{-SCRATCH}){ + $scratch_dir = (exists $options{-DIR} ? $options{-DIR} : + (-e '/scratch_space' ? '/scratch_space' : undef) ); + } + + my $dir; + if($scratch_dir) { + $scratch_dir = File::Spec->rel2abs($scratch_dir); + if($ENV{'JOB_ID'}) { + $dir = File::Spec->catdir($scratch_dir, $ENV{'JOB_ID'}) + } + else { + $dir = tempdir(DIR => $scratch_dir); + } + } + else { + $dir = tempdir(); + } + mkdir($dir) if( ! -e $dir); + $work_dir = $dir; + return $dir; +} + + +sub get_sclip_reads { + my %args = @_; + my ($sam, $chr, $start, $end, $clip, $min_len, $validator, $paired, $rmdup, $full_seq, $extra_base) = + ($args{-SAM}, $args{-CHR}, $args{-START}, $args{-END}, $args{-CLIP}, + $args{-MINCLIPLEN}, $args{-VALIDATOR}, $args{-PAIRED}, $args{-RMDUP}, + $args{-FULLSEQ}, $args{-EXTRA}); + $extra_base = 0 unless $extra_base; + my @rtn; + $min_len = 0 unless $min_len; + my $max_sclip_len = 0; + my $range = $chr; + $range = $chr . ":" . $start . "-" . $end if($start && $end); + my @pairs; + my $strand_validator = $paired ? 1 : 0; + + $sam->fetch($range, sub { + my $a = shift; + my $cigar_str = $a->cigar_str; + my @cigar_array = @{$a->cigar_array}; + return if($cigar_str !~ m/S/); + return unless($a->start && $a->end); + #return if(!$a->proper_pair && $paired); + if($paired && !$strand_validator) { + $validator->add_validator("strand_validator"); + } + + if($paired && !$a->proper_pair) { + $validator->remove_validator("strand_validator"); + $strand_validator = 0; + } + my ($sclip_len, $pos, $seq, $qual); + my @tmp = $a->qscore; + if($cigar_array[0]->[0] eq 'S' && $clip == LEFT_CLIP ) { + $pos = $a->start; + return if($pos < $start or $pos > $end); # the softclipping position is not in range + $sclip_len = $cigar_array[0]->[1]; + #print $cigar_str, "\t", scalar @pairs, "\n"; + return unless($validator && $validator->validate($a, LEFT_CLIP)); + $max_sclip_len = $sclip_len if($max_sclip_len < $sclip_len); + return if(@pairs > 0 && $rmdup && is_PCR_dup($a, \@pairs, $sclip_len)); #it's a PCR dumplicate + $seq = $full_seq ? $a->query->dna : substr($a->query->dna, 0, $sclip_len + $extra_base); + @tmp = $full_seq ? @tmp : @tmp[0 .. ($sclip_len -1 + $extra_base)]; + + push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0, + $a->mate_end ? $a->mate_end : 0, $sclip_len] if($rmdup); + my $qscore = $a->qscore; + + push @rtn, { + name => $a->qname, + seq => $seq, + qual => \@tmp, + full_seq => $a->query->dna, + full_qual => $qscore, + pos => $pos, + }; + } + if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP ) { + $pos = $a->end; + return if($pos < $start or $pos > $end); # the softclipping position is not in range +# print $cigar_str, "\t", scalar @pairs, "\n"; + return unless $validator->validate($a, RIGHT_CLIP); + $sclip_len = $cigar_array[$#cigar_array]->[1]; + $max_sclip_len = $sclip_len if($max_sclip_len < $sclip_len); + return if(@pairs > 0 && $rmdup && is_PCR_dup($a, \@pairs, $sclip_len)); #it's a PCR dumplicate + $seq = $full_seq ? $a->qseq : substr($a->qseq, $a->l_qseq - $sclip_len - $extra_base ); + @tmp = $full_seq ? @tmp : @tmp[($a->l_qseq - $sclip_len - $extra_base) .. ($a->l_qseq - 1)]; + push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0, + $a->mate_end ? $a->mate_end : 0, $sclip_len] if($rmdup); + my $qscore = $a->qscore; + push @rtn, { + name => $a->qname, + seq => $seq, + qual => \@tmp, + full_seq => $a->query->dna, + full_qual => $qscore, + pos => $pos, + }; + } + } + ); + if($max_sclip_len >= $min_len) { + return @rtn; + } + else{ + undef @rtn; return @rtn; + } +} + +my $start_dir; +BEGIN { + $start_dir = getcwd; +} +END { + chdir($start_dir); + remove_tree($work_dir) if($use_scratch && $work_dir && $start_dir ne $work_dir); +} + +1;