Mercurial > repos > jjohnson > crest
view SVExtTools.pm @ 1:4f6952e0af48 default tip
CREST - add crest.loc.sample
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Wed, 08 Feb 2012 16:08:01 -0600 |
parents | acc8d8bfeb9a |
children |
line wrap: on
line source
package SVExtToolsI; # this package has three major tools that the program is going to use: # 1. Assembler, cap3 is used for this purpose # 2. Mapper, blat server version is used and highly recommended # 3. Aligner, any one will work, but the program need to parse the output sub new { my $class = shift; my %param = @_; my $self = {}; $self->{PRG} = undef; $self->{OPTIONS} = undef; foreach my $k (keys %param) { my $k1 = $k; $k1 = substr($k, 1) if($k =~ m/^-/); $self->{$k1} = $param{$k}; } bless $self, ref($class) || $class; return $self; } sub program { my ($self, $value) = @_; $self->{PRG} = $value if($value); return $self->{PRG}; } sub options { my ($self, $value) = @_; $self->{OPTIONS} = $value if($value); return $self->{OPTIONS}; } sub run { print STDERR "you need implement your own run method"; } package Assembler; use strict; use Carp; use English; use base qw(SVExtToolsI); sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->{PRG} = "cap3" if(!$self->{PRG}); $self->{OPTIONS} = "" if(!$self->{OPTIONS}); return $self; } # the run function of Assembler returns the contig file and # a hashref with the number of reads in each contig sub run { my($self, $file) = @_; croak "$self->{PRG} need an input fasta file to do the assembly" if(!$file); if(-s $file == 0) { print STDERR "$file is of size 0"; return; } system(join(" ", ($self->{PRG}, $file, $self->{OPTIONS}))); my( $r_count, $r_reads ) = _count_reads("$file.cap.ace"); return ("$file.cap.contigs", $r_count, $r_reads, "$file.cap.singlets"); } sub _count_reads { my $file = shift; my %count; my %reads; open my $ACE, "<$file" or croak "Can't open $file:$OS_ERROR"; my $contig_name; while( my $line = <$ACE> ) { if($line =~ m/^CO\s(.*?)\s\d+\s(\d+)/){ $contig_name = $1; $count{$contig_name} = $2; $reads{$contig_name} = []; } if($line =~ m/^RD\s(.*?)\s/) { push @{$reads{$contig_name}}, $1; } } close($ACE); return (\%count, \%reads); } package Mapper; use strict; use Carp; use Data::Dumper; use English; use Bio::SearchIO; use Bio::SeqIO; use base qw(SVExtToolsI); sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->{PRG} = "gfClient sjblat 50000" if(!$self->{PRG}); $self->{OPTIONS} = "-out=psl -nohead" if(!$self->{OPTIONS}); $self->{MAX_SCORE_DIFF} = 20 if(!$self->{MAX_SCORE_DIFF}); $self->{MAX_NUM_HITS} = 10 if(!$self->{MAX_NUM_HITS}); return $self; } sub max_score_diff { my $self = shift; my $value = shift; $self->{MAX_SCORE_DIFF} = $value if($value); return $self->{MAX_SCORE_DIFF}; } sub max_num_hits { my $self = shift; my $value = shift; $self->{MAX_NUM_HITS} = $value if($value); return $self->{MAX_NUM_HITS}; } sub run { my $self = shift; my %param = @_; croak "Missing QUERY parameter for $self->{PRG}" if(!$param{-QUERY}); my $output = $param{-OUTPUT} || $param{-QUERY} . ".psl"; my $options = $param{-OPTIONS} || $self->{OPTIONS}; system(join(" ", ($self->{PRG}, $self->{BIT2_DIR}, $param{-QUERY}, $output, $options))); system("sort -k 10g -k 1nr $output -o $output.sorted"); open my $SORTED, "<$output.sorted" or croak "can't open $output.sorted:$OS_ERROR"; open my $PSL, ">$output" or croak "can't open $output:$OS_ERROR"; my $n = 0; while( my $line = <$SORTED>) { print $PSL $line; $n++; last if($n > 10); } close($PSL); close($SORTED); return $self->select_target($output); } sub select_target { my $self = shift; my $file = shift; my %targets; my $parser = Bio::SearchIO->new( -file => $file, -format => 'psl'); while( my $result = $parser->next_result ) { my $qname = $result->query_name; $targets{$qname} = []; # $result->sort_hits(sub {$Bio::Search::Result::ResultI::b -> matches('id') <=> # $Bio::Search::Result::ResultI::a ->matches('id')}); my $n_hits = 0; my $max_score = 0; # my $perfect_only; while( my $hit = $result->next_hit ) { while( my $hsp = $hit->next_hsp) { my ($n_matches) = $hsp->matches; last if($max_score - $n_matches > $self->{MAX_SCORE_DIFF}); #last if($perfect_only && $hit->query_length != $n_matches); my @blocks; foreach my $bl (@{$hsp->gap_blocks('hit')}) { push @blocks, [$bl->[0], $bl->[0] + $bl->[1]]; } push @{$targets{$qname}}, { tchr => $hit->name, tstart => $hsp->start('hit'), tend => $hsp->end('hit'), qstart => $hsp->start('query'), qend => $hsp->end('query'), qstrand => $hsp->strand('query') == 1 ? '+' : '-', matches => $n_matches, blocks => \@blocks, perfect => $hit->query_length == $n_matches ? 1 : 0, }; #$perfect_only = 1 if($hit->query_length == $n_matches); $max_score = $n_matches if($max_score < $n_matches); $n_hits++; } $hit->rewind; last if($n_hits >= $self->{MAX_NUM_HITS}); } } return \%targets; } package Aligner; use strict; use Carp; use English; use Bio::SearchIO; use Bio::SeqIO; use base qw(SVExtToolsI); sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); $self->{PRG} = "blat" if(!$self->{PRG}); $self->{OPTIONS} = "-tileSize=9 -stepSize=1 -out=psl -nohead -minScore=15" if(!$self->{OPTIONS}); return $self; } # return best hit for each query reads sub run { my $self = shift; my %param = @_; croak "Missing TARGET parameter for $self->{PRG}" if(!$param{-TARGET}); croak "Missing QUERY parameter for $self->{PRG}" if(!$param{-QUERY}); my $output = $param{-OUTPUT} || $param{-QUERY} . ".psl"; system( join(" ", ($self->{PRG}, $param{-TARGET}, $param{-QUERY}, $output, $self->{OPTIONS}))); return $output if($param{-FILE}); return _find_best_hit($output); } sub _find_best_hit { my $file = shift; my $parser = Bio::SearchIO->new( -file => $file, -format => 'psl'); my %best_hit; while( my $result = $parser->next_result ) { $result->sort_hits(sub {$Bio::Search::Result::ResultI::b -> matches('id') <=> $Bio::Search::Result::ResultI::a ->matches('id')}); my $hit = $result->next_hit; # the best hit my $hsp = $hit->next_hsp; $best_hit{$result->query_name} = $hit; $hit->rewind; } return \%best_hit; } 1;