comparison SVExtTools.pm @ 0:acc8d8bfeb9a

Uploaded
author jjohnson
date Wed, 08 Feb 2012 16:59:24 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc8d8bfeb9a
1 package SVExtToolsI;
2
3 # this package has three major tools that the program is going to use:
4 # 1. Assembler, cap3 is used for this purpose
5 # 2. Mapper, blat server version is used and highly recommended
6 # 3. Aligner, any one will work, but the program need to parse the output
7 sub new {
8 my $class = shift;
9 my %param = @_;
10 my $self = {};
11 $self->{PRG} = undef;
12 $self->{OPTIONS} = undef;
13 foreach my $k (keys %param) {
14 my $k1 = $k;
15 $k1 = substr($k, 1) if($k =~ m/^-/);
16 $self->{$k1} = $param{$k};
17 }
18
19 bless $self, ref($class) || $class;
20 return $self;
21 }
22
23 sub program {
24 my ($self, $value) = @_;
25 $self->{PRG} = $value if($value);
26 return $self->{PRG};
27 }
28
29 sub options {
30 my ($self, $value) = @_;
31 $self->{OPTIONS} = $value if($value);
32 return $self->{OPTIONS};
33 }
34
35 sub run {
36 print STDERR "you need implement your own run method";
37 }
38
39 package Assembler;
40 use strict;
41 use Carp;
42 use English;
43 use base qw(SVExtToolsI);
44
45 sub new {
46 my ($class, @args) = @_;
47 my $self = $class->SUPER::new(@args);
48 $self->{PRG} = "cap3" if(!$self->{PRG});
49 $self->{OPTIONS} = "" if(!$self->{OPTIONS});
50 return $self;
51 }
52
53 # the run function of Assembler returns the contig file and
54 # a hashref with the number of reads in each contig
55
56 sub run {
57 my($self, $file) = @_;
58 croak "$self->{PRG} need an input fasta file to do the assembly" if(!$file);
59 if(-s $file == 0) {
60 print STDERR "$file is of size 0";
61 return;
62 }
63 system(join(" ", ($self->{PRG}, $file, $self->{OPTIONS})));
64 my( $r_count, $r_reads ) = _count_reads("$file.cap.ace");
65 return ("$file.cap.contigs", $r_count, $r_reads, "$file.cap.singlets");
66 }
67
68 sub _count_reads {
69 my $file = shift;
70 my %count;
71 my %reads;
72 open my $ACE, "<$file" or croak "Can't open $file:$OS_ERROR";
73 my $contig_name;
74 while( my $line = <$ACE> ) {
75 if($line =~ m/^CO\s(.*?)\s\d+\s(\d+)/){
76 $contig_name = $1;
77 $count{$contig_name} = $2;
78 $reads{$contig_name} = [];
79 }
80 if($line =~ m/^RD\s(.*?)\s/) {
81 push @{$reads{$contig_name}}, $1;
82 }
83 }
84 close($ACE);
85 return (\%count, \%reads);
86 }
87
88 package Mapper;
89 use strict;
90 use Carp;
91 use Data::Dumper;
92 use English;
93 use Bio::SearchIO;
94 use Bio::SeqIO;
95 use base qw(SVExtToolsI);
96
97 sub new {
98 my ($class, @args) = @_;
99 my $self = $class->SUPER::new(@args);
100 $self->{PRG} = "gfClient sjblat 50000" if(!$self->{PRG});
101 $self->{OPTIONS} = "-out=psl -nohead" if(!$self->{OPTIONS});
102 $self->{MAX_SCORE_DIFF} = 20 if(!$self->{MAX_SCORE_DIFF});
103 $self->{MAX_NUM_HITS} = 10 if(!$self->{MAX_NUM_HITS});
104 return $self;
105 }
106
107 sub max_score_diff {
108 my $self = shift;
109 my $value = shift;
110 $self->{MAX_SCORE_DIFF} = $value if($value);
111 return $self->{MAX_SCORE_DIFF};
112 }
113
114 sub max_num_hits {
115 my $self = shift;
116 my $value = shift;
117 $self->{MAX_NUM_HITS} = $value if($value);
118 return $self->{MAX_NUM_HITS};
119 }
120
121 sub run {
122 my $self = shift;
123 my %param = @_;
124 croak "Missing QUERY parameter for $self->{PRG}" if(!$param{-QUERY});
125 my $output = $param{-OUTPUT} || $param{-QUERY} . ".psl";
126 my $options = $param{-OPTIONS} || $self->{OPTIONS};
127 system(join(" ", ($self->{PRG}, $self->{BIT2_DIR}, $param{-QUERY}, $output, $options)));
128 system("sort -k 10g -k 1nr $output -o $output.sorted");
129 open my $SORTED, "<$output.sorted" or croak "can't open $output.sorted:$OS_ERROR";
130 open my $PSL, ">$output" or croak "can't open $output:$OS_ERROR";
131 my $n = 0;
132 while( my $line = <$SORTED>) {
133 print $PSL $line;
134 $n++;
135 last if($n > 10);
136 }
137 close($PSL);
138 close($SORTED);
139 return $self->select_target($output);
140 }
141
142
143 sub select_target {
144 my $self = shift;
145 my $file = shift;
146 my %targets;
147 my $parser = Bio::SearchIO->new( -file => $file, -format => 'psl');
148 while( my $result = $parser->next_result ) {
149 my $qname = $result->query_name;
150 $targets{$qname} = [];
151 # $result->sort_hits(sub {$Bio::Search::Result::ResultI::b -> matches('id') <=>
152 # $Bio::Search::Result::ResultI::a ->matches('id')});
153 my $n_hits = 0;
154 my $max_score = 0;
155 # my $perfect_only;
156 while( my $hit = $result->next_hit ) {
157 while( my $hsp = $hit->next_hsp) {
158 my ($n_matches) = $hsp->matches;
159 last if($max_score - $n_matches > $self->{MAX_SCORE_DIFF});
160 #last if($perfect_only && $hit->query_length != $n_matches);
161 my @blocks;
162 foreach my $bl (@{$hsp->gap_blocks('hit')}) {
163 push @blocks, [$bl->[0], $bl->[0] + $bl->[1]];
164 }
165 push @{$targets{$qname}}, {
166 tchr => $hit->name,
167 tstart => $hsp->start('hit'),
168 tend => $hsp->end('hit'),
169 qstart => $hsp->start('query'),
170 qend => $hsp->end('query'),
171 qstrand => $hsp->strand('query') == 1 ? '+' : '-',
172 matches => $n_matches,
173 blocks => \@blocks,
174 perfect => $hit->query_length == $n_matches ? 1 : 0,
175 };
176 #$perfect_only = 1 if($hit->query_length == $n_matches);
177 $max_score = $n_matches if($max_score < $n_matches);
178 $n_hits++;
179 }
180 $hit->rewind;
181 last if($n_hits >= $self->{MAX_NUM_HITS});
182 }
183 }
184 return \%targets;
185 }
186
187 package Aligner;
188 use strict;
189 use Carp;
190 use English;
191 use Bio::SearchIO;
192 use Bio::SeqIO;
193 use base qw(SVExtToolsI);
194
195 sub new {
196 my ($class, @args) = @_;
197 my $self = $class->SUPER::new(@args);
198 $self->{PRG} = "blat" if(!$self->{PRG});
199 $self->{OPTIONS} = "-tileSize=9 -stepSize=1 -out=psl -nohead -minScore=15"
200 if(!$self->{OPTIONS});
201 return $self;
202 }
203
204 # return best hit for each query reads
205 sub run {
206 my $self = shift;
207 my %param = @_;
208 croak "Missing TARGET parameter for $self->{PRG}" if(!$param{-TARGET});
209 croak "Missing QUERY parameter for $self->{PRG}" if(!$param{-QUERY});
210 my $output = $param{-OUTPUT} || $param{-QUERY} . ".psl";
211 system( join(" ", ($self->{PRG}, $param{-TARGET}, $param{-QUERY}, $output, $self->{OPTIONS})));
212 return $output if($param{-FILE});
213 return _find_best_hit($output);
214 }
215
216 sub _find_best_hit {
217 my $file = shift;
218 my $parser = Bio::SearchIO->new( -file => $file, -format => 'psl');
219 my %best_hit;
220 while( my $result = $parser->next_result ) {
221 $result->sort_hits(sub {$Bio::Search::Result::ResultI::b -> matches('id') <=>
222 $Bio::Search::Result::ResultI::a ->matches('id')});
223 my $hit = $result->next_hit; # the best hit
224 my $hsp = $hit->next_hsp;
225 $best_hit{$result->query_name} = $hit;
226 $hit->rewind;
227 }
228 return \%best_hit;
229 }
230
231 1;