Mercurial > repos > jjohnson > crest
comparison SVExtTools.pm @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc8d8bfeb9a |
---|---|
1 package SVExtToolsI; | |
2 | |
3 # this package has three major tools that the program is going to use: | |
4 # 1. Assembler, cap3 is used for this purpose | |
5 # 2. Mapper, blat server version is used and highly recommended | |
6 # 3. Aligner, any one will work, but the program need to parse the output | |
7 sub new { | |
8 my $class = shift; | |
9 my %param = @_; | |
10 my $self = {}; | |
11 $self->{PRG} = undef; | |
12 $self->{OPTIONS} = undef; | |
13 foreach my $k (keys %param) { | |
14 my $k1 = $k; | |
15 $k1 = substr($k, 1) if($k =~ m/^-/); | |
16 $self->{$k1} = $param{$k}; | |
17 } | |
18 | |
19 bless $self, ref($class) || $class; | |
20 return $self; | |
21 } | |
22 | |
23 sub program { | |
24 my ($self, $value) = @_; | |
25 $self->{PRG} = $value if($value); | |
26 return $self->{PRG}; | |
27 } | |
28 | |
29 sub options { | |
30 my ($self, $value) = @_; | |
31 $self->{OPTIONS} = $value if($value); | |
32 return $self->{OPTIONS}; | |
33 } | |
34 | |
35 sub run { | |
36 print STDERR "you need implement your own run method"; | |
37 } | |
38 | |
39 package Assembler; | |
40 use strict; | |
41 use Carp; | |
42 use English; | |
43 use base qw(SVExtToolsI); | |
44 | |
45 sub new { | |
46 my ($class, @args) = @_; | |
47 my $self = $class->SUPER::new(@args); | |
48 $self->{PRG} = "cap3" if(!$self->{PRG}); | |
49 $self->{OPTIONS} = "" if(!$self->{OPTIONS}); | |
50 return $self; | |
51 } | |
52 | |
53 # the run function of Assembler returns the contig file and | |
54 # a hashref with the number of reads in each contig | |
55 | |
56 sub run { | |
57 my($self, $file) = @_; | |
58 croak "$self->{PRG} need an input fasta file to do the assembly" if(!$file); | |
59 if(-s $file == 0) { | |
60 print STDERR "$file is of size 0"; | |
61 return; | |
62 } | |
63 system(join(" ", ($self->{PRG}, $file, $self->{OPTIONS}))); | |
64 my( $r_count, $r_reads ) = _count_reads("$file.cap.ace"); | |
65 return ("$file.cap.contigs", $r_count, $r_reads, "$file.cap.singlets"); | |
66 } | |
67 | |
68 sub _count_reads { | |
69 my $file = shift; | |
70 my %count; | |
71 my %reads; | |
72 open my $ACE, "<$file" or croak "Can't open $file:$OS_ERROR"; | |
73 my $contig_name; | |
74 while( my $line = <$ACE> ) { | |
75 if($line =~ m/^CO\s(.*?)\s\d+\s(\d+)/){ | |
76 $contig_name = $1; | |
77 $count{$contig_name} = $2; | |
78 $reads{$contig_name} = []; | |
79 } | |
80 if($line =~ m/^RD\s(.*?)\s/) { | |
81 push @{$reads{$contig_name}}, $1; | |
82 } | |
83 } | |
84 close($ACE); | |
85 return (\%count, \%reads); | |
86 } | |
87 | |
88 package Mapper; | |
89 use strict; | |
90 use Carp; | |
91 use Data::Dumper; | |
92 use English; | |
93 use Bio::SearchIO; | |
94 use Bio::SeqIO; | |
95 use base qw(SVExtToolsI); | |
96 | |
97 sub new { | |
98 my ($class, @args) = @_; | |
99 my $self = $class->SUPER::new(@args); | |
100 $self->{PRG} = "gfClient sjblat 50000" if(!$self->{PRG}); | |
101 $self->{OPTIONS} = "-out=psl -nohead" if(!$self->{OPTIONS}); | |
102 $self->{MAX_SCORE_DIFF} = 20 if(!$self->{MAX_SCORE_DIFF}); | |
103 $self->{MAX_NUM_HITS} = 10 if(!$self->{MAX_NUM_HITS}); | |
104 return $self; | |
105 } | |
106 | |
107 sub max_score_diff { | |
108 my $self = shift; | |
109 my $value = shift; | |
110 $self->{MAX_SCORE_DIFF} = $value if($value); | |
111 return $self->{MAX_SCORE_DIFF}; | |
112 } | |
113 | |
114 sub max_num_hits { | |
115 my $self = shift; | |
116 my $value = shift; | |
117 $self->{MAX_NUM_HITS} = $value if($value); | |
118 return $self->{MAX_NUM_HITS}; | |
119 } | |
120 | |
121 sub run { | |
122 my $self = shift; | |
123 my %param = @_; | |
124 croak "Missing QUERY parameter for $self->{PRG}" if(!$param{-QUERY}); | |
125 my $output = $param{-OUTPUT} || $param{-QUERY} . ".psl"; | |
126 my $options = $param{-OPTIONS} || $self->{OPTIONS}; | |
127 system(join(" ", ($self->{PRG}, $self->{BIT2_DIR}, $param{-QUERY}, $output, $options))); | |
128 system("sort -k 10g -k 1nr $output -o $output.sorted"); | |
129 open my $SORTED, "<$output.sorted" or croak "can't open $output.sorted:$OS_ERROR"; | |
130 open my $PSL, ">$output" or croak "can't open $output:$OS_ERROR"; | |
131 my $n = 0; | |
132 while( my $line = <$SORTED>) { | |
133 print $PSL $line; | |
134 $n++; | |
135 last if($n > 10); | |
136 } | |
137 close($PSL); | |
138 close($SORTED); | |
139 return $self->select_target($output); | |
140 } | |
141 | |
142 | |
143 sub select_target { | |
144 my $self = shift; | |
145 my $file = shift; | |
146 my %targets; | |
147 my $parser = Bio::SearchIO->new( -file => $file, -format => 'psl'); | |
148 while( my $result = $parser->next_result ) { | |
149 my $qname = $result->query_name; | |
150 $targets{$qname} = []; | |
151 # $result->sort_hits(sub {$Bio::Search::Result::ResultI::b -> matches('id') <=> | |
152 # $Bio::Search::Result::ResultI::a ->matches('id')}); | |
153 my $n_hits = 0; | |
154 my $max_score = 0; | |
155 # my $perfect_only; | |
156 while( my $hit = $result->next_hit ) { | |
157 while( my $hsp = $hit->next_hsp) { | |
158 my ($n_matches) = $hsp->matches; | |
159 last if($max_score - $n_matches > $self->{MAX_SCORE_DIFF}); | |
160 #last if($perfect_only && $hit->query_length != $n_matches); | |
161 my @blocks; | |
162 foreach my $bl (@{$hsp->gap_blocks('hit')}) { | |
163 push @blocks, [$bl->[0], $bl->[0] + $bl->[1]]; | |
164 } | |
165 push @{$targets{$qname}}, { | |
166 tchr => $hit->name, | |
167 tstart => $hsp->start('hit'), | |
168 tend => $hsp->end('hit'), | |
169 qstart => $hsp->start('query'), | |
170 qend => $hsp->end('query'), | |
171 qstrand => $hsp->strand('query') == 1 ? '+' : '-', | |
172 matches => $n_matches, | |
173 blocks => \@blocks, | |
174 perfect => $hit->query_length == $n_matches ? 1 : 0, | |
175 }; | |
176 #$perfect_only = 1 if($hit->query_length == $n_matches); | |
177 $max_score = $n_matches if($max_score < $n_matches); | |
178 $n_hits++; | |
179 } | |
180 $hit->rewind; | |
181 last if($n_hits >= $self->{MAX_NUM_HITS}); | |
182 } | |
183 } | |
184 return \%targets; | |
185 } | |
186 | |
187 package Aligner; | |
188 use strict; | |
189 use Carp; | |
190 use English; | |
191 use Bio::SearchIO; | |
192 use Bio::SeqIO; | |
193 use base qw(SVExtToolsI); | |
194 | |
195 sub new { | |
196 my ($class, @args) = @_; | |
197 my $self = $class->SUPER::new(@args); | |
198 $self->{PRG} = "blat" if(!$self->{PRG}); | |
199 $self->{OPTIONS} = "-tileSize=9 -stepSize=1 -out=psl -nohead -minScore=15" | |
200 if(!$self->{OPTIONS}); | |
201 return $self; | |
202 } | |
203 | |
204 # return best hit for each query reads | |
205 sub run { | |
206 my $self = shift; | |
207 my %param = @_; | |
208 croak "Missing TARGET parameter for $self->{PRG}" if(!$param{-TARGET}); | |
209 croak "Missing QUERY parameter for $self->{PRG}" if(!$param{-QUERY}); | |
210 my $output = $param{-OUTPUT} || $param{-QUERY} . ".psl"; | |
211 system( join(" ", ($self->{PRG}, $param{-TARGET}, $param{-QUERY}, $output, $self->{OPTIONS}))); | |
212 return $output if($param{-FILE}); | |
213 return _find_best_hit($output); | |
214 } | |
215 | |
216 sub _find_best_hit { | |
217 my $file = shift; | |
218 my $parser = Bio::SearchIO->new( -file => $file, -format => 'psl'); | |
219 my %best_hit; | |
220 while( my $result = $parser->next_result ) { | |
221 $result->sort_hits(sub {$Bio::Search::Result::ResultI::b -> matches('id') <=> | |
222 $Bio::Search::Result::ResultI::a ->matches('id')}); | |
223 my $hit = $result->next_hit; # the best hit | |
224 my $hsp = $hit->next_hsp; | |
225 $best_hit{$result->query_name} = $hit; | |
226 $hit->rewind; | |
227 } | |
228 return \%best_hit; | |
229 } | |
230 | |
231 1; |