0
|
1 package SVExtToolsI;
|
|
2
|
|
3 # this package has three major tools that the program is going to use:
|
|
4 # 1. Assembler, cap3 is used for this purpose
|
|
5 # 2. Mapper, blat server version is used and highly recommended
|
|
6 # 3. Aligner, any one will work, but the program need to parse the output
|
|
7 sub new {
|
|
8 my $class = shift;
|
|
9 my %param = @_;
|
|
10 my $self = {};
|
|
11 $self->{PRG} = undef;
|
|
12 $self->{OPTIONS} = undef;
|
|
13 foreach my $k (keys %param) {
|
|
14 my $k1 = $k;
|
|
15 $k1 = substr($k, 1) if($k =~ m/^-/);
|
|
16 $self->{$k1} = $param{$k};
|
|
17 }
|
|
18
|
|
19 bless $self, ref($class) || $class;
|
|
20 return $self;
|
|
21 }
|
|
22
|
|
23 sub program {
|
|
24 my ($self, $value) = @_;
|
|
25 $self->{PRG} = $value if($value);
|
|
26 return $self->{PRG};
|
|
27 }
|
|
28
|
|
29 sub options {
|
|
30 my ($self, $value) = @_;
|
|
31 $self->{OPTIONS} = $value if($value);
|
|
32 return $self->{OPTIONS};
|
|
33 }
|
|
34
|
|
35 sub run {
|
|
36 print STDERR "you need implement your own run method";
|
|
37 }
|
|
38
|
|
39 package Assembler;
|
|
40 use strict;
|
|
41 use Carp;
|
|
42 use English;
|
|
43 use base qw(SVExtToolsI);
|
|
44
|
|
45 sub new {
|
|
46 my ($class, @args) = @_;
|
|
47 my $self = $class->SUPER::new(@args);
|
|
48 $self->{PRG} = "cap3" if(!$self->{PRG});
|
|
49 $self->{OPTIONS} = "" if(!$self->{OPTIONS});
|
|
50 return $self;
|
|
51 }
|
|
52
|
|
53 # the run function of Assembler returns the contig file and
|
|
54 # a hashref with the number of reads in each contig
|
|
55
|
|
56 sub run {
|
|
57 my($self, $file) = @_;
|
|
58 croak "$self->{PRG} need an input fasta file to do the assembly" if(!$file);
|
|
59 if(-s $file == 0) {
|
|
60 print STDERR "$file is of size 0";
|
|
61 return;
|
|
62 }
|
|
63 system(join(" ", ($self->{PRG}, $file, $self->{OPTIONS})));
|
|
64 my( $r_count, $r_reads ) = _count_reads("$file.cap.ace");
|
|
65 return ("$file.cap.contigs", $r_count, $r_reads, "$file.cap.singlets");
|
|
66 }
|
|
67
|
|
68 sub _count_reads {
|
|
69 my $file = shift;
|
|
70 my %count;
|
|
71 my %reads;
|
|
72 open my $ACE, "<$file" or croak "Can't open $file:$OS_ERROR";
|
|
73 my $contig_name;
|
|
74 while( my $line = <$ACE> ) {
|
|
75 if($line =~ m/^CO\s(.*?)\s\d+\s(\d+)/){
|
|
76 $contig_name = $1;
|
|
77 $count{$contig_name} = $2;
|
|
78 $reads{$contig_name} = [];
|
|
79 }
|
|
80 if($line =~ m/^RD\s(.*?)\s/) {
|
|
81 push @{$reads{$contig_name}}, $1;
|
|
82 }
|
|
83 }
|
|
84 close($ACE);
|
|
85 return (\%count, \%reads);
|
|
86 }
|
|
87
|
|
88 package Mapper;
|
|
89 use strict;
|
|
90 use Carp;
|
|
91 use Data::Dumper;
|
|
92 use English;
|
|
93 use Bio::SearchIO;
|
|
94 use Bio::SeqIO;
|
|
95 use base qw(SVExtToolsI);
|
|
96
|
|
97 sub new {
|
|
98 my ($class, @args) = @_;
|
|
99 my $self = $class->SUPER::new(@args);
|
|
100 $self->{PRG} = "gfClient sjblat 50000" if(!$self->{PRG});
|
|
101 $self->{OPTIONS} = "-out=psl -nohead" if(!$self->{OPTIONS});
|
|
102 $self->{MAX_SCORE_DIFF} = 20 if(!$self->{MAX_SCORE_DIFF});
|
|
103 $self->{MAX_NUM_HITS} = 10 if(!$self->{MAX_NUM_HITS});
|
|
104 return $self;
|
|
105 }
|
|
106
|
|
107 sub max_score_diff {
|
|
108 my $self = shift;
|
|
109 my $value = shift;
|
|
110 $self->{MAX_SCORE_DIFF} = $value if($value);
|
|
111 return $self->{MAX_SCORE_DIFF};
|
|
112 }
|
|
113
|
|
114 sub max_num_hits {
|
|
115 my $self = shift;
|
|
116 my $value = shift;
|
|
117 $self->{MAX_NUM_HITS} = $value if($value);
|
|
118 return $self->{MAX_NUM_HITS};
|
|
119 }
|
|
120
|
|
121 sub run {
|
|
122 my $self = shift;
|
|
123 my %param = @_;
|
|
124 croak "Missing QUERY parameter for $self->{PRG}" if(!$param{-QUERY});
|
|
125 my $output = $param{-OUTPUT} || $param{-QUERY} . ".psl";
|
|
126 my $options = $param{-OPTIONS} || $self->{OPTIONS};
|
|
127 system(join(" ", ($self->{PRG}, $self->{BIT2_DIR}, $param{-QUERY}, $output, $options)));
|
|
128 system("sort -k 10g -k 1nr $output -o $output.sorted");
|
|
129 open my $SORTED, "<$output.sorted" or croak "can't open $output.sorted:$OS_ERROR";
|
|
130 open my $PSL, ">$output" or croak "can't open $output:$OS_ERROR";
|
|
131 my $n = 0;
|
|
132 while( my $line = <$SORTED>) {
|
|
133 print $PSL $line;
|
|
134 $n++;
|
|
135 last if($n > 10);
|
|
136 }
|
|
137 close($PSL);
|
|
138 close($SORTED);
|
|
139 return $self->select_target($output);
|
|
140 }
|
|
141
|
|
142
|
|
143 sub select_target {
|
|
144 my $self = shift;
|
|
145 my $file = shift;
|
|
146 my %targets;
|
|
147 my $parser = Bio::SearchIO->new( -file => $file, -format => 'psl');
|
|
148 while( my $result = $parser->next_result ) {
|
|
149 my $qname = $result->query_name;
|
|
150 $targets{$qname} = [];
|
|
151 # $result->sort_hits(sub {$Bio::Search::Result::ResultI::b -> matches('id') <=>
|
|
152 # $Bio::Search::Result::ResultI::a ->matches('id')});
|
|
153 my $n_hits = 0;
|
|
154 my $max_score = 0;
|
|
155 # my $perfect_only;
|
|
156 while( my $hit = $result->next_hit ) {
|
|
157 while( my $hsp = $hit->next_hsp) {
|
|
158 my ($n_matches) = $hsp->matches;
|
|
159 last if($max_score - $n_matches > $self->{MAX_SCORE_DIFF});
|
|
160 #last if($perfect_only && $hit->query_length != $n_matches);
|
|
161 my @blocks;
|
|
162 foreach my $bl (@{$hsp->gap_blocks('hit')}) {
|
|
163 push @blocks, [$bl->[0], $bl->[0] + $bl->[1]];
|
|
164 }
|
|
165 push @{$targets{$qname}}, {
|
|
166 tchr => $hit->name,
|
|
167 tstart => $hsp->start('hit'),
|
|
168 tend => $hsp->end('hit'),
|
|
169 qstart => $hsp->start('query'),
|
|
170 qend => $hsp->end('query'),
|
|
171 qstrand => $hsp->strand('query') == 1 ? '+' : '-',
|
|
172 matches => $n_matches,
|
|
173 blocks => \@blocks,
|
|
174 perfect => $hit->query_length == $n_matches ? 1 : 0,
|
|
175 };
|
|
176 #$perfect_only = 1 if($hit->query_length == $n_matches);
|
|
177 $max_score = $n_matches if($max_score < $n_matches);
|
|
178 $n_hits++;
|
|
179 }
|
|
180 $hit->rewind;
|
|
181 last if($n_hits >= $self->{MAX_NUM_HITS});
|
|
182 }
|
|
183 }
|
|
184 return \%targets;
|
|
185 }
|
|
186
|
|
187 package Aligner;
|
|
188 use strict;
|
|
189 use Carp;
|
|
190 use English;
|
|
191 use Bio::SearchIO;
|
|
192 use Bio::SeqIO;
|
|
193 use base qw(SVExtToolsI);
|
|
194
|
|
195 sub new {
|
|
196 my ($class, @args) = @_;
|
|
197 my $self = $class->SUPER::new(@args);
|
|
198 $self->{PRG} = "blat" if(!$self->{PRG});
|
|
199 $self->{OPTIONS} = "-tileSize=9 -stepSize=1 -out=psl -nohead -minScore=15"
|
|
200 if(!$self->{OPTIONS});
|
|
201 return $self;
|
|
202 }
|
|
203
|
|
204 # return best hit for each query reads
|
|
205 sub run {
|
|
206 my $self = shift;
|
|
207 my %param = @_;
|
|
208 croak "Missing TARGET parameter for $self->{PRG}" if(!$param{-TARGET});
|
|
209 croak "Missing QUERY parameter for $self->{PRG}" if(!$param{-QUERY});
|
|
210 my $output = $param{-OUTPUT} || $param{-QUERY} . ".psl";
|
|
211 system( join(" ", ($self->{PRG}, $param{-TARGET}, $param{-QUERY}, $output, $self->{OPTIONS})));
|
|
212 return $output if($param{-FILE});
|
|
213 return _find_best_hit($output);
|
|
214 }
|
|
215
|
|
216 sub _find_best_hit {
|
|
217 my $file = shift;
|
|
218 my $parser = Bio::SearchIO->new( -file => $file, -format => 'psl');
|
|
219 my %best_hit;
|
|
220 while( my $result = $parser->next_result ) {
|
|
221 $result->sort_hits(sub {$Bio::Search::Result::ResultI::b -> matches('id') <=>
|
|
222 $Bio::Search::Result::ResultI::a ->matches('id')});
|
|
223 my $hit = $result->next_hit; # the best hit
|
|
224 my $hsp = $hit->next_hsp;
|
|
225 $best_hit{$result->query_name} = $hit;
|
|
226 $hit->rewind;
|
|
227 }
|
|
228 return \%best_hit;
|
|
229 }
|
|
230
|
|
231 1;
|