0
|
1 package Bio::Roary::SplitGroups;
|
|
2
|
|
3 # ABSTRACT: split groups
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 use Bio::Roary::SplitGroups;
|
|
8
|
|
9 =cut
|
|
10
|
|
11 use Moose;
|
|
12 use Bio::Roary::AnalyseGroups;
|
|
13 use File::Path qw(make_path remove_tree);
|
|
14 use File::Copy qw(move);
|
|
15 use File::Temp;
|
|
16 use File::Basename;
|
|
17 use File::Slurper 'read_lines';
|
|
18 use Cwd;
|
|
19
|
|
20
|
|
21 has 'groupfile' => ( is => 'ro', isa => 'Str', required => 1 );
|
|
22 has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
|
|
23 has 'outfile' => ( is => 'ro', isa => 'Str', required => 1 );
|
|
24 has 'iterations' => ( is => 'ro', isa => 'Int', default => 5 );
|
|
25 has 'dont_delete' => ( is => 'ro', isa => 'Bool', default => 0 );
|
|
26
|
|
27 has '_neighbourhood_size' => ( is => 'ro', isa => 'Int', default => 5 );
|
|
28
|
|
29 has '_group_filelist' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 );
|
|
30 has '_tmp_dir_object' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
|
|
31 has '_tmp_dir' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__tmp_dir' );
|
|
32
|
|
33 has '_analyse_groups_obj' => ( is => 'ro', lazy_build => 1 );
|
|
34 has '_genes_to_files' => ( is => 'ro', lazy_build => 1 );
|
|
35 has '_genes_to_groups' => ( is => 'rw', isa => 'HashRef' );
|
|
36
|
|
37 has '_first_gene_of_group_which_doesnt_have_paralogs' => ( is => 'rw', isa => 'HashRef', default => sub {{}} );
|
|
38
|
|
39 has '_genes_to_neighbourhood' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_neighbourhood' );
|
|
40
|
|
41
|
|
42 has '_gene_files_temp_dir_obj' =>
|
|
43 ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
|
|
44
|
|
45
|
|
46 has '_do_sorting' => ( is => 'rw', isa => 'Bool', default => 0 ); # set to 1 for testing only
|
|
47
|
|
48 sub _build__tmp_dir {
|
|
49 my ($self) = @_;
|
|
50 return $self->_tmp_dir_object->dirname();
|
|
51 }
|
|
52
|
|
53 sub _build__analyse_groups_obj {
|
|
54 my ( $self ) = @_;
|
|
55
|
|
56 return Bio::Roary::AnalyseGroups->new(
|
|
57 fasta_files => $self->fasta_files,
|
|
58 groups_filename => $self->groupfile
|
|
59 );
|
|
60 }
|
|
61
|
|
62 sub _build__genes_to_files {
|
|
63 my ( $self ) = @_;
|
|
64 return $self->_analyse_groups_obj->_genes_to_file;
|
|
65 }
|
|
66
|
|
67 sub _build__group_filelist {
|
|
68 my ( $self ) = @_;
|
|
69 my $tmp = $self->_tmp_dir;
|
|
70
|
|
71 my @filelist = ( $self->groupfile );
|
|
72 for my $i ( 1..($self->iterations - 1) ){
|
|
73 push( @filelist, "$tmp/group_$i" );
|
|
74 }
|
|
75 push( @filelist, $self->outfile );
|
|
76
|
|
77 return \@filelist;
|
|
78 }
|
|
79
|
|
80 sub _build__genes_to_neighbourhood
|
|
81 {
|
|
82 my ( $self ) = @_;
|
|
83 my %genes_to_neighbourhood;
|
|
84 for my $fasta_file( @{$self->fasta_files})
|
|
85 {
|
|
86 my ( $filename, $directories, $suffix ) = fileparse( $fasta_file, qr/\.[^.]*/ );
|
|
87 system('grep \> '.$fasta_file.'| sed \'s/>//\' >'.$self->_gene_files_temp_dir_obj."/".$filename.$suffix ) ;
|
|
88
|
|
89 my @genes = read_lines($self->_gene_files_temp_dir_obj."/".$filename.$suffix );
|
|
90
|
|
91 for(my $i =0; $i< @genes; $i++)
|
|
92 {
|
|
93 for(my $offset = 1; $offset <= $self->_neighbourhood_size; $offset++)
|
|
94 {
|
|
95 if($i -$offset >= 0)
|
|
96 {
|
|
97 push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i - $offset ]);
|
|
98 }
|
|
99 if($i +$offset <@genes)
|
|
100 {
|
|
101 push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i + $offset ]);
|
|
102 }
|
|
103 }
|
|
104 }
|
|
105 }
|
|
106 return \%genes_to_neighbourhood;
|
|
107 }
|
|
108
|
|
109 sub split_groups {
|
|
110 my ( $self ) = @_;
|
|
111
|
|
112 # iteratively
|
|
113 for my $x ( 0..($self->iterations - 1) ){
|
|
114 my ( $in_groups, $out_groups ) = $self->_get_files_for_iteration( $x );
|
|
115
|
|
116 # read in groups, check paralogs and split
|
|
117 my @newgroups;
|
|
118 my $any_paralogs = 0;
|
|
119 $self->_set_genes_to_groups( $in_groups );
|
|
120 open( my $group_handle, '<', $in_groups );
|
|
121 while( my $line = <$group_handle> ){
|
|
122 my @group = split( /\s+/, $line );
|
|
123
|
|
124 if($self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]})
|
|
125 {
|
|
126 push( @newgroups, \@group );
|
|
127 }
|
|
128 elsif(@group == 1)
|
|
129 {
|
|
130 $self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
|
|
131 push( @newgroups, \@group );
|
|
132 }
|
|
133 elsif( $self->_contains_paralogs( \@group ) ){
|
|
134 my @true_orthologs = @{ $self->_true_orthologs( \@group ) };
|
|
135 push( @newgroups, @true_orthologs);
|
|
136 $any_paralogs = 1;
|
|
137 }
|
|
138 else {
|
|
139 $self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
|
|
140 push( @newgroups, \@group );
|
|
141 }
|
|
142 }
|
|
143 close( $group_handle );
|
|
144
|
|
145 # check if next iteration required, move output if not
|
|
146 unless ($any_paralogs){
|
|
147 move $in_groups, $self->outfile; # input file will be the same as new output file if no splitting has been performed
|
|
148 last;
|
|
149 }
|
|
150
|
|
151 # write split groups to file
|
|
152 open( my $outfile_handle, '>', $out_groups );
|
|
153 for my $g ( @newgroups ) {
|
|
154 my $group_str = join( "\t", @{ $g } ) . "\n";
|
|
155 print $outfile_handle $group_str;
|
|
156 }
|
|
157 close( $outfile_handle );
|
|
158 }
|
|
159 }
|
|
160
|
|
161 sub _set_genes_to_groups {
|
|
162 my ( $self, $groupfile ) = @_;
|
|
163
|
|
164 my %genes2groups;
|
|
165 my $c = 0;
|
|
166 open( my $gfh, '<', $groupfile );
|
|
167 while( my $line = <$gfh> ){
|
|
168 chomp $line;
|
|
169 my @genes = split( /\s+/, $line );
|
|
170 for my $g ( @genes ){
|
|
171 $genes2groups{$g} = $c;
|
|
172 }
|
|
173 $c++;
|
|
174 }
|
|
175 close($gfh);
|
|
176 $self->_genes_to_groups( \%genes2groups );
|
|
177 }
|
|
178
|
|
179 sub _update_genes_to_groups {
|
|
180 my ( $self, $groups ) = @_;
|
|
181
|
|
182 my %genes2groups = %{ $self->_genes_to_groups };
|
|
183 my $c = 1;
|
|
184 for my $g ( @{ $groups } ){
|
|
185 for my $h ( @{ $g } ){
|
|
186 $genes2groups{$h} .= ".$c";
|
|
187 }
|
|
188 $c++;
|
|
189 }
|
|
190
|
|
191 $self->_genes_to_groups( \%genes2groups );
|
|
192 }
|
|
193
|
|
194 sub _get_files_for_iteration {
|
|
195 my ( $self, $n ) = @_;
|
|
196 my @filelist = @{ $self->_group_filelist };
|
|
197 return ( $filelist[$n], $filelist[$n+1] );
|
|
198 }
|
|
199
|
|
200 sub _contains_paralogs {
|
|
201 my ( $self, $group ) = @_;
|
|
202
|
|
203 return 1 if defined $self->_find_paralogs( $group );
|
|
204 return 0;
|
|
205 }
|
|
206
|
|
207 sub _find_paralogs {
|
|
208 my ( $self, $group ) = @_;
|
|
209
|
|
210 my %occ;
|
|
211 for my $gene ( @{ $group } ){
|
|
212 my $gene_file = $self->_genes_to_files->{ $gene };
|
|
213 push( @{ $occ{$gene_file} }, $gene );
|
|
214 }
|
|
215
|
|
216 # pick the smallest number of paralogs
|
|
217 my $smallest_number = 1000000;
|
|
218 my $smallest_group;
|
|
219 for my $v ( values %occ ){
|
|
220 my $v_len = scalar( @{$v} );
|
|
221 if ( $v_len < $smallest_number && $v_len > 1 ){
|
|
222 $smallest_number = $v_len;
|
|
223 $smallest_group = $v;
|
|
224 }
|
|
225 }
|
|
226 return $smallest_group if ( defined $smallest_group );
|
|
227
|
|
228 return undef;
|
|
229 }
|
|
230
|
|
231 sub _true_orthologs {
|
|
232 my ( $self, $group ) = @_;
|
|
233
|
|
234 # first, create CGN hash for group
|
|
235 my %cgns;
|
|
236 for my $g ( @{ $group } ){
|
|
237 $cgns{$g} = $self->_parse_gene_neighbourhood( $g );
|
|
238 }
|
|
239
|
|
240 # finding paralogs in the group
|
|
241 my @paralogs = @{ $self->_find_paralogs( $group ) };
|
|
242 my @paralog_cgns_groups;
|
|
243 for my $p ( @paralogs ){
|
|
244 my %paralog_groups ;
|
|
245 for my $paralog_gene (@{$cgns{$p}})
|
|
246 {
|
|
247 my $gene_paralog_group = $self->_genes_to_groups->{$paralog_gene};
|
|
248 next unless( defined($gene_paralog_group));
|
|
249 $paralog_groups{$self->_genes_to_groups->{$paralog_gene}}++;
|
|
250 }
|
|
251 push( @paralog_cgns_groups, \%paralog_groups );
|
|
252 }
|
|
253
|
|
254 # create data structure to hold new groups
|
|
255 my @new_groups;
|
|
256 for my $p ( @paralogs ){
|
|
257 push( @new_groups, [ $p ] );
|
|
258 }
|
|
259 push( @new_groups, [] ); # extra "leftovers" array to gather genes that don't share CGN with anything
|
|
260
|
|
261 # cluster other members of the group to their closest match
|
|
262 for my $g ( @{ $group } ){
|
|
263 next if ( grep {$_ eq $g} @paralogs );
|
|
264 my $closest = $self->_closest_cgn( $cgns{$g}, \@paralog_cgns_groups );
|
|
265 push( @{ $new_groups[$closest] }, $g );
|
|
266 }
|
|
267
|
|
268 # check for "leftovers", remove if absent
|
|
269 my $last = pop @new_groups;
|
|
270 push( @new_groups, $last ) if ( @$last > 0 );
|
|
271
|
|
272 # sort
|
|
273 if ( $self->_do_sorting ){
|
|
274 my @sorted_new_groups;
|
|
275 for my $gr ( @new_groups ){
|
|
276 my @s_gr = sort @{ $gr };
|
|
277 push( @sorted_new_groups, \@s_gr );
|
|
278 }
|
|
279 return \@sorted_new_groups;
|
|
280 }
|
|
281
|
|
282 return \@new_groups;
|
|
283 }
|
|
284
|
|
285 sub _closest_cgn {
|
|
286 my ( $self, $cgn, $p_cgns ) = @_;
|
|
287
|
|
288 my @paralog_cgns = @{ $p_cgns };
|
|
289 my $best_score = 0;
|
|
290 my $bs_index = -1; # return -1 to add to "leftovers" array if no better score is found
|
|
291 for my $i ( 0..$#paralog_cgns ){
|
|
292 my $p_cgn = $paralog_cgns[$i];
|
|
293 my $score = $self->_shared_cgn_score( $cgn, $p_cgn );
|
|
294 if ( $score > $best_score ){
|
|
295 $best_score = $score;
|
|
296 $bs_index = $i;
|
|
297 }
|
|
298 }
|
|
299 return $bs_index;
|
|
300 }
|
|
301
|
|
302 sub _shared_cgn_score {
|
|
303 my ( $self, $cgn1, $cgn2 ) = @_;
|
|
304
|
|
305 my $total_shared = 0;
|
|
306 for my $i ( @{ $cgn1 } ){
|
|
307 my $input_group = $self->_genes_to_groups->{$i};
|
|
308 next unless(defined($input_group));
|
|
309 $total_shared++ if($cgn2->{$input_group});
|
|
310 }
|
|
311 if( (scalar @{ $cgn1 }) == 0)
|
|
312 {
|
|
313 return 0;
|
|
314 }
|
|
315 my $score = $total_shared/scalar @{ $cgn1 };
|
|
316 return $score;
|
|
317 }
|
|
318
|
|
319 sub _parse_gene_neighbourhood {
|
|
320 my ( $self, $gene_id ) = @_;
|
|
321
|
|
322 return $self->_genes_to_neighbourhood->{$gene_id };
|
|
323
|
|
324 }
|
|
325
|
|
326 no Moose;
|
|
327 __PACKAGE__->meta->make_immutable;
|
|
328 1;
|