Mercurial > repos > dereeper > roary_plots
comparison Roary/lib/Bio/Roary/SplitGroups.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c47a5f61bc9f |
---|---|
1 package Bio::Roary::SplitGroups; | |
2 | |
3 # ABSTRACT: split groups | |
4 | |
5 =head1 SYNOPSIS | |
6 | |
7 use Bio::Roary::SplitGroups; | |
8 | |
9 =cut | |
10 | |
11 use Moose; | |
12 use Bio::Roary::AnalyseGroups; | |
13 use File::Path qw(make_path remove_tree); | |
14 use File::Copy qw(move); | |
15 use File::Temp; | |
16 use File::Basename; | |
17 use File::Slurper 'read_lines'; | |
18 use Cwd; | |
19 | |
20 | |
21 has 'groupfile' => ( is => 'ro', isa => 'Str', required => 1 ); | |
22 has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); | |
23 has 'outfile' => ( is => 'ro', isa => 'Str', required => 1 ); | |
24 has 'iterations' => ( is => 'ro', isa => 'Int', default => 5 ); | |
25 has 'dont_delete' => ( is => 'ro', isa => 'Bool', default => 0 ); | |
26 | |
27 has '_neighbourhood_size' => ( is => 'ro', isa => 'Int', default => 5 ); | |
28 | |
29 has '_group_filelist' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 ); | |
30 has '_tmp_dir_object' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } ); | |
31 has '_tmp_dir' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__tmp_dir' ); | |
32 | |
33 has '_analyse_groups_obj' => ( is => 'ro', lazy_build => 1 ); | |
34 has '_genes_to_files' => ( is => 'ro', lazy_build => 1 ); | |
35 has '_genes_to_groups' => ( is => 'rw', isa => 'HashRef' ); | |
36 | |
37 has '_first_gene_of_group_which_doesnt_have_paralogs' => ( is => 'rw', isa => 'HashRef', default => sub {{}} ); | |
38 | |
39 has '_genes_to_neighbourhood' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_neighbourhood' ); | |
40 | |
41 | |
42 has '_gene_files_temp_dir_obj' => | |
43 ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } ); | |
44 | |
45 | |
46 has '_do_sorting' => ( is => 'rw', isa => 'Bool', default => 0 ); # set to 1 for testing only | |
47 | |
48 sub _build__tmp_dir { | |
49 my ($self) = @_; | |
50 return $self->_tmp_dir_object->dirname(); | |
51 } | |
52 | |
53 sub _build__analyse_groups_obj { | |
54 my ( $self ) = @_; | |
55 | |
56 return Bio::Roary::AnalyseGroups->new( | |
57 fasta_files => $self->fasta_files, | |
58 groups_filename => $self->groupfile | |
59 ); | |
60 } | |
61 | |
62 sub _build__genes_to_files { | |
63 my ( $self ) = @_; | |
64 return $self->_analyse_groups_obj->_genes_to_file; | |
65 } | |
66 | |
67 sub _build__group_filelist { | |
68 my ( $self ) = @_; | |
69 my $tmp = $self->_tmp_dir; | |
70 | |
71 my @filelist = ( $self->groupfile ); | |
72 for my $i ( 1..($self->iterations - 1) ){ | |
73 push( @filelist, "$tmp/group_$i" ); | |
74 } | |
75 push( @filelist, $self->outfile ); | |
76 | |
77 return \@filelist; | |
78 } | |
79 | |
80 sub _build__genes_to_neighbourhood | |
81 { | |
82 my ( $self ) = @_; | |
83 my %genes_to_neighbourhood; | |
84 for my $fasta_file( @{$self->fasta_files}) | |
85 { | |
86 my ( $filename, $directories, $suffix ) = fileparse( $fasta_file, qr/\.[^.]*/ ); | |
87 system('grep \> '.$fasta_file.'| sed \'s/>//\' >'.$self->_gene_files_temp_dir_obj."/".$filename.$suffix ) ; | |
88 | |
89 my @genes = read_lines($self->_gene_files_temp_dir_obj."/".$filename.$suffix ); | |
90 | |
91 for(my $i =0; $i< @genes; $i++) | |
92 { | |
93 for(my $offset = 1; $offset <= $self->_neighbourhood_size; $offset++) | |
94 { | |
95 if($i -$offset >= 0) | |
96 { | |
97 push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i - $offset ]); | |
98 } | |
99 if($i +$offset <@genes) | |
100 { | |
101 push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i + $offset ]); | |
102 } | |
103 } | |
104 } | |
105 } | |
106 return \%genes_to_neighbourhood; | |
107 } | |
108 | |
109 sub split_groups { | |
110 my ( $self ) = @_; | |
111 | |
112 # iteratively | |
113 for my $x ( 0..($self->iterations - 1) ){ | |
114 my ( $in_groups, $out_groups ) = $self->_get_files_for_iteration( $x ); | |
115 | |
116 # read in groups, check paralogs and split | |
117 my @newgroups; | |
118 my $any_paralogs = 0; | |
119 $self->_set_genes_to_groups( $in_groups ); | |
120 open( my $group_handle, '<', $in_groups ); | |
121 while( my $line = <$group_handle> ){ | |
122 my @group = split( /\s+/, $line ); | |
123 | |
124 if($self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}) | |
125 { | |
126 push( @newgroups, \@group ); | |
127 } | |
128 elsif(@group == 1) | |
129 { | |
130 $self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++; | |
131 push( @newgroups, \@group ); | |
132 } | |
133 elsif( $self->_contains_paralogs( \@group ) ){ | |
134 my @true_orthologs = @{ $self->_true_orthologs( \@group ) }; | |
135 push( @newgroups, @true_orthologs); | |
136 $any_paralogs = 1; | |
137 } | |
138 else { | |
139 $self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++; | |
140 push( @newgroups, \@group ); | |
141 } | |
142 } | |
143 close( $group_handle ); | |
144 | |
145 # check if next iteration required, move output if not | |
146 unless ($any_paralogs){ | |
147 move $in_groups, $self->outfile; # input file will be the same as new output file if no splitting has been performed | |
148 last; | |
149 } | |
150 | |
151 # write split groups to file | |
152 open( my $outfile_handle, '>', $out_groups ); | |
153 for my $g ( @newgroups ) { | |
154 my $group_str = join( "\t", @{ $g } ) . "\n"; | |
155 print $outfile_handle $group_str; | |
156 } | |
157 close( $outfile_handle ); | |
158 } | |
159 } | |
160 | |
161 sub _set_genes_to_groups { | |
162 my ( $self, $groupfile ) = @_; | |
163 | |
164 my %genes2groups; | |
165 my $c = 0; | |
166 open( my $gfh, '<', $groupfile ); | |
167 while( my $line = <$gfh> ){ | |
168 chomp $line; | |
169 my @genes = split( /\s+/, $line ); | |
170 for my $g ( @genes ){ | |
171 $genes2groups{$g} = $c; | |
172 } | |
173 $c++; | |
174 } | |
175 close($gfh); | |
176 $self->_genes_to_groups( \%genes2groups ); | |
177 } | |
178 | |
179 sub _update_genes_to_groups { | |
180 my ( $self, $groups ) = @_; | |
181 | |
182 my %genes2groups = %{ $self->_genes_to_groups }; | |
183 my $c = 1; | |
184 for my $g ( @{ $groups } ){ | |
185 for my $h ( @{ $g } ){ | |
186 $genes2groups{$h} .= ".$c"; | |
187 } | |
188 $c++; | |
189 } | |
190 | |
191 $self->_genes_to_groups( \%genes2groups ); | |
192 } | |
193 | |
194 sub _get_files_for_iteration { | |
195 my ( $self, $n ) = @_; | |
196 my @filelist = @{ $self->_group_filelist }; | |
197 return ( $filelist[$n], $filelist[$n+1] ); | |
198 } | |
199 | |
200 sub _contains_paralogs { | |
201 my ( $self, $group ) = @_; | |
202 | |
203 return 1 if defined $self->_find_paralogs( $group ); | |
204 return 0; | |
205 } | |
206 | |
207 sub _find_paralogs { | |
208 my ( $self, $group ) = @_; | |
209 | |
210 my %occ; | |
211 for my $gene ( @{ $group } ){ | |
212 my $gene_file = $self->_genes_to_files->{ $gene }; | |
213 push( @{ $occ{$gene_file} }, $gene ); | |
214 } | |
215 | |
216 # pick the smallest number of paralogs | |
217 my $smallest_number = 1000000; | |
218 my $smallest_group; | |
219 for my $v ( values %occ ){ | |
220 my $v_len = scalar( @{$v} ); | |
221 if ( $v_len < $smallest_number && $v_len > 1 ){ | |
222 $smallest_number = $v_len; | |
223 $smallest_group = $v; | |
224 } | |
225 } | |
226 return $smallest_group if ( defined $smallest_group ); | |
227 | |
228 return undef; | |
229 } | |
230 | |
231 sub _true_orthologs { | |
232 my ( $self, $group ) = @_; | |
233 | |
234 # first, create CGN hash for group | |
235 my %cgns; | |
236 for my $g ( @{ $group } ){ | |
237 $cgns{$g} = $self->_parse_gene_neighbourhood( $g ); | |
238 } | |
239 | |
240 # finding paralogs in the group | |
241 my @paralogs = @{ $self->_find_paralogs( $group ) }; | |
242 my @paralog_cgns_groups; | |
243 for my $p ( @paralogs ){ | |
244 my %paralog_groups ; | |
245 for my $paralog_gene (@{$cgns{$p}}) | |
246 { | |
247 my $gene_paralog_group = $self->_genes_to_groups->{$paralog_gene}; | |
248 next unless( defined($gene_paralog_group)); | |
249 $paralog_groups{$self->_genes_to_groups->{$paralog_gene}}++; | |
250 } | |
251 push( @paralog_cgns_groups, \%paralog_groups ); | |
252 } | |
253 | |
254 # create data structure to hold new groups | |
255 my @new_groups; | |
256 for my $p ( @paralogs ){ | |
257 push( @new_groups, [ $p ] ); | |
258 } | |
259 push( @new_groups, [] ); # extra "leftovers" array to gather genes that don't share CGN with anything | |
260 | |
261 # cluster other members of the group to their closest match | |
262 for my $g ( @{ $group } ){ | |
263 next if ( grep {$_ eq $g} @paralogs ); | |
264 my $closest = $self->_closest_cgn( $cgns{$g}, \@paralog_cgns_groups ); | |
265 push( @{ $new_groups[$closest] }, $g ); | |
266 } | |
267 | |
268 # check for "leftovers", remove if absent | |
269 my $last = pop @new_groups; | |
270 push( @new_groups, $last ) if ( @$last > 0 ); | |
271 | |
272 # sort | |
273 if ( $self->_do_sorting ){ | |
274 my @sorted_new_groups; | |
275 for my $gr ( @new_groups ){ | |
276 my @s_gr = sort @{ $gr }; | |
277 push( @sorted_new_groups, \@s_gr ); | |
278 } | |
279 return \@sorted_new_groups; | |
280 } | |
281 | |
282 return \@new_groups; | |
283 } | |
284 | |
285 sub _closest_cgn { | |
286 my ( $self, $cgn, $p_cgns ) = @_; | |
287 | |
288 my @paralog_cgns = @{ $p_cgns }; | |
289 my $best_score = 0; | |
290 my $bs_index = -1; # return -1 to add to "leftovers" array if no better score is found | |
291 for my $i ( 0..$#paralog_cgns ){ | |
292 my $p_cgn = $paralog_cgns[$i]; | |
293 my $score = $self->_shared_cgn_score( $cgn, $p_cgn ); | |
294 if ( $score > $best_score ){ | |
295 $best_score = $score; | |
296 $bs_index = $i; | |
297 } | |
298 } | |
299 return $bs_index; | |
300 } | |
301 | |
302 sub _shared_cgn_score { | |
303 my ( $self, $cgn1, $cgn2 ) = @_; | |
304 | |
305 my $total_shared = 0; | |
306 for my $i ( @{ $cgn1 } ){ | |
307 my $input_group = $self->_genes_to_groups->{$i}; | |
308 next unless(defined($input_group)); | |
309 $total_shared++ if($cgn2->{$input_group}); | |
310 } | |
311 if( (scalar @{ $cgn1 }) == 0) | |
312 { | |
313 return 0; | |
314 } | |
315 my $score = $total_shared/scalar @{ $cgn1 }; | |
316 return $score; | |
317 } | |
318 | |
319 sub _parse_gene_neighbourhood { | |
320 my ( $self, $gene_id ) = @_; | |
321 | |
322 return $self->_genes_to_neighbourhood->{$gene_id }; | |
323 | |
324 } | |
325 | |
326 no Moose; | |
327 __PACKAGE__->meta->make_immutable; | |
328 1; |