annotate Roary/lib/Bio/Roary/SplitGroups.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
1 package Bio::Roary::SplitGroups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
2
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
3 # ABSTRACT: split groups
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
4
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
5 =head1 SYNOPSIS
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
6
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
7 use Bio::Roary::SplitGroups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
8
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
9 =cut
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
10
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
11 use Moose;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
12 use Bio::Roary::AnalyseGroups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
13 use File::Path qw(make_path remove_tree);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
14 use File::Copy qw(move);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
15 use File::Temp;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
16 use File::Basename;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
17 use File::Slurper 'read_lines';
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
18 use Cwd;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
19
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
20
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
21 has 'groupfile' => ( is => 'ro', isa => 'Str', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
22 has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
23 has 'outfile' => ( is => 'ro', isa => 'Str', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
24 has 'iterations' => ( is => 'ro', isa => 'Int', default => 5 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
25 has 'dont_delete' => ( is => 'ro', isa => 'Bool', default => 0 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
26
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
27 has '_neighbourhood_size' => ( is => 'ro', isa => 'Int', default => 5 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
28
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
29 has '_group_filelist' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
30 has '_tmp_dir_object' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
31 has '_tmp_dir' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__tmp_dir' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
32
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
33 has '_analyse_groups_obj' => ( is => 'ro', lazy_build => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
34 has '_genes_to_files' => ( is => 'ro', lazy_build => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
35 has '_genes_to_groups' => ( is => 'rw', isa => 'HashRef' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
36
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
37 has '_first_gene_of_group_which_doesnt_have_paralogs' => ( is => 'rw', isa => 'HashRef', default => sub {{}} );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
38
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
39 has '_genes_to_neighbourhood' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_neighbourhood' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
40
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
41
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
42 has '_gene_files_temp_dir_obj' =>
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
43 ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
44
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
45
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
46 has '_do_sorting' => ( is => 'rw', isa => 'Bool', default => 0 ); # set to 1 for testing only
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
47
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
48 sub _build__tmp_dir {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
49 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
50 return $self->_tmp_dir_object->dirname();
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
51 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
52
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
53 sub _build__analyse_groups_obj {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
54 my ( $self ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
55
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
56 return Bio::Roary::AnalyseGroups->new(
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
57 fasta_files => $self->fasta_files,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
58 groups_filename => $self->groupfile
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
59 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
60 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
61
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
62 sub _build__genes_to_files {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
63 my ( $self ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
64 return $self->_analyse_groups_obj->_genes_to_file;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
65 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
66
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
67 sub _build__group_filelist {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
68 my ( $self ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
69 my $tmp = $self->_tmp_dir;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
70
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
71 my @filelist = ( $self->groupfile );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
72 for my $i ( 1..($self->iterations - 1) ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
73 push( @filelist, "$tmp/group_$i" );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
74 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
75 push( @filelist, $self->outfile );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
76
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
77 return \@filelist;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
78 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
79
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
80 sub _build__genes_to_neighbourhood
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
81 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
82 my ( $self ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
83 my %genes_to_neighbourhood;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
84 for my $fasta_file( @{$self->fasta_files})
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
85 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
86 my ( $filename, $directories, $suffix ) = fileparse( $fasta_file, qr/\.[^.]*/ );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
87 system('grep \> '.$fasta_file.'| sed \'s/>//\' >'.$self->_gene_files_temp_dir_obj."/".$filename.$suffix ) ;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
88
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
89 my @genes = read_lines($self->_gene_files_temp_dir_obj."/".$filename.$suffix );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
90
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
91 for(my $i =0; $i< @genes; $i++)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
92 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
93 for(my $offset = 1; $offset <= $self->_neighbourhood_size; $offset++)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
94 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
95 if($i -$offset >= 0)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
96 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
97 push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i - $offset ]);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
98 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
99 if($i +$offset <@genes)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
100 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
101 push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i + $offset ]);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
102 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
103 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
104 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
105 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
106 return \%genes_to_neighbourhood;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
107 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
108
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
109 sub split_groups {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
110 my ( $self ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
111
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
112 # iteratively
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
113 for my $x ( 0..($self->iterations - 1) ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
114 my ( $in_groups, $out_groups ) = $self->_get_files_for_iteration( $x );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
115
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
116 # read in groups, check paralogs and split
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
117 my @newgroups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
118 my $any_paralogs = 0;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
119 $self->_set_genes_to_groups( $in_groups );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
120 open( my $group_handle, '<', $in_groups );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
121 while( my $line = <$group_handle> ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
122 my @group = split( /\s+/, $line );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
123
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
124 if($self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]})
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
125 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
126 push( @newgroups, \@group );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
127 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
128 elsif(@group == 1)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
129 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
130 $self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
131 push( @newgroups, \@group );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
132 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
133 elsif( $self->_contains_paralogs( \@group ) ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
134 my @true_orthologs = @{ $self->_true_orthologs( \@group ) };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
135 push( @newgroups, @true_orthologs);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
136 $any_paralogs = 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
137 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
138 else {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
139 $self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
140 push( @newgroups, \@group );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
141 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
142 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
143 close( $group_handle );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
144
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
145 # check if next iteration required, move output if not
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
146 unless ($any_paralogs){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
147 move $in_groups, $self->outfile; # input file will be the same as new output file if no splitting has been performed
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
148 last;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
149 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
150
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
151 # write split groups to file
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
152 open( my $outfile_handle, '>', $out_groups );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
153 for my $g ( @newgroups ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
154 my $group_str = join( "\t", @{ $g } ) . "\n";
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
155 print $outfile_handle $group_str;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
156 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
157 close( $outfile_handle );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
158 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
159 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
160
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
161 sub _set_genes_to_groups {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
162 my ( $self, $groupfile ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
163
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
164 my %genes2groups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
165 my $c = 0;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
166 open( my $gfh, '<', $groupfile );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
167 while( my $line = <$gfh> ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
168 chomp $line;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
169 my @genes = split( /\s+/, $line );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
170 for my $g ( @genes ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
171 $genes2groups{$g} = $c;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
172 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
173 $c++;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
174 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
175 close($gfh);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
176 $self->_genes_to_groups( \%genes2groups );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
177 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
178
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
179 sub _update_genes_to_groups {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
180 my ( $self, $groups ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
181
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
182 my %genes2groups = %{ $self->_genes_to_groups };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
183 my $c = 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
184 for my $g ( @{ $groups } ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
185 for my $h ( @{ $g } ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
186 $genes2groups{$h} .= ".$c";
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
187 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
188 $c++;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
189 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
190
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
191 $self->_genes_to_groups( \%genes2groups );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
192 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
193
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
194 sub _get_files_for_iteration {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
195 my ( $self, $n ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
196 my @filelist = @{ $self->_group_filelist };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
197 return ( $filelist[$n], $filelist[$n+1] );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
198 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
199
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
200 sub _contains_paralogs {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
201 my ( $self, $group ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
202
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
203 return 1 if defined $self->_find_paralogs( $group );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
204 return 0;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
205 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
206
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
207 sub _find_paralogs {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
208 my ( $self, $group ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
209
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
210 my %occ;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
211 for my $gene ( @{ $group } ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
212 my $gene_file = $self->_genes_to_files->{ $gene };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
213 push( @{ $occ{$gene_file} }, $gene );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
214 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
215
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
216 # pick the smallest number of paralogs
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
217 my $smallest_number = 1000000;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
218 my $smallest_group;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
219 for my $v ( values %occ ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
220 my $v_len = scalar( @{$v} );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
221 if ( $v_len < $smallest_number && $v_len > 1 ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
222 $smallest_number = $v_len;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
223 $smallest_group = $v;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
224 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
225 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
226 return $smallest_group if ( defined $smallest_group );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
227
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
228 return undef;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
229 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
230
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
231 sub _true_orthologs {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
232 my ( $self, $group ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
233
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
234 # first, create CGN hash for group
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
235 my %cgns;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
236 for my $g ( @{ $group } ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
237 $cgns{$g} = $self->_parse_gene_neighbourhood( $g );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
238 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
239
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
240 # finding paralogs in the group
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
241 my @paralogs = @{ $self->_find_paralogs( $group ) };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
242 my @paralog_cgns_groups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
243 for my $p ( @paralogs ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
244 my %paralog_groups ;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
245 for my $paralog_gene (@{$cgns{$p}})
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
246 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
247 my $gene_paralog_group = $self->_genes_to_groups->{$paralog_gene};
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
248 next unless( defined($gene_paralog_group));
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
249 $paralog_groups{$self->_genes_to_groups->{$paralog_gene}}++;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
250 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
251 push( @paralog_cgns_groups, \%paralog_groups );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
252 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
253
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
254 # create data structure to hold new groups
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
255 my @new_groups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
256 for my $p ( @paralogs ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
257 push( @new_groups, [ $p ] );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
258 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
259 push( @new_groups, [] ); # extra "leftovers" array to gather genes that don't share CGN with anything
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
260
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
261 # cluster other members of the group to their closest match
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
262 for my $g ( @{ $group } ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
263 next if ( grep {$_ eq $g} @paralogs );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
264 my $closest = $self->_closest_cgn( $cgns{$g}, \@paralog_cgns_groups );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
265 push( @{ $new_groups[$closest] }, $g );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
266 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
267
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
268 # check for "leftovers", remove if absent
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
269 my $last = pop @new_groups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
270 push( @new_groups, $last ) if ( @$last > 0 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
271
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
272 # sort
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
273 if ( $self->_do_sorting ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
274 my @sorted_new_groups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
275 for my $gr ( @new_groups ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
276 my @s_gr = sort @{ $gr };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
277 push( @sorted_new_groups, \@s_gr );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
278 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
279 return \@sorted_new_groups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
280 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
281
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
282 return \@new_groups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
283 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
284
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
285 sub _closest_cgn {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
286 my ( $self, $cgn, $p_cgns ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
287
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
288 my @paralog_cgns = @{ $p_cgns };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
289 my $best_score = 0;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
290 my $bs_index = -1; # return -1 to add to "leftovers" array if no better score is found
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
291 for my $i ( 0..$#paralog_cgns ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
292 my $p_cgn = $paralog_cgns[$i];
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
293 my $score = $self->_shared_cgn_score( $cgn, $p_cgn );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
294 if ( $score > $best_score ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
295 $best_score = $score;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
296 $bs_index = $i;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
297 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
298 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
299 return $bs_index;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
300 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
301
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
302 sub _shared_cgn_score {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
303 my ( $self, $cgn1, $cgn2 ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
304
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
305 my $total_shared = 0;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
306 for my $i ( @{ $cgn1 } ){
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
307 my $input_group = $self->_genes_to_groups->{$i};
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
308 next unless(defined($input_group));
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
309 $total_shared++ if($cgn2->{$input_group});
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
310 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
311 if( (scalar @{ $cgn1 }) == 0)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
312 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
313 return 0;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
314 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
315 my $score = $total_shared/scalar @{ $cgn1 };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
316 return $score;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
317 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
318
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
319 sub _parse_gene_neighbourhood {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
320 my ( $self, $gene_id ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
321
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
322 return $self->_genes_to_neighbourhood->{$gene_id };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
323
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
324 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
325
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
326 no Moose;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
327 __PACKAGE__->meta->make_immutable;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
328 1;