Mercurial > repos > dereeper > roary_plots
diff Roary/lib/Bio/Roary/SplitGroups.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Roary/lib/Bio/Roary/SplitGroups.pm Fri May 14 20:27:06 2021 +0000 @@ -0,0 +1,328 @@ +package Bio::Roary::SplitGroups; + +# ABSTRACT: split groups + +=head1 SYNOPSIS + + use Bio::Roary::SplitGroups; + +=cut + +use Moose; +use Bio::Roary::AnalyseGroups; +use File::Path qw(make_path remove_tree); +use File::Copy qw(move); +use File::Temp; +use File::Basename; +use File::Slurper 'read_lines'; +use Cwd; + + +has 'groupfile' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); +has 'outfile' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'iterations' => ( is => 'ro', isa => 'Int', default => 5 ); +has 'dont_delete' => ( is => 'ro', isa => 'Bool', default => 0 ); + +has '_neighbourhood_size' => ( is => 'ro', isa => 'Int', default => 5 ); + +has '_group_filelist' => ( is => 'rw', isa => 'ArrayRef', lazy_build => 1 ); +has '_tmp_dir_object' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } ); +has '_tmp_dir' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__tmp_dir' ); + +has '_analyse_groups_obj' => ( is => 'ro', lazy_build => 1 ); +has '_genes_to_files' => ( is => 'ro', lazy_build => 1 ); +has '_genes_to_groups' => ( is => 'rw', isa => 'HashRef' ); + +has '_first_gene_of_group_which_doesnt_have_paralogs' => ( is => 'rw', isa => 'HashRef', default => sub {{}} ); + +has '_genes_to_neighbourhood' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_neighbourhood' ); + + +has '_gene_files_temp_dir_obj' => + ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } ); + + +has '_do_sorting' => ( is => 'rw', isa => 'Bool', default => 0 ); # set to 1 for testing only + +sub _build__tmp_dir { + my ($self) = @_; + return $self->_tmp_dir_object->dirname(); +} + +sub _build__analyse_groups_obj { + my ( $self ) = @_; + + return Bio::Roary::AnalyseGroups->new( + fasta_files => $self->fasta_files, + groups_filename => $self->groupfile + ); +} + +sub _build__genes_to_files { + my ( $self ) = @_; + return $self->_analyse_groups_obj->_genes_to_file; +} + +sub _build__group_filelist { + my ( $self ) = @_; + my $tmp = $self->_tmp_dir; + + my @filelist = ( $self->groupfile ); + for my $i ( 1..($self->iterations - 1) ){ + push( @filelist, "$tmp/group_$i" ); + } + push( @filelist, $self->outfile ); + + return \@filelist; +} + +sub _build__genes_to_neighbourhood +{ + my ( $self ) = @_; + my %genes_to_neighbourhood; + for my $fasta_file( @{$self->fasta_files}) + { + my ( $filename, $directories, $suffix ) = fileparse( $fasta_file, qr/\.[^.]*/ ); + system('grep \> '.$fasta_file.'| sed \'s/>//\' >'.$self->_gene_files_temp_dir_obj."/".$filename.$suffix ) ; + + my @genes = read_lines($self->_gene_files_temp_dir_obj."/".$filename.$suffix ); + + for(my $i =0; $i< @genes; $i++) + { + for(my $offset = 1; $offset <= $self->_neighbourhood_size; $offset++) + { + if($i -$offset >= 0) + { + push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i - $offset ]); + } + if($i +$offset <@genes) + { + push(@{$genes_to_neighbourhood{$genes[$i]}}, $genes[$i + $offset ]); + } + } + } + } + return \%genes_to_neighbourhood; +} + +sub split_groups { + my ( $self ) = @_; + + # iteratively + for my $x ( 0..($self->iterations - 1) ){ + my ( $in_groups, $out_groups ) = $self->_get_files_for_iteration( $x ); + + # read in groups, check paralogs and split + my @newgroups; + my $any_paralogs = 0; + $self->_set_genes_to_groups( $in_groups ); + open( my $group_handle, '<', $in_groups ); + while( my $line = <$group_handle> ){ + my @group = split( /\s+/, $line ); + + if($self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}) + { + push( @newgroups, \@group ); + } + elsif(@group == 1) + { + $self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++; + push( @newgroups, \@group ); + } + elsif( $self->_contains_paralogs( \@group ) ){ + my @true_orthologs = @{ $self->_true_orthologs( \@group ) }; + push( @newgroups, @true_orthologs); + $any_paralogs = 1; + } + else { + $self->_first_gene_of_group_which_doesnt_have_paralogs->{$group[0]}++; + push( @newgroups, \@group ); + } + } + close( $group_handle ); + + # check if next iteration required, move output if not + unless ($any_paralogs){ + move $in_groups, $self->outfile; # input file will be the same as new output file if no splitting has been performed + last; + } + + # write split groups to file + open( my $outfile_handle, '>', $out_groups ); + for my $g ( @newgroups ) { + my $group_str = join( "\t", @{ $g } ) . "\n"; + print $outfile_handle $group_str; + } + close( $outfile_handle ); + } +} + +sub _set_genes_to_groups { + my ( $self, $groupfile ) = @_; + + my %genes2groups; + my $c = 0; + open( my $gfh, '<', $groupfile ); + while( my $line = <$gfh> ){ + chomp $line; + my @genes = split( /\s+/, $line ); + for my $g ( @genes ){ + $genes2groups{$g} = $c; + } + $c++; + } + close($gfh); + $self->_genes_to_groups( \%genes2groups ); +} + +sub _update_genes_to_groups { + my ( $self, $groups ) = @_; + + my %genes2groups = %{ $self->_genes_to_groups }; + my $c = 1; + for my $g ( @{ $groups } ){ + for my $h ( @{ $g } ){ + $genes2groups{$h} .= ".$c"; + } + $c++; + } + + $self->_genes_to_groups( \%genes2groups ); +} + +sub _get_files_for_iteration { + my ( $self, $n ) = @_; + my @filelist = @{ $self->_group_filelist }; + return ( $filelist[$n], $filelist[$n+1] ); +} + +sub _contains_paralogs { + my ( $self, $group ) = @_; + + return 1 if defined $self->_find_paralogs( $group ); + return 0; +} + +sub _find_paralogs { + my ( $self, $group ) = @_; + + my %occ; + for my $gene ( @{ $group } ){ + my $gene_file = $self->_genes_to_files->{ $gene }; + push( @{ $occ{$gene_file} }, $gene ); + } + + # pick the smallest number of paralogs + my $smallest_number = 1000000; + my $smallest_group; + for my $v ( values %occ ){ + my $v_len = scalar( @{$v} ); + if ( $v_len < $smallest_number && $v_len > 1 ){ + $smallest_number = $v_len; + $smallest_group = $v; + } + } + return $smallest_group if ( defined $smallest_group ); + + return undef; +} + +sub _true_orthologs { + my ( $self, $group ) = @_; + + # first, create CGN hash for group + my %cgns; + for my $g ( @{ $group } ){ + $cgns{$g} = $self->_parse_gene_neighbourhood( $g ); + } + + # finding paralogs in the group + my @paralogs = @{ $self->_find_paralogs( $group ) }; + my @paralog_cgns_groups; + for my $p ( @paralogs ){ + my %paralog_groups ; + for my $paralog_gene (@{$cgns{$p}}) + { + my $gene_paralog_group = $self->_genes_to_groups->{$paralog_gene}; + next unless( defined($gene_paralog_group)); + $paralog_groups{$self->_genes_to_groups->{$paralog_gene}}++; + } + push( @paralog_cgns_groups, \%paralog_groups ); + } + + # create data structure to hold new groups + my @new_groups; + for my $p ( @paralogs ){ + push( @new_groups, [ $p ] ); + } + push( @new_groups, [] ); # extra "leftovers" array to gather genes that don't share CGN with anything + + # cluster other members of the group to their closest match + for my $g ( @{ $group } ){ + next if ( grep {$_ eq $g} @paralogs ); + my $closest = $self->_closest_cgn( $cgns{$g}, \@paralog_cgns_groups ); + push( @{ $new_groups[$closest] }, $g ); + } + + # check for "leftovers", remove if absent + my $last = pop @new_groups; + push( @new_groups, $last ) if ( @$last > 0 ); + + # sort + if ( $self->_do_sorting ){ + my @sorted_new_groups; + for my $gr ( @new_groups ){ + my @s_gr = sort @{ $gr }; + push( @sorted_new_groups, \@s_gr ); + } + return \@sorted_new_groups; + } + + return \@new_groups; +} + +sub _closest_cgn { + my ( $self, $cgn, $p_cgns ) = @_; + + my @paralog_cgns = @{ $p_cgns }; + my $best_score = 0; + my $bs_index = -1; # return -1 to add to "leftovers" array if no better score is found + for my $i ( 0..$#paralog_cgns ){ + my $p_cgn = $paralog_cgns[$i]; + my $score = $self->_shared_cgn_score( $cgn, $p_cgn ); + if ( $score > $best_score ){ + $best_score = $score; + $bs_index = $i; + } + } + return $bs_index; +} + +sub _shared_cgn_score { + my ( $self, $cgn1, $cgn2 ) = @_; + + my $total_shared = 0; + for my $i ( @{ $cgn1 } ){ + my $input_group = $self->_genes_to_groups->{$i}; + next unless(defined($input_group)); + $total_shared++ if($cgn2->{$input_group}); + } + if( (scalar @{ $cgn1 }) == 0) + { + return 0; + } + my $score = $total_shared/scalar @{ $cgn1 }; + return $score; +} + +sub _parse_gene_neighbourhood { + my ( $self, $gene_id ) = @_; + + return $self->_genes_to_neighbourhood->{$gene_id }; + +} + +no Moose; +__PACKAGE__->meta->make_immutable; +1;