Mercurial > repos > dereeper > roary_plots
view Roary/lib/Bio/Roary/AnnotateGroups.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line source
package Bio::Roary::AnnotateGroups; # ABSTRACT: Take in a group file and associated GFF files for the isolates and update the group name to the gene name =head1 SYNOPSIS Take in a group file and associated GFF files for the isolates and update the group name to the gene name use Bio::Roary::AnnotateGroups; my $obj = Bio::Roary::AnnotateGroups->new( gff_files => ['abc.gff','efg.gff'], output_filename => 'example_output.fa', groups_filename => 'groupsfile', ); $obj->reannotate; =cut use Moose; use Bio::Roary::Exceptions; use Bio::Roary::GeneNamesFromGFF; use Array::Utils qw(array_minus); use List::Util qw(max min sum); use File::Grep qw(fgrep); has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'reannotated_groups_file' ); has 'groups_filename' => ( is => 'ro', isa => 'Str', required => 1 ); has '_ids_to_gene_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__ids_to_gene_names' ); has '_ids_to_product' => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); has '_ids_to_gene_size' => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); has 'group_nucleotide_lengths' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_nucleotide_lengths'); has '_groups_to_id_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_builder__groups_to_id_names' ); has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' ); has '_groups_to_consensus_gene_names' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_consensus_gene_names' ); has '_filtered_gff_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build__filtered_gff_files' ); has '_number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__number_of_files' ); has '_ids_to_groups' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_builder__ids_to_groups' ); has '_group_counter' => ( is => 'rw', isa => 'Int', lazy => 1, builder => '_builder__group_counter' ); has '_group_default_prefix' => ( is => 'rw', isa => 'Str', default => 'group_' ); has '_ids_to_verbose_stats' => ( is => 'rw', isa => 'HashRef', lazy_build => 1 ); sub BUILD { my ($self) = @_; $self->_ids_to_gene_names; } sub _builder__group_counter { my ($self) = @_; my $prefix = $self->_group_default_prefix; my $highest_group = 0; for my $group ( @{ $self->_groups } ) { if ( $group =~ /$prefix([\d]+)$/ ) { my $group_id = $1; if ( $group_id > $highest_group ) { $highest_group = $group_id; } } } return $highest_group + 1; } sub _generate__ids_to_groups { my ($self) = @_; my %ids_to_groups; for my $group ( keys %{ $self->_groups_to_id_names } ) { for my $id_name ( @{ $self->_groups_to_id_names->{$group} } ) { $ids_to_groups{$id_name} = $group; } } return \%ids_to_groups; } sub _builder__ids_to_groups { my ($self) = @_; return $self->_generate__ids_to_groups; } sub _build__output_fh { my ($self) = @_; open( my $fh, '>', $self->output_filename ) or Bio::Roary::Exceptions::CouldntWriteToFile->throw( error => "Couldnt write output file:" . $self->output_filename ); return $fh; } sub _build__filtered_gff_files { my ($self) = @_; my @gff_files = grep( /\.gff$/, @{ $self->gff_files } ); return \@gff_files; } sub _build__ids_to_gene_names { my ($self) = @_; my %ids_to_gene_names; my %ids_to_product; my %ids_to_gene_size; for my $filename ( @{ $self->_filtered_gff_files } ) { my $gene_names_from_gff = Bio::Roary::GeneNamesFromGFF->new( gff_file => $filename ); my %id_to_gene_lookup = %{ $gene_names_from_gff->ids_to_gene_name }; @ids_to_gene_names{ keys %id_to_gene_lookup } = values %id_to_gene_lookup; my %id_to_product_lookup = %{ $gene_names_from_gff->ids_to_product }; @ids_to_product{ keys %id_to_product_lookup } = values %id_to_product_lookup; my %ids_to_gene_size_lookup = %{ $gene_names_from_gff->ids_to_gene_size }; @ids_to_gene_size{ keys %ids_to_gene_size_lookup } = values %ids_to_gene_size_lookup; } $self->_ids_to_product( \%ids_to_product ); $self->_ids_to_gene_size( \%ids_to_gene_size ); return \%ids_to_gene_names; } sub _build__ids_to_verbose_stats { my $self = shift; my @matches_hash = fgrep { /ID=/i } @{ $self->_filtered_gff_files }; my @matches; foreach my $m ( @matches_hash ){ push( @matches, values %{$m->{matches}} ); } # chomp @matches; my %verbose; foreach my $line ( @matches ){ my ( $id, $inf, $prod ); if( $line =~ m/ID=["']?([^;"']+)["']?;?/i ){ $id = $1; } else { next; } $inf = $1 if ( $line =~ m/inference=([^;]+);/ ); $prod = $1 if ( $line =~ m/product=([^;]+)[;\n]/ ); my %info = ( 'inference' => $inf, 'product' => $prod ); $verbose{$id} = \%info; } return \%verbose; } sub consensus_product_for_id_names { my ( $self, $id_names ) = @_; my %product_freq; for my $id_name ( @{$id_names} ) { next unless ( defined( $self->_ids_to_product->{$id_name} ) ); $product_freq{ $self->_ids_to_product->{$id_name} }++; } my @sorted_product_keys = sort { $product_freq{$b} <=> $product_freq{$a} } keys(%product_freq); if ( @sorted_product_keys > 0 ) { return $sorted_product_keys[0]; } else { return ''; } } sub _builder__groups_to_id_names { my ($self) = @_; my %groups_to_id_names; open( my $fh, $self->groups_filename ) or Bio::Roary::Exceptions::FileNotFound->throw( error => "Group file not found:" . $self->groups_filename ); while (<$fh>) { chomp; my $line = $_; if ( $line =~ /^(.+): (.+)$/ ) { my $group_name = $1; my $genes = $2; my @elements = split( /[\s\t]+/, $genes ); $groups_to_id_names{$group_name} = \@elements; } } return \%groups_to_id_names; } sub _groups { my ($self) = @_; my @groups = keys %{ $self->_groups_to_id_names }; return \@groups; } sub _ids_grouped_by_gene_name_for_group { my ( $self, $group_name ) = @_; my %gene_name_freq; for my $id_name ( @{ $self->_groups_to_id_names->{$group_name} } ) { if ( defined( $self->_ids_to_gene_names->{$id_name} ) && $self->_ids_to_gene_names->{$id_name} ne "" ) { push( @{ $gene_name_freq{ $self->_ids_to_gene_names->{$id_name} } }, $id_name ); } } return \%gene_name_freq; } sub _consensus_gene_name_for_group { my ( $self, $group_name ) = @_; my $gene_name_freq = $self->_ids_grouped_by_gene_name_for_group($group_name); my @sorted_gene_names = sort { @{ $gene_name_freq->{$b} } <=> @{ $gene_name_freq->{$a} } } keys %{$gene_name_freq}; if ( @sorted_gene_names > 0 ) { return shift(@sorted_gene_names); } else { return $group_name; } } sub _build_group_nucleotide_lengths { my ($self) = @_; my %group_nucleotide_lengths; for my $group_name (keys %{ $self->_groups_to_id_names } ) { my @gene_lengths; for my $gene_id (@{$self->_groups_to_id_names->{$group_name}}) { my $current_gene_size = $self->_ids_to_gene_size->{$gene_id}; next unless(defined($current_gene_size) ); next if($current_gene_size < 1); push(@gene_lengths, $current_gene_size); } next if(@gene_lengths == 0); my $average_gene_size = (int((sum @gene_lengths)/@gene_lengths)) || 0; my $min_gene_size = (min @gene_lengths) || 0; my $max_gene_size = (max @gene_lengths) || 0; $group_nucleotide_lengths{$group_name} = {'min' => $min_gene_size, 'max' =>$max_gene_size , 'average' => $average_gene_size}; } return \%group_nucleotide_lengths; } sub _generate_groups_to_consensus_gene_names { my ($self) = @_; my %groups_to_gene_names; my %gene_name_freq; my $group_prefix = $self->_group_default_prefix; # These are already annotated for my $group_name ( sort { @{ $self->_groups_to_id_names->{$b} } <=> @{ $self->_groups_to_id_names->{$a} } } keys %{ $self->_groups_to_id_names } ) { next if ( $group_name =~ /$group_prefix/ ); $groups_to_gene_names{$group_name} = $group_name; } for my $group_name ( sort { @{ $self->_groups_to_id_names->{$b} } <=> @{ $self->_groups_to_id_names->{$a} } } keys %{ $self->_groups_to_id_names } ) { next unless ( $group_name =~ /$group_prefix/ ); my $consensus_gene_name = $self->_consensus_gene_name_for_group($group_name); if ( defined( $gene_name_freq{$consensus_gene_name} ) ) { $groups_to_gene_names{$group_name} = $group_name; } else { $groups_to_gene_names{$group_name} = $consensus_gene_name; $gene_name_freq{$consensus_gene_name}++; } } return \%groups_to_gene_names; } sub _build__groups_to_consensus_gene_names { my ($self) = @_; return $self->_generate_groups_to_consensus_gene_names; } sub _build__number_of_files { my ($self) = @_; return @{ $self->gff_files }; } sub _split_groups { my ($self) = @_; $self->_groups_to_consensus_gene_names( $self->_generate_groups_to_consensus_gene_names ); $self->_ids_to_groups( $self->_generate__ids_to_groups ); } sub _remove_ids_from_group { my ( $self, $ids_to_remove, $group ) = @_; my @remaining_ids = array_minus( @{ $self->_groups_to_id_names->{$group} }, @{ $ids_to_remove } ); $self->_groups_to_id_names->{$group} = \@remaining_ids; if ( @{ $self->_groups_to_id_names->{$group} } == 0 ) { delete( $self->_groups_to_id_names->{$group} ); } } sub reannotate { my ($self) = @_; $self->_split_groups; my %groups_to_id_names = %{ $self->_groups_to_id_names }; for my $group_name ( sort { @{ $groups_to_id_names{$b} } <=> @{ $groups_to_id_names{$a} } } keys %groups_to_id_names ) { my $consensus_gene_name = $self->_groups_to_consensus_gene_names->{$group_name}; print { $self->_output_fh } $consensus_gene_name . ": " . join( "\t", @{ $self->_groups_to_id_names->{$group_name} } ) . "\n"; } close( $self->_output_fh ); return $self; } sub full_annotation { my ( $self, $group ) = @_; my @id_names = @{ $self->_groups_to_id_names->{$group} }; my %product_freq; for my $id_name ( @id_names ) { next unless ( defined( $self->_ids_to_verbose_stats->{$id_name}->{'product'} ) ); $product_freq{ $self->_ids_to_verbose_stats->{$id_name}->{'product'} }++; } my @sorted_product_keys = sort { $product_freq{$b} <=> $product_freq{$a} } keys(%product_freq); if ( @sorted_product_keys > 0 ) { return join('; ', @sorted_product_keys); } else { return ''; } } sub inference { my ( $self, $group ) = @_; my @infs; foreach my $g ( @{ $self->_groups_to_id_names->{$group} } ){ next unless ( defined $self->_ids_to_verbose_stats->{$g}->{'inference'} ); push( @infs, $self->_ids_to_verbose_stats->{$g}->{'inference'} ); } # maybe make a consensus in the future? return $infs[0]; } no Moose; __PACKAGE__->meta->make_immutable; 1;