Mercurial > repos > dereeper > roary_plots
diff Roary/lib/Bio/Roary/AccessoryBinaryFasta.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Roary/lib/Bio/Roary/AccessoryBinaryFasta.pm Fri May 14 20:27:06 2021 +0000 @@ -0,0 +1,100 @@ +package Bio::Roary::AccessoryBinaryFasta; + +# ABSTRACT: Output a FASTA file which represents the binary presence and absence of genes in the accessory genome + +=head1 SYNOPSIS + +Output a FASTA file which represents the binary presence and absence of genes in the accessory genome + use Bio::Roary::AccessoryBinaryFasta; + my $obj = Bio::Roary::AccessoryBinaryFasta->new(input_files => ['abc','efg'], + groups_to_files => {'group_1' => ['abc'], group_2 => ['abc', 'efg']} + ); + $obj->create_accessory_binary_fasta(); +=cut + +use Moose; +use POSIX; +use Bio::Roary::AnnotateGroups; +use Bio::Roary::AnalyseGroups; +use Bio::Roary::Exceptions; +use Bio::SeqIO; +use File::Basename; + +has 'input_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); +has 'annotate_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 ); +has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', required => 1 ); +has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'accessory_binary_genes.fa' ); +has 'lower_bound_percentage' => ( is => 'ro', isa => 'Int', default => 5 ); +has 'upper_bound_percentage' => ( is => 'ro', isa => 'Int', default => 5 ); +has 'max_accessory_to_include' => ( is => 'ro', isa => 'Int', default => 4000 ); +has 'groups_to_files' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_files' ); +has '_lower_bound_value' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__lower_bound_value' ); +has '_upper_bound_value' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__upper_bound_value' ); + +sub _build__groups_to_files { + my ($self) = @_; + my %groups_to_files; + for my $group ( @{ $self->annotate_groups_obj->_groups } ) { + my $genes = $self->annotate_groups_obj->_groups_to_id_names->{$group}; + my %filenames; + for my $gene_name ( @{$genes} ) { + my $filename = $self->analyse_groups_obj->_genes_to_file->{$gene_name}; + push( @{ $filenames{$filename} }, $gene_name ); + } + $groups_to_files{$group} = \%filenames; + } + + return \%groups_to_files; +} + +sub _build__lower_bound_value { + my ($self) = @_; + my $num_files = @{ $self->input_files }; + return ceil( $num_files * ( $self->lower_bound_percentage / 100 ) ); +} + +sub _build__upper_bound_value { + my ($self) = @_; + my $num_files = @{ $self->input_files }; + return $num_files - ceil( $num_files * ( $self->upper_bound_percentage / 100 ) ); +} + +sub create_accessory_binary_fasta { + my ($self) = @_; + my $out_seq_io = Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' ); + + for my $full_filename ( @{ $self->input_files } ) { + my($filename, $dirs, $suffix) = fileparse($full_filename); + + my $output_sequence = ''; + my $sample_name = $filename; + $sample_name =~ s!\.gff\.proteome\.faa!!; + + my $gene_count = 0; + for my $group ( sort keys %{ $self->groups_to_files } ) { + last if($gene_count > $self->max_accessory_to_include); + + my @files = keys %{ $self->groups_to_files->{$group} }; + + next if ( @files <= $self->_lower_bound_value || @files > $self->_upper_bound_value ); + + my $group_to_file_genes = $self->groups_to_files->{$group}->{$full_filename}; + if ( defined($group_to_file_genes) && @{$group_to_file_genes} > 0 ) { + $output_sequence .= 'A'; + } + else { + $output_sequence .= 'C'; + } + $gene_count++; + + } + next if($output_sequence eq ''); + $out_seq_io->write_seq( Bio::Seq->new( -display_id => $sample_name, -seq => $output_sequence ) ); + } + return 1; +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1;