annotate Roary/lib/Bio/Roary/AccessoryBinaryFasta.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
1 package Bio::Roary::AccessoryBinaryFasta;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
2
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
3 # ABSTRACT: Output a FASTA file which represents the binary presence and absence of genes in the accessory genome
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
4
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
5 =head1 SYNOPSIS
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
6
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
7 Output a FASTA file which represents the binary presence and absence of genes in the accessory genome
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
8 use Bio::Roary::AccessoryBinaryFasta;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
9 my $obj = Bio::Roary::AccessoryBinaryFasta->new(input_files => ['abc','efg'],
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
10 groups_to_files => {'group_1' => ['abc'], group_2 => ['abc', 'efg']}
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
11 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
12 $obj->create_accessory_binary_fasta();
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
13 =cut
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
14
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
15 use Moose;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
16 use POSIX;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
17 use Bio::Roary::AnnotateGroups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
18 use Bio::Roary::AnalyseGroups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
19 use Bio::Roary::Exceptions;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
20 use Bio::SeqIO;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
21 use File::Basename;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
22
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
23 has 'input_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
24 has 'annotate_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
25 has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
26 has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'accessory_binary_genes.fa' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
27 has 'lower_bound_percentage' => ( is => 'ro', isa => 'Int', default => 5 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
28 has 'upper_bound_percentage' => ( is => 'ro', isa => 'Int', default => 5 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
29 has 'max_accessory_to_include' => ( is => 'ro', isa => 'Int', default => 4000 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
30 has 'groups_to_files' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_files' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
31 has '_lower_bound_value' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__lower_bound_value' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
32 has '_upper_bound_value' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__upper_bound_value' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
33
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
34 sub _build__groups_to_files {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
35 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
36 my %groups_to_files;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
37 for my $group ( @{ $self->annotate_groups_obj->_groups } ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
38 my $genes = $self->annotate_groups_obj->_groups_to_id_names->{$group};
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
39 my %filenames;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
40 for my $gene_name ( @{$genes} ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
41 my $filename = $self->analyse_groups_obj->_genes_to_file->{$gene_name};
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
42 push( @{ $filenames{$filename} }, $gene_name );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
43 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
44 $groups_to_files{$group} = \%filenames;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
45 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
46
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
47 return \%groups_to_files;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
48 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
49
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
50 sub _build__lower_bound_value {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
51 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
52 my $num_files = @{ $self->input_files };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
53 return ceil( $num_files * ( $self->lower_bound_percentage / 100 ) );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
54 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
55
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
56 sub _build__upper_bound_value {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
57 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
58 my $num_files = @{ $self->input_files };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
59 return $num_files - ceil( $num_files * ( $self->upper_bound_percentage / 100 ) );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
60 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
61
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
62 sub create_accessory_binary_fasta {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
63 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
64 my $out_seq_io = Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
65
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
66 for my $full_filename ( @{ $self->input_files } ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
67 my($filename, $dirs, $suffix) = fileparse($full_filename);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
68
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
69 my $output_sequence = '';
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
70 my $sample_name = $filename;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
71 $sample_name =~ s!\.gff\.proteome\.faa!!;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
72
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
73 my $gene_count = 0;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
74 for my $group ( sort keys %{ $self->groups_to_files } ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
75 last if($gene_count > $self->max_accessory_to_include);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
76
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
77 my @files = keys %{ $self->groups_to_files->{$group} };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
78
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
79 next if ( @files <= $self->_lower_bound_value || @files > $self->_upper_bound_value );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
80
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
81 my $group_to_file_genes = $self->groups_to_files->{$group}->{$full_filename};
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
82 if ( defined($group_to_file_genes) && @{$group_to_file_genes} > 0 ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
83 $output_sequence .= 'A';
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
84 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
85 else {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
86 $output_sequence .= 'C';
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
87 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
88 $gene_count++;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
89
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
90 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
91 next if($output_sequence eq '');
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
92 $out_seq_io->write_seq( Bio::Seq->new( -display_id => $sample_name, -seq => $output_sequence ) );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
93 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
94 return 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
95 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
96
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
97 no Moose;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
98 __PACKAGE__->meta->make_immutable;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
99
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
100 1;