comparison Roary/lib/Bio/Roary/MergeMultifastaAlignments.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c47a5f61bc9f
1 package Bio::Roary::MergeMultifastaAlignments;
2
3 # ABSTRACT: Merge multifasta alignment files with equal numbers of sequences.
4
5 =head1 SYNOPSIS
6
7 Merge multifasta alignment files with equal numbers of sequences.So each sequence in each file gets concatenated together. It is assumed the
8 sequences are in the correct order.
9 use Bio::Roary::MergeMultifastaAlignments;
10
11 my $obj = Bio::Roary::MergeMultifastaAlignments->new(
12 multifasta_files => [],
13 output_filename => 'output_merged.aln'
14 );
15 $obj->merge_files;
16
17 =cut
18
19 use Moose;
20 use Bio::SeqIO;
21 use Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL;
22
23 has 'multifasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
24 has 'sample_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
25 has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', required => 1 );
26 has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'core_alignment.aln' );
27 has 'output_header_filename' => ( is => 'ro', isa => 'Str', default => 'core_alignment_header.embl' );
28 has '_output_seqio_obj' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio_obj' );
29 has '_gene_lengths' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__gene_lengths' );
30 has '_gene_to_sequence' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
31 has '_sorted_multifasta_files' => ( is => 'rw', isa => 'ArrayRef', lazy => 1, builder => '_build__sorted_multifasta_files' );
32
33 sub BUILD {
34 my ($self) = @_;
35 $self->_gene_lengths;
36 }
37
38 sub _input_seq_io_obj {
39 my ( $self, $filename ) = @_;
40 return Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );
41 }
42
43 sub _build__output_seqio_obj {
44 my ($self) = @_;
45 return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );
46 }
47
48 sub _build__gene_lengths {
49 my ($self) = @_;
50 my %gene_lengths;
51 for my $filename ( @{ $self->_sorted_multifasta_files } ) {
52 my $seq_io = $self->_input_seq_io_obj($filename);
53 next unless ( defined($seq_io) );
54 while ( my $seq_record = $seq_io->next_seq ) {
55
56 # Save all of the gene sequences to memory, massive speedup but a bit naughty.
57 $self->_gene_to_sequence->{$filename}->{ $seq_record->display_id } = $seq_record->seq;
58 $gene_lengths{$filename} = $seq_record->length() if ( !defined( $gene_lengths{$filename} ) );
59 }
60 }
61
62 return \%gene_lengths;
63 }
64
65 sub _build__sorted_multifasta_files {
66 my ($self) = @_;
67 my @sorted_gene_files = sort @{ $self->multifasta_files };
68 return \@sorted_gene_files;
69 }
70
71 sub _sequence_for_sample_from_gene_file {
72 my ( $self, $sample_name, $gene_file ) = @_;
73
74 # loop over this to get the geneIDs
75 for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) {
76 if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) {
77 return $self->_gene_to_sequence->{$gene_file}->{$gene_id};
78 }
79 }
80 return $self->_padded_string_for_gene_file($gene_file);
81 }
82
83 sub _padded_string_for_gene_file {
84 my ( $self, $gene_file ) = @_;
85 return '' unless ( defined( $self->_gene_lengths->{$gene_file} ) );
86 return '-' x ( $self->_gene_lengths->{$gene_file} );
87 }
88
89 sub _create_merged_sequence_for_sample {
90 my ( $self, $sample_name ) = @_;
91 my $merged_sequence = '';
92 for my $gene_file ( @{ $self->_sorted_multifasta_files } ) {
93 $merged_sequence .= $self->_sequence_for_sample_from_gene_file( $sample_name, $gene_file );
94 }
95 return $merged_sequence;
96 }
97
98 sub merge_files {
99 my ($self) = @_;
100
101 for my $sample_name ( @{ $self->sample_names } ) {
102 my $sequence = $self->_create_merged_sequence_for_sample($sample_name);
103 my $seq_io = Bio::Seq->new( -display_id => $sample_name, -seq => $sequence );
104 $self->_output_seqio_obj->write_seq($seq_io);
105 }
106
107 # Create a header file which gives the coordinates of each gene in the multifasta
108 Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL->new(
109 multifasta_files => $self->_sorted_multifasta_files,
110 gene_lengths => $self->_gene_lengths,
111 output_filename => $self->output_header_filename
112 )->create_file();
113
114 return 1;
115 }
116
117 no Moose;
118 __PACKAGE__->meta->make_immutable;
119
120 1;
121