annotate Roary/lib/Bio/Roary/MergeMultifastaAlignments.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
1 package Bio::Roary::MergeMultifastaAlignments;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
2
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
3 # ABSTRACT: Merge multifasta alignment files with equal numbers of sequences.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
4
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
5 =head1 SYNOPSIS
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
6
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
7 Merge multifasta alignment files with equal numbers of sequences.So each sequence in each file gets concatenated together. It is assumed the
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
8 sequences are in the correct order.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
9 use Bio::Roary::MergeMultifastaAlignments;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
10
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
11 my $obj = Bio::Roary::MergeMultifastaAlignments->new(
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
12 multifasta_files => [],
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
13 output_filename => 'output_merged.aln'
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
14 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
15 $obj->merge_files;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
16
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
17 =cut
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
18
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
19 use Moose;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
20 use Bio::SeqIO;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
21 use Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
22
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
23 has 'multifasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
24 has 'sample_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
25 has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
26 has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'core_alignment.aln' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
27 has 'output_header_filename' => ( is => 'ro', isa => 'Str', default => 'core_alignment_header.embl' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
28 has '_output_seqio_obj' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio_obj' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
29 has '_gene_lengths' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__gene_lengths' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
30 has '_gene_to_sequence' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
31 has '_sorted_multifasta_files' => ( is => 'rw', isa => 'ArrayRef', lazy => 1, builder => '_build__sorted_multifasta_files' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
32
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
33 sub BUILD {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
34 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
35 $self->_gene_lengths;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
36 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
37
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
38 sub _input_seq_io_obj {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
39 my ( $self, $filename ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
40 return Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
41 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
42
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
43 sub _build__output_seqio_obj {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
44 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
45 return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
46 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
47
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
48 sub _build__gene_lengths {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
49 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
50 my %gene_lengths;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
51 for my $filename ( @{ $self->_sorted_multifasta_files } ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
52 my $seq_io = $self->_input_seq_io_obj($filename);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
53 next unless ( defined($seq_io) );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
54 while ( my $seq_record = $seq_io->next_seq ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
55
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
56 # Save all of the gene sequences to memory, massive speedup but a bit naughty.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
57 $self->_gene_to_sequence->{$filename}->{ $seq_record->display_id } = $seq_record->seq;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
58 $gene_lengths{$filename} = $seq_record->length() if ( !defined( $gene_lengths{$filename} ) );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
59 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
60 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
61
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
62 return \%gene_lengths;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
63 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
64
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
65 sub _build__sorted_multifasta_files {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
66 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
67 my @sorted_gene_files = sort @{ $self->multifasta_files };
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
68 return \@sorted_gene_files;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
69 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
70
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
71 sub _sequence_for_sample_from_gene_file {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
72 my ( $self, $sample_name, $gene_file ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
73
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
74 # loop over this to get the geneIDs
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
75 for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
76 if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
77 return $self->_gene_to_sequence->{$gene_file}->{$gene_id};
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
78 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
79 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
80 return $self->_padded_string_for_gene_file($gene_file);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
81 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
82
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
83 sub _padded_string_for_gene_file {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
84 my ( $self, $gene_file ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
85 return '' unless ( defined( $self->_gene_lengths->{$gene_file} ) );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
86 return '-' x ( $self->_gene_lengths->{$gene_file} );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
87 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
88
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
89 sub _create_merged_sequence_for_sample {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
90 my ( $self, $sample_name ) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
91 my $merged_sequence = '';
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
92 for my $gene_file ( @{ $self->_sorted_multifasta_files } ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
93 $merged_sequence .= $self->_sequence_for_sample_from_gene_file( $sample_name, $gene_file );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
94 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
95 return $merged_sequence;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
96 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
97
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
98 sub merge_files {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
99 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
100
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
101 for my $sample_name ( @{ $self->sample_names } ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
102 my $sequence = $self->_create_merged_sequence_for_sample($sample_name);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
103 my $seq_io = Bio::Seq->new( -display_id => $sample_name, -seq => $sequence );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
104 $self->_output_seqio_obj->write_seq($seq_io);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
105 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
106
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
107 # Create a header file which gives the coordinates of each gene in the multifasta
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
108 Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL->new(
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
109 multifasta_files => $self->_sorted_multifasta_files,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
110 gene_lengths => $self->_gene_lengths,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
111 output_filename => $self->output_header_filename
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
112 )->create_file();
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
113
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
114 return 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
115 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
116
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
117 no Moose;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
118 __PACKAGE__->meta->make_immutable;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
119
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
120 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
121