Mercurial > repos > dereeper > roary_plots
comparison Roary/lib/Bio/Roary/MergeMultifastaAlignments.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c47a5f61bc9f |
---|---|
1 package Bio::Roary::MergeMultifastaAlignments; | |
2 | |
3 # ABSTRACT: Merge multifasta alignment files with equal numbers of sequences. | |
4 | |
5 =head1 SYNOPSIS | |
6 | |
7 Merge multifasta alignment files with equal numbers of sequences.So each sequence in each file gets concatenated together. It is assumed the | |
8 sequences are in the correct order. | |
9 use Bio::Roary::MergeMultifastaAlignments; | |
10 | |
11 my $obj = Bio::Roary::MergeMultifastaAlignments->new( | |
12 multifasta_files => [], | |
13 output_filename => 'output_merged.aln' | |
14 ); | |
15 $obj->merge_files; | |
16 | |
17 =cut | |
18 | |
19 use Moose; | |
20 use Bio::SeqIO; | |
21 use Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL; | |
22 | |
23 has 'multifasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); | |
24 has 'sample_names' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); | |
25 has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', required => 1 ); | |
26 has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'core_alignment.aln' ); | |
27 has 'output_header_filename' => ( is => 'ro', isa => 'Str', default => 'core_alignment_header.embl' ); | |
28 has '_output_seqio_obj' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio_obj' ); | |
29 has '_gene_lengths' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build__gene_lengths' ); | |
30 has '_gene_to_sequence' => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); | |
31 has '_sorted_multifasta_files' => ( is => 'rw', isa => 'ArrayRef', lazy => 1, builder => '_build__sorted_multifasta_files' ); | |
32 | |
33 sub BUILD { | |
34 my ($self) = @_; | |
35 $self->_gene_lengths; | |
36 } | |
37 | |
38 sub _input_seq_io_obj { | |
39 my ( $self, $filename ) = @_; | |
40 return Bio::SeqIO->new( -file => $filename, -format => 'Fasta' ); | |
41 } | |
42 | |
43 sub _build__output_seqio_obj { | |
44 my ($self) = @_; | |
45 return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' ); | |
46 } | |
47 | |
48 sub _build__gene_lengths { | |
49 my ($self) = @_; | |
50 my %gene_lengths; | |
51 for my $filename ( @{ $self->_sorted_multifasta_files } ) { | |
52 my $seq_io = $self->_input_seq_io_obj($filename); | |
53 next unless ( defined($seq_io) ); | |
54 while ( my $seq_record = $seq_io->next_seq ) { | |
55 | |
56 # Save all of the gene sequences to memory, massive speedup but a bit naughty. | |
57 $self->_gene_to_sequence->{$filename}->{ $seq_record->display_id } = $seq_record->seq; | |
58 $gene_lengths{$filename} = $seq_record->length() if ( !defined( $gene_lengths{$filename} ) ); | |
59 } | |
60 } | |
61 | |
62 return \%gene_lengths; | |
63 } | |
64 | |
65 sub _build__sorted_multifasta_files { | |
66 my ($self) = @_; | |
67 my @sorted_gene_files = sort @{ $self->multifasta_files }; | |
68 return \@sorted_gene_files; | |
69 } | |
70 | |
71 sub _sequence_for_sample_from_gene_file { | |
72 my ( $self, $sample_name, $gene_file ) = @_; | |
73 | |
74 # loop over this to get the geneIDs | |
75 for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) { | |
76 if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) { | |
77 return $self->_gene_to_sequence->{$gene_file}->{$gene_id}; | |
78 } | |
79 } | |
80 return $self->_padded_string_for_gene_file($gene_file); | |
81 } | |
82 | |
83 sub _padded_string_for_gene_file { | |
84 my ( $self, $gene_file ) = @_; | |
85 return '' unless ( defined( $self->_gene_lengths->{$gene_file} ) ); | |
86 return '-' x ( $self->_gene_lengths->{$gene_file} ); | |
87 } | |
88 | |
89 sub _create_merged_sequence_for_sample { | |
90 my ( $self, $sample_name ) = @_; | |
91 my $merged_sequence = ''; | |
92 for my $gene_file ( @{ $self->_sorted_multifasta_files } ) { | |
93 $merged_sequence .= $self->_sequence_for_sample_from_gene_file( $sample_name, $gene_file ); | |
94 } | |
95 return $merged_sequence; | |
96 } | |
97 | |
98 sub merge_files { | |
99 my ($self) = @_; | |
100 | |
101 for my $sample_name ( @{ $self->sample_names } ) { | |
102 my $sequence = $self->_create_merged_sequence_for_sample($sample_name); | |
103 my $seq_io = Bio::Seq->new( -display_id => $sample_name, -seq => $sequence ); | |
104 $self->_output_seqio_obj->write_seq($seq_io); | |
105 } | |
106 | |
107 # Create a header file which gives the coordinates of each gene in the multifasta | |
108 Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL->new( | |
109 multifasta_files => $self->_sorted_multifasta_files, | |
110 gene_lengths => $self->_gene_lengths, | |
111 output_filename => $self->output_header_filename | |
112 )->create_file(); | |
113 | |
114 return 1; | |
115 } | |
116 | |
117 no Moose; | |
118 __PACKAGE__->meta->make_immutable; | |
119 | |
120 1; | |
121 |