0
|
1 package Bio::Roary::Output::GroupsMultifastaNucleotide;
|
|
2
|
|
3 # ABSTRACT: Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences.
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences.
|
|
8 use Bio::Roary::Output::GroupsMultifastas;
|
|
9
|
|
10 my $obj = Bio::Roary::Output::GroupsMultifastasNucleotide->new(
|
|
11 group_names => ['aaa','bbb'],
|
|
12 );
|
|
13 $obj->populate_files();
|
|
14
|
|
15 =cut
|
|
16
|
|
17 use Moose;
|
|
18 use Bio::SeqIO;
|
|
19 use File::Path qw(make_path);
|
|
20 use File::Basename;
|
|
21 use File::Copy;
|
|
22 use File::Temp qw/ tempfile /;
|
|
23 use Bio::Roary::Exceptions;
|
|
24 use Bio::Roary::AnalyseGroups;
|
|
25 use Bio::Tools::GFF;
|
|
26 with 'Bio::Roary::BedFromGFFRole';
|
|
27
|
|
28 has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
|
|
29 has 'group_names' => ( is => 'ro', isa => 'ArrayRef', required => 0 );
|
|
30 has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
|
|
31 has 'pan_reference_groups_seen' => ( is => 'rw', isa => 'HashRef', required => 1 );
|
|
32 has 'number_of_gff_files' => ( is => 'ro', isa => 'Int', required => 1 );
|
|
33 has 'pan_reference_filename' => ( is => 'ro', isa => 'Str',default => 'pan_genome_reference.fa' );
|
|
34 has 'dont_delete_files' => ( is => 'ro', isa => 'Bool',default => 0 );
|
|
35 has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
|
|
36
|
|
37 has 'annotate_groups' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 );
|
|
38 has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
|
|
39
|
|
40 has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
|
|
41 has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
|
|
42
|
|
43 has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
|
|
44
|
|
45 sub _build_output_filename
|
|
46 {
|
|
47 my ($self) = @_;
|
|
48 my ( $filename, $directories, $suffix ) = fileparse($self->gff_file);
|
|
49 return join('/',($self->output_directory, $filename.'.tmp_nuc_sequences.fa' ));
|
|
50 }
|
|
51
|
|
52 sub _build__input_seqio {
|
|
53 my ($self) = @_;
|
|
54 return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' );
|
|
55 }
|
|
56
|
|
57 sub _bed_output_filename {
|
|
58 my ($self) = @_;
|
|
59 return join( '.', ( $self->output_filename, 'intermediate.bed' ) );
|
|
60 }
|
|
61
|
|
62 sub populate_files {
|
|
63 my ($self) = @_;
|
|
64 while ( my $input_seq = $self->_input_seqio->next_seq() )
|
|
65 {
|
|
66 if ( $self->annotate_groups->_ids_to_groups->{$input_seq->display_id} )
|
|
67 {
|
|
68 my $current_group = $self->annotate_groups->_ids_to_groups->{$input_seq->display_id};
|
|
69 my $gene_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$current_group};
|
|
70
|
|
71 if(! defined($self->pan_reference_groups_seen->{$current_group}))
|
|
72 {
|
|
73 my $pan_output_seq = $self->_pan_genome_reference_io_obj($current_group);
|
|
74 $pan_output_seq->write_seq(Bio::Seq->new( -display_id => $input_seq->display_id, -desc => ($gene_name ? $gene_name : $current_group), -seq => $input_seq->seq ) );
|
|
75 $self->pan_reference_groups_seen->{$current_group} = 1;
|
|
76 }
|
|
77
|
|
78 my $number_of_genes = @{$self->annotate_groups->_groups_to_id_names->{$current_group}};
|
|
79 # Theres no need to align noncore files
|
|
80 next if($self->dont_delete_files == 0 && $number_of_genes < ($self->core_definition * $self->number_of_gff_files ));
|
|
81
|
|
82 my $output_seq = $self->_group_seq_io_obj($current_group,$number_of_genes);
|
|
83 $output_seq->write_seq($input_seq);
|
|
84 }
|
|
85 }
|
|
86
|
|
87 unlink($self->fasta_file);
|
|
88 1;
|
|
89 }
|
|
90
|
|
91 sub _group_file_name
|
|
92 {
|
|
93 my ($self,$group_name,$num_group_genes) = @_;
|
|
94 my $annotated_group_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$group_name};
|
|
95 $annotated_group_name =~ s!\W!_!gi;
|
|
96 my $filename = $annotated_group_name.'.fa';
|
|
97 my $group_file_name = join('/',($self->output_directory, $filename ));
|
|
98 return $group_file_name;
|
|
99 }
|
|
100
|
|
101
|
|
102 sub _pan_genome_reference_io_obj
|
|
103 {
|
|
104 my ($self) = @_;
|
|
105 return Bio::SeqIO->new( -file => ">>".$self->pan_reference_filename, -format => 'Fasta' );
|
|
106 }
|
|
107
|
|
108
|
|
109 sub _group_seq_io_obj
|
|
110 {
|
|
111 my ($self,$group_name,$num_group_genes) = @_;
|
|
112 my $filename = $self->_group_file_name($group_name,$num_group_genes);
|
|
113 return Bio::SeqIO->new( -file => ">>".$filename, -format => 'Fasta' );
|
|
114 }
|
|
115
|
|
116
|
|
117 sub _extracted_nucleotide_fasta_file_from_bed_filename {
|
|
118 my ($self) = @_;
|
|
119 return join( '.', ( $self->output_filename, 'intermediate.extracted.fa' ) );
|
|
120 }
|
|
121
|
|
122 sub _create_nucleotide_fasta_file_from_gff {
|
|
123 my ($self) = @_;
|
|
124
|
|
125 open(my $input_fh, $self->gff_file);
|
|
126 open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename);
|
|
127 my $at_sequence = 0;
|
|
128 while(<$input_fh>)
|
|
129 {
|
|
130 my $line = $_;
|
|
131 if($line =~/^>/)
|
|
132 {
|
|
133 $at_sequence = 1;
|
|
134 }
|
|
135
|
|
136 if($at_sequence == 1)
|
|
137 {
|
|
138 print {$output_fh} $line;
|
|
139 }
|
|
140 }
|
|
141 close($input_fh);
|
|
142 close($output_fh);
|
|
143 }
|
|
144
|
|
145 sub _nucleotide_fasta_file_from_gff_filename {
|
|
146 my ($self) = @_;
|
|
147 return join( '.', ( $self->output_filename, 'intermediate.fa' ) );
|
|
148 }
|
|
149
|
|
150 sub _extract_nucleotide_regions {
|
|
151 my ($self) = @_;
|
|
152
|
|
153 $self->_create_nucleotide_fasta_file_from_gff;
|
|
154 $self->_create_bed_file_from_gff;
|
|
155
|
|
156 my $cmd =
|
|
157 'bedtools getfasta -s -fi '
|
|
158 . $self->_nucleotide_fasta_file_from_gff_filename
|
|
159 . ' -bed '
|
|
160 . $self->_bed_output_filename . ' -fo '
|
|
161 . $self->_extracted_nucleotide_fasta_file_from_bed_filename
|
|
162 . ' -name > /dev/null 2>&1';
|
|
163 system($cmd);
|
|
164 unlink( $self->_nucleotide_fasta_file_from_gff_filename );
|
|
165 unlink( $self->_bed_output_filename );
|
|
166 unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' );
|
|
167 return $self->_extracted_nucleotide_fasta_file_from_bed_filename;
|
|
168 }
|
|
169
|
|
170 sub _cleanup_fasta {
|
|
171 my ($self,$infile) = @_;
|
|
172
|
|
173 my($fh, $outfile) = tempfile();
|
|
174 return unless ( -e $infile );
|
|
175
|
|
176 open( my $in, '<', $infile );
|
|
177 open( my $out, '>', $outfile );
|
|
178 while ( my $line = <$in> ) {
|
|
179 chomp $line;
|
|
180 if ( $line =~ /^>/ )
|
|
181 {
|
|
182 $line =~ s/"//g ;
|
|
183 # newer versions of Bedtools add (-) or (+) to the end of the sequence name, remove them
|
|
184 $line =~ s!\([-+]\)!!;
|
|
185 }
|
|
186
|
|
187 if($line =~ /^(>[^:]+)/)
|
|
188 {
|
|
189 $line = $1;
|
|
190 }
|
|
191 print $out "$line\n";
|
|
192 }
|
|
193 close $in;
|
|
194 close $out;
|
|
195
|
|
196 move( $outfile, $infile);
|
|
197 return $infile;
|
|
198 }
|
|
199
|
|
200
|
|
201 sub _build_fasta_file {
|
|
202 my ($self) = @_;
|
|
203 my $fasta_filename = $self->_extract_nucleotide_regions;
|
|
204 return $self->_cleanup_fasta($fasta_filename);
|
|
205 }
|
|
206
|
|
207 no Moose;
|
|
208 __PACKAGE__->meta->make_immutable;
|
|
209
|
|
210 1;
|
|
211
|