annotate Roary/lib/Bio/Roary/Output/GroupsMultifastaNucleotide.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
1 package Bio::Roary::Output::GroupsMultifastaNucleotide;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
2
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
3 # ABSTRACT: Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
4
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
5 =head1 SYNOPSIS
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
6
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
7 Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
8 use Bio::Roary::Output::GroupsMultifastas;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
9
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
10 my $obj = Bio::Roary::Output::GroupsMultifastasNucleotide->new(
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
11 group_names => ['aaa','bbb'],
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
12 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
13 $obj->populate_files();
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
14
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
15 =cut
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
16
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
17 use Moose;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
18 use Bio::SeqIO;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
19 use File::Path qw(make_path);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
20 use File::Basename;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
21 use File::Copy;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
22 use File::Temp qw/ tempfile /;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
23 use Bio::Roary::Exceptions;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
24 use Bio::Roary::AnalyseGroups;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
25 use Bio::Tools::GFF;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
26 with 'Bio::Roary::BedFromGFFRole';
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
27
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
28 has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
29 has 'group_names' => ( is => 'ro', isa => 'ArrayRef', required => 0 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
30 has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
31 has 'pan_reference_groups_seen' => ( is => 'rw', isa => 'HashRef', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
32 has 'number_of_gff_files' => ( is => 'ro', isa => 'Int', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
33 has 'pan_reference_filename' => ( is => 'ro', isa => 'Str',default => 'pan_genome_reference.fa' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
34 has 'dont_delete_files' => ( is => 'ro', isa => 'Bool',default => 0 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
35 has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
36
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
37 has 'annotate_groups' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
38 has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
39
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
40 has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
41 has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
42
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
43 has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
44
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
45 sub _build_output_filename
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
46 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
47 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
48 my ( $filename, $directories, $suffix ) = fileparse($self->gff_file);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
49 return join('/',($self->output_directory, $filename.'.tmp_nuc_sequences.fa' ));
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
50 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
51
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
52 sub _build__input_seqio {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
53 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
54 return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
55 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
56
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
57 sub _bed_output_filename {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
58 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
59 return join( '.', ( $self->output_filename, 'intermediate.bed' ) );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
60 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
61
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
62 sub populate_files {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
63 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
64 while ( my $input_seq = $self->_input_seqio->next_seq() )
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
65 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
66 if ( $self->annotate_groups->_ids_to_groups->{$input_seq->display_id} )
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
67 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
68 my $current_group = $self->annotate_groups->_ids_to_groups->{$input_seq->display_id};
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
69 my $gene_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$current_group};
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
70
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
71 if(! defined($self->pan_reference_groups_seen->{$current_group}))
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
72 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
73 my $pan_output_seq = $self->_pan_genome_reference_io_obj($current_group);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
74 $pan_output_seq->write_seq(Bio::Seq->new( -display_id => $input_seq->display_id, -desc => ($gene_name ? $gene_name : $current_group), -seq => $input_seq->seq ) );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
75 $self->pan_reference_groups_seen->{$current_group} = 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
76 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
77
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
78 my $number_of_genes = @{$self->annotate_groups->_groups_to_id_names->{$current_group}};
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
79 # Theres no need to align noncore files
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
80 next if($self->dont_delete_files == 0 && $number_of_genes < ($self->core_definition * $self->number_of_gff_files ));
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
81
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
82 my $output_seq = $self->_group_seq_io_obj($current_group,$number_of_genes);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
83 $output_seq->write_seq($input_seq);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
84 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
85 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
86
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
87 unlink($self->fasta_file);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
88 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
89 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
90
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
91 sub _group_file_name
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
92 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
93 my ($self,$group_name,$num_group_genes) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
94 my $annotated_group_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$group_name};
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
95 $annotated_group_name =~ s!\W!_!gi;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
96 my $filename = $annotated_group_name.'.fa';
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
97 my $group_file_name = join('/',($self->output_directory, $filename ));
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
98 return $group_file_name;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
99 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
100
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
101
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
102 sub _pan_genome_reference_io_obj
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
103 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
104 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
105 return Bio::SeqIO->new( -file => ">>".$self->pan_reference_filename, -format => 'Fasta' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
106 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
107
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
108
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
109 sub _group_seq_io_obj
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
110 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
111 my ($self,$group_name,$num_group_genes) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
112 my $filename = $self->_group_file_name($group_name,$num_group_genes);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
113 return Bio::SeqIO->new( -file => ">>".$filename, -format => 'Fasta' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
114 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
115
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
116
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
117 sub _extracted_nucleotide_fasta_file_from_bed_filename {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
118 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
119 return join( '.', ( $self->output_filename, 'intermediate.extracted.fa' ) );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
120 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
121
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
122 sub _create_nucleotide_fasta_file_from_gff {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
123 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
124
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
125 open(my $input_fh, $self->gff_file);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
126 open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
127 my $at_sequence = 0;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
128 while(<$input_fh>)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
129 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
130 my $line = $_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
131 if($line =~/^>/)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
132 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
133 $at_sequence = 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
134 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
135
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
136 if($at_sequence == 1)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
137 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
138 print {$output_fh} $line;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
139 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
140 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
141 close($input_fh);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
142 close($output_fh);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
143 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
144
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
145 sub _nucleotide_fasta_file_from_gff_filename {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
146 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
147 return join( '.', ( $self->output_filename, 'intermediate.fa' ) );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
148 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
149
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
150 sub _extract_nucleotide_regions {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
151 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
152
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
153 $self->_create_nucleotide_fasta_file_from_gff;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
154 $self->_create_bed_file_from_gff;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
155
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
156 my $cmd =
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
157 'bedtools getfasta -s -fi '
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
158 . $self->_nucleotide_fasta_file_from_gff_filename
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
159 . ' -bed '
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
160 . $self->_bed_output_filename . ' -fo '
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
161 . $self->_extracted_nucleotide_fasta_file_from_bed_filename
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
162 . ' -name > /dev/null 2>&1';
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
163 system($cmd);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
164 unlink( $self->_nucleotide_fasta_file_from_gff_filename );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
165 unlink( $self->_bed_output_filename );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
166 unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
167 return $self->_extracted_nucleotide_fasta_file_from_bed_filename;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
168 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
169
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
170 sub _cleanup_fasta {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
171 my ($self,$infile) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
172
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
173 my($fh, $outfile) = tempfile();
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
174 return unless ( -e $infile );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
175
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
176 open( my $in, '<', $infile );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
177 open( my $out, '>', $outfile );
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
178 while ( my $line = <$in> ) {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
179 chomp $line;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
180 if ( $line =~ /^>/ )
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
181 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
182 $line =~ s/"//g ;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
183 # newer versions of Bedtools add (-) or (+) to the end of the sequence name, remove them
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
184 $line =~ s!\([-+]\)!!;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
185 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
186
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
187 if($line =~ /^(>[^:]+)/)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
188 {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
189 $line = $1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
190 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
191 print $out "$line\n";
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
192 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
193 close $in;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
194 close $out;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
195
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
196 move( $outfile, $infile);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
197 return $infile;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
198 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
199
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
200
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
201 sub _build_fasta_file {
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
202 my ($self) = @_;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
203 my $fasta_filename = $self->_extract_nucleotide_regions;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
204 return $self->_cleanup_fasta($fasta_filename);
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
205 }
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
206
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
207 no Moose;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
208 __PACKAGE__->meta->make_immutable;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
209
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
210 1;
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
211