0
|
1 package Bio::Roary::ExtractProteomeFromGFF;
|
|
2
|
|
3 # ABSTRACT: Take in a GFF file and create protein sequences in FASTA format
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 Take in GFF files and create protein sequences in FASTA format
|
|
8 use Bio::Roary::ExtractProteomeFromGFF;
|
|
9
|
|
10 my $obj = Bio::Roary::ExtractProteomeFromGFF->new(
|
|
11 gff_file => $fasta_file,
|
|
12 );
|
|
13 $obj->fasta_file();
|
|
14
|
|
15 =cut
|
|
16
|
|
17 use Moose;
|
|
18 use Bio::SeqIO;
|
|
19 use Cwd;
|
|
20 use Bio::Roary::Exceptions;
|
|
21 use File::Basename;
|
|
22 use File::Temp;
|
|
23 use File::Copy;
|
|
24 use Bio::Tools::GFF;
|
|
25 with 'Bio::Roary::JobRunner::Role';
|
|
26 with 'Bio::Roary::BedFromGFFRole';
|
|
27
|
|
28 has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
|
|
29 has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
|
|
30 has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 );
|
|
31 has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
|
|
32 has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
|
|
33 has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
|
|
34 has '_working_directory_name' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__working_directory_name' );
|
|
35 has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
|
|
36
|
|
37 sub _build_fasta_file {
|
|
38 my ($self) = @_;
|
|
39 $self->_extract_nucleotide_regions;
|
|
40 $self->_convert_nucleotide_to_protein;
|
|
41 $self->_cleanup_fasta;
|
|
42 $self->_cleanup_intermediate_files;
|
|
43 $self->_filter_fasta_sequences( join('/',($self->output_directory,$self->output_filename)) );
|
|
44 return join('/',($self->output_directory,$self->output_filename));
|
|
45 }
|
|
46
|
|
47 sub _build__working_directory_name {
|
|
48 my ($self) = @_;
|
|
49 return $self->_working_directory->dirname();
|
|
50 }
|
|
51
|
|
52 sub _build_output_filename {
|
|
53 my ($self) = @_;
|
|
54 my ( $filename, $directories, $suffix ) = fileparse( $self->gff_file, qr/\.[^.]*/ );
|
|
55 return join( '/', ( $self->_working_directory_name, $filename . '.faa' ) );
|
|
56 }
|
|
57
|
|
58
|
|
59
|
|
60 sub _cleanup_intermediate_files {
|
|
61 my ($self) = @_;
|
|
62 unlink( $self->_unfiltered_output_filename );
|
|
63 unlink( $self->_fastatranslate_filename );
|
|
64 }
|
|
65
|
|
66 sub _nucleotide_fasta_file_from_gff_filename {
|
|
67 my ($self) = @_;
|
|
68 return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.fa' ) )));
|
|
69 }
|
|
70
|
|
71 sub _extracted_nucleotide_fasta_file_from_bed_filename {
|
|
72 my ($self) = @_;
|
|
73 return join('/',($self->output_directory,join( '.', ( $self->output_filename,'intermediate.extracted.fa' ) )));
|
|
74 }
|
|
75
|
|
76 sub _unfiltered_output_filename {
|
|
77 my $self = shift;
|
|
78 return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'unfiltered.fa' ) )));
|
|
79 }
|
|
80
|
|
81
|
|
82 sub _create_nucleotide_fasta_file_from_gff {
|
|
83 my ($self) = @_;
|
|
84
|
|
85 open(my $input_fh, $self->gff_file);
|
|
86 open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename);
|
|
87 my $at_sequence = 0;
|
|
88 while(<$input_fh>)
|
|
89 {
|
|
90 my $line = $_;
|
|
91 if($line =~/^>/)
|
|
92 {
|
|
93 $at_sequence = 1;
|
|
94 }
|
|
95
|
|
96 if($at_sequence == 1)
|
|
97 {
|
|
98 print {$output_fh} $line;
|
|
99 }
|
|
100 }
|
|
101 close($input_fh);
|
|
102 close($output_fh);
|
|
103 }
|
|
104
|
|
105 sub _extract_nucleotide_regions {
|
|
106 my ($self) = @_;
|
|
107
|
|
108 $self->_create_nucleotide_fasta_file_from_gff;
|
|
109 $self->_create_bed_file_from_gff;
|
|
110
|
|
111 my $cmd =
|
|
112 'bedtools getfasta -s -fi '
|
|
113 . $self->_nucleotide_fasta_file_from_gff_filename
|
|
114 . ' -bed '
|
|
115 . $self->_bed_output_filename . ' -fo '
|
|
116 . $self->_extracted_nucleotide_fasta_file_from_bed_filename
|
|
117 . ' -name > /dev/null 2>&1';
|
|
118
|
|
119 $self->logger->debug($cmd);
|
|
120 system($cmd);
|
|
121 unlink( $self->_nucleotide_fasta_file_from_gff_filename );
|
|
122 unlink( $self->_bed_output_filename );
|
|
123 unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' );
|
|
124 }
|
|
125
|
|
126 sub _cleanup_fasta {
|
|
127 my $self = shift;
|
|
128 my $infile = $self->_unfiltered_output_filename;
|
|
129 my $outfile = join('/',($self->output_directory,$self->output_filename));
|
|
130 return unless ( -e $infile );
|
|
131
|
|
132 open( my $in, '<', $infile );
|
|
133 open( my $out, '>', $outfile );
|
|
134 while ( my $line = <$in> ) {
|
|
135 chomp $line;
|
|
136 if ( $line =~ /^>/ )
|
|
137 {
|
|
138 $line =~ s/"//g;
|
|
139 # newer versions of Bedtools add (-) or (+) to the end of the sequence name, remove them
|
|
140 $line =~ s!\([-+]\)!!;
|
|
141 }
|
|
142
|
|
143 if($line =~ /^(>[^:]+)/)
|
|
144 {
|
|
145 $line = $1;
|
|
146 }
|
|
147 print $out "$line\n";
|
|
148 }
|
|
149 close $in;
|
|
150 close $out;
|
|
151 }
|
|
152
|
|
153 sub _fastatranslate_filename {
|
|
154 my ($self) = @_;
|
|
155 return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.translate.fa' ) )));
|
|
156 }
|
|
157
|
|
158 sub _fastatranslate {
|
|
159 my ( $self, $inputfile, $outputfile ) = @_;
|
|
160
|
|
161 my $input_fasta_file_obj = Bio::SeqIO->new( -file => $inputfile, -format => 'Fasta' );
|
|
162 my $output_protein_file_obj = Bio::SeqIO->new( -file => ">" . $outputfile, -format => 'Fasta', -alphabet => 'protein' );
|
|
163
|
|
164 my %protein_sequence_objs;
|
|
165 while ( my $seq = $input_fasta_file_obj->next_seq ) {
|
|
166 $seq->desc(undef);
|
|
167 my $protseq = $seq->translate( -codontable_id => $self->translation_table );
|
|
168 $output_protein_file_obj->write_seq($protseq);
|
|
169 }
|
|
170 return 1;
|
|
171 }
|
|
172
|
|
173 sub _convert_nucleotide_to_protein {
|
|
174 my ($self) = @_;
|
|
175 $self->_fastatranslate( $self->_extracted_nucleotide_fasta_file_from_bed_filename, $self->_unfiltered_output_filename );
|
|
176 unlink( $self->_extracted_nucleotide_fasta_file_from_bed_filename );
|
|
177 }
|
|
178
|
|
179 sub _does_sequence_contain_too_many_unknowns {
|
|
180 my ( $self, $sequence_obj ) = @_;
|
|
181 my $maximum_number_of_Xs = int( ( $sequence_obj->length() * $self->maximum_percentage_of_unknowns ) / 100 );
|
|
182 my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
|
|
183 if ( $number_of_Xs_found > $maximum_number_of_Xs ) {
|
|
184 return 1;
|
|
185 }
|
|
186 else {
|
|
187 return 0;
|
|
188 }
|
|
189 }
|
|
190
|
|
191 sub _filter_fasta_sequences {
|
|
192 my ( $self, $filename ) = @_;
|
|
193 my $temp_output_file = $filename . '.tmp.filtered.fa';
|
|
194 my $out_fasta_obj = Bio::SeqIO->new( -file => ">" . $temp_output_file, -format => 'Fasta' );
|
|
195 my $fasta_obj = Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );
|
|
196
|
|
197 my $sequence_found = 0;
|
|
198
|
|
199 while ( my $seq = $fasta_obj->next_seq() ) {
|
|
200 if ( $self->_does_sequence_contain_too_many_unknowns($seq) ) {
|
|
201 next;
|
|
202 }
|
|
203 $seq->desc(undef);
|
|
204 $out_fasta_obj->write_seq($seq);
|
|
205 $sequence_found = 1;
|
|
206 }
|
|
207
|
|
208 if ( $sequence_found == 0 ) {
|
|
209 $self->logger->error( "Could not extract any protein sequences from "
|
|
210 . $self->gff_file
|
|
211 . ". Does the file contain the assembly as well as the annotation?" );
|
|
212 }
|
|
213
|
|
214 # Replace the original file.
|
|
215 move( $temp_output_file, $filename );
|
|
216 return 1;
|
|
217 }
|
|
218
|
|
219 no Moose;
|
|
220 __PACKAGE__->meta->make_immutable;
|
|
221
|
|
222 1;
|
|
223
|