Mercurial > repos > dereeper > roary_plots
diff Roary/lib/Bio/Roary/ExtractProteomeFromGFF.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Roary/lib/Bio/Roary/ExtractProteomeFromGFF.pm Fri May 14 20:27:06 2021 +0000 @@ -0,0 +1,223 @@ +package Bio::Roary::ExtractProteomeFromGFF; + +# ABSTRACT: Take in a GFF file and create protein sequences in FASTA format + +=head1 SYNOPSIS + +Take in GFF files and create protein sequences in FASTA format + use Bio::Roary::ExtractProteomeFromGFF; + + my $obj = Bio::Roary::ExtractProteomeFromGFF->new( + gff_file => $fasta_file, + ); + $obj->fasta_file(); + +=cut + +use Moose; +use Bio::SeqIO; +use Cwd; +use Bio::Roary::Exceptions; +use File::Basename; +use File::Temp; +use File::Copy; +use Bio::Tools::GFF; +with 'Bio::Roary::JobRunner::Role'; +with 'Bio::Roary::BedFromGFFRole'; + +has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 ); +has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 ); +has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' ); +has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' ); +has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } ); +has '_working_directory_name' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__working_directory_name' ); +has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 ); + +sub _build_fasta_file { + my ($self) = @_; + $self->_extract_nucleotide_regions; + $self->_convert_nucleotide_to_protein; + $self->_cleanup_fasta; + $self->_cleanup_intermediate_files; + $self->_filter_fasta_sequences( join('/',($self->output_directory,$self->output_filename)) ); + return join('/',($self->output_directory,$self->output_filename)); +} + +sub _build__working_directory_name { + my ($self) = @_; + return $self->_working_directory->dirname(); +} + +sub _build_output_filename { + my ($self) = @_; + my ( $filename, $directories, $suffix ) = fileparse( $self->gff_file, qr/\.[^.]*/ ); + return join( '/', ( $self->_working_directory_name, $filename . '.faa' ) ); +} + + + +sub _cleanup_intermediate_files { + my ($self) = @_; + unlink( $self->_unfiltered_output_filename ); + unlink( $self->_fastatranslate_filename ); +} + +sub _nucleotide_fasta_file_from_gff_filename { + my ($self) = @_; + return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.fa' ) ))); +} + +sub _extracted_nucleotide_fasta_file_from_bed_filename { + my ($self) = @_; + return join('/',($self->output_directory,join( '.', ( $self->output_filename,'intermediate.extracted.fa' ) ))); +} + +sub _unfiltered_output_filename { + my $self = shift; + return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'unfiltered.fa' ) ))); +} + + +sub _create_nucleotide_fasta_file_from_gff { + my ($self) = @_; + + open(my $input_fh, $self->gff_file); + open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename); + my $at_sequence = 0; + while(<$input_fh>) + { + my $line = $_; + if($line =~/^>/) + { + $at_sequence = 1; + } + + if($at_sequence == 1) + { + print {$output_fh} $line; + } + } + close($input_fh); + close($output_fh); +} + +sub _extract_nucleotide_regions { + my ($self) = @_; + + $self->_create_nucleotide_fasta_file_from_gff; + $self->_create_bed_file_from_gff; + + my $cmd = + 'bedtools getfasta -s -fi ' + . $self->_nucleotide_fasta_file_from_gff_filename + . ' -bed ' + . $self->_bed_output_filename . ' -fo ' + . $self->_extracted_nucleotide_fasta_file_from_bed_filename + . ' -name > /dev/null 2>&1'; + + $self->logger->debug($cmd); + system($cmd); + unlink( $self->_nucleotide_fasta_file_from_gff_filename ); + unlink( $self->_bed_output_filename ); + unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' ); +} + +sub _cleanup_fasta { + my $self = shift; + my $infile = $self->_unfiltered_output_filename; + my $outfile = join('/',($self->output_directory,$self->output_filename)); + return unless ( -e $infile ); + + open( my $in, '<', $infile ); + open( my $out, '>', $outfile ); + while ( my $line = <$in> ) { + chomp $line; + if ( $line =~ /^>/ ) + { + $line =~ s/"//g; + # newer versions of Bedtools add (-) or (+) to the end of the sequence name, remove them + $line =~ s!\([-+]\)!!; + } + + if($line =~ /^(>[^:]+)/) + { + $line = $1; + } + print $out "$line\n"; + } + close $in; + close $out; +} + +sub _fastatranslate_filename { + my ($self) = @_; + return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.translate.fa' ) ))); +} + +sub _fastatranslate { + my ( $self, $inputfile, $outputfile ) = @_; + + my $input_fasta_file_obj = Bio::SeqIO->new( -file => $inputfile, -format => 'Fasta' ); + my $output_protein_file_obj = Bio::SeqIO->new( -file => ">" . $outputfile, -format => 'Fasta', -alphabet => 'protein' ); + + my %protein_sequence_objs; + while ( my $seq = $input_fasta_file_obj->next_seq ) { + $seq->desc(undef); + my $protseq = $seq->translate( -codontable_id => $self->translation_table ); + $output_protein_file_obj->write_seq($protseq); + } + return 1; +} + +sub _convert_nucleotide_to_protein { + my ($self) = @_; + $self->_fastatranslate( $self->_extracted_nucleotide_fasta_file_from_bed_filename, $self->_unfiltered_output_filename ); + unlink( $self->_extracted_nucleotide_fasta_file_from_bed_filename ); +} + +sub _does_sequence_contain_too_many_unknowns { + my ( $self, $sequence_obj ) = @_; + my $maximum_number_of_Xs = int( ( $sequence_obj->length() * $self->maximum_percentage_of_unknowns ) / 100 ); + my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g; + if ( $number_of_Xs_found > $maximum_number_of_Xs ) { + return 1; + } + else { + return 0; + } +} + +sub _filter_fasta_sequences { + my ( $self, $filename ) = @_; + my $temp_output_file = $filename . '.tmp.filtered.fa'; + my $out_fasta_obj = Bio::SeqIO->new( -file => ">" . $temp_output_file, -format => 'Fasta' ); + my $fasta_obj = Bio::SeqIO->new( -file => $filename, -format => 'Fasta' ); + + my $sequence_found = 0; + + while ( my $seq = $fasta_obj->next_seq() ) { + if ( $self->_does_sequence_contain_too_many_unknowns($seq) ) { + next; + } + $seq->desc(undef); + $out_fasta_obj->write_seq($seq); + $sequence_found = 1; + } + + if ( $sequence_found == 0 ) { + $self->logger->error( "Could not extract any protein sequences from " + . $self->gff_file + . ". Does the file contain the assembly as well as the annotation?" ); + } + + # Replace the original file. + move( $temp_output_file, $filename ); + return 1; +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; +