view Roary/lib/Bio/Roary/ExtractProteomeFromGFF.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
line wrap: on
line source

package Bio::Roary::ExtractProteomeFromGFF;

# ABSTRACT: Take in a GFF file and create protein sequences in FASTA format

=head1 SYNOPSIS

Take in GFF files and create protein sequences in FASTA format
   use Bio::Roary::ExtractProteomeFromGFF;
   
   my $obj = Bio::Roary::ExtractProteomeFromGFF->new(
       gff_file        => $fasta_file,
     );
   $obj->fasta_file();

=cut

use Moose;
use Bio::SeqIO;
use Cwd;
use Bio::Roary::Exceptions;
use File::Basename;
use File::Temp;
use File::Copy;
use Bio::Tools::GFF;
with 'Bio::Roary::JobRunner::Role';
with 'Bio::Roary::BedFromGFFRole';

has 'gff_file'                       => ( is => 'ro', isa => 'Str',  required => 1 );
has 'apply_unknowns_filter'          => ( is => 'rw', isa => 'Bool', default  => 1 );
has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num',  default  => 5 );
has 'output_filename'                => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
has 'fasta_file'                     => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
has '_working_directory'             => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_working_directory_name'        => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__working_directory_name' );
has 'translation_table'              => ( is => 'rw', isa => 'Int', default => 11 );

sub _build_fasta_file {
    my ($self) = @_;
    $self->_extract_nucleotide_regions;
    $self->_convert_nucleotide_to_protein;
    $self->_cleanup_fasta;
    $self->_cleanup_intermediate_files;
    $self->_filter_fasta_sequences( join('/',($self->output_directory,$self->output_filename)) );
    return join('/',($self->output_directory,$self->output_filename));
}

sub _build__working_directory_name {
    my ($self) = @_;
    return $self->_working_directory->dirname();
}

sub _build_output_filename {
    my ($self) = @_;
    my ( $filename, $directories, $suffix ) = fileparse( $self->gff_file, qr/\.[^.]*/ );
    return join( '/', ( $self->_working_directory_name, $filename . '.faa' ) );
}



sub _cleanup_intermediate_files {
    my ($self) = @_;
    unlink( $self->_unfiltered_output_filename );
    unlink( $self->_fastatranslate_filename );
}

sub _nucleotide_fasta_file_from_gff_filename {
    my ($self) = @_;
    return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.fa' ) )));
}

sub _extracted_nucleotide_fasta_file_from_bed_filename {
    my ($self) = @_;
    return join('/',($self->output_directory,join( '.', ( $self->output_filename,'intermediate.extracted.fa' ) )));
}

sub _unfiltered_output_filename {
    my $self = shift;
    return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'unfiltered.fa' ) )));
}


sub _create_nucleotide_fasta_file_from_gff {
    my ($self) = @_;
    
    open(my $input_fh, $self->gff_file);
    open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename);
    my $at_sequence = 0;
    while(<$input_fh>)
    {
	    my $line = $_;
	    if($line =~/^>/)
	    {
	    	$at_sequence = 1;
	    }
	    
	    if($at_sequence == 1)
	    {
		    print {$output_fh} $line;
	    }
    }
    close($input_fh);
    close($output_fh);
}

sub _extract_nucleotide_regions {
    my ($self) = @_;

    $self->_create_nucleotide_fasta_file_from_gff;
    $self->_create_bed_file_from_gff;

    my $cmd =
        'bedtools getfasta -s -fi '
      . $self->_nucleotide_fasta_file_from_gff_filename
      . ' -bed '
      . $self->_bed_output_filename . ' -fo '
      . $self->_extracted_nucleotide_fasta_file_from_bed_filename
      . ' -name > /dev/null 2>&1';

    $self->logger->debug($cmd);
    system($cmd);
    unlink( $self->_nucleotide_fasta_file_from_gff_filename );
    unlink( $self->_bed_output_filename );
    unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' );
}

sub _cleanup_fasta {
    my $self    = shift;
    my $infile  = $self->_unfiltered_output_filename;
    my $outfile = join('/',($self->output_directory,$self->output_filename));
    return unless ( -e $infile );

    open( my $in,  '<', $infile );
    open( my $out, '>', $outfile );
    while ( my $line = <$in> ) {
       chomp $line;
       if ( $line =~ /^>/ )
	   {
	   	 $line =~ s/"//g;
		 # newer versions of Bedtools add (-) or (+) to the end of the sequence name, remove them
		 $line =~ s!\([-+]\)!!;
	   }
	
	if($line =~ /^(>[^:]+)/)
	{
		$line = $1;
	}
        print $out "$line\n";
    }
    close $in;
    close $out;
}

sub _fastatranslate_filename {
    my ($self) = @_;
    return join('/',($self->output_directory,join( '.', ( $self->output_filename, 'intermediate.translate.fa' ) )));
}

sub _fastatranslate {
    my ( $self, $inputfile, $outputfile ) = @_;

    my $input_fasta_file_obj = Bio::SeqIO->new( -file => $inputfile, -format => 'Fasta' );
    my $output_protein_file_obj = Bio::SeqIO->new( -file => ">" . $outputfile, -format => 'Fasta', -alphabet => 'protein' );

    my %protein_sequence_objs;
    while ( my $seq = $input_fasta_file_obj->next_seq ) {
        $seq->desc(undef);
        my $protseq = $seq->translate( -codontable_id => $self->translation_table );
        $output_protein_file_obj->write_seq($protseq);
    }
    return 1;
}

sub _convert_nucleotide_to_protein {
    my ($self) = @_;
    $self->_fastatranslate( $self->_extracted_nucleotide_fasta_file_from_bed_filename, $self->_unfiltered_output_filename );
    unlink( $self->_extracted_nucleotide_fasta_file_from_bed_filename );
}

sub _does_sequence_contain_too_many_unknowns {
    my ( $self, $sequence_obj ) = @_;
    my $maximum_number_of_Xs = int( ( $sequence_obj->length() * $self->maximum_percentage_of_unknowns ) / 100 );
    my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
    if ( $number_of_Xs_found > $maximum_number_of_Xs ) {
        return 1;
    }
    else {
        return 0;
    }
}

sub _filter_fasta_sequences {
    my ( $self, $filename ) = @_;
    my $temp_output_file = $filename . '.tmp.filtered.fa';
    my $out_fasta_obj    = Bio::SeqIO->new( -file => ">" . $temp_output_file, -format => 'Fasta' );
    my $fasta_obj        = Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );

    my $sequence_found = 0;

    while ( my $seq = $fasta_obj->next_seq() ) {
        if ( $self->_does_sequence_contain_too_many_unknowns($seq) ) {
            next;
        }
        $seq->desc(undef);
        $out_fasta_obj->write_seq($seq);
        $sequence_found = 1;
    }

    if ( $sequence_found == 0 ) {
        $self->logger->error( "Could not extract any protein sequences from "
              . $self->gff_file
              . ". Does the file contain the assembly as well as the annotation?" );
    }

    # Replace the original file.
    move( $temp_output_file, $filename );
    return 1;
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;