view Roary/lib/Bio/Roary/PrepareInputFiles.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
line wrap: on
line source

package Bio::Roary::PrepareInputFiles;

# ABSTRACT: Take in a mixture of FASTA and GFF input files and output FASTA proteomes only

=head1 SYNOPSIS

Take in a mixture of FASTA and GFF input files and output FASTA proteomes only
   use Bio::Roary::PrepareInputFiles;
   
   my $obj = Bio::Roary::PrepareInputFiles->new(
     input_files   => ['abc.gff','ddd.faa'],
   );
   $obj->fasta_files;

=cut

use Moose;
use Bio::Roary::Exceptions;
use Bio::Roary::ExtractProteomeFromGFFs;
use Bio::Roary::FilterUnknownsFromFasta;
use Cwd qw(getcwd);
use File::Temp;
use Log::Log4perl qw(:easy);

has 'input_files'        => ( is => 'ro', isa => 'ArrayRef',        required => 1 );
has 'job_runner'         => ( is => 'ro', isa => 'Str',             default  => 'Local' );
has 'cpus'               => ( is => 'ro', isa => 'Int',             default  => 1 );
has '_input_gff_files'   => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy     => 1, builder => '_build__input_gff_files' );
has '_input_fasta_files' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy     => 1, builder => '_build__input_fasta_files' );
has '_input_fasta_files_filtered' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__input_fasta_files_filtered' );
has '_input_fasta_files_filtered_obj' =>
  ( is => 'ro', isa => 'Bio::Roary::FilterUnknownsFromFasta', lazy => 1, builder => '_build__input_fasta_files_filtered_obj' );

has '_derived_fasta_files' => ( is => 'ro', isa => 'Maybe[ArrayRef]', lazy => 1, builder => '_build__derived_fasta_files' );
has '_extract_proteome_obj' => (
    is      => 'ro',
    isa     => 'Bio::Roary::ExtractProteomeFromGFFs',
    lazy    => 1,
    builder => '_build__extract_proteome_obj'
);
has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool',                               default => 1 );
has 'translation_table'     => ( is => 'rw', isa => 'Int',                                default => 11 );
has 'verbose'               => ( is => 'rw', isa => 'Bool',                               default => 0 );
has '_fasta_filter_obj'     => ( is => 'ro', isa => 'Bio::Roary::FilterUnknowsFromFasta', lazy    => 1, builder => '_fasta_filter_obj' );
has 'working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' );

sub _build_logger {
    my ($self) = @_;
    Log::Log4perl->easy_init($ERROR);
    my $logger = get_logger();
    return $logger;
}

sub _build__input_gff_files {
    my ($self) = @_;
    my @gff_files = grep( /\.gff$/, @{ $self->input_files } );
    return \@gff_files;
}

sub _build__input_fasta_files {
    my ($self) = @_;
    my @fasta_files = grep( !/\.gff$/, @{ $self->input_files } );

    my @validated_fasta_files;

    for my $fasta_file (@fasta_files) {
        eval {
            my $inseq = Bio::SeqIO->new(
                -file     => $fasta_file,
                -format   => 'fasta',
                -alphabet => 'protein'
            );
            while ( my $seq = $inseq->next_seq ) {

                # do something to force the reading.
                $seq->seq;
            }
        };
        if ($@) {
            $self->logger->warn(
                "Input file doesnt have a .gff extension and isnt a protein FASTA file so excluding it from further analysis: $fasta_file"
            );
        }
        else {
            push( @validated_fasta_files, $fasta_file );
        }

    }

    return \@fasta_files;
}

sub _build__input_fasta_files_filtered_obj {
    my ($self) = @_;
    return Bio::Roary::FilterUnknownsFromFasta->new( fasta_files => $self->_input_fasta_files );
}

sub _build__input_fasta_files_filtered {
    my ($self) = @_;
    return undef if ( !defined( $self->_input_fasta_files ) );
    return $self->_input_fasta_files_filtered_obj->filtered_fasta_files();
}

sub _build__extract_proteome_obj {
    my ($self) = @_;
    return Bio::Roary::ExtractProteomeFromGFFs->new(
        gff_files             => $self->_input_gff_files,
        job_runner            => $self->job_runner,
        apply_unknowns_filter => $self->apply_unknowns_filter,
        translation_table     => $self->translation_table,
        cpus                  => $self->cpus,
        verbose               => $self->verbose,
        working_directory     => $self->working_directory,
    );
}

sub _build__derived_fasta_files {
    my ($self) = @_;
    return undef if ( !defined( $self->_input_gff_files ) );
    return $self->_extract_proteome_obj->fasta_files();
}

sub fasta_files {
    my ($self) = @_;
    my @output_fasta_files = ( @{ $self->_input_fasta_files_filtered }, @{ $self->_derived_fasta_files } );
    return \@output_fasta_files;
}

sub lookup_fasta_files_from_unknown_input_files {
    my ( $self, $input_files ) = @_;
    $self->fasta_files;

    my @output_fasta_files;
    for my $input_file ( @{$input_files} ) {
        if ( defined( $self->_extract_proteome_obj->fasta_files_to_gff_files->{$input_file} ) ) {
            push( @output_fasta_files, $self->_extract_proteome_obj->fasta_files_to_gff_files->{$input_file} );
        }
        else {
            push( @output_fasta_files, $self->_input_fasta_files_filtered_obj->input_fasta_to_output_fasta->{$input_file} );
        }
    }
    return \@output_fasta_files;
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;