Mercurial > repos > dereeper > roary_plots
diff Roary/lib/Bio/Roary/External/Cdhit.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Roary/lib/Bio/Roary/External/Cdhit.pm Fri May 14 20:27:06 2021 +0000 @@ -0,0 +1,103 @@ +package Bio::Roary::External::Cdhit; + +# ABSTRACT: Wrapper to run cd-hit + +=head1 SYNOPSIS + +Wrapper to run cd-hit + use Bio::Roary::External::Cdhit; + + my $obj = Bio::Roary::External::Cdhit->new( + input_file => 'abc.fa', + exec => 'cd-hit', + output_base => 'efg', + ); + $obj->run; + +=cut + +use Moose; + +with 'Bio::Roary::JobRunner::Role'; + +has 'input_file' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'output_base' => ( is => 'ro', isa => 'Str', default => 'output' ); +has 'exec' => ( is => 'ro', isa => 'Str', default => 'cd-hit' ); +has 'alt_exec' => ( is => 'ro', isa => 'Str', default => 'cdhit' ); +has '_max_available_memory_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__max_available_memory_in_mb' ); +has '_use_most_similar_clustering' => ( is => 'ro', isa => 'Bool', default => 1 ); +has '_length_difference_cutoff' => ( is => 'ro', isa => 'Num', default => 1 ); +has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num', default => 1 ); +has '_description_length' => ( is => 'ro', isa => 'Int', default => 256 ); +has '_logging' => ( is => 'ro', isa => 'Str', default => '> /dev/null 2>&1' ); +has '_max_cpus' => ( is => 'ro', isa => 'Int', default => 40 ); + + +# Overload Role +has 'memory_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_memory_in_mb' ); + +sub _build_memory_in_mb +{ + my ($self) = @_; + my $filename = $self->input_file; + my $memory_required = 2000; + if(-e $filename) + { + $memory_required = -s $filename; + # Convert to mb + $memory_required = int($memory_required/1000000); + # Give it bucket loads of memory for the worst case scenario + $memory_required *= 5; + $memory_required = 2000 if($memory_required < 2000); + } + + return $memory_required; +} + +sub _build__max_available_memory_in_mb +{ + my ($self) = @_; + my $memory_to_cdhit = int($self->memory_in_mb *0.9); + return $memory_to_cdhit; +} + +sub clusters_filename +{ + my ($self) = @_; + return join('.',($self->output_base,'clstr')); +} + +sub _command_to_run { + my ($self) = @_; + + my $executable = $self->_find_exe([$self->exec, $self->alt_exec]); + + my $cpus = ($self->cpus > $self->_max_cpus) ? $self->_max_cpus : $self->cpus; + return join( + ' ', + ( + $executable, '-i', $self->input_file, '-o', + $self->output_base, '-T', $cpus, '-M', + $self->_max_available_memory_in_mb, '-g', $self->_use_most_similar_clustering, '-s', + $self->_length_difference_cutoff, '-d', $self->_description_length ,'-c', $self->_sequence_identity_threshold, + $self->_logging + ) + ); +} + +sub run { + my ($self) = @_; + my @commands_to_run; + + push(@commands_to_run, $self->_command_to_run() ); + $self->logger->info( "Running command: " . $self->_command_to_run() ); + my $job_runner_obj = $self->_job_runner_class->new( commands_to_run => \@commands_to_run, memory_in_mb => $self->memory_in_mb, queue => $self->_queue, cpus => $self->cpus ); + $job_runner_obj->run(); + + 1; +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1;