0
|
1 package Bio::Roary::External::Cdhit;
|
|
2
|
|
3 # ABSTRACT: Wrapper to run cd-hit
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 Wrapper to run cd-hit
|
|
8 use Bio::Roary::External::Cdhit;
|
|
9
|
|
10 my $obj = Bio::Roary::External::Cdhit->new(
|
|
11 input_file => 'abc.fa',
|
|
12 exec => 'cd-hit',
|
|
13 output_base => 'efg',
|
|
14 );
|
|
15 $obj->run;
|
|
16
|
|
17 =cut
|
|
18
|
|
19 use Moose;
|
|
20
|
|
21 with 'Bio::Roary::JobRunner::Role';
|
|
22
|
|
23 has 'input_file' => ( is => 'ro', isa => 'Str', required => 1 );
|
|
24 has 'output_base' => ( is => 'ro', isa => 'Str', default => 'output' );
|
|
25 has 'exec' => ( is => 'ro', isa => 'Str', default => 'cd-hit' );
|
|
26 has 'alt_exec' => ( is => 'ro', isa => 'Str', default => 'cdhit' );
|
|
27 has '_max_available_memory_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build__max_available_memory_in_mb' );
|
|
28 has '_use_most_similar_clustering' => ( is => 'ro', isa => 'Bool', default => 1 );
|
|
29 has '_length_difference_cutoff' => ( is => 'ro', isa => 'Num', default => 1 );
|
|
30 has '_sequence_identity_threshold' => ( is => 'ro', isa => 'Num', default => 1 );
|
|
31 has '_description_length' => ( is => 'ro', isa => 'Int', default => 256 );
|
|
32 has '_logging' => ( is => 'ro', isa => 'Str', default => '> /dev/null 2>&1' );
|
|
33 has '_max_cpus' => ( is => 'ro', isa => 'Int', default => 40 );
|
|
34
|
|
35
|
|
36 # Overload Role
|
|
37 has 'memory_in_mb' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_memory_in_mb' );
|
|
38
|
|
39 sub _build_memory_in_mb
|
|
40 {
|
|
41 my ($self) = @_;
|
|
42 my $filename = $self->input_file;
|
|
43 my $memory_required = 2000;
|
|
44 if(-e $filename)
|
|
45 {
|
|
46 $memory_required = -s $filename;
|
|
47 # Convert to mb
|
|
48 $memory_required = int($memory_required/1000000);
|
|
49 # Give it bucket loads of memory for the worst case scenario
|
|
50 $memory_required *= 5;
|
|
51 $memory_required = 2000 if($memory_required < 2000);
|
|
52 }
|
|
53
|
|
54 return $memory_required;
|
|
55 }
|
|
56
|
|
57 sub _build__max_available_memory_in_mb
|
|
58 {
|
|
59 my ($self) = @_;
|
|
60 my $memory_to_cdhit = int($self->memory_in_mb *0.9);
|
|
61 return $memory_to_cdhit;
|
|
62 }
|
|
63
|
|
64 sub clusters_filename
|
|
65 {
|
|
66 my ($self) = @_;
|
|
67 return join('.',($self->output_base,'clstr'));
|
|
68 }
|
|
69
|
|
70 sub _command_to_run {
|
|
71 my ($self) = @_;
|
|
72
|
|
73 my $executable = $self->_find_exe([$self->exec, $self->alt_exec]);
|
|
74
|
|
75 my $cpus = ($self->cpus > $self->_max_cpus) ? $self->_max_cpus : $self->cpus;
|
|
76 return join(
|
|
77 ' ',
|
|
78 (
|
|
79 $executable, '-i', $self->input_file, '-o',
|
|
80 $self->output_base, '-T', $cpus, '-M',
|
|
81 $self->_max_available_memory_in_mb, '-g', $self->_use_most_similar_clustering, '-s',
|
|
82 $self->_length_difference_cutoff, '-d', $self->_description_length ,'-c', $self->_sequence_identity_threshold,
|
|
83 $self->_logging
|
|
84 )
|
|
85 );
|
|
86 }
|
|
87
|
|
88 sub run {
|
|
89 my ($self) = @_;
|
|
90 my @commands_to_run;
|
|
91
|
|
92 push(@commands_to_run, $self->_command_to_run() );
|
|
93 $self->logger->info( "Running command: " . $self->_command_to_run() );
|
|
94 my $job_runner_obj = $self->_job_runner_class->new( commands_to_run => \@commands_to_run, memory_in_mb => $self->memory_in_mb, queue => $self->_queue, cpus => $self->cpus );
|
|
95 $job_runner_obj->run();
|
|
96
|
|
97 1;
|
|
98 }
|
|
99
|
|
100 no Moose;
|
|
101 __PACKAGE__->meta->make_immutable;
|
|
102
|
|
103 1;
|