0
|
1 undef $VERSION;
|
|
2 package Bio::Roary::CommandLine::RoaryPostAnalysis;
|
|
3
|
|
4 # ABSTRACT: Perform the post analysis on the pan genome
|
|
5
|
|
6 =head1 SYNOPSIS
|
|
7
|
|
8 Perform the post analysis on the pan genome
|
|
9
|
|
10 =cut
|
|
11
|
|
12 use Moose;
|
|
13 use Getopt::Long qw(GetOptionsFromArray);
|
|
14 use Bio::Roary::PostAnalysis;
|
|
15 use File::Find::Rule;
|
|
16 use Bio::Roary::External::GeneAlignmentFromNucleotides;
|
|
17 use File::Path qw(remove_tree);
|
|
18 use Bio::Roary::External::Fasttree;
|
|
19 extends 'Bio::Roary::CommandLine::Common';
|
|
20
|
|
21 has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
|
|
22 has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
|
|
23 has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );
|
|
24 has '_error_message' => ( is => 'rw', isa => 'Str' );
|
|
25
|
|
26 has 'fasta_files' => ( is => 'rw', isa => 'Str', default => '_fasta_files' );
|
|
27 has 'input_files' => ( is => 'rw', isa => 'Str', default => '_gff_files');
|
|
28 has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
|
|
29 has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str', default => 'pan_genome.fa' );
|
|
30 has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'gene_presence_absence.csv' );
|
|
31 has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 );
|
|
32 has 'clusters_filename' => ( is => 'rw', isa => 'Str', default => '_clustered.clstr' );
|
|
33 has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'Local' );
|
|
34 has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 );
|
|
35 has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
|
|
36 has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 0 );
|
|
37 has 'dont_split_groups' => ( is => 'rw', isa => 'Bool', default => 0 );
|
|
38 has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
|
|
39 has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
|
|
40 has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );
|
|
41 has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 );
|
|
42 has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
|
|
43 has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 );
|
|
44 has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 );
|
|
45
|
|
46 sub BUILD {
|
|
47 my ($self) = @_;
|
|
48
|
|
49 my (
|
|
50 $output_filename, $dont_create_rplots, $dont_delete_files, $dont_split_groups, $output_pan_geneome_filename,
|
|
51 $job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $core_definition,
|
|
52 $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft, $allow_paralogs
|
|
53 );
|
|
54
|
|
55
|
|
56 GetOptionsFromArray(
|
|
57 $self->args,
|
|
58 'o|output=s' => \$output_filename,
|
|
59 'j|job_runner=s' => \$job_runner,
|
|
60 'm|output_multifasta_files' => \$output_multifasta_files,
|
|
61 'p=s' => \$output_pan_geneome_filename,
|
|
62 's=s' => \$output_statistics_filename,
|
|
63 'c=s' => \$clusters_filename,
|
|
64 'f=s' => \$fasta_files,
|
|
65 'i=s' => \$input_files,
|
|
66 'a|dont_delete_files' => \$dont_delete_files,
|
|
67 'b|dont_create_rplots' => \$dont_create_rplots,
|
|
68 'd|dont_split_groups' => \$dont_split_groups,
|
|
69 'e|verbose_stats' => \$verbose_stats,
|
|
70 'z|processors=i' => \$cpus,
|
|
71 't|translation_table=i' => \$translation_table,
|
|
72 'g|group_limit=i' => \$group_limit,
|
|
73 'cd|core_definition=f' => \$core_definition,
|
|
74 'v|verbose' => \$verbose,
|
|
75 'n|mafft' => \$mafft,
|
|
76 'q|allow_paralogs' => \$allow_paralogs,
|
|
77 'h|help' => \$help,
|
|
78 );
|
|
79
|
|
80 $self->help($help) if(defined($help));
|
|
81 $self->job_runner($job_runner) if ( defined($job_runner) );
|
|
82 $self->fasta_files($fasta_files) if ( defined($fasta_files) );
|
|
83 $self->input_files($input_files) if ( defined($input_files) );
|
|
84 $self->output_filename($output_filename) if ( defined($output_filename) );
|
|
85 $self->output_pan_geneome_filename($output_pan_geneome_filename) if ( defined($output_pan_geneome_filename) );
|
|
86 $self->output_statistics_filename($output_statistics_filename) if ( defined($output_statistics_filename) );
|
|
87 $self->output_multifasta_files($output_multifasta_files) if ( defined($output_multifasta_files) );
|
|
88 $self->clusters_filename($clusters_filename) if ( defined($clusters_filename) );
|
|
89 $self->dont_delete_files($dont_delete_files) if ( defined($dont_delete_files) );
|
|
90 $self->dont_create_rplots($dont_create_rplots) if ( defined($dont_create_rplots) );
|
|
91 $self->dont_split_groups($dont_split_groups) if ( defined($dont_split_groups) );
|
|
92 $self->verbose_stats($verbose_stats) if ( defined($verbose_stats));
|
|
93 $self->translation_table($translation_table) if ( defined($translation_table) );
|
|
94 $self->cpus($cpus) if ( defined($cpus) );
|
|
95 $self->group_limit($group_limit) if ( defined($group_limit) );
|
|
96 $self->core_definition( $core_definition/100 ) if ( defined($core_definition) );
|
|
97 $self->mafft($mafft) if ( defined($mafft) );
|
|
98 $self->allow_paralogs($allow_paralogs) if ( defined($allow_paralogs) );
|
|
99 if ( defined($verbose) ) {
|
|
100 $self->verbose($verbose);
|
|
101 $self->logger->level(10000);
|
|
102 }
|
|
103 }
|
|
104
|
|
105 sub run {
|
|
106 my ($self) = @_;
|
|
107
|
|
108 ( !$self->help ) or die $self->usage_text;
|
|
109 if ( defined( $self->_error_message ) ) {
|
|
110 print $self->_error_message . "\n";
|
|
111 die $self->usage_text;
|
|
112 }
|
|
113
|
|
114 my $input_files = $self->_read_file_into_array($self->input_files);
|
|
115 my $obj = Bio::Roary::PostAnalysis->new(
|
|
116 fasta_files => $self->_read_file_into_array($self->fasta_files) ,
|
|
117 input_files => $input_files ,
|
|
118 output_filename => $self->output_filename ,
|
|
119 output_pan_geneome_filename => $self->output_pan_geneome_filename,
|
|
120 output_statistics_filename => $self->output_statistics_filename ,
|
|
121 output_multifasta_files => $self->output_multifasta_files ,
|
|
122 clusters_filename => $self->clusters_filename ,
|
|
123 dont_delete_files => $self->dont_delete_files,
|
|
124 dont_create_rplots => $self->dont_create_rplots,
|
|
125 dont_split_groups => $self->dont_split_groups,
|
|
126 verbose_stats => $self->verbose_stats,
|
|
127 group_limit => $self->group_limit,
|
|
128 verbose => $self->verbose,
|
|
129 cpus => $self->cpus,
|
|
130 logger => $self->logger,
|
|
131 core_definition => $self->core_definition,
|
|
132 );
|
|
133 $obj->run();
|
|
134
|
|
135 if($self->dont_delete_files == 0)
|
|
136 {
|
|
137 unlink('_inflated_unsplit_mcl_groups');
|
|
138 remove_tree('split_groups');
|
|
139 }
|
|
140
|
|
141 if($self->output_multifasta_files == 1)
|
|
142 {
|
|
143 print "Aligning each cluster\n" if($self->verbose);
|
|
144
|
|
145 my $job_runner_to_use = $self->job_runner;
|
|
146 if($self->_is_lsf_job_runner_available && $self->job_runner eq "LSF")
|
|
147 {
|
|
148 $job_runner_to_use = $self->job_runner;
|
|
149 }
|
|
150 else
|
|
151 {
|
|
152 $job_runner_to_use = 'Parallel';
|
|
153 }
|
|
154
|
|
155 my $output_gene_files = $self->_find_input_files;
|
|
156 my $seg = Bio::Roary::External::GeneAlignmentFromNucleotides->new(
|
|
157 fasta_files => $output_gene_files,
|
|
158 job_runner => $job_runner_to_use,
|
|
159 translation_table => $self->translation_table,
|
|
160 core_definition => $self->core_definition,
|
|
161 cpus => $self->cpus,
|
|
162 verbose => $self->verbose,
|
|
163 mafft => $self->mafft,
|
|
164 allow_paralogs => $self->allow_paralogs,
|
|
165 dont_delete_files => $self->dont_delete_files,
|
|
166 num_input_files => $#{$input_files},
|
|
167 );
|
|
168 $seg->run();
|
|
169 }
|
|
170 }
|
|
171
|
|
172 sub _is_lsf_job_runner_available
|
|
173 {
|
|
174 my ($self) = @_;
|
|
175 my $rc = eval "require Bio::Roary::JobRunner::LSF; 1;";
|
|
176 if(defined($rc) && $rc == 1)
|
|
177 {
|
|
178 return 1;
|
|
179 }
|
|
180 else
|
|
181 {
|
|
182 return 0;
|
|
183 }
|
|
184 }
|
|
185
|
|
186 sub _find_input_files
|
|
187 {
|
|
188 my ($self) = @_;
|
|
189 my @files = File::Find::Rule->file()
|
|
190 ->name( '*.fa' )
|
|
191 ->in('pan_genome_sequences' );
|
|
192 return \@files;
|
|
193 }
|
|
194
|
|
195 sub _read_file_into_array
|
|
196 {
|
|
197 my ($self, $filename) = @_;
|
|
198 open(my $in_fh, $filename);
|
|
199
|
|
200 my @filenames;
|
|
201 while(<$in_fh>){
|
|
202 chomp;
|
|
203 my $line = $_;
|
|
204 push(@filenames, $line);
|
|
205 }
|
|
206 return \@filenames;
|
|
207 }
|
|
208
|
|
209 sub usage_text {
|
|
210 my ($self) = @_;
|
|
211
|
|
212 return <<USAGE;
|
|
213 Usage: pan_genome_post_analysis [options]
|
|
214 Perform the post analysis on the pan genome. This script is usally only called by another script.
|
|
215
|
|
216 Options: -a dont delete intermediate files
|
|
217 -b dont create R plots
|
|
218 -c STR clusters filename [_clustered.clstr]
|
|
219 -cd FLOAT percentage of isolates a gene must be in to be core [0.99]
|
|
220 -d dont split groups
|
|
221 -e add inference values to gene presence and absence spreadsheet
|
|
222 -f STR file of protein filenames [_fasta_files]
|
|
223 -g INT maximum number of clusters [50000]
|
|
224 -i STR file of GFF filenames [_gff_files]
|
|
225 -m core gene alignement with PRANK
|
|
226 -n fast core gene alignement with MAFFT instead of PRANK
|
|
227 -o STR clusters output filename [clustered_proteins]
|
|
228 -p STR output pan genome filename [pan_genome.fa]
|
|
229 -q allow paralogs in core alignment
|
|
230 -s STR output gene presence and absence filename [gene_presence_absence.csv]
|
|
231 -t INT translation table [11]
|
|
232 -z INT number of threads [1]
|
|
233 -v verbose output to STDOUT
|
|
234 -h this help message
|
|
235
|
|
236 For further info see: http://sanger-pathogens.github.io/Roary/
|
|
237 USAGE
|
|
238 }
|
|
239
|
|
240 __PACKAGE__->meta->make_immutable;
|
|
241 no Moose;
|
|
242 1;
|