diff Roary/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Roary/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm	Fri May 14 20:27:06 2021 +0000
@@ -0,0 +1,242 @@
+undef $VERSION;
+package Bio::Roary::CommandLine::RoaryPostAnalysis;
+
+# ABSTRACT: Perform the post analysis on the pan genome
+
+=head1 SYNOPSIS
+
+Perform the post analysis on the pan genome
+
+=cut
+
+use Moose;
+use Getopt::Long qw(GetOptionsFromArray);
+use Bio::Roary::PostAnalysis;
+use File::Find::Rule;
+use Bio::Roary::External::GeneAlignmentFromNucleotides;
+use File::Path qw(remove_tree);
+use Bio::Roary::External::Fasttree;
+extends 'Bio::Roary::CommandLine::Common';
+
+has 'args'                        => ( is => 'ro', isa => 'ArrayRef', required => 1 );
+has 'script_name'                 => ( is => 'ro', isa => 'Str',      required => 1 );
+has 'help'                        => ( is => 'rw', isa => 'Bool',     default  => 0 );
+has '_error_message'              => ( is => 'rw', isa => 'Str' );
+
+has 'fasta_files'                 => ( is => 'rw', isa => 'Str',  default  => '_fasta_files' );
+has 'input_files'                 => ( is => 'rw', isa => 'Str',  default  => '_gff_files');
+has 'output_filename'             => ( is => 'rw', isa => 'Str',  default  => 'clustered_proteins' );
+has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str',  default  => 'pan_genome.fa' );
+has 'output_statistics_filename'  => ( is => 'rw', isa => 'Str',  default  => 'gene_presence_absence.csv' );
+has 'output_multifasta_files'     => ( is => 'rw', isa => 'Bool', default  => 0 );
+has 'clusters_filename'           => ( is => 'rw', isa => 'Str',  default  => '_clustered.clstr' );
+has 'job_runner'                  => ( is => 'rw', isa => 'Str',  default  => 'Local' );
+has 'cpus'                        => ( is => 'rw', isa => 'Int',  default => 1 );
+has 'dont_delete_files'           => ( is => 'rw', isa => 'Bool', default  => 0 );
+has 'dont_create_rplots'          => ( is => 'rw', isa => 'Bool', default  => 0 );
+has 'dont_split_groups'           => ( is => 'rw', isa => 'Bool', default  => 0 );
+has 'verbose_stats'               => ( is => 'rw', isa => 'Bool', default  => 0 );
+has 'translation_table'           => ( is => 'rw', isa => 'Int',  default => 11 );
+has 'group_limit'                 => ( is => 'rw', isa => 'Num',  default => 50000 );
+has 'core_definition'             => ( is => 'rw', isa => 'Num',  default => 0.99 );
+has 'verbose'                     => ( is => 'rw', isa => 'Bool', default => 0 );
+has 'mafft'                       => ( is => 'rw', isa => 'Bool', default => 0 );
+has 'allow_paralogs'              => ( is => 'rw', isa => 'Bool', default => 0 );
+
+sub BUILD {
+    my ($self) = @_;
+
+    my ( 
+      $output_filename, $dont_create_rplots, $dont_delete_files, $dont_split_groups, $output_pan_geneome_filename, 
+      $job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $core_definition,
+      $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft, $allow_paralogs
+    );
+
+
+    GetOptionsFromArray(
+        $self->args,
+        'o|output=s'                => \$output_filename,
+        'j|job_runner=s'            => \$job_runner,
+        'm|output_multifasta_files' => \$output_multifasta_files,
+        'p=s'                       => \$output_pan_geneome_filename,
+        's=s'                       => \$output_statistics_filename,
+        'c=s'                       => \$clusters_filename,
+        'f=s'                       => \$fasta_files,
+        'i=s'                       => \$input_files,
+        'a|dont_delete_files'       => \$dont_delete_files,
+        'b|dont_create_rplots'      => \$dont_create_rplots,
+        'd|dont_split_groups'       => \$dont_split_groups,
+        'e|verbose_stats'           => \$verbose_stats,
+        'z|processors=i'            => \$cpus,
+        't|translation_table=i'     => \$translation_table,
+        'g|group_limit=i'           => \$group_limit,
+        'cd|core_definition=f'      => \$core_definition,
+		'v|verbose'                 => \$verbose,
+		'n|mafft'                   => \$mafft,
+		'q|allow_paralogs'          => \$allow_paralogs,
+        'h|help'                    => \$help,
+    );
+    
+    $self->help($help) if(defined($help));
+    $self->job_runner($job_runner)                                   if ( defined($job_runner) );
+    $self->fasta_files($fasta_files)                                 if ( defined($fasta_files) );
+    $self->input_files($input_files)                                 if ( defined($input_files) );
+    $self->output_filename($output_filename)                         if ( defined($output_filename) );
+    $self->output_pan_geneome_filename($output_pan_geneome_filename) if ( defined($output_pan_geneome_filename) );
+    $self->output_statistics_filename($output_statistics_filename)   if ( defined($output_statistics_filename) );
+    $self->output_multifasta_files($output_multifasta_files)         if ( defined($output_multifasta_files) );
+    $self->clusters_filename($clusters_filename)                     if ( defined($clusters_filename) );
+    $self->dont_delete_files($dont_delete_files)                     if ( defined($dont_delete_files) );
+    $self->dont_create_rplots($dont_create_rplots)                   if ( defined($dont_create_rplots) );
+    $self->dont_split_groups($dont_split_groups)                     if ( defined($dont_split_groups) );
+    $self->verbose_stats($verbose_stats)                             if ( defined($verbose_stats));
+    $self->translation_table($translation_table)                     if ( defined($translation_table) );
+    $self->cpus($cpus)                                               if ( defined($cpus) );
+    $self->group_limit($group_limit)                                 if ( defined($group_limit) );
+    $self->core_definition( $core_definition/100 )                   if ( defined($core_definition) );
+	$self->mafft($mafft)                                             if ( defined($mafft) );
+	$self->allow_paralogs($allow_paralogs)                           if ( defined($allow_paralogs) );
+    if ( defined($verbose) ) {
+        $self->verbose($verbose);
+        $self->logger->level(10000);
+    }
+}
+
+sub run {
+    my ($self) = @_;
+
+    ( !$self->help ) or die $self->usage_text;
+    if ( defined( $self->_error_message ) ) {
+        print $self->_error_message . "\n";
+        die $self->usage_text;
+    }
+
+    my $input_files = $self->_read_file_into_array($self->input_files);
+    my $obj = Bio::Roary::PostAnalysis->new(
+      fasta_files                     =>  $self->_read_file_into_array($self->fasta_files) ,
+      input_files                     =>  $input_files ,
+      output_filename                 =>  $self->output_filename            ,
+      output_pan_geneome_filename     =>  $self->output_pan_geneome_filename,
+      output_statistics_filename      =>  $self->output_statistics_filename ,
+      output_multifasta_files         =>  $self->output_multifasta_files    ,
+      clusters_filename               =>  $self->clusters_filename          ,
+      dont_delete_files               =>  $self->dont_delete_files,
+      dont_create_rplots              =>  $self->dont_create_rplots,
+      dont_split_groups               =>  $self->dont_split_groups,
+      verbose_stats                   =>  $self->verbose_stats,
+      group_limit                     =>  $self->group_limit,
+	  verbose                         =>  $self->verbose,
+	  cpus                            =>  $self->cpus,
+	  logger                          =>  $self->logger,
+	  core_definition                 =>  $self->core_definition,
+      );                                                             
+    $obj->run();
+	
+    if($self->dont_delete_files == 0)
+    {
+		unlink('_inflated_unsplit_mcl_groups');
+        remove_tree('split_groups');
+    }
+
+    if($self->output_multifasta_files == 1)
+    {
+	  print "Aligning each cluster\n" if($self->verbose);
+      
+      my $job_runner_to_use = $self->job_runner;
+      if($self->_is_lsf_job_runner_available && $self->job_runner eq "LSF")
+      {
+          $job_runner_to_use = $self->job_runner;
+      }
+      else
+      {
+          $job_runner_to_use = 'Parallel';
+      }
+      
+      my $output_gene_files = $self->_find_input_files;
+      my $seg = Bio::Roary::External::GeneAlignmentFromNucleotides->new(
+        fasta_files         => $output_gene_files,
+        job_runner          => $job_runner_to_use,
+        translation_table   => $self->translation_table,
+        core_definition     => $self->core_definition,
+        cpus                => $self->cpus,
+		verbose             => $self->verbose,
+		mafft               => $self->mafft,
+		allow_paralogs      => $self->allow_paralogs,
+        dont_delete_files   => $self->dont_delete_files,
+        num_input_files     => $#{$input_files},
+      );
+      $seg->run();
+    }
+}
+
+sub _is_lsf_job_runner_available
+{
+    my ($self) = @_;
+    my $rc = eval "require Bio::Roary::JobRunner::LSF; 1;";
+    if(defined($rc) && $rc == 1)
+    {
+        return 1;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+sub _find_input_files
+{
+   my ($self) = @_;
+   my @files = File::Find::Rule->file()
+                               ->name( '*.fa' )
+                               ->in('pan_genome_sequences' );
+   return \@files;
+}
+
+sub _read_file_into_array
+{
+  my ($self, $filename) = @_;
+  open(my $in_fh, $filename);
+  
+  my @filenames;
+  while(<$in_fh>){
+    chomp;
+    my $line = $_;
+    push(@filenames, $line);
+  }
+  return \@filenames;
+}
+
+sub usage_text {
+    my ($self) = @_;
+
+    return <<USAGE;
+Usage: pan_genome_post_analysis [options]
+Perform the post analysis on the pan genome. This script is usally only called by another script.
+
+Options: -a        dont delete intermediate files
+         -b        dont create R plots
+         -c STR    clusters filename [_clustered.clstr]
+         -cd FLOAT percentage of isolates a gene must be in to be core [0.99]
+         -d        dont split groups
+         -e        add inference values to gene presence and absence spreadsheet
+         -f STR    file of protein filenames [_fasta_files]
+         -g INT    maximum number of clusters [50000]
+         -i STR    file of GFF filenames [_gff_files]
+         -m        core gene alignement with PRANK
+         -n        fast core gene alignement with MAFFT instead of PRANK
+         -o STR    clusters output filename [clustered_proteins]
+         -p STR    output pan genome filename [pan_genome.fa]
+         -q        allow paralogs in core alignment
+         -s STR    output gene presence and absence filename [gene_presence_absence.csv]
+         -t INT    translation table [11]
+         -z INT    number of threads [1]
+         -v        verbose output to STDOUT
+         -h        this help message
+         
+For further info see: http://sanger-pathogens.github.io/Roary/
+USAGE
+}
+
+__PACKAGE__->meta->make_immutable;
+no Moose;
+1;