Mercurial > repos > dereeper > roary_plots
view Roary/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line source
undef $VERSION; package Bio::Roary::CommandLine::RoaryPostAnalysis; # ABSTRACT: Perform the post analysis on the pan genome =head1 SYNOPSIS Perform the post analysis on the pan genome =cut use Moose; use Getopt::Long qw(GetOptionsFromArray); use Bio::Roary::PostAnalysis; use File::Find::Rule; use Bio::Roary::External::GeneAlignmentFromNucleotides; use File::Path qw(remove_tree); use Bio::Roary::External::Fasttree; extends 'Bio::Roary::CommandLine::Common'; has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 ); has 'help' => ( is => 'rw', isa => 'Bool', default => 0 ); has '_error_message' => ( is => 'rw', isa => 'Str' ); has 'fasta_files' => ( is => 'rw', isa => 'Str', default => '_fasta_files' ); has 'input_files' => ( is => 'rw', isa => 'Str', default => '_gff_files'); has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' ); has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str', default => 'pan_genome.fa' ); has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'gene_presence_absence.csv' ); has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'clusters_filename' => ( is => 'rw', isa => 'Str', default => '_clustered.clstr' ); has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'Local' ); has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 ); has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'dont_split_groups' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 ); has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 ); has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); sub BUILD { my ($self) = @_; my ( $output_filename, $dont_create_rplots, $dont_delete_files, $dont_split_groups, $output_pan_geneome_filename, $job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $core_definition, $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft, $allow_paralogs ); GetOptionsFromArray( $self->args, 'o|output=s' => \$output_filename, 'j|job_runner=s' => \$job_runner, 'm|output_multifasta_files' => \$output_multifasta_files, 'p=s' => \$output_pan_geneome_filename, 's=s' => \$output_statistics_filename, 'c=s' => \$clusters_filename, 'f=s' => \$fasta_files, 'i=s' => \$input_files, 'a|dont_delete_files' => \$dont_delete_files, 'b|dont_create_rplots' => \$dont_create_rplots, 'd|dont_split_groups' => \$dont_split_groups, 'e|verbose_stats' => \$verbose_stats, 'z|processors=i' => \$cpus, 't|translation_table=i' => \$translation_table, 'g|group_limit=i' => \$group_limit, 'cd|core_definition=f' => \$core_definition, 'v|verbose' => \$verbose, 'n|mafft' => \$mafft, 'q|allow_paralogs' => \$allow_paralogs, 'h|help' => \$help, ); $self->help($help) if(defined($help)); $self->job_runner($job_runner) if ( defined($job_runner) ); $self->fasta_files($fasta_files) if ( defined($fasta_files) ); $self->input_files($input_files) if ( defined($input_files) ); $self->output_filename($output_filename) if ( defined($output_filename) ); $self->output_pan_geneome_filename($output_pan_geneome_filename) if ( defined($output_pan_geneome_filename) ); $self->output_statistics_filename($output_statistics_filename) if ( defined($output_statistics_filename) ); $self->output_multifasta_files($output_multifasta_files) if ( defined($output_multifasta_files) ); $self->clusters_filename($clusters_filename) if ( defined($clusters_filename) ); $self->dont_delete_files($dont_delete_files) if ( defined($dont_delete_files) ); $self->dont_create_rplots($dont_create_rplots) if ( defined($dont_create_rplots) ); $self->dont_split_groups($dont_split_groups) if ( defined($dont_split_groups) ); $self->verbose_stats($verbose_stats) if ( defined($verbose_stats)); $self->translation_table($translation_table) if ( defined($translation_table) ); $self->cpus($cpus) if ( defined($cpus) ); $self->group_limit($group_limit) if ( defined($group_limit) ); $self->core_definition( $core_definition/100 ) if ( defined($core_definition) ); $self->mafft($mafft) if ( defined($mafft) ); $self->allow_paralogs($allow_paralogs) if ( defined($allow_paralogs) ); if ( defined($verbose) ) { $self->verbose($verbose); $self->logger->level(10000); } } sub run { my ($self) = @_; ( !$self->help ) or die $self->usage_text; if ( defined( $self->_error_message ) ) { print $self->_error_message . "\n"; die $self->usage_text; } my $input_files = $self->_read_file_into_array($self->input_files); my $obj = Bio::Roary::PostAnalysis->new( fasta_files => $self->_read_file_into_array($self->fasta_files) , input_files => $input_files , output_filename => $self->output_filename , output_pan_geneome_filename => $self->output_pan_geneome_filename, output_statistics_filename => $self->output_statistics_filename , output_multifasta_files => $self->output_multifasta_files , clusters_filename => $self->clusters_filename , dont_delete_files => $self->dont_delete_files, dont_create_rplots => $self->dont_create_rplots, dont_split_groups => $self->dont_split_groups, verbose_stats => $self->verbose_stats, group_limit => $self->group_limit, verbose => $self->verbose, cpus => $self->cpus, logger => $self->logger, core_definition => $self->core_definition, ); $obj->run(); if($self->dont_delete_files == 0) { unlink('_inflated_unsplit_mcl_groups'); remove_tree('split_groups'); } if($self->output_multifasta_files == 1) { print "Aligning each cluster\n" if($self->verbose); my $job_runner_to_use = $self->job_runner; if($self->_is_lsf_job_runner_available && $self->job_runner eq "LSF") { $job_runner_to_use = $self->job_runner; } else { $job_runner_to_use = 'Parallel'; } my $output_gene_files = $self->_find_input_files; my $seg = Bio::Roary::External::GeneAlignmentFromNucleotides->new( fasta_files => $output_gene_files, job_runner => $job_runner_to_use, translation_table => $self->translation_table, core_definition => $self->core_definition, cpus => $self->cpus, verbose => $self->verbose, mafft => $self->mafft, allow_paralogs => $self->allow_paralogs, dont_delete_files => $self->dont_delete_files, num_input_files => $#{$input_files}, ); $seg->run(); } } sub _is_lsf_job_runner_available { my ($self) = @_; my $rc = eval "require Bio::Roary::JobRunner::LSF; 1;"; if(defined($rc) && $rc == 1) { return 1; } else { return 0; } } sub _find_input_files { my ($self) = @_; my @files = File::Find::Rule->file() ->name( '*.fa' ) ->in('pan_genome_sequences' ); return \@files; } sub _read_file_into_array { my ($self, $filename) = @_; open(my $in_fh, $filename); my @filenames; while(<$in_fh>){ chomp; my $line = $_; push(@filenames, $line); } return \@filenames; } sub usage_text { my ($self) = @_; return <<USAGE; Usage: pan_genome_post_analysis [options] Perform the post analysis on the pan genome. This script is usally only called by another script. Options: -a dont delete intermediate files -b dont create R plots -c STR clusters filename [_clustered.clstr] -cd FLOAT percentage of isolates a gene must be in to be core [0.99] -d dont split groups -e add inference values to gene presence and absence spreadsheet -f STR file of protein filenames [_fasta_files] -g INT maximum number of clusters [50000] -i STR file of GFF filenames [_gff_files] -m core gene alignement with PRANK -n fast core gene alignement with MAFFT instead of PRANK -o STR clusters output filename [clustered_proteins] -p STR output pan genome filename [pan_genome.fa] -q allow paralogs in core alignment -s STR output gene presence and absence filename [gene_presence_absence.csv] -t INT translation table [11] -z INT number of threads [1] -v verbose output to STDOUT -h this help message For further info see: http://sanger-pathogens.github.io/Roary/ USAGE } __PACKAGE__->meta->make_immutable; no Moose; 1;