Mercurial > repos > dereeper > roary_plots
view Roary/lib/Bio/Roary/CommandLine/Roary.pm @ 3:e95344f6dfc5 draft default tip
Uploaded
author | dereeper |
---|---|
date | Fri, 12 Nov 2021 16:32:26 +0000 |
parents | c47a5f61bc9f |
children |
line wrap: on
line source
undef $VERSION; package Bio::Roary::CommandLine::Roary; # ABSTRACT: Take in FASTA files of proteins and cluster them =head1 SYNOPSIS Take in FASTA files of proteins and cluster them =cut use Moose; use Getopt::Long qw(GetOptionsFromArray); use Bio::Roary; use Bio::Roary::PrepareInputFiles; use Bio::Roary::QC::Report; use Bio::Roary::ReformatInputGFFs; use Bio::Roary::External::CheckTools; use File::Which; use File::Path qw(make_path); use Cwd qw(abs_path getcwd); use File::Temp; use File::Basename; extends 'Bio::Roary::CommandLine::Common'; has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 ); has 'help' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'fasta_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } ); has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' ); has 'output_directory' => ( is => 'rw', isa => 'Str', default => '.' ); has '_original_directory' => ( is => 'rw', isa => 'Str', default => '.' ); has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'Local' ); has 'makeblastdb_exec' => ( is => 'rw', isa => 'Str', default => 'makeblastdb' ); has 'blastp_exec' => ( is => 'rw', isa => 'Str', default => 'blastp' ); has 'mcxdeblast_exec' => ( is => 'rw', isa => 'Str', default => 'mcxdeblast' ); has 'mcl_exec' => ( is => 'rw', isa => 'Str', default => 'mcl' ); has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 ); has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 ); has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'perc_identity' => ( is => 'rw', isa => 'Num', default => 95 ); has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'check_dependancies' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 1 ); has 'dont_run_qc' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'dont_split_groups' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 ); has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 ); has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'kraken_db' => ( is => 'rw', isa => 'Str', default => '/lustre/scratch118/infgen/pathogen/pathpipe/kraken/minikraken_20140330/' ); has 'run_qc' => ( is => 'rw', isa => 'Bool', default => 0 ); has '_working_directory' => ( is => 'rw', isa => 'File::Temp::Dir', lazy => 1, builder => '_build__working_directory' ); has 'inflation_value' => ( is => 'rw', isa => 'Num', default => 1.5 ); sub _build__working_directory { my ($self) = @_; return File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } sub BUILD { my ($self) = @_; my ( $fasta_files, $verbose, $create_rplots, $group_limit, $dont_run_qc, $max_threads, $dont_delete_files, $dont_split_groups, $perc_identity, $output_filename, $job_runner, $makeblastdb_exec, $mcxdeblast_exec, $mcl_exec, $blastp_exec, $apply_unknowns_filter, $cpus, $output_multifasta_files, $verbose_stats, $translation_table, $run_qc, $core_definition, $help, $kraken_db, $cmd_version, $mafft, $output_directory, $check_dependancies, $inflation_value, $allow_paralogs, ); GetOptionsFromArray( $self->args, 'o|output=s' => \$output_filename, 'f|output_directory=s' => \$output_directory, 'j|job_runner=s' => \$job_runner, 'm|makeblastdb_exec=s' => \$makeblastdb_exec, 'b|blastp_exec=s' => \$blastp_exec, 'd|mcxdeblast_exec=s' => \$mcxdeblast_exec, 'c|mcl_exec=s' => \$mcl_exec, 'p|processors=i' => \$cpus, 'u|apply_unknowns_filter=i' => \$apply_unknowns_filter, 'e|output_multifasta_files' => \$output_multifasta_files, 'i|perc_identity=i' => \$perc_identity, 'z|dont_delete_files' => \$dont_delete_files, 's|dont_split_groups' => \$dont_split_groups, 'r|create_rplots' => \$create_rplots, 'y|verbose_stats' => \$verbose_stats, 't|translation_table=i' => \$translation_table, 'g|group_limit=i' => \$group_limit, 'qc|run_qc' => \$run_qc, 'x|dont_run_qc' => \$dont_run_qc, 'cd|core_definition=f' => \$core_definition, 'v|verbose' => \$verbose, 'n|mafft' => \$mafft, 'ap|allow_paralogs' => \$allow_paralogs, 'k|kraken_db=s' => \$kraken_db, 'w|version' => \$cmd_version, 'a|check_dependancies' => \$check_dependancies, 'iv|inflation_value=f' => \$inflation_value, 'h|help' => \$help, ); $self->version($cmd_version) if ( defined($cmd_version) ); if ( $self->version ) { print $self->_version() ; return; } print "\nPlease cite Roary if you use any of the results it produces: Andrew J. Page, Carla A. Cummins, Martin Hunt, Vanessa K. Wong, Sandra Reuter, Matthew T. G. Holden, Maria Fookes, Daniel Falush, Jacqueline A. Keane, Julian Parkhill, \"Roary: Rapid large-scale prokaryote pan genome analysis\", Bioinformatics, 2015 Nov 15;31(22):3691-3693 doi: http://doi.org/10.1093/bioinformatics/btv421 Pubmed: 26198102\n\n"; $self->help($help) if ( defined($help) ); if( $self->help ) { print $self->usage_text; return; } if ( defined($verbose) ) { $self->verbose($verbose); $self->logger->level(10000); } $self->check_dependancies($check_dependancies) if ( defined($check_dependancies) ); if ($self->check_dependancies) { my $check_tools = Bio::Roary::External::CheckTools->new(); $check_tools->check_all_tools; $self->logger->error( "Roary version " . $self->_version() ); if( @{ $self->args } < 1 ) { return; } } if ( @{ $self->args } < 2 ) { $self->logger->error("Error: You need to provide at least 2 files to build a pan genome"); die $self->usage_text; } my %basenames; foreach my $string (@{$self->args}) { my($base, $path, $suf) = fileparse($string); next unless $basenames{$base}++; $self->logger->error("Error: GFF files must have unique basenames."); die $self->usage_text; } if ( @{ $self->args } < 2 ) { $self->logger->error("Error: You need to provide at least 2 files to build a pan genome"); die $self->usage_text; } $self->output_filename($output_filename) if ( defined($output_filename) ); $self->job_runner($job_runner) if ( defined($job_runner) ); $self->makeblastdb_exec($makeblastdb_exec) if ( defined($makeblastdb_exec) ); $self->blastp_exec($blastp_exec) if ( defined($blastp_exec) ); $self->mcxdeblast_exec($mcxdeblast_exec) if ( defined($mcxdeblast_exec) ); $self->mcl_exec($mcl_exec) if ( defined($mcl_exec) ); $self->cpus($cpus) if ( defined($cpus) ); $self->inflation_value($inflation_value) if ( defined($inflation_value)); if ( defined($perc_identity) ) { $self->perc_identity($perc_identity); if ( $perc_identity < 50 ) { $self->logger->error( "The percentage identity is too low. Either something is wrong with your data, like contamination, or your doing something that the software isnt designed to support." ); } } $self->mafft($mafft) if ( defined($mafft) ); $self->apply_unknowns_filter($apply_unknowns_filter) if ( defined($apply_unknowns_filter) ); if ( defined($output_multifasta_files) ) { if ( which('prank') ) { $self->output_multifasta_files($output_multifasta_files); } else { if ( which('mafft') ) { $self->output_multifasta_files($output_multifasta_files); $self->mafft(1); $self->logger->warn("PRANK not found in your PATH so using MAFFT instead to generate multiFASTA alignments."); } else { $self->logger->warn("PRANK (or MAFFT) not found in your PATH so cannot generate multiFASTA alignments, skipping for now."); } } } $self->dont_delete_files($dont_delete_files) if ( defined($dont_delete_files) ); $self->dont_split_groups($dont_split_groups) if ( defined($dont_split_groups) ); $self->dont_create_rplots(0) if ( defined($create_rplots) ); $self->verbose_stats($verbose_stats) if ( defined $verbose_stats ); $self->translation_table($translation_table) if ( defined($translation_table) ); $self->group_limit($group_limit) if ( defined($group_limit) ); $self->kraken_db($kraken_db) if ( defined($kraken_db) ); $self->output_directory($output_directory) if ( defined($output_directory) ); if ( defined $verbose_stats && defined($output_multifasta_files) ) { $self->verbose_stats(0); $self->logger->warn("The verbose stats spreadsheet is not compatible with the core gene alignement so disabling verbose_stats"); } if ( defined($run_qc) ) { if ( which('kraken') && which('kraken-report') ) { $self->run_qc($run_qc); } else { $self->logger->warn("kraken or kraken-report not found in your PATH so cannot run QC, skipping for now."); } } if ( $self->cpus > 1 ) { $self->job_runner('Parallel'); } $self->core_definition( $core_definition / 100 ) if ( defined($core_definition) ); for my $filename ( @{ $self->args } ) { if ( !-e $filename ) { $self->logger->error("Error: Cant access file $filename"); die $self->usage_text; } push( @{ $self->fasta_files }, abs_path($filename) ); } $self->_working_directory( File::Temp->newdir( DIR => getcwd, CLEANUP => 0 ) ) if ( $self->dont_delete_files ); } sub _setup_output_directory { my ($self) = @_; return if ( $self->output_directory eq '.' || $self->output_directory eq '' ); if ( -e $self->output_directory || -d $self->output_directory ) { $self->logger->warn("Output directory name exists already so adding a timestamp to the end"); $self->output_directory( $self->output_directory() . '_' . time() ); if ( -e $self->output_directory || -d $self->output_directory ) { die("Output directory name with time stamp exist so giving up"); } } make_path( $self->output_directory, { error => \my $err } ); if (@$err) { for my $diag (@$err) { my ( $file, $message ) = %$diag; die("Error creating output directory $message"); } } $self->logger->info( "Output directory created: " . $self->output_directory ); $self->_original_directory( getcwd() ); chdir( $self->output_directory ); return $self; } sub run { my ($self) = @_; return if($self->version || $self->help || ($self->check_dependancies && @{$self->fasta_files} < 1) ); $self->_setup_output_directory; $self->logger->info("Fixing input GFF files"); my $reformat_input_files = Bio::Roary::ReformatInputGFFs->new( gff_files => $self->fasta_files, logger => $self->logger ); $reformat_input_files->fix_duplicate_gene_ids(); if ( @{ $reformat_input_files->fixed_gff_files } == 0 ) { die( "All input files have been excluded from analysis. Please check you have valid GFF files, with annotation and a FASTA sequence at the end. Better still, reannotate your FASTA file with PROKKA." ); } $self->fasta_files( $reformat_input_files->fixed_gff_files ); $self->logger->info("Extracting proteins from GFF files"); my $prepare_input_files = Bio::Roary::PrepareInputFiles->new( input_files => $self->fasta_files, job_runner => $self->job_runner, apply_unknowns_filter => $self->apply_unknowns_filter, cpus => $self->cpus, translation_table => $self->translation_table, verbose => $self->verbose, working_directory => $self->_working_directory, ); if ( $self->run_qc ) { $self->logger->info("Running Kraken on each input assembly"); my $qc_input_files = Bio::Roary::QC::Report->new( input_files => $self->fasta_files, job_runner => $self->job_runner, cpus => $self->cpus, verbose => $self->verbose, kraken_db => $self->kraken_db ); $qc_input_files->report; } my $pan_genome_obj = Bio::Roary->new( input_files => $self->fasta_files, fasta_files => $prepare_input_files->fasta_files, output_filename => $self->output_filename, job_runner => $self->job_runner, cpus => $self->cpus, makeblastdb_exec => $self->makeblastdb_exec, blastp_exec => $self->blastp_exec, output_multifasta_files => $self->output_multifasta_files, perc_identity => $self->perc_identity, dont_delete_files => $self->dont_delete_files, dont_create_rplots => $self->dont_create_rplots, dont_split_groups => $self->dont_split_groups, verbose_stats => $self->verbose_stats, translation_table => $self->translation_table, group_limit => $self->group_limit, core_definition => $self->core_definition, verbose => $self->verbose, mafft => $self->mafft, allow_paralogs => $self->allow_paralogs, inflation_value => $self->inflation_value, ); $pan_genome_obj->run(); chdir( $self->_original_directory ); } sub _version { my ($self) = @_; if ( defined($Bio::Roary::CommandLine::Roary::VERSION) ) { return $Bio::Roary::CommandLine::Roary::VERSION . "\n"; } else { return "x.y.z\n"; } } sub usage_text { my ($self) = @_; return <<USAGE; Usage: roary [options] *.gff Options: -p INT number of threads [1] -o STR clusters output filename [clustered_proteins] -f STR output directory [.] -e create a multiFASTA alignment of core genes using PRANK -n fast core gene alignment with MAFFT, use with -e -i minimum percentage identity for blastp [95] -cd FLOAT percentage of isolates a gene must be in to be core [99] -qc generate QC report with Kraken -k STR path to Kraken database for QC, use with -qc -a check dependancies and print versions -b STR blastp executable [blastp] -c STR mcl executable [mcl] -d STR mcxdeblast executable [mcxdeblast] -g INT maximum number of clusters [50000] -m STR makeblastdb executable [makeblastdb] -r create R plots, requires R and ggplot2 -s dont split paralogs -t INT translation table [11] -ap allow paralogs in core alignment -z dont delete intermediate files -v verbose output to STDOUT -w print version and exit -y add gene inference information to spreadsheet, doesnt work with -e -iv STR Change the MCL inflation value [1.5] -h this help message Example: Quickly generate a core gene alignment using 8 threads roary -e --mafft -p 8 *.gff For further info see: http://sanger-pathogens.github.io/Roary/ USAGE } __PACKAGE__->meta->make_immutable; no Moose; 1;