Mercurial > repos > dereeper > roary_plots
diff Roary/lib/Bio/Roary/Output/GroupsMultifastaNucleotide.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Roary/lib/Bio/Roary/Output/GroupsMultifastaNucleotide.pm Fri May 14 20:27:06 2021 +0000 @@ -0,0 +1,211 @@ +package Bio::Roary::Output::GroupsMultifastaNucleotide; + +# ABSTRACT: Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences. + +=head1 SYNOPSIS + +Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences. + use Bio::Roary::Output::GroupsMultifastas; + + my $obj = Bio::Roary::Output::GroupsMultifastasNucleotide->new( + group_names => ['aaa','bbb'], + ); + $obj->populate_files(); + +=cut + +use Moose; +use Bio::SeqIO; +use File::Path qw(make_path); +use File::Basename; +use File::Copy; +use File::Temp qw/ tempfile /; +use Bio::Roary::Exceptions; +use Bio::Roary::AnalyseGroups; +use Bio::Tools::GFF; +with 'Bio::Roary::BedFromGFFRole'; + +has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'group_names' => ( is => 'ro', isa => 'ArrayRef', required => 0 ); +has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 ); +has 'pan_reference_groups_seen' => ( is => 'rw', isa => 'HashRef', required => 1 ); +has 'number_of_gff_files' => ( is => 'ro', isa => 'Int', required => 1 ); +has 'pan_reference_filename' => ( is => 'ro', isa => 'Str',default => 'pan_genome_reference.fa' ); +has 'dont_delete_files' => ( is => 'ro', isa => 'Bool',default => 0 ); +has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 ); + +has 'annotate_groups' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 ); +has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 ); + +has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' ); +has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' ); + +has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' ); + +sub _build_output_filename +{ + my ($self) = @_; + my ( $filename, $directories, $suffix ) = fileparse($self->gff_file); + return join('/',($self->output_directory, $filename.'.tmp_nuc_sequences.fa' )); +} + +sub _build__input_seqio { + my ($self) = @_; + return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' ); +} + +sub _bed_output_filename { + my ($self) = @_; + return join( '.', ( $self->output_filename, 'intermediate.bed' ) ); +} + +sub populate_files { + my ($self) = @_; + while ( my $input_seq = $self->_input_seqio->next_seq() ) + { + if ( $self->annotate_groups->_ids_to_groups->{$input_seq->display_id} ) + { + my $current_group = $self->annotate_groups->_ids_to_groups->{$input_seq->display_id}; + my $gene_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$current_group}; + + if(! defined($self->pan_reference_groups_seen->{$current_group})) + { + my $pan_output_seq = $self->_pan_genome_reference_io_obj($current_group); + $pan_output_seq->write_seq(Bio::Seq->new( -display_id => $input_seq->display_id, -desc => ($gene_name ? $gene_name : $current_group), -seq => $input_seq->seq ) ); + $self->pan_reference_groups_seen->{$current_group} = 1; + } + + my $number_of_genes = @{$self->annotate_groups->_groups_to_id_names->{$current_group}}; + # Theres no need to align noncore files + next if($self->dont_delete_files == 0 && $number_of_genes < ($self->core_definition * $self->number_of_gff_files )); + + my $output_seq = $self->_group_seq_io_obj($current_group,$number_of_genes); + $output_seq->write_seq($input_seq); + } + } + + unlink($self->fasta_file); + 1; +} + +sub _group_file_name +{ + my ($self,$group_name,$num_group_genes) = @_; + my $annotated_group_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$group_name}; + $annotated_group_name =~ s!\W!_!gi; + my $filename = $annotated_group_name.'.fa'; + my $group_file_name = join('/',($self->output_directory, $filename )); + return $group_file_name; +} + + +sub _pan_genome_reference_io_obj +{ + my ($self) = @_; + return Bio::SeqIO->new( -file => ">>".$self->pan_reference_filename, -format => 'Fasta' ); +} + + +sub _group_seq_io_obj +{ + my ($self,$group_name,$num_group_genes) = @_; + my $filename = $self->_group_file_name($group_name,$num_group_genes); + return Bio::SeqIO->new( -file => ">>".$filename, -format => 'Fasta' ); +} + + +sub _extracted_nucleotide_fasta_file_from_bed_filename { + my ($self) = @_; + return join( '.', ( $self->output_filename, 'intermediate.extracted.fa' ) ); +} + +sub _create_nucleotide_fasta_file_from_gff { + my ($self) = @_; + + open(my $input_fh, $self->gff_file); + open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename); + my $at_sequence = 0; + while(<$input_fh>) + { + my $line = $_; + if($line =~/^>/) + { + $at_sequence = 1; + } + + if($at_sequence == 1) + { + print {$output_fh} $line; + } + } + close($input_fh); + close($output_fh); +} + +sub _nucleotide_fasta_file_from_gff_filename { + my ($self) = @_; + return join( '.', ( $self->output_filename, 'intermediate.fa' ) ); +} + +sub _extract_nucleotide_regions { + my ($self) = @_; + + $self->_create_nucleotide_fasta_file_from_gff; + $self->_create_bed_file_from_gff; + + my $cmd = + 'bedtools getfasta -s -fi ' + . $self->_nucleotide_fasta_file_from_gff_filename + . ' -bed ' + . $self->_bed_output_filename . ' -fo ' + . $self->_extracted_nucleotide_fasta_file_from_bed_filename + . ' -name > /dev/null 2>&1'; + system($cmd); + unlink( $self->_nucleotide_fasta_file_from_gff_filename ); + unlink( $self->_bed_output_filename ); + unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' ); + return $self->_extracted_nucleotide_fasta_file_from_bed_filename; +} + +sub _cleanup_fasta { + my ($self,$infile) = @_; + + my($fh, $outfile) = tempfile(); + return unless ( -e $infile ); + + open( my $in, '<', $infile ); + open( my $out, '>', $outfile ); + while ( my $line = <$in> ) { + chomp $line; + if ( $line =~ /^>/ ) + { + $line =~ s/"//g ; + # newer versions of Bedtools add (-) or (+) to the end of the sequence name, remove them + $line =~ s!\([-+]\)!!; + } + + if($line =~ /^(>[^:]+)/) + { + $line = $1; + } + print $out "$line\n"; + } + close $in; + close $out; + + move( $outfile, $infile); + return $infile; +} + + +sub _build_fasta_file { + my ($self) = @_; + my $fasta_filename = $self->_extract_nucleotide_regions; + return $self->_cleanup_fasta($fasta_filename); +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; +