Mercurial > repos > dereeper > roary_plots
view Roary/lib/Bio/Roary/Output/GroupsMultifastaNucleotide.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line source
package Bio::Roary::Output::GroupsMultifastaNucleotide; # ABSTRACT: Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences. =head1 SYNOPSIS Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences. use Bio::Roary::Output::GroupsMultifastas; my $obj = Bio::Roary::Output::GroupsMultifastasNucleotide->new( group_names => ['aaa','bbb'], ); $obj->populate_files(); =cut use Moose; use Bio::SeqIO; use File::Path qw(make_path); use File::Basename; use File::Copy; use File::Temp qw/ tempfile /; use Bio::Roary::Exceptions; use Bio::Roary::AnalyseGroups; use Bio::Tools::GFF; with 'Bio::Roary::BedFromGFFRole'; has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 ); has 'group_names' => ( is => 'ro', isa => 'ArrayRef', required => 0 ); has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 ); has 'pan_reference_groups_seen' => ( is => 'rw', isa => 'HashRef', required => 1 ); has 'number_of_gff_files' => ( is => 'ro', isa => 'Int', required => 1 ); has 'pan_reference_filename' => ( is => 'ro', isa => 'Str',default => 'pan_genome_reference.fa' ); has 'dont_delete_files' => ( is => 'ro', isa => 'Bool',default => 0 ); has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 ); has 'annotate_groups' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 ); has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 ); has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' ); has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' ); has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' ); sub _build_output_filename { my ($self) = @_; my ( $filename, $directories, $suffix ) = fileparse($self->gff_file); return join('/',($self->output_directory, $filename.'.tmp_nuc_sequences.fa' )); } sub _build__input_seqio { my ($self) = @_; return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' ); } sub _bed_output_filename { my ($self) = @_; return join( '.', ( $self->output_filename, 'intermediate.bed' ) ); } sub populate_files { my ($self) = @_; while ( my $input_seq = $self->_input_seqio->next_seq() ) { if ( $self->annotate_groups->_ids_to_groups->{$input_seq->display_id} ) { my $current_group = $self->annotate_groups->_ids_to_groups->{$input_seq->display_id}; my $gene_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$current_group}; if(! defined($self->pan_reference_groups_seen->{$current_group})) { my $pan_output_seq = $self->_pan_genome_reference_io_obj($current_group); $pan_output_seq->write_seq(Bio::Seq->new( -display_id => $input_seq->display_id, -desc => ($gene_name ? $gene_name : $current_group), -seq => $input_seq->seq ) ); $self->pan_reference_groups_seen->{$current_group} = 1; } my $number_of_genes = @{$self->annotate_groups->_groups_to_id_names->{$current_group}}; # Theres no need to align noncore files next if($self->dont_delete_files == 0 && $number_of_genes < ($self->core_definition * $self->number_of_gff_files )); my $output_seq = $self->_group_seq_io_obj($current_group,$number_of_genes); $output_seq->write_seq($input_seq); } } unlink($self->fasta_file); 1; } sub _group_file_name { my ($self,$group_name,$num_group_genes) = @_; my $annotated_group_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$group_name}; $annotated_group_name =~ s!\W!_!gi; my $filename = $annotated_group_name.'.fa'; my $group_file_name = join('/',($self->output_directory, $filename )); return $group_file_name; } sub _pan_genome_reference_io_obj { my ($self) = @_; return Bio::SeqIO->new( -file => ">>".$self->pan_reference_filename, -format => 'Fasta' ); } sub _group_seq_io_obj { my ($self,$group_name,$num_group_genes) = @_; my $filename = $self->_group_file_name($group_name,$num_group_genes); return Bio::SeqIO->new( -file => ">>".$filename, -format => 'Fasta' ); } sub _extracted_nucleotide_fasta_file_from_bed_filename { my ($self) = @_; return join( '.', ( $self->output_filename, 'intermediate.extracted.fa' ) ); } sub _create_nucleotide_fasta_file_from_gff { my ($self) = @_; open(my $input_fh, $self->gff_file); open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename); my $at_sequence = 0; while(<$input_fh>) { my $line = $_; if($line =~/^>/) { $at_sequence = 1; } if($at_sequence == 1) { print {$output_fh} $line; } } close($input_fh); close($output_fh); } sub _nucleotide_fasta_file_from_gff_filename { my ($self) = @_; return join( '.', ( $self->output_filename, 'intermediate.fa' ) ); } sub _extract_nucleotide_regions { my ($self) = @_; $self->_create_nucleotide_fasta_file_from_gff; $self->_create_bed_file_from_gff; my $cmd = 'bedtools getfasta -s -fi ' . $self->_nucleotide_fasta_file_from_gff_filename . ' -bed ' . $self->_bed_output_filename . ' -fo ' . $self->_extracted_nucleotide_fasta_file_from_bed_filename . ' -name > /dev/null 2>&1'; system($cmd); unlink( $self->_nucleotide_fasta_file_from_gff_filename ); unlink( $self->_bed_output_filename ); unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' ); return $self->_extracted_nucleotide_fasta_file_from_bed_filename; } sub _cleanup_fasta { my ($self,$infile) = @_; my($fh, $outfile) = tempfile(); return unless ( -e $infile ); open( my $in, '<', $infile ); open( my $out, '>', $outfile ); while ( my $line = <$in> ) { chomp $line; if ( $line =~ /^>/ ) { $line =~ s/"//g ; # newer versions of Bedtools add (-) or (+) to the end of the sequence name, remove them $line =~ s!\([-+]\)!!; } if($line =~ /^(>[^:]+)/) { $line = $1; } print $out "$line\n"; } close $in; close $out; move( $outfile, $infile); return $infile; } sub _build_fasta_file { my ($self) = @_; my $fasta_filename = $self->_extract_nucleotide_regions; return $self->_cleanup_fasta($fasta_filename); } no Moose; __PACKAGE__->meta->make_immutable; 1;