Mercurial > repos > dereeper > roary_plots
view Roary/lib/Bio/Roary/ReformatInputGFFs.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
line wrap: on
line source
package Bio::Roary::ReformatInputGFFs; # ABSTRACT: Take in gff files and add suffix where a gene id is seen twice =head1 SYNOPSIS Take in gff files and add suffix where a gene id is seen twice use Bio::Roary::ReformatInputGFFs; my $obj = Bio::Roary::PrepareInputFiles->new( gff_files => ['abc.gff','ddd.faa'], ); $obj->fix_duplicate_gene_ids; $obj->fixed_gff_files; =cut use Moose; use Bio::Roary::Exceptions; use Cwd; use File::Copy; use Log::Log4perl qw(:easy); use Bio::Tools::GFF; use File::Path qw(make_path); use File::Basename; use Digest::MD5::File qw(file_md5_hex); has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' ); has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => 'CDS' ); has 'output_directory' => ( is => 'ro', isa => 'Str', default => 'fixed_input_files' ); has 'suffix_counter' => ( is => 'rw', isa => 'Int', default => 1 ); has 'fixed_gff_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } ); sub _build_logger { my ($self) = @_; Log::Log4perl->easy_init( $ERROR ); my $logger = get_logger(); return $logger; } sub fix_duplicate_gene_ids { my ($self) = @_; my %gene_ids_seen_before; my %file_md5s; for my $file ( @{ $self->gff_files } ) { my $digest = file_md5_hex($file); if(defined($file_md5s{$digest})) { $self->logger->warn( "Input files have identical MD5 hashes, only using the first file: ".$file_md5s{$digest}." == ".$file ); next; } else { $file_md5s{$digest} = $file; } my $ids_seen = 0; my $ids_from_file = $self->_get_ids_for_gff_file($file); if ( @{$ids_from_file} < 1 ) { $self->logger->error( "Input GFF file doesnt contain annotation we can use so excluding it from the analysis: $file" ); } else { for my $gene_id ( @{$ids_from_file} ) { if ( $gene_ids_seen_before{$gene_id} ) { $self->logger->error( "Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix, new GFF in the fixed_input_files directory: $file " ); my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file, $digest); push( @{ $self->fixed_gff_files }, $updated_file ) if ( defined($updated_file) ); $ids_seen = 1; last; } $gene_ids_seen_before{$gene_id}++; } # We know its a valid GFF file since we could open it and extract IDs. # We need to make sure the filenames end in .gff. If it contained duplicate IDs, then they are fixed so nothing to do, but # if they didnt, then we have to double check and repair if necessary. if ( $ids_seen == 0 ) { push( @{ $self->fixed_gff_files }, $self->_fix_gff_file_extension($file) ); } } } return 1; } sub _fix_gff_file_extension { my ( $self, $input_file ) = @_; my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ ); return $input_file if($suffix eq '.gff'); make_path( $self->output_directory ) if ( !( -d $self->output_directory ) ); my $output_file = $self->output_directory . '/' . $filename . '.gff'; copy($input_file, $output_file) or $self->logger->error("Couldnt copy file with invalid gff extention: $input_file -> $output_file"); return $output_file; } sub _add_suffix_to_gene_ids_and_return_new_file { my ( $self, $input_file, $digest ) = @_; my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ ); make_path( $self->output_directory ) if ( !( -d $self->output_directory ) ); my $output_file = $self->output_directory . '/' . $filename . '.gff'; open( my $input_gff_fh, $input_file ); open( my $out_gff_fh, '>', $output_file ); # There is a chance that there can be a collision here, but its remote. my $random_locus_tag = "".$digest; $self->logger->warn( "Renamed GFF file from: $input_file -> $output_file" ); $self->logger->warn( "Locus tag used is '$random_locus_tag' for file: $input_file" ); my $found_fasta = 0; my $gene_counter = 1; while (<$input_gff_fh>) { my $line = $_; if ( $line =~ /^\#\#FASTA/ ) { $found_fasta = 1; } if ( $line =~ /\#/ || $found_fasta == 1 ) { print {$out_gff_fh} $line; next; } my @cells = split( /\t/, $line ); my @tags = split( /;/, $cells[8] ); my $found_id = 0; for ( my $i = 0 ; $i < @tags ; $i++ ) { if ( $tags[$i] =~ /^(ID=["']?)([^;"']+)(["']?)/ ) { my $current_id = $2; $current_id .= '___' . $self->suffix_counter; $tags[$i] = $1 .$random_locus_tag.'_'. $gene_counter . $3; $gene_counter++; $found_id++; last; } } if ( $found_id == 0 ) { unshift( @tags, 'ID=' . $random_locus_tag.'_'. $gene_counter ); $gene_counter++; } $cells[8] = join( ';', @tags ); print {$out_gff_fh} join( "\t", @cells ); } if ( $found_fasta == 0 ) { $self->logger->warn( "Input GFF file doesnt appear to have the FASTA sequence at the end of the file so is being excluded from the analysis: $input_file" ); return undef; } close($out_gff_fh); close($input_gff_fh); return $output_file; } sub _get_ids_for_gff_file { my ( $self, $file ) = @_; my @gene_ids; my $tags_regex = $self->_tags_to_filter; my $gffio = Bio::Tools::GFF->new( -file => $file, -gff_version => 3 ); while ( my $feature = $gffio->next_feature() ) { next if !( $feature->primary_tag =~ /$tags_regex/ ); my $gene_id = $self->_get_feature_id($feature); push( @gene_ids, $gene_id ) if ( defined($gene_id) ); } return \@gene_ids; } sub _get_feature_id { my ( $self, $feature ) = @_; my ( $gene_id, @junk ); if ( $feature->has_tag('ID') ) { ( $gene_id, @junk ) = $feature->get_tag_values('ID'); } elsif ( $feature->has_tag('locus_tag') ) { ( $gene_id, @junk ) = $feature->get_tag_values('locus_tag'); } else { return undef; } $gene_id =~ s!["']!!g; return undef if ( $gene_id eq "" ); return $gene_id; } no Moose; __PACKAGE__->meta->make_immutable; 1;