comparison Roary/lib/Bio/Roary/ReformatInputGFFs.pm @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c47a5f61bc9f
1 package Bio::Roary::ReformatInputGFFs;
2
3 # ABSTRACT: Take in gff files and add suffix where a gene id is seen twice
4
5 =head1 SYNOPSIS
6
7 Take in gff files and add suffix where a gene id is seen twice
8 use Bio::Roary::ReformatInputGFFs;
9
10 my $obj = Bio::Roary::PrepareInputFiles->new(
11 gff_files => ['abc.gff','ddd.faa'],
12 );
13 $obj->fix_duplicate_gene_ids;
14 $obj->fixed_gff_files;
15
16 =cut
17
18 use Moose;
19 use Bio::Roary::Exceptions;
20 use Cwd;
21 use File::Copy;
22 use Log::Log4perl qw(:easy);
23 use Bio::Tools::GFF;
24 use File::Path qw(make_path);
25 use File::Basename;
26 use Digest::MD5::File qw(file_md5_hex);
27
28 has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
29 has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' );
30 has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => 'CDS' );
31 has 'output_directory' => ( is => 'ro', isa => 'Str', default => 'fixed_input_files' );
32 has 'suffix_counter' => ( is => 'rw', isa => 'Int', default => 1 );
33
34 has 'fixed_gff_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
35
36 sub _build_logger {
37 my ($self) = @_;
38 Log::Log4perl->easy_init( $ERROR );
39 my $logger = get_logger();
40 return $logger;
41 }
42
43 sub fix_duplicate_gene_ids {
44 my ($self) = @_;
45
46 my %gene_ids_seen_before;
47
48 my %file_md5s;
49
50 for my $file ( @{ $self->gff_files } ) {
51 my $digest = file_md5_hex($file);
52
53 if(defined($file_md5s{$digest}))
54 {
55 $self->logger->warn(
56 "Input files have identical MD5 hashes, only using the first file: ".$file_md5s{$digest}." == ".$file
57 );
58 next;
59 }
60 else
61 {
62 $file_md5s{$digest} = $file;
63 }
64
65 my $ids_seen = 0;
66 my $ids_from_file = $self->_get_ids_for_gff_file($file);
67
68 if ( @{$ids_from_file} < 1 ) {
69 $self->logger->error(
70 "Input GFF file doesnt contain annotation we can use so excluding it from the analysis: $file"
71 );
72 }
73 else {
74 for my $gene_id ( @{$ids_from_file} ) {
75 if ( $gene_ids_seen_before{$gene_id} ) {
76 $self->logger->error(
77 "Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix, new GFF in the fixed_input_files directory: $file "
78 );
79 my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file, $digest);
80 push( @{ $self->fixed_gff_files }, $updated_file ) if ( defined($updated_file) );
81 $ids_seen = 1;
82 last;
83 }
84 $gene_ids_seen_before{$gene_id}++;
85 }
86
87 # We know its a valid GFF file since we could open it and extract IDs.
88 # We need to make sure the filenames end in .gff. If it contained duplicate IDs, then they are fixed so nothing to do, but
89 # if they didnt, then we have to double check and repair if necessary.
90 if ( $ids_seen == 0 ) {
91
92
93 push( @{ $self->fixed_gff_files }, $self->_fix_gff_file_extension($file) );
94 }
95 }
96 }
97 return 1;
98 }
99
100 sub _fix_gff_file_extension
101 {
102 my ( $self, $input_file ) = @_;
103
104 my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
105 return $input_file if($suffix eq '.gff');
106
107
108 make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
109 my $output_file = $self->output_directory . '/' . $filename . '.gff';
110 copy($input_file, $output_file) or $self->logger->error("Couldnt copy file with invalid gff extention: $input_file -> $output_file");
111 return $output_file;
112 }
113
114
115 sub _add_suffix_to_gene_ids_and_return_new_file {
116 my ( $self, $input_file, $digest ) = @_;
117 my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ );
118 make_path( $self->output_directory ) if ( !( -d $self->output_directory ) );
119 my $output_file = $self->output_directory . '/' . $filename . '.gff';
120
121 open( my $input_gff_fh, $input_file );
122 open( my $out_gff_fh, '>', $output_file );
123
124 # There is a chance that there can be a collision here, but its remote.
125 my $random_locus_tag = "".$digest;
126
127 $self->logger->warn(
128 "Renamed GFF file from: $input_file -> $output_file" );
129 $self->logger->warn(
130 "Locus tag used is '$random_locus_tag' for file: $input_file" );
131
132 my $found_fasta = 0;
133 my $gene_counter = 1;
134 while (<$input_gff_fh>) {
135 my $line = $_;
136
137 if ( $line =~ /^\#\#FASTA/ ) {
138 $found_fasta = 1;
139 }
140
141 if ( $line =~ /\#/ || $found_fasta == 1 ) {
142 print {$out_gff_fh} $line;
143 next;
144 }
145
146 my @cells = split( /\t/, $line );
147 my @tags = split( /;/, $cells[8] );
148 my $found_id = 0;
149 for ( my $i = 0 ; $i < @tags ; $i++ ) {
150 if ( $tags[$i] =~ /^(ID=["']?)([^;"']+)(["']?)/ ) {
151 my $current_id = $2;
152 $current_id .= '___' . $self->suffix_counter;
153 $tags[$i] = $1 .$random_locus_tag.'_'. $gene_counter . $3;
154 $gene_counter++;
155 $found_id++;
156 last;
157 }
158 }
159 if ( $found_id == 0 ) {
160 unshift( @tags, 'ID=' . $random_locus_tag.'_'. $gene_counter );
161 $gene_counter++;
162 }
163 $cells[8] = join( ';', @tags );
164 print {$out_gff_fh} join( "\t", @cells );
165 }
166
167 if ( $found_fasta == 0 ) {
168 $self->logger->warn(
169 "Input GFF file doesnt appear to have the FASTA sequence at the end of the file so is being excluded from the analysis: $input_file" );
170 return undef;
171 }
172 close($out_gff_fh);
173 close($input_gff_fh);
174 return $output_file;
175 }
176
177 sub _get_ids_for_gff_file {
178 my ( $self, $file ) = @_;
179 my @gene_ids;
180 my $tags_regex = $self->_tags_to_filter;
181 my $gffio = Bio::Tools::GFF->new( -file => $file, -gff_version => 3 );
182 while ( my $feature = $gffio->next_feature() ) {
183 next if !( $feature->primary_tag =~ /$tags_regex/ );
184 my $gene_id = $self->_get_feature_id($feature);
185 push( @gene_ids, $gene_id ) if ( defined($gene_id) );
186 }
187 return \@gene_ids;
188 }
189
190 sub _get_feature_id {
191 my ( $self, $feature ) = @_;
192 my ( $gene_id, @junk );
193 if ( $feature->has_tag('ID') ) {
194 ( $gene_id, @junk ) = $feature->get_tag_values('ID');
195 }
196 elsif ( $feature->has_tag('locus_tag') ) {
197 ( $gene_id, @junk ) = $feature->get_tag_values('locus_tag');
198 }
199 else {
200 return undef;
201 }
202 $gene_id =~ s!["']!!g;
203 return undef if ( $gene_id eq "" );
204 return $gene_id;
205 }
206
207 no Moose;
208 __PACKAGE__->meta->make_immutable;
209
210 1;