Mercurial > repos > dereeper > roary_plots
comparison Roary/lib/Bio/Roary/ReformatInputGFFs.pm @ 0:c47a5f61bc9f draft
Uploaded
author | dereeper |
---|---|
date | Fri, 14 May 2021 20:27:06 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c47a5f61bc9f |
---|---|
1 package Bio::Roary::ReformatInputGFFs; | |
2 | |
3 # ABSTRACT: Take in gff files and add suffix where a gene id is seen twice | |
4 | |
5 =head1 SYNOPSIS | |
6 | |
7 Take in gff files and add suffix where a gene id is seen twice | |
8 use Bio::Roary::ReformatInputGFFs; | |
9 | |
10 my $obj = Bio::Roary::PrepareInputFiles->new( | |
11 gff_files => ['abc.gff','ddd.faa'], | |
12 ); | |
13 $obj->fix_duplicate_gene_ids; | |
14 $obj->fixed_gff_files; | |
15 | |
16 =cut | |
17 | |
18 use Moose; | |
19 use Bio::Roary::Exceptions; | |
20 use Cwd; | |
21 use File::Copy; | |
22 use Log::Log4perl qw(:easy); | |
23 use Bio::Tools::GFF; | |
24 use File::Path qw(make_path); | |
25 use File::Basename; | |
26 use Digest::MD5::File qw(file_md5_hex); | |
27 | |
28 has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); | |
29 has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger' ); | |
30 has '_tags_to_filter' => ( is => 'ro', isa => 'Str', default => 'CDS' ); | |
31 has 'output_directory' => ( is => 'ro', isa => 'Str', default => 'fixed_input_files' ); | |
32 has 'suffix_counter' => ( is => 'rw', isa => 'Int', default => 1 ); | |
33 | |
34 has 'fixed_gff_files' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } ); | |
35 | |
36 sub _build_logger { | |
37 my ($self) = @_; | |
38 Log::Log4perl->easy_init( $ERROR ); | |
39 my $logger = get_logger(); | |
40 return $logger; | |
41 } | |
42 | |
43 sub fix_duplicate_gene_ids { | |
44 my ($self) = @_; | |
45 | |
46 my %gene_ids_seen_before; | |
47 | |
48 my %file_md5s; | |
49 | |
50 for my $file ( @{ $self->gff_files } ) { | |
51 my $digest = file_md5_hex($file); | |
52 | |
53 if(defined($file_md5s{$digest})) | |
54 { | |
55 $self->logger->warn( | |
56 "Input files have identical MD5 hashes, only using the first file: ".$file_md5s{$digest}." == ".$file | |
57 ); | |
58 next; | |
59 } | |
60 else | |
61 { | |
62 $file_md5s{$digest} = $file; | |
63 } | |
64 | |
65 my $ids_seen = 0; | |
66 my $ids_from_file = $self->_get_ids_for_gff_file($file); | |
67 | |
68 if ( @{$ids_from_file} < 1 ) { | |
69 $self->logger->error( | |
70 "Input GFF file doesnt contain annotation we can use so excluding it from the analysis: $file" | |
71 ); | |
72 } | |
73 else { | |
74 for my $gene_id ( @{$ids_from_file} ) { | |
75 if ( $gene_ids_seen_before{$gene_id} ) { | |
76 $self->logger->error( | |
77 "Input file contains duplicate gene IDs, attempting to fix by adding a unique suffix, new GFF in the fixed_input_files directory: $file " | |
78 ); | |
79 my $updated_file = $self->_add_suffix_to_gene_ids_and_return_new_file($file, $digest); | |
80 push( @{ $self->fixed_gff_files }, $updated_file ) if ( defined($updated_file) ); | |
81 $ids_seen = 1; | |
82 last; | |
83 } | |
84 $gene_ids_seen_before{$gene_id}++; | |
85 } | |
86 | |
87 # We know its a valid GFF file since we could open it and extract IDs. | |
88 # We need to make sure the filenames end in .gff. If it contained duplicate IDs, then they are fixed so nothing to do, but | |
89 # if they didnt, then we have to double check and repair if necessary. | |
90 if ( $ids_seen == 0 ) { | |
91 | |
92 | |
93 push( @{ $self->fixed_gff_files }, $self->_fix_gff_file_extension($file) ); | |
94 } | |
95 } | |
96 } | |
97 return 1; | |
98 } | |
99 | |
100 sub _fix_gff_file_extension | |
101 { | |
102 my ( $self, $input_file ) = @_; | |
103 | |
104 my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ ); | |
105 return $input_file if($suffix eq '.gff'); | |
106 | |
107 | |
108 make_path( $self->output_directory ) if ( !( -d $self->output_directory ) ); | |
109 my $output_file = $self->output_directory . '/' . $filename . '.gff'; | |
110 copy($input_file, $output_file) or $self->logger->error("Couldnt copy file with invalid gff extention: $input_file -> $output_file"); | |
111 return $output_file; | |
112 } | |
113 | |
114 | |
115 sub _add_suffix_to_gene_ids_and_return_new_file { | |
116 my ( $self, $input_file, $digest ) = @_; | |
117 my ( $filename, $directories, $suffix ) = fileparse( $input_file, qr/\.[^.]*/ ); | |
118 make_path( $self->output_directory ) if ( !( -d $self->output_directory ) ); | |
119 my $output_file = $self->output_directory . '/' . $filename . '.gff'; | |
120 | |
121 open( my $input_gff_fh, $input_file ); | |
122 open( my $out_gff_fh, '>', $output_file ); | |
123 | |
124 # There is a chance that there can be a collision here, but its remote. | |
125 my $random_locus_tag = "".$digest; | |
126 | |
127 $self->logger->warn( | |
128 "Renamed GFF file from: $input_file -> $output_file" ); | |
129 $self->logger->warn( | |
130 "Locus tag used is '$random_locus_tag' for file: $input_file" ); | |
131 | |
132 my $found_fasta = 0; | |
133 my $gene_counter = 1; | |
134 while (<$input_gff_fh>) { | |
135 my $line = $_; | |
136 | |
137 if ( $line =~ /^\#\#FASTA/ ) { | |
138 $found_fasta = 1; | |
139 } | |
140 | |
141 if ( $line =~ /\#/ || $found_fasta == 1 ) { | |
142 print {$out_gff_fh} $line; | |
143 next; | |
144 } | |
145 | |
146 my @cells = split( /\t/, $line ); | |
147 my @tags = split( /;/, $cells[8] ); | |
148 my $found_id = 0; | |
149 for ( my $i = 0 ; $i < @tags ; $i++ ) { | |
150 if ( $tags[$i] =~ /^(ID=["']?)([^;"']+)(["']?)/ ) { | |
151 my $current_id = $2; | |
152 $current_id .= '___' . $self->suffix_counter; | |
153 $tags[$i] = $1 .$random_locus_tag.'_'. $gene_counter . $3; | |
154 $gene_counter++; | |
155 $found_id++; | |
156 last; | |
157 } | |
158 } | |
159 if ( $found_id == 0 ) { | |
160 unshift( @tags, 'ID=' . $random_locus_tag.'_'. $gene_counter ); | |
161 $gene_counter++; | |
162 } | |
163 $cells[8] = join( ';', @tags ); | |
164 print {$out_gff_fh} join( "\t", @cells ); | |
165 } | |
166 | |
167 if ( $found_fasta == 0 ) { | |
168 $self->logger->warn( | |
169 "Input GFF file doesnt appear to have the FASTA sequence at the end of the file so is being excluded from the analysis: $input_file" ); | |
170 return undef; | |
171 } | |
172 close($out_gff_fh); | |
173 close($input_gff_fh); | |
174 return $output_file; | |
175 } | |
176 | |
177 sub _get_ids_for_gff_file { | |
178 my ( $self, $file ) = @_; | |
179 my @gene_ids; | |
180 my $tags_regex = $self->_tags_to_filter; | |
181 my $gffio = Bio::Tools::GFF->new( -file => $file, -gff_version => 3 ); | |
182 while ( my $feature = $gffio->next_feature() ) { | |
183 next if !( $feature->primary_tag =~ /$tags_regex/ ); | |
184 my $gene_id = $self->_get_feature_id($feature); | |
185 push( @gene_ids, $gene_id ) if ( defined($gene_id) ); | |
186 } | |
187 return \@gene_ids; | |
188 } | |
189 | |
190 sub _get_feature_id { | |
191 my ( $self, $feature ) = @_; | |
192 my ( $gene_id, @junk ); | |
193 if ( $feature->has_tag('ID') ) { | |
194 ( $gene_id, @junk ) = $feature->get_tag_values('ID'); | |
195 } | |
196 elsif ( $feature->has_tag('locus_tag') ) { | |
197 ( $gene_id, @junk ) = $feature->get_tag_values('locus_tag'); | |
198 } | |
199 else { | |
200 return undef; | |
201 } | |
202 $gene_id =~ s!["']!!g; | |
203 return undef if ( $gene_id eq "" ); | |
204 return $gene_id; | |
205 } | |
206 | |
207 no Moose; | |
208 __PACKAGE__->meta->make_immutable; | |
209 | |
210 1; |