Mercurial > repos > cpt > cpt_psm_recombine
comparison lib/CPT/GBK2GFF3.pm @ 1:97ef96676b48 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:51:26 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:b18e8268bf4e | 1:97ef96676b48 |
---|---|
1 package CPT::GBK2GFF3; | |
2 use strict; | |
3 use warnings; | |
4 use CPT; | |
5 use Data::Dumper; | |
6 use autodie; | |
7 use Bio::SeqIO; | |
8 use Moose; | |
9 | |
10 has data => ( | |
11 is => 'rw', | |
12 isa => 'ArrayRef', | |
13 default => sub { [] }, | |
14 ); | |
15 has header => ( | |
16 is => 'rw', | |
17 isa => 'ArrayRef', | |
18 ); | |
19 has sequence => ( | |
20 is => 'rw', | |
21 isa => 'Str', | |
22 ); | |
23 has seqid => ( | |
24 is => 'rw', | |
25 isa => 'Str', | |
26 ); | |
27 has genbank => ( | |
28 is => 'rw', | |
29 isa => 'Str', | |
30 ); | |
31 has is_circular => ( | |
32 is => 'rw', | |
33 isa => 'Str', | |
34 ); | |
35 has id_prefix => ( is => 'rw', isa => 'Str' ); | |
36 has global_feat_idx => ( is => 'rw', isa => 'Int', default => sub { 0 } ); | |
37 has source => ( is => 'rw', isa => 'ArrayRef'); | |
38 | |
39 my %feat_type_count; | |
40 my %reserved_keys = map { $_ => 1 } | |
41 qw/ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term/; | |
42 | |
43 use CPT::Bio; | |
44 my $bio = CPT::Bio->new(); | |
45 my %key_mapping = ( | |
46 "-" => [ "located_sequence_feature", "SO:0000110" ], | |
47 "-10_signal" => [ "minus_10_signal", "SO:0000175" ], | |
48 "-35_signal" => [ "minus_35_signal", "SO:0000176" ], | |
49 "3'UTR" => [ "three_prime_UTR", "SO:0000205" ], | |
50 "3'clip" => [ "three_prime_clip", "SO:0000557" ], | |
51 "5'UTR" => [ "five_prime_UTR", "SO:0000204" ], | |
52 "5'clip" => [ "five_prime_clip", "SO:0000555" ], | |
53 "CAAT_signal" => [ "CAAT_signal", "SO:0000172" ], | |
54 "CDS" => [ "CDS", "SO:0000316" ], | |
55 "D-loop" => [ "D_loop", "SO:0000297" ], | |
56 "D_segment" => [ "D_gene", "SO:0000458" ], | |
57 "GC_signal" => [ "GC_rich_region", "SO:0000173" ], | |
58 "LTR" => [ "long_terminal_repeat", "SO:0000286" ], | |
59 "RBS" => [ "ribosome_entry_site", "SO:0000139" ], | |
60 "STS" => [ "STS", "SO:0000331" ], | |
61 "TATA_signal" => [ "TATA_box", "SO:0000174" ], | |
62 "attenuator" => [ "attenuator", "SO:0000140" ], | |
63 "enhancer" => [ "enhancer", "SO:0000165" ], | |
64 "exon" => [ "exon", "SO:0000147" ], | |
65 "gap" => [ "gap", "SO:0000730" ], | |
66 "gene" => [ "gene", "SO:0000704" ], | |
67 "iDNA" => [ "iDNA", "SO:0000723" ], | |
68 "intron" => [ "intron", "SO:0000188" ], | |
69 "mRNA" => [ "mRNA", "SO:0000234" ], | |
70 "mat_peptide" => [ "mature_protein_region", "SO:0000419" ], | |
71 "misc_RNA" => [ "transcript", "SO:0000673" ], | |
72 "misc_binding" => [ "binding_site", "SO:0000409" ], | |
73 "misc_difference" => [ "sequence_difference", "SO:0000413" ], | |
74 "misc_feature" => [ "region", "SO:0000001" ], | |
75 "misc_recomb" => [ "recombination_feature", "SO:0000298" ], | |
76 "misc_signal" => [ "regulatory_region", "SO:0005836" ], | |
77 "misc_structure" => [ "sequence_secondary_structure", "SO:0000002" ], | |
78 "modified_base" => [ "modified_DNA_base", "SO:0000305" ], | |
79 "operon" => [ "operon", "SO:0000178" ], | |
80 "oriT" => [ "origin_of_transfer", "SO:0000724" ], | |
81 "polyA_signal" => [ "polyA_signal_sequence", "SO:0000551" ], | |
82 "polyA_site" => [ "polyA_site", "SO:0000553" ], | |
83 "precursor_RNA" => [ "primary_transcript", "SO:0000185" ], | |
84 "prim_transcript" => [ "primary_transcript", "SO:0000185" ], | |
85 "primer_bind" => [ "primer_binding_site", "SO:0005850" ], | |
86 "promoter" => [ "promoter", "SO:0000167" ], | |
87 "protein_bind" => [ "protein_binding_site", "SO:0000410" ], | |
88 "rRNA" => [ "rRNA", "SO:0000252" ], | |
89 "repeat_region" => [ "repeat_region", "SO:0000657" ], | |
90 "repeat_unit" => [ "repeat_unit", "SO:0000726" ], | |
91 "satellite" => [ "satellite_DNA", "SO:0000005" ], | |
92 "scRNA" => [ "scRNA", "SO:0000013" ], | |
93 "sig_peptide" => [ "signal_peptide", "SO:0000418" ], | |
94 "snRNA" => [ "snRNA", "SO:0000274" ], | |
95 "snoRNA" => [ "snoRNA", "SO:0000275" ], | |
96 "source" => [ "contig", "SO:0000149" ], # manually modified | |
97 "stem_loop" => [ "stem_loop", "SO:0000313" ], | |
98 "tRNA" => [ "tRNA", "SO:0000253" ], | |
99 "terminator" => [ "terminator", "SO:0000141" ], | |
100 "transit_peptide" => [ "transit_peptide", "SO:0000725" ], | |
101 "variation" => [ "sequence_variant", "SO:0000109" ], | |
102 ); | |
103 | |
104 | |
105 sub get_gff3_file { | |
106 my ($self) = @_; | |
107 my @output; | |
108 for my $header_line ( @{ $self->header() } ) { | |
109 push( @output, $header_line ); | |
110 } | |
111 push( @output, join( "\t", @{ $self->get_source } ) ); | |
112 for my $data_line ( @{ $self->data() }) { | |
113 push( @output, join( "\t", @{$data_line} ) ); | |
114 } | |
115 push( @output, '##FASTA' ); | |
116 push( @output, '>' . $self->seqid() ); | |
117 my $seq = $self->sequence(); | |
118 $seq =~ s/(.{80})/$1\n/g; | |
119 push( @output, $seq ); | |
120 return join( "\n", @output ); | |
121 } | |
122 | |
123 sub escape { | |
124 my ($self, $str) = @_; | |
125 $str =~ s/,/%2C/g; | |
126 $str =~ s/=/%3D/g; | |
127 $str =~ s/;/%3B/g; | |
128 $str =~ s/\t/%09/g; | |
129 return $str; | |
130 } | |
131 | |
132 sub FT_SO_map { | |
133 my ( $self, $key ) = @_; | |
134 if ( $key_mapping{$key} ) { | |
135 my @result = @{ $key_mapping{$key} }; | |
136 return $result[0]; | |
137 } | |
138 else { | |
139 return 'region'; | |
140 } | |
141 } | |
142 | |
143 sub source_map { | |
144 my ( $self, $type ) = @_; | |
145 if ( $self->{'override_source'} ) { | |
146 return $self->{'override_source'}; | |
147 } | |
148 if ( $self->{'annotation_software'}{$type} ) { | |
149 return $self->{'annotation_software'}{$type}; | |
150 } | |
151 else { | |
152 return '.'; | |
153 } | |
154 } | |
155 | |
156 sub get_attrs { | |
157 my ( $self, %data ) = @_; | |
158 my $feature = $data{'feat'}; | |
159 my $parents_ref = $data{'parents'}; | |
160 | |
161 my %attrs = (); | |
162 $attrs{'ID'} = | |
163 $self->id_prefix() . '.' . ($self->global_feat_idx()); | |
164 | |
165 $self->global_feat_idx($self->global_feat_idx()+1); | |
166 | |
167 # Handle Identifier | |
168 my $identifier = $bio->_getIdentifier($feature); | |
169 if ( $identifier ne 'ERROR' ) { | |
170 $attrs{'Name'} = $identifier . '.' . $feature->primary_tag; | |
171 } | |
172 | |
173 # Handle parents, if there are any | |
174 if ($parents_ref) { | |
175 if ( ref($parents_ref) eq 'ARRAY' ) { | |
176 $attrs{'Parent'} = $parents_ref; | |
177 } | |
178 else { | |
179 $attrs{'Parent'} = [$parents_ref]; | |
180 } | |
181 } | |
182 | |
183 # These are otherwise "Special" keys that need to be handled differently. | |
184 if ( $feature->has_tag('note') ) { | |
185 my @notes = $feature->get_tag_values('note'); | |
186 $attrs{'Note'} = \@notes; | |
187 } | |
188 if ( $feature->has_tag('db_xref') ) { | |
189 my @dbxref = $feature->get_tag_values('db_xref'); | |
190 $attrs{'Dbxref'} = \@dbxref; | |
191 } | |
192 | |
193 # Do the rest | |
194 for my $tag ( $feature->get_all_tags() ) { | |
195 | |
196 # If not one of the specially handled ones | |
197 if ( $tag ne 'name' && $tag ne 'note' && $tag ne 'db_xref' ) { | |
198 | |
199 # If not a reserved_key | |
200 if ( !$reserved_keys{$tag} ) { | |
201 my @vals = $feature->get_tag_values($tag); | |
202 $attrs{lc($tag)} = \@vals; | |
203 } | |
204 else { | |
205 warn | |
206 "Trying to set a reserved key $tag with value $attrs{$tag}"; | |
207 } | |
208 } | |
209 } | |
210 my %response = ( | |
211 id => $attrs{'ID'}, | |
212 attr_str => $self->post_process_attribute_string(%attrs), | |
213 ); | |
214 return %response; | |
215 } | |
216 | |
217 sub post_process_attribute_string { | |
218 my ($self,%attrs) = @_; | |
219 my @parts = (); | |
220 for my $k ( keys %attrs ) { | |
221 | |
222 # IGNORED TAGS | |
223 if ( $k ne 'translation' && $k ne 'product' ) { | |
224 my $joined = | |
225 $self->escape_and_join_attribute_subpart( $attrs{$k} ); | |
226 push @parts, "$k=$joined"; | |
227 } | |
228 } | |
229 | |
230 #print STDERR join("\t",@parts),"\n"; | |
231 return join( ";", @parts ); | |
232 | |
233 } | |
234 | |
235 sub escape_and_join_attribute_subpart { | |
236 my ($self,$ref) = @_; | |
237 if ( ref($ref) eq 'ARRAY' ) { | |
238 my @attrs = @{$ref}; | |
239 return join( ",", map { $self->escape($_) } @attrs ); | |
240 } | |
241 else { #scalar | |
242 return $self->escape($ref); | |
243 } | |
244 } | |
245 | |
246 sub get_source { | |
247 my ($self) = @_; | |
248 if ( !$self->source() ) { | |
249 $self->auto_source(); | |
250 } | |
251 return $self->source(); | |
252 } | |
253 | |
254 sub auto_source { | |
255 | |
256 # Auto generate a source feature, in case there isn't one. | |
257 my ($self) = @_; | |
258 my @region = ( | |
259 $self->seqid(), | |
260 ( $self->genbank() ? 'Genbank' : 'Assembly' ), | |
261 'contig', | |
262 1, | |
263 $self->get_length, | |
264 '.', | |
265 '.', | |
266 '.', | |
267 sprintf( "ID=%s;Name=%s", $self->seqid(), $self->seqid() ) | |
268 . ( $self->is_circular() ? ";Is_circular=True" : "" ) | |
269 ); | |
270 $self->source(\@region); | |
271 } | |
272 | |
273 sub add_feature { | |
274 my ( $self, $feat ) = @_; | |
275 my $primary_tag = $feat->primary_tag; | |
276 if ( $primary_tag eq 'CDS' ) { | |
277 my ( $id, $data_0 ) = $self->_add_gene( feat => $feat, ); | |
278 my ($data_1) = $self->_add_feature( | |
279 feat => $feat, | |
280 parent => $id, | |
281 ); | |
282 $self->_low_level_add_feature($data_0); | |
283 $self->_low_level_add_feature($data_1); | |
284 } | |
285 elsif ( $primary_tag eq 'source' ) { | |
286 my ($data) = $self->_add_feature( feat => $feat, ); | |
287 | |
288 # YUCK. | |
289 my @z = @{$data}; | |
290 my $seqid = $self->seqid(); | |
291 $z[8] =~ s/ID=[^;]*;/ID=$seqid;/g; | |
292 $self->source( \@z ); | |
293 } | |
294 elsif ( $primary_tag ne 'gene' ) { | |
295 my ($data) = $self->_add_feature( feat => $feat, ); | |
296 $self->_low_level_add_feature($data); | |
297 } | |
298 | |
299 } | |
300 | |
301 sub _low_level_add_feature { | |
302 my ( $self, $data ) = @_; | |
303 push( @{ $self->data() }, $data ); | |
304 } | |
305 | |
306 sub _add_feature { | |
307 my ( $self, %data ) = @_; | |
308 my %attrs = | |
309 $self->get_attrs( feat => $data{feat}, parents => $data{parent} ); | |
310 my @data = ( | |
311 $self->seqid(), | |
312 $self->source_map( $data{feat}->primary_tag ), | |
313 $self->FT_SO_map( $data{feat}->primary_tag ), | |
314 $data{feat}->start, | |
315 $data{feat}->end, | |
316 '.', | |
317 ( $data{feat}->strand == 1 ? '+' : '-' ), | |
318 '.', | |
319 $attrs{attr_str} | |
320 ); | |
321 | |
322 #$self->_low_level_add_feature(\@data); | |
323 return \@data; | |
324 } | |
325 | |
326 sub _add_gene { | |
327 my ( $self, %data ) = @_; | |
328 my $id = $self->id_prefix() . '.' . ( $self->global_feat_idx()); | |
329 $self->global_feat_idx($self->global_feat_idx()+1); | |
330 my $gene_count = ++$feat_type_count{'gene'}; | |
331 my @data = ( | |
332 $self->seqid(), | |
333 $self->source_map( $data{feat}->primary_tag ), | |
334 $self->FT_SO_map('gene'), | |
335 $data{feat}->start, | |
336 $data{feat}->end, | |
337 '.', | |
338 ( $data{feat}->strand == 1 ? '+' : '-' ), | |
339 '.', | |
340 "ID=$id;Name=Gene$gene_count" | |
341 ); | |
342 | |
343 #$self->_low_level_add_feature(\@data); | |
344 return ( $id, \@data ); | |
345 } | |
346 | |
347 no Moose; | |
348 1; | |
349 | |
350 __END__ | |
351 | |
352 =pod | |
353 | |
354 =encoding UTF-8 | |
355 | |
356 =head1 NAME | |
357 | |
358 CPT::GBK2GFF3 | |
359 | |
360 =head1 VERSION | |
361 | |
362 version 1.99.4 | |
363 | |
364 =head1 AUTHOR | |
365 | |
366 Eric Rasche <rasche.eric@yandex.ru> | |
367 | |
368 =head1 COPYRIGHT AND LICENSE | |
369 | |
370 This software is Copyright (c) 2014 by Eric Rasche. | |
371 | |
372 This is free software, licensed under: | |
373 | |
374 The GNU General Public License, Version 3, June 2007 | |
375 | |
376 =cut |