Mercurial > repos > cpt > cpt_psm_recombine
diff lib/CPT/GBK2GFF3.pm @ 1:97ef96676b48 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:51:26 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/CPT/GBK2GFF3.pm Mon Jun 05 02:51:26 2023 +0000 @@ -0,0 +1,376 @@ +package CPT::GBK2GFF3; +use strict; +use warnings; +use CPT; +use Data::Dumper; +use autodie; +use Bio::SeqIO; +use Moose; + +has data => ( + is => 'rw', + isa => 'ArrayRef', + default => sub { [] }, +); +has header => ( + is => 'rw', + isa => 'ArrayRef', +); +has sequence => ( + is => 'rw', + isa => 'Str', +); +has seqid => ( + is => 'rw', + isa => 'Str', +); +has genbank => ( + is => 'rw', + isa => 'Str', +); +has is_circular => ( + is => 'rw', + isa => 'Str', +); +has id_prefix => ( is => 'rw', isa => 'Str' ); +has global_feat_idx => ( is => 'rw', isa => 'Int', default => sub { 0 } ); +has source => ( is => 'rw', isa => 'ArrayRef'); + +my %feat_type_count; +my %reserved_keys = map { $_ => 1 } + qw/ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term/; + +use CPT::Bio; +my $bio = CPT::Bio->new(); +my %key_mapping = ( + "-" => [ "located_sequence_feature", "SO:0000110" ], + "-10_signal" => [ "minus_10_signal", "SO:0000175" ], + "-35_signal" => [ "minus_35_signal", "SO:0000176" ], + "3'UTR" => [ "three_prime_UTR", "SO:0000205" ], + "3'clip" => [ "three_prime_clip", "SO:0000557" ], + "5'UTR" => [ "five_prime_UTR", "SO:0000204" ], + "5'clip" => [ "five_prime_clip", "SO:0000555" ], + "CAAT_signal" => [ "CAAT_signal", "SO:0000172" ], + "CDS" => [ "CDS", "SO:0000316" ], + "D-loop" => [ "D_loop", "SO:0000297" ], + "D_segment" => [ "D_gene", "SO:0000458" ], + "GC_signal" => [ "GC_rich_region", "SO:0000173" ], + "LTR" => [ "long_terminal_repeat", "SO:0000286" ], + "RBS" => [ "ribosome_entry_site", "SO:0000139" ], + "STS" => [ "STS", "SO:0000331" ], + "TATA_signal" => [ "TATA_box", "SO:0000174" ], + "attenuator" => [ "attenuator", "SO:0000140" ], + "enhancer" => [ "enhancer", "SO:0000165" ], + "exon" => [ "exon", "SO:0000147" ], + "gap" => [ "gap", "SO:0000730" ], + "gene" => [ "gene", "SO:0000704" ], + "iDNA" => [ "iDNA", "SO:0000723" ], + "intron" => [ "intron", "SO:0000188" ], + "mRNA" => [ "mRNA", "SO:0000234" ], + "mat_peptide" => [ "mature_protein_region", "SO:0000419" ], + "misc_RNA" => [ "transcript", "SO:0000673" ], + "misc_binding" => [ "binding_site", "SO:0000409" ], + "misc_difference" => [ "sequence_difference", "SO:0000413" ], + "misc_feature" => [ "region", "SO:0000001" ], + "misc_recomb" => [ "recombination_feature", "SO:0000298" ], + "misc_signal" => [ "regulatory_region", "SO:0005836" ], + "misc_structure" => [ "sequence_secondary_structure", "SO:0000002" ], + "modified_base" => [ "modified_DNA_base", "SO:0000305" ], + "operon" => [ "operon", "SO:0000178" ], + "oriT" => [ "origin_of_transfer", "SO:0000724" ], + "polyA_signal" => [ "polyA_signal_sequence", "SO:0000551" ], + "polyA_site" => [ "polyA_site", "SO:0000553" ], + "precursor_RNA" => [ "primary_transcript", "SO:0000185" ], + "prim_transcript" => [ "primary_transcript", "SO:0000185" ], + "primer_bind" => [ "primer_binding_site", "SO:0005850" ], + "promoter" => [ "promoter", "SO:0000167" ], + "protein_bind" => [ "protein_binding_site", "SO:0000410" ], + "rRNA" => [ "rRNA", "SO:0000252" ], + "repeat_region" => [ "repeat_region", "SO:0000657" ], + "repeat_unit" => [ "repeat_unit", "SO:0000726" ], + "satellite" => [ "satellite_DNA", "SO:0000005" ], + "scRNA" => [ "scRNA", "SO:0000013" ], + "sig_peptide" => [ "signal_peptide", "SO:0000418" ], + "snRNA" => [ "snRNA", "SO:0000274" ], + "snoRNA" => [ "snoRNA", "SO:0000275" ], + "source" => [ "contig", "SO:0000149" ], # manually modified + "stem_loop" => [ "stem_loop", "SO:0000313" ], + "tRNA" => [ "tRNA", "SO:0000253" ], + "terminator" => [ "terminator", "SO:0000141" ], + "transit_peptide" => [ "transit_peptide", "SO:0000725" ], + "variation" => [ "sequence_variant", "SO:0000109" ], +); + + +sub get_gff3_file { + my ($self) = @_; + my @output; + for my $header_line ( @{ $self->header() } ) { + push( @output, $header_line ); + } + push( @output, join( "\t", @{ $self->get_source } ) ); + for my $data_line ( @{ $self->data() }) { + push( @output, join( "\t", @{$data_line} ) ); + } + push( @output, '##FASTA' ); + push( @output, '>' . $self->seqid() ); + my $seq = $self->sequence(); + $seq =~ s/(.{80})/$1\n/g; + push( @output, $seq ); + return join( "\n", @output ); +} + +sub escape { + my ($self, $str) = @_; + $str =~ s/,/%2C/g; + $str =~ s/=/%3D/g; + $str =~ s/;/%3B/g; + $str =~ s/\t/%09/g; + return $str; +} + +sub FT_SO_map { + my ( $self, $key ) = @_; + if ( $key_mapping{$key} ) { + my @result = @{ $key_mapping{$key} }; + return $result[0]; + } + else { + return 'region'; + } +} + +sub source_map { + my ( $self, $type ) = @_; + if ( $self->{'override_source'} ) { + return $self->{'override_source'}; + } + if ( $self->{'annotation_software'}{$type} ) { + return $self->{'annotation_software'}{$type}; + } + else { + return '.'; + } +} + +sub get_attrs { + my ( $self, %data ) = @_; + my $feature = $data{'feat'}; + my $parents_ref = $data{'parents'}; + + my %attrs = (); + $attrs{'ID'} = + $self->id_prefix() . '.' . ($self->global_feat_idx()); + + $self->global_feat_idx($self->global_feat_idx()+1); + + # Handle Identifier + my $identifier = $bio->_getIdentifier($feature); + if ( $identifier ne 'ERROR' ) { + $attrs{'Name'} = $identifier . '.' . $feature->primary_tag; + } + + # Handle parents, if there are any + if ($parents_ref) { + if ( ref($parents_ref) eq 'ARRAY' ) { + $attrs{'Parent'} = $parents_ref; + } + else { + $attrs{'Parent'} = [$parents_ref]; + } + } + + # These are otherwise "Special" keys that need to be handled differently. + if ( $feature->has_tag('note') ) { + my @notes = $feature->get_tag_values('note'); + $attrs{'Note'} = \@notes; + } + if ( $feature->has_tag('db_xref') ) { + my @dbxref = $feature->get_tag_values('db_xref'); + $attrs{'Dbxref'} = \@dbxref; + } + + # Do the rest + for my $tag ( $feature->get_all_tags() ) { + + # If not one of the specially handled ones + if ( $tag ne 'name' && $tag ne 'note' && $tag ne 'db_xref' ) { + + # If not a reserved_key + if ( !$reserved_keys{$tag} ) { + my @vals = $feature->get_tag_values($tag); + $attrs{lc($tag)} = \@vals; + } + else { + warn +"Trying to set a reserved key $tag with value $attrs{$tag}"; + } + } + } + my %response = ( + id => $attrs{'ID'}, + attr_str => $self->post_process_attribute_string(%attrs), + ); + return %response; +} + +sub post_process_attribute_string { + my ($self,%attrs) = @_; + my @parts = (); + for my $k ( keys %attrs ) { + + # IGNORED TAGS + if ( $k ne 'translation' && $k ne 'product' ) { + my $joined = + $self->escape_and_join_attribute_subpart( $attrs{$k} ); + push @parts, "$k=$joined"; + } + } + + #print STDERR join("\t",@parts),"\n"; + return join( ";", @parts ); + +} + +sub escape_and_join_attribute_subpart { + my ($self,$ref) = @_; + if ( ref($ref) eq 'ARRAY' ) { + my @attrs = @{$ref}; + return join( ",", map { $self->escape($_) } @attrs ); + } + else { #scalar + return $self->escape($ref); + } +} + +sub get_source { + my ($self) = @_; + if ( !$self->source() ) { + $self->auto_source(); + } + return $self->source(); +} + +sub auto_source { + + # Auto generate a source feature, in case there isn't one. + my ($self) = @_; + my @region = ( + $self->seqid(), + ( $self->genbank() ? 'Genbank' : 'Assembly' ), + 'contig', + 1, + $self->get_length, + '.', + '.', + '.', + sprintf( "ID=%s;Name=%s", $self->seqid(), $self->seqid() ) + . ( $self->is_circular() ? ";Is_circular=True" : "" ) + ); + $self->source(\@region); +} + +sub add_feature { + my ( $self, $feat ) = @_; + my $primary_tag = $feat->primary_tag; + if ( $primary_tag eq 'CDS' ) { + my ( $id, $data_0 ) = $self->_add_gene( feat => $feat, ); + my ($data_1) = $self->_add_feature( + feat => $feat, + parent => $id, + ); + $self->_low_level_add_feature($data_0); + $self->_low_level_add_feature($data_1); + } + elsif ( $primary_tag eq 'source' ) { + my ($data) = $self->_add_feature( feat => $feat, ); + + # YUCK. + my @z = @{$data}; + my $seqid = $self->seqid(); + $z[8] =~ s/ID=[^;]*;/ID=$seqid;/g; + $self->source( \@z ); + } + elsif ( $primary_tag ne 'gene' ) { + my ($data) = $self->_add_feature( feat => $feat, ); + $self->_low_level_add_feature($data); + } + +} + +sub _low_level_add_feature { + my ( $self, $data ) = @_; + push( @{ $self->data() }, $data ); +} + +sub _add_feature { + my ( $self, %data ) = @_; + my %attrs = + $self->get_attrs( feat => $data{feat}, parents => $data{parent} ); + my @data = ( + $self->seqid(), + $self->source_map( $data{feat}->primary_tag ), + $self->FT_SO_map( $data{feat}->primary_tag ), + $data{feat}->start, + $data{feat}->end, + '.', + ( $data{feat}->strand == 1 ? '+' : '-' ), + '.', + $attrs{attr_str} + ); + + #$self->_low_level_add_feature(\@data); + return \@data; +} + +sub _add_gene { + my ( $self, %data ) = @_; + my $id = $self->id_prefix() . '.' . ( $self->global_feat_idx()); + $self->global_feat_idx($self->global_feat_idx()+1); + my $gene_count = ++$feat_type_count{'gene'}; + my @data = ( + $self->seqid(), + $self->source_map( $data{feat}->primary_tag ), + $self->FT_SO_map('gene'), + $data{feat}->start, + $data{feat}->end, + '.', + ( $data{feat}->strand == 1 ? '+' : '-' ), + '.', + "ID=$id;Name=Gene$gene_count" + ); + + #$self->_low_level_add_feature(\@data); + return ( $id, \@data ); +} + +no Moose; +1; + +__END__ + +=pod + +=encoding UTF-8 + +=head1 NAME + +CPT::GBK2GFF3 + +=head1 VERSION + +version 1.99.4 + +=head1 AUTHOR + +Eric Rasche <rasche.eric@yandex.ru> + +=head1 COPYRIGHT AND LICENSE + +This software is Copyright (c) 2014 by Eric Rasche. + +This is free software, licensed under: + + The GNU General Public License, Version 3, June 2007 + +=cut