Mercurial > repos > cpt > cpt_psm_prep
diff cpt_dnaplotter.pl @ 1:d724f34e671d draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:50:07 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_dnaplotter.pl Mon Jun 05 02:50:07 2023 +0000 @@ -0,0 +1,465 @@ +#!/usr/bin/perl +# +# Code written by Eric Rasche +# mailto:rasche.eric@yandex.ru +# tel: 404.692.2048 +# http://eric.rasche.co.uk +# for +# Center for Phage Technology +# + +use strict; +use warnings; + +use CPT::GalaxyGetOpt; +use Data::Dumper; +use CPT::Bio; +use CPT::Circos::Conf; +my $bio = CPT::Bio->new(); + +my @argv_copy; +foreach(@ARGV){push(@argv_copy, "$_");} + +my $ggo = CPT::GalaxyGetOpt->new(); +my $options = $ggo->getOptions( + 'options' => [ + [ 'file|f', 'Input file', { validate => 'File/Input', + file_format => ['genbank', 'embl', 'txt'], + } ], + [], + ['Track Configuration'], + ['track_key', 'Key to select from genbank data', { validate => 'Genomic/Tag', required => 1, multiple => 1 } ], + ['track_feature_filter_invert', 'Should the qualifier search be inverted?', { validate => 'Option', options => { 'yes', 'Yes', 'no', 'No' } , multiple => 1 } ], + ['track_feature_filter_hastag', 'Select a tag which should be present in that qualifier (e.g., signal/tmhelix/pseudo)', { validate => 'String' , multiple => 1 } ], + ['track_feature_filter_textquery', 'Specify text which MUST be in that tag', { validate => 'String' , multiple => 1 } ], + ['track_feature_filter_strand', 'Which strand should the feature appear on?', { validate => 'Option', options => { 'f', 'Forward', 'r', 'Reverse', 'a', 'Any' } , multiple => 1 } ], + [], + ['enable_gc_skew_plot', 'Enable/Disable calculation of GC Skew Plot', { validate => 'Flag' } ], + ['gc_skew_plot_window_size', 'Window size for calculation of GC Skew', { validate => 'Int', min => 1000, default => 10000} ], + ['gc_skew_plot_step_size', 'Step size for calculation of GC Skew', { validate => 'Int', min => 200, default => 200 } ], + ], + 'outputs' => [ + [ + 'output_circos_confs', + 'Circos Configuration Files', + { + validate => 'File/Output', + required => 1, + default => 'out', + data_format => 'archive', + default_format => 'zip', + } + ], + ], + 'defaults' => [ + 'appid' => 'CircularDNAPlotter', + 'appname' => 'Circos based DNAPlotter', + 'appdesc' => 'plots genomes similar to Artemis\'s DNAPlotter', + 'appvers' => '1.94.1', + ], +); + +#perl cpt_dnaplotter.pl \ + #-f ../t/test-files/moon.gbk \ + #--track_key CDS --track_feature_filter_invert yes --track_feature_filter_hastag pseudo --track_feature_filter_strand f \ + #--track_key CDS --track_feature_filter_invert yes --track_feature_filter_hastag pseudo --track_feature_filter_strand r \ + #--track_key CDS --track_feature_filter_hastag pseudo --track_feature_filter_strand a \ + #--track_key tRNA --track_feature_filter_strand a \ + #--track_key CDS --track_feature_filter_hastag signal --track_feature_filter_strand a \ + #--track_key CDS --track_feature_filter_hastag tmhelix --track_feature_filter_strand a + + +my @reorg_args = (); + +my $cum_gc_ske_mean = 0; + + + +my %latest = (); +for(my $i = 0; $i < scalar(@argv_copy); $i++){ + my $c = $argv_copy[$i]; + # We have entered a new one block + if($c eq '--track_key'){ + # If we have loaded data + if(scalar(keys(%latest)) > 0){ + my %copy; + foreach(keys(%latest)){ + $copy{$_} = "" . $latest{$_}; + } + push(@reorg_args, \%copy); + } + + # Clean out latest to prep for new data + foreach(keys(%latest)){ + delete $latest{$_}; + } + } + + if($c =~ /^--track_(.*)/){ + $latest{$1} = $argv_copy[$i+1]; + # Artificially bump so we don't bother looking at the answer to + # this question. We can "get away" with this because none of + # the options are flags. However, I've disabled it in the event + # that flags ARE introduced + #$i++; + } +} +push(@reorg_args, \%latest); +#$VAR1 = [ + #{ + #'feature_filter_invert' => 'yes', + #'feature_plot_color' => '005500', + #'feature_filter_strand' => 'f', + #'feature_filter_hastag' => 'pseudo', + #'key' => 'CDS' + #}, + #{ + #'feature_filter_strand' => 'a', + #'key' => 'RBS' + #} + #]; +my @files = (); + +my $number_of_tracks = 0; +sub register_track { + #my $r0 = ( 90 - (10 * $number_of_tracks - 1)/2) / 100; + #my $r1 = ( 90 - (10 * $number_of_tracks - 9)/2) / 100; + my $r0 = ( 90 - (10 * $number_of_tracks - 1)/1) / 100; + my $r1 = ( 90 - (10 * $number_of_tracks - 9)/1) / 100; + $number_of_tracks++; + return ($r0, $r1); +} + +sub circosconf { + my $cc = CPT::Circos::Conf->new(); + $cc->include('etc/colors_fonts_patterns.conf'); + # Features to plot along the genome + $cc->include('ideogram.conf'); + # markings indicating position along genome + $cc->include('ticks.conf'); + # Genome data + $cc->set('karyotype', 'karyotype.conf'); + # Default image params are fine + $cc->start_block('image'); + $cc->include('etc/image.conf'); + $cc->end_block(); + # ??? + $cc->set('chromosome_units', '1000'); + $cc->set('chromosome_display_default', 'yes'); + #$cc->include('highlights.conf'); + $cc->include('plots.conf'); + + $cc->include('etc/housekeeping.conf'); + my $result = $cc->finalize(); + $cc = CPT::Circos::Conf->new(); + return $result; +} +sub ideogramconf{ + my $cc = CPT::Circos::Conf->new(); + $cc->start_block('ideogram'); + $cc->start_block('spacing'); + $cc->set('default','0u'); + $cc->set('break','0u'); + $cc->end_block(); + + $cc->set('thickness', '20p'); + $cc->set('stroke_thickness', '2'); + $cc->set('stroke_color', 'black'); + $cc->set('fill','no'); + $cc->set('fill_color','black'); + $cc->set('radius','0.85r'); + $cc->set('show_label','yes'); + $cc->set('label_font','default'); + $cc->set('label_radius','dims(ideogram,radius) + 0.05'); + $cc->set('label_size','36'); + $cc->set('label_parallel','yes'); + $cc->set('label_case','upper'); + + $cc->set('band_stroke_thickness','2'); + $cc->set('show_bands','yes'); + $cc->set('fill_bands','yes'); + $cc->end_block(); + + return $cc->finalize(); +} +sub generate_feature_table { + my ($filename, %filter) = @_; + print "Filtering on features\n"; + print Dumper \%filter; + my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank'); + # Only handing first sequence. + my $seq_object = $seqio_object->next_seq; + my $parent = $seq_object->display_id(); + # Feature data + my @features; + foreach my $feat($seq_object->get_SeqFeatures()){ + if($feat->primary_tag() eq $filter{key}){ + # If they've said "hastag" AND we do indeed have that tag AND we haven't inverted this filter. + if( + ($filter{feature_filter_hastag} && $feat->has_tag($filter{feature_filter_hastag}) && !$filter{feature_filter_invert}) + || + ($filter{feature_filter_hastag} && $filter{feature_filter_invert} && !$feat->has_tag($filter{feature_filter_hastag})) + || + (! $filter{feature_filter_hastag}) + ){ + if( + !$filter{feature_filter_strand} + || + ($feat->strand() == 1 && ( $filter{feature_filter_strand} eq 'f' || $filter{feature_filter_strand} eq 'a' )) + || + ($feat->strand() == -1 && ( $filter{feature_filter_strand} eq 'r' || $filter{feature_filter_strand} eq 'a' )) + || + ($feat->strand() == 0 && ( $filter{feature_filter_strand} eq 'a' )) + ){ + push(@features, join(' ', $parent, $feat->start, $feat->end)); + } + } + } + } + print "Found " . scalar @features . " features \n"; + push(@files, [ 'data/' . $filename, join("\n", @features) ] ); +} +sub plotsconf{ + my $cc = CPT::Circos::Conf->new(); + + $cc->start_block('plots'); + + + my $idx = 0; + foreach(@reorg_args){ + my %filter = %{$_}; + #{ + #'feature_filter_invert' => 'yes', + #'feature_plot_color' => '005500', + #'feature_filter_strand' => 'f', + #'feature_filter_hastag' => 'pseudo', + #'key' => 'CDS' + #}, + $idx++; + my $filename = sprintf('org.features.%s.txt', $idx); + generate_feature_table($filename, %filter); + + my ($r0,$r1) = register_track(); + $cc->start_block('plot'); + $cc->set('type','tile'); + $cc->set('file',$filename); + $cc->set('orientation', 'center'); + $cc->set('thickness', '30'); + $cc->set('r1', $r1 . 'r');# '0.78r'); + $cc->set('r0', $r0 . 'r');# '0.72r'); + $cc->set('layers','3'); + $cc->set('layers_overflow','collapse'); + $cc->set('color','paired-6-qual-' . $idx); + $cc->end_block(); + } + + if($options->{enable_gc_skew_plot}){ + my ($r0,$r1) = register_track(); + $cc->start_block('plot'); + $cc->set('type','histogram'); + $cc->set('file','gc.txt'); + $cc->set('r1',$r1 . 'r'); + $cc->set('r0',$r0 . 'r'); + $cc->set('fill_color','purple'); + $cc->set('orientation','in'); + $cc->start_block('rules'); + $cc->start_block('rule'); + $cc->set('condition','var(value) < 0'); + $cc->set('fill_color', 'green'); + $cc->end_block(); + $cc->end_block(); + $cc->end_block(); + } + + #$cc->start_block('plot'); + #$cc->set('type','histogram'); + #$cc->set('file','gc_cumulative.txt'); + #$cc->set('r1','0.6r'); + #$cc->set('r0','0.55r'); + #$cc->set('fill_color','purple'); + #$cc->set('orientation','out'); + #$cc->start_block('rules'); + #$cc->start_block('rule'); + #$cc->set('condition','var(value) < ' . $cum_gc_ske_mean); + #$cc->set('fill_color', 'green'); + #$cc->end_block(); + #$cc->end_block(); + #$cc->end_block(); + + $cc->end_block(); + return $cc->finalize(); +} +sub ticksconf{ + my $cc = CPT::Circos::Conf->new(); + + $cc->set('show_ticks','yes'); + $cc->set('show_tick_labels','yes'); + $cc->start_block('ticks'); + $cc->set('radius','1r'); + $cc->set('color','black'); + $cc->set('thickness','2p'); + $cc->set('multiplier','1e-3'); + $cc->set('format','%d'); + + $cc->start_block('tick'); + $cc->set('spacing','1000u'); + $cc->set('size','10p'); + $cc->end_block(); + + $cc->start_block('tick'); + $cc->set('spacing','10000u'); + $cc->set('size','15p'); + $cc->set('show_label','yes'); + $cc->set('label_size','20p'); + $cc->set('label_offset','10p'); + $cc->set('format','%d'); + $cc->end_block(); + $cc->end_block(); + return $cc->finalize(); +} +sub karyotype { + my @karyotype_data = (); + my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank'); + # Only handing first sequence. + my $seq_object = $seqio_object->next_seq; + # Main 'chromosome' data + push(@karyotype_data, join(' ', 'chr', '-',$seq_object->display_id(),$seq_object->display_id(),0, $seq_object->length(),'black')); + + return join("\n", @karyotype_data); +} + +sub gcgraph_cumulative { + my @gcdata = (); + my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank'); + # Only handing first sequence. + my $seq_object = $seqio_object->next_seq; + + my $parent = $seq_object->display_id(); + + my $seq = $seq_object->seq(); + my $sep = int($options->{gc_skew_plot_window_size}/2); + my $stepsep = int($options->{gc_skew_plot_step_size}/2); + my $cumulative_gc_skew = 0; + my @cumgc_vals; + + my $count = 0; + foreach(my $i = $sep; $i < $seq_object->length() - $sep - $options->{gc_skew_plot_step_size}; $i += $options->{gc_skew_plot_step_size}){ + $count++; + $cumulative_gc_skew += _calculate_gc_skew_for_seq(substr($seq,$i-$sep,$options->{gc_skew_plot_window_size})), + push(@cumgc_vals, $cumulative_gc_skew); + push(@gcdata, join(" ", + $parent, + $i - $stepsep, + $i + $stepsep, + $cumulative_gc_skew + )); + } + + my $sum = 0; + foreach(@cumgc_vals){$sum += $_;} + $cum_gc_ske_mean = $sum / $count; + + return join("\n", @gcdata); + # Main 'chromosome' data +} +sub gcgraph { + my @gcdata = (); + my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank'); + # Only handing first sequence. + my $seq_object = $seqio_object->next_seq; + + my $parent = $seq_object->display_id(); + + my $seq = $seq_object->seq(); + my $sep = int($options->{gc_skew_plot_window_size}/2); + my $stepsep = int($options->{gc_skew_plot_step_size}/2); + foreach(my $i = $sep; $i < $seq_object->length() - $sep - $options->{gc_skew_plot_step_size}; $i += $options->{gc_skew_plot_step_size}){ + push(@gcdata, join(" ", + $parent, + $i - $stepsep, + $i + $stepsep, + _calculate_gc_skew_for_seq(substr($seq,$i-$sep,$options->{gc_skew_plot_window_size})), + )); + } + return join("\n", @gcdata); + # Main 'chromosome' data +} +sub _calculate_gc_skew_for_seq { + my ($seq) = @_; + $seq = uc($seq); + my %counts; + foreach(split //,$seq){ + $counts{$_}++; + } + if($counts{G} + $counts{C} > 0){ + return ($counts{G} - $counts{C}) / ($counts{G} + $counts{C}); + } + return 0; +} + +push(@files, [ 'etc/karyotype.conf', karyotype() ]); +push(@files, [ 'etc/circos.conf', circosconf() ]); +push(@files, [ 'etc/ideogram.conf', ideogramconf() ]); +if($options->{enable_gc_skew_plot}){ + push(@files, [ 'data/gc.txt', gcgraph() ]); + push(@files, [ 'data/gc_cumulative.txt', gcgraph_cumulative() ]); +} + +push(@files, [ 'etc/plots.conf', plotsconf() ]); +push(@files, [ 'etc/ticks.conf', ticksconf() ]); + + +use Archive::Any::Create; +my $archive = Archive::Any::Create->new(); +$archive->container('conf'); +foreach(@files){ + my ($location, $content) = @{$_}; + $archive->add_file($location, $content); +} + +use CPT::OutputFiles; +my $crr_output = CPT::OutputFiles->new( + name => 'output_circos_confs', + GGO => $ggo, +); +$crr_output->CRR(data => $archive); + +=head1 NAME + +DNAPlotter + +=head1 DESCRIPTION + +Much like artemis's DNAPlotter, this tool plots genomes in a ciruclar fashion, and can plot gc-deviation tracks as well. The options are somewhat reduced compared to artemis, so if you need something that isn't available in this version please file a bug report. + +=head1 USE + +Each track has several parameters: + +=over 4 + +=item track_key + +This selects a set of features from a genbank file, e.g., CDSs or tRNAs + +=item track_feature_filter_invert + +This parameter will invert (negate) the results of whatever query parameters you specify after it. + +=item track_feature_filter_hastag + +Require that a feature has a specific tag.... + +=item track_feature_filter_textquery + +...with this specific text in it + +=item track_feature_filter_strand + +Which strand should the feature be on (not inverted) + +=back + +Additionally, users are able to enable/disable GC skew plots. it's suggested that these are generally left alone, as they can quickly increase runtime. + +=cut