Mercurial > repos > cpt > cpt_psm_plotter
comparison cpt_dnaplotter.pl @ 1:8691c1c61a8e draft default tip
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:48:47 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:54c7a3ea81e2 | 1:8691c1c61a8e |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 # | |
| 3 # Code written by Eric Rasche | |
| 4 # mailto:rasche.eric@yandex.ru | |
| 5 # tel: 404.692.2048 | |
| 6 # http://eric.rasche.co.uk | |
| 7 # for | |
| 8 # Center for Phage Technology | |
| 9 # | |
| 10 | |
| 11 use strict; | |
| 12 use warnings; | |
| 13 | |
| 14 use CPT::GalaxyGetOpt; | |
| 15 use Data::Dumper; | |
| 16 use CPT::Bio; | |
| 17 use CPT::Circos::Conf; | |
| 18 my $bio = CPT::Bio->new(); | |
| 19 | |
| 20 my @argv_copy; | |
| 21 foreach(@ARGV){push(@argv_copy, "$_");} | |
| 22 | |
| 23 my $ggo = CPT::GalaxyGetOpt->new(); | |
| 24 my $options = $ggo->getOptions( | |
| 25 'options' => [ | |
| 26 [ 'file|f', 'Input file', { validate => 'File/Input', | |
| 27 file_format => ['genbank', 'embl', 'txt'], | |
| 28 } ], | |
| 29 [], | |
| 30 ['Track Configuration'], | |
| 31 ['track_key', 'Key to select from genbank data', { validate => 'Genomic/Tag', required => 1, multiple => 1 } ], | |
| 32 ['track_feature_filter_invert', 'Should the qualifier search be inverted?', { validate => 'Option', options => { 'yes', 'Yes', 'no', 'No' } , multiple => 1 } ], | |
| 33 ['track_feature_filter_hastag', 'Select a tag which should be present in that qualifier (e.g., signal/tmhelix/pseudo)', { validate => 'String' , multiple => 1 } ], | |
| 34 ['track_feature_filter_textquery', 'Specify text which MUST be in that tag', { validate => 'String' , multiple => 1 } ], | |
| 35 ['track_feature_filter_strand', 'Which strand should the feature appear on?', { validate => 'Option', options => { 'f', 'Forward', 'r', 'Reverse', 'a', 'Any' } , multiple => 1 } ], | |
| 36 [], | |
| 37 ['enable_gc_skew_plot', 'Enable/Disable calculation of GC Skew Plot', { validate => 'Flag' } ], | |
| 38 ['gc_skew_plot_window_size', 'Window size for calculation of GC Skew', { validate => 'Int', min => 1000, default => 10000} ], | |
| 39 ['gc_skew_plot_step_size', 'Step size for calculation of GC Skew', { validate => 'Int', min => 200, default => 200 } ], | |
| 40 ], | |
| 41 'outputs' => [ | |
| 42 [ | |
| 43 'output_circos_confs', | |
| 44 'Circos Configuration Files', | |
| 45 { | |
| 46 validate => 'File/Output', | |
| 47 required => 1, | |
| 48 default => 'out', | |
| 49 data_format => 'archive', | |
| 50 default_format => 'zip', | |
| 51 } | |
| 52 ], | |
| 53 ], | |
| 54 'defaults' => [ | |
| 55 'appid' => 'CircularDNAPlotter', | |
| 56 'appname' => 'Circos based DNAPlotter', | |
| 57 'appdesc' => 'plots genomes similar to Artemis\'s DNAPlotter', | |
| 58 'appvers' => '1.94.1', | |
| 59 ], | |
| 60 ); | |
| 61 | |
| 62 #perl cpt_dnaplotter.pl \ | |
| 63 #-f ../t/test-files/moon.gbk \ | |
| 64 #--track_key CDS --track_feature_filter_invert yes --track_feature_filter_hastag pseudo --track_feature_filter_strand f \ | |
| 65 #--track_key CDS --track_feature_filter_invert yes --track_feature_filter_hastag pseudo --track_feature_filter_strand r \ | |
| 66 #--track_key CDS --track_feature_filter_hastag pseudo --track_feature_filter_strand a \ | |
| 67 #--track_key tRNA --track_feature_filter_strand a \ | |
| 68 #--track_key CDS --track_feature_filter_hastag signal --track_feature_filter_strand a \ | |
| 69 #--track_key CDS --track_feature_filter_hastag tmhelix --track_feature_filter_strand a | |
| 70 | |
| 71 | |
| 72 my @reorg_args = (); | |
| 73 | |
| 74 my $cum_gc_ske_mean = 0; | |
| 75 | |
| 76 | |
| 77 | |
| 78 my %latest = (); | |
| 79 for(my $i = 0; $i < scalar(@argv_copy); $i++){ | |
| 80 my $c = $argv_copy[$i]; | |
| 81 # We have entered a new one block | |
| 82 if($c eq '--track_key'){ | |
| 83 # If we have loaded data | |
| 84 if(scalar(keys(%latest)) > 0){ | |
| 85 my %copy; | |
| 86 foreach(keys(%latest)){ | |
| 87 $copy{$_} = "" . $latest{$_}; | |
| 88 } | |
| 89 push(@reorg_args, \%copy); | |
| 90 } | |
| 91 | |
| 92 # Clean out latest to prep for new data | |
| 93 foreach(keys(%latest)){ | |
| 94 delete $latest{$_}; | |
| 95 } | |
| 96 } | |
| 97 | |
| 98 if($c =~ /^--track_(.*)/){ | |
| 99 $latest{$1} = $argv_copy[$i+1]; | |
| 100 # Artificially bump so we don't bother looking at the answer to | |
| 101 # this question. We can "get away" with this because none of | |
| 102 # the options are flags. However, I've disabled it in the event | |
| 103 # that flags ARE introduced | |
| 104 #$i++; | |
| 105 } | |
| 106 } | |
| 107 push(@reorg_args, \%latest); | |
| 108 #$VAR1 = [ | |
| 109 #{ | |
| 110 #'feature_filter_invert' => 'yes', | |
| 111 #'feature_plot_color' => '005500', | |
| 112 #'feature_filter_strand' => 'f', | |
| 113 #'feature_filter_hastag' => 'pseudo', | |
| 114 #'key' => 'CDS' | |
| 115 #}, | |
| 116 #{ | |
| 117 #'feature_filter_strand' => 'a', | |
| 118 #'key' => 'RBS' | |
| 119 #} | |
| 120 #]; | |
| 121 my @files = (); | |
| 122 | |
| 123 my $number_of_tracks = 0; | |
| 124 sub register_track { | |
| 125 #my $r0 = ( 90 - (10 * $number_of_tracks - 1)/2) / 100; | |
| 126 #my $r1 = ( 90 - (10 * $number_of_tracks - 9)/2) / 100; | |
| 127 my $r0 = ( 90 - (10 * $number_of_tracks - 1)/1) / 100; | |
| 128 my $r1 = ( 90 - (10 * $number_of_tracks - 9)/1) / 100; | |
| 129 $number_of_tracks++; | |
| 130 return ($r0, $r1); | |
| 131 } | |
| 132 | |
| 133 sub circosconf { | |
| 134 my $cc = CPT::Circos::Conf->new(); | |
| 135 $cc->include('etc/colors_fonts_patterns.conf'); | |
| 136 # Features to plot along the genome | |
| 137 $cc->include('ideogram.conf'); | |
| 138 # markings indicating position along genome | |
| 139 $cc->include('ticks.conf'); | |
| 140 # Genome data | |
| 141 $cc->set('karyotype', 'karyotype.conf'); | |
| 142 # Default image params are fine | |
| 143 $cc->start_block('image'); | |
| 144 $cc->include('etc/image.conf'); | |
| 145 $cc->end_block(); | |
| 146 # ??? | |
| 147 $cc->set('chromosome_units', '1000'); | |
| 148 $cc->set('chromosome_display_default', 'yes'); | |
| 149 #$cc->include('highlights.conf'); | |
| 150 $cc->include('plots.conf'); | |
| 151 | |
| 152 $cc->include('etc/housekeeping.conf'); | |
| 153 my $result = $cc->finalize(); | |
| 154 $cc = CPT::Circos::Conf->new(); | |
| 155 return $result; | |
| 156 } | |
| 157 sub ideogramconf{ | |
| 158 my $cc = CPT::Circos::Conf->new(); | |
| 159 $cc->start_block('ideogram'); | |
| 160 $cc->start_block('spacing'); | |
| 161 $cc->set('default','0u'); | |
| 162 $cc->set('break','0u'); | |
| 163 $cc->end_block(); | |
| 164 | |
| 165 $cc->set('thickness', '20p'); | |
| 166 $cc->set('stroke_thickness', '2'); | |
| 167 $cc->set('stroke_color', 'black'); | |
| 168 $cc->set('fill','no'); | |
| 169 $cc->set('fill_color','black'); | |
| 170 $cc->set('radius','0.85r'); | |
| 171 $cc->set('show_label','yes'); | |
| 172 $cc->set('label_font','default'); | |
| 173 $cc->set('label_radius','dims(ideogram,radius) + 0.05'); | |
| 174 $cc->set('label_size','36'); | |
| 175 $cc->set('label_parallel','yes'); | |
| 176 $cc->set('label_case','upper'); | |
| 177 | |
| 178 $cc->set('band_stroke_thickness','2'); | |
| 179 $cc->set('show_bands','yes'); | |
| 180 $cc->set('fill_bands','yes'); | |
| 181 $cc->end_block(); | |
| 182 | |
| 183 return $cc->finalize(); | |
| 184 } | |
| 185 sub generate_feature_table { | |
| 186 my ($filename, %filter) = @_; | |
| 187 print "Filtering on features\n"; | |
| 188 print Dumper \%filter; | |
| 189 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank'); | |
| 190 # Only handing first sequence. | |
| 191 my $seq_object = $seqio_object->next_seq; | |
| 192 my $parent = $seq_object->display_id(); | |
| 193 # Feature data | |
| 194 my @features; | |
| 195 foreach my $feat($seq_object->get_SeqFeatures()){ | |
| 196 if($feat->primary_tag() eq $filter{key}){ | |
| 197 # If they've said "hastag" AND we do indeed have that tag AND we haven't inverted this filter. | |
| 198 if( | |
| 199 ($filter{feature_filter_hastag} && $feat->has_tag($filter{feature_filter_hastag}) && !$filter{feature_filter_invert}) | |
| 200 || | |
| 201 ($filter{feature_filter_hastag} && $filter{feature_filter_invert} && !$feat->has_tag($filter{feature_filter_hastag})) | |
| 202 || | |
| 203 (! $filter{feature_filter_hastag}) | |
| 204 ){ | |
| 205 if( | |
| 206 !$filter{feature_filter_strand} | |
| 207 || | |
| 208 ($feat->strand() == 1 && ( $filter{feature_filter_strand} eq 'f' || $filter{feature_filter_strand} eq 'a' )) | |
| 209 || | |
| 210 ($feat->strand() == -1 && ( $filter{feature_filter_strand} eq 'r' || $filter{feature_filter_strand} eq 'a' )) | |
| 211 || | |
| 212 ($feat->strand() == 0 && ( $filter{feature_filter_strand} eq 'a' )) | |
| 213 ){ | |
| 214 push(@features, join(' ', $parent, $feat->start, $feat->end)); | |
| 215 } | |
| 216 } | |
| 217 } | |
| 218 } | |
| 219 print "Found " . scalar @features . " features \n"; | |
| 220 push(@files, [ 'data/' . $filename, join("\n", @features) ] ); | |
| 221 } | |
| 222 sub plotsconf{ | |
| 223 my $cc = CPT::Circos::Conf->new(); | |
| 224 | |
| 225 $cc->start_block('plots'); | |
| 226 | |
| 227 | |
| 228 my $idx = 0; | |
| 229 foreach(@reorg_args){ | |
| 230 my %filter = %{$_}; | |
| 231 #{ | |
| 232 #'feature_filter_invert' => 'yes', | |
| 233 #'feature_plot_color' => '005500', | |
| 234 #'feature_filter_strand' => 'f', | |
| 235 #'feature_filter_hastag' => 'pseudo', | |
| 236 #'key' => 'CDS' | |
| 237 #}, | |
| 238 $idx++; | |
| 239 my $filename = sprintf('org.features.%s.txt', $idx); | |
| 240 generate_feature_table($filename, %filter); | |
| 241 | |
| 242 my ($r0,$r1) = register_track(); | |
| 243 $cc->start_block('plot'); | |
| 244 $cc->set('type','tile'); | |
| 245 $cc->set('file',$filename); | |
| 246 $cc->set('orientation', 'center'); | |
| 247 $cc->set('thickness', '30'); | |
| 248 $cc->set('r1', $r1 . 'r');# '0.78r'); | |
| 249 $cc->set('r0', $r0 . 'r');# '0.72r'); | |
| 250 $cc->set('layers','3'); | |
| 251 $cc->set('layers_overflow','collapse'); | |
| 252 $cc->set('color','paired-6-qual-' . $idx); | |
| 253 $cc->end_block(); | |
| 254 } | |
| 255 | |
| 256 if($options->{enable_gc_skew_plot}){ | |
| 257 my ($r0,$r1) = register_track(); | |
| 258 $cc->start_block('plot'); | |
| 259 $cc->set('type','histogram'); | |
| 260 $cc->set('file','gc.txt'); | |
| 261 $cc->set('r1',$r1 . 'r'); | |
| 262 $cc->set('r0',$r0 . 'r'); | |
| 263 $cc->set('fill_color','purple'); | |
| 264 $cc->set('orientation','in'); | |
| 265 $cc->start_block('rules'); | |
| 266 $cc->start_block('rule'); | |
| 267 $cc->set('condition','var(value) < 0'); | |
| 268 $cc->set('fill_color', 'green'); | |
| 269 $cc->end_block(); | |
| 270 $cc->end_block(); | |
| 271 $cc->end_block(); | |
| 272 } | |
| 273 | |
| 274 #$cc->start_block('plot'); | |
| 275 #$cc->set('type','histogram'); | |
| 276 #$cc->set('file','gc_cumulative.txt'); | |
| 277 #$cc->set('r1','0.6r'); | |
| 278 #$cc->set('r0','0.55r'); | |
| 279 #$cc->set('fill_color','purple'); | |
| 280 #$cc->set('orientation','out'); | |
| 281 #$cc->start_block('rules'); | |
| 282 #$cc->start_block('rule'); | |
| 283 #$cc->set('condition','var(value) < ' . $cum_gc_ske_mean); | |
| 284 #$cc->set('fill_color', 'green'); | |
| 285 #$cc->end_block(); | |
| 286 #$cc->end_block(); | |
| 287 #$cc->end_block(); | |
| 288 | |
| 289 $cc->end_block(); | |
| 290 return $cc->finalize(); | |
| 291 } | |
| 292 sub ticksconf{ | |
| 293 my $cc = CPT::Circos::Conf->new(); | |
| 294 | |
| 295 $cc->set('show_ticks','yes'); | |
| 296 $cc->set('show_tick_labels','yes'); | |
| 297 $cc->start_block('ticks'); | |
| 298 $cc->set('radius','1r'); | |
| 299 $cc->set('color','black'); | |
| 300 $cc->set('thickness','2p'); | |
| 301 $cc->set('multiplier','1e-3'); | |
| 302 $cc->set('format','%d'); | |
| 303 | |
| 304 $cc->start_block('tick'); | |
| 305 $cc->set('spacing','1000u'); | |
| 306 $cc->set('size','10p'); | |
| 307 $cc->end_block(); | |
| 308 | |
| 309 $cc->start_block('tick'); | |
| 310 $cc->set('spacing','10000u'); | |
| 311 $cc->set('size','15p'); | |
| 312 $cc->set('show_label','yes'); | |
| 313 $cc->set('label_size','20p'); | |
| 314 $cc->set('label_offset','10p'); | |
| 315 $cc->set('format','%d'); | |
| 316 $cc->end_block(); | |
| 317 $cc->end_block(); | |
| 318 return $cc->finalize(); | |
| 319 } | |
| 320 sub karyotype { | |
| 321 my @karyotype_data = (); | |
| 322 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank'); | |
| 323 # Only handing first sequence. | |
| 324 my $seq_object = $seqio_object->next_seq; | |
| 325 # Main 'chromosome' data | |
| 326 push(@karyotype_data, join(' ', 'chr', '-',$seq_object->display_id(),$seq_object->display_id(),0, $seq_object->length(),'black')); | |
| 327 | |
| 328 return join("\n", @karyotype_data); | |
| 329 } | |
| 330 | |
| 331 sub gcgraph_cumulative { | |
| 332 my @gcdata = (); | |
| 333 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank'); | |
| 334 # Only handing first sequence. | |
| 335 my $seq_object = $seqio_object->next_seq; | |
| 336 | |
| 337 my $parent = $seq_object->display_id(); | |
| 338 | |
| 339 my $seq = $seq_object->seq(); | |
| 340 my $sep = int($options->{gc_skew_plot_window_size}/2); | |
| 341 my $stepsep = int($options->{gc_skew_plot_step_size}/2); | |
| 342 my $cumulative_gc_skew = 0; | |
| 343 my @cumgc_vals; | |
| 344 | |
| 345 my $count = 0; | |
| 346 foreach(my $i = $sep; $i < $seq_object->length() - $sep - $options->{gc_skew_plot_step_size}; $i += $options->{gc_skew_plot_step_size}){ | |
| 347 $count++; | |
| 348 $cumulative_gc_skew += _calculate_gc_skew_for_seq(substr($seq,$i-$sep,$options->{gc_skew_plot_window_size})), | |
| 349 push(@cumgc_vals, $cumulative_gc_skew); | |
| 350 push(@gcdata, join(" ", | |
| 351 $parent, | |
| 352 $i - $stepsep, | |
| 353 $i + $stepsep, | |
| 354 $cumulative_gc_skew | |
| 355 )); | |
| 356 } | |
| 357 | |
| 358 my $sum = 0; | |
| 359 foreach(@cumgc_vals){$sum += $_;} | |
| 360 $cum_gc_ske_mean = $sum / $count; | |
| 361 | |
| 362 return join("\n", @gcdata); | |
| 363 # Main 'chromosome' data | |
| 364 } | |
| 365 sub gcgraph { | |
| 366 my @gcdata = (); | |
| 367 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank'); | |
| 368 # Only handing first sequence. | |
| 369 my $seq_object = $seqio_object->next_seq; | |
| 370 | |
| 371 my $parent = $seq_object->display_id(); | |
| 372 | |
| 373 my $seq = $seq_object->seq(); | |
| 374 my $sep = int($options->{gc_skew_plot_window_size}/2); | |
| 375 my $stepsep = int($options->{gc_skew_plot_step_size}/2); | |
| 376 foreach(my $i = $sep; $i < $seq_object->length() - $sep - $options->{gc_skew_plot_step_size}; $i += $options->{gc_skew_plot_step_size}){ | |
| 377 push(@gcdata, join(" ", | |
| 378 $parent, | |
| 379 $i - $stepsep, | |
| 380 $i + $stepsep, | |
| 381 _calculate_gc_skew_for_seq(substr($seq,$i-$sep,$options->{gc_skew_plot_window_size})), | |
| 382 )); | |
| 383 } | |
| 384 return join("\n", @gcdata); | |
| 385 # Main 'chromosome' data | |
| 386 } | |
| 387 sub _calculate_gc_skew_for_seq { | |
| 388 my ($seq) = @_; | |
| 389 $seq = uc($seq); | |
| 390 my %counts; | |
| 391 foreach(split //,$seq){ | |
| 392 $counts{$_}++; | |
| 393 } | |
| 394 if($counts{G} + $counts{C} > 0){ | |
| 395 return ($counts{G} - $counts{C}) / ($counts{G} + $counts{C}); | |
| 396 } | |
| 397 return 0; | |
| 398 } | |
| 399 | |
| 400 push(@files, [ 'etc/karyotype.conf', karyotype() ]); | |
| 401 push(@files, [ 'etc/circos.conf', circosconf() ]); | |
| 402 push(@files, [ 'etc/ideogram.conf', ideogramconf() ]); | |
| 403 if($options->{enable_gc_skew_plot}){ | |
| 404 push(@files, [ 'data/gc.txt', gcgraph() ]); | |
| 405 push(@files, [ 'data/gc_cumulative.txt', gcgraph_cumulative() ]); | |
| 406 } | |
| 407 | |
| 408 push(@files, [ 'etc/plots.conf', plotsconf() ]); | |
| 409 push(@files, [ 'etc/ticks.conf', ticksconf() ]); | |
| 410 | |
| 411 | |
| 412 use Archive::Any::Create; | |
| 413 my $archive = Archive::Any::Create->new(); | |
| 414 $archive->container('conf'); | |
| 415 foreach(@files){ | |
| 416 my ($location, $content) = @{$_}; | |
| 417 $archive->add_file($location, $content); | |
| 418 } | |
| 419 | |
| 420 use CPT::OutputFiles; | |
| 421 my $crr_output = CPT::OutputFiles->new( | |
| 422 name => 'output_circos_confs', | |
| 423 GGO => $ggo, | |
| 424 ); | |
| 425 $crr_output->CRR(data => $archive); | |
| 426 | |
| 427 =head1 NAME | |
| 428 | |
| 429 DNAPlotter | |
| 430 | |
| 431 =head1 DESCRIPTION | |
| 432 | |
| 433 Much like artemis's DNAPlotter, this tool plots genomes in a ciruclar fashion, and can plot gc-deviation tracks as well. The options are somewhat reduced compared to artemis, so if you need something that isn't available in this version please file a bug report. | |
| 434 | |
| 435 =head1 USE | |
| 436 | |
| 437 Each track has several parameters: | |
| 438 | |
| 439 =over 4 | |
| 440 | |
| 441 =item track_key | |
| 442 | |
| 443 This selects a set of features from a genbank file, e.g., CDSs or tRNAs | |
| 444 | |
| 445 =item track_feature_filter_invert | |
| 446 | |
| 447 This parameter will invert (negate) the results of whatever query parameters you specify after it. | |
| 448 | |
| 449 =item track_feature_filter_hastag | |
| 450 | |
| 451 Require that a feature has a specific tag.... | |
| 452 | |
| 453 =item track_feature_filter_textquery | |
| 454 | |
| 455 ...with this specific text in it | |
| 456 | |
| 457 =item track_feature_filter_strand | |
| 458 | |
| 459 Which strand should the feature be on (not inverted) | |
| 460 | |
| 461 =back | |
| 462 | |
| 463 Additionally, users are able to enable/disable GC skew plots. it's suggested that these are generally left alone, as they can quickly increase runtime. | |
| 464 | |
| 465 =cut |
