comparison cpt_dnaplotter.pl @ 1:8691c1c61a8e draft default tip

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:48:47 +0000
parents
children
comparison
equal deleted inserted replaced
0:54c7a3ea81e2 1:8691c1c61a8e
1 #!/usr/bin/perl
2 #
3 # Code written by Eric Rasche
4 # mailto:rasche.eric@yandex.ru
5 # tel: 404.692.2048
6 # http://eric.rasche.co.uk
7 # for
8 # Center for Phage Technology
9 #
10
11 use strict;
12 use warnings;
13
14 use CPT::GalaxyGetOpt;
15 use Data::Dumper;
16 use CPT::Bio;
17 use CPT::Circos::Conf;
18 my $bio = CPT::Bio->new();
19
20 my @argv_copy;
21 foreach(@ARGV){push(@argv_copy, "$_");}
22
23 my $ggo = CPT::GalaxyGetOpt->new();
24 my $options = $ggo->getOptions(
25 'options' => [
26 [ 'file|f', 'Input file', { validate => 'File/Input',
27 file_format => ['genbank', 'embl', 'txt'],
28 } ],
29 [],
30 ['Track Configuration'],
31 ['track_key', 'Key to select from genbank data', { validate => 'Genomic/Tag', required => 1, multiple => 1 } ],
32 ['track_feature_filter_invert', 'Should the qualifier search be inverted?', { validate => 'Option', options => { 'yes', 'Yes', 'no', 'No' } , multiple => 1 } ],
33 ['track_feature_filter_hastag', 'Select a tag which should be present in that qualifier (e.g., signal/tmhelix/pseudo)', { validate => 'String' , multiple => 1 } ],
34 ['track_feature_filter_textquery', 'Specify text which MUST be in that tag', { validate => 'String' , multiple => 1 } ],
35 ['track_feature_filter_strand', 'Which strand should the feature appear on?', { validate => 'Option', options => { 'f', 'Forward', 'r', 'Reverse', 'a', 'Any' } , multiple => 1 } ],
36 [],
37 ['enable_gc_skew_plot', 'Enable/Disable calculation of GC Skew Plot', { validate => 'Flag' } ],
38 ['gc_skew_plot_window_size', 'Window size for calculation of GC Skew', { validate => 'Int', min => 1000, default => 10000} ],
39 ['gc_skew_plot_step_size', 'Step size for calculation of GC Skew', { validate => 'Int', min => 200, default => 200 } ],
40 ],
41 'outputs' => [
42 [
43 'output_circos_confs',
44 'Circos Configuration Files',
45 {
46 validate => 'File/Output',
47 required => 1,
48 default => 'out',
49 data_format => 'archive',
50 default_format => 'zip',
51 }
52 ],
53 ],
54 'defaults' => [
55 'appid' => 'CircularDNAPlotter',
56 'appname' => 'Circos based DNAPlotter',
57 'appdesc' => 'plots genomes similar to Artemis\'s DNAPlotter',
58 'appvers' => '1.94.1',
59 ],
60 );
61
62 #perl cpt_dnaplotter.pl \
63 #-f ../t/test-files/moon.gbk \
64 #--track_key CDS --track_feature_filter_invert yes --track_feature_filter_hastag pseudo --track_feature_filter_strand f \
65 #--track_key CDS --track_feature_filter_invert yes --track_feature_filter_hastag pseudo --track_feature_filter_strand r \
66 #--track_key CDS --track_feature_filter_hastag pseudo --track_feature_filter_strand a \
67 #--track_key tRNA --track_feature_filter_strand a \
68 #--track_key CDS --track_feature_filter_hastag signal --track_feature_filter_strand a \
69 #--track_key CDS --track_feature_filter_hastag tmhelix --track_feature_filter_strand a
70
71
72 my @reorg_args = ();
73
74 my $cum_gc_ske_mean = 0;
75
76
77
78 my %latest = ();
79 for(my $i = 0; $i < scalar(@argv_copy); $i++){
80 my $c = $argv_copy[$i];
81 # We have entered a new one block
82 if($c eq '--track_key'){
83 # If we have loaded data
84 if(scalar(keys(%latest)) > 0){
85 my %copy;
86 foreach(keys(%latest)){
87 $copy{$_} = "" . $latest{$_};
88 }
89 push(@reorg_args, \%copy);
90 }
91
92 # Clean out latest to prep for new data
93 foreach(keys(%latest)){
94 delete $latest{$_};
95 }
96 }
97
98 if($c =~ /^--track_(.*)/){
99 $latest{$1} = $argv_copy[$i+1];
100 # Artificially bump so we don't bother looking at the answer to
101 # this question. We can "get away" with this because none of
102 # the options are flags. However, I've disabled it in the event
103 # that flags ARE introduced
104 #$i++;
105 }
106 }
107 push(@reorg_args, \%latest);
108 #$VAR1 = [
109 #{
110 #'feature_filter_invert' => 'yes',
111 #'feature_plot_color' => '005500',
112 #'feature_filter_strand' => 'f',
113 #'feature_filter_hastag' => 'pseudo',
114 #'key' => 'CDS'
115 #},
116 #{
117 #'feature_filter_strand' => 'a',
118 #'key' => 'RBS'
119 #}
120 #];
121 my @files = ();
122
123 my $number_of_tracks = 0;
124 sub register_track {
125 #my $r0 = ( 90 - (10 * $number_of_tracks - 1)/2) / 100;
126 #my $r1 = ( 90 - (10 * $number_of_tracks - 9)/2) / 100;
127 my $r0 = ( 90 - (10 * $number_of_tracks - 1)/1) / 100;
128 my $r1 = ( 90 - (10 * $number_of_tracks - 9)/1) / 100;
129 $number_of_tracks++;
130 return ($r0, $r1);
131 }
132
133 sub circosconf {
134 my $cc = CPT::Circos::Conf->new();
135 $cc->include('etc/colors_fonts_patterns.conf');
136 # Features to plot along the genome
137 $cc->include('ideogram.conf');
138 # markings indicating position along genome
139 $cc->include('ticks.conf');
140 # Genome data
141 $cc->set('karyotype', 'karyotype.conf');
142 # Default image params are fine
143 $cc->start_block('image');
144 $cc->include('etc/image.conf');
145 $cc->end_block();
146 # ???
147 $cc->set('chromosome_units', '1000');
148 $cc->set('chromosome_display_default', 'yes');
149 #$cc->include('highlights.conf');
150 $cc->include('plots.conf');
151
152 $cc->include('etc/housekeeping.conf');
153 my $result = $cc->finalize();
154 $cc = CPT::Circos::Conf->new();
155 return $result;
156 }
157 sub ideogramconf{
158 my $cc = CPT::Circos::Conf->new();
159 $cc->start_block('ideogram');
160 $cc->start_block('spacing');
161 $cc->set('default','0u');
162 $cc->set('break','0u');
163 $cc->end_block();
164
165 $cc->set('thickness', '20p');
166 $cc->set('stroke_thickness', '2');
167 $cc->set('stroke_color', 'black');
168 $cc->set('fill','no');
169 $cc->set('fill_color','black');
170 $cc->set('radius','0.85r');
171 $cc->set('show_label','yes');
172 $cc->set('label_font','default');
173 $cc->set('label_radius','dims(ideogram,radius) + 0.05');
174 $cc->set('label_size','36');
175 $cc->set('label_parallel','yes');
176 $cc->set('label_case','upper');
177
178 $cc->set('band_stroke_thickness','2');
179 $cc->set('show_bands','yes');
180 $cc->set('fill_bands','yes');
181 $cc->end_block();
182
183 return $cc->finalize();
184 }
185 sub generate_feature_table {
186 my ($filename, %filter) = @_;
187 print "Filtering on features\n";
188 print Dumper \%filter;
189 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank');
190 # Only handing first sequence.
191 my $seq_object = $seqio_object->next_seq;
192 my $parent = $seq_object->display_id();
193 # Feature data
194 my @features;
195 foreach my $feat($seq_object->get_SeqFeatures()){
196 if($feat->primary_tag() eq $filter{key}){
197 # If they've said "hastag" AND we do indeed have that tag AND we haven't inverted this filter.
198 if(
199 ($filter{feature_filter_hastag} && $feat->has_tag($filter{feature_filter_hastag}) && !$filter{feature_filter_invert})
200 ||
201 ($filter{feature_filter_hastag} && $filter{feature_filter_invert} && !$feat->has_tag($filter{feature_filter_hastag}))
202 ||
203 (! $filter{feature_filter_hastag})
204 ){
205 if(
206 !$filter{feature_filter_strand}
207 ||
208 ($feat->strand() == 1 && ( $filter{feature_filter_strand} eq 'f' || $filter{feature_filter_strand} eq 'a' ))
209 ||
210 ($feat->strand() == -1 && ( $filter{feature_filter_strand} eq 'r' || $filter{feature_filter_strand} eq 'a' ))
211 ||
212 ($feat->strand() == 0 && ( $filter{feature_filter_strand} eq 'a' ))
213 ){
214 push(@features, join(' ', $parent, $feat->start, $feat->end));
215 }
216 }
217 }
218 }
219 print "Found " . scalar @features . " features \n";
220 push(@files, [ 'data/' . $filename, join("\n", @features) ] );
221 }
222 sub plotsconf{
223 my $cc = CPT::Circos::Conf->new();
224
225 $cc->start_block('plots');
226
227
228 my $idx = 0;
229 foreach(@reorg_args){
230 my %filter = %{$_};
231 #{
232 #'feature_filter_invert' => 'yes',
233 #'feature_plot_color' => '005500',
234 #'feature_filter_strand' => 'f',
235 #'feature_filter_hastag' => 'pseudo',
236 #'key' => 'CDS'
237 #},
238 $idx++;
239 my $filename = sprintf('org.features.%s.txt', $idx);
240 generate_feature_table($filename, %filter);
241
242 my ($r0,$r1) = register_track();
243 $cc->start_block('plot');
244 $cc->set('type','tile');
245 $cc->set('file',$filename);
246 $cc->set('orientation', 'center');
247 $cc->set('thickness', '30');
248 $cc->set('r1', $r1 . 'r');# '0.78r');
249 $cc->set('r0', $r0 . 'r');# '0.72r');
250 $cc->set('layers','3');
251 $cc->set('layers_overflow','collapse');
252 $cc->set('color','paired-6-qual-' . $idx);
253 $cc->end_block();
254 }
255
256 if($options->{enable_gc_skew_plot}){
257 my ($r0,$r1) = register_track();
258 $cc->start_block('plot');
259 $cc->set('type','histogram');
260 $cc->set('file','gc.txt');
261 $cc->set('r1',$r1 . 'r');
262 $cc->set('r0',$r0 . 'r');
263 $cc->set('fill_color','purple');
264 $cc->set('orientation','in');
265 $cc->start_block('rules');
266 $cc->start_block('rule');
267 $cc->set('condition','var(value) < 0');
268 $cc->set('fill_color', 'green');
269 $cc->end_block();
270 $cc->end_block();
271 $cc->end_block();
272 }
273
274 #$cc->start_block('plot');
275 #$cc->set('type','histogram');
276 #$cc->set('file','gc_cumulative.txt');
277 #$cc->set('r1','0.6r');
278 #$cc->set('r0','0.55r');
279 #$cc->set('fill_color','purple');
280 #$cc->set('orientation','out');
281 #$cc->start_block('rules');
282 #$cc->start_block('rule');
283 #$cc->set('condition','var(value) < ' . $cum_gc_ske_mean);
284 #$cc->set('fill_color', 'green');
285 #$cc->end_block();
286 #$cc->end_block();
287 #$cc->end_block();
288
289 $cc->end_block();
290 return $cc->finalize();
291 }
292 sub ticksconf{
293 my $cc = CPT::Circos::Conf->new();
294
295 $cc->set('show_ticks','yes');
296 $cc->set('show_tick_labels','yes');
297 $cc->start_block('ticks');
298 $cc->set('radius','1r');
299 $cc->set('color','black');
300 $cc->set('thickness','2p');
301 $cc->set('multiplier','1e-3');
302 $cc->set('format','%d');
303
304 $cc->start_block('tick');
305 $cc->set('spacing','1000u');
306 $cc->set('size','10p');
307 $cc->end_block();
308
309 $cc->start_block('tick');
310 $cc->set('spacing','10000u');
311 $cc->set('size','15p');
312 $cc->set('show_label','yes');
313 $cc->set('label_size','20p');
314 $cc->set('label_offset','10p');
315 $cc->set('format','%d');
316 $cc->end_block();
317 $cc->end_block();
318 return $cc->finalize();
319 }
320 sub karyotype {
321 my @karyotype_data = ();
322 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank');
323 # Only handing first sequence.
324 my $seq_object = $seqio_object->next_seq;
325 # Main 'chromosome' data
326 push(@karyotype_data, join(' ', 'chr', '-',$seq_object->display_id(),$seq_object->display_id(),0, $seq_object->length(),'black'));
327
328 return join("\n", @karyotype_data);
329 }
330
331 sub gcgraph_cumulative {
332 my @gcdata = ();
333 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank');
334 # Only handing first sequence.
335 my $seq_object = $seqio_object->next_seq;
336
337 my $parent = $seq_object->display_id();
338
339 my $seq = $seq_object->seq();
340 my $sep = int($options->{gc_skew_plot_window_size}/2);
341 my $stepsep = int($options->{gc_skew_plot_step_size}/2);
342 my $cumulative_gc_skew = 0;
343 my @cumgc_vals;
344
345 my $count = 0;
346 foreach(my $i = $sep; $i < $seq_object->length() - $sep - $options->{gc_skew_plot_step_size}; $i += $options->{gc_skew_plot_step_size}){
347 $count++;
348 $cumulative_gc_skew += _calculate_gc_skew_for_seq(substr($seq,$i-$sep,$options->{gc_skew_plot_window_size})),
349 push(@cumgc_vals, $cumulative_gc_skew);
350 push(@gcdata, join(" ",
351 $parent,
352 $i - $stepsep,
353 $i + $stepsep,
354 $cumulative_gc_skew
355 ));
356 }
357
358 my $sum = 0;
359 foreach(@cumgc_vals){$sum += $_;}
360 $cum_gc_ske_mean = $sum / $count;
361
362 return join("\n", @gcdata);
363 # Main 'chromosome' data
364 }
365 sub gcgraph {
366 my @gcdata = ();
367 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank');
368 # Only handing first sequence.
369 my $seq_object = $seqio_object->next_seq;
370
371 my $parent = $seq_object->display_id();
372
373 my $seq = $seq_object->seq();
374 my $sep = int($options->{gc_skew_plot_window_size}/2);
375 my $stepsep = int($options->{gc_skew_plot_step_size}/2);
376 foreach(my $i = $sep; $i < $seq_object->length() - $sep - $options->{gc_skew_plot_step_size}; $i += $options->{gc_skew_plot_step_size}){
377 push(@gcdata, join(" ",
378 $parent,
379 $i - $stepsep,
380 $i + $stepsep,
381 _calculate_gc_skew_for_seq(substr($seq,$i-$sep,$options->{gc_skew_plot_window_size})),
382 ));
383 }
384 return join("\n", @gcdata);
385 # Main 'chromosome' data
386 }
387 sub _calculate_gc_skew_for_seq {
388 my ($seq) = @_;
389 $seq = uc($seq);
390 my %counts;
391 foreach(split //,$seq){
392 $counts{$_}++;
393 }
394 if($counts{G} + $counts{C} > 0){
395 return ($counts{G} - $counts{C}) / ($counts{G} + $counts{C});
396 }
397 return 0;
398 }
399
400 push(@files, [ 'etc/karyotype.conf', karyotype() ]);
401 push(@files, [ 'etc/circos.conf', circosconf() ]);
402 push(@files, [ 'etc/ideogram.conf', ideogramconf() ]);
403 if($options->{enable_gc_skew_plot}){
404 push(@files, [ 'data/gc.txt', gcgraph() ]);
405 push(@files, [ 'data/gc_cumulative.txt', gcgraph_cumulative() ]);
406 }
407
408 push(@files, [ 'etc/plots.conf', plotsconf() ]);
409 push(@files, [ 'etc/ticks.conf', ticksconf() ]);
410
411
412 use Archive::Any::Create;
413 my $archive = Archive::Any::Create->new();
414 $archive->container('conf');
415 foreach(@files){
416 my ($location, $content) = @{$_};
417 $archive->add_file($location, $content);
418 }
419
420 use CPT::OutputFiles;
421 my $crr_output = CPT::OutputFiles->new(
422 name => 'output_circos_confs',
423 GGO => $ggo,
424 );
425 $crr_output->CRR(data => $archive);
426
427 =head1 NAME
428
429 DNAPlotter
430
431 =head1 DESCRIPTION
432
433 Much like artemis's DNAPlotter, this tool plots genomes in a ciruclar fashion, and can plot gc-deviation tracks as well. The options are somewhat reduced compared to artemis, so if you need something that isn't available in this version please file a bug report.
434
435 =head1 USE
436
437 Each track has several parameters:
438
439 =over 4
440
441 =item track_key
442
443 This selects a set of features from a genbank file, e.g., CDSs or tRNAs
444
445 =item track_feature_filter_invert
446
447 This parameter will invert (negate) the results of whatever query parameters you specify after it.
448
449 =item track_feature_filter_hastag
450
451 Require that a feature has a specific tag....
452
453 =item track_feature_filter_textquery
454
455 ...with this specific text in it
456
457 =item track_feature_filter_strand
458
459 Which strand should the feature be on (not inverted)
460
461 =back
462
463 Additionally, users are able to enable/disable GC skew plots. it's suggested that these are generally left alone, as they can quickly increase runtime.
464
465 =cut