0
|
1 #!/usr/bin/perl
|
|
2 #
|
|
3 # Code written by Eric Rasche
|
|
4 # mailto:rasche.eric@yandex.ru
|
|
5 # tel: 404.692.2048
|
|
6 # http://eric.rasche.co.uk
|
|
7 # for
|
|
8 # Center for Phage Technology
|
|
9 #
|
|
10
|
|
11 use strict;
|
|
12 use warnings;
|
|
13
|
|
14 use CPT::GalaxyGetOpt;
|
|
15 use Data::Dumper;
|
|
16 use CPT::Bio;
|
|
17 use CPT::Circos::Conf;
|
|
18 my $bio = CPT::Bio->new();
|
|
19
|
|
20 my @argv_copy;
|
|
21 foreach(@ARGV){push(@argv_copy, "$_");}
|
|
22
|
|
23 my $ggo = CPT::GalaxyGetOpt->new();
|
|
24 my $options = $ggo->getOptions(
|
|
25 'options' => [
|
|
26 [ 'file|f', 'Input file', { validate => 'File/Input',
|
|
27 file_format => ['genbank', 'embl', 'txt'],
|
|
28 } ],
|
|
29 [],
|
|
30 ['Track Configuration'],
|
|
31 ['track_key', 'Key to select from genbank data', { validate => 'Genomic/Tag', required => 1, multiple => 1 } ],
|
|
32 ['track_feature_filter_invert', 'Should the qualifier search be inverted?', { validate => 'Option', options => { 'yes', 'Yes', 'no', 'No' } , multiple => 1 } ],
|
|
33 ['track_feature_filter_hastag', 'Select a tag which should be present in that qualifier (e.g., signal/tmhelix/pseudo)', { validate => 'String' , multiple => 1 } ],
|
|
34 ['track_feature_filter_textquery', 'Specify text which MUST be in that tag', { validate => 'String' , multiple => 1 } ],
|
|
35 ['track_feature_filter_strand', 'Which strand should the feature appear on?', { validate => 'Option', options => { 'f', 'Forward', 'r', 'Reverse', 'a', 'Any' } , multiple => 1 } ],
|
|
36 [],
|
|
37 ['enable_gc_skew_plot', 'Enable/Disable calculation of GC Skew Plot', { validate => 'Flag' } ],
|
|
38 ['gc_skew_plot_window_size', 'Window size for calculation of GC Skew', { validate => 'Int', min => 1000, default => 10000} ],
|
|
39 ['gc_skew_plot_step_size', 'Step size for calculation of GC Skew', { validate => 'Int', min => 200, default => 200 } ],
|
|
40 ],
|
|
41 'outputs' => [
|
|
42 [
|
|
43 'output_circos_confs',
|
|
44 'Circos Configuration Files',
|
|
45 {
|
|
46 validate => 'File/Output',
|
|
47 required => 1,
|
|
48 default => 'out',
|
|
49 data_format => 'archive',
|
|
50 default_format => 'zip',
|
|
51 }
|
|
52 ],
|
|
53 ],
|
|
54 'defaults' => [
|
|
55 'appid' => 'CircularDNAPlotter',
|
|
56 'appname' => 'Circos based DNAPlotter',
|
|
57 'appdesc' => 'plots genomes similar to Artemis\'s DNAPlotter',
|
|
58 'appvers' => '1.94.1',
|
|
59 ],
|
|
60 );
|
|
61
|
|
62 #perl cpt_dnaplotter.pl \
|
|
63 #-f ../t/test-files/moon.gbk \
|
|
64 #--track_key CDS --track_feature_filter_invert yes --track_feature_filter_hastag pseudo --track_feature_filter_strand f \
|
|
65 #--track_key CDS --track_feature_filter_invert yes --track_feature_filter_hastag pseudo --track_feature_filter_strand r \
|
|
66 #--track_key CDS --track_feature_filter_hastag pseudo --track_feature_filter_strand a \
|
|
67 #--track_key tRNA --track_feature_filter_strand a \
|
|
68 #--track_key CDS --track_feature_filter_hastag signal --track_feature_filter_strand a \
|
|
69 #--track_key CDS --track_feature_filter_hastag tmhelix --track_feature_filter_strand a
|
|
70
|
|
71
|
|
72 my @reorg_args = ();
|
|
73
|
|
74 my $cum_gc_ske_mean = 0;
|
|
75
|
|
76
|
|
77
|
|
78 my %latest = ();
|
|
79 for(my $i = 0; $i < scalar(@argv_copy); $i++){
|
|
80 my $c = $argv_copy[$i];
|
|
81 # We have entered a new one block
|
|
82 if($c eq '--track_key'){
|
|
83 # If we have loaded data
|
|
84 if(scalar(keys(%latest)) > 0){
|
|
85 my %copy;
|
|
86 foreach(keys(%latest)){
|
|
87 $copy{$_} = "" . $latest{$_};
|
|
88 }
|
|
89 push(@reorg_args, \%copy);
|
|
90 }
|
|
91
|
|
92 # Clean out latest to prep for new data
|
|
93 foreach(keys(%latest)){
|
|
94 delete $latest{$_};
|
|
95 }
|
|
96 }
|
|
97
|
|
98 if($c =~ /^--track_(.*)/){
|
|
99 $latest{$1} = $argv_copy[$i+1];
|
|
100 # Artificially bump so we don't bother looking at the answer to
|
|
101 # this question. We can "get away" with this because none of
|
|
102 # the options are flags. However, I've disabled it in the event
|
|
103 # that flags ARE introduced
|
|
104 #$i++;
|
|
105 }
|
|
106 }
|
|
107 push(@reorg_args, \%latest);
|
|
108 #$VAR1 = [
|
|
109 #{
|
|
110 #'feature_filter_invert' => 'yes',
|
|
111 #'feature_plot_color' => '005500',
|
|
112 #'feature_filter_strand' => 'f',
|
|
113 #'feature_filter_hastag' => 'pseudo',
|
|
114 #'key' => 'CDS'
|
|
115 #},
|
|
116 #{
|
|
117 #'feature_filter_strand' => 'a',
|
|
118 #'key' => 'RBS'
|
|
119 #}
|
|
120 #];
|
|
121 my @files = ();
|
|
122
|
|
123 my $number_of_tracks = 0;
|
|
124 sub register_track {
|
|
125 #my $r0 = ( 90 - (10 * $number_of_tracks - 1)/2) / 100;
|
|
126 #my $r1 = ( 90 - (10 * $number_of_tracks - 9)/2) / 100;
|
|
127 my $r0 = ( 90 - (10 * $number_of_tracks - 1)/1) / 100;
|
|
128 my $r1 = ( 90 - (10 * $number_of_tracks - 9)/1) / 100;
|
|
129 $number_of_tracks++;
|
|
130 return ($r0, $r1);
|
|
131 }
|
|
132
|
|
133 sub circosconf {
|
|
134 my $cc = CPT::Circos::Conf->new();
|
|
135 $cc->include('etc/colors_fonts_patterns.conf');
|
|
136 # Features to plot along the genome
|
|
137 $cc->include('ideogram.conf');
|
|
138 # markings indicating position along genome
|
|
139 $cc->include('ticks.conf');
|
|
140 # Genome data
|
|
141 $cc->set('karyotype', 'karyotype.conf');
|
|
142 # Default image params are fine
|
|
143 $cc->start_block('image');
|
|
144 $cc->include('etc/image.conf');
|
|
145 $cc->end_block();
|
|
146 # ???
|
|
147 $cc->set('chromosome_units', '1000');
|
|
148 $cc->set('chromosome_display_default', 'yes');
|
|
149 #$cc->include('highlights.conf');
|
|
150 $cc->include('plots.conf');
|
|
151
|
|
152 $cc->include('etc/housekeeping.conf');
|
|
153 my $result = $cc->finalize();
|
|
154 $cc = CPT::Circos::Conf->new();
|
|
155 return $result;
|
|
156 }
|
|
157 sub ideogramconf{
|
|
158 my $cc = CPT::Circos::Conf->new();
|
|
159 $cc->start_block('ideogram');
|
|
160 $cc->start_block('spacing');
|
|
161 $cc->set('default','0u');
|
|
162 $cc->set('break','0u');
|
|
163 $cc->end_block();
|
|
164
|
|
165 $cc->set('thickness', '20p');
|
|
166 $cc->set('stroke_thickness', '2');
|
|
167 $cc->set('stroke_color', 'black');
|
|
168 $cc->set('fill','no');
|
|
169 $cc->set('fill_color','black');
|
|
170 $cc->set('radius','0.85r');
|
|
171 $cc->set('show_label','yes');
|
|
172 $cc->set('label_font','default');
|
|
173 $cc->set('label_radius','dims(ideogram,radius) + 0.05');
|
|
174 $cc->set('label_size','36');
|
|
175 $cc->set('label_parallel','yes');
|
|
176 $cc->set('label_case','upper');
|
|
177
|
|
178 $cc->set('band_stroke_thickness','2');
|
|
179 $cc->set('show_bands','yes');
|
|
180 $cc->set('fill_bands','yes');
|
|
181 $cc->end_block();
|
|
182
|
|
183 return $cc->finalize();
|
|
184 }
|
|
185 sub generate_feature_table {
|
|
186 my ($filename, %filter) = @_;
|
|
187 print "Filtering on features\n";
|
|
188 print Dumper \%filter;
|
|
189 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank');
|
|
190 # Only handing first sequence.
|
|
191 my $seq_object = $seqio_object->next_seq;
|
|
192 my $parent = $seq_object->display_id();
|
|
193 # Feature data
|
|
194 my @features;
|
|
195 foreach my $feat($seq_object->get_SeqFeatures()){
|
|
196 if($feat->primary_tag() eq $filter{key}){
|
|
197 # If they've said "hastag" AND we do indeed have that tag AND we haven't inverted this filter.
|
|
198 if(
|
|
199 ($filter{feature_filter_hastag} && $feat->has_tag($filter{feature_filter_hastag}) && !$filter{feature_filter_invert})
|
|
200 ||
|
|
201 ($filter{feature_filter_hastag} && $filter{feature_filter_invert} && !$feat->has_tag($filter{feature_filter_hastag}))
|
|
202 ||
|
|
203 (! $filter{feature_filter_hastag})
|
|
204 ){
|
|
205 if(
|
|
206 !$filter{feature_filter_strand}
|
|
207 ||
|
|
208 ($feat->strand() == 1 && ( $filter{feature_filter_strand} eq 'f' || $filter{feature_filter_strand} eq 'a' ))
|
|
209 ||
|
|
210 ($feat->strand() == -1 && ( $filter{feature_filter_strand} eq 'r' || $filter{feature_filter_strand} eq 'a' ))
|
|
211 ||
|
|
212 ($feat->strand() == 0 && ( $filter{feature_filter_strand} eq 'a' ))
|
|
213 ){
|
|
214 push(@features, join(' ', $parent, $feat->start, $feat->end));
|
|
215 }
|
|
216 }
|
|
217 }
|
|
218 }
|
|
219 print "Found " . scalar @features . " features \n";
|
|
220 push(@files, [ 'data/' . $filename, join("\n", @features) ] );
|
|
221 }
|
|
222 sub plotsconf{
|
|
223 my $cc = CPT::Circos::Conf->new();
|
|
224
|
|
225 $cc->start_block('plots');
|
|
226
|
|
227
|
|
228 my $idx = 0;
|
|
229 foreach(@reorg_args){
|
|
230 my %filter = %{$_};
|
|
231 #{
|
|
232 #'feature_filter_invert' => 'yes',
|
|
233 #'feature_plot_color' => '005500',
|
|
234 #'feature_filter_strand' => 'f',
|
|
235 #'feature_filter_hastag' => 'pseudo',
|
|
236 #'key' => 'CDS'
|
|
237 #},
|
|
238 $idx++;
|
|
239 my $filename = sprintf('org.features.%s.txt', $idx);
|
|
240 generate_feature_table($filename, %filter);
|
|
241
|
|
242 my ($r0,$r1) = register_track();
|
|
243 $cc->start_block('plot');
|
|
244 $cc->set('type','tile');
|
|
245 $cc->set('file',$filename);
|
|
246 $cc->set('orientation', 'center');
|
|
247 $cc->set('thickness', '30');
|
|
248 $cc->set('r1', $r1 . 'r');# '0.78r');
|
|
249 $cc->set('r0', $r0 . 'r');# '0.72r');
|
|
250 $cc->set('layers','3');
|
|
251 $cc->set('layers_overflow','collapse');
|
|
252 $cc->set('color','paired-6-qual-' . $idx);
|
|
253 $cc->end_block();
|
|
254 }
|
|
255
|
|
256 if($options->{enable_gc_skew_plot}){
|
|
257 my ($r0,$r1) = register_track();
|
|
258 $cc->start_block('plot');
|
|
259 $cc->set('type','histogram');
|
|
260 $cc->set('file','gc.txt');
|
|
261 $cc->set('r1',$r1 . 'r');
|
|
262 $cc->set('r0',$r0 . 'r');
|
|
263 $cc->set('fill_color','purple');
|
|
264 $cc->set('orientation','in');
|
|
265 $cc->start_block('rules');
|
|
266 $cc->start_block('rule');
|
|
267 $cc->set('condition','var(value) < 0');
|
|
268 $cc->set('fill_color', 'green');
|
|
269 $cc->end_block();
|
|
270 $cc->end_block();
|
|
271 $cc->end_block();
|
|
272 }
|
|
273
|
|
274 #$cc->start_block('plot');
|
|
275 #$cc->set('type','histogram');
|
|
276 #$cc->set('file','gc_cumulative.txt');
|
|
277 #$cc->set('r1','0.6r');
|
|
278 #$cc->set('r0','0.55r');
|
|
279 #$cc->set('fill_color','purple');
|
|
280 #$cc->set('orientation','out');
|
|
281 #$cc->start_block('rules');
|
|
282 #$cc->start_block('rule');
|
|
283 #$cc->set('condition','var(value) < ' . $cum_gc_ske_mean);
|
|
284 #$cc->set('fill_color', 'green');
|
|
285 #$cc->end_block();
|
|
286 #$cc->end_block();
|
|
287 #$cc->end_block();
|
|
288
|
|
289 $cc->end_block();
|
|
290 return $cc->finalize();
|
|
291 }
|
|
292 sub ticksconf{
|
|
293 my $cc = CPT::Circos::Conf->new();
|
|
294
|
|
295 $cc->set('show_ticks','yes');
|
|
296 $cc->set('show_tick_labels','yes');
|
|
297 $cc->start_block('ticks');
|
|
298 $cc->set('radius','1r');
|
|
299 $cc->set('color','black');
|
|
300 $cc->set('thickness','2p');
|
|
301 $cc->set('multiplier','1e-3');
|
|
302 $cc->set('format','%d');
|
|
303
|
|
304 $cc->start_block('tick');
|
|
305 $cc->set('spacing','1000u');
|
|
306 $cc->set('size','10p');
|
|
307 $cc->end_block();
|
|
308
|
|
309 $cc->start_block('tick');
|
|
310 $cc->set('spacing','10000u');
|
|
311 $cc->set('size','15p');
|
|
312 $cc->set('show_label','yes');
|
|
313 $cc->set('label_size','20p');
|
|
314 $cc->set('label_offset','10p');
|
|
315 $cc->set('format','%d');
|
|
316 $cc->end_block();
|
|
317 $cc->end_block();
|
|
318 return $cc->finalize();
|
|
319 }
|
|
320 sub karyotype {
|
|
321 my @karyotype_data = ();
|
|
322 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank');
|
|
323 # Only handing first sequence.
|
|
324 my $seq_object = $seqio_object->next_seq;
|
|
325 # Main 'chromosome' data
|
|
326 push(@karyotype_data, join(' ', 'chr', '-',$seq_object->display_id(),$seq_object->display_id(),0, $seq_object->length(),'black'));
|
|
327
|
|
328 return join("\n", @karyotype_data);
|
|
329 }
|
|
330
|
|
331 sub gcgraph_cumulative {
|
|
332 my @gcdata = ();
|
|
333 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank');
|
|
334 # Only handing first sequence.
|
|
335 my $seq_object = $seqio_object->next_seq;
|
|
336
|
|
337 my $parent = $seq_object->display_id();
|
|
338
|
|
339 my $seq = $seq_object->seq();
|
|
340 my $sep = int($options->{gc_skew_plot_window_size}/2);
|
|
341 my $stepsep = int($options->{gc_skew_plot_step_size}/2);
|
|
342 my $cumulative_gc_skew = 0;
|
|
343 my @cumgc_vals;
|
|
344
|
|
345 my $count = 0;
|
|
346 foreach(my $i = $sep; $i < $seq_object->length() - $sep - $options->{gc_skew_plot_step_size}; $i += $options->{gc_skew_plot_step_size}){
|
|
347 $count++;
|
|
348 $cumulative_gc_skew += _calculate_gc_skew_for_seq(substr($seq,$i-$sep,$options->{gc_skew_plot_window_size})),
|
|
349 push(@cumgc_vals, $cumulative_gc_skew);
|
|
350 push(@gcdata, join(" ",
|
|
351 $parent,
|
|
352 $i - $stepsep,
|
|
353 $i + $stepsep,
|
|
354 $cumulative_gc_skew
|
|
355 ));
|
|
356 }
|
|
357
|
|
358 my $sum = 0;
|
|
359 foreach(@cumgc_vals){$sum += $_;}
|
|
360 $cum_gc_ske_mean = $sum / $count;
|
|
361
|
|
362 return join("\n", @gcdata);
|
|
363 # Main 'chromosome' data
|
|
364 }
|
|
365 sub gcgraph {
|
|
366 my @gcdata = ();
|
|
367 my $seqio_object = Bio::SeqIO->new(-file => $options->{file}, -format=>'genbank');
|
|
368 # Only handing first sequence.
|
|
369 my $seq_object = $seqio_object->next_seq;
|
|
370
|
|
371 my $parent = $seq_object->display_id();
|
|
372
|
|
373 my $seq = $seq_object->seq();
|
|
374 my $sep = int($options->{gc_skew_plot_window_size}/2);
|
|
375 my $stepsep = int($options->{gc_skew_plot_step_size}/2);
|
|
376 foreach(my $i = $sep; $i < $seq_object->length() - $sep - $options->{gc_skew_plot_step_size}; $i += $options->{gc_skew_plot_step_size}){
|
|
377 push(@gcdata, join(" ",
|
|
378 $parent,
|
|
379 $i - $stepsep,
|
|
380 $i + $stepsep,
|
|
381 _calculate_gc_skew_for_seq(substr($seq,$i-$sep,$options->{gc_skew_plot_window_size})),
|
|
382 ));
|
|
383 }
|
|
384 return join("\n", @gcdata);
|
|
385 # Main 'chromosome' data
|
|
386 }
|
|
387 sub _calculate_gc_skew_for_seq {
|
|
388 my ($seq) = @_;
|
|
389 $seq = uc($seq);
|
|
390 my %counts;
|
|
391 foreach(split //,$seq){
|
|
392 $counts{$_}++;
|
|
393 }
|
|
394 if($counts{G} + $counts{C} > 0){
|
|
395 return ($counts{G} - $counts{C}) / ($counts{G} + $counts{C});
|
|
396 }
|
|
397 return 0;
|
|
398 }
|
|
399
|
|
400 push(@files, [ 'etc/karyotype.conf', karyotype() ]);
|
|
401 push(@files, [ 'etc/circos.conf', circosconf() ]);
|
|
402 push(@files, [ 'etc/ideogram.conf', ideogramconf() ]);
|
|
403 if($options->{enable_gc_skew_plot}){
|
|
404 push(@files, [ 'data/gc.txt', gcgraph() ]);
|
|
405 push(@files, [ 'data/gc_cumulative.txt', gcgraph_cumulative() ]);
|
|
406 }
|
|
407
|
|
408 push(@files, [ 'etc/plots.conf', plotsconf() ]);
|
|
409 push(@files, [ 'etc/ticks.conf', ticksconf() ]);
|
|
410
|
|
411
|
|
412 use Archive::Any::Create;
|
|
413 my $archive = Archive::Any::Create->new();
|
|
414 $archive->container('conf');
|
|
415 foreach(@files){
|
|
416 my ($location, $content) = @{$_};
|
|
417 $archive->add_file($location, $content);
|
|
418 }
|
|
419
|
|
420 use CPT::OutputFiles;
|
|
421 my $crr_output = CPT::OutputFiles->new(
|
|
422 name => 'output_circos_confs',
|
|
423 GGO => $ggo,
|
|
424 );
|
|
425 $crr_output->CRR(data => $archive);
|
|
426
|
|
427 =head1 NAME
|
|
428
|
|
429 DNAPlotter
|
|
430
|
|
431 =head1 DESCRIPTION
|
|
432
|
|
433 Much like artemis's DNAPlotter, this tool plots genomes in a ciruclar fashion, and can plot gc-deviation tracks as well. The options are somewhat reduced compared to artemis, so if you need something that isn't available in this version please file a bug report.
|
|
434
|
|
435 =head1 USE
|
|
436
|
|
437 Each track has several parameters:
|
|
438
|
|
439 =over 4
|
|
440
|
|
441 =item track_key
|
|
442
|
|
443 This selects a set of features from a genbank file, e.g., CDSs or tRNAs
|
|
444
|
|
445 =item track_feature_filter_invert
|
|
446
|
|
447 This parameter will invert (negate) the results of whatever query parameters you specify after it.
|
|
448
|
|
449 =item track_feature_filter_hastag
|
|
450
|
|
451 Require that a feature has a specific tag....
|
|
452
|
|
453 =item track_feature_filter_textquery
|
|
454
|
|
455 ...with this specific text in it
|
|
456
|
|
457 =item track_feature_filter_strand
|
|
458
|
|
459 Which strand should the feature be on (not inverted)
|
|
460
|
|
461 =back
|
|
462
|
|
463 Additionally, users are able to enable/disable GC skew plots. it's suggested that these are generally left alone, as they can quickly increase runtime.
|
|
464
|
|
465 =cut
|