comparison weblogolib/__init__.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1 #!/usr/bin/env python
2
3 # -------------------------------- WebLogo --------------------------------
4
5 # Copyright (c) 2003-2004 The Regents of the University of California.
6 # Copyright (c) 2005 Gavin E. Crooks
7 # Copyright (c) 2006, The Regents of the University of California, through print
8 # Lawrence Berkeley National Laboratory (subject to receipt of any required
9 # approvals from the U.S. Dept. of Energy). All rights reserved.
10
11 # This software is distributed under the new BSD Open Source License.
12 # <http://www.opensource.org/licenses/bsd-license.html>
13 #
14 # Redistribution and use in source and binary forms, with or without
15 # modification, are permitted provided that the following conditions are met:
16 #
17 # (1) Redistributions of source code must retain the above copyright notice,
18 # this list of conditions and the following disclaimer.
19 #
20 # (2) Redistributions in binary form must reproduce the above copyright
21 # notice, this list of conditions and the following disclaimer in the
22 # documentation and or other materials provided with the distribution.
23 #
24 # (3) Neither the name of the University of California, Lawrence Berkeley
25 # National Laboratory, U.S. Dept. of Energy nor the names of its contributors
26 # may be used to endorse or promote products derived from this software
27 # without specific prior written permission.
28 #
29 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 # POSSIBILITY OF SUCH DAMAGE.
40
41 # Replicates README.txt
42
43 """
44 WebLogo (http://code.google.com/p/weblogo/) is a tool for creating sequence
45 logos from biological sequence alignments. It can be run on the command line,
46 as a standalone webserver, as a CGI webapp, or as a python library.
47
48 The main WebLogo webserver is located at http://bespoke.lbl.gov/weblogo/
49
50
51 Codonlogo is based on Weblogo.
52
53 Please consult the manual for installation instructions and more information:
54 (Also located in the weblogolib/htdocs subdirectory.)
55
56 For help on the command line interface run
57 ./codonlogo --help
58
59 To build a simple logo run
60 ./codonlogo < cap.fa > logo0.eps
61
62 To run as a standalone webserver at localhost:8080
63 ./codonlogo --server
64
65 To create a logo in python code:
66 >>> from weblogolib import *
67 >>> fin = open('cap.fa')
68 >>> seqs = read_seq_data(fin)
69 >>> data = LogoData.from_seqs(seqs)
70 >>> options = LogoOptions()
71 >>> options.title = "A Logo Title"
72 >>> format = LogoFormat(data, options)
73 >>> fout = open('cap.eps', 'w')
74 >>> eps_formatter( data, format, fout)
75
76
77 -- Distribution and Modification --
78 This package is distributed under the new BSD Open Source License.
79 Please see the LICENSE.txt file for details on copyright and licensing.
80 The WebLogo source code can be downloaded from http://code.google.com/p/weblogo/
81 WebLogo requires Python 2.3, 2.4 or 2.5, the corebio python toolkit for computational
82 biology (http://code.google.com/p/corebio), and the python array package
83 'numpy' (http://www.scipy.org/Download)
84
85 # -------------------------------- CodonLogo --------------------------------
86
87
88 """
89
90 from math import *
91 import random
92 from itertools import izip, count
93 import sys
94 import copy
95 import os
96 from itertools import product
97 from datetime import datetime
98 from StringIO import StringIO
99
100
101
102 from corebio.data import rna_letters, dna_letters, amino_acid_letters
103 import random
104
105 # python2.3 compatability
106 from corebio._future import Template
107 from corebio._future.subprocess import *
108 from corebio._future import resource_string, resource_filename
109
110
111 from math import log, sqrt
112
113 # Avoid 'from numpy import *' since numpy has lots of names defined
114 from numpy import array, asarray, float64, ones, zeros, int32,all,any, shape
115 import numpy as na
116
117 from corebio.utils.deoptparse import DeOptionParser
118 from optparse import OptionGroup
119
120 from color import *
121 from colorscheme import *
122 from corebio.seq import Alphabet, Seq, SeqList
123 from corebio import seq_io
124 from corebio.utils import isfloat, find_command, ArgumentError
125 from corebio.moremath import *
126 from corebio.data import amino_acid_composition
127 from corebio.seq import unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet
128
129 codon_alphabetU=['AAA', 'AAC', 'AAG', 'AAU', 'ACA', 'ACC', 'ACG', 'ACU', 'AGA', 'AGC', 'AGG', 'AGU', 'AUA', 'AUC', 'AUG', 'AUU', 'CAA', 'CAC', 'CAG', 'CAU', 'CCA', 'CCC', 'CCG', 'CCU', 'CGA', 'CGC', 'CGG', 'CGU', 'CUA', 'CUC', 'CUG', 'CUU', 'GAA', 'GAC', 'GAG', 'GAU', 'GCA', 'GCC', 'GCG', 'GCU', 'GGA', 'GGC', 'GGG', 'GGU', 'GUA', 'GUC', 'GUG', 'GUU', 'UAA', 'UAC', 'UAG', 'UAU', 'UCA', 'UCC', 'UCG', 'UCU', 'UGA', 'UGC', 'UGG', 'UGU', 'UUA', 'UUC', 'UUG', 'UUU']
130 codon_alphabetT=['AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA', 'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC', 'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT', 'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC', 'CTG', 'CTT', 'GAA', 'GAC', 'GAG', 'GAT', 'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC', 'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG', 'TTT']
131
132 altype="codonsT"
133 offset=0
134 col=[]
135
136
137
138 __all__ = ['LogoSize',
139 'LogoOptions',
140 'description',
141 '__version__',
142 'LogoFormat',
143 'LogoData',
144 'Dirichlet',
145 'GhostscriptAPI',
146 'std_color_schemes',
147 'default_color_schemes',
148 'classic',
149 'std_units',
150 'std_sizes',
151 'std_alphabets',
152 'std_percentCG',
153 'pdf_formatter',
154 'jpeg_formatter',
155 'png_formatter',
156 'png_print_formatter',
157 'txt_formatter',
158 'eps_formatter',
159 'formatters',
160 'default_formatter',
161 'base_distribution',
162 'equiprobable_distribution',
163 'read_seq_data',
164 'which_alphabet',
165 'color',
166 'colorscheme',
167 ]
168
169 description = "Create sequence logos from biological sequence alignments."
170
171 __version__ = "1.0"
172
173 # These keywords are subsituted by subversion.
174 # The date and revision will only tell the truth after a branch or tag,
175 # since different files in trunk will have been changed at different times
176 release_date ="$Date: 2011-09-17 16:30:00 -0700 (Tue, 14 Oct 2008) $".split()[1]
177 release_build = "$Revision: 53 $".split()[1]
178 release_description = "CodonLogo %s (%s)" % (__version__, release_date)
179
180
181
182 def cgi(htdocs_directory) :
183 import weblogolib._cgi
184 weblogolib._cgi.main(htdocs_directory)
185
186 class GhostscriptAPI(object) :
187 """Interface to the command line program Ghostscript ('gs')"""
188
189 formats = ('png', 'pdf', 'jpeg')
190
191 def __init__(self, path=None) :
192 try:
193 command = find_command('gs', path=path)
194 except EnvironmentError:
195 try:
196 command = find_command('gswin32c.exe', path=path)
197 except EnvironmentError:
198 raise EnvironmentError("Could not find Ghostscript on path."
199 " There should be either a gs executable or a gswin32c.exe on your system's path")
200
201 self.command = command
202
203 def version(self) :
204 args = [self.command, '--version']
205 try :
206 p = Popen(args, stdout=PIPE)
207 (out,err) = p.communicate()
208 except OSError :
209 raise RuntimeError("Cannot communicate with ghostscript.")
210 return out.strip()
211
212 def convert(self, format, fin, fout, width, height, resolution=300) :
213 device_map = { 'png':'png16m', 'pdf':'pdfwrite', 'jpeg':'jpeg'}
214
215 try :
216 device = device_map[format]
217 except KeyError:
218 raise ValueError("Unsupported format.")
219
220 args = [self.command,
221 "-sDEVICE=%s" % device,
222 "-dPDFSETTINGS=/printer",
223 #"-q", # Quite: Do not dump messages to stdout.
224 "-sstdout=%stderr", # Redirect messages and errors to stderr
225 "-sOutputFile=-", # Stdout
226 "-dDEVICEWIDTHPOINTS=%s" % str(width),
227 "-dDEVICEHEIGHTPOINTS=%s" % str(height),
228 "-dSAFER", # For added security
229 "-dNOPAUSE",]
230
231 if device != 'pdf' :
232 args.append("-r%s" % str(resolution) )
233 if resolution < 300 : # Antialias if resolution is Less than 300 DPI
234 args.append("-dGraphicsAlphaBits=4")
235 args.append("-dTextAlphaBits=4")
236 args.append("-dAlignToPixels=0")
237
238 args.append("-") # Read from stdin. Must be last argument.
239
240 error_msg = "Unrecoverable error : Ghostscript conversion failed " \
241 "(Invalid postscript?). %s" % " ".join(args)
242
243 source = fin.read()
244
245 try :
246 p = Popen(args, stdin=PIPE, stdout = PIPE, stderr= PIPE)
247 (out,err) = p.communicate(source)
248 except OSError :
249 raise RuntimeError(error_msg)
250
251 if p.returncode != 0 :
252 error_msg += '\nReturn code: %i\n' % p.returncode
253 if err is not None : error_msg += err
254 raise RuntimeError(error_msg)
255
256 print >>fout, out
257 # end class Ghostscript
258
259
260 aa_composition = [ amino_acid_composition[_k] for _k in
261 unambiguous_protein_alphabet]
262
263
264
265 # ------ DATA ------
266
267 classic = ColorScheme([
268 ColorGroup("G", "orange" ),
269 ColorGroup("TU", "red"),
270 ColorGroup("C", "blue"),
271 ColorGroup("A", "green")
272 ] )
273
274 std_color_schemes = {"auto": None, # Depends on sequence type
275 "monochrome": monochrome,
276 "base pairing": base_pairing,
277 "classic": classic,
278 "hydrophobicity" : hydrophobicity,
279 "chemistry" : chemistry,
280 "charge" : charge,
281 }#
282
283 default_color_schemes = {
284 unambiguous_protein_alphabet: hydrophobicity,
285 unambiguous_rna_alphabet: base_pairing,
286 unambiguous_dna_alphabet: base_pairing
287 #codon_alphabet:codonsU
288 }
289
290
291 std_units = {
292 "bits" : 1./log(2),
293 "nats" : 1.,
294 "digits" : 1./log(10),
295 "kT" : 1.,
296 "kJ/mol" : 8.314472 *298.15 /1000.,
297 "kcal/mol": 1.987 *298.15 /1000.,
298 "probability" : None,
299 }
300
301 class LogoSize(object) :
302 def __init__(self, stack_width, stack_height) :
303 self.stack_width = stack_width
304 self.stack_height = stack_height
305
306 def __repr__(self):
307 return stdrepr(self)
308
309 # The base stack width is set equal to 9pt Courier.
310 # (Courier has a width equal to 3/5 of the point size.)
311 # Check that can get 80 characters in journal page @small
312 # 40 chacaters in a journal column
313 std_sizes = {
314 "small" : LogoSize( stack_width = 10, stack_height = 10*1*5),
315 "medium" : LogoSize( stack_width = 10*2, stack_height = 10*2*5),
316 "large" : LogoSize( stack_width = 10*3, stack_height = 10*3*5),
317 }
318
319
320 std_alphabets = {
321 'protein': unambiguous_protein_alphabet,
322 'rna': unambiguous_rna_alphabet,
323 'dna': unambiguous_dna_alphabet,
324 'codonsU':codon_alphabetU,
325 'codonsT':codon_alphabetT
326 }
327
328 std_percentCG = {
329 'H. sapiens' : 40.,
330 'E. coli' : 50.5,
331 'S. cerevisiae' : 38.,
332 'C. elegans' : 36.,
333 'D. melanogaster': 43.,
334 'M. musculus' : 42.,
335 'T. thermophilus' : 69.4,
336 }
337
338 # Thermus thermophilus: Henne A, Bruggemann H, Raasch C, Wiezer A, Hartsch T,
339 # Liesegang H, Johann A, Lienard T, Gohl O, Martinez-Arias R, Jacobi C,
340 # Starkuviene V, Schlenczeck S, Dencker S, Huber R, Klenk HP, Kramer W,
341 # Merkl R, Gottschalk G, Fritz HJ: The genome sequence of the extreme
342 # thermophile Thermus thermophilus.
343 # Nat Biotechnol 2004, 22:547-53
344
345 def stdrepr(obj) :
346 attr = vars(obj).items()
347
348
349 attr.sort()
350 args = []
351 for a in attr :
352 if a[0][0]=='_' : continue
353 args.append( '%s=%s' % ( a[0], repr(a[1])) )
354 args = ',\n'.join(args).replace('\n', '\n ')
355 return '%s(\n %s\n)' % (obj.__class__.__name__, args)
356
357
358 class LogoOptions(object) :
359 """ A container for all logo formating options. Not all of these
360 are directly accessible through the CLI or web interfaces.
361
362 To display LogoOption defaults:
363 >>> from weblogolib import *
364 >>> LogoOptions()
365
366
367 Attributes:
368 o alphabet
369 o creator_text -- Embedded as comment in figures.
370 o logo_title
371 o logo_label
372 o stacks_per_line
373 o unit_name
374 o show_yaxis
375 o yaxis_label -- Default depends on other settings.
376 o yaxis_tic_interval
377 o yaxis_minor_tic_ratio
378 o yaxis_scale
379 o show_xaxis
380 o xaxis_label
381 o xaxis_tic_interval
382 o rotate_numbers
383 o number_interval
384 o show_ends
385 o show_fineprint
386 o fineprint
387 o show_boxes
388 o shrink_fraction
389 o show_errorbars
390 o errorbar_fraction
391 o errorbar_width_fraction
392 o errorbar_gray
393 o resolution -- Dots per inch
394 o default_color
395 o color_scheme
396 o debug
397 o logo_margin
398 o stroke_width
399 o tic_length
400 o size
401 o stack_margin
402 o pad_right
403 o small_fontsize
404 o fontsize
405 o title_fontsize
406 o number_fontsize
407 o text_font
408 o logo_font
409 o title_font
410 o first_index
411 o logo_start
412 o logo_end
413 o scale_width
414 """
415
416 def __init__(self, **kwargs) :
417 """ Create a new LogoOptions instance.
418
419 >>> L = LogoOptions(logo_title = "Some Title String")
420 >>> L.show_yaxis = False
421 >>> repr(L)
422 """
423
424 self.creator_text = release_description,
425 self.alphabet = None
426
427 self.logo_title = ""
428 self.logo_label = ""
429 self.stacks_per_line = 20
430
431 self.unit_name = "bits"
432
433 self.show_yaxis = True
434 # yaxis_lable default depends on other settings. See LogoFormat
435 self.yaxis_label = None
436 self.yaxis_tic_interval = 1.
437 self.yaxis_minor_tic_ratio = 5
438 self.yaxis_scale = None
439
440 self.show_xaxis = True
441 self.xaxis_label = ""
442 self.xaxis_tic_interval =1
443 self.rotate_numbers = False
444 self.number_interval = 5
445 self.show_ends = False
446
447 self.show_fineprint = True
448 self.fineprint = "CodonLogo "+__version__
449
450 self.show_boxes = False
451 self.shrink_fraction = 0.5
452
453 self.show_errorbars = True
454 self.altype = True
455
456 self.errorbar_fraction = 0.90
457 self.errorbar_width_fraction = 0.25
458 self.errorbar_gray = 0.75
459
460 self.resolution = 96. # Dots per inch
461
462 self.default_color = Color.by_name("black")
463 self.color_scheme = None
464 #self.show_color_key = False # NOT yet implemented
465
466 self.debug = False
467
468 self.logo_margin = 2
469 self.stroke_width = 0.5
470 self.tic_length = 5
471
472 self.size = std_sizes["medium"]
473
474 self.stack_margin = 0.5
475 self.pad_right = False
476
477 self.small_fontsize = 6
478 self.fontsize = 10
479 self.title_fontsize = 12
480 self.number_fontsize = 8
481
482 self.text_font = "ArialMT"
483 self.logo_font = "Arial-BoldMT"
484 self.title_font = "ArialMT"
485
486 self.first_index = 1
487 self.logo_start = None
488 self.logo_end=None
489
490 # Scale width of characters proportional to gaps
491 self.scale_width = True
492
493 from corebio.utils import update
494 update(self, **kwargs)
495
496 def __repr__(self) :
497 attr = vars(self).items()
498 attr.sort()
499 args = []
500 for a in attr :
501 if a[0][0]=='_' : continue
502 args.append( '%s=%s' % ( a[0], repr(a[1])) )
503 args = ',\n'.join(args).replace('\n', '\n ')
504 return '%s(\n %s\n)' % (self.__class__.__name__, args)
505 # End class LogoOptions
506
507
508
509
510 class LogoFormat(LogoOptions) :
511 """ Specifies the format of the logo. Requires a LogoData and LogoOptions
512 objects.
513
514 >>> data = LogoData.from_seqs(seqs )
515 >>> options = LogoOptions()
516 >>> options.title = "A Logo Title"
517 >>> format = LogoFormat(data, options)
518 """
519 # TODO: Raise ArgumentErrors instead of ValueError and document
520 def __init__(self, data, options= None) :
521
522 LogoOptions.__init__(self)
523 #global offset
524 if options is not None :
525 self.__dict__.update(options.__dict__)
526
527 #offset=options.frame
528
529 self.alphabet = data.alphabet
530 self.seqlen = data.length
531 self.altype = True
532 self.show_title = False
533 self.show_xaxis_label = False
534 self.yaxis_minor_tic_interval = None
535 self.lines_per_logo = None
536 self.char_width = None
537 self.line_margin_left = None
538 self.line_margin_right = None
539 self.line_margin_bottom = None
540 self.line_margin_top = None
541 self.title_height = None
542 self.xaxis_label_height = None
543 self.line_height = None
544 self.line_width = None
545 self.logo_height = None
546 self.logo_width = None
547 self.creation_date = None
548 self.end_type = None
549
550 if self.stacks_per_line< 1 :
551 raise ArgumentError("Stacks per line should be greater than zero.",
552 "stacks_per_line" )
553
554 if self.size.stack_height<=0.0 :
555 raise ArgumentError(
556 "Stack height must be greater than zero.", "stack_height")
557 if (self.small_fontsize <= 0 or self.fontsize <=0 or
558 self.title_fontsize<=0 ):
559 raise ValueError("Font sizes must be positive.")
560
561 if self.errorbar_fraction<0.0 or self.errorbar_fraction>1.0 :
562 raise ValueError(
563 "The visible fraction of the error bar must be between zero and one.")
564
565 if self.yaxis_tic_interval<=0.0 :
566 raise ArgumentError( "The yaxis tic interval cannot be negative.",
567 'yaxis_tic_interval')
568
569 if self.size.stack_width <= 0.0 :
570 raise ValueError(
571 "The width of a stack should be a positive number.")
572
573 if self.yaxis_minor_tic_interval and \
574 self.yaxis_minor_tic_interval<=0.0 :
575 raise ValueError("Distances cannot be negative.")
576
577 if self.xaxis_tic_interval<=0 :
578 raise ValueError("Tic interval must be greater than zero.")
579
580 if self.number_interval<=0 :
581 raise ValueError("Invalid interval between numbers.")
582
583 if self.shrink_fraction<0.0 or self.shrink_fraction>1.0 :
584 raise ValueError("Invalid shrink fraction.")
585
586 if self.stack_margin<=0.0 :
587 raise ValueError("Invalid stack margin." )
588
589 if self.logo_margin<=0.0 :
590 raise ValueError("Invalid logo margin." )
591
592 if self.stroke_width<=0.0 :
593 raise ValueError("Invalid stroke width.")
594
595 if self.tic_length<=0.0 :
596 raise ValueError("Invalid tic length.")
597
598 # FIXME: More validation
599
600 # Inclusive upper and lower bounds
601 # FIXME: Validate here. Move from eps_formatter
602 if self.logo_start is None: self.logo_start = self.first_index
603
604 if self.logo_end is None :
605 self.logo_end = self.seqlen + self.first_index -1
606
607 self.total_stacks = self.logo_end - self.logo_start +1
608
609 if self.logo_start - self.first_index <0 :
610 raise ArgumentError(
611 "Logo range extends before start of available sequence.",
612 'logo_range')
613
614 if self.logo_end - self.first_index >= self.seqlen :
615 raise ArgumentError(
616 "Logo range extends beyond end of available sequence.",
617 'logo_range')
618
619 if self.logo_title : self.show_title = True
620 if not self.fineprint : self.show_fineprint = False
621 if self.xaxis_label : self.show_xaxis_label = True
622
623 if self.yaxis_label is None :
624 self.yaxis_label = self.unit_name
625
626 if self.yaxis_label :
627 self.show_yaxis_label = True
628 else :
629 self.show_yaxis_label = False
630 self.show_ends = False
631
632 if not self.yaxis_scale :
633 conversion_factor = std_units[self.unit_name]
634 if conversion_factor :
635 self.yaxis_scale=log(len(self.alphabet))*conversion_factor
636 #self.yaxis_scale=max(data.entropy)*conversion_factor
637 #marker# this is where I handle the max height. needs revision.
638 else :
639 self.yaxis_scale=1.0 # probability units
640
641 if self.yaxis_scale<=0.0 :
642 raise ValueError(('yaxis_scale', "Invalid yaxis scale"))
643 if self.yaxis_tic_interval >= self.yaxis_scale:
644 self.yaxis_tic_interval /= 2.
645
646 self.yaxis_minor_tic_interval \
647 = float(self.yaxis_tic_interval)/self.yaxis_minor_tic_ratio
648
649 if self.color_scheme is None :
650 #if self.alphabet in default_color_schemes :
651 #self.color_scheme = default_color_schemes[self.alphabet]
652 #else :
653 self.color_scheme = codonsT
654 #else:
655 #for color, symbols, desc in options.colors:
656 #try :
657 #self.color_scheme.append( ColorGroup(symbols, color, desc) )
658 #print >>sys.stderr, color_scheme.groups[2]
659 #except ValueError :
660 #raise ValueError(
661 #"error: option --color: invalid value: '%s'" % color )
662
663
664 self.lines_per_logo = 1+ ( (self.total_stacks-1) / self.stacks_per_line)
665
666 if self.lines_per_logo==1 and not self.pad_right:
667 self.stacks_per_line = min(self.stacks_per_line, self.total_stacks)
668
669 self.char_width = self.size.stack_width - 2* self.stack_margin
670
671
672 if self.show_yaxis :
673 self.line_margin_left = self.fontsize * 3.0
674 else :
675 self.line_margin_left = 0
676
677 if self.show_ends :
678 self.line_margin_right = self.fontsize *1.5
679 else :
680 self.line_margin_right = self.fontsize
681
682 if self.show_xaxis :
683 if self.rotate_numbers :
684 self.line_margin_bottom = self.number_fontsize *2.5
685 else:
686 self.line_margin_bottom = self.number_fontsize *1.5
687 else :
688 self.line_margin_bottom = 4
689
690 self.line_margin_top = 4
691
692 if self.show_title :
693 self.title_height = self.title_fontsize
694 else :
695 self.title_height = 0
696
697 self.xaxis_label_height =0.
698 if self.show_xaxis_label :
699 self.xaxis_label_height += self.fontsize
700 if self.show_fineprint :
701 self.xaxis_label_height += self.small_fontsize
702
703 self.line_height = (self.size.stack_height + self.line_margin_top +
704 self.line_margin_bottom )
705 self.line_width = (self.size.stack_width*self.stacks_per_line +
706 self.line_margin_left + self.line_margin_right )
707
708 self.logo_height = int(2*self.logo_margin + self.title_height \
709 + self.xaxis_label_height + self.line_height*self.lines_per_logo)
710 self.logo_width = int(2*self.logo_margin + self.line_width )
711
712
713 self.creation_date = datetime.now().isoformat(' ')
714
715 end_type = '-'
716 end_types = {
717 unambiguous_protein_alphabet: 'p',
718 unambiguous_rna_alphabet: '-',
719 unambiguous_dna_alphabet: 'd'
720 }
721 if self.show_ends and self.alphabet in end_types:
722 end_type = end_types[self.alphabet]
723 self.end_type = end_type
724 # End __init__
725 # End class LogoFormat
726
727
728
729 # ------ Logo Formaters ------
730 # Each formatter is a function f(LogoData, LogoFormat, output file).
731 # that draws a represntation of the logo into the given file.
732 # The main graphical formatter is eps_formatter. A mapping 'formatters'
733 # containing all available formatters is located after the formatter
734 # definitions.
735
736 def pdf_formatter(data, format, fout) :
737 """ Generate a logo in PDF format."""
738
739 feps = StringIO()
740 eps_formatter(data, format, feps)
741 feps.seek(0)
742
743 gs = GhostscriptAPI()
744 gs.convert('pdf', feps, fout, format.logo_width, format.logo_height)
745
746
747 def _bitmap_formatter(data, format, fout, device) :
748 feps = StringIO()
749 eps_formatter(data, format, feps)
750 feps.seek(0)
751
752 gs = GhostscriptAPI()
753 gs.convert(device, feps, fout,
754 format.logo_width, format.logo_height, format.resolution)
755
756
757 def jpeg_formatter(data, format, fout) :
758 """ Generate a logo in JPEG format."""
759 _bitmap_formatter(data, format, fout, device="jpeg")
760
761
762 def png_formatter(data, format, fout) :
763 """ Generate a logo in PNG format."""
764
765 _bitmap_formatter(data, format, fout, device="png")
766
767
768 def png_print_formatter(data, format, fout) :
769 """ Generate a logo in PNG format with print quality (600 DPI) resolution."""
770 format.resolution = 600
771 _bitmap_formatter(data, format, fout, device="png")
772
773
774 def txt_formatter( logodata, format, fout) :
775 """ Create a text representation of the logo data.
776 """
777 print >>fout, str(logodata)
778
779
780
781
782 def eps_formatter( logodata, format, fout) :
783 """ Generate a logo in Encapsulated Postscript (EPS)"""
784
785 subsitutions = {}
786 from_format =[
787 "creation_date", "logo_width", "logo_height",
788 "lines_per_logo", "line_width", "line_height",
789 "line_margin_right","line_margin_left", "line_margin_bottom",
790 "line_margin_top", "title_height", "xaxis_label_height",
791 "creator_text", "logo_title", "logo_margin",
792 "stroke_width", "tic_length",
793 "stacks_per_line", "stack_margin",
794 "yaxis_label", "yaxis_tic_interval", "yaxis_minor_tic_interval",
795 "xaxis_label", "xaxis_tic_interval", "number_interval",
796 "fineprint", "shrink_fraction", "errorbar_fraction",
797 "errorbar_width_fraction",
798 "errorbar_gray", "small_fontsize", "fontsize",
799 "title_fontsize", "number_fontsize", "text_font",
800 "logo_font", "title_font",
801 "logo_label", "yaxis_scale", "end_type",
802 "debug", "show_title", "show_xaxis",
803 "show_xaxis_label", "show_yaxis", "show_yaxis_label",
804 "show_boxes", "show_errorbars", "show_fineprint",
805 "rotate_numbers", "show_ends", "altype",
806
807 ]
808
809 for s in from_format :
810 subsitutions[s] = getattr(format,s)
811
812
813 from_format_size = ["stack_height", "stack_width"]
814 for s in from_format_size :
815 subsitutions[s] = getattr(format.size,s)
816
817 subsitutions["shrink"] = str(format.show_boxes).lower()
818
819
820 # --------- COLORS --------------
821 def format_color(color):
822 return " ".join( ("[",str(color.red) , str(color.green),
823 str(color.blue), "]"))
824
825 subsitutions["default_color"] = format_color(format.default_color)
826 global col
827 colors = []
828
829 if altype=="codonsT" or altype=="codonsU":
830 for group in col:
831 cf = format_color(group.color)
832 colors.append( " ("+group.symbols+") " + cf )
833 for group in format.color_scheme.groups :
834 cf = format_color(group.color)
835
836 colors.append( " ("+group.symbols+") " + cf )
837 #print >>sys.stderr,opts.colors
838 #print >>sys.stderr,logodata.options
839 #print >>sys.stderr, group.symbols
840 #print >>sys.stderr, cf
841
842
843
844 else:
845 for group in format.color_scheme.groups :
846 cf = format_color(group.color)
847 for s in group.symbols :
848 colors.append( " ("+s+") " + cf )
849
850 subsitutions["color_dict"] = "\n".join(colors)
851 data = []
852
853 # Unit conversion. 'None' for probability units
854 conv_factor = std_units[format.unit_name]
855
856 data.append("StartLine")
857
858
859 seq_from = format.logo_start- format.first_index
860 seq_to = format.logo_end - format.first_index +1
861
862 # seq_index : zero based index into sequence data
863 # logo_index : User visible coordinate, first_index based
864 # stack_index : zero based index of visible stacks
865 for seq_index in range(seq_from, seq_to) :
866 logo_index = seq_index + format.first_index
867 stack_index = seq_index - seq_from
868
869 if stack_index!=0 and (stack_index % format.stacks_per_line) ==0 :
870 data.append("")
871 data.append("EndLine")
872 data.append("StartLine")
873 data.append("")
874
875 if logo_index % format.number_interval == 0 :
876 data.append("(%d) StartStack" % logo_index)
877 else :
878 data.append("() StartStack" )
879
880 if conv_factor:
881 stack_height = logodata.entropy[seq_index] * std_units[format.unit_name]
882 else :
883 stack_height = 1.0 # Probability
884
885 # if logodata.entropy_interval is not None and conv_factor:
886 # Draw Error bars
887 # low, high = logodata.entropy_interval[seq_index]
888 # center = logodata.entropy[seq_index]
889
890
891 # down = (center - low) * conv_factor
892 # up = (high - center) * conv_factor
893 # data.append(" %f %f %f DrawErrorbarFirst" % (down, up, stack_height) )
894
895 s = zip(logodata.counts[seq_index], logodata.alphabet)
896 def mycmp( c1, c2 ) :
897 # Sort by frequency. If equal frequency then reverse alphabetic
898 if c1[0] == c2[0] : return cmp(c2[1], c1[1])
899 return cmp(c1[0], c2[0])
900
901 s.sort(mycmp)
902
903 C = float(sum(logodata.counts[seq_index]))
904 if C > 0.0 :
905 fraction_width = 1.0
906 if format.scale_width :
907 fraction_width = logodata.weight[seq_index]
908 for c in s:
909 data.append(" %f %f (%s) ShowSymbol" % (fraction_width, c[0]*stack_height/C, c[1]) )
910
911 # Draw error bar on top of logo. Replaced by DrawErrorbarFirst above.
912 if logodata.entropy_interval is not None and conv_factor:
913 low, high = logodata.entropy_interval[seq_index]
914 center = logodata.entropy[seq_index]
915
916 down = (center - low) * conv_factor
917 up = (high - center) * conv_factor
918 data.append(" %f %f DrawErrorbar" % (down, up) )
919
920 data.append("EndStack")
921 data.append("")
922
923 data.append("EndLine")
924 subsitutions["logo_data"] = "\n".join(data)
925
926
927 # Create and output logo
928 template = resource_string( __name__, 'template.eps', __file__)
929 logo = Template(template).substitute(subsitutions)
930 print >>fout, logo
931
932
933 # map between output format names and logo
934 formatters = {
935 'eps': eps_formatter,
936 'pdf': pdf_formatter,
937 'png': png_formatter,
938 'png_print' : png_print_formatter,
939 'jpeg' : jpeg_formatter,
940 'txt' : txt_formatter,
941 }
942
943 default_formatter = eps_formatter
944
945
946
947
948
949 def parse_prior(composition, alphabet, weight=None) :
950 """ Parse a description of the expected monomer distribution of a sequence.
951
952 Valid compositions:
953
954 - None or 'none' : No composition sepecified
955 - 'auto' or 'automatic': Use the typical average distribution
956 for proteins and an equiprobable distribution for
957 everything else.
958 - 'equiprobable' : All monomers have the same probability.
959 - a percentage, e.g. '45%' or a fraction '0.45':
960 The fraction of CG bases for nucleotide alphabets
961 - a species name, e.g. 'E. coli', 'H. sapiens' :
962 Use the average CG percentage for the specie's
963 genome.
964 - An explicit distribution, e.g. {'A':10, 'C':40, 'G':40, 'T':10}
965 """
966
967
968 if composition is None: return None
969 comp = composition.strip()
970
971 if comp.lower() == 'none': return None
972
973 if weight is None and alphabet is not None: weight = float(len(alphabet))
974
975 if comp.lower() == 'equiprobable' :
976 prior = weight * equiprobable_distribution(len(alphabet))
977 elif comp.lower() == 'auto' or comp.lower() == 'automatic':
978 if alphabet == unambiguous_protein_alphabet :
979 prior = weight * asarray(aa_composition, float64)
980 else :
981 prior = weight * equiprobable_distribution(len(alphabet))
982 elif comp in std_percentCG :
983 prior = weight * base_distribution(std_percentCG[comp])
984
985 elif comp[-1] == '%' :
986 prior = weight * base_distribution( float(comp[:-1]))
987
988 elif isfloat(comp) :
989 prior = weight * base_distribution( float(comp)*100. )
990
991 elif composition[0] == '{' and composition[-1] == '}' :
992 explicit = composition[1: -1]
993 explicit = explicit.replace(',',' ').replace("'", ' ').replace('"',' ').replace(':', ' ').split()
994
995 if len(explicit) != len(alphabet)*2 :
996 #print explicit
997 raise ValueError("Explicit prior does not match length of alphabet")
998 prior = - ones(len(alphabet), float64)
999 try :
1000 for r in range(len(explicit)/2) :
1001 letter = explicit[r*2]
1002 index = alphabet.index(letter)
1003 value = float(explicit[r*2 +1])
1004 prior[index] = value
1005 except ValueError :
1006 raise ValueError("Cannot parse explicit composition")
1007
1008 if any(prior==-1.) :
1009 raise ValueError("Explicit prior does not match alphabet")
1010 prior/= sum(prior)
1011 prior *= weight
1012
1013
1014 else :
1015 raise ValueError("Unknown or malformed composition: %s"%composition)
1016 if len(prior) != len(alphabet) :
1017 raise ValueError(
1018 "The sequence alphabet and composition are incompatible.")
1019
1020 return prior
1021
1022
1023 def base_distribution(percentCG) :
1024 A = (1. - (percentCG/100.))/2.
1025 C = (percentCG/100.)/2.
1026 G = (percentCG/100.)/2.
1027 T = (1. - (percentCG/100))/2.
1028 return asarray((A,C,G,T), float64)
1029
1030 def equiprobable_distribution( length) :
1031 return ones( (length), float64) /length
1032
1033
1034
1035
1036 def read_seq_data(fin, input_parser=seq_io.read,alphabet=None, ignore_lower_case=False, max_file_size=0):
1037 # TODO: Document this method and enviroment variable
1038 max_file_size =int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size))
1039
1040 # If max_file_size is set, or if fin==stdin (which is non-seekable), we
1041 # read the data and replace fin with a StringIO object.
1042
1043 if(max_file_size>0) :
1044 data = fin.read(max_file_size)
1045
1046 more_data = fin.read(2)
1047 if more_data != "" :
1048 raise IOError("File exceeds maximum allowed size: %d bytes" % max_file_size)
1049
1050 fin = StringIO(data)
1051 elif fin == sys.stdin:
1052 fin = StringIO(fin.read())
1053
1054 seqs = input_parser(fin)
1055
1056 if seqs is None or len(seqs) ==0 :
1057 raise ValueError("Please provide a multiple sequence alignment")
1058
1059 if ignore_lower_case :
1060 # Case is significant. Do not count lower case letters.
1061 for i,s in enumerate(seqs) :
1062 seqs[i] = s.mask()
1063
1064 global altype
1065 if(altype=="codonsT" or altype=="codonsU"):
1066 if 'T' in seqs[0] or 't' in seqs[0]:
1067 altype="codonsT"
1068 if 'U' in seqs[0] or 'u' in seqs[0]:
1069 altype="codonsU"
1070 global offset
1071 seq2=[""]*len(seqs)
1072 seq2 = []
1073 for i in xrange(len(seqs)):
1074 seq2.append([])
1075 if(offset%6>2):
1076 for x in range(0,len(seqs)):
1077 backs=seqs[x][::-1]
1078 for y in range(0,len(backs)):
1079 seq2[x].append(str(backs[y]))
1080
1081 if(altype=="codonsU"):
1082 for x in range(0,len(seq2)):
1083 for y in range(0,len(seq2[x])):
1084 if(cmp(seq2[x][y],'G')==0):
1085 seq2[x][y]="C"
1086 elif(cmp(seq2[x][y],'A')==0):
1087 seq2[x][y]='U'
1088 elif(cmp(seq2[x][y],'U')==0):
1089 seq2[x][y]='A'
1090 elif(cmp(seq2[x][y],'C')==0):
1091 seq2[x][y]='G'
1092 if(altype=="codonsT"):
1093 for x in range(0,len(seq2)):
1094 for y in range(0,len(seq2[x])):
1095 if(cmp(seq2[x][y],'G')==0):
1096 seq2[x][y]='C'
1097 elif(cmp(seq2[x][y],'A')==0):
1098 seq2[x][y]='T'
1099 elif(cmp(seq2[x][y],'T')==0):
1100 seq2[x][y]='A'
1101 elif(cmp(seq2[x][y],'C')==0):
1102 seq2[x][y]='G'
1103 offset=offset%3
1104 for x in range(0,len(seqs)):
1105 seqs[x]=Seq("".join(seq2[x]))
1106
1107
1108 # Add alphabet to seqs.
1109 if alphabet :
1110 seqs.alphabet = alphabet
1111 else :
1112 seqs.alphabet = which_alphabet(seqs)
1113
1114 return seqs
1115
1116
1117
1118 #TODO: Move to seq_io?
1119 # Would have to check that no symbol outside of full alphabet?
1120 def which_alphabet(seqs) :
1121 """ Returns the most appropriate unambiguous protien, rna or dna alphabet
1122 for a Seq or SeqList.
1123 """
1124 alphabets = (unambiguous_protein_alphabet,
1125 unambiguous_rna_alphabet,
1126 unambiguous_dna_alphabet
1127 )
1128 # Heuristic
1129 # Count occurances of each letter. Downweight longer alphabet.
1130 #for x in seqs:
1131
1132
1133 if( altype=="codonsU"):
1134 return codon_alphabetU
1135 if( altype=="codonsT"):
1136 return codon_alphabetT
1137 else:
1138 score = [1.0*asarray(seqs.tally(a)).sum()/sqrt(len(a)) for a in alphabets]
1139 #print score
1140 best = argmax(score) # Ties go to last in list.
1141 a = alphabets[best]
1142 return a
1143
1144
1145
1146 class LogoData(object) :
1147 """The data needed to generate a sequence logo.
1148
1149 - alphabet
1150 - length
1151 - counts -- An array of character counts
1152 - entropy -- The relative entropy of each column
1153 - entropy_interval -- entropy confidence interval
1154 """
1155
1156 def __init__(self, length=None, alphabet = None, counts =None,
1157 entropy =None, entropy_interval = None, weight=None) :
1158 """Creates a new LogoData object"""
1159 self.length = length
1160 self.alphabet = alphabet
1161 self.counts = counts
1162 self.entropy = entropy
1163 self.entropy_interval = entropy_interval
1164 self.weight = weight
1165
1166
1167
1168 #@classmethod
1169 def from_counts(cls, alphabet, counts, prior= None):
1170
1171 """Build a logodata object from counts."""
1172 seq_length, A = counts.shape
1173
1174 if prior is not None: prior = array(prior, float64)
1175 if prior is None :
1176 R = log(A)
1177 ent = zeros( seq_length, float64)
1178 entropy_interval = None
1179 for i in range (0, seq_length) :
1180 C = sum(counts[i])
1181 #FIXME: fixup corebio.moremath.entropy()?
1182 if C == 0 :
1183 ent[i] = 0.0
1184 else :
1185 ent[i] = R - entropy(counts[i])
1186 else :
1187 ent = zeros( seq_length, float64)
1188 entropy_interval = zeros( (seq_length,2) , float64)
1189
1190 R = log(A)
1191
1192 for i in range (0, seq_length) :
1193 alpha = array(counts[i] , float64)
1194 alpha += prior
1195
1196 posterior = Dirichlet(alpha)
1197 ent[i] = posterior.mean_relative_entropy(prior/sum(prior))
1198 entropy_interval[i][0], entropy_interval[i][1] = \
1199 posterior.interval_relative_entropy(prior/sum(prior), 0.95)
1200 weight = array( na.sum(counts,axis=1) , float)
1201
1202 weight /= max(weight)
1203
1204 return cls(seq_length, alphabet, counts, ent, entropy_interval, weight)
1205 from_counts = classmethod(from_counts)
1206
1207
1208 #@classmethod
1209 def from_seqs(cls, seqs, prior= None):
1210
1211
1212 alphabet=seqs.alphabet
1213
1214 #get the offset and if it's greater than 2 flip the sequences to the negative strand and reverse.
1215 for x in range(0,len(seqs)):
1216 seqs[x]=seqs[x].upper()
1217 counter=0
1218
1219
1220 """Build a 2D array from a SeqList, a list of sequences."""
1221 # --- VALIDATE DATA ---
1222 # check that there is at least one sequence of length 1
1223 if len(seqs)==0 or len(seqs[0]) ==0:
1224 raise ValueError("No sequence data found.")
1225 sys.exit(0)
1226 # Check sequence lengths
1227 seq_length = len(seqs[0])
1228 for i,s in enumerate(seqs) :
1229 #print i,s, len(s)
1230 if seq_length != len(s) :
1231 raise ArgumentError(
1232 "Sequence number %d differs in length from the previous sequences" % (i+1) ,'sequences')
1233 sys.exit(0)
1234
1235 if(altype=="codonsT" or altype=="codonsU"):
1236 x = [[0]*64 for x in xrange(seq_length/3)]
1237 counter=offset
1238
1239 while counter+offset<seq_length:
1240 for i in range(0,len(seqs)):
1241 if len(str(seqs[i][(counter):(counter+3)]))==3 and len(seqs[i][(counter):(counter+3)].strip("GATUC"))==0 :
1242 if(str(seqs[i][(counter):(counter+3)]) in alphabet):
1243 x[counter/3][ (alphabet.index(str(seqs[i][(counter):(counter+3)]))) ]+=1
1244 counter=counter+3
1245 counts=asarray(x)
1246 else:
1247 counts = asarray(seqs.tally())
1248
1249 return cls.from_counts(alphabet, counts, prior)
1250 from_seqs = classmethod(from_seqs)
1251
1252
1253
1254 def __str__(self) :
1255 out = StringIO()
1256 print >>out, '## LogoData'
1257 print >>out, '# First column is position number, couting from zero'
1258 print >>out, '# Subsequent columns are raw symbol counts'
1259 print >>out, '# Entropy is mean entropy measured in nats.'
1260 print >>out, '# Low and High are the 95% confidence limits.'
1261 print >>out, '# Weight is the fraction of non-gap symbols in the column.'
1262 print >>out, '#\t'
1263 print >>out, '#\t',
1264 for a in self.alphabet :
1265 print >>out, a, '\t',
1266 print >>out, 'Entropy\tLow\tHigh\tWeight'
1267
1268 for i in range(self.length) :
1269 print >>out, i, '\t',
1270 for c in self.counts[i] : print >>out, c, '\t',
1271 print >>out, self.entropy[i], '\t',
1272 if self.entropy_interval is not None:
1273 print >>out, self.entropy_interval[i][0], '\t',
1274 print >>out, self.entropy_interval[i][1], '\t',
1275 else :
1276 print >>out, '\t','\t',
1277 if self.weight is not None :
1278 print >>out, self.weight[i],
1279 print >>out, ''
1280 print >>out, '# End LogoData'
1281
1282 return out.getvalue()
1283
1284 # ====================== Main: Parse Command line =============================
1285 def main():
1286 """CodonLogo command line interface """
1287
1288 # ------ Parse Command line ------
1289 parser = _build_option_parser()
1290 (opts, args) = parser.parse_args(sys.argv[1:])
1291 if args : parser.error("Unparsable arguments: %s " % args)
1292
1293 if opts.serve:
1294 httpd_serve_forever(opts.port) # Never returns?
1295 sys.exit(0)
1296
1297
1298 # ------ Create Logo ------
1299 try:
1300 data = _build_logodata(opts)
1301
1302
1303 format = _build_logoformat(data, opts)
1304
1305
1306 formatter = opts.formatter
1307 formatter(data, format, opts.fout)
1308
1309 except ValueError, err :
1310 print >>sys.stderr, 'Error:', err
1311 sys.exit(2)
1312 except KeyboardInterrupt, err:
1313 sys.exit(0)
1314 # End main()
1315
1316
1317 def httpd_serve_forever(port=8080) :
1318 """ Start a webserver on a local port."""
1319 import BaseHTTPServer
1320 import CGIHTTPServer
1321
1322 class __HTTPRequestHandler(CGIHTTPServer.CGIHTTPRequestHandler):
1323 def is_cgi(self) :
1324 if self.path == "/create.cgi":
1325 self.cgi_info = '', 'create.cgi'
1326 return True
1327 return False
1328
1329 # Add current directory to PYTHONPATH. This is
1330 # so that we can run the standalone server
1331 # without having to run the install script.
1332 pythonpath = os.getenv("PYTHONPATH", '')
1333 pythonpath += ":" + os.path.abspath(sys.path[0]).split()[0]
1334 os.environ["PYTHONPATH"] = pythonpath
1335
1336 htdocs = resource_filename(__name__, 'htdocs', __file__)
1337 os.chdir(htdocs)
1338
1339 HandlerClass = __HTTPRequestHandler
1340 ServerClass = BaseHTTPServer.HTTPServer
1341 httpd = ServerClass(('', port), HandlerClass)
1342 print "Serving HTTP on localhost:%d ..." % port
1343
1344 try :
1345 httpd.serve_forever()
1346 except KeyboardInterrupt:
1347 sys.exit(0)
1348 # end httpd_serve_forever()
1349
1350 def read_priors(finp, alphabet ,max_file_size=0):
1351
1352 max_file_size =int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size))
1353 if(max_file_size>0) :
1354 data = finp.read(max_file_size)
1355 more_data = finp.read(2)
1356 if more_data != "" :
1357 raise IOError("File exceeds maximum allowed size: %d bytes" % max_file_size)
1358 finp = StringIO(data)
1359 priordict={}
1360 while 1:
1361 line = finp.readline()
1362 if not line:
1363 break
1364 line = line.split()
1365 priordict[line[0]]=line[1]
1366 return priordict
1367
1368 def _build_logodata(options) :
1369 global offset
1370 offset=options.frame
1371 options.alphabet = None
1372 options.ignore_lower_case = False
1373 #options.default_color = Color.by_name("black")
1374 options.color_scheme=None
1375 #options.colors=[]
1376 options.show_ends=False
1377 seqs = read_seq_data(options.fin,
1378 options.input_parser.read,
1379 alphabet=options.alphabet,
1380 ignore_lower_case = options.ignore_lower_case)
1381 if(options.priorfile!=None):
1382 if(altype=="CodonsT"):
1383 options.composition= str(read_priors(options.priorfile,codon_alphabetT))
1384 options.alphabet = codon_alphabetT
1385 else:
1386 options.composition= str(read_priors(options.priorfile,codon_alphabetU))
1387 options.alphabet = codon_alphabetU
1388
1389 prior = parse_prior( options.composition,seqs.alphabet, options.weight)
1390 data = LogoData.from_seqs(seqs, prior)
1391 return data
1392
1393
1394 def _build_logoformat( logodata, opts) :
1395 """ Extract and process relevant option values and return a
1396 LogoFormat object."""
1397
1398 args = {}
1399 direct_from_opts = [
1400 "stacks_per_line",
1401 "logo_title",
1402 "yaxis_label",
1403 "show_xaxis",
1404 "show_yaxis",
1405 "xaxis_label",
1406 "show_ends",
1407 "fineprint",
1408 "show_errorbars",
1409 "altype",
1410 "show_boxes",
1411 "yaxis_tic_interval",
1412 "resolution",
1413 "alphabet",
1414 "debug",
1415 "show_ends",
1416 "default_color",
1417 #"show_color_key",
1418 "color_scheme",
1419 "unit_name",
1420 "logo_label",
1421 "yaxis_scale",
1422 "first_index",
1423 "logo_start",
1424 "logo_end",
1425 "scale_width",
1426 "frame",
1427 ]
1428
1429 for k in direct_from_opts:
1430 args[k] = opts.__dict__[k]
1431 logo_size = copy.copy(opts.__dict__['logo_size'])
1432 size_from_opts = ["stack_width", "stack_height"]
1433 for k in size_from_opts :
1434 length = getattr(opts, k)
1435 if length : setattr( logo_size, k, length )
1436 args["size"] = logo_size
1437
1438 global col
1439
1440 if opts.colors:
1441 color_scheme = ColorScheme()
1442 for color, symbols, desc in opts.colors:
1443 try :
1444 #c = Color.from_string(color)
1445 color_scheme.groups.append( ColorGroup(symbols, color, desc) )
1446 #print >> sys.stderr, color
1447
1448 col.append( ColorGroup(symbols, color, desc) )
1449
1450 except ValueError :
1451 raise ValueError(
1452 "error: option --color: invalid value: '%s'" % color )
1453 if(altype!="codonsU" and altype!="codonsT") :
1454 args["color_scheme"] = color_scheme
1455
1456 #cf = colorscheme.format_color(col[0])
1457 #col.append( " ("+group.symbols+") " + cf )
1458
1459 logooptions = LogoOptions()
1460 for a, v in args.iteritems() :
1461 setattr(logooptions,a,v)
1462
1463
1464
1465 theformat = LogoFormat(logodata, logooptions )
1466
1467 return theformat
1468
1469
1470
1471
1472
1473
1474 # ========================== OPTIONS ==========================
1475 def _build_option_parser() :
1476 defaults = LogoOptions()
1477 parser = DeOptionParser(usage="%prog [options] < sequence_data.fa > sequence_logo.eps",
1478 description = description,
1479 version = __version__ ,
1480 add_verbose_options = False
1481 )
1482
1483 io_grp = OptionGroup(parser, "Input/Output Options",)
1484 data_grp = OptionGroup(parser, "Logo Data Options",)
1485 format_grp = OptionGroup(parser, "Logo Format Options",
1486 "These options control the format and display of the logo.")
1487 color_grp = OptionGroup(parser, "Color Options",
1488 "Colors can be specified using CSS2 syntax. e.g. 'red', '#FF0000', etc.")
1489 advanced_grp = OptionGroup(parser, "Advanced Format Options",
1490 "These options provide fine control over the display of the logo. ")
1491 server_grp = OptionGroup(parser, "CodonLogo Server",
1492 "Run a standalone webserver on a local port.")
1493
1494
1495 parser.add_option_group(io_grp)
1496 parser.add_option_group(data_grp)
1497 parser.add_option_group(format_grp)
1498 parser.add_option_group(color_grp)
1499 parser.add_option_group(advanced_grp)
1500 parser.add_option_group(server_grp)
1501
1502 # ========================== IO OPTIONS ==========================
1503
1504
1505
1506 io_grp.add_option( "-f", "--fin",
1507 dest="fin",
1508 action="store",
1509 type="file_in",
1510 default=sys.stdin,
1511 help="Sequence input file (default: stdin)",
1512 metavar="FILENAME")
1513
1514 io_grp.add_option( "-R", "--prior",
1515 dest="priorfile",
1516 action="store",
1517 type="file_in",
1518 help="A file with 64 codons and their prior probabilities, one per line, each codon followed by a space and it's probability.",
1519 metavar="FILENAME")
1520
1521 io_grp.add_option("", "--fin-format",
1522 dest="input_parser",
1523 action="store", type ="dict",
1524 default = seq_io,
1525 choices = seq_io.format_names(),
1526 help="Multiple sequence alignment format: (%s)" %
1527 ', '.join([ f.names[0] for f in seq_io.formats]),
1528 metavar="FORMAT")
1529
1530 io_grp.add_option("-o", "--fout", dest="fout",
1531 type="file_out",
1532 default=sys.stdout,
1533 help="Output file (default: stdout)",
1534 metavar="FILENAME")
1535
1536
1537
1538 io_grp.add_option( "-F", "--format",
1539 dest="formatter",
1540 action="store",
1541 type="dict",
1542 choices = formatters,
1543 metavar= "FORMAT",
1544 help="Format of output: eps (default), png, png_print, pdf, jpeg, txt",
1545 default = default_formatter)
1546
1547
1548 # ========================== Data OPTIONS ==========================
1549
1550 data_grp.add_option("-m", "--frame",
1551 dest="frame",
1552 action="store",
1553 type="int",
1554 default=0,
1555 help="Offset of the reading frame you wish to look in (default: 0)",
1556 metavar="COUNT")
1557
1558 data_grp.add_option("-T", "--type",
1559 dest="altype",
1560 action="store",
1561 type="boolean",
1562 default=True,
1563 help="Generate a codon logo rather than a sequence logo (default: True)",
1564 metavar="YES/NO")
1565
1566
1567
1568 #data_grp.add_option( "-A", "--sequence-type",
1569 #dest="alphabet",
1570 #action="store",
1571 #type="dict",
1572 #choices = std_alphabets,
1573 #help="The type of sequence data: 'protein', 'rna' or 'dna'.",
1574 #metavar="TYPE")
1575
1576 #data_grp.add_option( "-a", "--alphabet",
1577 #dest="alphabet",
1578 #action="store",
1579 #help="The set of symbols to count, e.g. 'AGTC'. "
1580 #"All characters not in the alphabet are ignored. "
1581 #"If neither the alphabet nor sequence-type are specified then codonlogo will examine the input data and make an educated guess. "
1582 #"See also --sequence-type, --ignore-lower-case" )
1583
1584 # FIXME Add test?
1585 #data_grp.add_option( "", "--ignore-lower-case",
1586 #dest="ignore_lower_case",
1587 #action="store",
1588 #type = "boolean",
1589 #default=False,
1590 #metavar = "YES/NO",
1591 #help="Disregard lower case letters and only count upper case letters in sequences? (Default: No)"
1592 #)
1593
1594 data_grp.add_option( "-U", "--units",
1595 dest="unit_name",
1596 action="store",
1597 choices = std_units.keys(),
1598 type="choice",
1599 default = defaults.unit_name,
1600 help="A unit of entropy ('bits' (default), 'nats', 'digits'), or a unit of free energy ('kT', 'kJ/mol', 'kcal/mol'), or 'probability' for probabilities",
1601 metavar = "NUMBER")
1602
1603
1604 data_grp.add_option( "", "--composition",
1605 dest="composition",
1606 action="store",
1607 type="string",
1608 default = "auto",
1609 help="The expected composition of the sequences: 'auto' (default), 'equiprobable', 'none' (Do not perform any compositional adjustment), a CG percentage, a species name (e.g. 'E. coli', 'H. sapiens'), or an explicit distribution (e.g. {'A':10, 'C':40, 'G':40, 'T':10}). The automatic option uses a typical distribution for proteins and equiprobable distribution for everything else. ",
1610 metavar="COMP.")
1611
1612 data_grp.add_option( "", "--weight",
1613 dest="weight",
1614 action="store",
1615 type="float",
1616 default = None,
1617 help="The weight of prior data. Default: total pseudocounts equal to the number of monomer types.",
1618 metavar="NUMBER")
1619
1620 data_grp.add_option( "-i", "--first-index",
1621 dest="first_index",
1622 action="store",
1623 type="int",
1624 default = 1,
1625 help="Index of first position in sequence data (default: 1)",
1626 metavar="INDEX")
1627
1628 data_grp.add_option( "-l", "--lower",
1629 dest="logo_start",
1630 action="store",
1631 type="int",
1632 help="Lower bound of sequence to display",
1633 metavar="INDEX")
1634
1635 data_grp.add_option( "-u", "--upper",
1636 dest="logo_end",
1637 action="store",
1638 type="int",
1639 help="Upper bound of sequence to display",
1640 metavar="INDEX")
1641
1642 # ========================== FORMAT OPTIONS ==========================
1643
1644 format_grp.add_option( "-s", "--size",
1645 dest="logo_size",
1646 action="store",
1647 type ="dict",
1648 choices = std_sizes,
1649 metavar = "LOGOSIZE",
1650 default = defaults.size,
1651 help="Specify a standard logo size (small, medium (default), large)" )
1652
1653
1654
1655 format_grp.add_option( "-n", "--stacks-per-line",
1656 dest="stacks_per_line",
1657 action="store",
1658 type="int",
1659 help="Maximum number of logo stacks per logo line. (default: %default)",
1660 default = defaults.stacks_per_line,
1661 metavar="COUNT")
1662
1663 format_grp.add_option( "-t", "--title",
1664 dest="logo_title",
1665 action="store",
1666 type="string",
1667 help="Logo title text.",
1668 default = defaults.logo_title,
1669 metavar="TEXT")
1670
1671 format_grp.add_option( "", "--label",
1672 dest="logo_label",
1673 action="store",
1674 type="string",
1675 help="A figure label, e.g. '2a'",
1676 default = defaults.logo_label,
1677 metavar="TEXT")
1678
1679 format_grp.add_option( "-X", "--show-xaxis",
1680 action="store",
1681 type = "boolean",
1682 default= defaults.show_xaxis,
1683 metavar = "YES/NO",
1684 help="Display sequence numbers along x-axis? (default: %default)")
1685
1686 format_grp.add_option( "-x", "--xlabel",
1687 dest="xaxis_label",
1688 action="store",
1689 type="string",
1690 default = defaults.xaxis_label,
1691 help="X-axis label",
1692 metavar="TEXT")
1693
1694 format_grp.add_option( "-S", "--yaxis",
1695 dest="yaxis_scale",
1696 action="store",
1697 type="float",
1698 help="Height of yaxis in units. (Default: Maximum value with uninformative prior.)",
1699 metavar = "UNIT")
1700
1701 format_grp.add_option( "-Y", "--show-yaxis",
1702 action="store",
1703 type = "boolean",
1704 dest = "show_yaxis",
1705 default= defaults.show_yaxis,
1706 metavar = "YES/NO",
1707 help="Display entropy scale along y-axis? (default: %default)")
1708
1709 format_grp.add_option( "-y", "--ylabel",
1710 dest="yaxis_label",
1711 action="store",
1712 type="string",
1713 help="Y-axis label (default depends on plot type and units)",
1714 metavar="TEXT")
1715
1716 #format_grp.add_option( "-E", "--show-ends",
1717 #action="store",
1718 #type = "boolean",
1719 #default= defaults.show_ends,
1720 #metavar = "YES/NO",
1721 #help="Label the ends of the sequence? (default: %default)")
1722
1723 format_grp.add_option( "-P", "--fineprint",
1724 dest="fineprint",
1725 action="store",
1726 type="string",
1727 default= defaults.fineprint,
1728 help="The fine print (default: Codonlogo version)",
1729 metavar="TEXT")
1730
1731 format_grp.add_option( "", "--ticmarks",
1732 dest="yaxis_tic_interval",
1733 action="store",
1734 type="float",
1735 default= defaults.yaxis_tic_interval,
1736 help="Distance between ticmarks (default: %default)",
1737 metavar = "NUMBER")
1738
1739
1740 format_grp.add_option( "", "--errorbars",
1741 dest = "show_errorbars",
1742 action="store",
1743 type = "boolean",
1744 default= defaults.show_errorbars,
1745 metavar = "YES/NO",
1746 help="Display error bars? (default: %default)")
1747
1748
1749
1750 # ========================== Color OPTIONS ==========================
1751 # TODO: Future Feature
1752 # color_grp.add_option( "-K", "--color-key",
1753 # dest= "show_color_key",
1754 # action="store",
1755 # type = "boolean",
1756 # default= defaults.show_color_key,
1757 # metavar = "YES/NO",
1758 # help="Display a color key (default: %default)")
1759
1760
1761 #color_scheme_choices = std_color_schemes.keys()
1762 #color_scheme_choices.sort()
1763 #color_grp.add_option( "-c", "--color-scheme",
1764 #dest="color_scheme",
1765 #action="store",
1766 #type ="dict",
1767 #choices = std_color_schemes,
1768 #metavar = "SCHEME",
1769 #default = None, # Auto
1770 #help="Specify a standard color scheme (%s)" % \
1771 #", ".join(color_scheme_choices) )
1772
1773 color_grp.add_option( "-C", "--color",
1774 dest="colors",
1775 action="append",
1776 metavar="COLOR SYMBOLS DESCRIPTION ",
1777 nargs = 3,
1778 default=[],
1779 help="Specify symbol colors, e.g. --color black AG 'Purine' --color red TC 'Pyrimidine' ")
1780
1781 color_grp.add_option( "", "--default-color",
1782 dest="default_color",
1783 action="store",
1784 metavar="COLOR",
1785 default= defaults.default_color,
1786 help="Symbol color if not otherwise specified.")
1787
1788 # ========================== Advanced options =========================
1789
1790 advanced_grp.add_option( "-W", "--stack-width",
1791 dest="stack_width",
1792 action="store",
1793 type="float",
1794 default= None,
1795 help="Width of a logo stack (default: %s)"% defaults.size.stack_width,
1796 metavar="POINTS" )
1797
1798 advanced_grp.add_option( "-H", "--stack-height",
1799 dest="stack_height",
1800 action="store",
1801 type="float",
1802 default= None,
1803 help="Height of a logo stack (default: %s)"%defaults.size.stack_height,
1804 metavar="POINTS" )
1805
1806 advanced_grp.add_option( "", "--box",
1807 dest="show_boxes",
1808 action="store",
1809 type = "boolean",
1810 default=False,
1811 metavar = "YES/NO",
1812 help="Draw boxes around symbols? (default: no)")
1813
1814 advanced_grp.add_option( "", "--resolution",
1815 dest="resolution",
1816 action="store",
1817 type="float",
1818 default=96,
1819 help="Bitmap resolution in dots per inch (DPI). (default: 96 DPI, except png_print, 600 DPI) Low resolution bitmaps (DPI<300) are antialiased.",
1820 metavar="DPI")
1821
1822 advanced_grp.add_option( "", "--scale-width",
1823 dest="scale_width",
1824 action="store",
1825 type = "boolean",
1826 default= True,
1827 metavar = "YES/NO",
1828 help="Scale the visible stack width by the fraction of symbols in the column? (i.e. columns with many gaps of unknowns are narrow.) (default: yes)")
1829
1830 advanced_grp.add_option( "", "--debug",
1831 action="store",
1832 type = "boolean",
1833 default= defaults.debug,
1834 metavar = "YES/NO",
1835 help="Output additional diagnostic information. (default: %default)")
1836
1837
1838 # ========================== Server options =========================
1839 server_grp.add_option( "", "--serve",
1840 dest="serve",
1841 action="store_true",
1842 default= False,
1843 help="Start a standalone CodonLogo server for creating sequence logos.")
1844
1845 server_grp.add_option( "", "--port",
1846 dest="port",
1847 action="store",
1848 type="int",
1849 default= 8080,
1850 help="Listen to this local port. (Default: %default)",
1851 metavar="PORT")
1852
1853 return parser
1854
1855 # END _build_option_parser
1856
1857
1858 ##############################################################
1859
1860 class Dirichlet(object) :
1861 """The Dirichlet probability distribution. The Dirichlet is a continuous
1862 multivariate probability distribution across non-negative unit length
1863 vectors. In other words, the Dirichlet is a probability distribution of
1864 probability distributions. It is conjugate to the multinomial
1865 distribution and is widely used in Bayesian statistics.
1866
1867 The Dirichlet probability distribution of order K-1 is
1868
1869 p(theta_1,...,theta_K) d theta_1 ... d theta_K =
1870 (1/Z) prod_i=1,K theta_i^{alpha_i - 1} delta(1 -sum_i=1,K theta_i)
1871
1872 The normalization factor Z can be expressed in terms of gamma functions:
1873
1874 Z = {prod_i=1,K Gamma(alpha_i)} / {Gamma( sum_i=1,K alpha_i)}
1875
1876 The K constants, alpha_1,...,alpha_K, must be positive. The K parameters,
1877 theta_1,...,theta_K are nonnegative and sum to 1.
1878
1879 Status:
1880 Alpha
1881 """
1882 __slots__ = 'alpha', '_total', '_mean',
1883
1884
1885
1886
1887 def __init__(self, alpha) :
1888 """
1889 Args:
1890 - alpha -- The parameters of the Dirichlet prior distribution.
1891 A vector of non-negative real numbers.
1892 """
1893 # TODO: Check that alphas are positive
1894 #TODO : what if alpha's not one dimensional?
1895 self.alpha = asarray(alpha, float64)
1896
1897 self._total = sum(alpha)
1898 self._mean = None
1899
1900
1901 def sample(self) :
1902 """Return a randomly generated probability vector.
1903
1904 Random samples are generated by sampling K values from gamma
1905 distributions with parameters a=\alpha_i, b=1, and renormalizing.
1906
1907 Ref:
1908 A.M. Law, W.D. Kelton, Simulation Modeling and Analysis (1991).
1909 Authors:
1910 Gavin E. Crooks <gec@compbio.berkeley.edu> (2002)
1911 """
1912 alpha = self.alpha
1913 K = len(alpha)
1914 theta = zeros( (K,), float64)
1915
1916 for k in range(K):
1917 theta[k] = random.gammavariate(alpha[k], 1.0)
1918 theta /= sum(theta)
1919
1920 return theta
1921
1922 def mean(self) :
1923 if self._mean ==None:
1924 self._mean = self.alpha / self._total
1925 return self._mean
1926
1927 def covariance(self) :
1928 alpha = self.alpha
1929 A = sum(alpha)
1930 #A2 = A * A
1931 K = len(alpha)
1932 cv = zeros( (K,K), float64)
1933
1934 for i in range(K) :
1935 cv[i,i] = alpha[i] * (1. - alpha[i]/A) / (A * (A+1.) )
1936
1937 for i in range(K) :
1938 for j in range(i+1,K) :
1939 v = - alpha[i] * alpha[j] / (A * A * (A+1.) )
1940 cv[i,j] = v
1941 cv[j,i] = v
1942 return cv
1943
1944 def mean_x(self, x) :
1945 x = asarray(x, float64)
1946 if shape(x) != shape(self.alpha) :
1947 raise ValueError("Argument must be same dimension as Dirichlet")
1948 return sum( x * self.mean())
1949
1950 def variance_x(self, x) :
1951 x = asarray(x, float64)
1952 if shape(x) != shape(self.alpha) :
1953 raise ValueError("Argument must be same dimension as Dirichlet")
1954
1955 cv = self.covariance()
1956 var = na.dot(na.dot(na.transpose( x), cv), x)
1957 return var
1958
1959
1960 def mean_entropy(self) :
1961 """Calculate the average entropy of probabilities sampled
1962 from this Dirichlet distribution.
1963
1964 Returns:
1965 The average entropy.
1966
1967 Ref:
1968 Wolpert & Wolf, PRE 53:6841-6854 (1996) Theorem 7
1969 (Warning: this paper contains typos.)
1970 Status:
1971 Alpha
1972 Authors:
1973 GEC 2005
1974
1975 """
1976 # TODO: Optimize
1977 alpha = self.alpha
1978 A = float(sum(alpha))
1979 ent = 0.0
1980 for a in alpha:
1981 if a>0 : ent += - 1.0 * a * digamma( 1.0+a) # FIXME: Check
1982 ent /= A
1983 ent += digamma(A+1.0)
1984 return ent
1985
1986
1987
1988 def variance_entropy(self):
1989 """Calculate the variance of the Dirichlet entropy.
1990
1991 Ref:
1992 Wolpert & Wolf, PRE 53:6841-6854 (1996) Theorem 8
1993 (Warning: this paper contains typos.)
1994 """
1995 alpha = self.alpha
1996 A = float(sum(alpha))
1997 A2 = A * (A+1)
1998 L = len(alpha)
1999
2000 dg1 = zeros( (L) , float64)
2001 dg2 = zeros( (L) , float64)
2002 tg2 = zeros( (L) , float64)
2003
2004 for i in range(L) :
2005 dg1[i] = digamma(alpha[i] + 1.0)
2006 dg2[i] = digamma(alpha[i] + 2.0)
2007 tg2[i] = trigamma(alpha[i] + 2.0)
2008
2009 dg_Ap2 = digamma( A+2. )
2010 tg_Ap2 = trigamma( A+2. )
2011
2012 mean = self.mean_entropy()
2013 var = 0.0
2014
2015 for i in range(L) :
2016 for j in range(L) :
2017 if i != j :
2018 var += (
2019 ( dg1[i] - dg_Ap2 ) * (dg1[j] - dg_Ap2 ) - tg_Ap2
2020 ) * (alpha[i] * alpha[j] ) / A2
2021 else :
2022 var += (
2023 ( dg2[i] - dg_Ap2 ) **2 + ( tg2[i] - tg_Ap2 )
2024 ) * ( alpha[i] * (alpha[i]+1.) ) / A2
2025
2026 var -= mean**2
2027 return var
2028
2029
2030
2031 def mean_relative_entropy(self, pvec) :
2032 ln_p = na.log(pvec)
2033 return - self.mean_x(ln_p) - self.mean_entropy()
2034
2035
2036 def variance_relative_entropy(self, pvec) :
2037 ln_p = na.log(pvec)
2038 return self.variance_x(ln_p) + self.variance_entropy()
2039
2040
2041 def interval_relative_entropy(self, pvec, frac) :
2042 mean = self.mean_relative_entropy(pvec)
2043 variance = self.variance_relative_entropy(pvec)
2044 # If the variance is small, use the standard 95%
2045 # confidence interval: mean +/- 1.96 * sd
2046 if variance< 0.1 :
2047 sd = sqrt(variance)
2048 return max(0.0, mean - sd*1.96), mean + sd*1.96
2049 sd = sqrt(variance)
2050 return max(0.0, mean - sd*1.96), mean + sd*1.96
2051
2052 g = gamma.from_mean_variance(mean, variance)
2053 low_limit = g.inverse_cdf( (1.-frac)/2.)
2054 high_limit = g.inverse_cdf( 1. - (1.-frac)/2. )
2055
2056 return low_limit, high_limit
2057
2058
2059 # Standard python voodoo for CLI
2060 if __name__ == "__main__":
2061 ## Code Profiling. Uncomment these lines
2062 #import hotshot, hotshot.stats
2063 #prof = hotshot.Profile("stones.prof")
2064 #prof.runcall(main)
2065 #prof.close()
2066 #stats = hotshot.stats.load("stones.prof")
2067 #stats.strip_dirs()
2068 #stats.sort_stats('cumulative', 'calls')
2069 #stats.print_stats(40)
2070 #sys.exit()
2071
2072 main()
2073
2074
2075
2076
2077
2078