Mercurial > repos > davidmurphy > codonlogo
view weblogolib/__init__.py @ 15:981eb8c3a756 default tip
Uploaded
author | davidmurphy |
---|---|
date | Sat, 31 Mar 2012 16:07:07 -0400 |
parents | 778f03497adb |
children |
line wrap: on
line source
#!/usr/bin/env python # -------------------------------- WebLogo -------------------------------- # Copyright (c) 2003-2004 The Regents of the University of California. # Copyright (c) 2005 Gavin E. Crooks # Copyright (c) 2006, The Regents of the University of California, through print # Lawrence Berkeley National Laboratory (subject to receipt of any required # approvals from the U.S. Dept. of Energy). All rights reserved. # This software is distributed under the new BSD Open Source License. # <http://www.opensource.org/licenses/bsd-license.html> # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # (1) Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # (2) Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and or other materials provided with the distribution. # # (3) Neither the name of the University of California, Lawrence Berkeley # National Laboratory, U.S. Dept. of Energy nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # Replicates README.txt """ WebLogo (http://code.google.com/p/weblogo/) is a tool for creating sequence logos from biological sequence alignments. It can be run on the command line, as a standalone webserver, as a CGI webapp, or as a python library. The main WebLogo webserver is located at http://bespoke.lbl.gov/weblogo/ Codonlogo is based on Weblogo. Please consult the manual for installation instructions and more information: (Also located in the weblogolib/htdocs subdirectory.) For help on the command line interface run ./codonlogo --help To build a simple logo run ./codonlogo < cap.fa > logo0.eps To run as a standalone webserver at localhost:8080 ./codonlogo --server To create a logo in python code: >>> from weblogolib import * >>> fin = open('cap.fa') >>> seqs = read_seq_data(fin) >>> data = LogoData.from_seqs(seqs) >>> options = LogoOptions() >>> options.title = "A Logo Title" >>> format = LogoFormat(data, options) >>> fout = open('cap.eps', 'w') >>> eps_formatter( data, format, fout) -- Distribution and Modification -- This package is distributed under the new BSD Open Source License. Please see the LICENSE.txt file for details on copyright and licensing. The WebLogo source code can be downloaded from http://code.google.com/p/weblogo/ WebLogo requires Python 2.3, 2.4 or 2.5, the corebio python toolkit for computational biology (http://code.google.com/p/corebio), and the python array package 'numpy' (http://www.scipy.org/Download) # -------------------------------- CodonLogo -------------------------------- """ from math import * import random from itertools import izip, count import sys import copy import os from itertools import product from datetime import datetime from StringIO import StringIO from corebio.data import rna_letters, dna_letters, amino_acid_letters import random # python2.3 compatability from corebio._future import Template from corebio._future.subprocess import * from corebio._future import resource_string, resource_filename from math import log, sqrt # Avoid 'from numpy import *' since numpy has lots of names defined from numpy import array, asarray, float64, ones, zeros, int32,all,any, shape import numpy as na from corebio.utils.deoptparse import DeOptionParser from optparse import OptionGroup from color import * from colorscheme import * from corebio.seq import Alphabet, Seq, SeqList from corebio import seq_io from corebio.utils import isfloat, find_command, ArgumentError from corebio.moremath import * from corebio.data import amino_acid_composition from corebio.seq import unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet codon_alphabetU=['AAA', 'AAC', 'AAG', 'AAU', 'ACA', 'ACC', 'ACG', 'ACU', 'AGA', 'AGC', 'AGG', 'AGU', 'AUA', 'AUC', 'AUG', 'AUU', 'CAA', 'CAC', 'CAG', 'CAU', 'CCA', 'CCC', 'CCG', 'CCU', 'CGA', 'CGC', 'CGG', 'CGU', 'CUA', 'CUC', 'CUG', 'CUU', 'GAA', 'GAC', 'GAG', 'GAU', 'GCA', 'GCC', 'GCG', 'GCU', 'GGA', 'GGC', 'GGG', 'GGU', 'GUA', 'GUC', 'GUG', 'GUU', 'UAA', 'UAC', 'UAG', 'UAU', 'UCA', 'UCC', 'UCG', 'UCU', 'UGA', 'UGC', 'UGG', 'UGU', 'UUA', 'UUC', 'UUG', 'UUU'] codon_alphabetT=['AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA', 'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC', 'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT', 'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC', 'CTG', 'CTT', 'GAA', 'GAC', 'GAG', 'GAT', 'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC', 'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG', 'TTT'] altype="codonsT" offset=0 isreversed=False show_warnings=False col=[] __all__ = ['LogoSize', 'LogoOptions', 'description', '__version__', 'LogoFormat', 'LogoData', 'Dirichlet', 'GhostscriptAPI', 'std_color_schemes', 'default_color_schemes', 'classic', 'std_units', 'std_sizes', 'std_alphabets', 'std_percentCG', 'pdf_formatter', 'jpeg_formatter', 'png_formatter', 'png_print_formatter', 'txt_formatter', 'eps_formatter', 'formatters', 'default_formatter', 'base_distribution', 'equiprobable_distribution', 'read_seq_data', 'which_alphabet', 'color', 'colorscheme', ] description = "Create sequence logos from biological sequence alignments." __version__ = "1.1" # These keywords are subsituted by subversion. # The date and revision will only tell the truth after a branch or tag, # since different files in trunk will have been changed at different times release_date ="$Date: 2011-09-17 16:30:00 -0700 (Tue, 14 Oct 2008) $".split()[1] release_build = "$Revision: 53 $".split()[1] release_description = "CodonLogo %s (%s)" % (__version__, release_date) def cgi(htdocs_directory) : import weblogolib._cgi weblogolib._cgi.main(htdocs_directory) class GhostscriptAPI(object) : """Interface to the command line program Ghostscript ('gs')""" formats = ('png', 'pdf', 'jpeg') def __init__(self, path=None) : try: command = find_command('gs', path=path) except EnvironmentError: try: command = find_command('gswin32c.exe', path=path) except EnvironmentError: raise EnvironmentError("Could not find Ghostscript on path." " There should be either a gs executable or a gswin32c.exe on your system's path") self.command = command def version(self) : args = [self.command, '--version'] try : p = Popen(args, stdout=PIPE) (out,err) = p.communicate() except OSError : raise RuntimeError("Cannot communicate with ghostscript.") return out.strip() def convert(self, format, fin, fout, width, height, resolution=300) : device_map = { 'png':'png16m', 'pdf':'pdfwrite', 'jpeg':'jpeg'} try : device = device_map[format] except KeyError: raise ValueError("Unsupported format.") args = [self.command, "-sDEVICE=%s" % device, "-dPDFSETTINGS=/printer", #"-q", # Quite: Do not dump messages to stdout. "-sstdout=%stderr", # Redirect messages and errors to stderr "-sOutputFile=-", # Stdout "-dDEVICEWIDTHPOINTS=%s" % str(width), "-dDEVICEHEIGHTPOINTS=%s" % str(height), "-dSAFER", # For added security "-dNOPAUSE",] if device != 'pdf' : args.append("-r%s" % str(resolution) ) if resolution < 300 : # Antialias if resolution is Less than 300 DPI args.append("-dGraphicsAlphaBits=4") args.append("-dTextAlphaBits=4") args.append("-dAlignToPixels=0") args.append("-") # Read from stdin. Must be last argument. error_msg = "Unrecoverable error : Ghostscript conversion failed " \ "(Invalid postscript?). %s" % " ".join(args) source = fin.read() try : p = Popen(args, stdin=PIPE, stdout = PIPE, stderr= PIPE) (out,err) = p.communicate(source) except OSError : raise RuntimeError(error_msg) if p.returncode != 0 : error_msg += '\nReturn code: %i\n' % p.returncode if err is not None : error_msg += err raise RuntimeError(error_msg) print >>fout, out # end class Ghostscript aa_composition = [ amino_acid_composition[_k] for _k in unambiguous_protein_alphabet] # ------ DATA ------ classic = ColorScheme([ ColorGroup("G", "orange" ), ColorGroup("TU", "red"), ColorGroup("C", "blue"), ColorGroup("A", "green") ] ) std_color_schemes = {"auto": None, # Depends on sequence type "monochrome": monochrome, "base pairing": base_pairing, "classic": classic, "hydrophobicity" : hydrophobicity, "chemistry" : chemistry, "charge" : charge, }# default_color_schemes = { unambiguous_protein_alphabet: hydrophobicity, unambiguous_rna_alphabet: base_pairing, unambiguous_dna_alphabet: base_pairing #codon_alphabet:codonsU } std_units = { "bits" : 1./log(2), "nats" : 1., "digits" : 1./log(10), "kT" : 1., "kJ/mol" : 8.314472 *298.15 /1000., "kcal/mol": 1.987 *298.15 /1000., "probability" : None, } class LogoSize(object) : def __init__(self, stack_width, stack_height) : self.stack_width = stack_width self.stack_height = stack_height def __repr__(self): return stdrepr(self) # The base stack width is set equal to 9pt Courier. # (Courier has a width equal to 3/5 of the point size.) # Check that can get 80 characters in journal page @small # 40 chacaters in a journal column std_sizes = { "small" : LogoSize( stack_width = 16.2, stack_height = 10*1*5), "medium" : LogoSize( stack_width = 16.2*2, stack_height = 10*2*5), "large" : LogoSize( stack_width = 16.2*3, stack_height = 10*3*5), } std_alphabets = { 'protein': unambiguous_protein_alphabet, 'rna': unambiguous_rna_alphabet, 'dna': unambiguous_dna_alphabet, 'codonsU':codon_alphabetU, 'codonsT':codon_alphabetT } std_percentCG = { 'H. sapiens' : 40., 'E. coli' : 50.5, 'S. cerevisiae' : 38., 'C. elegans' : 36., 'D. melanogaster': 43., 'M. musculus' : 42., 'T. thermophilus' : 69.4, } # Thermus thermophilus: Henne A, Bruggemann H, Raasch C, Wiezer A, Hartsch T, # Liesegang H, Johann A, Lienard T, Gohl O, Martinez-Arias R, Jacobi C, # Starkuviene V, Schlenczeck S, Dencker S, Huber R, Klenk HP, Kramer W, # Merkl R, Gottschalk G, Fritz HJ: The genome sequence of the extreme # thermophile Thermus thermophilus. # Nat Biotechnol 2004, 22:547-53 def stdrepr(obj) : attr = vars(obj).items() attr.sort() args = [] for a in attr : if a[0][0]=='_' : continue args.append( '%s=%s' % ( a[0], repr(a[1])) ) args = ',\n'.join(args).replace('\n', '\n ') return '%s(\n %s\n)' % (obj.__class__.__name__, args) class LogoOptions(object) : """ A container for all logo formating options. Not all of these are directly accessible through the CLI or web interfaces. To display LogoOption defaults: >>> from weblogolib import * >>> LogoOptions() Attributes: o alphabet o creator_text -- Embedded as comment in figures. o logo_title o logo_label o stacks_per_line o unit_name o show_yaxis o yaxis_label -- Default depends on other settings. o yaxis_tic_interval o yaxis_minor_tic_ratio o yaxis_scale o show_xaxis o xaxis_label o xaxis_tic_interval o rotate_numbers o number_interval o show_ends o show_fineprint o fineprint o show_boxes o shrink_fraction o show_errorbars o errorbar_fraction o errorbar_width_fraction o errorbar_gray o resolution -- Dots per inch o default_color o color_scheme o debug o logo_margin o stroke_width o tic_length o size o stack_margin o pad_right o small_fontsize o fontsize o title_fontsize o number_fontsize o text_font o logo_font o title_font o first_index o logo_start o logo_end o scale_width """ def __init__(self, **kwargs) : """ Create a new LogoOptions instance. >>> L = LogoOptions(logo_title = "Some Title String") >>> L.show_yaxis = False >>> repr(L) """ self.creator_text = release_description, self.alphabet = None self.logo_title = "" self.logo_label = "" self.stacks_per_line = 20 self.unit_name = "bits" self.show_yaxis = True # yaxis_lable default depends on other settings. See LogoFormat self.yaxis_label = None self.yaxis_tic_interval = 1. self.yaxis_minor_tic_ratio = 5 self.yaxis_scale = None self.show_xaxis = True self.strict=False self.xaxis_label = "" self.xaxis_tic_interval =1 self.rotate_numbers = False self.number_interval = 5 self.show_ends = False self.show_fineprint = True self.fineprint = "CodonLogo "+__version__ self.show_boxes = False self.shrink_fraction = 0.5 self.show_errorbars = True self.altype = True self.errorbar_fraction = 0.90 self.errorbar_width_fraction = 0.25 self.errorbar_gray = 0.50 self.resolution = 96. # Dots per inch self.default_color = Color.by_name("black") self.color_scheme = None #self.show_color_key = False # NOT yet implemented self.debug = False self.logo_margin = 2 self.stroke_width = 0.5 self.tic_length = 5 self.size = std_sizes["medium"] self.stack_margin = 0.5 self.pad_right = False self.small_fontsize = 6 self.fontsize = 10 self.title_fontsize = 12 self.number_fontsize = 8 self.text_font = "ArialMT" self.logo_font = "Arial-BoldMT" self.title_font = "ArialMT" self.first_index = 1 self.logo_start = None self.logo_end=None # Scale width of characters proportional to gaps self.scale_width = True from corebio.utils import update update(self, **kwargs) def __repr__(self) : attr = vars(self).items() attr.sort() args = [] for a in attr : if a[0][0]=='_' : continue args.append( '%s=%s' % ( a[0], repr(a[1])) ) args = ',\n'.join(args).replace('\n', '\n ') return '%s(\n %s\n)' % (self.__class__.__name__, args) # End class LogoOptions class LogoFormat(LogoOptions) : """ Specifies the format of the logo. Requires a LogoData and LogoOptions objects. >>> data = LogoData.from_seqs(seqs ) >>> options = LogoOptions() >>> options.title = "A Logo Title" >>> format = LogoFormat(data, options) """ # TODO: Raise ArgumentErrors instead of ValueError and document def __init__(self, data, options= None) : LogoOptions.__init__(self) #global offset if options is not None : self.__dict__.update(options.__dict__) #offset=options.frame self.alphabet = data.alphabet self.seqlen = data.length self.altype = True self.show_title = False self.show_xaxis_label = False self.yaxis_minor_tic_interval = None self.lines_per_logo = None self.char_width = None self.line_margin_left = None self.line_margin_right = None self.line_margin_bottom = None self.line_margin_top = None self.title_height = None self.xaxis_label_height = None self.line_height = None self.line_width = None self.logo_height = None self.logo_width = None self.creation_date = None self.end_type = None if self.stacks_per_line< 1 : raise ArgumentError("Stacks per line should be greater than zero.", "stacks_per_line" ) if self.size.stack_height<=0.0 : raise ArgumentError( "Stack height must be greater than zero.", "stack_height") if (self.small_fontsize <= 0 or self.fontsize <=0 or self.title_fontsize<=0 ): raise ValueError("Font sizes must be positive.") if self.errorbar_fraction<0.0 or self.errorbar_fraction>1.0 : raise ValueError( "The visible fraction of the error bar must be between zero and one.") if self.yaxis_tic_interval<=0.0 : raise ArgumentError( "The yaxis tic interval cannot be negative.", 'yaxis_tic_interval') if self.size.stack_width <= 0.0 : raise ValueError( "The width of a stack should be a positive number.") if self.yaxis_minor_tic_interval and \ self.yaxis_minor_tic_interval<=0.0 : raise ValueError("Distances cannot be negative.") if self.xaxis_tic_interval<=0 : raise ValueError("Tic interval must be greater than zero.") if self.number_interval<=0 : raise ValueError("Invalid interval between numbers.") if self.shrink_fraction<0.0 or self.shrink_fraction>1.0 : raise ValueError("Invalid shrink fraction.") if self.stack_margin<=0.0 : raise ValueError("Invalid stack margin." ) if self.logo_margin<=0.0 : raise ValueError("Invalid logo margin." ) if self.stroke_width<=0.0 : raise ValueError("Invalid stroke width.") if self.tic_length<=0.0 : raise ValueError("Invalid tic length.") # FIXME: More validation # Inclusive upper and lower bounds # FIXME: Validate here. Move from eps_formatter if self.logo_start is None: self.logo_start = self.first_index if self.logo_end is None : self.logo_end = self.seqlen + self.first_index -1 self.total_stacks = self.logo_end - self.logo_start +1 if self.logo_start - self.first_index <0 : raise ArgumentError( "Logo range extends before start of available sequence.", 'logo_range') if self.logo_end - self.first_index >= self.seqlen : raise ArgumentError( "Logo range extends beyond end of available sequence.", 'logo_range') if self.logo_title : self.show_title = True if not self.fineprint : self.show_fineprint = False if self.xaxis_label : self.show_xaxis_label = True if self.yaxis_label is None : self.yaxis_label = self.unit_name if self.yaxis_label : self.show_yaxis_label = True else : self.show_yaxis_label = False self.show_ends = False if not self.yaxis_scale : conversion_factor = std_units[self.unit_name] if conversion_factor : self.yaxis_scale=log(len(self.alphabet))*conversion_factor #self.yaxis_scale=max(data.entropy)*conversion_factor #marker# this is where I handle the max height. needs revision. else : self.yaxis_scale=1.0 # probability units if self.yaxis_scale<=0.0 : raise ValueError(('yaxis_scale', "Invalid yaxis scale")) if self.yaxis_tic_interval >= self.yaxis_scale: self.yaxis_tic_interval /= 2. self.yaxis_minor_tic_interval \ = float(self.yaxis_tic_interval)/self.yaxis_minor_tic_ratio if self.color_scheme is None : #if self.alphabet in default_color_schemes : #self.color_scheme = default_color_schemes[self.alphabet] #else : self.color_scheme = codonsT #else: #for color, symbols, desc in options.colors: #try : #self.color_scheme.append( ColorGroup(symbols, color, desc) ) #print >>sys.stderr, color_scheme.groups[2] #except ValueError : #raise ValueError( #"error: option --color: invalid value: '%s'" % color ) self.lines_per_logo = 1+ ( (self.total_stacks-1) / self.stacks_per_line) if self.lines_per_logo==1 and not self.pad_right: self.stacks_per_line = min(self.stacks_per_line, self.total_stacks) self.char_width = self.size.stack_width - 2* self.stack_margin if self.show_yaxis : self.line_margin_left = self.fontsize * 3.0 else : self.line_margin_left = 0 if self.show_ends : self.line_margin_right = self.fontsize *1.5 else : self.line_margin_right = self.fontsize if self.show_xaxis : if self.rotate_numbers : self.line_margin_bottom = self.number_fontsize *2.5 else: self.line_margin_bottom = self.number_fontsize *1.5 else : self.line_margin_bottom = 4 self.line_margin_top = 4 if self.show_title : self.title_height = self.title_fontsize else : self.title_height = 0 self.xaxis_label_height =0. if self.show_xaxis_label : self.xaxis_label_height += self.fontsize if self.show_fineprint : self.xaxis_label_height += self.small_fontsize self.line_height = (self.size.stack_height + self.line_margin_top + self.line_margin_bottom ) self.line_width = (self.size.stack_width*self.stacks_per_line + self.line_margin_left + self.line_margin_right ) self.logo_height = int(2*self.logo_margin + self.title_height \ + self.xaxis_label_height + self.line_height*self.lines_per_logo) self.logo_width = int(2*self.logo_margin + self.line_width ) self.creation_date = datetime.now().isoformat(' ') end_type = '-' end_types = { unambiguous_protein_alphabet: 'p', unambiguous_rna_alphabet: '-', unambiguous_dna_alphabet: 'd' } if self.show_ends and self.alphabet in end_types: end_type = end_types[self.alphabet] self.end_type = end_type # End __init__ # End class LogoFormat # ------ Logo Formaters ------ # Each formatter is a function f(LogoData, LogoFormat, output file). # that draws a represntation of the logo into the given file. # The main graphical formatter is eps_formatter. A mapping 'formatters' # containing all available formatters is located after the formatter # definitions. def pdf_formatter(data, format, fout) : """ Generate a logo in PDF format.""" feps = StringIO() eps_formatter(data, format, feps) feps.seek(0) gs = GhostscriptAPI() gs.convert('pdf', feps, fout, format.logo_width, format.logo_height) def _bitmap_formatter(data, format, fout, device) : feps = StringIO() eps_formatter(data, format, feps) feps.seek(0) gs = GhostscriptAPI() gs.convert(device, feps, fout, format.logo_width, format.logo_height, format.resolution) def jpeg_formatter(data, format, fout) : """ Generate a logo in JPEG format.""" _bitmap_formatter(data, format, fout, device="jpeg") def png_formatter(data, format, fout) : """ Generate a logo in PNG format.""" _bitmap_formatter(data, format, fout, device="png") def png_print_formatter(data, format, fout) : """ Generate a logo in PNG format with print quality (600 DPI) resolution.""" format.resolution = 600 _bitmap_formatter(data, format, fout, device="png") def txt_formatter( logodata, format, fout) : """ Create a text representation of the logo data. """ print >>fout, str(logodata) def eps_formatter( logodata, format, fout) : """ Generate a logo in Encapsulated Postscript (EPS)""" subsitutions = {} from_format =[ "creation_date", "logo_width", "logo_height", "lines_per_logo", "line_width", "line_height", "line_margin_right","line_margin_left", "line_margin_bottom", "line_margin_top", "title_height", "xaxis_label_height", "creator_text", "logo_title", "logo_margin", "stroke_width", "tic_length", "stacks_per_line", "stack_margin", "yaxis_label", "yaxis_tic_interval", "yaxis_minor_tic_interval", "xaxis_label", "xaxis_tic_interval", "number_interval", "fineprint", "shrink_fraction", "errorbar_fraction", "errorbar_width_fraction", "errorbar_gray", "small_fontsize", "fontsize", "title_fontsize", "number_fontsize", "text_font", "logo_font", "title_font", "strict", "logo_label", "yaxis_scale", "end_type", "debug", "show_title", "show_xaxis", "show_xaxis_label", "show_yaxis", "show_yaxis_label", "show_boxes", "show_errorbars", "show_fineprint", "rotate_numbers", "show_ends", "altype", ] for s in from_format : subsitutions[s] = getattr(format,s) from_format_size = ["stack_height", "stack_width"] for s in from_format_size : subsitutions[s] = getattr(format.size,s) subsitutions["shrink"] = str(format.show_boxes).lower() # --------- COLORS -------------- def format_color(color): return " ".join( ("[",str(color.red) , str(color.green), str(color.blue), "]")) subsitutions["default_color"] = format_color(format.default_color) global col colors = [] if altype=="codonsT" or altype=="codonsU": for group in col: cf = format_color(group.color) colors.append( " ("+group.symbols+") " + cf ) for group in format.color_scheme.groups : cf = format_color(group.color) colors.append( " ("+group.symbols+") " + cf ) #print >>sys.stderr,opts.colors #print >>sys.stderr,logodata.options #print >>sys.stderr, group.symbols #print >>sys.stderr, cf else: for group in format.color_scheme.groups : cf = format_color(group.color) for s in group.symbols : colors.append( " ("+s+") " + cf ) subsitutions["color_dict"] = "\n".join(colors) data = [] # Unit conversion. 'None' for probability units conv_factor = std_units[format.unit_name] data.append("StartLine") seq_from = format.logo_start- format.first_index seq_to = format.logo_end - format.first_index +1 # seq_index : zero based index into sequence data # logo_index : User visible coordinate, first_index based # stack_index : zero based index of visible stacks for seq_index in range(seq_from, seq_to) : logo_index = seq_index + format.first_index stack_index = seq_index - seq_from if stack_index!=0 and (stack_index % format.stacks_per_line) ==0 : data.append("") data.append("EndLine") data.append("StartLine") data.append("") if isreversed : data.append("(%d) StartStack" % logo_index) else : data.append("(%d) StartStack" % logo_index) if conv_factor: stack_height = logodata.entropy[seq_index] * std_units[format.unit_name] else : stack_height = 1.0 # Probability # if logodata.entropy_interval is not None and conv_factor: # Draw Error bars # low, high = logodata.entropy_interval[seq_index] # center = logodata.entropy[seq_index] # down = (center - low) * conv_factor # up = (high - center) * conv_factor # data.append(" %f %f %f DrawErrorbarFirst" % (down, up, stack_height) ) s = zip(logodata.counts[seq_index], logodata.alphabet) def mycmp( c1, c2 ) : # Sort by frequency. If equal frequency then reverse alphabetic if c1[0] == c2[0] : return cmp(c2[1], c1[1]) return cmp(c1[0], c2[0]) s.sort(mycmp) C = float(sum(logodata.counts[seq_index])) if C > 0.0 : fraction_width = 1.0 if format.scale_width : fraction_width = logodata.weight[seq_index] for c in s: data.append(" %f %f (%s) ShowSymbol" % (fraction_width, c[0]*stack_height/C, c[1]) ) # Draw error bar on top of logo. Replaced by DrawErrorbarFirst above. if logodata.entropy_interval is not None and conv_factor: low, high = logodata.entropy_interval[seq_index] center = logodata.entropy[seq_index] down = (center - low) * conv_factor up = (high - center) * conv_factor data.append(" %f %f DrawErrorbar" % (down, up) ) data.append("EndStack") data.append("") data.append("EndLine") subsitutions["logo_data"] = "\n".join(data) # Create and output logo template = resource_string( __name__, 'template.eps', __file__) logo = Template(template).substitute(subsitutions) print >>fout, logo # map between output format names and logo formatters = { 'eps': eps_formatter, 'pdf': pdf_formatter, 'png': png_formatter, 'png_print' : png_print_formatter, 'jpeg' : jpeg_formatter, 'txt' : txt_formatter, } default_formatter = eps_formatter def parse_prior(composition, alphabet, weight=None) : """ Parse a description of the expected monomer distribution of a sequence. Valid compositions: - None or 'none' : No composition sepecified - 'auto' or 'automatic': Use the typical average distribution for proteins and an equiprobable distribution for everything else. - 'equiprobable' : All monomers have the same probability. - a percentage, e.g. '45%' or a fraction '0.45': The fraction of CG bases for nucleotide alphabets - a species name, e.g. 'E. coli', 'H. sapiens' : Use the average CG percentage for the specie's genome. - An explicit distribution, e.g. {'A':10, 'C':40, 'G':40, 'T':10} """ if composition is None: return None comp = composition.strip() if comp.lower() == 'none': return None if weight is None and alphabet is not None: weight = float(len(alphabet)) if comp.lower() == 'equiprobable' : prior = weight * equiprobable_distribution(len(alphabet)) return prior if comp.lower() == 'escherichiacoli' : if(altype=="codonsT"): composition="{'CTT': 0.7616, 'ATG': 1.5872, 'ACA': 0.4096, 'ACG': 0.736, 'ATC': 1.1648, 'AAC': 1.5615999999999999, 'ATA': 0.2368, 'AGG': 0.1024, 'CCT': 0.5376000000000001, 'ACT': 0.512, 'AGC': 1.0624, 'AAG': 0.7744, 'AGA': 0.0896, 'CAT': 1.0112, 'AAT': 1.4016, 'ATT': 1.952, 'CTG': 3.0016, 'CTA': 0.3392, 'CTC': 0.672, 'CAC': 0.8383999999999999, 'AAA': 2.1248, 'CCG': 1.7087999999999999, 'AGT': 0.4608, 'CCA': 0.4224, 'CAA': 0.7744, 'CCC': 0.4096, 'TAT': 1.0752000000000002, 'GGT': 1.3632, 'TGT': 0.37760000000000005, 'CGA': 0.2752, 'CAG': 1.7728, 'TCT': 0.3648, 'GAT': 2.4255999999999998, 'CGG': 0.26239999999999997, 'TTT': 1.2608, 'TGC': 0.512, 'GGG': 0.5504, 'TAG': 1e-06, 'GGA': 0.5888, 'TAA': 0.1152, 'GGC': 2.1376, 'TAC': 0.9344, 'TTC': 0.96, 'TCG': 0.512, 'TTA': 0.9728, 'TTG': 0.7616, 'TCC': 0.352, 'ACC': 1.4592, 'TCA': 0.4992, 'GCA': 1.3504, 'GTA': 0.736, 'GCC': 2.0224, 'GTC': 0.7487999999999999, 'GCG': 2.464, 'GTG': 1.6896, 'GAG': 1.1776, 'GTT': 1.0752000000000002, 'GCT': 0.6848, 'TGA': 0.064, 'GAC': 1.312, 'CGT': 1.3504, 'TGG': 0.6848, 'GAA': 2.7968, 'CGC': 1.664}" else: composition="{'CUU': 0.7616, 'AUG': 1.5872, 'ACA': 0.4096, 'ACG': 0.736, 'AUC': 1.1648, 'AAC': 1.5615999999999999, 'AUA': 0.2368, 'AGG': 0.1024, 'CCU': 0.5376000000000001, 'ACU': 0.512, 'AGC': 1.0624, 'AAG': 0.7744, 'AGA': 0.0896, 'CAU': 1.0112, 'AAU': 1.4016, 'AUU': 1.952, 'CUG': 3.0016, 'CUA': 0.3392, 'CUC': 0.672, 'CAC': 0.8383999999999999, 'AAA': 2.1248, 'CCG': 1.7087999999999999, 'AGU': 0.4608, 'CCA': 0.4224, 'CAA': 0.7744, 'CCC': 0.4096, 'UAU': 1.0752000000000002, 'GGU': 1.3632, 'UGU': 0.37760000000000005, 'CGA': 0.2752, 'CAG': 1.7728, 'UCU': 0.3648, 'GAU': 2.4255999999999998, 'CGG': 0.26239999999999997, 'UUU': 1.2608, 'UGC': 0.512, 'GGG': 0.5504, 'UAG': 1e-06, 'GGA': 0.5888, 'UAA': 0.1152, 'GGC': 2.1376, 'UAC': 0.9344, 'UUC': 0.96, 'UCG': 0.512, 'UUA': 0.9728, 'UUG': 0.7616, 'UCC': 0.352, 'ACC': 1.4592, 'UCA': 0.4992, 'GCA': 1.3504, 'GUA': 0.736, 'GCC': 2.0224, 'GUC': 0.7487999999999999, 'GCG': 2.464, 'GUG': 1.6896, 'GAG': 1.1776, 'GUU': 1.0752000000000002, 'GCU': 0.6848, 'UGA': 0.064, 'GAC': 1.312, 'CGU': 1.3504, 'UGG': 0.6848, 'GAA': 2.7968, 'CGC': 1.664}" elif comp.lower() == 'homosapiens' : if(altype=="codonsT"): composition="{'CTT': 0.8448, 'ATG': 1.408, 'ACA': 0.9663999999999999, 'ACG': 0.39039999999999997, 'ATC': 1.3312, 'AAC': 1.2224000000000002, 'ATA': 0.48, 'AGG': 0.768, 'CCT': 1.12, 'ACT': 0.8383999999999999, 'AGC': 1.248, 'AAG': 2.0416, 'AGA': 0.7807999999999999, 'CAT': 0.6976, 'AAT': 1.088, 'ATT': 1.024, 'CTG': 2.5344, 'CTA': 0.4608, 'CTC': 1.2544000000000002, 'CAC': 0.9663999999999999, 'AAA': 1.5615999999999999, 'CCG': 0.44160000000000005, 'AGT': 0.7744, 'CCA': 1.0816, 'CAA': 0.7872, 'CCC': 1.2672, 'TAT': 0.7807999999999999, 'GGT': 0.6912, 'TGT': 0.6784, 'CGA': 0.3968, 'CAG': 2.1888, 'TCT': 0.9728, 'GAT': 1.3952, 'CGG': 0.7296, 'TTT': 1.1264, 'TGC': 0.8064, 'GGG': 1.056, 'TAG': 0.0512, 'GGA': 1.056, 'TAA': 0.064, 'GGC': 1.4208, 'TAC': 0.9792000000000001, 'TTC': 1.2992000000000001, 'TCG': 0.2816, 'TTA': 0.4928, 'TTG': 0.8256, 'TCC': 1.1328, 'ACC': 1.2096, 'TCA': 0.7807999999999999, 'GCA': 1.0112, 'GTA': 0.45439999999999997, 'GCC': 1.7728, 'GTC': 0.928, 'GCG': 0.4736, 'GTG': 1.7984, 'GAG': 2.5344, 'GTT': 0.704, 'GCT': 1.1776, 'TGA': 0.1024, 'GAC': 1.6064, 'CGT': 0.288, 'TGG': 0.8448, 'GAA': 1.856, 'CGC': 0.6656}" else: composition="{'CUU': 0.8448, 'AUG': 1.408, 'ACA': 0.9663999999999999, 'ACG': 0.39039999999999997, 'AUC': 1.3312, 'AAC': 1.2224000000000002, 'AUA': 0.48, 'AGG': 0.768, 'CCU': 1.12, 'ACU': 0.8383999999999999, 'AGC': 1.248, 'AAG': 2.0416, 'AGA': 0.7807999999999999, 'CAU': 0.6976, 'AAU': 1.088, 'AUU': 1.024, 'CUG': 2.5344, 'CUA': 0.4608, 'CUC': 1.2544000000000002, 'CAC': 0.9663999999999999, 'AAA': 1.5615999999999999, 'CCG': 0.44160000000000005, 'AGU': 0.7744, 'CCA': 1.0816, 'CAA': 0.7872, 'CCC': 1.2672, 'UAU': 0.7807999999999999, 'GGU': 0.6912, 'UGU': 0.6784, 'CGA': 0.3968, 'CAG': 2.1888, 'UCU': 0.9728, 'GAU': 1.3952, 'CGG': 0.7296, 'UUU': 1.1264, 'UGC': 0.8064, 'GGG': 1.056, 'UAG': 0.0512, 'GGA': 1.056, 'UAA': 0.064, 'GGC': 1.4208, 'UAC': 0.9792000000000001, 'UUC': 1.2992000000000001, 'UCG': 0.2816, 'UUA': 0.4928, 'UUG': 0.8256, 'UCC': 1.1328, 'ACC': 1.2096, 'UCA': 0.7807999999999999, 'GCA': 1.0112, 'GUA': 0.45439999999999997, 'GCC': 1.7728, 'GUC': 0.928, 'GCG': 0.4736, 'GUG': 1.7984, 'GAG': 2.5344, 'GUU': 0.704, 'GCU': 1.1776, 'UGA': 0.1024, 'GAC': 1.6064, 'CGU': 0.288, 'UGG': 0.8448, 'GAA': 1.856, 'CGC': 0.6656}" elif comp.lower() == 'saccharomycescerevisiae' : if(altype=="codonsT"): composition="{'CTT': 0.7872, 'ATG': 1.3376, 'ACA': 1.1392, 'ACG': 0.512, 'ATC': 1.1008, 'AAC': 1.5872, 'ATA': 1.1392, 'AGG': 0.5888, 'CCT': 0.864, 'ACT': 1.2992000000000001, 'AGC': 0.6272000000000001, 'AAG': 1.9712, 'AGA': 1.3632, 'CAT': 0.8704, 'AAT': 2.2848, 'ATT': 1.9264000000000001, 'CTG': 0.672, 'CTA': 0.8576, 'CTC': 0.3456, 'CAC': 0.4992, 'AAA': 2.6816, 'CCG': 0.3392, 'AGT': 0.9087999999999999, 'CCA': 1.1712, 'CAA': 1.7472, 'CCC': 0.4352, 'TAT': 1.2032, 'GGT': 1.5295999999999998, 'TGT': 0.5184, 'CGA': 0.192, 'CAG': 0.7744, 'TCT': 1.504, 'GAT': 2.4064, 'CGG': 0.1088, 'TTT': 1.6704, 'TGC': 0.3072, 'GGG': 0.384, 'TAG': 0.032, 'GGA': 0.6976, 'TAA': 0.0704, 'GGC': 0.6272000000000001, 'TAC': 0.9472, 'TTC': 1.1776, 'TCG': 0.5504, 'TTA': 1.6767999999999998, 'TTG': 1.7408, 'TCC': 0.9087999999999999, 'ACC': 0.8128, 'TCA': 1.1967999999999999, 'GCA': 1.0368, 'GTA': 0.7552000000000001, 'GCC': 0.8064, 'GTC': 0.7552000000000001, 'GCG': 0.3968, 'GTG': 0.6912, 'GAG': 1.2288, 'GTT': 1.4144, 'GCT': 1.3568, 'TGA': 0.0448, 'GAC': 1.2928, 'CGT': 0.4096, 'TGG': 0.6656, 'GAA': 2.9184, 'CGC': 0.1664}" else: composition="{'CUU': 0.7872, 'AUG': 1.3376, 'ACA': 1.1392, 'ACG': 0.512, 'AUC': 1.1008, 'AAC': 1.5872, 'AUA': 1.1392, 'AGG': 0.5888, 'CCU': 0.864, 'ACU': 1.2992000000000001, 'AGC': 0.6272000000000001, 'AAG': 1.9712, 'AGA': 1.3632, 'CAU': 0.8704, 'AAU': 2.2848, 'AUU': 1.9264000000000001, 'CUG': 0.672, 'CUA': 0.8576, 'CUC': 0.3456, 'CAC': 0.4992, 'AAA': 2.6816, 'CCG': 0.3392, 'AGU': 0.9087999999999999, 'CCA': 1.1712, 'CAA': 1.7472, 'CCC': 0.4352, 'UAU': 1.2032, 'GGU': 1.5295999999999998, 'UGU': 0.5184, 'CGA': 0.192, 'CAG': 0.7744, 'UCU': 1.504, 'GAU': 2.4064, 'CGG': 0.1088, 'UUU': 1.6704, 'UGC': 0.3072, 'GGG': 0.384, 'UAG': 0.032, 'GGA': 0.6976, 'UAA': 0.0704, 'GGC': 0.6272000000000001, 'UAC': 0.9472, 'UUC': 1.1776, 'UCG': 0.5504, 'UUA': 1.6767999999999998, 'UUG': 1.7408, 'UCC': 0.9087999999999999, 'ACC': 0.8128, 'UCA': 1.1967999999999999, 'GCA': 1.0368, 'GUA': 0.7552000000000001, 'GCC': 0.8064, 'GUC': 0.7552000000000001, 'GCG': 0.3968, 'GUG': 0.6912, 'GAG': 1.2288, 'GUU': 1.4144, 'GCU': 1.3568, 'UGA': 0.0448, 'GAC': 1.2928, 'CGU': 0.4096, 'UGG': 0.6656, 'GAA': 2.9184, 'CGC': 0.1664}" if comp.lower() == 'auto' or comp.lower() == 'automatic': if alphabet == unambiguous_protein_alphabet : prior = weight * asarray(aa_composition, float64) else : prior = weight * equiprobable_distribution(len(alphabet)) elif comp in std_percentCG : prior = weight * base_distribution(std_percentCG[comp]) return prior elif comp[-1] == '%' : prior = weight * base_distribution( float(comp[:-1])) return prior elif isfloat(comp) : prior = weight * base_distribution( float(comp)*100. ) return prior elif composition[0] == '{' and composition[-1] == '}' : explicit = composition[1: -1] explicit = explicit.replace(',',' ').replace("'", ' ').replace('"',' ').replace(':', ' ').split() if len(explicit) != len(alphabet)*2 : #print explicit raise ValueError("Explicit prior does not match length of alphabet") prior = - ones(len(alphabet), float64) try : for r in range(len(explicit)/2) : letter = explicit[r*2] index = alphabet.index(letter) value = float(explicit[r*2 +1]) prior[index] = value except ValueError : raise ValueError("Cannot parse explicit composition") if any(prior==-1.) : raise ValueError("Explicit prior does not match alphabet") prior/= sum(prior) prior *= weight else : raise ValueError("Unknown or malformed composition: %s"%composition) if len(prior) != len(alphabet) : raise ValueError( "The sequence alphabet and composition are incompatible.") return prior def base_distribution(percentCG) : A = (1. - (percentCG/100.))/2. C = (percentCG/100.)/2. G = (percentCG/100.)/2. T = (1. - (percentCG/100))/2. return asarray((A,C,G,T), float64) def equiprobable_distribution( length) : return ones( (length), float64) /length def read_seq_data(fin, input_parser=seq_io.read,alphabet=None, ignore_lower_case=False, max_file_size=0): # TODO: Document this method and enviroment variable max_file_size =int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size)) # If max_file_size is set, or if fin==stdin (which is non-seekable), we # read the data and replace fin with a StringIO object. if(max_file_size>0) : data = fin.read(max_file_size) more_data = fin.read(2) if more_data != "" : raise IOError("File exceeds maximum allowed size: %d bytes" % max_file_size) fin = StringIO(data) elif fin == sys.stdin: fin = StringIO(fin.read()) seqs = input_parser(fin) if seqs is None or len(seqs) ==0 : raise ValueError("Please provide a multiple sequence alignment") if ignore_lower_case : # Case is significant. Do not count lower case letters. for i,s in enumerate(seqs) : seqs[i] = s.mask() global altype if(altype=="codonsT" or altype=="codonsU"): if 'T' in seqs[0] or 't' in seqs[0]: altype="codonsT" if 'U' in seqs[0] or 'u' in seqs[0]: altype="codonsU" global offset global isreversed seq2=[""]*len(seqs) seq2 = [] for i in xrange(len(seqs)): seq2.append([]) if(offset%6>2): for x in range(0,len(seqs)): backs=seqs[x][::-1] for y in range(0,len(backs)): seq2[x].append(str(backs[y])) if(altype=="codonsU"): for x in range(0,len(seq2)): for y in range(0,len(seq2[x])): if(cmp(seq2[x][y],'G')==0): seq2[x][y]="C" elif(cmp(seq2[x][y],'A')==0): seq2[x][y]='U' elif(cmp(seq2[x][y],'U')==0): seq2[x][y]='A' elif(cmp(seq2[x][y],'C')==0): seq2[x][y]='G' if(altype=="codonsT"): for x in range(0,len(seq2)): for y in range(0,len(seq2[x])): if(cmp(seq2[x][y],'G')==0): seq2[x][y]='C' elif(cmp(seq2[x][y],'A')==0): seq2[x][y]='T' elif(cmp(seq2[x][y],'T')==0): seq2[x][y]='A' elif(cmp(seq2[x][y],'C')==0): seq2[x][y]='G' offset=offset%3 isreversed=True for x in range(0,len(seqs)): seqs[x]=Seq("".join(seq2[x])) # Add alphabet to seqs. if alphabet : seqs.alphabet = alphabet else : seqs.alphabet = which_alphabet(seqs) return seqs #TODO: Move to seq_io? # Would have to check that no symbol outside of full alphabet? def which_alphabet(seqs) : """ Returns the most appropriate unambiguous protien, rna or dna alphabet for a Seq or SeqList. """ alphabets = (unambiguous_protein_alphabet, unambiguous_rna_alphabet, unambiguous_dna_alphabet ) # Heuristic # Count occurances of each letter. Downweight longer alphabet. #for x in seqs: if( altype=="codonsU"): return codon_alphabetU if( altype=="codonsT"): return codon_alphabetT else: score = [1.0*asarray(seqs.tally(a)).sum()/sqrt(len(a)) for a in alphabets] #print score best = argmax(score) # Ties go to last in list. a = alphabets[best] return a class LogoData(object) : """The data needed to generate a sequence logo. - alphabet - length - counts -- An array of character counts - entropy -- The relative entropy of each column - entropy_interval -- entropy confidence interval """ def __init__(self, length=None, alphabet = None, counts =None, entropy =None, entropy_interval = None, weight=None) : """Creates a new LogoData object""" self.length = length self.alphabet = alphabet self.counts = counts self.entropy = entropy self.entropy_interval = entropy_interval self.weight = weight #@classmethod def from_counts(cls, alphabet, counts, prior= None): """Build a logodata object from counts.""" seq_length, A = counts.shape if prior is not None: prior = array(prior, float64) if prior is None : R = log(A) ent = zeros( seq_length, float64) entropy_interval = None for i in range (0, seq_length) : C = sum(counts[i]) #FIXME: fixup corebio.moremath.entropy()? if C == 0 : ent[i] = 0.0 else : ent[i] = R - entropy(counts[i]) else : ent = zeros( seq_length, float64) entropy_interval = zeros( (seq_length,2) , float64) R = log(A) for i in range (0, seq_length) : alpha = array(counts[i] , float64) alpha += prior posterior = Dirichlet(alpha) ent[i] = posterior.mean_relative_entropy(prior/sum(prior)) entropy_interval[i][0], entropy_interval[i][1] = \ posterior.interval_relative_entropy(prior/sum(prior), 0.95) weight = array( na.sum(counts,axis=1) , float) weight /= max(weight) return cls(seq_length, alphabet, counts, ent, entropy_interval, weight) from_counts = classmethod(from_counts) #@classmethod def from_seqs(cls, seqs, prior= None): alphabet=seqs.alphabet #get the offset and if it's greater than 2 swap bases to get the reverse compliment and reverse. for x in range(0,len(seqs)): seqs[x]=seqs[x].upper() counter=0 """Build a 2D array from a SeqList, a list of sequences.""" # --- VALIDATE DATA --- # check that there is at least one sequence of length 1 if len(seqs)==0 or len(seqs[0]) ==0: raise ValueError("No sequence data found.") sys.exit(0) # Check sequence lengths seq_length = len(seqs[0]) for i,s in enumerate(seqs) : #print i,s, len(s) if seq_length != len(s) : raise ArgumentError( "Sequence number %d differs in length from the previous sequences" % (i+1) ,'sequences') sys.exit(0) if(altype=="codonsT" or altype=="codonsU"): x = [[0]*64 for x in xrange(seq_length/3)] counter=offset while counter+offset<seq_length: for i in range(0,len(seqs)): if len(str(seqs[i][(counter):(counter+3)]))==3 and len(seqs[i][(counter):(counter+3)].strip("GATUC"))==0 : if(str(seqs[i][(counter):(counter+3)]) in alphabet): x[counter/3][ (alphabet.index(str(seqs[i][(counter):(counter+3)]))) ]+=1 elif show_warnings: if len(seqs[i][(counter):(counter+3)].strip("GATUC"))==1 or len(seqs[i][(counter):(counter+3)].strip("GATUC"))==2 : print >>sys.stderr, 'Warning:Incomplete or non GATUC codon detected:', seqs[i][(counter):(counter+3)] print >>sys.stderr, 'Position:',counter print >>sys.stderr, 'Sequence:',(i+1) print >>sys.stderr, 'This will be treated as ---' counter=counter+3 counts=asarray(x) else: counts = asarray(seqs.tally()) return cls.from_counts(alphabet, counts, prior) from_seqs = classmethod(from_seqs) def __str__(self) : out = StringIO() print >>out, '## LogoData' print >>out, '# First column is position number, couting from zero' print >>out, '# Subsequent columns are raw symbol counts' print >>out, '# Entropy is mean entropy measured in nats.' print >>out, '# Low and High are the 95% confidence limits.' print >>out, '# Weight is the fraction of non-gap symbols in the column.' print >>out, '#\t' print >>out, '#\t', for a in self.alphabet : print >>out, a, '\t', print >>out, 'Entropy\tLow\tHigh\tWeight' for i in range(self.length) : print >>out, i, '\t', for c in self.counts[i] : print >>out, c, '\t', print >>out, self.entropy[i], '\t', if self.entropy_interval is not None: print >>out, self.entropy_interval[i][0], '\t', print >>out, self.entropy_interval[i][1], '\t', else : print >>out, '\t','\t', if self.weight is not None : print >>out, self.weight[i], print >>out, '' print >>out, '# End LogoData' return out.getvalue() # ====================== Main: Parse Command line ============================= def main(): """CodonLogo command line interface """ # ------ Parse Command line ------ parser = _build_option_parser() (opts, args) = parser.parse_args(sys.argv[1:]) if args : parser.error("Unparsable arguments: %s " % args) if opts.serve: httpd_serve_forever(opts.port) # Never returns? sys.exit(0) # ------ Create Logo ------ try: data = _build_logodata(opts) format = _build_logoformat(data, opts) formatter = opts.formatter formatter(data, format, opts.fout) except ValueError, err : print >>sys.stderr, 'Error:', err sys.exit(2) except KeyboardInterrupt, err: sys.exit(0) # End main() def httpd_serve_forever(port=8080) : """ Start a webserver on a local port.""" import BaseHTTPServer import CGIHTTPServer class __HTTPRequestHandler(CGIHTTPServer.CGIHTTPRequestHandler): def is_cgi(self) : if self.path == "/create.cgi": self.cgi_info = '', 'create.cgi' return True return False # Add current directory to PYTHONPATH. This is # so that we can run the standalone server # without having to run the install script. pythonpath = os.getenv("PYTHONPATH", '') pythonpath += ":" + os.path.abspath(sys.path[0]).split()[0] os.environ["PYTHONPATH"] = pythonpath htdocs = resource_filename(__name__, 'htdocs', __file__) os.chdir(htdocs) HandlerClass = __HTTPRequestHandler ServerClass = BaseHTTPServer.HTTPServer httpd = ServerClass(('', port), HandlerClass) print "Serving HTTP on localhost:%d ..." % port try : httpd.serve_forever() except KeyboardInterrupt: sys.exit(0) # end httpd_serve_forever() def read_priors(finp, alphabet ,max_file_size=0): max_file_size =int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size)) if(max_file_size>0) : data = finp.read(max_file_size) more_data = finp.read(2) if more_data != "" : raise IOError("File exceeds maximum allowed size: %d bytes" % max_file_size) finp = StringIO(data) priordict={} while 1: line = finp.readline() if not line: break line = line.split() if(altype=="codonsT"): priordict[line[0].upper().replace("U", "T")]=(float(line[1])/1000)*64 else: priordict[line[0].upper().replace("T", "U")]=(float(line[1])/1000)*64 if priordict[line[0].upper().replace("U", "T")] == 0: priordict[line[0].upper().replace("U", "T")] = 0.000001 return priordict def _build_logodata(options) : global offset offset=options.frame global show_warnings show_warnings = options.strict options.alphabet = None options.ignore_lower_case = False #options.default_color = Color.by_name("black") options.color_scheme=None #options.colors=[] options.show_ends=False seqs = read_seq_data(options.fin, options.input_parser.read, alphabet=options.alphabet, ignore_lower_case = options.ignore_lower_case) if(options.priorfile!=None): if(altype=="CodonsT"): options.composition= str(read_priors(options.priorfile,codon_alphabetT)) options.alphabet = codon_alphabetT else: options.composition= str(read_priors(options.priorfile,codon_alphabetU)) options.alphabet = codon_alphabetU prior = parse_prior( options.composition,seqs.alphabet, options.weight) data = LogoData.from_seqs(seqs, prior) return data def _build_logoformat( logodata, opts) : """ Extract and process relevant option values and return a LogoFormat object.""" args = {} direct_from_opts = [ "stacks_per_line", "logo_title", "yaxis_label", "show_xaxis", "strict", "show_yaxis", "xaxis_label", "show_ends", "fineprint", "show_errorbars", "show_boxes", "yaxis_tic_interval", "resolution", "alphabet", "debug", "show_ends", "default_color", #"show_color_key", "color_scheme", "unit_name", "logo_label", "yaxis_scale", "first_index", "logo_start", "logo_end", "scale_width", "frame", ] for k in direct_from_opts: args[k] = opts.__dict__[k] logo_size = copy.copy(opts.__dict__['logo_size']) size_from_opts = ["stack_width", "stack_height"] for k in size_from_opts : length = getattr(opts, k) if length : setattr( logo_size, k, length ) args["size"] = logo_size global col if opts.colors: color_scheme = ColorScheme() for color, symbols, desc in opts.colors: try : #c = Color.from_string(color) color_scheme.groups.append( ColorGroup(symbols, color, desc) ) #print >> sys.stderr, color col.append( ColorGroup(symbols, color, desc) ) except ValueError : raise ValueError( "error: option --color: invalid value: '%s'" % color ) if(altype!="codonsU" and altype!="codonsT") : args["color_scheme"] = color_scheme #cf = colorscheme.format_color(col[0]) #col.append( " ("+group.symbols+") " + cf ) logooptions = LogoOptions() for a, v in args.iteritems() : setattr(logooptions,a,v) theformat = LogoFormat(logodata, logooptions ) return theformat # ========================== OPTIONS ========================== def _build_option_parser() : defaults = LogoOptions() parser = DeOptionParser(usage="%prog [options] < sequence_data.fa > sequence_logo.eps", description = description, version = __version__ , add_verbose_options = False ) io_grp = OptionGroup(parser, "Input/Output Options",) data_grp = OptionGroup(parser, "Logo Data Options",) format_grp = OptionGroup(parser, "Logo Format Options", "These options control the format and display of the logo.") color_grp = OptionGroup(parser, "Color Options", "Colors can be specified using CSS2 syntax. e.g. 'red', '#FF0000', etc.") advanced_grp = OptionGroup(parser, "Advanced Format Options", "These options provide fine control over the display of the logo. ") server_grp = OptionGroup(parser, "CodonLogo Server", "Run a standalone webserver on a local port.") parser.add_option_group(io_grp) parser.add_option_group(data_grp) parser.add_option_group(format_grp) parser.add_option_group(color_grp) parser.add_option_group(advanced_grp) parser.add_option_group(server_grp) # ========================== IO OPTIONS ========================== io_grp.add_option( "-f", "--fin", dest="fin", action="store", type="file_in", default=sys.stdin, help="Sequence input file (default: stdin)", metavar="FILENAME") io_grp.add_option( "-R", "--prior", dest="priorfile", action="store", type="file_in", help="A file with 64 codons and their prior probabilities, one per line, each codon followed by a space and it's probability.", metavar="FILENAME") io_grp.add_option("", "--fin-format", dest="input_parser", action="store", type ="dict", default = seq_io, choices = seq_io.format_names(), help="Multiple sequence alignment format: (%s)" % ', '.join([ f.names[0] for f in seq_io.formats]), metavar="FORMAT") io_grp.add_option("-o", "--fout", dest="fout", type="file_out", default=sys.stdout, help="Output file (default: stdout)", metavar="FILENAME") io_grp.add_option( "-F", "--format", dest="formatter", action="store", type="dict", choices = formatters, metavar= "FORMAT", help="Format of output: eps (default), png, png_print, pdf, jpeg, txt", default = default_formatter) # ========================== Data OPTIONS ========================== data_grp.add_option("-m", "--frame", dest="frame", action="store", type="int", default=0, help="Offset of the reading frame you wish to look in (default: 0) 3-6 indicate the reverse complement in which case codonlogo will read from the last symbol in the sequences backwards and replace each base with it's complement.", metavar="NUMBER") #data_grp.add_option("-T", "--type", #dest="altype", #action="store", #type="boolean", #default=True, #help="Generate a codon logo rather than a sequence logo (default: True)", #metavar="YES/NO") #data_grp.add_option( "-A", "--sequence-type", #dest="alphabet", #action="store", #type="dict", #choices = std_alphabets, #help="The type of sequence data: 'protein', 'rna' or 'dna'.", #metavar="TYPE") #data_grp.add_option( "-a", "--alphabet", #dest="alphabet", #action="store", #help="The set of symbols to count, e.g. 'AGTC'. " #"All characters not in the alphabet are ignored. " #"If neither the alphabet nor sequence-type are specified then codonlogo will examine the input data and make an educated guess. " #"See also --sequence-type, --ignore-lower-case" ) # FIXME Add test? #data_grp.add_option( "", "--ignore-lower-case", #dest="ignore_lower_case", #action="store", #type = "boolean", #default=False, #metavar = "YES/NO", #help="Disregard lower case letters and only count upper case letters in sequences? (Default: No)" #) data_grp.add_option( "-U", "--units", dest="unit_name", action="store", choices = std_units.keys(), type="choice", default = defaults.unit_name, help="A unit of entropy ('bits' (default), 'nats', 'digits'), or a unit of free energy ('kT', 'kJ/mol', 'kcal/mol'), or 'probability' for probabilities", metavar = "NUMBER") data_grp.add_option( "", "--composition", dest="composition", action="store", type="string", default = "equiprobable", help="The expected composition of the sequences: 'equiprobable'(default), 'none' (Do not perform any compositional adjustment), or 'escherichiacoli' 'homosapiens' 'saccharomycescerevisiae' for ecoli, human and SC codon frequencies.", metavar="COMP.") data_grp.add_option( "", "--weight", dest="weight", action="store", type="float", default = None, help="The weight of prior data. Default: total pseudocounts equal to the number of monomer types.", metavar="NUMBER") data_grp.add_option( "-i", "--first-index", dest="first_index", action="store", type="int", default = 1, help="Index of first position in sequence data (default: 1)", metavar="INDEX") data_grp.add_option( "-l", "--lower", dest="logo_start", action="store", type="int", help="Lower bound of sequence to display", metavar="INDEX") data_grp.add_option( "-u", "--upper", dest="logo_end", action="store", type="int", help="Upper bound of sequence to display", metavar="INDEX") data_grp.add_option( "-G", "--strict", dest="strict", action="store", type="boolean", help="Issue warnings if partial codons are encountered. Default: %default",default = defaults.strict, metavar="True/False") # ========================== FORMAT OPTIONS ========================== format_grp.add_option( "-s", "--size", dest="logo_size", action="store", type ="dict", choices = std_sizes, metavar = "LOGOSIZE", default = defaults.size, help="Specify a standard logo size (small, medium (default), large)" ) format_grp.add_option( "-n", "--stacks-per-line", dest="stacks_per_line", action="store", type="int", help="Maximum number of logo stacks per logo line. (default: %default)", default = defaults.stacks_per_line, metavar="COUNT") format_grp.add_option( "-t", "--title", dest="logo_title", action="store", type="string", help="Logo title text.", default = defaults.logo_title, metavar="TEXT") format_grp.add_option( "", "--label", dest="logo_label", action="store", type="string", help="A figure label, e.g. '2a'", default = defaults.logo_label, metavar="TEXT") format_grp.add_option( "-X", "--show-xaxis", action="store", type = "boolean", default= defaults.show_xaxis, metavar = "YES/NO", help="Display sequence numbers along x-axis? (default: %default)") format_grp.add_option( "-x", "--xlabel", dest="xaxis_label", action="store", type="string", default = defaults.xaxis_label, help="X-axis label", metavar="TEXT") format_grp.add_option( "-S", "--yaxis", dest="yaxis_scale", action="store", type="float", help="Height of yaxis in units. (Default: Maximum value with uninformative prior.)", metavar = "UNIT") format_grp.add_option( "-Y", "--show-yaxis", action="store", type = "boolean", dest = "show_yaxis", default= defaults.show_yaxis, metavar = "YES/NO", help="Display entropy scale along y-axis? (default: %default)") format_grp.add_option( "-y", "--ylabel", dest="yaxis_label", action="store", type="string", help="Y-axis label (default depends on plot type and units)", metavar="TEXT") #format_grp.add_option( "-E", "--show-ends", #action="store", #type = "boolean", #default= defaults.show_ends, #metavar = "YES/NO", #help="Label the ends of the sequence? (default: %default)") format_grp.add_option( "-P", "--fineprint", dest="fineprint", action="store", type="string", default= defaults.fineprint, help="The fine print (default: Codonlogo version)", metavar="TEXT") format_grp.add_option( "", "--ticmarks", dest="yaxis_tic_interval", action="store", type="float", default= defaults.yaxis_tic_interval, help="Distance between ticmarks (default: %default)", metavar = "NUMBER") format_grp.add_option( "", "--errorbars", dest = "show_errorbars", action="store", type = "boolean", default= defaults.show_errorbars, metavar = "YES/NO", help="Display error bars? (default: %default)") # ========================== Color OPTIONS ========================== # TODO: Future Feature # color_grp.add_option( "-K", "--color-key", # dest= "show_color_key", # action="store", # type = "boolean", # default= defaults.show_color_key, # metavar = "YES/NO", # help="Display a color key (default: %default)") #color_scheme_choices = std_color_schemes.keys() #color_scheme_choices.sort() #color_grp.add_option( "-c", "--color-scheme", #dest="color_scheme", #action="store", #type ="dict", #choices = std_color_schemes, #metavar = "SCHEME", #default = None, # Auto #help="Specify a standard color scheme (%s)" % \ #", ".join(color_scheme_choices) ) color_grp.add_option( "-C", "--color", dest="colors", action="append", metavar="COLOR SYMBOLS DESCRIPTION ", nargs = 3, default=[], help="Specify symbol colors, e.g. --color black AG 'Purine' --color red TC 'Pyrimidine' ") color_grp.add_option( "", "--default-color", dest="default_color", action="store", metavar="COLOR", default= defaults.default_color, help="Symbol color if not otherwise specified.") # ========================== Advanced options ========================= advanced_grp.add_option( "-W", "--stack-width", dest="stack_width", action="store", type="float", default= None, help="Width of a logo stack (default: %s)"% defaults.size.stack_width, metavar="POINTS" ) advanced_grp.add_option( "-H", "--stack-height", dest="stack_height", action="store", type="float", default= None, help="Height of a logo stack (default: %s)"%defaults.size.stack_height, metavar="POINTS" ) advanced_grp.add_option( "", "--box", dest="show_boxes", action="store", type = "boolean", default=False, metavar = "YES/NO", help="Draw boxes around symbols? (default: no)") advanced_grp.add_option( "", "--resolution", dest="resolution", action="store", type="float", default=96, help="Bitmap resolution in dots per inch (DPI). (default: 96 DPI, except png_print, 600 DPI) Low resolution bitmaps (DPI<300) are antialiased.", metavar="DPI") advanced_grp.add_option( "", "--scale-width", dest="scale_width", action="store", type = "boolean", default= True, metavar = "YES/NO", help="Scale the visible stack width by the fraction of symbols in the column? (i.e. columns with many gaps of unknowns are narrow.) (default: yes)") advanced_grp.add_option( "", "--debug", action="store", type = "boolean", default= defaults.debug, metavar = "YES/NO", help="Output additional diagnostic information. (default: %default)") # ========================== Server options ========================= server_grp.add_option( "", "--serve", dest="serve", action="store_true", default= False, help="Start a standalone CodonLogo server for creating sequence logos.") server_grp.add_option( "", "--port", dest="port", action="store", type="int", default= 8080, help="Listen to this local port. (Default: %default)", metavar="PORT") return parser # END _build_option_parser ############################################################## class Dirichlet(object) : """The Dirichlet probability distribution. The Dirichlet is a continuous multivariate probability distribution across non-negative unit length vectors. In other words, the Dirichlet is a probability distribution of probability distributions. It is conjugate to the multinomial distribution and is widely used in Bayesian statistics. The Dirichlet probability distribution of order K-1 is p(theta_1,...,theta_K) d theta_1 ... d theta_K = (1/Z) prod_i=1,K theta_i^{alpha_i - 1} delta(1 -sum_i=1,K theta_i) The normalization factor Z can be expressed in terms of gamma functions: Z = {prod_i=1,K Gamma(alpha_i)} / {Gamma( sum_i=1,K alpha_i)} The K constants, alpha_1,...,alpha_K, must be positive. The K parameters, theta_1,...,theta_K are nonnegative and sum to 1. Status: Alpha """ __slots__ = 'alpha', '_total', '_mean', def __init__(self, alpha) : """ Args: - alpha -- The parameters of the Dirichlet prior distribution. A vector of non-negative real numbers. """ # TODO: Check that alphas are positive #TODO : what if alpha's not one dimensional? self.alpha = asarray(alpha, float64) self._total = sum(alpha) self._mean = None def sample(self) : """Return a randomly generated probability vector. Random samples are generated by sampling K values from gamma distributions with parameters a=\alpha_i, b=1, and renormalizing. Ref: A.M. Law, W.D. Kelton, Simulation Modeling and Analysis (1991). Authors: Gavin E. Crooks <gec@compbio.berkeley.edu> (2002) """ alpha = self.alpha K = len(alpha) theta = zeros( (K,), float64) for k in range(K): theta[k] = random.gammavariate(alpha[k], 1.0) theta /= sum(theta) return theta def mean(self) : if self._mean ==None: self._mean = self.alpha / self._total return self._mean def covariance(self) : alpha = self.alpha A = sum(alpha) #A2 = A * A K = len(alpha) cv = zeros( (K,K), float64) for i in range(K) : cv[i,i] = alpha[i] * (1. - alpha[i]/A) / (A * (A+1.) ) for i in range(K) : for j in range(i+1,K) : v = - alpha[i] * alpha[j] / (A * A * (A+1.) ) cv[i,j] = v cv[j,i] = v return cv def mean_x(self, x) : x = asarray(x, float64) if shape(x) != shape(self.alpha) : raise ValueError("Argument must be same dimension as Dirichlet") return sum( x * self.mean()) def variance_x(self, x) : x = asarray(x, float64) if shape(x) != shape(self.alpha) : raise ValueError("Argument must be same dimension as Dirichlet") cv = self.covariance() var = na.dot(na.dot(na.transpose( x), cv), x) return var def mean_entropy(self) : """Calculate the average entropy of probabilities sampled from this Dirichlet distribution. Returns: The average entropy. Ref: Wolpert & Wolf, PRE 53:6841-6854 (1996) Theorem 7 (Warning: this paper contains typos.) Status: Alpha Authors: GEC 2005 """ # TODO: Optimize alpha = self.alpha A = float(sum(alpha)) ent = 0.0 for a in alpha: if a>0 : ent += - 1.0 * a * digamma( 1.0+a) # FIXME: Check ent /= A ent += digamma(A+1.0) return ent def variance_entropy(self): """Calculate the variance of the Dirichlet entropy. Ref: Wolpert & Wolf, PRE 53:6841-6854 (1996) Theorem 8 (Warning: this paper contains typos.) """ alpha = self.alpha A = float(sum(alpha)) A2 = A * (A+1) L = len(alpha) dg1 = zeros( (L) , float64) dg2 = zeros( (L) , float64) tg2 = zeros( (L) , float64) for i in range(L) : dg1[i] = digamma(alpha[i] + 1.0) dg2[i] = digamma(alpha[i] + 2.0) tg2[i] = trigamma(alpha[i] + 2.0) dg_Ap2 = digamma( A+2. ) tg_Ap2 = trigamma( A+2. ) mean = self.mean_entropy() var = 0.0 for i in range(L) : for j in range(L) : if i != j : var += ( ( dg1[i] - dg_Ap2 ) * (dg1[j] - dg_Ap2 ) - tg_Ap2 ) * (alpha[i] * alpha[j] ) / A2 else : var += ( ( dg2[i] - dg_Ap2 ) **2 + ( tg2[i] - tg_Ap2 ) ) * ( alpha[i] * (alpha[i]+1.) ) / A2 var -= mean**2 return var def mean_relative_entropy(self, pvec) : ln_p = na.log(pvec) return - self.mean_x(ln_p) - self.mean_entropy() def variance_relative_entropy(self, pvec) : ln_p = na.log(pvec) return self.variance_x(ln_p) + self.variance_entropy() def interval_relative_entropy(self, pvec, frac) : mean = self.mean_relative_entropy(pvec) variance = self.variance_relative_entropy(pvec) # If the variance is small, use the standard 95% # confidence interval: mean +/- 1.96 * sd if variance< 0.1 : sd = sqrt(variance) return max(0.0, mean - sd*1.96), mean + sd*1.96 sd = sqrt(variance) return max(0.0, mean - sd*1.96), mean + sd*1.96 g = gamma.from_mean_variance(mean, variance) low_limit = g.inverse_cdf( (1.-frac)/2.) high_limit = g.inverse_cdf( 1. - (1.-frac)/2. ) return low_limit, high_limit # Standard python voodoo for CLI if __name__ == "__main__": ## Code Profiling. Uncomment these lines #import hotshot, hotshot.stats #prof = hotshot.Profile("stones.prof") #prof.runcall(main) #prof.close() #stats = hotshot.stats.load("stones.prof") #stats.strip_dirs() #stats.sort_stats('cumulative', 'calls') #stats.print_stats(40) #sys.exit() main()