Mercurial > repos > xuebing > sharplabtool

diff sequence.py @ 14:76e1b1b21cce default tip
Deleted selected files
author: xuebing
date: Tue, 13 Mar 2012 19:05:10 -0400
parents: 292186c14b08
--- a/sequence.py	Sat Mar 10 08:17:36 2012 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,720 +0,0 @@
-#!@WHICHPYTHON@
-
-import copy, string, sys
-
-#------------------ Alphabet -------------------
-
-class Alphabet(object):
-    """Biological alphabet class.
-    This defines the set of symbols from which various objects can be built, e.g. sequences and motifs.
-    The symbol set is immutable and accessed as a tuple.
-    symstr: symbols in alphabet as either a tuple or string
-    complement: dictionary defining letters and their complement
-    """
-    def __init__(self, symstr, complement = None):
-        """Construct an alphabet from a string or tuple of characters.
-        Lower case characters will be converted to upper case.
-        An optional mapping for complements may be provided.
-        Example:
-        >>> alpha = sequence.Alphabet('ACGTttga', {'A':'C', 'G':'T'})
-        >>> alpha.getSymbols()
-        will construct the DNA alphabet and output:
-        ('A', 'C', 'G', 'T')
-        """
-        symlst = []
-        for s in [str(sym).upper()[0] for sym in symstr]:
-            if not s in symlst:
-                symlst.append(s)
-        self.symbols = tuple(symlst)
-        if complement != None:
-            # expand the mapping and check for contradictions
-            cmap = {}
-            for s in self.symbols:
-                c = complement.get(s, None)
-                if c != None:
-                    if s in cmap and cmap[s] != c:
-                        raise RuntimeError("Alphabet complement map "
-                                "contains contradictory mapping")
-                    cmap[s] = c
-                    cmap[c] = s
-            # replace mapping with indicies
-            cimap = {}
-            for idx in range (len(self.symbols)):
-                s = self.symbols[idx]
-                if s in cmap:
-                    cimap[cmap[s]] = idx
-            # create tuple
-            cidxlst = []
-            for idx in range (len(self.symbols)):
-                cidxlst.append(cimap.get(self.symbols[idx], None))
-            self.complements = tuple(cidxlst)
-        else:
-            self.complements = None
-
-    def getSymbols(self):
-        """Retrieve a tuple with all symbols, immutable membership and order"""
-        return self.symbols
-
-    def getComplements(self):
-        """Retrieve a tuple with all complement indicies, immutable"""
-        return self.complements
-
-    def isValidSymbol(self, sym):
-        """Check if the symbol is a member of alphabet"""
-        return any([s==sym for s in self.symbols])
-
-    def getIndex(self, sym):
-        """Retrieve the index of the symbol (immutable)"""
-        for idx in range (len(self.symbols)):
-            if self.symbols[idx] == sym:
-                return idx
-        raise RuntimeError("Symbol " + sym + " does not exist in alphabet")
-
-    def isComplementable(self):
-        return self.complements != None
-
-    def getComplement(self, sym):
-        """Retrieve the complement of the symbol (immutable)"""
-        return self.symbols[self.complements[self.getIndex(sym)]];
-
-    def isValidString(self, symstr):
-        """Check if the string contains only symbols that belong to the alphabet"""
-        found = True
-        for sym in symstr:
-            if self.isValidSymbol(sym) == False:
-                return False
-        return True
-
-    def getLen(self):
-        """Retrieve the number of symbols in (the length of) the alphabet"""
-        return len(self.symbols)
-
-# pre-defined alphabets that can be specified by their name
-predefAlphabets = [
-    ("DNA"                , Alphabet('ACGT', {'A':'T', 'G':'C'})),
-    ("RNA"                , Alphabet('ACGU')),
-    ("Extended DNA"       , Alphabet('ACGTYRN')),
-    ("Protein"            , Alphabet('ACDEFGHIKLMNPQRSTVWY')),
-    ("Extended Protein"   , Alphabet('ACDEFGHIKLMNPQRSTVWYX')),
-    ("TM Labels"          , Alphabet('MIO'))
-]
-
-def getAlphabet(name):
-    """Retrieve a pre-defined alphabet by name.
-    Currently, "Protein", "DNA", "RNA", "Extended DNA", "Extended Protein" and "TM Labels" are available.
-    Example:
-    >>> alpha = sequence.getAlphabet('Protein')
-    >>> alpha.getSymbols()
-    will retrieve the 20 amino acid alphabet and output the tuple:
-    ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')
-    """
-    for (xname, xalpha) in predefAlphabets:
-        if xname == name:
-            return xalpha
-    return None
-
-#------------------ Sequence -------------------
-
-class Sequence(object):
-    """Biological sequence class. Sequence data is immutable.
-
-    data: the sequence data as a tuple or string
-    alpha: the alphabet from which symbols are taken
-    name: the sequence name, if any
-    info: can contain additional sequence information apart from the name
-    """
-    def __init__(self, sequence, alpha = None, name = "", seqinfo = ""):
-        """Create a sequence with sequence data.
-        Specifying the alphabet is optional, so is the name and info.
-        Example:
-        >>> myseq = sequence.Sequence('MVSAKKVPAIAMSFGVSF')
-        will create a sequence with name "", and assign one of the predefined alphabets on basis of what symbols were used.
-        >>> myseq.getAlphabet().getSymbols()
-        will most likely output the standard protein alphabet:
-        ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')
-        """
-        if type(sequence) is str:
-            self.data = tuple(sequence.upper())
-        elif type(sequence) is tuple:
-            self.data = sequence
-        elif type(sequence) is list:
-            self.data = tuple([s.upper() for s in sequence])
-        else:
-            raise RuntimeError("Sequence data is not specified correctly: must be string or tuple")
-        # Resolve choice of alphabet
-        validAlphabet = False
-        if alpha == None:                                   # Alphabet is not set, attempt to set it automatically...
-            for (xname, xalpha) in predefAlphabets:         # Iterate through each predefined alphabet, in order
-                if xalpha.isValidString( self.data ):        # This alphabet works, go with it
-                    self.alpha = alpha = xalpha
-                    validAlphabet = True
-                    break
-        self.name = name
-        self.info = seqinfo
-        if validAlphabet == False:            # we were either unsuccessful above or the alphabet was specified so test it
-            if type(alpha) is str:            # check if name is a predefined alphabet
-                for (xname, xalpha) in predefAlphabets:   # Iterate through each predefined alphabet, check for name
-                    if (xname == alpha):
-                        alpha = xalpha
-                        break
-            if type(alpha) is Alphabet:       # the alphabet is specified
-                if alpha.isValidString(self.data) == False:
-                    raise RuntimeError("Invalid alphabet specified: "+"".join(alpha.getSymbols())+" is not compatible with sequence '"+"".join(self.data)+"'")
-                else:
-                    self.alpha = alpha
-            else:
-                raise RuntimeError("Could not identify alphabet from sequence")
-
-    #basic getters and setters for the class
-    def getName(self):
-        """Get the name of the sequence"""
-        return self.name
-    def getInfo(self):
-        """Get additional info of the sequence (e.g. from the defline in a FASTA file)"""
-        return self.info
-    def getAlphabet(self):
-        """Retrieve the alphabet that is assigned to this sequence"""
-        return self.alpha
-    def setName(self, name):
-        """Change the name of the sequence"""
-        self.name = name
-    def setAlphabet(self, alpha):
-        """Set the alphabet, throws an exception if it is not compatible with the sequence data"""
-        if type(alpha) is Alphabet:
-            if alpha.isValid( sequence ) == False:
-                raise RuntimeError( "Invalid alphabet specified" )
-    #sequence functions
-    def getSequence(self):
-        """Retrieve the sequence data (a tuple of symbols)"""
-        return self.data
-    def getString(self):
-        """Retrieve the sequence data as a string (copy of actual data)"""
-        return "".join(self.data)
-    def getLen(self):
-        """Get the length of the sequence (number of symbols)"""
-        return len(self.data)
-    def getSite(self, position, length = 1):
-        """Retrieve a site in the sequence of desired length.
-        Note that positions go from 0 to length-1, and that if the requested site
-        extends beyond those the method throws an exception.
-        """
-        if position >= 0 and position <= self.getLen() - length:
-            if length == 1:
-                return self.data[position]
-            else:
-                return self.data[position:position+length]
-        else:
-            raise RuntimeError( "Attempt to access invalid position in sequence "+self.getName() )
-
-    def nice(self):
-        """ A short description of the sequence """
-        print self.getName(), ":", self.getLen()
-
-def readStrings(filename):
-    """ Read one or more lines of text from a file--for example an alignment.
-    Return as a list of strings.
-    filename: name of file
-    """
-    txtlist = []
-    f = open(filename)
-    for line in f.readlines():
-        txtlist.extend(line.split())
-    return txtlist
-
-def readFASTA(filename, alpha = None):
-    """ Read one or more sequences from a file in FASTA format.
-    filename: name of file to load sequences from
-    alpha: alphabet that is used (if left unspecified, an attempt is made to identify the alphabet for each individual sequence)
-    """
-    seqlist = []
-    seqname = None
-    seqinfo = None
-    seqdata = []
-    fh = open(filename)
-    thisline = fh.readline()
-    while (thisline):
-        if (thisline[0] == '>'): # new sequence
-            if (seqname):        # take care of the data that is already in the buffer before processing the new sequence
-                try:
-                    seqnew = Sequence(seqdata, alpha, seqname, seqinfo)
-                    seqlist.append(seqnew)
-                except RuntimeError, e:
-                    print >> sys.stderr, "Warning: "+seqname+" is invalid (ignored): ", e
-            seqinfo = thisline[1:-1]         # everything on the defline is "info"
-            seqname = seqinfo.split()[0]     # up to first space
-            seqdata = []
-        else:  # pull out the sequence data
-            cleanline = thisline.split()
-            for line in cleanline:
-                seqdata.extend(tuple(line.strip('*'))) # sometimes a line ends with an asterisk in FASTA files
-        thisline = fh.readline()
-
-    if (seqname):
-        try:
-            seqnew = Sequence(seqdata, alpha, seqname, seqinfo)
-            seqlist.append(seqnew)
-        except RuntimeError, e:
-            print >> sys.stderr, "Warning: " + seqname + " is invalid (ignored): ", e
-    else:
-        raise RuntimeError("No sequences on FASTA format found in this file")
-    fh.close()
-    return seqlist
-
-def _writeOneFASTA(sequence, filehandle):
-    """Write one sequence in FASTA format to an already open file"""
-    filehandle.write(">" + sequence.getName()+"\n")
-    data = sequence.getSequence()
-    lines = ( sequence.getLen() - 1) / 60 + 1
-    for i in range(lines):
-        #note: python lets us get the last line (var length) free
-        #lineofseq = data[i*60 : (i+1)*60] + "\n"
-        lineofseq = "".join(data[i*60 : (i+1)*60]) + "\n"
-        filehandle.write(lineofseq)
-
-def writeFASTA(sequence, filename):
-    """Write a list (or a single) of sequences to a file in the FASTA format"""
-    fh = open(filename, "w")
-    if isinstance(sequence, Sequence):
-        _writeOneFASTA(sequence, fh)
-    else:
-        for seq in sequence:
-            if isinstance(seq, Sequence):
-                _writeOneFASTA(seq, fh)
-            else:
-                print >> sys.stderr, "Warning: could not write " + seq.getName() + " (ignored)."
-    fh.flush()
-    fh.close()
-
-#------------------ Distrib -------------------
-
-class Distrib(object):
-    """Class for storing a multinomial probability distribution over the symbols in an alphabet"""
-    def __init__(self, alpha, pseudo_count = 0.0):
-        self.alpha = alpha
-        self.tot = pseudo_count * self.alpha.getLen()
-        self.cnt = [pseudo_count for _ in range( self.alpha.getLen() )]
-
-    def __deepcopy__(self, memo):
-        dup = Distrib(self.alpha)
-        dup.tot = copy.deepcopy(self.tot, memo)
-        dup.cnt = copy.deepcopy(self.cnt, memo)
-        return dup
-
-    def count(self, syms = None ):
-        """Count an observation of a symbol"""
-        if syms == None:
-            syms = self.alpha.getSymbols()
-        for sym in syms:
-            idx = self.alpha.getIndex( sym )
-            self.cnt[idx] += 1.0
-            self.tot += 1
-
-    def complement(self):
-        """Complement the counts, throw an error if this is impossible"""
-        if not self.alpha.isComplementable():
-            raise RuntimeError("Attempt to complement a Distrib "
-                    "based on a non-complementable alphabet.")
-        coms = self.alpha.getComplements()
-        new_count = []
-        for idx in range(len(coms)):
-            cidx = coms[idx]
-            if cidx == None:
-                cidx = idx
-            new_count.append(self.cnt[cidx])
-        self.cnt = new_count
-        return self
-
-    def reset(self):
-        """Reset the distribution, that is, restart counting."""
-        self.tot = 0
-        self.cnt = [0.0 for _ in range( self.alpha.getLen() )]
-
-    def getFreq(self, sym = None):
-        """Determine the probability distribution from the current counts.
-        The order in which probabilities are given follow the order of the symbols in the alphabet."""
-        if self.tot > 0:
-            if sym == None:
-                freq = tuple([ y / self.tot for y in self.cnt ])
-                return freq
-            else:
-                idx = self.alpha.getIndex( sym )
-                return self.cnt[idx] / self.tot
-        return None
-
-    def pretty(self):
-        """Retrieve the probabilites for all symbols and return as a pretty table (a list of text strings)"""
-        table = ["".join(["%4s " % s for s in self.alpha.getSymbols()])]
-        table.append("".join(["%3.2f " % y for y in Distrib.getFreq(self)]))
-        return table
-
-    def getSymbols(self):
-        """Get the symbols in the alphabet in the same order as probabilities are given."""
-        return self.alpha.getSymbols()
-
-    def getAlphabet(self):
-        """Get the alphabet over which the distribution is defined."""
-        return self.alpha
-
-#------------------ Motif (and subclasses) -------------------
-
-class Motif(object):
-    """ Sequence motif class--defining a pattern that can be searched in sequences.
-    This class is not intended for direct use. Instead use and develop sub-classes (see below).
-    """
-    def __init__(self, alpha):
-        self.len = 0
-        self.alpha = alpha
-
-    def getLen(self):
-        """Get the length of the motif"""
-        return self.len
-
-    def getAlphabet(self):
-        """Get the alphabet that is used in the motif"""
-        return self.alpha
-
-    def isAlphabet(self, seqstr):
-        """Check if the sequence can be processed by this motif"""
-        mystr = seqstr
-        if type(seqstr) is Sequence:
-            mystr = seqstr.getString()
-        return self.getAlphabet().isValidString(mystr)
-
-import re
-
-class RegExp(Motif):
-    """A motif class that defines the pattern in terms of a regular expression"""
-    def __init__(self, alpha, re_string):
-        Motif.__init__(self, alpha)
-        self.pattern = re.compile(re_string)
-
-    def match(self, seq):
-        """Find matches to the motif in a specified sequence.
-        The method is a generator, hence subsequent hits can be retrieved using next().
-        The returned result is a tuple (position, match-sequence, score), where score is
-        always 1.0 since a regular expression is either true or false (not returned).
-        """
-        myseq = seq
-        if not type(seq) is Sequence:
-            myseq = Sequence(seq, self.alpha)
-        mystr = myseq.getString()
-        if not Motif.isAlphabet(self, mystr):
-            raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName())
-        for m in re.finditer(self.pattern, mystr):
-            yield (m.start(), m.group(), 1.0)
-
-import math, time
-
-# Variables used by the PWM for creating an EPS file
-_colour_def = (
-    "/black [0 0 0] def\n"
-    "/red [0.8 0 0] def\n"
-    "/green [0 0.5 0] def\n"
-    "/blue [0 0 0.8] def\n"
-    "/yellow [1 1 0] def\n"
-    "/purple [0.8 0 0.8] def\n"
-    "/magenta [1.0 0 1.0] def\n"
-    "/cyan [0 1.0 1.0] def\n"
-    "/pink [1.0 0.8 0.8] def\n"
-    "/turquoise [0.2 0.9 0.8] def\n"
-    "/orange [1 0.7 0] def\n"
-    "/lightred [0.8 0.56 0.56] def\n"
-    "/lightgreen [0.35 0.5 0.35] def\n"
-    "/lightblue [0.56 0.56 0.8] def\n"
-    "/lightyellow [1 1 0.71] def\n"
-    "/lightpurple [0.8 0.56 0.8] def\n"
-    "/lightmagenta [1.0 0.7 1.0] def\n"
-    "/lightcyan [0.7 1.0 1.0] def\n"
-    "/lightpink [1.0 0.9 0.9] def\n"
-    "/lightturquoise [0.81 0.9 0.89] def\n"
-    "/lightorange [1 0.91 0.7] def\n")
-_colour_dict = (
-    "/fullColourDict <<\n"
-    " (G)  orange\n"
-    " (T)  green\n"
-    " (C)  blue\n"
-    " (A)  red\n"
-    " (U)  green\n"
-    ">> def\n"
-    "/mutedColourDict <<\n"
-    " (G)  lightorange\n"
-    " (T)  lightgreen\n"
-    " (C)  lightblue\n"
-    " (A)  lightred\n"
-    " (U)  lightgreen\n"
-    ">> def\n"
-    "/colorDict fullColourDict def\n")
-
-_eps_defaults = {
-    'LOGOTYPE': 'NA',
-    'FONTSIZE': '12',
-    'TITLEFONTSIZE': '12',
-    'SMALLFONTSIZE': '6',
-    'TOPMARGIN': '0.9',
-    'BOTTOMMARGIN': '0.9',
-    'YAXIS': 'true',
-    'YAXISLABEL': 'bits',
-    'XAXISLABEL': '',
-    'TITLE': '',
-    'ERRORBARFRACTION': '1.0',
-    'SHOWINGBOX': 'false',
-    'BARBITS': '2.0',
-    'TICBITS': '1',
-    'COLORDEF': _colour_def,
-    'COLORDICT': _colour_dict,
-    'SHOWENDS': 'false',
-    'NUMBERING': 'true',
-    'OUTLINE': 'false',
-}
-class PWM(Motif):
-    """This motif subclass defines a pattern in terms of a position weight matrix.
-    An alphabet must be provided. A pseudo-count to be added to each count is
-    optional.  A uniform background distribution is used by default.
-    """
-    def __init__(self, alpha):
-        Motif.__init__(self, alpha)                     # set alphabet of this multinomial distribution
-        self.background = Distrib(alpha)                # the default background ...
-        self.background.count(alpha.getSymbols())       # ... is uniform
-        self.nsites = 0
-
-    def setFromAlignment(self, aligned, pseudo_count = 0.0):
-        """Set the probabilities in the PWM from an alignment.
-        The alignment is a list of equal-length strings (see readStrings), OR
-        a list of Sequence.
-        """
-        self.cols = -1
-        self.nsites = len(aligned)
-        seqs = []
-        # Below we create a list of Sequence from the alignment,
-        # while doing some error checking, and figure out the number of columns
-        for s in aligned:
-            # probably a text string, so we make a nameless sequence from it
-            if not type(s) is Sequence:
-                s=Sequence(s, Motif.getAlphabet(self))
-            else:
-            # it was a sequence, so we check that the alphabet in
-            # this motif will be able to process it
-                if not Motif.isAlphabet(self, s):
-                    raise RuntimeError("Motif alphabet is not valid for sequence " + s.getName())
-            if self.cols == -1:
-                self.cols = s.getLen()
-            elif self.cols != s.getLen():
-                raise RuntimeError("Sequences in alignment are not of equal length")
-            seqs.append(s)
-        # The line below initializes the list of Distrib (one for each column of the alignment)
-        self.counts = [Distrib(Motif.getAlphabet(self), pseudo_count) for _ in range(self.cols)]
-        # Next, we do the counting, column by column
-        for c in range( self.cols ):     # iterate through columns
-            for s in seqs:               # iterate through rows
-                # determine the index of the symbol we find at this position (row, column c)
-                self.counts[c].count(s.getSite(c))
-        # Update the length
-        self.len = self.cols
-
-    def reverseComplement(self):
-        """Reverse complement the PWM"""
-        i = 0
-        j = len(self.counts)-1
-        while (i < j):
-            temp = self.counts[i];
-            self.counts[i] = self.counts[j]
-            self.counts[j] = temp
-            self.counts[i].complement()
-            self.counts[j].complement()
-            i += 1;
-            j -= 1;
-        if i == j:
-            self.counts[i].complement()
-        return self
-
-    def getNSites(self):
-        """Get the number of sites that made the PWM"""
-        return self.nsites
-
-    def setBackground(self, distrib):
-        """Set the background distribution"""
-        if not distrib.getAlphabet() == Motif.getAlphabet(self):
-            raise RuntimeError("Incompatible alphabets")
-        self.background = distrib
-
-    def getFreq(self, col = None, sym = None):
-        """Get the probabilities for all positions in the PWM (a list of Distribs)"""
-        if (col == None):
-            return [y.getFreq() for y in self.counts]
-        else:
-            return self.counts[col].getFreq(sym)
-
-    def pretty(self):
-        """Retrieve the probabilites for all positions in the PWM as a pretty table (a list of text strings)"""
-        #table = ["".join(["%8s " % s for s in self.alpha.getSymbols()])]
-        table = []
-        for row in PWM.getFreq(self):
-            table.append("".join(["%8.6f " % y for y in row]))
-        return table
-
-    def logoddsPretty(self, bkg):
-        """Retrieve the (base-2) log-odds for all positions in the PWM as a pretty table (a list of text strings)"""
-        table = []
-        for row in PWM.getFreq(self):
-            #table.append("".join(["%8.6f " % (math.log((row[i]+1e-6)/bkg[i])/math.log(2)) for i in range(len(row))]))
-            table.append("".join(["%8.6f " % (math.log((row[i])/bkg[i])/math.log(2)) for i in range(len(row))]))
-            #table.append("".join(["%8.6f " % row[i] for i in range(len(row))]))
-        return table
-
-
-    def consensus_sequence(self):
-        """
-        Get the consensus sequence corresponding to a PWM.
-        Consensus sequence is the letter in each column
-        with the highest probability.
-        """
-        consensus = ""
-        alphabet = Motif.getAlphabet(self).getSymbols()
-        for pos in range(self.cols):
-            best_letter = alphabet[0]
-            best_p = self.counts[pos].getFreq(best_letter)
-            for letter in alphabet[1:]:
-                p = self.counts[pos].getFreq(letter)
-                if p > best_p:
-                    best_p = p
-                    best_letter = letter
-            consensus += best_letter
-        return consensus
-
-
-    def consensus(self):
-        """
-        Get the consensus corresponding to a PWM.
-        Consensus at each column of motif is a list of
-        characters with non-zero probabilities.
-        """
-        consensus = []
-        for pos in range(self.cols):
-            matches = []
-            for letter in Motif.getAlphabet(self).getSymbols():
-                p = self.counts[pos].getFreq(letter)
-                if p > 0:
-                    matches += letter
-            consensus.append(matches)
-        return consensus
-
-
-    def getScore(self, seq, start):
-        """Score this particular list of symbols using the PFM (background needs to be set separately)"""
-        sum = 0.0
-        seqdata = seq.getSequence()[start : start+self.cols]
-        for pos in range(len(seqdata)):
-            q = self.counts[pos].getFreq(seqdata[pos])
-            if q == 0:
-                q = 0.0001 # to avoid log(0) == -Infinity
-            logodds = math.log(q / self.background.getFreq(seqdata[pos]))
-            sum += logodds
-        return sum
-
-    def match(self, seq, _LOG0 = -10):
-        """Find matches to the motif in a specified sequence.
-        The method is a generator, hence subsequent hits can be retrieved using next().
-        The returned result is a tuple (position, match-sequence, score).
-        The optional parameter _LOG0 specifies a lower bound on reported logodds scores.
-        """
-        myseq = seq
-        if not type(seq) is Sequence:
-            myseq = Sequence(seq, self.alpha)
-        if not Motif.isAlphabet(self, myseq):
-            raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName())
-        for pos in range(myseq.getLen() - self.cols):
-            score = PWM.getScore(self, myseq, pos)
-            if score > _LOG0:
-                yield (pos, "".join(myseq.getSite(pos, self.cols)), score)
-
-    def writeEPS(self, program, template_file, eps_fh, 
-            timestamp = time.localtime()):
-        """Write out a DNA motif to EPS format."""
-        small_dfmt = "%d.%m.%Y %H:%M"
-        full_dfmt = "%d.%m.%Y %H:%M:%S %Z"
-        small_date = time.strftime(small_dfmt, timestamp)
-        full_date = time.strftime(full_dfmt, timestamp)
-        points_per_cm = 72.0 / 2.54
-        height = 4.5
-        width = self.getLen() * 0.8 + 2
-        width = min(30, width)
-        points_height = int(height * points_per_cm)
-        points_width = int(width * points_per_cm)
-        defaults = _eps_defaults.copy()
-        defaults['CREATOR'] = program
-        defaults['CREATIONDATE'] = full_date
-        defaults['LOGOHEIGHT'] = str(height)
-        defaults['LOGOWIDTH'] = str(width)
-        defaults['FINEPRINT'] = program + ' ' + small_date
-        defaults['CHARSPERLINE'] = str(self.getLen())
-        defaults['BOUNDINGHEIGHT'] = str(points_height)
-        defaults['BOUNDINGWIDTH'] = str(points_width)
-        defaults['LOGOLINEHEIGHT'] = str(height)
-        with open(template_file, 'r') as template_fh:
-            m_var = re.compile("\{\$([A-Z]+)\}")
-            for line in template_fh:
-                last = 0
-                match = m_var.search(line)
-                while (match):
-                    if (last < match.start()):
-                        prev = line[last:match.start()]
-                        eps_fh.write(prev)
-                    key = match.group(1)
-                    if (key == "DATA"):
-                        eps_fh.write("\nStartLine\n")
-                        for pos in range(self.getLen()):
-                            eps_fh.write("({0:d}) startstack\n".format(pos+1))
-                            stack = []
-                            # calculate the stack information content
-                            alpha_ic = 2
-                            h = 0
-                            for sym in self.getAlphabet().getSymbols():
-                                freq = self.getFreq(pos, sym)
-                                if (freq == 0):
-                                    continue
-                                h -= (freq * math.log(freq, 2))
-                            stack_ic = alpha_ic - h
-                            # calculate the heights of each symbol
-                            for sym in self.getAlphabet().getSymbols():
-                                freq = self.getFreq(pos, sym)
-                                if (freq == 0):
-                                    continue
-                                stack.append((freq * stack_ic, sym))
-                            stack.sort();
-                            # output the symbols
-                            for symh, sym in stack:
-                                eps_fh.write(" {0:f} ({1:s}) numchar\n".format(
-                                        symh, sym))
-                            eps_fh.write("endstack\n\n")
-                        eps_fh.write("EndLine\n")
-                    elif (key in defaults):
-                        eps_fh.write(defaults[key])
-                    else:
-                        raise RuntimeError('Unknown variable "' + key + 
-                                '" in EPS template')
-                    last = match.end();
-                    match = m_var.search(line, last)
-                if (last < len(line)):
-                    eps_fh.write(line[last:])
-
-
-#------------------ Main method -------------------
-# Executed if you run this file from the operating system prompt, e.g.
-# > python sequence.py
-
-if __name__=='__main__':
-    alpha = getAlphabet('Extended DNA')
-    #seqs = readFASTA('pos.fasta')
-    seqs = []
-    aln = readStrings('tmp0')
-    #regexp = RegExp(alpha, '[AG]G.[DE]TT[AS].')
-    pwm = PWM(alpha)
-    pwm.setFromAlignment(aln)
-    for row in pwm.pretty():
-        print row
-    for s in seqs:
-        print s.getName(), s.getLen(), s.getAlphabet().getSymbols()
-        for m in regexp.match( s ):
-            print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])
-        for m in pwm.match( s ):
-            print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])
author	xuebing
date	Tue, 13 Mar 2012 19:05:10 -0400
parents	292186c14b08
children