Mercurial > repos > xuebing > sharplab_seq_motif

diff sequence.py @ 15:0e221dbd17b2 default tip
Uploaded
author: xuebing
date: Sat, 31 Mar 2012 08:53:06 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sequence.py	Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,720 @@
+#!@WHICHPYTHON@
+
+import copy, string, sys
+
+#------------------ Alphabet -------------------
+
+class Alphabet(object):
+    """Biological alphabet class.
+    This defines the set of symbols from which various objects can be built, e.g. sequences and motifs.
+    The symbol set is immutable and accessed as a tuple.
+    symstr: symbols in alphabet as either a tuple or string
+    complement: dictionary defining letters and their complement
+    """
+    def __init__(self, symstr, complement = None):
+        """Construct an alphabet from a string or tuple of characters.
+        Lower case characters will be converted to upper case.
+        An optional mapping for complements may be provided.
+        Example:
+        >>> alpha = sequence.Alphabet('ACGTttga', {'A':'C', 'G':'T'})
+        >>> alpha.getSymbols()
+        will construct the DNA alphabet and output:
+        ('A', 'C', 'G', 'T')
+        """
+        symlst = []
+        for s in [str(sym).upper()[0] for sym in symstr]:
+            if not s in symlst:
+                symlst.append(s)
+        self.symbols = tuple(symlst)
+        if complement != None:
+            # expand the mapping and check for contradictions
+            cmap = {}
+            for s in self.symbols:
+                c = complement.get(s, None)
+                if c != None:
+                    if s in cmap and cmap[s] != c:
+                        raise RuntimeError("Alphabet complement map "
+                                "contains contradictory mapping")
+                    cmap[s] = c
+                    cmap[c] = s
+            # replace mapping with indicies
+            cimap = {}
+            for idx in range (len(self.symbols)):
+                s = self.symbols[idx]
+                if s in cmap:
+                    cimap[cmap[s]] = idx
+            # create tuple
+            cidxlst = []
+            for idx in range (len(self.symbols)):
+                cidxlst.append(cimap.get(self.symbols[idx], None))
+            self.complements = tuple(cidxlst)
+        else:
+            self.complements = None
+
+    def getSymbols(self):
+        """Retrieve a tuple with all symbols, immutable membership and order"""
+        return self.symbols
+
+    def getComplements(self):
+        """Retrieve a tuple with all complement indicies, immutable"""
+        return self.complements
+
+    def isValidSymbol(self, sym):
+        """Check if the symbol is a member of alphabet"""
+        return any([s==sym for s in self.symbols])
+
+    def getIndex(self, sym):
+        """Retrieve the index of the symbol (immutable)"""
+        for idx in range (len(self.symbols)):
+            if self.symbols[idx] == sym:
+                return idx
+        raise RuntimeError("Symbol " + sym + " does not exist in alphabet")
+
+    def isComplementable(self):
+        return self.complements != None
+
+    def getComplement(self, sym):
+        """Retrieve the complement of the symbol (immutable)"""
+        return self.symbols[self.complements[self.getIndex(sym)]];
+
+    def isValidString(self, symstr):
+        """Check if the string contains only symbols that belong to the alphabet"""
+        found = True
+        for sym in symstr:
+            if self.isValidSymbol(sym) == False:
+                return False
+        return True
+
+    def getLen(self):
+        """Retrieve the number of symbols in (the length of) the alphabet"""
+        return len(self.symbols)
+
+# pre-defined alphabets that can be specified by their name
+predefAlphabets = [
+    ("DNA"                , Alphabet('ACGT', {'A':'T', 'G':'C'})),
+    ("RNA"                , Alphabet('ACGU')),
+    ("Extended DNA"       , Alphabet('ACGTYRN')),
+    ("Protein"            , Alphabet('ACDEFGHIKLMNPQRSTVWY')),
+    ("Extended Protein"   , Alphabet('ACDEFGHIKLMNPQRSTVWYX')),
+    ("TM Labels"          , Alphabet('MIO'))
+]
+
+def getAlphabet(name):
+    """Retrieve a pre-defined alphabet by name.
+    Currently, "Protein", "DNA", "RNA", "Extended DNA", "Extended Protein" and "TM Labels" are available.
+    Example:
+    >>> alpha = sequence.getAlphabet('Protein')
+    >>> alpha.getSymbols()
+    will retrieve the 20 amino acid alphabet and output the tuple:
+    ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')
+    """
+    for (xname, xalpha) in predefAlphabets:
+        if xname == name:
+            return xalpha
+    return None
+
+#------------------ Sequence -------------------
+
+class Sequence(object):
+    """Biological sequence class. Sequence data is immutable.
+
+    data: the sequence data as a tuple or string
+    alpha: the alphabet from which symbols are taken
+    name: the sequence name, if any
+    info: can contain additional sequence information apart from the name
+    """
+    def __init__(self, sequence, alpha = None, name = "", seqinfo = ""):
+        """Create a sequence with sequence data.
+        Specifying the alphabet is optional, so is the name and info.
+        Example:
+        >>> myseq = sequence.Sequence('MVSAKKVPAIAMSFGVSF')
+        will create a sequence with name "", and assign one of the predefined alphabets on basis of what symbols were used.
+        >>> myseq.getAlphabet().getSymbols()
+        will most likely output the standard protein alphabet:
+        ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')
+        """
+        if type(sequence) is str:
+            self.data = tuple(sequence.upper())
+        elif type(sequence) is tuple:
+            self.data = sequence
+        elif type(sequence) is list:
+            self.data = tuple([s.upper() for s in sequence])
+        else:
+            raise RuntimeError("Sequence data is not specified correctly: must be string or tuple")
+        # Resolve choice of alphabet
+        validAlphabet = False
+        if alpha == None:                                   # Alphabet is not set, attempt to set it automatically...
+            for (xname, xalpha) in predefAlphabets:         # Iterate through each predefined alphabet, in order
+                if xalpha.isValidString( self.data ):        # This alphabet works, go with it
+                    self.alpha = alpha = xalpha
+                    validAlphabet = True
+                    break
+        self.name = name
+        self.info = seqinfo
+        if validAlphabet == False:            # we were either unsuccessful above or the alphabet was specified so test it
+            if type(alpha) is str:            # check if name is a predefined alphabet
+                for (xname, xalpha) in predefAlphabets:   # Iterate through each predefined alphabet, check for name
+                    if (xname == alpha):
+                        alpha = xalpha
+                        break
+            if type(alpha) is Alphabet:       # the alphabet is specified
+                if alpha.isValidString(self.data) == False:
+                    raise RuntimeError("Invalid alphabet specified: "+"".join(alpha.getSymbols())+" is not compatible with sequence '"+"".join(self.data)+"'")
+                else:
+                    self.alpha = alpha
+            else:
+                raise RuntimeError("Could not identify alphabet from sequence")
+
+    #basic getters and setters for the class
+    def getName(self):
+        """Get the name of the sequence"""
+        return self.name
+    def getInfo(self):
+        """Get additional info of the sequence (e.g. from the defline in a FASTA file)"""
+        return self.info
+    def getAlphabet(self):
+        """Retrieve the alphabet that is assigned to this sequence"""
+        return self.alpha
+    def setName(self, name):
+        """Change the name of the sequence"""
+        self.name = name
+    def setAlphabet(self, alpha):
+        """Set the alphabet, throws an exception if it is not compatible with the sequence data"""
+        if type(alpha) is Alphabet:
+            if alpha.isValid( sequence ) == False:
+                raise RuntimeError( "Invalid alphabet specified" )
+    #sequence functions
+    def getSequence(self):
+        """Retrieve the sequence data (a tuple of symbols)"""
+        return self.data
+    def getString(self):
+        """Retrieve the sequence data as a string (copy of actual data)"""
+        return "".join(self.data)
+    def getLen(self):
+        """Get the length of the sequence (number of symbols)"""
+        return len(self.data)
+    def getSite(self, position, length = 1):
+        """Retrieve a site in the sequence of desired length.
+        Note that positions go from 0 to length-1, and that if the requested site
+        extends beyond those the method throws an exception.
+        """
+        if position >= 0 and position <= self.getLen() - length:
+            if length == 1:
+                return self.data[position]
+            else:
+                return self.data[position:position+length]
+        else:
+            raise RuntimeError( "Attempt to access invalid position in sequence "+self.getName() )
+
+    def nice(self):
+        """ A short description of the sequence """
+        print self.getName(), ":", self.getLen()
+
+def readStrings(filename):
+    """ Read one or more lines of text from a file--for example an alignment.
+    Return as a list of strings.
+    filename: name of file
+    """
+    txtlist = []
+    f = open(filename)
+    for line in f.readlines():
+        txtlist.extend(line.split())
+    return txtlist
+
+def readFASTA(filename, alpha = None):
+    """ Read one or more sequences from a file in FASTA format.
+    filename: name of file to load sequences from
+    alpha: alphabet that is used (if left unspecified, an attempt is made to identify the alphabet for each individual sequence)
+    """
+    seqlist = []
+    seqname = None
+    seqinfo = None
+    seqdata = []
+    fh = open(filename)
+    thisline = fh.readline()
+    while (thisline):
+        if (thisline[0] == '>'): # new sequence
+            if (seqname):        # take care of the data that is already in the buffer before processing the new sequence
+                try:
+                    seqnew = Sequence(seqdata, alpha, seqname, seqinfo)
+                    seqlist.append(seqnew)
+                except RuntimeError, e:
+                    print >> sys.stderr, "Warning: "+seqname+" is invalid (ignored): ", e
+            seqinfo = thisline[1:-1]         # everything on the defline is "info"
+            seqname = seqinfo.split()[0]     # up to first space
+            seqdata = []
+        else:  # pull out the sequence data
+            cleanline = thisline.split()
+            for line in cleanline:
+                seqdata.extend(tuple(line.strip('*'))) # sometimes a line ends with an asterisk in FASTA files
+        thisline = fh.readline()
+
+    if (seqname):
+        try:
+            seqnew = Sequence(seqdata, alpha, seqname, seqinfo)
+            seqlist.append(seqnew)
+        except RuntimeError, e:
+            print >> sys.stderr, "Warning: " + seqname + " is invalid (ignored): ", e
+    else:
+        raise RuntimeError("No sequences on FASTA format found in this file")
+    fh.close()
+    return seqlist
+
+def _writeOneFASTA(sequence, filehandle):
+    """Write one sequence in FASTA format to an already open file"""
+    filehandle.write(">" + sequence.getName()+"\n")
+    data = sequence.getSequence()
+    lines = ( sequence.getLen() - 1) / 60 + 1
+    for i in range(lines):
+        #note: python lets us get the last line (var length) free
+        #lineofseq = data[i*60 : (i+1)*60] + "\n"
+        lineofseq = "".join(data[i*60 : (i+1)*60]) + "\n"
+        filehandle.write(lineofseq)
+
+def writeFASTA(sequence, filename):
+    """Write a list (or a single) of sequences to a file in the FASTA format"""
+    fh = open(filename, "w")
+    if isinstance(sequence, Sequence):
+        _writeOneFASTA(sequence, fh)
+    else:
+        for seq in sequence:
+            if isinstance(seq, Sequence):
+                _writeOneFASTA(seq, fh)
+            else:
+                print >> sys.stderr, "Warning: could not write " + seq.getName() + " (ignored)."
+    fh.flush()
+    fh.close()
+
+#------------------ Distrib -------------------
+
+class Distrib(object):
+    """Class for storing a multinomial probability distribution over the symbols in an alphabet"""
+    def __init__(self, alpha, pseudo_count = 0.0):
+        self.alpha = alpha
+        self.tot = pseudo_count * self.alpha.getLen()
+        self.cnt = [pseudo_count for _ in range( self.alpha.getLen() )]
+
+    def __deepcopy__(self, memo):
+        dup = Distrib(self.alpha)
+        dup.tot = copy.deepcopy(self.tot, memo)
+        dup.cnt = copy.deepcopy(self.cnt, memo)
+        return dup
+
+    def count(self, syms = None ):
+        """Count an observation of a symbol"""
+        if syms == None:
+            syms = self.alpha.getSymbols()
+        for sym in syms:
+            idx = self.alpha.getIndex( sym )
+            self.cnt[idx] += 1.0
+            self.tot += 1
+
+    def complement(self):
+        """Complement the counts, throw an error if this is impossible"""
+        if not self.alpha.isComplementable():
+            raise RuntimeError("Attempt to complement a Distrib "
+                    "based on a non-complementable alphabet.")
+        coms = self.alpha.getComplements()
+        new_count = []
+        for idx in range(len(coms)):
+            cidx = coms[idx]
+            if cidx == None:
+                cidx = idx
+            new_count.append(self.cnt[cidx])
+        self.cnt = new_count
+        return self
+
+    def reset(self):
+        """Reset the distribution, that is, restart counting."""
+        self.tot = 0
+        self.cnt = [0.0 for _ in range( self.alpha.getLen() )]
+
+    def getFreq(self, sym = None):
+        """Determine the probability distribution from the current counts.
+        The order in which probabilities are given follow the order of the symbols in the alphabet."""
+        if self.tot > 0:
+            if sym == None:
+                freq = tuple([ y / self.tot for y in self.cnt ])
+                return freq
+            else:
+                idx = self.alpha.getIndex( sym )
+                return self.cnt[idx] / self.tot
+        return None
+
+    def pretty(self):
+        """Retrieve the probabilites for all symbols and return as a pretty table (a list of text strings)"""
+        table = ["".join(["%4s " % s for s in self.alpha.getSymbols()])]
+        table.append("".join(["%3.2f " % y for y in Distrib.getFreq(self)]))
+        return table
+
+    def getSymbols(self):
+        """Get the symbols in the alphabet in the same order as probabilities are given."""
+        return self.alpha.getSymbols()
+
+    def getAlphabet(self):
+        """Get the alphabet over which the distribution is defined."""
+        return self.alpha
+
+#------------------ Motif (and subclasses) -------------------
+
+class Motif(object):
+    """ Sequence motif class--defining a pattern that can be searched in sequences.
+    This class is not intended for direct use. Instead use and develop sub-classes (see below).
+    """
+    def __init__(self, alpha):
+        self.len = 0
+        self.alpha = alpha
+
+    def getLen(self):
+        """Get the length of the motif"""
+        return self.len
+
+    def getAlphabet(self):
+        """Get the alphabet that is used in the motif"""
+        return self.alpha
+
+    def isAlphabet(self, seqstr):
+        """Check if the sequence can be processed by this motif"""
+        mystr = seqstr
+        if type(seqstr) is Sequence:
+            mystr = seqstr.getString()
+        return self.getAlphabet().isValidString(mystr)
+
+import re
+
+class RegExp(Motif):
+    """A motif class that defines the pattern in terms of a regular expression"""
+    def __init__(self, alpha, re_string):
+        Motif.__init__(self, alpha)
+        self.pattern = re.compile(re_string)
+
+    def match(self, seq):
+        """Find matches to the motif in a specified sequence.
+        The method is a generator, hence subsequent hits can be retrieved using next().
+        The returned result is a tuple (position, match-sequence, score), where score is
+        always 1.0 since a regular expression is either true or false (not returned).
+        """
+        myseq = seq
+        if not type(seq) is Sequence:
+            myseq = Sequence(seq, self.alpha)
+        mystr = myseq.getString()
+        if not Motif.isAlphabet(self, mystr):
+            raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName())
+        for m in re.finditer(self.pattern, mystr):
+            yield (m.start(), m.group(), 1.0)
+
+import math, time
+
+# Variables used by the PWM for creating an EPS file
+_colour_def = (
+    "/black [0 0 0] def\n"
+    "/red [0.8 0 0] def\n"
+    "/green [0 0.5 0] def\n"
+    "/blue [0 0 0.8] def\n"
+    "/yellow [1 1 0] def\n"
+    "/purple [0.8 0 0.8] def\n"
+    "/magenta [1.0 0 1.0] def\n"
+    "/cyan [0 1.0 1.0] def\n"
+    "/pink [1.0 0.8 0.8] def\n"
+    "/turquoise [0.2 0.9 0.8] def\n"
+    "/orange [1 0.7 0] def\n"
+    "/lightred [0.8 0.56 0.56] def\n"
+    "/lightgreen [0.35 0.5 0.35] def\n"
+    "/lightblue [0.56 0.56 0.8] def\n"
+    "/lightyellow [1 1 0.71] def\n"
+    "/lightpurple [0.8 0.56 0.8] def\n"
+    "/lightmagenta [1.0 0.7 1.0] def\n"
+    "/lightcyan [0.7 1.0 1.0] def\n"
+    "/lightpink [1.0 0.9 0.9] def\n"
+    "/lightturquoise [0.81 0.9 0.89] def\n"
+    "/lightorange [1 0.91 0.7] def\n")
+_colour_dict = (
+    "/fullColourDict <<\n"
+    " (G)  orange\n"
+    " (T)  green\n"
+    " (C)  blue\n"
+    " (A)  red\n"
+    " (U)  green\n"
+    ">> def\n"
+    "/mutedColourDict <<\n"
+    " (G)  lightorange\n"
+    " (T)  lightgreen\n"
+    " (C)  lightblue\n"
+    " (A)  lightred\n"
+    " (U)  lightgreen\n"
+    ">> def\n"
+    "/colorDict fullColourDict def\n")
+
+_eps_defaults = {
+    'LOGOTYPE': 'NA',
+    'FONTSIZE': '12',
+    'TITLEFONTSIZE': '12',
+    'SMALLFONTSIZE': '6',
+    'TOPMARGIN': '0.9',
+    'BOTTOMMARGIN': '0.9',
+    'YAXIS': 'true',
+    'YAXISLABEL': 'bits',
+    'XAXISLABEL': '',
+    'TITLE': '',
+    'ERRORBARFRACTION': '1.0',
+    'SHOWINGBOX': 'false',
+    'BARBITS': '2.0',
+    'TICBITS': '1',
+    'COLORDEF': _colour_def,
+    'COLORDICT': _colour_dict,
+    'SHOWENDS': 'false',
+    'NUMBERING': 'true',
+    'OUTLINE': 'false',
+}
+class PWM(Motif):
+    """This motif subclass defines a pattern in terms of a position weight matrix.
+    An alphabet must be provided. A pseudo-count to be added to each count is
+    optional.  A uniform background distribution is used by default.
+    """
+    def __init__(self, alpha):
+        Motif.__init__(self, alpha)                     # set alphabet of this multinomial distribution
+        self.background = Distrib(alpha)                # the default background ...
+        self.background.count(alpha.getSymbols())       # ... is uniform
+        self.nsites = 0
+
+    def setFromAlignment(self, aligned, pseudo_count = 0.0):
+        """Set the probabilities in the PWM from an alignment.
+        The alignment is a list of equal-length strings (see readStrings), OR
+        a list of Sequence.
+        """
+        self.cols = -1
+        self.nsites = len(aligned)
+        seqs = []
+        # Below we create a list of Sequence from the alignment,
+        # while doing some error checking, and figure out the number of columns
+        for s in aligned:
+            # probably a text string, so we make a nameless sequence from it
+            if not type(s) is Sequence:
+                s=Sequence(s, Motif.getAlphabet(self))
+            else:
+            # it was a sequence, so we check that the alphabet in
+            # this motif will be able to process it
+                if not Motif.isAlphabet(self, s):
+                    raise RuntimeError("Motif alphabet is not valid for sequence " + s.getName())
+            if self.cols == -1:
+                self.cols = s.getLen()
+            elif self.cols != s.getLen():
+                raise RuntimeError("Sequences in alignment are not of equal length")
+            seqs.append(s)
+        # The line below initializes the list of Distrib (one for each column of the alignment)
+        self.counts = [Distrib(Motif.getAlphabet(self), pseudo_count) for _ in range(self.cols)]
+        # Next, we do the counting, column by column
+        for c in range( self.cols ):     # iterate through columns
+            for s in seqs:               # iterate through rows
+                # determine the index of the symbol we find at this position (row, column c)
+                self.counts[c].count(s.getSite(c))
+        # Update the length
+        self.len = self.cols
+
+    def reverseComplement(self):
+        """Reverse complement the PWM"""
+        i = 0
+        j = len(self.counts)-1
+        while (i < j):
+            temp = self.counts[i];
+            self.counts[i] = self.counts[j]
+            self.counts[j] = temp
+            self.counts[i].complement()
+            self.counts[j].complement()
+            i += 1;
+            j -= 1;
+        if i == j:
+            self.counts[i].complement()
+        return self
+
+    def getNSites(self):
+        """Get the number of sites that made the PWM"""
+        return self.nsites
+
+    def setBackground(self, distrib):
+        """Set the background distribution"""
+        if not distrib.getAlphabet() == Motif.getAlphabet(self):
+            raise RuntimeError("Incompatible alphabets")
+        self.background = distrib
+
+    def getFreq(self, col = None, sym = None):
+        """Get the probabilities for all positions in the PWM (a list of Distribs)"""
+        if (col == None):
+            return [y.getFreq() for y in self.counts]
+        else:
+            return self.counts[col].getFreq(sym)
+
+    def pretty(self):
+        """Retrieve the probabilites for all positions in the PWM as a pretty table (a list of text strings)"""
+        #table = ["".join(["%8s " % s for s in self.alpha.getSymbols()])]
+        table = []
+        for row in PWM.getFreq(self):
+            table.append("".join(["%8.6f " % y for y in row]))
+        return table
+
+    def logoddsPretty(self, bkg):
+        """Retrieve the (base-2) log-odds for all positions in the PWM as a pretty table (a list of text strings)"""
+        table = []
+        for row in PWM.getFreq(self):
+            #table.append("".join(["%8.6f " % (math.log((row[i]+1e-6)/bkg[i])/math.log(2)) for i in range(len(row))]))
+            table.append("".join(["%8.6f " % (math.log((row[i])/bkg[i])/math.log(2)) for i in range(len(row))]))
+            #table.append("".join(["%8.6f " % row[i] for i in range(len(row))]))
+        return table
+
+
+    def consensus_sequence(self):
+        """
+        Get the consensus sequence corresponding to a PWM.
+        Consensus sequence is the letter in each column
+        with the highest probability.
+        """
+        consensus = ""
+        alphabet = Motif.getAlphabet(self).getSymbols()
+        for pos in range(self.cols):
+            best_letter = alphabet[0]
+            best_p = self.counts[pos].getFreq(best_letter)
+            for letter in alphabet[1:]:
+                p = self.counts[pos].getFreq(letter)
+                if p > best_p:
+                    best_p = p
+                    best_letter = letter
+            consensus += best_letter
+        return consensus
+
+
+    def consensus(self):
+        """
+        Get the consensus corresponding to a PWM.
+        Consensus at each column of motif is a list of
+        characters with non-zero probabilities.
+        """
+        consensus = []
+        for pos in range(self.cols):
+            matches = []
+            for letter in Motif.getAlphabet(self).getSymbols():
+                p = self.counts[pos].getFreq(letter)
+                if p > 0:
+                    matches += letter
+            consensus.append(matches)
+        return consensus
+
+
+    def getScore(self, seq, start):
+        """Score this particular list of symbols using the PFM (background needs to be set separately)"""
+        sum = 0.0
+        seqdata = seq.getSequence()[start : start+self.cols]
+        for pos in range(len(seqdata)):
+            q = self.counts[pos].getFreq(seqdata[pos])
+            if q == 0:
+                q = 0.0001 # to avoid log(0) == -Infinity
+            logodds = math.log(q / self.background.getFreq(seqdata[pos]))
+            sum += logodds
+        return sum
+
+    def match(self, seq, _LOG0 = -10):
+        """Find matches to the motif in a specified sequence.
+        The method is a generator, hence subsequent hits can be retrieved using next().
+        The returned result is a tuple (position, match-sequence, score).
+        The optional parameter _LOG0 specifies a lower bound on reported logodds scores.
+        """
+        myseq = seq
+        if not type(seq) is Sequence:
+            myseq = Sequence(seq, self.alpha)
+        if not Motif.isAlphabet(self, myseq):
+            raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName())
+        for pos in range(myseq.getLen() - self.cols):
+            score = PWM.getScore(self, myseq, pos)
+            if score > _LOG0:
+                yield (pos, "".join(myseq.getSite(pos, self.cols)), score)
+
+    def writeEPS(self, program, template_file, eps_fh, 
+            timestamp = time.localtime()):
+        """Write out a DNA motif to EPS format."""
+        small_dfmt = "%d.%m.%Y %H:%M"
+        full_dfmt = "%d.%m.%Y %H:%M:%S %Z"
+        small_date = time.strftime(small_dfmt, timestamp)
+        full_date = time.strftime(full_dfmt, timestamp)
+        points_per_cm = 72.0 / 2.54
+        height = 4.5
+        width = self.getLen() * 0.8 + 2
+        width = min(30, width)
+        points_height = int(height * points_per_cm)
+        points_width = int(width * points_per_cm)
+        defaults = _eps_defaults.copy()
+        defaults['CREATOR'] = program
+        defaults['CREATIONDATE'] = full_date
+        defaults['LOGOHEIGHT'] = str(height)
+        defaults['LOGOWIDTH'] = str(width)
+        defaults['FINEPRINT'] = program + ' ' + small_date
+        defaults['CHARSPERLINE'] = str(self.getLen())
+        defaults['BOUNDINGHEIGHT'] = str(points_height)
+        defaults['BOUNDINGWIDTH'] = str(points_width)
+        defaults['LOGOLINEHEIGHT'] = str(height)
+        with open(template_file, 'r') as template_fh:
+            m_var = re.compile("\{\$([A-Z]+)\}")
+            for line in template_fh:
+                last = 0
+                match = m_var.search(line)
+                while (match):
+                    if (last < match.start()):
+                        prev = line[last:match.start()]
+                        eps_fh.write(prev)
+                    key = match.group(1)
+                    if (key == "DATA"):
+                        eps_fh.write("\nStartLine\n")
+                        for pos in range(self.getLen()):
+                            eps_fh.write("({0:d}) startstack\n".format(pos+1))
+                            stack = []
+                            # calculate the stack information content
+                            alpha_ic = 2
+                            h = 0
+                            for sym in self.getAlphabet().getSymbols():
+                                freq = self.getFreq(pos, sym)
+                                if (freq == 0):
+                                    continue
+                                h -= (freq * math.log(freq, 2))
+                            stack_ic = alpha_ic - h
+                            # calculate the heights of each symbol
+                            for sym in self.getAlphabet().getSymbols():
+                                freq = self.getFreq(pos, sym)
+                                if (freq == 0):
+                                    continue
+                                stack.append((freq * stack_ic, sym))
+                            stack.sort();
+                            # output the symbols
+                            for symh, sym in stack:
+                                eps_fh.write(" {0:f} ({1:s}) numchar\n".format(
+                                        symh, sym))
+                            eps_fh.write("endstack\n\n")
+                        eps_fh.write("EndLine\n")
+                    elif (key in defaults):
+                        eps_fh.write(defaults[key])
+                    else:
+                        raise RuntimeError('Unknown variable "' + key + 
+                                '" in EPS template')
+                    last = match.end();
+                    match = m_var.search(line, last)
+                if (last < len(line)):
+                    eps_fh.write(line[last:])
+
+
+#------------------ Main method -------------------
+# Executed if you run this file from the operating system prompt, e.g.
+# > python sequence.py
+
+if __name__=='__main__':
+    alpha = getAlphabet('Extended DNA')
+    #seqs = readFASTA('pos.fasta')
+    seqs = []
+    aln = readStrings('tmp0')
+    #regexp = RegExp(alpha, '[AG]G.[DE]TT[AS].')
+    pwm = PWM(alpha)
+    pwm.setFromAlignment(aln)
+    for row in pwm.pretty():
+        print row
+    for s in seqs:
+        print s.getName(), s.getLen(), s.getAlphabet().getSymbols()
+        for m in regexp.match( s ):
+            print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])
+        for m in pwm.match( s ):
+            print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])
author	xuebing
date	Sat, 31 Mar 2012 08:53:06 -0400
parents
children