Mercurial > repos > xuebing > sharplab_seq_motif
view mytools/sequence.py @ 9:8cec2078632a
Uploaded
author | xuebing |
---|---|
date | Fri, 16 Mar 2012 14:15:10 -0400 |
parents | 39217fa39ff2 |
children |
line wrap: on
line source
#!@WHICHPYTHON@ import copy, string, sys #------------------ Alphabet ------------------- class Alphabet(object): """Biological alphabet class. This defines the set of symbols from which various objects can be built, e.g. sequences and motifs. The symbol set is immutable and accessed as a tuple. symstr: symbols in alphabet as either a tuple or string complement: dictionary defining letters and their complement """ def __init__(self, symstr, complement = None): """Construct an alphabet from a string or tuple of characters. Lower case characters will be converted to upper case. An optional mapping for complements may be provided. Example: >>> alpha = sequence.Alphabet('ACGTttga', {'A':'C', 'G':'T'}) >>> alpha.getSymbols() will construct the DNA alphabet and output: ('A', 'C', 'G', 'T') """ symlst = [] for s in [str(sym).upper()[0] for sym in symstr]: if not s in symlst: symlst.append(s) self.symbols = tuple(symlst) if complement != None: # expand the mapping and check for contradictions cmap = {} for s in self.symbols: c = complement.get(s, None) if c != None: if s in cmap and cmap[s] != c: raise RuntimeError("Alphabet complement map " "contains contradictory mapping") cmap[s] = c cmap[c] = s # replace mapping with indicies cimap = {} for idx in range (len(self.symbols)): s = self.symbols[idx] if s in cmap: cimap[cmap[s]] = idx # create tuple cidxlst = [] for idx in range (len(self.symbols)): cidxlst.append(cimap.get(self.symbols[idx], None)) self.complements = tuple(cidxlst) else: self.complements = None def getSymbols(self): """Retrieve a tuple with all symbols, immutable membership and order""" return self.symbols def getComplements(self): """Retrieve a tuple with all complement indicies, immutable""" return self.complements def isValidSymbol(self, sym): """Check if the symbol is a member of alphabet""" return any([s==sym for s in self.symbols]) def getIndex(self, sym): """Retrieve the index of the symbol (immutable)""" for idx in range (len(self.symbols)): if self.symbols[idx] == sym: return idx raise RuntimeError("Symbol " + sym + " does not exist in alphabet") def isComplementable(self): return self.complements != None def getComplement(self, sym): """Retrieve the complement of the symbol (immutable)""" return self.symbols[self.complements[self.getIndex(sym)]]; def isValidString(self, symstr): """Check if the string contains only symbols that belong to the alphabet""" found = True for sym in symstr: if self.isValidSymbol(sym) == False: return False return True def getLen(self): """Retrieve the number of symbols in (the length of) the alphabet""" return len(self.symbols) # pre-defined alphabets that can be specified by their name predefAlphabets = [ ("DNA" , Alphabet('ACGT', {'A':'T', 'G':'C'})), ("RNA" , Alphabet('ACGU')), ("Extended DNA" , Alphabet('ACGTYRN')), ("Protein" , Alphabet('ACDEFGHIKLMNPQRSTVWY')), ("Extended Protein" , Alphabet('ACDEFGHIKLMNPQRSTVWYX')), ("TM Labels" , Alphabet('MIO')) ] def getAlphabet(name): """Retrieve a pre-defined alphabet by name. Currently, "Protein", "DNA", "RNA", "Extended DNA", "Extended Protein" and "TM Labels" are available. Example: >>> alpha = sequence.getAlphabet('Protein') >>> alpha.getSymbols() will retrieve the 20 amino acid alphabet and output the tuple: ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') """ for (xname, xalpha) in predefAlphabets: if xname == name: return xalpha return None #------------------ Sequence ------------------- class Sequence(object): """Biological sequence class. Sequence data is immutable. data: the sequence data as a tuple or string alpha: the alphabet from which symbols are taken name: the sequence name, if any info: can contain additional sequence information apart from the name """ def __init__(self, sequence, alpha = None, name = "", seqinfo = ""): """Create a sequence with sequence data. Specifying the alphabet is optional, so is the name and info. Example: >>> myseq = sequence.Sequence('MVSAKKVPAIAMSFGVSF') will create a sequence with name "", and assign one of the predefined alphabets on basis of what symbols were used. >>> myseq.getAlphabet().getSymbols() will most likely output the standard protein alphabet: ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') """ if type(sequence) is str: self.data = tuple(sequence.upper()) elif type(sequence) is tuple: self.data = sequence elif type(sequence) is list: self.data = tuple([s.upper() for s in sequence]) else: raise RuntimeError("Sequence data is not specified correctly: must be string or tuple") # Resolve choice of alphabet validAlphabet = False if alpha == None: # Alphabet is not set, attempt to set it automatically... for (xname, xalpha) in predefAlphabets: # Iterate through each predefined alphabet, in order if xalpha.isValidString( self.data ): # This alphabet works, go with it self.alpha = alpha = xalpha validAlphabet = True break self.name = name self.info = seqinfo if validAlphabet == False: # we were either unsuccessful above or the alphabet was specified so test it if type(alpha) is str: # check if name is a predefined alphabet for (xname, xalpha) in predefAlphabets: # Iterate through each predefined alphabet, check for name if (xname == alpha): alpha = xalpha break if type(alpha) is Alphabet: # the alphabet is specified if alpha.isValidString(self.data) == False: raise RuntimeError("Invalid alphabet specified: "+"".join(alpha.getSymbols())+" is not compatible with sequence '"+"".join(self.data)+"'") else: self.alpha = alpha else: raise RuntimeError("Could not identify alphabet from sequence") #basic getters and setters for the class def getName(self): """Get the name of the sequence""" return self.name def getInfo(self): """Get additional info of the sequence (e.g. from the defline in a FASTA file)""" return self.info def getAlphabet(self): """Retrieve the alphabet that is assigned to this sequence""" return self.alpha def setName(self, name): """Change the name of the sequence""" self.name = name def setAlphabet(self, alpha): """Set the alphabet, throws an exception if it is not compatible with the sequence data""" if type(alpha) is Alphabet: if alpha.isValid( sequence ) == False: raise RuntimeError( "Invalid alphabet specified" ) #sequence functions def getSequence(self): """Retrieve the sequence data (a tuple of symbols)""" return self.data def getString(self): """Retrieve the sequence data as a string (copy of actual data)""" return "".join(self.data) def getLen(self): """Get the length of the sequence (number of symbols)""" return len(self.data) def getSite(self, position, length = 1): """Retrieve a site in the sequence of desired length. Note that positions go from 0 to length-1, and that if the requested site extends beyond those the method throws an exception. """ if position >= 0 and position <= self.getLen() - length: if length == 1: return self.data[position] else: return self.data[position:position+length] else: raise RuntimeError( "Attempt to access invalid position in sequence "+self.getName() ) def nice(self): """ A short description of the sequence """ print self.getName(), ":", self.getLen() def readStrings(filename): """ Read one or more lines of text from a file--for example an alignment. Return as a list of strings. filename: name of file """ txtlist = [] f = open(filename) for line in f.readlines(): txtlist.extend(line.split()) return txtlist def readFASTA(filename, alpha = None): """ Read one or more sequences from a file in FASTA format. filename: name of file to load sequences from alpha: alphabet that is used (if left unspecified, an attempt is made to identify the alphabet for each individual sequence) """ seqlist = [] seqname = None seqinfo = None seqdata = [] fh = open(filename) thisline = fh.readline() while (thisline): if (thisline[0] == '>'): # new sequence if (seqname): # take care of the data that is already in the buffer before processing the new sequence try: seqnew = Sequence(seqdata, alpha, seqname, seqinfo) seqlist.append(seqnew) except RuntimeError, e: print >> sys.stderr, "Warning: "+seqname+" is invalid (ignored): ", e seqinfo = thisline[1:-1] # everything on the defline is "info" seqname = seqinfo.split()[0] # up to first space seqdata = [] else: # pull out the sequence data cleanline = thisline.split() for line in cleanline: seqdata.extend(tuple(line.strip('*'))) # sometimes a line ends with an asterisk in FASTA files thisline = fh.readline() if (seqname): try: seqnew = Sequence(seqdata, alpha, seqname, seqinfo) seqlist.append(seqnew) except RuntimeError, e: print >> sys.stderr, "Warning: " + seqname + " is invalid (ignored): ", e else: raise RuntimeError("No sequences on FASTA format found in this file") fh.close() return seqlist def _writeOneFASTA(sequence, filehandle): """Write one sequence in FASTA format to an already open file""" filehandle.write(">" + sequence.getName()+"\n") data = sequence.getSequence() lines = ( sequence.getLen() - 1) / 60 + 1 for i in range(lines): #note: python lets us get the last line (var length) free #lineofseq = data[i*60 : (i+1)*60] + "\n" lineofseq = "".join(data[i*60 : (i+1)*60]) + "\n" filehandle.write(lineofseq) def writeFASTA(sequence, filename): """Write a list (or a single) of sequences to a file in the FASTA format""" fh = open(filename, "w") if isinstance(sequence, Sequence): _writeOneFASTA(sequence, fh) else: for seq in sequence: if isinstance(seq, Sequence): _writeOneFASTA(seq, fh) else: print >> sys.stderr, "Warning: could not write " + seq.getName() + " (ignored)." fh.flush() fh.close() #------------------ Distrib ------------------- class Distrib(object): """Class for storing a multinomial probability distribution over the symbols in an alphabet""" def __init__(self, alpha, pseudo_count = 0.0): self.alpha = alpha self.tot = pseudo_count * self.alpha.getLen() self.cnt = [pseudo_count for _ in range( self.alpha.getLen() )] def __deepcopy__(self, memo): dup = Distrib(self.alpha) dup.tot = copy.deepcopy(self.tot, memo) dup.cnt = copy.deepcopy(self.cnt, memo) return dup def count(self, syms = None ): """Count an observation of a symbol""" if syms == None: syms = self.alpha.getSymbols() for sym in syms: idx = self.alpha.getIndex( sym ) self.cnt[idx] += 1.0 self.tot += 1 def complement(self): """Complement the counts, throw an error if this is impossible""" if not self.alpha.isComplementable(): raise RuntimeError("Attempt to complement a Distrib " "based on a non-complementable alphabet.") coms = self.alpha.getComplements() new_count = [] for idx in range(len(coms)): cidx = coms[idx] if cidx == None: cidx = idx new_count.append(self.cnt[cidx]) self.cnt = new_count return self def reset(self): """Reset the distribution, that is, restart counting.""" self.tot = 0 self.cnt = [0.0 for _ in range( self.alpha.getLen() )] def getFreq(self, sym = None): """Determine the probability distribution from the current counts. The order in which probabilities are given follow the order of the symbols in the alphabet.""" if self.tot > 0: if sym == None: freq = tuple([ y / self.tot for y in self.cnt ]) return freq else: idx = self.alpha.getIndex( sym ) return self.cnt[idx] / self.tot return None def pretty(self): """Retrieve the probabilites for all symbols and return as a pretty table (a list of text strings)""" table = ["".join(["%4s " % s for s in self.alpha.getSymbols()])] table.append("".join(["%3.2f " % y for y in Distrib.getFreq(self)])) return table def getSymbols(self): """Get the symbols in the alphabet in the same order as probabilities are given.""" return self.alpha.getSymbols() def getAlphabet(self): """Get the alphabet over which the distribution is defined.""" return self.alpha #------------------ Motif (and subclasses) ------------------- class Motif(object): """ Sequence motif class--defining a pattern that can be searched in sequences. This class is not intended for direct use. Instead use and develop sub-classes (see below). """ def __init__(self, alpha): self.len = 0 self.alpha = alpha def getLen(self): """Get the length of the motif""" return self.len def getAlphabet(self): """Get the alphabet that is used in the motif""" return self.alpha def isAlphabet(self, seqstr): """Check if the sequence can be processed by this motif""" mystr = seqstr if type(seqstr) is Sequence: mystr = seqstr.getString() return self.getAlphabet().isValidString(mystr) import re class RegExp(Motif): """A motif class that defines the pattern in terms of a regular expression""" def __init__(self, alpha, re_string): Motif.__init__(self, alpha) self.pattern = re.compile(re_string) def match(self, seq): """Find matches to the motif in a specified sequence. The method is a generator, hence subsequent hits can be retrieved using next(). The returned result is a tuple (position, match-sequence, score), where score is always 1.0 since a regular expression is either true or false (not returned). """ myseq = seq if not type(seq) is Sequence: myseq = Sequence(seq, self.alpha) mystr = myseq.getString() if not Motif.isAlphabet(self, mystr): raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName()) for m in re.finditer(self.pattern, mystr): yield (m.start(), m.group(), 1.0) import math, time # Variables used by the PWM for creating an EPS file _colour_def = ( "/black [0 0 0] def\n" "/red [0.8 0 0] def\n" "/green [0 0.5 0] def\n" "/blue [0 0 0.8] def\n" "/yellow [1 1 0] def\n" "/purple [0.8 0 0.8] def\n" "/magenta [1.0 0 1.0] def\n" "/cyan [0 1.0 1.0] def\n" "/pink [1.0 0.8 0.8] def\n" "/turquoise [0.2 0.9 0.8] def\n" "/orange [1 0.7 0] def\n" "/lightred [0.8 0.56 0.56] def\n" "/lightgreen [0.35 0.5 0.35] def\n" "/lightblue [0.56 0.56 0.8] def\n" "/lightyellow [1 1 0.71] def\n" "/lightpurple [0.8 0.56 0.8] def\n" "/lightmagenta [1.0 0.7 1.0] def\n" "/lightcyan [0.7 1.0 1.0] def\n" "/lightpink [1.0 0.9 0.9] def\n" "/lightturquoise [0.81 0.9 0.89] def\n" "/lightorange [1 0.91 0.7] def\n") _colour_dict = ( "/fullColourDict <<\n" " (G) orange\n" " (T) green\n" " (C) blue\n" " (A) red\n" " (U) green\n" ">> def\n" "/mutedColourDict <<\n" " (G) lightorange\n" " (T) lightgreen\n" " (C) lightblue\n" " (A) lightred\n" " (U) lightgreen\n" ">> def\n" "/colorDict fullColourDict def\n") _eps_defaults = { 'LOGOTYPE': 'NA', 'FONTSIZE': '12', 'TITLEFONTSIZE': '12', 'SMALLFONTSIZE': '6', 'TOPMARGIN': '0.9', 'BOTTOMMARGIN': '0.9', 'YAXIS': 'true', 'YAXISLABEL': 'bits', 'XAXISLABEL': '', 'TITLE': '', 'ERRORBARFRACTION': '1.0', 'SHOWINGBOX': 'false', 'BARBITS': '2.0', 'TICBITS': '1', 'COLORDEF': _colour_def, 'COLORDICT': _colour_dict, 'SHOWENDS': 'false', 'NUMBERING': 'true', 'OUTLINE': 'false', } class PWM(Motif): """This motif subclass defines a pattern in terms of a position weight matrix. An alphabet must be provided. A pseudo-count to be added to each count is optional. A uniform background distribution is used by default. """ def __init__(self, alpha): Motif.__init__(self, alpha) # set alphabet of this multinomial distribution self.background = Distrib(alpha) # the default background ... self.background.count(alpha.getSymbols()) # ... is uniform self.nsites = 0 def setFromAlignment(self, aligned, pseudo_count = 0.0): """Set the probabilities in the PWM from an alignment. The alignment is a list of equal-length strings (see readStrings), OR a list of Sequence. """ self.cols = -1 self.nsites = len(aligned) seqs = [] # Below we create a list of Sequence from the alignment, # while doing some error checking, and figure out the number of columns for s in aligned: # probably a text string, so we make a nameless sequence from it if not type(s) is Sequence: s=Sequence(s, Motif.getAlphabet(self)) else: # it was a sequence, so we check that the alphabet in # this motif will be able to process it if not Motif.isAlphabet(self, s): raise RuntimeError("Motif alphabet is not valid for sequence " + s.getName()) if self.cols == -1: self.cols = s.getLen() elif self.cols != s.getLen(): raise RuntimeError("Sequences in alignment are not of equal length") seqs.append(s) # The line below initializes the list of Distrib (one for each column of the alignment) self.counts = [Distrib(Motif.getAlphabet(self), pseudo_count) for _ in range(self.cols)] # Next, we do the counting, column by column for c in range( self.cols ): # iterate through columns for s in seqs: # iterate through rows # determine the index of the symbol we find at this position (row, column c) self.counts[c].count(s.getSite(c)) # Update the length self.len = self.cols def reverseComplement(self): """Reverse complement the PWM""" i = 0 j = len(self.counts)-1 while (i < j): temp = self.counts[i]; self.counts[i] = self.counts[j] self.counts[j] = temp self.counts[i].complement() self.counts[j].complement() i += 1; j -= 1; if i == j: self.counts[i].complement() return self def getNSites(self): """Get the number of sites that made the PWM""" return self.nsites def setBackground(self, distrib): """Set the background distribution""" if not distrib.getAlphabet() == Motif.getAlphabet(self): raise RuntimeError("Incompatible alphabets") self.background = distrib def getFreq(self, col = None, sym = None): """Get the probabilities for all positions in the PWM (a list of Distribs)""" if (col == None): return [y.getFreq() for y in self.counts] else: return self.counts[col].getFreq(sym) def pretty(self): """Retrieve the probabilites for all positions in the PWM as a pretty table (a list of text strings)""" #table = ["".join(["%8s " % s for s in self.alpha.getSymbols()])] table = [] for row in PWM.getFreq(self): table.append("".join(["%8.6f " % y for y in row])) return table def logoddsPretty(self, bkg): """Retrieve the (base-2) log-odds for all positions in the PWM as a pretty table (a list of text strings)""" table = [] for row in PWM.getFreq(self): #table.append("".join(["%8.6f " % (math.log((row[i]+1e-6)/bkg[i])/math.log(2)) for i in range(len(row))])) table.append("".join(["%8.6f " % (math.log((row[i])/bkg[i])/math.log(2)) for i in range(len(row))])) #table.append("".join(["%8.6f " % row[i] for i in range(len(row))])) return table def consensus_sequence(self): """ Get the consensus sequence corresponding to a PWM. Consensus sequence is the letter in each column with the highest probability. """ consensus = "" alphabet = Motif.getAlphabet(self).getSymbols() for pos in range(self.cols): best_letter = alphabet[0] best_p = self.counts[pos].getFreq(best_letter) for letter in alphabet[1:]: p = self.counts[pos].getFreq(letter) if p > best_p: best_p = p best_letter = letter consensus += best_letter return consensus def consensus(self): """ Get the consensus corresponding to a PWM. Consensus at each column of motif is a list of characters with non-zero probabilities. """ consensus = [] for pos in range(self.cols): matches = [] for letter in Motif.getAlphabet(self).getSymbols(): p = self.counts[pos].getFreq(letter) if p > 0: matches += letter consensus.append(matches) return consensus def getScore(self, seq, start): """Score this particular list of symbols using the PFM (background needs to be set separately)""" sum = 0.0 seqdata = seq.getSequence()[start : start+self.cols] for pos in range(len(seqdata)): q = self.counts[pos].getFreq(seqdata[pos]) if q == 0: q = 0.0001 # to avoid log(0) == -Infinity logodds = math.log(q / self.background.getFreq(seqdata[pos])) sum += logodds return sum def match(self, seq, _LOG0 = -10): """Find matches to the motif in a specified sequence. The method is a generator, hence subsequent hits can be retrieved using next(). The returned result is a tuple (position, match-sequence, score). The optional parameter _LOG0 specifies a lower bound on reported logodds scores. """ myseq = seq if not type(seq) is Sequence: myseq = Sequence(seq, self.alpha) if not Motif.isAlphabet(self, myseq): raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName()) for pos in range(myseq.getLen() - self.cols): score = PWM.getScore(self, myseq, pos) if score > _LOG0: yield (pos, "".join(myseq.getSite(pos, self.cols)), score) def writeEPS(self, program, template_file, eps_fh, timestamp = time.localtime()): """Write out a DNA motif to EPS format.""" small_dfmt = "%d.%m.%Y %H:%M" full_dfmt = "%d.%m.%Y %H:%M:%S %Z" small_date = time.strftime(small_dfmt, timestamp) full_date = time.strftime(full_dfmt, timestamp) points_per_cm = 72.0 / 2.54 height = 4.5 width = self.getLen() * 0.8 + 2 width = min(30, width) points_height = int(height * points_per_cm) points_width = int(width * points_per_cm) defaults = _eps_defaults.copy() defaults['CREATOR'] = program defaults['CREATIONDATE'] = full_date defaults['LOGOHEIGHT'] = str(height) defaults['LOGOWIDTH'] = str(width) defaults['FINEPRINT'] = program + ' ' + small_date defaults['CHARSPERLINE'] = str(self.getLen()) defaults['BOUNDINGHEIGHT'] = str(points_height) defaults['BOUNDINGWIDTH'] = str(points_width) defaults['LOGOLINEHEIGHT'] = str(height) with open(template_file, 'r') as template_fh: m_var = re.compile("\{\$([A-Z]+)\}") for line in template_fh: last = 0 match = m_var.search(line) while (match): if (last < match.start()): prev = line[last:match.start()] eps_fh.write(prev) key = match.group(1) if (key == "DATA"): eps_fh.write("\nStartLine\n") for pos in range(self.getLen()): eps_fh.write("({0:d}) startstack\n".format(pos+1)) stack = [] # calculate the stack information content alpha_ic = 2 h = 0 for sym in self.getAlphabet().getSymbols(): freq = self.getFreq(pos, sym) if (freq == 0): continue h -= (freq * math.log(freq, 2)) stack_ic = alpha_ic - h # calculate the heights of each symbol for sym in self.getAlphabet().getSymbols(): freq = self.getFreq(pos, sym) if (freq == 0): continue stack.append((freq * stack_ic, sym)) stack.sort(); # output the symbols for symh, sym in stack: eps_fh.write(" {0:f} ({1:s}) numchar\n".format( symh, sym)) eps_fh.write("endstack\n\n") eps_fh.write("EndLine\n") elif (key in defaults): eps_fh.write(defaults[key]) else: raise RuntimeError('Unknown variable "' + key + '" in EPS template') last = match.end(); match = m_var.search(line, last) if (last < len(line)): eps_fh.write(line[last:]) #------------------ Main method ------------------- # Executed if you run this file from the operating system prompt, e.g. # > python sequence.py if __name__=='__main__': alpha = getAlphabet('Extended DNA') #seqs = readFASTA('pos.fasta') seqs = [] aln = readStrings('tmp0') #regexp = RegExp(alpha, '[AG]G.[DE]TT[AS].') pwm = PWM(alpha) pwm.setFromAlignment(aln) for row in pwm.pretty(): print row for s in seqs: print s.getName(), s.getLen(), s.getAlphabet().getSymbols() for m in regexp.match( s ): print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2]) for m in pwm.match( s ): print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])