annotate mytools/sequence.py @ 11:92eac44fe40d

Uploaded
author xuebing
date Fri, 16 Mar 2012 15:27:31 -0400
parents 39217fa39ff2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
1 #!@WHICHPYTHON@
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
2
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
3 import copy, string, sys
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
4
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
5 #------------------ Alphabet -------------------
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
6
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
7 class Alphabet(object):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
8 """Biological alphabet class.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
9 This defines the set of symbols from which various objects can be built, e.g. sequences and motifs.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
10 The symbol set is immutable and accessed as a tuple.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
11 symstr: symbols in alphabet as either a tuple or string
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
12 complement: dictionary defining letters and their complement
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
13 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
14 def __init__(self, symstr, complement = None):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
15 """Construct an alphabet from a string or tuple of characters.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
16 Lower case characters will be converted to upper case.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
17 An optional mapping for complements may be provided.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
18 Example:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
19 >>> alpha = sequence.Alphabet('ACGTttga', {'A':'C', 'G':'T'})
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
20 >>> alpha.getSymbols()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
21 will construct the DNA alphabet and output:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
22 ('A', 'C', 'G', 'T')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
23 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
24 symlst = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
25 for s in [str(sym).upper()[0] for sym in symstr]:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
26 if not s in symlst:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
27 symlst.append(s)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
28 self.symbols = tuple(symlst)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
29 if complement != None:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
30 # expand the mapping and check for contradictions
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
31 cmap = {}
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
32 for s in self.symbols:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
33 c = complement.get(s, None)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
34 if c != None:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
35 if s in cmap and cmap[s] != c:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
36 raise RuntimeError("Alphabet complement map "
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
37 "contains contradictory mapping")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
38 cmap[s] = c
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
39 cmap[c] = s
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
40 # replace mapping with indicies
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
41 cimap = {}
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
42 for idx in range (len(self.symbols)):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
43 s = self.symbols[idx]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
44 if s in cmap:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
45 cimap[cmap[s]] = idx
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
46 # create tuple
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
47 cidxlst = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
48 for idx in range (len(self.symbols)):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
49 cidxlst.append(cimap.get(self.symbols[idx], None))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
50 self.complements = tuple(cidxlst)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
51 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
52 self.complements = None
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
53
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
54 def getSymbols(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
55 """Retrieve a tuple with all symbols, immutable membership and order"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
56 return self.symbols
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
57
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
58 def getComplements(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
59 """Retrieve a tuple with all complement indicies, immutable"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
60 return self.complements
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
61
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
62 def isValidSymbol(self, sym):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
63 """Check if the symbol is a member of alphabet"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
64 return any([s==sym for s in self.symbols])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
65
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
66 def getIndex(self, sym):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
67 """Retrieve the index of the symbol (immutable)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
68 for idx in range (len(self.symbols)):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
69 if self.symbols[idx] == sym:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
70 return idx
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
71 raise RuntimeError("Symbol " + sym + " does not exist in alphabet")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
72
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
73 def isComplementable(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
74 return self.complements != None
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
75
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
76 def getComplement(self, sym):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
77 """Retrieve the complement of the symbol (immutable)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
78 return self.symbols[self.complements[self.getIndex(sym)]];
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
79
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
80 def isValidString(self, symstr):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
81 """Check if the string contains only symbols that belong to the alphabet"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
82 found = True
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
83 for sym in symstr:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
84 if self.isValidSymbol(sym) == False:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
85 return False
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
86 return True
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
87
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
88 def getLen(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
89 """Retrieve the number of symbols in (the length of) the alphabet"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
90 return len(self.symbols)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
91
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
92 # pre-defined alphabets that can be specified by their name
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
93 predefAlphabets = [
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
94 ("DNA" , Alphabet('ACGT', {'A':'T', 'G':'C'})),
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
95 ("RNA" , Alphabet('ACGU')),
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
96 ("Extended DNA" , Alphabet('ACGTYRN')),
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
97 ("Protein" , Alphabet('ACDEFGHIKLMNPQRSTVWY')),
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
98 ("Extended Protein" , Alphabet('ACDEFGHIKLMNPQRSTVWYX')),
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
99 ("TM Labels" , Alphabet('MIO'))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
100 ]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
101
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
102 def getAlphabet(name):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
103 """Retrieve a pre-defined alphabet by name.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
104 Currently, "Protein", "DNA", "RNA", "Extended DNA", "Extended Protein" and "TM Labels" are available.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
105 Example:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
106 >>> alpha = sequence.getAlphabet('Protein')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
107 >>> alpha.getSymbols()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
108 will retrieve the 20 amino acid alphabet and output the tuple:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
109 ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
110 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
111 for (xname, xalpha) in predefAlphabets:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
112 if xname == name:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
113 return xalpha
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
114 return None
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
115
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
116 #------------------ Sequence -------------------
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
117
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
118 class Sequence(object):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
119 """Biological sequence class. Sequence data is immutable.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
120
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
121 data: the sequence data as a tuple or string
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
122 alpha: the alphabet from which symbols are taken
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
123 name: the sequence name, if any
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
124 info: can contain additional sequence information apart from the name
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
125 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
126 def __init__(self, sequence, alpha = None, name = "", seqinfo = ""):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
127 """Create a sequence with sequence data.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
128 Specifying the alphabet is optional, so is the name and info.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
129 Example:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
130 >>> myseq = sequence.Sequence('MVSAKKVPAIAMSFGVSF')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
131 will create a sequence with name "", and assign one of the predefined alphabets on basis of what symbols were used.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
132 >>> myseq.getAlphabet().getSymbols()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
133 will most likely output the standard protein alphabet:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
134 ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
135 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
136 if type(sequence) is str:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
137 self.data = tuple(sequence.upper())
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
138 elif type(sequence) is tuple:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
139 self.data = sequence
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
140 elif type(sequence) is list:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
141 self.data = tuple([s.upper() for s in sequence])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
142 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
143 raise RuntimeError("Sequence data is not specified correctly: must be string or tuple")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
144 # Resolve choice of alphabet
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
145 validAlphabet = False
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
146 if alpha == None: # Alphabet is not set, attempt to set it automatically...
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
147 for (xname, xalpha) in predefAlphabets: # Iterate through each predefined alphabet, in order
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
148 if xalpha.isValidString( self.data ): # This alphabet works, go with it
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
149 self.alpha = alpha = xalpha
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
150 validAlphabet = True
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
151 break
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
152 self.name = name
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
153 self.info = seqinfo
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
154 if validAlphabet == False: # we were either unsuccessful above or the alphabet was specified so test it
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
155 if type(alpha) is str: # check if name is a predefined alphabet
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
156 for (xname, xalpha) in predefAlphabets: # Iterate through each predefined alphabet, check for name
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
157 if (xname == alpha):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
158 alpha = xalpha
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
159 break
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
160 if type(alpha) is Alphabet: # the alphabet is specified
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
161 if alpha.isValidString(self.data) == False:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
162 raise RuntimeError("Invalid alphabet specified: "+"".join(alpha.getSymbols())+" is not compatible with sequence '"+"".join(self.data)+"'")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
163 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
164 self.alpha = alpha
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
165 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
166 raise RuntimeError("Could not identify alphabet from sequence")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
167
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
168 #basic getters and setters for the class
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
169 def getName(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
170 """Get the name of the sequence"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
171 return self.name
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
172 def getInfo(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
173 """Get additional info of the sequence (e.g. from the defline in a FASTA file)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
174 return self.info
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
175 def getAlphabet(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
176 """Retrieve the alphabet that is assigned to this sequence"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
177 return self.alpha
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
178 def setName(self, name):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
179 """Change the name of the sequence"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
180 self.name = name
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
181 def setAlphabet(self, alpha):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
182 """Set the alphabet, throws an exception if it is not compatible with the sequence data"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
183 if type(alpha) is Alphabet:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
184 if alpha.isValid( sequence ) == False:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
185 raise RuntimeError( "Invalid alphabet specified" )
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
186 #sequence functions
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
187 def getSequence(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
188 """Retrieve the sequence data (a tuple of symbols)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
189 return self.data
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
190 def getString(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
191 """Retrieve the sequence data as a string (copy of actual data)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
192 return "".join(self.data)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
193 def getLen(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
194 """Get the length of the sequence (number of symbols)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
195 return len(self.data)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
196 def getSite(self, position, length = 1):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
197 """Retrieve a site in the sequence of desired length.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
198 Note that positions go from 0 to length-1, and that if the requested site
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
199 extends beyond those the method throws an exception.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
200 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
201 if position >= 0 and position <= self.getLen() - length:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
202 if length == 1:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
203 return self.data[position]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
204 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
205 return self.data[position:position+length]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
206 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
207 raise RuntimeError( "Attempt to access invalid position in sequence "+self.getName() )
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
208
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
209 def nice(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
210 """ A short description of the sequence """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
211 print self.getName(), ":", self.getLen()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
212
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
213 def readStrings(filename):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
214 """ Read one or more lines of text from a file--for example an alignment.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
215 Return as a list of strings.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
216 filename: name of file
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
217 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
218 txtlist = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
219 f = open(filename)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
220 for line in f.readlines():
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
221 txtlist.extend(line.split())
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
222 return txtlist
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
223
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
224 def readFASTA(filename, alpha = None):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
225 """ Read one or more sequences from a file in FASTA format.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
226 filename: name of file to load sequences from
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
227 alpha: alphabet that is used (if left unspecified, an attempt is made to identify the alphabet for each individual sequence)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
228 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
229 seqlist = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
230 seqname = None
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
231 seqinfo = None
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
232 seqdata = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
233 fh = open(filename)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
234 thisline = fh.readline()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
235 while (thisline):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
236 if (thisline[0] == '>'): # new sequence
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
237 if (seqname): # take care of the data that is already in the buffer before processing the new sequence
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
238 try:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
239 seqnew = Sequence(seqdata, alpha, seqname, seqinfo)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
240 seqlist.append(seqnew)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
241 except RuntimeError, e:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
242 print >> sys.stderr, "Warning: "+seqname+" is invalid (ignored): ", e
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
243 seqinfo = thisline[1:-1] # everything on the defline is "info"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
244 seqname = seqinfo.split()[0] # up to first space
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
245 seqdata = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
246 else: # pull out the sequence data
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
247 cleanline = thisline.split()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
248 for line in cleanline:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
249 seqdata.extend(tuple(line.strip('*'))) # sometimes a line ends with an asterisk in FASTA files
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
250 thisline = fh.readline()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
251
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
252 if (seqname):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
253 try:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
254 seqnew = Sequence(seqdata, alpha, seqname, seqinfo)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
255 seqlist.append(seqnew)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
256 except RuntimeError, e:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
257 print >> sys.stderr, "Warning: " + seqname + " is invalid (ignored): ", e
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
258 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
259 raise RuntimeError("No sequences on FASTA format found in this file")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
260 fh.close()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
261 return seqlist
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
262
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
263 def _writeOneFASTA(sequence, filehandle):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
264 """Write one sequence in FASTA format to an already open file"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
265 filehandle.write(">" + sequence.getName()+"\n")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
266 data = sequence.getSequence()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
267 lines = ( sequence.getLen() - 1) / 60 + 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
268 for i in range(lines):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
269 #note: python lets us get the last line (var length) free
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
270 #lineofseq = data[i*60 : (i+1)*60] + "\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
271 lineofseq = "".join(data[i*60 : (i+1)*60]) + "\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
272 filehandle.write(lineofseq)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
273
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
274 def writeFASTA(sequence, filename):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
275 """Write a list (or a single) of sequences to a file in the FASTA format"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
276 fh = open(filename, "w")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
277 if isinstance(sequence, Sequence):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
278 _writeOneFASTA(sequence, fh)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
279 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
280 for seq in sequence:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
281 if isinstance(seq, Sequence):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
282 _writeOneFASTA(seq, fh)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
283 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
284 print >> sys.stderr, "Warning: could not write " + seq.getName() + " (ignored)."
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
285 fh.flush()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
286 fh.close()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
287
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
288 #------------------ Distrib -------------------
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
289
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
290 class Distrib(object):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
291 """Class for storing a multinomial probability distribution over the symbols in an alphabet"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
292 def __init__(self, alpha, pseudo_count = 0.0):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
293 self.alpha = alpha
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
294 self.tot = pseudo_count * self.alpha.getLen()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
295 self.cnt = [pseudo_count for _ in range( self.alpha.getLen() )]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
296
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
297 def __deepcopy__(self, memo):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
298 dup = Distrib(self.alpha)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
299 dup.tot = copy.deepcopy(self.tot, memo)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
300 dup.cnt = copy.deepcopy(self.cnt, memo)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
301 return dup
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
302
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
303 def count(self, syms = None ):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
304 """Count an observation of a symbol"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
305 if syms == None:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
306 syms = self.alpha.getSymbols()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
307 for sym in syms:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
308 idx = self.alpha.getIndex( sym )
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
309 self.cnt[idx] += 1.0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
310 self.tot += 1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
311
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
312 def complement(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
313 """Complement the counts, throw an error if this is impossible"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
314 if not self.alpha.isComplementable():
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
315 raise RuntimeError("Attempt to complement a Distrib "
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
316 "based on a non-complementable alphabet.")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
317 coms = self.alpha.getComplements()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
318 new_count = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
319 for idx in range(len(coms)):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
320 cidx = coms[idx]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
321 if cidx == None:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
322 cidx = idx
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
323 new_count.append(self.cnt[cidx])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
324 self.cnt = new_count
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
325 return self
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
326
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
327 def reset(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
328 """Reset the distribution, that is, restart counting."""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
329 self.tot = 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
330 self.cnt = [0.0 for _ in range( self.alpha.getLen() )]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
331
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
332 def getFreq(self, sym = None):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
333 """Determine the probability distribution from the current counts.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
334 The order in which probabilities are given follow the order of the symbols in the alphabet."""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
335 if self.tot > 0:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
336 if sym == None:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
337 freq = tuple([ y / self.tot for y in self.cnt ])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
338 return freq
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
339 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
340 idx = self.alpha.getIndex( sym )
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
341 return self.cnt[idx] / self.tot
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
342 return None
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
343
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
344 def pretty(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
345 """Retrieve the probabilites for all symbols and return as a pretty table (a list of text strings)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
346 table = ["".join(["%4s " % s for s in self.alpha.getSymbols()])]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
347 table.append("".join(["%3.2f " % y for y in Distrib.getFreq(self)]))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
348 return table
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
349
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
350 def getSymbols(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
351 """Get the symbols in the alphabet in the same order as probabilities are given."""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
352 return self.alpha.getSymbols()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
353
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
354 def getAlphabet(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
355 """Get the alphabet over which the distribution is defined."""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
356 return self.alpha
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
357
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
358 #------------------ Motif (and subclasses) -------------------
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
359
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
360 class Motif(object):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
361 """ Sequence motif class--defining a pattern that can be searched in sequences.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
362 This class is not intended for direct use. Instead use and develop sub-classes (see below).
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
363 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
364 def __init__(self, alpha):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
365 self.len = 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
366 self.alpha = alpha
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
367
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
368 def getLen(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
369 """Get the length of the motif"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
370 return self.len
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
371
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
372 def getAlphabet(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
373 """Get the alphabet that is used in the motif"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
374 return self.alpha
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
375
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
376 def isAlphabet(self, seqstr):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
377 """Check if the sequence can be processed by this motif"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
378 mystr = seqstr
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
379 if type(seqstr) is Sequence:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
380 mystr = seqstr.getString()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
381 return self.getAlphabet().isValidString(mystr)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
382
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
383 import re
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
384
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
385 class RegExp(Motif):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
386 """A motif class that defines the pattern in terms of a regular expression"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
387 def __init__(self, alpha, re_string):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
388 Motif.__init__(self, alpha)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
389 self.pattern = re.compile(re_string)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
390
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
391 def match(self, seq):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
392 """Find matches to the motif in a specified sequence.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
393 The method is a generator, hence subsequent hits can be retrieved using next().
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
394 The returned result is a tuple (position, match-sequence, score), where score is
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
395 always 1.0 since a regular expression is either true or false (not returned).
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
396 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
397 myseq = seq
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
398 if not type(seq) is Sequence:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
399 myseq = Sequence(seq, self.alpha)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
400 mystr = myseq.getString()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
401 if not Motif.isAlphabet(self, mystr):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
402 raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName())
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
403 for m in re.finditer(self.pattern, mystr):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
404 yield (m.start(), m.group(), 1.0)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
405
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
406 import math, time
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
407
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
408 # Variables used by the PWM for creating an EPS file
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
409 _colour_def = (
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
410 "/black [0 0 0] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
411 "/red [0.8 0 0] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
412 "/green [0 0.5 0] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
413 "/blue [0 0 0.8] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
414 "/yellow [1 1 0] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
415 "/purple [0.8 0 0.8] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
416 "/magenta [1.0 0 1.0] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
417 "/cyan [0 1.0 1.0] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
418 "/pink [1.0 0.8 0.8] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
419 "/turquoise [0.2 0.9 0.8] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
420 "/orange [1 0.7 0] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
421 "/lightred [0.8 0.56 0.56] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
422 "/lightgreen [0.35 0.5 0.35] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
423 "/lightblue [0.56 0.56 0.8] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
424 "/lightyellow [1 1 0.71] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
425 "/lightpurple [0.8 0.56 0.8] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
426 "/lightmagenta [1.0 0.7 1.0] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
427 "/lightcyan [0.7 1.0 1.0] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
428 "/lightpink [1.0 0.9 0.9] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
429 "/lightturquoise [0.81 0.9 0.89] def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
430 "/lightorange [1 0.91 0.7] def\n")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
431 _colour_dict = (
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
432 "/fullColourDict <<\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
433 " (G) orange\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
434 " (T) green\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
435 " (C) blue\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
436 " (A) red\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
437 " (U) green\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
438 ">> def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
439 "/mutedColourDict <<\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
440 " (G) lightorange\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
441 " (T) lightgreen\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
442 " (C) lightblue\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
443 " (A) lightred\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
444 " (U) lightgreen\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
445 ">> def\n"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
446 "/colorDict fullColourDict def\n")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
447
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
448 _eps_defaults = {
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
449 'LOGOTYPE': 'NA',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
450 'FONTSIZE': '12',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
451 'TITLEFONTSIZE': '12',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
452 'SMALLFONTSIZE': '6',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
453 'TOPMARGIN': '0.9',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
454 'BOTTOMMARGIN': '0.9',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
455 'YAXIS': 'true',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
456 'YAXISLABEL': 'bits',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
457 'XAXISLABEL': '',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
458 'TITLE': '',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
459 'ERRORBARFRACTION': '1.0',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
460 'SHOWINGBOX': 'false',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
461 'BARBITS': '2.0',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
462 'TICBITS': '1',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
463 'COLORDEF': _colour_def,
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
464 'COLORDICT': _colour_dict,
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
465 'SHOWENDS': 'false',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
466 'NUMBERING': 'true',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
467 'OUTLINE': 'false',
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
468 }
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
469 class PWM(Motif):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
470 """This motif subclass defines a pattern in terms of a position weight matrix.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
471 An alphabet must be provided. A pseudo-count to be added to each count is
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
472 optional. A uniform background distribution is used by default.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
473 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
474 def __init__(self, alpha):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
475 Motif.__init__(self, alpha) # set alphabet of this multinomial distribution
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
476 self.background = Distrib(alpha) # the default background ...
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
477 self.background.count(alpha.getSymbols()) # ... is uniform
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
478 self.nsites = 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
479
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
480 def setFromAlignment(self, aligned, pseudo_count = 0.0):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
481 """Set the probabilities in the PWM from an alignment.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
482 The alignment is a list of equal-length strings (see readStrings), OR
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
483 a list of Sequence.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
484 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
485 self.cols = -1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
486 self.nsites = len(aligned)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
487 seqs = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
488 # Below we create a list of Sequence from the alignment,
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
489 # while doing some error checking, and figure out the number of columns
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
490 for s in aligned:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
491 # probably a text string, so we make a nameless sequence from it
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
492 if not type(s) is Sequence:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
493 s=Sequence(s, Motif.getAlphabet(self))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
494 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
495 # it was a sequence, so we check that the alphabet in
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
496 # this motif will be able to process it
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
497 if not Motif.isAlphabet(self, s):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
498 raise RuntimeError("Motif alphabet is not valid for sequence " + s.getName())
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
499 if self.cols == -1:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
500 self.cols = s.getLen()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
501 elif self.cols != s.getLen():
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
502 raise RuntimeError("Sequences in alignment are not of equal length")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
503 seqs.append(s)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
504 # The line below initializes the list of Distrib (one for each column of the alignment)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
505 self.counts = [Distrib(Motif.getAlphabet(self), pseudo_count) for _ in range(self.cols)]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
506 # Next, we do the counting, column by column
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
507 for c in range( self.cols ): # iterate through columns
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
508 for s in seqs: # iterate through rows
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
509 # determine the index of the symbol we find at this position (row, column c)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
510 self.counts[c].count(s.getSite(c))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
511 # Update the length
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
512 self.len = self.cols
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
513
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
514 def reverseComplement(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
515 """Reverse complement the PWM"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
516 i = 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
517 j = len(self.counts)-1
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
518 while (i < j):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
519 temp = self.counts[i];
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
520 self.counts[i] = self.counts[j]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
521 self.counts[j] = temp
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
522 self.counts[i].complement()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
523 self.counts[j].complement()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
524 i += 1;
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
525 j -= 1;
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
526 if i == j:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
527 self.counts[i].complement()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
528 return self
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
529
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
530 def getNSites(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
531 """Get the number of sites that made the PWM"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
532 return self.nsites
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
533
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
534 def setBackground(self, distrib):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
535 """Set the background distribution"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
536 if not distrib.getAlphabet() == Motif.getAlphabet(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
537 raise RuntimeError("Incompatible alphabets")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
538 self.background = distrib
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
539
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
540 def getFreq(self, col = None, sym = None):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
541 """Get the probabilities for all positions in the PWM (a list of Distribs)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
542 if (col == None):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
543 return [y.getFreq() for y in self.counts]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
544 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
545 return self.counts[col].getFreq(sym)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
546
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
547 def pretty(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
548 """Retrieve the probabilites for all positions in the PWM as a pretty table (a list of text strings)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
549 #table = ["".join(["%8s " % s for s in self.alpha.getSymbols()])]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
550 table = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
551 for row in PWM.getFreq(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
552 table.append("".join(["%8.6f " % y for y in row]))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
553 return table
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
554
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
555 def logoddsPretty(self, bkg):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
556 """Retrieve the (base-2) log-odds for all positions in the PWM as a pretty table (a list of text strings)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
557 table = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
558 for row in PWM.getFreq(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
559 #table.append("".join(["%8.6f " % (math.log((row[i]+1e-6)/bkg[i])/math.log(2)) for i in range(len(row))]))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
560 table.append("".join(["%8.6f " % (math.log((row[i])/bkg[i])/math.log(2)) for i in range(len(row))]))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
561 #table.append("".join(["%8.6f " % row[i] for i in range(len(row))]))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
562 return table
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
563
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
564
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
565 def consensus_sequence(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
566 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
567 Get the consensus sequence corresponding to a PWM.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
568 Consensus sequence is the letter in each column
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
569 with the highest probability.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
570 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
571 consensus = ""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
572 alphabet = Motif.getAlphabet(self).getSymbols()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
573 for pos in range(self.cols):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
574 best_letter = alphabet[0]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
575 best_p = self.counts[pos].getFreq(best_letter)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
576 for letter in alphabet[1:]:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
577 p = self.counts[pos].getFreq(letter)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
578 if p > best_p:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
579 best_p = p
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
580 best_letter = letter
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
581 consensus += best_letter
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
582 return consensus
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
583
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
584
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
585 def consensus(self):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
586 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
587 Get the consensus corresponding to a PWM.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
588 Consensus at each column of motif is a list of
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
589 characters with non-zero probabilities.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
590 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
591 consensus = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
592 for pos in range(self.cols):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
593 matches = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
594 for letter in Motif.getAlphabet(self).getSymbols():
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
595 p = self.counts[pos].getFreq(letter)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
596 if p > 0:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
597 matches += letter
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
598 consensus.append(matches)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
599 return consensus
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
600
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
601
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
602 def getScore(self, seq, start):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
603 """Score this particular list of symbols using the PFM (background needs to be set separately)"""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
604 sum = 0.0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
605 seqdata = seq.getSequence()[start : start+self.cols]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
606 for pos in range(len(seqdata)):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
607 q = self.counts[pos].getFreq(seqdata[pos])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
608 if q == 0:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
609 q = 0.0001 # to avoid log(0) == -Infinity
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
610 logodds = math.log(q / self.background.getFreq(seqdata[pos]))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
611 sum += logodds
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
612 return sum
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
613
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
614 def match(self, seq, _LOG0 = -10):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
615 """Find matches to the motif in a specified sequence.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
616 The method is a generator, hence subsequent hits can be retrieved using next().
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
617 The returned result is a tuple (position, match-sequence, score).
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
618 The optional parameter _LOG0 specifies a lower bound on reported logodds scores.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
619 """
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
620 myseq = seq
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
621 if not type(seq) is Sequence:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
622 myseq = Sequence(seq, self.alpha)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
623 if not Motif.isAlphabet(self, myseq):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
624 raise RuntimeError("Motif alphabet is not valid for sequence " + myseq.getName())
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
625 for pos in range(myseq.getLen() - self.cols):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
626 score = PWM.getScore(self, myseq, pos)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
627 if score > _LOG0:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
628 yield (pos, "".join(myseq.getSite(pos, self.cols)), score)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
629
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
630 def writeEPS(self, program, template_file, eps_fh,
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
631 timestamp = time.localtime()):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
632 """Write out a DNA motif to EPS format."""
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
633 small_dfmt = "%d.%m.%Y %H:%M"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
634 full_dfmt = "%d.%m.%Y %H:%M:%S %Z"
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
635 small_date = time.strftime(small_dfmt, timestamp)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
636 full_date = time.strftime(full_dfmt, timestamp)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
637 points_per_cm = 72.0 / 2.54
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
638 height = 4.5
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
639 width = self.getLen() * 0.8 + 2
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
640 width = min(30, width)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
641 points_height = int(height * points_per_cm)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
642 points_width = int(width * points_per_cm)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
643 defaults = _eps_defaults.copy()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
644 defaults['CREATOR'] = program
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
645 defaults['CREATIONDATE'] = full_date
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
646 defaults['LOGOHEIGHT'] = str(height)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
647 defaults['LOGOWIDTH'] = str(width)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
648 defaults['FINEPRINT'] = program + ' ' + small_date
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
649 defaults['CHARSPERLINE'] = str(self.getLen())
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
650 defaults['BOUNDINGHEIGHT'] = str(points_height)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
651 defaults['BOUNDINGWIDTH'] = str(points_width)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
652 defaults['LOGOLINEHEIGHT'] = str(height)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
653 with open(template_file, 'r') as template_fh:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
654 m_var = re.compile("\{\$([A-Z]+)\}")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
655 for line in template_fh:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
656 last = 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
657 match = m_var.search(line)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
658 while (match):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
659 if (last < match.start()):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
660 prev = line[last:match.start()]
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
661 eps_fh.write(prev)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
662 key = match.group(1)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
663 if (key == "DATA"):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
664 eps_fh.write("\nStartLine\n")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
665 for pos in range(self.getLen()):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
666 eps_fh.write("({0:d}) startstack\n".format(pos+1))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
667 stack = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
668 # calculate the stack information content
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
669 alpha_ic = 2
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
670 h = 0
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
671 for sym in self.getAlphabet().getSymbols():
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
672 freq = self.getFreq(pos, sym)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
673 if (freq == 0):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
674 continue
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
675 h -= (freq * math.log(freq, 2))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
676 stack_ic = alpha_ic - h
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
677 # calculate the heights of each symbol
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
678 for sym in self.getAlphabet().getSymbols():
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
679 freq = self.getFreq(pos, sym)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
680 if (freq == 0):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
681 continue
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
682 stack.append((freq * stack_ic, sym))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
683 stack.sort();
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
684 # output the symbols
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
685 for symh, sym in stack:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
686 eps_fh.write(" {0:f} ({1:s}) numchar\n".format(
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
687 symh, sym))
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
688 eps_fh.write("endstack\n\n")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
689 eps_fh.write("EndLine\n")
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
690 elif (key in defaults):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
691 eps_fh.write(defaults[key])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
692 else:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
693 raise RuntimeError('Unknown variable "' + key +
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
694 '" in EPS template')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
695 last = match.end();
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
696 match = m_var.search(line, last)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
697 if (last < len(line)):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
698 eps_fh.write(line[last:])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
699
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
700
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
701 #------------------ Main method -------------------
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
702 # Executed if you run this file from the operating system prompt, e.g.
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
703 # > python sequence.py
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
704
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
705 if __name__=='__main__':
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
706 alpha = getAlphabet('Extended DNA')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
707 #seqs = readFASTA('pos.fasta')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
708 seqs = []
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
709 aln = readStrings('tmp0')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
710 #regexp = RegExp(alpha, '[AG]G.[DE]TT[AS].')
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
711 pwm = PWM(alpha)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
712 pwm.setFromAlignment(aln)
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
713 for row in pwm.pretty():
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
714 print row
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
715 for s in seqs:
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
716 print s.getName(), s.getLen(), s.getAlphabet().getSymbols()
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
717 for m in regexp.match( s ):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
718 print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
719 for m in pwm.match( s ):
39217fa39ff2 Uploaded
xuebing
parents:
diff changeset
720 print "pos: %d pat: %s %4.2f" % (m[0], m[1], m[2])