Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq.py @ 4:4d47ab2b7bcc
Uploaded
author | davidmurphy |
---|---|
date | Fri, 13 Jan 2012 07:18:19 -0500 |
parents | c55bdc2fb9fa |
children |
comparison
equal
deleted
inserted
replaced
3:09d2dac9ef73 | 4:4d47ab2b7bcc |
---|---|
1 | |
2 # Copyright (c) 2005 Gavin E. Crooks <gec@compbio.berkeley.edu> | |
3 # | |
4 # This software is distributed under the MIT Open Source License. | |
5 # <http://www.opensource.org/licenses/mit-license.html> | |
6 # | |
7 # Permission is hereby granted, free of charge, to any person obtaining a | |
8 # copy of this software and associated documentation files (the "Software"), | |
9 # to deal in the Software without restriction, including without limitation | |
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
11 # and/or sell copies of the Software, and to permit persons to whom the | |
12 # Software is furnished to do so, subject to the following conditions: | |
13 # | |
14 # The above copyright notice and this permission notice shall be included | |
15 # in all copies or substantial portions of the Software. | |
16 # | |
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
23 # THE SOFTWARE. | |
24 # | |
25 | |
26 | |
27 | |
28 """ Alphabetic sequences and associated tools and data. | |
29 | |
30 Seq is a subclass of a python string with additional annotation and an alphabet. | |
31 The characters in string must be contained in the alphabet. Various standard | |
32 alphabets are provided. | |
33 | |
34 | |
35 Classes : | |
36 Alphabet -- A subset of non-null ascii characters | |
37 Seq -- An alphabetic string | |
38 SeqList -- A collection of Seq's | |
39 | |
40 Alphabets : | |
41 o generic_alphabet -- A generic alphabet. Any printable ASCII character. | |
42 o protein_alphabet -- IUCAP/IUB Amino Acid one letter codes. | |
43 o nucleic_alphabet -- IUPAC/IUB Nucleic Acid codes 'ACGTURYSWKMBDHVN-' | |
44 o dna_alphabet -- Same as nucleic_alphabet, with 'U' (Uracil) an | |
45 alternative for 'T' (Thymidine). | |
46 o rna_alphabet -- Same as nucleic_alphabet, with 'T' (Thymidine) an | |
47 alternative for 'U' (Uracil). | |
48 o reduced_nucleic_alphabet -- All ambiguous codes in 'nucleic_alphabet' are | |
49 alternative to 'N' (aNy) | |
50 o reduced_protein_alphabet -- All ambiguous ('BZJ') and non-canonical amino | |
51 acids codes ( 'U', Selenocysteine and 'O', Pyrrolysine) in | |
52 'protein_alphabet' are alternative to 'X'. | |
53 o unambiguous_dna_alphabet -- 'ACGT' | |
54 o unambiguous_rna_alphabet -- 'ACGU' | |
55 o unambiguous_protein_alphabet -- The twenty canonical amino acid one letter | |
56 codes, in alphabetic order, 'ACDEFGHIKLMNPQRSTVWY' | |
57 | |
58 Amino Acid Codes: | |
59 Code Alt. Meaning | |
60 ----------------- | |
61 A Alanine | |
62 B Aspartic acid or Asparagine | |
63 C Cysteine | |
64 D Aspartate | |
65 E Glutamate | |
66 F Phenylalanine | |
67 G Glycine | |
68 H Histidine | |
69 I Isoleucine | |
70 J Leucine or Isoleucine | |
71 K Lysine | |
72 L Leucine | |
73 M Methionine | |
74 N Asparagine | |
75 O Pyrrolysine | |
76 P Proline | |
77 Q Glutamine | |
78 R Arginine | |
79 S Serine | |
80 T Threonine | |
81 U Selenocysteine | |
82 V Valine | |
83 W Tryptophan | |
84 Y Tyrosine | |
85 Z Glutamate or Glutamine | |
86 X ? any | |
87 * translation stop | |
88 - .~ gap | |
89 | |
90 Nucleotide Codes: | |
91 Code Alt. Meaning | |
92 ------------------------------ | |
93 A Adenosine | |
94 C Cytidine | |
95 G Guanine | |
96 T Thymidine | |
97 U Uracil | |
98 R G A (puRine) | |
99 Y T C (pYrimidine) | |
100 K G T (Ketone) | |
101 M A C (aMino group) | |
102 S G C (Strong interaction) | |
103 W A T (Weak interaction) | |
104 B G T C (not A) (B comes after A) | |
105 D G A T (not C) (D comes after C) | |
106 H A C T (not G) (H comes after G) | |
107 V G C A (not T, not U) (V comes after U) | |
108 N X? A G C T (aNy) | |
109 - .~ A gap | |
110 | |
111 | |
112 | |
113 | |
114 Refs: | |
115 http://www.chem.qmw.ac.uk/iupac/AminoAcid/A2021.html | |
116 http://www.chem.qmw.ac.uk/iubmb/misc/naseq.html | |
117 Status: | |
118 Beta | |
119 Authors: | |
120 GEC 2004,2005 | |
121 """ | |
122 | |
123 # TODO: Add this to docstring somewhere. | |
124 # To replace all ambiguous nucleic code by 'N', replace alphabet and then n | |
125 # normalize. | |
126 # | |
127 # >>> Seq( 'ACGT-RYKM', reduced_nucleic_alphabet).normalized() | |
128 # 'ACGT-NNNN' | |
129 | |
130 from array import array | |
131 from string import maketrans | |
132 from corebio.moremath import argmax, sqrt | |
133 | |
134 __all__ = [ | |
135 'Alphabet', | |
136 'Seq', | |
137 'rna', 'dna', 'protein', | |
138 'SeqList', | |
139 'generic_alphabet', | |
140 'protein_alphabet', | |
141 'nucleic_alphabet', | |
142 'dna_alphabet', | |
143 'rna_alphabet', | |
144 'reduced_nucleic_alphabet', | |
145 'reduced_protein_alphabet', | |
146 'unambiguous_dna_alphabet', | |
147 'unambiguous_dna_alphabet', | |
148 'unambiguous_rna_alphabet', | |
149 'unambiguous_protein_alphabet', | |
150 'generic_alphabet' | |
151 ] | |
152 | |
153 | |
154 | |
155 class Alphabet(object) : | |
156 """An ordered subset of printable ascii characters. | |
157 | |
158 Status: | |
159 Beta | |
160 Authors: | |
161 - GEC 2005 | |
162 """ | |
163 __slots__ = ['_letters', '_alternatives','_ord_table', '_chr_table'] | |
164 | |
165 # We're immutable, so use __new__ not __init__ | |
166 def __new__(cls, letters, alternatives= None) : | |
167 """Create a new, immutable Alphabet. | |
168 | |
169 arguments: | |
170 - letters -- the letters in the alphabet. The ordering determines | |
171 the ordinal position of each character in this alphabet. | |
172 - alt -- A list of (alternative, canonical) letters. The alternatives | |
173 are given the same ordinal position as the canonical character. | |
174 e.g. (('?','X'),('x', 'X')) states that '?' and 'x' are synomonous | |
175 with 'X'. Values that are not in 'letters' are ignored. Alternatives | |
176 that are already in 'letters' are also ignored. If the same | |
177 alternative character is used twice then the alternative is assigned | |
178 to the canonical character that occurs first in 'letters'. The | |
179 default is to assume that upper and lower case characters are | |
180 equivalent, unless both cases are included in 'letters'. | |
181 raises: | |
182 ValueError : Repetitive or otherwise illegal set of letters. | |
183 """ | |
184 self = object.__new__(cls) | |
185 | |
186 # Printable Ascii characters | |
187 ascii_letters = "".join([chr(__i) for __i in range(32,128)]) | |
188 | |
189 if letters is None : letters = ascii_letters | |
190 self._letters = letters | |
191 | |
192 equivalent_by_case = zip( 'abcdefghijklmnopqrstuvwxyz', | |
193 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') | |
194 | |
195 if alternatives is None : alternatives = equivalent_by_case | |
196 | |
197 | |
198 # The ord_table maps between the ordinal position of a character in ascii | |
199 # and the ordinal position in this alphabet. Characters not in the | |
200 # alphabet are given a position of 255. The ord_table is stored as a | |
201 # string. | |
202 ord_table = ["\xff",] * 256 | |
203 for i,a in enumerate(letters) : | |
204 n = ord(a) | |
205 if n == 0 : | |
206 raise ValueError("Alphabet cannot contain null character \\0") | |
207 if ord_table[ n ] != "\xff": | |
208 raise ValueError("Repetitive alphabet") | |
209 ord_table[ n ] = chr(i) | |
210 | |
211 # Add alternatives | |
212 _from = [] | |
213 _to = [] | |
214 for e, c in alternatives : | |
215 if c in letters : | |
216 n = ord(e) | |
217 if ord_table[ n ] == "\xff" : # empty | |
218 ord_table[ n ] = ord_table[ ord(c)] | |
219 _from.append(e) | |
220 _to.append(c) | |
221 self._alternatives = (''.join(_from), ''.join(_to)) | |
222 | |
223 ord_table = "".join(ord_table) | |
224 assert( ord_table[0] == "\xff") | |
225 self._ord_table = ord_table | |
226 | |
227 # The chr_table maps between ordinal position in the alphabet letters | |
228 # and the ordinal position in ascii. This map is not the inverse of | |
229 # ord_table if there are alternatives. | |
230 chr_table = ["\x00"]*256 | |
231 for i,a in enumerate(letters) : | |
232 chr_table[ i ] = a | |
233 chr_table = "".join(chr_table) | |
234 self._chr_table = chr_table | |
235 | |
236 return self | |
237 | |
238 | |
239 def alphabetic(self, string) : | |
240 """True if all characters of the string are in this alphabet.""" | |
241 table = self._ord_table | |
242 for s in str(string): | |
243 if table[ord(s)] == "\xff" : | |
244 return False | |
245 return True | |
246 | |
247 def chr(self, n) : | |
248 """ The n'th character in the alphabet (zero indexed) or \\0 """ | |
249 return self._chr_table[n] | |
250 | |
251 def ord(self, c) : | |
252 """The ordinal position of the character c in this alphabet, | |
253 or 255 if no such character. | |
254 """ | |
255 return ord(self._ord_table[ord(c)]) | |
256 | |
257 def chrs(self, sequence_of_ints) : | |
258 """Convert a sequence of ordinals into an alphabetic string.""" | |
259 if not isinstance(sequence_of_ints, array) : | |
260 sequence_of_ints = array('B', sequence_of_ints) | |
261 s = sequence_of_ints.tostring().translate(self._chr_table) | |
262 return Seq(s, self) | |
263 | |
264 def ords(self, string) : | |
265 """Convert an alphabetic string into a byte array of ordinals.""" | |
266 string = str(string) | |
267 s = string.translate(self._ord_table) | |
268 a = array('B',s) | |
269 return a | |
270 | |
271 | |
272 def normalize(self, string) : | |
273 """Normalize an alphabetic string by converting all alternative symbols | |
274 to the canonical equivalent in 'letters'. | |
275 """ | |
276 if not self.alphabetic(string) : | |
277 raise ValueError("Not an alphabetic string.") | |
278 return self.chrs(self.ords(string)) | |
279 | |
280 def letters(self) : | |
281 """ Letters of the alphabet as a string.""" | |
282 return str(self) | |
283 | |
284 def _all_letters(self) : | |
285 """ All allowed letters, including alternatives.""" | |
286 let = [] | |
287 let.append(self._letters) | |
288 for key, value in self._alternatives : | |
289 let.append(value) | |
290 return ''.join(let) | |
291 | |
292 def __repr__(self) : | |
293 return "Alphabet( '" + self._letters +"', zip"+ repr(self._alternatives)+" )" | |
294 | |
295 def __str__(self) : | |
296 return str(self._letters) | |
297 | |
298 def __len__(self) : | |
299 return len(self._letters) | |
300 | |
301 def __eq__(self, other) : | |
302 if not hasattr(other, "_ord_table") : return False | |
303 return self._ord_table == other._ord_table | |
304 | |
305 def __ne__(self, other) : | |
306 return not self.__eq__(other) | |
307 | |
308 def __iter__(self) : | |
309 return iter(self._letters) | |
310 | |
311 def __getitem__(self, key) : | |
312 return self._letters[key] | |
313 | |
314 | |
315 # End class Alphabet | |
316 | |
317 # ------------------- Standard ALPHABETS ------------------- | |
318 # Standard alphabets are defined here, after Alphabet class. | |
319 | |
320 generic_alphabet = Alphabet(None, None) | |
321 | |
322 | |
323 protein_alphabet = Alphabet('ACDEFGHIKLMNOPQRSTUVWYBJZX*-', | |
324 zip('acdefghiklmnopqrstuvwybjzx?.~', | |
325 'ACDEFGHIKLMNOPQRSTUVWYBJZXX--') ) | |
326 | |
327 | |
328 nucleic_alphabet = Alphabet("ACGTURYSWKMBDHVN-", | |
329 zip("acgturyswkmbdhvnXx?.~", | |
330 "ACGTURYSWKMBDHVNNNN--") ) | |
331 | |
332 dna_alphabet = Alphabet("ACGTRYSWKMBDHVN-", | |
333 zip('acgtryswkmbdhvnXx?.~Uu', | |
334 'ACGTRYSWKMBDHVNNNN--TT') ) | |
335 | |
336 rna_alphabet = Alphabet("ACGURYSWKMBDHVN-", | |
337 zip('acguryswkmbdhvnXx?.~Tt', | |
338 'ACGURYSWKMBDHVNNNN--UU') ) | |
339 | |
340 reduced_nucleic_alphabet = Alphabet("ACGTN-", | |
341 zip('acgtryswkmbdhvnXx?.~TtRYSWKMBDHV', | |
342 'ACGTNNNNNNNNNNNNNN--TTNNNNNNNNNN') ) | |
343 | |
344 reduced_protein_alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWYX*-', | |
345 zip('acdefghiklmnpqrstvwyx?.~BbZzUu', | |
346 'ACDEFGHIKLMNPQRSTVWYXX--XXXXCC') ) | |
347 | |
348 unambiguous_dna_alphabet = Alphabet("ACGT", zip('acgt','ACGT') ) | |
349 | |
350 unambiguous_rna_alphabet = Alphabet("ACGU", zip('acgu','ACGU') ) | |
351 | |
352 unambiguous_protein_alphabet = Alphabet("ACDEFGHIKLMNPQRSTVWY", | |
353 zip('acdefghiklmnopqrstuvwy', | |
354 'ACDEFGHIKLMNOPQRSTUVWY') ) | |
355 | |
356 | |
357 _complement_table = maketrans("ACGTRYSWKMBDHVN-acgtUuryswkmbdhvnXx?.~", | |
358 "TGCAYRSWMKVHDBN-tgcaAayrswmkvhdbnXx?.~") | |
359 | |
360 | |
361 | |
362 class Seq(str): | |
363 """ An alphabetic string. A subclass of "str" consisting solely of | |
364 letters from the same alphabet. | |
365 | |
366 Attributes: | |
367 alphabet -- A string or Alphabet of allowed characters. | |
368 name -- A short string used to identify the sequence. | |
369 description -- A string describing the sequence | |
370 | |
371 Authors : | |
372 GEC 2005 | |
373 """ | |
374 # TODO: need a method to return a copy of the string with a new alphabet, | |
375 # preserving the sequence, name and alphabet? | |
376 | |
377 def __new__(cls, obj, | |
378 alphabet= generic_alphabet, | |
379 name =None, description=None, | |
380 ): | |
381 self = str.__new__(cls, obj) | |
382 if alphabet is None: | |
383 alphabet = generic_alphabet | |
384 if not isinstance(alphabet, Alphabet): | |
385 alphabet = Alphabet(alphabet) | |
386 if not alphabet.alphabetic(self) : | |
387 raise ValueError("Sequence not alphabetic %s, '%s'" %(alphabet, self)) | |
388 | |
389 self._alphabet=alphabet | |
390 self.name = name | |
391 self.description = description | |
392 | |
393 return self | |
394 | |
395 # BEGIN PROPERTIES | |
396 | |
397 # Make alphabet constant | |
398 def _get_alphabet(self): | |
399 return self._alphabet | |
400 alphabet = property(_get_alphabet) | |
401 | |
402 # END PROPERTIES | |
403 | |
404 | |
405 def ords(self) : | |
406 """ Convert sequence to an array of integers | |
407 in the range [0, len(alphabet) ) | |
408 """ | |
409 return self.alphabet.ords(self) | |
410 | |
411 def tally(self, alphabet = None): | |
412 """Counts the occurrences of alphabetic characters. | |
413 | |
414 Arguments: | |
415 - alphabet -- an optional alternative alphabet | |
416 | |
417 Returns : | |
418 A list of character counts in alphabetic order. | |
419 """ | |
420 # Renamed from count() since this conflicts with str.count(). | |
421 if not alphabet : alphabet = self.alphabet | |
422 L = len(alphabet) | |
423 counts = [0,] * L | |
424 | |
425 ords = alphabet.ords(self) | |
426 | |
427 for n in ords: | |
428 if n<L : counts[n] +=1 | |
429 | |
430 return counts | |
431 | |
432 | |
433 def kmers(self, alphabet = None, k=1): | |
434 """Counts the occurrences of overlapping alphabetic subsequences. | |
435 | |
436 Arguments: | |
437 - alphabet -- an optional alternative alphabet | |
438 - k -- subsequence length. Default: 1 (monomers) | |
439 | |
440 Returns : | |
441 A list of kmers counts in alphabetic order. | |
442 Status : | |
443 Alpha -- Not sure on interface. Will only work for small k | |
444 """ | |
445 # TODO: Refactor? | |
446 # TODO: Rename 'kmers' to 'words' or word_count | |
447 if not alphabet : alphabet = self.alphabet | |
448 | |
449 L = len(alphabet) | |
450 N = L**k | |
451 counts = [0,]*N | |
452 | |
453 ords = alphabet.ords(self) | |
454 | |
455 | |
456 # Easy case | |
457 if k==1 : | |
458 for n in ords: | |
459 if n<N : counts[n] +=1 | |
460 return counts | |
461 | |
462 # Kmer counting. | |
463 # FIXME: This code assumes that k isn't too large. | |
464 | |
465 # e.g. L =10, k = 3, multi = [100,10,1] | |
466 multi = [ L**i for i in range(k-1,-1,-1)] | |
467 | |
468 for i in range(len(ords)-k+1) : | |
469 if ords[i] >= N : # Skip non-alphabetic kmers | |
470 i += k | |
471 continue | |
472 #FIXME: this should be a function of alphabet? | |
473 n = sum([multi[j]* ords[i+j] for j in range(k) ]) | |
474 counts[n] +=1 | |
475 | |
476 return counts | |
477 | |
478 def __getslice__(self, i, j): | |
479 cls = self.__class__ | |
480 return cls( str.__getslice__(self,i,j), self.alphabet) | |
481 | |
482 def __getitem__(self, key) : | |
483 cls = self.__class__ | |
484 return cls( str.__getitem__(self,key), self.alphabet) | |
485 | |
486 def __add__(self, other) : | |
487 # called for "self + other" | |
488 cls = self.__class__ | |
489 return cls( str.__add__(self, other), self.alphabet) | |
490 | |
491 def __radd__(self, other) : | |
492 # Called when "other + self" and other is superclass of self | |
493 cls = self.__class__ | |
494 return cls( str.__add__(self, other), self.alphabet) | |
495 | |
496 def join(self, str_list) : | |
497 cls = self.__class__ | |
498 return cls( super(Seq, self).join(str_list), self.alphabet) | |
499 | |
500 def __eq__(self, other) : | |
501 if not hasattr(other, "alphabet") : return False | |
502 if self.alphabet != other.alphabet : | |
503 return False | |
504 return str.__eq__(self, other) | |
505 | |
506 def __ne__(self, other) : | |
507 return not self.__eq__(other) | |
508 | |
509 def tostring(self) : | |
510 """ Converts Seq to a raw string. | |
511 """ | |
512 # Compatibility with biopython | |
513 return str(self) | |
514 | |
515 # ---- Transformations of Seq ---- | |
516 def reverse(self) : | |
517 """Return the reversed sequence. | |
518 | |
519 Not that this method returns a new object, in contrast to | |
520 the in-place reverse() method of list objects. | |
521 """ | |
522 cls = self.__class__ | |
523 return cls( self[::-1], self.alphabet) | |
524 | |
525 def ungap(self) : | |
526 # FIXME: Gap symbols should be specified by the Alphabet? | |
527 return self.remove( '-.~') | |
528 | |
529 def remove(self, delchars) : | |
530 """Return a new alphabetic sequence with all characters in 'delchars' | |
531 removed. | |
532 """ | |
533 cls = self.__class__ | |
534 return cls( str(self).translate(maketrans('',''), delchars), self.alphabet) | |
535 | |
536 def lower(self) : | |
537 """Return a lower case copy of the sequence. """ | |
538 cls = self.__class__ | |
539 trans = maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz') | |
540 return cls(str(self).translate(trans), self.alphabet) | |
541 | |
542 def upper(self) : | |
543 """Return a lower case copy of the sequence. """ | |
544 cls = self.__class__ | |
545 trans = maketrans('abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ') | |
546 return cls(str(self).translate(trans), self.alphabet) | |
547 | |
548 def mask(self, letters= 'abcdefghijklmnopqrstuvwxyz', mask='X') : | |
549 """Replace all occurences of letters with the mask character. | |
550 The default is to replace all lower case letters with 'X'. | |
551 """ | |
552 LL = len(letters) | |
553 if len(mask) !=1 : | |
554 raise ValueError("Mask should be single character") | |
555 to = mask * LL | |
556 trans = maketrans( letters, to) | |
557 cls = self.__class__ | |
558 return cls(str(self).translate(trans), self.alphabet) | |
559 | |
560 def translate(self) : | |
561 """Translate a nucleotide sequence to a polypeptide using full | |
562 IUPAC ambiguities in DNA/RNA and amino acid codes, using the | |
563 standard genetic code. See corebio.transform.GeneticCode for | |
564 details and more options. | |
565 """ | |
566 # Note: masks str.translate | |
567 from transform import GeneticCode | |
568 return GeneticCode.std().translate(self) | |
569 | |
570 def back_translate(self) : | |
571 """Translate a protein sequence back into coding DNA, using using the | |
572 standard genetic code. See corebio.transform.GeneticCode for | |
573 details and more options. | |
574 """ | |
575 from transform import GeneticCode | |
576 return GeneticCode.std().back_translate(self) | |
577 | |
578 | |
579 def reverse_complement(self) : | |
580 """Returns reversed complementary nucleic acid sequence (i.e. the other | |
581 strand of a DNA sequence.) | |
582 """ | |
583 return self.reverse().complement() | |
584 | |
585 def complement(self) : | |
586 """Returns complementary nucleic acid sequence.""" | |
587 if not nucleic_alphabet.alphabetic(self.alphabet): | |
588 raise ValueError("Incompatable alphabets") | |
589 s = str.translate(self, _complement_table) | |
590 cls = self.__class__ | |
591 return cls(s, self.alphabet, self.name, self.description) | |
592 | |
593 | |
594 # end class Seq | |
595 | |
596 | |
597 class SeqList(list): | |
598 """ A list of sequences. | |
599 | |
600 Status: | |
601 Beta | |
602 """ | |
603 # TODO: If alphabet given, we should ensure that all sequences conform. | |
604 # TODO: Need an isaligned() method. All seqs same length, same alphabet. | |
605 __slots__ =["alphabet", "name", "description"] | |
606 | |
607 def __init__(self, alist=[], alphabet=None, name=None, description=None): | |
608 list.__init__(self, alist) | |
609 self.alphabet = alphabet | |
610 self.name = name | |
611 self.description = description | |
612 | |
613 # TOOWTDI. Replicates seq_io.read() | |
614 #@classmethod | |
615 #def read(cls, afile, alphabet = None): | |
616 # return corebio.seq_io.read(afile, alphabet) | |
617 #read = classmethod(read) | |
618 | |
619 def ords(self, alphabet=None) : | |
620 """ Convert sequence list into a 2D array of ordinals. | |
621 """ | |
622 if not alphabet : alphabet = self.alphabet | |
623 if not alphabet : raise ValueError("No alphabet") | |
624 k = [] | |
625 for s in self: | |
626 k.append( alphabet.ords(s) ) | |
627 return k | |
628 | |
629 def tally(self, alphabet = None): | |
630 """Counts the occurrences of characters in each column.""" | |
631 if not alphabet : alphabet = self.alphabet | |
632 if not alphabet : raise ValueError("No alphabet") | |
633 | |
634 N = len(alphabet) | |
635 ords = self.ords(alphabet) | |
636 L = len(ords[0]) | |
637 counts = [ [0,]*N for l in range(0,L)] | |
638 | |
639 for o in ords : | |
640 for j,n in enumerate(o) : | |
641 if n<N : counts[ j][n] +=1 | |
642 | |
643 return counts | |
644 # end class SeqList | |
645 | |
646 | |
647 def dna(string) : | |
648 """Create an alphabetic sequence representing a stretch of DNA. | |
649 """ | |
650 return Seq(string, alphabet = dna_alphabet) | |
651 | |
652 def rna(string) : | |
653 """Create an alphabetic sequence representing a stretch of RNA. | |
654 """ | |
655 return Seq(string, alphabet = rna_alphabet) | |
656 | |
657 def protein(string) : | |
658 """Create an alphabetic sequence representing a stretch of polypeptide. | |
659 """ | |
660 return Seq(string, alphabet = protein_alphabet) | |
661 | |
662 | |
663 | |
664 | |
665 |