Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq.py @ 0:c55bdc2fb9fa
Uploaded
| author | davidmurphy | 
|---|---|
| date | Thu, 27 Oct 2011 12:09:09 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:c55bdc2fb9fa | 
|---|---|
| 1 | |
| 2 # Copyright (c) 2005 Gavin E. Crooks <gec@compbio.berkeley.edu> | |
| 3 # | |
| 4 # This software is distributed under the MIT Open Source License. | |
| 5 # <http://www.opensource.org/licenses/mit-license.html> | |
| 6 # | |
| 7 # Permission is hereby granted, free of charge, to any person obtaining a | |
| 8 # copy of this software and associated documentation files (the "Software"), | |
| 9 # to deal in the Software without restriction, including without limitation | |
| 10 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
| 11 # and/or sell copies of the Software, and to permit persons to whom the | |
| 12 # Software is furnished to do so, subject to the following conditions: | |
| 13 # | |
| 14 # The above copyright notice and this permission notice shall be included | |
| 15 # in all copies or substantial portions of the Software. | |
| 16 # | |
| 17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| 23 # THE SOFTWARE. | |
| 24 # | |
| 25 | |
| 26 | |
| 27 | |
| 28 """ Alphabetic sequences and associated tools and data. | |
| 29 | |
| 30 Seq is a subclass of a python string with additional annotation and an alphabet. | |
| 31 The characters in string must be contained in the alphabet. Various standard | |
| 32 alphabets are provided. | |
| 33 | |
| 34 | |
| 35 Classes : | |
| 36 Alphabet -- A subset of non-null ascii characters | |
| 37 Seq -- An alphabetic string | |
| 38 SeqList -- A collection of Seq's | |
| 39 | |
| 40 Alphabets : | |
| 41 o generic_alphabet -- A generic alphabet. Any printable ASCII character. | |
| 42 o protein_alphabet -- IUCAP/IUB Amino Acid one letter codes. | |
| 43 o nucleic_alphabet -- IUPAC/IUB Nucleic Acid codes 'ACGTURYSWKMBDHVN-' | |
| 44 o dna_alphabet -- Same as nucleic_alphabet, with 'U' (Uracil) an | |
| 45 alternative for 'T' (Thymidine). | |
| 46 o rna_alphabet -- Same as nucleic_alphabet, with 'T' (Thymidine) an | |
| 47 alternative for 'U' (Uracil). | |
| 48 o reduced_nucleic_alphabet -- All ambiguous codes in 'nucleic_alphabet' are | |
| 49 alternative to 'N' (aNy) | |
| 50 o reduced_protein_alphabet -- All ambiguous ('BZJ') and non-canonical amino | |
| 51 acids codes ( 'U', Selenocysteine and 'O', Pyrrolysine) in | |
| 52 'protein_alphabet' are alternative to 'X'. | |
| 53 o unambiguous_dna_alphabet -- 'ACGT' | |
| 54 o unambiguous_rna_alphabet -- 'ACGU' | |
| 55 o unambiguous_protein_alphabet -- The twenty canonical amino acid one letter | |
| 56 codes, in alphabetic order, 'ACDEFGHIKLMNPQRSTVWY' | |
| 57 | |
| 58 Amino Acid Codes: | |
| 59 Code Alt. Meaning | |
| 60 ----------------- | |
| 61 A Alanine | |
| 62 B Aspartic acid or Asparagine | |
| 63 C Cysteine | |
| 64 D Aspartate | |
| 65 E Glutamate | |
| 66 F Phenylalanine | |
| 67 G Glycine | |
| 68 H Histidine | |
| 69 I Isoleucine | |
| 70 J Leucine or Isoleucine | |
| 71 K Lysine | |
| 72 L Leucine | |
| 73 M Methionine | |
| 74 N Asparagine | |
| 75 O Pyrrolysine | |
| 76 P Proline | |
| 77 Q Glutamine | |
| 78 R Arginine | |
| 79 S Serine | |
| 80 T Threonine | |
| 81 U Selenocysteine | |
| 82 V Valine | |
| 83 W Tryptophan | |
| 84 Y Tyrosine | |
| 85 Z Glutamate or Glutamine | |
| 86 X ? any | |
| 87 * translation stop | |
| 88 - .~ gap | |
| 89 | |
| 90 Nucleotide Codes: | |
| 91 Code Alt. Meaning | |
| 92 ------------------------------ | |
| 93 A Adenosine | |
| 94 C Cytidine | |
| 95 G Guanine | |
| 96 T Thymidine | |
| 97 U Uracil | |
| 98 R G A (puRine) | |
| 99 Y T C (pYrimidine) | |
| 100 K G T (Ketone) | |
| 101 M A C (aMino group) | |
| 102 S G C (Strong interaction) | |
| 103 W A T (Weak interaction) | |
| 104 B G T C (not A) (B comes after A) | |
| 105 D G A T (not C) (D comes after C) | |
| 106 H A C T (not G) (H comes after G) | |
| 107 V G C A (not T, not U) (V comes after U) | |
| 108 N X? A G C T (aNy) | |
| 109 - .~ A gap | |
| 110 | |
| 111 | |
| 112 | |
| 113 | |
| 114 Refs: | |
| 115 http://www.chem.qmw.ac.uk/iupac/AminoAcid/A2021.html | |
| 116 http://www.chem.qmw.ac.uk/iubmb/misc/naseq.html | |
| 117 Status: | |
| 118 Beta | |
| 119 Authors: | |
| 120 GEC 2004,2005 | |
| 121 """ | |
| 122 | |
| 123 # TODO: Add this to docstring somewhere. | |
| 124 # To replace all ambiguous nucleic code by 'N', replace alphabet and then n | |
| 125 # normalize. | |
| 126 # | |
| 127 # >>> Seq( 'ACGT-RYKM', reduced_nucleic_alphabet).normalized() | |
| 128 # 'ACGT-NNNN' | |
| 129 | |
| 130 from array import array | |
| 131 from string import maketrans | |
| 132 from corebio.moremath import argmax, sqrt | |
| 133 | |
| 134 __all__ = [ | |
| 135 'Alphabet', | |
| 136 'Seq', | |
| 137 'rna', 'dna', 'protein', | |
| 138 'SeqList', | |
| 139 'generic_alphabet', | |
| 140 'protein_alphabet', | |
| 141 'nucleic_alphabet', | |
| 142 'dna_alphabet', | |
| 143 'rna_alphabet', | |
| 144 'reduced_nucleic_alphabet', | |
| 145 'reduced_protein_alphabet', | |
| 146 'unambiguous_dna_alphabet', | |
| 147 'unambiguous_dna_alphabet', | |
| 148 'unambiguous_rna_alphabet', | |
| 149 'unambiguous_protein_alphabet', | |
| 150 'generic_alphabet' | |
| 151 ] | |
| 152 | |
| 153 | |
| 154 | |
| 155 class Alphabet(object) : | |
| 156 """An ordered subset of printable ascii characters. | |
| 157 | |
| 158 Status: | |
| 159 Beta | |
| 160 Authors: | |
| 161 - GEC 2005 | |
| 162 """ | |
| 163 __slots__ = ['_letters', '_alternatives','_ord_table', '_chr_table'] | |
| 164 | |
| 165 # We're immutable, so use __new__ not __init__ | |
| 166 def __new__(cls, letters, alternatives= None) : | |
| 167 """Create a new, immutable Alphabet. | |
| 168 | |
| 169 arguments: | |
| 170 - letters -- the letters in the alphabet. The ordering determines | |
| 171 the ordinal position of each character in this alphabet. | |
| 172 - alt -- A list of (alternative, canonical) letters. The alternatives | |
| 173 are given the same ordinal position as the canonical character. | |
| 174 e.g. (('?','X'),('x', 'X')) states that '?' and 'x' are synomonous | |
| 175 with 'X'. Values that are not in 'letters' are ignored. Alternatives | |
| 176 that are already in 'letters' are also ignored. If the same | |
| 177 alternative character is used twice then the alternative is assigned | |
| 178 to the canonical character that occurs first in 'letters'. The | |
| 179 default is to assume that upper and lower case characters are | |
| 180 equivalent, unless both cases are included in 'letters'. | |
| 181 raises: | |
| 182 ValueError : Repetitive or otherwise illegal set of letters. | |
| 183 """ | |
| 184 self = object.__new__(cls) | |
| 185 | |
| 186 # Printable Ascii characters | |
| 187 ascii_letters = "".join([chr(__i) for __i in range(32,128)]) | |
| 188 | |
| 189 if letters is None : letters = ascii_letters | |
| 190 self._letters = letters | |
| 191 | |
| 192 equivalent_by_case = zip( 'abcdefghijklmnopqrstuvwxyz', | |
| 193 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') | |
| 194 | |
| 195 if alternatives is None : alternatives = equivalent_by_case | |
| 196 | |
| 197 | |
| 198 # The ord_table maps between the ordinal position of a character in ascii | |
| 199 # and the ordinal position in this alphabet. Characters not in the | |
| 200 # alphabet are given a position of 255. The ord_table is stored as a | |
| 201 # string. | |
| 202 ord_table = ["\xff",] * 256 | |
| 203 for i,a in enumerate(letters) : | |
| 204 n = ord(a) | |
| 205 if n == 0 : | |
| 206 raise ValueError("Alphabet cannot contain null character \\0") | |
| 207 if ord_table[ n ] != "\xff": | |
| 208 raise ValueError("Repetitive alphabet") | |
| 209 ord_table[ n ] = chr(i) | |
| 210 | |
| 211 # Add alternatives | |
| 212 _from = [] | |
| 213 _to = [] | |
| 214 for e, c in alternatives : | |
| 215 if c in letters : | |
| 216 n = ord(e) | |
| 217 if ord_table[ n ] == "\xff" : # empty | |
| 218 ord_table[ n ] = ord_table[ ord(c)] | |
| 219 _from.append(e) | |
| 220 _to.append(c) | |
| 221 self._alternatives = (''.join(_from), ''.join(_to)) | |
| 222 | |
| 223 ord_table = "".join(ord_table) | |
| 224 assert( ord_table[0] == "\xff") | |
| 225 self._ord_table = ord_table | |
| 226 | |
| 227 # The chr_table maps between ordinal position in the alphabet letters | |
| 228 # and the ordinal position in ascii. This map is not the inverse of | |
| 229 # ord_table if there are alternatives. | |
| 230 chr_table = ["\x00"]*256 | |
| 231 for i,a in enumerate(letters) : | |
| 232 chr_table[ i ] = a | |
| 233 chr_table = "".join(chr_table) | |
| 234 self._chr_table = chr_table | |
| 235 | |
| 236 return self | |
| 237 | |
| 238 | |
| 239 def alphabetic(self, string) : | |
| 240 """True if all characters of the string are in this alphabet.""" | |
| 241 table = self._ord_table | |
| 242 for s in str(string): | |
| 243 if table[ord(s)] == "\xff" : | |
| 244 return False | |
| 245 return True | |
| 246 | |
| 247 def chr(self, n) : | |
| 248 """ The n'th character in the alphabet (zero indexed) or \\0 """ | |
| 249 return self._chr_table[n] | |
| 250 | |
| 251 def ord(self, c) : | |
| 252 """The ordinal position of the character c in this alphabet, | |
| 253 or 255 if no such character. | |
| 254 """ | |
| 255 return ord(self._ord_table[ord(c)]) | |
| 256 | |
| 257 def chrs(self, sequence_of_ints) : | |
| 258 """Convert a sequence of ordinals into an alphabetic string.""" | |
| 259 if not isinstance(sequence_of_ints, array) : | |
| 260 sequence_of_ints = array('B', sequence_of_ints) | |
| 261 s = sequence_of_ints.tostring().translate(self._chr_table) | |
| 262 return Seq(s, self) | |
| 263 | |
| 264 def ords(self, string) : | |
| 265 """Convert an alphabetic string into a byte array of ordinals.""" | |
| 266 string = str(string) | |
| 267 s = string.translate(self._ord_table) | |
| 268 a = array('B',s) | |
| 269 return a | |
| 270 | |
| 271 | |
| 272 def normalize(self, string) : | |
| 273 """Normalize an alphabetic string by converting all alternative symbols | |
| 274 to the canonical equivalent in 'letters'. | |
| 275 """ | |
| 276 if not self.alphabetic(string) : | |
| 277 raise ValueError("Not an alphabetic string.") | |
| 278 return self.chrs(self.ords(string)) | |
| 279 | |
| 280 def letters(self) : | |
| 281 """ Letters of the alphabet as a string.""" | |
| 282 return str(self) | |
| 283 | |
| 284 def _all_letters(self) : | |
| 285 """ All allowed letters, including alternatives.""" | |
| 286 let = [] | |
| 287 let.append(self._letters) | |
| 288 for key, value in self._alternatives : | |
| 289 let.append(value) | |
| 290 return ''.join(let) | |
| 291 | |
| 292 def __repr__(self) : | |
| 293 return "Alphabet( '" + self._letters +"', zip"+ repr(self._alternatives)+" )" | |
| 294 | |
| 295 def __str__(self) : | |
| 296 return str(self._letters) | |
| 297 | |
| 298 def __len__(self) : | |
| 299 return len(self._letters) | |
| 300 | |
| 301 def __eq__(self, other) : | |
| 302 if not hasattr(other, "_ord_table") : return False | |
| 303 return self._ord_table == other._ord_table | |
| 304 | |
| 305 def __ne__(self, other) : | |
| 306 return not self.__eq__(other) | |
| 307 | |
| 308 def __iter__(self) : | |
| 309 return iter(self._letters) | |
| 310 | |
| 311 def __getitem__(self, key) : | |
| 312 return self._letters[key] | |
| 313 | |
| 314 | |
| 315 # End class Alphabet | |
| 316 | |
| 317 # ------------------- Standard ALPHABETS ------------------- | |
| 318 # Standard alphabets are defined here, after Alphabet class. | |
| 319 | |
| 320 generic_alphabet = Alphabet(None, None) | |
| 321 | |
| 322 | |
| 323 protein_alphabet = Alphabet('ACDEFGHIKLMNOPQRSTUVWYBJZX*-', | |
| 324 zip('acdefghiklmnopqrstuvwybjzx?.~', | |
| 325 'ACDEFGHIKLMNOPQRSTUVWYBJZXX--') ) | |
| 326 | |
| 327 | |
| 328 nucleic_alphabet = Alphabet("ACGTURYSWKMBDHVN-", | |
| 329 zip("acgturyswkmbdhvnXx?.~", | |
| 330 "ACGTURYSWKMBDHVNNNN--") ) | |
| 331 | |
| 332 dna_alphabet = Alphabet("ACGTRYSWKMBDHVN-", | |
| 333 zip('acgtryswkmbdhvnXx?.~Uu', | |
| 334 'ACGTRYSWKMBDHVNNNN--TT') ) | |
| 335 | |
| 336 rna_alphabet = Alphabet("ACGURYSWKMBDHVN-", | |
| 337 zip('acguryswkmbdhvnXx?.~Tt', | |
| 338 'ACGURYSWKMBDHVNNNN--UU') ) | |
| 339 | |
| 340 reduced_nucleic_alphabet = Alphabet("ACGTN-", | |
| 341 zip('acgtryswkmbdhvnXx?.~TtRYSWKMBDHV', | |
| 342 'ACGTNNNNNNNNNNNNNN--TTNNNNNNNNNN') ) | |
| 343 | |
| 344 reduced_protein_alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWYX*-', | |
| 345 zip('acdefghiklmnpqrstvwyx?.~BbZzUu', | |
| 346 'ACDEFGHIKLMNPQRSTVWYXX--XXXXCC') ) | |
| 347 | |
| 348 unambiguous_dna_alphabet = Alphabet("ACGT", zip('acgt','ACGT') ) | |
| 349 | |
| 350 unambiguous_rna_alphabet = Alphabet("ACGU", zip('acgu','ACGU') ) | |
| 351 | |
| 352 unambiguous_protein_alphabet = Alphabet("ACDEFGHIKLMNPQRSTVWY", | |
| 353 zip('acdefghiklmnopqrstuvwy', | |
| 354 'ACDEFGHIKLMNOPQRSTUVWY') ) | |
| 355 | |
| 356 | |
| 357 _complement_table = maketrans("ACGTRYSWKMBDHVN-acgtUuryswkmbdhvnXx?.~", | |
| 358 "TGCAYRSWMKVHDBN-tgcaAayrswmkvhdbnXx?.~") | |
| 359 | |
| 360 | |
| 361 | |
| 362 class Seq(str): | |
| 363 """ An alphabetic string. A subclass of "str" consisting solely of | |
| 364 letters from the same alphabet. | |
| 365 | |
| 366 Attributes: | |
| 367 alphabet -- A string or Alphabet of allowed characters. | |
| 368 name -- A short string used to identify the sequence. | |
| 369 description -- A string describing the sequence | |
| 370 | |
| 371 Authors : | |
| 372 GEC 2005 | |
| 373 """ | |
| 374 # TODO: need a method to return a copy of the string with a new alphabet, | |
| 375 # preserving the sequence, name and alphabet? | |
| 376 | |
| 377 def __new__(cls, obj, | |
| 378 alphabet= generic_alphabet, | |
| 379 name =None, description=None, | |
| 380 ): | |
| 381 self = str.__new__(cls, obj) | |
| 382 if alphabet is None: | |
| 383 alphabet = generic_alphabet | |
| 384 if not isinstance(alphabet, Alphabet): | |
| 385 alphabet = Alphabet(alphabet) | |
| 386 if not alphabet.alphabetic(self) : | |
| 387 raise ValueError("Sequence not alphabetic %s, '%s'" %(alphabet, self)) | |
| 388 | |
| 389 self._alphabet=alphabet | |
| 390 self.name = name | |
| 391 self.description = description | |
| 392 | |
| 393 return self | |
| 394 | |
| 395 # BEGIN PROPERTIES | |
| 396 | |
| 397 # Make alphabet constant | |
| 398 def _get_alphabet(self): | |
| 399 return self._alphabet | |
| 400 alphabet = property(_get_alphabet) | |
| 401 | |
| 402 # END PROPERTIES | |
| 403 | |
| 404 | |
| 405 def ords(self) : | |
| 406 """ Convert sequence to an array of integers | |
| 407 in the range [0, len(alphabet) ) | |
| 408 """ | |
| 409 return self.alphabet.ords(self) | |
| 410 | |
| 411 def tally(self, alphabet = None): | |
| 412 """Counts the occurrences of alphabetic characters. | |
| 413 | |
| 414 Arguments: | |
| 415 - alphabet -- an optional alternative alphabet | |
| 416 | |
| 417 Returns : | |
| 418 A list of character counts in alphabetic order. | |
| 419 """ | |
| 420 # Renamed from count() since this conflicts with str.count(). | |
| 421 if not alphabet : alphabet = self.alphabet | |
| 422 L = len(alphabet) | |
| 423 counts = [0,] * L | |
| 424 | |
| 425 ords = alphabet.ords(self) | |
| 426 | |
| 427 for n in ords: | |
| 428 if n<L : counts[n] +=1 | |
| 429 | |
| 430 return counts | |
| 431 | |
| 432 | |
| 433 def kmers(self, alphabet = None, k=1): | |
| 434 """Counts the occurrences of overlapping alphabetic subsequences. | |
| 435 | |
| 436 Arguments: | |
| 437 - alphabet -- an optional alternative alphabet | |
| 438 - k -- subsequence length. Default: 1 (monomers) | |
| 439 | |
| 440 Returns : | |
| 441 A list of kmers counts in alphabetic order. | |
| 442 Status : | |
| 443 Alpha -- Not sure on interface. Will only work for small k | |
| 444 """ | |
| 445 # TODO: Refactor? | |
| 446 # TODO: Rename 'kmers' to 'words' or word_count | |
| 447 if not alphabet : alphabet = self.alphabet | |
| 448 | |
| 449 L = len(alphabet) | |
| 450 N = L**k | |
| 451 counts = [0,]*N | |
| 452 | |
| 453 ords = alphabet.ords(self) | |
| 454 | |
| 455 | |
| 456 # Easy case | |
| 457 if k==1 : | |
| 458 for n in ords: | |
| 459 if n<N : counts[n] +=1 | |
| 460 return counts | |
| 461 | |
| 462 # Kmer counting. | |
| 463 # FIXME: This code assumes that k isn't too large. | |
| 464 | |
| 465 # e.g. L =10, k = 3, multi = [100,10,1] | |
| 466 multi = [ L**i for i in range(k-1,-1,-1)] | |
| 467 | |
| 468 for i in range(len(ords)-k+1) : | |
| 469 if ords[i] >= N : # Skip non-alphabetic kmers | |
| 470 i += k | |
| 471 continue | |
| 472 #FIXME: this should be a function of alphabet? | |
| 473 n = sum([multi[j]* ords[i+j] for j in range(k) ]) | |
| 474 counts[n] +=1 | |
| 475 | |
| 476 return counts | |
| 477 | |
| 478 def __getslice__(self, i, j): | |
| 479 cls = self.__class__ | |
| 480 return cls( str.__getslice__(self,i,j), self.alphabet) | |
| 481 | |
| 482 def __getitem__(self, key) : | |
| 483 cls = self.__class__ | |
| 484 return cls( str.__getitem__(self,key), self.alphabet) | |
| 485 | |
| 486 def __add__(self, other) : | |
| 487 # called for "self + other" | |
| 488 cls = self.__class__ | |
| 489 return cls( str.__add__(self, other), self.alphabet) | |
| 490 | |
| 491 def __radd__(self, other) : | |
| 492 # Called when "other + self" and other is superclass of self | |
| 493 cls = self.__class__ | |
| 494 return cls( str.__add__(self, other), self.alphabet) | |
| 495 | |
| 496 def join(self, str_list) : | |
| 497 cls = self.__class__ | |
| 498 return cls( super(Seq, self).join(str_list), self.alphabet) | |
| 499 | |
| 500 def __eq__(self, other) : | |
| 501 if not hasattr(other, "alphabet") : return False | |
| 502 if self.alphabet != other.alphabet : | |
| 503 return False | |
| 504 return str.__eq__(self, other) | |
| 505 | |
| 506 def __ne__(self, other) : | |
| 507 return not self.__eq__(other) | |
| 508 | |
| 509 def tostring(self) : | |
| 510 """ Converts Seq to a raw string. | |
| 511 """ | |
| 512 # Compatibility with biopython | |
| 513 return str(self) | |
| 514 | |
| 515 # ---- Transformations of Seq ---- | |
| 516 def reverse(self) : | |
| 517 """Return the reversed sequence. | |
| 518 | |
| 519 Not that this method returns a new object, in contrast to | |
| 520 the in-place reverse() method of list objects. | |
| 521 """ | |
| 522 cls = self.__class__ | |
| 523 return cls( self[::-1], self.alphabet) | |
| 524 | |
| 525 def ungap(self) : | |
| 526 # FIXME: Gap symbols should be specified by the Alphabet? | |
| 527 return self.remove( '-.~') | |
| 528 | |
| 529 def remove(self, delchars) : | |
| 530 """Return a new alphabetic sequence with all characters in 'delchars' | |
| 531 removed. | |
| 532 """ | |
| 533 cls = self.__class__ | |
| 534 return cls( str(self).translate(maketrans('',''), delchars), self.alphabet) | |
| 535 | |
| 536 def lower(self) : | |
| 537 """Return a lower case copy of the sequence. """ | |
| 538 cls = self.__class__ | |
| 539 trans = maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz') | |
| 540 return cls(str(self).translate(trans), self.alphabet) | |
| 541 | |
| 542 def upper(self) : | |
| 543 """Return a lower case copy of the sequence. """ | |
| 544 cls = self.__class__ | |
| 545 trans = maketrans('abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ') | |
| 546 return cls(str(self).translate(trans), self.alphabet) | |
| 547 | |
| 548 def mask(self, letters= 'abcdefghijklmnopqrstuvwxyz', mask='X') : | |
| 549 """Replace all occurences of letters with the mask character. | |
| 550 The default is to replace all lower case letters with 'X'. | |
| 551 """ | |
| 552 LL = len(letters) | |
| 553 if len(mask) !=1 : | |
| 554 raise ValueError("Mask should be single character") | |
| 555 to = mask * LL | |
| 556 trans = maketrans( letters, to) | |
| 557 cls = self.__class__ | |
| 558 return cls(str(self).translate(trans), self.alphabet) | |
| 559 | |
| 560 def translate(self) : | |
| 561 """Translate a nucleotide sequence to a polypeptide using full | |
| 562 IUPAC ambiguities in DNA/RNA and amino acid codes, using the | |
| 563 standard genetic code. See corebio.transform.GeneticCode for | |
| 564 details and more options. | |
| 565 """ | |
| 566 # Note: masks str.translate | |
| 567 from transform import GeneticCode | |
| 568 return GeneticCode.std().translate(self) | |
| 569 | |
| 570 def back_translate(self) : | |
| 571 """Translate a protein sequence back into coding DNA, using using the | |
| 572 standard genetic code. See corebio.transform.GeneticCode for | |
| 573 details and more options. | |
| 574 """ | |
| 575 from transform import GeneticCode | |
| 576 return GeneticCode.std().back_translate(self) | |
| 577 | |
| 578 | |
| 579 def reverse_complement(self) : | |
| 580 """Returns reversed complementary nucleic acid sequence (i.e. the other | |
| 581 strand of a DNA sequence.) | |
| 582 """ | |
| 583 return self.reverse().complement() | |
| 584 | |
| 585 def complement(self) : | |
| 586 """Returns complementary nucleic acid sequence.""" | |
| 587 if not nucleic_alphabet.alphabetic(self.alphabet): | |
| 588 raise ValueError("Incompatable alphabets") | |
| 589 s = str.translate(self, _complement_table) | |
| 590 cls = self.__class__ | |
| 591 return cls(s, self.alphabet, self.name, self.description) | |
| 592 | |
| 593 | |
| 594 # end class Seq | |
| 595 | |
| 596 | |
| 597 class SeqList(list): | |
| 598 """ A list of sequences. | |
| 599 | |
| 600 Status: | |
| 601 Beta | |
| 602 """ | |
| 603 # TODO: If alphabet given, we should ensure that all sequences conform. | |
| 604 # TODO: Need an isaligned() method. All seqs same length, same alphabet. | |
| 605 __slots__ =["alphabet", "name", "description"] | |
| 606 | |
| 607 def __init__(self, alist=[], alphabet=None, name=None, description=None): | |
| 608 list.__init__(self, alist) | |
| 609 self.alphabet = alphabet | |
| 610 self.name = name | |
| 611 self.description = description | |
| 612 | |
| 613 # TOOWTDI. Replicates seq_io.read() | |
| 614 #@classmethod | |
| 615 #def read(cls, afile, alphabet = None): | |
| 616 # return corebio.seq_io.read(afile, alphabet) | |
| 617 #read = classmethod(read) | |
| 618 | |
| 619 def ords(self, alphabet=None) : | |
| 620 """ Convert sequence list into a 2D array of ordinals. | |
| 621 """ | |
| 622 if not alphabet : alphabet = self.alphabet | |
| 623 if not alphabet : raise ValueError("No alphabet") | |
| 624 k = [] | |
| 625 for s in self: | |
| 626 k.append( alphabet.ords(s) ) | |
| 627 return k | |
| 628 | |
| 629 def tally(self, alphabet = None): | |
| 630 """Counts the occurrences of characters in each column.""" | |
| 631 if not alphabet : alphabet = self.alphabet | |
| 632 if not alphabet : raise ValueError("No alphabet") | |
| 633 | |
| 634 N = len(alphabet) | |
| 635 ords = self.ords(alphabet) | |
| 636 L = len(ords[0]) | |
| 637 counts = [ [0,]*N for l in range(0,L)] | |
| 638 | |
| 639 for o in ords : | |
| 640 for j,n in enumerate(o) : | |
| 641 if n<N : counts[ j][n] +=1 | |
| 642 | |
| 643 return counts | |
| 644 # end class SeqList | |
| 645 | |
| 646 | |
| 647 def dna(string) : | |
| 648 """Create an alphabetic sequence representing a stretch of DNA. | |
| 649 """ | |
| 650 return Seq(string, alphabet = dna_alphabet) | |
| 651 | |
| 652 def rna(string) : | |
| 653 """Create an alphabetic sequence representing a stretch of RNA. | |
| 654 """ | |
| 655 return Seq(string, alphabet = rna_alphabet) | |
| 656 | |
| 657 def protein(string) : | |
| 658 """Create an alphabetic sequence representing a stretch of polypeptide. | |
| 659 """ | |
| 660 return Seq(string, alphabet = protein_alphabet) | |
| 661 | |
| 662 | |
| 663 | |
| 664 | |
| 665 | 
