comparison commons/core/seq/Bioseq.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children 94ab73e8a190
comparison
equal deleted inserted replaced
5:ea3082881bf8 6:769e306b7933
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import sys
33 import string
34 import re
35 import random
36 import cStringIO
37 from commons.core.coord.Map import Map
38
39 DNA_ALPHABET_WITH_N = set( ['A','T','G','C','N'] )
40 IUPAC = set(['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'])
41
42
43 ## Record a sequence with its header
44 #
45 class Bioseq( object ):
46
47 header = ""
48 sequence = ""
49
50 ## constructor
51 #
52 # @param name the header of sequence
53 # @param seq sequence (DNA, RNA, protein)
54 #
55 def __init__( self, name="", seq="" ):
56 self.header = name
57 self.sequence = seq
58
59
60 ## Equal operator
61 #
62 def __eq__( self, o ):
63 if self.header==o.header and self.sequence==o.sequence:
64 return True
65 return False
66
67
68 ## overload __repr__
69 #
70 def __repr__( self ):
71 return "%s;%s" % ( self.header, self.sequence )
72
73
74 ## set attribute header
75 #
76 # @param header a string
77 #
78 def setHeader( self, header ):
79 self.header = header
80
81
82 ## get attribute header
83 #
84 # @return header
85 def getHeader(self):
86 return self.header
87
88
89 ## set attribute sequence
90 #
91 # @param sequence a string
92 #
93 def setSequence( self, sequence ):
94 self.sequence = sequence
95
96
97 def getSequence(self):
98 return self.sequence
99
100 ## reset
101 #
102 def reset( self ):
103 self.setHeader( "" )
104 self.setSequence( "" )
105
106
107 ## Test if bioseq is empty
108 #
109 def isEmpty( self ):
110 return self.header == "" and self.sequence == ""
111
112
113 ## Reverse the sequence
114 #
115 def reverse( self ):
116 tmp = self.sequence
117 self.sequence = tmp[::-1]
118
119
120 ## Turn the sequence into its complement
121 # Force upper case letters
122 # @warning: old name in pyRepet.Bioseq realComplement
123 #
124 def complement( self ):
125 complement = ""
126 self.upCase()
127 for i in xrange(0,len(self.sequence),1):
128 if self.sequence[i] == "A":
129 complement += "T"
130 elif self.sequence[i] == "T":
131 complement += "A"
132 elif self.sequence[i] == "C":
133 complement += "G"
134 elif self.sequence[i] == "G":
135 complement += "C"
136 elif self.sequence[i] == "M":
137 complement += "K"
138 elif self.sequence[i] == "R":
139 complement += "Y"
140 elif self.sequence[i] == "W":
141 complement += "W"
142 elif self.sequence[i] == "S":
143 complement += "S"
144 elif self.sequence[i] == "Y":
145 complement += "R"
146 elif self.sequence[i] == "K":
147 complement += "M"
148 elif self.sequence[i] == "V":
149 complement += "B"
150 elif self.sequence[i] == "H":
151 complement += "D"
152 elif self.sequence[i] == "D":
153 complement += "H"
154 elif self.sequence[i] == "B":
155 complement += "V"
156 elif self.sequence[i] == "N":
157 complement += "N"
158 elif self.sequence[i] == "-":
159 complement += "-"
160 else:
161 print "WARNING: unknown symbol '%s', replacing it by N" % ( self.sequence[i] )
162 complement += "N"
163 self.sequence = complement
164
165
166 ## Reverse and complement the sequence
167 #
168 # Force upper case letters
169 # @warning: old name in pyRepet.Bioseq : complement
170 #
171 def reverseComplement( self ):
172 self.reverse()
173 self.complement()
174
175
176 ## Remove gap in the sequence
177 #
178 def cleanGap(self):
179 self.sequence = self.sequence.replace("-","")
180
181
182 ## Copy current Bioseq Instance
183 #
184 # @return: a Bioseq instance, a copy of current sequence.
185 #
186 def copyBioseqInstance(self):
187 seq = Bioseq()
188 seq.sequence = self.sequence
189 seq.header = self.header
190 return seq
191
192
193 ## Add phase information after the name of sequence in header
194 #
195 # @param phase integer representing phase (1, 2, 3, -1, -2, -3)
196 #
197 def setFrameInfoOnHeader(self, phase):
198 if " " in self.header:
199 name, desc = self.header.split(" ", 1)
200 name = name + "_" + str(phase)
201 self.header = name + " " + desc
202 else:
203 self.header = self.header + "_" + str(phase)
204
205
206 ## Fill Bioseq attributes with fasta file
207 #
208 # @param faFileHandler file handler of a fasta file
209 #
210 def read( self, faFileHandler ):
211 line = faFileHandler.readline()
212 if line == "":
213 self.header = None
214 self.sequence = None
215 return
216 while line == "\n":
217 line = faFileHandler.readline()
218 if line[0] == '>':
219 self.header = string.rstrip(line[1:])
220 else:
221 print "error, line is",string.rstrip(line)
222 return
223 line = " "
224 seq = cStringIO.StringIO()
225 while line:
226 prev_pos = faFileHandler.tell()
227 line = faFileHandler.readline()
228 if line == "":
229 break
230 if line[0] == '>':
231 faFileHandler.seek( prev_pos )
232 break
233 seq.write( string.rstrip(line) )
234 self.sequence = seq.getvalue()
235
236
237 ## Create a subsequence with a modified header
238 #
239 # @param s integer start a required subsequence
240 # @param e integer end a required subsequence
241 #
242 # @return a Bioseq instance, a subsequence of current sequence
243 #
244 def subseq( self, s, e=0 ):
245 if e == 0 :
246 e=len( self.sequence )
247 if s > e :
248 print "error: start must be < or = to end"
249 return
250 if s <= 0 :
251 print "error: start must be > 0"
252 return
253 sub = Bioseq()
254 sub.header = self.header + " fragment " + str(s) + ".." + str(e)
255 sub.sequence = self.sequence[(s-1):e]
256 return sub
257
258
259 ## Get the nucleotide or aminoacid at the given position
260 #
261 # @param pos integer nucleotide or aminoacid position
262 #
263 # @return a string
264 #
265 def getNtFromPosition(self, pos):
266 result = None
267 if not (pos < 1 or pos > self.getLength()):
268 result = self.sequence[pos - 1]
269 return result
270
271
272 ## Print in stdout the Bioseq in fasta format with 60 characters lines
273 #
274 # @param l length of required sequence default is whole sequence
275 #
276 def view(self,l=0):
277 print '>'+self.header
278 i=0
279 if(l==0):
280 l=len(self.sequence)
281 seq=self.sequence[0:l]
282
283 while i<len(seq):
284 print seq[i:i+60]
285 i=i+60
286
287
288 ## Get length of sequence
289 #
290 # @param avoidN boolean don't count 'N' nucleotides
291 #
292 # @return length of current sequence
293 #
294 def getLength( self, countN = True ):
295 if countN:
296 return len(self.sequence)
297 else:
298 return len(self.sequence) - self.countNt('N')
299
300
301 ## Return the proportion of a specific character
302 #
303 # @param nt character that we want to count
304 #
305 def propNt( self, nt ):
306 return self.countNt( nt ) / float( self.getLength() )
307
308
309 ## Count occurrence of specific character
310 #
311 # @param nt character that we want to count
312 #
313 # @return nb of occurrences
314 #
315 def countNt( self, nt ):
316 return self.sequence.count( nt )
317
318
319 ## Count occurrence of each nucleotide in current seq
320 #
321 # @return a dict, keys are nucleotides, values are nb of occurrences
322 #
323 def countAllNt( self ):
324 dNt2Count = {}
325 for nt in ["A","T","G","C","N"]:
326 dNt2Count[ nt ] = self.countNt( nt )
327 return dNt2Count
328
329
330 ## Return a dict with the number of occurrences for each combination of ATGC of specified size and number of word found
331 #
332 # @param size integer required length word
333 #
334 def occ_word( self, size ):
335 occ = {}
336 if size == 0:
337 return occ,0
338 nbword = 0
339 srch = re.compile('[^ATGC]+')
340 wordlist = self._createWordList( size )
341 for i in wordlist:
342 occ[i] = 0
343 lenseq = len(self.sequence)
344 i = 0
345 while i < lenseq-size+1:
346 word = self.sequence[i:i+size].upper()
347 m = srch.search(word)
348 if m == None:
349 occ[word] = occ[word]+1
350 nbword = nbword + 1
351 i = i + 1
352 else:
353 i = i + m.end(0)
354 return occ, nbword
355
356
357 ## Return a dictionary with the frequency of occurs for each combination of ATGC of specified size
358 #
359 # @param size integer required length word
360 #
361 def freq_word( self, size ):
362 dOcc, nbWords = self.occ_word( size )
363 freq = {}
364 for word in dOcc.keys():
365 freq[word] = float(dOcc[word]) / nbWords
366 return freq
367
368
369 ## Find ORF in each phase
370 #
371 # @return: a dict, keys are phases, values are stop codon positions.
372 #
373 def findORF (self):
374 orf = {0:[],1:[],2:[]}
375 length = len (self.sequence)
376 for i in xrange(0,length):
377 triplet = self.sequence[i:i+3]
378 if ( triplet == "TAA" or triplet == "TAG" or triplet == "TGA"):
379 phase = i % 3
380 orf[phase].append(i)
381 return orf
382
383
384 ## Convert the sequence into upper case
385 #
386 def upCase( self ):
387 self.sequence = self.sequence.upper()
388
389
390 ## Convert the sequence into lower case
391 #
392 def lowCase( self ):
393 self.sequence = self.sequence.lower()
394
395
396 ## Extract the cluster of the fragment (output from Grouper)
397 #
398 # @return cluster id (string)
399 #
400 def getClusterID( self ):
401 data = self.header.split()
402 return data[0].split("Cl")[1]
403
404
405 ## Extract the group of the sequence (output from Grouper)
406 #
407 # @return group id (string)
408 #
409 def getGroupID( self ):
410 data = self.header.split()
411 return data[0].split("Gr")[1].split("Cl")[0]
412
413
414 ## Get the header of the full sequence (output from Grouper)
415 #
416 # @example 'Dmel_Grouper_3091_Malign_3:LARD' from '>MbS1566Gr81Cl81 Dmel_Grouper_3091_Malign_3:LARD {Fragment} 1..5203'
417 # @return header (string)
418 #
419 def getHeaderFullSeq( self ):
420 data = self.header.split()
421 return data[1]
422
423
424 ## Get the strand of the fragment (output from Grouper)
425 #
426 # @return: strand (+ or -)
427 #
428 def getFragStrand( self ):
429 data = self.header.split()
430 coord = data[3].split("..")
431 if int(coord[0]) < int(coord[-1]):
432 return "+"
433 else:
434 return "-"
435
436
437 ## Get A, T, G, C or N from an IUPAC letter
438 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']
439 #
440 # @return A, T, G, C or N
441 #
442 def getATGCNFromIUPAC( self, nt ):
443 subset = ["A","T","G","C","N"]
444
445 if nt in subset:
446 return nt
447 elif nt == "U":
448 return "T"
449 elif nt == "R":
450 return random.choice( "AG" )
451 elif nt == "Y":
452 return random.choice( "CT" )
453 elif nt == "M":
454 return random.choice( "CA" )
455 elif nt == "K":
456 return random.choice( "TG" )
457 elif nt == "W":
458 return random.choice( "TA" )
459 elif nt == "S":
460 return random.choice( "CG" )
461 elif nt == "B":
462 return random.choice( "CTG" )
463 elif nt == "D":
464 return random.choice( "ATG" )
465 elif nt == "H":
466 return random.choice( "ATC" )
467 elif nt == "V":
468 return random.choice( "ACG" )
469 else:
470 return "N"
471
472
473 def getSeqWithOnlyATGCN( self ):
474 newSeq = ""
475 for nt in self.sequence:
476 newSeq += self.getATGCNFromIUPAC( nt )
477 return newSeq
478
479
480 ## Replace any symbol not in (A,T,G,C,N) by another nucleotide it represents
481 #
482 def partialIUPAC( self ):
483 self.sequence = self.getSeqWithOnlyATGCN()
484
485
486 ## Remove non Unix end-of-line symbols, if any
487 #
488 def checkEOF( self ):
489 symbol = "\r" # corresponds to '^M' from Windows
490 if symbol in self.sequence:
491 print "WARNING: Windows EOF removed in '%s'" % ( self.header )
492 sys.stdout.flush()
493 newSeq = self.sequence.replace( symbol, "" )
494 self.sequence = newSeq
495
496
497 ## Write Bioseq instance into a fasta file handler
498 #
499 # @param faFileHandler file handler of a fasta file
500 #
501 def write( self, faFileHandler ):
502 faFileHandler.write( ">%s\n" % ( self.header ) )
503 self.writeSeqInFasta( faFileHandler )
504
505
506 ## Write only the sequence of Bioseq instance into a fasta file handler
507 #
508 # @param faFileHandler file handler of a fasta file
509 #
510 def writeSeqInFasta( self, faFileHandler ):
511 i = 0
512 while i < self.getLength():
513 faFileHandler.write( "%s\n" % ( self.sequence[i:i+60] ) )
514 i += 60
515
516
517 ## Append Bioseq instance to a fasta file
518 #
519 # @param faFile name of a fasta file as a string
520 # @param mode 'write' or 'append'
521 #
522 def save( self, faFile, mode="a" ):
523 faFileHandler = open( faFile, mode )
524 self.write( faFileHandler )
525 faFileHandler.close()
526
527
528 ## Append Bioseq instance to a fasta file
529 #
530 # @param faFile name of a fasta file as a string
531 #
532 def appendBioseqInFile( self, faFile ):
533 self.save( faFile, "a" )
534
535
536 ## Write Bioseq instance into a fasta file handler
537 #
538 # @param faFileHandler file handler on a file with writing right
539 #
540 def writeABioseqInAFastaFile( self, faFileHandler ):
541 self.write( faFileHandler )
542
543
544 ## Write Bioseq instance with other header into a fasta file handler
545 #
546 # @param faFileHandler file handler on a file with writing right
547 # @param otherHeader a string representing a new header (without the > and the \n)
548 #
549 def writeWithOtherHeader( self, faFileHandler, otherHeader ):
550 self.header = otherHeader
551 self.write( faFileHandler )
552
553
554 ## Append Bioseq header and Bioseq sequence in a fasta file
555 #
556 # @param faFileHandler file handler on a file with writing right
557 # @param otherHeader a string representing a new header (without the > and the \n)
558 #
559 def writeABioseqInAFastaFileWithOtherHeader( self, faFileHandler, otherHeader ):
560 self.writeWithOtherHeader( faFileHandler, otherHeader )
561
562
563 ## get the list of Maps corresponding to seq without gap
564 #
565 # @warning This method was called getMap() in pyRepet.Bioseq
566 # @return a list of Map object
567 #
568 def getLMapWhithoutGap( self ):
569 lMaps = []
570 countSite = 1
571 countSubseq = 1
572 inGap = False
573 startMap = -1
574 endMap = -1
575
576 # initialize with the first site
577 if self.sequence[0] == "-":
578 inGap = True
579 else:
580 startMap = countSite
581
582 # for each remaining site
583 for site in self.sequence[1:]:
584 countSite += 1
585
586 # if it is a gap
587 if site == "-":
588
589 # if this is the beginning of a gap, record the previous subsequence
590 if inGap == False:
591 inGap = True
592 endMap = countSite - 1
593 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) )
594 countSubseq += 1
595
596 # if it is NOT a gap
597 if site != "-":
598
599 # if it is the end of a gap, begin the next subsequence
600 if inGap == True:
601 inGap = False
602 startMap = countSite
603
604 # if it is the last site
605 if countSite == self.getLength():
606 endMap = countSite
607 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) )
608
609 return lMaps
610
611
612 ## get the percentage of GC
613 #
614 # @return a percentage
615 #
616 def getGCpercentage( self ):
617 tmpSeq = self.getSeqWithOnlyATGCN()
618 nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" )
619 return 100 * nbGC / float( self.getLength() )
620
621 ## get the percentage of GC of a sequence without counting N in sequence length
622 #
623 # @return a percentage
624 #
625 def getGCpercentageInSequenceWithoutCountNInLength(self):
626 tmpSeq = self.getSeqWithOnlyATGCN()
627 nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" )
628 return 100 * nbGC / float( self.getLength() - self.countNt("N") )
629
630 ## get the 5 prime subsequence of a given length at the given position
631 #
632 # @param position integer
633 # @param flankLength integer subsequence length
634 # @return a sequence string
635 #
636 def get5PrimeFlank(self, position, flankLength):
637 if(position == 1):
638 return ""
639 else:
640 startOfFlank = 1
641 endOfFlank = position -1
642
643 if((position - flankLength) > 0):
644 startOfFlank = position - flankLength
645 else:
646 startOfFlank = 1
647
648 return self.subseq(startOfFlank, endOfFlank).sequence
649
650
651 ## get the 3 prime subsequence of a given length at the given position
652 # In the case of indels, the polymorphism length can be specified
653 #
654 # @param position integer
655 # @param flankLength integer subsequence length
656 # @param polymLength integer polymorphism length
657 # @return a sequence string
658 #
659 def get3PrimeFlank(self, position, flankLength, polymLength = 1):
660 if((position + polymLength) > len( self.sequence )):
661 return ""
662 else:
663 startOfFlank = position + polymLength
664
665 if((position+polymLength+flankLength) > len( self.sequence )):
666 endOfFlank = len( self.sequence )
667 else:
668 endOfFlank = position+polymLength+flankLength-1
669
670 return self.subseq(startOfFlank, endOfFlank).sequence
671
672
673 def _createWordList(self,size,l=['A','T','G','C']):
674 if size == 1 :
675 return l
676 else:
677 l2 = []
678 for i in l:
679 for j in ['A','T','G','C']:
680 l2.append( i + j )
681 return self._createWordList(size-1,l2)
682
683
684 def removeSymbol( self, symbol ):
685 tmp = self.sequence.replace( symbol, "" )
686 self.sequence = tmp