36
|
1 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
2 # http://www.inra.fr
|
|
3 # http://urgi.versailles.inra.fr
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30
|
|
31
|
|
32 import sys
|
|
33 import string
|
|
34 import re
|
|
35 import random
|
|
36 import cStringIO
|
|
37 from commons.core.coord.Map import Map
|
|
38 from commons.core.checker.RepetException import RepetException
|
|
39
|
|
40 DNA_ALPHABET_WITH_N = set( ['A','T','G','C','N'] )
|
|
41 IUPAC = set(['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'])
|
|
42
|
|
43
|
|
44 ## Record a sequence with its header
|
|
45 #
|
|
46 class Bioseq( object ):
|
|
47
|
|
48 header = ""
|
|
49 sequence = ""
|
|
50
|
|
51 ## constructor
|
|
52 #
|
|
53 # @param name the header of sequence
|
|
54 # @param seq sequence (DNA, RNA, protein)
|
|
55 #
|
|
56 def __init__( self, name="", seq="" ):
|
|
57 self.header = name
|
|
58 self.sequence = seq
|
|
59
|
|
60
|
|
61 ## Equal operator
|
|
62 #
|
|
63 def __eq__( self, o ):
|
|
64 if self.header==o.header and self.sequence==o.sequence:
|
|
65 return True
|
|
66 return False
|
|
67
|
|
68
|
|
69 ## overload __repr__
|
|
70 #
|
|
71 def __repr__( self ):
|
|
72 return "%s;%s" % ( self.header, self.sequence )
|
|
73
|
|
74
|
|
75 ## set attribute header
|
|
76 #
|
|
77 # @param header a string
|
|
78 #
|
|
79 def setHeader( self, header ):
|
|
80 self.header = header
|
|
81
|
|
82
|
|
83 ## get attribute header
|
|
84 #
|
|
85 # @return header
|
|
86 def getHeader(self):
|
|
87 return self.header
|
|
88
|
|
89
|
|
90 ## set attribute sequence
|
|
91 #
|
|
92 # @param sequence a string
|
|
93 #
|
|
94 def setSequence( self, sequence ):
|
|
95 self.sequence = sequence
|
|
96
|
|
97
|
|
98 def getSequence(self):
|
|
99 return self.sequence
|
|
100
|
|
101 ## reset
|
|
102 #
|
|
103 def reset( self ):
|
|
104 self.setHeader( "" )
|
|
105 self.setSequence( "" )
|
|
106
|
|
107
|
|
108 ## Test if bioseq is empty
|
|
109 #
|
|
110 def isEmpty( self ):
|
|
111 return self.header == "" and self.sequence == ""
|
|
112
|
|
113
|
|
114 ## Reverse the sequence
|
|
115 #
|
|
116 def reverse( self ):
|
|
117 tmp = self.sequence
|
|
118 self.sequence = tmp[::-1]
|
|
119
|
|
120
|
|
121 ## Turn the sequence into its complement
|
|
122 # Force upper case letters
|
|
123 # @warning: old name in pyRepet.Bioseq realComplement
|
|
124 #
|
|
125 def complement( self ):
|
|
126 complement = ""
|
|
127 self.upCase()
|
|
128 for i in xrange(0,len(self.sequence),1):
|
|
129 if self.sequence[i] == "A":
|
|
130 complement += "T"
|
|
131 elif self.sequence[i] == "T":
|
|
132 complement += "A"
|
|
133 elif self.sequence[i] == "C":
|
|
134 complement += "G"
|
|
135 elif self.sequence[i] == "G":
|
|
136 complement += "C"
|
|
137 elif self.sequence[i] == "M":
|
|
138 complement += "K"
|
|
139 elif self.sequence[i] == "R":
|
|
140 complement += "Y"
|
|
141 elif self.sequence[i] == "W":
|
|
142 complement += "W"
|
|
143 elif self.sequence[i] == "S":
|
|
144 complement += "S"
|
|
145 elif self.sequence[i] == "Y":
|
|
146 complement += "R"
|
|
147 elif self.sequence[i] == "K":
|
|
148 complement += "M"
|
|
149 elif self.sequence[i] == "V":
|
|
150 complement += "B"
|
|
151 elif self.sequence[i] == "H":
|
|
152 complement += "D"
|
|
153 elif self.sequence[i] == "D":
|
|
154 complement += "H"
|
|
155 elif self.sequence[i] == "B":
|
|
156 complement += "V"
|
|
157 elif self.sequence[i] == "N":
|
|
158 complement += "N"
|
|
159 elif self.sequence[i] == "-":
|
|
160 complement += "-"
|
|
161 else:
|
|
162 print "WARNING: unknown symbol '%s', replacing it by N" % ( self.sequence[i] )
|
|
163 complement += "N"
|
|
164 self.sequence = complement
|
|
165
|
|
166
|
|
167 ## Reverse and complement the sequence
|
|
168 #
|
|
169 # Force upper case letters
|
|
170 # @warning: old name in pyRepet.Bioseq : complement
|
|
171 #
|
|
172 def reverseComplement( self ):
|
|
173 self.reverse()
|
|
174 self.complement()
|
|
175
|
|
176
|
|
177 ## Remove gap in the sequence
|
|
178 #
|
|
179 def cleanGap(self):
|
|
180 self.sequence = self.sequence.replace("-","")
|
|
181
|
|
182
|
|
183 ## Copy current Bioseq Instance
|
|
184 #
|
|
185 # @return: a Bioseq instance, a copy of current sequence.
|
|
186 #
|
|
187 def copyBioseqInstance(self):
|
|
188 seq = Bioseq()
|
|
189 seq.sequence = self.sequence
|
|
190 seq.header = self.header
|
|
191 return seq
|
|
192
|
|
193
|
|
194 ## Add phase information after the name of sequence in header
|
|
195 #
|
|
196 # @param phase integer representing phase (1, 2, 3, -1, -2, -3)
|
|
197 #
|
|
198 def setFrameInfoOnHeader(self, phase):
|
|
199 if " " in self.header:
|
|
200 name, desc = self.header.split(" ", 1)
|
|
201 name = name + "_" + str(phase)
|
|
202 self.header = name + " " + desc
|
|
203 else:
|
|
204 self.header = self.header + "_" + str(phase)
|
|
205
|
|
206
|
|
207 ## Fill Bioseq attributes with fasta file
|
|
208 #
|
|
209 # @param faFileHandler file handler of a fasta file
|
|
210 #
|
|
211 def read( self, faFileHandler ):
|
|
212 line = faFileHandler.readline()
|
|
213 if line == "":
|
|
214 self.header = None
|
|
215 self.sequence = None
|
|
216 return
|
|
217 while line == "\n":
|
|
218 line = faFileHandler.readline()
|
|
219 if line[0] == '>':
|
|
220 self.header = string.rstrip(line[1:])
|
|
221 else:
|
|
222 print "error, line is",string.rstrip(line)
|
|
223 return
|
|
224 line = " "
|
|
225 seq = cStringIO.StringIO()
|
|
226 while line:
|
|
227 prev_pos = faFileHandler.tell()
|
|
228 line = faFileHandler.readline()
|
|
229 if line == "":
|
|
230 break
|
|
231 if line[0] == '>':
|
|
232 faFileHandler.seek( prev_pos )
|
|
233 break
|
|
234 seq.write( string.rstrip(line) )
|
|
235 self.sequence = seq.getvalue()
|
|
236
|
|
237
|
|
238 ## Create a subsequence with a modified header
|
|
239 #
|
|
240 # @param s integer start a required subsequence
|
|
241 # @param e integer end a required subsequence
|
|
242 #
|
|
243 # @return a Bioseq instance, a subsequence of current sequence
|
|
244 #
|
|
245 def subseq( self, s, e=0 ):
|
|
246 if e == 0 :
|
|
247 e=len( self.sequence )
|
|
248 if s > e :
|
|
249 print "error: start must be < or = to end"
|
|
250 return
|
|
251 if s <= 0 :
|
|
252 print "error: start must be > 0"
|
|
253 return
|
|
254 sub = Bioseq()
|
|
255 sub.header = self.header + " fragment " + str(s) + ".." + str(e)
|
|
256 sub.sequence = self.sequence[(s-1):e]
|
|
257 return sub
|
|
258
|
|
259
|
|
260 ## Get the nucleotide or aminoacid at the given position
|
|
261 #
|
|
262 # @param pos integer nucleotide or aminoacid position
|
|
263 #
|
|
264 # @return a string
|
|
265 #
|
|
266 def getNtFromPosition(self, pos):
|
|
267 result = None
|
|
268 if not (pos < 1 or pos > self.getLength()):
|
|
269 result = self.sequence[pos - 1]
|
|
270 return result
|
|
271
|
|
272
|
|
273 ## Print in stdout the Bioseq in fasta format with 60 characters lines
|
|
274 #
|
|
275 # @param l length of required sequence default is whole sequence
|
|
276 #
|
|
277 def view(self,l=0):
|
|
278 print '>'+self.header
|
|
279 i=0
|
|
280 if(l==0):
|
|
281 l=len(self.sequence)
|
|
282 seq=self.sequence[0:l]
|
|
283
|
|
284 while i<len(seq):
|
|
285 print seq[i:i+60]
|
|
286 i=i+60
|
|
287
|
|
288
|
|
289 ## Get length of sequence
|
|
290 #
|
|
291 # @param avoidN boolean don't count 'N' nucleotides
|
|
292 #
|
|
293 # @return length of current sequence
|
|
294 #
|
|
295 def getLength( self, countN = True ):
|
|
296 if countN:
|
|
297 return len(self.sequence)
|
|
298 else:
|
|
299 return len(self.sequence) - self.countNt('N')
|
|
300
|
|
301
|
|
302 ## Return the proportion of a specific character
|
|
303 #
|
|
304 # @param nt character that we want to count
|
|
305 #
|
|
306 def propNt( self, nt ):
|
|
307 return self.countNt( nt ) / float( self.getLength() )
|
|
308
|
|
309
|
|
310 ## Count occurrence of specific character
|
|
311 #
|
|
312 # @param nt character that we want to count
|
|
313 #
|
|
314 # @return nb of occurrences
|
|
315 #
|
|
316 def countNt( self, nt ):
|
|
317 return self.sequence.count( nt )
|
|
318
|
|
319
|
|
320 ## Count occurrence of each nucleotide in current seq
|
|
321 #
|
|
322 # @return a dict, keys are nucleotides, values are nb of occurrences
|
|
323 #
|
|
324 def countAllNt( self ):
|
|
325 dNt2Count = {}
|
|
326 for nt in ["A","T","G","C","N"]:
|
|
327 dNt2Count[ nt ] = self.countNt( nt )
|
|
328 return dNt2Count
|
|
329
|
|
330
|
|
331 ## Return a dict with the number of occurrences for each combination of ATGC of specified size and number of word found
|
|
332 #
|
|
333 # @param size integer required length word
|
|
334 #
|
|
335 def occ_word( self, size ):
|
|
336 occ = {}
|
|
337 if size == 0:
|
|
338 return occ,0
|
|
339 nbword = 0
|
|
340 srch = re.compile('[^ATGC]+')
|
|
341 wordlist = self._createWordList( size )
|
|
342 for i in wordlist:
|
|
343 occ[i] = 0
|
|
344 lenseq = len(self.sequence)
|
|
345 i = 0
|
|
346 while i < lenseq-size+1:
|
|
347 word = self.sequence[i:i+size].upper()
|
|
348 m = srch.search(word)
|
|
349 if m == None:
|
|
350 occ[word] = occ[word]+1
|
|
351 nbword = nbword + 1
|
|
352 i = i + 1
|
|
353 else:
|
|
354 i = i + m.end(0)
|
|
355 return occ, nbword
|
|
356
|
|
357
|
|
358 ## Return a dictionary with the frequency of occurs for each combination of ATGC of specified size
|
|
359 #
|
|
360 # @param size integer required length word
|
|
361 #
|
|
362 def freq_word( self, size ):
|
|
363 dOcc, nbWords = self.occ_word( size )
|
|
364 freq = {}
|
|
365 for word in dOcc.keys():
|
|
366 freq[word] = float(dOcc[word]) / nbWords
|
|
367 return freq
|
|
368
|
|
369
|
|
370 ## Find ORF in each phase
|
|
371 #
|
|
372 # @return: a dict, keys are phases, values are stop codon positions.
|
|
373 #
|
|
374 def findORF (self):
|
|
375 orf = {0:[],1:[],2:[]}
|
|
376 length = len (self.sequence)
|
|
377 for i in xrange(0,length):
|
|
378 triplet = self.sequence[i:i+3]
|
|
379 if ( triplet == "TAA" or triplet == "TAG" or triplet == "TGA"):
|
|
380 phase = i % 3
|
|
381 orf[phase].append(i)
|
|
382 return orf
|
|
383
|
|
384
|
|
385 ## Convert the sequence into upper case
|
|
386 #
|
|
387 def upCase( self ):
|
|
388 self.sequence = self.sequence.upper()
|
|
389
|
|
390
|
|
391 ## Convert the sequence into lower case
|
|
392 #
|
|
393 def lowCase( self ):
|
|
394 self.sequence = self.sequence.lower()
|
|
395
|
|
396
|
|
397 ## Extract the cluster of the fragment (output from Grouper)
|
|
398 #
|
|
399 # @return cluster id (string)
|
|
400 #
|
|
401 def getClusterID( self ):
|
|
402 data = self.header.split()
|
|
403 return data[0].split("Cl")[1]
|
|
404
|
|
405
|
|
406 ## Extract the group of the sequence (output from Grouper)
|
|
407 #
|
|
408 # @return group id (string)
|
|
409 #
|
|
410 def getGroupID( self ):
|
|
411 data = self.header.split()
|
|
412 return data[0].split("Gr")[1].split("Cl")[0]
|
|
413
|
|
414
|
|
415 ## Get the header of the full sequence (output from Grouper)
|
|
416 #
|
|
417 # @example 'Dmel_Grouper_3091_Malign_3:LARD' from '>MbS1566Gr81Cl81 Dmel_Grouper_3091_Malign_3:LARD {Fragment} 1..5203'
|
|
418 # @return header (string)
|
|
419 #
|
|
420 def getHeaderFullSeq( self ):
|
|
421 data = self.header.split()
|
|
422 return data[1]
|
|
423
|
|
424
|
|
425 ## Get the strand of the fragment (output from Grouper)
|
|
426 #
|
|
427 # @return: strand (+ or -)
|
|
428 #
|
|
429 def getFragStrand( self ):
|
|
430 data = self.header.split()
|
|
431 coord = data[3].split("..")
|
|
432 if int(coord[0]) < int(coord[-1]):
|
|
433 return "+"
|
|
434 else:
|
|
435 return "-"
|
|
436
|
|
437
|
|
438 ## Get A, T, G, C or N from an IUPAC letter
|
|
439 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']
|
|
440 #
|
|
441 # @return A, T, G, C or N
|
|
442 #
|
|
443 def getATGCNFromIUPAC( self, nt ):
|
|
444 subset = ["A","T","G","C","N"]
|
|
445
|
|
446 if nt in subset:
|
|
447 return nt
|
|
448 elif nt == "U":
|
|
449 return "T"
|
|
450 elif nt == "R":
|
|
451 return random.choice( "AG" )
|
|
452 elif nt == "Y":
|
|
453 return random.choice( "CT" )
|
|
454 elif nt == "M":
|
|
455 return random.choice( "CA" )
|
|
456 elif nt == "K":
|
|
457 return random.choice( "TG" )
|
|
458 elif nt == "W":
|
|
459 return random.choice( "TA" )
|
|
460 elif nt == "S":
|
|
461 return random.choice( "CG" )
|
|
462 elif nt == "B":
|
|
463 return random.choice( "CTG" )
|
|
464 elif nt == "D":
|
|
465 return random.choice( "ATG" )
|
|
466 elif nt == "H":
|
|
467 return random.choice( "ATC" )
|
|
468 elif nt == "V":
|
|
469 return random.choice( "ACG" )
|
|
470 else:
|
|
471 return "N"
|
|
472
|
|
473 ## Get nucleotide from an IUPAC letter and a nucleotide
|
|
474 # Works only for IUPAC code with two possibilities ['R','Y','M','K','W','S']
|
|
475 # Examples:
|
|
476 # Y and C returns T
|
|
477 # Y and T returns C
|
|
478 # B and C throws RepetException
|
|
479 #
|
|
480 # @return A, T, G, C
|
|
481 #
|
|
482 def getATGCNFromIUPACandATGCN(self, IUPACCode, nt):
|
|
483 if IUPACCode == "R":
|
|
484 possibleNt = set(["A", "G"])
|
|
485 if nt not in possibleNt:
|
|
486 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
|
|
487 return (possibleNt - set(nt)).pop()
|
|
488
|
|
489 elif IUPACCode == "Y":
|
|
490 possibleNt = set(["C", "T"])
|
|
491 if nt not in possibleNt:
|
|
492 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
|
|
493 return (possibleNt - set(nt)).pop()
|
|
494
|
|
495 elif IUPACCode == "M":
|
|
496 possibleNt = set(["A", "C"])
|
|
497 if nt not in possibleNt:
|
|
498 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
|
|
499 return (possibleNt - set(nt)).pop()
|
|
500
|
|
501 elif IUPACCode == "K":
|
|
502 possibleNt = set(["T", "G"])
|
|
503 if nt not in possibleNt:
|
|
504 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
|
|
505 return (possibleNt - set(nt)).pop()
|
|
506
|
|
507 elif IUPACCode == "W":
|
|
508 possibleNt = set(["A", "T"])
|
|
509 if nt not in possibleNt:
|
|
510 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
|
|
511 return (possibleNt - set(nt)).pop()
|
|
512
|
|
513 elif IUPACCode == "S":
|
|
514 possibleNt = set(["C", "G"])
|
|
515 if nt not in possibleNt:
|
|
516 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
|
|
517 return (possibleNt - set(nt)).pop()
|
|
518
|
|
519 else:
|
|
520 raise RepetException("Can't retrieve the third nucleotide from IUPAC code '%s' and nucleotide '%s'" % (IUPACCode, nt))
|
|
521
|
|
522 def getSeqWithOnlyATGCN( self ):
|
|
523 newSeq = ""
|
|
524 for nt in self.sequence:
|
|
525 newSeq += self.getATGCNFromIUPAC( nt )
|
|
526 return newSeq
|
|
527
|
|
528
|
|
529 ## Replace any symbol not in (A,T,G,C,N) by another nucleotide it represents
|
|
530 #
|
|
531 def partialIUPAC( self ):
|
|
532 self.sequence = self.getSeqWithOnlyATGCN()
|
|
533
|
|
534
|
|
535 ## Remove non Unix end-of-line symbols, if any
|
|
536 #
|
|
537 def checkEOF( self ):
|
|
538 symbol = "\r" # corresponds to '^M' from Windows
|
|
539 if symbol in self.sequence:
|
|
540 print "WARNING: Windows EOF removed in '%s'" % ( self.header )
|
|
541 sys.stdout.flush()
|
|
542 newSeq = self.sequence.replace( symbol, "" )
|
|
543 self.sequence = newSeq
|
|
544
|
|
545
|
|
546 ## Write Bioseq instance into a fasta file handler
|
|
547 #
|
|
548 # @param faFileHandler file handler of a fasta file
|
|
549 #
|
|
550 def write( self, faFileHandler ):
|
|
551 faFileHandler.write( ">%s\n" % ( self.header ) )
|
|
552 self.writeSeqInFasta( faFileHandler )
|
|
553
|
|
554
|
|
555 ## Write only the sequence of Bioseq instance into a fasta file handler
|
|
556 #
|
|
557 # @param faFileHandler file handler of a fasta file
|
|
558 #
|
|
559 def writeSeqInFasta( self, faFileHandler ):
|
|
560 i = 0
|
|
561 while i < self.getLength():
|
|
562 faFileHandler.write( "%s\n" % ( self.sequence[i:i+60] ) )
|
|
563 i += 60
|
|
564
|
|
565
|
|
566 ## Append Bioseq instance to a fasta file
|
|
567 #
|
|
568 # @param faFile name of a fasta file as a string
|
|
569 # @param mode 'write' or 'append'
|
|
570 #
|
|
571 def save( self, faFile, mode="a" ):
|
|
572 faFileHandler = open( faFile, mode )
|
|
573 self.write( faFileHandler )
|
|
574 faFileHandler.close()
|
|
575
|
|
576
|
|
577 ## Append Bioseq instance to a fasta file
|
|
578 #
|
|
579 # @param faFile name of a fasta file as a string
|
|
580 #
|
|
581 def appendBioseqInFile( self, faFile ):
|
|
582 self.save( faFile, "a" )
|
|
583
|
|
584
|
|
585 ## Write Bioseq instance into a fasta file handler
|
|
586 #
|
|
587 # @param faFileHandler file handler on a file with writing right
|
|
588 #
|
|
589 def writeABioseqInAFastaFile( self, faFileHandler ):
|
|
590 self.write( faFileHandler )
|
|
591
|
|
592
|
|
593 ## Write Bioseq instance with other header into a fasta file handler
|
|
594 #
|
|
595 # @param faFileHandler file handler on a file with writing right
|
|
596 # @param otherHeader a string representing a new header (without the > and the \n)
|
|
597 #
|
|
598 def writeWithOtherHeader( self, faFileHandler, otherHeader ):
|
|
599 self.header = otherHeader
|
|
600 self.write( faFileHandler )
|
|
601
|
|
602
|
|
603 ## Append Bioseq header and Bioseq sequence in a fasta file
|
|
604 #
|
|
605 # @param faFileHandler file handler on a file with writing right
|
|
606 # @param otherHeader a string representing a new header (without the > and the \n)
|
|
607 #
|
|
608 def writeABioseqInAFastaFileWithOtherHeader( self, faFileHandler, otherHeader ):
|
|
609 self.writeWithOtherHeader( faFileHandler, otherHeader )
|
|
610
|
|
611
|
|
612 ## get the list of Maps corresponding to seq without gap
|
|
613 #
|
|
614 # @warning This method was called getMap() in pyRepet.Bioseq
|
|
615 # @return a list of Map object
|
|
616 #
|
|
617 def getLMapWhithoutGap( self ):
|
|
618 lMaps = []
|
|
619 countSite = 1
|
|
620 countSubseq = 1
|
|
621 inGap = False
|
|
622 startMap = -1
|
|
623 endMap = -1
|
|
624
|
|
625 # initialize with the first site
|
|
626 if self.sequence[0] == "-":
|
|
627 inGap = True
|
|
628 else:
|
|
629 startMap = countSite
|
|
630
|
|
631 # for each remaining site
|
|
632 for site in self.sequence[1:]:
|
|
633 countSite += 1
|
|
634
|
|
635 # if it is a gap
|
|
636 if site == "-":
|
|
637
|
|
638 # if this is the beginning of a gap, record the previous subsequence
|
|
639 if inGap == False:
|
|
640 inGap = True
|
|
641 endMap = countSite - 1
|
|
642 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) )
|
|
643 countSubseq += 1
|
|
644
|
|
645 # if it is NOT a gap
|
|
646 if site != "-":
|
|
647
|
|
648 # if it is the end of a gap, begin the next subsequence
|
|
649 if inGap == True:
|
|
650 inGap = False
|
|
651 startMap = countSite
|
|
652
|
|
653 # if it is the last site
|
|
654 if countSite == self.getLength():
|
|
655 endMap = countSite
|
|
656 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) )
|
|
657
|
|
658 return lMaps
|
|
659
|
|
660
|
|
661 ## get the percentage of GC
|
|
662 #
|
|
663 # @return a percentage
|
|
664 #
|
|
665 def getGCpercentage( self ):
|
|
666 tmpSeq = self.getSeqWithOnlyATGCN()
|
|
667 nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" )
|
|
668 return 100 * nbGC / float( self.getLength() )
|
|
669
|
|
670 ## get the percentage of GC of a sequence without counting N in sequence length
|
|
671 #
|
|
672 # @return a percentage
|
|
673 #
|
|
674 def getGCpercentageInSequenceWithoutCountNInLength(self):
|
|
675 tmpSeq = self.getSeqWithOnlyATGCN()
|
|
676 nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" )
|
|
677 return 100 * nbGC / float( self.getLength() - self.countNt("N") )
|
|
678
|
|
679 ## get the 5 prime subsequence of a given length at the given position
|
|
680 #
|
|
681 # @param position integer
|
|
682 # @param flankLength integer subsequence length
|
|
683 # @return a sequence string
|
|
684 #
|
|
685 def get5PrimeFlank(self, position, flankLength):
|
|
686 if(position == 1):
|
|
687 return ""
|
|
688 else:
|
|
689 startOfFlank = 1
|
|
690 endOfFlank = position -1
|
|
691
|
|
692 if((position - flankLength) > 0):
|
|
693 startOfFlank = position - flankLength
|
|
694 else:
|
|
695 startOfFlank = 1
|
|
696
|
|
697 return self.subseq(startOfFlank, endOfFlank).sequence
|
|
698
|
|
699
|
|
700 ## get the 3 prime subsequence of a given length at the given position
|
|
701 # In the case of indels, the polymorphism length can be specified
|
|
702 #
|
|
703 # @param position integer
|
|
704 # @param flankLength integer subsequence length
|
|
705 # @param polymLength integer polymorphism length
|
|
706 # @return a sequence string
|
|
707 #
|
|
708 def get3PrimeFlank(self, position, flankLength, polymLength = 1):
|
|
709 if((position + polymLength) > len( self.sequence )):
|
|
710 return ""
|
|
711 else:
|
|
712 startOfFlank = position + polymLength
|
|
713
|
|
714 if((position+polymLength+flankLength) > len( self.sequence )):
|
|
715 endOfFlank = len( self.sequence )
|
|
716 else:
|
|
717 endOfFlank = position+polymLength+flankLength-1
|
|
718
|
|
719 return self.subseq(startOfFlank, endOfFlank).sequence
|
|
720
|
|
721
|
|
722 def _createWordList(self,size,l=['A','T','G','C']):
|
|
723 if size == 1 :
|
|
724 return l
|
|
725 else:
|
|
726 l2 = []
|
|
727 for i in l:
|
|
728 for j in ['A','T','G','C']:
|
|
729 l2.append( i + j )
|
|
730 return self._createWordList(size-1,l2)
|
|
731
|
|
732
|
|
733 def removeSymbol( self, symbol ):
|
|
734 tmp = self.sequence.replace( symbol, "" )
|
|
735 self.sequence = tmp
|