6
|
1 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
2 # http://www.inra.fr
|
|
3 # http://urgi.versailles.inra.fr
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30
|
|
31
|
|
32 import sys
|
|
33 import string
|
|
34 import re
|
|
35 import random
|
|
36 import cStringIO
|
|
37 from commons.core.coord.Map import Map
|
|
38
|
|
39 DNA_ALPHABET_WITH_N = set( ['A','T','G','C','N'] )
|
|
40 IUPAC = set(['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'])
|
|
41
|
|
42
|
|
43 ## Record a sequence with its header
|
|
44 #
|
|
45 class Bioseq( object ):
|
|
46
|
|
47 header = ""
|
|
48 sequence = ""
|
|
49
|
|
50 ## constructor
|
|
51 #
|
|
52 # @param name the header of sequence
|
|
53 # @param seq sequence (DNA, RNA, protein)
|
|
54 #
|
|
55 def __init__( self, name="", seq="" ):
|
|
56 self.header = name
|
|
57 self.sequence = seq
|
|
58
|
|
59
|
|
60 ## Equal operator
|
|
61 #
|
|
62 def __eq__( self, o ):
|
|
63 if self.header==o.header and self.sequence==o.sequence:
|
|
64 return True
|
|
65 return False
|
|
66
|
|
67
|
|
68 ## overload __repr__
|
|
69 #
|
|
70 def __repr__( self ):
|
|
71 return "%s;%s" % ( self.header, self.sequence )
|
|
72
|
|
73
|
|
74 ## set attribute header
|
|
75 #
|
|
76 # @param header a string
|
|
77 #
|
|
78 def setHeader( self, header ):
|
|
79 self.header = header
|
|
80
|
|
81
|
|
82 ## get attribute header
|
|
83 #
|
|
84 # @return header
|
|
85 def getHeader(self):
|
|
86 return self.header
|
|
87
|
|
88
|
|
89 ## set attribute sequence
|
|
90 #
|
|
91 # @param sequence a string
|
|
92 #
|
|
93 def setSequence( self, sequence ):
|
|
94 self.sequence = sequence
|
|
95
|
|
96
|
|
97 def getSequence(self):
|
|
98 return self.sequence
|
|
99
|
|
100 ## reset
|
|
101 #
|
|
102 def reset( self ):
|
|
103 self.setHeader( "" )
|
|
104 self.setSequence( "" )
|
|
105
|
|
106
|
|
107 ## Test if bioseq is empty
|
|
108 #
|
|
109 def isEmpty( self ):
|
|
110 return self.header == "" and self.sequence == ""
|
|
111
|
|
112
|
|
113 ## Reverse the sequence
|
|
114 #
|
|
115 def reverse( self ):
|
|
116 tmp = self.sequence
|
|
117 self.sequence = tmp[::-1]
|
|
118
|
|
119
|
|
120 ## Turn the sequence into its complement
|
|
121 # Force upper case letters
|
|
122 # @warning: old name in pyRepet.Bioseq realComplement
|
|
123 #
|
|
124 def complement( self ):
|
|
125 complement = ""
|
|
126 self.upCase()
|
|
127 for i in xrange(0,len(self.sequence),1):
|
|
128 if self.sequence[i] == "A":
|
|
129 complement += "T"
|
|
130 elif self.sequence[i] == "T":
|
|
131 complement += "A"
|
|
132 elif self.sequence[i] == "C":
|
|
133 complement += "G"
|
|
134 elif self.sequence[i] == "G":
|
|
135 complement += "C"
|
|
136 elif self.sequence[i] == "M":
|
|
137 complement += "K"
|
|
138 elif self.sequence[i] == "R":
|
|
139 complement += "Y"
|
|
140 elif self.sequence[i] == "W":
|
|
141 complement += "W"
|
|
142 elif self.sequence[i] == "S":
|
|
143 complement += "S"
|
|
144 elif self.sequence[i] == "Y":
|
|
145 complement += "R"
|
|
146 elif self.sequence[i] == "K":
|
|
147 complement += "M"
|
|
148 elif self.sequence[i] == "V":
|
|
149 complement += "B"
|
|
150 elif self.sequence[i] == "H":
|
|
151 complement += "D"
|
|
152 elif self.sequence[i] == "D":
|
|
153 complement += "H"
|
|
154 elif self.sequence[i] == "B":
|
|
155 complement += "V"
|
|
156 elif self.sequence[i] == "N":
|
|
157 complement += "N"
|
|
158 elif self.sequence[i] == "-":
|
|
159 complement += "-"
|
|
160 else:
|
|
161 print "WARNING: unknown symbol '%s', replacing it by N" % ( self.sequence[i] )
|
|
162 complement += "N"
|
|
163 self.sequence = complement
|
|
164
|
|
165
|
|
166 ## Reverse and complement the sequence
|
|
167 #
|
|
168 # Force upper case letters
|
|
169 # @warning: old name in pyRepet.Bioseq : complement
|
|
170 #
|
|
171 def reverseComplement( self ):
|
|
172 self.reverse()
|
|
173 self.complement()
|
|
174
|
|
175
|
|
176 ## Remove gap in the sequence
|
|
177 #
|
|
178 def cleanGap(self):
|
|
179 self.sequence = self.sequence.replace("-","")
|
|
180
|
|
181
|
|
182 ## Copy current Bioseq Instance
|
|
183 #
|
|
184 # @return: a Bioseq instance, a copy of current sequence.
|
|
185 #
|
|
186 def copyBioseqInstance(self):
|
|
187 seq = Bioseq()
|
|
188 seq.sequence = self.sequence
|
|
189 seq.header = self.header
|
|
190 return seq
|
|
191
|
|
192
|
|
193 ## Add phase information after the name of sequence in header
|
|
194 #
|
|
195 # @param phase integer representing phase (1, 2, 3, -1, -2, -3)
|
|
196 #
|
|
197 def setFrameInfoOnHeader(self, phase):
|
|
198 if " " in self.header:
|
|
199 name, desc = self.header.split(" ", 1)
|
|
200 name = name + "_" + str(phase)
|
|
201 self.header = name + " " + desc
|
|
202 else:
|
|
203 self.header = self.header + "_" + str(phase)
|
|
204
|
|
205
|
|
206 ## Fill Bioseq attributes with fasta file
|
|
207 #
|
|
208 # @param faFileHandler file handler of a fasta file
|
|
209 #
|
|
210 def read( self, faFileHandler ):
|
|
211 line = faFileHandler.readline()
|
|
212 if line == "":
|
|
213 self.header = None
|
|
214 self.sequence = None
|
|
215 return
|
|
216 while line == "\n":
|
|
217 line = faFileHandler.readline()
|
|
218 if line[0] == '>':
|
|
219 self.header = string.rstrip(line[1:])
|
|
220 else:
|
|
221 print "error, line is",string.rstrip(line)
|
|
222 return
|
|
223 line = " "
|
|
224 seq = cStringIO.StringIO()
|
|
225 while line:
|
|
226 prev_pos = faFileHandler.tell()
|
|
227 line = faFileHandler.readline()
|
|
228 if line == "":
|
|
229 break
|
|
230 if line[0] == '>':
|
|
231 faFileHandler.seek( prev_pos )
|
|
232 break
|
|
233 seq.write( string.rstrip(line) )
|
|
234 self.sequence = seq.getvalue()
|
|
235
|
|
236
|
|
237 ## Create a subsequence with a modified header
|
|
238 #
|
|
239 # @param s integer start a required subsequence
|
|
240 # @param e integer end a required subsequence
|
|
241 #
|
|
242 # @return a Bioseq instance, a subsequence of current sequence
|
|
243 #
|
|
244 def subseq( self, s, e=0 ):
|
|
245 if e == 0 :
|
|
246 e=len( self.sequence )
|
|
247 if s > e :
|
|
248 print "error: start must be < or = to end"
|
|
249 return
|
|
250 if s <= 0 :
|
|
251 print "error: start must be > 0"
|
|
252 return
|
|
253 sub = Bioseq()
|
|
254 sub.header = self.header + " fragment " + str(s) + ".." + str(e)
|
|
255 sub.sequence = self.sequence[(s-1):e]
|
|
256 return sub
|
|
257
|
|
258
|
|
259 ## Get the nucleotide or aminoacid at the given position
|
|
260 #
|
|
261 # @param pos integer nucleotide or aminoacid position
|
|
262 #
|
|
263 # @return a string
|
|
264 #
|
|
265 def getNtFromPosition(self, pos):
|
|
266 result = None
|
|
267 if not (pos < 1 or pos > self.getLength()):
|
|
268 result = self.sequence[pos - 1]
|
|
269 return result
|
|
270
|
|
271
|
|
272 ## Print in stdout the Bioseq in fasta format with 60 characters lines
|
|
273 #
|
|
274 # @param l length of required sequence default is whole sequence
|
|
275 #
|
|
276 def view(self,l=0):
|
|
277 print '>'+self.header
|
|
278 i=0
|
|
279 if(l==0):
|
|
280 l=len(self.sequence)
|
|
281 seq=self.sequence[0:l]
|
|
282
|
|
283 while i<len(seq):
|
|
284 print seq[i:i+60]
|
|
285 i=i+60
|
|
286
|
|
287
|
|
288 ## Get length of sequence
|
|
289 #
|
|
290 # @param avoidN boolean don't count 'N' nucleotides
|
|
291 #
|
|
292 # @return length of current sequence
|
|
293 #
|
|
294 def getLength( self, countN = True ):
|
|
295 if countN:
|
|
296 return len(self.sequence)
|
|
297 else:
|
|
298 return len(self.sequence) - self.countNt('N')
|
|
299
|
|
300
|
|
301 ## Return the proportion of a specific character
|
|
302 #
|
|
303 # @param nt character that we want to count
|
|
304 #
|
|
305 def propNt( self, nt ):
|
|
306 return self.countNt( nt ) / float( self.getLength() )
|
|
307
|
|
308
|
|
309 ## Count occurrence of specific character
|
|
310 #
|
|
311 # @param nt character that we want to count
|
|
312 #
|
|
313 # @return nb of occurrences
|
|
314 #
|
|
315 def countNt( self, nt ):
|
|
316 return self.sequence.count( nt )
|
|
317
|
|
318
|
|
319 ## Count occurrence of each nucleotide in current seq
|
|
320 #
|
|
321 # @return a dict, keys are nucleotides, values are nb of occurrences
|
|
322 #
|
|
323 def countAllNt( self ):
|
|
324 dNt2Count = {}
|
|
325 for nt in ["A","T","G","C","N"]:
|
|
326 dNt2Count[ nt ] = self.countNt( nt )
|
|
327 return dNt2Count
|
|
328
|
|
329
|
|
330 ## Return a dict with the number of occurrences for each combination of ATGC of specified size and number of word found
|
|
331 #
|
|
332 # @param size integer required length word
|
|
333 #
|
|
334 def occ_word( self, size ):
|
|
335 occ = {}
|
|
336 if size == 0:
|
|
337 return occ,0
|
|
338 nbword = 0
|
|
339 srch = re.compile('[^ATGC]+')
|
|
340 wordlist = self._createWordList( size )
|
|
341 for i in wordlist:
|
|
342 occ[i] = 0
|
|
343 lenseq = len(self.sequence)
|
|
344 i = 0
|
|
345 while i < lenseq-size+1:
|
|
346 word = self.sequence[i:i+size].upper()
|
|
347 m = srch.search(word)
|
|
348 if m == None:
|
|
349 occ[word] = occ[word]+1
|
|
350 nbword = nbword + 1
|
|
351 i = i + 1
|
|
352 else:
|
|
353 i = i + m.end(0)
|
|
354 return occ, nbword
|
|
355
|
|
356
|
|
357 ## Return a dictionary with the frequency of occurs for each combination of ATGC of specified size
|
|
358 #
|
|
359 # @param size integer required length word
|
|
360 #
|
|
361 def freq_word( self, size ):
|
|
362 dOcc, nbWords = self.occ_word( size )
|
|
363 freq = {}
|
|
364 for word in dOcc.keys():
|
|
365 freq[word] = float(dOcc[word]) / nbWords
|
|
366 return freq
|
|
367
|
|
368
|
|
369 ## Find ORF in each phase
|
|
370 #
|
|
371 # @return: a dict, keys are phases, values are stop codon positions.
|
|
372 #
|
|
373 def findORF (self):
|
|
374 orf = {0:[],1:[],2:[]}
|
|
375 length = len (self.sequence)
|
|
376 for i in xrange(0,length):
|
|
377 triplet = self.sequence[i:i+3]
|
|
378 if ( triplet == "TAA" or triplet == "TAG" or triplet == "TGA"):
|
|
379 phase = i % 3
|
|
380 orf[phase].append(i)
|
|
381 return orf
|
|
382
|
|
383
|
|
384 ## Convert the sequence into upper case
|
|
385 #
|
|
386 def upCase( self ):
|
|
387 self.sequence = self.sequence.upper()
|
|
388
|
|
389
|
|
390 ## Convert the sequence into lower case
|
|
391 #
|
|
392 def lowCase( self ):
|
|
393 self.sequence = self.sequence.lower()
|
|
394
|
|
395
|
|
396 ## Extract the cluster of the fragment (output from Grouper)
|
|
397 #
|
|
398 # @return cluster id (string)
|
|
399 #
|
|
400 def getClusterID( self ):
|
|
401 data = self.header.split()
|
|
402 return data[0].split("Cl")[1]
|
|
403
|
|
404
|
|
405 ## Extract the group of the sequence (output from Grouper)
|
|
406 #
|
|
407 # @return group id (string)
|
|
408 #
|
|
409 def getGroupID( self ):
|
|
410 data = self.header.split()
|
|
411 return data[0].split("Gr")[1].split("Cl")[0]
|
|
412
|
|
413
|
|
414 ## Get the header of the full sequence (output from Grouper)
|
|
415 #
|
|
416 # @example 'Dmel_Grouper_3091_Malign_3:LARD' from '>MbS1566Gr81Cl81 Dmel_Grouper_3091_Malign_3:LARD {Fragment} 1..5203'
|
|
417 # @return header (string)
|
|
418 #
|
|
419 def getHeaderFullSeq( self ):
|
|
420 data = self.header.split()
|
|
421 return data[1]
|
|
422
|
|
423
|
|
424 ## Get the strand of the fragment (output from Grouper)
|
|
425 #
|
|
426 # @return: strand (+ or -)
|
|
427 #
|
|
428 def getFragStrand( self ):
|
|
429 data = self.header.split()
|
|
430 coord = data[3].split("..")
|
|
431 if int(coord[0]) < int(coord[-1]):
|
|
432 return "+"
|
|
433 else:
|
|
434 return "-"
|
|
435
|
|
436
|
|
437 ## Get A, T, G, C or N from an IUPAC letter
|
|
438 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']
|
|
439 #
|
|
440 # @return A, T, G, C or N
|
|
441 #
|
|
442 def getATGCNFromIUPAC( self, nt ):
|
|
443 subset = ["A","T","G","C","N"]
|
|
444
|
|
445 if nt in subset:
|
|
446 return nt
|
|
447 elif nt == "U":
|
|
448 return "T"
|
|
449 elif nt == "R":
|
|
450 return random.choice( "AG" )
|
|
451 elif nt == "Y":
|
|
452 return random.choice( "CT" )
|
|
453 elif nt == "M":
|
|
454 return random.choice( "CA" )
|
|
455 elif nt == "K":
|
|
456 return random.choice( "TG" )
|
|
457 elif nt == "W":
|
|
458 return random.choice( "TA" )
|
|
459 elif nt == "S":
|
|
460 return random.choice( "CG" )
|
|
461 elif nt == "B":
|
|
462 return random.choice( "CTG" )
|
|
463 elif nt == "D":
|
|
464 return random.choice( "ATG" )
|
|
465 elif nt == "H":
|
|
466 return random.choice( "ATC" )
|
|
467 elif nt == "V":
|
|
468 return random.choice( "ACG" )
|
|
469 else:
|
|
470 return "N"
|
|
471
|
|
472
|
|
473 def getSeqWithOnlyATGCN( self ):
|
|
474 newSeq = ""
|
|
475 for nt in self.sequence:
|
|
476 newSeq += self.getATGCNFromIUPAC( nt )
|
|
477 return newSeq
|
|
478
|
|
479
|
|
480 ## Replace any symbol not in (A,T,G,C,N) by another nucleotide it represents
|
|
481 #
|
|
482 def partialIUPAC( self ):
|
|
483 self.sequence = self.getSeqWithOnlyATGCN()
|
|
484
|
|
485
|
|
486 ## Remove non Unix end-of-line symbols, if any
|
|
487 #
|
|
488 def checkEOF( self ):
|
|
489 symbol = "\r" # corresponds to '^M' from Windows
|
|
490 if symbol in self.sequence:
|
|
491 print "WARNING: Windows EOF removed in '%s'" % ( self.header )
|
|
492 sys.stdout.flush()
|
|
493 newSeq = self.sequence.replace( symbol, "" )
|
|
494 self.sequence = newSeq
|
|
495
|
|
496
|
|
497 ## Write Bioseq instance into a fasta file handler
|
|
498 #
|
|
499 # @param faFileHandler file handler of a fasta file
|
|
500 #
|
|
501 def write( self, faFileHandler ):
|
|
502 faFileHandler.write( ">%s\n" % ( self.header ) )
|
|
503 self.writeSeqInFasta( faFileHandler )
|
|
504
|
|
505
|
|
506 ## Write only the sequence of Bioseq instance into a fasta file handler
|
|
507 #
|
|
508 # @param faFileHandler file handler of a fasta file
|
|
509 #
|
|
510 def writeSeqInFasta( self, faFileHandler ):
|
|
511 i = 0
|
|
512 while i < self.getLength():
|
|
513 faFileHandler.write( "%s\n" % ( self.sequence[i:i+60] ) )
|
|
514 i += 60
|
|
515
|
|
516
|
|
517 ## Append Bioseq instance to a fasta file
|
|
518 #
|
|
519 # @param faFile name of a fasta file as a string
|
|
520 # @param mode 'write' or 'append'
|
|
521 #
|
|
522 def save( self, faFile, mode="a" ):
|
|
523 faFileHandler = open( faFile, mode )
|
|
524 self.write( faFileHandler )
|
|
525 faFileHandler.close()
|
|
526
|
|
527
|
|
528 ## Append Bioseq instance to a fasta file
|
|
529 #
|
|
530 # @param faFile name of a fasta file as a string
|
|
531 #
|
|
532 def appendBioseqInFile( self, faFile ):
|
|
533 self.save( faFile, "a" )
|
|
534
|
|
535
|
|
536 ## Write Bioseq instance into a fasta file handler
|
|
537 #
|
|
538 # @param faFileHandler file handler on a file with writing right
|
|
539 #
|
|
540 def writeABioseqInAFastaFile( self, faFileHandler ):
|
|
541 self.write( faFileHandler )
|
|
542
|
|
543
|
|
544 ## Write Bioseq instance with other header into a fasta file handler
|
|
545 #
|
|
546 # @param faFileHandler file handler on a file with writing right
|
|
547 # @param otherHeader a string representing a new header (without the > and the \n)
|
|
548 #
|
|
549 def writeWithOtherHeader( self, faFileHandler, otherHeader ):
|
|
550 self.header = otherHeader
|
|
551 self.write( faFileHandler )
|
|
552
|
|
553
|
|
554 ## Append Bioseq header and Bioseq sequence in a fasta file
|
|
555 #
|
|
556 # @param faFileHandler file handler on a file with writing right
|
|
557 # @param otherHeader a string representing a new header (without the > and the \n)
|
|
558 #
|
|
559 def writeABioseqInAFastaFileWithOtherHeader( self, faFileHandler, otherHeader ):
|
|
560 self.writeWithOtherHeader( faFileHandler, otherHeader )
|
|
561
|
|
562
|
|
563 ## get the list of Maps corresponding to seq without gap
|
|
564 #
|
|
565 # @warning This method was called getMap() in pyRepet.Bioseq
|
|
566 # @return a list of Map object
|
|
567 #
|
|
568 def getLMapWhithoutGap( self ):
|
|
569 lMaps = []
|
|
570 countSite = 1
|
|
571 countSubseq = 1
|
|
572 inGap = False
|
|
573 startMap = -1
|
|
574 endMap = -1
|
|
575
|
|
576 # initialize with the first site
|
|
577 if self.sequence[0] == "-":
|
|
578 inGap = True
|
|
579 else:
|
|
580 startMap = countSite
|
|
581
|
|
582 # for each remaining site
|
|
583 for site in self.sequence[1:]:
|
|
584 countSite += 1
|
|
585
|
|
586 # if it is a gap
|
|
587 if site == "-":
|
|
588
|
|
589 # if this is the beginning of a gap, record the previous subsequence
|
|
590 if inGap == False:
|
|
591 inGap = True
|
|
592 endMap = countSite - 1
|
|
593 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) )
|
|
594 countSubseq += 1
|
|
595
|
|
596 # if it is NOT a gap
|
|
597 if site != "-":
|
|
598
|
|
599 # if it is the end of a gap, begin the next subsequence
|
|
600 if inGap == True:
|
|
601 inGap = False
|
|
602 startMap = countSite
|
|
603
|
|
604 # if it is the last site
|
|
605 if countSite == self.getLength():
|
|
606 endMap = countSite
|
|
607 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) )
|
|
608
|
|
609 return lMaps
|
|
610
|
|
611
|
|
612 ## get the percentage of GC
|
|
613 #
|
|
614 # @return a percentage
|
|
615 #
|
|
616 def getGCpercentage( self ):
|
|
617 tmpSeq = self.getSeqWithOnlyATGCN()
|
|
618 nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" )
|
|
619 return 100 * nbGC / float( self.getLength() )
|
|
620
|
|
621 ## get the percentage of GC of a sequence without counting N in sequence length
|
|
622 #
|
|
623 # @return a percentage
|
|
624 #
|
|
625 def getGCpercentageInSequenceWithoutCountNInLength(self):
|
|
626 tmpSeq = self.getSeqWithOnlyATGCN()
|
|
627 nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" )
|
|
628 return 100 * nbGC / float( self.getLength() - self.countNt("N") )
|
|
629
|
|
630 ## get the 5 prime subsequence of a given length at the given position
|
|
631 #
|
|
632 # @param position integer
|
|
633 # @param flankLength integer subsequence length
|
|
634 # @return a sequence string
|
|
635 #
|
|
636 def get5PrimeFlank(self, position, flankLength):
|
|
637 if(position == 1):
|
|
638 return ""
|
|
639 else:
|
|
640 startOfFlank = 1
|
|
641 endOfFlank = position -1
|
|
642
|
|
643 if((position - flankLength) > 0):
|
|
644 startOfFlank = position - flankLength
|
|
645 else:
|
|
646 startOfFlank = 1
|
|
647
|
|
648 return self.subseq(startOfFlank, endOfFlank).sequence
|
|
649
|
|
650
|
|
651 ## get the 3 prime subsequence of a given length at the given position
|
|
652 # In the case of indels, the polymorphism length can be specified
|
|
653 #
|
|
654 # @param position integer
|
|
655 # @param flankLength integer subsequence length
|
|
656 # @param polymLength integer polymorphism length
|
|
657 # @return a sequence string
|
|
658 #
|
|
659 def get3PrimeFlank(self, position, flankLength, polymLength = 1):
|
|
660 if((position + polymLength) > len( self.sequence )):
|
|
661 return ""
|
|
662 else:
|
|
663 startOfFlank = position + polymLength
|
|
664
|
|
665 if((position+polymLength+flankLength) > len( self.sequence )):
|
|
666 endOfFlank = len( self.sequence )
|
|
667 else:
|
|
668 endOfFlank = position+polymLength+flankLength-1
|
|
669
|
|
670 return self.subseq(startOfFlank, endOfFlank).sequence
|
|
671
|
|
672
|
|
673 def _createWordList(self,size,l=['A','T','G','C']):
|
|
674 if size == 1 :
|
|
675 return l
|
|
676 else:
|
|
677 l2 = []
|
|
678 for i in l:
|
|
679 for j in ['A','T','G','C']:
|
|
680 l2.append( i + j )
|
|
681 return self._createWordList(size-1,l2)
|
|
682
|
|
683
|
|
684 def removeSymbol( self, symbol ):
|
|
685 tmp = self.sequence.replace( symbol, "" )
|
|
686 self.sequence = tmp
|