Mercurial > repos > yufei-luo > s_mart
comparison commons/core/seq/Bioseq.py @ 38:2c0c0a89fad7
Uploaded
author | m-zytnicki |
---|---|
date | Thu, 02 May 2013 09:56:47 -0400 |
parents | 44d5973c188c |
children |
comparison
equal
deleted
inserted
replaced
37:d22fadc825e3 | 38:2c0c0a89fad7 |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 import sys | |
33 import string | |
34 import re | |
35 import random | |
36 import cStringIO | |
37 from commons.core.coord.Map import Map | |
38 from commons.core.checker.RepetException import RepetException | |
39 | |
40 DNA_ALPHABET_WITH_N = set( ['A','T','G','C','N'] ) | |
41 IUPAC = set(['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']) | |
42 | |
43 | |
44 ## Record a sequence with its header | |
45 # | |
46 class Bioseq( object ): | |
47 | |
48 header = "" | |
49 sequence = "" | |
50 | |
51 ## constructor | |
52 # | |
53 # @param name the header of sequence | |
54 # @param seq sequence (DNA, RNA, protein) | |
55 # | |
56 def __init__( self, name="", seq="" ): | |
57 self.header = name | |
58 self.sequence = seq | |
59 | |
60 | |
61 ## Equal operator | |
62 # | |
63 def __eq__( self, o ): | |
64 if self.header==o.header and self.sequence==o.sequence: | |
65 return True | |
66 return False | |
67 | |
68 | |
69 ## overload __repr__ | |
70 # | |
71 def __repr__( self ): | |
72 return "%s;%s" % ( self.header, self.sequence ) | |
73 | |
74 | |
75 ## set attribute header | |
76 # | |
77 # @param header a string | |
78 # | |
79 def setHeader( self, header ): | |
80 self.header = header | |
81 | |
82 | |
83 ## get attribute header | |
84 # | |
85 # @return header | |
86 def getHeader(self): | |
87 return self.header | |
88 | |
89 | |
90 ## set attribute sequence | |
91 # | |
92 # @param sequence a string | |
93 # | |
94 def setSequence( self, sequence ): | |
95 self.sequence = sequence | |
96 | |
97 | |
98 def getSequence(self): | |
99 return self.sequence | |
100 | |
101 ## reset | |
102 # | |
103 def reset( self ): | |
104 self.setHeader( "" ) | |
105 self.setSequence( "" ) | |
106 | |
107 | |
108 ## Test if bioseq is empty | |
109 # | |
110 def isEmpty( self ): | |
111 return self.header == "" and self.sequence == "" | |
112 | |
113 | |
114 ## Reverse the sequence | |
115 # | |
116 def reverse( self ): | |
117 tmp = self.sequence | |
118 self.sequence = tmp[::-1] | |
119 | |
120 | |
121 ## Turn the sequence into its complement | |
122 # Force upper case letters | |
123 # @warning: old name in pyRepet.Bioseq realComplement | |
124 # | |
125 def complement( self ): | |
126 complement = "" | |
127 self.upCase() | |
128 for i in xrange(0,len(self.sequence),1): | |
129 if self.sequence[i] == "A": | |
130 complement += "T" | |
131 elif self.sequence[i] == "T": | |
132 complement += "A" | |
133 elif self.sequence[i] == "C": | |
134 complement += "G" | |
135 elif self.sequence[i] == "G": | |
136 complement += "C" | |
137 elif self.sequence[i] == "M": | |
138 complement += "K" | |
139 elif self.sequence[i] == "R": | |
140 complement += "Y" | |
141 elif self.sequence[i] == "W": | |
142 complement += "W" | |
143 elif self.sequence[i] == "S": | |
144 complement += "S" | |
145 elif self.sequence[i] == "Y": | |
146 complement += "R" | |
147 elif self.sequence[i] == "K": | |
148 complement += "M" | |
149 elif self.sequence[i] == "V": | |
150 complement += "B" | |
151 elif self.sequence[i] == "H": | |
152 complement += "D" | |
153 elif self.sequence[i] == "D": | |
154 complement += "H" | |
155 elif self.sequence[i] == "B": | |
156 complement += "V" | |
157 elif self.sequence[i] == "N": | |
158 complement += "N" | |
159 elif self.sequence[i] == "-": | |
160 complement += "-" | |
161 else: | |
162 print "WARNING: unknown symbol '%s', replacing it by N" % ( self.sequence[i] ) | |
163 complement += "N" | |
164 self.sequence = complement | |
165 | |
166 | |
167 ## Reverse and complement the sequence | |
168 # | |
169 # Force upper case letters | |
170 # @warning: old name in pyRepet.Bioseq : complement | |
171 # | |
172 def reverseComplement( self ): | |
173 self.reverse() | |
174 self.complement() | |
175 | |
176 | |
177 ## Remove gap in the sequence | |
178 # | |
179 def cleanGap(self): | |
180 self.sequence = self.sequence.replace("-","") | |
181 | |
182 | |
183 ## Copy current Bioseq Instance | |
184 # | |
185 # @return: a Bioseq instance, a copy of current sequence. | |
186 # | |
187 def copyBioseqInstance(self): | |
188 seq = Bioseq() | |
189 seq.sequence = self.sequence | |
190 seq.header = self.header | |
191 return seq | |
192 | |
193 | |
194 ## Add phase information after the name of sequence in header | |
195 # | |
196 # @param phase integer representing phase (1, 2, 3, -1, -2, -3) | |
197 # | |
198 def setFrameInfoOnHeader(self, phase): | |
199 if " " in self.header: | |
200 name, desc = self.header.split(" ", 1) | |
201 name = name + "_" + str(phase) | |
202 self.header = name + " " + desc | |
203 else: | |
204 self.header = self.header + "_" + str(phase) | |
205 | |
206 | |
207 ## Fill Bioseq attributes with fasta file | |
208 # | |
209 # @param faFileHandler file handler of a fasta file | |
210 # | |
211 def read( self, faFileHandler ): | |
212 line = faFileHandler.readline() | |
213 if line == "": | |
214 self.header = None | |
215 self.sequence = None | |
216 return | |
217 while line == "\n": | |
218 line = faFileHandler.readline() | |
219 if line[0] == '>': | |
220 self.header = string.rstrip(line[1:]) | |
221 else: | |
222 print "error, line is",string.rstrip(line) | |
223 return | |
224 line = " " | |
225 seq = cStringIO.StringIO() | |
226 while line: | |
227 prev_pos = faFileHandler.tell() | |
228 line = faFileHandler.readline() | |
229 if line == "": | |
230 break | |
231 if line[0] == '>': | |
232 faFileHandler.seek( prev_pos ) | |
233 break | |
234 seq.write( string.rstrip(line) ) | |
235 self.sequence = seq.getvalue() | |
236 | |
237 | |
238 ## Create a subsequence with a modified header | |
239 # | |
240 # @param s integer start a required subsequence | |
241 # @param e integer end a required subsequence | |
242 # | |
243 # @return a Bioseq instance, a subsequence of current sequence | |
244 # | |
245 def subseq( self, s, e=0 ): | |
246 if e == 0 : | |
247 e=len( self.sequence ) | |
248 if s > e : | |
249 print "error: start must be < or = to end" | |
250 return | |
251 if s <= 0 : | |
252 print "error: start must be > 0" | |
253 return | |
254 sub = Bioseq() | |
255 sub.header = self.header + " fragment " + str(s) + ".." + str(e) | |
256 sub.sequence = self.sequence[(s-1):e] | |
257 return sub | |
258 | |
259 | |
260 ## Get the nucleotide or aminoacid at the given position | |
261 # | |
262 # @param pos integer nucleotide or aminoacid position | |
263 # | |
264 # @return a string | |
265 # | |
266 def getNtFromPosition(self, pos): | |
267 result = None | |
268 if not (pos < 1 or pos > self.getLength()): | |
269 result = self.sequence[pos - 1] | |
270 return result | |
271 | |
272 | |
273 ## Print in stdout the Bioseq in fasta format with 60 characters lines | |
274 # | |
275 # @param l length of required sequence default is whole sequence | |
276 # | |
277 def view(self,l=0): | |
278 print '>'+self.header | |
279 i=0 | |
280 if(l==0): | |
281 l=len(self.sequence) | |
282 seq=self.sequence[0:l] | |
283 | |
284 while i<len(seq): | |
285 print seq[i:i+60] | |
286 i=i+60 | |
287 | |
288 | |
289 ## Get length of sequence | |
290 # | |
291 # @param avoidN boolean don't count 'N' nucleotides | |
292 # | |
293 # @return length of current sequence | |
294 # | |
295 def getLength( self, countN = True ): | |
296 if countN: | |
297 return len(self.sequence) | |
298 else: | |
299 return len(self.sequence) - self.countNt('N') | |
300 | |
301 | |
302 ## Return the proportion of a specific character | |
303 # | |
304 # @param nt character that we want to count | |
305 # | |
306 def propNt( self, nt ): | |
307 return self.countNt( nt ) / float( self.getLength() ) | |
308 | |
309 | |
310 ## Count occurrence of specific character | |
311 # | |
312 # @param nt character that we want to count | |
313 # | |
314 # @return nb of occurrences | |
315 # | |
316 def countNt( self, nt ): | |
317 return self.sequence.count( nt ) | |
318 | |
319 | |
320 ## Count occurrence of each nucleotide in current seq | |
321 # | |
322 # @return a dict, keys are nucleotides, values are nb of occurrences | |
323 # | |
324 def countAllNt( self ): | |
325 dNt2Count = {} | |
326 for nt in ["A","T","G","C","N"]: | |
327 dNt2Count[ nt ] = self.countNt( nt ) | |
328 return dNt2Count | |
329 | |
330 | |
331 ## Return a dict with the number of occurrences for each combination of ATGC of specified size and number of word found | |
332 # | |
333 # @param size integer required length word | |
334 # | |
335 def occ_word( self, size ): | |
336 occ = {} | |
337 if size == 0: | |
338 return occ,0 | |
339 nbword = 0 | |
340 srch = re.compile('[^ATGC]+') | |
341 wordlist = self._createWordList( size ) | |
342 for i in wordlist: | |
343 occ[i] = 0 | |
344 lenseq = len(self.sequence) | |
345 i = 0 | |
346 while i < lenseq-size+1: | |
347 word = self.sequence[i:i+size].upper() | |
348 m = srch.search(word) | |
349 if m == None: | |
350 occ[word] = occ[word]+1 | |
351 nbword = nbword + 1 | |
352 i = i + 1 | |
353 else: | |
354 i = i + m.end(0) | |
355 return occ, nbword | |
356 | |
357 | |
358 ## Return a dictionary with the frequency of occurs for each combination of ATGC of specified size | |
359 # | |
360 # @param size integer required length word | |
361 # | |
362 def freq_word( self, size ): | |
363 dOcc, nbWords = self.occ_word( size ) | |
364 freq = {} | |
365 for word in dOcc.keys(): | |
366 freq[word] = float(dOcc[word]) / nbWords | |
367 return freq | |
368 | |
369 | |
370 ## Find ORF in each phase | |
371 # | |
372 # @return: a dict, keys are phases, values are stop codon positions. | |
373 # | |
374 def findORF (self): | |
375 orf = {0:[],1:[],2:[]} | |
376 length = len (self.sequence) | |
377 for i in xrange(0,length): | |
378 triplet = self.sequence[i:i+3] | |
379 if ( triplet == "TAA" or triplet == "TAG" or triplet == "TGA"): | |
380 phase = i % 3 | |
381 orf[phase].append(i) | |
382 return orf | |
383 | |
384 | |
385 ## Convert the sequence into upper case | |
386 # | |
387 def upCase( self ): | |
388 self.sequence = self.sequence.upper() | |
389 | |
390 | |
391 ## Convert the sequence into lower case | |
392 # | |
393 def lowCase( self ): | |
394 self.sequence = self.sequence.lower() | |
395 | |
396 | |
397 ## Extract the cluster of the fragment (output from Grouper) | |
398 # | |
399 # @return cluster id (string) | |
400 # | |
401 def getClusterID( self ): | |
402 data = self.header.split() | |
403 return data[0].split("Cl")[1] | |
404 | |
405 | |
406 ## Extract the group of the sequence (output from Grouper) | |
407 # | |
408 # @return group id (string) | |
409 # | |
410 def getGroupID( self ): | |
411 data = self.header.split() | |
412 return data[0].split("Gr")[1].split("Cl")[0] | |
413 | |
414 | |
415 ## Get the header of the full sequence (output from Grouper) | |
416 # | |
417 # @example 'Dmel_Grouper_3091_Malign_3:LARD' from '>MbS1566Gr81Cl81 Dmel_Grouper_3091_Malign_3:LARD {Fragment} 1..5203' | |
418 # @return header (string) | |
419 # | |
420 def getHeaderFullSeq( self ): | |
421 data = self.header.split() | |
422 return data[1] | |
423 | |
424 | |
425 ## Get the strand of the fragment (output from Grouper) | |
426 # | |
427 # @return: strand (+ or -) | |
428 # | |
429 def getFragStrand( self ): | |
430 data = self.header.split() | |
431 coord = data[3].split("..") | |
432 if int(coord[0]) < int(coord[-1]): | |
433 return "+" | |
434 else: | |
435 return "-" | |
436 | |
437 | |
438 ## Get A, T, G, C or N from an IUPAC letter | |
439 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'] | |
440 # | |
441 # @return A, T, G, C or N | |
442 # | |
443 def getATGCNFromIUPAC( self, nt ): | |
444 subset = ["A","T","G","C","N"] | |
445 | |
446 if nt in subset: | |
447 return nt | |
448 elif nt == "U": | |
449 return "T" | |
450 elif nt == "R": | |
451 return random.choice( "AG" ) | |
452 elif nt == "Y": | |
453 return random.choice( "CT" ) | |
454 elif nt == "M": | |
455 return random.choice( "CA" ) | |
456 elif nt == "K": | |
457 return random.choice( "TG" ) | |
458 elif nt == "W": | |
459 return random.choice( "TA" ) | |
460 elif nt == "S": | |
461 return random.choice( "CG" ) | |
462 elif nt == "B": | |
463 return random.choice( "CTG" ) | |
464 elif nt == "D": | |
465 return random.choice( "ATG" ) | |
466 elif nt == "H": | |
467 return random.choice( "ATC" ) | |
468 elif nt == "V": | |
469 return random.choice( "ACG" ) | |
470 else: | |
471 return "N" | |
472 | |
473 ## Get nucleotide from an IUPAC letter and a nucleotide | |
474 # Works only for IUPAC code with two possibilities ['R','Y','M','K','W','S'] | |
475 # Examples: | |
476 # Y and C returns T | |
477 # Y and T returns C | |
478 # B and C throws RepetException | |
479 # | |
480 # @return A, T, G, C | |
481 # | |
482 def getATGCNFromIUPACandATGCN(self, IUPACCode, nt): | |
483 if IUPACCode == "R": | |
484 possibleNt = set(["A", "G"]) | |
485 if nt not in possibleNt: | |
486 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) | |
487 return (possibleNt - set(nt)).pop() | |
488 | |
489 elif IUPACCode == "Y": | |
490 possibleNt = set(["C", "T"]) | |
491 if nt not in possibleNt: | |
492 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) | |
493 return (possibleNt - set(nt)).pop() | |
494 | |
495 elif IUPACCode == "M": | |
496 possibleNt = set(["A", "C"]) | |
497 if nt not in possibleNt: | |
498 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) | |
499 return (possibleNt - set(nt)).pop() | |
500 | |
501 elif IUPACCode == "K": | |
502 possibleNt = set(["T", "G"]) | |
503 if nt not in possibleNt: | |
504 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) | |
505 return (possibleNt - set(nt)).pop() | |
506 | |
507 elif IUPACCode == "W": | |
508 possibleNt = set(["A", "T"]) | |
509 if nt not in possibleNt: | |
510 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) | |
511 return (possibleNt - set(nt)).pop() | |
512 | |
513 elif IUPACCode == "S": | |
514 possibleNt = set(["C", "G"]) | |
515 if nt not in possibleNt: | |
516 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) | |
517 return (possibleNt - set(nt)).pop() | |
518 | |
519 else: | |
520 raise RepetException("Can't retrieve the third nucleotide from IUPAC code '%s' and nucleotide '%s'" % (IUPACCode, nt)) | |
521 | |
522 def getSeqWithOnlyATGCN( self ): | |
523 newSeq = "" | |
524 for nt in self.sequence: | |
525 newSeq += self.getATGCNFromIUPAC( nt ) | |
526 return newSeq | |
527 | |
528 | |
529 ## Replace any symbol not in (A,T,G,C,N) by another nucleotide it represents | |
530 # | |
531 def partialIUPAC( self ): | |
532 self.sequence = self.getSeqWithOnlyATGCN() | |
533 | |
534 | |
535 ## Remove non Unix end-of-line symbols, if any | |
536 # | |
537 def checkEOF( self ): | |
538 symbol = "\r" # corresponds to '^M' from Windows | |
539 if symbol in self.sequence: | |
540 print "WARNING: Windows EOF removed in '%s'" % ( self.header ) | |
541 sys.stdout.flush() | |
542 newSeq = self.sequence.replace( symbol, "" ) | |
543 self.sequence = newSeq | |
544 | |
545 | |
546 ## Write Bioseq instance into a fasta file handler | |
547 # | |
548 # @param faFileHandler file handler of a fasta file | |
549 # | |
550 def write( self, faFileHandler ): | |
551 faFileHandler.write( ">%s\n" % ( self.header ) ) | |
552 self.writeSeqInFasta( faFileHandler ) | |
553 | |
554 | |
555 ## Write only the sequence of Bioseq instance into a fasta file handler | |
556 # | |
557 # @param faFileHandler file handler of a fasta file | |
558 # | |
559 def writeSeqInFasta( self, faFileHandler ): | |
560 i = 0 | |
561 while i < self.getLength(): | |
562 faFileHandler.write( "%s\n" % ( self.sequence[i:i+60] ) ) | |
563 i += 60 | |
564 | |
565 | |
566 ## Append Bioseq instance to a fasta file | |
567 # | |
568 # @param faFile name of a fasta file as a string | |
569 # @param mode 'write' or 'append' | |
570 # | |
571 def save( self, faFile, mode="a" ): | |
572 faFileHandler = open( faFile, mode ) | |
573 self.write( faFileHandler ) | |
574 faFileHandler.close() | |
575 | |
576 | |
577 ## Append Bioseq instance to a fasta file | |
578 # | |
579 # @param faFile name of a fasta file as a string | |
580 # | |
581 def appendBioseqInFile( self, faFile ): | |
582 self.save( faFile, "a" ) | |
583 | |
584 | |
585 ## Write Bioseq instance into a fasta file handler | |
586 # | |
587 # @param faFileHandler file handler on a file with writing right | |
588 # | |
589 def writeABioseqInAFastaFile( self, faFileHandler ): | |
590 self.write( faFileHandler ) | |
591 | |
592 | |
593 ## Write Bioseq instance with other header into a fasta file handler | |
594 # | |
595 # @param faFileHandler file handler on a file with writing right | |
596 # @param otherHeader a string representing a new header (without the > and the \n) | |
597 # | |
598 def writeWithOtherHeader( self, faFileHandler, otherHeader ): | |
599 self.header = otherHeader | |
600 self.write( faFileHandler ) | |
601 | |
602 | |
603 ## Append Bioseq header and Bioseq sequence in a fasta file | |
604 # | |
605 # @param faFileHandler file handler on a file with writing right | |
606 # @param otherHeader a string representing a new header (without the > and the \n) | |
607 # | |
608 def writeABioseqInAFastaFileWithOtherHeader( self, faFileHandler, otherHeader ): | |
609 self.writeWithOtherHeader( faFileHandler, otherHeader ) | |
610 | |
611 | |
612 ## get the list of Maps corresponding to seq without gap | |
613 # | |
614 # @warning This method was called getMap() in pyRepet.Bioseq | |
615 # @return a list of Map object | |
616 # | |
617 def getLMapWhithoutGap( self ): | |
618 lMaps = [] | |
619 countSite = 1 | |
620 countSubseq = 1 | |
621 inGap = False | |
622 startMap = -1 | |
623 endMap = -1 | |
624 | |
625 # initialize with the first site | |
626 if self.sequence[0] == "-": | |
627 inGap = True | |
628 else: | |
629 startMap = countSite | |
630 | |
631 # for each remaining site | |
632 for site in self.sequence[1:]: | |
633 countSite += 1 | |
634 | |
635 # if it is a gap | |
636 if site == "-": | |
637 | |
638 # if this is the beginning of a gap, record the previous subsequence | |
639 if inGap == False: | |
640 inGap = True | |
641 endMap = countSite - 1 | |
642 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) ) | |
643 countSubseq += 1 | |
644 | |
645 # if it is NOT a gap | |
646 if site != "-": | |
647 | |
648 # if it is the end of a gap, begin the next subsequence | |
649 if inGap == True: | |
650 inGap = False | |
651 startMap = countSite | |
652 | |
653 # if it is the last site | |
654 if countSite == self.getLength(): | |
655 endMap = countSite | |
656 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) ) | |
657 | |
658 return lMaps | |
659 | |
660 | |
661 ## get the percentage of GC | |
662 # | |
663 # @return a percentage | |
664 # | |
665 def getGCpercentage( self ): | |
666 tmpSeq = self.getSeqWithOnlyATGCN() | |
667 nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" ) | |
668 return 100 * nbGC / float( self.getLength() ) | |
669 | |
670 ## get the percentage of GC of a sequence without counting N in sequence length | |
671 # | |
672 # @return a percentage | |
673 # | |
674 def getGCpercentageInSequenceWithoutCountNInLength(self): | |
675 tmpSeq = self.getSeqWithOnlyATGCN() | |
676 nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" ) | |
677 return 100 * nbGC / float( self.getLength() - self.countNt("N") ) | |
678 | |
679 ## get the 5 prime subsequence of a given length at the given position | |
680 # | |
681 # @param position integer | |
682 # @param flankLength integer subsequence length | |
683 # @return a sequence string | |
684 # | |
685 def get5PrimeFlank(self, position, flankLength): | |
686 if(position == 1): | |
687 return "" | |
688 else: | |
689 startOfFlank = 1 | |
690 endOfFlank = position -1 | |
691 | |
692 if((position - flankLength) > 0): | |
693 startOfFlank = position - flankLength | |
694 else: | |
695 startOfFlank = 1 | |
696 | |
697 return self.subseq(startOfFlank, endOfFlank).sequence | |
698 | |
699 | |
700 ## get the 3 prime subsequence of a given length at the given position | |
701 # In the case of indels, the polymorphism length can be specified | |
702 # | |
703 # @param position integer | |
704 # @param flankLength integer subsequence length | |
705 # @param polymLength integer polymorphism length | |
706 # @return a sequence string | |
707 # | |
708 def get3PrimeFlank(self, position, flankLength, polymLength = 1): | |
709 if((position + polymLength) > len( self.sequence )): | |
710 return "" | |
711 else: | |
712 startOfFlank = position + polymLength | |
713 | |
714 if((position+polymLength+flankLength) > len( self.sequence )): | |
715 endOfFlank = len( self.sequence ) | |
716 else: | |
717 endOfFlank = position+polymLength+flankLength-1 | |
718 | |
719 return self.subseq(startOfFlank, endOfFlank).sequence | |
720 | |
721 | |
722 def _createWordList(self,size,l=['A','T','G','C']): | |
723 if size == 1 : | |
724 return l | |
725 else: | |
726 l2 = [] | |
727 for i in l: | |
728 for j in ['A','T','G','C']: | |
729 l2.append( i + j ) | |
730 return self._createWordList(size-1,l2) | |
731 | |
732 | |
733 def removeSymbol( self, symbol ): | |
734 tmp = self.sequence.replace( symbol, "" ) | |
735 self.sequence = tmp |