annotate commons/core/seq/Bioseq.py @ 58:5f5c9b74c2dd

Uploaded
author m-zytnicki
date Fri, 07 Feb 2014 11:53:36 -0500
parents 44d5973c188c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
36
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1 # Copyright INRA (Institut National de la Recherche Agronomique)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
2 # http://www.inra.fr
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
3 # http://urgi.versailles.inra.fr
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
4 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
5 # This software is governed by the CeCILL license under French law and
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
6 # abiding by the rules of distribution of free software. You can use,
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
7 # modify and/ or redistribute the software under the terms of the CeCILL
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
8 # license as circulated by CEA, CNRS and INRIA at the following URL
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
9 # "http://www.cecill.info".
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
10 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
11 # As a counterpart to the access to the source code and rights to copy,
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
12 # modify and redistribute granted by the license, users are provided only
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
13 # with a limited warranty and the software's author, the holder of the
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
14 # economic rights, and the successive licensors have only limited
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
15 # liability.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
16 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
17 # In this respect, the user's attention is drawn to the risks associated
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
18 # with loading, using, modifying and/or developing or reproducing the
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
19 # software by the user in light of its specific status of free software,
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
20 # that may mean that it is complicated to manipulate, and that also
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
21 # therefore means that it is reserved for developers and experienced
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
22 # professionals having in-depth computer knowledge. Users are therefore
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
23 # encouraged to load and test the software's suitability as regards their
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
24 # requirements in conditions enabling the security of their systems and/or
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
25 # data to be ensured and, more generally, to use and operate it in the
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
26 # same conditions as regards security.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
27 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
28 # The fact that you are presently reading this means that you have had
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
29 # knowledge of the CeCILL license and that you accept its terms.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
30
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
31
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
32 import sys
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
33 import string
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
34 import re
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
35 import random
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
36 import cStringIO
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
37 from commons.core.coord.Map import Map
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
38 from commons.core.checker.RepetException import RepetException
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
39
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
40 DNA_ALPHABET_WITH_N = set( ['A','T','G','C','N'] )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
41 IUPAC = set(['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
42
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
43
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
44 ## Record a sequence with its header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
45 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
46 class Bioseq( object ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
47
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
48 header = ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
49 sequence = ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
50
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
51 ## constructor
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
52 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
53 # @param name the header of sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
54 # @param seq sequence (DNA, RNA, protein)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
55 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
56 def __init__( self, name="", seq="" ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
57 self.header = name
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
58 self.sequence = seq
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
59
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
60
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
61 ## Equal operator
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
62 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
63 def __eq__( self, o ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
64 if self.header==o.header and self.sequence==o.sequence:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
65 return True
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
66 return False
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
67
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
68
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
69 ## overload __repr__
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
70 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
71 def __repr__( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
72 return "%s;%s" % ( self.header, self.sequence )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
73
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
74
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
75 ## set attribute header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
76 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
77 # @param header a string
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
78 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
79 def setHeader( self, header ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
80 self.header = header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
81
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
82
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
83 ## get attribute header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
84 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
85 # @return header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
86 def getHeader(self):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
87 return self.header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
88
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
89
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
90 ## set attribute sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
91 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
92 # @param sequence a string
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
93 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
94 def setSequence( self, sequence ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
95 self.sequence = sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
96
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
97
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
98 def getSequence(self):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
99 return self.sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
100
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
101 ## reset
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
102 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
103 def reset( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
104 self.setHeader( "" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
105 self.setSequence( "" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
106
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
107
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
108 ## Test if bioseq is empty
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
109 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
110 def isEmpty( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
111 return self.header == "" and self.sequence == ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
112
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
113
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
114 ## Reverse the sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
115 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
116 def reverse( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
117 tmp = self.sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
118 self.sequence = tmp[::-1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
119
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
120
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
121 ## Turn the sequence into its complement
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
122 # Force upper case letters
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
123 # @warning: old name in pyRepet.Bioseq realComplement
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
124 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
125 def complement( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
126 complement = ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
127 self.upCase()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
128 for i in xrange(0,len(self.sequence),1):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
129 if self.sequence[i] == "A":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
130 complement += "T"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
131 elif self.sequence[i] == "T":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
132 complement += "A"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
133 elif self.sequence[i] == "C":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
134 complement += "G"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
135 elif self.sequence[i] == "G":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
136 complement += "C"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
137 elif self.sequence[i] == "M":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
138 complement += "K"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
139 elif self.sequence[i] == "R":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
140 complement += "Y"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
141 elif self.sequence[i] == "W":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
142 complement += "W"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
143 elif self.sequence[i] == "S":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
144 complement += "S"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
145 elif self.sequence[i] == "Y":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
146 complement += "R"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
147 elif self.sequence[i] == "K":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
148 complement += "M"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
149 elif self.sequence[i] == "V":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
150 complement += "B"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
151 elif self.sequence[i] == "H":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
152 complement += "D"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
153 elif self.sequence[i] == "D":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
154 complement += "H"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
155 elif self.sequence[i] == "B":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
156 complement += "V"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
157 elif self.sequence[i] == "N":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
158 complement += "N"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
159 elif self.sequence[i] == "-":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
160 complement += "-"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
161 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
162 print "WARNING: unknown symbol '%s', replacing it by N" % ( self.sequence[i] )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
163 complement += "N"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
164 self.sequence = complement
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
165
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
166
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
167 ## Reverse and complement the sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
168 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
169 # Force upper case letters
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
170 # @warning: old name in pyRepet.Bioseq : complement
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
171 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
172 def reverseComplement( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
173 self.reverse()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
174 self.complement()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
175
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
176
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
177 ## Remove gap in the sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
178 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
179 def cleanGap(self):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
180 self.sequence = self.sequence.replace("-","")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
181
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
182
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
183 ## Copy current Bioseq Instance
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
184 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
185 # @return: a Bioseq instance, a copy of current sequence.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
186 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
187 def copyBioseqInstance(self):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
188 seq = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
189 seq.sequence = self.sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
190 seq.header = self.header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
191 return seq
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
192
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
193
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
194 ## Add phase information after the name of sequence in header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
195 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
196 # @param phase integer representing phase (1, 2, 3, -1, -2, -3)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
197 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
198 def setFrameInfoOnHeader(self, phase):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
199 if " " in self.header:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
200 name, desc = self.header.split(" ", 1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
201 name = name + "_" + str(phase)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
202 self.header = name + " " + desc
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
203 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
204 self.header = self.header + "_" + str(phase)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
205
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
206
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
207 ## Fill Bioseq attributes with fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
208 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
209 # @param faFileHandler file handler of a fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
210 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
211 def read( self, faFileHandler ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
212 line = faFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
213 if line == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
214 self.header = None
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
215 self.sequence = None
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
216 return
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
217 while line == "\n":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
218 line = faFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
219 if line[0] == '>':
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
220 self.header = string.rstrip(line[1:])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
221 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
222 print "error, line is",string.rstrip(line)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
223 return
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
224 line = " "
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
225 seq = cStringIO.StringIO()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
226 while line:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
227 prev_pos = faFileHandler.tell()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
228 line = faFileHandler.readline()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
229 if line == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
230 break
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
231 if line[0] == '>':
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
232 faFileHandler.seek( prev_pos )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
233 break
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
234 seq.write( string.rstrip(line) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
235 self.sequence = seq.getvalue()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
236
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
237
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
238 ## Create a subsequence with a modified header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
239 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
240 # @param s integer start a required subsequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
241 # @param e integer end a required subsequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
242 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
243 # @return a Bioseq instance, a subsequence of current sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
244 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
245 def subseq( self, s, e=0 ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
246 if e == 0 :
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
247 e=len( self.sequence )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
248 if s > e :
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
249 print "error: start must be < or = to end"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
250 return
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
251 if s <= 0 :
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
252 print "error: start must be > 0"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
253 return
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
254 sub = Bioseq()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
255 sub.header = self.header + " fragment " + str(s) + ".." + str(e)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
256 sub.sequence = self.sequence[(s-1):e]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
257 return sub
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
258
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
259
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
260 ## Get the nucleotide or aminoacid at the given position
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
261 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
262 # @param pos integer nucleotide or aminoacid position
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
263 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
264 # @return a string
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
265 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
266 def getNtFromPosition(self, pos):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
267 result = None
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
268 if not (pos < 1 or pos > self.getLength()):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
269 result = self.sequence[pos - 1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
270 return result
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
271
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
272
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
273 ## Print in stdout the Bioseq in fasta format with 60 characters lines
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
274 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
275 # @param l length of required sequence default is whole sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
276 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
277 def view(self,l=0):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
278 print '>'+self.header
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
279 i=0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
280 if(l==0):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
281 l=len(self.sequence)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
282 seq=self.sequence[0:l]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
283
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
284 while i<len(seq):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
285 print seq[i:i+60]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
286 i=i+60
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
287
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
288
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
289 ## Get length of sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
290 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
291 # @param avoidN boolean don't count 'N' nucleotides
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
292 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
293 # @return length of current sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
294 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
295 def getLength( self, countN = True ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
296 if countN:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
297 return len(self.sequence)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
298 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
299 return len(self.sequence) - self.countNt('N')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
300
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
301
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
302 ## Return the proportion of a specific character
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
303 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
304 # @param nt character that we want to count
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
305 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
306 def propNt( self, nt ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
307 return self.countNt( nt ) / float( self.getLength() )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
308
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
309
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
310 ## Count occurrence of specific character
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
311 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
312 # @param nt character that we want to count
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
313 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
314 # @return nb of occurrences
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
315 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
316 def countNt( self, nt ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
317 return self.sequence.count( nt )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
318
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
319
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
320 ## Count occurrence of each nucleotide in current seq
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
321 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
322 # @return a dict, keys are nucleotides, values are nb of occurrences
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
323 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
324 def countAllNt( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
325 dNt2Count = {}
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
326 for nt in ["A","T","G","C","N"]:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
327 dNt2Count[ nt ] = self.countNt( nt )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
328 return dNt2Count
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
329
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
330
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
331 ## Return a dict with the number of occurrences for each combination of ATGC of specified size and number of word found
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
332 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
333 # @param size integer required length word
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
334 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
335 def occ_word( self, size ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
336 occ = {}
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
337 if size == 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
338 return occ,0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
339 nbword = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
340 srch = re.compile('[^ATGC]+')
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
341 wordlist = self._createWordList( size )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
342 for i in wordlist:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
343 occ[i] = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
344 lenseq = len(self.sequence)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
345 i = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
346 while i < lenseq-size+1:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
347 word = self.sequence[i:i+size].upper()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
348 m = srch.search(word)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
349 if m == None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
350 occ[word] = occ[word]+1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
351 nbword = nbword + 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
352 i = i + 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
353 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
354 i = i + m.end(0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
355 return occ, nbword
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
356
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
357
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
358 ## Return a dictionary with the frequency of occurs for each combination of ATGC of specified size
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
359 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
360 # @param size integer required length word
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
361 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
362 def freq_word( self, size ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
363 dOcc, nbWords = self.occ_word( size )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
364 freq = {}
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
365 for word in dOcc.keys():
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
366 freq[word] = float(dOcc[word]) / nbWords
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
367 return freq
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
368
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
369
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
370 ## Find ORF in each phase
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
371 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
372 # @return: a dict, keys are phases, values are stop codon positions.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
373 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
374 def findORF (self):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
375 orf = {0:[],1:[],2:[]}
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
376 length = len (self.sequence)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
377 for i in xrange(0,length):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
378 triplet = self.sequence[i:i+3]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
379 if ( triplet == "TAA" or triplet == "TAG" or triplet == "TGA"):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
380 phase = i % 3
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
381 orf[phase].append(i)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
382 return orf
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
383
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
384
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
385 ## Convert the sequence into upper case
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
386 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
387 def upCase( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
388 self.sequence = self.sequence.upper()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
389
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
390
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
391 ## Convert the sequence into lower case
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
392 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
393 def lowCase( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
394 self.sequence = self.sequence.lower()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
395
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
396
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
397 ## Extract the cluster of the fragment (output from Grouper)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
398 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
399 # @return cluster id (string)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
400 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
401 def getClusterID( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
402 data = self.header.split()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
403 return data[0].split("Cl")[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
404
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
405
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
406 ## Extract the group of the sequence (output from Grouper)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
407 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
408 # @return group id (string)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
409 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
410 def getGroupID( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
411 data = self.header.split()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
412 return data[0].split("Gr")[1].split("Cl")[0]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
413
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
414
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
415 ## Get the header of the full sequence (output from Grouper)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
416 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
417 # @example 'Dmel_Grouper_3091_Malign_3:LARD' from '>MbS1566Gr81Cl81 Dmel_Grouper_3091_Malign_3:LARD {Fragment} 1..5203'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
418 # @return header (string)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
419 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
420 def getHeaderFullSeq( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
421 data = self.header.split()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
422 return data[1]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
423
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
424
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
425 ## Get the strand of the fragment (output from Grouper)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
426 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
427 # @return: strand (+ or -)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
428 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
429 def getFragStrand( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
430 data = self.header.split()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
431 coord = data[3].split("..")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
432 if int(coord[0]) < int(coord[-1]):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
433 return "+"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
434 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
435 return "-"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
436
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
437
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
438 ## Get A, T, G, C or N from an IUPAC letter
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
439 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
440 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
441 # @return A, T, G, C or N
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
442 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
443 def getATGCNFromIUPAC( self, nt ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
444 subset = ["A","T","G","C","N"]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
445
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
446 if nt in subset:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
447 return nt
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
448 elif nt == "U":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
449 return "T"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
450 elif nt == "R":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
451 return random.choice( "AG" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
452 elif nt == "Y":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
453 return random.choice( "CT" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
454 elif nt == "M":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
455 return random.choice( "CA" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
456 elif nt == "K":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
457 return random.choice( "TG" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
458 elif nt == "W":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
459 return random.choice( "TA" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
460 elif nt == "S":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
461 return random.choice( "CG" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
462 elif nt == "B":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
463 return random.choice( "CTG" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
464 elif nt == "D":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
465 return random.choice( "ATG" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
466 elif nt == "H":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
467 return random.choice( "ATC" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
468 elif nt == "V":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
469 return random.choice( "ACG" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
470 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
471 return "N"
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
472
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
473 ## Get nucleotide from an IUPAC letter and a nucleotide
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
474 # Works only for IUPAC code with two possibilities ['R','Y','M','K','W','S']
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
475 # Examples:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
476 # Y and C returns T
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
477 # Y and T returns C
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
478 # B and C throws RepetException
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
479 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
480 # @return A, T, G, C
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
481 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
482 def getATGCNFromIUPACandATGCN(self, IUPACCode, nt):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
483 if IUPACCode == "R":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
484 possibleNt = set(["A", "G"])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
485 if nt not in possibleNt:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
486 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
487 return (possibleNt - set(nt)).pop()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
488
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
489 elif IUPACCode == "Y":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
490 possibleNt = set(["C", "T"])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
491 if nt not in possibleNt:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
492 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
493 return (possibleNt - set(nt)).pop()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
494
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
495 elif IUPACCode == "M":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
496 possibleNt = set(["A", "C"])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
497 if nt not in possibleNt:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
498 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
499 return (possibleNt - set(nt)).pop()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
500
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
501 elif IUPACCode == "K":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
502 possibleNt = set(["T", "G"])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
503 if nt not in possibleNt:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
504 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
505 return (possibleNt - set(nt)).pop()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
506
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
507 elif IUPACCode == "W":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
508 possibleNt = set(["A", "T"])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
509 if nt not in possibleNt:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
510 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
511 return (possibleNt - set(nt)).pop()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
512
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
513 elif IUPACCode == "S":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
514 possibleNt = set(["C", "G"])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
515 if nt not in possibleNt:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
516 raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
517 return (possibleNt - set(nt)).pop()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
518
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
519 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
520 raise RepetException("Can't retrieve the third nucleotide from IUPAC code '%s' and nucleotide '%s'" % (IUPACCode, nt))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
521
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
522 def getSeqWithOnlyATGCN( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
523 newSeq = ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
524 for nt in self.sequence:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
525 newSeq += self.getATGCNFromIUPAC( nt )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
526 return newSeq
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
527
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
528
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
529 ## Replace any symbol not in (A,T,G,C,N) by another nucleotide it represents
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
530 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
531 def partialIUPAC( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
532 self.sequence = self.getSeqWithOnlyATGCN()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
533
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
534
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
535 ## Remove non Unix end-of-line symbols, if any
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
536 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
537 def checkEOF( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
538 symbol = "\r" # corresponds to '^M' from Windows
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
539 if symbol in self.sequence:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
540 print "WARNING: Windows EOF removed in '%s'" % ( self.header )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
541 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
542 newSeq = self.sequence.replace( symbol, "" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
543 self.sequence = newSeq
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
544
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
545
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
546 ## Write Bioseq instance into a fasta file handler
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
547 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
548 # @param faFileHandler file handler of a fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
549 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
550 def write( self, faFileHandler ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
551 faFileHandler.write( ">%s\n" % ( self.header ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
552 self.writeSeqInFasta( faFileHandler )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
553
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
554
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
555 ## Write only the sequence of Bioseq instance into a fasta file handler
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
556 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
557 # @param faFileHandler file handler of a fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
558 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
559 def writeSeqInFasta( self, faFileHandler ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
560 i = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
561 while i < self.getLength():
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
562 faFileHandler.write( "%s\n" % ( self.sequence[i:i+60] ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
563 i += 60
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
564
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
565
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
566 ## Append Bioseq instance to a fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
567 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
568 # @param faFile name of a fasta file as a string
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
569 # @param mode 'write' or 'append'
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
570 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
571 def save( self, faFile, mode="a" ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
572 faFileHandler = open( faFile, mode )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
573 self.write( faFileHandler )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
574 faFileHandler.close()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
575
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
576
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
577 ## Append Bioseq instance to a fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
578 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
579 # @param faFile name of a fasta file as a string
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
580 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
581 def appendBioseqInFile( self, faFile ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
582 self.save( faFile, "a" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
583
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
584
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
585 ## Write Bioseq instance into a fasta file handler
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
586 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
587 # @param faFileHandler file handler on a file with writing right
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
588 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
589 def writeABioseqInAFastaFile( self, faFileHandler ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
590 self.write( faFileHandler )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
591
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
592
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
593 ## Write Bioseq instance with other header into a fasta file handler
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
594 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
595 # @param faFileHandler file handler on a file with writing right
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
596 # @param otherHeader a string representing a new header (without the > and the \n)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
597 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
598 def writeWithOtherHeader( self, faFileHandler, otherHeader ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
599 self.header = otherHeader
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
600 self.write( faFileHandler )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
601
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
602
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
603 ## Append Bioseq header and Bioseq sequence in a fasta file
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
604 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
605 # @param faFileHandler file handler on a file with writing right
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
606 # @param otherHeader a string representing a new header (without the > and the \n)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
607 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
608 def writeABioseqInAFastaFileWithOtherHeader( self, faFileHandler, otherHeader ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
609 self.writeWithOtherHeader( faFileHandler, otherHeader )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
610
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
611
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
612 ## get the list of Maps corresponding to seq without gap
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
613 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
614 # @warning This method was called getMap() in pyRepet.Bioseq
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
615 # @return a list of Map object
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
616 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
617 def getLMapWhithoutGap( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
618 lMaps = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
619 countSite = 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
620 countSubseq = 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
621 inGap = False
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
622 startMap = -1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
623 endMap = -1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
624
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
625 # initialize with the first site
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
626 if self.sequence[0] == "-":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
627 inGap = True
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
628 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
629 startMap = countSite
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
630
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
631 # for each remaining site
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
632 for site in self.sequence[1:]:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
633 countSite += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
634
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
635 # if it is a gap
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
636 if site == "-":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
637
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
638 # if this is the beginning of a gap, record the previous subsequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
639 if inGap == False:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
640 inGap = True
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
641 endMap = countSite - 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
642 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
643 countSubseq += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
644
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
645 # if it is NOT a gap
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
646 if site != "-":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
647
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
648 # if it is the end of a gap, begin the next subsequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
649 if inGap == True:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
650 inGap = False
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
651 startMap = countSite
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
652
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
653 # if it is the last site
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
654 if countSite == self.getLength():
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
655 endMap = countSite
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
656 lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
657
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
658 return lMaps
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
659
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
660
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
661 ## get the percentage of GC
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
662 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
663 # @return a percentage
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
664 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
665 def getGCpercentage( self ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
666 tmpSeq = self.getSeqWithOnlyATGCN()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
667 nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
668 return 100 * nbGC / float( self.getLength() )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
669
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
670 ## get the percentage of GC of a sequence without counting N in sequence length
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
671 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
672 # @return a percentage
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
673 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
674 def getGCpercentageInSequenceWithoutCountNInLength(self):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
675 tmpSeq = self.getSeqWithOnlyATGCN()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
676 nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
677 return 100 * nbGC / float( self.getLength() - self.countNt("N") )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
678
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
679 ## get the 5 prime subsequence of a given length at the given position
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
680 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
681 # @param position integer
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
682 # @param flankLength integer subsequence length
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
683 # @return a sequence string
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
684 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
685 def get5PrimeFlank(self, position, flankLength):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
686 if(position == 1):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
687 return ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
688 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
689 startOfFlank = 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
690 endOfFlank = position -1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
691
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
692 if((position - flankLength) > 0):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
693 startOfFlank = position - flankLength
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
694 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
695 startOfFlank = 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
696
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
697 return self.subseq(startOfFlank, endOfFlank).sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
698
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
699
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
700 ## get the 3 prime subsequence of a given length at the given position
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
701 # In the case of indels, the polymorphism length can be specified
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
702 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
703 # @param position integer
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
704 # @param flankLength integer subsequence length
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
705 # @param polymLength integer polymorphism length
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
706 # @return a sequence string
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
707 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
708 def get3PrimeFlank(self, position, flankLength, polymLength = 1):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
709 if((position + polymLength) > len( self.sequence )):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
710 return ""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
711 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
712 startOfFlank = position + polymLength
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
713
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
714 if((position+polymLength+flankLength) > len( self.sequence )):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
715 endOfFlank = len( self.sequence )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
716 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
717 endOfFlank = position+polymLength+flankLength-1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
718
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
719 return self.subseq(startOfFlank, endOfFlank).sequence
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
720
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
721
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
722 def _createWordList(self,size,l=['A','T','G','C']):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
723 if size == 1 :
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
724 return l
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
725 else:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
726 l2 = []
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
727 for i in l:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
728 for j in ['A','T','G','C']:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
729 l2.append( i + j )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
730 return self._createWordList(size-1,l2)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
731
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
732
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
733 def removeSymbol( self, symbol ):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
734 tmp = self.sequence.replace( symbol, "" )
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
735 self.sequence = tmp