6
|
1 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
2 # http://www.inra.fr
|
|
3 # http://urgi.versailles.inra.fr
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30
|
|
31
|
|
32 import math
|
|
33 import re
|
|
34 from commons.core.seq.Bioseq import Bioseq
|
|
35
|
|
36 ## Static methods for sequences manipulation
|
|
37 #
|
|
38 class BioseqUtils(object):
|
|
39
|
|
40 ## Translate a nucleotide sequence
|
|
41 #
|
|
42 # @param bioSeqInstanceToTranslate a bioseq instance to translate
|
|
43 # @param phase a integer : 1 (default), 2 or 3
|
|
44 #
|
|
45 def translateSequence(bioSeqInstanceToTranslate, phase=1):
|
|
46 pep = ""
|
|
47 #length = math.floor((len(self.sequence)-phase-1)/3)*3
|
|
48 length = int( math.floor( ( len(bioSeqInstanceToTranslate.sequence )-( phase-1 ) )/3 )*3 )
|
|
49 #We need capital letters !
|
|
50 bioSeqInstanceToTranslate.upCase()
|
|
51 sequence = bioSeqInstanceToTranslate.sequence
|
|
52 for i in xrange(phase-1,length,3):
|
|
53 if (sequence[i:i+3] == "TTT" or sequence[i:i+3] == "TTC"):
|
|
54 pep = pep + "F"
|
|
55 elif ( sequence[i:i+3] == "TTA" or sequence[i:i+3] == "TTG" ):
|
|
56 pep = pep + "L"
|
|
57 elif ( sequence[i:i+2] == "CT" ):
|
|
58 pep = pep + "L"
|
|
59 elif ( sequence[i:i+3] == "ATT" or sequence[i:i+3] == "ATC" or sequence[i:i+3] == "ATA" ):
|
|
60 pep = pep + "I"
|
|
61 elif ( sequence[i:i+3] == "ATG" ):
|
|
62 pep = pep + "M"
|
|
63 elif ( sequence[i:i+2] == "GT" ):
|
|
64 pep = pep + "V"
|
|
65 elif ( sequence[i:i+2] == "TC" ) :
|
|
66 pep = pep + "S"
|
|
67 elif ( sequence[i:i+2] == "CC" ) :
|
|
68 pep = pep + "P"
|
|
69 elif ( sequence[i:i+2] == "AC" ) :
|
|
70 pep = pep + "T"
|
|
71 elif ( sequence[i:i+2] == "GC" ) :
|
|
72 pep = pep + "A"
|
|
73 elif ( sequence[i:i+3] == "TAT" or sequence[i:i+3] == "TAC" ) :
|
|
74 pep = pep + "Y"
|
|
75 elif ( sequence[i:i+3] == "TAA" or sequence[i:i+3] == "TAG" ) :
|
|
76 pep = pep + "*"
|
|
77 elif ( sequence[i:i+3] == "CAT" or sequence[i:i+3] == "CAC" ) :
|
|
78 pep = pep + "H"
|
|
79 elif ( sequence[i:i+3] == "CAA" or sequence[i:i+3] == "CAG" ) :
|
|
80 pep = pep + "Q"
|
|
81 elif ( sequence[i:i+3] == "AAT" or sequence[i:i+3] == "AAC" ) :
|
|
82 pep = pep + "N"
|
|
83 elif ( sequence[i:i+3] == "AAA" or sequence[i:i+3] == "AAG" ) :
|
|
84 pep = pep + "K"
|
|
85 elif ( sequence[i:i+3] == "GAT" or sequence[i:i+3] == "GAC" ) :
|
|
86 pep = pep + "D"
|
|
87 elif ( sequence[i:i+3] == "GAA" or sequence[i:i+3] == "GAG" ) :
|
|
88 pep = pep + "E"
|
|
89 elif ( sequence[i:i+3] == "TGT" or sequence[i:i+3] == "TGC" ) :
|
|
90 pep = pep + "C"
|
|
91 elif ( sequence[i:i+3] == "TGA" ) :
|
|
92 pep = pep + "*"
|
|
93 elif ( sequence[i:i+3] == "TGG" ) :
|
|
94 pep = pep + "W"
|
|
95 elif ( sequence[i:i+2] == "CG" ) :
|
|
96 pep = pep + "R"
|
|
97 elif ( sequence[i:i+3] == "AGT" or sequence[i:i+3] == "AGC" ) :
|
|
98 pep = pep + "S"
|
|
99 elif ( sequence[i:i+3] == "AGA" or sequence[i:i+3] == "AGG" ) :
|
|
100 pep = pep + "R"
|
|
101 elif ( sequence[i:i+2] == "GG" ):
|
|
102 pep = pep + "G"
|
|
103 #We don't know the amino acid because we don't have the nucleotide
|
|
104 #R Purine (A or G)
|
|
105 #Y Pyrimidine (C, T, or U)
|
|
106 #M C or A
|
|
107 #K T, U, or G
|
|
108 #W T, U, or A
|
|
109 #S C or G
|
|
110 #B C, T, U, or G (not A)
|
|
111 #D A, T, U, or G (not C)
|
|
112 #H A, T, U, or C (not G)
|
|
113 #V A, C, or G (not T, not U)
|
|
114 #N Unknown nucleotide
|
|
115 elif ( re.search("N|R|Y|M|K|W|S|B|D|H|V", sequence[i:i+3])):
|
|
116 pep = pep + "X"
|
|
117 bioSeqInstanceToTranslate.sequence = pep
|
|
118
|
|
119 translateSequence = staticmethod(translateSequence)
|
|
120
|
|
121 ## Add the frame info in header
|
|
122 #
|
|
123 # @param bioSeqInstance a bioseq instance to translate
|
|
124 # @param phase a integer : 1 , 2 or 3
|
|
125 #
|
|
126 def setFrameInfoOnHeader(bioSeqInstance, phase):
|
|
127 if " " in bioSeqInstance.header:
|
|
128 name, desc = bioSeqInstance.header.split(" ", 1)
|
|
129 name = name + "_" + str(phase)
|
|
130 bioSeqInstance.header = name + " " + desc
|
|
131 else:
|
|
132 bioSeqInstance.header = bioSeqInstance.header + "_" + str(phase)
|
|
133
|
|
134 setFrameInfoOnHeader = staticmethod(setFrameInfoOnHeader)
|
|
135
|
|
136 ## Translate a nucleotide sequence for all frames (positives and negatives)
|
|
137 #
|
|
138 # @param bioSeqInstanceToTranslate a bioseq instance to translate
|
|
139 #
|
|
140 def translateInAllFrame( bioSeqInstanceToTranslate ):
|
|
141 positives = BioseqUtils._translateInPositiveFrames( bioSeqInstanceToTranslate )
|
|
142 negatives = BioseqUtils._translateInNegativeFrames( bioSeqInstanceToTranslate )
|
|
143 listAll6Frames = []
|
|
144 listAll6Frames.extend(positives)
|
|
145 listAll6Frames.extend(negatives)
|
|
146 return listAll6Frames
|
|
147
|
|
148 translateInAllFrame = staticmethod(translateInAllFrame)
|
|
149
|
|
150 ## Replace the stop codons by X in sequence
|
|
151 #
|
|
152 # @param bioSeqInstance a bioseq instance
|
|
153 #
|
|
154 def replaceStopCodonsByX( bioSeqInstance ):
|
|
155 bioSeqInstance.sequence = bioSeqInstance.sequence.replace ("*", "X")
|
|
156
|
|
157 replaceStopCodonsByX = staticmethod(replaceStopCodonsByX)
|
|
158
|
|
159 ## Translate in a list all the frames of all the bioseq of bioseq list
|
|
160 #
|
|
161 # @param bioseqList a list of bioseq instances
|
|
162 # @return a list of translated bioseq instances
|
|
163 #
|
|
164 def translateBioseqListInAllFrames( bioseqList ):
|
|
165 bioseqListInAllFrames = []
|
|
166 for bioseq in bioseqList :
|
|
167 bioseqListInAllFrames.extend(BioseqUtils.translateInAllFrame(bioseq))
|
|
168 return bioseqListInAllFrames
|
|
169
|
|
170 translateBioseqListInAllFrames = staticmethod( translateBioseqListInAllFrames )
|
|
171
|
|
172 ## Replace the stop codons by X for each sequence of a bioseq list
|
|
173 #
|
|
174 # @param lBioseqWithStops a list of bioseq instances
|
|
175 # @return a list of bioseq instances
|
|
176 #
|
|
177 def replaceStopCodonsByXInBioseqList ( lBioseqWithStops ):
|
|
178 bioseqListWithStopsreplaced = []
|
|
179 for bioseq in lBioseqWithStops:
|
|
180 BioseqUtils.replaceStopCodonsByX(bioseq)
|
|
181 bioseqListWithStopsreplaced.append(bioseq)
|
|
182 return bioseqListWithStopsreplaced
|
|
183
|
|
184 replaceStopCodonsByXInBioseqList = staticmethod( replaceStopCodonsByXInBioseqList )
|
|
185
|
|
186 ## Write a list of bioseq instances in a fasta file (60 characters per line)
|
|
187 #
|
|
188 # @param lBioseq a list of bioseq instances
|
|
189 # @param fileName string
|
|
190 #
|
|
191 def writeBioseqListIntoFastaFile( lBioseq, fileName ):
|
|
192 fout = open(fileName, "w")
|
|
193 for bioseq in lBioseq:
|
|
194 bioseq.write(fout)
|
|
195 fout.close()
|
|
196
|
|
197 writeBioseqListIntoFastaFile = staticmethod( writeBioseqListIntoFastaFile )
|
|
198
|
|
199 ## read in a fasta file and create a list of bioseq instances
|
|
200 #
|
|
201 # @param fileName string
|
|
202 # @return a list of bioseq
|
|
203 #
|
|
204 def extractBioseqListFromFastaFile( fileName ):
|
|
205 file = open( fileName )
|
|
206 lBioseq = []
|
|
207 currentHeader = ""
|
|
208 while currentHeader != None:
|
|
209 bioseq = Bioseq()
|
|
210 bioseq.read(file)
|
|
211 currentHeader = bioseq.header
|
|
212 if currentHeader != None:
|
|
213 lBioseq.append(bioseq)
|
|
214 return lBioseq
|
|
215
|
|
216 extractBioseqListFromFastaFile = staticmethod( extractBioseqListFromFastaFile )
|
|
217
|
|
218 ## Give the length of a sequence search by name
|
|
219 #
|
|
220 # @param lBioseq a list of bioseq instances
|
|
221 # @param seqName string
|
|
222 # @return an integer
|
|
223 #
|
|
224 def getSeqLengthWithSeqName( lBioseq, seqName ):
|
|
225 length = 0
|
|
226 for bioseq in lBioseq:
|
|
227 if bioseq.header == seqName:
|
|
228 length = bioseq.getLength()
|
|
229 break
|
|
230 return length
|
|
231
|
|
232 getSeqLengthWithSeqName = staticmethod( getSeqLengthWithSeqName )
|
|
233
|
|
234 def _translateInPositiveFrames( bioSeqInstanceToTranslate ):
|
|
235 seq1 = bioSeqInstanceToTranslate.copyBioseqInstance()
|
|
236 BioseqUtils.setFrameInfoOnHeader(seq1, 1)
|
|
237 BioseqUtils.translateSequence(seq1, 1)
|
|
238 seq2 = bioSeqInstanceToTranslate.copyBioseqInstance()
|
|
239 BioseqUtils.setFrameInfoOnHeader(seq2, 2)
|
|
240 BioseqUtils.translateSequence(seq2, 2)
|
|
241 seq3 = bioSeqInstanceToTranslate.copyBioseqInstance()
|
|
242 BioseqUtils.setFrameInfoOnHeader(seq3, 3)
|
|
243 BioseqUtils.translateSequence(seq3, 3)
|
|
244 return [seq1, seq2, seq3]
|
|
245
|
|
246 _translateInPositiveFrames = staticmethod( _translateInPositiveFrames )
|
|
247
|
|
248 def _translateInNegativeFrames(bioSeqInstanceToTranslate):
|
|
249 seq4 = bioSeqInstanceToTranslate.copyBioseqInstance()
|
|
250 seq4.reverseComplement()
|
|
251 BioseqUtils.setFrameInfoOnHeader(seq4, 4)
|
|
252 BioseqUtils.translateSequence(seq4, 1)
|
|
253 seq5 = bioSeqInstanceToTranslate.copyBioseqInstance()
|
|
254 seq5.reverseComplement()
|
|
255 BioseqUtils.setFrameInfoOnHeader(seq5, 5)
|
|
256 BioseqUtils.translateSequence(seq5, 2)
|
|
257 seq6 = bioSeqInstanceToTranslate.copyBioseqInstance()
|
|
258 seq6.reverseComplement()
|
|
259 BioseqUtils.setFrameInfoOnHeader(seq6, 6)
|
|
260 BioseqUtils.translateSequence(seq6, 3)
|
|
261 return [seq4, seq5, seq6]
|
|
262
|
|
263 _translateInNegativeFrames = staticmethod( _translateInNegativeFrames )
|
|
264
|
|
265
|
|
266 ## Return a dictionary which keys are sequence headers and values sequence lengths.
|
|
267 #
|
|
268 def getLengthPerSeqFromFile( inFile ):
|
|
269 dHeader2Length = {}
|
|
270 inFileHandler = open( inFile, "r" )
|
|
271 while True:
|
|
272 iBs = Bioseq()
|
|
273 iBs.read( inFileHandler )
|
|
274 if iBs.sequence == None:
|
|
275 break
|
|
276 dHeader2Length[ iBs.header ] = iBs.getLength()
|
|
277 inFileHandler.close()
|
|
278 return dHeader2Length
|
|
279
|
|
280 getLengthPerSeqFromFile = staticmethod( getLengthPerSeqFromFile )
|
|
281
|
|
282
|
|
283 ## Return the list of Bioseq instances, these being sorted in decreasing length
|
|
284 #
|
|
285 def getBioseqListSortedByDecreasingLength( lBioseqs ):
|
|
286 return sorted( lBioseqs, key=lambda iBs: ( iBs.getLength() ), reverse=True )
|
|
287
|
|
288 getBioseqListSortedByDecreasingLength = staticmethod( getBioseqListSortedByDecreasingLength )
|
|
289
|
|
290
|
|
291 ## Return the list of Bioseq instances, these being sorted in decreasing length (without gaps)
|
|
292 #
|
|
293 def getBioseqListSortedByDecreasingLengthWithoutGaps( lBioseqs ):
|
|
294 return sorted( lBioseqs, key=lambda iBs: ( len(iBs.sequence.replace("-","")) ), reverse=True )
|
|
295
|
|
296 getBioseqListSortedByDecreasingLengthWithoutGaps = staticmethod( getBioseqListSortedByDecreasingLengthWithoutGaps )
|