Mercurial > repos > yufei-luo > s_mart
comparison commons/core/seq/BioseqUtils.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:ea3082881bf8 | 6:769e306b7933 |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 import math | |
33 import re | |
34 from commons.core.seq.Bioseq import Bioseq | |
35 | |
36 ## Static methods for sequences manipulation | |
37 # | |
38 class BioseqUtils(object): | |
39 | |
40 ## Translate a nucleotide sequence | |
41 # | |
42 # @param bioSeqInstanceToTranslate a bioseq instance to translate | |
43 # @param phase a integer : 1 (default), 2 or 3 | |
44 # | |
45 def translateSequence(bioSeqInstanceToTranslate, phase=1): | |
46 pep = "" | |
47 #length = math.floor((len(self.sequence)-phase-1)/3)*3 | |
48 length = int( math.floor( ( len(bioSeqInstanceToTranslate.sequence )-( phase-1 ) )/3 )*3 ) | |
49 #We need capital letters ! | |
50 bioSeqInstanceToTranslate.upCase() | |
51 sequence = bioSeqInstanceToTranslate.sequence | |
52 for i in xrange(phase-1,length,3): | |
53 if (sequence[i:i+3] == "TTT" or sequence[i:i+3] == "TTC"): | |
54 pep = pep + "F" | |
55 elif ( sequence[i:i+3] == "TTA" or sequence[i:i+3] == "TTG" ): | |
56 pep = pep + "L" | |
57 elif ( sequence[i:i+2] == "CT" ): | |
58 pep = pep + "L" | |
59 elif ( sequence[i:i+3] == "ATT" or sequence[i:i+3] == "ATC" or sequence[i:i+3] == "ATA" ): | |
60 pep = pep + "I" | |
61 elif ( sequence[i:i+3] == "ATG" ): | |
62 pep = pep + "M" | |
63 elif ( sequence[i:i+2] == "GT" ): | |
64 pep = pep + "V" | |
65 elif ( sequence[i:i+2] == "TC" ) : | |
66 pep = pep + "S" | |
67 elif ( sequence[i:i+2] == "CC" ) : | |
68 pep = pep + "P" | |
69 elif ( sequence[i:i+2] == "AC" ) : | |
70 pep = pep + "T" | |
71 elif ( sequence[i:i+2] == "GC" ) : | |
72 pep = pep + "A" | |
73 elif ( sequence[i:i+3] == "TAT" or sequence[i:i+3] == "TAC" ) : | |
74 pep = pep + "Y" | |
75 elif ( sequence[i:i+3] == "TAA" or sequence[i:i+3] == "TAG" ) : | |
76 pep = pep + "*" | |
77 elif ( sequence[i:i+3] == "CAT" or sequence[i:i+3] == "CAC" ) : | |
78 pep = pep + "H" | |
79 elif ( sequence[i:i+3] == "CAA" or sequence[i:i+3] == "CAG" ) : | |
80 pep = pep + "Q" | |
81 elif ( sequence[i:i+3] == "AAT" or sequence[i:i+3] == "AAC" ) : | |
82 pep = pep + "N" | |
83 elif ( sequence[i:i+3] == "AAA" or sequence[i:i+3] == "AAG" ) : | |
84 pep = pep + "K" | |
85 elif ( sequence[i:i+3] == "GAT" or sequence[i:i+3] == "GAC" ) : | |
86 pep = pep + "D" | |
87 elif ( sequence[i:i+3] == "GAA" or sequence[i:i+3] == "GAG" ) : | |
88 pep = pep + "E" | |
89 elif ( sequence[i:i+3] == "TGT" or sequence[i:i+3] == "TGC" ) : | |
90 pep = pep + "C" | |
91 elif ( sequence[i:i+3] == "TGA" ) : | |
92 pep = pep + "*" | |
93 elif ( sequence[i:i+3] == "TGG" ) : | |
94 pep = pep + "W" | |
95 elif ( sequence[i:i+2] == "CG" ) : | |
96 pep = pep + "R" | |
97 elif ( sequence[i:i+3] == "AGT" or sequence[i:i+3] == "AGC" ) : | |
98 pep = pep + "S" | |
99 elif ( sequence[i:i+3] == "AGA" or sequence[i:i+3] == "AGG" ) : | |
100 pep = pep + "R" | |
101 elif ( sequence[i:i+2] == "GG" ): | |
102 pep = pep + "G" | |
103 #We don't know the amino acid because we don't have the nucleotide | |
104 #R Purine (A or G) | |
105 #Y Pyrimidine (C, T, or U) | |
106 #M C or A | |
107 #K T, U, or G | |
108 #W T, U, or A | |
109 #S C or G | |
110 #B C, T, U, or G (not A) | |
111 #D A, T, U, or G (not C) | |
112 #H A, T, U, or C (not G) | |
113 #V A, C, or G (not T, not U) | |
114 #N Unknown nucleotide | |
115 elif ( re.search("N|R|Y|M|K|W|S|B|D|H|V", sequence[i:i+3])): | |
116 pep = pep + "X" | |
117 bioSeqInstanceToTranslate.sequence = pep | |
118 | |
119 translateSequence = staticmethod(translateSequence) | |
120 | |
121 ## Add the frame info in header | |
122 # | |
123 # @param bioSeqInstance a bioseq instance to translate | |
124 # @param phase a integer : 1 , 2 or 3 | |
125 # | |
126 def setFrameInfoOnHeader(bioSeqInstance, phase): | |
127 if " " in bioSeqInstance.header: | |
128 name, desc = bioSeqInstance.header.split(" ", 1) | |
129 name = name + "_" + str(phase) | |
130 bioSeqInstance.header = name + " " + desc | |
131 else: | |
132 bioSeqInstance.header = bioSeqInstance.header + "_" + str(phase) | |
133 | |
134 setFrameInfoOnHeader = staticmethod(setFrameInfoOnHeader) | |
135 | |
136 ## Translate a nucleotide sequence for all frames (positives and negatives) | |
137 # | |
138 # @param bioSeqInstanceToTranslate a bioseq instance to translate | |
139 # | |
140 def translateInAllFrame( bioSeqInstanceToTranslate ): | |
141 positives = BioseqUtils._translateInPositiveFrames( bioSeqInstanceToTranslate ) | |
142 negatives = BioseqUtils._translateInNegativeFrames( bioSeqInstanceToTranslate ) | |
143 listAll6Frames = [] | |
144 listAll6Frames.extend(positives) | |
145 listAll6Frames.extend(negatives) | |
146 return listAll6Frames | |
147 | |
148 translateInAllFrame = staticmethod(translateInAllFrame) | |
149 | |
150 ## Replace the stop codons by X in sequence | |
151 # | |
152 # @param bioSeqInstance a bioseq instance | |
153 # | |
154 def replaceStopCodonsByX( bioSeqInstance ): | |
155 bioSeqInstance.sequence = bioSeqInstance.sequence.replace ("*", "X") | |
156 | |
157 replaceStopCodonsByX = staticmethod(replaceStopCodonsByX) | |
158 | |
159 ## Translate in a list all the frames of all the bioseq of bioseq list | |
160 # | |
161 # @param bioseqList a list of bioseq instances | |
162 # @return a list of translated bioseq instances | |
163 # | |
164 def translateBioseqListInAllFrames( bioseqList ): | |
165 bioseqListInAllFrames = [] | |
166 for bioseq in bioseqList : | |
167 bioseqListInAllFrames.extend(BioseqUtils.translateInAllFrame(bioseq)) | |
168 return bioseqListInAllFrames | |
169 | |
170 translateBioseqListInAllFrames = staticmethod( translateBioseqListInAllFrames ) | |
171 | |
172 ## Replace the stop codons by X for each sequence of a bioseq list | |
173 # | |
174 # @param lBioseqWithStops a list of bioseq instances | |
175 # @return a list of bioseq instances | |
176 # | |
177 def replaceStopCodonsByXInBioseqList ( lBioseqWithStops ): | |
178 bioseqListWithStopsreplaced = [] | |
179 for bioseq in lBioseqWithStops: | |
180 BioseqUtils.replaceStopCodonsByX(bioseq) | |
181 bioseqListWithStopsreplaced.append(bioseq) | |
182 return bioseqListWithStopsreplaced | |
183 | |
184 replaceStopCodonsByXInBioseqList = staticmethod( replaceStopCodonsByXInBioseqList ) | |
185 | |
186 ## Write a list of bioseq instances in a fasta file (60 characters per line) | |
187 # | |
188 # @param lBioseq a list of bioseq instances | |
189 # @param fileName string | |
190 # | |
191 def writeBioseqListIntoFastaFile( lBioseq, fileName ): | |
192 fout = open(fileName, "w") | |
193 for bioseq in lBioseq: | |
194 bioseq.write(fout) | |
195 fout.close() | |
196 | |
197 writeBioseqListIntoFastaFile = staticmethod( writeBioseqListIntoFastaFile ) | |
198 | |
199 ## read in a fasta file and create a list of bioseq instances | |
200 # | |
201 # @param fileName string | |
202 # @return a list of bioseq | |
203 # | |
204 def extractBioseqListFromFastaFile( fileName ): | |
205 file = open( fileName ) | |
206 lBioseq = [] | |
207 currentHeader = "" | |
208 while currentHeader != None: | |
209 bioseq = Bioseq() | |
210 bioseq.read(file) | |
211 currentHeader = bioseq.header | |
212 if currentHeader != None: | |
213 lBioseq.append(bioseq) | |
214 return lBioseq | |
215 | |
216 extractBioseqListFromFastaFile = staticmethod( extractBioseqListFromFastaFile ) | |
217 | |
218 ## Give the length of a sequence search by name | |
219 # | |
220 # @param lBioseq a list of bioseq instances | |
221 # @param seqName string | |
222 # @return an integer | |
223 # | |
224 def getSeqLengthWithSeqName( lBioseq, seqName ): | |
225 length = 0 | |
226 for bioseq in lBioseq: | |
227 if bioseq.header == seqName: | |
228 length = bioseq.getLength() | |
229 break | |
230 return length | |
231 | |
232 getSeqLengthWithSeqName = staticmethod( getSeqLengthWithSeqName ) | |
233 | |
234 def _translateInPositiveFrames( bioSeqInstanceToTranslate ): | |
235 seq1 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
236 BioseqUtils.setFrameInfoOnHeader(seq1, 1) | |
237 BioseqUtils.translateSequence(seq1, 1) | |
238 seq2 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
239 BioseqUtils.setFrameInfoOnHeader(seq2, 2) | |
240 BioseqUtils.translateSequence(seq2, 2) | |
241 seq3 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
242 BioseqUtils.setFrameInfoOnHeader(seq3, 3) | |
243 BioseqUtils.translateSequence(seq3, 3) | |
244 return [seq1, seq2, seq3] | |
245 | |
246 _translateInPositiveFrames = staticmethod( _translateInPositiveFrames ) | |
247 | |
248 def _translateInNegativeFrames(bioSeqInstanceToTranslate): | |
249 seq4 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
250 seq4.reverseComplement() | |
251 BioseqUtils.setFrameInfoOnHeader(seq4, 4) | |
252 BioseqUtils.translateSequence(seq4, 1) | |
253 seq5 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
254 seq5.reverseComplement() | |
255 BioseqUtils.setFrameInfoOnHeader(seq5, 5) | |
256 BioseqUtils.translateSequence(seq5, 2) | |
257 seq6 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
258 seq6.reverseComplement() | |
259 BioseqUtils.setFrameInfoOnHeader(seq6, 6) | |
260 BioseqUtils.translateSequence(seq6, 3) | |
261 return [seq4, seq5, seq6] | |
262 | |
263 _translateInNegativeFrames = staticmethod( _translateInNegativeFrames ) | |
264 | |
265 | |
266 ## Return a dictionary which keys are sequence headers and values sequence lengths. | |
267 # | |
268 def getLengthPerSeqFromFile( inFile ): | |
269 dHeader2Length = {} | |
270 inFileHandler = open( inFile, "r" ) | |
271 while True: | |
272 iBs = Bioseq() | |
273 iBs.read( inFileHandler ) | |
274 if iBs.sequence == None: | |
275 break | |
276 dHeader2Length[ iBs.header ] = iBs.getLength() | |
277 inFileHandler.close() | |
278 return dHeader2Length | |
279 | |
280 getLengthPerSeqFromFile = staticmethod( getLengthPerSeqFromFile ) | |
281 | |
282 | |
283 ## Return the list of Bioseq instances, these being sorted in decreasing length | |
284 # | |
285 def getBioseqListSortedByDecreasingLength( lBioseqs ): | |
286 return sorted( lBioseqs, key=lambda iBs: ( iBs.getLength() ), reverse=True ) | |
287 | |
288 getBioseqListSortedByDecreasingLength = staticmethod( getBioseqListSortedByDecreasingLength ) | |
289 | |
290 | |
291 ## Return the list of Bioseq instances, these being sorted in decreasing length (without gaps) | |
292 # | |
293 def getBioseqListSortedByDecreasingLengthWithoutGaps( lBioseqs ): | |
294 return sorted( lBioseqs, key=lambda iBs: ( len(iBs.sequence.replace("-","")) ), reverse=True ) | |
295 | |
296 getBioseqListSortedByDecreasingLengthWithoutGaps = staticmethod( getBioseqListSortedByDecreasingLengthWithoutGaps ) |