comparison commons/core/seq/test/Test_BioseqUtils.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children
comparison
equal deleted inserted replaced
5:ea3082881bf8 6:769e306b7933
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import unittest
33 import os
34 from commons.core.seq.Bioseq import Bioseq
35 from commons.core.seq.BioseqUtils import BioseqUtils
36 from commons.core.utils.FileUtils import FileUtils
37
38
39 class Test_BioseqUtils( unittest.TestCase ):
40
41 def test_translateSequence_one_nt( self ):
42 bioseq = Bioseq()
43 bioseq.sequence = "G"
44 BioseqUtils.translateSequence(bioseq, 1)
45 expSequence = ""
46 obsSequence = bioseq.sequence
47 self.assertEqual(expSequence, obsSequence)
48
49
50 def test_translateSequence_frame1( self ):
51 bioseq = Bioseq()
52 bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
53 BioseqUtils.translateSequence(bioseq, 1)
54 expSequence = "XGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
55 obsSequence = bioseq.sequence
56 self.assertEqual(expSequence, obsSequence)
57
58
59 def test_translateSequence_frame2( self ):
60 bioseq = Bioseq()
61 bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
62 BioseqUtils.translateSequence(bioseq, 2)
63 expSequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
64 obsSequence = bioseq.sequence
65 self.assertEqual(expSequence, obsSequence)
66
67
68 def test_translateSequence_frame3( self ):
69 bioseq = Bioseq()
70 bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
71 BioseqUtils.translateSequence(bioseq, 3)
72 expSequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"
73 obsSequence = bioseq.sequence
74 self.assertEqual(expSequence, obsSequence)
75
76
77 def test_setFrameInfoOnHeader(self):
78 bioseq = Bioseq()
79 bioseq.header = "header1 description1 description2"
80 BioseqUtils.setFrameInfoOnHeader(bioseq,1)
81 expHeader = "header1_1 description1 description2"
82 obsHeader = bioseq.header
83 self.assertEquals(expHeader,obsHeader)
84
85
86 def test_setFrameInfoOnHeader_header_without_space(self):
87 bioseq = Bioseq()
88 bioseq.header = "header"
89 BioseqUtils.setFrameInfoOnHeader(bioseq,1)
90 expHeader = "header_1"
91 obsHeader = bioseq.header
92 self.assertEquals(expHeader, obsHeader)
93
94
95 def test_TranslateInAllFrame( self ):
96 bioseq = Bioseq()
97 bioseq.header = "header1"
98 bioseq.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
99
100 bioseq1 = Bioseq()
101 bioseq1.header = "header1_1"
102 bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
103 bioseq2 = Bioseq()
104 bioseq2.header = "header1_2"
105 bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
106 bioseq3 = Bioseq()
107 bioseq3.header = "header1_3"
108 bioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"
109 bioseq4 = Bioseq()
110 bioseq4.header = "header1_4"
111 bioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"
112 bioseq5 = Bioseq()
113 bioseq5.header = "header1_5"
114 bioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH"
115 bioseq6 = Bioseq()
116 bioseq6.header = "header1_6"
117 bioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT"
118
119 expLBioseq = [bioseq1, bioseq2, bioseq3, bioseq4, bioseq5, bioseq6]
120 obsLBioseq = BioseqUtils.translateInAllFrame(bioseq)
121
122 self.assertEquals(expLBioseq, obsLBioseq)
123
124
125 def test_replaceStopCodonsByX( self ):
126 bioseq = Bioseq()
127 bioseq.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
128 BioseqUtils.replaceStopCodonsByX(bioseq)
129 expSequence = "VASSXSVYDHNDFTXVSRGSDXSTIXCEXSL"
130 obsSequence = bioseq.sequence
131 self.assertEquals(expSequence, obsSequence)
132
133
134 def test_translateBioseqListInAllFrames_with_empty_list( self ):
135 lBioseq = []
136 obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq )
137 expLBioseq = []
138 self.assertEquals( expLBioseq, obsLBioseq )
139
140
141 def test_translateBioseqListInAllFrames_with_one_item( self ):
142 bioseq1 = Bioseq()
143 bioseq1.header = "header1 description"
144 bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
145 lBioseq = [bioseq1]
146 obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq )
147
148 expBioseq1 = Bioseq()
149 expBioseq1.header = "header1_1 description"
150 expBioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
151
152 expBioseq2 = Bioseq()
153 expBioseq2.header = "header1_2 description"
154 expBioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
155
156 expBioseq3 = Bioseq()
157 expBioseq3.header = "header1_3 description"
158 expBioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"
159
160 expBioseq4 = Bioseq()
161 expBioseq4.header = "header1_4 description"
162 expBioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"
163
164 expBioseq5 = Bioseq()
165 expBioseq5.header = "header1_5 description"
166 expBioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH"
167
168 expBioseq6 = Bioseq()
169 expBioseq6.header = "header1_6 description"
170 expBioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT"
171
172 expLBioseq = [expBioseq1, expBioseq2, expBioseq3, expBioseq4, expBioseq5, expBioseq6]
173
174 self.assertEquals( expLBioseq, obsLBioseq )
175
176
177 def test_translateBioseqListInAllFrames( self ):
178 bioseq1 = Bioseq()
179 bioseq1.header = "header1 description"
180 bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
181 bioseq2 = Bioseq()
182 bioseq2.header = "header2"
183 bioseq2.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTACGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
184 lBioseq = [bioseq1, bioseq2]
185 obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq )
186
187 expBioseq1 = Bioseq()
188 expBioseq1.header = "header1_1 description"
189 expBioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
190
191 expBioseq2 = Bioseq()
192 expBioseq2.header = "header1_2 description"
193 expBioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
194
195 expBioseq3 = Bioseq()
196 expBioseq3.header = "header1_3 description"
197 expBioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"
198
199 expBioseq4 = Bioseq()
200 expBioseq4.header = "header1_4 description"
201 expBioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"
202
203 expBioseq5 = Bioseq()
204 expBioseq5.header = "header1_5 description"
205 expBioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH"
206
207 expBioseq6 = Bioseq()
208 expBioseq6.header = "header1_6 description"
209 expBioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT"
210
211 expBioseq7 = Bioseq()
212 expBioseq7.header = "header2_1"
213 expBioseq7.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
214
215 expBioseq8 = Bioseq()
216 expBioseq8.header = "header2_2"
217 expBioseq8.sequence = "VASS*SVYDHNDFT*VSRGYD*STI*CE*SL"
218
219 expBioseq9 = Bioseq()
220 expBioseq9.header = "header2_3"
221 expBioseq9.sequence = "WLLVDQFMITMISRRCLVATTNQQYNASRA*"
222
223 expBioseq10 = Bioseq()
224 expBioseq10.header = "header2_4"
225 expBioseq10.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"
226
227 expBioseq11 = Bioseq()
228 expBioseq11.header = "header2_5"
229 expBioseq11.sequence = "QALLALYC*LVVATRHLREIIVIIN*STRSH"
230
231 expBioseq12 = Bioseq()
232 expBioseq12.header = "header2_6"
233 expBioseq12.sequence = "KLYSHYIVD*S*PRDTYVKSL*S*TDQLEAT"
234
235 expLBioseq = [expBioseq1, expBioseq2, expBioseq3, expBioseq4, expBioseq5, expBioseq6, expBioseq7, expBioseq8, expBioseq9, expBioseq10, expBioseq11, expBioseq12]
236 self.assertEquals( expLBioseq, obsLBioseq )
237
238
239 def test_replaceStopCodonsByXInBioseqList_empty_list( self ):
240 lBioseq = []
241 obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq )
242 expLBioseq = []
243 self.assertEquals(obsLBioseq, expLBioseq)
244
245
246 def test_replaceStopCodonsByXInBioseqList_without_stop_codon( self ):
247 bioseq1 = Bioseq()
248 bioseq1.header = "header1 description"
249 bioseq1.sequence = "CGFLISLSQFHVGVSWLRLINNIMRVEL"
250
251 lBioseq = [bioseq1]
252
253 obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq )
254
255 bioseq2 = Bioseq()
256 bioseq2.header = "header1 description"
257 bioseq2.sequence = "CGFLISLSQFHVGVSWLRLINNIMRVEL"
258
259 expLBioseq = [bioseq2]
260
261 self.assertEquals(obsLBioseq, expLBioseq)
262
263
264 def test_replaceStopCodonsByXInBioseqList( self ):
265 bioseq1 = Bioseq()
266 bioseq1.header = "header1 description"
267 bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
268
269 bioseq2 = Bioseq()
270 bioseq2.header = "header2"
271 bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
272
273 lBioseq = [bioseq1, bioseq2]
274
275 obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq )
276
277 bioseq3 = Bioseq()
278 bioseq3.header = "header1 description"
279 bioseq3.sequence = "CGFXLISLXSQXFHVGVSWLRLINNIMRVEL"
280
281 bioseq4 = Bioseq()
282 bioseq4.header = "header2"
283 bioseq4.sequence = "VASSXSVYDHNDFTXVSRGSDXSTIXCEXSL"
284
285 expLBioseq = [bioseq3, bioseq4]
286
287 self.assertEquals(obsLBioseq, expLBioseq)
288
289
290 def test_writeBioseqListIntoFastaFile(self):
291 obsFileName = "dummyWrittenFastaFile.fa"
292
293 bioseq1 = Bioseq()
294 bioseq1.header = "header1 description"
295 bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
296 bioseq2 = Bioseq()
297 bioseq2.header = "header2"
298 bioseq2.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTACGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
299
300 lBioseq = [bioseq1, bioseq2]
301
302 BioseqUtils.writeBioseqListIntoFastaFile( lBioseq, obsFileName )
303
304 expFileName = "dummyFastaFile.fa"
305 f = open(expFileName, "w")
306 f.write(">header1 description\n")
307 f.write("TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTC\n")
308 f.write("CGACTAATCAACAATATAATGCGAGTAGAGCTTGA\n")
309 f.write(">header2\n")
310 f.write("TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTA\n")
311 f.write("CGACTAATCAACAATATAATGCGAGTAGAGCTTGA\n")
312 f.close()
313
314 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
315
316 os.remove(expFileName)
317 os.remove(obsFileName)
318
319
320 def test_extractBioseqListFromFastaFile( self ):
321 fileName = "dummyFastaFile.fa"
322 f = open(fileName,"w")
323 f.write(">header1_1 description1\n")
324 f.write("CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL\n")
325 f.write(">header1_2 description2\n")
326 f.write("VASS*SVYDHNDFT*VSRGSD*STI*CE*SL\n")
327 f.write(">header1_3 description3\n")
328 f.write("CWLLVDQFMITMISRRCLVAPTNQQYNASRA*\n")
329 f.close()
330
331 bioseq1 = Bioseq()
332 bioseq1.header = "header1_1 description1"
333 bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
334 bioseq2 = Bioseq()
335 bioseq2.header = "header1_2 description2"
336 bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
337 bioseq3 = Bioseq()
338 bioseq3.header = "header1_3 description3"
339 bioseq3.sequence = "CWLLVDQFMITMISRRCLVAPTNQQYNASRA*"
340
341 expLBioseq = [bioseq1, bioseq2, bioseq3]
342
343 obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName )
344 self.assertEquals(expLBioseq , obsLBioseq)
345
346 os.remove( fileName )
347
348
349 def test_extractBioseqListFromFastaFile_empty_seq( self ):
350 fileName = "dummyFastaFile.fa"
351 f = open(fileName,"w")
352 f.write(">header1_1 description1\n")
353 f.close()
354
355 bioseq1 = Bioseq()
356 bioseq1.header = "header1_1 description1"
357 bioseq1.sequence = ""
358 expLBioseq = [bioseq1]
359
360 obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName )
361 self.assertEquals(expLBioseq , obsLBioseq)
362
363 os.remove( fileName )
364
365
366 def test_extractBioseqListFromFastaFile_empty_file( self ):
367 fileName = "dummyFastaFile.fa"
368
369 f = open(fileName,"w")
370 f.close()
371
372 expLBioseq = []
373
374 obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName )
375 self.assertEquals(expLBioseq , obsLBioseq)
376
377 os.remove( fileName )
378
379
380 def test_getSeqLengthWithSeqName ( self ):
381 bioseq1 = Bioseq()
382 bioseq1.header = "header1 description"
383 bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
384
385 bioseq2 = Bioseq()
386 bioseq2.header = "header2"
387 bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT"
388
389 lBioseq = [bioseq1, bioseq2]
390
391 obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header1 description")
392 expLength = 31
393
394 self.assertEquals( expLength, obsLength)
395
396
397 def test_getSeqLengthWithSeqName_second_item ( self ):
398 bioseq1 = Bioseq()
399 bioseq1.header = "header1 description"
400 bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
401
402 bioseq2 = Bioseq()
403 bioseq2.header = "header2"
404 bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT"
405
406 lBioseq = [bioseq1, bioseq2]
407
408 obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2")
409 expLength = 44
410
411 self.assertEquals( expLength, obsLength)
412
413
414 def test_getSeqLengthWithSeqName_empty_list ( self ):
415 lBioseq = []
416
417 obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2")
418 expLength = 0
419
420 self.assertEquals( expLength, obsLength)
421
422
423 def test_getSeqLengthWithSeqName_empty_sequence ( self ):
424 bioseq1 = Bioseq()
425 bioseq1.header = "header1 description"
426 bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
427
428 bioseq2 = Bioseq()
429 bioseq2.header = "header2"
430 bioseq2.sequence = ""
431
432 lBioseq = [bioseq1, bioseq2]
433
434 obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2")
435 expLength = 0
436
437 self.assertEquals( expLength, obsLength)
438
439
440 def test_getSeqLengthWithSeqName_sequence_unknown ( self ):
441 bioseq1 = Bioseq()
442 bioseq1.header = "header1 description"
443 bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
444
445 bioseq2 = Bioseq()
446 bioseq2.header = "header2"
447 bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT"
448
449 lBioseq = [bioseq1, bioseq2]
450
451 obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header3")
452 expLength = 0
453
454 self.assertEquals( expLength, obsLength)
455
456
457 def test_getLengthPerSeqFromFile( self ):
458 inFile = "dummyInFile"
459 inFileHandler = open( inFile, "w" )
460 inFileHandler.write( ">seq1\nAGCGATGCAGCTA\n" )
461 inFileHandler.write( ">seq2\nGCGATGCGCATCGACGCGA\n" )
462 inFileHandler.close()
463
464 dExp = { "seq1": 13, "seq2": 19 }
465
466 dObs = BioseqUtils.getLengthPerSeqFromFile( inFile )
467
468 self.assertEqual( dExp, dObs )
469
470 os.remove( inFile )
471
472
473 def test_getBioseqListSortedByDecreasingLength( self ):
474 lBioseqs = [ Bioseq( "TE2", "ACC" ),
475 Bioseq( "TE3", "TA" ),
476 Bioseq( "TE1", "AGCG" ) ]
477 lExp = [ Bioseq( "TE1", "AGCG" ),
478 Bioseq( "TE2", "ACC" ),
479 Bioseq( "TE3", "TA" ) ]
480 lObs = BioseqUtils.getBioseqListSortedByDecreasingLength( lBioseqs )
481 self.assertEquals( lExp, lObs )
482
483
484 def test_getBioseqListSortedByDecreasingLengthWithoutGaps( self ):
485 lBioseqs = [ Bioseq( "TE2", "-ACC-" ),
486 Bioseq( "TE3", "TA---" ),
487 Bioseq( "TE1", "-AGCG" ) ]
488 lExp = [ Bioseq( "TE1", "-AGCG" ),
489 Bioseq( "TE2", "-ACC-" ),
490 Bioseq( "TE3", "TA---" ) ]
491 lObs = BioseqUtils.getBioseqListSortedByDecreasingLengthWithoutGaps( lBioseqs )
492 self.assertEquals( lExp, lObs )
493
494
495 test_suite = unittest.TestSuite()
496 test_suite.addTest( unittest.makeSuite( Test_BioseqUtils ) )
497 if __name__ == "__main__":
498 unittest.TextTestRunner(verbosity=2).run( test_suite )