Mercurial > repos > yufei-luo > s_mart
diff smart_toolShed/commons/core/seq/test/Test_BioseqUtils.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/commons/core/seq/test/Test_BioseqUtils.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,498 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import unittest +import os +from commons.core.seq.Bioseq import Bioseq +from commons.core.seq.BioseqUtils import BioseqUtils +from commons.core.utils.FileUtils import FileUtils + + +class Test_BioseqUtils( unittest.TestCase ): + + def test_translateSequence_one_nt( self ): + bioseq = Bioseq() + bioseq.sequence = "G" + BioseqUtils.translateSequence(bioseq, 1) + expSequence = "" + obsSequence = bioseq.sequence + self.assertEqual(expSequence, obsSequence) + + + def test_translateSequence_frame1( self ): + bioseq = Bioseq() + bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA" + BioseqUtils.translateSequence(bioseq, 1) + expSequence = "XGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + obsSequence = bioseq.sequence + self.assertEqual(expSequence, obsSequence) + + + def test_translateSequence_frame2( self ): + bioseq = Bioseq() + bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA" + BioseqUtils.translateSequence(bioseq, 2) + expSequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL" + obsSequence = bioseq.sequence + self.assertEqual(expSequence, obsSequence) + + + def test_translateSequence_frame3( self ): + bioseq = Bioseq() + bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA" + BioseqUtils.translateSequence(bioseq, 3) + expSequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*" + obsSequence = bioseq.sequence + self.assertEqual(expSequence, obsSequence) + + + def test_setFrameInfoOnHeader(self): + bioseq = Bioseq() + bioseq.header = "header1 description1 description2" + BioseqUtils.setFrameInfoOnHeader(bioseq,1) + expHeader = "header1_1 description1 description2" + obsHeader = bioseq.header + self.assertEquals(expHeader,obsHeader) + + + def test_setFrameInfoOnHeader_header_without_space(self): + bioseq = Bioseq() + bioseq.header = "header" + BioseqUtils.setFrameInfoOnHeader(bioseq,1) + expHeader = "header_1" + obsHeader = bioseq.header + self.assertEquals(expHeader, obsHeader) + + + def test_TranslateInAllFrame( self ): + bioseq = Bioseq() + bioseq.header = "header1" + bioseq.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA" + + bioseq1 = Bioseq() + bioseq1.header = "header1_1" + bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + bioseq2 = Bioseq() + bioseq2.header = "header1_2" + bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL" + bioseq3 = Bioseq() + bioseq3.header = "header1_3" + bioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*" + bioseq4 = Bioseq() + bioseq4.header = "header1_4" + bioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP" + bioseq5 = Bioseq() + bioseq5.header = "header1_5" + bioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH" + bioseq6 = Bioseq() + bioseq6.header = "header1_6" + bioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT" + + expLBioseq = [bioseq1, bioseq2, bioseq3, bioseq4, bioseq5, bioseq6] + obsLBioseq = BioseqUtils.translateInAllFrame(bioseq) + + self.assertEquals(expLBioseq, obsLBioseq) + + + def test_replaceStopCodonsByX( self ): + bioseq = Bioseq() + bioseq.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL" + BioseqUtils.replaceStopCodonsByX(bioseq) + expSequence = "VASSXSVYDHNDFTXVSRGSDXSTIXCEXSL" + obsSequence = bioseq.sequence + self.assertEquals(expSequence, obsSequence) + + + def test_translateBioseqListInAllFrames_with_empty_list( self ): + lBioseq = [] + obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq ) + expLBioseq = [] + self.assertEquals( expLBioseq, obsLBioseq ) + + + def test_translateBioseqListInAllFrames_with_one_item( self ): + bioseq1 = Bioseq() + bioseq1.header = "header1 description" + bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA" + lBioseq = [bioseq1] + obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq ) + + expBioseq1 = Bioseq() + expBioseq1.header = "header1_1 description" + expBioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + + expBioseq2 = Bioseq() + expBioseq2.header = "header1_2 description" + expBioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL" + + expBioseq3 = Bioseq() + expBioseq3.header = "header1_3 description" + expBioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*" + + expBioseq4 = Bioseq() + expBioseq4.header = "header1_4 description" + expBioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP" + + expBioseq5 = Bioseq() + expBioseq5.header = "header1_5 description" + expBioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH" + + expBioseq6 = Bioseq() + expBioseq6.header = "header1_6 description" + expBioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT" + + expLBioseq = [expBioseq1, expBioseq2, expBioseq3, expBioseq4, expBioseq5, expBioseq6] + + self.assertEquals( expLBioseq, obsLBioseq ) + + + def test_translateBioseqListInAllFrames( self ): + bioseq1 = Bioseq() + bioseq1.header = "header1 description" + bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA" + bioseq2 = Bioseq() + bioseq2.header = "header2" + bioseq2.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTACGACTAATCAACAATATAATGCGAGTAGAGCTTGA" + lBioseq = [bioseq1, bioseq2] + obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq ) + + expBioseq1 = Bioseq() + expBioseq1.header = "header1_1 description" + expBioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + + expBioseq2 = Bioseq() + expBioseq2.header = "header1_2 description" + expBioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL" + + expBioseq3 = Bioseq() + expBioseq3.header = "header1_3 description" + expBioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*" + + expBioseq4 = Bioseq() + expBioseq4.header = "header1_4 description" + expBioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP" + + expBioseq5 = Bioseq() + expBioseq5.header = "header1_5 description" + expBioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH" + + expBioseq6 = Bioseq() + expBioseq6.header = "header1_6 description" + expBioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT" + + expBioseq7 = Bioseq() + expBioseq7.header = "header2_1" + expBioseq7.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + + expBioseq8 = Bioseq() + expBioseq8.header = "header2_2" + expBioseq8.sequence = "VASS*SVYDHNDFT*VSRGYD*STI*CE*SL" + + expBioseq9 = Bioseq() + expBioseq9.header = "header2_3" + expBioseq9.sequence = "WLLVDQFMITMISRRCLVATTNQQYNASRA*" + + expBioseq10 = Bioseq() + expBioseq10.header = "header2_4" + expBioseq10.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP" + + expBioseq11 = Bioseq() + expBioseq11.header = "header2_5" + expBioseq11.sequence = "QALLALYC*LVVATRHLREIIVIIN*STRSH" + + expBioseq12 = Bioseq() + expBioseq12.header = "header2_6" + expBioseq12.sequence = "KLYSHYIVD*S*PRDTYVKSL*S*TDQLEAT" + + expLBioseq = [expBioseq1, expBioseq2, expBioseq3, expBioseq4, expBioseq5, expBioseq6, expBioseq7, expBioseq8, expBioseq9, expBioseq10, expBioseq11, expBioseq12] + self.assertEquals( expLBioseq, obsLBioseq ) + + + def test_replaceStopCodonsByXInBioseqList_empty_list( self ): + lBioseq = [] + obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq ) + expLBioseq = [] + self.assertEquals(obsLBioseq, expLBioseq) + + + def test_replaceStopCodonsByXInBioseqList_without_stop_codon( self ): + bioseq1 = Bioseq() + bioseq1.header = "header1 description" + bioseq1.sequence = "CGFLISLSQFHVGVSWLRLINNIMRVEL" + + lBioseq = [bioseq1] + + obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq ) + + bioseq2 = Bioseq() + bioseq2.header = "header1 description" + bioseq2.sequence = "CGFLISLSQFHVGVSWLRLINNIMRVEL" + + expLBioseq = [bioseq2] + + self.assertEquals(obsLBioseq, expLBioseq) + + + def test_replaceStopCodonsByXInBioseqList( self ): + bioseq1 = Bioseq() + bioseq1.header = "header1 description" + bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + + bioseq2 = Bioseq() + bioseq2.header = "header2" + bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL" + + lBioseq = [bioseq1, bioseq2] + + obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq ) + + bioseq3 = Bioseq() + bioseq3.header = "header1 description" + bioseq3.sequence = "CGFXLISLXSQXFHVGVSWLRLINNIMRVEL" + + bioseq4 = Bioseq() + bioseq4.header = "header2" + bioseq4.sequence = "VASSXSVYDHNDFTXVSRGSDXSTIXCEXSL" + + expLBioseq = [bioseq3, bioseq4] + + self.assertEquals(obsLBioseq, expLBioseq) + + + def test_writeBioseqListIntoFastaFile(self): + obsFileName = "dummyWrittenFastaFile.fa" + + bioseq1 = Bioseq() + bioseq1.header = "header1 description" + bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA" + bioseq2 = Bioseq() + bioseq2.header = "header2" + bioseq2.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTACGACTAATCAACAATATAATGCGAGTAGAGCTTGA" + + lBioseq = [bioseq1, bioseq2] + + BioseqUtils.writeBioseqListIntoFastaFile( lBioseq, obsFileName ) + + expFileName = "dummyFastaFile.fa" + f = open(expFileName, "w") + f.write(">header1 description\n") + f.write("TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTC\n") + f.write("CGACTAATCAACAATATAATGCGAGTAGAGCTTGA\n") + f.write(">header2\n") + f.write("TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTA\n") + f.write("CGACTAATCAACAATATAATGCGAGTAGAGCTTGA\n") + f.close() + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(expFileName) + os.remove(obsFileName) + + + def test_extractBioseqListFromFastaFile( self ): + fileName = "dummyFastaFile.fa" + f = open(fileName,"w") + f.write(">header1_1 description1\n") + f.write("CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL\n") + f.write(">header1_2 description2\n") + f.write("VASS*SVYDHNDFT*VSRGSD*STI*CE*SL\n") + f.write(">header1_3 description3\n") + f.write("CWLLVDQFMITMISRRCLVAPTNQQYNASRA*\n") + f.close() + + bioseq1 = Bioseq() + bioseq1.header = "header1_1 description1" + bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + bioseq2 = Bioseq() + bioseq2.header = "header1_2 description2" + bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL" + bioseq3 = Bioseq() + bioseq3.header = "header1_3 description3" + bioseq3.sequence = "CWLLVDQFMITMISRRCLVAPTNQQYNASRA*" + + expLBioseq = [bioseq1, bioseq2, bioseq3] + + obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName ) + self.assertEquals(expLBioseq , obsLBioseq) + + os.remove( fileName ) + + + def test_extractBioseqListFromFastaFile_empty_seq( self ): + fileName = "dummyFastaFile.fa" + f = open(fileName,"w") + f.write(">header1_1 description1\n") + f.close() + + bioseq1 = Bioseq() + bioseq1.header = "header1_1 description1" + bioseq1.sequence = "" + expLBioseq = [bioseq1] + + obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName ) + self.assertEquals(expLBioseq , obsLBioseq) + + os.remove( fileName ) + + + def test_extractBioseqListFromFastaFile_empty_file( self ): + fileName = "dummyFastaFile.fa" + + f = open(fileName,"w") + f.close() + + expLBioseq = [] + + obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName ) + self.assertEquals(expLBioseq , obsLBioseq) + + os.remove( fileName ) + + + def test_getSeqLengthWithSeqName ( self ): + bioseq1 = Bioseq() + bioseq1.header = "header1 description" + bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + + bioseq2 = Bioseq() + bioseq2.header = "header2" + bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT" + + lBioseq = [bioseq1, bioseq2] + + obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header1 description") + expLength = 31 + + self.assertEquals( expLength, obsLength) + + + def test_getSeqLengthWithSeqName_second_item ( self ): + bioseq1 = Bioseq() + bioseq1.header = "header1 description" + bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + + bioseq2 = Bioseq() + bioseq2.header = "header2" + bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT" + + lBioseq = [bioseq1, bioseq2] + + obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2") + expLength = 44 + + self.assertEquals( expLength, obsLength) + + + def test_getSeqLengthWithSeqName_empty_list ( self ): + lBioseq = [] + + obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2") + expLength = 0 + + self.assertEquals( expLength, obsLength) + + + def test_getSeqLengthWithSeqName_empty_sequence ( self ): + bioseq1 = Bioseq() + bioseq1.header = "header1 description" + bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + + bioseq2 = Bioseq() + bioseq2.header = "header2" + bioseq2.sequence = "" + + lBioseq = [bioseq1, bioseq2] + + obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2") + expLength = 0 + + self.assertEquals( expLength, obsLength) + + + def test_getSeqLengthWithSeqName_sequence_unknown ( self ): + bioseq1 = Bioseq() + bioseq1.header = "header1 description" + bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL" + + bioseq2 = Bioseq() + bioseq2.header = "header2" + bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT" + + lBioseq = [bioseq1, bioseq2] + + obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header3") + expLength = 0 + + self.assertEquals( expLength, obsLength) + + + def test_getLengthPerSeqFromFile( self ): + inFile = "dummyInFile" + inFileHandler = open( inFile, "w" ) + inFileHandler.write( ">seq1\nAGCGATGCAGCTA\n" ) + inFileHandler.write( ">seq2\nGCGATGCGCATCGACGCGA\n" ) + inFileHandler.close() + + dExp = { "seq1": 13, "seq2": 19 } + + dObs = BioseqUtils.getLengthPerSeqFromFile( inFile ) + + self.assertEqual( dExp, dObs ) + + os.remove( inFile ) + + + def test_getBioseqListSortedByDecreasingLength( self ): + lBioseqs = [ Bioseq( "TE2", "ACC" ), + Bioseq( "TE3", "TA" ), + Bioseq( "TE1", "AGCG" ) ] + lExp = [ Bioseq( "TE1", "AGCG" ), + Bioseq( "TE2", "ACC" ), + Bioseq( "TE3", "TA" ) ] + lObs = BioseqUtils.getBioseqListSortedByDecreasingLength( lBioseqs ) + self.assertEquals( lExp, lObs ) + + + def test_getBioseqListSortedByDecreasingLengthWithoutGaps( self ): + lBioseqs = [ Bioseq( "TE2", "-ACC-" ), + Bioseq( "TE3", "TA---" ), + Bioseq( "TE1", "-AGCG" ) ] + lExp = [ Bioseq( "TE1", "-AGCG" ), + Bioseq( "TE2", "-ACC-" ), + Bioseq( "TE3", "TA---" ) ] + lObs = BioseqUtils.getBioseqListSortedByDecreasingLengthWithoutGaps( lBioseqs ) + self.assertEquals( lExp, lObs ) + + +test_suite = unittest.TestSuite() +test_suite.addTest( unittest.makeSuite( Test_BioseqUtils ) ) +if __name__ == "__main__": + unittest.TextTestRunner(verbosity=2).run( test_suite )