Mercurial > repos > yufei-luo > s_mart
diff smart_toolShed/commons/core/seq/test/Test_BioseqDB.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/commons/core/seq/test/Test_BioseqDB.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,974 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import unittest +import os +import time +from commons.core.seq.BioseqDB import BioseqDB +from commons.core.seq.Bioseq import Bioseq +from commons.core.utils.FileUtils import FileUtils +from commons.core.coord.Map import Map + + +class Test_BioseqDB( unittest.TestCase ): + + def setUp( self ): + self._uniqId = "%s_%s" % ( time.strftime("%Y%m%d%H%M%S") , os.getpid() ) + + + def tearDown( self ): + if os._exists("dummyBioseqDB.fa"): + os.remove("dummyBioseqDB.fa") + + + def test__eq__(self): + iBioseq1 = Bioseq( "seq1", "AGCGGACGATGCAGCATGCGAATGACGAT" ) + iBioseq2 = Bioseq( "seq2", "GCGATGCGATCGATGCGATAGCA" ) + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + iBioseq3 = Bioseq( "seq1", "AGCGGACGATGCAGCATGCGAATGACGAT" ) + iBioseq4 = Bioseq( "seq2", "GCGATGCGATCGATGCGATAGCA" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq3, iBioseq4 ] ) + + self.assertEquals( expBioseqDB, obsBioseqDB ) + + + def test__eq__instances_with_different_header(self): + iBioseq1 = Bioseq( "seq1", "AGCGGACGATGCAGCATGCGAATGACGAT" ) + iBioseq2 = Bioseq( "seq2", "GCGATGCGATCGATGCGATAGCA" ) + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + iBioseq3 = Bioseq( "seq3", "AGCGGACGATGCAGCATGCGAATGACGAT" ) + iBioseq4 = Bioseq( "seq4", "GCGATGCGATCGATGCGATAGCA" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq3, iBioseq4 ] ) + + self.assertNotEquals( expBioseqDB, obsBioseqDB ) + + + def test__eq__instances_with_different_sequences(self): + iBioseq1 = Bioseq( "seq1", "AGCGGACGATGCAGCATGCGAATGACGAT" ) + iBioseq2 = Bioseq( "seq2", "GCGATGCGATCGATGCGATAGCA" ) + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + iBioseq3 = Bioseq( "seq1", "AGCGGACGATGCAGCATGCGAATGACGAT" ) + iBioseq4 = Bioseq( "seq2", "GCGATGCGATCGATGCGATAGCATATATATATATATATATATATAT" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq3, iBioseq4 ] ) + + self.assertNotEquals( expBioseqDB, obsBioseqDB ) + + + def test__eq__instances_with_different_sequences_and_headers(self): + iBioseq1 = Bioseq( "seq1", "AGCGGACGATGCAGCATGCGAATGACGAT" ) + iBioseq2 = Bioseq( "seq2", "GCGATGCGATCGATGCGATAGCA" ) + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + iBioseq3 = Bioseq( "seq3", "AGCGGACGATGCAGCATGCGAATGACGAT" ) + iBioseq4 = Bioseq( "seq4", "GCGATGCGATCGATGCGATAGCATATATATATATATATATATATAT" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq3, iBioseq4 ] ) + + self.assertNotEquals( expBioseqDB, obsBioseqDB ) + + + def test__eq__instances_with_different_sizeOfBioseq(self): + iBioseq1 = Bioseq( "seq1", "AGCGGACGATGCAGCATGCGAATGACGAT" ) + iBioseq2 = Bioseq( "seq2", "GCGATGCGATCGATGCGATAGCA" ) + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + iBioseq3 = Bioseq( "seq3", "AGCGGACGATGCAGCATGCGAATGACGAT" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq3 ] ) + + self.assertNotEquals( expBioseqDB, obsBioseqDB ) + + + def test_setName (self): + expName = "myDataBank" + iBioseqDB = BioseqDB() + self.assertEquals (iBioseqDB.name, "") + + iBioseqDB.setName (expName) + obsName = iBioseqDB.name + self.assertEquals (expName, obsName) + + + def test_read(self): + iBioseq1 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCT") + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGC") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + faFN = "dummyFaFile.fa" + faF = open( faFN, "w" ) + faF.write(">consensus1\n") + faF.write("GAGATGGCTCATGGAGTACCTGCCT\n") + faF.write(">consensus2\n") + faF.write("GAGATGGCTCATGGAGTACCGC\n") + faF.close() + + faF = open( faFN, "r" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.read( faF ) + faF.close() + os.remove( faFN ) + self.assertEquals( expBioseqDB, obsBioseqDB ) + + + def test_write(self): + iBioseq1 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + expFaFileName = "dummyFaFile.fa" + expFaFile = open( expFaFileName, "w" ) + expFaFile.write(">consensus1\n") + expFaFile.write("GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTC\n") + expFaFile.write("ATGGAGTACCTGCCT\n") + expFaFile.write(">consensus2\n") + expFaFile.write("GAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAG\n") + expFaFile.write("TACCGCGAGATGGCTCATGGAGTACCGC\n") + expFaFile.close() + + obsFaFileName = "obsDummyFastaFile.fa" + obsFaFile = open( obsFaFileName, "w" ) + iBioseqDB.write( obsFaFile ) + obsFaFile.close() + + self.assertTrue( FileUtils.are2FilesIdentical(expFaFileName, obsFaFileName) ) + os.remove( expFaFileName ) + os.remove( obsFaFileName ) + + + def test_save(self): + iBioseq1 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + expFaFileName = "dummyFaFile.fa" + expFaFile = open( expFaFileName, "w" ) + expFaFile.write(">consensus1\n") + expFaFile.write("GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTC\n") + expFaFile.write("ATGGAGTACCTGCCT\n") + expFaFile.write(">consensus2\n") + expFaFile.write("GAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAG\n") + expFaFile.write("TACCGCGAGATGGCTCATGGAGTACCGC\n") + expFaFile.close() + + obsFaFileName = "obsDummyFastaFile.fa" + iBioseqDB.save( obsFaFileName ) + + self.assertTrue( FileUtils.are2FilesIdentical(expFaFileName, obsFaFileName) ) + os.remove( expFaFileName ) + os.remove( obsFaFileName ) + + + def test_load(self): + iBioseq1 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + FaFileName = "dummyFaFile.fa" + FaFile = open( FaFileName, "w" ) + FaFile.write(">consensus1\n") + FaFile.write("GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTC\n") + FaFile.write("ATGGAGTACCTGCCT\n") + FaFile.write(">consensus2\n") + FaFile.write("GAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAG\n") + FaFile.write("TACCGCGAGATGGCTCATGGAGTACCGC\n") + FaFile.close() + + obsBioseqDB = BioseqDB() + obsBioseqDB.load( FaFileName ) + + self.assertEquals( expBioseqDB, obsBioseqDB ) + os.remove( FaFileName ) + + + def test_reverse( self ): + iBioseq1 = Bioseq( "seq1", "ATTG" ) + iBioseq2 = Bioseq( "seq2", "CGAAT" ) + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + iBioseq3 = Bioseq( "seq1", "GTTA" ) + iBioseq4 = Bioseq( "seq2", "TAAGC" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq3, iBioseq4 ] ) + obsBioseqDB.reverse() + self.assertEquals( expBioseqDB, obsBioseqDB ) + + + def test_complement( self ): + iBioseq1 = Bioseq( "seq1", "ATTG" ) + iBioseq2 = Bioseq( "seq2", "CGAAT" ) + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + iBioseq3 = Bioseq( "seq1", "TAAC" ) + iBioseq4 = Bioseq( "seq2", "GCTTA" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq3, iBioseq4 ] ) + + obsBioseqDB.complement() + self.assertEquals( expBioseqDB, obsBioseqDB ) + + + def test_reverseComplement( self ): + iBioseq1 = Bioseq( "seq1", "ATTG" ) + iBioseq2 = Bioseq( "seq2", "CGAAT" ) + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + iBioseq3 = Bioseq( "seq1", "CAAT" ) + iBioseq4 = Bioseq( "seq2", "ATTCG" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq3, iBioseq4 ] ) + + obsBioseqDB.reverseComplement() + self.assertEquals( expBioseqDB, obsBioseqDB ) + + + def test_setData(self): + iBioseq1 = Bioseq( "seq1", "ATTG" ) + iBioseq2 = Bioseq( "seq2", "CGAAT" ) + iBioseq3 = Bioseq( "seq3", "CAAT" ) + iBioseq4 = Bioseq( "seq4", "ATTCG" ) + + lBioseq = [iBioseq1, iBioseq2, iBioseq3, iBioseq4] + expBioseqDB = BioseqDB() + expBioseqDB.db = lBioseq + + iBioseq5 = Bioseq( "seq1", "ATTG" ) + iBioseq6 = Bioseq( "seq2", "CGAAT" ) + iBioseq7 = Bioseq( "seq3", "CAAT" ) + iBioseq8 = Bioseq( "seq4", "ATTCG" ) + + lBioseq2 = [iBioseq5, iBioseq6, iBioseq7, iBioseq8] + obsBioseqDB = BioseqDB() + obsBioseqDB.setData(lBioseq2) + + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_reset( self ): + iBioseq1 = Bioseq( "seq1", "ATTG" ) + iBioseq2 = Bioseq( "seq2", "CGAAT" ) + iBioseq3 = Bioseq( "seq3", "CAAT" ) + iBioseq4 = Bioseq( "seq4", "ATTCG" ) + + lBioseq = [iBioseq1, iBioseq2, iBioseq3, iBioseq4] + obsBioseqDB = BioseqDB() + obsBioseqDB.setData(lBioseq) + obsBioseqDB.reset() + + expBioseqDB = BioseqDB() + + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def testCleanGap(self): + iBioseq1 = Bioseq( "seq1", "ATTG" ) + iBioseq2 = Bioseq( "seq2", "CGAAT" ) + expBioseqDB = BioseqDB() + expBioseqDB.setData([iBioseq1, iBioseq2]) + + iBioseq3 = Bioseq( "seq1", "AT-----TG" ) + iBioseq4 = Bioseq( "seq2", "CGAA----T" ) + + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq3, iBioseq4 ] ) + obsBioseqDB.cleanGap() + + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def testCleanGap_on_empty_db(self): + expBioseqDB = BioseqDB() + + obsBioseqDB = BioseqDB() + obsBioseqDB.cleanGap() + + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def testCleanGap_on_size_one_db(self): + iBioseq1 = Bioseq( "seq1", "ATTG" ) + expBioseqDB = BioseqDB() + expBioseqDB.setData([iBioseq1]) + + iBioseq2 = Bioseq( "seq1", "AT-----TG" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.setData([iBioseq2]) + + obsBioseqDB.cleanGap() + + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_add_to_a_empty_bioseqDB_instance (self): + sHeader = "embl::AF332402:AF332402 Arabidopsis thaliana clone C00024 (f)" + sHeader += "(At4g29080) mRNA, complete cds." + + expDictIdx = { sHeader : 0} + + sHeaderRenamed = "embl-AF332402-AF332402_Arabidopsis_thaliana_clone_C00024_(f)" + sHeaderRenamed += "(At4g29080)_mRNA-_complete_cds." + expDictIdxRenamed = {sHeaderRenamed : 0} + + iBioseq1 = Bioseq( sHeader, "ATTG" ) + obsBioseqDB = BioseqDB() + obsBioseqDB.add(iBioseq1) + + obsDictIdx = obsBioseqDB.idx + obsDictIdxRenamed = obsBioseqDB.idx_renamed + + self.assertEquals(expDictIdx,obsDictIdx) + self.assertEquals(expDictIdxRenamed,obsDictIdxRenamed) + + + def test_add_to_a_size_one_bioseqDB_instance (self): + sHeader1 = "embl::AF332402:AF332402 Arabidopsis thaliana clone C00024 (f)" + sHeader1 += "(At4g29080) mRNA, complete cds." + + sHeader2 = "embl::AF332503:AF332402 Arabidopsis thaliana clone C00024 (f)" + sHeader2 += "(At4g29080) mRNA, complete cds." + + expDictIdx = { sHeader1 : 0, sHeader2 : 1} + + sHeaderRenamed1 = "embl-AF332402-AF332402_Arabidopsis_thaliana_clone_C00024_(f)" + sHeaderRenamed1 += "(At4g29080)_mRNA-_complete_cds." + + sHeaderRenamed2 = "embl-AF332503-AF332402_Arabidopsis_thaliana_clone_C00024_(f)" + sHeaderRenamed2 += "(At4g29080)_mRNA-_complete_cds." + + expDictIdxRenamed = {sHeaderRenamed1 : 0, sHeaderRenamed2 : 1} + + iBioseq1 = Bioseq( sHeader1, "ATTG" ) + iBioseq2 = Bioseq( sHeader2, "ATTG" ) + + obsBioseqDB = BioseqDB() + obsBioseqDB.setData([ iBioseq1]) + obsBioseqDB.add(iBioseq2) + + obsDictIdx = obsBioseqDB.idx + obsDictIdxRenamed = obsBioseqDB.idx_renamed + + self.assertEquals(expDictIdx,obsDictIdx) + self.assertEquals(expDictIdxRenamed,obsDictIdxRenamed) + + + def test_add_to_a_size_two_bioseqDB_instance (self): + sHeader1 = "embl::AF332402:AF332402 Arabidopsis thaliana clone C00024 (f)" + sHeader1 += "(At4g29080) mRNA, complete cds." + + sHeader2 = "embl::AF332503:AF332402 Arabidopsis thaliana clone C00024 (f)" + sHeader2 += "(At4g29080) mRNA, complete cds." + + sHeader3 = "embl::AF332604:AF332402 Arabidopsis thaliana clone C00024 (f)" + sHeader3 += "(At4g29080) mRNA, complete cds." + expDictIdx = { sHeader1 : 0, sHeader2 : 1, sHeader3 : 2} + + sHeaderRenamed1 = "embl-AF332402-AF332402_Arabidopsis_thaliana_clone_C00024_(f)" + sHeaderRenamed1 += "(At4g29080)_mRNA-_complete_cds." + + sHeaderRenamed2 = "embl-AF332503-AF332402_Arabidopsis_thaliana_clone_C00024_(f)" + sHeaderRenamed2 += "(At4g29080)_mRNA-_complete_cds." + + sHeaderRenamed3 = "embl-AF332604-AF332402_Arabidopsis_thaliana_clone_C00024_(f)" + sHeaderRenamed3 += "(At4g29080)_mRNA-_complete_cds." + expDictIdxRenamed = {sHeaderRenamed1 : 0, sHeaderRenamed2 : 1, sHeaderRenamed3 :2} + + iBioseq1 = Bioseq( sHeader1, "ATTG" ) + iBioseq2 = Bioseq( sHeader2, "ATTG" ) + iBioseq3 = Bioseq( sHeader3, "ATTG" ) + + obsBioseqDB = BioseqDB() + obsBioseqDB.setData([ iBioseq1, iBioseq2 ]) + obsBioseqDB.add(iBioseq3) + + obsDictIdx = obsBioseqDB.idx + obsDictIdxRenamed = obsBioseqDB.idx_renamed + + self.assertEquals(expDictIdx,obsDictIdx) + self.assertEquals(expDictIdxRenamed,obsDictIdxRenamed) + + + def test__getitem__(self): + iBioseq1 = Bioseq("seq1","ATTG") + iBioseq2 = Bioseq("seq2","CGAAT") + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + expBioseq = Bioseq("seq2","CGAAT") + obsBioseq = iBioseqDB[1] + + self.assertEquals(expBioseq, obsBioseq) + + + def test_getSize(self): + expSize = 4 + + iBioseq1 = Bioseq( "seq1", "ATTG" ) + iBioseq2 = Bioseq( "seq2", "CGAAT" ) + iBioseq3 = Bioseq( "seq3", "AT-----TG" ) + iBioseq4 = Bioseq( "seq4", "CGAA----T" ) + + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [iBioseq1, iBioseq2 , iBioseq3, iBioseq4 ] ) + obsSize = obsBioseqDB.getSize() + + self.assertEquals(expSize,obsSize) + + + def test_getSize_emptyDB(self): + expSize = 0 + + obsBioseqDB = BioseqDB() + obsSize = obsBioseqDB.getSize() + + self.assertEquals(expSize,obsSize) + + + def test_getLength(self): + iBioseq1 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + expLength = 163 + obsLength = iBioseqDB.getLength() + + self.assertEquals( expLength, obsLength) + + def test_getListOfSequencesLength(self): + iBioseq1 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + lLength = iBioseqDB.getListOfSequencesLength() + + expLLengh = [75, 88] + self.assertEquals( expLLengh, lLength ) + + + def test_getHeaderList( self ): + lExpHeader = ["seq1", "seq2"] + + iBioseq1 = Bioseq( "seq1", "ATTG" ) + iBioseq2 = Bioseq( "seq2", "CGAAT" ) + + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + lObsHeader = obsBioseqDB.getHeaderList() + + self.assertEquals( lExpHeader, lObsHeader ) + + + def test_getSequencesList( self ): + lExpSeqs = ["ATGC", "AATTCCGG"] + + iBioseq1 = Bioseq("seq1", "ATGC") + iBioseq2 = Bioseq("seq2", "AATTCCGG") + + obsBioseqDB = BioseqDB() + obsBioseqDB.setData([iBioseq1, iBioseq2]) + + lObsSeqs = obsBioseqDB.getSequencesList() + + self.assertEquals(lExpSeqs, lObsSeqs) + + + def test_fetch( self ): + ibioseq1 = Bioseq( "seq1", "ATTG" ) + ibioseq2 = Bioseq( "seq2", "CGAAT" ) + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ ibioseq1, ibioseq2 ] ) + expBioseq = ibioseq1 + obsBioseq = iBioseqDB.fetch( "seq1" ) + self.assertEquals( expBioseq, obsBioseq ) + + + def test_getBioseqByRenamedHeader( self ): + Header1 = "embl::AF332402:AF332402 Arabidopsis thaliana clone C00024 (f)" + Header1 += "(At4g29080) mRNA, complete cds." + + Header2 = "embl::AF332503:AF332402 Arabidopsis thaliana clone C00024 (f)" + Header2 += "(At4g29080) mRNA, complete cds." + + Header3 = "embl::AF332604:AF332402 Arabidopsis thaliana clone C00024 (f)" + Header3 += "(At4g29080) mRNA, complete cds." + + HeaderRenamed2 = "embl-AF332503-AF332402_Arabidopsis_thaliana_clone_C00024_(f)" + HeaderRenamed2 += "(At4g29080)_mRNA-_complete_cds." + + ibioseq1 = Bioseq( Header1, "ATTG" ) + ibioseq2 = Bioseq( Header2, "CGAAT" ) + ibioseq3 = Bioseq( Header3, "TGCGAAT" ) + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ ibioseq1, ibioseq2, ibioseq3 ] ) + expBioseq = ibioseq2 + + obsBioseq = iBioseqDB.getBioseqByRenamedHeader( HeaderRenamed2 ) + + self.assertEquals( expBioseq, obsBioseq ) + + + def test_init_with_the_parm_name( self ): + iBioseq1 = Bioseq("seq1","ATTG") + iBioseq2 = Bioseq("seq2","CGAAT") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + fastaFilename = "dummyBioseqDB.fa" + f = open(fastaFilename, "w") + f.write(">seq1\n") + f.write("ATTG\n") + f.write(">seq2\n") + f.write("CGAAT\n") + f.close() + + obsBioseqDB = BioseqDB(fastaFilename) + os.remove(fastaFilename) + self.assertEquals( expBioseqDB, obsBioseqDB ) + + + def test_countNt(self): + iBioseq1 = Bioseq() + iBioseq1.header = "seq1 description1" + iBioseq1.sequence = "GCGNCGCTGCTTTATTAAGCGCTAGCATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCGATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCG" + iBioseq2 = Bioseq() + iBioseq2.header = "seq2 description2" + iBioseq2.sequence = "GCGNCGCTGCTTTATTAAGCGCTAGCATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCGATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCG" + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + expCount = 6 + obsCount = iBioseqDB.countNt('N') + self.assertEquals(expCount, obsCount) + + def test_countNt_lowercase(self): + iBioseq1 = Bioseq() + iBioseq1.header = "seq1 description1" + iBioseq1.sequence = "gcgncgctgctttattaagcgctagcatgcgncgctgctttattaagcgctagcgattatatagcagacgcatattatattgcgcgatgcgncgctgctttattaagcgctagcgattatatagcagacgcatattatattgcgcg" + iBioseq2 = Bioseq() + iBioseq2.header = "seq2 description2" + iBioseq2.sequence = "gcgncgctgctttattaagcgctagcatgcgncgctgctttattaagcgctagcgattatatagcagacgcatattatattgcgcgatgcgncgctgctttattaagcgctagcgattatatagcagacgcatattatattgcgcg" + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + expCount = 0 + obsCount = iBioseqDB.countNt('N') + self.assertEquals(expCount, obsCount) + + + def test_countNt_withCharacterNotExisting(self): + iBioseq1 = Bioseq() + iBioseq1.header = "seq1 description1" + iBioseq1.sequence = "GCGNCGCTGCTTTATTAAGCGCTAGCATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCGATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCG" + iBioseq2 = Bioseq() + iBioseq2.header = "seq2 description2" + iBioseq2.sequence = "GCGNCGCTGCTTTATTAAGCGCTAGCATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCGATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCG" + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + expCount = 0 + obsCount = iBioseqDB.countNt('W') + self.assertEquals(expCount, obsCount) + + + def test_countAllNt(self): + iBioseq1 = Bioseq() + iBioseq1.header = "seq1 description1" + iBioseq1.sequence = "GCGNCGCTGCTTTATTAAGCGCTAGCATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCGATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCG" + iBioseq2 = Bioseq() + iBioseq2.header = "seq2 description2" + iBioseq2.sequence = "GCGNCGCTGCTTTATTAAGCGCTAGCATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCGATGCGNCGCTGCTTTATTAAGCGCTAGCGATTATATAGCAGACGCATATTATATTGCGCG" + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + + dExpCount = {'A': 68, 'C': 62, 'T': 86, 'G': 70, 'N': 6} + + dObsCount = iBioseqDB.countAllNt() + self.assertEquals(dExpCount, dObsCount) + + + def test_extractPart(self): + iBioseq1 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq3 = Bioseq("consensus3","GAGATGGCTCATGGAGTACCTGCCTTGCATGACTGCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq4 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4 ] ) + + iBioseq5 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq6 = Bioseq("consensus3","GAGATGGCTCATGGAGTACCTGCCTTGCATGACTGCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACCTGCCT") + + expSubBioseqDB = BioseqDB() + expSubBioseqDB.setData( [ iBioseq5, iBioseq6 ] ) + + obsSubBioseqDB = iBioseqDB.extractPart (1, 2) + + self.assertEquals(expSubBioseqDB, obsSubBioseqDB) + + + def test_bestLength(self): + iBioseq1 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACC") + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq3 = Bioseq("consensus3","GAGATGGCTCATGGAGTACC") + iBioseq4 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq5 = Bioseq("consensus5","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq6 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq7 = Bioseq("consensus7","TGCCTGATGGCTCATGGAGTACCTGCCT") + + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4, iBioseq5, iBioseq6 , iBioseq7] ) + + iBioseq8 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACC") + iBioseq9 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq10 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq11 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq8, iBioseq9, iBioseq10, iBioseq11] ) + + obsBioseqDB = iBioseqDB.bestLength (4) + + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_bestLength_with_a_none_sequence_include(self): + iBioseq1 = Bioseq("consensus1", None) + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq3 = Bioseq("consensus3","GAGATGGCTCATGGAGTACC") + + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3] ) + + iBioseq4 = Bioseq("consensus1", None) + iBioseq5 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq6 = Bioseq("consensus3","GAGATGGCTCATGGAGTACC") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq4, iBioseq5, iBioseq6] ) + + obsBioseqDB = iBioseqDB.bestLength (3) + + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_bestLength_with_a_none_sequence_not_include(self): + iBioseq1 = Bioseq("consensus1", None) + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq3 = Bioseq("consensus3","GAGATGGCTCATGGAGTACC") + + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3] ) + + iBioseq5 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq6 = Bioseq("consensus3","GAGATGGCTCATGGAGTACC") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq5, iBioseq6] ) + + obsBioseqDB = iBioseqDB.bestLength (2) + + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_bestLength_number_of_bioseq_requiered_gt_BioseqDB_size(self): + iBioseq1 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACC") + iBioseq2 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq3 = Bioseq("consensus3","GAGATGGCTCATGGAGTACC") + + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3] ) + + iBioseq4 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACC") + iBioseq5 = Bioseq("consensus2","GAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq6 = Bioseq("consensus3","GAGATGGCTCATGGAGTACC") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq4, iBioseq5, iBioseq6] ) + + obsBioseqDB = iBioseqDB.bestLength (15) + + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_extractPatternOfFile(self): + fastaFilename = "dummyBioseqDB.fa" + f = open(fastaFilename, "w") + f.write(">consensus1\nGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACC\n") + f.write(">consensus2\nGAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC\n") + f.write(">consensus3\nGAGATGGCTCATGGAGTACC\n") + f.write(">consensus4\nGAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC\n") + f.write(">consensus11\nTGCCTGAGATGGCTCATGGAGTACCTGCCTTGCCTTGCATGACTGCATGGAGTACCTGCCTGTGCCTGATGGCTCATGGAGTACCTGCCT\n") + f.close() + + iBioseq1 = Bioseq("consensus1","GAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACC") + iBioseq2 = Bioseq("consensus11","TGCCTGAGATGGCTCATGGAGTACCTGCCTTGCCTTGCATGACTGCATGGAGTACCTGCCTGTGCCTGATGGCTCATGGAGTACCTGCCT") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq1, iBioseq2] ) + + obsBioseqDB = BioseqDB() + obsBioseqDB.extractPatternOfFile("consensus1+" , fastaFilename) + os.remove(fastaFilename) + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_extractPatternOfFile_WithNoExistingPattern(self): + fastaFilename = "dummyBioseqDB.fa" + f = open(fastaFilename, "w") + f.write(">consensus1\nGAGATGGCTCATGGAGTACCTGCCTGAGATGGCTCATGGAGTACC\n") + f.write(">consensus2\nGAGATGGCTCATGGAGTACCGCGAGACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC\n") + f.write(">consensus3\nGAGATGGCTCATGGAGTACC\n") + f.write(">consensus4\nGAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC\n") + f.write(">consensus11\nTGCCTGAGATGGCTCATGGAGTACCTGCCTTGCCTTGCATGACTGCATGGAGTACCTGCCTGTGCCTGATGGCTCATGGAGTACCTGCCT\n") + f.close() + + expBioseqDB = BioseqDB() + + obsBioseqDB = BioseqDB() + obsBioseqDB.extractPatternOfFile("NoExistingPattern" , fastaFilename) + os.remove(fastaFilename) + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_getByPattern (self): + iBioseq1 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq2 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq3 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq4 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4] ) + + iBioseq5 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq6 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq5, iBioseq6] ) + + obsBioseqDB = iBioseqDB.getByPattern("consensus1+") + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_getByPattern_with_no_existing_pattern (self): + iBioseq1 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq2 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq3 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq4 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4] ) + + expBioseqDB = BioseqDB() + + obsBioseqDB = iBioseqDB.getByPattern("noExistingPattern+") + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_getDiffFromPattern (self): + iBioseq1 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq2 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq3 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq4 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4] ) + + iBioseq5 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq6 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq5, iBioseq6] ) + + obsBioseqDB = iBioseqDB.getDiffFromPattern("consensus[4|6]") + + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_getDiffFromPattern_with_no_existing_pattern (self): + iBioseq1 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq2 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq3 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq4 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + iBioseqDB = BioseqDB() + iBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4] ) + + iBioseq5 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq6 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq7 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq8 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq5, iBioseq6, iBioseq7, iBioseq8] ) + + obsBioseqDB = iBioseqDB.getDiffFromPattern("noExistingPattern+") + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_rmByPattern (self): + iBioseq1 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq2 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq3 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq4 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4] ) + + iBioseq5 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq6 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq5, iBioseq6 ] ) + + obsBioseqDB.rmByPattern("consensus1+") + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_rmByPattern_with_no_existing_pattern (self): + iBioseq1 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq2 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq3 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq4 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4] ) + + iBioseq5 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq6 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq7 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq8 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq5, iBioseq6, iBioseq7, iBioseq8 ] ) + obsBioseqDB.rmByPattern("noExistingPattern+") + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_addBioseqFromABioseqDBIfHeaderContainPattern (self): + iBioseq1 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq2 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq3 = Bioseq("consensus7","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq4 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4] ) + + iBioseq5 = Bioseq("Sequence4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq6 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + inBioseqDB = BioseqDB() + inBioseqDB.setData( [ iBioseq5, iBioseq6 ]) + + iBioseq7 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq8 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq9 = Bioseq("consensus7","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq10 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + iBioseq11 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq7, iBioseq8, iBioseq9, iBioseq10, iBioseq11] ) + + obsBioseqDB.addBioseqFromABioseqDBIfHeaderContainPattern("consensus.*", inBioseqDB) + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_addBioseqFromABioseqDBIfHeaderContainPattern_with_no_existing_pattern (self): + iBioseq1 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq2 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq3 = Bioseq("consensus7","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq4 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4] ) + + iBioseq5 = Bioseq("Sequence4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq6 = Bioseq("consensus6","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + inBioseqDB = BioseqDB() + inBioseqDB.setData( [ iBioseq5, iBioseq6 ]) + + iBioseq7 = Bioseq("consensus4","GAGATGGCTCATGGAGTACCGCGAGTGCGGTACCTATGGCCCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGCGAGATGGCTCATGGAGTACCGC") + iBioseq8 = Bioseq("consensus1","TGCCTGAGATGGCTCATGGAGTACCTGCCT") + iBioseq9 = Bioseq("consensus7","TGCCTTGCATGACTGCATGGAGTACCTGCCTG") + iBioseq10 = Bioseq("consensus11","TGCCTGATGGCTCATGGAGTACCTGCCT") + + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq7, iBioseq8, iBioseq9, iBioseq10] ) + + obsBioseqDB.addBioseqFromABioseqDBIfHeaderContainPattern("noExistingPattern", inBioseqDB) + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_upCase (self): + iBioseq1 = Bioseq("consensus4","atgacGatgca") + iBioseq2 = Bioseq("consensus1","atgcgaT") + obsBioseqDB = BioseqDB() + obsBioseqDB.setData( [ iBioseq1, iBioseq2 ] ) + iBioseq3 = Bioseq("consensus4","ATGACGATGCA") + iBioseq4 = Bioseq("consensus1","ATGCGAT") + expBioseqDB = BioseqDB() + expBioseqDB.setData( [ iBioseq3, iBioseq4 ] ) + obsBioseqDB.upCase() + self.assertEquals(expBioseqDB, obsBioseqDB) + + + def test_getMap(self): + iBioseq1 = Bioseq("header1","ATGC-RA-GCT") + iBioseq2 = Bioseq("header2","-TGC-RA-GCT") + iBioseq3 = Bioseq("header3","ATGC-RA-GC-") + + iAlignedBioseqDB = BioseqDB() + iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3]) + + obsDict = iAlignedBioseqDB.getDictOfLMapsWithoutGaps() + + expLMap1 = [Map( "header1_subSeq1", "header1", 1, 4 ), Map( "header1_subSeq2", "header1", 6, 7 ), Map( "header1_subSeq3", "header1", 9, 11 )] + expLMap2 = [Map( "header2_subSeq1", "header2", 2, 4 ), Map( "header2_subSeq2", "header2", 6, 7 ), Map( "header2_subSeq3", "header2", 9, 11 )] + expLMap3 = [Map( "header3_subSeq1", "header3", 1, 4 ), Map( "header3_subSeq2", "header3", 6, 7 ), Map( "header3_subSeq3", "header3", 9, 10 )] + + expDict = { + "header1": expLMap1, + "header2": expLMap2, + "header3": expLMap3 + } + + self.assertEquals(expDict, obsDict) + + def test_getSeqLengthByListOfName(self): + iBioseq1 = Bioseq("header1","ATGC-RA-GCT") + iBioseq2 = Bioseq("header2","-TGC-RAR") + iBioseq3 = Bioseq("header3","ATGC") + + iBioseqDB = BioseqDB() + iBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3]) + + expList = [11, 4] + obsList = iBioseqDB.getSeqLengthByListOfName(["header1", "header3"]) + + self.assertEquals( expList, obsList ) + +test_suite = unittest.TestSuite() +test_suite.addTest( unittest.makeSuite( Test_BioseqDB ) ) +if __name__ == "__main__": + unittest.TextTestRunner(verbosity=2).run( test_suite )