view test_repex_pipeline.py @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python3
'''
Basic Tarean and RepeatExplorer tests
'''
import subprocess
import tempfile
import unittest
import os
import shutil

def check_for_missing_files(directory, file_list):
    ''' check if files exists in the directory '''
    missing_files = []
    for f in file_list:
        path = os.path.join(directory, f)
        if os.path.exists(path):
            continue
        else:
            missing_files.append(f)
    return missing_files


class TestBasic(unittest.TestCase):
    ''' basic repex-tarean testcase '''
    EXECUTABLE = "./seqclust"

    # file lists to check
    FILE_LIST_BASIC = [
        "./seqclust/clustering/clusters/dir_CL0001/hitsort_part.csv",
        "./seqclust/clustering/clusters/dir_CL0001/reads.fasta",
        "./seqclust/clustering/clusters/dir_CL0001/reads_selection.fasta",
        "./seqclust/clustering/clusters/dir_CL0001/dna_database_annotation.csv",
        "./seqclust/clustering/clusters/dir_CL0001/graph_layout.GL",
        "./seqclust/clustering/clusters/dir_CL0001/graph_layout.png",
        "./seqclust/clustering/clusters/dir_CL0001/graph_layout_tmb.png",
        "./seqclust/clustering/clusters/dir_CL0001/graph_layout_directed.RData",
        "./logfile.txt", "./style1.css", "./documentation.html",
        "./tarean_report.html", "./cluster_report.html",
        "./summary_histogram.png", "./index.html", "./sequences.db",
        "./hitsort.db", "./TAREAN_consensus_rank_1.fasta",
        "./TAREAN_consensus_rank_2.fasta", "./TAREAN_consensus_rank_3.fasta",
        "./TAREAN_consensus_rank_4.fasta", "./seqclust/clustering/hitsort",
        "./seqclust/clustering/hitsort.cls"
    ]
    FILE_LIST_ASSEMBLY = [
        "./seqclust/small_clusters_assembly/small_clusters.aln",
        "./seqclust/small_clusters_assembly/small_clusters.ace",
        "./seqclust/small_clusters_assembly/small_clusters.fasta"
    ]
    FILE_LIST_FILTERING = ["./seqclust/prerun/filter_sequences.fasta"]
    FILE_LIST_COMPARATIVE = ["COMPARATIVE_ANALYSIS_COUNTS.csv"]
    FILE_LIST_CUSTOM_DATABASE = [
        "./seqclust/custom_databases/extra_database",
        "./seqclust/clustering/clusters/dir_CL0001/custom_db_extra_database_annotation.csv"
    ]
    def setUp(self):
        pass

    # helper function
    def tarean_run(self, cmd_options, file_list):
        ''' Basic taren run '''
        # output goes to tmp directory
        tmpdir = tempfile.mkdtemp()
        logfile = tempfile.NamedTemporaryFile(delete=False)
        print("\n------------------------------------------------------")
        print("Temp files:")
        print("   tmpdir : ", tmpdir)
        print("  logfile : ", logfile.name)
        print("------------------------------------------------------")
        print([self.EXECUTABLE] + ['-l', logfile.name, '-v', tmpdir] + cmd_options)
        p = subprocess.Popen(
            args=[self.EXECUTABLE] + ['-l', logfile.name, '-v', tmpdir
                                     ] + cmd_options)
        p.wait()
        status = p.returncode
        missing_files = check_for_missing_files(directory=tmpdir,
                                                file_list=file_list)
        if status:
            # print log file
            print("Non zero exit status!")
            with open(logfile.name) as f:
                print(f.read())

        self.assertEqual(status, 0)
        self.assertEqual(
            len(missing_files),
            0,
            msg="\n missing files: \n" + "\n".join(missing_files))
        shutil.rmtree(tmpdir)
        os.remove(logfile.name)


    def test_help(self):
        '''Test if help option works '''
        p = subprocess.Popen(args=[self.EXECUTABLE, "-h"],
                             stdout=subprocess.PIPE)
        output = str(p.stdout.readlines())
        p.stdout.close()
        p.wait()
        status = p.returncode
        self.assertRegex(output, "usage")
        self.assertRegex(output, "optional arguments")
        self.assertEqual(status, 0)

    def test_basic_no_merging_tarean(self):
        ''' Basic taren run '''
        cmd_options = ['-t', '-p', '-s', '6000', 'test_data/LAS_paired_10k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC)

    def test_basic_with_merging_tarean(self):
        ''' Basic taren run '''
        cmd_options = ['-t', '-p', '-M', '0.2', '-s', '6000',
                       'test_data/LAS_paired_10k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC)


    def test_basic_with_merging_tarean_dust_off(self):
        ''' Basic taren run '''
        cmd_options = ['-t', '-p', '-M', '0.2', '-s', '6000', "-opt", "ILLUMINA_DUST_OFF",
                       'test_data/LAS_paired_10k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC)

    def test_long_with_merging_tarean(self):
        '''Using more data with tarean'''
        cmd_options = ['-t', '-p', '-M', '0.1', '-m', '0.01',
                       'test_data/LAS_paired_25k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC)

    def test_long_with_merging2_tarean(self):
        '''Using more data with tarean 300k reads'''
        cmd_options = ['-t', '-p', '-M', '0.1', '-m', '0.01',
                       'test_data/LAS_paired_300k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC)

    def test_short_comparative_re(self):
        '''comparative analysis, two species, small run'''
        cmd_options = ['-P','3', '-p', '-m', '0.01',
                       'test_data/sequences_comparative.fasta']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_COMPARATIVE)

    # REPEATEXPLORER - full runs
    def test_basic_no_merging_re(self):
        ''' Basic taren run '''
        cmd_options = ['-p', '-s', '6000', 'test_data/LAS_paired_10k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)

    def test_basic_no_merging_re_diamond(self):
        ''' Basic taren run '''
        cmd_options = ['-p', '-s', '6000','-D','DIAMOND', 'test_data/LAS_paired_10k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)



    def test_basic_with_merging_re(self):
        ''' Basic taren run '''
        cmd_options = ['-p', '-M', '0.2', '-s', '6000',
                       'test_data/LAS_paired_10k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)

    def test_long_with_merging_re(self):
        '''Using more data with tarean'''
        cmd_options = ['-p', '-M', '0.1', '-m', '0.01',
                       'test_data/LAS_paired_25k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)

    def test_long_with_merging_re_diamond(self):
        '''Using more data with tarean and using diamond'''
        cmd_options = ['-p', '-M', '0.1', '-m', '0.01','-D','DIAMOND',
                       'test_data/LAS_paired_25k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)

    def test_long_with_merging2_re(self):
        '''Using more data with tarean 300k reads'''
        cmd_options = ['-p', '-M', '0.1', '-m', '0.01',
                       'test_data/LAS_paired_300k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_ASSEMBLY)

    def test_long_with_merging_and_filtering_re(self):
        '''Using more data with tarean, test of automatic filtering'''
        cmd_options = ['-A', '-p', '-M', '0.2', '-m', '0.01',
                       'test_data/ceu_200k.fasta']
        self.tarean_run(
            cmd_options,
            file_list=self.FILE_LIST_BASIC + self.FILE_LIST_FILTERING + self.FILE_LIST_ASSEMBLY)

    def test_custom_database_re(self):
        ''' Basic taren run '''
        cmd_options = ['-p', '-d', 'test_data/extra_database', 'extra_database', 'test_data/LAS_paired_10k.fas']
        self.tarean_run(cmd_options, file_list=self.FILE_LIST_BASIC + self.FILE_LIST_CUSTOM_DATABASE)

    def tearDown(self):
        pass


SHORT_TASK_NAME_LIST_TAREAN = ['test_help', 'test_basic_no_merging_tarean',
                               'test_basic_with_merging_tarean',
                               'test_basic_with_merging_tarean_dust_off']
LONG_TASK_NAME_LIST_TAREAN = ['test_long_with_merging_tarean',
                              'test_long_with_merging2_tarean']
SHORT_TASK_NAME_LIST_RE = ['test_basic_no_merging_re',
                           'test_basic_with_merging_re',
                           'test_basic_no_merging_re_diamond']
LONG_TASK_NAME_LIST_RE = ['test_long_with_merging_re',
                          'test_long_with_merging2_re',
                          'test_long_with_merging_and_filtering_re',
                          'test_long_with_merging_re_diamond']

COMPARATIVE_LIST = ['test_short_comparative_re']
CUSTOM_DATABASE_LIST = ['test_short_custom_database']

# Test suites:
SHORT_TAREAN_SUITE = unittest.TestSuite([TestBasic(i)
                                   for i in SHORT_TASK_NAME_LIST_TAREAN])
LONG_TAREAN_SUITE = unittest.TestSuite([TestBasic(i)
                                  for i in LONG_TASK_NAME_LIST_TAREAN])
COMPARATIVE_SUITE = unittest.TestSuite([TestBasic(i) for i in COMPARATIVE_LIST])
CUSTOM_DB_SUITE = unittest.TestSuite([TestBasic('test_custom_database_re')])

SHORT_RE_SUITE = unittest.TestSuite([TestBasic(i) for i in SHORT_TASK_NAME_LIST_RE])
LONG_RE_SUITE = unittest.TestSuite([TestBasic(i) for i in LONG_TASK_NAME_LIST_RE])

SHORT_SUITE = unittest.TestSuite([SHORT_RE_SUITE, SHORT_TAREAN_SUITE,
                                  COMPARATIVE_SUITE, CUSTOM_DB_SUITE])

LONG_LONG = unittest.TestSuite([LONG_RE_SUITE, LONG_TAREAN_SUITE])

# for single test tesing
if __name__ == '__main__':
    unittest.main(verbosity=2)