view test/integration_tests.py @ 26:637830ac8bcd

added validation in metexp to tabular tool; added workaround/fix for L and D compound types
author pieter.lukasse@wur.nl
date Thu, 24 Apr 2014 11:28:38 +0200
parents 53e1eee93430
children
line wrap: on
line source

'''Integration tests for the GCMS project'''

from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611
from GCMS import library_lookup, combine_output
from GCMS.rankfilter_GCMS import rankfilter
import os.path
import sys
import unittest
import re


class IntegrationTest(unittest.TestCase):
    def test_library_lookup(self):
        '''
        Run main for data/NIST_tabular and compare produced files with references determined earlier.
        '''
        # Create out folder
        outdir = "output/" #tempfile.mkdtemp(prefix='test_library_lookup')
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        outfile_base = os.path.join(outdir, 'produced_library_lookup')
        outfile_txt = outfile_base + '.txt'

        #Build up arguments and run
        input_txt = resource_filename(__name__, "data/NIST_tabular.txt")
        library = resource_filename(__name__, "data/RIDB_subset.txt")
        regress_model = resource_filename(__name__, "data/ridb_poly_regression.txt")
        sys.argv = ['test',
                    library,
                    input_txt,
                    'Capillary',
                    'Semi-standard non-polar',
                    outfile_txt,
                    'HP-5',
                    regress_model]
        # Execute main function with arguments provided through sys.argv
        library_lookup.main()
        #Compare with reference files
        reference_txt = resource_filename(__name__, 'reference/produced_library_lookup.txt')
        
        #read both the reference file  and actual output files
        expected = _read_file(reference_txt)
        actual = _read_file(outfile_txt)
        
        #convert the read in files to lists we can compare
        expected = expected.split()
        actual = actual.split()

        for exp, act in zip(expected, actual):
            if re.match('\\d+\\.\\d+', exp):
                exp = float(exp)
                act = float(act)
                self.assertAlmostEqual(exp, act, places=5)
            else:
                # compare values
                self.failUnlessEqual(expected, actual)


    def test_combine_output_simple(self):
        '''
        Run main for data/NIST_tabular and compare produced files with references determined earlier.
        '''
        # Create out folder
        outdir = "output/" #tempfile.mkdtemp(prefix='test_library_lookup')
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        outfile_base = os.path.join(outdir, 'produced_combine_output')
        outfile_single_txt = outfile_base + '_single.txt'
        outfile_multi_txt = outfile_base + '_multi.txt'

        #Build up arguments and run
        input_rankfilter = resource_filename(__name__, "data/Rankfilter.txt")
        input_caslookup = resource_filename(__name__, "data/Caslookup.txt")
        sys.argv = ['test',
                    input_rankfilter,
                    input_caslookup,
                    outfile_single_txt,
                    outfile_multi_txt]
        # Execute main function with arguments provided through sys.argv
        combine_output.main()
        #Compare with reference files
        # reference_single_txt = resource_filename(__name__, 'reference/produced_combine_output_single.txt')
        # reference_multi_txt = resource_filename(__name__, 'reference/produced_combine_output_multi.txt')
        # self.failUnlessEqual(_read_file(reference_single_txt), _read_file(outfile_single_txt))
        # self.failUnlessEqual(_read_file(reference_multi_txt), _read_file(outfile_multi_txt))

        #Clean up
        #shutil.rmtree(tempdir)


        
    def def_test_rank_filter_advanced(self):
        '''
        Run main of RankFilter
        '''
        # Create out folder
        outdir = "output/integration/"
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        #Build up arguments and run
        input_txt = resource_filename(__name__, "data/integration/RankFilterInput_conf.txt")
        sys.argv = ['test', 
                    input_txt]
        # Execute main function with arguments provided through sys.argv
        rankfilter.main()
        #Compare with reference files
               
    def def_test_library_lookup_advanced(self):
        '''
        Run main for data/NIST_tabular and compare produced files with references determined earlier.
        '''
        # Create out folder
        outdir = "output/integration/" 
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        outfile_base = os.path.join(outdir, 'produced_library_lookup_ADVANCED')
        outfile_txt = outfile_base + '.txt'

        #Build up arguments and run
        input_txt = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")
        library = resource_filename(__name__, "../repositories/PRIMS-metabolomics/RI_DB_libraries/Library_RI_DB_capillary_columns-noDuplicates.txt")
        regress_model = resource_filename(__name__, "data/integration/regression_MODEL_for_columns.txt")
        sys.argv = ['test',
                    library,
                    input_txt,
                    'Capillary',
                    'Semi-standard non-polar',
                    outfile_txt,
                    'DB-5',
                    regress_model]
        # Execute main function with arguments provided through sys.argv
        library_lookup.main()


        
    def test_combine_output_advanced(self):
        '''
        Variant on test case above, but a bit more complex as some of the centrotypes have
        different NIST hits which should give them different RI values. This test also
        runs not only the combine output, but the other two preceding steps as well, 
        so it ensures the integration also works on the current code of all three tools. 
        '''
            
        # Run RankFilter 
        self.def_test_rank_filter_advanced()
        
        # Run library CAS RI lookup
        self.def_test_library_lookup_advanced()
        
        outdir = "output/integration/"    
        outfile_base = os.path.join(outdir, 'produced_combine_output')
        outfile_single_txt = outfile_base + '_single.txt'
        outfile_multi_txt = outfile_base + '_multi.txt'

        #Build up arguments and run
        input_rankfilter = resource_filename(__name__, "output/integration/produced_rank_filter_out.txt")
        input_caslookup = resource_filename(__name__, "output/integration/produced_library_lookup_ADVANCED.txt")
        sys.argv = ['test',
                    input_rankfilter,
                    input_caslookup,
                    outfile_single_txt,
                    outfile_multi_txt]
        # Execute main function with arguments provided through sys.argv
        combine_output.main()
        #Compare with reference files
#        reference_single_txt = resource_filename(__name__, 'reference/produced_combine_output_single.txt')
#        reference_multi_txt = resource_filename(__name__, 'reference/produced_combine_output_multi.txt')
#        self.failUnlessEqual(_read_file(reference_single_txt), _read_file(outfile_single_txt))
#        self.failUnlessEqual(_read_file(reference_multi_txt), _read_file(outfile_multi_txt))
        
        # Check 1: output single should have one record per centrotype:
        
        
        # Check 2: output single has more records than output single:
        combine_result_single_items =  combine_output._process_data(outfile_single_txt)
        combine_result_multi_items =  combine_output._process_data(outfile_multi_txt)
        self.assertGreater(len(combine_result_single_items['Centrotype']), 
                           len(combine_result_multi_items['Centrotype']))
        
        
        # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct:
        caslookup_items = combine_output._process_data(input_caslookup)
        rankfilter_items = combine_output._process_data(input_rankfilter)
        
        # check that the caslookup RI column is correctly maintained in its original order in
        # the combined file:
        ri_caslookup = caslookup_items['RI']
        ri_combine_single = combine_result_single_items['RI']
        self.assertListEqual(ri_caslookup, ri_combine_single) 
        
        # check the centrotype column's integrity:
        centrotype_caslookup = caslookup_items['Centrotype']
        centrotype_combine_single = combine_result_single_items['Centrotype']
        centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items['ID'])
        self.assertListEqual(centrotype_caslookup, centrotype_combine_single)
        self.assertListEqual(centrotype_caslookup, centrotype_rankfilter)
                
        # integration and integrity checks:
        file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")
        file_NIST_items = combine_output._process_data(file_NIST)
        # check that rank filter output has exactly the same ID items as the original NIST input file:
        self.assertListEqual(file_NIST_items['ID'], rankfilter_items['ID']) 
        # check the same for the CAS column:
        self.assertListEqual(_get_strippedcas(file_NIST_items['CAS']), rankfilter_items['CAS'])
        # now check the NIST CAS column against the cas lookup results:  
        cas_NIST = _get_processedcas(file_NIST_items['CAS'])
        self.assertListEqual(cas_NIST, caslookup_items['CAS'])
        # now check the CAS of the combined result. If all checks are OK, it means the CAS column's order
        # and values remained stable throughout all steps: 
        self.assertListEqual(rankfilter_items['CAS'], combine_result_single_items['CAS']) 
        
        # check that the rankfilter RIsvr column is correctly maintained in its original order in
        # the combined file:
        risvr_rankfilter = rankfilter_items['RIsvr']
        risvr_combine_single = combine_result_single_items['RIsvr']
        self.assertListEqual(risvr_rankfilter, risvr_combine_single) 

        
   

def _get_centrotype_rankfilter(id_list):
    '''
    returns the list of centrotype ids given a list of ID in the
    form e.g. 74-1.0-564-1905200-7, where the numbers before the 
    first "-" are the centrotype id
    '''
    result = []
    for compound_id_idx in xrange(len(id_list)):
        compound_id = id_list[compound_id_idx]
        centrotype = compound_id.split('-')[0]
        result.append(centrotype) 

    return result


def _get_processedcas(cas_list):
    '''
    returns the list cas numbers in the form C64175 instead of 64-17-5
    '''
    result = []
    for cas_id_idx in xrange(len(cas_list)):
        cas = cas_list[cas_id_idx]
        processed_cas = 'C' + str(cas.replace('-', '').strip())
        result.append(processed_cas) 

    return result

def _get_strippedcas(cas_list):
    '''
    removes the leading white space from e.g. " 64-17-5"
    '''
    result = []
    for cas_id_idx in xrange(len(cas_list)):
        cas = cas_list[cas_id_idx]
        processed_cas = cas.strip()
        result.append(processed_cas) 

    return result


def _read_file(filename):
    '''
    Helper method to quickly read a file
    @param filename:
    '''
    with open(filename) as handle:
        return handle.read()