Mercurial > repos > fabio > btman
diff query.py @ 19:7f712cc0d3d5 draft
Uploaded 20190304.2
author | fabio |
---|---|
date | Mon, 04 Mar 2019 08:31:28 -0500 |
parents | |
children | c619ad82600e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.py Mon Mar 04 08:31:28 2019 -0500 @@ -0,0 +1,148 @@ +#!/usr/bin/env python + +import sys, os, optparse, shutil + +__version__ = "1.0.0" +VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' +# in the case of collections, exitcodes equal to 0 and 1 are not considered errors +ERR_EXIT_CODE = 2 +OK_EXIT_CODE = 0 + +def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ): + print message + with open( logfilepath, 'a+' ) as out: + out.write( message + '\n' ) + if exit: + sys.exit( exitcode ) + +def querySBT( options, args ): + output_dir_path = options.outputdir + outlogfile = options.outfile + + tree_file_paths = options.treep.split( ',' ) + tree_file_names = options.treen.split( ',' ) + tree_def_filepath = None + leafnames_filepath = None + for idx, tree_file_name in enumerate( tree_file_names ): + if tree_file_name == 'howde': + tree_def_filepath = tree_file_paths[ idx ] + elif tree_file_name == 'leafnames': + leafnames_filepath = tree_file_paths[ idx ] + if tree_def_filepath is not None and leafnames_filepath is not None: + break + + if tree_def_filepath is not None and leafnames_filepath is not None: + leafnames_counter = 0 + with open( leafnames_filepath ) as leafnames_file: + for line in leafnames_file: + if line.strip(): + leafnames_counter += 1 + if leafnames_counter > 0: + printLog( outlogfile, 'The selected collection contains a valid tree' ) + shutil.copyfile( tree_def_filepath, 'howde.txt' ) + tree_def_filepath = 'howde.txt' + for idx, tree_file_name in enumerate( tree_file_names ): + if tree_file_name.endswith( 'detbrief.rrr' ): + shutil.copyfile( tree_file_paths[ idx ], tree_file_name + '.bf' ) + + printLog( outlogfile, 'Creating batch of queries' ) + # create tmp batch file + batch_file_name = 'queries.fa' + batch_file = open( batch_file_name, 'w' ) + + comma_sep_file_paths = options.files + # check if options.files contains at least one file path + if comma_sep_file_paths is not None: + # split file paths + file_paths = comma_sep_file_paths.split(",") + # split file names + file_names = options.names.split(",") + for idx, file_path in enumerate(file_paths): + fixed_file_name = ''.join( c for c in file_names[ idx ] if c in VALID_CHARS ) + printLog( outlogfile, '> processing file ' + file_names[ idx ] + ' ( fixed_name=\"' + fixed_file_name + '\" ) ' ) + with open(file_path, 'r') as content_file: + for line in content_file: + line = line.strip() + if line: + line_split = line.strip().split("\t") # split on tab + if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line + original_seq_id = line_split[0] + # fix seq_id using valid chars only + seq_id = ''.join( c for c in original_seq_id if c in VALID_CHARS ) + printLog( outlogfile, '> sequence ' + original_seq_id + ' ( fixed_name=\"' + seq_id + '\" )' ) + seq_text = line_split[1] + + # write on batch file + batch_file.write( '> ' + fixed_file_name + '_' + seq_id + '\n' + seq_text + '\n' ) + batch_file.close() + # query the tree + printLog( outlogfile, 'Querying the tree' ) + query_res_file_path = os.path.abspath( 'answer.txt' ) + sort_param = '--sort' + if options.sort == 0: + sort_param = '' + query_exitcode = os.system( 'howdesbt query --tree=' + os.path.abspath( tree_def_filepath ) + ' ' + os.path.abspath( batch_file_name ) + '=' + str(options.threshold) + ' --out=' + query_res_file_path ) + ' ' + sort_param + if query_exitcode > 0: + printLog( outlogfile, '> ERROR: an error has occurred while querying the tree with the sequence [id: ' + seq_id + '] in input file ' + file_names[ idx ] ) + else: + if os.path.exists( query_res_file_path ): + with open( query_res_file_path ) as query_res_file: + file_path = '' + theta_matches = 0 + for line in query_res_file: + line = line.strip() + if line: + if line.startswith( '*' ): + line_split = line.split( ' ' ) + theta_matches = int( line_split[ 1 ] ) + file_name = line_split[ 0 ].replace( '*', '' ) + file_path = os.path.join( output_dir_path, file_name + '_txt' ) + open( file_path, 'a' ).close() + else: + res_file = open( file_path, 'a+' ) + fraction = str( theta_matches ) + '/' + str( leafnames_counter ) + score = format( round( float( theta_matches ) / float( leafnames_counter ) , 6 ), '6f' ) + res_file.write( line + '\t' + fraction + '\t' + score + '\n' ) + res_file.close() + else: + printLog( outlogfile, 'An error has occurred while querying the tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) + else: + printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True ) + +def __main__(): + # Parse the command line options + usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path" + parser = optparse.OptionParser(usage = usage) + parser.add_option("-v", "--version", action="store_true", dest="version", + default=False, help="display version and exit") + parser.add_option("-f", "--files", type="string", + action="store", dest="files", help="comma separated files path") + parser.add_option("-n", "--names", type="string", + action="store", dest="names", help="comma separated names associated to the files specified in --files") + parser.add_option("-k", "--treep", type="string", + action="store", dest="treep", help="paths of files in collection") + parser.add_option("-m", "--treen", type="string", + action="store", dest="treen", help="names of files in collection") + parser.add_option("-t", "--threshold", type="float", default=0.7, + action="store", dest="threshold", help="search threshold") + parser.add_option("-s", "--sort", type="int", default=1, + action="store", dest="sort", help="sort results") + parser.add_option("-o", "--outputdir", type="string", default="output", + action="store", dest="outputdir", help="output directory (collection) path") + parser.add_option("-r", "--outfile", type="string", default="query.txt", + action="store", dest="outfile", help="output log file path") + + (options, args) = parser.parse_args() + if options.version: + print __version__ + else: + # create output dir (collection) + output_dir_path = options.outputdir + if not os.path.exists(output_dir_path): + os.makedirs(output_dir_path) + + querySBT( options, args ) + +if __name__ == "__main__": __main__()