diff query.py @ 19:7f712cc0d3d5 draft

Uploaded 20190304.2
author fabio
date Mon, 04 Mar 2019 08:31:28 -0500
parents
children c619ad82600e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query.py	Mon Mar 04 08:31:28 2019 -0500
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+
+import sys, os, optparse, shutil
+
+__version__ = "1.0.0"
+VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+# in the case of collections, exitcodes equal to 0 and 1 are not considered errors
+ERR_EXIT_CODE = 2
+OK_EXIT_CODE = 0
+
+def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ):
+    print message
+    with open( logfilepath, 'a+' ) as out:
+        out.write( message + '\n' )
+    if exit:
+        sys.exit( exitcode )
+
+def querySBT( options, args ):
+    output_dir_path = options.outputdir
+    outlogfile = options.outfile
+    
+    tree_file_paths = options.treep.split( ',' )
+    tree_file_names = options.treen.split( ',' )
+    tree_def_filepath = None
+    leafnames_filepath = None
+    for idx, tree_file_name in enumerate( tree_file_names ):
+        if tree_file_name == 'howde':
+            tree_def_filepath = tree_file_paths[ idx ]
+        elif tree_file_name == 'leafnames':
+            leafnames_filepath = tree_file_paths[ idx ]
+        if tree_def_filepath is not None and leafnames_filepath is not None:
+            break
+
+    if tree_def_filepath is not None and leafnames_filepath is not None:
+        leafnames_counter = 0
+        with open( leafnames_filepath ) as leafnames_file:
+            for line in leafnames_file:
+                if line.strip():
+                    leafnames_counter += 1
+        if leafnames_counter > 0:
+            printLog( outlogfile, 'The selected collection contains a valid tree' )
+            shutil.copyfile( tree_def_filepath, 'howde.txt' )
+            tree_def_filepath = 'howde.txt'
+            for idx, tree_file_name in enumerate( tree_file_names ):
+                if tree_file_name.endswith( 'detbrief.rrr' ):
+                    shutil.copyfile( tree_file_paths[ idx ], tree_file_name + '.bf' )
+            
+            printLog( outlogfile, 'Creating batch of queries' )
+            # create tmp batch file
+            batch_file_name = 'queries.fa'
+            batch_file = open( batch_file_name, 'w' )
+
+            comma_sep_file_paths = options.files
+            # check if options.files contains at least one file path
+            if comma_sep_file_paths is not None:
+                # split file paths
+                file_paths = comma_sep_file_paths.split(",")
+                # split file names
+                file_names = options.names.split(",")
+                for idx, file_path in enumerate(file_paths):
+                    fixed_file_name = ''.join( c for c in file_names[ idx ] if c in VALID_CHARS )
+                    printLog( outlogfile, '> processing file ' + file_names[ idx ] + ' ( fixed_name=\"' + fixed_file_name + '\" ) ' )
+                    with open(file_path, 'r') as content_file:
+                        for line in content_file:
+                            line = line.strip()
+                            if line:
+                                line_split = line.strip().split("\t") # split on tab
+                                if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line
+                                    original_seq_id = line_split[0]
+                                    # fix seq_id using valid chars only
+                                    seq_id = ''.join( c for c in original_seq_id if c in VALID_CHARS )
+                                    printLog( outlogfile, '> sequence ' + original_seq_id + ' ( fixed_name=\"' + seq_id + '\" )' )
+                                    seq_text = line_split[1]
+
+                                    # write on batch file
+                                    batch_file.write( '> ' + fixed_file_name + '_' + seq_id + '\n' + seq_text + '\n' )
+            batch_file.close()
+            # query the tree
+            printLog( outlogfile, 'Querying the tree' )
+            query_res_file_path = os.path.abspath( 'answer.txt' )
+            sort_param = '--sort'
+            if options.sort == 0:
+                sort_param = ''
+            query_exitcode = os.system( 'howdesbt query --tree=' + os.path.abspath( tree_def_filepath ) + ' ' + os.path.abspath( batch_file_name ) + '=' + str(options.threshold) + ' --out=' + query_res_file_path ) + ' ' + sort_param
+            if query_exitcode > 0:
+                printLog( outlogfile, '> ERROR: an error has occurred while querying the tree with the sequence [id: ' + seq_id + '] in input file ' + file_names[ idx ] )
+            else:
+                if os.path.exists( query_res_file_path ):
+                    with open( query_res_file_path ) as query_res_file:
+                        file_path = ''
+                        theta_matches = 0
+                        for line in query_res_file:
+                            line = line.strip()
+                            if line:
+                                if line.startswith( '*' ):
+                                    line_split = line.split( ' ' )
+                                    theta_matches = int( line_split[ 1 ] )
+                                    file_name = line_split[ 0 ].replace( '*', '' )
+                                    file_path = os.path.join( output_dir_path, file_name + '_txt' )
+                                    open( file_path, 'a' ).close()
+                                else:
+                                    res_file = open( file_path, 'a+' )
+                                    fraction = str( theta_matches ) + '/' + str( leafnames_counter )
+                                    score = format( round( float( theta_matches ) / float( leafnames_counter ) , 6 ), '6f' )
+                                    res_file.write( line + '\t' + fraction + '\t' + score + '\n' )
+                                    res_file.close()
+                else:
+                    printLog( outlogfile, 'An error has occurred while querying the tree', exitcode=ERR_EXIT_CODE, exit=True )
+        else:
+            printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True )
+    else:
+        printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True )
+
+def __main__():
+    # Parse the command line options
+    usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path"
+    parser = optparse.OptionParser(usage = usage)
+    parser.add_option("-v", "--version", action="store_true", dest="version",
+                    default=False, help="display version and exit")
+    parser.add_option("-f", "--files", type="string",
+                    action="store", dest="files", help="comma separated files path")
+    parser.add_option("-n", "--names", type="string",
+                    action="store", dest="names", help="comma separated names associated to the files specified in --files")
+    parser.add_option("-k", "--treep", type="string",
+                    action="store", dest="treep", help="paths of files in collection")
+    parser.add_option("-m", "--treen", type="string",
+                    action="store", dest="treen", help="names of files in collection")
+    parser.add_option("-t", "--threshold", type="float", default=0.7,
+                    action="store", dest="threshold", help="search threshold")
+    parser.add_option("-s", "--sort", type="int", default=1,
+                    action="store", dest="sort", help="sort results")
+    parser.add_option("-o", "--outputdir", type="string", default="output",
+                    action="store", dest="outputdir", help="output directory (collection) path")
+    parser.add_option("-r", "--outfile", type="string", default="query.txt",
+                    action="store", dest="outfile", help="output log file path")
+
+    (options, args) = parser.parse_args()
+    if options.version:
+        print __version__
+    else:
+        # create output dir (collection)
+        output_dir_path = options.outputdir
+        if not os.path.exists(output_dir_path):
+            os.makedirs(output_dir_path)
+
+        querySBT( options, args )
+
+if __name__ == "__main__": __main__()