Mercurial > repos > yufei-luo > s_mart
diff commons/tools/srptTableOverlap.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/srptTableOverlap.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,319 @@ +#!/usr/bin/env python + +import os +import sys +import getopt +import logging +import string +import ConfigParser + +from pyRepet.sql.TableAdaptator import * +import pyRepet.sql.RepetDBMySQL +import pyRepet.coord.Map +import pyRepet.coord.Path +import pyRepet.coord.Set + + +def help(): + print + print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] ) + print "options:" + print " -h: this help" + print " -q: query table" + print " -s: subject table" + print " -p: by path" + print " -t: table type comparison: qtype/stype where qtype=[map,set,path] and stype=[path,set,map]" + print " -c: configuration file from TEdenovo or TEannot pipeline" + print " -H: MySQL host (if no configuration file)" + print " -U: MySQL user (if no configuration file)" + print " -P: MySQL password (if no configuration file)" + print " -D: MySQL database (if no configuration file)" + print + + +def pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose=0 ): + + if qtype == "path": + db.create_path_index( qtable ) + qtablePathAdaptator = TablePathAdaptator( db, qtable ) + path_num_list = qtablePathAdaptator.getPath_num() + elif qtype == "set": + db.create_set_index( qtable ) + qtableSetAdaptator = TableSetAdaptator( db, qtable ) + path_num_list = qtableSetAdaptator.getSet_num() + else: + string = "unknown query table type: %s" % ( qtype ) + if verbose > 0: + print string + logging.error( string ) + sys.exit(1) + string = "nb of paths in query table: %i" % (len(path_num_list) ) + if verbose > 0: + print string + logging.info( string ) + + if stype == "path": + stablePathAdaptator = TableBinPathAdaptator( db, stable ) +# stablePathAdaptator=TablePathAdaptator(db,stable) + elif stype == "set": + stableSetAdaptator = TableBinSetAdaptator( db, stable ) +# stableSetAdaptator=TableSetAdaptator(db,stable) + else: + string = "unknown subject table type: %s" % ( stype ) + if verbose > 0: + print string + logging.error( string ) + sys.exit(1) + + count = 0 + for path_num in path_num_list: + if qtype == "path": + qlist = qtablePathAdaptator.getPathList_from_num( path_num ) + qlist = pyRepet.coord.Path.path_list_rangeQ2Set( qlist ) + elif qtype == "set": + qlist = qtableSetAdaptator.getSetList_from_num( path_num ) + + qlist.sort() + qmin, qmax = pyRepet.coord.Set.set_list_boundaries( qlist ) + + qmin = qmin - 1 + qmax = qmax + 1 + if stype == "path": + slist = stablePathAdaptator.getPathList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax) + slist = pyRepet.coord.Path.path_list_rangeQ2Set( slist ) + elif stype == "set": + slist = stableSetAdaptator.getSetList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax) + + if len(slist) > 0: + print "----------------------------------------" + print "query:" + pyRepet.coord.Set.set_list_show( qlist ) + qlist=pyRepet.coord.Set.set_list_merge( qlist ) + qsize=pyRepet.coord.Set.set_list_size( qlist ) + print "query size=",qsize + + slist_dict = pyRepet.coord.Set.set_list_split( slist ) + subj_names = "" + for i,l in slist_dict.items(): + if subj_names != "": + subj_names += "|" + subj_names += "%d:%s" % (i,l[0].name) + subj_count = len(slist_dict.keys()) + + print "subject:" + pyRepet.coord.Set.set_list_show( slist ) + slist = pyRepet.coord.Set.set_list_merge( slist ) + ssize = pyRepet.coord.Set.set_list_size( slist ) + osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist ) + + print "subject size=",ssize + print "overlap size=",osize + + fout.write("%d\t%s\t%d\t%s\t%d\t%d\t%d\t%f\t%f\n"\ + %(path_num,qlist[0].name,\ + qsize,\ + subj_names,\ + subj_count,\ + ssize,\ + osize,\ + float(osize)/qsize,\ + float(osize)/ssize)) + else: + print "----------------------------------------" + print "query:" + pyRepet.coord.Set.set_list_show( qlist ) + qlist = pyRepet.coord.Set.set_list_merge( qlist ) + qsize = pyRepet.coord.Set.set_list_size( qlist ) + print "query size=",qsize + print "No match!" + fout.write("%d\t%s\t%d\t-\t0\t0\t0\t0.0\t0.0\n"\ + %(path_num,qlist[0].name,qsize)) + + +def getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose=0 ): + """ + For each query in qtable, compute the overlap between its matches and the matches in stable. + """ + if qtype =="map": + qtableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, qtable ) + elif qtype == "path": + qtableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, qtable ) + elif qtype == "set": + qtableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, qtable ) + else: + string = "unknown query table type: %s" % ( qtype ) + if verbose > 0: + print string + logging.error( string ) + sys.exit(1) + + string = "fetching query table data..." + contigs = qtableAdaptator.getContig_name() + string += " %i contig(s)" % ( len(contigs) ) + if verbose > 0: + print string; sys.stdout.flush() + logging.info( string ) + + if stype == "map": + stableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, stable ) + elif stype == "path": + stableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, stable ) + elif stype == "set": + stableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, stable ) + else: + string = "unknown subject table type: %s" % ( stype ) + if verbose > 0: + print string + logging.error( string ) + sys.exit(1) + + string = "looking for overlaps with subject data..." + if verbose > 0: + print string; sys.stdout.flush() + logging.info( string ) + sum_qsize = 0 + sum_osize = 0 + sum_non_osize = 0 + for c in contigs: + string = "contig '%s': "%(c) + qlist = qtableAdaptator.getSetList_from_contig( c ) + qlist = pyRepet.coord.Set.set_list_merge( qlist ) + slist = stableAdaptator.getSetList_from_contig( c ) + slist = pyRepet.coord.Set.set_list_merge( slist ) + qsize = pyRepet.coord.Set.set_list_size( qlist ) + osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist ) + sum_osize += osize + sum_qsize += qsize + sum_non_osize += qsize - osize + string += "qsize=%d osize=%d" % ( qsize, osize ) + logging.debug( string ) + if verbose > 0: + print string; sys.stdout.flush() + + string = "summary:" + string += "\ncumulative coverage of the query table: %i nt" % ( sum_qsize ) + string += "\nsize of overlaps with the subject table: %i nt" % ( sum_osize ) + string += "\n proportion of query: %.3f" % ( float(sum_osize)/sum_qsize ) + string += "\nsize of non-overlaps with the subject table: %i nt" % ( sum_non_osize ) + string += "\n proportion of query: %.3f" % ( float(sum_non_osize)/sum_qsize ) + if verbose > 0: + print string; sys.stdout.flush() + logging.info( string ) + + return sum_osize, sum_non_osize, sum_qsize + + +def main (): + """ + This program computes the overlaps between two tables recording spatial coordinates. + """ + qtable = "" + stable = "" + type = "" + by_path = False + configFileName = "" + host = "" + user = "" + passwd = "" + db = "" + verbose = 0 + try: + opts, args = getopt.getopt( sys.argv[1:], "hq:s:t:pc:H:U:P:D:v:" ) + except getopt.GetoptError: + help() + sys.exit(1) + if len(args) != 0: + help() + sys.exit(1) + for o,a in opts: + if o == "-h": + help() + sys.exit(0) + elif o == "-q": + qtable = a + elif o == "-s": + stable = a + elif o == "-t": + type = a + elif o == "-p": + by_path = True + elif o == "-c": + configFileName = a + elif o == "-H": + host = a + elif o == "-U": + user = a + elif o == "-P": + passwd = a + elif o == "-D": + db = a + elif o == "-v": + verbose = int(a) + if qtable=="" or stable=="" or \ + (configFileName== "" and (host=="" or \ + user=="" or passwd=="" or db=="")): + print "ERROR: missing compulsory options" + help() + sys.exit(1) + if verbose > 0: + print "START %s" % (sys.argv[0].split("/")[-1]) + sys.stdout.flush() + + if configFileName != "": + config = ConfigParser.ConfigParser() + config.readfp( open(configFileName) ) + host = config.get("repet_env","repet_host") + user = config.get("repet_env","repet_user") + passwd = config.get("repet_env","repet_pw") + dbname = config.get("repet_env","repet_db") + + logfilename = qtable + "-" + stable + "-" + str(os.getpid()) + ".log" + handler = logging.FileHandler( logfilename ) + formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s") + handler.setFormatter( formatter ) + logging.getLogger('').addHandler(handler) + logging.getLogger('').setLevel(logging.DEBUG) + logging.info("started") + + db = pyRepet.sql.RepetDBMySQL.RepetDB( user, host, passwd, dbname ) + + qtype, stype = type.split("/") + + if not db.exist( qtable ): + if not os.path.exists( qtable ): + msg = "ERROR: neither table nor file '%s'" % ( qtable ) + sys.stderr.write( "%s\n" % msg ) + sys.exit(1) + tmp = qtable.replace(".","_") + db.create_table( db, tmp, qtable, qtype ) + qtable = tmp + if not db.exist( stable ): + if not os.path.exists( stable ): + msg = "ERROR: neither table nor file '%s'" % ( stable ) + sys.stderr.write( "%s\n" % msg ) + sys.exit(1) + tmp = stable.replace(".","_") + db.create_table( db, tmp, stable, qtype ) + stable = tmp + + string = "input tables:" + string += "\nquery table: %s ('%s' format)" % ( qtable, qtype ) + string += "\nsubject table: %s ('%s' format)" % ( stable, stype ) + logging.info( string ) + + if by_path: + fout = open(qtable+"_vs_"+stable+".dat","w") + pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose ) + fout.close() + else: + getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose ) + + logging.info("finished") + + if verbose > 0: + print "END %s" % (sys.argv[0].split("/")[-1]) + sys.stdout.flush() + + +if __name__ == "__main__": + main()