Mercurial > repos > yufei-luo > s_mart
view commons/tools/srptTableOverlap.py @ 19:9bcfa7936eec
Deleted selected files
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:23:29 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line source
#!/usr/bin/env python import os import sys import getopt import logging import string import ConfigParser from pyRepet.sql.TableAdaptator import * import pyRepet.sql.RepetDBMySQL import pyRepet.coord.Map import pyRepet.coord.Path import pyRepet.coord.Set def help(): print print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] ) print "options:" print " -h: this help" print " -q: query table" print " -s: subject table" print " -p: by path" print " -t: table type comparison: qtype/stype where qtype=[map,set,path] and stype=[path,set,map]" print " -c: configuration file from TEdenovo or TEannot pipeline" print " -H: MySQL host (if no configuration file)" print " -U: MySQL user (if no configuration file)" print " -P: MySQL password (if no configuration file)" print " -D: MySQL database (if no configuration file)" print def pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose=0 ): if qtype == "path": db.create_path_index( qtable ) qtablePathAdaptator = TablePathAdaptator( db, qtable ) path_num_list = qtablePathAdaptator.getPath_num() elif qtype == "set": db.create_set_index( qtable ) qtableSetAdaptator = TableSetAdaptator( db, qtable ) path_num_list = qtableSetAdaptator.getSet_num() else: string = "unknown query table type: %s" % ( qtype ) if verbose > 0: print string logging.error( string ) sys.exit(1) string = "nb of paths in query table: %i" % (len(path_num_list) ) if verbose > 0: print string logging.info( string ) if stype == "path": stablePathAdaptator = TableBinPathAdaptator( db, stable ) # stablePathAdaptator=TablePathAdaptator(db,stable) elif stype == "set": stableSetAdaptator = TableBinSetAdaptator( db, stable ) # stableSetAdaptator=TableSetAdaptator(db,stable) else: string = "unknown subject table type: %s" % ( stype ) if verbose > 0: print string logging.error( string ) sys.exit(1) count = 0 for path_num in path_num_list: if qtype == "path": qlist = qtablePathAdaptator.getPathList_from_num( path_num ) qlist = pyRepet.coord.Path.path_list_rangeQ2Set( qlist ) elif qtype == "set": qlist = qtableSetAdaptator.getSetList_from_num( path_num ) qlist.sort() qmin, qmax = pyRepet.coord.Set.set_list_boundaries( qlist ) qmin = qmin - 1 qmax = qmax + 1 if stype == "path": slist = stablePathAdaptator.getPathList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax) slist = pyRepet.coord.Path.path_list_rangeQ2Set( slist ) elif stype == "set": slist = stableSetAdaptator.getSetList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax) if len(slist) > 0: print "----------------------------------------" print "query:" pyRepet.coord.Set.set_list_show( qlist ) qlist=pyRepet.coord.Set.set_list_merge( qlist ) qsize=pyRepet.coord.Set.set_list_size( qlist ) print "query size=",qsize slist_dict = pyRepet.coord.Set.set_list_split( slist ) subj_names = "" for i,l in slist_dict.items(): if subj_names != "": subj_names += "|" subj_names += "%d:%s" % (i,l[0].name) subj_count = len(slist_dict.keys()) print "subject:" pyRepet.coord.Set.set_list_show( slist ) slist = pyRepet.coord.Set.set_list_merge( slist ) ssize = pyRepet.coord.Set.set_list_size( slist ) osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist ) print "subject size=",ssize print "overlap size=",osize fout.write("%d\t%s\t%d\t%s\t%d\t%d\t%d\t%f\t%f\n"\ %(path_num,qlist[0].name,\ qsize,\ subj_names,\ subj_count,\ ssize,\ osize,\ float(osize)/qsize,\ float(osize)/ssize)) else: print "----------------------------------------" print "query:" pyRepet.coord.Set.set_list_show( qlist ) qlist = pyRepet.coord.Set.set_list_merge( qlist ) qsize = pyRepet.coord.Set.set_list_size( qlist ) print "query size=",qsize print "No match!" fout.write("%d\t%s\t%d\t-\t0\t0\t0\t0.0\t0.0\n"\ %(path_num,qlist[0].name,qsize)) def getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose=0 ): """ For each query in qtable, compute the overlap between its matches and the matches in stable. """ if qtype =="map": qtableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, qtable ) elif qtype == "path": qtableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, qtable ) elif qtype == "set": qtableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, qtable ) else: string = "unknown query table type: %s" % ( qtype ) if verbose > 0: print string logging.error( string ) sys.exit(1) string = "fetching query table data..." contigs = qtableAdaptator.getContig_name() string += " %i contig(s)" % ( len(contigs) ) if verbose > 0: print string; sys.stdout.flush() logging.info( string ) if stype == "map": stableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, stable ) elif stype == "path": stableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, stable ) elif stype == "set": stableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, stable ) else: string = "unknown subject table type: %s" % ( stype ) if verbose > 0: print string logging.error( string ) sys.exit(1) string = "looking for overlaps with subject data..." if verbose > 0: print string; sys.stdout.flush() logging.info( string ) sum_qsize = 0 sum_osize = 0 sum_non_osize = 0 for c in contigs: string = "contig '%s': "%(c) qlist = qtableAdaptator.getSetList_from_contig( c ) qlist = pyRepet.coord.Set.set_list_merge( qlist ) slist = stableAdaptator.getSetList_from_contig( c ) slist = pyRepet.coord.Set.set_list_merge( slist ) qsize = pyRepet.coord.Set.set_list_size( qlist ) osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist ) sum_osize += osize sum_qsize += qsize sum_non_osize += qsize - osize string += "qsize=%d osize=%d" % ( qsize, osize ) logging.debug( string ) if verbose > 0: print string; sys.stdout.flush() string = "summary:" string += "\ncumulative coverage of the query table: %i nt" % ( sum_qsize ) string += "\nsize of overlaps with the subject table: %i nt" % ( sum_osize ) string += "\n proportion of query: %.3f" % ( float(sum_osize)/sum_qsize ) string += "\nsize of non-overlaps with the subject table: %i nt" % ( sum_non_osize ) string += "\n proportion of query: %.3f" % ( float(sum_non_osize)/sum_qsize ) if verbose > 0: print string; sys.stdout.flush() logging.info( string ) return sum_osize, sum_non_osize, sum_qsize def main (): """ This program computes the overlaps between two tables recording spatial coordinates. """ qtable = "" stable = "" type = "" by_path = False configFileName = "" host = "" user = "" passwd = "" db = "" verbose = 0 try: opts, args = getopt.getopt( sys.argv[1:], "hq:s:t:pc:H:U:P:D:v:" ) except getopt.GetoptError: help() sys.exit(1) if len(args) != 0: help() sys.exit(1) for o,a in opts: if o == "-h": help() sys.exit(0) elif o == "-q": qtable = a elif o == "-s": stable = a elif o == "-t": type = a elif o == "-p": by_path = True elif o == "-c": configFileName = a elif o == "-H": host = a elif o == "-U": user = a elif o == "-P": passwd = a elif o == "-D": db = a elif o == "-v": verbose = int(a) if qtable=="" or stable=="" or \ (configFileName== "" and (host=="" or \ user=="" or passwd=="" or db=="")): print "ERROR: missing compulsory options" help() sys.exit(1) if verbose > 0: print "START %s" % (sys.argv[0].split("/")[-1]) sys.stdout.flush() if configFileName != "": config = ConfigParser.ConfigParser() config.readfp( open(configFileName) ) host = config.get("repet_env","repet_host") user = config.get("repet_env","repet_user") passwd = config.get("repet_env","repet_pw") dbname = config.get("repet_env","repet_db") logfilename = qtable + "-" + stable + "-" + str(os.getpid()) + ".log" handler = logging.FileHandler( logfilename ) formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s") handler.setFormatter( formatter ) logging.getLogger('').addHandler(handler) logging.getLogger('').setLevel(logging.DEBUG) logging.info("started") db = pyRepet.sql.RepetDBMySQL.RepetDB( user, host, passwd, dbname ) qtype, stype = type.split("/") if not db.exist( qtable ): if not os.path.exists( qtable ): msg = "ERROR: neither table nor file '%s'" % ( qtable ) sys.stderr.write( "%s\n" % msg ) sys.exit(1) tmp = qtable.replace(".","_") db.create_table( db, tmp, qtable, qtype ) qtable = tmp if not db.exist( stable ): if not os.path.exists( stable ): msg = "ERROR: neither table nor file '%s'" % ( stable ) sys.stderr.write( "%s\n" % msg ) sys.exit(1) tmp = stable.replace(".","_") db.create_table( db, tmp, stable, qtype ) stable = tmp string = "input tables:" string += "\nquery table: %s ('%s' format)" % ( qtable, qtype ) string += "\nsubject table: %s ('%s' format)" % ( stable, stype ) logging.info( string ) if by_path: fout = open(qtable+"_vs_"+stable+".dat","w") pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose ) fout.close() else: getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose ) logging.info("finished") if verbose > 0: print "END %s" % (sys.argv[0].split("/")[-1]) sys.stdout.flush() if __name__ == "__main__": main()