18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import os
|
|
4 import sys
|
|
5 import getopt
|
|
6 import logging
|
|
7 import string
|
|
8 import ConfigParser
|
|
9
|
|
10 from pyRepet.sql.TableAdaptator import *
|
|
11 import pyRepet.sql.RepetDBMySQL
|
|
12 import pyRepet.coord.Map
|
|
13 import pyRepet.coord.Path
|
|
14 import pyRepet.coord.Set
|
|
15
|
|
16
|
|
17 def help():
|
|
18 print
|
|
19 print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] )
|
|
20 print "options:"
|
|
21 print " -h: this help"
|
|
22 print " -q: query table"
|
|
23 print " -s: subject table"
|
|
24 print " -p: by path"
|
|
25 print " -t: table type comparison: qtype/stype where qtype=[map,set,path] and stype=[path,set,map]"
|
|
26 print " -c: configuration file from TEdenovo or TEannot pipeline"
|
|
27 print " -H: MySQL host (if no configuration file)"
|
|
28 print " -U: MySQL user (if no configuration file)"
|
|
29 print " -P: MySQL password (if no configuration file)"
|
|
30 print " -D: MySQL database (if no configuration file)"
|
|
31 print
|
|
32
|
|
33
|
|
34 def pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose=0 ):
|
|
35
|
|
36 if qtype == "path":
|
|
37 db.create_path_index( qtable )
|
|
38 qtablePathAdaptator = TablePathAdaptator( db, qtable )
|
|
39 path_num_list = qtablePathAdaptator.getPath_num()
|
|
40 elif qtype == "set":
|
|
41 db.create_set_index( qtable )
|
|
42 qtableSetAdaptator = TableSetAdaptator( db, qtable )
|
|
43 path_num_list = qtableSetAdaptator.getSet_num()
|
|
44 else:
|
|
45 string = "unknown query table type: %s" % ( qtype )
|
|
46 if verbose > 0:
|
|
47 print string
|
|
48 logging.error( string )
|
|
49 sys.exit(1)
|
|
50 string = "nb of paths in query table: %i" % (len(path_num_list) )
|
|
51 if verbose > 0:
|
|
52 print string
|
|
53 logging.info( string )
|
|
54
|
|
55 if stype == "path":
|
|
56 stablePathAdaptator = TableBinPathAdaptator( db, stable )
|
|
57 # stablePathAdaptator=TablePathAdaptator(db,stable)
|
|
58 elif stype == "set":
|
|
59 stableSetAdaptator = TableBinSetAdaptator( db, stable )
|
|
60 # stableSetAdaptator=TableSetAdaptator(db,stable)
|
|
61 else:
|
|
62 string = "unknown subject table type: %s" % ( stype )
|
|
63 if verbose > 0:
|
|
64 print string
|
|
65 logging.error( string )
|
|
66 sys.exit(1)
|
|
67
|
|
68 count = 0
|
|
69 for path_num in path_num_list:
|
|
70 if qtype == "path":
|
|
71 qlist = qtablePathAdaptator.getPathList_from_num( path_num )
|
|
72 qlist = pyRepet.coord.Path.path_list_rangeQ2Set( qlist )
|
|
73 elif qtype == "set":
|
|
74 qlist = qtableSetAdaptator.getSetList_from_num( path_num )
|
|
75
|
|
76 qlist.sort()
|
|
77 qmin, qmax = pyRepet.coord.Set.set_list_boundaries( qlist )
|
|
78
|
|
79 qmin = qmin - 1
|
|
80 qmax = qmax + 1
|
|
81 if stype == "path":
|
|
82 slist = stablePathAdaptator.getPathList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax)
|
|
83 slist = pyRepet.coord.Path.path_list_rangeQ2Set( slist )
|
|
84 elif stype == "set":
|
|
85 slist = stableSetAdaptator.getSetList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax)
|
|
86
|
|
87 if len(slist) > 0:
|
|
88 print "----------------------------------------"
|
|
89 print "query:"
|
|
90 pyRepet.coord.Set.set_list_show( qlist )
|
|
91 qlist=pyRepet.coord.Set.set_list_merge( qlist )
|
|
92 qsize=pyRepet.coord.Set.set_list_size( qlist )
|
|
93 print "query size=",qsize
|
|
94
|
|
95 slist_dict = pyRepet.coord.Set.set_list_split( slist )
|
|
96 subj_names = ""
|
|
97 for i,l in slist_dict.items():
|
|
98 if subj_names != "":
|
|
99 subj_names += "|"
|
|
100 subj_names += "%d:%s" % (i,l[0].name)
|
|
101 subj_count = len(slist_dict.keys())
|
|
102
|
|
103 print "subject:"
|
|
104 pyRepet.coord.Set.set_list_show( slist )
|
|
105 slist = pyRepet.coord.Set.set_list_merge( slist )
|
|
106 ssize = pyRepet.coord.Set.set_list_size( slist )
|
|
107 osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist )
|
|
108
|
|
109 print "subject size=",ssize
|
|
110 print "overlap size=",osize
|
|
111
|
|
112 fout.write("%d\t%s\t%d\t%s\t%d\t%d\t%d\t%f\t%f\n"\
|
|
113 %(path_num,qlist[0].name,\
|
|
114 qsize,\
|
|
115 subj_names,\
|
|
116 subj_count,\
|
|
117 ssize,\
|
|
118 osize,\
|
|
119 float(osize)/qsize,\
|
|
120 float(osize)/ssize))
|
|
121 else:
|
|
122 print "----------------------------------------"
|
|
123 print "query:"
|
|
124 pyRepet.coord.Set.set_list_show( qlist )
|
|
125 qlist = pyRepet.coord.Set.set_list_merge( qlist )
|
|
126 qsize = pyRepet.coord.Set.set_list_size( qlist )
|
|
127 print "query size=",qsize
|
|
128 print "No match!"
|
|
129 fout.write("%d\t%s\t%d\t-\t0\t0\t0\t0.0\t0.0\n"\
|
|
130 %(path_num,qlist[0].name,qsize))
|
|
131
|
|
132
|
|
133 def getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose=0 ):
|
|
134 """
|
|
135 For each query in qtable, compute the overlap between its matches and the matches in stable.
|
|
136 """
|
|
137 if qtype =="map":
|
|
138 qtableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, qtable )
|
|
139 elif qtype == "path":
|
|
140 qtableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, qtable )
|
|
141 elif qtype == "set":
|
|
142 qtableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, qtable )
|
|
143 else:
|
|
144 string = "unknown query table type: %s" % ( qtype )
|
|
145 if verbose > 0:
|
|
146 print string
|
|
147 logging.error( string )
|
|
148 sys.exit(1)
|
|
149
|
|
150 string = "fetching query table data..."
|
|
151 contigs = qtableAdaptator.getContig_name()
|
|
152 string += " %i contig(s)" % ( len(contigs) )
|
|
153 if verbose > 0:
|
|
154 print string; sys.stdout.flush()
|
|
155 logging.info( string )
|
|
156
|
|
157 if stype == "map":
|
|
158 stableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, stable )
|
|
159 elif stype == "path":
|
|
160 stableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, stable )
|
|
161 elif stype == "set":
|
|
162 stableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, stable )
|
|
163 else:
|
|
164 string = "unknown subject table type: %s" % ( stype )
|
|
165 if verbose > 0:
|
|
166 print string
|
|
167 logging.error( string )
|
|
168 sys.exit(1)
|
|
169
|
|
170 string = "looking for overlaps with subject data..."
|
|
171 if verbose > 0:
|
|
172 print string; sys.stdout.flush()
|
|
173 logging.info( string )
|
|
174 sum_qsize = 0
|
|
175 sum_osize = 0
|
|
176 sum_non_osize = 0
|
|
177 for c in contigs:
|
|
178 string = "contig '%s': "%(c)
|
|
179 qlist = qtableAdaptator.getSetList_from_contig( c )
|
|
180 qlist = pyRepet.coord.Set.set_list_merge( qlist )
|
|
181 slist = stableAdaptator.getSetList_from_contig( c )
|
|
182 slist = pyRepet.coord.Set.set_list_merge( slist )
|
|
183 qsize = pyRepet.coord.Set.set_list_size( qlist )
|
|
184 osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist )
|
|
185 sum_osize += osize
|
|
186 sum_qsize += qsize
|
|
187 sum_non_osize += qsize - osize
|
|
188 string += "qsize=%d osize=%d" % ( qsize, osize )
|
|
189 logging.debug( string )
|
|
190 if verbose > 0:
|
|
191 print string; sys.stdout.flush()
|
|
192
|
|
193 string = "summary:"
|
|
194 string += "\ncumulative coverage of the query table: %i nt" % ( sum_qsize )
|
|
195 string += "\nsize of overlaps with the subject table: %i nt" % ( sum_osize )
|
|
196 string += "\n proportion of query: %.3f" % ( float(sum_osize)/sum_qsize )
|
|
197 string += "\nsize of non-overlaps with the subject table: %i nt" % ( sum_non_osize )
|
|
198 string += "\n proportion of query: %.3f" % ( float(sum_non_osize)/sum_qsize )
|
|
199 if verbose > 0:
|
|
200 print string; sys.stdout.flush()
|
|
201 logging.info( string )
|
|
202
|
|
203 return sum_osize, sum_non_osize, sum_qsize
|
|
204
|
|
205
|
|
206 def main ():
|
|
207 """
|
|
208 This program computes the overlaps between two tables recording spatial coordinates.
|
|
209 """
|
|
210 qtable = ""
|
|
211 stable = ""
|
|
212 type = ""
|
|
213 by_path = False
|
|
214 configFileName = ""
|
|
215 host = ""
|
|
216 user = ""
|
|
217 passwd = ""
|
|
218 db = ""
|
|
219 verbose = 0
|
|
220 try:
|
|
221 opts, args = getopt.getopt( sys.argv[1:], "hq:s:t:pc:H:U:P:D:v:" )
|
|
222 except getopt.GetoptError:
|
|
223 help()
|
|
224 sys.exit(1)
|
|
225 if len(args) != 0:
|
|
226 help()
|
|
227 sys.exit(1)
|
|
228 for o,a in opts:
|
|
229 if o == "-h":
|
|
230 help()
|
|
231 sys.exit(0)
|
|
232 elif o == "-q":
|
|
233 qtable = a
|
|
234 elif o == "-s":
|
|
235 stable = a
|
|
236 elif o == "-t":
|
|
237 type = a
|
|
238 elif o == "-p":
|
|
239 by_path = True
|
|
240 elif o == "-c":
|
|
241 configFileName = a
|
|
242 elif o == "-H":
|
|
243 host = a
|
|
244 elif o == "-U":
|
|
245 user = a
|
|
246 elif o == "-P":
|
|
247 passwd = a
|
|
248 elif o == "-D":
|
|
249 db = a
|
|
250 elif o == "-v":
|
|
251 verbose = int(a)
|
|
252 if qtable=="" or stable=="" or \
|
|
253 (configFileName== "" and (host=="" or \
|
|
254 user=="" or passwd=="" or db=="")):
|
|
255 print "ERROR: missing compulsory options"
|
|
256 help()
|
|
257 sys.exit(1)
|
|
258 if verbose > 0:
|
|
259 print "START %s" % (sys.argv[0].split("/")[-1])
|
|
260 sys.stdout.flush()
|
|
261
|
|
262 if configFileName != "":
|
|
263 config = ConfigParser.ConfigParser()
|
|
264 config.readfp( open(configFileName) )
|
|
265 host = config.get("repet_env","repet_host")
|
|
266 user = config.get("repet_env","repet_user")
|
|
267 passwd = config.get("repet_env","repet_pw")
|
|
268 dbname = config.get("repet_env","repet_db")
|
|
269
|
|
270 logfilename = qtable + "-" + stable + "-" + str(os.getpid()) + ".log"
|
|
271 handler = logging.FileHandler( logfilename )
|
|
272 formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
|
|
273 handler.setFormatter( formatter )
|
|
274 logging.getLogger('').addHandler(handler)
|
|
275 logging.getLogger('').setLevel(logging.DEBUG)
|
|
276 logging.info("started")
|
|
277
|
|
278 db = pyRepet.sql.RepetDBMySQL.RepetDB( user, host, passwd, dbname )
|
|
279
|
|
280 qtype, stype = type.split("/")
|
|
281
|
|
282 if not db.exist( qtable ):
|
|
283 if not os.path.exists( qtable ):
|
|
284 msg = "ERROR: neither table nor file '%s'" % ( qtable )
|
|
285 sys.stderr.write( "%s\n" % msg )
|
|
286 sys.exit(1)
|
|
287 tmp = qtable.replace(".","_")
|
|
288 db.create_table( db, tmp, qtable, qtype )
|
|
289 qtable = tmp
|
|
290 if not db.exist( stable ):
|
|
291 if not os.path.exists( stable ):
|
|
292 msg = "ERROR: neither table nor file '%s'" % ( stable )
|
|
293 sys.stderr.write( "%s\n" % msg )
|
|
294 sys.exit(1)
|
|
295 tmp = stable.replace(".","_")
|
|
296 db.create_table( db, tmp, stable, qtype )
|
|
297 stable = tmp
|
|
298
|
|
299 string = "input tables:"
|
|
300 string += "\nquery table: %s ('%s' format)" % ( qtable, qtype )
|
|
301 string += "\nsubject table: %s ('%s' format)" % ( stable, stype )
|
|
302 logging.info( string )
|
|
303
|
|
304 if by_path:
|
|
305 fout = open(qtable+"_vs_"+stable+".dat","w")
|
|
306 pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose )
|
|
307 fout.close()
|
|
308 else:
|
|
309 getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose )
|
|
310
|
|
311 logging.info("finished")
|
|
312
|
|
313 if verbose > 0:
|
|
314 print "END %s" % (sys.argv[0].split("/")[-1])
|
|
315 sys.stdout.flush()
|
|
316
|
|
317
|
|
318 if __name__ == "__main__":
|
|
319 main()
|