Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/srptTableOverlap.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os | |
4 import sys | |
5 import getopt | |
6 import logging | |
7 import string | |
8 import ConfigParser | |
9 | |
10 from pyRepet.sql.TableAdaptator import * | |
11 import pyRepet.sql.RepetDBMySQL | |
12 import pyRepet.coord.Map | |
13 import pyRepet.coord.Path | |
14 import pyRepet.coord.Set | |
15 | |
16 | |
17 def help(): | |
18 print | |
19 print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] ) | |
20 print "options:" | |
21 print " -h: this help" | |
22 print " -q: query table" | |
23 print " -s: subject table" | |
24 print " -p: by path" | |
25 print " -t: table type comparison: qtype/stype where qtype=[map,set,path] and stype=[path,set,map]" | |
26 print " -c: configuration file from TEdenovo or TEannot pipeline" | |
27 print " -H: MySQL host (if no configuration file)" | |
28 print " -U: MySQL user (if no configuration file)" | |
29 print " -P: MySQL password (if no configuration file)" | |
30 print " -D: MySQL database (if no configuration file)" | |
31 print | |
32 | |
33 | |
34 def pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose=0 ): | |
35 | |
36 if qtype == "path": | |
37 db.create_path_index( qtable ) | |
38 qtablePathAdaptator = TablePathAdaptator( db, qtable ) | |
39 path_num_list = qtablePathAdaptator.getPath_num() | |
40 elif qtype == "set": | |
41 db.create_set_index( qtable ) | |
42 qtableSetAdaptator = TableSetAdaptator( db, qtable ) | |
43 path_num_list = qtableSetAdaptator.getSet_num() | |
44 else: | |
45 string = "unknown query table type: %s" % ( qtype ) | |
46 if verbose > 0: | |
47 print string | |
48 logging.error( string ) | |
49 sys.exit(1) | |
50 string = "nb of paths in query table: %i" % (len(path_num_list) ) | |
51 if verbose > 0: | |
52 print string | |
53 logging.info( string ) | |
54 | |
55 if stype == "path": | |
56 stablePathAdaptator = TableBinPathAdaptator( db, stable ) | |
57 # stablePathAdaptator=TablePathAdaptator(db,stable) | |
58 elif stype == "set": | |
59 stableSetAdaptator = TableBinSetAdaptator( db, stable ) | |
60 # stableSetAdaptator=TableSetAdaptator(db,stable) | |
61 else: | |
62 string = "unknown subject table type: %s" % ( stype ) | |
63 if verbose > 0: | |
64 print string | |
65 logging.error( string ) | |
66 sys.exit(1) | |
67 | |
68 count = 0 | |
69 for path_num in path_num_list: | |
70 if qtype == "path": | |
71 qlist = qtablePathAdaptator.getPathList_from_num( path_num ) | |
72 qlist = pyRepet.coord.Path.path_list_rangeQ2Set( qlist ) | |
73 elif qtype == "set": | |
74 qlist = qtableSetAdaptator.getSetList_from_num( path_num ) | |
75 | |
76 qlist.sort() | |
77 qmin, qmax = pyRepet.coord.Set.set_list_boundaries( qlist ) | |
78 | |
79 qmin = qmin - 1 | |
80 qmax = qmax + 1 | |
81 if stype == "path": | |
82 slist = stablePathAdaptator.getPathList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax) | |
83 slist = pyRepet.coord.Path.path_list_rangeQ2Set( slist ) | |
84 elif stype == "set": | |
85 slist = stableSetAdaptator.getSetList_from_qcoord(qlist[0].seqname.split()[0],qmin,qmax) | |
86 | |
87 if len(slist) > 0: | |
88 print "----------------------------------------" | |
89 print "query:" | |
90 pyRepet.coord.Set.set_list_show( qlist ) | |
91 qlist=pyRepet.coord.Set.set_list_merge( qlist ) | |
92 qsize=pyRepet.coord.Set.set_list_size( qlist ) | |
93 print "query size=",qsize | |
94 | |
95 slist_dict = pyRepet.coord.Set.set_list_split( slist ) | |
96 subj_names = "" | |
97 for i,l in slist_dict.items(): | |
98 if subj_names != "": | |
99 subj_names += "|" | |
100 subj_names += "%d:%s" % (i,l[0].name) | |
101 subj_count = len(slist_dict.keys()) | |
102 | |
103 print "subject:" | |
104 pyRepet.coord.Set.set_list_show( slist ) | |
105 slist = pyRepet.coord.Set.set_list_merge( slist ) | |
106 ssize = pyRepet.coord.Set.set_list_size( slist ) | |
107 osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist ) | |
108 | |
109 print "subject size=",ssize | |
110 print "overlap size=",osize | |
111 | |
112 fout.write("%d\t%s\t%d\t%s\t%d\t%d\t%d\t%f\t%f\n"\ | |
113 %(path_num,qlist[0].name,\ | |
114 qsize,\ | |
115 subj_names,\ | |
116 subj_count,\ | |
117 ssize,\ | |
118 osize,\ | |
119 float(osize)/qsize,\ | |
120 float(osize)/ssize)) | |
121 else: | |
122 print "----------------------------------------" | |
123 print "query:" | |
124 pyRepet.coord.Set.set_list_show( qlist ) | |
125 qlist = pyRepet.coord.Set.set_list_merge( qlist ) | |
126 qsize = pyRepet.coord.Set.set_list_size( qlist ) | |
127 print "query size=",qsize | |
128 print "No match!" | |
129 fout.write("%d\t%s\t%d\t-\t0\t0\t0\t0.0\t0.0\n"\ | |
130 %(path_num,qlist[0].name,qsize)) | |
131 | |
132 | |
133 def getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose=0 ): | |
134 """ | |
135 For each query in qtable, compute the overlap between its matches and the matches in stable. | |
136 """ | |
137 if qtype =="map": | |
138 qtableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, qtable ) | |
139 elif qtype == "path": | |
140 qtableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, qtable ) | |
141 elif qtype == "set": | |
142 qtableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, qtable ) | |
143 else: | |
144 string = "unknown query table type: %s" % ( qtype ) | |
145 if verbose > 0: | |
146 print string | |
147 logging.error( string ) | |
148 sys.exit(1) | |
149 | |
150 string = "fetching query table data..." | |
151 contigs = qtableAdaptator.getContig_name() | |
152 string += " %i contig(s)" % ( len(contigs) ) | |
153 if verbose > 0: | |
154 print string; sys.stdout.flush() | |
155 logging.info( string ) | |
156 | |
157 if stype == "map": | |
158 stableAdaptator = pyRepet.sql.TableAdaptator.TableMapAdaptator( db, stable ) | |
159 elif stype == "path": | |
160 stableAdaptator = pyRepet.sql.TableAdaptator.TablePathAdaptator( db, stable ) | |
161 elif stype == "set": | |
162 stableAdaptator = pyRepet.sql.TableAdaptator.TableSetAdaptator( db, stable ) | |
163 else: | |
164 string = "unknown subject table type: %s" % ( stype ) | |
165 if verbose > 0: | |
166 print string | |
167 logging.error( string ) | |
168 sys.exit(1) | |
169 | |
170 string = "looking for overlaps with subject data..." | |
171 if verbose > 0: | |
172 print string; sys.stdout.flush() | |
173 logging.info( string ) | |
174 sum_qsize = 0 | |
175 sum_osize = 0 | |
176 sum_non_osize = 0 | |
177 for c in contigs: | |
178 string = "contig '%s': "%(c) | |
179 qlist = qtableAdaptator.getSetList_from_contig( c ) | |
180 qlist = pyRepet.coord.Set.set_list_merge( qlist ) | |
181 slist = stableAdaptator.getSetList_from_contig( c ) | |
182 slist = pyRepet.coord.Set.set_list_merge( slist ) | |
183 qsize = pyRepet.coord.Set.set_list_size( qlist ) | |
184 osize = pyRepet.coord.Set.set_list_overlap_size( qlist, slist ) | |
185 sum_osize += osize | |
186 sum_qsize += qsize | |
187 sum_non_osize += qsize - osize | |
188 string += "qsize=%d osize=%d" % ( qsize, osize ) | |
189 logging.debug( string ) | |
190 if verbose > 0: | |
191 print string; sys.stdout.flush() | |
192 | |
193 string = "summary:" | |
194 string += "\ncumulative coverage of the query table: %i nt" % ( sum_qsize ) | |
195 string += "\nsize of overlaps with the subject table: %i nt" % ( sum_osize ) | |
196 string += "\n proportion of query: %.3f" % ( float(sum_osize)/sum_qsize ) | |
197 string += "\nsize of non-overlaps with the subject table: %i nt" % ( sum_non_osize ) | |
198 string += "\n proportion of query: %.3f" % ( float(sum_non_osize)/sum_qsize ) | |
199 if verbose > 0: | |
200 print string; sys.stdout.flush() | |
201 logging.info( string ) | |
202 | |
203 return sum_osize, sum_non_osize, sum_qsize | |
204 | |
205 | |
206 def main (): | |
207 """ | |
208 This program computes the overlaps between two tables recording spatial coordinates. | |
209 """ | |
210 qtable = "" | |
211 stable = "" | |
212 type = "" | |
213 by_path = False | |
214 configFileName = "" | |
215 host = "" | |
216 user = "" | |
217 passwd = "" | |
218 db = "" | |
219 verbose = 0 | |
220 try: | |
221 opts, args = getopt.getopt( sys.argv[1:], "hq:s:t:pc:H:U:P:D:v:" ) | |
222 except getopt.GetoptError: | |
223 help() | |
224 sys.exit(1) | |
225 if len(args) != 0: | |
226 help() | |
227 sys.exit(1) | |
228 for o,a in opts: | |
229 if o == "-h": | |
230 help() | |
231 sys.exit(0) | |
232 elif o == "-q": | |
233 qtable = a | |
234 elif o == "-s": | |
235 stable = a | |
236 elif o == "-t": | |
237 type = a | |
238 elif o == "-p": | |
239 by_path = True | |
240 elif o == "-c": | |
241 configFileName = a | |
242 elif o == "-H": | |
243 host = a | |
244 elif o == "-U": | |
245 user = a | |
246 elif o == "-P": | |
247 passwd = a | |
248 elif o == "-D": | |
249 db = a | |
250 elif o == "-v": | |
251 verbose = int(a) | |
252 if qtable=="" or stable=="" or \ | |
253 (configFileName== "" and (host=="" or \ | |
254 user=="" or passwd=="" or db=="")): | |
255 print "ERROR: missing compulsory options" | |
256 help() | |
257 sys.exit(1) | |
258 if verbose > 0: | |
259 print "START %s" % (sys.argv[0].split("/")[-1]) | |
260 sys.stdout.flush() | |
261 | |
262 if configFileName != "": | |
263 config = ConfigParser.ConfigParser() | |
264 config.readfp( open(configFileName) ) | |
265 host = config.get("repet_env","repet_host") | |
266 user = config.get("repet_env","repet_user") | |
267 passwd = config.get("repet_env","repet_pw") | |
268 dbname = config.get("repet_env","repet_db") | |
269 | |
270 logfilename = qtable + "-" + stable + "-" + str(os.getpid()) + ".log" | |
271 handler = logging.FileHandler( logfilename ) | |
272 formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s") | |
273 handler.setFormatter( formatter ) | |
274 logging.getLogger('').addHandler(handler) | |
275 logging.getLogger('').setLevel(logging.DEBUG) | |
276 logging.info("started") | |
277 | |
278 db = pyRepet.sql.RepetDBMySQL.RepetDB( user, host, passwd, dbname ) | |
279 | |
280 qtype, stype = type.split("/") | |
281 | |
282 if not db.exist( qtable ): | |
283 if not os.path.exists( qtable ): | |
284 msg = "ERROR: neither table nor file '%s'" % ( qtable ) | |
285 sys.stderr.write( "%s\n" % msg ) | |
286 sys.exit(1) | |
287 tmp = qtable.replace(".","_") | |
288 db.create_table( db, tmp, qtable, qtype ) | |
289 qtable = tmp | |
290 if not db.exist( stable ): | |
291 if not os.path.exists( stable ): | |
292 msg = "ERROR: neither table nor file '%s'" % ( stable ) | |
293 sys.stderr.write( "%s\n" % msg ) | |
294 sys.exit(1) | |
295 tmp = stable.replace(".","_") | |
296 db.create_table( db, tmp, stable, qtype ) | |
297 stable = tmp | |
298 | |
299 string = "input tables:" | |
300 string += "\nquery table: %s ('%s' format)" % ( qtable, qtype ) | |
301 string += "\nsubject table: %s ('%s' format)" % ( stable, stype ) | |
302 logging.info( string ) | |
303 | |
304 if by_path: | |
305 fout = open(qtable+"_vs_"+stable+".dat","w") | |
306 pathOverlapByPath( qtable, qtype, stable, stype, db, fout, verbose ) | |
307 fout.close() | |
308 else: | |
309 getOverlapAllPaths( qtable, qtype, stable, stype, db, verbose ) | |
310 | |
311 logging.info("finished") | |
312 | |
313 if verbose > 0: | |
314 print "END %s" % (sys.argv[0].split("/")[-1]) | |
315 sys.stdout.flush() | |
316 | |
317 | |
318 if __name__ == "__main__": | |
319 main() |