18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 ###@file
|
|
4 # Read a file recording matches in the 'tab' format (output from Matcher) and return the number of matches between queries and subjects being CC, CI, IC and II.
|
|
5 # A match is said to be CC (for complete-complete) when both query and subject match over x% of their entire respective length. By default, x=95.
|
|
6 #
|
|
7 # usage: tabFileReader.py [ options ]
|
|
8 # options:
|
|
9 # -h: this help
|
|
10 # -m: name of the file recording the matches (format='tab', output from Matcher)
|
|
11 # -q: name of the fasta file recording the queries
|
|
12 # -s: name of the fasta file recording the subjects
|
|
13 # -t: threshold over which the match is 'complete', in % of the seq length (default=95)
|
|
14 # -i: identity below which matches are ignored (default=0)
|
|
15 # -l: length below which matches are ignored (default=0)
|
|
16 # -o: overlap on query and subject below which matches are ignored (default=0)
|
|
17 # -v: verbose (default=0/1)
|
|
18
|
|
19 import sys
|
|
20 import getopt
|
|
21 from string import *
|
|
22
|
|
23 import pyRepet.seq.BioseqDB
|
|
24 import pyRepet.util.Stat
|
|
25
|
|
26 #TODO: remove case changes in headers (4 lower() method calls in this script)
|
|
27
|
|
28 #----------------------------------------------------------------------------
|
|
29
|
|
30 def help():
|
|
31 print
|
|
32 print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] )
|
|
33 print "options:"
|
|
34 print " -h: this help"
|
|
35 print " -m: name of the file recording the matches (format='tab', output from Matcher)"
|
|
36 print " -q: name of the fasta file recording the queries"
|
|
37 print " -s: name of the fasta file recording the subjects"
|
|
38 print " -t: coverage threshold over which the match is 'complete' (in %% of the seq length, default=95)"
|
|
39 print " -i: identity below which matches are ignored (default=0)"
|
|
40 print " -l: length below which matches are ignored (default=0)"
|
|
41 print " -o: overlap on query and subject below which matches are ignored (default=0)"
|
|
42 print " -I: identity threshold for 'CC' matches (default=90)"
|
|
43 print " -E: E-value threshold for 'CC' matches (default=1e-10)"
|
|
44 print " -T: coverage threshold for match length on query compare to subject length (default=90)"
|
|
45 print " -v: verbose (default=0/1)"
|
|
46 print
|
|
47
|
|
48 #----------------------------------------------------------------------------
|
|
49
|
|
50 #here are the fields of a '.tab' file:
|
|
51 #[0]: query sequence name
|
|
52 #[1]: whole match start coordinate on the query sequence
|
|
53 #[2]: whole match end coordinate on the query sequence
|
|
54 #[3]: length on the query sequence
|
|
55 #[4]: length in percentage of the query sequence
|
|
56 #[5]: length on the query relative to the subject length in percentage
|
|
57 #[6]: subject sequence name
|
|
58 #[7]: whole match start coordinate on the subject sequence
|
|
59 #[8]: whole match end coordinate on the subject sequence
|
|
60 #[9]: length on the subject sequence
|
|
61 #[10]: length in percentage of the subject sequence
|
|
62 #[11]: BLAST E-value
|
|
63 #[12]: BLAST score
|
|
64 #[13]: identity percentage
|
|
65 #[14]: path
|
|
66
|
|
67 class tabFileReader( object ):
|
|
68
|
|
69 def __init__( self, line ):
|
|
70
|
|
71 columns = line.split("\t")
|
|
72
|
|
73 self.name_sbj = (columns[6])
|
|
74 self.length_sbj = int(round(int(columns[9])/float(columns[10]),0)) #length of the subject
|
|
75 self.prct_sbj = float(columns[10]) * 100 #prct_sbj = length of the match on the subject divided by the length of the subject * 100
|
|
76 if int(columns[7]) < int(columns[8]):
|
|
77 self.start_sbj = int(columns[7]) #start of the match on the subject
|
|
78 self.end_sbj = int(columns[8]) #end of the match on the subject
|
|
79 else:
|
|
80 self.start_sbj = int(columns[8])
|
|
81 self.end_sbj = int(columns[7])
|
|
82 self.sbj_dist_ends = int(columns[9]) #length on the subject that matches with the query
|
|
83
|
|
84 self.name_qry = columns[0]
|
|
85 self.length_qry = int(round(int(columns[3])/float(columns[4]),0)) #length of the query
|
|
86 self.prct_qry = float(columns[4]) * 100 #prct_qry = length of the match on the query divided by the length of the query * 100
|
|
87 if int(columns[1]) < int(columns[2]):
|
|
88 self.start_qry = int(columns[1]) #start of the match on the query
|
|
89 self.end_qry = int(columns[2]) #end of the match on the query
|
|
90 else:
|
|
91 self.start_qry = int(columns[2])
|
|
92 self.end_qry = int(columns[1])
|
|
93 self.qry_dist_ends = int(columns[3]) #length on the query that matches with the subject
|
|
94
|
|
95 self.length_match = int(columns[3])
|
|
96 self.prct_matchQryOverSbj = float(columns[5]) * 100 #length on the query relative to the subject length in percentage
|
|
97 self.identity = float(columns[13])
|
|
98 self.score = int(columns[12])
|
|
99 self.evalue = float(columns[11])
|
|
100
|
|
101 self.sbj2qry = [self.length_sbj,self.prct_sbj,self.start_sbj,self.end_sbj,self.name_qry,self.length_sbj,self.prct_qry,self.start_qry,self.end_qry,self.identity,self.score]
|
|
102
|
|
103 self.qry2sbj = [self.length_qry,self.prct_qry,self.start_qry,self.end_qry,self.name_sbj,self.length_sbj,self.prct_sbj,self.start_sbj,self.end_sbj,self.identity,self.score]
|
|
104
|
|
105 #----------------------------------------------------------------------------
|
|
106
|
|
107 def make_dico( lMatches ):
|
|
108 """
|
|
109 Record the matches in two dictionaries which keys are the queries or the subjects.
|
|
110 """
|
|
111
|
|
112 Sbj2Qry = {}
|
|
113 Qry2Sbj = {}
|
|
114
|
|
115 for match in lMatches:
|
|
116 if Sbj2Qry.has_key( match.name_sbj ):
|
|
117 Sbj2Qry[match.name_sbj].append( match )
|
|
118 else:
|
|
119 Sbj2Qry[match.name_sbj] = [ match ]
|
|
120 if Qry2Sbj.has_key( match.name_qry ):
|
|
121 Qry2Sbj[match.name_qry].append( match )
|
|
122 else:
|
|
123 Qry2Sbj[match.name_qry] = [ match ]
|
|
124
|
|
125 return [ Sbj2Qry, Qry2Sbj ]
|
|
126
|
|
127 #----------------------------------------------------------------------------
|
|
128
|
|
129 def find_UniqRedun( list_matchs ):
|
|
130
|
|
131 list_total_sbj = [];list_total_qry = []
|
|
132 list_uniq_sbj = [];list_redun_sbj = []
|
|
133 list_uniq_qry = [];list_redun_qry = []
|
|
134
|
|
135 for match in list_matchs:
|
|
136 list_total_sbj.append(match.name_sbj)
|
|
137 list_total_qry.append(match.name_qry)
|
|
138
|
|
139 for name_sbj in list_total_sbj:
|
|
140 if list_total_sbj.count(name_sbj) == 1:
|
|
141 list_uniq_sbj.append(name_sbj)
|
|
142 else:
|
|
143 if name_sbj not in list_redun_sbj:
|
|
144 list_redun_sbj.append(name_sbj)
|
|
145
|
|
146 for name_qry in list_total_qry:
|
|
147 if list_total_qry.count(name_qry) == 1:
|
|
148 list_uniq_qry.append(name_qry)
|
|
149 else:
|
|
150 if name_qry not in list_redun_qry:
|
|
151 list_redun_qry.append(name_qry)
|
|
152
|
|
153 return [ list_uniq_sbj, list_redun_sbj, list_uniq_qry, list_redun_qry ]
|
|
154
|
|
155 #----------------------------------------------------------------------------
|
|
156
|
|
157 def remove( all, sup_sbjqry, sup_sbj, sup_qry, inf_sbjqry ):
|
|
158
|
|
159 for name_sbj in all.keys():
|
|
160
|
|
161 if sup_sbjqry.has_key( name_sbj ) and sup_sbj.has_key( name_sbj ):
|
|
162 del sup_sbj[ name_sbj ]
|
|
163
|
|
164 if sup_sbjqry.has_key( name_sbj ) and sup_qry.has_key( name_sbj ):
|
|
165 del sup_qry[ name_sbj ]
|
|
166
|
|
167 if sup_sbjqry.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ):
|
|
168 del inf_sbjqry[ name_sbj ]
|
|
169
|
|
170 if sup_sbj.has_key( name_sbj ) and sup_qry.has_key( name_sbj ):
|
|
171 del sup_qry[ name_sbj ]
|
|
172
|
|
173 if sup_sbj.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ):
|
|
174 del inf_sbjqry[ name_sbj ]
|
|
175
|
|
176 if sup_qry.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ):
|
|
177 del inf_sbjqry[ name_sbj ]
|
|
178
|
|
179 return [ sup_sbj, sup_qry, inf_sbjqry ]
|
|
180
|
|
181 #----------------------------------------------------------------------------
|
|
182
|
|
183 def write_output( outFile, match_type, Sbj2Qry, dSbj2Cat, Qry2Sbj, dQry2Cat ):
|
|
184 """
|
|
185 Save the results (subjects in each category and its matches) in a human-readable way.
|
|
186 """
|
|
187
|
|
188 if match_type == 'CC':
|
|
189 msg = "Matches with L >= %i%% for subject and query (CC)" % ( thresholdCoverage )
|
|
190 elif match_type == 'CI':
|
|
191 msg = "Matches with L >= %i%% for subject and L < %i%% for query (CI)" % ( thresholdCoverage, thresholdCoverage )
|
|
192 elif match_type == 'IC':
|
|
193 msg = "Matches with L < %i%% for subject and L >= %i%% for query (IC)" % ( thresholdCoverage, thresholdCoverage )
|
|
194 elif match_type == 'II':
|
|
195 msg ="Matches with L < %i%% for subject and query (II)" % ( thresholdCoverage )
|
|
196 if verbose > 1:
|
|
197 print "%s: %i subjects" % ( msg, len(Sbj2Qry.keys()) )
|
|
198 outFile.write("\n%s\n" % ( msg ) )
|
|
199
|
|
200 for name_sbj in Sbj2Qry.keys():
|
|
201 matchs = Sbj2Qry[name_sbj]
|
|
202 if len(matchs) == 1:
|
|
203 outFile.write("-> subject %s (%s: %s,%s) matches with query %s (%s: %s,%s): prct_sbj %.3f & prct_qry %.3f (id=%.3f,Eval=%g)\n" % (name_sbj,matchs[0].length_sbj,matchs[0].start_sbj,matchs[0].end_sbj,matchs[0].name_qry,matchs[0].length_qry,matchs[0].start_qry,matchs[0].end_qry,matchs[0].prct_sbj,matchs[0].prct_qry,matchs[0].identity,matchs[0].evalue))
|
|
204 else:
|
|
205 outFile.write("-> subject %s (%s: %s,%s) matches with %s queries:\n" % (name_sbj,matchs[0].length_sbj,matchs[0].start_sbj,matchs[0].end_sbj,len(matchs)))
|
|
206 for match in matchs:
|
|
207 outFile.write("%s versus %s (%s: %s,%s): prct_sbj %.3f & prct_qry %.3f (id=%.3f,Eval=%g)\n"%(name_sbj,match.name_qry,match.length_qry,match.start_qry,match.end_qry,match.prct_sbj,match.prct_qry,match.identity,match.evalue))
|
|
208
|
|
209 tmpList = []
|
|
210 for name_sbj in Sbj2Qry.keys():
|
|
211 tmpList.append( name_sbj.split(" ")[0].lower() )
|
|
212 tmpList.sort()
|
|
213 for name_sbj in tmpList:
|
|
214 outFile.write( name_sbj+"\n" )
|
|
215 dSbj2Cat[ name_sbj ] = match_type
|
|
216
|
|
217 tmpList = []
|
|
218 for name_qry in Qry2Sbj.keys():
|
|
219 tmpList.append( name_qry.split(" ")[0].lower() )
|
|
220 tmpList.sort()
|
|
221 for name_qry in tmpList:
|
|
222 outFile.write( name_qry+"\n" )
|
|
223 dQry2Cat[ name_qry ] = match_type
|
|
224
|
|
225 #----------------------------------------------------------------------------
|
|
226
|
|
227 def writeSubjectCategory( dSbj2Cat ):
|
|
228 """
|
|
229 Save the category (CC/CI/IC/II/NA) in which each subject has been found.
|
|
230
|
|
231 @param dSbj2Cat: dictionary which keys are subject names and values the category of that subject
|
|
232 @type dSbj2Cat: dictionary
|
|
233 """
|
|
234
|
|
235 # sort the subject names in alphabetical order
|
|
236 lSbjSorted = dSbj2Cat.keys()
|
|
237 lSbjSorted.sort()
|
|
238
|
|
239 catFile = open( tabFileName + "_sbjCategories.txt", "w" )
|
|
240 for sbj in lSbjSorted:
|
|
241 string = "%s\t%s\n" % ( sbj, dSbj2Cat[ sbj ] )
|
|
242 catFile.write( string )
|
|
243 catFile.close()
|
|
244
|
|
245 #----------------------------------------------------------------------------
|
|
246
|
|
247 def writeQueryCategory( dQry2Cat ):
|
|
248 """
|
|
249 Save the category (CC/CI/IC/II/NA) in which each query has been found.
|
|
250
|
|
251 @param dQry2Cat: dictionary which keys are query names and values the category of that query
|
|
252 @type dQry2Cat: dictionary
|
|
253 """
|
|
254
|
|
255 # sort the query names in alphabetical order
|
|
256 lQrySorted = dQry2Cat.keys()
|
|
257 lQrySorted.sort()
|
|
258
|
|
259 catFile = open( tabFileName + "_qryCategories.txt", "w" )
|
|
260 for qry in lQrySorted:
|
|
261 string = "%s\t%s\n" % ( qry, dQry2Cat[ qry ] )
|
|
262 catFile.write( string )
|
|
263 catFile.close()
|
|
264
|
|
265 #----------------------------------------------------------------------------
|
|
266
|
|
267 def main():
|
|
268
|
|
269 global tabFileName
|
|
270 tabFileName = ""
|
|
271 qryFileName = ""
|
|
272 sbjFileName = ""
|
|
273 global thresholdCoverage
|
|
274 thresholdCoverage = 95
|
|
275 minIdentity = 0
|
|
276 minLength = 0
|
|
277 minOverlap = 0
|
|
278 global thresholdIdentity
|
|
279 thresholdIdentity = 90
|
|
280 global thresholdEvalue
|
|
281 thresholdEvalue = 1e-10
|
|
282 global thresholdCoverageMatch
|
|
283 thresholdCoverageMatch = 90
|
|
284 global verbose
|
|
285 verbose = 0
|
|
286
|
|
287 try:
|
|
288 opts, args = getopt.getopt(sys.argv[1:],"hm:q:s:t:i:l:I:E:T:o:v:")
|
|
289 except getopt.GetoptError, err:
|
|
290 print str(err); help(); sys.exit(1)
|
|
291 for o,a in opts:
|
|
292 if o == "-h":
|
|
293 help()
|
|
294 sys.exit(0)
|
|
295 elif o == "-m":
|
|
296 tabFileName = a
|
|
297 elif o == "-q":
|
|
298 qryFileName = a
|
|
299 elif o == "-s":
|
|
300 sbjFileName = a
|
|
301 elif o == "-t":
|
|
302 thresholdCoverage = int(a)
|
|
303 elif o == "-i":
|
|
304 minIdentity = float(a)
|
|
305 elif o == "-l":
|
|
306 minLength = int(a)
|
|
307 elif o == "-o":
|
|
308 minOverlap = float(a)
|
|
309 elif o == "-I":
|
|
310 thresholdIdentity = int(a)
|
|
311 elif o == "-E":
|
|
312 thresholdEvalue = float(a)
|
|
313 elif o == "-T":
|
|
314 thresholdCoverageMatch = int(a)
|
|
315 elif o == "-v":
|
|
316 verbose = int(a)
|
|
317
|
|
318 if tabFileName == "":
|
|
319 msg = "ERROR: missing 'tab' file (-m)"
|
|
320 sys.stderr.write( "%s\n" % msg )
|
|
321 help()
|
|
322 sys.exit(1)
|
|
323 if qryFileName == "" or sbjFileName == "":
|
|
324 msg = "ERROR: missing 'fasta' files (-q or -s)"
|
|
325 sys.stderr.write( "%s\n" % msg )
|
|
326 help()
|
|
327 sys.exit(1)
|
|
328
|
|
329 if verbose > 0:
|
|
330 print "START %s" % (sys.argv[0].split("/")[-1])
|
|
331 sys.stdout.flush()
|
|
332
|
|
333 # 4 categories of matchs:
|
|
334 # type 1 (CC): the length of the match on the subject is >= 95% of the total length of the subject, idem for the query
|
|
335 # type 2 (CI): sbj >= 95% & qry < 95%
|
|
336 # type 3 (IC): sbj < 95% & qry >= 95%
|
|
337 # type 4 (II): sbj & qry < 95%
|
|
338 ListMatches_all = []
|
|
339 ListMatches_sup_sbjqry = []
|
|
340 ListMatches_sup_sbj = []
|
|
341 ListMatches_sup_qry = []
|
|
342 ListMatches_inf_sbjqry = []
|
|
343
|
|
344 qryDB = pyRepet.seq.BioseqDB.BioseqDB( qryFileName )
|
|
345 nbQry = qryDB.getSize()
|
|
346 if verbose > 0:
|
|
347 print "nb of queries in '%s': %i" % ( qryFileName, nbQry )
|
|
348 dQry2Cat = {}
|
|
349 for bs in qryDB.db:
|
|
350 dQry2Cat[ bs.header.split(" ")[0].lower() ] = "NA"
|
|
351
|
|
352 sbjDB = pyRepet.seq.BioseqDB.BioseqDB( sbjFileName )
|
|
353 nbSbj = sbjDB.getSize()
|
|
354 if verbose > 0:
|
|
355 print "nb of subjects in '%s': %i" % ( sbjFileName, nbSbj )
|
|
356 dSbj2Cat = {}
|
|
357 for bs in sbjDB.db:
|
|
358 dSbj2Cat[ bs.header.split(" ")[0].lower() ] = "NA"
|
|
359
|
|
360 tabFile = open( tabFileName )
|
|
361 nbMatchesInTab = 0
|
|
362 dSubject2DistinctQueries = {}
|
|
363 dQuery2DistinctSubjects = {}
|
|
364
|
|
365 # For each match, create a 'tabFileReader' object and record it in a list according to the type of the match
|
|
366 if verbose > 0:
|
|
367 print "parse the 'tab' file..."; sys.stdout.flush()
|
|
368 while True:
|
|
369 line = tabFile.readline()
|
|
370 if line == "":
|
|
371 break
|
|
372 if line[0:10] == "query.name":
|
|
373 continue
|
|
374 nbMatchesInTab += 1
|
|
375
|
|
376 match = tabFileReader( line )
|
|
377 if match.identity < minIdentity:
|
|
378 line = tabFile.readline()
|
|
379 continue
|
|
380 if match.length_match < minLength:
|
|
381 line = tabFile.readline()
|
|
382 continue
|
|
383 if match.prct_qry < minOverlap or match.prct_sbj < minOverlap:
|
|
384 line = tabFile.readline()
|
|
385 continue
|
|
386 ListMatches_all.append( match )
|
|
387
|
|
388 # type 1: sbj C & qry C
|
|
389 if match.prct_sbj >= thresholdCoverage and match.prct_qry >= thresholdCoverage:
|
|
390 qsLengthRatio = 100 * match.length_qry / float(match.length_sbj)
|
|
391 if match.identity >= thresholdIdentity \
|
|
392 and match.evalue <= thresholdEvalue \
|
|
393 and qsLengthRatio >= thresholdCoverage - 2 \
|
|
394 and qsLengthRatio <= 100 + (100-thresholdCoverage) + 2 \
|
|
395 and match.prct_matchQryOverSbj >= thresholdCoverageMatch:
|
|
396 ListMatches_sup_sbjqry.append( match )
|
|
397 else:
|
|
398 ListMatches_inf_sbjqry.append( match )
|
|
399
|
|
400 # type 2: sbj C & qry I
|
|
401 elif match.prct_sbj >= thresholdCoverage and match.prct_qry < thresholdCoverage:
|
|
402 ListMatches_sup_sbj.append( match )
|
|
403
|
|
404 # type 3: sbj I & qry C
|
|
405 elif match.prct_qry >= thresholdCoverage and match.prct_sbj < thresholdCoverage:
|
|
406 ListMatches_sup_qry.append( match )
|
|
407
|
|
408 # type 4: sbj I & qry I
|
|
409 elif match.prct_qry < thresholdCoverage and match.prct_sbj < thresholdCoverage:
|
|
410 ListMatches_inf_sbjqry.append( match )
|
|
411
|
|
412 if not dSubject2DistinctQueries.has_key( match.name_sbj ):
|
|
413 dSubject2DistinctQueries[ match.name_sbj ] = []
|
|
414 if not match.name_qry in dSubject2DistinctQueries[ match.name_sbj ]:
|
|
415 dSubject2DistinctQueries[ match.name_sbj ].append( match.name_qry )
|
|
416 if not dQuery2DistinctSubjects.has_key( match.name_qry ):
|
|
417 dQuery2DistinctSubjects[ match.name_qry ] = []
|
|
418 if not match.name_sbj in dQuery2DistinctSubjects[ match.name_qry ]:
|
|
419 dQuery2DistinctSubjects[ match.name_qry ].append( match.name_sbj )
|
|
420
|
|
421 if verbose > 0:
|
|
422 print "parsing done !"; sys.stdout.flush()
|
|
423 print "nb matches in '%s': %i" % ( tabFileName, nbMatchesInTab )
|
|
424 print "nb matches 'CC': %i" % ( len(ListMatches_sup_sbjqry) )
|
|
425 if verbose > 1:
|
|
426 for match in ListMatches_sup_sbjqry:
|
|
427 print "\t%s (%.2f%%) - %s (%.2f%%) id=%.2f" % ( match.name_sbj, match.prct_sbj, match.name_qry, match.prct_qry, match.identity )
|
|
428 print "nb matches 'CI': %i" % ( len(ListMatches_sup_sbj) )
|
|
429 if verbose > 1:
|
|
430 for match in ListMatches_sup_sbj:
|
|
431 print "\t%s (%.2f%%) - %s (%.2f%%) id=%.2f" % ( match.name_sbj, match.prct_sbj, match.name_qry, match.prct_qry, match.identity )
|
|
432 print "nb matches 'IC': %i" % ( len(ListMatches_sup_qry) )
|
|
433 print "nb matches 'II': %i" % ( len(ListMatches_inf_sbjqry) )
|
|
434
|
|
435 if nbMatchesInTab == 0:
|
|
436 print "nothing to do"
|
|
437 sys.exit(0)
|
|
438
|
|
439 # For each type of matchs, record them in 2 dictionaries: Sbj2Qry and Qry2Sbj
|
|
440 D_all = make_dico( ListMatches_all )
|
|
441 Sbj2Qry_all = D_all[0]
|
|
442 Qry2Sbj_all = D_all[1]
|
|
443
|
|
444 D_sup_sbjqry = make_dico(ListMatches_sup_sbjqry)
|
|
445 Sbj2Qry_sup_sbjqry = D_sup_sbjqry[0]
|
|
446 Qry2Sbj_sup_sbjqry = D_sup_sbjqry[1]
|
|
447
|
|
448 D_sup_sbj = make_dico(ListMatches_sup_sbj)
|
|
449 Sbj2Qry_sup_sbj = D_sup_sbj[0]
|
|
450 Qry2Sbj_sup_sbj = D_sup_sbj[1]
|
|
451
|
|
452 D_sup_qry = make_dico(ListMatches_sup_qry)
|
|
453 Sbj2Qry_sup_qry = D_sup_qry[0]
|
|
454 Qry2Sbj_sup_qry = D_sup_qry[1]
|
|
455
|
|
456 D_inf_sbjqry = make_dico(ListMatches_inf_sbjqry)
|
|
457 Sbj2Qry_inf_sbjqry = D_inf_sbjqry[0]
|
|
458 Qry2Sbj_inf_sbjqry = D_inf_sbjqry[1]
|
|
459
|
|
460
|
|
461 # For each type of matches, find the subjects/queries that are involve in one or several match
|
|
462 list_all = find_UniqRedun(ListMatches_all)
|
|
463 UniqSbj_all = list_all[0]
|
|
464 RedunSbj_all = list_all[1]
|
|
465 UniqQry_all = list_all[2]
|
|
466 RedunQry_all = list_all[3]
|
|
467
|
|
468 list1 = find_UniqRedun(ListMatches_sup_sbjqry)
|
|
469 UniqSbj_sup_sbjqry = list1[0]
|
|
470 RedunSbj_sup_sbjqry = list1[1]
|
|
471 UniqQry_sup_sbjqry = list1[2]
|
|
472 RedunQry_sup_sbjqry = list1[3]
|
|
473
|
|
474 list2 = find_UniqRedun(ListMatches_sup_sbj)
|
|
475 UniqSbj_sup_sbj = list2[0]
|
|
476 RedunSbj_sup_sbj = list2[1]
|
|
477 UniqQry_sup_sbj = list2[2]
|
|
478 RedunQry_sup_sbj = list2[3]
|
|
479
|
|
480 list3 = find_UniqRedun(ListMatches_sup_qry)
|
|
481 UniqSbj_sup_qry = list3[0]
|
|
482 RedunSbj_sup_qry = list3[1]
|
|
483 UniqQry_sup_qry = list3[2]
|
|
484 RedunQry_sup_qry = list3[3]
|
|
485
|
|
486 list4 = find_UniqRedun(ListMatches_inf_sbjqry)
|
|
487 UniqSbj_inf_sbjqry = list4[0]
|
|
488 RedunSbj_inf_sbjqry = list4[1]
|
|
489 UniqQry_inf_sbjqry = list4[2]
|
|
490 RedunQry_inf_sbjqry = list4[3]
|
|
491
|
|
492 iStatSbj = pyRepet.util.Stat.Stat()
|
|
493 for subject in dSubject2DistinctQueries.keys():
|
|
494 iStatSbj.add( len( dSubject2DistinctQueries[ subject ] ) )
|
|
495 iStatQry = pyRepet.util.Stat.Stat()
|
|
496 for query in dQuery2DistinctSubjects.keys():
|
|
497 iStatQry.add( len( dQuery2DistinctSubjects[ query ] ) )
|
|
498
|
|
499
|
|
500 # Write the review of the '.tab' file
|
|
501 outFile = open( tabFileName + "_tabFileReader.txt", "w" )
|
|
502 outFile.write( "Input: %s\n" % ( tabFileName ) )
|
|
503
|
|
504 outFile.write( "\n# Number of subjects in '%s': %i\n" % ( sbjFileName, nbSbj ) )
|
|
505 outFile.write( "# Number of queries in '%s': %i\n" % ( qryFileName, nbQry ) )
|
|
506
|
|
507 outFile.write( "\nNumber of matches: %s\n" % (len(ListMatches_all)))
|
|
508 outFile.write( " # Number of different subjects that match: %s (Sn*=%.2f%%)\n" % ( len(Sbj2Qry_all.keys()), 100 * len(Sbj2Qry_all.keys()) / float(nbSbj) ) )
|
|
509 outFile.write( " Among them, number of different subjects having exactly one match: %s (%.2f%%)\n" % ( len(UniqSbj_all), 100 * len(UniqSbj_all) / float(len(Sbj2Qry_all.keys())) ) )
|
|
510 outFile.write( " Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_all)))
|
|
511 outFile.write( " Different queries per subject: mean=%.2f sd=%.2f min=%.2f q25=%.2f med=%.2f q75=%.2f max=%.2f\n" % ( iStatSbj.mean(), iStatSbj.sd(), iStatSbj.min, iStatSbj.quantile(0.25), iStatSbj.median(), iStatSbj.quantile(0.75), iStatSbj.max ) )
|
|
512 outFile.write( " # Number of different queries that match: %s (Sp*=%.2f%%)\n" % ( len(Qry2Sbj_all.keys()), 100 * len(Qry2Sbj_all.keys()) / float(nbQry) ) )
|
|
513 outFile.write( " Among them, number of different queries having exactly one match: %s (%.2f%%)\n" % ( len(UniqQry_all), 100 * len(UniqQry_all) / float(len(Qry2Sbj_all.keys())) ) )
|
|
514 outFile.write( " Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_all)) )
|
|
515 outFile.write( " Different subjects per query: mean=%.2f sd=%.2f min=%.2f q25=%.2f med=%.2f q75=%.2f max=%.2f\n" % ( iStatQry.mean(), iStatQry.sd(), iStatQry.min, iStatQry.quantile(0.25), iStatQry.median(), iStatQry.quantile(0.75), iStatQry.max ) )
|
|
516
|
|
517 outFile.write( "\nNumber of matches with L >= %i%% for subject & query: %i\n" % ( thresholdCoverage, len(ListMatches_sup_sbjqry) ) )
|
|
518 outFile.write( " # Number of different subjects in the 'CC' case: %s (%.2f%%)\n" % ( len(Sbj2Qry_sup_sbjqry), 100 * len(Sbj2Qry_sup_sbjqry) / float(nbSbj) ) )
|
|
519 outFile.write( " Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_sbjqry)))
|
|
520 outFile.write( " Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_sbjqry)))
|
|
521 outFile.write( " # Number of different queries in the 'CC' case: %s (%.2f%%)\n" % ( len(Qry2Sbj_sup_sbjqry), 100 * len(Qry2Sbj_sup_sbjqry) / float(nbQry) ) )
|
|
522 outFile.write( " Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_sbjqry)))
|
|
523 outFile.write( " Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_sbjqry)))
|
|
524
|
|
525 outFile.write( "\nNumber of matches with L >= %i%% for subject and L < %i%% for query: %i\n" % ( thresholdCoverage, thresholdCoverage, len(ListMatches_sup_sbj) ) )
|
|
526 outFile.write( " Number of different subjects in that case: %s\n" % (len(Sbj2Qry_sup_sbj)))
|
|
527 outFile.write( " Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_sbj)))
|
|
528 outFile.write( " Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_sbj)))
|
|
529 outFile.write( " Number of different queries in that case: %s\n" % (len(Qry2Sbj_sup_sbj)))
|
|
530 outFile.write( " Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_sbj)))
|
|
531 outFile.write( " Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_sbj)))
|
|
532
|
|
533 outFile.write( "\nNumber of matches with L < %i%% for subject and L >= %i%% for query: %i\n" % ( thresholdCoverage, thresholdCoverage, len(ListMatches_sup_qry) ) )
|
|
534 outFile.write( " Number of different subjects in that case: %s\n" % (len(Sbj2Qry_sup_qry)))
|
|
535 outFile.write( " Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_qry)))
|
|
536 outFile.write( " Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_qry)))
|
|
537 outFile.write( " Number of different queries in that case: %s\n" % (len(Qry2Sbj_sup_qry)))
|
|
538 outFile.write( " Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_qry)))
|
|
539 outFile.write( " Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_qry)))
|
|
540
|
|
541 outFile.write( "\nNumber of matches with L < %i%% for subject & query: %i\n" % ( thresholdCoverage, len(ListMatches_inf_sbjqry) ) )
|
|
542 outFile.write( " Number of different subjects in that case: %s\n" % (len(Sbj2Qry_inf_sbjqry)))
|
|
543 outFile.write( " Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_inf_sbjqry)))
|
|
544 outFile.write( " Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_inf_sbjqry)))
|
|
545 outFile.write( " Number of different queries in that case: %s\n" % (len(Qry2Sbj_inf_sbjqry)))
|
|
546 outFile.write( " Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_inf_sbjqry)))
|
|
547 outFile.write( " Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_inf_sbjqry)))
|
|
548
|
|
549
|
|
550 # For the elements already counted in the matches with L >= 95% for subject & query, remove them from the other dictionnaries
|
|
551 rmv_Sbj2Qry = remove( Sbj2Qry_all, Sbj2Qry_sup_sbjqry, Sbj2Qry_sup_sbj, Sbj2Qry_sup_qry, Sbj2Qry_inf_sbjqry )
|
|
552 rmv_Qry2Sbj = remove( Qry2Sbj_all, Qry2Sbj_sup_sbjqry, Qry2Sbj_sup_sbj, Qry2Sbj_sup_qry, Qry2Sbj_inf_sbjqry )
|
|
553
|
|
554 outFile.write("\n\nAfter removal of the subjects/queries already counted in the matches with L >= %i%% for them:\n" % ( thresholdCoverage ) )
|
|
555
|
|
556 outFile.write( "\nMatches with L >= %i%% for subject and L < %i%% for query:\n" % ( thresholdCoverage, thresholdCoverage ) )
|
|
557 outFile.write( " # Number of different subjects in the 'CI' case: %s (%.2f%%)\n" % ( len(rmv_Sbj2Qry[0]), 100*len(rmv_Sbj2Qry[0])/float(nbSbj) ) )
|
|
558 outFile.write( " # Number of different queries in the 'CI' case: %s (%.2f%%)\n" % ( len(rmv_Qry2Sbj[0]), 100*len(rmv_Qry2Sbj[0])/float(nbQry) ) )
|
|
559
|
|
560 outFile.write( "\nMatches with L < %i%% for subject and L >= %i%% for query:\n" % ( thresholdCoverage, thresholdCoverage ) )
|
|
561 outFile.write( " # Number of different subjects in the 'IC' case: %s (%.2f%%)\n" % (len(rmv_Sbj2Qry[1]), 100*len(rmv_Sbj2Qry[1])/float(nbSbj) ) )
|
|
562 outFile.write( " # Number of different queries in the 'IC' case: %s (%.2f%%)\n" % (len(rmv_Qry2Sbj[1]), 100*len(rmv_Qry2Sbj[1])/float(nbQry) ) )
|
|
563
|
|
564 outFile.write( "\nMatches with L < %i%% for subject & query:\n" % ( thresholdCoverage ) )
|
|
565 outFile.write( " # Number of different subjects in the 'II' case: %s (%.2f%%)\n" % (len(rmv_Sbj2Qry[2]), 100*len(rmv_Sbj2Qry[2])/float(nbSbj) ) )
|
|
566 outFile.write( " # Number of different queries in the 'II' case: %s (%.2f%%)\n" % (len(rmv_Qry2Sbj[2]), 100*len(rmv_Qry2Sbj[2])/float(nbQry) ) )
|
|
567
|
|
568 outFile.write("\n==========================================================================\n")
|
|
569
|
|
570 write_output( outFile, 'CC', Sbj2Qry_sup_sbjqry, dSbj2Cat, Qry2Sbj_sup_sbjqry, dQry2Cat )
|
|
571
|
|
572 outFile.write("\n==========================================================================\n")
|
|
573
|
|
574 write_output( outFile, 'CI', rmv_Sbj2Qry[0], dSbj2Cat, rmv_Qry2Sbj[0], dQry2Cat )
|
|
575
|
|
576 outFile.write("\n==========================================================================\n")
|
|
577
|
|
578 write_output( outFile, 'IC', rmv_Sbj2Qry[1], dSbj2Cat, rmv_Qry2Sbj[1], dQry2Cat )
|
|
579
|
|
580 outFile.write("\n==========================================================================\n")
|
|
581
|
|
582 write_output( outFile, 'II', rmv_Sbj2Qry[2], dSbj2Cat, rmv_Qry2Sbj[2], dQry2Cat )
|
|
583
|
|
584 outFile.write("\n==========================================================================\n")
|
|
585
|
|
586 outFile.close()
|
|
587
|
|
588 writeSubjectCategory( dSbj2Cat )
|
|
589 writeQueryCategory( dQry2Cat )
|
|
590
|
|
591 if verbose > 0:
|
|
592 print "END %s" % (sys.argv[0].split("/")[-1])
|
|
593 sys.stdout.flush()
|
|
594
|
|
595 return 0
|
|
596
|
|
597 #-----------------------------------------------------------------------------------------------------
|
|
598
|
|
599 if __name__ == "__main__":
|
|
600 main()
|