Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/tabFileReader.py @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki |
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 ###@file | |
| 4 # Read a file recording matches in the 'tab' format (output from Matcher) and return the number of matches between queries and subjects being CC, CI, IC and II. | |
| 5 # A match is said to be CC (for complete-complete) when both query and subject match over x% of their entire respective length. By default, x=95. | |
| 6 # | |
| 7 # usage: tabFileReader.py [ options ] | |
| 8 # options: | |
| 9 # -h: this help | |
| 10 # -m: name of the file recording the matches (format='tab', output from Matcher) | |
| 11 # -q: name of the fasta file recording the queries | |
| 12 # -s: name of the fasta file recording the subjects | |
| 13 # -t: threshold over which the match is 'complete', in % of the seq length (default=95) | |
| 14 # -i: identity below which matches are ignored (default=0) | |
| 15 # -l: length below which matches are ignored (default=0) | |
| 16 # -o: overlap on query and subject below which matches are ignored (default=0) | |
| 17 # -v: verbose (default=0/1) | |
| 18 | |
| 19 import sys | |
| 20 import getopt | |
| 21 from string import * | |
| 22 | |
| 23 import pyRepet.seq.BioseqDB | |
| 24 import pyRepet.util.Stat | |
| 25 | |
| 26 #TODO: remove case changes in headers (4 lower() method calls in this script) | |
| 27 | |
| 28 #---------------------------------------------------------------------------- | |
| 29 | |
| 30 def help(): | |
| 31 print | |
| 32 print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] ) | |
| 33 print "options:" | |
| 34 print " -h: this help" | |
| 35 print " -m: name of the file recording the matches (format='tab', output from Matcher)" | |
| 36 print " -q: name of the fasta file recording the queries" | |
| 37 print " -s: name of the fasta file recording the subjects" | |
| 38 print " -t: coverage threshold over which the match is 'complete' (in %% of the seq length, default=95)" | |
| 39 print " -i: identity below which matches are ignored (default=0)" | |
| 40 print " -l: length below which matches are ignored (default=0)" | |
| 41 print " -o: overlap on query and subject below which matches are ignored (default=0)" | |
| 42 print " -I: identity threshold for 'CC' matches (default=90)" | |
| 43 print " -E: E-value threshold for 'CC' matches (default=1e-10)" | |
| 44 print " -T: coverage threshold for match length on query compare to subject length (default=90)" | |
| 45 print " -v: verbose (default=0/1)" | |
| 46 print | |
| 47 | |
| 48 #---------------------------------------------------------------------------- | |
| 49 | |
| 50 #here are the fields of a '.tab' file: | |
| 51 #[0]: query sequence name | |
| 52 #[1]: whole match start coordinate on the query sequence | |
| 53 #[2]: whole match end coordinate on the query sequence | |
| 54 #[3]: length on the query sequence | |
| 55 #[4]: length in percentage of the query sequence | |
| 56 #[5]: length on the query relative to the subject length in percentage | |
| 57 #[6]: subject sequence name | |
| 58 #[7]: whole match start coordinate on the subject sequence | |
| 59 #[8]: whole match end coordinate on the subject sequence | |
| 60 #[9]: length on the subject sequence | |
| 61 #[10]: length in percentage of the subject sequence | |
| 62 #[11]: BLAST E-value | |
| 63 #[12]: BLAST score | |
| 64 #[13]: identity percentage | |
| 65 #[14]: path | |
| 66 | |
| 67 class tabFileReader( object ): | |
| 68 | |
| 69 def __init__( self, line ): | |
| 70 | |
| 71 columns = line.split("\t") | |
| 72 | |
| 73 self.name_sbj = (columns[6]) | |
| 74 self.length_sbj = int(round(int(columns[9])/float(columns[10]),0)) #length of the subject | |
| 75 self.prct_sbj = float(columns[10]) * 100 #prct_sbj = length of the match on the subject divided by the length of the subject * 100 | |
| 76 if int(columns[7]) < int(columns[8]): | |
| 77 self.start_sbj = int(columns[7]) #start of the match on the subject | |
| 78 self.end_sbj = int(columns[8]) #end of the match on the subject | |
| 79 else: | |
| 80 self.start_sbj = int(columns[8]) | |
| 81 self.end_sbj = int(columns[7]) | |
| 82 self.sbj_dist_ends = int(columns[9]) #length on the subject that matches with the query | |
| 83 | |
| 84 self.name_qry = columns[0] | |
| 85 self.length_qry = int(round(int(columns[3])/float(columns[4]),0)) #length of the query | |
| 86 self.prct_qry = float(columns[4]) * 100 #prct_qry = length of the match on the query divided by the length of the query * 100 | |
| 87 if int(columns[1]) < int(columns[2]): | |
| 88 self.start_qry = int(columns[1]) #start of the match on the query | |
| 89 self.end_qry = int(columns[2]) #end of the match on the query | |
| 90 else: | |
| 91 self.start_qry = int(columns[2]) | |
| 92 self.end_qry = int(columns[1]) | |
| 93 self.qry_dist_ends = int(columns[3]) #length on the query that matches with the subject | |
| 94 | |
| 95 self.length_match = int(columns[3]) | |
| 96 self.prct_matchQryOverSbj = float(columns[5]) * 100 #length on the query relative to the subject length in percentage | |
| 97 self.identity = float(columns[13]) | |
| 98 self.score = int(columns[12]) | |
| 99 self.evalue = float(columns[11]) | |
| 100 | |
| 101 self.sbj2qry = [self.length_sbj,self.prct_sbj,self.start_sbj,self.end_sbj,self.name_qry,self.length_sbj,self.prct_qry,self.start_qry,self.end_qry,self.identity,self.score] | |
| 102 | |
| 103 self.qry2sbj = [self.length_qry,self.prct_qry,self.start_qry,self.end_qry,self.name_sbj,self.length_sbj,self.prct_sbj,self.start_sbj,self.end_sbj,self.identity,self.score] | |
| 104 | |
| 105 #---------------------------------------------------------------------------- | |
| 106 | |
| 107 def make_dico( lMatches ): | |
| 108 """ | |
| 109 Record the matches in two dictionaries which keys are the queries or the subjects. | |
| 110 """ | |
| 111 | |
| 112 Sbj2Qry = {} | |
| 113 Qry2Sbj = {} | |
| 114 | |
| 115 for match in lMatches: | |
| 116 if Sbj2Qry.has_key( match.name_sbj ): | |
| 117 Sbj2Qry[match.name_sbj].append( match ) | |
| 118 else: | |
| 119 Sbj2Qry[match.name_sbj] = [ match ] | |
| 120 if Qry2Sbj.has_key( match.name_qry ): | |
| 121 Qry2Sbj[match.name_qry].append( match ) | |
| 122 else: | |
| 123 Qry2Sbj[match.name_qry] = [ match ] | |
| 124 | |
| 125 return [ Sbj2Qry, Qry2Sbj ] | |
| 126 | |
| 127 #---------------------------------------------------------------------------- | |
| 128 | |
| 129 def find_UniqRedun( list_matchs ): | |
| 130 | |
| 131 list_total_sbj = [];list_total_qry = [] | |
| 132 list_uniq_sbj = [];list_redun_sbj = [] | |
| 133 list_uniq_qry = [];list_redun_qry = [] | |
| 134 | |
| 135 for match in list_matchs: | |
| 136 list_total_sbj.append(match.name_sbj) | |
| 137 list_total_qry.append(match.name_qry) | |
| 138 | |
| 139 for name_sbj in list_total_sbj: | |
| 140 if list_total_sbj.count(name_sbj) == 1: | |
| 141 list_uniq_sbj.append(name_sbj) | |
| 142 else: | |
| 143 if name_sbj not in list_redun_sbj: | |
| 144 list_redun_sbj.append(name_sbj) | |
| 145 | |
| 146 for name_qry in list_total_qry: | |
| 147 if list_total_qry.count(name_qry) == 1: | |
| 148 list_uniq_qry.append(name_qry) | |
| 149 else: | |
| 150 if name_qry not in list_redun_qry: | |
| 151 list_redun_qry.append(name_qry) | |
| 152 | |
| 153 return [ list_uniq_sbj, list_redun_sbj, list_uniq_qry, list_redun_qry ] | |
| 154 | |
| 155 #---------------------------------------------------------------------------- | |
| 156 | |
| 157 def remove( all, sup_sbjqry, sup_sbj, sup_qry, inf_sbjqry ): | |
| 158 | |
| 159 for name_sbj in all.keys(): | |
| 160 | |
| 161 if sup_sbjqry.has_key( name_sbj ) and sup_sbj.has_key( name_sbj ): | |
| 162 del sup_sbj[ name_sbj ] | |
| 163 | |
| 164 if sup_sbjqry.has_key( name_sbj ) and sup_qry.has_key( name_sbj ): | |
| 165 del sup_qry[ name_sbj ] | |
| 166 | |
| 167 if sup_sbjqry.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ): | |
| 168 del inf_sbjqry[ name_sbj ] | |
| 169 | |
| 170 if sup_sbj.has_key( name_sbj ) and sup_qry.has_key( name_sbj ): | |
| 171 del sup_qry[ name_sbj ] | |
| 172 | |
| 173 if sup_sbj.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ): | |
| 174 del inf_sbjqry[ name_sbj ] | |
| 175 | |
| 176 if sup_qry.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ): | |
| 177 del inf_sbjqry[ name_sbj ] | |
| 178 | |
| 179 return [ sup_sbj, sup_qry, inf_sbjqry ] | |
| 180 | |
| 181 #---------------------------------------------------------------------------- | |
| 182 | |
| 183 def write_output( outFile, match_type, Sbj2Qry, dSbj2Cat, Qry2Sbj, dQry2Cat ): | |
| 184 """ | |
| 185 Save the results (subjects in each category and its matches) in a human-readable way. | |
| 186 """ | |
| 187 | |
| 188 if match_type == 'CC': | |
| 189 msg = "Matches with L >= %i%% for subject and query (CC)" % ( thresholdCoverage ) | |
| 190 elif match_type == 'CI': | |
| 191 msg = "Matches with L >= %i%% for subject and L < %i%% for query (CI)" % ( thresholdCoverage, thresholdCoverage ) | |
| 192 elif match_type == 'IC': | |
| 193 msg = "Matches with L < %i%% for subject and L >= %i%% for query (IC)" % ( thresholdCoverage, thresholdCoverage ) | |
| 194 elif match_type == 'II': | |
| 195 msg ="Matches with L < %i%% for subject and query (II)" % ( thresholdCoverage ) | |
| 196 if verbose > 1: | |
| 197 print "%s: %i subjects" % ( msg, len(Sbj2Qry.keys()) ) | |
| 198 outFile.write("\n%s\n" % ( msg ) ) | |
| 199 | |
| 200 for name_sbj in Sbj2Qry.keys(): | |
| 201 matchs = Sbj2Qry[name_sbj] | |
| 202 if len(matchs) == 1: | |
| 203 outFile.write("-> subject %s (%s: %s,%s) matches with query %s (%s: %s,%s): prct_sbj %.3f & prct_qry %.3f (id=%.3f,Eval=%g)\n" % (name_sbj,matchs[0].length_sbj,matchs[0].start_sbj,matchs[0].end_sbj,matchs[0].name_qry,matchs[0].length_qry,matchs[0].start_qry,matchs[0].end_qry,matchs[0].prct_sbj,matchs[0].prct_qry,matchs[0].identity,matchs[0].evalue)) | |
| 204 else: | |
| 205 outFile.write("-> subject %s (%s: %s,%s) matches with %s queries:\n" % (name_sbj,matchs[0].length_sbj,matchs[0].start_sbj,matchs[0].end_sbj,len(matchs))) | |
| 206 for match in matchs: | |
| 207 outFile.write("%s versus %s (%s: %s,%s): prct_sbj %.3f & prct_qry %.3f (id=%.3f,Eval=%g)\n"%(name_sbj,match.name_qry,match.length_qry,match.start_qry,match.end_qry,match.prct_sbj,match.prct_qry,match.identity,match.evalue)) | |
| 208 | |
| 209 tmpList = [] | |
| 210 for name_sbj in Sbj2Qry.keys(): | |
| 211 tmpList.append( name_sbj.split(" ")[0].lower() ) | |
| 212 tmpList.sort() | |
| 213 for name_sbj in tmpList: | |
| 214 outFile.write( name_sbj+"\n" ) | |
| 215 dSbj2Cat[ name_sbj ] = match_type | |
| 216 | |
| 217 tmpList = [] | |
| 218 for name_qry in Qry2Sbj.keys(): | |
| 219 tmpList.append( name_qry.split(" ")[0].lower() ) | |
| 220 tmpList.sort() | |
| 221 for name_qry in tmpList: | |
| 222 outFile.write( name_qry+"\n" ) | |
| 223 dQry2Cat[ name_qry ] = match_type | |
| 224 | |
| 225 #---------------------------------------------------------------------------- | |
| 226 | |
| 227 def writeSubjectCategory( dSbj2Cat ): | |
| 228 """ | |
| 229 Save the category (CC/CI/IC/II/NA) in which each subject has been found. | |
| 230 | |
| 231 @param dSbj2Cat: dictionary which keys are subject names and values the category of that subject | |
| 232 @type dSbj2Cat: dictionary | |
| 233 """ | |
| 234 | |
| 235 # sort the subject names in alphabetical order | |
| 236 lSbjSorted = dSbj2Cat.keys() | |
| 237 lSbjSorted.sort() | |
| 238 | |
| 239 catFile = open( tabFileName + "_sbjCategories.txt", "w" ) | |
| 240 for sbj in lSbjSorted: | |
| 241 string = "%s\t%s\n" % ( sbj, dSbj2Cat[ sbj ] ) | |
| 242 catFile.write( string ) | |
| 243 catFile.close() | |
| 244 | |
| 245 #---------------------------------------------------------------------------- | |
| 246 | |
| 247 def writeQueryCategory( dQry2Cat ): | |
| 248 """ | |
| 249 Save the category (CC/CI/IC/II/NA) in which each query has been found. | |
| 250 | |
| 251 @param dQry2Cat: dictionary which keys are query names and values the category of that query | |
| 252 @type dQry2Cat: dictionary | |
| 253 """ | |
| 254 | |
| 255 # sort the query names in alphabetical order | |
| 256 lQrySorted = dQry2Cat.keys() | |
| 257 lQrySorted.sort() | |
| 258 | |
| 259 catFile = open( tabFileName + "_qryCategories.txt", "w" ) | |
| 260 for qry in lQrySorted: | |
| 261 string = "%s\t%s\n" % ( qry, dQry2Cat[ qry ] ) | |
| 262 catFile.write( string ) | |
| 263 catFile.close() | |
| 264 | |
| 265 #---------------------------------------------------------------------------- | |
| 266 | |
| 267 def main(): | |
| 268 | |
| 269 global tabFileName | |
| 270 tabFileName = "" | |
| 271 qryFileName = "" | |
| 272 sbjFileName = "" | |
| 273 global thresholdCoverage | |
| 274 thresholdCoverage = 95 | |
| 275 minIdentity = 0 | |
| 276 minLength = 0 | |
| 277 minOverlap = 0 | |
| 278 global thresholdIdentity | |
| 279 thresholdIdentity = 90 | |
| 280 global thresholdEvalue | |
| 281 thresholdEvalue = 1e-10 | |
| 282 global thresholdCoverageMatch | |
| 283 thresholdCoverageMatch = 90 | |
| 284 global verbose | |
| 285 verbose = 0 | |
| 286 | |
| 287 try: | |
| 288 opts, args = getopt.getopt(sys.argv[1:],"hm:q:s:t:i:l:I:E:T:o:v:") | |
| 289 except getopt.GetoptError, err: | |
| 290 print str(err); help(); sys.exit(1) | |
| 291 for o,a in opts: | |
| 292 if o == "-h": | |
| 293 help() | |
| 294 sys.exit(0) | |
| 295 elif o == "-m": | |
| 296 tabFileName = a | |
| 297 elif o == "-q": | |
| 298 qryFileName = a | |
| 299 elif o == "-s": | |
| 300 sbjFileName = a | |
| 301 elif o == "-t": | |
| 302 thresholdCoverage = int(a) | |
| 303 elif o == "-i": | |
| 304 minIdentity = float(a) | |
| 305 elif o == "-l": | |
| 306 minLength = int(a) | |
| 307 elif o == "-o": | |
| 308 minOverlap = float(a) | |
| 309 elif o == "-I": | |
| 310 thresholdIdentity = int(a) | |
| 311 elif o == "-E": | |
| 312 thresholdEvalue = float(a) | |
| 313 elif o == "-T": | |
| 314 thresholdCoverageMatch = int(a) | |
| 315 elif o == "-v": | |
| 316 verbose = int(a) | |
| 317 | |
| 318 if tabFileName == "": | |
| 319 msg = "ERROR: missing 'tab' file (-m)" | |
| 320 sys.stderr.write( "%s\n" % msg ) | |
| 321 help() | |
| 322 sys.exit(1) | |
| 323 if qryFileName == "" or sbjFileName == "": | |
| 324 msg = "ERROR: missing 'fasta' files (-q or -s)" | |
| 325 sys.stderr.write( "%s\n" % msg ) | |
| 326 help() | |
| 327 sys.exit(1) | |
| 328 | |
| 329 if verbose > 0: | |
| 330 print "START %s" % (sys.argv[0].split("/")[-1]) | |
| 331 sys.stdout.flush() | |
| 332 | |
| 333 # 4 categories of matchs: | |
| 334 # type 1 (CC): the length of the match on the subject is >= 95% of the total length of the subject, idem for the query | |
| 335 # type 2 (CI): sbj >= 95% & qry < 95% | |
| 336 # type 3 (IC): sbj < 95% & qry >= 95% | |
| 337 # type 4 (II): sbj & qry < 95% | |
| 338 ListMatches_all = [] | |
| 339 ListMatches_sup_sbjqry = [] | |
| 340 ListMatches_sup_sbj = [] | |
| 341 ListMatches_sup_qry = [] | |
| 342 ListMatches_inf_sbjqry = [] | |
| 343 | |
| 344 qryDB = pyRepet.seq.BioseqDB.BioseqDB( qryFileName ) | |
| 345 nbQry = qryDB.getSize() | |
| 346 if verbose > 0: | |
| 347 print "nb of queries in '%s': %i" % ( qryFileName, nbQry ) | |
| 348 dQry2Cat = {} | |
| 349 for bs in qryDB.db: | |
| 350 dQry2Cat[ bs.header.split(" ")[0].lower() ] = "NA" | |
| 351 | |
| 352 sbjDB = pyRepet.seq.BioseqDB.BioseqDB( sbjFileName ) | |
| 353 nbSbj = sbjDB.getSize() | |
| 354 if verbose > 0: | |
| 355 print "nb of subjects in '%s': %i" % ( sbjFileName, nbSbj ) | |
| 356 dSbj2Cat = {} | |
| 357 for bs in sbjDB.db: | |
| 358 dSbj2Cat[ bs.header.split(" ")[0].lower() ] = "NA" | |
| 359 | |
| 360 tabFile = open( tabFileName ) | |
| 361 nbMatchesInTab = 0 | |
| 362 dSubject2DistinctQueries = {} | |
| 363 dQuery2DistinctSubjects = {} | |
| 364 | |
| 365 # For each match, create a 'tabFileReader' object and record it in a list according to the type of the match | |
| 366 if verbose > 0: | |
| 367 print "parse the 'tab' file..."; sys.stdout.flush() | |
| 368 while True: | |
| 369 line = tabFile.readline() | |
| 370 if line == "": | |
| 371 break | |
| 372 if line[0:10] == "query.name": | |
| 373 continue | |
| 374 nbMatchesInTab += 1 | |
| 375 | |
| 376 match = tabFileReader( line ) | |
| 377 if match.identity < minIdentity: | |
| 378 line = tabFile.readline() | |
| 379 continue | |
| 380 if match.length_match < minLength: | |
| 381 line = tabFile.readline() | |
| 382 continue | |
| 383 if match.prct_qry < minOverlap or match.prct_sbj < minOverlap: | |
| 384 line = tabFile.readline() | |
| 385 continue | |
| 386 ListMatches_all.append( match ) | |
| 387 | |
| 388 # type 1: sbj C & qry C | |
| 389 if match.prct_sbj >= thresholdCoverage and match.prct_qry >= thresholdCoverage: | |
| 390 qsLengthRatio = 100 * match.length_qry / float(match.length_sbj) | |
| 391 if match.identity >= thresholdIdentity \ | |
| 392 and match.evalue <= thresholdEvalue \ | |
| 393 and qsLengthRatio >= thresholdCoverage - 2 \ | |
| 394 and qsLengthRatio <= 100 + (100-thresholdCoverage) + 2 \ | |
| 395 and match.prct_matchQryOverSbj >= thresholdCoverageMatch: | |
| 396 ListMatches_sup_sbjqry.append( match ) | |
| 397 else: | |
| 398 ListMatches_inf_sbjqry.append( match ) | |
| 399 | |
| 400 # type 2: sbj C & qry I | |
| 401 elif match.prct_sbj >= thresholdCoverage and match.prct_qry < thresholdCoverage: | |
| 402 ListMatches_sup_sbj.append( match ) | |
| 403 | |
| 404 # type 3: sbj I & qry C | |
| 405 elif match.prct_qry >= thresholdCoverage and match.prct_sbj < thresholdCoverage: | |
| 406 ListMatches_sup_qry.append( match ) | |
| 407 | |
| 408 # type 4: sbj I & qry I | |
| 409 elif match.prct_qry < thresholdCoverage and match.prct_sbj < thresholdCoverage: | |
| 410 ListMatches_inf_sbjqry.append( match ) | |
| 411 | |
| 412 if not dSubject2DistinctQueries.has_key( match.name_sbj ): | |
| 413 dSubject2DistinctQueries[ match.name_sbj ] = [] | |
| 414 if not match.name_qry in dSubject2DistinctQueries[ match.name_sbj ]: | |
| 415 dSubject2DistinctQueries[ match.name_sbj ].append( match.name_qry ) | |
| 416 if not dQuery2DistinctSubjects.has_key( match.name_qry ): | |
| 417 dQuery2DistinctSubjects[ match.name_qry ] = [] | |
| 418 if not match.name_sbj in dQuery2DistinctSubjects[ match.name_qry ]: | |
| 419 dQuery2DistinctSubjects[ match.name_qry ].append( match.name_sbj ) | |
| 420 | |
| 421 if verbose > 0: | |
| 422 print "parsing done !"; sys.stdout.flush() | |
| 423 print "nb matches in '%s': %i" % ( tabFileName, nbMatchesInTab ) | |
| 424 print "nb matches 'CC': %i" % ( len(ListMatches_sup_sbjqry) ) | |
| 425 if verbose > 1: | |
| 426 for match in ListMatches_sup_sbjqry: | |
| 427 print "\t%s (%.2f%%) - %s (%.2f%%) id=%.2f" % ( match.name_sbj, match.prct_sbj, match.name_qry, match.prct_qry, match.identity ) | |
| 428 print "nb matches 'CI': %i" % ( len(ListMatches_sup_sbj) ) | |
| 429 if verbose > 1: | |
| 430 for match in ListMatches_sup_sbj: | |
| 431 print "\t%s (%.2f%%) - %s (%.2f%%) id=%.2f" % ( match.name_sbj, match.prct_sbj, match.name_qry, match.prct_qry, match.identity ) | |
| 432 print "nb matches 'IC': %i" % ( len(ListMatches_sup_qry) ) | |
| 433 print "nb matches 'II': %i" % ( len(ListMatches_inf_sbjqry) ) | |
| 434 | |
| 435 if nbMatchesInTab == 0: | |
| 436 print "nothing to do" | |
| 437 sys.exit(0) | |
| 438 | |
| 439 # For each type of matchs, record them in 2 dictionaries: Sbj2Qry and Qry2Sbj | |
| 440 D_all = make_dico( ListMatches_all ) | |
| 441 Sbj2Qry_all = D_all[0] | |
| 442 Qry2Sbj_all = D_all[1] | |
| 443 | |
| 444 D_sup_sbjqry = make_dico(ListMatches_sup_sbjqry) | |
| 445 Sbj2Qry_sup_sbjqry = D_sup_sbjqry[0] | |
| 446 Qry2Sbj_sup_sbjqry = D_sup_sbjqry[1] | |
| 447 | |
| 448 D_sup_sbj = make_dico(ListMatches_sup_sbj) | |
| 449 Sbj2Qry_sup_sbj = D_sup_sbj[0] | |
| 450 Qry2Sbj_sup_sbj = D_sup_sbj[1] | |
| 451 | |
| 452 D_sup_qry = make_dico(ListMatches_sup_qry) | |
| 453 Sbj2Qry_sup_qry = D_sup_qry[0] | |
| 454 Qry2Sbj_sup_qry = D_sup_qry[1] | |
| 455 | |
| 456 D_inf_sbjqry = make_dico(ListMatches_inf_sbjqry) | |
| 457 Sbj2Qry_inf_sbjqry = D_inf_sbjqry[0] | |
| 458 Qry2Sbj_inf_sbjqry = D_inf_sbjqry[1] | |
| 459 | |
| 460 | |
| 461 # For each type of matches, find the subjects/queries that are involve in one or several match | |
| 462 list_all = find_UniqRedun(ListMatches_all) | |
| 463 UniqSbj_all = list_all[0] | |
| 464 RedunSbj_all = list_all[1] | |
| 465 UniqQry_all = list_all[2] | |
| 466 RedunQry_all = list_all[3] | |
| 467 | |
| 468 list1 = find_UniqRedun(ListMatches_sup_sbjqry) | |
| 469 UniqSbj_sup_sbjqry = list1[0] | |
| 470 RedunSbj_sup_sbjqry = list1[1] | |
| 471 UniqQry_sup_sbjqry = list1[2] | |
| 472 RedunQry_sup_sbjqry = list1[3] | |
| 473 | |
| 474 list2 = find_UniqRedun(ListMatches_sup_sbj) | |
| 475 UniqSbj_sup_sbj = list2[0] | |
| 476 RedunSbj_sup_sbj = list2[1] | |
| 477 UniqQry_sup_sbj = list2[2] | |
| 478 RedunQry_sup_sbj = list2[3] | |
| 479 | |
| 480 list3 = find_UniqRedun(ListMatches_sup_qry) | |
| 481 UniqSbj_sup_qry = list3[0] | |
| 482 RedunSbj_sup_qry = list3[1] | |
| 483 UniqQry_sup_qry = list3[2] | |
| 484 RedunQry_sup_qry = list3[3] | |
| 485 | |
| 486 list4 = find_UniqRedun(ListMatches_inf_sbjqry) | |
| 487 UniqSbj_inf_sbjqry = list4[0] | |
| 488 RedunSbj_inf_sbjqry = list4[1] | |
| 489 UniqQry_inf_sbjqry = list4[2] | |
| 490 RedunQry_inf_sbjqry = list4[3] | |
| 491 | |
| 492 iStatSbj = pyRepet.util.Stat.Stat() | |
| 493 for subject in dSubject2DistinctQueries.keys(): | |
| 494 iStatSbj.add( len( dSubject2DistinctQueries[ subject ] ) ) | |
| 495 iStatQry = pyRepet.util.Stat.Stat() | |
| 496 for query in dQuery2DistinctSubjects.keys(): | |
| 497 iStatQry.add( len( dQuery2DistinctSubjects[ query ] ) ) | |
| 498 | |
| 499 | |
| 500 # Write the review of the '.tab' file | |
| 501 outFile = open( tabFileName + "_tabFileReader.txt", "w" ) | |
| 502 outFile.write( "Input: %s\n" % ( tabFileName ) ) | |
| 503 | |
| 504 outFile.write( "\n# Number of subjects in '%s': %i\n" % ( sbjFileName, nbSbj ) ) | |
| 505 outFile.write( "# Number of queries in '%s': %i\n" % ( qryFileName, nbQry ) ) | |
| 506 | |
| 507 outFile.write( "\nNumber of matches: %s\n" % (len(ListMatches_all))) | |
| 508 outFile.write( " # Number of different subjects that match: %s (Sn*=%.2f%%)\n" % ( len(Sbj2Qry_all.keys()), 100 * len(Sbj2Qry_all.keys()) / float(nbSbj) ) ) | |
| 509 outFile.write( " Among them, number of different subjects having exactly one match: %s (%.2f%%)\n" % ( len(UniqSbj_all), 100 * len(UniqSbj_all) / float(len(Sbj2Qry_all.keys())) ) ) | |
| 510 outFile.write( " Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_all))) | |
| 511 outFile.write( " Different queries per subject: mean=%.2f sd=%.2f min=%.2f q25=%.2f med=%.2f q75=%.2f max=%.2f\n" % ( iStatSbj.mean(), iStatSbj.sd(), iStatSbj.min, iStatSbj.quantile(0.25), iStatSbj.median(), iStatSbj.quantile(0.75), iStatSbj.max ) ) | |
| 512 outFile.write( " # Number of different queries that match: %s (Sp*=%.2f%%)\n" % ( len(Qry2Sbj_all.keys()), 100 * len(Qry2Sbj_all.keys()) / float(nbQry) ) ) | |
| 513 outFile.write( " Among them, number of different queries having exactly one match: %s (%.2f%%)\n" % ( len(UniqQry_all), 100 * len(UniqQry_all) / float(len(Qry2Sbj_all.keys())) ) ) | |
| 514 outFile.write( " Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_all)) ) | |
| 515 outFile.write( " Different subjects per query: mean=%.2f sd=%.2f min=%.2f q25=%.2f med=%.2f q75=%.2f max=%.2f\n" % ( iStatQry.mean(), iStatQry.sd(), iStatQry.min, iStatQry.quantile(0.25), iStatQry.median(), iStatQry.quantile(0.75), iStatQry.max ) ) | |
| 516 | |
| 517 outFile.write( "\nNumber of matches with L >= %i%% for subject & query: %i\n" % ( thresholdCoverage, len(ListMatches_sup_sbjqry) ) ) | |
| 518 outFile.write( " # Number of different subjects in the 'CC' case: %s (%.2f%%)\n" % ( len(Sbj2Qry_sup_sbjqry), 100 * len(Sbj2Qry_sup_sbjqry) / float(nbSbj) ) ) | |
| 519 outFile.write( " Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_sbjqry))) | |
| 520 outFile.write( " Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_sbjqry))) | |
| 521 outFile.write( " # Number of different queries in the 'CC' case: %s (%.2f%%)\n" % ( len(Qry2Sbj_sup_sbjqry), 100 * len(Qry2Sbj_sup_sbjqry) / float(nbQry) ) ) | |
| 522 outFile.write( " Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_sbjqry))) | |
| 523 outFile.write( " Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_sbjqry))) | |
| 524 | |
| 525 outFile.write( "\nNumber of matches with L >= %i%% for subject and L < %i%% for query: %i\n" % ( thresholdCoverage, thresholdCoverage, len(ListMatches_sup_sbj) ) ) | |
| 526 outFile.write( " Number of different subjects in that case: %s\n" % (len(Sbj2Qry_sup_sbj))) | |
| 527 outFile.write( " Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_sbj))) | |
| 528 outFile.write( " Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_sbj))) | |
| 529 outFile.write( " Number of different queries in that case: %s\n" % (len(Qry2Sbj_sup_sbj))) | |
| 530 outFile.write( " Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_sbj))) | |
| 531 outFile.write( " Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_sbj))) | |
| 532 | |
| 533 outFile.write( "\nNumber of matches with L < %i%% for subject and L >= %i%% for query: %i\n" % ( thresholdCoverage, thresholdCoverage, len(ListMatches_sup_qry) ) ) | |
| 534 outFile.write( " Number of different subjects in that case: %s\n" % (len(Sbj2Qry_sup_qry))) | |
| 535 outFile.write( " Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_qry))) | |
| 536 outFile.write( " Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_qry))) | |
| 537 outFile.write( " Number of different queries in that case: %s\n" % (len(Qry2Sbj_sup_qry))) | |
| 538 outFile.write( " Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_qry))) | |
| 539 outFile.write( " Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_qry))) | |
| 540 | |
| 541 outFile.write( "\nNumber of matches with L < %i%% for subject & query: %i\n" % ( thresholdCoverage, len(ListMatches_inf_sbjqry) ) ) | |
| 542 outFile.write( " Number of different subjects in that case: %s\n" % (len(Sbj2Qry_inf_sbjqry))) | |
| 543 outFile.write( " Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_inf_sbjqry))) | |
| 544 outFile.write( " Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_inf_sbjqry))) | |
| 545 outFile.write( " Number of different queries in that case: %s\n" % (len(Qry2Sbj_inf_sbjqry))) | |
| 546 outFile.write( " Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_inf_sbjqry))) | |
| 547 outFile.write( " Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_inf_sbjqry))) | |
| 548 | |
| 549 | |
| 550 # For the elements already counted in the matches with L >= 95% for subject & query, remove them from the other dictionnaries | |
| 551 rmv_Sbj2Qry = remove( Sbj2Qry_all, Sbj2Qry_sup_sbjqry, Sbj2Qry_sup_sbj, Sbj2Qry_sup_qry, Sbj2Qry_inf_sbjqry ) | |
| 552 rmv_Qry2Sbj = remove( Qry2Sbj_all, Qry2Sbj_sup_sbjqry, Qry2Sbj_sup_sbj, Qry2Sbj_sup_qry, Qry2Sbj_inf_sbjqry ) | |
| 553 | |
| 554 outFile.write("\n\nAfter removal of the subjects/queries already counted in the matches with L >= %i%% for them:\n" % ( thresholdCoverage ) ) | |
| 555 | |
| 556 outFile.write( "\nMatches with L >= %i%% for subject and L < %i%% for query:\n" % ( thresholdCoverage, thresholdCoverage ) ) | |
| 557 outFile.write( " # Number of different subjects in the 'CI' case: %s (%.2f%%)\n" % ( len(rmv_Sbj2Qry[0]), 100*len(rmv_Sbj2Qry[0])/float(nbSbj) ) ) | |
| 558 outFile.write( " # Number of different queries in the 'CI' case: %s (%.2f%%)\n" % ( len(rmv_Qry2Sbj[0]), 100*len(rmv_Qry2Sbj[0])/float(nbQry) ) ) | |
| 559 | |
| 560 outFile.write( "\nMatches with L < %i%% for subject and L >= %i%% for query:\n" % ( thresholdCoverage, thresholdCoverage ) ) | |
| 561 outFile.write( " # Number of different subjects in the 'IC' case: %s (%.2f%%)\n" % (len(rmv_Sbj2Qry[1]), 100*len(rmv_Sbj2Qry[1])/float(nbSbj) ) ) | |
| 562 outFile.write( " # Number of different queries in the 'IC' case: %s (%.2f%%)\n" % (len(rmv_Qry2Sbj[1]), 100*len(rmv_Qry2Sbj[1])/float(nbQry) ) ) | |
| 563 | |
| 564 outFile.write( "\nMatches with L < %i%% for subject & query:\n" % ( thresholdCoverage ) ) | |
| 565 outFile.write( " # Number of different subjects in the 'II' case: %s (%.2f%%)\n" % (len(rmv_Sbj2Qry[2]), 100*len(rmv_Sbj2Qry[2])/float(nbSbj) ) ) | |
| 566 outFile.write( " # Number of different queries in the 'II' case: %s (%.2f%%)\n" % (len(rmv_Qry2Sbj[2]), 100*len(rmv_Qry2Sbj[2])/float(nbQry) ) ) | |
| 567 | |
| 568 outFile.write("\n==========================================================================\n") | |
| 569 | |
| 570 write_output( outFile, 'CC', Sbj2Qry_sup_sbjqry, dSbj2Cat, Qry2Sbj_sup_sbjqry, dQry2Cat ) | |
| 571 | |
| 572 outFile.write("\n==========================================================================\n") | |
| 573 | |
| 574 write_output( outFile, 'CI', rmv_Sbj2Qry[0], dSbj2Cat, rmv_Qry2Sbj[0], dQry2Cat ) | |
| 575 | |
| 576 outFile.write("\n==========================================================================\n") | |
| 577 | |
| 578 write_output( outFile, 'IC', rmv_Sbj2Qry[1], dSbj2Cat, rmv_Qry2Sbj[1], dQry2Cat ) | |
| 579 | |
| 580 outFile.write("\n==========================================================================\n") | |
| 581 | |
| 582 write_output( outFile, 'II', rmv_Sbj2Qry[2], dSbj2Cat, rmv_Qry2Sbj[2], dQry2Cat ) | |
| 583 | |
| 584 outFile.write("\n==========================================================================\n") | |
| 585 | |
| 586 outFile.close() | |
| 587 | |
| 588 writeSubjectCategory( dSbj2Cat ) | |
| 589 writeQueryCategory( dQry2Cat ) | |
| 590 | |
| 591 if verbose > 0: | |
| 592 print "END %s" % (sys.argv[0].split("/")[-1]) | |
| 593 sys.stdout.flush() | |
| 594 | |
| 595 return 0 | |
| 596 | |
| 597 #----------------------------------------------------------------------------------------------------- | |
| 598 | |
| 599 if __name__ == "__main__": | |
| 600 main() |
