Mercurial > repos > yufei-luo > s_mart

diff commons/tools/tabFileReader.py @ 18:94ab73e8a190
Uploaded
author: m-zytnicki
date: Mon, 29 Apr 2013 03:20:15 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/tools/tabFileReader.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,600 @@
+#!/usr/bin/env python
+
+###@file
+# Read a file recording matches in the 'tab' format (output from Matcher) and return the number of matches between queries and subjects being CC, CI, IC and II.
+# A match is said to be CC (for complete-complete) when both query and subject match over x% of their entire respective length. By default, x=95.
+#
+# usage: tabFileReader.py [ options ]
+# options:
+#      -h: this help
+#      -m: name of the file recording the matches (format='tab', output from Matcher)
+#      -q: name of the fasta file recording the queries
+#      -s: name of the fasta file recording the subjects
+#      -t: threshold over which the match is 'complete', in % of the seq length (default=95)
+#      -i: identity below which matches are ignored (default=0)
+#      -l: length below which matches are ignored (default=0)
+#      -o: overlap on query and subject below which matches are ignored (default=0)
+#      -v: verbose (default=0/1)
+
+import sys
+import getopt
+from string import *
+
+import pyRepet.seq.BioseqDB
+import pyRepet.util.Stat
+
+#TODO: remove case changes in headers (4 lower() method calls in this script) 
+
+#----------------------------------------------------------------------------
+
+def help():
+    print
+    print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] )
+    print "options:"
+    print "     -h: this help"
+    print "     -m: name of the file recording the matches (format='tab', output from Matcher)"
+    print "     -q: name of the fasta file recording the queries"
+    print "     -s: name of the fasta file recording the subjects"
+    print "     -t: coverage threshold over which the match is 'complete' (in %% of the seq length, default=95)"
+    print "     -i: identity below which matches are ignored (default=0)"
+    print "     -l: length below which matches are ignored (default=0)"
+    print "     -o: overlap on query and subject below which matches are ignored (default=0)"
+    print "     -I: identity threshold for 'CC' matches (default=90)"
+    print "     -E: E-value threshold for 'CC' matches (default=1e-10)"
+    print "     -T: coverage threshold for match length on query compare to subject length (default=90)"
+    print "     -v: verbose (default=0/1)"
+    print
+
+#----------------------------------------------------------------------------
+
+#here are the fields of a '.tab' file:
+#[0]: query sequence name
+#[1]: whole match start coordinate on the query sequence
+#[2]: whole match end coordinate on the query sequence
+#[3]: length on the query sequence
+#[4]: length in percentage of the query sequence
+#[5]: length on the query relative to the subject length in percentage
+#[6]: subject sequence name
+#[7]: whole match start coordinate on the subject sequence
+#[8]: whole match end coordinate on the subject sequence
+#[9]: length on the subject sequence
+#[10]: length in percentage of the subject sequence
+#[11]: BLAST E-value
+#[12]: BLAST score
+#[13]: identity percentage
+#[14]: path
+
+class tabFileReader( object ):
+
+    def __init__( self, line ):
+
+        columns = line.split("\t")
+
+        self.name_sbj = (columns[6])
+        self.length_sbj = int(round(int(columns[9])/float(columns[10]),0))  #length of the subject
+        self.prct_sbj = float(columns[10]) * 100  #prct_sbj = length of the match on the subject divided by the length of the subject * 100
+        if int(columns[7]) < int(columns[8]):
+            self.start_sbj = int(columns[7])                        #start of the match on the subject
+            self.end_sbj = int(columns[8])                          #end of the match on the subject
+        else:
+            self.start_sbj = int(columns[8])
+            self.end_sbj = int(columns[7])
+        self.sbj_dist_ends = int(columns[9])                    #length on the subject that matches with the query
+
+        self.name_qry = columns[0]
+        self.length_qry = int(round(int(columns[3])/float(columns[4]),0))  #length of the query
+        self.prct_qry = float(columns[4]) * 100   #prct_qry = length of the match on the query divided by the length of the query * 100
+        if int(columns[1]) < int(columns[2]):
+            self.start_qry = int(columns[1])                        #start of the match on the query
+            self.end_qry = int(columns[2])                          #end of the match on the query
+        else:
+            self.start_qry = int(columns[2])
+            self.end_qry = int(columns[1])
+        self.qry_dist_ends = int(columns[3])                    #length on the query that matches with the subject
+
+        self.length_match = int(columns[3])
+        self.prct_matchQryOverSbj = float(columns[5]) * 100   #length on the query relative to the subject length in percentage
+        self.identity = float(columns[13])
+        self.score = int(columns[12])
+        self.evalue = float(columns[11])
+
+        self.sbj2qry = [self.length_sbj,self.prct_sbj,self.start_sbj,self.end_sbj,self.name_qry,self.length_sbj,self.prct_qry,self.start_qry,self.end_qry,self.identity,self.score]
+        
+        self.qry2sbj = [self.length_qry,self.prct_qry,self.start_qry,self.end_qry,self.name_sbj,self.length_sbj,self.prct_sbj,self.start_sbj,self.end_sbj,self.identity,self.score]
+
+#----------------------------------------------------------------------------
+
+def make_dico( lMatches ):
+    """
+    Record the matches in two dictionaries which keys are the queries or the subjects.
+    """
+
+    Sbj2Qry = {}
+    Qry2Sbj = {}
+
+    for match in lMatches:
+        if Sbj2Qry.has_key( match.name_sbj ):
+            Sbj2Qry[match.name_sbj].append( match )
+        else:
+            Sbj2Qry[match.name_sbj] = [ match ]
+        if Qry2Sbj.has_key( match.name_qry ):
+            Qry2Sbj[match.name_qry].append( match )
+        else:
+            Qry2Sbj[match.name_qry] = [ match ]
+
+    return [ Sbj2Qry, Qry2Sbj ]
+
+#----------------------------------------------------------------------------
+
+def find_UniqRedun( list_matchs ):
+
+    list_total_sbj = [];list_total_qry = []
+    list_uniq_sbj = [];list_redun_sbj = []
+    list_uniq_qry = [];list_redun_qry = []
+
+    for match in list_matchs:
+        list_total_sbj.append(match.name_sbj)
+        list_total_qry.append(match.name_qry)
+
+    for name_sbj in list_total_sbj:
+        if list_total_sbj.count(name_sbj) == 1:
+            list_uniq_sbj.append(name_sbj)
+        else:
+            if name_sbj not in list_redun_sbj:
+                list_redun_sbj.append(name_sbj)
+
+    for name_qry in list_total_qry:
+        if list_total_qry.count(name_qry) == 1:
+            list_uniq_qry.append(name_qry)
+        else:
+            if name_qry not in list_redun_qry:
+                list_redun_qry.append(name_qry)
+
+    return [ list_uniq_sbj, list_redun_sbj, list_uniq_qry, list_redun_qry ]
+
+#----------------------------------------------------------------------------
+
+def remove( all, sup_sbjqry, sup_sbj, sup_qry, inf_sbjqry ):
+
+    for name_sbj in all.keys():
+
+        if sup_sbjqry.has_key( name_sbj ) and sup_sbj.has_key( name_sbj ):
+            del sup_sbj[ name_sbj ]
+
+        if sup_sbjqry.has_key( name_sbj ) and sup_qry.has_key( name_sbj ):
+            del sup_qry[ name_sbj ]
+
+        if sup_sbjqry.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ):
+            del inf_sbjqry[ name_sbj ]
+
+        if sup_sbj.has_key( name_sbj ) and sup_qry.has_key( name_sbj ):
+            del sup_qry[ name_sbj ]
+
+        if sup_sbj.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ):
+            del inf_sbjqry[ name_sbj ]
+
+        if sup_qry.has_key( name_sbj ) and inf_sbjqry.has_key( name_sbj ):
+            del inf_sbjqry[ name_sbj ]
+
+    return [ sup_sbj, sup_qry, inf_sbjqry ]
+
+#----------------------------------------------------------------------------
+
+def write_output( outFile, match_type, Sbj2Qry, dSbj2Cat, Qry2Sbj, dQry2Cat ):
+    """
+    Save the results (subjects in each category and its matches) in a human-readable way.
+    """
+
+    if match_type == 'CC':
+        msg = "Matches with L >= %i%% for subject and query (CC)" % ( thresholdCoverage )
+    elif match_type == 'CI':
+        msg = "Matches with L >= %i%% for subject and L < %i%% for query (CI)" % ( thresholdCoverage, thresholdCoverage )
+    elif match_type == 'IC':
+        msg = "Matches with L < %i%% for subject and L >= %i%% for query (IC)" % ( thresholdCoverage, thresholdCoverage )
+    elif match_type == 'II':
+        msg ="Matches with L < %i%% for subject and query (II)" % ( thresholdCoverage )
+    if verbose > 1:
+        print "%s: %i subjects" % ( msg, len(Sbj2Qry.keys()) )
+    outFile.write("\n%s\n" % ( msg ) )
+
+    for name_sbj in Sbj2Qry.keys():
+        matchs = Sbj2Qry[name_sbj]
+        if len(matchs) == 1:
+            outFile.write("-> subject %s (%s: %s,%s) matches with query %s (%s: %s,%s): prct_sbj %.3f & prct_qry %.3f (id=%.3f,Eval=%g)\n" % (name_sbj,matchs[0].length_sbj,matchs[0].start_sbj,matchs[0].end_sbj,matchs[0].name_qry,matchs[0].length_qry,matchs[0].start_qry,matchs[0].end_qry,matchs[0].prct_sbj,matchs[0].prct_qry,matchs[0].identity,matchs[0].evalue))
+        else:
+            outFile.write("-> subject %s (%s: %s,%s) matches with %s queries:\n" % (name_sbj,matchs[0].length_sbj,matchs[0].start_sbj,matchs[0].end_sbj,len(matchs)))
+            for match in matchs:
+                outFile.write("%s versus %s (%s: %s,%s): prct_sbj %.3f & prct_qry %.3f (id=%.3f,Eval=%g)\n"%(name_sbj,match.name_qry,match.length_qry,match.start_qry,match.end_qry,match.prct_sbj,match.prct_qry,match.identity,match.evalue))
+
+    tmpList = []
+    for name_sbj in Sbj2Qry.keys():
+        tmpList.append( name_sbj.split(" ")[0].lower() )
+    tmpList.sort()
+    for name_sbj in tmpList:
+        outFile.write( name_sbj+"\n" )
+        dSbj2Cat[ name_sbj ] = match_type
+
+    tmpList = []
+    for name_qry in Qry2Sbj.keys():
+        tmpList.append( name_qry.split(" ")[0].lower() )
+    tmpList.sort()
+    for name_qry in tmpList:
+        outFile.write( name_qry+"\n" )
+        dQry2Cat[ name_qry ] = match_type
+
+#----------------------------------------------------------------------------
+
+def writeSubjectCategory( dSbj2Cat ):
+    """
+    Save the category (CC/CI/IC/II/NA) in which each subject has been found.
+
+    @param dSbj2Cat: dictionary which keys are subject names and values the category of that subject
+    @type dSbj2Cat: dictionary
+    """
+
+    # sort the subject names in alphabetical order
+    lSbjSorted = dSbj2Cat.keys()
+    lSbjSorted.sort()
+
+    catFile = open( tabFileName + "_sbjCategories.txt", "w" )
+    for sbj in lSbjSorted:
+        string = "%s\t%s\n" % ( sbj, dSbj2Cat[ sbj ] )
+        catFile.write( string )
+    catFile.close()
+
+#----------------------------------------------------------------------------
+
+def writeQueryCategory( dQry2Cat ):
+    """
+    Save the category (CC/CI/IC/II/NA) in which each query has been found.
+
+    @param dQry2Cat: dictionary which keys are query names and values the category of that query
+    @type dQry2Cat: dictionary
+    """
+
+    # sort the query names in alphabetical order
+    lQrySorted = dQry2Cat.keys()
+    lQrySorted.sort()
+
+    catFile = open( tabFileName + "_qryCategories.txt", "w" )
+    for qry in lQrySorted:
+        string = "%s\t%s\n" % ( qry, dQry2Cat[ qry ] )
+        catFile.write( string )
+    catFile.close()
+    
+#----------------------------------------------------------------------------
+
+def main():
+
+    global tabFileName
+    tabFileName = ""
+    qryFileName = ""
+    sbjFileName = ""
+    global thresholdCoverage
+    thresholdCoverage = 95
+    minIdentity = 0
+    minLength = 0
+    minOverlap = 0
+    global thresholdIdentity
+    thresholdIdentity = 90
+    global thresholdEvalue
+    thresholdEvalue = 1e-10
+    global thresholdCoverageMatch
+    thresholdCoverageMatch = 90
+    global verbose
+    verbose = 0
+
+    try:
+        opts, args = getopt.getopt(sys.argv[1:],"hm:q:s:t:i:l:I:E:T:o:v:")
+    except getopt.GetoptError, err:
+        print str(err); help(); sys.exit(1)
+    for o,a in opts:
+        if o == "-h":
+            help()
+            sys.exit(0)
+        elif o == "-m":
+            tabFileName = a
+        elif o == "-q":
+            qryFileName = a
+        elif o == "-s":
+            sbjFileName = a
+        elif o == "-t":
+            thresholdCoverage = int(a)
+        elif o == "-i":
+            minIdentity = float(a)
+        elif o == "-l":
+            minLength = int(a)
+        elif o == "-o":
+            minOverlap = float(a)
+        elif o == "-I":
+            thresholdIdentity = int(a)
+        elif o == "-E":
+            thresholdEvalue = float(a)
+        elif o == "-T":
+            thresholdCoverageMatch = int(a)
+        elif o == "-v":
+            verbose = int(a)
+
+    if tabFileName == "":
+        msg = "ERROR: missing 'tab' file (-m)"
+        sys.stderr.write( "%s\n" % msg )
+        help()
+        sys.exit(1)
+    if qryFileName == "" or sbjFileName == "":
+        msg = "ERROR: missing 'fasta' files (-q or -s)"
+        sys.stderr.write( "%s\n" % msg )
+        help()
+        sys.exit(1)
+
+    if verbose > 0:
+        print "START %s" % (sys.argv[0].split("/")[-1])
+        sys.stdout.flush()
+
+    # 4 categories of matchs:
+    # type 1 (CC): the length of the match on the subject is >= 95% of the total length of the subject, idem for the query
+    # type 2 (CI): sbj >= 95% & qry < 95%
+    # type 3 (IC): sbj < 95% & qry >= 95%
+    # type 4 (II): sbj & qry < 95%
+    ListMatches_all = []
+    ListMatches_sup_sbjqry = []
+    ListMatches_sup_sbj = []
+    ListMatches_sup_qry = []
+    ListMatches_inf_sbjqry = []
+
+    qryDB = pyRepet.seq.BioseqDB.BioseqDB( qryFileName )
+    nbQry = qryDB.getSize()
+    if verbose > 0:
+        print "nb of queries in '%s': %i" % ( qryFileName, nbQry )
+    dQry2Cat = {}
+    for bs in qryDB.db:
+        dQry2Cat[ bs.header.split(" ")[0].lower() ] = "NA"
+
+    sbjDB = pyRepet.seq.BioseqDB.BioseqDB( sbjFileName )
+    nbSbj = sbjDB.getSize()
+    if verbose > 0:
+        print "nb of subjects in '%s': %i" % ( sbjFileName, nbSbj )
+    dSbj2Cat = {}
+    for bs in sbjDB.db:
+        dSbj2Cat[ bs.header.split(" ")[0].lower() ] = "NA"
+
+    tabFile = open( tabFileName )
+    nbMatchesInTab = 0
+    dSubject2DistinctQueries = {}
+    dQuery2DistinctSubjects = {}
+
+    # For each match, create a 'tabFileReader' object and record it in a list according to the type of the match
+    if verbose > 0:
+        print "parse the 'tab' file..."; sys.stdout.flush()
+    while True:
+        line = tabFile.readline()
+        if line == "":
+            break
+        if line[0:10] == "query.name":
+            continue
+        nbMatchesInTab += 1
+
+        match = tabFileReader( line )
+        if match.identity < minIdentity:
+            line = tabFile.readline()
+            continue
+        if match.length_match < minLength:
+            line = tabFile.readline()
+            continue
+        if match.prct_qry < minOverlap or match.prct_sbj < minOverlap:
+            line = tabFile.readline()
+            continue
+        ListMatches_all.append( match )
+
+        # type 1: sbj C & qry C
+        if match.prct_sbj >= thresholdCoverage and match.prct_qry >= thresholdCoverage:
+            qsLengthRatio = 100 * match.length_qry / float(match.length_sbj)
+            if match.identity >= thresholdIdentity \
+            and match.evalue <= thresholdEvalue \
+            and qsLengthRatio >= thresholdCoverage - 2 \
+            and qsLengthRatio <= 100 + (100-thresholdCoverage) + 2 \
+            and match.prct_matchQryOverSbj >= thresholdCoverageMatch:
+                ListMatches_sup_sbjqry.append( match )
+            else:
+                ListMatches_inf_sbjqry.append( match )
+
+        # type 2: sbj C & qry I
+        elif match.prct_sbj >= thresholdCoverage and match.prct_qry < thresholdCoverage:
+            ListMatches_sup_sbj.append( match )
+
+        # type 3: sbj I & qry C
+        elif match.prct_qry >= thresholdCoverage and match.prct_sbj < thresholdCoverage:
+            ListMatches_sup_qry.append( match )
+
+        # type 4: sbj I & qry I
+        elif match.prct_qry < thresholdCoverage and match.prct_sbj < thresholdCoverage:
+            ListMatches_inf_sbjqry.append( match )
+
+        if not dSubject2DistinctQueries.has_key( match.name_sbj ):
+            dSubject2DistinctQueries[ match.name_sbj ] = []
+        if not match.name_qry in dSubject2DistinctQueries[ match.name_sbj ]:
+            dSubject2DistinctQueries[ match.name_sbj ].append( match.name_qry )
+        if not dQuery2DistinctSubjects.has_key( match.name_qry ):
+            dQuery2DistinctSubjects[ match.name_qry ] = []
+        if not match.name_sbj in dQuery2DistinctSubjects[ match.name_qry ]:
+            dQuery2DistinctSubjects[ match.name_qry ].append( match.name_sbj )
+
+    if verbose > 0:
+        print "parsing done !"; sys.stdout.flush()
+        print "nb matches in '%s': %i" % ( tabFileName, nbMatchesInTab )
+        print "nb matches 'CC': %i" % ( len(ListMatches_sup_sbjqry) )
+        if verbose > 1:
+            for match in ListMatches_sup_sbjqry:
+                print "\t%s (%.2f%%) - %s (%.2f%%) id=%.2f" % ( match.name_sbj, match.prct_sbj, match.name_qry, match.prct_qry, match.identity )
+        print "nb matches 'CI': %i" % ( len(ListMatches_sup_sbj) )
+        if verbose > 1:
+            for match in ListMatches_sup_sbj:
+                print "\t%s (%.2f%%) - %s (%.2f%%) id=%.2f" % ( match.name_sbj, match.prct_sbj, match.name_qry, match.prct_qry, match.identity )
+        print "nb matches 'IC': %i" % ( len(ListMatches_sup_qry) )
+        print "nb matches 'II': %i" % ( len(ListMatches_inf_sbjqry) )
+
+    if nbMatchesInTab == 0:
+        print "nothing to do"
+        sys.exit(0)
+
+    # For each type of matchs, record them in 2 dictionaries: Sbj2Qry and Qry2Sbj
+    D_all = make_dico( ListMatches_all )
+    Sbj2Qry_all = D_all[0]
+    Qry2Sbj_all = D_all[1]
+
+    D_sup_sbjqry = make_dico(ListMatches_sup_sbjqry)
+    Sbj2Qry_sup_sbjqry = D_sup_sbjqry[0]
+    Qry2Sbj_sup_sbjqry = D_sup_sbjqry[1]
+
+    D_sup_sbj = make_dico(ListMatches_sup_sbj)
+    Sbj2Qry_sup_sbj = D_sup_sbj[0]
+    Qry2Sbj_sup_sbj = D_sup_sbj[1]
+
+    D_sup_qry = make_dico(ListMatches_sup_qry)
+    Sbj2Qry_sup_qry = D_sup_qry[0]
+    Qry2Sbj_sup_qry = D_sup_qry[1]
+
+    D_inf_sbjqry = make_dico(ListMatches_inf_sbjqry)
+    Sbj2Qry_inf_sbjqry = D_inf_sbjqry[0]
+    Qry2Sbj_inf_sbjqry = D_inf_sbjqry[1]
+
+
+    # For each type of matches, find the subjects/queries that are involve in one or several match
+    list_all = find_UniqRedun(ListMatches_all)
+    UniqSbj_all = list_all[0]
+    RedunSbj_all = list_all[1]
+    UniqQry_all = list_all[2]
+    RedunQry_all = list_all[3]
+
+    list1 = find_UniqRedun(ListMatches_sup_sbjqry)
+    UniqSbj_sup_sbjqry = list1[0]
+    RedunSbj_sup_sbjqry = list1[1]
+    UniqQry_sup_sbjqry = list1[2]
+    RedunQry_sup_sbjqry = list1[3]
+
+    list2 = find_UniqRedun(ListMatches_sup_sbj)
+    UniqSbj_sup_sbj = list2[0]
+    RedunSbj_sup_sbj = list2[1]
+    UniqQry_sup_sbj = list2[2]
+    RedunQry_sup_sbj = list2[3]
+    
+    list3 = find_UniqRedun(ListMatches_sup_qry)
+    UniqSbj_sup_qry = list3[0]
+    RedunSbj_sup_qry = list3[1]
+    UniqQry_sup_qry = list3[2]
+    RedunQry_sup_qry = list3[3]
+    
+    list4 = find_UniqRedun(ListMatches_inf_sbjqry)
+    UniqSbj_inf_sbjqry = list4[0]
+    RedunSbj_inf_sbjqry = list4[1]
+    UniqQry_inf_sbjqry = list4[2]
+    RedunQry_inf_sbjqry = list4[3]
+    
+    iStatSbj = pyRepet.util.Stat.Stat()
+    for subject in dSubject2DistinctQueries.keys():
+        iStatSbj.add( len( dSubject2DistinctQueries[ subject ] ) )
+    iStatQry = pyRepet.util.Stat.Stat()
+    for query in dQuery2DistinctSubjects.keys():
+        iStatQry.add( len( dQuery2DistinctSubjects[ query ] ) )
+
+
+    # Write the review of the '.tab' file
+    outFile = open( tabFileName + "_tabFileReader.txt", "w" )
+    outFile.write( "Input: %s\n" % ( tabFileName ) )
+
+    outFile.write( "\n# Number of subjects in '%s': %i\n" % ( sbjFileName, nbSbj ) )
+    outFile.write( "# Number of queries in '%s': %i\n" % ( qryFileName, nbQry ) )
+
+    outFile.write( "\nNumber of matches: %s\n" % (len(ListMatches_all)))
+    outFile.write( "    # Number of different subjects that match: %s (Sn*=%.2f%%)\n" % ( len(Sbj2Qry_all.keys()), 100 * len(Sbj2Qry_all.keys()) / float(nbSbj) ) )
+    outFile.write( "        Among them, number of different subjects having exactly one match: %s (%.2f%%)\n" % ( len(UniqSbj_all), 100 * len(UniqSbj_all) / float(len(Sbj2Qry_all.keys())) ) )
+    outFile.write( "        Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_all)))
+    outFile.write( "        Different queries per subject: mean=%.2f sd=%.2f min=%.2f q25=%.2f med=%.2f q75=%.2f max=%.2f\n" % ( iStatSbj.mean(), iStatSbj.sd(), iStatSbj.min, iStatSbj.quantile(0.25), iStatSbj.median(), iStatSbj.quantile(0.75), iStatSbj.max ) )
+    outFile.write( "    # Number of different queries that match: %s (Sp*=%.2f%%)\n" % ( len(Qry2Sbj_all.keys()), 100 * len(Qry2Sbj_all.keys()) / float(nbQry) ) )
+    outFile.write( "        Among them, number of different queries having exactly one match: %s (%.2f%%)\n" % ( len(UniqQry_all), 100 * len(UniqQry_all) / float(len(Qry2Sbj_all.keys())) ) )
+    outFile.write( "        Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_all)) )
+    outFile.write( "        Different subjects per query: mean=%.2f sd=%.2f min=%.2f q25=%.2f med=%.2f q75=%.2f max=%.2f\n" % ( iStatQry.mean(), iStatQry.sd(), iStatQry.min, iStatQry.quantile(0.25), iStatQry.median(), iStatQry.quantile(0.75), iStatQry.max ) )
+
+    outFile.write( "\nNumber of matches with L >= %i%% for subject & query: %i\n" % ( thresholdCoverage, len(ListMatches_sup_sbjqry) ) )
+    outFile.write( "    # Number of different subjects in the 'CC' case: %s (%.2f%%)\n" % ( len(Sbj2Qry_sup_sbjqry), 100 *  len(Sbj2Qry_sup_sbjqry) / float(nbSbj) ) )
+    outFile.write( "        Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_sbjqry)))
+    outFile.write( "        Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_sbjqry)))
+    outFile.write( "    # Number of different queries in the 'CC' case: %s (%.2f%%)\n" % ( len(Qry2Sbj_sup_sbjqry), 100 * len(Qry2Sbj_sup_sbjqry) / float(nbQry) ) )
+    outFile.write( "        Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_sbjqry)))
+    outFile.write( "        Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_sbjqry)))
+
+    outFile.write( "\nNumber of matches with L >= %i%% for subject and L < %i%% for query: %i\n" % ( thresholdCoverage, thresholdCoverage, len(ListMatches_sup_sbj) ) )
+    outFile.write( "    Number of different subjects in that case: %s\n" % (len(Sbj2Qry_sup_sbj)))
+    outFile.write( "        Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_sbj)))
+    outFile.write( "        Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_sbj)))
+    outFile.write( "    Number of different queries in that case: %s\n" % (len(Qry2Sbj_sup_sbj)))
+    outFile.write( "        Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_sbj)))
+    outFile.write( "        Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_sbj)))
+
+    outFile.write( "\nNumber of matches with L < %i%% for subject and L >= %i%% for query: %i\n" % ( thresholdCoverage, thresholdCoverage, len(ListMatches_sup_qry) ) )
+    outFile.write( "    Number of different subjects in that case: %s\n" % (len(Sbj2Qry_sup_qry)))
+    outFile.write( "        Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_sup_qry)))
+    outFile.write( "        Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_sup_qry)))
+    outFile.write( "    Number of different queries in that case: %s\n" % (len(Qry2Sbj_sup_qry)))
+    outFile.write( "        Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_sup_qry)))
+    outFile.write( "        Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_sup_qry)))
+
+    outFile.write( "\nNumber of matches with L < %i%% for subject & query: %i\n" % ( thresholdCoverage, len(ListMatches_inf_sbjqry) ) )
+    outFile.write( "    Number of different subjects in that case: %s\n" % (len(Sbj2Qry_inf_sbjqry)))
+    outFile.write( "        Among them, number of different subjects having exactly one match: %s\n" % (len(UniqSbj_inf_sbjqry)))
+    outFile.write( "        Among them, number of different subjects having more than one match: %s\n" % (len(RedunSbj_inf_sbjqry)))
+    outFile.write( "    Number of different queries in that case: %s\n" % (len(Qry2Sbj_inf_sbjqry)))
+    outFile.write( "        Among them, number of different queries having exactly one match: %s\n" % (len(UniqQry_inf_sbjqry)))
+    outFile.write( "        Among them, number of different queries having more than one match: %s\n" % (len(RedunQry_inf_sbjqry)))
+    
+    
+    # For the elements already counted in the matches with L >= 95% for subject & query, remove them from the other dictionnaries
+    rmv_Sbj2Qry = remove( Sbj2Qry_all, Sbj2Qry_sup_sbjqry, Sbj2Qry_sup_sbj, Sbj2Qry_sup_qry, Sbj2Qry_inf_sbjqry )
+    rmv_Qry2Sbj = remove( Qry2Sbj_all, Qry2Sbj_sup_sbjqry, Qry2Sbj_sup_sbj, Qry2Sbj_sup_qry, Qry2Sbj_inf_sbjqry )
+    
+    outFile.write("\n\nAfter removal of the subjects/queries already counted in the matches with L >= %i%% for them:\n" % ( thresholdCoverage ) )
+    
+    outFile.write( "\nMatches with L >= %i%% for subject and L < %i%% for query:\n" % ( thresholdCoverage, thresholdCoverage ) )
+    outFile.write( "    # Number of different subjects in the 'CI' case: %s (%.2f%%)\n" % ( len(rmv_Sbj2Qry[0]), 100*len(rmv_Sbj2Qry[0])/float(nbSbj) ) )
+    outFile.write( "    # Number of different queries in the 'CI' case: %s (%.2f%%)\n" % ( len(rmv_Qry2Sbj[0]), 100*len(rmv_Qry2Sbj[0])/float(nbQry) ) )
+    
+    outFile.write( "\nMatches with L < %i%% for subject and L >= %i%% for query:\n" % ( thresholdCoverage, thresholdCoverage ) )
+    outFile.write( "    # Number of different subjects in the 'IC' case: %s (%.2f%%)\n" % (len(rmv_Sbj2Qry[1]), 100*len(rmv_Sbj2Qry[1])/float(nbSbj) ) )
+    outFile.write( "    # Number of different queries in the 'IC' case: %s (%.2f%%)\n" % (len(rmv_Qry2Sbj[1]), 100*len(rmv_Qry2Sbj[1])/float(nbQry) ) )
+    
+    outFile.write( "\nMatches with L < %i%% for subject & query:\n" % ( thresholdCoverage ) )
+    outFile.write( "    # Number of different subjects in the 'II' case: %s (%.2f%%)\n" % (len(rmv_Sbj2Qry[2]), 100*len(rmv_Sbj2Qry[2])/float(nbSbj) ) )
+    outFile.write( "    # Number of different queries in the 'II' case: %s (%.2f%%)\n" % (len(rmv_Qry2Sbj[2]), 100*len(rmv_Qry2Sbj[2])/float(nbQry) ) )
+    
+    outFile.write("\n==========================================================================\n")
+    
+    write_output( outFile, 'CC', Sbj2Qry_sup_sbjqry, dSbj2Cat, Qry2Sbj_sup_sbjqry, dQry2Cat )
+    
+    outFile.write("\n==========================================================================\n")
+    
+    write_output( outFile, 'CI', rmv_Sbj2Qry[0], dSbj2Cat, rmv_Qry2Sbj[0], dQry2Cat )
+    
+    outFile.write("\n==========================================================================\n")
+    
+    write_output( outFile, 'IC', rmv_Sbj2Qry[1], dSbj2Cat, rmv_Qry2Sbj[1], dQry2Cat )
+    
+    outFile.write("\n==========================================================================\n")
+    
+    write_output( outFile, 'II', rmv_Sbj2Qry[2], dSbj2Cat, rmv_Qry2Sbj[2], dQry2Cat )
+    
+    outFile.write("\n==========================================================================\n")
+    
+    outFile.close()
+    
+    writeSubjectCategory( dSbj2Cat )
+    writeQueryCategory( dQry2Cat )
+    
+    if verbose > 0:
+        print "END %s" % (sys.argv[0].split("/")[-1])
+        sys.stdout.flush()
+        
+    return 0
+
+#-----------------------------------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
author	m-zytnicki
date	Mon, 29 Apr 2013 03:20:15 -0400
parents
children