Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/misc/Utils.py @ 38:2c0c0a89fad7
Uploaded
author | m-zytnicki |
---|---|
date | Thu, 02 May 2013 09:56:47 -0400 |
parents | 769e306b7933 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/Utils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,271 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Some useful functions""" + +import sys, os +import random +import subprocess + + +def writeFile(fileName, content): + """ + Write the content of a file + """ + handle = open(fileName, "w") + handle.write(content) + handle.close() + +def sumOfLists(list1, list2): + """ + Element by element sum + """ + if len(list1) != len(list2): + sys.exit("Cannot sum list whose sizes are different!") + return [list1[i] + list2[i] for i in range(len(list1))] + + +def protectBackslashes(string): + """ + Protect the backslashes in a path by adding another backslash + """ + return string.replace("\\", "\\\\") + + +def getHammingDistance(string1, string2): + """ + Compute Hamming distance between two strings + """ + if len(string1) != len(string2): + raise Exception("Error, size of %s and %s differ" % (string1, string2)) + return sum(ch1 != ch2 for ch1, ch2 in zip(string1, string2)) + + +def getLevenshteinDistance(string1, string2): + """ + Compute Levenshtein distance between two strings + """ + if len(string1) < len(string2): + return getLevenshteinDistance(string2, string1) + if not string1: + return len(string2) + previousRow = xrange(len(string2) + 1) + for i, c1 in enumerate(string1): + currentRow = [i + 1] + for j, c2 in enumerate(string2): + insertions = previousRow[j + 1] + 1 + deletions = currentRow[j] + 1 + substitutions = previousRow[j] + (c1 != c2) + currentRow.append(min(insertions, deletions, substitutions)) + previousRow = currentRow + return previousRow[-1] + + +def getMinAvgMedMax(values): + """ + Get some stats about a dict + @param values: a distribution (the value being the number of occurrences of the key) + @type values: dict int to int + @return: a tuple + """ + minValues = min(values.keys()) + maxValues = max(values.keys()) + sumValues = sum([value * values[value] for value in values]) + nbValues = sum(values.values()) + allValues = [] + for key in values: + for i in range(values[key]): + allValues.append(key) + sortedValues = sorted(allValues) + sorted(values.values()) + if (nbValues % 2 == 0): + medValues = (sortedValues[nbValues / 2 - 1] + sortedValues[nbValues / 2]) / 2.0 + else: + medValues = sortedValues[(nbValues + 1) / 2 - 1] + return (minValues, float(sumValues) / nbValues, medValues, maxValues) + + +def xor(value1, value2): + """ + Logical xor + @param value1: a value + @type value1: anything + @param value2: a value + @type value2: anything + """ + return bool(value1) != bool(value2) + + +def diff(fileName1, fileName2): + """ + Compare two files + @param fileName1: a file name + @type fileName1: string + @param fileName2: another file name + @type fileName2: string + @return: None if the files are the same, a string otherwise + """ + handle1 = open(fileName1) + lines1 = handle1.readlines() + handle2 = open(fileName2) + lines2 = handle2.readlines() + if len(lines1) != len(lines2): + print "Sizes of files differ (%d != %d)" % (len(lines1), len(lines2)) + return False + for i in xrange(len(lines1)): + if lines1[i] != lines2[i]: + print "Line %d differ ('%s' != '%s')" % (i, lines1[i].strip(), lines2[i].strip()) + return False + return True + + +def binomialCoefficient(a, b): + """ + Compute cumulated product from a to b + @param a: a value + @type a: int + @param b: a value + @type b: int + """ + if a > b / 2: + a = b-a + p = 1.0 + for i in range(b-a+1, b+1): + p *= i + q = 1.0 + for i in range(1, a+1): + q *= i + return p / q + + +memory = {} + +# def fisherExactPValue(a, b, c, d): +# """ +# P-value of Fisher exact test for 2x2 contingency table +# """ +# if (a, b, c, d) in memory: +# return memory[(a, b, c, d)] + +# n = a + b + c + d +# i1 = binomialCoefficient(a, a+b) +# i2 = binomialCoefficient(c, a+c) +# i3 = binomialCoefficient(c+d, n) +# pValue = i1 * i2 / i3 + +# memory[(a, b, c, d)] = pValue + +# return pValue + + +def fisherExactPValue(a, b, c, d): + if (a, b, c, d) in memory: + return memory[(a, b, c, d)] + + scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000)) + rScript = open(scriptFileName, "w") + rScript.write("data = matrix(c(%d, %d, %d, %d), nr=2)\n" % (a, b, c, d)) + rScript.write("fisher.test(data)\n") + #rScript.write("chisq.test(data)\n") + rScript.close() + + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + + if status != 0: + sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) + + outputRFileName = "%sout" % (scriptFileName) + outputRFile = open(outputRFileName) + pValue = None + pValueTag = "p-value " + for line in outputRFile: + line = line.strip() + if line == "": continue + for splittedLine in line.split(","): + splittedLine = splittedLine.strip() + if splittedLine.startswith(pValueTag): + pValue = float(splittedLine.split()[-1]) + break + + if pValue == None: + sys.exit("Problem with the cannot find p-value! File %s, values are: %d, %d, %d, %d" % (scriptFileName, a, b, c, d)) + + os.remove(scriptFileName) + os.remove(outputRFileName) + + memory[(a, b, c, d)] = pValue + + return pValue + + +def fisherExactPValueBulk(list): + + scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000)) + rScript = open(scriptFileName, "w") + for element in list: + rScript.write("fisher.test(matrix(c(%d, %d, %d, %d), nr=2))$p.value\n" % (int(element[0]), int(element[1]), int(element[2]), int(element[3]))) + rScript.close() + + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + + if status != 0: + sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) + + outputRFileName = "%sout" % (scriptFileName) + outputRFile = open(outputRFileName) + pValue = None + pValueTag = "[1] " + results = {} + cpt = 0 + for line in outputRFile: + line = line.strip() + if line == "": continue + if line.startswith(pValueTag): + pValue = float(line.split()[-1]) + results[list[cpt][0:2]] = pValue + cpt += 1 + + if pValue == None: + sys.exit("Problem with the cannot find p-value!") + if cpt != len(list): + sys.exit("Error in the number of p-values computed by R in file '%s'!" % (scriptFileName)) + + os.remove(scriptFileName) + os.remove(outputRFileName) + + return results +