Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/ncList/Benchmark.py @ 36:44d5973c188c
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 15:02:29 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/Benchmark.py Tue Apr 30 15:02:29 2013 -0400 @@ -0,0 +1,357 @@ +import os, os.path, random, glob, subprocess, threading, time, resource +from optparse import OptionParser +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.getRandomRegions import RandomRegionsGenerator +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.GffParser import GffParser + +#TYPES = ("bin", "has", "seg", "fj", "nc", "new") +TYPES = ("new", ) + +class RunCmd(threading.Thread): + def __init__(self, cmd, out, err, time, memory): + threading.Thread.__init__(self) + self._cmd = cmd + self._out = out + self._err = err + self._time = time + self._memory = memory + self._id = os.getpid() + self._mem = 0.0 + self._outputFileName = "tmp_%d.out" % (self._id) + + def run(self): + self._p = subprocess.Popen(self._cmd, stdout = self._out, stderr = self._err, shell = True) + #self._p.wait() + + def _runShellCommand(self, command): + p = subprocess.call("%s > %s" % (command, self._outputFileName), shell=True) + handle = open(self._outputFileName) + data = [line.split() for line in handle.readlines()[1:] if line] + handle.close() + os.remove(self._outputFileName) + return data + + def _getPid(self): + self._pid = None + cpt = 1 + while True: + commandsFound = [] + for line in self._runShellCommand("ps -o pid,cmd"): + if line[1:] == self._cmd.split(" "): + self._pid = int(line[0]) + commandsFound.append(" ".join(line[1:])) + if self._pid != None: + return True + time.sleep(1) + if cpt % 100 == 0: + print "pid of '%s' not found after %d seconds. Found: %s" % (self._cmd, cpt, " --- ".join(commandsFound)) + cpt += 1 + if cpt > 300: + return False + + def _fetchMemory(self): + lines = self._runShellCommand("ps u -p %d" % (self._pid)) + for line in lines: + self._mem = max(self._mem, float(line[3])) + return self._mem >= self._memory + #print "Cannot find the memory of the current PID (%d) in: %s" % (self._pid, " --- ".join([" ".join(line) for line in lines])) + return False + + def getMemory(self): + return self._mem + + def _abort(self): + try: + self._p.terminate() + except Exception: + pass + self._killSubThreads() + + def _killSubThreads(self): + for line in self._runShellCommand("ps --ppid %d -o pid" % (self._pid)): + self._runShellCommand("kill %s" % (line[0])) + self._runShellCommand("kill %s" % (self._pid)) + + def go(self): + startTime = time.time() + self.run() + #self.start() + while not self._getPid(): + #self.start() + self.run() + while True: + if self._time != None and time.time() - startTime > self._time: + print "\nCommand '%s' did not finish in time. Aborting it." % (self._cmd) + self._abort() + break + if self._memory != None and self._fetchMemory(): + print "\nCommand '%s' required too much memory (%f). Aborting it." % (self._cmd, self._mem) + self._abort() + break + #self.join(0.1) + time.sleep(0.1) + #if not self.isAlive(): + if self._p.poll() != None: + return True + return False + + +class DataStructure(object): + def __init__(self): + self._structure = {} + + def addData(self, data): + if data._nbRefs not in self._structure: + self._structure[data._nbRefs] = {} + if data._nbQueries not in self._structure[data._nbRefs]: + self._structure[data._nbRefs][data._nbQueries] = {} + if data._genomeSize not in self._structure[data._nbRefs][data._nbQueries]: + self._structure[data._nbRefs][data._nbQueries][data._genomeSize] = {} + if data._type not in self._structure[data._nbRefs][data._nbQueries][data._genomeSize]: + self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type] = [] + self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type].append(data._group) + + def export(self): + outputString = "#refs\t#queries\tgenome size\ttype\t# written\t# overlaps\tbuild t.\trun t.\tmem\n" + for nbRefs in sorted(self._structure.keys()): + for nbQueries in sorted(self._structure[nbRefs].keys()): + for genomeSize in sorted(self._structure[nbRefs][nbQueries].keys()): + for type in TYPES: + if type not in self._structure[nbRefs][nbQueries][genomeSize]: + outputString += "NA\tNA\tNA\t%s\tNA\tNA\tNA\tNA\tNA\tNA\n" % (type) + else: + for group in self._structure[nbRefs][nbQueries][genomeSize][type]: + outputString += "%d\t%d\t%d\t%s\t%d\t%d\t%f\t%f\t%f\n" % (nbRefs, nbQueries, genomeSize, type, group._nbOutputs, group._nbOverlaps, group._buildTime, group._runTime, group._mem) + return outputString + + +class Data(object): + def __init__(self, type, buildTime, runTime, mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize): + self._type = type + self._nbRefs = nbRefs + self._nbQueries = nbQueries + self._genomeSize = genomeSize + self._group = Group(nbOutputs, nbOverlaps, buildTime, runTime, mem) + + def checkConsistency(self, data): + return self._group.checkConsistency(data._group) + + +class Group(object): + def __init__(self, nbOutputs, nbOverlaps, buildTime, runTime, mem): + self._buildTime = buildTime + self._runTime = runTime + self._mem = mem + self._nbOutputs = nbOutputs + self._nbOverlaps = nbOverlaps + + def checkConsistency(self, group): + if (self._buildTime == "NA" or group._buildTime == "NA"): + return True + return (self._nbOutputs == group._nbOutputs and self._nbOverlaps == group._nbOverlaps) + + +class Benchmark(object): + + def __init__(self, verbosity = 1): + self._verbosity = verbosity + self._checkEnvironmentVariable() + self._toolName = {"bin": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBin.py"), \ + "has": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsHashBin.py"), \ + "seg": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBinSegment.py"), \ + "fj": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsFJoin.py"), \ + "nc": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervals.py"), \ + "new": os.path.join(os.environ["SMARTPATH"], "FindOverlapsOptim.py")} + self._structure = DataStructure() + self._pid = os.getpid() + self._count = 0 + self._time = None + self._memory = None + + def _checkEnvironmentVariable(self): + if "SMARTPATH" not in os.environ: + raise Exception("'SMARTPATH' is not set. Please set it to '<installation-direction>/S-mart/Java/Python'.") + + def _createTmpFileName(self, name, extension): + self._count += 1 + return "tmp_%d_%d_%s.%s" % (self._pid, self._count, name, extension) + + def _dumpAndReturn(self, fileName, exception): + handle = open(fileName) + print "Error in parsing file '%s':" % (fileName) + for line in handle: + print line.strip() + print "Command is: '%s'" % (self._command) + raise exception + + def setNbReferences(self, nbReferences): + self._nbReferences = nbReferences + + def setNbQueries(self, nbQueries): + self._nbQueries = nbQueries + + def setGenomeSizes(self, nbGenomeSizes): + self._nbGenomeSizes = nbGenomeSizes + + def setNbReplicates(self, nbReplicates): + self._nbReplicates = nbReplicates + + def setChromosomeName(self, chromosome): + self._chromosomeName = chromosome + + def setSizes(self, minSize, maxSize): + self._minSize = minSize + self._maxSize = maxSize + + def setOutputFileName(self, fileName): + self._outputFileName = fileName + + def setLimits(self, time, memory): + self._time = time + self._memory = memory + + def _generateIntervals(self, nbElements, genomeSize): + fileName = self._createTmpFileName("intervals", "gff3") + iRR = RandomRegionsGenerator(0) + iRR.setMinSize(self._minSize) + iRR.setMaxSize(self._maxSize) + iRR.setGenomeSize(genomeSize) + iRR.setChromosomeName(self._chromosomeName) + iRR.setStrands(False) + iRR.setNumber(nbElements) + iRR.setOutputFile(fileName) + iRR.run() + return fileName + + def _startTool(self, type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize): + outputFileName = self._createTmpFileName("output", "gff3") + outFileName = self._createTmpFileName("out", "out") + errFileName = self._createTmpFileName("err", "err") + outHandle = open(outFileName, "w") + errHandle = open(errFileName, "w") + self._command = "python %s -i %s -f gff3 -j %s -g gff3 -o %s -v 3" % (self._toolName[type], queryFileName, refFileName, outputFileName) + thread = RunCmd(self._command, outHandle, errHandle, self._time, self._memory) + over = thread.go() + self._mem = thread.getMemory() + if os.path.exists(outputFileName): + os.remove(outputFileName) + outHandle.close() + errHandle.close() + errData = open(errFileName).readlines() + if errData: + print "Error output: \n%s\n" % ("\n".join(errData)) + if not over: + errHandle = open(errFileName, "r") + error = errHandle.readlines() + errHandle.close() + if error: + for line in error: + print line.strip() + print "Previous process failed" + os.remove(errFileName) + if not over: + return False + return outFileName + + def _parseTrace(self, type, fileName, genomeSize): + handle = open(fileName) + buildTime = 0 + try: + for line in handle: + line = line.strip() + if "time spent" in line: + buildTime += float(line.split()[-1][:-1]) + elif "done" in line: + buildTime += float(line.split("(")[1][:-2]) + elif "# queries" in line: + nbQueries = int(line.split()[-1]) + elif "# refs" in line: + nbRefs = int(line.split()[-1]) + elif "# written" in line: + nbOutputs = int(line.split()[2]) + nbOverlaps = int(line.split()[3][1:]) + elif "time" in line: + runTime = float(line.split()[-1][:-1]) + except Exception, e: + handle.close() + self._dumpAndReturn(fileName, e) + handle.close() + try: + return Data(type, buildTime, runTime, self._mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize) + except Exception, e: + handle.close() + self._dumpAndReturn(fileName, e) + + def _cleanTmpFiles(self, really = False): + files = glob.glob("tmp_%d_*.pkl" % (self._pid)) + glob.glob("tmp_%d_*.bin" % (self._pid)) + if really: + files += glob.glob("tmp_%d_*.gff3" % (self._pid)) + glob.glob("tmp_%d_*.out" % (self._pid)) + for fileName in files: + os.remove(fileName) + + def run(self): + progress = Progress(len(self._nbReferences) * len(self._nbQueries) * len(self._nbGenomeSizes) * self._nbReplicates, "Processing", self._verbosity) + for nbReferences in self._nbReferences: + for queriesRatio in self._nbQueries: + nbQueries = int(nbReferences * queriesRatio) + for genomeSizeRatio in self._nbGenomeSizes: + genomeSize = int(nbReferences * genomeSizeRatio) + for replicate in range(self._nbReplicates): + refFileName = self._generateIntervals(nbReferences, genomeSize) + queryFileName = self._generateIntervals(nbQueries, genomeSize) + data = {} + for type in TYPES: + fileName = self._startTool(type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize) + if not fileName: + data[type] = Data(type, "NA", "NA", "NA", nbReferences, nbQueries, "NA", "NA", genomeSize) + else: + data[type] = self._parseTrace(type, fileName, genomeSize) + self._structure.addData(data[type]) + os.remove(fileName) + self._cleanTmpFiles() + self._cleanTmpFiles(True) + firstType = TYPES[0] + for type in TYPES[1:]: + if not data[firstType].checkConsistency(data[type]): + raise Exception("Outputs are not consistent.\n # outputs: %d vs %d.\n # overlaps: %d vs %d.\n %s: %f + %f; %s: %f + %f.\n Files are %s and %s." % (data[firstType]._group._nbOutputs, data[type]._group._nbOutputs, data[firstType]._group._nbOverlaps, data[type]._group._nbOverlaps, firstType, data[firstType]._group._buildTime, data[firstType]._group._runTime, data[firstType]._group._mem, type, data[type]._group._buildTime, data[type]._group._runTime, data[type]._group._mem, refFileName, queryFileName)) + for fileName in (queryFileName, refFileName): + if os.path.exists(fileName): + os.remove(fileName) + progress.inc() + progress.done() + handle = open(self._outputFileName, "w") + handle.write(self._structure.export()) + handle.close() + + + +if __name__ == "__main__": + + description = "Benchmark v1.0.2: Compare NC-List with other tools. Only work under Linux. [Category: Other]" + parser = OptionParser(description = description) + parser.add_option("-r", "--nbReferences", dest="nbReferences", action="store", default=None, type="string", help="number of references (list of integers separated by commas) [compulsory] [format: string]") + parser.add_option("-q", "--nbQueries", dest="nbQueries", action="store", default=None, type="string", help="number of queries as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]") + parser.add_option("-R", "--nbReplicates", dest="nbReplicates", action="store", default=None, type="int", help="number of replicates [compulsory] [format: int]") + parser.add_option("-s", "--genomeSizes", dest="genomeSizes", action="store", default=None, type="string", help="genome size as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default="chr1", type="string", help="name of the chromosome [default: chr1] [format: string]") + parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the reads [compulsory] [format: int]") + parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the reads [compulsory] [format: int]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in TXT format]") + parser.add_option("-t", "--time", dest="time", action="store", default=None, type="int", help="maximum time to wait (in seconds) [default: None] [format: int]") + parser.add_option("-m", "--memory", dest="memory", action="store", default=None, type="float", help="maximum memory usage (in %) [default: None] [format: float]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + benchmark = Benchmark(options.verbosity) + benchmark.setNbReferences(map(int, options.nbReferences.split(","))) + benchmark.setNbQueries(map(float, options.nbQueries.split(","))) + benchmark.setGenomeSizes(map(float, options.genomeSizes.split(","))) + benchmark.setNbReplicates(options.nbReplicates) + benchmark.setChromosomeName(options.chromosome) + benchmark.setSizes(options.minSize, options.maxSize) + benchmark.setLimits(options.time, options.memory) + benchmark.setOutputFileName(options.outputFileName) + benchmark.run() +