diff SMART/Java/Python/ncList/Benchmark.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 94ab73e8a190
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/Java/Python/ncList/Benchmark.py	Thu May 02 09:56:47 2013 -0400
@@ -0,0 +1,357 @@
+import os, os.path, random, glob, subprocess, threading, time, resource
+from optparse import OptionParser
+from SMART.Java.Python.misc.Progress import *
+from SMART.Java.Python.getRandomRegions import RandomRegionsGenerator
+from commons.core.writer.TranscriptWriter import TranscriptWriter
+from SMART.Java.Python.structure.Transcript import Transcript
+from commons.core.parsing.GffParser import GffParser
+
+#TYPES = ("bin", "has", "seg", "fj", "nc", "new")
+TYPES = ("new", )
+
+class RunCmd(threading.Thread):
+	def __init__(self, cmd, out, err, time, memory):
+		threading.Thread.__init__(self)
+		self._cmd    = cmd
+		self._out    = out
+		self._err    = err
+		self._time   = time
+		self._memory = memory
+		self._id	 = os.getpid()
+		self._mem    = 0.0
+		self._outputFileName = "tmp_%d.out" % (self._id)
+
+	def run(self):
+		self._p = subprocess.Popen(self._cmd, stdout = self._out, stderr = self._err, shell = True)
+		#self._p.wait()
+
+	def _runShellCommand(self, command):
+		p = subprocess.call("%s > %s" % (command, self._outputFileName), shell=True)
+		handle = open(self._outputFileName)
+		data   = [line.split() for line in handle.readlines()[1:] if line]
+		handle.close()
+		os.remove(self._outputFileName)
+		return data
+
+	def _getPid(self):
+		self._pid	  = None
+		cpt           = 1
+		while True:
+			commandsFound = []
+			for line in self._runShellCommand("ps -o pid,cmd"):
+				if line[1:] == self._cmd.split(" "):
+					self._pid = int(line[0])
+				commandsFound.append(" ".join(line[1:]))
+			if self._pid != None:
+				return True
+			time.sleep(1)
+			if cpt % 100 == 0:
+				print "pid of '%s' not found after %d seconds. Found: %s" % (self._cmd, cpt, " --- ".join(commandsFound))
+			cpt += 1
+			if cpt > 300:
+				return False
+
+	def _fetchMemory(self):
+		lines = self._runShellCommand("ps u -p %d" % (self._pid))
+		for line in lines:
+			self._mem = max(self._mem, float(line[3]))
+			return self._mem >= self._memory
+		#print "Cannot find the memory of the current PID (%d) in: %s" % (self._pid, " --- ".join([" ".join(line) for line in lines]))
+		return False
+	
+	def getMemory(self):
+		return self._mem
+
+	def _abort(self):
+		try:
+			self._p.terminate()
+		except Exception:
+			pass
+		self._killSubThreads()
+	
+	def _killSubThreads(self):
+		for line in self._runShellCommand("ps --ppid %d -o pid" % (self._pid)):
+			self._runShellCommand("kill %s" % (line[0]))
+		self._runShellCommand("kill %s" % (self._pid))
+
+	def go(self):
+		startTime = time.time()
+		self.run()
+		#self.start()
+		while not self._getPid():
+			#self.start()
+			self.run()
+		while True:
+			if self._time != None and time.time() - startTime > self._time:
+				print "\nCommand '%s' did not finish in time. Aborting it." % (self._cmd)
+				self._abort()
+				break
+			if self._memory != None and self._fetchMemory():
+				print "\nCommand '%s' required too much memory (%f). Aborting it." % (self._cmd, self._mem)
+				self._abort()
+				break
+			#self.join(0.1)
+			time.sleep(0.1)
+			#if not self.isAlive():
+			if self._p.poll() != None:
+				return True
+		return False
+
+
+class DataStructure(object):
+	def __init__(self):
+		self._structure = {}
+
+	def addData(self, data):
+		if data._nbRefs not in self._structure:
+			self._structure[data._nbRefs] = {}
+		if data._nbQueries not in self._structure[data._nbRefs]:
+			self._structure[data._nbRefs][data._nbQueries] = {}
+		if data._genomeSize not in self._structure[data._nbRefs][data._nbQueries]:
+			self._structure[data._nbRefs][data._nbQueries][data._genomeSize] = {}
+		if data._type not in self._structure[data._nbRefs][data._nbQueries][data._genomeSize]:
+			self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type] = []
+		self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type].append(data._group)
+
+	def export(self):
+		outputString = "#refs\t#queries\tgenome size\ttype\t# written\t# overlaps\tbuild t.\trun t.\tmem\n"
+		for nbRefs in sorted(self._structure.keys()):
+			for nbQueries in sorted(self._structure[nbRefs].keys()):
+				for genomeSize in sorted(self._structure[nbRefs][nbQueries].keys()):
+					for type in TYPES:
+						if type not in self._structure[nbRefs][nbQueries][genomeSize]:
+							outputString += "NA\tNA\tNA\t%s\tNA\tNA\tNA\tNA\tNA\tNA\n" % (type)
+						else:
+							for group in self._structure[nbRefs][nbQueries][genomeSize][type]:
+								outputString += "%d\t%d\t%d\t%s\t%d\t%d\t%f\t%f\t%f\n" % (nbRefs, nbQueries, genomeSize, type, group._nbOutputs, group._nbOverlaps, group._buildTime, group._runTime, group._mem)
+		return outputString
+
+
+class Data(object):
+	def __init__(self, type, buildTime, runTime, mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize):
+		self._type	   = type
+		self._nbRefs	 = nbRefs
+		self._nbQueries  = nbQueries
+		self._genomeSize = genomeSize
+		self._group	  = Group(nbOutputs, nbOverlaps, buildTime, runTime, mem)
+
+	def checkConsistency(self, data):
+		return self._group.checkConsistency(data._group)
+
+
+class Group(object):
+	def __init__(self, nbOutputs, nbOverlaps, buildTime, runTime, mem):
+		self._buildTime  = buildTime
+		self._runTime	 = runTime
+		self._mem    	 = mem
+		self._nbOutputs	 = nbOutputs
+		self._nbOverlaps = nbOverlaps
+
+	def checkConsistency(self, group):
+		if (self._buildTime == "NA" or group._buildTime == "NA"):
+			return True
+		return (self._nbOutputs == group._nbOutputs and self._nbOverlaps == group._nbOverlaps)
+		
+
+class Benchmark(object):
+
+	def __init__(self, verbosity = 1):
+		self._verbosity = verbosity
+		self._checkEnvironmentVariable()
+		self._toolName	= {"bin":  os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBin.py"), \
+							"has": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsHashBin.py"), \
+							"seg": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBinSegment.py"), \
+							"fj":  os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsFJoin.py"), \
+							"nc":  os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervals.py"), \
+							"new": os.path.join(os.environ["SMARTPATH"],		   "FindOverlapsOptim.py")}
+		self._structure = DataStructure()
+		self._pid	   = os.getpid()
+		self._count		= 0
+		self._time	  = None
+		self._memory	= None
+
+	def _checkEnvironmentVariable(self):
+		if "SMARTPATH" not in os.environ:
+			raise Exception("'SMARTPATH' is not set. Please set it to '<installation-direction>/S-mart/Java/Python'.")
+
+	def _createTmpFileName(self, name, extension):
+		self._count += 1
+		return "tmp_%d_%d_%s.%s" % (self._pid, self._count, name, extension)
+
+	def _dumpAndReturn(self, fileName, exception):
+		handle = open(fileName)
+		print "Error in parsing file '%s':" % (fileName)
+		for line in handle:
+			print line.strip()
+		print "Command is: '%s'" % (self._command)
+		raise exception
+
+	def setNbReferences(self, nbReferences):
+		self._nbReferences = nbReferences
+
+	def setNbQueries(self, nbQueries):
+		self._nbQueries = nbQueries
+
+	def setGenomeSizes(self, nbGenomeSizes):
+		self._nbGenomeSizes = nbGenomeSizes
+
+	def setNbReplicates(self, nbReplicates):
+		self._nbReplicates = nbReplicates
+
+	def setChromosomeName(self, chromosome):
+		self._chromosomeName = chromosome
+
+	def setSizes(self, minSize, maxSize):
+		self._minSize = minSize
+		self._maxSize = maxSize
+
+	def setOutputFileName(self, fileName):
+		self._outputFileName = fileName
+
+	def setLimits(self, time, memory):
+		self._time   = time
+		self._memory = memory
+
+	def _generateIntervals(self, nbElements, genomeSize):
+		fileName = self._createTmpFileName("intervals", "gff3")
+		iRR	  = RandomRegionsGenerator(0)
+		iRR.setMinSize(self._minSize)
+		iRR.setMaxSize(self._maxSize)
+		iRR.setGenomeSize(genomeSize)
+		iRR.setChromosomeName(self._chromosomeName)
+		iRR.setStrands(False)
+		iRR.setNumber(nbElements)
+		iRR.setOutputFile(fileName)
+		iRR.run()
+		return fileName
+
+	def _startTool(self, type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize):
+		outputFileName = self._createTmpFileName("output", "gff3")
+		outFileName	   = self._createTmpFileName("out", "out")
+		errFileName	   = self._createTmpFileName("err", "err")
+		outHandle	   = open(outFileName, "w")
+		errHandle	   = open(errFileName, "w")
+		self._command  = "python %s -i %s -f gff3 -j %s -g gff3 -o %s -v 3" % (self._toolName[type], queryFileName, refFileName, outputFileName)
+		thread		   = RunCmd(self._command, outHandle, errHandle, self._time, self._memory)
+		over		   = thread.go()
+		self._mem      = thread.getMemory()
+		if os.path.exists(outputFileName):
+			os.remove(outputFileName)
+		outHandle.close()
+		errHandle.close()
+		errData = open(errFileName).readlines()
+		if errData:
+			print "Error output: \n%s\n" % ("\n".join(errData))
+		if not over:
+			errHandle = open(errFileName, "r")
+			error = errHandle.readlines()
+			errHandle.close()
+			if error:
+				for line in error:
+					print line.strip()
+				print "Previous process failed"
+		os.remove(errFileName)
+		if not over:
+			return False
+		return outFileName
+
+	def _parseTrace(self, type, fileName, genomeSize):
+		handle	  = open(fileName)
+		buildTime = 0
+		try:
+			for line in handle:
+				line = line.strip()
+				if "time spent" in line:
+					buildTime += float(line.split()[-1][:-1])
+				elif "done" in line:
+					buildTime += float(line.split("(")[1][:-2])
+				elif "# queries" in line:
+					nbQueries = int(line.split()[-1])
+				elif "# refs" in line:
+					nbRefs = int(line.split()[-1])
+				elif "# written" in line:
+					nbOutputs	= int(line.split()[2])
+					nbOverlaps = int(line.split()[3][1:])
+				elif "time" in line:
+					runTime = float(line.split()[-1][:-1])
+		except Exception, e:
+			handle.close()
+			self._dumpAndReturn(fileName, e)
+		handle.close()
+		try:
+			return Data(type, buildTime, runTime, self._mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize)
+		except Exception, e:
+			handle.close()
+			self._dumpAndReturn(fileName, e)
+	
+	def _cleanTmpFiles(self, really = False):
+		files = glob.glob("tmp_%d_*.pkl" % (self._pid)) + glob.glob("tmp_%d_*.bin" % (self._pid))
+		if really: 
+			files += glob.glob("tmp_%d_*.gff3" % (self._pid)) + glob.glob("tmp_%d_*.out" % (self._pid))
+		for fileName in files:
+			os.remove(fileName)
+		
+	def run(self):
+		progress = Progress(len(self._nbReferences) * len(self._nbQueries) * len(self._nbGenomeSizes) * self._nbReplicates, "Processing", self._verbosity)
+		for nbReferences in self._nbReferences:
+			for queriesRatio in self._nbQueries:
+				nbQueries = int(nbReferences * queriesRatio)
+				for genomeSizeRatio in self._nbGenomeSizes:
+					genomeSize = int(nbReferences * genomeSizeRatio)
+					for replicate in range(self._nbReplicates):
+						refFileName	  = self._generateIntervals(nbReferences, genomeSize)
+						queryFileName = self._generateIntervals(nbQueries,	genomeSize)
+						data		  = {}
+						for type in TYPES:
+							fileName   = self._startTool(type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize)
+							if not fileName:
+								data[type] = Data(type, "NA", "NA", "NA", nbReferences, nbQueries, "NA", "NA", genomeSize)
+							else:
+								data[type] = self._parseTrace(type, fileName, genomeSize)
+								self._structure.addData(data[type])
+								os.remove(fileName)
+							self._cleanTmpFiles()
+						self._cleanTmpFiles(True)
+						firstType = TYPES[0]
+						for type in TYPES[1:]:
+							if not data[firstType].checkConsistency(data[type]):
+								raise Exception("Outputs are not consistent.\n  # outputs: %d vs %d.\n  # overlaps: %d vs %d.\n  %s: %f + %f; %s: %f + %f.\n  Files are %s and %s." % (data[firstType]._group._nbOutputs, data[type]._group._nbOutputs, data[firstType]._group._nbOverlaps, data[type]._group._nbOverlaps, firstType, data[firstType]._group._buildTime, data[firstType]._group._runTime, data[firstType]._group._mem, type, data[type]._group._buildTime, data[type]._group._runTime, data[type]._group._mem, refFileName, queryFileName))
+						for fileName in (queryFileName, refFileName):
+							if os.path.exists(fileName):
+								os.remove(fileName)
+						progress.inc()
+		progress.done()
+		handle = open(self._outputFileName, "w")
+		handle.write(self._structure.export())
+		handle.close()
+
+
+
+if __name__ == "__main__":
+	
+	description = "Benchmark v1.0.2: Compare NC-List with other tools. Only work under Linux. [Category: Other]"
+	parser = OptionParser(description = description)
+	parser.add_option("-r", "--nbReferences", dest="nbReferences",   action="store", default=None,   type="string", help="number of references (list of integers separated by commas) [compulsory] [format: string]")
+	parser.add_option("-q", "--nbQueries",    dest="nbQueries",		 action="store", default=None,   type="string", help="number of queries as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]")
+	parser.add_option("-R", "--nbReplicates", dest="nbReplicates",   action="store", default=None,   type="int",	help="number of replicates [compulsory] [format: int]")
+	parser.add_option("-s", "--genomeSizes",  dest="genomeSizes",    action="store", default=None,   type="string", help="genome size as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]")
+	parser.add_option("-c", "--chromosome",   dest="chromosome",	 action="store", default="chr1", type="string", help="name of the chromosome [default: chr1] [format: string]")
+	parser.add_option("-z", "--minSize",      dest="minSize",        action="store", default=None,   type="int",	help="minimum size of the reads [compulsory] [format: int]")
+	parser.add_option("-Z", "--maxSize",      dest="maxSize",        action="store", default=None,   type="int",	help="maximum size of the reads [compulsory] [format: int]")
+	parser.add_option("-o", "--output",       dest="outputFileName", action="store",				 type="string", help="output file [compulsory] [format: output file in TXT format]")
+	parser.add_option("-t", "--time",         dest="time",           action="store", default=None,   type="int",	help="maximum time to wait (in seconds) [default: None] [format: int]")
+	parser.add_option("-m", "--memory",	      dest="memory",		 action="store", default=None,   type="float",	help="maximum memory usage (in %) [default: None] [format: float]")
+	parser.add_option("-v", "--verbosity",    dest="verbosity",      action="store", default=1,	     type="int",	help="trace level [format: int]")
+	(options, args) = parser.parse_args()
+
+	benchmark = Benchmark(options.verbosity)
+	benchmark.setNbReferences(map(int, options.nbReferences.split(",")))
+	benchmark.setNbQueries(map(float, options.nbQueries.split(",")))
+	benchmark.setGenomeSizes(map(float, options.genomeSizes.split(",")))
+	benchmark.setNbReplicates(options.nbReplicates)
+	benchmark.setChromosomeName(options.chromosome)
+	benchmark.setSizes(options.minSize, options.maxSize)
+	benchmark.setLimits(options.time, options.memory)
+	benchmark.setOutputFileName(options.outputFileName)
+	benchmark.run()
+