18
+ − 1 import os, os.path, random, glob, subprocess, threading, time, resource
+ − 2 from optparse import OptionParser
+ − 3 from SMART.Java.Python.misc.Progress import *
+ − 4 from SMART.Java.Python.getRandomRegions import RandomRegionsGenerator
+ − 5 from commons.core.writer.TranscriptWriter import TranscriptWriter
+ − 6 from SMART.Java.Python.structure.Transcript import Transcript
+ − 7 from commons.core.parsing.GffParser import GffParser
+ − 8
+ − 9 #TYPES = ("bin", "has", "seg", "fj", "nc", "new")
+ − 10 TYPES = ("new", )
+ − 11
+ − 12 class RunCmd(threading.Thread):
+ − 13 def __init__(self, cmd, out, err, time, memory):
+ − 14 threading.Thread.__init__(self)
+ − 15 self._cmd = cmd
+ − 16 self._out = out
+ − 17 self._err = err
+ − 18 self._time = time
+ − 19 self._memory = memory
+ − 20 self._id = os.getpid()
+ − 21 self._mem = 0.0
+ − 22 self._outputFileName = "tmp_%d.out" % (self._id)
+ − 23
+ − 24 def run(self):
+ − 25 self._p = subprocess.Popen(self._cmd, stdout = self._out, stderr = self._err, shell = True)
+ − 26 #self._p.wait()
+ − 27
+ − 28 def _runShellCommand(self, command):
+ − 29 p = subprocess.call("%s > %s" % (command, self._outputFileName), shell=True)
+ − 30 handle = open(self._outputFileName)
+ − 31 data = [line.split() for line in handle.readlines()[1:] if line]
+ − 32 handle.close()
+ − 33 os.remove(self._outputFileName)
+ − 34 return data
+ − 35
+ − 36 def _getPid(self):
+ − 37 self._pid = None
+ − 38 cpt = 1
+ − 39 while True:
+ − 40 commandsFound = []
+ − 41 for line in self._runShellCommand("ps -o pid,cmd"):
+ − 42 if line[1:] == self._cmd.split(" "):
+ − 43 self._pid = int(line[0])
+ − 44 commandsFound.append(" ".join(line[1:]))
+ − 45 if self._pid != None:
+ − 46 return True
+ − 47 time.sleep(1)
+ − 48 if cpt % 100 == 0:
+ − 49 print "pid of '%s' not found after %d seconds. Found: %s" % (self._cmd, cpt, " --- ".join(commandsFound))
+ − 50 cpt += 1
+ − 51 if cpt > 300:
+ − 52 return False
+ − 53
+ − 54 def _fetchMemory(self):
+ − 55 lines = self._runShellCommand("ps u -p %d" % (self._pid))
+ − 56 for line in lines:
+ − 57 self._mem = max(self._mem, float(line[3]))
+ − 58 return self._mem >= self._memory
+ − 59 #print "Cannot find the memory of the current PID (%d) in: %s" % (self._pid, " --- ".join([" ".join(line) for line in lines]))
+ − 60 return False
+ − 61
+ − 62 def getMemory(self):
+ − 63 return self._mem
+ − 64
+ − 65 def _abort(self):
+ − 66 try:
+ − 67 self._p.terminate()
+ − 68 except Exception:
+ − 69 pass
+ − 70 self._killSubThreads()
+ − 71
+ − 72 def _killSubThreads(self):
+ − 73 for line in self._runShellCommand("ps --ppid %d -o pid" % (self._pid)):
+ − 74 self._runShellCommand("kill %s" % (line[0]))
+ − 75 self._runShellCommand("kill %s" % (self._pid))
+ − 76
+ − 77 def go(self):
+ − 78 startTime = time.time()
+ − 79 self.run()
+ − 80 #self.start()
+ − 81 while not self._getPid():
+ − 82 #self.start()
+ − 83 self.run()
+ − 84 while True:
+ − 85 if self._time != None and time.time() - startTime > self._time:
+ − 86 print "\nCommand '%s' did not finish in time. Aborting it." % (self._cmd)
+ − 87 self._abort()
+ − 88 break
+ − 89 if self._memory != None and self._fetchMemory():
+ − 90 print "\nCommand '%s' required too much memory (%f). Aborting it." % (self._cmd, self._mem)
+ − 91 self._abort()
+ − 92 break
+ − 93 #self.join(0.1)
+ − 94 time.sleep(0.1)
+ − 95 #if not self.isAlive():
+ − 96 if self._p.poll() != None:
+ − 97 return True
+ − 98 return False
+ − 99
+ − 100
+ − 101 class DataStructure(object):
+ − 102 def __init__(self):
+ − 103 self._structure = {}
+ − 104
+ − 105 def addData(self, data):
+ − 106 if data._nbRefs not in self._structure:
+ − 107 self._structure[data._nbRefs] = {}
+ − 108 if data._nbQueries not in self._structure[data._nbRefs]:
+ − 109 self._structure[data._nbRefs][data._nbQueries] = {}
+ − 110 if data._genomeSize not in self._structure[data._nbRefs][data._nbQueries]:
+ − 111 self._structure[data._nbRefs][data._nbQueries][data._genomeSize] = {}
+ − 112 if data._type not in self._structure[data._nbRefs][data._nbQueries][data._genomeSize]:
+ − 113 self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type] = []
+ − 114 self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type].append(data._group)
+ − 115
+ − 116 def export(self):
+ − 117 outputString = "#refs\t#queries\tgenome size\ttype\t# written\t# overlaps\tbuild t.\trun t.\tmem\n"
+ − 118 for nbRefs in sorted(self._structure.keys()):
+ − 119 for nbQueries in sorted(self._structure[nbRefs].keys()):
+ − 120 for genomeSize in sorted(self._structure[nbRefs][nbQueries].keys()):
+ − 121 for type in TYPES:
+ − 122 if type not in self._structure[nbRefs][nbQueries][genomeSize]:
+ − 123 outputString += "NA\tNA\tNA\t%s\tNA\tNA\tNA\tNA\tNA\tNA\n" % (type)
+ − 124 else:
+ − 125 for group in self._structure[nbRefs][nbQueries][genomeSize][type]:
+ − 126 outputString += "%d\t%d\t%d\t%s\t%d\t%d\t%f\t%f\t%f\n" % (nbRefs, nbQueries, genomeSize, type, group._nbOutputs, group._nbOverlaps, group._buildTime, group._runTime, group._mem)
+ − 127 return outputString
+ − 128
+ − 129
+ − 130 class Data(object):
+ − 131 def __init__(self, type, buildTime, runTime, mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize):
+ − 132 self._type = type
+ − 133 self._nbRefs = nbRefs
+ − 134 self._nbQueries = nbQueries
+ − 135 self._genomeSize = genomeSize
+ − 136 self._group = Group(nbOutputs, nbOverlaps, buildTime, runTime, mem)
+ − 137
+ − 138 def checkConsistency(self, data):
+ − 139 return self._group.checkConsistency(data._group)
+ − 140
+ − 141
+ − 142 class Group(object):
+ − 143 def __init__(self, nbOutputs, nbOverlaps, buildTime, runTime, mem):
+ − 144 self._buildTime = buildTime
+ − 145 self._runTime = runTime
+ − 146 self._mem = mem
+ − 147 self._nbOutputs = nbOutputs
+ − 148 self._nbOverlaps = nbOverlaps
+ − 149
+ − 150 def checkConsistency(self, group):
+ − 151 if (self._buildTime == "NA" or group._buildTime == "NA"):
+ − 152 return True
+ − 153 return (self._nbOutputs == group._nbOutputs and self._nbOverlaps == group._nbOverlaps)
+ − 154
+ − 155
+ − 156 class Benchmark(object):
+ − 157
+ − 158 def __init__(self, verbosity = 1):
+ − 159 self._verbosity = verbosity
+ − 160 self._checkEnvironmentVariable()
+ − 161 self._toolName = {"bin": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBin.py"), \
+ − 162 "has": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsHashBin.py"), \
+ − 163 "seg": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBinSegment.py"), \
+ − 164 "fj": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsFJoin.py"), \
+ − 165 "nc": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervals.py"), \
+ − 166 "new": os.path.join(os.environ["SMARTPATH"], "FindOverlapsOptim.py")}
+ − 167 self._structure = DataStructure()
+ − 168 self._pid = os.getpid()
+ − 169 self._count = 0
+ − 170 self._time = None
+ − 171 self._memory = None
+ − 172
+ − 173 def _checkEnvironmentVariable(self):
+ − 174 if "SMARTPATH" not in os.environ:
+ − 175 raise Exception("'SMARTPATH' is not set. Please set it to '<installation-direction>/S-mart/Java/Python'.")
+ − 176
+ − 177 def _createTmpFileName(self, name, extension):
+ − 178 self._count += 1
+ − 179 return "tmp_%d_%d_%s.%s" % (self._pid, self._count, name, extension)
+ − 180
+ − 181 def _dumpAndReturn(self, fileName, exception):
+ − 182 handle = open(fileName)
+ − 183 print "Error in parsing file '%s':" % (fileName)
+ − 184 for line in handle:
+ − 185 print line.strip()
+ − 186 print "Command is: '%s'" % (self._command)
+ − 187 raise exception
+ − 188
+ − 189 def setNbReferences(self, nbReferences):
+ − 190 self._nbReferences = nbReferences
+ − 191
+ − 192 def setNbQueries(self, nbQueries):
+ − 193 self._nbQueries = nbQueries
+ − 194
+ − 195 def setGenomeSizes(self, nbGenomeSizes):
+ − 196 self._nbGenomeSizes = nbGenomeSizes
+ − 197
+ − 198 def setNbReplicates(self, nbReplicates):
+ − 199 self._nbReplicates = nbReplicates
+ − 200
+ − 201 def setChromosomeName(self, chromosome):
+ − 202 self._chromosomeName = chromosome
+ − 203
+ − 204 def setSizes(self, minSize, maxSize):
+ − 205 self._minSize = minSize
+ − 206 self._maxSize = maxSize
+ − 207
+ − 208 def setOutputFileName(self, fileName):
+ − 209 self._outputFileName = fileName
+ − 210
+ − 211 def setLimits(self, time, memory):
+ − 212 self._time = time
+ − 213 self._memory = memory
+ − 214
+ − 215 def _generateIntervals(self, nbElements, genomeSize):
+ − 216 fileName = self._createTmpFileName("intervals", "gff3")
+ − 217 iRR = RandomRegionsGenerator(0)
+ − 218 iRR.setMinSize(self._minSize)
+ − 219 iRR.setMaxSize(self._maxSize)
+ − 220 iRR.setGenomeSize(genomeSize)
+ − 221 iRR.setChromosomeName(self._chromosomeName)
+ − 222 iRR.setStrands(False)
+ − 223 iRR.setNumber(nbElements)
+ − 224 iRR.setOutputFile(fileName)
+ − 225 iRR.run()
+ − 226 return fileName
+ − 227
+ − 228 def _startTool(self, type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize):
+ − 229 outputFileName = self._createTmpFileName("output", "gff3")
+ − 230 outFileName = self._createTmpFileName("out", "out")
+ − 231 errFileName = self._createTmpFileName("err", "err")
+ − 232 outHandle = open(outFileName, "w")
+ − 233 errHandle = open(errFileName, "w")
+ − 234 self._command = "python %s -i %s -f gff3 -j %s -g gff3 -o %s -v 3" % (self._toolName[type], queryFileName, refFileName, outputFileName)
+ − 235 thread = RunCmd(self._command, outHandle, errHandle, self._time, self._memory)
+ − 236 over = thread.go()
+ − 237 self._mem = thread.getMemory()
+ − 238 if os.path.exists(outputFileName):
+ − 239 os.remove(outputFileName)
+ − 240 outHandle.close()
+ − 241 errHandle.close()
+ − 242 errData = open(errFileName).readlines()
+ − 243 if errData:
+ − 244 print "Error output: \n%s\n" % ("\n".join(errData))
+ − 245 if not over:
+ − 246 errHandle = open(errFileName, "r")
+ − 247 error = errHandle.readlines()
+ − 248 errHandle.close()
+ − 249 if error:
+ − 250 for line in error:
+ − 251 print line.strip()
+ − 252 print "Previous process failed"
+ − 253 os.remove(errFileName)
+ − 254 if not over:
+ − 255 return False
+ − 256 return outFileName
+ − 257
+ − 258 def _parseTrace(self, type, fileName, genomeSize):
+ − 259 handle = open(fileName)
+ − 260 buildTime = 0
+ − 261 try:
+ − 262 for line in handle:
+ − 263 line = line.strip()
+ − 264 if "time spent" in line:
+ − 265 buildTime += float(line.split()[-1][:-1])
+ − 266 elif "done" in line:
+ − 267 buildTime += float(line.split("(")[1][:-2])
+ − 268 elif "# queries" in line:
+ − 269 nbQueries = int(line.split()[-1])
+ − 270 elif "# refs" in line:
+ − 271 nbRefs = int(line.split()[-1])
+ − 272 elif "# written" in line:
+ − 273 nbOutputs = int(line.split()[2])
+ − 274 nbOverlaps = int(line.split()[3][1:])
+ − 275 elif "time" in line:
+ − 276 runTime = float(line.split()[-1][:-1])
+ − 277 except Exception, e:
+ − 278 handle.close()
+ − 279 self._dumpAndReturn(fileName, e)
+ − 280 handle.close()
+ − 281 try:
+ − 282 return Data(type, buildTime, runTime, self._mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize)
+ − 283 except Exception, e:
+ − 284 handle.close()
+ − 285 self._dumpAndReturn(fileName, e)
+ − 286
+ − 287 def _cleanTmpFiles(self, really = False):
+ − 288 files = glob.glob("tmp_%d_*.pkl" % (self._pid)) + glob.glob("tmp_%d_*.bin" % (self._pid))
+ − 289 if really:
+ − 290 files += glob.glob("tmp_%d_*.gff3" % (self._pid)) + glob.glob("tmp_%d_*.out" % (self._pid))
+ − 291 for fileName in files:
+ − 292 os.remove(fileName)
+ − 293
+ − 294 def run(self):
+ − 295 progress = Progress(len(self._nbReferences) * len(self._nbQueries) * len(self._nbGenomeSizes) * self._nbReplicates, "Processing", self._verbosity)
+ − 296 for nbReferences in self._nbReferences:
+ − 297 for queriesRatio in self._nbQueries:
+ − 298 nbQueries = int(nbReferences * queriesRatio)
+ − 299 for genomeSizeRatio in self._nbGenomeSizes:
+ − 300 genomeSize = int(nbReferences * genomeSizeRatio)
+ − 301 for replicate in range(self._nbReplicates):
+ − 302 refFileName = self._generateIntervals(nbReferences, genomeSize)
+ − 303 queryFileName = self._generateIntervals(nbQueries, genomeSize)
+ − 304 data = {}
+ − 305 for type in TYPES:
+ − 306 fileName = self._startTool(type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize)
+ − 307 if not fileName:
+ − 308 data[type] = Data(type, "NA", "NA", "NA", nbReferences, nbQueries, "NA", "NA", genomeSize)
+ − 309 else:
+ − 310 data[type] = self._parseTrace(type, fileName, genomeSize)
+ − 311 self._structure.addData(data[type])
+ − 312 os.remove(fileName)
+ − 313 self._cleanTmpFiles()
+ − 314 self._cleanTmpFiles(True)
+ − 315 firstType = TYPES[0]
+ − 316 for type in TYPES[1:]:
+ − 317 if not data[firstType].checkConsistency(data[type]):
+ − 318 raise Exception("Outputs are not consistent.\n # outputs: %d vs %d.\n # overlaps: %d vs %d.\n %s: %f + %f; %s: %f + %f.\n Files are %s and %s." % (data[firstType]._group._nbOutputs, data[type]._group._nbOutputs, data[firstType]._group._nbOverlaps, data[type]._group._nbOverlaps, firstType, data[firstType]._group._buildTime, data[firstType]._group._runTime, data[firstType]._group._mem, type, data[type]._group._buildTime, data[type]._group._runTime, data[type]._group._mem, refFileName, queryFileName))
+ − 319 for fileName in (queryFileName, refFileName):
+ − 320 if os.path.exists(fileName):
+ − 321 os.remove(fileName)
+ − 322 progress.inc()
+ − 323 progress.done()
+ − 324 handle = open(self._outputFileName, "w")
+ − 325 handle.write(self._structure.export())
+ − 326 handle.close()
+ − 327
+ − 328
+ − 329
+ − 330 if __name__ == "__main__":
+ − 331
+ − 332 description = "Benchmark v1.0.2: Compare NC-List with other tools. Only work under Linux. [Category: Other]"
+ − 333 parser = OptionParser(description = description)
+ − 334 parser.add_option("-r", "--nbReferences", dest="nbReferences", action="store", default=None, type="string", help="number of references (list of integers separated by commas) [compulsory] [format: string]")
+ − 335 parser.add_option("-q", "--nbQueries", dest="nbQueries", action="store", default=None, type="string", help="number of queries as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]")
+ − 336 parser.add_option("-R", "--nbReplicates", dest="nbReplicates", action="store", default=None, type="int", help="number of replicates [compulsory] [format: int]")
+ − 337 parser.add_option("-s", "--genomeSizes", dest="genomeSizes", action="store", default=None, type="string", help="genome size as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]")
+ − 338 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default="chr1", type="string", help="name of the chromosome [default: chr1] [format: string]")
+ − 339 parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the reads [compulsory] [format: int]")
+ − 340 parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the reads [compulsory] [format: int]")
+ − 341 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in TXT format]")
+ − 342 parser.add_option("-t", "--time", dest="time", action="store", default=None, type="int", help="maximum time to wait (in seconds) [default: None] [format: int]")
+ − 343 parser.add_option("-m", "--memory", dest="memory", action="store", default=None, type="float", help="maximum memory usage (in %) [default: None] [format: float]")
+ − 344 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
+ − 345 (options, args) = parser.parse_args()
+ − 346
+ − 347 benchmark = Benchmark(options.verbosity)
+ − 348 benchmark.setNbReferences(map(int, options.nbReferences.split(",")))
+ − 349 benchmark.setNbQueries(map(float, options.nbQueries.split(",")))
+ − 350 benchmark.setGenomeSizes(map(float, options.genomeSizes.split(",")))
+ − 351 benchmark.setNbReplicates(options.nbReplicates)
+ − 352 benchmark.setChromosomeName(options.chromosome)
+ − 353 benchmark.setSizes(options.minSize, options.maxSize)
+ − 354 benchmark.setLimits(options.time, options.memory)
+ − 355 benchmark.setOutputFileName(options.outputFileName)
+ − 356 benchmark.run()
+ − 357