18
|
1 import os, os.path, random, glob, subprocess, threading, time, resource
|
|
2 from optparse import OptionParser
|
|
3 from SMART.Java.Python.misc.Progress import *
|
|
4 from SMART.Java.Python.getRandomRegions import RandomRegionsGenerator
|
|
5 from commons.core.writer.TranscriptWriter import TranscriptWriter
|
|
6 from SMART.Java.Python.structure.Transcript import Transcript
|
|
7 from commons.core.parsing.GffParser import GffParser
|
|
8
|
|
9 #TYPES = ("bin", "has", "seg", "fj", "nc", "new")
|
|
10 TYPES = ("new", )
|
|
11
|
|
12 class RunCmd(threading.Thread):
|
|
13 def __init__(self, cmd, out, err, time, memory):
|
|
14 threading.Thread.__init__(self)
|
|
15 self._cmd = cmd
|
|
16 self._out = out
|
|
17 self._err = err
|
|
18 self._time = time
|
|
19 self._memory = memory
|
|
20 self._id = os.getpid()
|
|
21 self._mem = 0.0
|
|
22 self._outputFileName = "tmp_%d.out" % (self._id)
|
|
23
|
|
24 def run(self):
|
|
25 self._p = subprocess.Popen(self._cmd, stdout = self._out, stderr = self._err, shell = True)
|
|
26 #self._p.wait()
|
|
27
|
|
28 def _runShellCommand(self, command):
|
|
29 p = subprocess.call("%s > %s" % (command, self._outputFileName), shell=True)
|
|
30 handle = open(self._outputFileName)
|
|
31 data = [line.split() for line in handle.readlines()[1:] if line]
|
|
32 handle.close()
|
|
33 os.remove(self._outputFileName)
|
|
34 return data
|
|
35
|
|
36 def _getPid(self):
|
|
37 self._pid = None
|
|
38 cpt = 1
|
|
39 while True:
|
|
40 commandsFound = []
|
|
41 for line in self._runShellCommand("ps -o pid,cmd"):
|
|
42 if line[1:] == self._cmd.split(" "):
|
|
43 self._pid = int(line[0])
|
|
44 commandsFound.append(" ".join(line[1:]))
|
|
45 if self._pid != None:
|
|
46 return True
|
|
47 time.sleep(1)
|
|
48 if cpt % 100 == 0:
|
|
49 print "pid of '%s' not found after %d seconds. Found: %s" % (self._cmd, cpt, " --- ".join(commandsFound))
|
|
50 cpt += 1
|
|
51 if cpt > 300:
|
|
52 return False
|
|
53
|
|
54 def _fetchMemory(self):
|
|
55 lines = self._runShellCommand("ps u -p %d" % (self._pid))
|
|
56 for line in lines:
|
|
57 self._mem = max(self._mem, float(line[3]))
|
|
58 return self._mem >= self._memory
|
|
59 #print "Cannot find the memory of the current PID (%d) in: %s" % (self._pid, " --- ".join([" ".join(line) for line in lines]))
|
|
60 return False
|
|
61
|
|
62 def getMemory(self):
|
|
63 return self._mem
|
|
64
|
|
65 def _abort(self):
|
|
66 try:
|
|
67 self._p.terminate()
|
|
68 except Exception:
|
|
69 pass
|
|
70 self._killSubThreads()
|
|
71
|
|
72 def _killSubThreads(self):
|
|
73 for line in self._runShellCommand("ps --ppid %d -o pid" % (self._pid)):
|
|
74 self._runShellCommand("kill %s" % (line[0]))
|
|
75 self._runShellCommand("kill %s" % (self._pid))
|
|
76
|
|
77 def go(self):
|
|
78 startTime = time.time()
|
|
79 self.run()
|
|
80 #self.start()
|
|
81 while not self._getPid():
|
|
82 #self.start()
|
|
83 self.run()
|
|
84 while True:
|
|
85 if self._time != None and time.time() - startTime > self._time:
|
|
86 print "\nCommand '%s' did not finish in time. Aborting it." % (self._cmd)
|
|
87 self._abort()
|
|
88 break
|
|
89 if self._memory != None and self._fetchMemory():
|
|
90 print "\nCommand '%s' required too much memory (%f). Aborting it." % (self._cmd, self._mem)
|
|
91 self._abort()
|
|
92 break
|
|
93 #self.join(0.1)
|
|
94 time.sleep(0.1)
|
|
95 #if not self.isAlive():
|
|
96 if self._p.poll() != None:
|
|
97 return True
|
|
98 return False
|
|
99
|
|
100
|
|
101 class DataStructure(object):
|
|
102 def __init__(self):
|
|
103 self._structure = {}
|
|
104
|
|
105 def addData(self, data):
|
|
106 if data._nbRefs not in self._structure:
|
|
107 self._structure[data._nbRefs] = {}
|
|
108 if data._nbQueries not in self._structure[data._nbRefs]:
|
|
109 self._structure[data._nbRefs][data._nbQueries] = {}
|
|
110 if data._genomeSize not in self._structure[data._nbRefs][data._nbQueries]:
|
|
111 self._structure[data._nbRefs][data._nbQueries][data._genomeSize] = {}
|
|
112 if data._type not in self._structure[data._nbRefs][data._nbQueries][data._genomeSize]:
|
|
113 self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type] = []
|
|
114 self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type].append(data._group)
|
|
115
|
|
116 def export(self):
|
|
117 outputString = "#refs\t#queries\tgenome size\ttype\t# written\t# overlaps\tbuild t.\trun t.\tmem\n"
|
|
118 for nbRefs in sorted(self._structure.keys()):
|
|
119 for nbQueries in sorted(self._structure[nbRefs].keys()):
|
|
120 for genomeSize in sorted(self._structure[nbRefs][nbQueries].keys()):
|
|
121 for type in TYPES:
|
|
122 if type not in self._structure[nbRefs][nbQueries][genomeSize]:
|
|
123 outputString += "NA\tNA\tNA\t%s\tNA\tNA\tNA\tNA\tNA\tNA\n" % (type)
|
|
124 else:
|
|
125 for group in self._structure[nbRefs][nbQueries][genomeSize][type]:
|
|
126 outputString += "%d\t%d\t%d\t%s\t%d\t%d\t%f\t%f\t%f\n" % (nbRefs, nbQueries, genomeSize, type, group._nbOutputs, group._nbOverlaps, group._buildTime, group._runTime, group._mem)
|
|
127 return outputString
|
|
128
|
|
129
|
|
130 class Data(object):
|
|
131 def __init__(self, type, buildTime, runTime, mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize):
|
|
132 self._type = type
|
|
133 self._nbRefs = nbRefs
|
|
134 self._nbQueries = nbQueries
|
|
135 self._genomeSize = genomeSize
|
|
136 self._group = Group(nbOutputs, nbOverlaps, buildTime, runTime, mem)
|
|
137
|
|
138 def checkConsistency(self, data):
|
|
139 return self._group.checkConsistency(data._group)
|
|
140
|
|
141
|
|
142 class Group(object):
|
|
143 def __init__(self, nbOutputs, nbOverlaps, buildTime, runTime, mem):
|
|
144 self._buildTime = buildTime
|
|
145 self._runTime = runTime
|
|
146 self._mem = mem
|
|
147 self._nbOutputs = nbOutputs
|
|
148 self._nbOverlaps = nbOverlaps
|
|
149
|
|
150 def checkConsistency(self, group):
|
|
151 if (self._buildTime == "NA" or group._buildTime == "NA"):
|
|
152 return True
|
|
153 return (self._nbOutputs == group._nbOutputs and self._nbOverlaps == group._nbOverlaps)
|
|
154
|
|
155
|
|
156 class Benchmark(object):
|
|
157
|
|
158 def __init__(self, verbosity = 1):
|
|
159 self._verbosity = verbosity
|
|
160 self._checkEnvironmentVariable()
|
|
161 self._toolName = {"bin": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBin.py"), \
|
|
162 "has": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsHashBin.py"), \
|
|
163 "seg": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBinSegment.py"), \
|
|
164 "fj": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsFJoin.py"), \
|
|
165 "nc": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervals.py"), \
|
|
166 "new": os.path.join(os.environ["SMARTPATH"], "FindOverlapsOptim.py")}
|
|
167 self._structure = DataStructure()
|
|
168 self._pid = os.getpid()
|
|
169 self._count = 0
|
|
170 self._time = None
|
|
171 self._memory = None
|
|
172
|
|
173 def _checkEnvironmentVariable(self):
|
|
174 if "SMARTPATH" not in os.environ:
|
|
175 raise Exception("'SMARTPATH' is not set. Please set it to '<installation-direction>/S-mart/Java/Python'.")
|
|
176
|
|
177 def _createTmpFileName(self, name, extension):
|
|
178 self._count += 1
|
|
179 return "tmp_%d_%d_%s.%s" % (self._pid, self._count, name, extension)
|
|
180
|
|
181 def _dumpAndReturn(self, fileName, exception):
|
|
182 handle = open(fileName)
|
|
183 print "Error in parsing file '%s':" % (fileName)
|
|
184 for line in handle:
|
|
185 print line.strip()
|
|
186 print "Command is: '%s'" % (self._command)
|
|
187 raise exception
|
|
188
|
|
189 def setNbReferences(self, nbReferences):
|
|
190 self._nbReferences = nbReferences
|
|
191
|
|
192 def setNbQueries(self, nbQueries):
|
|
193 self._nbQueries = nbQueries
|
|
194
|
|
195 def setGenomeSizes(self, nbGenomeSizes):
|
|
196 self._nbGenomeSizes = nbGenomeSizes
|
|
197
|
|
198 def setNbReplicates(self, nbReplicates):
|
|
199 self._nbReplicates = nbReplicates
|
|
200
|
|
201 def setChromosomeName(self, chromosome):
|
|
202 self._chromosomeName = chromosome
|
|
203
|
|
204 def setSizes(self, minSize, maxSize):
|
|
205 self._minSize = minSize
|
|
206 self._maxSize = maxSize
|
|
207
|
|
208 def setOutputFileName(self, fileName):
|
|
209 self._outputFileName = fileName
|
|
210
|
|
211 def setLimits(self, time, memory):
|
|
212 self._time = time
|
|
213 self._memory = memory
|
|
214
|
|
215 def _generateIntervals(self, nbElements, genomeSize):
|
|
216 fileName = self._createTmpFileName("intervals", "gff3")
|
|
217 iRR = RandomRegionsGenerator(0)
|
|
218 iRR.setMinSize(self._minSize)
|
|
219 iRR.setMaxSize(self._maxSize)
|
|
220 iRR.setGenomeSize(genomeSize)
|
|
221 iRR.setChromosomeName(self._chromosomeName)
|
|
222 iRR.setStrands(False)
|
|
223 iRR.setNumber(nbElements)
|
|
224 iRR.setOutputFile(fileName)
|
|
225 iRR.run()
|
|
226 return fileName
|
|
227
|
|
228 def _startTool(self, type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize):
|
|
229 outputFileName = self._createTmpFileName("output", "gff3")
|
|
230 outFileName = self._createTmpFileName("out", "out")
|
|
231 errFileName = self._createTmpFileName("err", "err")
|
|
232 outHandle = open(outFileName, "w")
|
|
233 errHandle = open(errFileName, "w")
|
|
234 self._command = "python %s -i %s -f gff3 -j %s -g gff3 -o %s -v 3" % (self._toolName[type], queryFileName, refFileName, outputFileName)
|
|
235 thread = RunCmd(self._command, outHandle, errHandle, self._time, self._memory)
|
|
236 over = thread.go()
|
|
237 self._mem = thread.getMemory()
|
|
238 if os.path.exists(outputFileName):
|
|
239 os.remove(outputFileName)
|
|
240 outHandle.close()
|
|
241 errHandle.close()
|
|
242 errData = open(errFileName).readlines()
|
|
243 if errData:
|
|
244 print "Error output: \n%s\n" % ("\n".join(errData))
|
|
245 if not over:
|
|
246 errHandle = open(errFileName, "r")
|
|
247 error = errHandle.readlines()
|
|
248 errHandle.close()
|
|
249 if error:
|
|
250 for line in error:
|
|
251 print line.strip()
|
|
252 print "Previous process failed"
|
|
253 os.remove(errFileName)
|
|
254 if not over:
|
|
255 return False
|
|
256 return outFileName
|
|
257
|
|
258 def _parseTrace(self, type, fileName, genomeSize):
|
|
259 handle = open(fileName)
|
|
260 buildTime = 0
|
|
261 try:
|
|
262 for line in handle:
|
|
263 line = line.strip()
|
|
264 if "time spent" in line:
|
|
265 buildTime += float(line.split()[-1][:-1])
|
|
266 elif "done" in line:
|
|
267 buildTime += float(line.split("(")[1][:-2])
|
|
268 elif "# queries" in line:
|
|
269 nbQueries = int(line.split()[-1])
|
|
270 elif "# refs" in line:
|
|
271 nbRefs = int(line.split()[-1])
|
|
272 elif "# written" in line:
|
|
273 nbOutputs = int(line.split()[2])
|
|
274 nbOverlaps = int(line.split()[3][1:])
|
|
275 elif "time" in line:
|
|
276 runTime = float(line.split()[-1][:-1])
|
|
277 except Exception, e:
|
|
278 handle.close()
|
|
279 self._dumpAndReturn(fileName, e)
|
|
280 handle.close()
|
|
281 try:
|
|
282 return Data(type, buildTime, runTime, self._mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize)
|
|
283 except Exception, e:
|
|
284 handle.close()
|
|
285 self._dumpAndReturn(fileName, e)
|
|
286
|
|
287 def _cleanTmpFiles(self, really = False):
|
|
288 files = glob.glob("tmp_%d_*.pkl" % (self._pid)) + glob.glob("tmp_%d_*.bin" % (self._pid))
|
|
289 if really:
|
|
290 files += glob.glob("tmp_%d_*.gff3" % (self._pid)) + glob.glob("tmp_%d_*.out" % (self._pid))
|
|
291 for fileName in files:
|
|
292 os.remove(fileName)
|
|
293
|
|
294 def run(self):
|
|
295 progress = Progress(len(self._nbReferences) * len(self._nbQueries) * len(self._nbGenomeSizes) * self._nbReplicates, "Processing", self._verbosity)
|
|
296 for nbReferences in self._nbReferences:
|
|
297 for queriesRatio in self._nbQueries:
|
|
298 nbQueries = int(nbReferences * queriesRatio)
|
|
299 for genomeSizeRatio in self._nbGenomeSizes:
|
|
300 genomeSize = int(nbReferences * genomeSizeRatio)
|
|
301 for replicate in range(self._nbReplicates):
|
|
302 refFileName = self._generateIntervals(nbReferences, genomeSize)
|
|
303 queryFileName = self._generateIntervals(nbQueries, genomeSize)
|
|
304 data = {}
|
|
305 for type in TYPES:
|
|
306 fileName = self._startTool(type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize)
|
|
307 if not fileName:
|
|
308 data[type] = Data(type, "NA", "NA", "NA", nbReferences, nbQueries, "NA", "NA", genomeSize)
|
|
309 else:
|
|
310 data[type] = self._parseTrace(type, fileName, genomeSize)
|
|
311 self._structure.addData(data[type])
|
|
312 os.remove(fileName)
|
|
313 self._cleanTmpFiles()
|
|
314 self._cleanTmpFiles(True)
|
|
315 firstType = TYPES[0]
|
|
316 for type in TYPES[1:]:
|
|
317 if not data[firstType].checkConsistency(data[type]):
|
|
318 raise Exception("Outputs are not consistent.\n # outputs: %d vs %d.\n # overlaps: %d vs %d.\n %s: %f + %f; %s: %f + %f.\n Files are %s and %s." % (data[firstType]._group._nbOutputs, data[type]._group._nbOutputs, data[firstType]._group._nbOverlaps, data[type]._group._nbOverlaps, firstType, data[firstType]._group._buildTime, data[firstType]._group._runTime, data[firstType]._group._mem, type, data[type]._group._buildTime, data[type]._group._runTime, data[type]._group._mem, refFileName, queryFileName))
|
|
319 for fileName in (queryFileName, refFileName):
|
|
320 if os.path.exists(fileName):
|
|
321 os.remove(fileName)
|
|
322 progress.inc()
|
|
323 progress.done()
|
|
324 handle = open(self._outputFileName, "w")
|
|
325 handle.write(self._structure.export())
|
|
326 handle.close()
|
|
327
|
|
328
|
|
329
|
|
330 if __name__ == "__main__":
|
|
331
|
|
332 description = "Benchmark v1.0.2: Compare NC-List with other tools. Only work under Linux. [Category: Other]"
|
|
333 parser = OptionParser(description = description)
|
|
334 parser.add_option("-r", "--nbReferences", dest="nbReferences", action="store", default=None, type="string", help="number of references (list of integers separated by commas) [compulsory] [format: string]")
|
|
335 parser.add_option("-q", "--nbQueries", dest="nbQueries", action="store", default=None, type="string", help="number of queries as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]")
|
|
336 parser.add_option("-R", "--nbReplicates", dest="nbReplicates", action="store", default=None, type="int", help="number of replicates [compulsory] [format: int]")
|
|
337 parser.add_option("-s", "--genomeSizes", dest="genomeSizes", action="store", default=None, type="string", help="genome size as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]")
|
|
338 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default="chr1", type="string", help="name of the chromosome [default: chr1] [format: string]")
|
|
339 parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the reads [compulsory] [format: int]")
|
|
340 parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the reads [compulsory] [format: int]")
|
|
341 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in TXT format]")
|
|
342 parser.add_option("-t", "--time", dest="time", action="store", default=None, type="int", help="maximum time to wait (in seconds) [default: None] [format: int]")
|
|
343 parser.add_option("-m", "--memory", dest="memory", action="store", default=None, type="float", help="maximum memory usage (in %) [default: None] [format: float]")
|
|
344 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
|
|
345 (options, args) = parser.parse_args()
|
|
346
|
|
347 benchmark = Benchmark(options.verbosity)
|
|
348 benchmark.setNbReferences(map(int, options.nbReferences.split(",")))
|
|
349 benchmark.setNbQueries(map(float, options.nbQueries.split(",")))
|
|
350 benchmark.setGenomeSizes(map(float, options.genomeSizes.split(",")))
|
|
351 benchmark.setNbReplicates(options.nbReplicates)
|
|
352 benchmark.setChromosomeName(options.chromosome)
|
|
353 benchmark.setSizes(options.minSize, options.maxSize)
|
|
354 benchmark.setLimits(options.time, options.memory)
|
|
355 benchmark.setOutputFileName(options.outputFileName)
|
|
356 benchmark.run()
|
|
357
|