Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/ncList/Benchmark.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 import os, os.path, random, glob, subprocess, threading, time, resource | |
2 from optparse import OptionParser | |
3 from SMART.Java.Python.misc.Progress import * | |
4 from SMART.Java.Python.getRandomRegions import RandomRegionsGenerator | |
5 from commons.core.writer.TranscriptWriter import TranscriptWriter | |
6 from SMART.Java.Python.structure.Transcript import Transcript | |
7 from commons.core.parsing.GffParser import GffParser | |
8 | |
9 #TYPES = ("bin", "has", "seg", "fj", "nc", "new") | |
10 TYPES = ("new", ) | |
11 | |
12 class RunCmd(threading.Thread): | |
13 def __init__(self, cmd, out, err, time, memory): | |
14 threading.Thread.__init__(self) | |
15 self._cmd = cmd | |
16 self._out = out | |
17 self._err = err | |
18 self._time = time | |
19 self._memory = memory | |
20 self._id = os.getpid() | |
21 self._mem = 0.0 | |
22 self._outputFileName = "tmp_%d.out" % (self._id) | |
23 | |
24 def run(self): | |
25 self._p = subprocess.Popen(self._cmd, stdout = self._out, stderr = self._err, shell = True) | |
26 #self._p.wait() | |
27 | |
28 def _runShellCommand(self, command): | |
29 p = subprocess.call("%s > %s" % (command, self._outputFileName), shell=True) | |
30 handle = open(self._outputFileName) | |
31 data = [line.split() for line in handle.readlines()[1:] if line] | |
32 handle.close() | |
33 os.remove(self._outputFileName) | |
34 return data | |
35 | |
36 def _getPid(self): | |
37 self._pid = None | |
38 cpt = 1 | |
39 while True: | |
40 commandsFound = [] | |
41 for line in self._runShellCommand("ps -o pid,cmd"): | |
42 if line[1:] == self._cmd.split(" "): | |
43 self._pid = int(line[0]) | |
44 commandsFound.append(" ".join(line[1:])) | |
45 if self._pid != None: | |
46 return True | |
47 time.sleep(1) | |
48 if cpt % 100 == 0: | |
49 print "pid of '%s' not found after %d seconds. Found: %s" % (self._cmd, cpt, " --- ".join(commandsFound)) | |
50 cpt += 1 | |
51 if cpt > 300: | |
52 return False | |
53 | |
54 def _fetchMemory(self): | |
55 lines = self._runShellCommand("ps u -p %d" % (self._pid)) | |
56 for line in lines: | |
57 self._mem = max(self._mem, float(line[3])) | |
58 return self._mem >= self._memory | |
59 #print "Cannot find the memory of the current PID (%d) in: %s" % (self._pid, " --- ".join([" ".join(line) for line in lines])) | |
60 return False | |
61 | |
62 def getMemory(self): | |
63 return self._mem | |
64 | |
65 def _abort(self): | |
66 try: | |
67 self._p.terminate() | |
68 except Exception: | |
69 pass | |
70 self._killSubThreads() | |
71 | |
72 def _killSubThreads(self): | |
73 for line in self._runShellCommand("ps --ppid %d -o pid" % (self._pid)): | |
74 self._runShellCommand("kill %s" % (line[0])) | |
75 self._runShellCommand("kill %s" % (self._pid)) | |
76 | |
77 def go(self): | |
78 startTime = time.time() | |
79 self.run() | |
80 #self.start() | |
81 while not self._getPid(): | |
82 #self.start() | |
83 self.run() | |
84 while True: | |
85 if self._time != None and time.time() - startTime > self._time: | |
86 print "\nCommand '%s' did not finish in time. Aborting it." % (self._cmd) | |
87 self._abort() | |
88 break | |
89 if self._memory != None and self._fetchMemory(): | |
90 print "\nCommand '%s' required too much memory (%f). Aborting it." % (self._cmd, self._mem) | |
91 self._abort() | |
92 break | |
93 #self.join(0.1) | |
94 time.sleep(0.1) | |
95 #if not self.isAlive(): | |
96 if self._p.poll() != None: | |
97 return True | |
98 return False | |
99 | |
100 | |
101 class DataStructure(object): | |
102 def __init__(self): | |
103 self._structure = {} | |
104 | |
105 def addData(self, data): | |
106 if data._nbRefs not in self._structure: | |
107 self._structure[data._nbRefs] = {} | |
108 if data._nbQueries not in self._structure[data._nbRefs]: | |
109 self._structure[data._nbRefs][data._nbQueries] = {} | |
110 if data._genomeSize not in self._structure[data._nbRefs][data._nbQueries]: | |
111 self._structure[data._nbRefs][data._nbQueries][data._genomeSize] = {} | |
112 if data._type not in self._structure[data._nbRefs][data._nbQueries][data._genomeSize]: | |
113 self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type] = [] | |
114 self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type].append(data._group) | |
115 | |
116 def export(self): | |
117 outputString = "#refs\t#queries\tgenome size\ttype\t# written\t# overlaps\tbuild t.\trun t.\tmem\n" | |
118 for nbRefs in sorted(self._structure.keys()): | |
119 for nbQueries in sorted(self._structure[nbRefs].keys()): | |
120 for genomeSize in sorted(self._structure[nbRefs][nbQueries].keys()): | |
121 for type in TYPES: | |
122 if type not in self._structure[nbRefs][nbQueries][genomeSize]: | |
123 outputString += "NA\tNA\tNA\t%s\tNA\tNA\tNA\tNA\tNA\tNA\n" % (type) | |
124 else: | |
125 for group in self._structure[nbRefs][nbQueries][genomeSize][type]: | |
126 outputString += "%d\t%d\t%d\t%s\t%d\t%d\t%f\t%f\t%f\n" % (nbRefs, nbQueries, genomeSize, type, group._nbOutputs, group._nbOverlaps, group._buildTime, group._runTime, group._mem) | |
127 return outputString | |
128 | |
129 | |
130 class Data(object): | |
131 def __init__(self, type, buildTime, runTime, mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize): | |
132 self._type = type | |
133 self._nbRefs = nbRefs | |
134 self._nbQueries = nbQueries | |
135 self._genomeSize = genomeSize | |
136 self._group = Group(nbOutputs, nbOverlaps, buildTime, runTime, mem) | |
137 | |
138 def checkConsistency(self, data): | |
139 return self._group.checkConsistency(data._group) | |
140 | |
141 | |
142 class Group(object): | |
143 def __init__(self, nbOutputs, nbOverlaps, buildTime, runTime, mem): | |
144 self._buildTime = buildTime | |
145 self._runTime = runTime | |
146 self._mem = mem | |
147 self._nbOutputs = nbOutputs | |
148 self._nbOverlaps = nbOverlaps | |
149 | |
150 def checkConsistency(self, group): | |
151 if (self._buildTime == "NA" or group._buildTime == "NA"): | |
152 return True | |
153 return (self._nbOutputs == group._nbOutputs and self._nbOverlaps == group._nbOverlaps) | |
154 | |
155 | |
156 class Benchmark(object): | |
157 | |
158 def __init__(self, verbosity = 1): | |
159 self._verbosity = verbosity | |
160 self._checkEnvironmentVariable() | |
161 self._toolName = {"bin": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBin.py"), \ | |
162 "has": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsHashBin.py"), \ | |
163 "seg": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBinSegment.py"), \ | |
164 "fj": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsFJoin.py"), \ | |
165 "nc": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervals.py"), \ | |
166 "new": os.path.join(os.environ["SMARTPATH"], "FindOverlapsOptim.py")} | |
167 self._structure = DataStructure() | |
168 self._pid = os.getpid() | |
169 self._count = 0 | |
170 self._time = None | |
171 self._memory = None | |
172 | |
173 def _checkEnvironmentVariable(self): | |
174 if "SMARTPATH" not in os.environ: | |
175 raise Exception("'SMARTPATH' is not set. Please set it to '<installation-direction>/S-mart/Java/Python'.") | |
176 | |
177 def _createTmpFileName(self, name, extension): | |
178 self._count += 1 | |
179 return "tmp_%d_%d_%s.%s" % (self._pid, self._count, name, extension) | |
180 | |
181 def _dumpAndReturn(self, fileName, exception): | |
182 handle = open(fileName) | |
183 print "Error in parsing file '%s':" % (fileName) | |
184 for line in handle: | |
185 print line.strip() | |
186 print "Command is: '%s'" % (self._command) | |
187 raise exception | |
188 | |
189 def setNbReferences(self, nbReferences): | |
190 self._nbReferences = nbReferences | |
191 | |
192 def setNbQueries(self, nbQueries): | |
193 self._nbQueries = nbQueries | |
194 | |
195 def setGenomeSizes(self, nbGenomeSizes): | |
196 self._nbGenomeSizes = nbGenomeSizes | |
197 | |
198 def setNbReplicates(self, nbReplicates): | |
199 self._nbReplicates = nbReplicates | |
200 | |
201 def setChromosomeName(self, chromosome): | |
202 self._chromosomeName = chromosome | |
203 | |
204 def setSizes(self, minSize, maxSize): | |
205 self._minSize = minSize | |
206 self._maxSize = maxSize | |
207 | |
208 def setOutputFileName(self, fileName): | |
209 self._outputFileName = fileName | |
210 | |
211 def setLimits(self, time, memory): | |
212 self._time = time | |
213 self._memory = memory | |
214 | |
215 def _generateIntervals(self, nbElements, genomeSize): | |
216 fileName = self._createTmpFileName("intervals", "gff3") | |
217 iRR = RandomRegionsGenerator(0) | |
218 iRR.setMinSize(self._minSize) | |
219 iRR.setMaxSize(self._maxSize) | |
220 iRR.setGenomeSize(genomeSize) | |
221 iRR.setChromosomeName(self._chromosomeName) | |
222 iRR.setStrands(False) | |
223 iRR.setNumber(nbElements) | |
224 iRR.setOutputFile(fileName) | |
225 iRR.run() | |
226 return fileName | |
227 | |
228 def _startTool(self, type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize): | |
229 outputFileName = self._createTmpFileName("output", "gff3") | |
230 outFileName = self._createTmpFileName("out", "out") | |
231 errFileName = self._createTmpFileName("err", "err") | |
232 outHandle = open(outFileName, "w") | |
233 errHandle = open(errFileName, "w") | |
234 self._command = "python %s -i %s -f gff3 -j %s -g gff3 -o %s -v 3" % (self._toolName[type], queryFileName, refFileName, outputFileName) | |
235 thread = RunCmd(self._command, outHandle, errHandle, self._time, self._memory) | |
236 over = thread.go() | |
237 self._mem = thread.getMemory() | |
238 if os.path.exists(outputFileName): | |
239 os.remove(outputFileName) | |
240 outHandle.close() | |
241 errHandle.close() | |
242 errData = open(errFileName).readlines() | |
243 if errData: | |
244 print "Error output: \n%s\n" % ("\n".join(errData)) | |
245 if not over: | |
246 errHandle = open(errFileName, "r") | |
247 error = errHandle.readlines() | |
248 errHandle.close() | |
249 if error: | |
250 for line in error: | |
251 print line.strip() | |
252 print "Previous process failed" | |
253 os.remove(errFileName) | |
254 if not over: | |
255 return False | |
256 return outFileName | |
257 | |
258 def _parseTrace(self, type, fileName, genomeSize): | |
259 handle = open(fileName) | |
260 buildTime = 0 | |
261 try: | |
262 for line in handle: | |
263 line = line.strip() | |
264 if "time spent" in line: | |
265 buildTime += float(line.split()[-1][:-1]) | |
266 elif "done" in line: | |
267 buildTime += float(line.split("(")[1][:-2]) | |
268 elif "# queries" in line: | |
269 nbQueries = int(line.split()[-1]) | |
270 elif "# refs" in line: | |
271 nbRefs = int(line.split()[-1]) | |
272 elif "# written" in line: | |
273 nbOutputs = int(line.split()[2]) | |
274 nbOverlaps = int(line.split()[3][1:]) | |
275 elif "time" in line: | |
276 runTime = float(line.split()[-1][:-1]) | |
277 except Exception, e: | |
278 handle.close() | |
279 self._dumpAndReturn(fileName, e) | |
280 handle.close() | |
281 try: | |
282 return Data(type, buildTime, runTime, self._mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize) | |
283 except Exception, e: | |
284 handle.close() | |
285 self._dumpAndReturn(fileName, e) | |
286 | |
287 def _cleanTmpFiles(self, really = False): | |
288 files = glob.glob("tmp_%d_*.pkl" % (self._pid)) + glob.glob("tmp_%d_*.bin" % (self._pid)) | |
289 if really: | |
290 files += glob.glob("tmp_%d_*.gff3" % (self._pid)) + glob.glob("tmp_%d_*.out" % (self._pid)) | |
291 for fileName in files: | |
292 os.remove(fileName) | |
293 | |
294 def run(self): | |
295 progress = Progress(len(self._nbReferences) * len(self._nbQueries) * len(self._nbGenomeSizes) * self._nbReplicates, "Processing", self._verbosity) | |
296 for nbReferences in self._nbReferences: | |
297 for queriesRatio in self._nbQueries: | |
298 nbQueries = int(nbReferences * queriesRatio) | |
299 for genomeSizeRatio in self._nbGenomeSizes: | |
300 genomeSize = int(nbReferences * genomeSizeRatio) | |
301 for replicate in range(self._nbReplicates): | |
302 refFileName = self._generateIntervals(nbReferences, genomeSize) | |
303 queryFileName = self._generateIntervals(nbQueries, genomeSize) | |
304 data = {} | |
305 for type in TYPES: | |
306 fileName = self._startTool(type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize) | |
307 if not fileName: | |
308 data[type] = Data(type, "NA", "NA", "NA", nbReferences, nbQueries, "NA", "NA", genomeSize) | |
309 else: | |
310 data[type] = self._parseTrace(type, fileName, genomeSize) | |
311 self._structure.addData(data[type]) | |
312 os.remove(fileName) | |
313 self._cleanTmpFiles() | |
314 self._cleanTmpFiles(True) | |
315 firstType = TYPES[0] | |
316 for type in TYPES[1:]: | |
317 if not data[firstType].checkConsistency(data[type]): | |
318 raise Exception("Outputs are not consistent.\n # outputs: %d vs %d.\n # overlaps: %d vs %d.\n %s: %f + %f; %s: %f + %f.\n Files are %s and %s." % (data[firstType]._group._nbOutputs, data[type]._group._nbOutputs, data[firstType]._group._nbOverlaps, data[type]._group._nbOverlaps, firstType, data[firstType]._group._buildTime, data[firstType]._group._runTime, data[firstType]._group._mem, type, data[type]._group._buildTime, data[type]._group._runTime, data[type]._group._mem, refFileName, queryFileName)) | |
319 for fileName in (queryFileName, refFileName): | |
320 if os.path.exists(fileName): | |
321 os.remove(fileName) | |
322 progress.inc() | |
323 progress.done() | |
324 handle = open(self._outputFileName, "w") | |
325 handle.write(self._structure.export()) | |
326 handle.close() | |
327 | |
328 | |
329 | |
330 if __name__ == "__main__": | |
331 | |
332 description = "Benchmark v1.0.2: Compare NC-List with other tools. Only work under Linux. [Category: Other]" | |
333 parser = OptionParser(description = description) | |
334 parser.add_option("-r", "--nbReferences", dest="nbReferences", action="store", default=None, type="string", help="number of references (list of integers separated by commas) [compulsory] [format: string]") | |
335 parser.add_option("-q", "--nbQueries", dest="nbQueries", action="store", default=None, type="string", help="number of queries as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]") | |
336 parser.add_option("-R", "--nbReplicates", dest="nbReplicates", action="store", default=None, type="int", help="number of replicates [compulsory] [format: int]") | |
337 parser.add_option("-s", "--genomeSizes", dest="genomeSizes", action="store", default=None, type="string", help="genome size as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]") | |
338 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default="chr1", type="string", help="name of the chromosome [default: chr1] [format: string]") | |
339 parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the reads [compulsory] [format: int]") | |
340 parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the reads [compulsory] [format: int]") | |
341 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in TXT format]") | |
342 parser.add_option("-t", "--time", dest="time", action="store", default=None, type="int", help="maximum time to wait (in seconds) [default: None] [format: int]") | |
343 parser.add_option("-m", "--memory", dest="memory", action="store", default=None, type="float", help="maximum memory usage (in %) [default: None] [format: float]") | |
344 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") | |
345 (options, args) = parser.parse_args() | |
346 | |
347 benchmark = Benchmark(options.verbosity) | |
348 benchmark.setNbReferences(map(int, options.nbReferences.split(","))) | |
349 benchmark.setNbQueries(map(float, options.nbQueries.split(","))) | |
350 benchmark.setGenomeSizes(map(float, options.genomeSizes.split(","))) | |
351 benchmark.setNbReplicates(options.nbReplicates) | |
352 benchmark.setChromosomeName(options.chromosome) | |
353 benchmark.setSizes(options.minSize, options.maxSize) | |
354 benchmark.setLimits(options.time, options.memory) | |
355 benchmark.setOutputFileName(options.outputFileName) | |
356 benchmark.run() | |
357 |