annotate commons/core/sql/TableJobAdaptator.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
1 # Copyright INRA (Institut National de la Recherche Agronomique)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
2 # http://www.inra.fr
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
3 # http://urgi.versailles.inra.fr
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
4 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
5 # This software is governed by the CeCILL license under French law and
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
6 # abiding by the rules of distribution of free software. You can use,
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
7 # modify and/ or redistribute the software under the terms of the CeCILL
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
8 # license as circulated by CEA, CNRS and INRIA at the following URL
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
9 # "http://www.cecill.info".
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
10 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
11 # As a counterpart to the access to the source code and rights to copy,
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
12 # modify and redistribute granted by the license, users are provided only
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
13 # with a limited warranty and the software's author, the holder of the
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
14 # economic rights, and the successive licensors have only limited
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
15 # liability.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
16 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
17 # In this respect, the user's attention is drawn to the risks associated
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
18 # with loading, using, modifying and/or developing or reproducing the
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
19 # software by the user in light of its specific status of free software,
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
20 # that may mean that it is complicated to manipulate, and that also
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
21 # therefore means that it is reserved for developers and experienced
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
22 # professionals having in-depth computer knowledge. Users are therefore
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
23 # encouraged to load and test the software's suitability as regards their
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
24 # requirements in conditions enabling the security of their systems and/or
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
25 # data to be ensured and, more generally, to use and operate it in the
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
26 # same conditions as regards security.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
27 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
28 # The fact that you are presently reading this means that you have had
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
29 # knowledge of the CeCILL license and that you accept its terms.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
30
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
31
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
32 import os
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
33 import time
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
34 import datetime
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
35 import sys
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
36 from commons.core.sql.Job import Job
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
37 from commons.core.sql.TableAdaptator import TableAdaptator
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
38
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
39 ## Methods for Job persistence
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
40 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
41 class TableJobAdaptator(TableAdaptator):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
42
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
43 ## Record a job
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
44 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
45 # @param job Job instance with the job informations
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
46 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
47 def recordJob(self, job):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
48 self.removeJob(job)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
49 sqlCmd = "INSERT INTO %s" % self._table
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
50 sqlCmd += " VALUES ("
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
51 sqlCmd += " \"%s\"," % job.jobid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
52 sqlCmd += " \"%s\"," % job.jobname
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
53 sqlCmd += " \"%s\"," % job.groupid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
54 sqlCmd += " \"%s\"," % job.launcher
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
55 sqlCmd += " \"%s\"," % job.queue
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
56 sqlCmd += " \"%s\"," % job.lResources
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
57 sqlCmd += " \"waiting\","
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
58 sqlCmd += " \"%s\"," % time.strftime("%Y-%m-%d %H:%M:%S")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
59 sqlCmd += " \"?\" );"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
60 self._iDb.execute(sqlCmd)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
61
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
62
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
63 ## Remove a job from the job table
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
64 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
65 # @param job: job instance to remove
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
66 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
67 def removeJob(self, job):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
68 qry = "DELETE FROM %s" % self._table
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
69 qry += " WHERE groupid='%s'" % job.groupid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
70 qry += " AND jobname='%s'" % job.jobname
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
71 qry += " AND launcher='%s';" % job.launcher
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
72 self._iDb.execute(qry)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
73
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
74
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
75 ## Set the jobid of a job with the id of SGE
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
76 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
77 # @param job job instance
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
78 # @param jobid integer
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
79 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
80 def updateJobIdInDB(self, job, jobid):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
81 #TODO: check if only one job will be updated
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
82 qry = "UPDATE %s" % self._table
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
83 qry += " SET jobid='%i'" % int(jobid)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
84 qry += " WHERE jobname='%s'" % job.jobname
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
85 qry += " AND groupid='%s'" % job.groupid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
86 qry += " AND launcher='%s';" % job.launcher
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
87 self._iDb.execute(qry)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
88
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
89
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
90 ## Get a job status
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
91 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
92 # @param job: a Job instance with the job informations
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
93 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
94 def getJobStatus(self, job):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
95 if job.jobid != 0 and job.jobname == "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
96 job.jobname = job.jobid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
97 job.jobid = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
98 qry = "SELECT status FROM %s" % self._table
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
99 qry += " WHERE groupid='%s'" % job.groupid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
100 qry += " AND jobname='%s'" % job.jobname
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
101 qry += " AND launcher='%s';" % job.launcher
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
102 self._iDb.execute(qry)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
103 res = self._iDb.fetchall()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
104 if len(res) > 1:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
105 sys.stderr.write("ERROR while getting job status: non-unique jobs\n")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
106 sys.stderr.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
107 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
108 if res == None or len(res) == 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
109 return "unknown"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
110 return res[0][0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
111
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
112
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
113 ## Change a job status
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
114 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
115 # @param job: a Job instance with the job informations
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
116 # @param status: the new status (waiting,finished,error)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
117 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
118 def changeJobStatus(self, job, status):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
119 sqlCmd = "UPDATE %s" % self._table
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
120 sqlCmd += " SET status='%s'" % status
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
121 sqlCmd += ", node='%s'" % job.node
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
122 sqlCmd += " WHERE groupid='%s'" % job.groupid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
123 sqlCmd += " AND jobname='%s'" % job.jobname
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
124 sqlCmd += " AND launcher='%s';" % job.launcher
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
125 self._iDb.execute(sqlCmd)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
126
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
127
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
128 ## Get the number of jobs belonging to the desired groupid with the desired status.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
129 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
130 # @param groupid string a group identifier to record related job series
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
131 # @param status string job status (waiting, running, finished, error)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
132 # @return int
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
133 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
134 def getCountStatus(self, groupid, status):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
135 qry = "SELECT count(jobname) FROM %s" % self._table
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
136 qry += " WHERE groupid='%s'" % groupid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
137 qry += " AND status='%s';" % status
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
138 self._iDb.execute(qry)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
139 res = self._iDb.fetchall()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
140 return int(res[0][0])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
141
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
142
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
143 ## Clean all job from a job group
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
144 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
145 # @param groupid: a group identifier to record related job series
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
146 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
147 def cleanJobGroup(self, groupid):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
148 qry = "DELETE FROM %s WHERE groupid='%s';" % (self._table, groupid)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
149 self._iDb.execute(qry)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
150
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
151
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
152 ## Check if there is unfinished job from a job group.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
153 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
154 # @param groupid string a group identifier to record related job series
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
155 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
156 def hasUnfinishedJob(self, groupid):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
157 qry = "SELECT * FROM %s" % self._table
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
158 qry += " WHERE groupid='%s'" % groupid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
159 qry += " and status!='finished';"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
160 self._iDb.execute(qry)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
161 res = self._iDb.fetchall()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
162 if len(res) == 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
163 return False
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
164 return True
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
165
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
166
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
167 ## Wait job finished status from a job group.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
168 # Job are re-launched if error (max. 3 times)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
169 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
170 # @param groupid string a group identifier to record related job series
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
171 # @param checkInterval integer time laps in seconds between two checks (default = 5)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
172 # @param maxRelaunch integer max nb of times a job in error is relaunch before exiting (default = 3)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
173 # @param exitIfTooManyErrors boolean exit if a job is still in error above maxRelaunch (default = True)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
174 # @param timeOutPerJob integer max nb of seconds after which one tests if a job is still in SGE or not (default = 60*60=1h)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
175 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
176 def waitJobGroup(self, groupid, checkInterval=5, maxRelaunch=3, exitIfTooManyErrors=True, timeOutPerJob=60*60):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
177 dJob2Err = {}
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
178
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
179 # retrieve the total number of jobs belonging to the desired groupid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
180 qry = "SELECT count(jobname) FROM %s WHERE groupid='%s';" % (self._table, groupid)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
181 self._iDb.execute(qry)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
182 totalNbJobs = int(self._iDb.fetchall()[0][0])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
183
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
184 nbTimeOuts = 0
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
185
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
186 while True:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
187 time.sleep(checkInterval)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
188 # retrieve the finished jobs and stop if all jobs are finished
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
189 nbFinishedJobs = self.getCountStatus(groupid, "finished")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
190 if nbFinishedJobs == totalNbJobs:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
191 break
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
192
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
193 # retrieve the jobs in error and relaunch them if they are in error (max. 'maxRelaunch' times)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
194 qry = "SELECT * FROM %s" % self._table
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
195 qry += " WHERE groupid='%s'" % groupid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
196 qry += " AND status ='error';"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
197 self._iDb.execute(qry)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
198 lJobsInError = self._iDb.fetchall()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
199 for job in lJobsInError:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
200 jobName = job[1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
201 if not dJob2Err.has_key(jobName):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
202 dJob2Err[jobName] = 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
203 if dJob2Err[jobName] < maxRelaunch:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
204 print "job '%s' in error, re-submitting (%i)" % (job[1], dJob2Err[job[1]])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
205 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
206 lResources = job[5].replace("[", "").replace("]", "").replace("'", "").split(", ")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
207 newJob = Job(jobname=jobName, groupid=job[2], launcherFile=job[3], queue=job[4], lResources=lResources)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
208 self.submitJob(newJob)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
209 dJob2Err[jobName] += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
210 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
211 dJob2Err[jobName] += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
212 cmd = "job '%s' in permanent error (>%i)" % (jobName, maxRelaunch)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
213 cmd += "\ngroupid = %s" % groupid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
214 cmd += "\nnb of jobs = %i" % totalNbJobs
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
215 cmd += "\nnb of finished jobs = %i" % self.getCountStatus(groupid, "finished")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
216 cmd += "\nnb of waiting jobs = %i" % self.getCountStatus(groupid, "waiting")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
217 cmd += "\nnb of running jobs = %i" % self.getCountStatus(groupid, "running")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
218 cmd += "\nnb of jobs in error = %i" % self.getCountStatus(groupid, "error")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
219 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
220 if exitIfTooManyErrors:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
221 self.cleanJobGroup(groupid)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
222 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
223 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
224 checkInterval = 60
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
225 nbTimeOuts = self._checkIfJobsTableAndJobsManagerInfoAreConsistent(nbTimeOuts, timeOutPerJob, groupid)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
226
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
227
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
228 ## Submit a job to a queue and record it in job table.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
229 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
230 # @param job a job instance
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
231 # @param maxNbWaitingJobs integer max nb of waiting jobs before submitting a new one (default = 10000)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
232 # @param checkInterval integer time laps in seconds between two checks (default = 30)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
233 # @param verbose integer (default = 0)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
234 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
235 def submitJob(self, job, verbose=0, maxNbWaitingJobs=10000, checkInterval=30):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
236 if self.getJobStatus(job) in ["waiting", "running", "finished"]:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
237 sys.stderr.write( "WARNING: job '%s' was already submitted\n" % job.jobname)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
238 sys.stderr.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
239 self.cleanJobGroup(job.groupid)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
240 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
241
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
242 while self.getCountStatus(job.groupid, "waiting") > maxNbWaitingJobs:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
243 time.sleep(checkInterval)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
244
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
245 self.recordJob(job)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
246 cmd = self._getQsubCommand(job)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
247 returnStatus = os.system(cmd)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
248
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
249 if returnStatus == 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
250 fileName = "jobid.stdout"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
251 jobidFileHandler = open(fileName, "r")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
252 jobid = self._getJobidFromJobManager(jobidFileHandler)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
253 if verbose > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
254 print "job '%i %s' submitted" % (jobid, job.jobname)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
255 sys.stdout.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
256 job.jobid = jobid
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
257 jobidFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
258 self.updateJobIdInDB(job, jobid)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
259 os.remove(fileName)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
260 return returnStatus
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
261
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
262
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
263 ## Get the list of nodes where jobs of one group were executed
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
264 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
265 # @param groupid string a group identifier of job series
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
266 # @return lNodes list of nodes names without redundancy
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
267 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
268 def getNodesListByGroupId(self, groupId):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
269 qry = "SELECT DISTINCT node FROM %s" % self._table
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
270 qry += " WHERE groupid='%s'" % groupId
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
271 self._iDb.execute(qry)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
272 res = self._iDb.fetchall()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
273 lNodes = []
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
274 for resTuple in res:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
275 lNodes.append(resTuple[0])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
276 return lNodes
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
277
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
278 def checkJobTable(self):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
279 if not self._iDb.doesTableExist(self._table):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
280 self._iDb.createTable(self._table, "jobs")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
281 else:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
282 lExpFields = sorted(["jobid", "jobname", "groupid", "launcher", "queue", "resources", "status", "time", "node"])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
283 lObsFields = sorted(self._iDb.getFieldList(self._table))
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
284 if lExpFields != lObsFields:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
285 self._iDb.createTable(self._table, "jobs", overwrite = True)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
286
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
287 def close(self):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
288 self._iDb.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
289
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
290 def _getJobidAndNbJob(self, jobid) :
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
291 tab = jobid.split(".")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
292 jobid = tab[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
293 tab = tab[1].split(":")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
294 nbJob = tab[0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
295 return jobid, nbJob
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
296
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
297 class TableJobAdaptatorSGE(TableJobAdaptator):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
298
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
299 def _checkIfJobsTableAndJobsManagerInfoAreConsistent(self, nbTimeOuts, timeOutPerJob, groupid):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
300 # retrieve the date and time at which the oldest, still-running job was submitted
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
301 sql = "SELECT jobid,jobname,time FROM %s WHERE groupid='%s' AND status='running' ORDER BY time DESC LIMIT 1" % (self._table, groupid)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
302 self._iDb.execute( sql )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
303 res = self._iDb.fetchall()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
304 if len(res) > 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
305 jobid = res[0][0]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
306 jobname = res[0][1]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
307 dateTimeOldestJob = res[0][2]
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
308 dateTimeCurrent = datetime.datetime.now()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
309 # delta is time between (i) first job launched of the given groupid and still in running state and (ii) current time
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
310 delta = dateTimeCurrent - dateTimeOldestJob
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
311 # check if delta is in an interval: 0 <= delta < 1h | 1h <= delta < 2h | 2h <= delta < 3h (timeOutPerJob = 1h)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
312 if delta.seconds >= nbTimeOuts * timeOutPerJob and delta.seconds < (nbTimeOuts+1) * timeOutPerJob:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
313 return nbTimeOuts
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
314 # delta outside the interval: go to next interval (time out)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
315 if delta.seconds >= (nbTimeOuts+1) * timeOutPerJob:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
316 nbTimeOuts += 1
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
317 # Job with 'running' status should be in qstat. Because status in DB is set at 'running' by the job launched.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
318 if not self.isJobStillHandledBySge(jobid, jobname):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
319 # But if not, let time for the status update (in DB), if the job finished between the query execution and now.
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
320 time.sleep( 5 )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
321 # If no update at 'finished', exit
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
322 #TODO: check status in DB
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
323 if not self.isJobStillHandledBySge(jobid, jobname):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
324 msg = "ERROR: job '%s', supposedly still running, is not handled by SGE anymore" % ( jobid )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
325 msg += "\nit was launched the %s (> %.2f hours ago)" % ( dateTimeOldestJob, timeOutPerJob/3600.0 )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
326 msg += "\nthis problem can be due to:"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
327 msg += "\n* memory shortage, in that case, decrease the size of your jobs;"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
328 msg += "\n* timeout, in that case, decrease the size of your jobs;"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
329 msg += "\n* node failure or database error, in that case, launch the program again or ask your system administrator."
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
330 sys.stderr.write("%s\n" % msg)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
331 sys.stderr.flush()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
332 self.cleanJobGroup(groupid)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
333 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
334 return nbTimeOuts
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
335
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
336 ## Check if a job is still handled by SGE
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
337 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
338 # @param jobid string job identifier
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
339 # @param jobname string job name
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
340 #
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
341 def isJobStillHandledBySge(self, jobid, jobname):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
342 isJobInQstat = False
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
343 qstatFile = "qstat_stdout"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
344 cmd = "qstat > %s" % qstatFile
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
345 returnStatus = os.system(cmd)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
346 if returnStatus != 0:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
347 msg = "ERROR while launching 'qstat'"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
348 sys.stderr.write( "%s\n" % msg )
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
349 sys.exit(1)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
350 qstatFileHandler = open(qstatFile, "r")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
351 lLines = qstatFileHandler.readlines()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
352 for line in lLines:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
353 tokens = line.split()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
354 if len(tokens) > 3 and tokens[0] == str(jobid) and tokens[2] == jobname[0:len(tokens[2])]:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
355 isJobInQstat = True
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
356 break
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
357 qstatFileHandler.close()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
358 os.remove(qstatFile)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
359 return isJobInQstat
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
360
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
361 def _getQsubCommand(self, job):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
362 cmd = "echo '%s' | " % job.launcher
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
363 prg = "qsub"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
364 cmd += prg
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
365 cmd += " -V"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
366 cmd += " -N %s" % job.jobname
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
367 if job.queue != "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
368 cmd += " -q %s" % job.queue
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
369 cmd += " -cwd"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
370 if job.lResources != []:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
371 cmd += " -l \""
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
372 cmd += " ".join(job.lResources)
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
373 cmd += "\""
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
374 if job.parallelEnvironment != "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
375 cmd += " -pe " + job.parallelEnvironment
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
376 cmd += " > jobid.stdout"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
377 return cmd
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
378
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
379 def _getJobidFromJobManager(self, jobidFileHandler):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
380 return int(jobidFileHandler.readline().split(" ")[2])
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
381
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
382
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
383 class TableJobAdaptatorTorque(TableJobAdaptator):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
384
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
385 def _checkIfJobsTableAndJobsManagerInfoAreConsistent(self, nbTimeOuts, timeOutPerJob, groupid):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
386 return nbTimeOuts
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
387
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
388 def _getQsubCommand(self, job):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
389 cmd = "echo '%s' | " % job.launcher
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
390 prg = "qsub"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
391 cmd += prg
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
392 cmd += " -V"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
393 cmd += " -d %s" % os.getcwd()
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
394 cmd += " -N %s" % job.jobname
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
395 if job.queue != "":
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
396 cmd += " -q %s" % job.queue
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
397 if job.lResources != []:
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
398 cmd += " -l \""
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
399 cmd += " ".join(job.lResources).replace("mem_free","mem")
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
400 cmd += "\""
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
401 cmd += " > jobid.stdout"
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
402 return cmd
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
403
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
404 def _getJobidFromJobManager(self, jobidFileHandler):
769e306b7933 Change the repository level.
yufei-luo
parents:
diff changeset
405 return int(jobidFileHandler.readline().split(".")[0])