14 from __future__
import print_function
15 from builtins
import range
16 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass
as mpslib
21 lib = mpslib.jobdatabase()
25 command = [
"ls",
"-l", os.path.join(lib.mssDir,
"binaries")]
27 eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT).
decode().
split(
'\n')
28 except subprocess.CalledProcessError:
32 for i
in range(len(lib.JOBID)):
69 if 'DISABLED' in lib.JOBSTATUS[i]:
72 if 'FETCH' in lib.JOBSTATUS[i]:
75 stdOut =
'jobData/'+lib.JOBDIR[i]+
'/STDOUT' 77 if os.access(stdOut+
'.gz', os.R_OK):
78 os.system(
'gunzip '+stdOut+
'.gz')
81 with open(stdOut,
"r") as STDFILE: 87 if re.search(re.compile(
'Unable to access quota space',re.M|re.I), line):
89 if re.search(re.compile(
'Unable to get quota space',re.M|re.I), line):
91 if re.search(re.compile(
'Disk quota exceeded',re.M|re.I), line):
93 if re.search(re.compile(
'CERN report: Job Killed',re.M), line):
95 if re.search(re.compile(
'Job finished',re.M), line):
97 if re.search(re.compile(
'connection timed out',re.M), line):
99 if re.search(re.compile(
'ConfigFileReadError',re.M), line):
101 if re.search(re.compile(
'0 bytes transferred',re.M), line):
103 if re.search(re.compile(
'command not found',re.M), line):
106 if re.search(re.compile(
'stage_put: Insufficient user privileges',re.M), line):
108 if re.search(re.compile(
'Give up doing',re.M), line):
110 if re.search(re.compile(
'Directory content before',re.M),line):
116 match = re.search(re.compile(
'This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
119 cputime =
int(round(
int(match.group(1))/cpuFactor))
122 print(
'gzip -f '+stdOut)
123 os.system(
'gzip -f '+stdOut)
125 if e.args == (2,
"No such file or directory"):
126 print(
"mps_check.py cannot find", stdOut,
"to test")
132 log_file = os.path.join(
"jobData", lib.JOBDIR[i],
"HTCJOB")
133 condor_log = subprocess.check_output([
"condor_q", lib.JOBID[i],
134 "-userlog", log_file,
139 stderr = subprocess.STDOUT).
decode()
140 condor_log = condor_log.split()
142 cputime =
int(round(
float(condor_log[0])))
144 if condor_log[1] ==
"3":
146 kill_reason =
" ".
join(condor_log[2:])
148 except subprocess.CalledProcessError
as e:
153 eazeLog =
'jobData/'+lib.JOBDIR[i]+
'/cmsRun.out' 154 if os.access(eazeLog, os.R_OK):
156 with open(eazeLog,
"r") as INFILE: 160 if re.search(re.compile(
'<StorageStatistics>',re.M), line):
162 if re.search(re.compile(
'Time limit reached\.',re.M), line):
164 if re.search(re.compile(
'gives I\/O problem',re.M), line):
166 if re.search(re.compile(
'FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
168 if re.search(re.compile(
'too many tracks',re.M), line):
170 if re.search(re.compile(
'segmentation violation',re.M), line):
172 if re.search(re.compile(
'failed RFIO error',re.M), line):
174 if re.search(re.compile(
'Request exceeds quota',re.M), line):
178 eazeLog =
'jobData/'+lib.JOBDIR[i]+
'/alignment.log' 181 if os.access(eazeLog+
'.gz', os.R_OK):
182 os.system(
'gunzip '+eazeLog+
'.gz')
185 if os.access(eazeLog, os.R_OK):
187 with open(eazeLog,
'r') as INFILE: 191 if re.search(re.compile(
'<StorageStatistics>',re.M), line):
193 if re.search(re.compile(
'EAZE\. Time limit reached\.',re.M), line):
195 if re.search(re.compile(
'GAF gives I\/O problem',re.M), line):
197 if re.search(re.compile(
'FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
199 if re.search(re.compile(
'too many tracks',re.M), line):
201 if re.search(re.compile(
'segmentation violation',re.M), line):
203 if re.search(re.compile(
'failed RFIO error',re.M), line):
205 if re.search(re.compile(
'Request exceeds quota',re.M), line):
208 if re.search(re.compile(
'Fatal Exception',re.M), line):
210 if re.search(re.compile(
'Exception caught in cmsRun',re.M), line):
213 if re.search(re.compile(
'AlignmentProducerAsAnalyzer::endJob\(\)',re.M), line):
215 if re.search(re.compile(
'FwkReport -i main_input:sourc',re.M), line):
217 nEvent =
int(array[5])
218 if nEvent==0
and re.search(re.compile(
'FwkReport -i PostSource',re.M), line):
220 nEvent =
int(array[5])
222 if nEvent==0
and re.search(re.compile(
'FwkReport -i AfterSource',re.M), line):
224 nEvent =
int(array[5])
226 if logZipped ==
'true':
227 os.system(
'gzip -f '+eazeLog)
230 print(
'mps_check.py cannot find',eazeLog,
'to test')
236 milleOut =
'milleBinary%03d.dat' % (i+1)
243 for line
in eoslsoutput:
245 columns = line.split()
246 mOutSize =
int(columns[4])
253 eazeLog =
'jobData/'+lib.JOBDIR[i]+
'/pede.dump' 254 if os.access(eazeLog+
'.gz', os.R_OK):
256 os.system(
'rm -f /tmp/pede.dump')
257 os.system(
'gunzip -c '+eazeLog+
'.gz > /tmp/pede.dump')
258 eazeLog =
'/tmp/pede.dump' 259 if os.access(eazeLog, os.R_OK):
260 with open(eazeLog,
"r") as INFILE: # open pede.dump 266 if re.search(re.compile(
'Millepede II.* ending',re.M), line):
269 match = re.search(re.compile(
'Peak dynamic memory allocation: (.+) GB',re.I), line)
272 mem = re.sub(
'\s',
'', mem)
274 if re.search(re.compile(
'^\d+\.\d+$',re.M), mem):
275 usedPedeMem =
float(mem)
277 print(
'mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem)
281 if lib.pedeMem > 0
and usedPedeMem > 0.:
282 memoryratio = usedPedeMem /(lib.pedeMem/1024.)
285 if lib.pedeMem > 4000
and memoryratio < 0.75 :
286 msg = (
"Warning: {0:.2f} GB of memory for Pede " 287 "requested, but only {1:.1f}% of it has been " 288 "used! Consider to request less memory in order " 289 "to save resources.")
290 print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
291 elif memoryratio > 1 :
292 msg = (
"Warning: {0:.2f} GB of memory for Pede " 293 "requested, but {1:.1f}% of this has been " 294 "used! Consider to request more memory to avoid " 295 "premature removal of the job by the admin.")
296 print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
298 msg = (
"Info: Used {0:.1f}% of {1:.2f} GB of memory " 299 "which has been requested for Pede.")
300 print(msg.format(memoryratio*100, lib.pedeMem/1024.0))
304 if eazeLog ==
'/tmp/pede.dump':
305 os.system(
'rm /tmp/pede.dump')
309 print(
'mps_check.py cannot find',eazeLog,
'to test')
312 eazeLog =
'jobData/'+lib.JOBDIR[i]+
'/millepede.log' 314 if os.access(eazeLog+
'.gz', os.R_OK):
315 os.system(
'gunzip '+eazeLog+
'.gz')
318 if os.access(eazeLog, os.R_OK):
320 with open(eazeLog,
"r") as INFILE: 324 if re.search(re.compile(
'step no descending',re.M), line):
326 pedeLogErrStr += line
327 if re.search(re.compile(
'Constraint equation discrepancies:',re.M), line):
329 pedeLogErrStr += line
331 if re.search(re.compile(
'insufficient constraint equations',re.M), line):
333 pedeLogWrnStr += line
335 if logZipped ==
'true':
336 os.system(
'gzip -f '+eazeLog)
338 print(
'mps_check.py cannot find',eazeLog,
'to test')
342 eazeLog =
'jobData/'+lib.JOBDIR[i]+
'/millepede.end' 344 if os.access(eazeLog+
'.gz', os.R_OK):
345 os.system(
'gunzip'+eazeLog+
'.gz')
348 if os.access(eazeLog, os.R_OK):
350 with open(eazeLog,
"r") as INFILE: 355 match = re.search(re.compile(
'([-+]?\d+)',re.M), line)
357 if int(match.group(1)) == 1:
359 pedeLogWrnStr += line
360 elif int(match.group(1)) != 0:
362 pedeLogErrStr += line
363 if logZipped ==
'true':
364 os.system(
'gzip -f '+eazeLog)
366 print(
'mps_check.py cannot find',eazeLog,
'to test')
374 print(lib.JOBDIR[i],lib.JOBID[i],
'did not reach end of file')
377 print(lib.JOBDIR[i],lib.JOBID[i],
'had quota space problem')
379 remark =
'eos quota space problem' 381 print(lib.JOBDIR[i],lib.JOBID[i],
'had I/O problem')
384 print(lib.JOBDIR[i],lib.JOBID[i],
'had Framework error 8001 problem')
385 remark =
'fwk error 8001' 388 print(lib.JOBDIR[i],lib.JOBID[i],
'had connection timed out problem')
389 remark =
'connection timed out' 391 print(lib.JOBDIR[i],lib.JOBID[i],
'had config file error')
392 remark =
'cfg file error' 395 guess =
" (probably time exceeded)" if kill_reason
is None else ":" 396 print(lib.JOBDIR[i], lib.JOBID[i],
"Job killed" + guess)
397 if kill_reason
is not None:
print(kill_reason)
401 print(lib.JOBDIR[i],lib.JOBID[i],
'ran into time limit')
403 if tooManyTracks == 1:
404 print(lib.JOBDIR[i],lib.JOBID[i],
'too many tracks')
406 print(lib.JOBDIR[i],lib.JOBID[i],
'SEGVIOL encountered')
410 print(lib.JOBDIR[i],lib.JOBID[i],
'RFIO error encountered')
411 remark =
'rfio error' 414 print(lib.JOBDIR[i],lib.JOBID[i],
'Request exceeds quota')
415 if exceptionCaught == 1:
416 print(lib.JOBDIR[i],lib.JOBID[i],
'Exception caught in cmsrun')
417 remark =
'Exception caught' 420 print(
'milleBinary???.dat file not found or empty')
421 remark =
'empty milleBinary' 422 if emptyDatOnFarm > 0:
423 print(
'...but already empty on farm so OK (or check job',i+1,
'yourself...)')
427 print(lib.JOBDIR[i],lib.JOBID[i],
'Command not found')
428 remark =
'cmd not found' 431 print(lib.JOBDIR[i],lib.JOBID[i],
'Insufficient privileges to rfcp files')
432 remark =
'Could not rfcp files' 435 print(lib.JOBDIR[i],lib.JOBID[i],
'Pede did not end normally')
436 remark =
'pede failed' 439 print(lib.JOBDIR[i],lib.JOBID[i],
'Problems in running Pede:')
441 remark =
'pede error' 445 print(lib.JOBDIR[i],lib.JOBID[i],
'Warnings in running Pede:')
447 remark =
'pede warnings' 450 print(lib.JOBDIR[i],lib.JOBID[i],
'Job not ended')
451 remark =
'job not ended' 453 if copyerr == 1
and ispede!=1:
455 print(lib.JOBDIR[i],lib.JOBID[i],
'Copy to eos failed')
456 remark =
'copy to eos failed' 462 print(lib.JOBDIR[i],lib.JOBID[i],
' -------- ',okStatus)
465 lib.JOBNEVT[i] = nEvent
467 lib.JOBSTATUS[i] = disabled+okStatus
469 lib.JOBRUNTIME[i] = cputime
471 lib.JOBREMARK[i] = remark
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
bool decode(bool &, std::string const &)
def split(sequence, size)
static std::string join(char **cmd)