CMS 3D CMS Logo

mps_check.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 #
3 # This script checks outputs from jobs that have FETCH status and updates if errors occured
4 # -> check STDOUT files
5 # -> check cmsRun.out
6 # -> check alignment.log
7 # -> check if millebinaries are on eos
8 # -> check pede.dump
9 # -> check millepede.log
10 # -> check millepede.end
11 #
12 # It also retirieves number of events from alignment.log and cputime from STDOUT
13 
14 from __future__ import print_function
15 from builtins import range
16 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
17 import subprocess
18 import re
19 import os
20 
21 lib = mpslib.jobdatabase()
22 lib.read_db()
23 
24 # create a list of eos ls entries containing files on eos binary store
25 command = ["ls", "-l", os.path.join(lib.mssDir, "binaries")]
26 try:
27  eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT).decode().split('\n')
28 except subprocess.CalledProcessError:
29  eoslsoutput = ""
30 
31 # loop over FETCH jobs
32 for i in range(len(lib.JOBID)):
33  # FIXME use bools?
34  batchSuccess = 0
35  batchExited = 0
36  finished = 0
37  endofjob = 0
38  eofile = 1 # do not deal with timel yet
39  timel = 0
40  killed = 0
41  ioprob = 0
42  fw8001 = 0
43  tooManyTracks = 0
44  segviol = 0
45  rfioerr = 0
46  quota = 0
47  nEvent = 0
48  cputime = -1
49  pedeAbend = 0
50  pedeLogErr = 0
51  pedeLogWrn = 0
52  exceptionCaught = 0
53  timeout = 0
54  cfgerr = 0
55  emptyDatErr = 0
56  emptyDatOnFarm = 0
57  cmdNotFound = 0
58  insuffPriv = 0
59  quotaspace = 0
60  copyerr=0
61  ispede=0
62 
63  kill_reason = None
64  pedeLogErrStr = ""
65  pedeLogWrnStr = ""
66  remark = ""
67 
68  disabled = "";
69  if 'DISABLED' in lib.JOBSTATUS[i]:
70  disabled = 'DISABLED'
71 
72  if 'FETCH' in lib.JOBSTATUS[i]:
73 
74  # open the STDOUT file
75  stdOut = 'jobData/'+lib.JOBDIR[i]+'/STDOUT'
76  # unzip the STDOUT file if necessary
77  if os.access(stdOut+'.gz', os.R_OK):
78  os.system('gunzip '+stdOut+'.gz')
79 
80  try:
81  with open(stdOut, "r") as STDFILE:
82  # scan records in input file.
83  # use regular expression to search. re.compile needed for options re.M and re.I
84  # re.M=re.MULTILINE enables matching of newline char
85  # re.I=re.IGNORECASE makes matching case-insensitive.
86  for line in STDFILE:
87  if re.search(re.compile('Unable to access quota space',re.M|re.I), line):
88  quotaspace = 1
89  if re.search(re.compile('Unable to get quota space',re.M|re.I), line):
90  quotaspace = 1
91  if re.search(re.compile('Disk quota exceeded',re.M|re.I), line):
92  quotaspace = 1
93  if re.search(re.compile('CERN report: Job Killed',re.M), line):
94  killed = 1
95  if re.search(re.compile('Job finished',re.M), line):
96  finished = 1
97  if re.search(re.compile('connection timed out',re.M), line):
98  timeout = 1
99  if re.search(re.compile('ConfigFileReadError',re.M), line):
100  cfgerr = 1
101  if re.search(re.compile('0 bytes transferred',re.M), line):
102  emptyDatOnFarm = 1
103  if re.search(re.compile('command not found',re.M), line):
104  cmdNotFound = 1
105  # AP 26.11.2009 Insufficient privileges to rfcp files
106  if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
107  insuffPriv = 1
108  if re.search(re.compile('Give up doing',re.M), line):
109  copyerr = 1
110  if re.search(re.compile('Directory content before',re.M),line):
111  ispede = 1
112  # AP 05.11.2015 Extract cpu-time.
113  # STDOUT doesn't contain NCU anymore. Now KSI2K and HS06 seconds are displayed.
114  # The ncuFactor is calculated from few samples by comparing KSI2K seconds with
115  # CPU time from email.
116  match = re.search(re.compile('This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
117  if match:
118  cpuFactor = 2.125
119  cputime = int(round(int(match.group(1))/cpuFactor)) # match.group(1) is the matched digit
120 
121  # gzip it afterwards:
122  print('gzip -f '+stdOut)
123  os.system('gzip -f '+stdOut)
124  except IOError as e:
125  if e.args == (2, "No such file or directory"):
126  print("mps_check.py cannot find", stdOut, "to test")
127  else:
128  raise
129 
130  # check HTCondor log file
131  try:
132  log_file = os.path.join("jobData", lib.JOBDIR[i], "HTCJOB")
133  condor_log = subprocess.check_output(["condor_q", lib.JOBID[i],
134  "-userlog", log_file,
135  "-af",
136  "RemoteSysCpu",
137  "JobStatus",
138  "RemoveReason"],
139  stderr = subprocess.STDOUT).decode()
140  condor_log = condor_log.split()
141 
142  cputime = int(round(float(condor_log[0])))
143 
144  if condor_log[1] == "3": # JobStatus == Removed
145  killed = 1
146  kill_reason = " ".join(condor_log[2:])
147 
148  except subprocess.CalledProcessError as e:
149  pass
150 
151 
152  # GF: This file is not produced (anymore...) -> check for existence and read-access added
153  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/cmsRun.out'
154  if os.access(eazeLog, os.R_OK):
155  # open the input file
156  with open(eazeLog, "r") as INFILE:
157  # scan records in input file
158  for line in INFILE:
159  # check if end of file has been reached
160  if re.search(re.compile('<StorageStatistics>',re.M), line):
161  eofile = 1
162  if re.search(re.compile('Time limit reached\.',re.M), line):
163  timel = 1
164  if re.search(re.compile('gives I\/O problem',re.M), line):
165  ioprob = 1
166  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
167  fw8001 = 1
168  if re.search(re.compile('too many tracks',re.M), line):
169  tooManyTracks = 1
170  if re.search(re.compile('segmentation violation',re.M), line):
171  segviol = 1
172  if re.search(re.compile('failed RFIO error',re.M), line):
173  rfioerr = 1
174  if re.search(re.compile('Request exceeds quota',re.M), line):
175  quota = 1
176 
177  # if there is an alignment.log[.gz] file, check it as well
178  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/alignment.log'
179  logZipped = 'no'
180  # unzip the logfile if necessary
181  if os.access(eazeLog+'.gz', os.R_OK):
182  os.system('gunzip '+eazeLog+'.gz')
183  logZipped = 'true'
184 
185  if os.access(eazeLog, os.R_OK): # access to alignment.log
186  # open the input file
187  with open(eazeLog,'r') as INFILE:
188  # scan records in input file
189  for line in INFILE:
190  # check if end of file has been reached
191  if re.search(re.compile('<StorageStatistics>',re.M), line):
192  eofile = 1
193  if re.search(re.compile('EAZE\. Time limit reached\.',re.M), line):
194  timel = 1
195  if re.search(re.compile('GAF gives I\/O problem',re.M), line):
196  ioprob = 1
197  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
198  fw8001 = 1
199  if re.search(re.compile('too many tracks',re.M), line):
200  tooManyTracks = 1
201  if re.search(re.compile('segmentation violation',re.M), line):
202  segviol = 1
203  if re.search(re.compile('failed RFIO error',re.M), line):
204  rfioerr = 1
205  if re.search(re.compile('Request exceeds quota',re.M), line):
206  quota = 1
207  # check for newer (e.g. CMSSW_5_1_X) and older CMSSW:
208  if re.search(re.compile('Fatal Exception',re.M), line):
209  exceptionCaught = 1
210  if re.search(re.compile('Exception caught in cmsRun',re.M), line):
211  exceptionCaught = 1
212  # AP 07.09.2009 - Check that the job got to a normal end
213  if re.search(re.compile('AlignmentProducerAsAnalyzer::endJob\(\)',re.M), line):
214  endofjob = 1
215  if re.search(re.compile('FwkReport -i main_input:sourc',re.M), line):
216  array = line.split()
217  nEvent = int(array[5])
218  if nEvent==0 and re.search(re.compile('FwkReport -i PostSource',re.M), line):
219  array = line.split()
220  nEvent = int(array[5])
221  # AP 31.07.2009 - To read number of events in CMSSW_3_2_2_patch2
222  if nEvent==0 and re.search(re.compile('FwkReport -i AfterSource',re.M), line):
223  array = line.split()
224  nEvent = int(array[5])
225 
226  if logZipped == 'true':
227  os.system('gzip -f '+eazeLog)
228 
229  else: # no access to alignment.log
230  print('mps_check.py cannot find',eazeLog,'to test')
231  # AP 07.09.2009 - The following check cannot be done: set to 1 to avoid fake error type
232  endofjob = 1
233 
234  # for mille jobs checks that milleBinary file is not empty
235  if i<lib.nJobs: # mille job!
236  milleOut = 'milleBinary%03d.dat' % (i+1)
237  # from Perl, should be deprecated because of cmsls and nsls
238  #(not translated to python, yes I'm lazy... use subprocess.checkout if needed):
239  #$mOutSize = `nsls -l $mssDir | grep $milleOut | head -1 | awk '{print \$5}'`;
240  #$mOutSize = `cmsLs -l $mssDir | grep $milleOut | head -1 | awk '{print \$2}'`;
241  mOutSize = 0
242  #print(">>>eoslsoutput:", eoslsoutput, " \ttype(eoslsoutput):", type(eoslsoutput))
243  for line in eoslsoutput:
244  if milleOut in line:
245  columns = line.split()
246  mOutSize = int(columns[4]) # 5th column = size
247  if not (mOutSize>0):
248  emptyDatErr = 1
249 
250  # merge jobs: additional checks for merging job
251  else:
252  # if there is a pede.dump file check it as well
253  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/pede.dump'
254  if os.access(eazeLog+'.gz', os.R_OK):
255  # unzip - but clean before and save to tmp
256  os.system('rm -f /tmp/pede.dump')
257  os.system('gunzip -c '+eazeLog+'.gz > /tmp/pede.dump')
258  eazeLog = '/tmp/pede.dump'
259  if os.access(eazeLog, os.R_OK):
260  with open(eazeLog, "r") as INFILE: # open pede.dump
261  # scan records in INFILE
262  pedeAbend = 1
263  usedPedeMem = 0.
264  for line in INFILE:
265  # check if pede has reached its normal end
266  if re.search(re.compile('Millepede II.* ending',re.M), line):
267  pedeAbend = 0
268  # extract memory usage
269  match = re.search(re.compile('Peak dynamic memory allocation: (.+) GB',re.I), line)
270  if match:
271  mem = match.group(1)
272  mem = re.sub('\s', '', mem)
273  # if mem is a float
274  if re.search(re.compile('^\d+\.\d+$',re.M), mem):
275  usedPedeMem = float(mem)
276  else:
277  print('mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem)
278 
279  # check memory usage
280  # no point in asking if lib.pedeMem is defined. Initialized as lib.pedeMem=-1
281  if lib.pedeMem > 0 and usedPedeMem > 0.:
282  memoryratio = usedPedeMem /(lib.pedeMem/1024.)
283  # print a warning if more than approx. 4 GB have been
284  # requested of which less than 75% are used by Pede
285  if lib.pedeMem > 4000 and memoryratio < 0.75 :
286  msg = ("Warning: {0:.2f} GB of memory for Pede "
287  "requested, but only {1:.1f}% of it has been "
288  "used! Consider to request less memory in order "
289  "to save resources.")
290  print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
291  elif memoryratio > 1 :
292  msg = ("Warning: {0:.2f} GB of memory for Pede "
293  "requested, but {1:.1f}% of this has been "
294  "used! Consider to request more memory to avoid "
295  "premature removal of the job by the admin.")
296  print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
297  else:
298  msg = ("Info: Used {0:.1f}% of {1:.2f} GB of memory "
299  "which has been requested for Pede.")
300  print(msg.format(memoryratio*100, lib.pedeMem/1024.0))
301 
302 
303  # clean up /tmp/pede.dump if needed
304  if eazeLog == '/tmp/pede.dump':
305  os.system('rm /tmp/pede.dump')
306 
307  # pede.dump not found or no read-access
308  else:
309  print('mps_check.py cannot find',eazeLog,'to test')
310 
311  # if there is a millepede.log file, check it as well
312  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.log'
313  logZipped = 'no'
314  if os.access(eazeLog+'.gz', os.R_OK):
315  os.system('gunzip '+eazeLog+'.gz')
316  logZipped = 'true'
317 
318  if os.access(eazeLog, os.R_OK):
319  # open log file
320  with open(eazeLog, "r") as INFILE:
321  # scan records in input file
322  for line in INFILE:
323  # Checks for Pede Errors
324  if re.search(re.compile('step no descending',re.M), line):
325  pedeLogErr = 1
326  pedeLogErrStr += line
327  if re.search(re.compile('Constraint equation discrepancies:',re.M), line):
328  pedeLogErr = 1
329  pedeLogErrStr += line
330  # AP 07.09.2009 - Checks for Pede Warnings:
331  if re.search(re.compile('insufficient constraint equations',re.M), line):
332  pedeLogWrn = 1
333  pedeLogWrnStr += line
334 
335  if logZipped == 'true':
336  os.system('gzip -f '+eazeLog)
337  else:
338  print('mps_check.py cannot find',eazeLog,'to test')
339 
340 
341  # check millepede.end -- added F. Meier 03.03.2015
342  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.end'
343  logZipped = 'no'
344  if os.access(eazeLog+'.gz', os.R_OK):
345  os.system('gunzip'+eazeLog+'.gz')
346  logZipped = 'true'
347 
348  if os.access(eazeLog, os.R_OK):
349  # open log file
350  with open(eazeLog, "r") as INFILE:
351  # scan records in input file
352  for line in INFILE:
353  # Checks for the output code. 0 is OK, 1 is WARN, anything else is FAIL
354  # searches the line for a number with or without a sign
355  match = re.search(re.compile('([-+]?\d+)',re.M), line)
356  if match:
357  if int(match.group(1)) == 1:
358  pedeLogWrn = 1
359  pedeLogWrnStr += line
360  elif int(match.group(1)) != 0:
361  pedeLogErr = 1
362  pedeLogErrStr += line
363  if logZipped == 'true':
364  os.system('gzip -f '+eazeLog)
365  else:
366  print('mps_check.py cannot find',eazeLog,'to test')
367 
368  # end of merge job checks
369  # evaluate Errors:
370  farmhost = ' '
371 
372  okStatus = 'OK'
373  if not eofile == 1:
374  print(lib.JOBDIR[i],lib.JOBID[i],'did not reach end of file')
375  okStatus = 'ABEND'
376  if quotaspace == 1:
377  print(lib.JOBDIR[i],lib.JOBID[i],'had quota space problem')
378  okStatus = 'FAIL'
379  remark = 'eos quota space problem'
380  if ioprob == 1:
381  print(lib.JOBDIR[i],lib.JOBID[i],'had I/O problem')
382  okStatus = 'FAIL'
383  if fw8001 == 1:
384  print(lib.JOBDIR[i],lib.JOBID[i],'had Framework error 8001 problem')
385  remark = 'fwk error 8001'
386  okStatus = 'FAIL'
387  if timeout == 1:
388  print(lib.JOBDIR[i],lib.JOBID[i],'had connection timed out problem')
389  remark = 'connection timed out'
390  if cfgerr == 1:
391  print(lib.JOBDIR[i],lib.JOBID[i],'had config file error')
392  remark = 'cfg file error'
393  okStatus = 'FAIL'
394  if killed == 1:
395  guess = " (probably time exceeded)" if kill_reason is None else ":"
396  print(lib.JOBDIR[i], lib.JOBID[i], "Job killed" + guess)
397  if kill_reason is not None: print(kill_reason)
398  remark = "killed";
399  okStatus = "FAIL"
400  if timel == 1:
401  print(lib.JOBDIR[i],lib.JOBID[i],'ran into time limit')
402  okStatus = 'TIMEL'
403  if tooManyTracks == 1:
404  print(lib.JOBDIR[i],lib.JOBID[i],'too many tracks')
405  if segviol == 1:
406  print(lib.JOBDIR[i],lib.JOBID[i],'SEGVIOL encountered')
407  remark = 'seg viol'
408  okStatus = 'FAIL'
409  if rfioerr == 1:
410  print(lib.JOBDIR[i],lib.JOBID[i],'RFIO error encountered')
411  remark = 'rfio error'
412  okStatus = 'FAIL'
413  if quota == 1:
414  print(lib.JOBDIR[i],lib.JOBID[i],'Request exceeds quota')
415  if exceptionCaught == 1:
416  print(lib.JOBDIR[i],lib.JOBID[i],'Exception caught in cmsrun')
417  remark = 'Exception caught'
418  okStatus = 'FAIL'
419  if emptyDatErr == 1:
420  print('milleBinary???.dat file not found or empty')
421  remark = 'empty milleBinary'
422  if emptyDatOnFarm > 0:
423  print('...but already empty on farm so OK (or check job',i+1,'yourself...)')
424  else:
425  okStatus = 'FAIL'
426  if cmdNotFound == 1:
427  print(lib.JOBDIR[i],lib.JOBID[i],'Command not found')
428  remark = 'cmd not found'
429  okStatus = 'FAIL'
430  if insuffPriv == 1:
431  print(lib.JOBDIR[i],lib.JOBID[i],'Insufficient privileges to rfcp files')
432  remark = 'Could not rfcp files'
433  okStatus = 'FAIL'
434  if pedeAbend == 1:
435  print(lib.JOBDIR[i],lib.JOBID[i],'Pede did not end normally')
436  remark = 'pede failed'
437  okStatus = 'FAIL'
438  if pedeLogErr == 1:
439  print(lib.JOBDIR[i],lib.JOBID[i],'Problems in running Pede:')
440  print(pedeLogErrStr)
441  remark = 'pede error'
442  okStatus = 'FAIL'
443  if pedeLogWrn == 1:
444  # AP 07.09.2009 - Reports Pede Warnings (but do _not_ set job status to FAIL)
445  print(lib.JOBDIR[i],lib.JOBID[i],'Warnings in running Pede:')
446  print(pedeLogWrnStr)
447  remark = 'pede warnings'
448  okStatus = 'WARN'
449  if endofjob != 1:
450  print(lib.JOBDIR[i],lib.JOBID[i],'Job not ended')
451  remark = 'job not ended'
452  okStatus = 'FAIL'
453  if copyerr == 1 and ispede!=1:
454  #Copy errors in pede job can occur when a nonexistent file is commented in alignment_merge.py but not in theScript.sh, and in that case is *not* a failure
455  print(lib.JOBDIR[i],lib.JOBID[i],'Copy to eos failed')
456  remark = 'copy to eos failed'
457  okStatus = 'FAIL'
458 
459 
460  # print warning line to stdout
461  if okStatus != "OK":
462  print(lib.JOBDIR[i],lib.JOBID[i],' -------- ',okStatus)
463 
464  # update number of events
465  lib.JOBNEVT[i] = nEvent
466  # udate Jobstatus
467  lib.JOBSTATUS[i] = disabled+okStatus
468  # update cputime
469  lib.JOBRUNTIME[i] = cputime
470  # update remark
471  lib.JOBREMARK[i] = remark
472  # update host
473 
474 
475 lib.write_db()
476 
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
Definition: Utilities.cc:47
bool decode(bool &, std::string const &)
Definition: types.cc:71
static std::string join(char **cmd)
Definition: RemoteFile.cc:19