CMS 3D CMS Logo

mps_check.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # This script checks outputs from jobs that have FETCH status and updates if errors occured
4 # -> check STDOUT files
5 # -> check cmsRun.out
6 # -> check alignment.log
7 # -> check if millebinaries are on eos
8 # -> check pede.dump
9 # -> check millepede.log
10 # -> check millepede.end
11 #
12 # It also retirieves number of events from alignment.log and cputime from STDOUT
13 
14 from __future__ import print_function
15 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
16 import subprocess
17 import re
18 import os
19 
20 lib = mpslib.jobdatabase()
21 lib.read_db()
22 
23 # create a list of eos ls entries containing files on eos binary store
24 command = ["ls", "-l", os.path.join(lib.mssDir, "binaries")]
25 try:
26  eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT).split('\n')
27 except subprocess.CalledProcessError:
28  eoslsoutput = ""
29 
30 # loop over FETCH jobs
31 for i in xrange(len(lib.JOBID)):
32  # FIXME use bools?
33  batchSuccess = 0
34  batchExited = 0
35  finished = 0
36  endofjob = 0
37  eofile = 1 # do not deal with timel yet
38  timel = 0
39  killed = 0
40  ioprob = 0
41  fw8001 = 0
42  tooManyTracks = 0
43  segviol = 0
44  rfioerr = 0
45  quota = 0
46  nEvent = 0
47  cputime = -1
48  pedeAbend = 0
49  pedeLogErr = 0
50  pedeLogWrn = 0
51  exceptionCaught = 0
52  timeout = 0
53  cfgerr = 0
54  emptyDatErr = 0
55  emptyDatOnFarm = 0
56  cmdNotFound = 0
57  insuffPriv = 0
58  quotaspace = 0
59 
60  kill_reason = None
61  pedeLogErrStr = ""
62  pedeLogWrnStr = ""
63  remark = ""
64 
65  disabled = "";
66  if 'DISABLED' in lib.JOBSTATUS[i]:
67  disabled = 'DISABLED'
68 
69  if 'FETCH' in lib.JOBSTATUS[i]:
70 
71  # open the STDOUT file
72  stdOut = 'jobData/'+lib.JOBDIR[i]+'/STDOUT'
73  # unzip the STDOUT file if necessary
74  if os.access(stdOut+'.gz', os.R_OK):
75  os.system('gunzip '+stdOut+'.gz')
76 
77  try:
78  with open(stdOut, "r") as STDFILE:
79  # scan records in input file.
80  # use regular expression to search. re.compile needed for options re.M and re.I
81  # re.M=re.MULTILINE enables matching of newline char
82  # re.I=re.IGNORECASE makes matching case-insensitive.
83  for line in STDFILE:
84  if re.search(re.compile('Unable to access quota space',re.M|re.I), line):
85  quotaspace = 1
86  if re.search(re.compile('Unable to get quota space',re.M|re.I), line):
87  quotaspace = 1
88  if re.search(re.compile('Disk quota exceeded',re.M|re.I), line):
89  quotaspace = 1
90  if re.search(re.compile('CERN report: Job Killed',re.M), line):
91  killed = 1
92  if re.search(re.compile('Job finished',re.M), line):
93  finished = 1
94  if re.search(re.compile('connection timed out',re.M), line):
95  timeout = 1
96  if re.search(re.compile('ConfigFileReadError',re.M), line):
97  cfgerr = 1
98  if re.search(re.compile('0 bytes transferred',re.M), line):
99  emptyDatOnFarm = 1
100  if re.search(re.compile('command not found',re.M), line):
101  cmdNotFound = 1
102  # AP 26.11.2009 Insufficient privileges to rfcp files
103  if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
104  insuffPriv = 1
105  # AP 05.11.2015 Extract cpu-time.
106  # STDOUT doesn't contain NCU anymore. Now KSI2K and HS06 seconds are displayed.
107  # The ncuFactor is calculated from few samples by comparing KSI2K seconds with
108  # CPU time from email.
109  match = re.search(re.compile('This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
110  if match:
111  cpuFactor = 2.125
112  cputime = int(round(int(match.group(1))/cpuFactor)) # match.group(1) is the matched digit
113 
114  # gzip it afterwards:
115  print('gzip -f '+stdOut)
116  os.system('gzip -f '+stdOut)
117  except IOError as e:
118  if e.args == (2, "No such file or directory"):
119  print("mps_check.py cannot find", stdOut, "to test")
120  else:
121  raise
122 
123  # check HTCondor log file
124  try:
125  log_file = os.path.join("jobData", lib.JOBDIR[i], "HTCJOB")
126  condor_log = subprocess.check_output(["condor_q", lib.JOBID[i],
127  "-userlog", log_file,
128  "-af",
129  "RemoteSysCpu",
130  "JobStatus",
131  "RemoveReason"],
132  stderr = subprocess.STDOUT)
133  condor_log = condor_log.split()
134 
135  cputime = int(round(float(condor_log[0])))
136 
137  if condor_log[1] == "3": # JobStatus == Removed
138  killed = 1
139  kill_reason = " ".join(condor_log[2:])
140 
141  except subprocess.CalledProcessError as e:
142  pass
143 
144 
145  # GF: This file is not produced (anymore...) -> check for existence and read-access added
146  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/cmsRun.out'
147  if os.access(eazeLog, os.R_OK):
148  # open the input file
149  with open(eazeLog, "r") as INFILE:
150  # scan records in input file
151  for line in INFILE:
152  # check if end of file has been reached
153  if re.search(re.compile('<StorageStatistics>',re.M), line):
154  eofile = 1
155  if re.search(re.compile('Time limit reached\.',re.M), line):
156  timel = 1
157  if re.search(re.compile('gives I\/O problem',re.M), line):
158  ioprob = 1
159  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
160  fw8001 = 1
161  if re.search(re.compile('too many tracks',re.M), line):
162  tooManyTracks = 1
163  if re.search(re.compile('segmentation violation',re.M), line):
164  segviol = 1
165  if re.search(re.compile('failed RFIO error',re.M), line):
166  rfioerr = 1
167  if re.search(re.compile('Request exceeds quota',re.M), line):
168  quota = 1
169 
170  # if there is an alignment.log[.gz] file, check it as well
171  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/alignment.log'
172  logZipped = 'no'
173  # unzip the logfile if necessary
174  if os.access(eazeLog+'.gz', os.R_OK):
175  os.system('gunzip '+eazeLog+'.gz')
176  logZipped = 'true'
177 
178  if os.access(eazeLog, os.R_OK): # access to alignment.log
179  # open the input file
180  with open(eazeLog,'r') as INFILE:
181  # scan records in input file
182  for line in INFILE:
183  # check if end of file has been reached
184  if re.search(re.compile('<StorageStatistics>',re.M), line):
185  eofile = 1
186  if re.search(re.compile('EAZE\. Time limit reached\.',re.M), line):
187  timel = 1
188  if re.search(re.compile('GAF gives I\/O problem',re.M), line):
189  ioprob = 1
190  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
191  fw8001 = 1
192  if re.search(re.compile('too many tracks',re.M), line):
193  tooManyTracks = 1
194  if re.search(re.compile('segmentation violation',re.M), line):
195  segviol = 1
196  if re.search(re.compile('failed RFIO error',re.M), line):
197  rfioerr = 1
198  if re.search(re.compile('Request exceeds quota',re.M), line):
199  quota = 1
200  # check for newer (e.g. CMSSW_5_1_X) and older CMSSW:
201  if re.search(re.compile('Fatal Exception',re.M), line):
202  exceptionCaught = 1
203  if re.search(re.compile('Exception caught in cmsRun',re.M), line):
204  exceptionCaught = 1
205  # AP 07.09.2009 - Check that the job got to a normal end
206  if re.search(re.compile('AlignmentProducerAsAnalyzer::endJob\(\)',re.M), line):
207  endofjob = 1
208  if re.search(re.compile('FwkReport -i main_input:sourc',re.M), line):
209  array = line.split()
210  nEvent = int(array[5])
211  if nEvent==0 and re.search(re.compile('FwkReport -i PostSource',re.M), line):
212  array = line.split()
213  nEvent = int(array[5])
214  # AP 31.07.2009 - To read number of events in CMSSW_3_2_2_patch2
215  if nEvent==0 and re.search(re.compile('FwkReport -i AfterSource',re.M), line):
216  array = line.split()
217  nEvent = int(array[5])
218 
219  if logZipped == 'true':
220  os.system('gzip '+eazeLog)
221 
222  else: # no access to alignment.log
223  print('mps_check.py cannot find',eazeLog,'to test')
224  # AP 07.09.2009 - The following check cannot be done: set to 1 to avoid fake error type
225  endofjob = 1
226 
227  # for mille jobs checks that milleBinary file is not empty
228  if i<lib.nJobs: # mille job!
229  milleOut = 'milleBinary%03d.dat' % (i+1)
230  # from Perl, should be deprecated because of cmsls and nsls
231  #(not translated to python, yes I'm lazy... use subprocess.checkout if needed):
232  #$mOutSize = `nsls -l $mssDir | grep $milleOut | head -1 | awk '{print \$5}'`;
233  #$mOutSize = `cmsLs -l $mssDir | grep $milleOut | head -1 | awk '{print \$2}'`;
234  mOutSize = 0
235  for line in eoslsoutput:
236  if milleOut in line:
237  columns = line.split()
238  mOutSize = columns[4] # 5th column = size
239  if not (mOutSize>0):
240  emptyDatErr = 1
241 
242  # merge jobs: additional checks for merging job
243  else:
244  # if there is a pede.dump file check it as well
245  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/pede.dump'
246  if os.access(eazeLog+'.gz', os.R_OK):
247  # unzip - but clean before and save to tmp
248  os.system('rm -f /tmp/pede.dump')
249  os.system('gunzip -c '+eazeLog+'.gz > /tmp/pede.dump')
250  eazeLog = '/tmp/pede.dump'
251  if os.access(eazeLog, os.R_OK):
252  with open(eazeLog, "r") as INFILE: # open pede.dump
253  # scan records in INFILE
254  pedeAbend = 1
255  usedPedeMem = 0.
256  for line in INFILE:
257  # check if pede has reached its normal end
258  if re.search(re.compile('Millepede II.* ending',re.M), line):
259  pedeAbend = 0
260  # extract memory usage
261  match = re.search(re.compile('Peak dynamic memory allocation: (.+) GB',re.I), line)
262  if match:
263  mem = match.group(1)
264  mem = re.sub('\s', '', mem)
265  # if mem is a float
266  if re.search(re.compile('^\d+\.\d+$',re.M), mem):
267  usedPedeMem = float(mem)
268  else:
269  print('mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem)
270 
271  # check memory usage
272  # no point in asking if lib.pedeMem is defined. Initialized as lib.pedeMem=-1
273  if lib.pedeMem > 0 and usedPedeMem > 0.:
274  memoryratio = usedPedeMem /(lib.pedeMem/1024.)
275  # print a warning if more than approx. 4 GB have been
276  # requested of which less than 75% are used by Pede
277  if lib.pedeMem > 4000 and memoryratio < 0.75 :
278  msg = ("Warning: {0:.2f} GB of memory for Pede "
279  "requested, but only {1:.1f}% of it has been "
280  "used! Consider to request less memory in order "
281  "to save resources.")
282  print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
283  elif memoryratio > 1 :
284  msg = ("Warning: {0:.2f} GB of memory for Pede "
285  "requested, but {1:.1f}% of this has been "
286  "used! Consider to request more memory to avoid "
287  "premature removal of the job by the admin.")
288  print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
289  else:
290  msg = ("Info: Used {0:.1f}% of {1:.2f} GB of memory "
291  "which has been requested for Pede.")
292  print(msg.format(memoryratio*100, lib.pedeMem/1024.0))
293 
294 
295  # clean up /tmp/pede.dump if needed
296  if eazeLog == '/tmp/pede.dump':
297  os.system('rm /tmp/pede.dump')
298 
299  # pede.dump not found or no read-access
300  else:
301  print('mps_check.py cannot find',eazeLog,'to test')
302 
303  # if there is a millepede.log file, check it as well
304  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.log'
305  logZipped = 'no'
306  if os.access(eazeLog+'.gz', os.R_OK):
307  os.system('gunzip '+eazeLog+'.gz')
308  logZipped = 'true'
309 
310  if os.access(eazeLog, os.R_OK):
311  # open log file
312  with open(eazeLog, "r") as INFILE:
313  # scan records in input file
314  for line in INFILE:
315  # Checks for Pede Errors
316  if re.search(re.compile('step no descending',re.M), line):
317  pedeLogErr = 1
318  pedeLogErrStr += line
319  if re.search(re.compile('Constraint equation discrepancies:',re.M), line):
320  pedeLogErr = 1
321  pedeLogErrStr += line
322  # AP 07.09.2009 - Checks for Pede Warnings:
323  if re.search(re.compile('insufficient constraint equations',re.M), line):
324  pedeLogWrn = 1
325  pedeLogWrnStr += line
326 
327  if logZipped == 'true':
328  os.system('gzip '+eazeLog)
329  else:
330  print('mps_check.py cannot find',eazeLog,'to test')
331 
332 
333  # check millepede.end -- added F. Meier 03.03.2015
334  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.end'
335  logZipped = 'no'
336  if os.access(eazeLog+'.gz', os.R_OK):
337  os.system('gunzip'+eazeLog+'.gz')
338  logZipped = 'true'
339 
340  if os.access(eazeLog, os.R_OK):
341  # open log file
342  with open(eazeLog, "r") as INFILE:
343  # scan records in input file
344  for line in INFILE:
345  # Checks for the output code. 0 is OK, 1 is WARN, anything else is FAIL
346  # searches the line for a number with or without a sign
347  match = re.search(re.compile('([-+]?\d+)',re.M), line)
348  if match:
349  if int(match.group(1)) == 1:
350  pedeLogWrn = 1
351  pedeLogWrnStr += line
352  elif int(match.group(1)) != 0:
353  pedeLogErr = 1
354  pedeLogErrStr += line
355  if logZipped == 'true':
356  os.system('gzip '+eazeLog)
357  else:
358  print('mps_check.py cannot find',eazeLog,'to test')
359 
360  # end of merge job checks
361  # evaluate Errors:
362  farmhost = ' '
363 
364  okStatus = 'OK'
365  if not eofile == 1:
366  print(lib.JOBDIR[i],lib.JOBID[i],'did not reach end of file')
367  okStatus = 'ABEND'
368  if quotaspace == 1:
369  print(lib.JOBDIR[i],lib.JOBID[i],'had quota space problem')
370  okStatus = 'FAIL'
371  remark = 'eos quota space problem'
372  if ioprob == 1:
373  print(lib.JOBDIR[i],lib.JOBID[i],'had I/O problem')
374  okStatus = 'FAIL'
375  if fw8001 == 1:
376  print(lib.JOBDIR[i],lib.JOBID[i],'had Framework error 8001 problem')
377  remark = 'fwk error 8001'
378  okStatus = 'FAIL'
379  if timeout == 1:
380  print(lib.JOBDIR[i],lib.JOBID[i],'had connection timed out problem')
381  remark = 'connection timed out'
382  if cfgerr == 1:
383  print(lib.JOBDIR[i],lib.JOBID[i],'had config file error')
384  remark = 'cfg file error'
385  okStatus = 'FAIL'
386  if killed == 1:
387  guess = " (probably time exceeded)" if kill_reason is None else ":"
388  print(lib.JOBDIR[i], lib.JOBID[i], "Job killed" + guess)
389  if kill_reason is not None: print(kill_reason)
390  remark = "killed";
391  okStatus = "FAIL"
392  if timel == 1:
393  print(lib.JOBDIR[i],lib.JOBID[i],'ran into time limit')
394  okStatus = 'TIMEL'
395  if tooManyTracks == 1:
396  print(lib.JOBDIR[i],lib.JOBID[i],'too many tracks')
397  if segviol == 1:
398  print(lib.JOBDIR[i],lib.JOBID[i],'SEGVIOL encountered')
399  remark = 'seg viol'
400  okStatus = 'FAIL'
401  if rfioerr == 1:
402  print(lib.JOBDIR[i],lib.JOBID[i],'RFIO error encountered')
403  remark = 'rfio error'
404  okStatus = 'FAIL'
405  if quota == 1:
406  print(lib.JOBDIR[i],lib.JOBID[i],'Request exceeds quota')
407  if exceptionCaught == 1:
408  print(lib.JOBDIR[i],lib.JOBID[i],'Exception caught in cmsrun')
409  remark = 'Exception caught'
410  okStatus = 'FAIL'
411  if emptyDatErr == 1:
412  print('milleBinary???.dat file not found or empty')
413  remark = 'empty milleBinary'
414  if emptyDatOnFarm > 0:
415  print('...but already empty on farm so OK (or check job',i+1,'yourself...)')
416  else:
417  okStatus = 'FAIL'
418  if cmdNotFound == 1:
419  print(lib.JOBDIR[i],lib.JOBID[i],'Command not found')
420  remark = 'cmd not found'
421  okStatus = 'FAIL'
422  if insuffPriv == 1:
423  print(lib.JOBDIR[i],lib.JOBID[i],'Insufficient privileges to rfcp files')
424  remark = 'Could not rfcp files'
425  okStatus = 'FAIL'
426  if pedeAbend == 1:
427  print(lib.JOBDIR[i],lib.JOBID[i],'Pede did not end normally')
428  remark = 'pede failed'
429  okStatus = 'FAIL'
430  if pedeLogErr == 1:
431  print(lib.JOBDIR[i],lib.JOBID[i],'Problems in running Pede:')
432  print(pedeLogErrStr)
433  remark = 'pede error'
434  okStatus = 'FAIL'
435  if pedeLogWrn == 1:
436  # AP 07.09.2009 - Reports Pede Warnings (but do _not_ set job status to FAIL)
437  print(lib.JOBDIR[i],lib.JOBID[i],'Warnings in running Pede:')
438  print(pedeLogWrnStr)
439  remark = 'pede warnings'
440  okStatus = 'WARN'
441  if endofjob != 1:
442  print(lib.JOBDIR[i],lib.JOBID[i],'Job not ended')
443  remark = 'job not ended'
444  okStatus = 'FAIL'
445 
446  # print warning line to stdout
447  if okStatus != "OK":
448  print(lib.JOBDIR[i],lib.JOBID[i],' -------- ',okStatus)
449 
450  # update number of events
451  lib.JOBNEVT[i] = nEvent
452  # udate Jobstatus
453  lib.JOBSTATUS[i] = disabled+okStatus
454  # update cputime
455  lib.JOBRUNTIME[i] = cputime
456  # update remark
457  lib.JOBREMARK[i] = remark
458  # update host
459  ##lib.JOBHOST[i] = farmhost
460 
461 lib.write_db()
462 
S & print(S &os, JobReport::InputFile const &f)
Definition: JobReport.cc:65
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
double split
Definition: MVATrainer.cc:139