CMS 3D CMS Logo

mps_check.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # This script checks outputs from jobs that have FETCH status and updates if errors occured
4 # -> check STDOUT files
5 # -> check cmsRun.out
6 # -> check alignment.log
7 # -> check if millebinaries are on eos
8 # -> check pede.dump
9 # -> check millepede.log
10 # -> check millepede.end
11 #
12 # It also retirieves number of events from alignment.log and cputime from STDOUT
13 
14 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
15 import subprocess
16 import re
17 import os
18 
19 lib = mpslib.jobdatabase()
20 lib.read_db()
21 
22 # create a list of eos ls entries containing files on eos binary store
23 command = ["ls", "-l", os.path.join(lib.mssDir, "binaries")]
24 try:
25  eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT).split('\n')
26 except subprocess.CalledProcessError:
27  eoslsoutput = ""
28 
29 # loop over FETCH jobs
30 for i in xrange(len(lib.JOBID)):
31  # FIXME use bools?
32  batchSuccess = 0
33  batchExited = 0
34  finished = 0
35  endofjob = 0
36  eofile = 1 # do not deal with timel yet
37  timel = 0
38  killed = 0
39  ioprob = 0
40  fw8001 = 0
41  tooManyTracks = 0
42  segviol = 0
43  rfioerr = 0
44  quota = 0
45  nEvent = 0
46  cputime = -1
47  pedeAbend = 0
48  pedeLogErr = 0
49  pedeLogWrn = 0
50  exceptionCaught = 0
51  timeout = 0
52  cfgerr = 0
53  emptyDatErr = 0
54  emptyDatOnFarm = 0
55  cmdNotFound = 0
56  insuffPriv = 0
57  quotaspace = 0
58 
59  kill_reason = None
60  pedeLogErrStr = ""
61  pedeLogWrnStr = ""
62  remark = ""
63 
64  disabled = "";
65  if 'DISABLED' in lib.JOBSTATUS[i]:
66  disabled = 'DISABLED'
67 
68  if 'FETCH' in lib.JOBSTATUS[i]:
69 
70  # open the STDOUT file
71  stdOut = 'jobData/'+lib.JOBDIR[i]+'/STDOUT'
72  # unzip the STDOUT file if necessary
73  if os.access(stdOut+'.gz', os.R_OK):
74  os.system('gunzip '+stdOut+'.gz')
75 
76  try:
77  with open(stdOut, "r") as STDFILE:
78  # scan records in input file.
79  # use regular expression to search. re.compile needed for options re.M and re.I
80  # re.M=re.MULTILINE enables matching of newline char
81  # re.I=re.IGNORECASE makes matching case-insensitive.
82  for line in STDFILE:
83  if re.search(re.compile('Unable to access quota space',re.M|re.I), line):
84  quotaspace = 1
85  if re.search(re.compile('Unable to get quota space',re.M|re.I), line):
86  quotaspace = 1
87  if re.search(re.compile('Disk quota exceeded',re.M|re.I), line):
88  quotaspace = 1
89  if re.search(re.compile('CERN report: Job Killed',re.M), line):
90  killed = 1
91  if re.search(re.compile('Job finished',re.M), line):
92  finished = 1
93  if re.search(re.compile('connection timed out',re.M), line):
94  timeout = 1
95  if re.search(re.compile('ConfigFileReadError',re.M), line):
96  cfgerr = 1
97  if re.search(re.compile('0 bytes transferred',re.M), line):
98  emptyDatOnFarm = 1
99  if re.search(re.compile('command not found',re.M), line):
100  cmdNotFound = 1
101  # AP 26.11.2009 Insufficient privileges to rfcp files
102  if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
103  insuffPriv = 1
104  # AP 05.11.2015 Extract cpu-time.
105  # STDOUT doesn't contain NCU anymore. Now KSI2K and HS06 seconds are displayed.
106  # The ncuFactor is calculated from few samples by comparing KSI2K seconds with
107  # CPU time from email.
108  match = re.search(re.compile('This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
109  if match:
110  cpuFactor = 2.125
111  cputime = int(round(int(match.group(1))/cpuFactor)) # match.group(1) is the matched digit
112 
113  # gzip it afterwards:
114  print 'gzip -f '+stdOut
115  os.system('gzip -f '+stdOut)
116  except IOError as e:
117  if e.args == (2, "No such file or directory"):
118  print "mps_check.py cannot find", stdOut, "to test"
119  else:
120  raise
121 
122  # check HTCondor log file
123  try:
124  log_file = os.path.join("jobData", lib.JOBDIR[i], "HTCJOB")
125  condor_log = subprocess.check_output(["condor_q", lib.JOBID[i],
126  "-userlog", log_file,
127  "-af",
128  "RemoteSysCpu",
129  "JobStatus",
130  "RemoveReason"],
131  stderr = subprocess.STDOUT)
132  condor_log = condor_log.split()
133 
134  cputime = int(round(float(condor_log[0])))
135 
136  if condor_log[1] == "3": # JobStatus == Removed
137  killed = 1
138  kill_reason = " ".join(condor_log[2:])
139 
140  except subprocess.CalledProcessError as e:
141  pass
142 
143 
144  # GF: This file is not produced (anymore...) -> check for existence and read-access added
145  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/cmsRun.out'
146  if os.access(eazeLog, os.R_OK):
147  # open the input file
148  with open(eazeLog, "r") as INFILE:
149  # scan records in input file
150  for line in INFILE:
151  # check if end of file has been reached
152  if re.search(re.compile('<StorageStatistics>',re.M), line):
153  eofile = 1
154  if re.search(re.compile('Time limit reached\.',re.M), line):
155  timel = 1
156  if re.search(re.compile('gives I\/O problem',re.M), line):
157  ioprob = 1
158  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
159  fw8001 = 1
160  if re.search(re.compile('too many tracks',re.M), line):
161  tooManyTracks = 1
162  if re.search(re.compile('segmentation violation',re.M), line):
163  segviol = 1
164  if re.search(re.compile('failed RFIO error',re.M), line):
165  rfioerr = 1
166  if re.search(re.compile('Request exceeds quota',re.M), line):
167  quota = 1
168 
169  # if there is an alignment.log[.gz] file, check it as well
170  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/alignment.log'
171  logZipped = 'no'
172  # unzip the logfile if necessary
173  if os.access(eazeLog+'.gz', os.R_OK):
174  os.system('gunzip '+eazeLog+'.gz')
175  logZipped = 'true'
176 
177  if os.access(eazeLog, os.R_OK): # access to alignment.log
178  # open the input file
179  with open(eazeLog,'r') as INFILE:
180  # scan records in input file
181  for line in INFILE:
182  # check if end of file has been reached
183  if re.search(re.compile('<StorageStatistics>',re.M), line):
184  eofile = 1
185  if re.search(re.compile('EAZE\. Time limit reached\.',re.M), line):
186  timel = 1
187  if re.search(re.compile('GAF gives I\/O problem',re.M), line):
188  ioprob = 1
189  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
190  fw8001 = 1
191  if re.search(re.compile('too many tracks',re.M), line):
192  tooManyTracks = 1
193  if re.search(re.compile('segmentation violation',re.M), line):
194  segviol = 1
195  if re.search(re.compile('failed RFIO error',re.M), line):
196  rfioerr = 1
197  if re.search(re.compile('Request exceeds quota',re.M), line):
198  quota = 1
199  # check for newer (e.g. CMSSW_5_1_X) and older CMSSW:
200  if re.search(re.compile('Fatal Exception',re.M), line):
201  exceptionCaught = 1
202  if re.search(re.compile('Exception caught in cmsRun',re.M), line):
203  exceptionCaught = 1
204  # AP 07.09.2009 - Check that the job got to a normal end
205  if re.search(re.compile('AlignmentProducerAsAnalyzer::endJob\(\)',re.M), line):
206  endofjob = 1
207  if re.search(re.compile('FwkReport -i main_input:sourc',re.M), line):
208  array = line.split()
209  nEvent = int(array[5])
210  if nEvent==0 and re.search(re.compile('FwkReport -i PostSource',re.M), line):
211  array = line.split()
212  nEvent = int(array[5])
213  # AP 31.07.2009 - To read number of events in CMSSW_3_2_2_patch2
214  if nEvent==0 and re.search(re.compile('FwkReport -i AfterSource',re.M), line):
215  array = line.split()
216  nEvent = int(array[5])
217 
218  if logZipped == 'true':
219  os.system('gzip '+eazeLog)
220 
221  else: # no access to alignment.log
222  print 'mps_check.py cannot find',eazeLog,'to test'
223  # AP 07.09.2009 - The following check cannot be done: set to 1 to avoid fake error type
224  endofjob = 1
225 
226  # for mille jobs checks that milleBinary file is not empty
227  if i<lib.nJobs: # mille job!
228  milleOut = 'milleBinary%03d.dat' % (i+1)
229  # from Perl, should be deprecated because of cmsls and nsls
230  #(not translated to python, yes I'm lazy... use subprocess.checkout if needed):
231  #$mOutSize = `nsls -l $mssDir | grep $milleOut | head -1 | awk '{print \$5}'`;
232  #$mOutSize = `cmsLs -l $mssDir | grep $milleOut | head -1 | awk '{print \$2}'`;
233  mOutSize = 0
234  for line in eoslsoutput:
235  if milleOut in line:
236  columns = line.split()
237  mOutSize = columns[4] # 5th column = size
238  if not (mOutSize>0):
239  emptyDatErr = 1
240 
241  # merge jobs: additional checks for merging job
242  else:
243  # if there is a pede.dump file check it as well
244  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/pede.dump'
245  if os.access(eazeLog+'.gz', os.R_OK):
246  # unzip - but clean before and save to tmp
247  os.system('rm -f /tmp/pede.dump')
248  os.system('gunzip -c '+eazeLog+'.gz > /tmp/pede.dump')
249  eazeLog = '/tmp/pede.dump'
250  if os.access(eazeLog, os.R_OK):
251  with open(eazeLog, "r") as INFILE: # open pede.dump
252  # scan records in INFILE
253  pedeAbend = 1
254  usedPedeMem = 0.
255  for line in INFILE:
256  # check if pede has reached its normal end
257  if re.search(re.compile('Millepede II.* ending',re.M), line):
258  pedeAbend = 0
259  # extract memory usage
260  match = re.search(re.compile('Peak dynamic memory allocation: (.+) GB',re.I), line)
261  if match:
262  mem = match.group(1)
263  mem = re.sub('\s', '', mem)
264  # if mem is a float
265  if re.search(re.compile('^\d+\.\d+$',re.M), mem):
266  usedPedeMem = float(mem)
267  else:
268  print 'mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem
269 
270  # check memory usage
271  # no point in asking if lib.pedeMem is defined. Initialized as lib.pedeMem=-1
272  if lib.pedeMem > 0 and usedPedeMem > 0.:
273  memoryratio = usedPedeMem /(lib.pedeMem/1024.)
274  # print a warning if more than approx. 4 GB have been
275  # requested of which less than 75% are used by Pede
276  if lib.pedeMem > 4000 and memoryratio < 0.75 :
277  msg = ("Warning: {0:.2f} GB of memory for Pede "
278  "requested, but only {1:.1f}% of it has been "
279  "used! Consider to request less memory in order "
280  "to save resources.")
281  print msg.format(lib.pedeMem/1024.0, memoryratio*100)
282  elif memoryratio > 1 :
283  msg = ("Warning: {0:.2f} GB of memory for Pede "
284  "requested, but {1:.1f}% of this has been "
285  "used! Consider to request more memory to avoid "
286  "premature removal of the job by the admin.")
287  print msg.format(lib.pedeMem/1024.0, memoryratio*100)
288  else:
289  msg = ("Info: Used {0:.1f}% of {1:.2f} GB of memory "
290  "which has been requested for Pede.")
291  print msg.format(memoryratio*100, lib.pedeMem/1024.0)
292 
293 
294  # clean up /tmp/pede.dump if needed
295  if eazeLog == '/tmp/pede.dump':
296  os.system('rm /tmp/pede.dump')
297 
298  # pede.dump not found or no read-access
299  else:
300  print 'mps_check.py cannot find',eazeLog,'to test'
301 
302  # if there is a millepede.log file, check it as well
303  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.log'
304  logZipped = 'no'
305  if os.access(eazeLog+'.gz', os.R_OK):
306  os.system('gunzip '+eazeLog+'.gz')
307  logZipped = 'true'
308 
309  if os.access(eazeLog, os.R_OK):
310  # open log file
311  with open(eazeLog, "r") as INFILE:
312  # scan records in input file
313  for line in INFILE:
314  # Checks for Pede Errors
315  if re.search(re.compile('step no descending',re.M), line):
316  pedeLogErr = 1
317  pedeLogErrStr += line
318  if re.search(re.compile('Constraint equation discrepancies:',re.M), line):
319  pedeLogErr = 1
320  pedeLogErrStr += line
321  # AP 07.09.2009 - Checks for Pede Warnings:
322  if re.search(re.compile('insufficient constraint equations',re.M), line):
323  pedeLogWrn = 1
324  pedeLogWrnStr += line
325 
326  if logZipped == 'true':
327  os.system('gzip '+eazeLog)
328  else:
329  print 'mps_check.py cannot find',eazeLog,'to test'
330 
331 
332  # check millepede.end -- added F. Meier 03.03.2015
333  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.end'
334  logZipped = 'no'
335  if os.access(eazeLog+'.gz', os.R_OK):
336  os.system('gunzip'+eazeLog+'.gz')
337  logZipped = 'true'
338 
339  if os.access(eazeLog, os.R_OK):
340  # open log file
341  with open(eazeLog, "r") as INFILE:
342  # scan records in input file
343  for line in INFILE:
344  # Checks for the output code. 0 is OK, 1 is WARN, anything else is FAIL
345  # searches the line for a number with or without a sign
346  match = re.search(re.compile('([-+]?\d+)',re.M), line)
347  if match:
348  if int(match.group(1)) == 1:
349  pedeLogWrn = 1
350  pedeLogWrnStr += line
351  elif int(match.group(1)) != 0:
352  pedeLogErr = 1
353  pedeLogErrStr += line
354  if logZipped == 'true':
355  os.system('gzip '+eazeLog)
356  else:
357  print 'mps_check.py cannot find',eazeLog,'to test'
358 
359  # end of merge job checks
360  # evaluate Errors:
361  farmhost = ' '
362 
363  okStatus = 'OK'
364  if not eofile == 1:
365  print lib.JOBDIR[i],lib.JOBID[i],'did not reach end of file'
366  okStatus = 'ABEND'
367  if quotaspace == 1:
368  print lib.JOBDIR[i],lib.JOBID[i],'had quota space problem'
369  okStatus = 'FAIL'
370  remark = 'eos quota space problem'
371  if ioprob == 1:
372  print lib.JOBDIR[i],lib.JOBID[i],'had I/O problem'
373  okStatus = 'FAIL'
374  if fw8001 == 1:
375  print lib.JOBDIR[i],lib.JOBID[i],'had Framework error 8001 problem'
376  remark = 'fwk error 8001'
377  okStatus = 'FAIL'
378  if timeout == 1:
379  print lib.JOBDIR[i],lib.JOBID[i],'had connection timed out problem'
380  remark = 'connection timed out'
381  if cfgerr == 1:
382  print lib.JOBDIR[i],lib.JOBID[i],'had config file error'
383  remark = 'cfg file error'
384  okStatus = 'FAIL'
385  if killed == 1:
386  guess = " (probably time exceeded)" if kill_reason is None else ":"
387  print lib.JOBDIR[i], lib.JOBID[i], "Job killed" + guess
388  if kill_reason is not None: print kill_reason
389  remark = "killed";
390  okStatus = "FAIL"
391  if timel == 1:
392  print lib.JOBDIR[i],lib.JOBID[i],'ran into time limit'
393  okStatus = 'TIMEL'
394  if tooManyTracks == 1:
395  print lib.JOBDIR[i],lib.JOBID[i],'too many tracks'
396  if segviol == 1:
397  print lib.JOBDIR[i],lib.JOBID[i],'SEGVIOL encountered'
398  remark = 'seg viol'
399  okStatus = 'FAIL'
400  if rfioerr == 1:
401  print lib.JOBDIR[i],lib.JOBID[i],'RFIO error encountered'
402  remark = 'rfio error'
403  okStatus = 'FAIL'
404  if quota == 1:
405  print lib.JOBDIR[i],lib.JOBID[i],'Request exceeds quota'
406  if exceptionCaught == 1:
407  print lib.JOBDIR[i],lib.JOBID[i],'Exception caught in cmsrun'
408  remark = 'Exception caught'
409  okStatus = 'FAIL'
410  if emptyDatErr == 1:
411  print 'milleBinary???.dat file not found or empty'
412  remark = 'empty milleBinary'
413  if emptyDatOnFarm > 0:
414  print '...but already empty on farm so OK (or check job',i+1,'yourself...)'
415  else:
416  okStatus = 'FAIL'
417  if cmdNotFound == 1:
418  print lib.JOBDIR[i],lib.JOBID[i],'Command not found'
419  remark = 'cmd not found'
420  okStatus = 'FAIL'
421  if insuffPriv == 1:
422  print lib.JOBDIR[i],lib.JOBID[i],'Insufficient privileges to rfcp files'
423  remark = 'Could not rfcp files'
424  okStatus = 'FAIL'
425  if pedeAbend == 1:
426  print lib.JOBDIR[i],lib.JOBID[i],'Pede did not end normally'
427  remark = 'pede failed'
428  okStatus = 'FAIL'
429  if pedeLogErr == 1:
430  print lib.JOBDIR[i],lib.JOBID[i],'Problems in running Pede:'
431  print pedeLogErrStr
432  remark = 'pede error'
433  okStatus = 'FAIL'
434  if pedeLogWrn == 1:
435  # AP 07.09.2009 - Reports Pede Warnings (but do _not_ set job status to FAIL)
436  print lib.JOBDIR[i],lib.JOBID[i],'Warnings in running Pede:'
437  print pedeLogWrnStr
438  remark = 'pede warnings'
439  okStatus = 'WARN'
440  if endofjob != 1:
441  print lib.JOBDIR[i],lib.JOBID[i],'Job not ended'
442  remark = 'job not ended'
443  okStatus = 'FAIL'
444 
445  # print warning line to stdout
446  if okStatus != "OK":
447  print lib.JOBDIR[i],lib.JOBID[i],' -------- ',okStatus
448 
449  # update number of events
450  lib.JOBNEVT[i] = nEvent
451  # udate Jobstatus
452  lib.JOBSTATUS[i] = disabled+okStatus
453  # update cputime
454  lib.JOBRUNTIME[i] = cputime
455  # update remark
456  lib.JOBREMARK[i] = remark
457  # update host
458  ##lib.JOBHOST[i] = farmhost
459 
460 lib.write_db()
461 
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
double split
Definition: MVATrainer.cc:139