CMS 3D CMS Logo

mps_check.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # This script checks outputs from jobs that have FETCH status and updates if errors occured
4 # -> check STDOUT files
5 # -> check cmsRun.out
6 # -> check alignment.log
7 # -> check if millebinaries are on eos
8 # -> check pede.dump
9 # -> check millepede.log
10 # -> check millepede.end
11 #
12 # It also retirieves number of events from alignment.log and cputime from STDOUT
13 
14 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
15 import subprocess
16 import re
17 import os
18 
19 lib = mpslib.jobdatabase()
20 lib.read_db()
21 
22 # create a list of eos ls entries containing files on eos
23 command = ' ls -l '+lib.mssDir
24 eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).split('\n')
25 
26 # loop over FETCH jobs
27 for i in xrange(len(lib.JOBID)):
28  # FIXME use bools?
29  batchSuccess = 0
30  batchExited = 0
31  finished = 0
32  endofjob = 0
33  eofile = 1 # do not deal with timel yet
34  timel = 0
35  killed = 0
36  ioprob = 0
37  fw8001 = 0
38  tooManyTracks = 0
39  segviol = 0
40  rfioerr = 0
41  quota = 0
42  nEvent = 0
43  cputime = -1
44  pedeAbend = 0
45  pedeLogErr = 0
46  pedeLogWrn = 0
47  exceptionCaught = 0
48  timeout = 0
49  cfgerr = 0
50  emptyDatErr = 0
51  emptyDatOnFarm = 0
52  cmdNotFound = 0
53  insuffPriv = 0
54  quotaspace = 0
55 
56  pedeLogErrStr = ""
57  pedeLogWrnStr = ""
58  remark = ""
59 
60  disabled = "";
61  if 'DISABLED' in lib.JOBSTATUS[i]:
62  disabled = 'DISABLED'
63 
64  if 'FETCH' in lib.JOBSTATUS[i]:
65 
66  # open the STDOUT file
67  stdOut = 'jobData/'+lib.JOBDIR[i]+'/STDOUT'
68  # unzip the STDOUT file if necessary
69  if os.access(stdOut+'.gz', os.R_OK):
70  os.system('gunzip '+stdOut+'.gz')
71 
72  try:
73  with open(stdOut, "r") as STDFILE:
74  # scan records in input file.
75  # use regular expression to search. re.compile needed for options re.M and re.I
76  # re.M=re.MULTILINE enables matching of newline char
77  # re.I=re.IGNORECASE makes matching case-insensitive.
78  for line in STDFILE:
79  if re.search(re.compile('Unable to access quota space',re.M|re.I), line):
80  quotaspace = 1
81  if re.search(re.compile('Unable to get quota space',re.M|re.I), line):
82  quotaspace = 1
83  if re.search(re.compile('Disk quota exceeded',re.M|re.I), line):
84  quotaspace = 1
85  if re.search(re.compile('CERN report: Job Killed',re.M), line):
86  killed = 1
87  if re.search(re.compile('Job finished',re.M), line):
88  finished = 1
89  if re.search(re.compile('connection timed out',re.M), line):
90  timeout = 1
91  if re.search(re.compile('ConfigFileReadError',re.M), line):
92  cfgerr = 1
93  if re.search(re.compile('0 bytes transferred',re.M), line):
94  emptyDatOnFarm = 1
95  if re.search(re.compile('command not found',re.M), line):
96  cmdNotFound = 1
97  # AP 26.11.2009 Insufficient privileges to rfcp files
98  if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
99  insuffPriv = 1
100  # AP 05.11.2015 Extract cpu-time.
101  # STDOUT doesn't contain NCU anymore. Now KSI2K and HS06 seconds are displayed.
102  # The ncuFactor is calculated from few samples by comparing KSI2K seconds with
103  # CPU time from email.
104  match = re.search(re.compile('This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
105  if match:
106  cpuFactor = 2.125
107  cputime = int(round(int(match.group(1))/cpuFactor)) # match.group(1) is the matched digit
108 
109  # gzip it afterwards:
110  print 'gzip -f '+stdOut
111  os.system('gzip -f '+stdOut)
112  except IOError as e:
113  if e.args == (2, "No such file or directory"):
114  print "mps_check.py cannot find", stdOut, "to test"
115  else:
116  raise
117 
118  # GF: This file is not produced (anymore...) -> check for existence and read-access added
119  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/cmsRun.out'
120  if os.access(eazeLog, os.R_OK):
121  # open the input file
122  with open(eazeLog, "r") as INFILE:
123  # scan records in input file
124  for line in INFILE:
125  # check if end of file has been reached
126  if re.search(re.compile('<StorageStatistics>',re.M), line):
127  eofile = 1
128  if re.search(re.compile('Time limit reached\.',re.M), line):
129  timel = 1
130  if re.search(re.compile('gives I\/O problem',re.M), line):
131  ioprob = 1
132  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
133  fw8001 = 1
134  if re.search(re.compile('too many tracks',re.M), line):
135  tooManyTracks = 1
136  if re.search(re.compile('segmentation violation',re.M), line):
137  segviol = 1
138  if re.search(re.compile('failed RFIO error',re.M), line):
139  rfioerr = 1
140  if re.search(re.compile('Request exceeds quota',re.M), line):
141  quota = 1
142 
143  # if there is an alignment.log[.gz] file, check it as well
144  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/alignment.log'
145  logZipped = 'no'
146  # unzip the logfile if necessary
147  if os.access(eazeLog+'.gz', os.R_OK):
148  os.system('gunzip '+eazeLog+'.gz')
149  logZipped = 'true'
150 
151  if os.access(eazeLog, os.R_OK): # access to alignment.log
152  # open the input file
153  with open(eazeLog,'r') as INFILE:
154  # scan records in input file
155  for line in INFILE:
156  # check if end of file has been reached
157  if re.search(re.compile('<StorageStatistics>',re.M), line):
158  eofile = 1
159  if re.search(re.compile('EAZE\. Time limit reached\.',re.M), line):
160  timel = 1
161  if re.search(re.compile('GAF gives I\/O problem',re.M), line):
162  ioprob = 1
163  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
164  fw8001 = 1
165  if re.search(re.compile('too many tracks',re.M), line):
166  tooManyTracks = 1
167  if re.search(re.compile('segmentation violation',re.M), line):
168  segviol = 1
169  if re.search(re.compile('failed RFIO error',re.M), line):
170  rfioerr = 1
171  if re.search(re.compile('Request exceeds quota',re.M), line):
172  quota = 1
173  # check for newer (e.g. CMSSW_5_1_X) and older CMSSW:
174  if re.search(re.compile('Fatal Exception',re.M), line):
175  exceptionCaught = 1
176  if re.search(re.compile('Exception caught in cmsRun',re.M), line):
177  exceptionCaught = 1
178  # AP 07.09.2009 - Check that the job got to a normal end
179  if re.search(re.compile('AlignmentProducerAsAnalyzer::endJob\(\)',re.M), line):
180  endofjob = 1
181  if re.search(re.compile('FwkReport -i main_input:sourc',re.M), line):
182  array = line.split()
183  nEvent = int(array[5])
184  if nEvent==0 and re.search(re.compile('FwkReport -i PostSource',re.M), line):
185  array = line.split()
186  nEvent = int(array[5])
187  # AP 31.07.2009 - To read number of events in CMSSW_3_2_2_patch2
188  if nEvent==0 and re.search(re.compile('FwkReport -i AfterSource',re.M), line):
189  array = line.split()
190  nEvent = int(array[5])
191 
192  if logZipped == 'true':
193  os.system('gzip '+eazeLog)
194 
195  else: # no access to alignment.log
196  print 'mps_check.py cannot find',eazeLog,'to test'
197  # AP 07.09.2009 - The following check cannot be done: set to 1 to avoid fake error type
198  endofjob = 1
199 
200  # for mille jobs checks that milleBinary file is not empty
201  if i<lib.nJobs: # mille job!
202  milleOut = 'milleBinary%03d.dat' % (i+1)
203  # from Perl, should be deprecated because of cmsls and nsls
204  #(not translated to python, yes I'm lazy... use subprocess.checkout if needed):
205  #$mOutSize = `nsls -l $mssDir | grep $milleOut | head -1 | awk '{print \$5}'`;
206  #$mOutSize = `cmsLs -l $mssDir | grep $milleOut | head -1 | awk '{print \$2}'`;
207  mOutSize = 0
208  for line in eoslsoutput:
209  if milleOut in line:
210  columns = line.split()
211  mOutSize = columns[4] # 5th column = size
212  if not (mOutSize>0):
213  emptyDatErr = 1
214 
215  # merge jobs: additional checks for merging job
216  else:
217  # if there is a pede.dump file check it as well
218  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/pede.dump'
219  if os.access(eazeLog+'.gz', os.R_OK):
220  # unzip - but clean before and save to tmp
221  os.system('rm -f /tmp/pede.dump')
222  os.system('gunzip -c '+eazeLog+'.gz > /tmp/pede.dump')
223  eazeLog = '/tmp/pede.dump'
224  if os.access(eazeLog, os.R_OK):
225  with open(eazeLog, "r") as INFILE: # open pede.dump
226  # scan records in INFILE
227  pedeAbend = 1
228  usedPedeMem = 0.
229  for line in INFILE:
230  # check if pede has reached its normal end
231  if re.search(re.compile('Millepede II.* ending',re.M), line):
232  pedeAbend = 0
233  # extract memory usage
234  match = re.search(re.compile('Peak dynamic memory allocation: (.+) GB',re.I), line)
235  if match:
236  mem = match.group(1)
237  mem = re.sub('\s', '', mem)
238  # if mem is a float
239  if re.search(re.compile('^\d+\.\d+$',re.M), mem):
240  usedPedeMem = float(mem)
241  else:
242  print 'mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem
243 
244  # check memory usage
245  # no point in asking if lib.pedeMem is defined. Initialized as lib.pedeMem=-1
246  if lib.pedeMem > 0 and usedPedeMem > 0.:
247  memoryratio = usedPedeMem /(lib.pedeMem/1024.)
248  # print a warning if more than approx. 4 GB have been
249  # requested of which less than 75% are used by Pede
250  if lib.pedeMem > 4000 and memoryratio < 0.75 :
251  msg = ("Warning: {0:.2f} GB of memory for Pede "
252  "requested, but only {1:.1f}% of it has been "
253  "used! Consider to request less memory in order "
254  "to save resources.").format(lib.pedeMem/1024.0,
255  memoryratio*100)
256  print msg
257  else:
258  msg = ("Info: Used {0:.1f}% of {1:.2f} GB of memory "
259  "which has been requested for Pede.")
260  print msg.format(memoryratio*100, lib.pedeMem/1024.0)
261 
262 
263  # clean up /tmp/pede.dump if needed
264  if eazeLog == '/tmp/pede.dump':
265  os.system('rm /tmp/pede.dump')
266 
267  # pede.dump not found or no read-access
268  else:
269  print 'mps_check.py cannot find',eazeLog,'to test'
270 
271  # if there is a millepede.log file, check it as well
272  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.log'
273  logZipped = 'no'
274  if os.access(eazeLog+'.gz', os.R_OK):
275  os.system('gunzip '+eazeLog+'.gz')
276  logZipped = 'true'
277 
278  if os.access(eazeLog, os.R_OK):
279  # open log file
280  with open(eazeLog, "r") as INFILE:
281  # scan records in input file
282  for line in INFILE:
283  # Checks for Pede Errors
284  if re.search(re.compile('step no descending',re.M), line):
285  pedeLogErr = 1
286  pedeLogErrStr += line
287  if re.search(re.compile('Constraint equation discrepancies:',re.M), line):
288  pedeLogErr = 1
289  pedeLogErrStr += line
290  # AP 07.09.2009 - Checks for Pede Warnings:
291  if re.search(re.compile('insufficient constraint equations',re.M), line):
292  pedeLogWrn = 1
293  pedeLogWrnStr += line
294 
295  if logZipped == 'true':
296  os.system('gzip '+eazeLog)
297  else:
298  print 'mps_check.py cannot find',eazeLog,'to test'
299 
300 
301  # check millepede.end -- added F. Meier 03.03.2015
302  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.end'
303  logZipped = 'no'
304  if os.access(eazeLog+'.gz', os.R_OK):
305  os.system('gunzip'+eazeLog+'.gz')
306  logZipped = 'true'
307 
308  if os.access(eazeLog, os.R_OK):
309  # open log file
310  with open(eazeLog, "r") as INFILE:
311  # scan records in input file
312  for line in INFILE:
313  # Checks for the output code. 0 is OK, 1 is WARN, anything else is FAIL
314  # searches the line for a number with or without a sign
315  match = re.search(re.compile('([-+]?\d+)',re.M), line)
316  if match:
317  if int(match.group(1)) == 1:
318  pedeLogWrn = 1
319  pedeLogWrnStr += line
320  elif int(match.group(1)) != 0:
321  pedeLogErr = 1
322  pedeLogErrStr += line
323  if logZipped == 'true':
324  os.system('gzip '+eazeLog)
325  else:
326  print 'mps_check.py cannot find',eazeLog,'to test'
327 
328  # end of merge job checks
329  # evaluate Errors:
330  farmhost = ' '
331 
332  okStatus = 'OK'
333  if not eofile == 1:
334  print lib.JOBDIR[i],lib.JOBID[i],'did not reach end of file'
335  okStatus = 'ABEND'
336  if quotaspace == 1:
337  print lib.JOBDIR[i],lib.JOBID[i],'had quota space problem'
338  okStatus = 'FAIL'
339  remark = 'eos quota space problem'
340  if ioprob == 1:
341  print lib.JOBDIR[i],lib.JOBID[i],'had I/O problem'
342  okStatus = 'FAIL'
343  if fw8001 == 1:
344  print lib.JOBDIR[i],lib.JOBID[i],'had Framework error 8001 problem'
345  remark = 'fwk error 8001'
346  okStatus = 'FAIL'
347  if timeout == 1:
348  print lib.JOBDIR[i],lib.JOBID[i],'had connection timed out problem'
349  remark = 'connection timed out'
350  if cfgerr == 1:
351  print lib.JOBDIR[i],lib.JOBID[i],'had config file error'
352  remark = 'cfg file error'
353  okStatus = 'FAIL'
354  if killed == 1:
355  print lib.JOBDIR[i],lib.JOBID[i],'Job Killed (probably time exceeded)'
356  remark = "killed";
357  okStatus = "FAIL"
358  if timel == 1:
359  print lib.JOBDIR[i],lib.JOBID[i],'ran into time limit'
360  okStatus = 'TIMEL'
361  if tooManyTracks == 1:
362  print lib.JOBDIR[i],lib.JOBID[i],'too many tracks'
363  if segviol == 1:
364  print lib.JOBDIR[i],lib.JOBID[i],'SEGVIOL encountered'
365  remark = 'seg viol'
366  okStatus = 'FAIL'
367  if rfioerr == 1:
368  print lib.JOBDIR[i],lib.JOBID[i],'RFIO error encountered'
369  remark = 'rfio error'
370  okStatus = 'FAIL'
371  if quota == 1:
372  print lib.JOBDIR[i],lib.JOBID[i],'Request exceeds quota'
373  if exceptionCaught == 1:
374  print lib.JOBDIR[i],lib.JOBID[i],'Exception caught in cmsrun'
375  remark = 'Exception caught'
376  okStatus = 'FAIL'
377  if emptyDatErr == 1:
378  print 'milleBinary???.dat file not found or empty'
379  remark = 'empty milleBinary'
380  if emptyDatOnFarm > 0:
381  print '...but already empty on farm so OK (or check job',i+1,'yourself...)'
382  else:
383  okStatus = 'FAIL'
384  if cmdNotFound == 1:
385  print lib.JOBDIR[i],lib.JOBID[i],'Command not found'
386  remark = 'cmd not found'
387  okStatus = 'FAIL'
388  if insuffPriv == 1:
389  print lib.JOBDIR[i],lib.JOBID[i],'Insufficient privileges to rfcp files'
390  remark = 'Could not rfcp files'
391  okStatus = 'FAIL'
392  if pedeAbend == 1:
393  print lib.JOBDIR[i],lib.JOBID[i],'Pede did not end normally'
394  remark = 'pede failed'
395  okStatus = 'FAIL'
396  if pedeLogErr == 1:
397  print lib.JOBDIR[i],lib.JOBID[i],'Problems in running Pede:'
398  print pedeLogErrStr
399  remark = 'pede error'
400  okStatus = 'FAIL'
401  if pedeLogWrn == 1:
402  # AP 07.09.2009 - Reports Pede Warnings (but do _not_ set job status to FAIL)
403  print lib.JOBDIR[i],lib.JOBID[i],'Warnings in running Pede:'
404  print pedeLogWrnStr
405  remark = 'pede warnings'
406  okStatus = 'WARN'
407  if endofjob != 1:
408  print lib.JOBDIR[i],lib.JOBID[i],'Job not ended'
409  remark = 'job not ended'
410  okStatus = 'FAIL'
411 
412  # print warning line to stdout
413  if okStatus != "OK":
414  print lib.JOBDIR[i],lib.JOBID[i],' -------- ',okStatus
415 
416  # update number of events
417  lib.JOBNEVT[i] = nEvent
418  # udate Jobstatus
419  lib.JOBSTATUS[i] = disabled+okStatus
420  # update cputime
421  print cputime
422  lib.JOBRUNTIME[i] = cputime
423  # update remark
424  lib.JOBREMARK[i] = remark
425  # update host
426  ##lib.JOBHOST[i] = farmhost
427 
428 lib.write_db()
429 
double split
Definition: MVATrainer.cc:139