CMS 3D CMS Logo

mps_check.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # This script checks outputs from jobs that have FETCH status and updates if errors occured
4 # -> check STDOUT files
5 # -> check cmsRun.out
6 # -> check alignment.log
7 # -> check if millebinaries are on eos
8 # -> check pede.dump
9 # -> check millepede.log
10 # -> check millepede.end
11 #
12 # It also retirieves number of events from alignment.log and cputime from STDOUT
13 
14 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
15 import subprocess
16 import re
17 import os
18 
19 lib = mpslib.jobdatabase()
20 lib.read_db()
21 
22 # create a list of eos ls entries containing files on eos
23 # previously 'cmsLs -l $mssDir' -> deprecated command. Removed in January 2016
24 # the simple command "eos ls -l" doesn't work. Exact location of application must be specified.
25 eos = '/afs/cern.ch/project/eos/installation/cms/bin/eos.select'
26 command = eos+' ls -l '+lib.mssDir
27 eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).split('\n')
28 
29 # loop over FETCH jobs
30 for i in xrange(len(lib.JOBID)):
31  # FIXME use bools?
32  batchSuccess = 0
33  batchExited = 0
34  finished = 0
35  endofjob = 0
36  eofile = 1 # do not deal with timel yet
37  timel = 0
38  killed = 0
39  ioprob = 0
40  fw8001 = 0
41  tooManyTracks = 0
42  segviol = 0
43  rfioerr = 0
44  quota = 0
45  nEvent = 0
46  cputime = -1
47  pedeAbend = 0
48  pedeLogErr = 0
49  pedeLogWrn = 0
50  exceptionCaught = 0
51  timeout = 0
52  cfgerr = 0
53  emptyDatErr = 0
54  emptyDatOnFarm = 0
55  cmdNotFound = 0
56  insuffPriv = 0
57  quotaspace = 0
58 
59  pedeLogErrStr = ""
60  pedeLogWrnStr = ""
61  remark = ""
62 
63  disabled = "";
64  if 'DISABLED' in lib.JOBSTATUS[i]:
65  disabled = 'DISABLED'
66 
67  if 'FETCH' in lib.JOBSTATUS[i]:
68 
69  # open the STDOUT file
70  stdOut = 'jobData/'+lib.JOBDIR[i]+'/STDOUT'
71  # unzip the STDOUT file if necessary
72  if os.access(stdOut+'.gz', os.R_OK):
73  os.system('gunzip '+stdOut+'.gz')
74 
75  try:
76  with open(stdOut, "r") as STDFILE:
77  # scan records in input file.
78  # use regular expression to search. re.compile needed for options re.M and re.I
79  # re.M=re.MULTILINE enables matching of newline char
80  # re.I=re.IGNORECASE makes matching case-insensitive.
81  for line in STDFILE:
82  if re.search(re.compile('Unable to access quota space',re.M|re.I), line):
83  quotaspace = 1
84  if re.search(re.compile('Unable to get quota space',re.M|re.I), line):
85  quotaspace = 1
86  if re.search(re.compile('Disk quota exceeded',re.M|re.I), line):
87  quotaspace = 1
88  if re.search(re.compile('CERN report: Job Killed',re.M), line):
89  killed = 1
90  if re.search(re.compile('Job finished',re.M), line):
91  finished = 1
92  if re.search(re.compile('connection timed out',re.M), line):
93  timeout = 1
94  if re.search(re.compile('ConfigFileReadError',re.M), line):
95  cfgerr = 1
96  if re.search(re.compile('0 bytes transferred',re.M), line):
97  emptyDatOnFarm = 1
98  if re.search(re.compile('command not found',re.M), line):
99  cmdNotFound = 1
100  # AP 26.11.2009 Insufficient privileges to rfcp files
101  if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
102  insuffPriv = 1
103  # AP 05.11.2015 Extract cpu-time.
104  # STDOUT doesn't contain NCU anymore. Now KSI2K and HS06 seconds are displayed.
105  # The ncuFactor is calculated from few samples by comparing KSI2K seconds with
106  # CPU time from email.
107  match = re.search(re.compile('This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
108  if match:
109  cpuFactor = 2.125
110  cputime = int(round(int(match.group(1))/cpuFactor)) # match.group(1) is the matched digit
111 
112  # gzip it afterwards:
113  print 'gzip -f '+stdOut
114  os.system('gzip -f '+stdOut)
115  except IOError as e:
116  if e.args == (2, "No such file or directory"):
117  print "mps_check.py cannot find", stdOut, "to test"
118  else:
119  raise
120 
121  # GF: This file is not produced (anymore...) -> check for existence and read-access added
122  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/cmsRun.out'
123  if os.access(eazeLog, os.R_OK):
124  # open the input file
125  with open(eazeLog, "r") as INFILE:
126  # scan records in input file
127  for line in INFILE:
128  # check if end of file has been reached
129  if re.search(re.compile('<StorageStatistics>',re.M), line):
130  eofile = 1
131  if re.search(re.compile('Time limit reached\.',re.M), line):
132  timel = 1
133  if re.search(re.compile('gives I\/O problem',re.M), line):
134  ioprob = 1
135  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
136  fw8001 = 1
137  if re.search(re.compile('too many tracks',re.M), line):
138  tooManyTracks = 1
139  if re.search(re.compile('segmentation violation',re.M), line):
140  segviol = 1
141  if re.search(re.compile('failed RFIO error',re.M), line):
142  rfioerr = 1
143  if re.search(re.compile('Request exceeds quota',re.M), line):
144  quota = 1
145 
146  # if there is an alignment.log[.gz] file, check it as well
147  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/alignment.log'
148  logZipped = 'no'
149  # unzip the logfile if necessary
150  if os.access(eazeLog+'.gz', os.R_OK):
151  os.system('gunzip '+eazeLog+'.gz')
152  logZipped = 'true'
153 
154  if os.access(eazeLog, os.R_OK): # access to alignment.log
155  # open the input file
156  with open(eazeLog,'r') as INFILE:
157  # scan records in input file
158  for line in INFILE:
159  # check if end of file has been reached
160  if re.search(re.compile('<StorageStatistics>',re.M), line):
161  eofile = 1
162  if re.search(re.compile('EAZE\. Time limit reached\.',re.M), line):
163  timel = 1
164  if re.search(re.compile('GAF gives I\/O problem',re.M), line):
165  ioprob = 1
166  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
167  fw8001 = 1
168  if re.search(re.compile('too many tracks',re.M), line):
169  tooManyTracks = 1
170  if re.search(re.compile('segmentation violation',re.M), line):
171  segviol = 1
172  if re.search(re.compile('failed RFIO error',re.M), line):
173  rfioerr = 1
174  if re.search(re.compile('Request exceeds quota',re.M), line):
175  quota = 1
176  # check for newer (e.g. CMSSW_5_1_X) and older CMSSW:
177  if re.search(re.compile('Fatal Exception',re.M), line):
178  exceptionCaught = 1
179  if re.search(re.compile('Exception caught in cmsRun',re.M), line):
180  exceptionCaught = 1
181  # AP 07.09.2009 - Check that the job got to a normal end
182  if re.search(re.compile('AlignmentProducerAsAnalyzer::endJob\(\)',re.M), line):
183  endofjob = 1
184  if re.search(re.compile('FwkReport -i main_input:sourc',re.M), line):
185  array = line.split()
186  nEvent = int(array[5])
187  if nEvent==0 and re.search(re.compile('FwkReport -i PostSource',re.M), line):
188  array = line.split()
189  nEvent = int(array[5])
190  # AP 31.07.2009 - To read number of events in CMSSW_3_2_2_patch2
191  if nEvent==0 and re.search(re.compile('FwkReport -i AfterSource',re.M), line):
192  array = line.split()
193  nEvent = int(array[5])
194 
195  if logZipped == 'true':
196  os.system('gzip '+eazeLog)
197 
198  else: # no access to alignment.log
199  print 'mps_check.py cannot find',eazeLog,'to test'
200  # AP 07.09.2009 - The following check cannot be done: set to 1 to avoid fake error type
201  endofjob = 1
202 
203  # for mille jobs checks that milleBinary file is not empty
204  if i<lib.nJobs: # mille job!
205  milleOut = 'milleBinary%03d.dat' % (i+1)
206  # from Perl, should be deprecated because of cmsls and nsls
207  #(not translated to python, yes I'm lazy... use subprocess.checkout if needed):
208  #$mOutSize = `nsls -l $mssDir | grep $milleOut | head -1 | awk '{print \$5}'`;
209  #$mOutSize = `cmsLs -l $mssDir | grep $milleOut | head -1 | awk '{print \$2}'`;
210  mOutSize = 0
211  for line in eoslsoutput:
212  if milleOut in line:
213  columns = line.split()
214  mOutSize = columns[4] # 5th column = size
215  if not (mOutSize>0):
216  emptyDatErr = 1
217 
218  # merge jobs: additional checks for merging job
219  else:
220  # if there is a pede.dump file check it as well
221  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/pede.dump'
222  if os.access(eazeLog+'.gz', os.R_OK):
223  # unzip - but clean before and save to tmp
224  os.system('rm -f /tmp/pede.dump')
225  os.system('gunzip -c '+eazeLog+'.gz > /tmp/pede.dump')
226  eazeLog = '/tmp/pede.dump'
227  if os.access(eazeLog, os.R_OK):
228  with open(eazeLog, "r") as INFILE: # open pede.dump
229  # scan records in INFILE
230  pedeAbend = 1
231  usedPedeMem = 0.
232  for line in INFILE:
233  # check if pede has reached its normal end
234  if re.search(re.compile('Millepede II.* ending',re.M), line):
235  pedeAbend = 0
236  # extract memory usage
237  match = re.search(re.compile('Peak dynamic memory allocation: (.+) GB',re.I), line)
238  if match:
239  mem = match.group(1)
240  mem = re.sub('\s', '', mem)
241  # if mem is a float
242  if re.search(re.compile('^\d+\.\d+$',re.M), mem):
243  usedPedeMem = float(mem)
244  else:
245  print 'mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem
246 
247  # check memory usage
248  # no point in asking if lib.pedeMem is defined. Initialized as lib.pedeMem=-1
249  if lib.pedeMem > 0 and usedPedeMem > 0.:
250  memoryratio = usedPedeMem /(lib.pedeMem/1024.)
251  # print a warning if more than approx. 4 GB have been
252  # requested of which less than 75% are used by Pede
253  if lib.pedeMem > 4000 and memoryratio < 0.75 :
254  msg = ("Warning: {0:.2f} GB of memory for Pede "
255  "requested, but only {1:.1f}% of it has been "
256  "used! Consider to request less memory in order "
257  "to save resources.").format(lib.pedeMem/1024.0,
258  memoryratio*100)
259  print msg
260  else:
261  msg = ("Info: Used {0:.1f}% of {1:.2f} GB of memory "
262  "which has been requested for Pede.")
263  print msg.format(memoryratio*100, lib.pedeMem/1024.0)
264 
265 
266  # clean up /tmp/pede.dump if needed
267  if eazeLog == '/tmp/pede.dump':
268  os.system('rm /tmp/pede.dump')
269 
270  # pede.dump not found or no read-access
271  else:
272  print 'mps_check.py cannot find',eazeLog,'to test'
273 
274  # if there is a millepede.log file, check it as well
275  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.log'
276  logZipped = 'no'
277  if os.access(eazeLog+'.gz', os.R_OK):
278  os.system('gunzip '+eazeLog+'.gz')
279  logZipped = 'true'
280 
281  if os.access(eazeLog, os.R_OK):
282  # open log file
283  with open(eazeLog, "r") as INFILE:
284  # scan records in input file
285  for line in INFILE:
286  # Checks for Pede Errors
287  if re.search(re.compile('step no descending',re.M), line):
288  pedeLogErr = 1
289  pedeLogErrStr += line
290  if re.search(re.compile('Constraint equation discrepancies:',re.M), line):
291  pedeLogErr = 1
292  pedeLogErrStr += line
293  # AP 07.09.2009 - Checks for Pede Warnings:
294  if re.search(re.compile('insufficient constraint equations',re.M), line):
295  pedeLogWrn = 1
296  pedeLogWrnStr += line
297 
298  if logZipped == 'true':
299  os.system('gzip '+eazeLog)
300  else:
301  print 'mps_check.py cannot find',eazeLog,'to test'
302 
303 
304  # check millepede.end -- added F. Meier 03.03.2015
305  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.end'
306  logZipped = 'no'
307  if os.access(eazeLog+'.gz', os.R_OK):
308  os.system('gunzip'+eazeLog+'.gz')
309  logZipped = 'true'
310 
311  if os.access(eazeLog, os.R_OK):
312  # open log file
313  with open(eazeLog, "r") as INFILE:
314  # scan records in input file
315  for line in INFILE:
316  # Checks for the output code. 0 is OK, 1 is WARN, anything else is FAIL
317  # searches the line for a number with or without a sign
318  match = re.search(re.compile('([-+]?\d+)',re.M), line)
319  if match:
320  if int(match.group(1)) == 1:
321  pedeLogWrn = 1
322  pedeLogWrnStr += line
323  elif int(match.group(1)) != 0:
324  pedeLogErr = 1
325  pedeLogErrStr += line
326  if logZipped == 'true':
327  os.system('gzip '+eazeLog)
328  else:
329  print 'mps_check.py cannot find',eazeLog,'to test'
330 
331  # end of merge job checks
332  # evaluate Errors:
333  farmhost = ' '
334 
335  okStatus = 'OK'
336  if not eofile == 1:
337  print lib.JOBDIR[i],lib.JOBID[i],'did not reach end of file'
338  okStatus = 'ABEND'
339  if quotaspace == 1:
340  print lib.JOBDIR[i],lib.JOBID[i],'had quota space problem'
341  okStatus = 'FAIL'
342  remark = 'eos quota space problem'
343  if ioprob == 1:
344  print lib.JOBDIR[i],lib.JOBID[i],'had I/O problem'
345  okStatus = 'FAIL'
346  if fw8001 == 1:
347  print lib.JOBDIR[i],lib.JOBID[i],'had Framework error 8001 problem'
348  remark = 'fwk error 8001'
349  okStatus = 'FAIL'
350  if timeout == 1:
351  print lib.JOBDIR[i],lib.JOBID[i],'had connection timed out problem'
352  remark = 'connection timed out'
353  if cfgerr == 1:
354  print lib.JOBDIR[i],lib.JOBID[i],'had config file error'
355  remark = 'cfg file error'
356  okStatus = 'FAIL'
357  if killed == 1:
358  print lib.JOBDIR[i],lib.JOBID[i],'Job Killed (probably time exceeded)'
359  remark = "killed";
360  okStatus = "FAIL"
361  if timel == 1:
362  print lib.JOBDIR[i],lib.JOBID[i],'ran into time limit'
363  okStatus = 'TIMEL'
364  if tooManyTracks == 1:
365  print lib.JOBDIR[i],lib.JOBID[i],'too many tracks'
366  if segviol == 1:
367  print lib.JOBDIR[i],lib.JOBID[i],'SEGVIOL encountered'
368  remark = 'seg viol'
369  okStatus = 'FAIL'
370  if rfioerr == 1:
371  print lib.JOBDIR[i],lib.JOBID[i],'RFIO error encountered'
372  remark = 'rfio error'
373  okStatus = 'FAIL'
374  if quota == 1:
375  print lib.JOBDIR[i],lib.JOBID[i],'Request exceeds quota'
376  if exceptionCaught == 1:
377  print lib.JOBDIR[i],lib.JOBID[i],'Exception caught in cmsrun'
378  remark = 'Exception caught'
379  okStatus = 'FAIL'
380  if emptyDatErr == 1:
381  print 'milleBinary???.dat file not found or empty'
382  remark = 'empty milleBinary'
383  if emptyDatOnFarm > 0:
384  print '...but already empty on farm so OK (or check job',i+1,'yourself...)'
385  else:
386  okStatus = 'FAIL'
387  if cmdNotFound == 1:
388  print lib.JOBDIR[i],lib.JOBID[i],'Command not found'
389  remark = 'cmd not found'
390  okStatus = 'FAIL'
391  if insuffPriv == 1:
392  print lib.JOBDIR[i],lib.JOBID[i],'Insufficient privileges to rfcp files'
393  remark = 'Could not rfcp files'
394  okStatus = 'FAIL'
395  if pedeAbend == 1:
396  print lib.JOBDIR[i],lib.JOBID[i],'Pede did not end normally'
397  remark = 'pede failed'
398  okStatus = 'FAIL'
399  if pedeLogErr == 1:
400  print lib.JOBDIR[i],lib.JOBID[i],'Problems in running Pede:'
401  print pedeLogErrStr
402  remark = 'pede error'
403  okStatus = 'FAIL'
404  if pedeLogWrn == 1:
405  # AP 07.09.2009 - Reports Pede Warnings (but do _not_ set job status to FAIL)
406  print lib.JOBDIR[i],lib.JOBID[i],'Warnings in running Pede:'
407  print pedeLogWrnStr
408  remark = 'pede warnings'
409  okStatus = 'WARN'
410  if endofjob != 1:
411  print lib.JOBDIR[i],lib.JOBID[i],'Job not ended'
412  remark = 'job not ended'
413  okStatus = 'FAIL'
414 
415  # print warning line to stdout
416  if okStatus != "OK":
417  print lib.JOBDIR[i],lib.JOBID[i],' -------- ',okStatus
418 
419  # update number of events
420  lib.JOBNEVT[i] = nEvent
421  # udate Jobstatus
422  lib.JOBSTATUS[i] = disabled+okStatus
423  # update cputime
424  print cputime
425  lib.JOBRUNTIME[i] = cputime
426  # update remark
427  lib.JOBREMARK[i] = remark
428  # update host
429  ##lib.JOBHOST[i] = farmhost
430 
431 lib.write_db()
432 
double split
Definition: MVATrainer.cc:139