CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
mps_check.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # This script checks outputs from jobs that have FETCH status and updates if errors occured
4 # -> check STDOUT files
5 # -> check cmsRun.out
6 # -> check alignment.log
7 # -> check if millebinaries are on eos
8 # -> check pede.dump
9 # -> check millepede.log
10 # -> check millepede.end
11 #
12 # It also retirieves number of events from alignment.log and cputime from STDOUT
13 
14 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
15 import subprocess
16 import re
17 import os
18 
19 lib = mpslib.jobdatabase()
20 lib.read_db()
21 
22 # create a list of eos ls entries containing files on eos
23 # previously 'cmsLs -l $mssDir' -> deprecated command. Removed in January 2016
24 # the simple command "eos ls -l" doesn't work. Exact location of application must be specified.
25 eos = '/afs/cern.ch/project/eos/installation/cms/bin/eos.select'
26 command = eos+' ls -l '+lib.mssDir
27 eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).split('\n')
28 
29 # loop over FETCH jobs
30 for i in xrange(len(lib.JOBID)):
31  # FIXME use bools?
32  batchSuccess = 0
33  batchExited = 0
34  finished = 0
35  endofjob = 0
36  eofile = 1 # do not deal with timel yet
37  timel = 0
38  killed = 0
39  ioprob = 0
40  fw8001 = 0
41  tooManyTracks = 0
42  segviol = 0
43  rfioerr = 0
44  quota = 0
45  nEvent = 0
46  cputime = -1
47  pedeAbend = 0
48  pedeLogErr = 0
49  pedeLogWrn = 0
50  exceptionCaught = 0
51  timeout = 0
52  cfgerr = 0
53  emptyDatErr = 0
54  emptyDatOnFarm = 0
55  cmdNotFound = 0
56  insuffPriv = 0
57  quotaspace = 0
58 
59  pedeLogErrStr = ""
60  pedeLogWrnStr = ""
61  remark = ""
62 
63  disabled = "";
64  if 'DISABLED' in lib.JOBSTATUS[i]:
65  disabled = 'DISABLED'
66 
67  if 'FETCH' in lib.JOBSTATUS[i]:
68 
69  # open the STDOUT file
70  stdOut = 'jobData/'+lib.JOBDIR[i]+'/STDOUT'
71  # unzip the STDOUT file if necessary
72  if os.access(stdOut+'.gz', os.R_OK):
73  os.system('gunzip '+stdOut+'.gz')
74 
75  STDFILE = open(stdOut,'r')
76  # scan records in input file.
77  # use regular expression to search. re.compile needed for options re.M and re.I
78  # re.M=re.MULTILINE enables matching of newline char
79  # re.I=re.IGNORECASE makes matching case-insensitive.
80  for line in STDFILE:
81  if re.search(re.compile('Unable to access quota space',re.M|re.I), line):
82  quotaspace = 1
83  if re.search(re.compile('Unable to get quota space',re.M|re.I), line):
84  quotaspace = 1
85  if re.search(re.compile('Disk quota exceeded',re.M|re.I), line):
86  quotaspace = 1
87  if re.search(re.compile('CERN report: Job Killed',re.M), line):
88  killed = 1
89  if re.search(re.compile('Job finished',re.M), line):
90  finished = 1
91  if re.search(re.compile('connection timed out',re.M), line):
92  timeout = 1
93  if re.search(re.compile('ConfigFileReadError',re.M), line):
94  cfgerr = 1
95  if re.search(re.compile('0 bytes transferred',re.M), line):
96  emptyDatOnFarm = 1
97  if re.search(re.compile('command not found',re.M), line):
98  cmdNotFound = 1
99  # AP 26.11.2009 Insufficient privileges to rfcp files
100  if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
101  insuffPriv = 1
102  # AP 05.11.2015 Extract cpu-time.
103  # STDOUT doesn't contain NCU anymore. Now KSI2K and HS06 seconds are displayed.
104  # The ncuFactor is calculated from few samples by comparing KSI2K seconds with
105  # CPU time from email.
106  match = re.search(re.compile('This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
107  if match:
108  cpuFactor = 2.125
109  cputime = int(round(int(match.group(1))/cpuFactor)) # match.group(1) is the matched digit
110  STDFILE.close()
111 
112  # gzip it afterwards:
113  print 'gzip -f '+stdOut
114  os.system('gzip -f '+stdOut)
115 
116  # GF: This file is not produced (anymore...) -> check for existence and read-access added
117  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/cmsRun.out'
118  if os.access(eazeLog, os.R_OK):
119  # open the input file
120  INFILE = open(eazeLog,'r')
121  # scan records in input file
122  for line in INFILE:
123  # check if end of file has been reached
124  if re.search(re.compile('<StorageStatistics>',re.M), line):
125  eofile = 1
126  if re.search(re.compile('Time limit reached\.',re.M), line):
127  timel = 1
128  if re.search(re.compile('gives I\/O problem',re.M), line):
129  ioprob = 1
130  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
131  fw8001 = 1
132  if re.search(re.compile('too many tracks',re.M), line):
133  tooManyTracks = 1
134  if re.search(re.compile('segmentation violation',re.M), line):
135  segviol = 1
136  if re.search(re.compile('failed RFIO error',re.M), line):
137  rfioerr = 1
138  if re.search(re.compile('Request exceeds quota',re.M), line):
139  quota = 1
140  INFILE.close()
141 
142  # if there is an alignment.log[.gz] file, check it as well
143  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/alignment.log'
144  logZipped = 'no'
145  # unzip the logfile if necessary
146  if os.access(eazeLog+'.gz', os.R_OK):
147  os.system('gunzip '+eazeLog+'.gz')
148  logZipped = 'true'
149 
150  if os.access(eazeLog, os.R_OK): # access to alignment.log
151  # open the input file
152  INFILE = open(eazeLog,'r')
153  # scan records in input file
154  for line in INFILE:
155  # check if end of file has been reached
156  if re.search(re.compile('<StorageStatistics>',re.M), line):
157  eofile = 1
158  if re.search(re.compile('EAZE\. Time limit reached\.',re.M), line):
159  timel = 1
160  if re.search(re.compile('GAF gives I\/O problem',re.M), line):
161  ioprob = 1
162  if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
163  fw8001 = 1
164  if re.search(re.compile('too many tracks',re.M), line):
165  tooManyTracks = 1
166  if re.search(re.compile('segmentation violation',re.M), line):
167  segviol = 1
168  if re.search(re.compile('failed RFIO error',re.M), line):
169  rfioerr = 1
170  if re.search(re.compile('Request exceeds quota',re.M), line):
171  quota = 1
172  # check for newer (e.g. CMSSW_5_1_X) and older CMSSW:
173  if re.search(re.compile('Fatal Exception',re.M), line):
174  exceptionCaught = 1
175  if re.search(re.compile('Exception caught in cmsRun',re.M), line):
176  exceptionCaught = 1
177  # AP 07.09.2009 - Check that the job got to a normal end
178  if re.search(re.compile('AlignmentProducer::endOfJob\(\)',re.M), line):
179  endofjob = 1
180  if re.search(re.compile('FwkReport -i main_input:sourc',re.M), line):
181  array = line.split()
182  nEvent = int(array[5])
183  if nEvent==0 and re.search(re.compile('FwkReport -i PostSource',re.M), line):
184  array = line.split()
185  nEvent = int(array[5])
186  # AP 31.07.2009 - To read number of events in CMSSW_3_2_2_patch2
187  if nEvent==0 and re.search(re.compile('FwkReport -i AfterSource',re.M), line):
188  array = line.split()
189  nEvent = int(array[5])
190  INFILE.close()
191 
192  if logZipped == 'true':
193  os.system('gzip '+eazeLog)
194 
195  else: # no access to alignment.log
196  print 'mps_check.py cannot find',eazeLog,'to test'
197  # AP 07.09.2009 - The following check cannot be done: set to 1 to avoid fake error type
198  endofjob = 1
199 
200  # for mille jobs checks that milleBinary file is not empty
201  if i<lib.nJobs: # mille job!
202  milleOut = 'milleBinary%03d.dat' % (i+1)
203  # from Perl, should be deprecated because of cmsls and nsls
204  #(not translated to python, yes I'm lazy... use subprocess.checkout if needed):
205  #$mOutSize = `nsls -l $mssDir | grep $milleOut | head -1 | awk '{print \$5}'`;
206  #$mOutSize = `cmsLs -l $mssDir | grep $milleOut | head -1 | awk '{print \$2}'`;
207  mOutSize = 0
208  for line in eoslsoutput:
209  if milleOut in line:
210  columns = line.split()
211  mOutSize = columns[4] # 5th column = size
212  if not (mOutSize>0):
213  emptyDatErr = 1
214 
215  # merge jobs: additional checks for merging job
216  else:
217  # if there is a pede.dump file check it as well
218  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/pede.dump'
219  if os.access(eazeLog+'.gz', os.R_OK):
220  # unzip - but clean before and save to tmp
221  os.system('rm -f /tmp/pede.dump')
222  os.system('gunzip -c '+eazeLog+'.gz > /tmp/pede.dump')
223  eazeLog = '/tmp/pede.dump'
224  if os.access(eazeLog, os.R_OK):
225  INFILE = open(eazeLog,'r') # open pede.dump
226 
227  # scan records in INFILE
228  pedeAbend = 1
229  usedPedeMem = 0.
230  for line in INFILE:
231  # check if pede has reached its normal end
232  if re.search(re.compile('Millepede II.* ending',re.M), line):
233  pedeAbend = 0
234  # extract memory usage
235  match = re.search(re.compile('Peak dynamic memory allocation: (.+) GB',re.I), line)
236  if match:
237  mem = match.group(1)
238  mem = re.sub('\s', '', mem)
239  # if mem is a float
240  if re.search(re.compile('^\d+\.\d+$',re.M), mem):
241  usedPedeMem = float(mem)
242  else:
243  print 'mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem
244 
245  # check memory usage
246  # no point in asking if lib.pedeMem is defined. Initialized as lib.pedeMem=-1
247  if lib.pedeMem > 0 and usedPedeMem > 0.:
248  memoryratio = usedPedeMem /(lib.pedeMem/1024.)
249  # print a warning if more than approx. 4 GB have been
250  # requested of which less than 75% are used by Pede
251  if lib.pedeMem > 4000 and memoryratio < 75. :
252  print 'Warning:',round(lib.pedeMem / 1024.,2),'GB of memory for Pede requested, but only',round(memoryratio,1),'\% of it has been used! Consider to request less memory in order to save resources.'
253 
254  # clean up /tmp/pede.dump if needed
255  if eazeLog == '/tmp/pede.dump':
256  os.system('rm /tmp/pede.dump')
257 
258  # pede.dump not found or no read-access
259  else:
260  print 'mps_check.py cannot find',eazeLog,'to test'
261 
262  # if there is a millepede.log file, check it as well
263  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.log'
264  logZipped = 'no'
265  if os.access(eazeLog+'.gz', os.R_OK):
266  os.system('gunzip '+eazeLog+'.gz')
267  logZipped = 'true'
268 
269  if os.access(eazeLog, os.R_OK):
270  # open log file
271  INFILE = open(eazeLog,'r')
272  # scan records in input file
273  for line in INFILE:
274  # Cheks for Pede Errors
275  if re.search(re.compile('step no descending',re.M), line):
276  pedeLogErr = 1
277  pedeLogErrStr += line
278  if re.search(re.compile('Constraint equation discrepancies:',re.M), line):
279  pedeLogErr = 1
280  pedeLogErrStr += line
281  # AP 07.09.2009 - Checks for Pede Warnings:
282  if re.search(re.compile('insufficient constraint equations',re.M), line):
283  pedeLogWrn = 1
284  pedeLogWrnStr += line
285  INFILE.close()
286 
287  if logZipped == 'true':
288  os.system('gzip '+eazeLog)
289  else:
290  print 'mps_check.py cannot find',eazeLog,'to test'
291 
292 
293  # check millepede.end -- added F. Meier 03.03.2015
294  eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.end'
295  logZipped = 'no'
296  if os.access(eazeLog+'.gz', os.R_OK):
297  os.system('gunzip'+eazeLog+'.gz')
298  logZipped = 'true'
299 
300  if os.access(eazeLog, os.R_OK):
301  # open log file
302  INFILE = open(eazeLog,'r')
303  # scan records in input file
304  for line in INFILE:
305  # Checks for the output code. 0 is OK, 1 is WARN, anything else is FAIL
306  # searches the line for a number with or without a sign
307  match = re.search(re.compile('([-+]?\d+)',re.M), line)
308  if match:
309  if int(match.group(1)) == 1:
310  pedeLogWrn = 1
311  pedeLogWrnStr += line
312  elif int(match.group(1)) != 0:
313  pedeLogErr = 1
314  pedeLogErrStr += line
315  INFILE.close()
316  if logZipped == 'true':
317  os.system('gzip '+eazeLog)
318  else:
319  print 'mps_check.py cannot find',eazeLog,'to test'
320 
321  # end of merge job checks
322  # evaluate Errors:
323  farmhost = ' '
324 
325  okStatus = 'OK'
326  if not eofile == 1:
327  print lib.JOBDIR[i],lib.JOBID[i],'did not reach end of file'
328  okStatus = 'ABEND'
329  if quotaspace == 1:
330  print lib.JOBDIR[i],lib.JOBID[i],'had quota space problem'
331  okStatus = 'FAIL'
332  remark = 'eos quota space problem'
333  if ioprob == 1:
334  print lib.JOBDIR[i],lib.JOBID[i],'had I/O problem'
335  okStatus = 'FAIL'
336  if fw8001 == 1:
337  print lib.JOBDIR[i],lib.JOBID[i],'had Framework error 8001 problem'
338  remark = 'fwk error 8001'
339  okStatus = 'FAIL'
340  if timeout == 1:
341  print lib.JOBDIR[i],lib.JOBID[i],'had connection timed out problem'
342  remark = 'connection timed out'
343  if cfgerr == 1:
344  print lib.JOBDIR[i],lib.JOBID[i],'had config file error'
345  remark = 'cfg file error'
346  okStatus = 'FAIL'
347  if killed == 1:
348  print lib.JOBDIR[i],lib.JOBID[i],'Job Killed (probably time exceeded)'
349  remark = "killed";
350  okStatus = "FAIL"
351  if timel == 1:
352  print lib.JOBDIR[i],lib.JOBID[i],'ran into time limit'
353  okStatus = 'TIMEL'
354  if tooManyTracks == 1:
355  print lib.JOBDIR[i],lib.JOBID[i],'too many tracks'
356  if segviol == 1:
357  print lib.JOBDIR[i],lib.JOBID[i],'SEGVIOL encountered'
358  remark = 'seg viol'
359  okStatus = 'FAIL'
360  if rfioerr == 1:
361  print lib.JOBDIR[i],lib.JOBID[i],'RFIO error encountered'
362  remark = 'rfio error'
363  okStatus = 'FAIL'
364  if quota == 1:
365  print lib.JOBDIR[i],lib.JOBID[i],'Request exceeds quota'
366  if exceptionCaught == 1:
367  print lib.JOBDIR[i],lib.JOBID[i],'Exception caught in cmsrun'
368  remark = 'Exception caught'
369  okStatus = 'FAIL'
370  if emptyDatErr == 1:
371  print 'milleBinary???.dat file not found or empty'
372  remark = 'empty milleBinary'
373  if emptyDatOnFarm > 0:
374  print '...but already empty on farm so OK (or check job',i+1,'yourself...)'
375  else:
376  okStatus = 'FAIL'
377  if cmdNotFound == 1:
378  print lib.JOBDIR[i],lib.JOBID[i],'Command not found'
379  remark = 'cmd not found'
380  okStatus = 'FAIL'
381  if insuffPriv == 1:
382  print lib.JOBDIR[i],lib.JOBID[i],'Insufficient privileges to rfcp files'
383  remark = 'Could not rfcp files'
384  okStatus = 'FAIL'
385  if pedeAbend == 1:
386  print lib.JOBDIR[i],lib.JOBID[i],'Pede did not end normally'
387  remark = 'pede failed'
388  okStatus = 'FAIL'
389  if pedeLogErr == 1:
390  print lib.JOBDIR[i],lib.JOBID[i],'Problems in running Pede:'
391  print pedeLogErrStr
392  remark = 'pede error'
393  okStatus = 'FAIL'
394  if pedeLogWrn == 1:
395  # AP 07.09.2009 - Reports Pede Warnings (but do _not_ set job status to FAIL)
396  print lib.JOBDIR[i],lib.JOBID[i],'Warnings in running Pede:'
397  print pedeLogWrnStr
398  remark = 'pede warnings'
399  okStatus = 'WARN'
400  if endofjob != 1:
401  print lib.JOBDIR[i],lib.JOBID[i],'Job not ended'
402  remark = 'job not ended'
403  okStatus = 'FAIL'
404 
405  # print warning line to stdout
406  if okStatus != "OK":
407  print lib.JOBDIR[i],lib.JOBID[i],' -------- ',okStatus
408 
409  # update number of events
410  lib.JOBNEVT[i] = nEvent
411  # udate Jobstatus
412  lib.JOBSTATUS[i] = disabled+okStatus
413  # update cputime
414  print cputime
415  lib.JOBRUNTIME[i] = cputime
416  # update remark
417  lib.JOBREMARK[i] = remark
418  # update host
419  ##lib.JOBHOST[i] = farmhost
420 
421 lib.write_db()
422 
double split
Definition: MVATrainer.cc:139