00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 import array
00016 import os
00017 import re
00018 import sys
00019 from cPickle import load
00020 from os.path import dirname,basename,join,isfile
00021 from threading import Thread
00022 from time import asctime
00023
00024 theargv=sys.argv
00025 sys.argv=[]
00026 from ROOT import *
00027 import ROOT
00028 ROOT.gErrorIgnoreLevel=1001
00029 ROOT.gROOT.SetBatch(True)
00030 sys.argv=theargv
00031
00032 from urllib2 import Request,build_opener,urlopen
00033
00034 if os.environ.has_key("RELMON_SA"):
00035 from definitions import *
00036 from authentication import X509CertOpen
00037 from utils import __file__ as this_module_name
00038 else:
00039 from Utilities.RelMon.definitions import *
00040 from Utilities.RelMon.authentication import X509CertOpen
00041 from Utilities.RelMon.utils import __file__ as this_module_name
00042
00043
00044
00045
00046 _log_level=10
00047 def logger(msg_level,message):
00048 if msg_level>=_log_level:
00049 print "[%s] %s" %(asctime(),message)
00050
00051
00052 def setTDRStyle():
00053 this_dir=dirname(this_module_name)
00054 this_dir_one_up=this_dir[:this_dir.rfind("/")+1]
00055
00056 style_file=''
00057 if os.environ.has_key("RELMON_SA"):
00058 style_file=this_dir_one_up+"data/tdrstyle_mod.C"
00059 else:
00060 style_file="%s/src/Utilities/RelMon/data/tdrstyle_mod.C"%(os.environ["CMSSW_BASE"])
00061 try:
00062 gROOT.ProcessLine(".L %s" %style_file)
00063 gROOT.ProcessLine("setTDRStyle()")
00064 except:
00065 "Print could not set the TDR style. File %s not found?" %style_file
00066
00067
00068
00069 def literal2root (literal,rootType):
00070 bitsarray = array.array('B')
00071 bitsarray.fromstring(literal.decode('hex'))
00072
00073 tbuffer=0
00074 try:
00075 tbuffer = TBufferFile(TBufferFile.kRead, len(bitsarray), bitsarray, False,0)
00076 except:
00077 print "could not transform to object array:"
00078 print [ i for i in bitsarray ]
00079
00080
00081 if rootType == 'TPROF':
00082 rootType = 'TProfile'
00083 if rootType == 'TPROF2D':
00084 rootType = 'TProfile2D'
00085
00086 root_class=eval(rootType+'.Class()')
00087
00088 return tbuffer.ReadObject(root_class)
00089
00090
00091
00092 def getNbins(h):
00093 biny=h.GetNbinsY()
00094 if biny>1:biny+=1
00095 binz=h.GetNbinsZ()
00096 if binz>1:binz+=1
00097 return (h.GetNbinsX()+1)*(biny)*(binz)
00098
00099
00100
00101
00102 class StatisticalTest(object):
00103 def __init__(self,threshold):
00104 self.name=""
00105 self.h1=None
00106 self.h2=None
00107 self.threshold=float(threshold)
00108 self.rank=-1
00109 self.is_init=False
00110
00111 def set_operands(self,h1,h2):
00112 self.h1=h1
00113 self.h2=h2
00114
00115 def get_rank(self):
00116 if not self.is_init:
00117 if self.rank<0:
00118 type1=type(self.h1)
00119 type2=type(self.h2)
00120 if (type1 != type2):
00121 logger(1,"*** ERROR: object types in comparison don't match: %s!=%s" %(type1,type2))
00122 self.rank=test_codes["DIFF_TYPES"]
00123 elif not self.h2.InheritsFrom("TH1"):
00124 logger(1,"*** ERROR: object type is not histogram but a %s" %(type1))
00125 self.rank=test_codes["NO_HIST"]
00126
00127
00128
00129
00130 else:
00131 is_empty1=is_empty(self.h1)
00132 is_empty2=is_empty(self.h2)
00133 are_empty=is_empty1 and is_empty2
00134 one_empty=is_empty1 or is_empty2
00135
00136 Nbins1= getNbins(self.h1)
00137 Nbins2= getNbins(self.h2)
00138
00139 if are_empty:
00140
00141
00142 return 1
00143 elif one_empty:
00144
00145
00146 return 1
00147
00148
00149 if Nbins1!=Nbins2:
00150 return test_codes["DIFF_BIN"]
00151
00152 self.rank=self.do_test()
00153 self.is_init=True
00154 return self.rank
00155
00156 def get_status(self):
00157 status = SUCCESS
00158 if self.get_rank()<0:
00159 status=NULL
00160 logger(0,"+++ Test %s FAILED: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
00161 elif self.get_rank()<=self.threshold:
00162 status=FAIL
00163 logger(0,"+++ Test %s: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
00164 return status
00165
00166 def do_test(self):
00167 pass
00168
00169
00170
00171 def is_empty(h):
00172 for i in xrange(1,getNbins(h)):
00173 if h.GetBinContent(i)!=0: return False
00174 return True
00175
00176
00177
00178
00179 def is_sparse(h):
00180 filled_bins=0.
00181 nbins=h.GetNbinsX()
00182 for ibin in xrange(nbins):
00183 if h.GetBinContent(ibin)>0:
00184 filled_bins+=1
00185
00186 if filled_bins/nbins < .5:
00187 return True
00188 else:
00189 return False
00190
00191
00192
00193 class KS(StatisticalTest):
00194 def __init__(self, threshold):
00195 StatisticalTest.__init__(self,threshold)
00196 self.name="KS"
00197
00198 def do_test(self):
00199
00200
00201 for h in self.h1,self.h2:
00202 w2s=h.GetSumw2()
00203 if w2s.GetSize()==0:
00204 h.Sumw2()
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216 return self.h1.KolmogorovTest(self.h2)
00217
00218
00219 import array
00220 def profile2histo(profile):
00221 if not profile.InheritsFrom("TH1"):
00222 return profile
00223
00224 bin_low_edges=[]
00225 n_bins=profile.GetNbinsX()
00226
00227 for ibin in xrange(1,n_bins+2):
00228 bin_low_edges.append(profile.GetBinLowEdge(ibin))
00229 bin_low_edges=array.array('f',bin_low_edges)
00230 histo=TH1F(profile.GetName(),profile.GetTitle(),n_bins,bin_low_edges)
00231 for ibin in xrange(0,n_bins+1):
00232 histo.SetBinContent(ibin,profile.GetBinContent(ibin))
00233 histo.SetBinError(ibin,profile.GetBinError(ibin))
00234
00235 return histo
00236
00237
00238 class Chi2(StatisticalTest):
00239 def __init__(self, threshold):
00240 StatisticalTest.__init__(self,threshold)
00241 self.name="Chi2"
00242
00243 def check_filled_bins(self,min_filled):
00244 nbins=self.h1.GetNbinsX()
00245 n_filled_l=[]
00246 for h in self.h1,self.h2:
00247 nfilled=0.
00248 for ibin in xrange(1,nbins+1):
00249 if h.GetBinContent(ibin)>0:
00250 nfilled+=1
00251 n_filled_l.append(nfilled)
00252 return len(filter (lambda x:x>=min_filled,n_filled_l) )>0
00253
00254 def absval(self):
00255 nbins=getNbins(self.h1)
00256 binc=0
00257 for i in xrange(1,nbins):
00258 for h in self.h1,self.h2:
00259 binc=h.GetBinContent(i)
00260 if binc<0:
00261 h.SetBinContent(i,-1*binc)
00262 if h.GetBinError(i)==0 and binc!=0:
00263
00264 h.SetBinContent(i,0)
00265
00266
00267 def do_test(self):
00268 self.absval()
00269 if self.check_filled_bins(3):
00270 if self.h1.InheritsFrom("TProfile") or (self.h1.GetEntries()!=self.h1.GetSumOfWeights()):
00271 chi2=self.h1.Chi2Test(self.h2,'WW')
00272
00273 return chi2
00274 else:
00275 return self.h1.Chi2Test(self.h2,'UU')
00276 else:
00277 return 1
00278
00279
00280
00281
00282 class BinToBin(StatisticalTest):
00283 """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
00284 bins is different. It is % of corresponding bins otherwhise.
00285 A threshold of 1 is needed to require a 1 to 1 correspondance between
00286 hisograms.
00287 """
00288 def __init__(self, threshold=1):
00289 StatisticalTest.__init__(self, threshold)
00290 self.name='BinToBin'
00291 self.epsilon= 0.000001
00292
00293 def checkBinningMatches(self):
00294 if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00295 or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00296 or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00297 or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00298 or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00299 or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00300 or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00301 or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00302 or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00303 return False
00304 return True
00305
00306 def do_test(self):
00307
00308 if not self.checkBinningMatches():
00309 return test_codes["DIFF_BIN"]
00310
00311 equal = 1
00312 nbins = getNbins(self.h1)
00313 n_ok_bins=0.0
00314 for ibin in xrange(0,nbins):
00315 ibin+=1
00316 h1bin=self.h1.GetBinContent(ibin)
00317 h2bin=self.h2.GetBinContent(ibin)
00318 bindiff=h1bin-h2bin
00319
00320 binavg=.5*(h1bin+h2bin)
00321
00322 if binavg==0 or abs(bindiff) < self.epsilon:
00323 n_ok_bins+=1
00324
00325 else:
00326 print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00327
00328
00329
00330
00331 rank=n_ok_bins/nbins
00332
00333 if rank!=1:
00334 print "Histogram %s differs: nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00335
00336 return rank
00337
00338
00339
00340 class BinToBin1percent(StatisticalTest):
00341 """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
00342 bins is different. It is % of corresponding bins otherwhise.
00343 A threshold of 1 is needed to require a 1 to 1 correspondance between
00344 hisograms.
00345 """
00346 def __init__(self, threshold=1):
00347 StatisticalTest.__init__(self, threshold)
00348 self.name='BinToBin1percent'
00349 self.epsilon= 0.000001
00350 self.tolerance= 0.01
00351
00352 def checkBinningMatches(self):
00353 if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00354 or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00355 or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00356 or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00357 or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00358 or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00359 or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00360 or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00361 or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00362 return False
00363 return True
00364
00365 def do_test(self):
00366
00367 if not self.checkBinningMatches():
00368 return test_codes["DIFF_BIN"]
00369
00370 equal = 1
00371 nbins = getNbins(self.h1)
00372 n_ok_bins=0.0
00373 for ibin in xrange(0,nbins):
00374 ibin+=1
00375 h1bin=self.h1.GetBinContent(ibin)
00376 h2bin=self.h2.GetBinContent(ibin)
00377 bindiff=h1bin-h2bin
00378
00379 binavg=.5*(h1bin+h2bin)
00380
00381 if binavg==0 or 100*abs(bindiff)/binavg < self.tolerance:
00382 n_ok_bins+=1
00383
00384 else:
00385 print "-->Bin %i bin: bindiff %s (%s - %s )" %(ibin,bindiff,h1bin,h2bin)
00386
00387
00388
00389
00390 rank=n_ok_bins/nbins
00391
00392 if rank!=1:
00393 print "%s nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00394
00395 return rank
00396
00397 Statistical_Tests={"KS":KS,
00398 "Chi2":Chi2,
00399 "BinToBin":BinToBin,
00400 "BinToBin1percent":BinToBin1percent,
00401 "Bin2Bin":BinToBin,
00402 "b2b":BinToBin,}
00403
00404
00405 def ask_ok(prompt, retries=4, complaint='yes or no'):
00406 while True:
00407 ok = raw_input(prompt)
00408 if ok in ('y', 'ye', 'yes'):
00409 return True
00410 if ok in ('n', 'no'):
00411 return False
00412 retries = retries - 1
00413 if retries < 0:
00414 raise IOError('refusenik user')
00415 print complaint
00416
00417
00418
00419 class unpickler(Thread):
00420 def __init__(self,filename):
00421 Thread.__init__(self)
00422 self.filename=filename
00423 self.directory=""
00424
00425 def run(self):
00426 print "Reading directory from %s" %(self.filename)
00427 ifile=open(self.filename,"rb")
00428 self.directory=load(ifile)
00429 ifile.close()
00430
00431
00432
00433 def wget(url):
00434 """ Fetch the WHOLE file, not in bunches... To be optimised.
00435 """
00436 opener=build_opener(X509CertOpen())
00437 datareq = Request(url)
00438 datareq.add_header('authenticated_wget', "The ultimate wgetter")
00439 bin_content=None
00440 try:
00441 filename=basename(url)
00442 print "Checking existence of file %s on disk..."%filename
00443 if not isfile("./%s"%filename):
00444 bin_content=opener.open(datareq).read()
00445 else:
00446 print "File %s exists, skipping.." %filename
00447 except ValueError:
00448 print "Error: Unknown url %s" %url
00449
00450 if bin_content!=None:
00451 ofile = open(filename, 'wb')
00452 ofile.write(bin_content)
00453 ofile.close()
00454
00455
00456
00457
00458 def get_relvaldata_id(file):
00459 """Returns unique relvaldata ID for a given file."""
00460 run_id = re.search('R\d{9}', file)
00461 run = re.search('_RelVal_([\w\d]*)-v\d__', file)
00462 if not run:
00463 run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
00464 if run_id and run:
00465 return (run_id.group(), run.group(1))
00466 return None
00467
00468 def get_relvaldata_cmssw_version(file):
00469 """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
00470 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00471 gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
00472 if not gr_r_version:
00473 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
00474 if cmssw_release and gr_r_version:
00475 return (cmssw_release[0], gr_r_version[0])
00476
00477 def get_relvaldata_version(file):
00478 """Returns tuple (CMSSW version, run version) for specified file."""
00479 cmssw_version = re.findall('DQM_V(\d*)_', file)
00480 run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
00481 if not run_version:
00482 run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
00483 if cmssw_version and run_version:
00484 return (int(cmssw_version[0]), int(run_version[0]))
00485
00486 def get_relvaldata_max_version(files):
00487 """Returns file with maximum version at a) beggining of the file,
00488 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00489 max_file = files[0]
00490 max_v = get_relvaldata_version(files[0])
00491 for file in files:
00492 file_v = get_relvaldata_version(file)
00493 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00494 max_file = file
00495 max_v = file_v
00496 return max_file
00497
00498
00499 def get_relval_version(file):
00500 """Returns tuple (CMSSW version, run version) for specified file."""
00501 cmssw_version = re.findall('DQM_V(\d*)_', file)
00502 run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
00503 if cmssw_version and run_version:
00504 return (int(cmssw_version[0]), int(run_version[0]))
00505
00506 def get_relval_max_version(files):
00507 """Returns file with maximum version at a) beggining of the file,
00508 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00509 max_file = files[0]
00510 max_v = get_relval_version(files[0])
00511 for file in files:
00512 file_v = get_relval_version(file)
00513 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00514 max_file = file
00515 max_v = file_v
00516 return max_file
00517
00518 def get_relval_cmssw_version(file):
00519 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00520 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(?:_[\w\d]*)?-v', file)
00521 if cmssw_release and gr_r_version:
00522 return (cmssw_release[0], gr_r_version[0])
00523
00524 def get_relval_id(file):
00525 """Returns unique relval ID (dataset name) for a given file."""
00526 dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
00527 run = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_([\w\d]*))?-v\d*__', file)
00528 if run[0]:
00529 return (dataset_name[0], run[0])
00530 else:
00531 return (dataset_name[0], '_V\d*\w?-v\d*__')
00532
00533
00534 def is_relvaldata(files):
00535 is_relvaldata_re = re.compile('_RelVal_')
00536 return any([is_relvaldata_re.search(filename) for filename in files])
00537
00538 def make_files_pairs(files, verbose=True):
00539
00540 if is_relvaldata(files):
00541 get_cmssw_version = get_relvaldata_cmssw_version
00542 get_id = get_relvaldata_id
00543 get_max_version = get_relvaldata_max_version
00544
00545 else:
00546 get_cmssw_version = get_relval_cmssw_version
00547 get_id = get_relval_id
00548 get_max_version = get_relval_max_version
00549
00550
00551
00552 versions_files = dict()
00553 for file in files:
00554 version = get_cmssw_version(file)
00555 if versions_files.has_key(version):
00556 versions_files[version].append(file)
00557 else:
00558 versions_files[version] = [file]
00559
00560
00561 if verbose:
00562 print '\nFound versions:'
00563 for version in versions_files:
00564 print '%s: %d files' % (str(version), len(versions_files[version]))
00565
00566
00567 versions = versions_files.keys()
00568 sizes = [len(value) for value in versions_files.values()]
00569 v1 = versions[sizes.index(max(sizes))]
00570 versions.remove(v1)
00571 sizes.remove(max(sizes))
00572 v2 = versions[sizes.index(max(sizes))]
00573
00574
00575 if verbose:
00576 print '\nPairing %s (%d files) and %s (%d files)' % (str(v1),
00577 len(versions_files[v1]), str(v2), len(versions_files[v2]))
00578
00579
00580 pairs = []
00581 for unique_id in set([get_id(file) for file in versions_files[v1]]):
00582 dataset_re = re.compile(unique_id[0]+'_')
00583 run = re.compile(unique_id[1])
00584 c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and run.search(file)]
00585 c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and run.search(file)]
00586 if len(c1_files) > 0 and len(c2_files) > 0:
00587 first_file = get_max_version(c1_files)
00588 second_file = get_max_version(c2_files)
00589 pairs.extend((first_file, second_file))
00590 if verbose:
00591 print "\nPaired and got %d files.\n" % len(pairs)
00592 return pairs