00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 import array
00016 import os
00017 import re
00018 import sys
00019 from cPickle import load
00020 from os.path import dirname,basename,join,isfile
00021 from threading import Thread
00022 from time import asctime
00023
00024 theargv=sys.argv
00025 sys.argv=[]
00026 from ROOT import *
00027 import ROOT
00028 ROOT.gErrorIgnoreLevel=1001
00029 ROOT.gROOT.SetBatch(True)
00030 sys.argv=theargv
00031
00032 from urllib2 import Request,build_opener,urlopen
00033
00034 if os.environ.has_key("RELMON_SA"):
00035 from definitions import *
00036 from authentication import X509CertOpen
00037 from utils import __file__ as this_module_name
00038 else:
00039 from Utilities.RelMon.definitions import *
00040 from Utilities.RelMon.authentication import X509CertOpen
00041 from Utilities.RelMon.utils import __file__ as this_module_name
00042
00043
00044
00045
00046 _log_level=10
00047 def logger(msg_level,message):
00048 if msg_level>=_log_level:
00049 print "[%s] %s" %(asctime(),message)
00050
00051
00052 def setTDRStyle():
00053 this_dir=dirname(this_module_name)
00054 this_dir_one_up=this_dir[:this_dir.rfind("/")+1]
00055
00056 style_file=''
00057 if os.environ.has_key("RELMON_SA"):
00058 style_file=this_dir_one_up+"data/tdrstyle_mod.C"
00059 else:
00060 style_file="%s/src/Utilities/RelMon/data/tdrstyle_mod.C"%(os.environ["CMSSW_BASE"])
00061 try:
00062 gROOT.ProcessLine(".L %s" %style_file)
00063 gROOT.ProcessLine("setTDRStyle()")
00064 except:
00065 "Print could not set the TDR style. File %s not found?" %style_file
00066
00067
00068
00069 def literal2root (literal,rootType):
00070 bitsarray = array.array('B')
00071 bitsarray.fromstring(literal.decode('hex'))
00072
00073 tbuffer=0
00074 try:
00075 tbuffer = TBufferFile(TBufferFile.kRead, len(bitsarray), bitsarray, False,0)
00076 except:
00077 print "could not transform to object array:"
00078 print [ i for i in bitsarray ]
00079
00080
00081 if rootType == 'TPROF':
00082 rootType = 'TProfile'
00083 if rootType == 'TPROF2D':
00084 rootType = 'TProfile2D'
00085
00086 root_class=eval(rootType+'.Class()')
00087
00088 return tbuffer.ReadObject(root_class)
00089
00090
00091
00092 def getNbins(h):
00093 biny=h.GetNbinsY()
00094 if biny>1:biny+=1
00095 binz=h.GetNbinsZ()
00096 if binz>1:binz+=1
00097 return (h.GetNbinsX()+1)*(biny)*(binz)
00098
00099
00100
00101
00102 class StatisticalTest(object):
00103 def __init__(self,threshold):
00104 self.name=""
00105 self.h1=None
00106 self.h2=None
00107 self.threshold=float(threshold)
00108 self.rank=-1
00109 self.is_init=False
00110
00111 def set_operands(self,h1,h2):
00112 self.h1=h1
00113 self.h2=h2
00114
00115 def get_rank(self):
00116 if not self.is_init:
00117 if self.rank < 0:
00118 type1=type(self.h1)
00119 type2=type(self.h2)
00120 if (type1 != type2):
00121 logger(1,"*** ERROR: object types in comparison don't match: %s!=%s" %(type1,type2))
00122 self.rank=test_codes["DIFF_TYPES"]
00123 elif not self.h2.InheritsFrom("TH1"):
00124 logger(1,"*** ERROR: object type is not histogram but a %s" %(type1))
00125 self.rank=test_codes["NO_HIST"]
00126
00127
00128
00129
00130 else:
00131 is_empty1=is_empty(self.h1)
00132 is_empty2=is_empty(self.h2)
00133 are_empty=is_empty1 and is_empty2
00134 one_empty=is_empty1 or is_empty2
00135
00136 Nbins1= getNbins(self.h1)
00137 Nbins2= getNbins(self.h2)
00138
00139 if are_empty:
00140
00141
00142 return 1
00143 elif one_empty:
00144
00145
00146 return 1
00147
00148
00149 if Nbins1!=Nbins2:
00150 return test_codes["DIFF_BIN"]
00151
00152 self.rank=self.do_test()
00153 self.is_init=True
00154 return self.rank
00155
00156 def get_status(self):
00157 status = SUCCESS
00158 if self.get_rank()<0:
00159 status=NULL
00160 logger(0,"+++ Test %s FAILED: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
00161 elif self.get_rank() < self.threshold:
00162 status=FAIL
00163 logger(0,"+++ Test %s: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
00164 return status
00165
00166 def do_test(self):
00167 pass
00168
00169
00170
00171 def is_empty(h):
00172 for i in xrange(1,getNbins(h)):
00173 if h.GetBinContent(i)!=0: return False
00174 return True
00175
00176
00177
00178
00179 def is_sparse(h):
00180 filled_bins=0.
00181 nbins=h.GetNbinsX()
00182 for ibin in xrange(nbins):
00183 if h.GetBinContent(ibin)>0:
00184 filled_bins+=1
00185
00186 if filled_bins/nbins < .5:
00187 return True
00188 else:
00189 return False
00190
00191
00192
00193 class KS(StatisticalTest):
00194 def __init__(self, threshold):
00195 StatisticalTest.__init__(self,threshold)
00196 self.name="KS"
00197
00198 def do_test(self):
00199
00200
00201 for h in self.h1,self.h2:
00202 w2s=h.GetSumw2()
00203 if w2s.GetSize()==0:
00204 h.Sumw2()
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216 return self.h1.KolmogorovTest(self.h2)
00217
00218
00219 import array
00220 def profile2histo(profile):
00221 if not profile.InheritsFrom("TH1"):
00222 return profile
00223
00224 bin_low_edges=[]
00225 n_bins=profile.GetNbinsX()
00226
00227 for ibin in xrange(1,n_bins+2):
00228 bin_low_edges.append(profile.GetBinLowEdge(ibin))
00229 bin_low_edges=array.array('f',bin_low_edges)
00230 histo=TH1F(profile.GetName(),profile.GetTitle(),n_bins,bin_low_edges)
00231 for ibin in xrange(0,n_bins+1):
00232 histo.SetBinContent(ibin,profile.GetBinContent(ibin))
00233 histo.SetBinError(ibin,profile.GetBinError(ibin))
00234
00235 return histo
00236
00237
00238 class Chi2(StatisticalTest):
00239 def __init__(self, threshold):
00240 StatisticalTest.__init__(self,threshold)
00241 self.name="Chi2"
00242
00243 def check_filled_bins(self,min_filled):
00244 nbins=self.h1.GetNbinsX()
00245 n_filled_l=[]
00246 for h in self.h1,self.h2:
00247 nfilled=0.
00248 for ibin in xrange(1,nbins+1):
00249 if h.GetBinContent(ibin)>0:
00250 nfilled+=1
00251 n_filled_l.append(nfilled)
00252 return len(filter (lambda x:x>=min_filled,n_filled_l) )>0
00253
00254 def absval(self):
00255 nbins=getNbins(self.h1)
00256 binc=0
00257 for i in xrange(1,nbins):
00258 for h in self.h1,self.h2:
00259 binc=h.GetBinContent(i)
00260 if binc<0:
00261 h.SetBinContent(i,-1*binc)
00262 if h.GetBinError(i)==0 and binc!=0:
00263
00264 h.SetBinContent(i,0)
00265
00266 def check_histograms(self, histogram):
00267 if histogram.InheritsFrom("TProfile") or (histogram.GetEntries()!=histogram.GetSumOfWeights()):
00268 return 'W'
00269 else:
00270 return 'U'
00271
00272 def do_test(self):
00273 self.absval()
00274 if self.check_filled_bins(3):
00275
00276
00277
00278
00279
00280
00281 hist1 = self.check_histograms(self.h1)
00282 hist2 = self.check_histograms(self.h2)
00283 if hist1 =='W' and hist2 =='W':
00284 chi2 = self.h1.Chi2Test(self.h2,'WW')
00285 return chi2
00286 elif hist1 == 'U' and hist2 == 'U':
00287 chi2 = self.h1.Chi2Test(self.h2,'UU')
00288 return chi2
00289 elif hist1 == 'U' and hist2 == 'W':
00290 chi2 = self.h1.Chi2Test(self.h2,'UW')
00291 return chi2
00292 elif hist1 == 'W' and hist2 == 'U':
00293 chi2 = self.h2.Chi2Test(self.h1,'UW')
00294 return chi2
00295 else:
00296 return 1
00297
00298
00299
00300
00301 class BinToBin(StatisticalTest):
00302 """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
00303 bins is different. It is % of corresponding bins otherwhise.
00304 A threshold of 1 is needed to require a 1 to 1 correspondance between
00305 hisograms.
00306 """
00307 def __init__(self, threshold=1):
00308 StatisticalTest.__init__(self, threshold)
00309 self.name='BinToBin'
00310 self.epsilon= 0.000001
00311
00312 def checkBinningMatches(self):
00313 if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00314 or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00315 or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00316 or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00317 or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00318 or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00319 or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00320 or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00321 or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00322 return False
00323 return True
00324
00325 def do_test(self):
00326
00327 if not self.checkBinningMatches():
00328 return test_codes["DIFF_BIN"]
00329
00330 equal = 1
00331 nbins = getNbins(self.h1)
00332 n_ok_bins=0.0
00333 for ibin in xrange(0,nbins):
00334 h1bin=self.h1.GetBinContent(ibin)
00335 h2bin=self.h2.GetBinContent(ibin)
00336 bindiff=h1bin-h2bin
00337
00338 binavg=.5*(h1bin+h2bin)
00339
00340 if binavg==0 or abs(bindiff) < self.epsilon:
00341 n_ok_bins+=1
00342
00343 else:
00344 print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00345
00346
00347
00348
00349 rank=n_ok_bins/nbins
00350
00351 if rank!=1:
00352 print "Histogram %s differs: nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00353
00354 return rank
00355
00356
00357
00358 class BinToBin1percent(StatisticalTest):
00359 """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
00360 bins is different. It is % of corresponding bins otherwhise.
00361 A threshold of 1 is needed to require a 1 to 1 correspondance between
00362 hisograms.
00363 """
00364 def __init__(self, threshold=1):
00365 StatisticalTest.__init__(self, threshold)
00366 self.name='BinToBin1percent'
00367 self.epsilon= 0.000001
00368 self.tolerance= 0.01
00369
00370 def checkBinningMatches(self):
00371 if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00372 or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00373 or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00374 or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00375 or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00376 or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00377 or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00378 or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00379 or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00380 return False
00381 return True
00382
00383 def do_test(self):
00384
00385 if not self.checkBinningMatches():
00386 return test_codes["DIFF_BIN"]
00387
00388 equal = 1
00389 nbins = getNbins(self.h1)
00390 n_ok_bins=0.0
00391 for ibin in xrange(0,nbins):
00392 ibin+=1
00393 h1bin=self.h1.GetBinContent(ibin)
00394 h2bin=self.h2.GetBinContent(ibin)
00395 bindiff=h1bin-h2bin
00396
00397 binavg=.5*(h1bin+h2bin)
00398
00399 if binavg==0 or 100*abs(bindiff)/binavg < self.tolerance:
00400 n_ok_bins+=1
00401
00402 else:
00403 print "-->Bin %i bin: bindiff %s (%s - %s )" %(ibin,bindiff,h1bin,h2bin)
00404
00405
00406
00407
00408 rank=n_ok_bins/nbins
00409
00410 if rank!=1:
00411 print "%s nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00412
00413 return rank
00414
00415 Statistical_Tests={"KS":KS,
00416 "Chi2":Chi2,
00417 "BinToBin":BinToBin,
00418 "BinToBin1percent":BinToBin1percent,
00419 "Bin2Bin":BinToBin,
00420 "b2b":BinToBin,}
00421
00422
00423 def ask_ok(prompt, retries=4, complaint='yes or no'):
00424 while True:
00425 ok = raw_input(prompt)
00426 if ok in ('y', 'ye', 'yes'):
00427 return True
00428 if ok in ('n', 'no'):
00429 return False
00430 retries = retries - 1
00431 if retries < 0:
00432 raise IOError('refusenik user')
00433 print complaint
00434
00435
00436
00437 class unpickler(Thread):
00438 def __init__(self,filename):
00439 Thread.__init__(self)
00440 self.filename=filename
00441 self.directory=""
00442
00443 def run(self):
00444 print "Reading directory from %s" %(self.filename)
00445 ifile=open(self.filename,"rb")
00446 self.directory=load(ifile)
00447 ifile.close()
00448
00449
00450
00451 def wget(url):
00452 """ Fetch the WHOLE file, not in bunches... To be optimised.
00453 """
00454 opener=build_opener(X509CertOpen())
00455 datareq = Request(url)
00456 datareq.add_header('authenticated_wget', "The ultimate wgetter")
00457 bin_content=None
00458 try:
00459 filename=basename(url)
00460 print "Checking existence of file %s on disk..."%filename
00461 if not isfile("./%s"%filename):
00462 bin_content=opener.open(datareq).read()
00463 else:
00464 print "File %s exists, skipping.." %filename
00465 except ValueError:
00466 print "Error: Unknown url %s" %url
00467
00468 if bin_content!=None:
00469 ofile = open(filename, 'wb')
00470 ofile.write(bin_content)
00471 ofile.close()
00472
00473
00474
00475
00476 def get_relvaldata_id(file):
00477 """Returns unique relvaldata ID for a given file."""
00478 run_id = re.search('R\d{9}', file)
00479 run = re.search('_RelVal_([\w\d]*)-v\d__', file)
00480 if not run:
00481 run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
00482 if run_id and run:
00483 return (run_id.group(), run.group(1))
00484 return None
00485
00486 def get_relvaldata_cmssw_version(file):
00487 """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
00488 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00489 gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
00490 if not gr_r_version:
00491 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
00492 if cmssw_release and gr_r_version:
00493 return (cmssw_release[0], gr_r_version[0])
00494
00495 def get_relvaldata_version(file):
00496 """Returns tuple (CMSSW version, run version) for specified file."""
00497 cmssw_version = re.findall('DQM_V(\d*)_', file)
00498 run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
00499 if not run_version:
00500 run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
00501 if cmssw_version and run_version:
00502 return (int(cmssw_version[0]), int(run_version[0]))
00503
00504 def get_relvaldata_max_version(files):
00505 """Returns file with maximum version at a) beggining of the file,
00506 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00507 max_file = files[0]
00508 max_v = get_relvaldata_version(files[0])
00509 for file in files:
00510 file_v = get_relvaldata_version(file)
00511 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00512 max_file = file
00513 max_v = file_v
00514 return max_file
00515
00516
00517 def get_relval_version(file):
00518 """Returns tuple (CMSSW version, run version) for specified file."""
00519 cmssw_version = re.findall('DQM_V(\d*)_', file)
00520 run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
00521 if cmssw_version and run_version:
00522 return (int(cmssw_version[0]), int(run_version[0]))
00523
00524 def get_relval_max_version(files):
00525 """Returns file with maximum version at a) beggining of the file,
00526 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00527 max_file = files[0]
00528 max_v = get_relval_version(files[0])
00529 for file in files:
00530 file_v = get_relval_version(file)
00531 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00532 max_file = file
00533 max_v = file_v
00534 return max_file
00535
00536 def get_relval_cmssw_version(file):
00537 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00538 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
00539 if cmssw_release and gr_r_version:
00540 return (cmssw_release[0], gr_r_version[0])
00541
00542 def get_relval_id(file):
00543 """Returns unique relval ID (dataset name) for a given file."""
00544 dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
00545 return dataset_name[0]
00546
00547
00548 def is_relvaldata(files):
00549 is_relvaldata_re = re.compile('_RelVal_')
00550 return any([is_relvaldata_re.search(filename) for filename in files])
00551
00552 def make_files_pairs(files, verbose=True):
00553
00554 if is_relvaldata(files):
00555 is_relval_data = True
00556 get_cmssw_version = get_relvaldata_cmssw_version
00557 get_id = get_relvaldata_id
00558 get_max_version = get_relvaldata_max_version
00559
00560 else:
00561 is_relval_data = False
00562 get_cmssw_version = get_relval_cmssw_version
00563 get_id = get_relval_id
00564 get_max_version = get_relval_max_version
00565
00566
00567
00568 versions_files = dict()
00569 for file in files:
00570 version = get_cmssw_version(file)
00571 if versions_files.has_key(version):
00572 versions_files[version].append(file)
00573 else:
00574 versions_files[version] = [file]
00575
00576
00577 if verbose:
00578 print '\nFound versions:'
00579 for version in versions_files:
00580 print '%s: %d files' % (str(version), len(versions_files[version]))
00581
00582 if len(versions_files.keys()) <= 1:
00583 print '\nFound too little versions, there is nothing to pair. Exiting...\n'
00584 exit()
00585
00586
00587 versions = versions_files.keys()
00588 sizes = [len(value) for value in versions_files.values()]
00589 v1 = versions[sizes.index(max(sizes))]
00590 versions.remove(v1)
00591 sizes.remove(max(sizes))
00592 v2 = versions[sizes.index(max(sizes))]
00593
00594
00595 if verbose:
00596 print '\nPairing %s (%d files) and %s (%d files)' % (str(v1),
00597 len(versions_files[v1]), str(v2), len(versions_files[v2]))
00598
00599
00600 print '\nGot pairs:'
00601 pairs = []
00602 for unique_id in set([get_id(file) for file in versions_files[v1]]):
00603 if is_relval_data:
00604 dataset_re = re.compile(unique_id[0]+'_')
00605 run_re = re.compile(unique_id[1])
00606 c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and run_re.search(file)]
00607 c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and run_re.search(file)]
00608 else:
00609 dataset_re = re.compile(unique_id+'_')
00610 c1_files = [file for file in versions_files[v1] if dataset_re.search(file)]
00611 c2_files = [file for file in versions_files[v2] if dataset_re.search(file)]
00612
00613 if len(c1_files) > 0 and len(c2_files) > 0:
00614 first_file = get_max_version(c1_files)
00615 second_file = get_max_version(c2_files)
00616 print '%s\n%s\n' % (first_file, second_file)
00617 pairs.extend((first_file, second_file))
00618 if verbose:
00619 print "Paired and got %d files.\n" % len(pairs)
00620 return pairs