00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 import array
00016 import os
00017 import re
00018 import sys
00019 from cPickle import load
00020 from os.path import dirname,basename,join,isfile
00021 from threading import Thread
00022 from time import asctime
00023
00024 theargv=sys.argv
00025 sys.argv=[]
00026 from ROOT import *
00027 import ROOT
00028 ROOT.gErrorIgnoreLevel=1001
00029 ROOT.gROOT.SetBatch(True)
00030 sys.argv=theargv
00031
00032 from urllib2 import Request,build_opener,urlopen
00033
00034 if os.environ.has_key("RELMON_SA"):
00035 from definitions import *
00036 from authentication import X509CertOpen
00037 from utils import __file__ as this_module_name
00038 else:
00039 from Utilities.RelMon.definitions import *
00040 from Utilities.RelMon.authentication import X509CertOpen
00041 from Utilities.RelMon.utils import __file__ as this_module_name
00042
00043
00044
00045
00046 _log_level=10
00047 def logger(msg_level,message):
00048 if msg_level>=_log_level:
00049 print "[%s] %s" %(asctime(),message)
00050
00051
00052 def setTDRStyle():
00053 this_dir=dirname(this_module_name)
00054 this_dir_one_up=this_dir[:this_dir.rfind("/")+1]
00055
00056 style_file=''
00057 if os.environ.has_key("RELMON_SA"):
00058 style_file=this_dir_one_up+"data/tdrstyle_mod.C"
00059 else:
00060 style_file="%s/src/Utilities/RelMon/data/tdrstyle_mod.C"%(os.environ["CMSSW_BASE"])
00061 try:
00062 gROOT.ProcessLine(".L %s" %style_file)
00063 gROOT.ProcessLine("setTDRStyle()")
00064 except:
00065 "Print could not set the TDR style. File %s not found?" %style_file
00066
00067
00068
00069 def literal2root (literal,rootType):
00070 bitsarray = array.array('B')
00071 bitsarray.fromstring(literal.decode('hex'))
00072
00073 tbuffer=0
00074 try:
00075 tbuffer = TBufferFile(TBufferFile.kRead, len(bitsarray), bitsarray, False,0)
00076 except:
00077 print "could not transform to object array:"
00078 print [ i for i in bitsarray ]
00079
00080
00081 if rootType == 'TPROF':
00082 rootType = 'TProfile'
00083 if rootType == 'TPROF2D':
00084 rootType = 'TProfile2D'
00085
00086 root_class=eval(rootType+'.Class()')
00087
00088 return tbuffer.ReadObject(root_class)
00089
00090
00091
00092 def getNbins(h):
00093 biny=h.GetNbinsY()
00094 if biny>1:biny+=1
00095 binz=h.GetNbinsZ()
00096 if binz>1:binz+=1
00097 return (h.GetNbinsX()+1)*(biny)*(binz)
00098
00099
00100
00101
00102 class StatisticalTest(object):
00103 def __init__(self,threshold):
00104 self.name=""
00105 self.h1=None
00106 self.h2=None
00107 self.threshold=float(threshold)
00108 self.rank=-1
00109 self.is_init=False
00110
00111 def set_operands(self,h1,h2):
00112 self.h1=h1
00113 self.h2=h2
00114
00115 def get_rank(self):
00116 if not self.is_init:
00117 if self.rank < 0:
00118 type1=type(self.h1)
00119 type2=type(self.h2)
00120 if (type1 != type2):
00121 logger(1,"*** ERROR: object types in comparison don't match: %s!=%s" %(type1,type2))
00122 self.rank=test_codes["DIFF_TYPES"]
00123 elif not self.h2.InheritsFrom("TH1"):
00124 logger(1,"*** ERROR: object type is not histogram but a %s" %(type1))
00125 self.rank=test_codes["NO_HIST"]
00126
00127
00128
00129
00130 else:
00131 is_empty1=is_empty(self.h1)
00132 is_empty2=is_empty(self.h2)
00133 are_empty=is_empty1 and is_empty2
00134 one_empty=is_empty1 or is_empty2
00135
00136 Nbins1= getNbins(self.h1)
00137 Nbins2= getNbins(self.h2)
00138
00139 if are_empty:
00140
00141
00142 return 1
00143 elif one_empty:
00144
00145
00146 return 1
00147
00148
00149 if Nbins1!=Nbins2:
00150 return test_codes["DIFF_BIN"]
00151
00152 self.rank=self.do_test()
00153 self.is_init=True
00154 return self.rank
00155
00156 def get_status(self):
00157 status = SUCCESS
00158 if self.get_rank()<0:
00159 status=NULL
00160 logger(0,"+++ Test %s FAILED: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
00161 elif self.get_rank() < self.threshold:
00162 status=FAIL
00163 logger(0,"+++ Test %s: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
00164 return status
00165
00166 def do_test(self):
00167 pass
00168
00169
00170
00171 def is_empty(h):
00172 for i in xrange(1,getNbins(h)):
00173 if h.GetBinContent(i)!=0: return False
00174 return True
00175
00176
00177
00178
00179 def is_sparse(h):
00180 filled_bins=0.
00181 nbins=h.GetNbinsX()
00182 for ibin in xrange(nbins):
00183 if h.GetBinContent(ibin)>0:
00184 filled_bins+=1
00185
00186 if filled_bins/nbins < .5:
00187 return True
00188 else:
00189 return False
00190
00191
00192
00193 class KS(StatisticalTest):
00194 def __init__(self, threshold):
00195 StatisticalTest.__init__(self,threshold)
00196 self.name="KS"
00197
00198 def do_test(self):
00199
00200
00201 for h in self.h1,self.h2:
00202 w2s=h.GetSumw2()
00203 if w2s.GetSize()==0:
00204 h.Sumw2()
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216 return self.h1.KolmogorovTest(self.h2)
00217
00218
00219 import array
00220 def profile2histo(profile):
00221 if not profile.InheritsFrom("TH1"):
00222 return profile
00223
00224 bin_low_edges=[]
00225 n_bins=profile.GetNbinsX()
00226
00227 for ibin in xrange(1,n_bins+2):
00228 bin_low_edges.append(profile.GetBinLowEdge(ibin))
00229 bin_low_edges=array.array('f',bin_low_edges)
00230 histo=TH1F(profile.GetName(),profile.GetTitle(),n_bins,bin_low_edges)
00231 for ibin in xrange(0,n_bins+1):
00232 histo.SetBinContent(ibin,profile.GetBinContent(ibin))
00233 histo.SetBinError(ibin,profile.GetBinError(ibin))
00234
00235 return histo
00236
00237
00238 class Chi2(StatisticalTest):
00239 def __init__(self, threshold):
00240 StatisticalTest.__init__(self,threshold)
00241 self.name="Chi2"
00242
00243 def check_filled_bins(self,min_filled):
00244 nbins=self.h1.GetNbinsX()
00245 n_filled_l=[]
00246 for h in self.h1,self.h2:
00247 nfilled=0.
00248 for ibin in xrange(1,nbins+1):
00249 if h.GetBinContent(ibin)>0:
00250 nfilled+=1
00251 n_filled_l.append(nfilled)
00252 return len(filter (lambda x:x>=min_filled,n_filled_l) )>0
00253
00254 def absval(self):
00255 nbins=getNbins(self.h1)
00256 binc=0
00257 for i in xrange(1,nbins):
00258 for h in self.h1,self.h2:
00259 binc=h.GetBinContent(i)
00260 if binc<0:
00261 h.SetBinContent(i,-1*binc)
00262 if h.GetBinError(i)==0 and binc!=0:
00263
00264 h.SetBinContent(i,0)
00265
00266 def check_histograms(self, histogram):
00267 if histogram.InheritsFrom("TProfile") or (histogram.GetEntries()!=histogram.GetSumOfWeights()):
00268 return 'W'
00269 else:
00270 return 'U'
00271
00272 def do_test(self):
00273 self.absval()
00274 if self.check_filled_bins(3):
00275
00276
00277
00278
00279
00280
00281 hist1 = self.check_histograms(self.h1)
00282 hist2 = self.check_histograms(self.h2)
00283 if hist1 =='W' and hist2 =='W':
00284 chi2 = self.h1.Chi2Test(self.h2,'WW')
00285 return chi2
00286 elif hist1 == 'U' and hist2 == 'U':
00287 chi2 = self.h1.Chi2Test(self.h2,'UU')
00288 return chi2
00289 elif hist1 == 'U' and hist2 == 'W':
00290 chi2 = self.h1.Chi2Test(self.h2,'UW')
00291 return chi2
00292 elif hist1 == 'W' and hist2 == 'U':
00293 chi2 = self.h2.Chi2Test(self.h1,'UW')
00294 return chi2
00295 else:
00296 return 1
00297
00298
00299
00300
00301 class BinToBin(StatisticalTest):
00302 """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
00303 bins is different. It is % of corresponding bins otherwhise.
00304 A threshold of 1 is needed to require a 1 to 1 correspondance between
00305 hisograms.
00306 """
00307 def __init__(self, threshold=1):
00308 StatisticalTest.__init__(self, threshold)
00309 self.name='BinToBin'
00310 self.epsilon= 0.000001
00311
00312 def checkBinningMatches(self):
00313 if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00314 or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00315 or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00316 or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00317 or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00318 or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00319 or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00320 or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00321 or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00322 return False
00323 return True
00324
00325 def do_test(self):
00326
00327 if not self.checkBinningMatches():
00328 return test_codes["DIFF_BIN"]
00329
00330 equal = 1
00331 nbins = getNbins(self.h1)
00332 n_ok_bins=0.0
00333 for ibin in xrange(0,nbins):
00334 ibin+=1
00335 h1bin=self.h1.GetBinContent(ibin)
00336 h2bin=self.h2.GetBinContent(ibin)
00337 bindiff=h1bin-h2bin
00338
00339 binavg=.5*(h1bin+h2bin)
00340
00341 if binavg==0 or abs(bindiff) < self.epsilon:
00342 n_ok_bins+=1
00343
00344 else:
00345 print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00346
00347
00348
00349
00350 rank=n_ok_bins/nbins
00351
00352 if rank!=1:
00353 print "Histogram %s differs: nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00354
00355 return rank
00356
00357
00358
00359 class BinToBin1percent(StatisticalTest):
00360 """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
00361 bins is different. It is % of corresponding bins otherwhise.
00362 A threshold of 1 is needed to require a 1 to 1 correspondance between
00363 hisograms.
00364 """
00365 def __init__(self, threshold=1):
00366 StatisticalTest.__init__(self, threshold)
00367 self.name='BinToBin1percent'
00368 self.epsilon= 0.000001
00369 self.tolerance= 0.01
00370
00371 def checkBinningMatches(self):
00372 if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00373 or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00374 or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00375 or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00376 or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00377 or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00378 or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00379 or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00380 or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00381 return False
00382 return True
00383
00384 def do_test(self):
00385
00386 if not self.checkBinningMatches():
00387 return test_codes["DIFF_BIN"]
00388
00389 equal = 1
00390 nbins = getNbins(self.h1)
00391 n_ok_bins=0.0
00392 for ibin in xrange(0,nbins):
00393 ibin+=1
00394 h1bin=self.h1.GetBinContent(ibin)
00395 h2bin=self.h2.GetBinContent(ibin)
00396 bindiff=h1bin-h2bin
00397
00398 binavg=.5*(h1bin+h2bin)
00399
00400 if binavg==0 or 100*abs(bindiff)/binavg < self.tolerance:
00401 n_ok_bins+=1
00402
00403 else:
00404 print "-->Bin %i bin: bindiff %s (%s - %s )" %(ibin,bindiff,h1bin,h2bin)
00405
00406
00407
00408
00409 rank=n_ok_bins/nbins
00410
00411 if rank!=1:
00412 print "%s nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00413
00414 return rank
00415
00416 Statistical_Tests={"KS":KS,
00417 "Chi2":Chi2,
00418 "BinToBin":BinToBin,
00419 "BinToBin1percent":BinToBin1percent,
00420 "Bin2Bin":BinToBin,
00421 "b2b":BinToBin,}
00422
00423
00424 def ask_ok(prompt, retries=4, complaint='yes or no'):
00425 while True:
00426 ok = raw_input(prompt)
00427 if ok in ('y', 'ye', 'yes'):
00428 return True
00429 if ok in ('n', 'no'):
00430 return False
00431 retries = retries - 1
00432 if retries < 0:
00433 raise IOError('refusenik user')
00434 print complaint
00435
00436
00437
00438 class unpickler(Thread):
00439 def __init__(self,filename):
00440 Thread.__init__(self)
00441 self.filename=filename
00442 self.directory=""
00443
00444 def run(self):
00445 print "Reading directory from %s" %(self.filename)
00446 ifile=open(self.filename,"rb")
00447 self.directory=load(ifile)
00448 ifile.close()
00449
00450
00451
00452 def wget(url):
00453 """ Fetch the WHOLE file, not in bunches... To be optimised.
00454 """
00455 opener=build_opener(X509CertOpen())
00456 datareq = Request(url)
00457 datareq.add_header('authenticated_wget', "The ultimate wgetter")
00458 bin_content=None
00459 try:
00460 filename=basename(url)
00461 print "Checking existence of file %s on disk..."%filename
00462 if not isfile("./%s"%filename):
00463 bin_content=opener.open(datareq).read()
00464 else:
00465 print "File %s exists, skipping.." %filename
00466 except ValueError:
00467 print "Error: Unknown url %s" %url
00468
00469 if bin_content!=None:
00470 ofile = open(filename, 'wb')
00471 ofile.write(bin_content)
00472 ofile.close()
00473
00474
00475
00476
00477 def get_relvaldata_id(file):
00478 """Returns unique relvaldata ID for a given file."""
00479 run_id = re.search('R\d{9}', file)
00480 run = re.search('_RelVal_([\w\d]*)-v\d__', file)
00481 if not run:
00482 run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
00483 if run_id and run:
00484 return (run_id.group(), run.group(1))
00485 return None
00486
00487 def get_relvaldata_cmssw_version(file):
00488 """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
00489 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00490 gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
00491 if not gr_r_version:
00492 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
00493 if cmssw_release and gr_r_version:
00494 return (cmssw_release[0], gr_r_version[0])
00495
00496 def get_relvaldata_version(file):
00497 """Returns tuple (CMSSW version, run version) for specified file."""
00498 cmssw_version = re.findall('DQM_V(\d*)_', file)
00499 run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
00500 if not run_version:
00501 run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
00502 if cmssw_version and run_version:
00503 return (int(cmssw_version[0]), int(run_version[0]))
00504
00505 def get_relvaldata_max_version(files):
00506 """Returns file with maximum version at a) beggining of the file,
00507 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00508 max_file = files[0]
00509 max_v = get_relvaldata_version(files[0])
00510 for file in files:
00511 file_v = get_relvaldata_version(file)
00512 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00513 max_file = file
00514 max_v = file_v
00515 return max_file
00516
00517
00518 def get_relval_version(file):
00519 """Returns tuple (CMSSW version, run version) for specified file."""
00520 cmssw_version = re.findall('DQM_V(\d*)_', file)
00521 run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
00522 if cmssw_version and run_version:
00523 return (int(cmssw_version[0]), int(run_version[0]))
00524
00525 def get_relval_max_version(files):
00526 """Returns file with maximum version at a) beggining of the file,
00527 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00528 max_file = files[0]
00529 max_v = get_relval_version(files[0])
00530 for file in files:
00531 file_v = get_relval_version(file)
00532 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00533 max_file = file
00534 max_v = file_v
00535 return max_file
00536
00537 def get_relval_cmssw_version(file):
00538 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00539 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
00540 if cmssw_release and gr_r_version:
00541 return (cmssw_release[0], gr_r_version[0])
00542
00543 def get_relval_id(file):
00544 """Returns unique relval ID (dataset name) for a given file."""
00545 dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
00546 return dataset_name[0]
00547
00548
00549 def is_relvaldata(files):
00550 is_relvaldata_re = re.compile('_RelVal_')
00551 return any([is_relvaldata_re.search(filename) for filename in files])
00552
00553 def make_files_pairs(files, verbose=True):
00554
00555 if is_relvaldata(files):
00556 is_relval_data = True
00557 get_cmssw_version = get_relvaldata_cmssw_version
00558 get_id = get_relvaldata_id
00559 get_max_version = get_relvaldata_max_version
00560
00561 else:
00562 is_relval_data = False
00563 get_cmssw_version = get_relval_cmssw_version
00564 get_id = get_relval_id
00565 get_max_version = get_relval_max_version
00566
00567
00568
00569 versions_files = dict()
00570 for file in files:
00571 version = get_cmssw_version(file)
00572 if versions_files.has_key(version):
00573 versions_files[version].append(file)
00574 else:
00575 versions_files[version] = [file]
00576
00577
00578 if verbose:
00579 print '\nFound versions:'
00580 for version in versions_files:
00581 print '%s: %d files' % (str(version), len(versions_files[version]))
00582
00583 if len(versions_files.keys()) <= 1:
00584 print '\nFound too little versions, there is nothing to pair. Exiting...\n'
00585 exit()
00586
00587
00588 versions = versions_files.keys()
00589 sizes = [len(value) for value in versions_files.values()]
00590 v1 = versions[sizes.index(max(sizes))]
00591 versions.remove(v1)
00592 sizes.remove(max(sizes))
00593 v2 = versions[sizes.index(max(sizes))]
00594
00595
00596 if verbose:
00597 print '\nPairing %s (%d files) and %s (%d files)' % (str(v1),
00598 len(versions_files[v1]), str(v2), len(versions_files[v2]))
00599
00600
00601 print '\nGot pairs:'
00602 pairs = []
00603 for unique_id in set([get_id(file) for file in versions_files[v1]]):
00604 if is_relval_data:
00605 dataset_re = re.compile(unique_id[0]+'_')
00606 run_re = re.compile(unique_id[1])
00607 c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and run_re.search(file)]
00608 c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and run_re.search(file)]
00609 else:
00610 dataset_re = re.compile(unique_id+'_')
00611 c1_files = [file for file in versions_files[v1] if dataset_re.search(file)]
00612 c2_files = [file for file in versions_files[v2] if dataset_re.search(file)]
00613
00614 if len(c1_files) > 0 and len(c2_files) > 0:
00615 first_file = get_max_version(c1_files)
00616 second_file = get_max_version(c2_files)
00617 print '%s\n%s\n' % (first_file, second_file)
00618 pairs.extend((first_file, second_file))
00619 if verbose:
00620 print "Paired and got %d files.\n" % len(pairs)
00621 return pairs