00001
00002 '''
00003 Help functions for ValidationMatrix_v2.py.
00004
00005 Author: Albertas Gimbutas, Vilnius University (LT)
00006 e-mail: albertasgim@gmail.com
00007 '''
00008 import sys
00009 import re
00010 import time
00011 import sqlite3
00012 from datetime import datetime
00013 from multiprocessing import Pool, Queue, Process
00014 import subprocess
00015 from optparse import OptionParser, OptionGroup
00016 from os import makedirs, listdir
00017 from os.path import basename, dirname, isfile, splitext, join, exists, getsize
00018 from Queue import Empty
00019 from urllib2 import build_opener, Request, HTTPError
00020 from urlparse import urlparse
00021 from httplib import BadStatusLine
00022
00023 try:
00024 from Utilities.RelMon.authentication import X509CertOpen
00025 except ImportError:
00026 from authentication import X509CertOpen
00027
00028
00029 def get_relvaldata_id(file):
00030 """Returns unique relvaldata ID for a given file."""
00031 run_id = re.search('R\d{9}', file)
00032 run = re.search('_RelVal_([\w\d]*)-v\d__', file)
00033 if not run:
00034 run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
00035 if run_id and run:
00036 return (run_id.group(), run.group(1))
00037 return None
00038
00039 def get_relvaldata_cmssw_version(file):
00040 """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
00041 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00042 gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
00043 if not gr_r_version:
00044 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
00045 if cmssw_release and gr_r_version:
00046 return (cmssw_release[0], gr_r_version[0])
00047
00048 def get_relvaldata_version(file):
00049 """Returns tuple (CMSSW version, run version) for specified file."""
00050 cmssw_version = re.findall('DQM_V(\d*)_', file)
00051 run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
00052 if not run_version:
00053 run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
00054 if cmssw_version and run_version:
00055 return (int(cmssw_version[0]), int(run_version[0]))
00056
00057 def get_relvaldata_max_version(files):
00058 """Returns file with maximum version at a) beggining of the file,
00059 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00060 max_file = files[0]
00061 max_v = get_relvaldata_version(files[0])
00062 for file in files:
00063 file_v = get_relvaldata_version(file)
00064 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00065 max_file = file
00066 max_v = file_v
00067 return max_file
00068
00069
00070 def get_relval_version(file):
00071 """Returns tuple (CMSSW version, run version) for specified file."""
00072 cmssw_version = re.findall('DQM_V(\d*)_', file)
00073 run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
00074 if cmssw_version and run_version:
00075 return (int(cmssw_version[0]), int(run_version[0]))
00076
00077 def get_relval_max_version(files):
00078 """Returns file with maximum version at a) beggining of the file,
00079 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00080 max_file = files[0]
00081 max_v = get_relval_version(files[0])
00082 for file in files:
00083 file_v = get_relval_version(file)
00084 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00085 max_file = file
00086 max_v = file_v
00087 return max_file
00088
00089 def get_relval_cmssw_version(file):
00090 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00091 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
00092 if cmssw_release and gr_r_version:
00093 return (cmssw_release[0], gr_r_version[0])
00094
00095 def get_relval_id(file):
00096 """Returns unique relval ID (dataset name) for a given file."""
00097 dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
00098 return dataset_name[0]
00099
00100
00101 def is_relvaldata(files):
00102 is_relvaldata_re = re.compile('_RelVal_')
00103 return any([is_relvaldata_re.search(filename) for filename in files])
00104
00105 def make_file_pairs(files1, files2):
00106 print '\n################# Analyzing files ###################'
00107
00108 if is_relvaldata(files1):
00109 is_relval_data = True
00110 get_cmssw_version = get_relvaldata_cmssw_version
00111 get_id = get_relvaldata_id
00112 get_max_version = get_relvaldata_max_version
00113 else:
00114 is_relval_data = False
00115 get_cmssw_version = get_relval_cmssw_version
00116 get_id = get_relval_id
00117 get_max_version = get_relval_max_version
00118
00119
00120 versions1, versions2 = dict(), dict()
00121 for files, versions in (files1, versions1), (files2, versions2):
00122 for file in files:
00123 version = get_cmssw_version(file)
00124 if version:
00125 if versions.has_key(version):
00126 versions[version].append(file)
00127 else:
00128 versions[version] = [file]
00129
00130
00131 print 'For RELEASE1 found file groups:'
00132 for version in versions1:
00133 print ' %s: %d files' % (str(version), len(versions1[version]))
00134 if not versions1:
00135 print 'None.'
00136
00137 print '\nFor RELEASE2 found file groups:'
00138 for version in versions2:
00139 print ' %s: %d files' % (str(version), len(versions2[version]))
00140 if not versions2:
00141 print 'None.'
00142
00143 if not len(versions1) or not len(versions2):
00144 print '\nNot enough file groups. Exiting...\n'
00145 exit()
00146
00147
00148 pairs = []
00149 for v1 in sorted(versions1, key=lambda x: len(versions1[x]), reverse=True):
00150 for v2 in sorted(versions2, key=lambda x: len(versions2[x]), reverse=True):
00151 if v1 == v2:
00152 continue
00153
00154 print '\n################# Pairing the files ###################'
00155 print '%s (%d files) VS %s (%d files):\n' % (str(v1),
00156 len(versions1[v1]), str(v2), len(versions2[v2]))
00157
00158
00159 for unique_id in set([get_id(file) for file in versions1[v1]]):
00160 if is_relval_data:
00161 dataset_re = re.compile(unique_id[0] + '_')
00162 run_re = re.compile(unique_id[1])
00163 c1_files = [file for file in versions1[v1] if dataset_re.search(file) and run_re.search(file)]
00164 c2_files = [file for file in versions2[v2] if dataset_re.search(file) and run_re.search(file)]
00165 else:
00166 dataset_re = re.compile(unique_id + '_')
00167 c1_files = [file for file in versions1[v1] if dataset_re.search(file)]
00168 c2_files = [file for file in versions2[v2] if dataset_re.search(file)]
00169
00170 if len(c1_files) > 0 and len(c2_files) > 0:
00171 first_file = get_max_version(c1_files)
00172 second_file = get_max_version(c2_files)
00173 print '%s\n%s\n' % (first_file, second_file)
00174 pairs.append((first_file, second_file))
00175
00176 print "Got %d pairs." % (len(pairs))
00177 if pairs:
00178 return pairs
00179 print 'Found no file pairs. Exiting..\n'
00180 exit()
00181
00182
00183 def auth_wget(url):
00184 try:
00185 opener = build_opener(X509CertOpen())
00186 return opener.open(Request(url)).read()
00187 except HTTPError, e:
00188 print '\nError: DQM GUI is temporarily unavailable. Probably maintainance hours. '+\
00189 'Please try again later. Original error message: ``%s``. \nExiting...\n' % (e,)
00190 exit()
00191 except BadStatusLine, e:
00192 print '\nYou do not have permissions to access DQM GUI. Please check if your certificates '+\
00193 'in ``~/.globus`` directory are configured correctly. Exitting...'
00194 exit()
00195
00196
00197 def auth_download_file(url, chunk_size=1048576):
00198 filename = basename(url)
00199 file_path = join(auth_download_file.work_dir, filename)
00200
00201 file = open(file_path, 'wb')
00202 opener = build_opener(X509CertOpen())
00203 url_file = opener.open(Request(url))
00204 chunk = url_file.read(chunk_size)
00205 while chunk:
00206 file.write(chunk)
00207 auth_download_file.q.put((1,))
00208 chunk = url_file.read(chunk_size)
00209 print '\rDownloaded: %s ' % (filename,)
00210 file.close()
00211
00212
00213 def recursive_search_online(url, rel1, frags1, rel2, frags2):
00214 """Recursively searches for files, that matches the pattern."""
00215 if not url:
00216 url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/'
00217 g1, g2 = recursive_search_online(url, rel1, frags1, rel2, frags2)
00218 url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelVal/'
00219 g3, g4 = recursive_search_online(url, rel1, frags1, rel2, frags2)
00220 g1.update(g3), g2.update(g4)
00221 return g1, g2
00222
00223 domain = '://'.join(urlparse(url)[:2])
00224
00225
00226 href_re = re.compile(r"<a href='([-./\w]*)'>([-./\w]*)<")
00227
00228 def compile_res(rel, frags):
00229 frags = frags.split(',')
00230 regexps = [s for s in frags if not s.startswith('!')]
00231 regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
00232 regexps += [rel + '-', '.root']
00233 return [re.compile(r) for r in regexps]
00234
00235 res1 = compile_res(rel1, frags1)
00236 res2 = compile_res(rel2, frags2)
00237
00238
00239 hrefs = [(name, path) for path, name in href_re.findall(auth_wget(url))[1:]]
00240 files_with_urls1, files_with_urls2 = dict(), dict()
00241 for name, path in hrefs:
00242 if splitext(name)[1]:
00243 if all([r.search(name) for r in res1]):
00244 files_with_urls1[name] = domain + path
00245 if all([r.search(name) for r in res2]):
00246 files_with_urls2[name] = domain + path
00247 else:
00248 print domain + path
00249 new_hrefs = href_re.findall(auth_wget(domain + path))[1:]
00250 hrefs.extend([(name, path) for path, name in new_hrefs])
00251 return files_with_urls1, files_with_urls2
00252
00253 def search_on_disk(work_path, rel1, frags1, rel2, frags2):
00254 if not work_path:
00255 print 'No working directory specified. Use "--dir DIR" option to ' +\
00256 'specify working directory. Exiting...'
00257 exit()
00258
00259 def compile_res(rel, frags):
00260 frags = frags.split(',')
00261 regexps = [s for s in frags if not s.startswith('!')]
00262 regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
00263 regexps += [rel + '-', '.root']
00264 return [re.compile(r) for r in regexps]
00265
00266 res1 = compile_res(rel1, frags1)
00267 res2 = compile_res(rel2, frags2)
00268
00269
00270 files = listdir(work_path)
00271 files1, files2 = [], []
00272 for name in files:
00273 if splitext(name)[1]:
00274 if all([r.search(name) for r in res1]):
00275 files1.append(name)
00276 if all([r.search(name) for r in res2]):
00277 files2.append(name)
00278 return files1, files2
00279
00280
00281
00282 comparison_errors = {
00283 'Missing histogram': -1,
00284 'Histograms have different types': -2,
00285 'Object is not a histogram': -3,
00286 'Ranges of histograms are different': -4
00287 }
00288
00289 class ComparisonError(Exception):
00290 def __init__(self, error_message, *args, **kwargs):
00291 self.error_message = error_message
00292 self.error_code = comparison_errors[error_message]
00293
00294 def __str__(self):
00295 return 'Comparison Error: %d' % self.error_code
00296
00297
00298
00299 class StatisticalTest(object):
00300 name = None
00301
00302 def get_N_bins(self, h):
00303 x = h.GetNbinsX()
00304 y = h.GetNbinsY()
00305 z = h.GetNbinsZ()
00306 if not (y and z):
00307 return 0
00308 return (x + 1) * (y + 1) * (z + 1)
00309
00310 def is_empty(self, h):
00311 for i in xrange(1, self.get_N_bins(h)):
00312 if h.GetBinContent(i) != 0:
00313 return False
00314 return True
00315
00316 def do_test(self, h1, h2):
00317 if not h1 or not h2:
00318 raise ComparisonError('Missing histogram')
00319 if type(h1) != type(h2):
00320 return -104
00321 if not h1.InheritsFrom('TH1'):
00322 return -105
00323 if self.is_empty(h1) or self.is_empty(h2):
00324 return 1
00325 h1_bins = self.get_N_bins(h1)
00326 if h1_bins != self.get_N_bins(h2):
00327 return -103
00328
00329
00330 class KolmogorovTest(StatisticalTest):
00331 name = 'KS'
00332
00333 def do_test(self, h1, h2):
00334 p_value = super(KolmogorovTest, self).do_test(h1, h2)
00335 if p_value is not None:
00336 return p_value
00337
00338 for h in h1, h2:
00339 if h.GetSumw2().GetSize() == 0:
00340 h.Sumw2()
00341 return h1.KolmogorovTest(h2)
00342
00343
00344 class Chi2Test(StatisticalTest):
00345 name = 'Chi2'
00346
00347 def make_absolute(self, h, bin_count):
00348 for i in xrange(1, bin_count):
00349 content = h.GetBinContent(i)
00350 if content < 0:
00351 h.SetBinContent(i, -1 * content)
00352 if h.GetBinError(i) == 0 and content != 0:
00353 h.SetBinContent(i, 0)
00354
00355 def enough_filled_bins(self, h, bin_count, more_than=3):
00356 filled_bins = 0
00357 for i in xrange(1, bin_count):
00358 if h.GetBinContent(i) > 0:
00359 filled_bins += 1
00360 if filled_bins > more_than:
00361 return True
00362 return False
00363
00364 def do_test(self, h1, h2):
00365 p_value = super(Chi2Test, self).do_test(h1, h2)
00366 if p_value is not None:
00367 return p_value
00368
00369 bin_count = self.get_N_bins(h1)
00370
00371
00372 self.make_absolute(h1, bin_count)
00373 self.make_absolute(h2, bin_count)
00374
00375
00376 if not self.enough_filled_bins(h1, bin_count) or\
00377 not self.enough_filled_bins(h2, bin_count):
00378 return 1
00379
00380 if h1.InheritsFrom("TProfile") or (h1.GetEntries() != h1.GetSumOfWeights()):
00381 return h1.Chi2Test(h2, 'WW')
00382 return h1.Chi2Test(h2, 'UU')
00383
00384
00385 tests = {KolmogorovTest.name: KolmogorovTest, Chi2Test.name: Chi2Test}
00386
00387
00388 def init_database(db_path):
00389 print 'Initialising DB: %s...' % basename(db_path),
00390 conn = sqlite3.connect(db_path)
00391
00392
00393 c = conn.cursor()
00394 c.execute("""CREATE TABLE IF NOT EXISTS ReleaseComparison (
00395 id INTEGER PRIMARY KEY,
00396 title TEXT,
00397 release1 TEXT,
00398 release2 TEXT,
00399 statistical_test TEXT
00400 );""")
00401 c.execute("""CREATE TABLE IF NOT EXISTS Directory (
00402 id INTEGER PRIMARY KEY,
00403 name TEXT,
00404 parent_id INTEGER,
00405 from_histogram_id INTEGER,
00406 till_histogram_id INTEGER,
00407 FOREIGN KEY (parent_id) REFERENCES Directory(id)
00408 FOREIGN KEY (from_histogram_id) REFERENCES HistogramComparison(id)
00409 FOREIGN KEY (till_histogram_id) REFERENCES HistogramComparison(id)
00410 )""")
00411 c.execute("""CREATE TABLE IF NOT EXISTS RootFileComparison (
00412 id INTEGER PRIMARY KEY,
00413 filename1 TEXT,
00414 filename2 TEXT,
00415 release_comparison_id INTEGER,
00416 directory_id INTEGER,
00417 FOREIGN KEY (release_comparison_id) REFERENCES ReleaseComparison(id),
00418 FOREIGN KEY (directory_id) REFERENCES Directory(id)
00419 )""")
00420 c.execute("""CREATE TABLE IF NOT EXISTS HistogramComparison (
00421 id INTEGER PRIMARY KEY,
00422 name TEXT,
00423 p_value REAL,
00424 directory_id INTEGER,
00425 FOREIGN KEY (directory_id) REFERENCES Directory(id)
00426 )""")
00427
00428 print 'Done.'
00429 return db_path
00430
00431
00432 def get_version(filename):
00433 """Returns CMSSW and GR_R versions for the given filename."""
00434 if is_relvaldata([filename]):
00435 version_elems = get_relvaldata_cmssw_version(filename)
00436 else:
00437 relval_version = get_relval_cmssw_version(filename)
00438 version_elems = (relval_version[0], relval_version[1][0], relval_version[1][1])
00439 version_elems = [elem.strip('_').strip('RelVal_') for elem in version_elems]
00440 return '___'.join([elem for elem in version_elems if elem])
00441
00442
00443 def get_size_to_download(work_path, files_with_urls):
00444 """Returns file list to download and total size to download."""
00445 opener = build_opener(X509CertOpen())
00446 size_to_download = 0
00447 files_to_download = []
00448 for filename, url in files_with_urls:
00449 url_file = opener.open(Request(url))
00450 size = int(url_file.headers["Content-Length"])
00451 file_path = join(work_path, filename)
00452 if exists(file_path) and getsize(file_path) / 1024 == size / 1024:
00453 print "Exists on disk %s." % filename
00454 else:
00455 size_to_download += size
00456 files_to_download.append(url)
00457 return size_to_download, files_to_download
00458
00459 def check_disk_for_space(work_path, size_needed):
00460 '''Checks afs file system for space.'''
00461 pass
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477 def show_status_bar(total_size):
00478 """Shows download status."""
00479 q = show_status_bar.q
00480 total_size = total_size / (1024*1024)
00481 downloaded = 0
00482 while downloaded < total_size:
00483 try:
00484 o = q.get(timeout=20)
00485 downloaded += 1
00486 print '\r %d/%d MB %d%% ' % (downloaded, total_size, 100*downloaded/total_size),
00487 sys.stdout.flush()
00488 except Empty:
00489 time.sleep(1)
00490 break