CMS 3D CMS Logo

/afs/cern.ch/work/a/aaltunda/public/www/CMSSW_6_2_7/src/Utilities/RelMon/python/utils_v2.py

Go to the documentation of this file.
00001 #! /usr/bin/env python
00002 '''
00003 Help functions for ValidationMatrix_v2.py.
00004 
00005 Author:  Albertas Gimbutas,  Vilnius University (LT)
00006 e-mail:  albertasgim@gmail.com
00007 '''
00008 import sys
00009 import re
00010 import time
00011 import sqlite3
00012 from datetime import datetime
00013 from multiprocessing import Pool, Queue, Process
00014 import subprocess
00015 from optparse import OptionParser, OptionGroup
00016 from os import makedirs, listdir
00017 from os.path import basename, dirname, isfile, splitext, join, exists, getsize
00018 from Queue import Empty
00019 from urllib2  import build_opener, Request, HTTPError
00020 from urlparse import urlparse
00021 from httplib import BadStatusLine
00022 
00023 try:
00024     from Utilities.RelMon.authentication import X509CertOpen
00025 except ImportError:
00026     from authentication import X509CertOpen
00027 
00028 ##-----------------   Make files pairs:  RelValData utils   --------------------
00029 def get_relvaldata_id(file):
00030     """Returns unique relvaldata ID for a given file."""
00031     run_id = re.search('R\d{9}', file)
00032     run = re.search('_RelVal_([\w\d]*)-v\d__', file)
00033     if not run:
00034         run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
00035     if run_id and run:
00036         return (run_id.group(), run.group(1))
00037     return None
00038 
00039 def get_relvaldata_cmssw_version(file):
00040     """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
00041     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00042     gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
00043     if not gr_r_version:
00044         gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
00045     if cmssw_release and gr_r_version:
00046         return (cmssw_release[0], gr_r_version[0])
00047 
00048 def get_relvaldata_version(file):
00049     """Returns tuple (CMSSW version, run version) for specified file."""
00050     cmssw_version = re.findall('DQM_V(\d*)_', file)
00051     run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
00052     if not run_version:
00053         run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
00054     if cmssw_version and run_version:
00055         return (int(cmssw_version[0]), int(run_version[0]))
00056 
00057 def get_relvaldata_max_version(files):
00058     """Returns file with maximum version at a) beggining of the file,
00059     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00060     max_file = files[0]
00061     max_v = get_relvaldata_version(files[0])
00062     for file in files:
00063         file_v = get_relvaldata_version(file)
00064         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00065             max_file = file
00066             max_v = file_v
00067     return max_file
00068 
00069 ## -------------------   Make files pairs:  RelVal utils   ---------------------
00070 def get_relval_version(file):
00071     """Returns tuple (CMSSW version, run version) for specified file."""
00072     cmssw_version = re.findall('DQM_V(\d*)_', file)
00073     run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
00074     if cmssw_version and run_version:
00075         return (int(cmssw_version[0]), int(run_version[0]))
00076 
00077 def get_relval_max_version(files):
00078     """Returns file with maximum version at a) beggining of the file,
00079     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00080     max_file = files[0]
00081     max_v = get_relval_version(files[0])
00082     for file in files:
00083         file_v = get_relval_version(file)
00084         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00085             max_file = file
00086             max_v = file_v
00087     return max_file
00088 
00089 def get_relval_cmssw_version(file):
00090     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00091     gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
00092     if cmssw_release and gr_r_version:
00093         return (cmssw_release[0], gr_r_version[0])
00094 
00095 def get_relval_id(file):
00096     """Returns unique relval ID (dataset name) for a given file."""
00097     dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
00098     return dataset_name[0]
00099 
00100 ## -----------------------  Make file pairs --------------------------
00101 def is_relvaldata(files):
00102     is_relvaldata_re = re.compile('_RelVal_')
00103     return any([is_relvaldata_re.search(filename) for filename in files])
00104 
00105 def make_file_pairs(files1, files2):
00106     print '\n#################       Analyzing files       ###################'
00107     ## Select functions to use
00108     if is_relvaldata(files1):
00109         is_relval_data = True
00110         get_cmssw_version = get_relvaldata_cmssw_version
00111         get_id = get_relvaldata_id
00112         get_max_version = get_relvaldata_max_version
00113     else:
00114         is_relval_data = False
00115         get_cmssw_version = get_relval_cmssw_version
00116         get_id = get_relval_id
00117         get_max_version = get_relval_max_version
00118 
00119     ## Divide files into groups
00120     versions1, versions2 = dict(), dict() # {version1: [file1, file2, ...], version2: [...], ...}
00121     for files, versions in (files1, versions1), (files2, versions2):
00122         for file in files:
00123             version = get_cmssw_version(file)
00124             if version:
00125                 if versions.has_key(version):
00126                     versions[version].append(file)
00127                 else:
00128                     versions[version] = [file]
00129 
00130     ## Print the division into groups
00131     print 'For RELEASE1 found file groups:'
00132     for version in versions1:
00133         print '   %s: %d files' % (str(version),  len(versions1[version]))
00134     if not versions1:
00135         print 'None.'
00136 
00137     print '\nFor RELEASE2 found file groups:'
00138     for version in versions2:
00139         print '   %s: %d files' % (str(version),  len(versions2[version]))
00140     if not versions2:
00141         print 'None.'
00142 
00143     if not len(versions1) or not len(versions2):
00144         print '\nNot enough file groups. Exiting...\n'
00145         exit()
00146 
00147     ## Pair till you find pairs.
00148     pairs = []
00149     for v1 in sorted(versions1, key=lambda x: len(versions1[x]), reverse=True):
00150         for v2 in sorted(versions2, key=lambda x: len(versions2[x]), reverse=True):
00151             if v1 == v2:
00152                 continue
00153             ## Print the groups.
00154             print '\n#################     Pairing the files     ###################'
00155             print '%s (%d files)   VS   %s (%d files):\n' % (str(v1),
00156                     len(versions1[v1]), str(v2), len(versions2[v2]))
00157 
00158             ## Pairing two versions
00159             for unique_id in set([get_id(file) for file in versions1[v1]]):
00160                 if is_relval_data:
00161                     dataset_re = re.compile(unique_id[0] + '_')
00162                     run_re = re.compile(unique_id[1])
00163                     c1_files = [file for file in versions1[v1] if dataset_re.search(file) and run_re.search(file)]
00164                     c2_files = [file for file in versions2[v2] if dataset_re.search(file) and run_re.search(file)]
00165                 else:
00166                     dataset_re = re.compile(unique_id + '_')
00167                     c1_files = [file for file in versions1[v1] if dataset_re.search(file)]
00168                     c2_files = [file for file in versions2[v2] if dataset_re.search(file)]
00169 
00170                 if len(c1_files) > 0 and len(c2_files) > 0:
00171                     first_file = get_max_version(c1_files)
00172                     second_file = get_max_version(c2_files)
00173                     print '%s\n%s\n' % (first_file, second_file)
00174                     pairs.append((first_file, second_file))
00175 
00176             print "Got %d pairs." % (len(pairs))
00177             if pairs:
00178                 return pairs
00179     print 'Found no file pairs. Exiting..\n'
00180     exit()
00181 
00182 ## --------------------   Recursife file downloader -----------------------
00183 def auth_wget(url):
00184     try:
00185         opener = build_opener(X509CertOpen())
00186         return opener.open(Request(url)).read()
00187     except HTTPError, e:
00188         print '\nError: DQM GUI is temporarily unavailable. Probably maintainance hours. '+\
00189                 'Please try again later. Original error message: ``%s``. \nExiting...\n' % (e,)
00190         exit()
00191     except BadStatusLine, e:
00192         print '\nYou do not have permissions to access DQM GUI. Please check if your certificates '+\
00193             'in ``~/.globus`` directory are configured correctly. Exitting...' 
00194         exit()
00195 
00196 
00197 def auth_download_file(url, chunk_size=1048576):
00198     filename = basename(url)
00199     file_path = join(auth_download_file.work_dir, filename)
00200 
00201     file = open(file_path, 'wb')
00202     opener = build_opener(X509CertOpen())
00203     url_file = opener.open(Request(url))
00204     chunk = url_file.read(chunk_size)
00205     while chunk:
00206         file.write(chunk)
00207         auth_download_file.q.put((1,))   # reports, that downloaded 1MB
00208         chunk = url_file.read(chunk_size)
00209     print '\rDownloaded: %s  ' % (filename,)
00210     file.close()
00211 
00212 
00213 def recursive_search_online(url, rel1, frags1, rel2, frags2):
00214     """Recursively searches for files, that matches the pattern."""
00215     if not url:
00216         url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/'
00217         g1, g2 = recursive_search_online(url, rel1, frags1, rel2, frags2)
00218         url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelVal/'
00219         g3, g4 = recursive_search_online(url, rel1, frags1, rel2, frags2)
00220         g1.update(g3), g2.update(g4)
00221         return g1, g2
00222 
00223     domain = '://'.join(urlparse(url)[:2])
00224 
00225     ## Compile regular expressions
00226     href_re = re.compile(r"<a href='([-./\w]*)'>([-./\w]*)<")
00227 
00228     def compile_res(rel, frags):
00229         frags = frags.split(',')
00230         regexps = [s for s in frags if not s.startswith('!')]
00231         regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
00232         regexps += [rel + '-', '.root']
00233         return [re.compile(r) for r in regexps]
00234 
00235     res1 = compile_res(rel1, frags1)
00236     res2 = compile_res(rel2, frags2)
00237 
00238     ## Recursively find files that matches regular expressions
00239     hrefs = [(name, path) for path, name in href_re.findall(auth_wget(url))[1:]]
00240     files_with_urls1, files_with_urls2 = dict(), dict()
00241     for name, path in hrefs:
00242         if splitext(name)[1]: # If file
00243             if all([r.search(name) for r in res1]):
00244                 files_with_urls1[name] = domain + path
00245             if all([r.search(name) for r in res2]):
00246                 files_with_urls2[name] = domain + path
00247         else:
00248             print domain + path
00249             new_hrefs = href_re.findall(auth_wget(domain + path))[1:]
00250             hrefs.extend([(name, path) for path, name in new_hrefs])
00251     return files_with_urls1, files_with_urls2
00252 
00253 def search_on_disk(work_path, rel1, frags1, rel2, frags2):
00254     if not work_path:
00255         print 'No working directory specified. Use "--dir DIR" option to ' +\
00256               'specify working directory. Exiting...'
00257         exit()
00258     ## Compile regular expressions
00259     def compile_res(rel, frags):
00260         frags = frags.split(',')
00261         regexps = [s for s in frags if not s.startswith('!')]
00262         regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
00263         regexps += [rel + '-', '.root']
00264         return [re.compile(r) for r in regexps]
00265 
00266     res1 = compile_res(rel1, frags1)
00267     res2 = compile_res(rel2, frags2)
00268 
00269     ## Recursively find files that matches regular expressions
00270     files = listdir(work_path)
00271     files1, files2 = [], []
00272     for name in files:
00273         if splitext(name)[1]:
00274             if all([r.search(name) for r in res1]):
00275                 files1.append(name)
00276             if all([r.search(name) for r in res2]):
00277                 files2.append(name)
00278     return files1, files2
00279 
00280 
00281 ## Exception definitions
00282 comparison_errors = {
00283         'Missing histogram': -1,
00284         'Histograms have different types': -2,
00285         'Object is not a histogram': -3,
00286         'Ranges of histograms are different': -4
00287     }
00288 
00289 class ComparisonError(Exception):
00290     def __init__(self, error_message, *args, **kwargs):
00291         self.error_message = error_message
00292         self.error_code = comparison_errors[error_message]
00293 
00294     def __str__(self):
00295         return 'Comparison Error: %d' % self.error_code
00296 
00297 
00298 ## StatisticalTests
00299 class StatisticalTest(object):
00300     name = None
00301 
00302     def get_N_bins(self, h):
00303         x = h.GetNbinsX()
00304         y = h.GetNbinsY()
00305         z = h.GetNbinsZ()
00306         if not (y and z): # Is this realy necessary?
00307             return 0
00308         return (x + 1) * (y + 1) * (z + 1)
00309 
00310     def is_empty(self, h):
00311         for i in xrange(1, self.get_N_bins(h)):
00312             if h.GetBinContent(i) != 0:
00313                 return False
00314             return True
00315 
00316     def do_test(self, h1, h2):
00317         if not h1 or not h2:
00318             raise ComparisonError('Missing histogram')
00319         if type(h1) != type(h2):
00320             return -104     # raise ComparisonError('Histograms have different types')
00321         if not h1.InheritsFrom('TH1'):
00322             return -105     # raise ComparisonError('Object is not a histogram')
00323         if self.is_empty(h1) or self.is_empty(h2):
00324             return 1
00325         h1_bins = self.get_N_bins(h1)
00326         if h1_bins != self.get_N_bins(h2):
00327             return -103     # raise CoparisonError('Ranges of histograms are different')
00328 
00329 
00330 class KolmogorovTest(StatisticalTest):
00331     name = 'KS'
00332 
00333     def do_test(self, h1, h2):
00334         p_value = super(KolmogorovTest, self).do_test(h1, h2)
00335         if p_value is not None:
00336             return p_value
00337 
00338         for h in h1, h2:
00339             if h.GetSumw2().GetSize() == 0:
00340                 h.Sumw2()
00341         return h1.KolmogorovTest(h2)
00342 
00343 
00344 class Chi2Test(StatisticalTest):
00345     name = 'Chi2'
00346 
00347     def make_absolute(self, h, bin_count):
00348         for i in xrange(1, bin_count): # Why here is no +1?
00349             content = h.GetBinContent(i)
00350             if content < 0:
00351                 h.SetBinContent(i, -1 * content)
00352             if h.GetBinError(i) == 0 and content != 0:
00353                 h.SetBinContent(i, 0)
00354 
00355     def enough_filled_bins(self, h, bin_count, more_than=3):
00356         filled_bins = 0
00357         for i in xrange(1, bin_count):
00358             if h.GetBinContent(i) > 0:
00359                 filled_bins += 1
00360             if filled_bins > more_than:
00361                 return True
00362         return False
00363 
00364     def do_test(self, h1, h2):
00365         p_value = super(Chi2Test, self).do_test(h1, h2)
00366         if p_value is not None:
00367             return p_value
00368 
00369         bin_count = self.get_N_bins(h1)
00370 
00371         # Make histograms absolute.
00372         self.make_absolute(h1, bin_count)
00373         self.make_absolute(h2, bin_count)
00374 
00375         # Check if there is enough filled bins in bouth histograms.
00376         if not self.enough_filled_bins(h1, bin_count) or\
00377            not self.enough_filled_bins(h2, bin_count):
00378             return 1
00379 
00380         if h1.InheritsFrom("TProfile") or (h1.GetEntries() != h1.GetSumOfWeights()):
00381             return h1.Chi2Test(h2, 'WW')
00382         return h1.Chi2Test(h2, 'UU')
00383 
00384 
00385 tests = {KolmogorovTest.name: KolmogorovTest, Chi2Test.name: Chi2Test}
00386 
00387 ## Utils
00388 def init_database(db_path):
00389     print 'Initialising DB: %s...' % basename(db_path),
00390     conn = sqlite3.connect(db_path)
00391 
00392     ## Creates tables
00393     c = conn.cursor()
00394     c.execute("""CREATE TABLE IF NOT EXISTS ReleaseComparison (
00395                         id INTEGER PRIMARY KEY,
00396                         title TEXT,
00397                         release1 TEXT,
00398                         release2 TEXT,
00399                         statistical_test TEXT
00400                     );""")
00401     c.execute("""CREATE TABLE IF NOT EXISTS Directory (
00402                         id INTEGER PRIMARY KEY,
00403                         name TEXT,
00404                         parent_id INTEGER,
00405                         from_histogram_id INTEGER,
00406                         till_histogram_id INTEGER,
00407                         FOREIGN KEY (parent_id) REFERENCES Directory(id)
00408                         FOREIGN KEY (from_histogram_id) REFERENCES HistogramComparison(id)
00409                         FOREIGN KEY (till_histogram_id) REFERENCES HistogramComparison(id)
00410                     )""")
00411     c.execute("""CREATE TABLE IF NOT EXISTS RootFileComparison (
00412                         id INTEGER PRIMARY KEY,
00413                         filename1 TEXT,
00414                         filename2 TEXT,
00415                         release_comparison_id INTEGER,
00416                         directory_id INTEGER,
00417                         FOREIGN KEY (release_comparison_id) REFERENCES ReleaseComparison(id),
00418                         FOREIGN KEY (directory_id) REFERENCES Directory(id)
00419                     )""")
00420     c.execute("""CREATE TABLE IF NOT EXISTS HistogramComparison (
00421                         id INTEGER PRIMARY KEY,
00422                         name TEXT,
00423                         p_value REAL,
00424                         directory_id INTEGER,
00425                         FOREIGN KEY (directory_id) REFERENCES Directory(id)
00426                     )""")
00427 
00428     print 'Done.'
00429     return db_path
00430 
00431 
00432 def get_version(filename):
00433     """Returns CMSSW and GR_R versions for the given filename."""
00434     if is_relvaldata([filename]):
00435         version_elems = get_relvaldata_cmssw_version(filename)
00436     else:
00437         relval_version = get_relval_cmssw_version(filename)
00438         version_elems = (relval_version[0], relval_version[1][0], relval_version[1][1])
00439     version_elems = [elem.strip('_').strip('RelVal_') for elem in version_elems]
00440     return '___'.join([elem for elem in version_elems if elem])
00441 
00442 
00443 def get_size_to_download(work_path, files_with_urls):
00444     """Returns file list to download and total size to download."""
00445     opener = build_opener(X509CertOpen())
00446     size_to_download = 0
00447     files_to_download = []
00448     for filename, url in files_with_urls:
00449         url_file = opener.open(Request(url))
00450         size = int(url_file.headers["Content-Length"])
00451         file_path = join(work_path, filename)
00452         if exists(file_path) and getsize(file_path) / 1024 == size / 1024:
00453             print "Exists on disk %s." % filename
00454         else:
00455             size_to_download += size
00456             files_to_download.append(url)
00457     return size_to_download, files_to_download
00458 
00459 def check_disk_for_space(work_path, size_needed):
00460     '''Checks afs file system for space.'''
00461     pass
00462     # try:
00463     #     fs_proc = subprocess.Popen(['fs', 'listquota', work_path], stdout=subprocess.PIPE)
00464     # except OSError:
00465     #     return
00466     # fs_response = fs_proc.communicate()[0]
00467     # quota, used = re.findall('([\d]+)', fs_response)[:2]
00468     # free_space = int(quota) - int(used)
00469     # if free_space * 1024 < size_needed:
00470     #     print '\nNot enougth free space on disk.',
00471     #     print 'Free space: %d MB. Need: %d MB. Exiting...\n' % (free_space / 1024, size_needed /1048576)
00472     #     exit()
00473     # elif size_needed:
00474     #     print 'Free space on disk: %d MB.\n' % (free_space / 1024,)
00475 
00476 
00477 def show_status_bar(total_size):
00478     """Shows download status."""
00479     q = show_status_bar.q
00480     total_size = total_size / (1024*1024)
00481     downloaded = 0
00482     while downloaded < total_size:
00483         try:
00484             o = q.get(timeout=20)
00485             downloaded += 1
00486             print '\r      %d/%d MB     %d%%     ' % (downloaded, total_size, 100*downloaded/total_size),
00487             sys.stdout.flush()
00488         except Empty:
00489             time.sleep(1)
00490             break