3 Help functions for ValidationMatrix_v2.py.
5 Author: Albertas Gimbutas, Vilnius University (LT)
6 e-mail: albertasgim@gmail.com
12 from datetime
import datetime
13 from multiprocessing
import Pool, Queue, Process
15 from optparse
import OptionParser, OptionGroup
16 from os
import makedirs, listdir
17 from os.path
import basename, dirname, isfile, splitext, join, exists, getsize
18 from Queue
import Empty
19 from urllib2
import build_opener, Request, HTTPError
20 from urlparse
import urlparse
21 from httplib
import BadStatusLine
24 from Utilities.RelMon.authentication
import X509CertOpen
26 from authentication
import X509CertOpen
30 """Returns unique relvaldata ID for a given file."""
31 run_id = re.search(
'R\d{9}', file)
32 run = re.search(
'_RelVal_([\w\d]*)-v\d__', file)
34 run = re.search(
'GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
36 return (run_id.group(), run.group(1))
40 """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
41 cmssw_release = re.findall(
'(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
42 gr_r_version = re.findall(
'-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
44 gr_r_version = re.findall(
'CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
45 if cmssw_release
and gr_r_version:
46 return (cmssw_release[0], gr_r_version[0])
49 """Returns tuple (CMSSW version, run version) for specified file."""
50 cmssw_version = re.findall(
'DQM_V(\d*)_', file)
51 run_version = re.findall(
'_RelVal_[\w\d]*-v(\d)__', file)
53 run_version = re.findall(
'GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
54 if cmssw_version
and run_version:
55 return (int(cmssw_version[0]), int(run_version[0]))
58 """Returns file with maximum version at a) beggining of the file,
59 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
64 if file_v[1] > max_v[1]
or ((file_v[1] == max_v[1])
and (file_v[0] > max_v[0])):
71 """Returns tuple (CMSSW version, run version) for specified file."""
72 cmssw_version = re.findall(
'DQM_V(\d*)_', file)
73 run_version = re.findall(
'CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
74 if cmssw_version
and run_version:
75 return (int(cmssw_version[0]), int(run_version[0]))
78 """Returns file with maximum version at a) beggining of the file,
79 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
84 if file_v[1] > max_v[1]
or ((file_v[1] == max_v[1])
and (file_v[0] > max_v[0])):
90 cmssw_release = re.findall(
'(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
91 gr_r_version = re.findall(
'CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
92 if cmssw_release
and gr_r_version:
93 return (cmssw_release[0], gr_r_version[0])
96 """Returns unique relval ID (dataset name) for a given file."""
97 dataset_name = re.findall(
'R\d{9}__([\w\d]*)__CMSSW_', file)
98 return dataset_name[0]
102 is_relvaldata_re = re.compile(
'_RelVal_')
103 return any([is_relvaldata_re.search(filename)
for filename
in files])
106 print '\n################# Analyzing files ###################'
109 is_relval_data =
True
110 get_cmssw_version = get_relvaldata_cmssw_version
111 get_id = get_relvaldata_id
112 get_max_version = get_relvaldata_max_version
114 is_relval_data =
False
115 get_cmssw_version = get_relval_cmssw_version
116 get_id = get_relval_id
117 get_max_version = get_relval_max_version
120 versions1, versions2 =
dict(),
dict()
121 for files, versions
in (files1, versions1), (files2, versions2):
123 version = get_cmssw_version(file)
125 if versions.has_key(version):
126 versions[version].
append(file)
128 versions[version] = [file]
131 print 'For RELEASE1 found file groups:'
132 for version
in versions1:
133 print ' %s: %d files' % (str(version), len(versions1[version]))
137 print '\nFor RELEASE2 found file groups:'
138 for version
in versions2:
139 print ' %s: %d files' % (str(version), len(versions2[version]))
143 if not len(versions1)
or not len(versions2):
144 print '\nNot enough file groups. Exiting...\n'
149 for v1
in sorted(versions1, key=
lambda x: len(versions1[x]), reverse=
True):
150 for v2
in sorted(versions2, key=
lambda x: len(versions2[x]), reverse=
True):
154 print '\n################# Pairing the files ###################'
155 print '%s (%d files) VS %s (%d files):\n' % (str(v1),
156 len(versions1[v1]), str(v2), len(versions2[v2]))
159 for unique_id
in set([get_id(file)
for file
in versions1[v1]]):
161 dataset_re = re.compile(unique_id[0] +
'_')
162 run_re = re.compile(unique_id[1])
163 c1_files = [file
for file
in versions1[v1]
if dataset_re.search(file)
and run_re.search(file)]
164 c2_files = [file
for file
in versions2[v2]
if dataset_re.search(file)
and run_re.search(file)]
166 dataset_re = re.compile(unique_id +
'_')
167 c1_files = [file
for file
in versions1[v1]
if dataset_re.search(file)]
168 c2_files = [file
for file
in versions2[v2]
if dataset_re.search(file)]
170 if len(c1_files) > 0
and len(c2_files) > 0:
171 first_file = get_max_version(c1_files)
172 second_file = get_max_version(c2_files)
173 print '%s\n%s\n' % (first_file, second_file)
174 pairs.append((first_file, second_file))
176 print "Got %d pairs." % (len(pairs))
179 print 'Found no file pairs. Exiting..\n'
186 return opener.open(Request(url)).
read()
188 print '\nError: DQM GUI is temporarily unavailable. Probably maintainance hours. '+\
189 'Please try again later. Original error message: ``%s``. \nExiting...\n' % (e,)
191 except BadStatusLine, e:
192 print '\nYou do not have permissions to access DQM GUI. Please check if your certificates '+\
193 'in ``~/.globus`` directory are configured correctly. Exitting...'
198 filename = basename(url)
199 file_path =
join(auth_download_file.work_dir, filename)
201 file = open(file_path,
'wb')
203 url_file = opener.open(Request(url))
204 chunk = url_file.read(chunk_size)
207 auth_download_file.q.put((1,))
208 chunk = url_file.read(chunk_size)
209 print '\rDownloaded: %s ' % (filename,)
214 """Recursively searches for files, that matches the pattern."""
216 url =
'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/'
218 url =
'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelVal/'
220 g1.update(g3), g2.update(g4)
223 domain =
'://'.
join(urlparse(url)[:2])
226 href_re = re.compile(
r"<a href='([-./\w]*)'>([-./\w]*)<")
228 def compile_res(rel, frags):
229 frags = frags.split(
',')
230 regexps = [s
for s
in frags
if not s.startswith(
'!')]
231 regexps += [
'^((?%s).)*$' % s
for s
in frags
if s.startswith(
'!')]
232 regexps += [rel +
'-',
'.root']
233 return [re.compile(r)
for r
in regexps]
235 res1 = compile_res(rel1, frags1)
236 res2 = compile_res(rel2, frags2)
239 hrefs = [(name, path)
for path, name
in href_re.findall(
auth_wget(url))[1:]]
240 files_with_urls1, files_with_urls2 =
dict(),
dict()
241 for name, path
in hrefs:
242 if splitext(name)[1]:
243 if all([r.search(name)
for r
in res1]):
244 files_with_urls1[name] = domain + path
245 if all([r.search(name)
for r
in res2]):
246 files_with_urls2[name] = domain + path
249 new_hrefs = href_re.findall(
auth_wget(domain + path))[1:]
250 hrefs.extend([(name, path)
for path, name
in new_hrefs])
251 return files_with_urls1, files_with_urls2
255 print 'No working directory specified. Use "--dir DIR" option to ' +\
256 'specify working directory. Exiting...'
259 def compile_res(rel, frags):
260 frags = frags.split(
',')
261 regexps = [s
for s
in frags
if not s.startswith(
'!')]
262 regexps += [
'^((?%s).)*$' % s
for s
in frags
if s.startswith(
'!')]
263 regexps += [rel +
'-',
'.root']
264 return [re.compile(r)
for r
in regexps]
266 res1 = compile_res(rel1, frags1)
267 res2 = compile_res(rel2, frags2)
270 files = listdir(work_path)
271 files1, files2 = [], []
273 if splitext(name)[1]:
274 if all([r.search(name)
for r
in res1]):
276 if all([r.search(name)
for r
in res2]):
278 return files1, files2
282 comparison_errors = {
283 'Missing histogram': -1,
284 'Histograms have different types': -2,
285 'Object is not a histogram': -3,
286 'Ranges of histograms are different': -4
290 def __init__(self, error_message, *args, **kwargs):
295 return 'Comparison Error: %d' % self.
error_code
308 return (x + 1) * (y + 1) * (z + 1)
312 if h.GetBinContent(i) != 0:
319 if type(h1) != type(h2):
321 if not h1.InheritsFrom(
'TH1'):
334 p_value = super(KolmogorovTest, self).
do_test(h1, h2)
335 if p_value
is not None:
339 if h.GetSumw2().GetSize() == 0:
341 return h1.KolmogorovTest(h2)
348 for i
in xrange(1, bin_count):
349 content = h.GetBinContent(i)
351 h.SetBinContent(i, -1 * content)
352 if h.GetBinError(i) == 0
and content != 0:
353 h.SetBinContent(i, 0)
357 for i
in xrange(1, bin_count):
358 if h.GetBinContent(i) > 0:
360 if filled_bins > more_than:
365 p_value = super(Chi2Test, self).
do_test(h1, h2)
366 if p_value
is not None:
380 if h1.InheritsFrom(
"TProfile")
or (h1.GetEntries() != h1.GetSumOfWeights()):
381 return h1.Chi2Test(h2,
'WW')
382 return h1.Chi2Test(h2,
'UU')
385 tests = {KolmogorovTest.name: KolmogorovTest, Chi2Test.name: Chi2Test}
389 print 'Initialising DB: %s...' % basename(db_path),
390 conn = sqlite3.connect(db_path)
394 c.execute(
"""CREATE TABLE IF NOT EXISTS ReleaseComparison (
395 id INTEGER PRIMARY KEY,
399 statistical_test TEXT
401 c.execute(
"""CREATE TABLE IF NOT EXISTS Directory (
402 id INTEGER PRIMARY KEY,
405 from_histogram_id INTEGER,
406 till_histogram_id INTEGER,
407 FOREIGN KEY (parent_id) REFERENCES Directory(id)
408 FOREIGN KEY (from_histogram_id) REFERENCES HistogramComparison(id)
409 FOREIGN KEY (till_histogram_id) REFERENCES HistogramComparison(id)
411 c.execute(
"""CREATE TABLE IF NOT EXISTS RootFileComparison (
412 id INTEGER PRIMARY KEY,
415 release_comparison_id INTEGER,
416 directory_id INTEGER,
417 FOREIGN KEY (release_comparison_id) REFERENCES ReleaseComparison(id),
418 FOREIGN KEY (directory_id) REFERENCES Directory(id)
420 c.execute(
"""CREATE TABLE IF NOT EXISTS HistogramComparison (
421 id INTEGER PRIMARY KEY,
424 directory_id INTEGER,
425 FOREIGN KEY (directory_id) REFERENCES Directory(id)
433 """Returns CMSSW and GR_R versions for the given filename."""
438 version_elems = (relval_version[0], relval_version[1][0], relval_version[1][1])
439 version_elems = [elem.strip(
'_').strip(
'RelVal_')
for elem
in version_elems]
440 return '___'.
join([elem
for elem
in version_elems
if elem])
444 """Returns file list to download and total size to download."""
447 files_to_download = []
448 for filename, url
in files_with_urls:
449 url_file = opener.open(Request(url))
450 size = int(url_file.headers[
"Content-Length"])
451 file_path =
join(work_path, filename)
452 if exists(file_path)
and getsize(file_path) / 1024 == size / 1024:
453 print "Exists on disk %s." % filename
455 size_to_download += size
456 files_to_download.append(url)
457 return size_to_download, files_to_download
460 '''Checks afs file system for space.'''
478 """Shows download status."""
479 q = show_status_bar.q
480 total_size = total_size / (1024*1024)
482 while downloaded < total_size:
484 o = q.get(timeout=20)
486 print '\r %d/%d MB %d%% ' % (downloaded, total_size, 100*downloaded/total_size),
def get_relval_version
-------------—— Make files pairs: RelVal utils ---------------——
def get_relval_max_version
def get_relvaldata_version
def get_relvaldata_cmssw_version
def get_relvaldata_max_version
def auth_wget
-----------------— Recursife file downloader --------------------—
def get_relvaldata_id
-----------—— Make files pairs: RelValData utils --------------——
static std::string join(char **cmd)
def get_relval_cmssw_version
def is_relvaldata
--------------------— Make file pairs -----------------------—
def recursive_search_online