CMS 3D CMS Logo

utils_v2.py
Go to the documentation of this file.
1 #! /usr/bin/env python
2 '''
3 Help functions for ValidationMatrix_v2.py.
4 
5 Author: Albertas Gimbutas, Vilnius University (LT)
6 e-mail: albertasgim@gmail.com
7 '''
8 from __future__ import print_function
9 from __future__ import absolute_import
10 from builtins import range
11 import sys
12 import re
13 import time
14 import sqlite3
15 from datetime import datetime
16 from multiprocessing import Pool, Queue, Process
17 import subprocess
18 from optparse import OptionParser, OptionGroup
19 from os import makedirs, listdir
20 from os.path import basename, dirname, isfile, splitext, join, exists, getsize
21 from Queue import Empty
22 from urllib2 import build_opener, Request, HTTPError
23 from urlparse import urlparse
24 from httplib import BadStatusLine
25 
26 try:
27  from Utilities.RelMon.authentication import X509CertOpen
28 except ImportError:
29  from .authentication import X509CertOpen
30 
31 
33  """Returns unique relvaldata ID for a given file."""
34  run_id = re.search('R\d{9}', file)
35  run = re.search('_RelVal_([\w\d]*)-v\d__', file)
36  if not run:
37  run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
38  if run_id and run:
39  return (run_id.group(), run.group(1))
40  return None
41 
43  """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
44  cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
45  gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
46  if not gr_r_version:
47  gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
48  if cmssw_release and gr_r_version:
49  return (cmssw_release[0], gr_r_version[0])
50 
52  """Returns tuple (CMSSW version, run version) for specified file."""
53  cmssw_version = re.findall('DQM_V(\d*)_', file)
54  run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
55  if not run_version:
56  run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
57  if cmssw_version and run_version:
58  return (int(cmssw_version[0]), int(run_version[0]))
59 
61  """Returns file with maximum version at a) beggining of the file,
62  e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
63  max_file = files[0]
64  max_v = get_relvaldata_version(files[0])
65  for file in files:
66  file_v = get_relvaldata_version(file)
67  if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
68  max_file = file
69  max_v = file_v
70  return max_file
71 
72 
74  """Returns tuple (CMSSW version, run version) for specified file."""
75  cmssw_version = re.findall('DQM_V(\d*)_', file)
76  run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
77  if cmssw_version and run_version:
78  return (int(cmssw_version[0]), int(run_version[0]))
79 
81  """Returns file with maximum version at a) beggining of the file,
82  e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
83  max_file = files[0]
84  max_v = get_relval_version(files[0])
85  for file in files:
86  file_v = get_relval_version(file)
87  if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
88  max_file = file
89  max_v = file_v
90  return max_file
91 
93  cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
94  gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
95  if cmssw_release and gr_r_version:
96  return (cmssw_release[0], gr_r_version[0])
97 
98 def get_relval_id(file):
99  """Returns unique relval ID (dataset name) for a given file."""
100  dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
101  return dataset_name[0]
102 
103 
104 def is_relvaldata(files):
105  is_relvaldata_re = re.compile('_RelVal_')
106  return any([is_relvaldata_re.search(filename) for filename in files])
107 
108 def make_file_pairs(files1, files2):
109  print('\n################# Analyzing files ###################')
110 
111  if is_relvaldata(files1):
112  is_relval_data = True
113  get_cmssw_version = get_relvaldata_cmssw_version
114  get_id = get_relvaldata_id
115  get_max_version = get_relvaldata_max_version
116  else:
117  is_relval_data = False
118  get_cmssw_version = get_relval_cmssw_version
119  get_id = get_relval_id
120  get_max_version = get_relval_max_version
121 
122 
123  versions1, versions2 = dict(), dict() # {version1: [file1, file2, ...], version2: [...], ...}
124  for files, versions in (files1, versions1), (files2, versions2):
125  for file in files:
126  version = get_cmssw_version(file)
127  if version:
128  if version in versions:
129  versions[version].append(file)
130  else:
131  versions[version] = [file]
132 
133 
134  print('For RELEASE1 found file groups:')
135  for version in versions1:
136  print(' %s: %d files' % (str(version), len(versions1[version])))
137  if not versions1:
138  print('None.')
139 
140  print('\nFor RELEASE2 found file groups:')
141  for version in versions2:
142  print(' %s: %d files' % (str(version), len(versions2[version])))
143  if not versions2:
144  print('None.')
145 
146  if not len(versions1) or not len(versions2):
147  print('\nNot enough file groups. Exiting...\n')
148  exit()
149 
150 
151  pairs = []
152  for v1 in sorted(versions1, key=lambda x: len(versions1[x]), reverse=True):
153  for v2 in sorted(versions2, key=lambda x: len(versions2[x]), reverse=True):
154  if v1 == v2:
155  continue
156 
157  print('\n################# Pairing the files ###################')
158  print('%s (%d files) VS %s (%d files):\n' % (str(v1),
159  len(versions1[v1]), str(v2), len(versions2[v2])))
160 
161 
162  for unique_id in set([get_id(file) for file in versions1[v1]]):
163  if is_relval_data:
164  dataset_re = re.compile(unique_id[0] + '_')
165  run_re = re.compile(unique_id[1])
166  c1_files = [file for file in versions1[v1] if dataset_re.search(file) and run_re.search(file)]
167  c2_files = [file for file in versions2[v2] if dataset_re.search(file) and run_re.search(file)]
168  else:
169  dataset_re = re.compile(unique_id + '_')
170  c1_files = [file for file in versions1[v1] if dataset_re.search(file)]
171  c2_files = [file for file in versions2[v2] if dataset_re.search(file)]
172 
173  if len(c1_files) > 0 and len(c2_files) > 0:
174  first_file = get_max_version(c1_files)
175  second_file = get_max_version(c2_files)
176  print('%s\n%s\n' % (first_file, second_file))
177  pairs.append((first_file, second_file))
178 
179  print("Got %d pairs." % (len(pairs)))
180  if pairs:
181  return pairs
182  print('Found no file pairs. Exiting..\n')
183  exit()
184 
185 
186 def auth_wget(url):
187  try:
188  opener = build_opener(X509CertOpen())
189  return opener.open(Request(url)).read()
190  except HTTPError as e:
191  print('\nError: DQM GUI is temporarily unavailable. Probably maintainance hours. '+\
192  'Please try again later. Original error message: ``%s``. \nExiting...\n' % (e,))
193  exit()
194  except BadStatusLine as e:
195  print('\nYou do not have permissions to access DQM GUI. Please check if your certificates '+\
196  'in ``~/.globus`` directory are configured correctly. Exitting...')
197  exit()
198 
199 
200 def auth_download_file(url, chunk_size=1048576):
201  filename = basename(url)
202  file_path = join(auth_download_file.work_dir, filename)
203 
204  file = open(file_path, 'wb')
205  opener = build_opener(X509CertOpen())
206  url_file = opener.open(Request(url))
207  chunk = url_file.read(chunk_size)
208  while chunk:
209  file.write(chunk)
210  auth_download_file.q.put((1,)) # reports, that downloaded 1MB
211  chunk = url_file.read(chunk_size)
212  print('\rDownloaded: %s ' % (filename,))
213  file.close()
214 
215 
216 def recursive_search_online(url, rel1, frags1, rel2, frags2):
217  """Recursively searches for files, that matches the pattern."""
218  if not url:
219  url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/'
220  g1, g2 = recursive_search_online(url, rel1, frags1, rel2, frags2)
221  url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelVal/'
222  g3, g4 = recursive_search_online(url, rel1, frags1, rel2, frags2)
223  g1.update(g3), g2.update(g4)
224  return g1, g2
225 
226  domain = '://'.join(urlparse(url)[:2])
227 
228 
229  href_re = re.compile(r"<a href='([-./\w]*)'>([-./\w]*)<")
230 
231  def compile_res(rel, frags):
232  frags = frags.split(',')
233  regexps = [s for s in frags if not s.startswith('!')]
234  regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
235  regexps += [rel + '-', '.root']
236  return [re.compile(r) for r in regexps]
237 
238  res1 = compile_res(rel1, frags1)
239  res2 = compile_res(rel2, frags2)
240 
241 
242  hrefs = [(name, path) for path, name in href_re.findall(auth_wget(url))[1:]]
243  files_with_urls1, files_with_urls2 = dict(), dict()
244  for name, path in hrefs:
245  if splitext(name)[1]: # If file
246  if all([r.search(name) for r in res1]):
247  files_with_urls1[name] = domain + path
248  if all([r.search(name) for r in res2]):
249  files_with_urls2[name] = domain + path
250  else:
251  print(domain + path)
252  new_hrefs = href_re.findall(auth_wget(domain + path))[1:]
253  hrefs.extend([(name, path) for path, name in new_hrefs])
254  return files_with_urls1, files_with_urls2
255 
256 def search_on_disk(work_path, rel1, frags1, rel2, frags2):
257  if not work_path:
258  print('No working directory specified. Use "--dir DIR" option to ' +\
259  'specify working directory. Exiting...')
260  exit()
261 
262  def compile_res(rel, frags):
263  frags = frags.split(',')
264  regexps = [s for s in frags if not s.startswith('!')]
265  regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
266  regexps += [rel + '-', '.root']
267  return [re.compile(r) for r in regexps]
268 
269  res1 = compile_res(rel1, frags1)
270  res2 = compile_res(rel2, frags2)
271 
272 
273  files = listdir(work_path)
274  files1, files2 = [], []
275  for name in files:
276  if splitext(name)[1]:
277  if all([r.search(name) for r in res1]):
278  files1.append(name)
279  if all([r.search(name) for r in res2]):
280  files2.append(name)
281  return files1, files2
282 
283 
284 
285 comparison_errors = {
286  'Missing histogram': -1,
287  'Histograms have different types': -2,
288  'Object is not a histogram': -3,
289  'Ranges of histograms are different': -4
290  }
291 
293  def __init__(self, error_message, *args, **kwargs):
294  self.error_message = error_message
295  self.error_code = comparison_errors[error_message]
296 
297  def __str__(self):
298  return 'Comparison Error: %d' % self.error_code
299 
300 
301 
303  name = None
304 
305  def get_N_bins(self, h):
306  x = h.GetNbinsX()
307  y = h.GetNbinsY()
308  z = h.GetNbinsZ()
309  if not (y and z): # Is this realy necessary?
310  return 0
311  return (x + 1) * (y + 1) * (z + 1)
312 
313  def is_empty(self, h):
314  for i in range(1, self.get_N_bins(h)):
315  if h.GetBinContent(i) != 0:
316  return False
317  return True
318 
319  def do_test(self, h1, h2):
320  if not h1 or not h2:
321  raise ComparisonError('Missing histogram')
322  if not isinstance(h1, type(h2)):
323  return -104 # raise ComparisonError('Histograms have different types')
324  if not h1.InheritsFrom('TH1'):
325  return -105 # raise ComparisonError('Object is not a histogram')
326  if self.is_empty(h1) or self.is_empty(h2):
327  return 1
328  h1_bins = self.get_N_bins(h1)
329  if h1_bins != self.get_N_bins(h2):
330  return -103 # raise CoparisonError('Ranges of histograms are different')
331 
332 
334  name = 'KS'
335 
336  def do_test(self, h1, h2):
337  p_value = super(KolmogorovTest, self).do_test(h1, h2)
338  if p_value is not None:
339  return p_value
340 
341  for h in h1, h2:
342  if h.GetSumw2().GetSize() == 0:
343  h.Sumw2()
344  return h1.KolmogorovTest(h2)
345 
346 
348  name = 'Chi2'
349 
350  def make_absolute(self, h, bin_count):
351  for i in range(1, bin_count): # Why here is no +1?
352  content = h.GetBinContent(i)
353  if content < 0:
354  h.SetBinContent(i, -1 * content)
355  if h.GetBinError(i) == 0 and content != 0:
356  h.SetBinContent(i, 0)
357 
358  def enough_filled_bins(self, h, bin_count, more_than=3):
359  filled_bins = 0
360  for i in range(1, bin_count):
361  if h.GetBinContent(i) > 0:
362  filled_bins += 1
363  if filled_bins > more_than:
364  return True
365  return False
366 
367  def do_test(self, h1, h2):
368  p_value = super(Chi2Test, self).do_test(h1, h2)
369  if p_value is not None:
370  return p_value
371 
372  bin_count = self.get_N_bins(h1)
373 
374  # Make histograms absolute.
375  self.make_absolute(h1, bin_count)
376  self.make_absolute(h2, bin_count)
377 
378  # Check if there is enough filled bins in bouth histograms.
379  if not self.enough_filled_bins(h1, bin_count) or\
380  not self.enough_filled_bins(h2, bin_count):
381  return 1
382 
383  if h1.InheritsFrom("TProfile") or (h1.GetEntries() != h1.GetSumOfWeights()):
384  return h1.Chi2Test(h2, 'WW')
385  return h1.Chi2Test(h2, 'UU')
386 
387 
388 tests = {KolmogorovTest.name: KolmogorovTest, Chi2Test.name: Chi2Test}
389 
390 
391 def init_database(db_path):
392  print('Initialising DB: %s...' % basename(db_path), end=' ')
393  conn = sqlite3.connect(db_path)
394 
395 
396  c = conn.cursor()
397  c.execute("""CREATE TABLE IF NOT EXISTS ReleaseComparison (
398  id INTEGER PRIMARY KEY,
399  title TEXT,
400  release1 TEXT,
401  release2 TEXT,
402  statistical_test TEXT
403  );""")
404  c.execute("""CREATE TABLE IF NOT EXISTS Directory (
405  id INTEGER PRIMARY KEY,
406  name TEXT,
407  parent_id INTEGER,
408  from_histogram_id INTEGER,
409  till_histogram_id INTEGER,
410  FOREIGN KEY (parent_id) REFERENCES Directory(id)
411  FOREIGN KEY (from_histogram_id) REFERENCES HistogramComparison(id)
412  FOREIGN KEY (till_histogram_id) REFERENCES HistogramComparison(id)
413  )""")
414  c.execute("""CREATE TABLE IF NOT EXISTS RootFileComparison (
415  id INTEGER PRIMARY KEY,
416  filename1 TEXT,
417  filename2 TEXT,
418  release_comparison_id INTEGER,
419  directory_id INTEGER,
420  FOREIGN KEY (release_comparison_id) REFERENCES ReleaseComparison(id),
421  FOREIGN KEY (directory_id) REFERENCES Directory(id)
422  )""")
423  c.execute("""CREATE TABLE IF NOT EXISTS HistogramComparison (
424  id INTEGER PRIMARY KEY,
425  name TEXT,
426  p_value REAL,
427  directory_id INTEGER,
428  FOREIGN KEY (directory_id) REFERENCES Directory(id)
429  )""")
430 
431  print('Done.')
432  return db_path
433 
434 
435 def get_version(filename):
436  """Returns CMSSW and GR_R versions for the given filename."""
437  if is_relvaldata([filename]):
438  version_elems = get_relvaldata_cmssw_version(filename)
439  else:
440  relval_version = get_relval_cmssw_version(filename)
441  version_elems = (relval_version[0], relval_version[1][0], relval_version[1][1])
442  version_elems = [elem.strip('_').strip('RelVal_') for elem in version_elems]
443  return '___'.join([elem for elem in version_elems if elem])
444 
445 
446 def get_size_to_download(work_path, files_with_urls):
447  """Returns file list to download and total size to download."""
448  opener = build_opener(X509CertOpen())
449  size_to_download = 0
450  files_to_download = []
451  for filename, url in files_with_urls:
452  url_file = opener.open(Request(url))
453  size = int(url_file.headers["Content-Length"])
454  file_path = join(work_path, filename)
455  if exists(file_path) and getsize(file_path) / 1024 == size / 1024:
456  print("Exists on disk %s." % filename)
457  else:
458  size_to_download += size
459  files_to_download.append(url)
460  return size_to_download, files_to_download
461 
462 def check_disk_for_space(work_path, size_needed):
463  '''Checks afs file system for space.'''
464  pass
465  # try:
466  # fs_proc = subprocess.Popen(['fs', 'listquota', work_path], stdout=subprocess.PIPE)
467  # except OSError:
468  # return
469  # fs_response = fs_proc.communicate()[0]
470  # quota, used = re.findall('([\d]+)', fs_response)[:2]
471  # free_space = int(quota) - int(used)
472  # if free_space * 1024 < size_needed:
473  # print '\nNot enougth free space on disk.',
474  # print 'Free space: %d MB. Need: %d MB. Exiting...\n' % (free_space / 1024, size_needed /1048576)
475  # exit()
476  # elif size_needed:
477  # print 'Free space on disk: %d MB.\n' % (free_space / 1024,)
478 
479 
480 def show_status_bar(total_size):
481  """Shows download status."""
482  q = show_status_bar.q
483  total_size = total_size / (1024*1024)
484  downloaded = 0
485  while downloaded < total_size:
486  try:
487  o = q.get(timeout=20)
488  downloaded += 1
489  print('\r %d/%d MB %d%% ' % (downloaded, total_size, 100*downloaded/total_size), end=' ')
490  sys.stdout.flush()
491  except Empty:
492  time.sleep(1)
493  break
def enough_filled_bins(self, h, bin_count, more_than=3)
Definition: utils_v2.py:358
def get_version(filename)
Definition: utils_v2.py:435
def make_file_pairs(files1, files2)
Definition: utils_v2.py:108
def all(container)
workaround iterator generators for ROOT classes
Definition: cmstools.py:25
def get_relval_id(file)
Definition: utils_v2.py:98
bool any(const std::vector< T > &v, const T &what)
Definition: ECalSD.cc:37
def __init__(self, error_message, args, kwargs)
Definition: utils_v2.py:293
def do_test(self, h1, h2)
Definition: utils_v2.py:336
def show_status_bar(total_size)
Definition: utils_v2.py:480
def auth_download_file(url, chunk_size=1048576)
Definition: utils_v2.py:200
StatisticalTests.
Definition: utils_v2.py:302
def do_test(self, h1, h2)
Definition: utils_v2.py:367
def recursive_search_online(url, rel1, frags1, rel2, frags2)
Definition: utils_v2.py:216
def get_relval_max_version(files)
Definition: utils_v2.py:80
def get_size_to_download(work_path, files_with_urls)
Definition: utils_v2.py:446
def auth_wget(url)
-----------------— Recursife file downloader --------------------—
Definition: utils_v2.py:186
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
Definition: Utilities.cc:47
def is_empty(self, h)
Definition: utils_v2.py:313
def get_relval_cmssw_version(file)
Definition: utils_v2.py:92
def search_on_disk(work_path, rel1, frags1, rel2, frags2)
Definition: utils_v2.py:256
static std::string join(char **cmd)
Definition: RemoteFile.cc:21
def get_N_bins(self, h)
Definition: utils_v2.py:305
def is_relvaldata(files)
--------------------— Make file pairs -----------------------—
Definition: utils_v2.py:104
def get_relvaldata_cmssw_version(file)
Definition: utils_v2.py:42
def get_relvaldata_max_version(files)
Definition: utils_v2.py:60
def get_relvaldata_version(file)
Definition: utils_v2.py:51
def check_disk_for_space(work_path, size_needed)
Definition: utils_v2.py:462
def init_database(db_path)
Utils.
Definition: utils_v2.py:391
def get_relvaldata_id(file)
-----------—— Make files pairs: RelValData utils --------------——
Definition: utils_v2.py:32
def get_relval_version(file)
-------------—— Make files pairs: RelVal utils ---------------——
Definition: utils_v2.py:73
def do_test(self, h1, h2)
Definition: utils_v2.py:319
def make_absolute(self, h, bin_count)
Definition: utils_v2.py:350
#define str(s)
def exit(msg="")