CMS 3D CMS Logo

utils_v2.py
Go to the documentation of this file.
1 #! /usr/bin/env python
2 '''
3 Help functions for ValidationMatrix_v2.py.
4 
5 Author: Albertas Gimbutas, Vilnius University (LT)
6 e-mail: albertasgim@gmail.com
7 '''
8 from __future__ import print_function
9 import sys
10 import re
11 import time
12 import sqlite3
13 from datetime import datetime
14 from multiprocessing import Pool, Queue, Process
15 import subprocess
16 from optparse import OptionParser, OptionGroup
17 from os import makedirs, listdir
18 from os.path import basename, dirname, isfile, splitext, join, exists, getsize
19 from Queue import Empty
20 from urllib2 import build_opener, Request, HTTPError
21 from urlparse import urlparse
22 from httplib import BadStatusLine
23 
24 try:
25  from Utilities.RelMon.authentication import X509CertOpen
26 except ImportError:
27  from authentication import X509CertOpen
28 
29 ##----------------- Make files pairs: RelValData utils --------------------
31  """Returns unique relvaldata ID for a given file."""
32  run_id = re.search('R\d{9}', file)
33  run = re.search('_RelVal_([\w\d]*)-v\d__', file)
34  if not run:
35  run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
36  if run_id and run:
37  return (run_id.group(), run.group(1))
38  return None
39 
41  """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
42  cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
43  gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
44  if not gr_r_version:
45  gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
46  if cmssw_release and gr_r_version:
47  return (cmssw_release[0], gr_r_version[0])
48 
50  """Returns tuple (CMSSW version, run version) for specified file."""
51  cmssw_version = re.findall('DQM_V(\d*)_', file)
52  run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
53  if not run_version:
54  run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
55  if cmssw_version and run_version:
56  return (int(cmssw_version[0]), int(run_version[0]))
57 
59  """Returns file with maximum version at a) beggining of the file,
60  e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
61  max_file = files[0]
62  max_v = get_relvaldata_version(files[0])
63  for file in files:
64  file_v = get_relvaldata_version(file)
65  if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
66  max_file = file
67  max_v = file_v
68  return max_file
69 
70 ## ------------------- Make files pairs: RelVal utils ---------------------
72  """Returns tuple (CMSSW version, run version) for specified file."""
73  cmssw_version = re.findall('DQM_V(\d*)_', file)
74  run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
75  if cmssw_version and run_version:
76  return (int(cmssw_version[0]), int(run_version[0]))
77 
79  """Returns file with maximum version at a) beggining of the file,
80  e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
81  max_file = files[0]
82  max_v = get_relval_version(files[0])
83  for file in files:
84  file_v = get_relval_version(file)
85  if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
86  max_file = file
87  max_v = file_v
88  return max_file
89 
91  cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
92  gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
93  if cmssw_release and gr_r_version:
94  return (cmssw_release[0], gr_r_version[0])
95 
96 def get_relval_id(file):
97  """Returns unique relval ID (dataset name) for a given file."""
98  dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
99  return dataset_name[0]
100 
101 ## ----------------------- Make file pairs --------------------------
102 def is_relvaldata(files):
103  is_relvaldata_re = re.compile('_RelVal_')
104  return any([is_relvaldata_re.search(filename) for filename in files])
105 
106 def make_file_pairs(files1, files2):
107  print('\n################# Analyzing files ###################')
108  ## Select functions to use
109  if is_relvaldata(files1):
110  is_relval_data = True
111  get_cmssw_version = get_relvaldata_cmssw_version
112  get_id = get_relvaldata_id
113  get_max_version = get_relvaldata_max_version
114  else:
115  is_relval_data = False
116  get_cmssw_version = get_relval_cmssw_version
117  get_id = get_relval_id
118  get_max_version = get_relval_max_version
119 
120  ## Divide files into groups
121  versions1, versions2 = dict(), dict() # {version1: [file1, file2, ...], version2: [...], ...}
122  for files, versions in (files1, versions1), (files2, versions2):
123  for file in files:
124  version = get_cmssw_version(file)
125  if version:
126  if version in versions:
127  versions[version].append(file)
128  else:
129  versions[version] = [file]
130 
131  ## Print the division into groups
132  print('For RELEASE1 found file groups:')
133  for version in versions1:
134  print(' %s: %d files' % (str(version), len(versions1[version])))
135  if not versions1:
136  print('None.')
137 
138  print('\nFor RELEASE2 found file groups:')
139  for version in versions2:
140  print(' %s: %d files' % (str(version), len(versions2[version])))
141  if not versions2:
142  print('None.')
143 
144  if not len(versions1) or not len(versions2):
145  print('\nNot enough file groups. Exiting...\n')
146  exit()
147 
148  ## Pair till you find pairs.
149  pairs = []
150  for v1 in sorted(versions1, key=lambda x: len(versions1[x]), reverse=True):
151  for v2 in sorted(versions2, key=lambda x: len(versions2[x]), reverse=True):
152  if v1 == v2:
153  continue
154  ## Print the groups.
155  print('\n################# Pairing the files ###################')
156  print('%s (%d files) VS %s (%d files):\n' % (str(v1),
157  len(versions1[v1]), str(v2), len(versions2[v2])))
158 
159  ## Pairing two versions
160  for unique_id in set([get_id(file) for file in versions1[v1]]):
161  if is_relval_data:
162  dataset_re = re.compile(unique_id[0] + '_')
163  run_re = re.compile(unique_id[1])
164  c1_files = [file for file in versions1[v1] if dataset_re.search(file) and run_re.search(file)]
165  c2_files = [file for file in versions2[v2] if dataset_re.search(file) and run_re.search(file)]
166  else:
167  dataset_re = re.compile(unique_id + '_')
168  c1_files = [file for file in versions1[v1] if dataset_re.search(file)]
169  c2_files = [file for file in versions2[v2] if dataset_re.search(file)]
170 
171  if len(c1_files) > 0 and len(c2_files) > 0:
172  first_file = get_max_version(c1_files)
173  second_file = get_max_version(c2_files)
174  print('%s\n%s\n' % (first_file, second_file))
175  pairs.append((first_file, second_file))
176 
177  print("Got %d pairs." % (len(pairs)))
178  if pairs:
179  return pairs
180  print('Found no file pairs. Exiting..\n')
181  exit()
182 
183 ## -------------------- Recursife file downloader -----------------------
184 def auth_wget(url):
185  try:
186  opener = build_opener(X509CertOpen())
187  return opener.open(Request(url)).read()
188  except HTTPError as e:
189  print('\nError: DQM GUI is temporarily unavailable. Probably maintainance hours. '+\
190  'Please try again later. Original error message: ``%s``. \nExiting...\n' % (e,))
191  exit()
192  except BadStatusLine as e:
193  print('\nYou do not have permissions to access DQM GUI. Please check if your certificates '+\
194  'in ``~/.globus`` directory are configured correctly. Exitting...')
195  exit()
196 
197 
198 def auth_download_file(url, chunk_size=1048576):
199  filename = basename(url)
200  file_path = join(auth_download_file.work_dir, filename)
201 
202  file = open(file_path, 'wb')
203  opener = build_opener(X509CertOpen())
204  url_file = opener.open(Request(url))
205  chunk = url_file.read(chunk_size)
206  while chunk:
207  file.write(chunk)
208  auth_download_file.q.put((1,)) # reports, that downloaded 1MB
209  chunk = url_file.read(chunk_size)
210  print('\rDownloaded: %s ' % (filename,))
211  file.close()
212 
213 
214 def recursive_search_online(url, rel1, frags1, rel2, frags2):
215  """Recursively searches for files, that matches the pattern."""
216  if not url:
217  url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/'
218  g1, g2 = recursive_search_online(url, rel1, frags1, rel2, frags2)
219  url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelVal/'
220  g3, g4 = recursive_search_online(url, rel1, frags1, rel2, frags2)
221  g1.update(g3), g2.update(g4)
222  return g1, g2
223 
224  domain = '://'.join(urlparse(url)[:2])
225 
226  ## Compile regular expressions
227  href_re = re.compile(r"<a href='([-./\w]*)'>([-./\w]*)<")
228 
229  def compile_res(rel, frags):
230  frags = frags.split(',')
231  regexps = [s for s in frags if not s.startswith('!')]
232  regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
233  regexps += [rel + '-', '.root']
234  return [re.compile(r) for r in regexps]
235 
236  res1 = compile_res(rel1, frags1)
237  res2 = compile_res(rel2, frags2)
238 
239  ## Recursively find files that matches regular expressions
240  hrefs = [(name, path) for path, name in href_re.findall(auth_wget(url))[1:]]
241  files_with_urls1, files_with_urls2 = dict(), dict()
242  for name, path in hrefs:
243  if splitext(name)[1]: # If file
244  if all([r.search(name) for r in res1]):
245  files_with_urls1[name] = domain + path
246  if all([r.search(name) for r in res2]):
247  files_with_urls2[name] = domain + path
248  else:
249  print(domain + path)
250  new_hrefs = href_re.findall(auth_wget(domain + path))[1:]
251  hrefs.extend([(name, path) for path, name in new_hrefs])
252  return files_with_urls1, files_with_urls2
253 
254 def search_on_disk(work_path, rel1, frags1, rel2, frags2):
255  if not work_path:
256  print('No working directory specified. Use "--dir DIR" option to ' +\
257  'specify working directory. Exiting...')
258  exit()
259  ## Compile regular expressions
260  def compile_res(rel, frags):
261  frags = frags.split(',')
262  regexps = [s for s in frags if not s.startswith('!')]
263  regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
264  regexps += [rel + '-', '.root']
265  return [re.compile(r) for r in regexps]
266 
267  res1 = compile_res(rel1, frags1)
268  res2 = compile_res(rel2, frags2)
269 
270  ## Recursively find files that matches regular expressions
271  files = listdir(work_path)
272  files1, files2 = [], []
273  for name in files:
274  if splitext(name)[1]:
275  if all([r.search(name) for r in res1]):
276  files1.append(name)
277  if all([r.search(name) for r in res2]):
278  files2.append(name)
279  return files1, files2
280 
281 
282 ## Exception definitions
283 comparison_errors = {
284  'Missing histogram': -1,
285  'Histograms have different types': -2,
286  'Object is not a histogram': -3,
287  'Ranges of histograms are different': -4
288  }
289 
291  def __init__(self, error_message, *args, **kwargs):
292  self.error_message = error_message
293  self.error_code = comparison_errors[error_message]
294 
295  def __str__(self):
296  return 'Comparison Error: %d' % self.error_code
297 
298 
299 ## StatisticalTests
301  name = None
302 
303  def get_N_bins(self, h):
304  x = h.GetNbinsX()
305  y = h.GetNbinsY()
306  z = h.GetNbinsZ()
307  if not (y and z): # Is this realy necessary?
308  return 0
309  return (x + 1) * (y + 1) * (z + 1)
310 
311  def is_empty(self, h):
312  for i in xrange(1, self.get_N_bins(h)):
313  if h.GetBinContent(i) != 0:
314  return False
315  return True
316 
317  def do_test(self, h1, h2):
318  if not h1 or not h2:
319  raise ComparisonError('Missing histogram')
320  if not isinstance(h1, type(h2)):
321  return -104 # raise ComparisonError('Histograms have different types')
322  if not h1.InheritsFrom('TH1'):
323  return -105 # raise ComparisonError('Object is not a histogram')
324  if self.is_empty(h1) or self.is_empty(h2):
325  return 1
326  h1_bins = self.get_N_bins(h1)
327  if h1_bins != self.get_N_bins(h2):
328  return -103 # raise CoparisonError('Ranges of histograms are different')
329 
330 
332  name = 'KS'
333 
334  def do_test(self, h1, h2):
335  p_value = super(KolmogorovTest, self).do_test(h1, h2)
336  if p_value is not None:
337  return p_value
338 
339  for h in h1, h2:
340  if h.GetSumw2().GetSize() == 0:
341  h.Sumw2()
342  return h1.KolmogorovTest(h2)
343 
344 
346  name = 'Chi2'
347 
348  def make_absolute(self, h, bin_count):
349  for i in xrange(1, bin_count): # Why here is no +1?
350  content = h.GetBinContent(i)
351  if content < 0:
352  h.SetBinContent(i, -1 * content)
353  if h.GetBinError(i) == 0 and content != 0:
354  h.SetBinContent(i, 0)
355 
356  def enough_filled_bins(self, h, bin_count, more_than=3):
357  filled_bins = 0
358  for i in xrange(1, bin_count):
359  if h.GetBinContent(i) > 0:
360  filled_bins += 1
361  if filled_bins > more_than:
362  return True
363  return False
364 
365  def do_test(self, h1, h2):
366  p_value = super(Chi2Test, self).do_test(h1, h2)
367  if p_value is not None:
368  return p_value
369 
370  bin_count = self.get_N_bins(h1)
371 
372  # Make histograms absolute.
373  self.make_absolute(h1, bin_count)
374  self.make_absolute(h2, bin_count)
375 
376  # Check if there is enough filled bins in bouth histograms.
377  if not self.enough_filled_bins(h1, bin_count) or\
378  not self.enough_filled_bins(h2, bin_count):
379  return 1
380 
381  if h1.InheritsFrom("TProfile") or (h1.GetEntries() != h1.GetSumOfWeights()):
382  return h1.Chi2Test(h2, 'WW')
383  return h1.Chi2Test(h2, 'UU')
384 
385 
386 tests = {KolmogorovTest.name: KolmogorovTest, Chi2Test.name: Chi2Test}
387 
388 ## Utils
389 def init_database(db_path):
390  print('Initialising DB: %s...' % basename(db_path), end=' ')
391  conn = sqlite3.connect(db_path)
392 
393  ## Creates tables
394  c = conn.cursor()
395  c.execute("""CREATE TABLE IF NOT EXISTS ReleaseComparison (
396  id INTEGER PRIMARY KEY,
397  title TEXT,
398  release1 TEXT,
399  release2 TEXT,
400  statistical_test TEXT
401  );""")
402  c.execute("""CREATE TABLE IF NOT EXISTS Directory (
403  id INTEGER PRIMARY KEY,
404  name TEXT,
405  parent_id INTEGER,
406  from_histogram_id INTEGER,
407  till_histogram_id INTEGER,
408  FOREIGN KEY (parent_id) REFERENCES Directory(id)
409  FOREIGN KEY (from_histogram_id) REFERENCES HistogramComparison(id)
410  FOREIGN KEY (till_histogram_id) REFERENCES HistogramComparison(id)
411  )""")
412  c.execute("""CREATE TABLE IF NOT EXISTS RootFileComparison (
413  id INTEGER PRIMARY KEY,
414  filename1 TEXT,
415  filename2 TEXT,
416  release_comparison_id INTEGER,
417  directory_id INTEGER,
418  FOREIGN KEY (release_comparison_id) REFERENCES ReleaseComparison(id),
419  FOREIGN KEY (directory_id) REFERENCES Directory(id)
420  )""")
421  c.execute("""CREATE TABLE IF NOT EXISTS HistogramComparison (
422  id INTEGER PRIMARY KEY,
423  name TEXT,
424  p_value REAL,
425  directory_id INTEGER,
426  FOREIGN KEY (directory_id) REFERENCES Directory(id)
427  )""")
428 
429  print('Done.')
430  return db_path
431 
432 
433 def get_version(filename):
434  """Returns CMSSW and GR_R versions for the given filename."""
435  if is_relvaldata([filename]):
436  version_elems = get_relvaldata_cmssw_version(filename)
437  else:
438  relval_version = get_relval_cmssw_version(filename)
439  version_elems = (relval_version[0], relval_version[1][0], relval_version[1][1])
440  version_elems = [elem.strip('_').strip('RelVal_') for elem in version_elems]
441  return '___'.join([elem for elem in version_elems if elem])
442 
443 
444 def get_size_to_download(work_path, files_with_urls):
445  """Returns file list to download and total size to download."""
446  opener = build_opener(X509CertOpen())
447  size_to_download = 0
448  files_to_download = []
449  for filename, url in files_with_urls:
450  url_file = opener.open(Request(url))
451  size = int(url_file.headers["Content-Length"])
452  file_path = join(work_path, filename)
453  if exists(file_path) and getsize(file_path) / 1024 == size / 1024:
454  print("Exists on disk %s." % filename)
455  else:
456  size_to_download += size
457  files_to_download.append(url)
458  return size_to_download, files_to_download
459 
460 def check_disk_for_space(work_path, size_needed):
461  '''Checks afs file system for space.'''
462  pass
463  # try:
464  # fs_proc = subprocess.Popen(['fs', 'listquota', work_path], stdout=subprocess.PIPE)
465  # except OSError:
466  # return
467  # fs_response = fs_proc.communicate()[0]
468  # quota, used = re.findall('([\d]+)', fs_response)[:2]
469  # free_space = int(quota) - int(used)
470  # if free_space * 1024 < size_needed:
471  # print '\nNot enougth free space on disk.',
472  # print 'Free space: %d MB. Need: %d MB. Exiting...\n' % (free_space / 1024, size_needed /1048576)
473  # exit()
474  # elif size_needed:
475  # print 'Free space on disk: %d MB.\n' % (free_space / 1024,)
476 
477 
478 def show_status_bar(total_size):
479  """Shows download status."""
480  q = show_status_bar.q
481  total_size = total_size / (1024*1024)
482  downloaded = 0
483  while downloaded < total_size:
484  try:
485  o = q.get(timeout=20)
486  downloaded += 1
487  print('\r %d/%d MB %d%% ' % (downloaded, total_size, 100*downloaded/total_size), end=' ')
488  sys.stdout.flush()
489  except Empty:
490  time.sleep(1)
491  break
def enough_filled_bins(self, h, bin_count, more_than=3)
Definition: utils_v2.py:356
def get_version(filename)
Definition: utils_v2.py:433
def make_file_pairs(files1, files2)
Definition: utils_v2.py:106
def get_relval_id(file)
Definition: utils_v2.py:96
bool any(const std::vector< T > &v, const T &what)
Definition: ECalSD.cc:37
def __init__(self, error_message, args, kwargs)
Definition: utils_v2.py:291
def do_test(self, h1, h2)
Definition: utils_v2.py:334
def show_status_bar(total_size)
Definition: utils_v2.py:478
def auth_download_file(url, chunk_size=1048576)
Definition: utils_v2.py:198
StatisticalTests.
Definition: utils_v2.py:300
S & print(S &os, JobReport::InputFile const &f)
Definition: JobReport.cc:65
def do_test(self, h1, h2)
Definition: utils_v2.py:365
def recursive_search_online(url, rel1, frags1, rel2, frags2)
Definition: utils_v2.py:214
def get_relval_max_version(files)
Definition: utils_v2.py:78
def get_size_to_download(work_path, files_with_urls)
Definition: utils_v2.py:444
def auth_wget(url)
-----------------— Recursife file downloader --------------------—
Definition: utils_v2.py:184
def is_empty(self, h)
Definition: utils_v2.py:311
def get_relval_cmssw_version(file)
Definition: utils_v2.py:90
def search_on_disk(work_path, rel1, frags1, rel2, frags2)
Definition: utils_v2.py:254
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def get_N_bins(self, h)
Definition: utils_v2.py:303
def is_relvaldata(files)
--------------------— Make file pairs -----------------------—
Definition: utils_v2.py:102
def get_relvaldata_cmssw_version(file)
Definition: utils_v2.py:40
def get_relvaldata_max_version(files)
Definition: utils_v2.py:58
def get_relvaldata_version(file)
Definition: utils_v2.py:49
def check_disk_for_space(work_path, size_needed)
Definition: utils_v2.py:460
def init_database(db_path)
Utils.
Definition: utils_v2.py:389
def get_relvaldata_id(file)
-----------—— Make files pairs: RelValData utils --------------——
Definition: utils_v2.py:30
def get_relval_version(file)
-------------—— Make files pairs: RelVal utils ---------------——
Definition: utils_v2.py:71
def do_test(self, h1, h2)
Definition: utils_v2.py:317
def make_absolute(self, h, bin_count)
Definition: utils_v2.py:348
#define str(s)