CMS 3D CMS Logo

utils_v2.py
Go to the documentation of this file.
1 #! /usr/bin/env python
2 '''
3 Help functions for ValidationMatrix_v2.py.
4 
5 Author: Albertas Gimbutas, Vilnius University (LT)
6 e-mail: albertasgim@gmail.com
7 '''
8 import sys
9 import re
10 import time
11 import sqlite3
12 from datetime import datetime
13 from multiprocessing import Pool, Queue, Process
14 import subprocess
15 from optparse import OptionParser, OptionGroup
16 from os import makedirs, listdir
17 from os.path import basename, dirname, isfile, splitext, join, exists, getsize
18 from Queue import Empty
19 from urllib2 import build_opener, Request, HTTPError
20 from urlparse import urlparse
21 from httplib import BadStatusLine
22 
23 try:
24  from Utilities.RelMon.authentication import X509CertOpen
25 except ImportError:
26  from authentication import X509CertOpen
27 
28 ##----------------- Make files pairs: RelValData utils --------------------
30  """Returns unique relvaldata ID for a given file."""
31  run_id = re.search('R\d{9}', file)
32  run = re.search('_RelVal_([\w\d]*)-v\d__', file)
33  if not run:
34  run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
35  if run_id and run:
36  return (run_id.group(), run.group(1))
37  return None
38 
40  """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
41  cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
42  gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
43  if not gr_r_version:
44  gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
45  if cmssw_release and gr_r_version:
46  return (cmssw_release[0], gr_r_version[0])
47 
49  """Returns tuple (CMSSW version, run version) for specified file."""
50  cmssw_version = re.findall('DQM_V(\d*)_', file)
51  run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
52  if not run_version:
53  run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
54  if cmssw_version and run_version:
55  return (int(cmssw_version[0]), int(run_version[0]))
56 
58  """Returns file with maximum version at a) beggining of the file,
59  e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
60  max_file = files[0]
61  max_v = get_relvaldata_version(files[0])
62  for file in files:
63  file_v = get_relvaldata_version(file)
64  if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
65  max_file = file
66  max_v = file_v
67  return max_file
68 
69 ## ------------------- Make files pairs: RelVal utils ---------------------
71  """Returns tuple (CMSSW version, run version) for specified file."""
72  cmssw_version = re.findall('DQM_V(\d*)_', file)
73  run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
74  if cmssw_version and run_version:
75  return (int(cmssw_version[0]), int(run_version[0]))
76 
78  """Returns file with maximum version at a) beggining of the file,
79  e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
80  max_file = files[0]
81  max_v = get_relval_version(files[0])
82  for file in files:
83  file_v = get_relval_version(file)
84  if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
85  max_file = file
86  max_v = file_v
87  return max_file
88 
90  cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
91  gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
92  if cmssw_release and gr_r_version:
93  return (cmssw_release[0], gr_r_version[0])
94 
95 def get_relval_id(file):
96  """Returns unique relval ID (dataset name) for a given file."""
97  dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
98  return dataset_name[0]
99 
100 ## ----------------------- Make file pairs --------------------------
101 def is_relvaldata(files):
102  is_relvaldata_re = re.compile('_RelVal_')
103  return any([is_relvaldata_re.search(filename) for filename in files])
104 
105 def make_file_pairs(files1, files2):
106  print '\n################# Analyzing files ###################'
107  ## Select functions to use
108  if is_relvaldata(files1):
109  is_relval_data = True
110  get_cmssw_version = get_relvaldata_cmssw_version
111  get_id = get_relvaldata_id
112  get_max_version = get_relvaldata_max_version
113  else:
114  is_relval_data = False
115  get_cmssw_version = get_relval_cmssw_version
116  get_id = get_relval_id
117  get_max_version = get_relval_max_version
118 
119  ## Divide files into groups
120  versions1, versions2 = dict(), dict() # {version1: [file1, file2, ...], version2: [...], ...}
121  for files, versions in (files1, versions1), (files2, versions2):
122  for file in files:
123  version = get_cmssw_version(file)
124  if version:
125  if version in versions:
126  versions[version].append(file)
127  else:
128  versions[version] = [file]
129 
130  ## Print the division into groups
131  print 'For RELEASE1 found file groups:'
132  for version in versions1:
133  print ' %s: %d files' % (str(version), len(versions1[version]))
134  if not versions1:
135  print 'None.'
136 
137  print '\nFor RELEASE2 found file groups:'
138  for version in versions2:
139  print ' %s: %d files' % (str(version), len(versions2[version]))
140  if not versions2:
141  print 'None.'
142 
143  if not len(versions1) or not len(versions2):
144  print '\nNot enough file groups. Exiting...\n'
145  exit()
146 
147  ## Pair till you find pairs.
148  pairs = []
149  for v1 in sorted(versions1, key=lambda x: len(versions1[x]), reverse=True):
150  for v2 in sorted(versions2, key=lambda x: len(versions2[x]), reverse=True):
151  if v1 == v2:
152  continue
153  ## Print the groups.
154  print '\n################# Pairing the files ###################'
155  print '%s (%d files) VS %s (%d files):\n' % (str(v1),
156  len(versions1[v1]), str(v2), len(versions2[v2]))
157 
158  ## Pairing two versions
159  for unique_id in set([get_id(file) for file in versions1[v1]]):
160  if is_relval_data:
161  dataset_re = re.compile(unique_id[0] + '_')
162  run_re = re.compile(unique_id[1])
163  c1_files = [file for file in versions1[v1] if dataset_re.search(file) and run_re.search(file)]
164  c2_files = [file for file in versions2[v2] if dataset_re.search(file) and run_re.search(file)]
165  else:
166  dataset_re = re.compile(unique_id + '_')
167  c1_files = [file for file in versions1[v1] if dataset_re.search(file)]
168  c2_files = [file for file in versions2[v2] if dataset_re.search(file)]
169 
170  if len(c1_files) > 0 and len(c2_files) > 0:
171  first_file = get_max_version(c1_files)
172  second_file = get_max_version(c2_files)
173  print '%s\n%s\n' % (first_file, second_file)
174  pairs.append((first_file, second_file))
175 
176  print "Got %d pairs." % (len(pairs))
177  if pairs:
178  return pairs
179  print 'Found no file pairs. Exiting..\n'
180  exit()
181 
182 ## -------------------- Recursife file downloader -----------------------
183 def auth_wget(url):
184  try:
185  opener = build_opener(X509CertOpen())
186  return opener.open(Request(url)).read()
187  except HTTPError as e:
188  print '\nError: DQM GUI is temporarily unavailable. Probably maintainance hours. '+\
189  'Please try again later. Original error message: ``%s``. \nExiting...\n' % (e,)
190  exit()
191  except BadStatusLine as e:
192  print '\nYou do not have permissions to access DQM GUI. Please check if your certificates '+\
193  'in ``~/.globus`` directory are configured correctly. Exitting...'
194  exit()
195 
196 
197 def auth_download_file(url, chunk_size=1048576):
198  filename = basename(url)
199  file_path = join(auth_download_file.work_dir, filename)
200 
201  file = open(file_path, 'wb')
202  opener = build_opener(X509CertOpen())
203  url_file = opener.open(Request(url))
204  chunk = url_file.read(chunk_size)
205  while chunk:
206  file.write(chunk)
207  auth_download_file.q.put((1,)) # reports, that downloaded 1MB
208  chunk = url_file.read(chunk_size)
209  print '\rDownloaded: %s ' % (filename,)
210  file.close()
211 
212 
213 def recursive_search_online(url, rel1, frags1, rel2, frags2):
214  """Recursively searches for files, that matches the pattern."""
215  if not url:
216  url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/'
217  g1, g2 = recursive_search_online(url, rel1, frags1, rel2, frags2)
218  url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelVal/'
219  g3, g4 = recursive_search_online(url, rel1, frags1, rel2, frags2)
220  g1.update(g3), g2.update(g4)
221  return g1, g2
222 
223  domain = '://'.join(urlparse(url)[:2])
224 
225  ## Compile regular expressions
226  href_re = re.compile(r"<a href='([-./\w]*)'>([-./\w]*)<")
227 
228  def compile_res(rel, frags):
229  frags = frags.split(',')
230  regexps = [s for s in frags if not s.startswith('!')]
231  regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
232  regexps += [rel + '-', '.root']
233  return [re.compile(r) for r in regexps]
234 
235  res1 = compile_res(rel1, frags1)
236  res2 = compile_res(rel2, frags2)
237 
238  ## Recursively find files that matches regular expressions
239  hrefs = [(name, path) for path, name in href_re.findall(auth_wget(url))[1:]]
240  files_with_urls1, files_with_urls2 = dict(), dict()
241  for name, path in hrefs:
242  if splitext(name)[1]: # If file
243  if all([r.search(name) for r in res1]):
244  files_with_urls1[name] = domain + path
245  if all([r.search(name) for r in res2]):
246  files_with_urls2[name] = domain + path
247  else:
248  print domain + path
249  new_hrefs = href_re.findall(auth_wget(domain + path))[1:]
250  hrefs.extend([(name, path) for path, name in new_hrefs])
251  return files_with_urls1, files_with_urls2
252 
253 def search_on_disk(work_path, rel1, frags1, rel2, frags2):
254  if not work_path:
255  print 'No working directory specified. Use "--dir DIR" option to ' +\
256  'specify working directory. Exiting...'
257  exit()
258  ## Compile regular expressions
259  def compile_res(rel, frags):
260  frags = frags.split(',')
261  regexps = [s for s in frags if not s.startswith('!')]
262  regexps += ['^((?%s).)*$' % s for s in frags if s.startswith('!')]
263  regexps += [rel + '-', '.root']
264  return [re.compile(r) for r in regexps]
265 
266  res1 = compile_res(rel1, frags1)
267  res2 = compile_res(rel2, frags2)
268 
269  ## Recursively find files that matches regular expressions
270  files = listdir(work_path)
271  files1, files2 = [], []
272  for name in files:
273  if splitext(name)[1]:
274  if all([r.search(name) for r in res1]):
275  files1.append(name)
276  if all([r.search(name) for r in res2]):
277  files2.append(name)
278  return files1, files2
279 
280 
281 ## Exception definitions
282 comparison_errors = {
283  'Missing histogram': -1,
284  'Histograms have different types': -2,
285  'Object is not a histogram': -3,
286  'Ranges of histograms are different': -4
287  }
288 
290  def __init__(self, error_message, *args, **kwargs):
291  self.error_message = error_message
292  self.error_code = comparison_errors[error_message]
293 
294  def __str__(self):
295  return 'Comparison Error: %d' % self.error_code
296 
297 
298 ## StatisticalTests
300  name = None
301 
302  def get_N_bins(self, h):
303  x = h.GetNbinsX()
304  y = h.GetNbinsY()
305  z = h.GetNbinsZ()
306  if not (y and z): # Is this realy necessary?
307  return 0
308  return (x + 1) * (y + 1) * (z + 1)
309 
310  def is_empty(self, h):
311  for i in xrange(1, self.get_N_bins(h)):
312  if h.GetBinContent(i) != 0:
313  return False
314  return True
315 
316  def do_test(self, h1, h2):
317  if not h1 or not h2:
318  raise ComparisonError('Missing histogram')
319  if type(h1) != type(h2):
320  return -104 # raise ComparisonError('Histograms have different types')
321  if not h1.InheritsFrom('TH1'):
322  return -105 # raise ComparisonError('Object is not a histogram')
323  if self.is_empty(h1) or self.is_empty(h2):
324  return 1
325  h1_bins = self.get_N_bins(h1)
326  if h1_bins != self.get_N_bins(h2):
327  return -103 # raise CoparisonError('Ranges of histograms are different')
328 
329 
331  name = 'KS'
332 
333  def do_test(self, h1, h2):
334  p_value = super(KolmogorovTest, self).do_test(h1, h2)
335  if p_value is not None:
336  return p_value
337 
338  for h in h1, h2:
339  if h.GetSumw2().GetSize() == 0:
340  h.Sumw2()
341  return h1.KolmogorovTest(h2)
342 
343 
345  name = 'Chi2'
346 
347  def make_absolute(self, h, bin_count):
348  for i in xrange(1, bin_count): # Why here is no +1?
349  content = h.GetBinContent(i)
350  if content < 0:
351  h.SetBinContent(i, -1 * content)
352  if h.GetBinError(i) == 0 and content != 0:
353  h.SetBinContent(i, 0)
354 
355  def enough_filled_bins(self, h, bin_count, more_than=3):
356  filled_bins = 0
357  for i in xrange(1, bin_count):
358  if h.GetBinContent(i) > 0:
359  filled_bins += 1
360  if filled_bins > more_than:
361  return True
362  return False
363 
364  def do_test(self, h1, h2):
365  p_value = super(Chi2Test, self).do_test(h1, h2)
366  if p_value is not None:
367  return p_value
368 
369  bin_count = self.get_N_bins(h1)
370 
371  # Make histograms absolute.
372  self.make_absolute(h1, bin_count)
373  self.make_absolute(h2, bin_count)
374 
375  # Check if there is enough filled bins in bouth histograms.
376  if not self.enough_filled_bins(h1, bin_count) or\
377  not self.enough_filled_bins(h2, bin_count):
378  return 1
379 
380  if h1.InheritsFrom("TProfile") or (h1.GetEntries() != h1.GetSumOfWeights()):
381  return h1.Chi2Test(h2, 'WW')
382  return h1.Chi2Test(h2, 'UU')
383 
384 
385 tests = {KolmogorovTest.name: KolmogorovTest, Chi2Test.name: Chi2Test}
386 
387 ## Utils
388 def init_database(db_path):
389  print 'Initialising DB: %s...' % basename(db_path),
390  conn = sqlite3.connect(db_path)
391 
392  ## Creates tables
393  c = conn.cursor()
394  c.execute("""CREATE TABLE IF NOT EXISTS ReleaseComparison (
395  id INTEGER PRIMARY KEY,
396  title TEXT,
397  release1 TEXT,
398  release2 TEXT,
399  statistical_test TEXT
400  );""")
401  c.execute("""CREATE TABLE IF NOT EXISTS Directory (
402  id INTEGER PRIMARY KEY,
403  name TEXT,
404  parent_id INTEGER,
405  from_histogram_id INTEGER,
406  till_histogram_id INTEGER,
407  FOREIGN KEY (parent_id) REFERENCES Directory(id)
408  FOREIGN KEY (from_histogram_id) REFERENCES HistogramComparison(id)
409  FOREIGN KEY (till_histogram_id) REFERENCES HistogramComparison(id)
410  )""")
411  c.execute("""CREATE TABLE IF NOT EXISTS RootFileComparison (
412  id INTEGER PRIMARY KEY,
413  filename1 TEXT,
414  filename2 TEXT,
415  release_comparison_id INTEGER,
416  directory_id INTEGER,
417  FOREIGN KEY (release_comparison_id) REFERENCES ReleaseComparison(id),
418  FOREIGN KEY (directory_id) REFERENCES Directory(id)
419  )""")
420  c.execute("""CREATE TABLE IF NOT EXISTS HistogramComparison (
421  id INTEGER PRIMARY KEY,
422  name TEXT,
423  p_value REAL,
424  directory_id INTEGER,
425  FOREIGN KEY (directory_id) REFERENCES Directory(id)
426  )""")
427 
428  print 'Done.'
429  return db_path
430 
431 
432 def get_version(filename):
433  """Returns CMSSW and GR_R versions for the given filename."""
434  if is_relvaldata([filename]):
435  version_elems = get_relvaldata_cmssw_version(filename)
436  else:
437  relval_version = get_relval_cmssw_version(filename)
438  version_elems = (relval_version[0], relval_version[1][0], relval_version[1][1])
439  version_elems = [elem.strip('_').strip('RelVal_') for elem in version_elems]
440  return '___'.join([elem for elem in version_elems if elem])
441 
442 
443 def get_size_to_download(work_path, files_with_urls):
444  """Returns file list to download and total size to download."""
445  opener = build_opener(X509CertOpen())
446  size_to_download = 0
447  files_to_download = []
448  for filename, url in files_with_urls:
449  url_file = opener.open(Request(url))
450  size = int(url_file.headers["Content-Length"])
451  file_path = join(work_path, filename)
452  if exists(file_path) and getsize(file_path) / 1024 == size / 1024:
453  print "Exists on disk %s." % filename
454  else:
455  size_to_download += size
456  files_to_download.append(url)
457  return size_to_download, files_to_download
458 
459 def check_disk_for_space(work_path, size_needed):
460  '''Checks afs file system for space.'''
461  pass
462  # try:
463  # fs_proc = subprocess.Popen(['fs', 'listquota', work_path], stdout=subprocess.PIPE)
464  # except OSError:
465  # return
466  # fs_response = fs_proc.communicate()[0]
467  # quota, used = re.findall('([\d]+)', fs_response)[:2]
468  # free_space = int(quota) - int(used)
469  # if free_space * 1024 < size_needed:
470  # print '\nNot enougth free space on disk.',
471  # print 'Free space: %d MB. Need: %d MB. Exiting...\n' % (free_space / 1024, size_needed /1048576)
472  # exit()
473  # elif size_needed:
474  # print 'Free space on disk: %d MB.\n' % (free_space / 1024,)
475 
476 
477 def show_status_bar(total_size):
478  """Shows download status."""
479  q = show_status_bar.q
480  total_size = total_size / (1024*1024)
481  downloaded = 0
482  while downloaded < total_size:
483  try:
484  o = q.get(timeout=20)
485  downloaded += 1
486  print '\r %d/%d MB %d%% ' % (downloaded, total_size, 100*downloaded/total_size),
487  sys.stdout.flush()
488  except Empty:
489  time.sleep(1)
490  break
def enough_filled_bins(self, h, bin_count, more_than=3)
Definition: utils_v2.py:355
def get_version(filename)
Definition: utils_v2.py:432
def make_file_pairs(files1, files2)
Definition: utils_v2.py:105
def get_relval_id(file)
Definition: utils_v2.py:95
bool any(const std::vector< T > &v, const T &what)
Definition: ECalSD.cc:37
def __init__(self, error_message, args, kwargs)
Definition: utils_v2.py:290
def do_test(self, h1, h2)
Definition: utils_v2.py:333
def show_status_bar(total_size)
Definition: utils_v2.py:477
def auth_download_file(url, chunk_size=1048576)
Definition: utils_v2.py:197
StatisticalTests.
Definition: utils_v2.py:299
def do_test(self, h1, h2)
Definition: utils_v2.py:364
def recursive_search_online(url, rel1, frags1, rel2, frags2)
Definition: utils_v2.py:213
def get_relval_max_version(files)
Definition: utils_v2.py:77
def get_size_to_download(work_path, files_with_urls)
Definition: utils_v2.py:443
def auth_wget(url)
-----------------— Recursife file downloader --------------------—
Definition: utils_v2.py:183
def is_empty(self, h)
Definition: utils_v2.py:310
def get_relval_cmssw_version(file)
Definition: utils_v2.py:89
def search_on_disk(work_path, rel1, frags1, rel2, frags2)
Definition: utils_v2.py:253
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def get_N_bins(self, h)
Definition: utils_v2.py:302
def is_relvaldata(files)
--------------------— Make file pairs -----------------------—
Definition: utils_v2.py:101
def get_relvaldata_cmssw_version(file)
Definition: utils_v2.py:39
def get_relvaldata_max_version(files)
Definition: utils_v2.py:57
def get_relvaldata_version(file)
Definition: utils_v2.py:48
def check_disk_for_space(work_path, size_needed)
Definition: utils_v2.py:459
def init_database(db_path)
Utils.
Definition: utils_v2.py:388
def get_relvaldata_id(file)
-----------—— Make files pairs: RelValData utils --------------——
Definition: utils_v2.py:29
def get_relval_version(file)
-------------—— Make files pairs: RelVal utils ---------------——
Definition: utils_v2.py:70
def do_test(self, h1, h2)
Definition: utils_v2.py:316
def make_absolute(self, h, bin_count)
Definition: utils_v2.py:347