CMS 3D CMS Logo

compareHistograms.py
Go to the documentation of this file.
1 #!/bin/env python3
2 
3 from __future__ import print_function
4 import ROOT
5 ROOT.PyConfig.IgnoreCommandLineOptions = True
6 import os
7 import sys
8 import argparse
9 import numpy as np
10 from DQMServices.FileIO.blacklist import get_blacklist
11 import multiprocessing
12 
13 def create_dif(base_file_path, pr_file_path, pr_number, test_number, cmssw_version, num_processes, output_dir_path):
14  base_file = ROOT.TFile(base_file_path, 'read')
15  ROOT.gROOT.GetListOfFiles().Remove(base_file)
16 
17  pr_file = ROOT.TFile(pr_file_path, 'read')
18  ROOT.gROOT.GetListOfFiles().Remove(pr_file)
19 
20  if base_file.IsOpen():
21  print('Baseline file successfully opened', file=sys.stderr)
22  else:
23  print('Unable to open base file', file=sys.stderr)
24  return
25 
26  if pr_file.IsOpen():
27  print('PR file successfully opened', file=sys.stderr)
28  else:
29  print('Unable to open PR file', file=sys.stderr)
30  return
31 
32  run_nr = get_run_nr(pr_file_path)
33 
34  # Get list of paths (lists of directories)
35  base_flat_dict = flatten_file(base_file, run_nr)
36  pr_flat_dict = flatten_file(pr_file, run_nr)
37 
38  # Paths that appear in both baseline and PR data. (Intersection)
39  shared_paths = list(set(pr_flat_dict).intersection(set(base_flat_dict)))
40 
41  # Paths that appear only in PR data. (Except)
42  only_pr_paths = list(set(pr_flat_dict).difference(set(base_flat_dict)))
43 
44  # Paths that appear only in baseline data. (Except)
45  only_base_paths = list(set(base_flat_dict).difference(set(pr_flat_dict)))
46 
47  # Histograms pointed to by these paths will be written to baseline output
48  paths_to_save_in_base = []
49 
50  # Histograms pointed to by these paths will be written to pr output
51  paths_to_save_in_pr = []
52 
53  # Make comparison
54  if num_processes > 1:
55  print("starting comparison using %d process(es)" % num_processes)
56  manager = multiprocessing.Manager()
57  return_dict = manager.dict()
58  proc = []
59  iProc = 0
60 
61  block = len(shared_paths)//num_processes
62  for i in range(num_processes):
63  p = multiprocessing.Process(target=compareMP, args=(shared_paths[i*block:(i+1)*block], pr_flat_dict, base_flat_dict, i, return_dict))
64  proc.append(p)
65  p.start()
66  iProc += 1
67  p = multiprocessing.Process(target=compareMP, args=(shared_paths[(i+1)*block:len(shared_paths)], pr_flat_dict, base_flat_dict, num_processes, return_dict))
68  proc.append(p)
69  p.start()
70  iProc += 1
71 
72  for i in range(iProc):
73  proc[i].join()
74  paths_to_save_in_pr.extend(return_dict[i]['pr'])
75  paths_to_save_in_base.extend(return_dict[i]['base'])
76 
77  paths_to_save_in_pr.sort()
78  paths_to_save_in_base.sort()
79  print("Done")
80  else:
81  compare(shared_paths, pr_flat_dict, base_flat_dict, paths_to_save_in_pr, paths_to_save_in_base)
82 
83  # Collect paths that have to be written to baseline output file
84  for path in only_base_paths:
85  item = base_flat_dict[path]
86 
87  if item == None:
88  continue
89 
90  paths_to_save_in_base.append(path)
91 
92  # Collect paths that have to be written to PR output file
93  for path in only_pr_paths:
94  item = pr_flat_dict[path]
95 
96  if item == None:
97  continue
98 
99  paths_to_save_in_pr.append(path)
100 
101  base_output_filename = get_output_filename(pr_file_path, pr_number, test_number, cmssw_version, False)
102  pr_output_filename = get_output_filename(pr_file_path, pr_number, test_number, cmssw_version, True)
103 
104  # Write baseline output
105  save_paths(base_flat_dict, paths_to_save_in_base, os.path.join(output_dir_path, 'base', base_output_filename))
106 
107  # Write PR output
108  save_paths(pr_flat_dict, paths_to_save_in_pr, os.path.join(output_dir_path, 'pr', pr_output_filename))
109 
110  pr_file.Close()
111  base_file.Close()
112 
113  # Info about changed, added and removed elements
114  nr_of_changed_elements = len(set(paths_to_save_in_base).intersection(set(paths_to_save_in_pr)))
115  nr_of_removed_elements = len(paths_to_save_in_base) - nr_of_changed_elements
116  nr_of_added_elements = len(paths_to_save_in_pr) - nr_of_changed_elements
117 
118  print('Base output file. PR output file. Changed elements, removed elements, added elements:')
119  print(base_output_filename)
120  print(pr_output_filename)
121  print('%s %s %s' % (nr_of_changed_elements, nr_of_removed_elements, nr_of_added_elements))
122 
123 def compareMP(shared_paths, pr_flat_dict, base_flat_dict, iProc, return_dict):
124  # Prepare output dictionary
125  comparisons = {'pr': [], 'base': []}
126 
127  # Collect paths that have to be written to both output files
128  for path in shared_paths:
129  pr_item = pr_flat_dict[path]
130  base_item = base_flat_dict[path]
131 
132  if pr_item == None or base_item == None:
133  continue
134 
135  are_different=False
136 
137  if pr_item.InheritsFrom('TProfile2D') and base_item.InheritsFrom('TProfile2D'):
138  # Compare TProfile (content, entries and errors)
139  are_different = not compare_TProfile(pr_item, base_item)
140 
141  elif pr_item.InheritsFrom('TProfile') and base_item.InheritsFrom('TProfile'):
142  # Compare TProfile (content, entries and errors)
143  are_different = not compare_TProfile(pr_item, base_item)
144 
145  elif pr_item.InheritsFrom('TH1') and base_item.InheritsFrom('TH1'):
146  # Compare bin by bin
147  pr_array = np.array(pr_item)
148  base_array = np.array(base_item)
149 
150  if pr_array.shape != base_array.shape or not np.allclose(pr_array, base_array, equal_nan=True):
151  are_different = True
152  else:
153  # Compare non histograms
154  if pr_item != base_item:
155  are_different = True
156 
157  if are_different:
158  comparisons['pr'].append(path)
159  comparisons['base'].append(path)
160  return_dict[iProc] = comparisons
161 
162 def compare(shared_paths, pr_flat_dict, base_flat_dict, paths_to_save_in_pr, paths_to_save_in_base):
163  # Collect paths that have to be written to both output files
164  for path in shared_paths:
165  pr_item = pr_flat_dict[path]
166  base_item = base_flat_dict[path]
167 
168  if pr_item == None or base_item == None:
169  continue
170 
171  are_different=False
172 
173  if pr_item.InheritsFrom('TProfile2D') and base_item.InheritsFrom('TProfile2D'):
174  # Compare TProfile (content, entries and errors)
175  are_different = not compare_TProfile(pr_item, base_item)
176 
177  elif pr_item.InheritsFrom('TProfile') and base_item.InheritsFrom('TProfile'):
178  # Compare TProfile (content, entries and errors)
179  are_different = not compare_TProfile(pr_item, base_item)
180 
181  elif pr_item.InheritsFrom('TH1') and base_item.InheritsFrom('TH1'):
182  # Compare bin by bin
183  pr_array = np.array(pr_item)
184  base_array = np.array(base_item)
185 
186  if pr_array.shape != base_array.shape or not np.allclose(pr_array, base_array, equal_nan=True):
187  are_different = True
188  else:
189  # Compare non histograms
190  if pr_item != base_item:
191  are_different = True
192 
193  if are_different:
194  paths_to_save_in_pr.append(path)
195  paths_to_save_in_base.append(path)
196 
197 # Returns False if different, True otherwise
198 def compare_TProfile(pr_item, base_item):
199  if pr_item.GetSize() != base_item.GetSize():
200  return False
201 
202  for i in range(pr_item.GetSize()):
203  pr_bin_content = pr_item.GetBinContent(i)
204  base_bin_content = base_item.GetBinContent(i)
205 
206  pr_bin_entries = pr_item.GetBinEntries(i)
207  base_bin_entries = base_item.GetBinEntries(i)
208 
209  pr_bin_error = pr_item.GetBinError(i)
210  base_bin_error = base_item.GetBinError(i)
211 
212  if not np.isclose(pr_bin_content, base_bin_content, equal_nan=True):
213  return False
214 
215  if not np.isclose(pr_bin_entries, base_bin_entries, equal_nan=True):
216  return False
217 
218  if not np.isclose(pr_bin_error, base_bin_error, equal_nan=True):
219  return False
220 
221  return True
222 
223 def flatten_file(file, run_nr):
224  result = {}
225  for key in file.GetListOfKeys():
226  try:
227  traverse_till_end(key.ReadObj(), [], result, run_nr)
228  except:
229  pass
230 
231  return result
232 
233 def traverse_till_end(node, dirs_list, result, run_nr):
234  new_dir_list = dirs_list + [get_node_name(node)]
235  if hasattr(node, 'GetListOfKeys'):
236  for key in node.GetListOfKeys():
237  traverse_till_end(key.ReadObj(), new_dir_list, result, run_nr)
238  else:
239  if not is_blacklisted(new_dir_list, run_nr):
240  path = tuple(new_dir_list)
241  result[path] = node
242 
243 def get_node_name(node):
244  if node.InheritsFrom('TObjString'):
245  # Strip out just the name from a tag (<name>value</name>)
246  name = node.GetName().split('>')[0][1:]
247  return name + get_string_suffix()
248  else:
249  return node.GetName()
250 
252  return '_string_monitor_element'
253 
254 def is_blacklisted(dirs_list, run_nr):
255  # Copy the list
256  dirs_list = dirs_list[:]
257  # Remove string suffix
258  if dirs_list[-1].endswith(get_string_suffix()):
259  dirs_list[-1] = dirs_list[-1].replace(get_string_suffix(), '')
260 
261  return tuple(dirs_list) in get_blacklist(run_nr)
262 
263 def save_paths(flat_dict, paths, result_file_path):
264  if len(paths) == 0:
265  print('No differences were observed - output will not be written', file=sys.stderr)
266  return
267 
268  # Make sure output dir exists
269  result_dir = os.path.dirname(result_file_path)
270  if not os.path.exists(result_dir):
271  os.makedirs(result_dir)
272 
273  result_file = ROOT.TFile(result_file_path, 'recreate')
274  ROOT.gROOT.GetListOfFiles().Remove(result_file)
275 
276  if not result_file.IsOpen():
277  print('Unable to open %s output file' % result_file_path, file=sys.stderr)
278  return
279 
280  for path in paths:
281  save_to_file(flat_dict, path, result_file)
282 
283  result_file.Close()
284  print('Output written to %s file' % result_file_path, file=sys.stderr)
285 
286 # Saves file from flat_dict in the same dir of currently open file for writing
287 def save_to_file(flat_dict, path, output_file):
288  histogram = flat_dict[path]
289 
290  current = output_file
291 
292  # Last item is filename. No need to create dir for it
293  for directory in path[:-1]:
294  current = create_dir(current, directory)
295  current.cd()
296 
297  histogram.Write()
298 
299 # Create dir in root file if it doesn't exist
300 def create_dir(parent_dir, name):
301  dir = parent_dir.Get(name)
302  if not dir:
303  dir = parent_dir.mkdir(name)
304  return dir
305 
306 def get_output_filename(input_file_path, pr_number, test_number, cmssw_version, isPr):
307  # Samples of correct output file format:
308  # DQM_V0001_R000320822__wf136_892_pr__CMSSW_10_4_0_pre3-PR25518-1234__DQMIO.root
309  # When run number is 1 we have to use RelVal naming pattern:
310  # DQM_V0002_R000000001__RelVal_wf136_892_pr__CMSSW_10_4_0_pre3-PR25518-1234__DQMIO.root
311 
312  input_file_name = os.path.basename(input_file_path)
313 
314  run = input_file_name.split('_')[2]
315  workflow = os.path.basename(os.path.dirname(input_file_path)).split('_')[0].replace('.', '_')
316  if not workflow:
317  workflow = 'Unknown'
318 
319  relval_prefix = ''
320  if run == 'R000000001':
321  relval_prefix = 'RelVal_'
322 
323  baseOrPr = 'base'
324  if isPr:
325  baseOrPr = 'pr'
326 
327  return 'DQM_V0001_%s__%swf%s_%s__%s-PR%s-%s__DQMIO.root' % (run, relval_prefix, workflow, baseOrPr, cmssw_version, pr_number, test_number)
328 
329 def get_run_nr(file_path):
330  return os.path.basename(file_path).split('_')[2].lstrip('R').lstrip('0')
331 
332 if __name__ == '__main__':
333  parser = argparse.ArgumentParser(description="This tool compares DQM monitor elements found in base-file with the ones found in pr-file."
334  "Comparison is done bin by bin and output is written to a root file containing only the changes.")
335  parser.add_argument('-b', '--base-file', help='Baseline IB DQM root file', required=True)
336  parser.add_argument('-p', '--pr-file', help='PR DQM root file', required=True)
337  parser.add_argument('-n', '--pr-number', help='PR number under test', default='00001')
338  parser.add_argument('-t', '--test-number', help='Unique test number to distinguish different comparisons of the same PR.', default='1')
339  parser.add_argument('-r', '--release-format', help='Release format in this format: CMSSW_10_5_X_2019-02-17-0000', default=os.environ['CMSSW_VERSION'])
340  parser.add_argument('-j', '--num-processes', help='Number of processes forked to parallel process the comparison', default=1, type=int)
341  parser.add_argument('-o', '--output-dir', help='Comparison root files output directory', default='dqmHistoComparisonOutput')
342  args = parser.parse_args()
343 
344  cmssw_version = '_'.join(args.release_format.split('_')[:4])
345 
346  create_dif(args.base_file, args.pr_file, args.pr_number, args.test_number, cmssw_version, args.num_processes, args.output_dir)
def get_output_filename(input_file_path, pr_number, test_number, cmssw_version, isPr)
def save_to_file(flat_dict, path, output_file)
def get_blacklist(RUN_NR)
Definition: blacklist.py:2
def replace(string, replacements)
def flatten_file(file, run_nr)
def traverse_till_end(node, dirs_list, result, run_nr)
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
Definition: Utilities.cc:47
def is_blacklisted(dirs_list, run_nr)
def create_dir(parent_dir, name)
static std::string join(char **cmd)
Definition: RemoteFile.cc:21
def create_dif(base_file_path, pr_file_path, pr_number, test_number, cmssw_version, num_processes, output_dir_path)
def compare_TProfile(pr_item, base_item)
def save_paths(flat_dict, paths, result_file_path)
def compareMP(shared_paths, pr_flat_dict, base_flat_dict, iProc, return_dict)
def compare(shared_paths, pr_flat_dict, base_flat_dict, paths_to_save_in_pr, paths_to_save_in_base)
def get_run_nr(file_path)