CMS 3D CMS Logo

mergeLHE.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 
3 from __future__ import print_function
4 import logging
5 import argparse
6 import math
7 import glob
8 import sys
9 import os
10 import re
11 
13  """Base class of the LHE merge schemes"""
14 
15  def __init__(self, input_files, output_file):
16  self.input_files = input_files
17  self.output_file = output_file
18 
19  def merge(self):
20  """Output the merged LHE"""
21  pass
22 
23 class DefaultLHEMerger(BaseLHEMerger):
24  """Default LHE merge scheme that copies the header of the first LHE file,
25  merges and outputs the init block, then concatenates all event blocks."""
26 
27  def __init__(self, input_files, output_file, **kwargs):
28  super(DefaultLHEMerger, self).__init__(input_files, output_file)
29 
30  self.bypass_check = kwargs.get('bypass_check', False)
31  # line-by-line iterator for each input file
32  self._f = [self.file_iterator(name) for name in self.input_files]
33  self._header_str = []
34  self._is_mglo = False
35  self._xsec_combined = 0.
36  self._uwgt = 0.
37  self._init_str = [] # initiated blocks for each input file
38  self._nevent = [] # number of events for each input file
39 
40  def file_iterator(self, path):
41  """Line-by-line iterator of a txt file"""
42  with open(path, 'r') as f:
43  for line in f:
44  yield line
45 
47  """Check if all headers for input files are consistent."""
48 
49  if self.bypass_check:
50  return
51 
52  inconsistent_error_info = ("Incompatibility found in LHE headers: %s. "
53  "Use -b/--bypass-check to bypass the check.")
54  allow_diff_keys = [
55  'nevent', 'numevts', 'iseed', 'Seed', 'Random', '.log', '.dat', '.lhe',
56  'Number of Events', 'Integrated weight'
57  ]
58  self._header_lines = [header.split('\n') for header in self._header_str]
59 
60  # Iterate over header lines for all input files and check consistency
61  logging.debug('header line number: %s' \
62  % ', '.join([str(len(lines)) for lines in self._header_lines]))
63  assert all([
64  len(self._header_lines[0]) == len(lines) for lines in self._header_lines]
65  ), inconsistent_error_info % "line number not matches"
66  inconsistent_lines_set = [set() for _ in self._header_lines]
67  for line_zip in zip(*self._header_lines):
68  if any([k in line_zip[0] for k in allow_diff_keys]):
69  logging.debug('Captured \'%s\', we allow difference in this line' % line_zip[0])
70  continue
71  if not all([line_zip[0] == line for line in line_zip]):
72  # Ok so meet inconsistency in some lines, then temporarily store them
73  for i, line in enumerate(line_zip):
74  inconsistent_lines_set[i].add(line)
75  # Those inconsistent lines still match, meaning that it is only a change of order
76  assert all([inconsistent_lines_set[0] == lset for lset in inconsistent_lines_set]), \
77  inconsistent_error_info % ('{' + ', '.join(inconsistent_lines_set[0]) + '}')
78 
79  def merge_headers(self):
80  """Merge the headers of input LHEs. Need special handle for the MG5 LO case."""
81 
82  self._is_mglo = all(['MGGenerationInfo' in header for header in self._header_str])
83  if self._is_mglo and not self.bypass_check:
84  # Special handling of MadGraph5 LO LHEs
85  match_geninfo = [
86  re.search(
87  (r"<MGGenerationInfo>\s+#\s*Number of Events\s*\:\s*(\S+)\s+"
88  r"#\s*Integrated weight \(pb\)\s*\:\s*(\S+)\s+<\/MGGenerationInfo>"),
89  header
90  ) for header in self._header_str
91  ]
92  self._xsec_combined = sum(
93  [float(info.group(2)) * nevt for info, nevt in zip(match_geninfo, self._nevent)]
94  ) / sum(self._nevent)
95  geninfo_combined = ("<MGGenerationInfo>\n"
96  "# Number of Events : %d\n"
97  "# Integrated weight (pb) : %.10f\n</MGGenerationInfo>") \
98  % (sum(self._nevent), self._xsec_combined)
99  logging.info('Detected: MG5 LO LHEs. Input <MGGenerationInfo>:\n\tnevt\txsec')
100  for info, nevt in zip(match_geninfo, self._nevent):
101  logging.info('\t%d\t%.10f' % (nevt, float(info.group(2))))
102  logging.info('Combined <MGGenerationInfo>:\n\t%d\t%.10f' \
103  % (sum(self._nevent), self._xsec_combined))
104 
105  header_combined = self._header_str[0].replace(match_geninfo[0].group(), geninfo_combined)
106  return header_combined
107 
108  else:
109  # No need to merge the headers
110  return self._header_str[0]
111 
112  def merge_init_blocks(self):
113  """If all <init> blocks are identical, return the same <init> block
114  (in the case of Powheg LHEs); otherwise, calculate the output <init>
115  blocks by merging the input blocks info using formula (same with the
116  MG5LOLHEMerger scheme):
117  XSECUP = sum(xsecup * no.events) / tot.events
118  XERRUP = sqrt( sum(sigma^2 * no.events^2) ) / tot.events
119  XMAXUP = max(xmaxup)
120  """
121 
122  if self.bypass_check:
123  # If bypass the consistency check, simply use the first LHE <init>
124  # block as the output
125  return self._init_str[0]
126 
127  # Initiate collected init block info. Will be in format of
128  # {iprocess: [xsecup, xerrup, xmaxup]}
129  new_init_block = {}
130  old_init_block = [{} for _ in self._init_str]
131 
132  # Read the xsecup, xerrup, and xmaxup info from the <init> block for
133  # all input LHEs
134  for i, bl in enumerate(self._init_str): # loop over files
135  nline = int(bl.split('\n')[0].strip().split()[-1])
136 
137  # loop over lines in <init> block
138  for bl_line in bl.split('\n')[1:nline + 1]:
139  bl_line_sp = bl_line.split()
140  old_init_block[i][int(bl_line_sp[3])] = [
141  float(bl_line_sp[0]), float(bl_line_sp[1]), float(bl_line_sp[2])]
142 
143  # After reading all subprocesses info, store the rest content in
144  # <init> block for the first file
145  if i == 0:
146  info_after_subprocess = bl.strip().split('\n')[nline + 1:]
147 
148  logging.info('Input file: %s' % self.input_files[i])
149  for ipr in sorted(list(old_init_block[i].keys()), reverse=True):
150  # reverse order: follow the MG5 custom
151  logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
152  % tuple(old_init_block[i][ipr] + [ipr]))
153 
154  # Adopt smarter <init> block merging method
155  # If all <init> blocks from input files are identical, return the same block;
156  # otherwise combine them based on MG5LOLHEMerger scheme
157  if all([old_init_block[i] == old_init_block[0] for i in range(len(self._f))]):
158  # All <init> blocks are identical
159  logging.info(
160  'All input <init> blocks are identical. Output the same "<init> block.')
161  return self._init_str[0]
162 
163  # Otherwise, calculate merged init block
164  for i in range(len(self._f)):
165  for ipr in old_init_block[i]:
166  # Initiate the subprocess for the new block if it is found for the
167  # first time in one input file
168  if ipr not in new_init_block:
169  new_init_block[ipr] = [0., 0., 0.]
170  new_init_block[ipr][0] += old_init_block[i][ipr][0] * self._nevent[i] # xsecup
171  new_init_block[ipr][1] += old_init_block[i][ipr][1]**2 * self._nevent[i]**2 # xerrup
172  new_init_block[ipr][2] = max(new_init_block[ipr][2], old_init_block[i][ipr][2]) # xmaxup
173  tot_nevent = sum([self._nevent[i] for i in range(len(self._f))])
174 
175  # Write first line of the <init> block (modify the nprocess at the last)
176  self._merged_init_str = self._init_str[0].split('\n')[0].strip().rsplit(' ', 1)[0] \
177  + ' ' + str(len(new_init_block)) + '\n'
178  # Form the merged init block
179  logging.info('Output file: %s' % self.output_file)
180  for ipr in sorted(list(new_init_block.keys()), reverse=True):
181  # reverse order: follow the MG5 custom
182  new_init_block[ipr][0] /= tot_nevent
183  new_init_block[ipr][1] = math.sqrt(new_init_block[ipr][1]) / tot_nevent
184  logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
185  % tuple(new_init_block[ipr] + [ipr]))
186  self._merged_init_str += '%.6E %.6E %.6E %d\n' % tuple(new_init_block[ipr] + [ipr])
187  self._merged_init_str += '\n'.join(info_after_subprocess)
188  if len(info_after_subprocess):
189  self._merged_init_str += '\n'
190 
191  return self._merged_init_str
192 
193  def merge(self):
194  with open(self.output_file, 'w') as fw:
195  # Read the header for the all input files
196  for i in range(len(self._f)):
197  header = []
198  line = next(self._f[i])
199  while not re.search('\s*<init(>|\s)', line):
200  header.append(line)
201  line = next(self._f[i])
202  # 'header' includes all contents before reaches <init>
203  self._header_str.append(''.join(header))
205 
206  # Read <init> blocks for all input_files
207  for i in range(len(self._f)):
208  init = []
209  line = next(self._f[i])
210  while not re.search('\s*</init>', line):
211  init.append(line)
212  line = next(self._f[i])
213  # 'init_str' includes all contents inside <init>...</init>
214  self._init_str.append(''.join(init))
215 
216  # Iterate over all events file-by-file and write events temporarily
217  # to .tmp.lhe
218  with open('.tmp.lhe', 'w') as _fwtmp:
219  for i in range(len(self._f)):
220  nevent = 0
221  while True:
222  line = next(self._f[i])
223  if re.search('\s*</event>', line):
224  nevent += 1
225  if re.search('\s*</LesHouchesEvents>', line):
226  break
227  _fwtmp.write(line)
228  self._nevent.append(nevent)
229  self._f[i].close()
230 
231  # Merge the header and init blocks and write to the output
232  fw.write(self.merge_headers())
233  fw.write('<init>\n' + self.merge_init_blocks() + '</init>\n')
234 
235  # Write event blocks in .tmp.lhe back to the output
236  # If is MG5 LO LHE, will recalculate the weights based on combined xsec
237  # and nevent read from <MGGenerationInfo>, and the 'event_norm' mode
238  if self._is_mglo and not self.bypass_check:
239  event_norm = re.search(
240  r'\s(\w+)\s*=\s*event_norm\s',
241  self._header_str[0]).group(1)
242  if event_norm == 'sum':
243  self._uwgt = self._xsec_combined / sum(self._nevent)
244  elif event_norm == 'average':
245  self._uwgt = self._xsec_combined
246  logging.info(("MG5 LO LHE with event_norm = %s detected. Will "
247  "recalculate weights in each event block.\n"
248  "Unit weight: %+.7E") % (event_norm, self._uwgt))
249 
250  # Modify event wgt when transfering .tmp.lhe to the output file
251  event_line = -999
252  with open('.tmp.lhe', 'r') as ftmp:
253  sign = lambda x: -1 if x < 0 else 1
254  for line in ftmp:
255  event_line += 1
256  if re.search('\s*<event.*>', line):
257  event_line = 0
258  if event_line == 1:
259  # modify the XWGTUP appeared in the first line of the
260  # <event> block
261  orig_wgt = float(line.split()[2])
262  fw.write(re.sub(r'(^\s*\S+\s+\S+\s+)\S+(.+)', r'\g<1>%+.7E\g<2>' \
263  % (sign(orig_wgt) * self._uwgt), line))
264  elif re.search('\s*<wgt.*>.*</wgt>', line):
265  addi_wgt_str = re.search(r'<wgt.*>\s*(\S+)\s*<\/wgt>', line).group(1)
266  fw.write(line.replace(
267  addi_wgt_str, '%+.7E' % (float(addi_wgt_str) / orig_wgt * self._uwgt)))
268  else:
269  fw.write(line)
270  else:
271  # Simply transfer all lines
272  with open('.tmp.lhe', 'r') as ftmp:
273  for line in ftmp:
274  fw.write(line)
275  fw.write('</LesHouchesEvents>\n')
276  os.remove('.tmp.lhe')
277 
278 
280  """Use the merger script dedicated for MG5 LO LHEs, as introduced in
281  https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl
282  """
283 
284  def __init__(self, input_files, output_file, **kwargs):
285  super(MG5LOLHEMerger, self).__init__(input_files, output_file)
287  'https://raw.githubusercontent.com/cms-sw/genproductions/5c1e865a6fbe3a762a28363835d9a804c9cf0dbe/bin/MadGraph5_aMCatNLO/Utilities/merge.pl'
288 
289  def merge(self):
290  logging.info(
291  ('Use the merger script in genproductions dedicated for '
292  'MadGraph5-produced LHEs'))
293  os.system('curl -s -L %s | perl - %s %s.gz banner.txt' \
294  % (self._merger_script_url, ' '.join(self.input_files), self.output_file))
295  os.system('gzip -df %s.gz' % self.output_file)
296  os.system('rm banner.txt')
297 
298 
300  """Use the external mergeLheFiles.cpp file to merge LHE files, as introduced in
301  https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns
302  """
303 
304  def __init__(self, input_files, output_file, **kwargs):
305  super(ExternalCppLHEMerger, self).__init__(input_files, output_file)
307  'https://twiki.cern.ch/twiki/bin/viewfile/CMSPublic/SWGuideSubgroupMC?filename=mergeLheFiles.cpp;rev=2'
308 
309  def merge(self):
310  logging.info(
311  ('Use the external mergeLheFiles.cpp file to merge LHE files.'))
312  os.system('curl -s -o mergeLheFiles.cpp %s' % self._merger_script_url)
313  with open('mergeLheFiles.cpp') as f:
314  script_str = f.read()
315  with open('mergeLheFiles.cpp', 'w') as fw:
316  fw.write(script_str.replace('/tmp/covarell/out.lhe', self.output_file))
317  with open('input_files.txt', 'w') as fw:
318  fw.write('\n'.join(self.input_files) + '\n')
319 
320  os.system('g++ -Wall -o mergeLheFiles mergeLheFiles.cpp')
321  os.system('./mergeLheFiles input_files.txt')
322  os.system('rm mergeLheFiles* input_files.txt')
323 
324 
325 def main(argv = None):
326  """Main routine of the script.
327 
328  Arguments:
329  - `argv`: arguments passed to the main routine
330  """
331 
332  if argv == None:
333  argv = sys.argv[1:]
334 
335  parser = argparse.ArgumentParser(
336  description=("A universal script that merges multiple LHE files for all possible conditions and in the most "
337  "natural way.\n"
338  "A detailed description of the merging step (in the default mode):\n"
339  " 1. Header:\n"
340  " a. assert consistency of the headers (allow difference for the info of e.g. #event, seed);\n"
341  " b. if not MG LO LHEs, will simply use the header from the first LHE; otherwise, reset the "
342  "<MGGenerationInfo> from the headers by merging the #event & xsec info;\n"
343  " 2. Init block: if all <init> blocks are the same, use the same as output; otherwise (the MG LO "
344  "case), merge them by recalculating the # of subprocess (LRPUP) and XSECUP, XERRUP, XMAXUP per "
345  "each subprocess.\n"
346  " 3. Event block: concatenate all event blocks. If for MG LO LHEs, recalculate the per-event "
347  "XWGTUP and all <wgt> tags based on the new XSECUP, #event, and 'event_norm' read from the MG "
348  "run card.\n"
349  "For further development of this script please always validate the merging result on the test "
350  "routines: https://github.com/colizz/mergelhe_validate\n"
351  "Example usage:\n"
352  " mergeLHE.py -i 'thread*/*.lhe,another_file/another.lhe' -o output.lhe"),
353  formatter_class=argparse.RawTextHelpFormatter)
354  parser.add_argument("-i", "--input-files", type=str,
355  help="Input LHE file paths separated by commas. Shell-type wildcards are supported.")
356  parser.add_argument("-o", "--output-file",
357  default='output.lhe', type=str,
358  help="Output LHE file path.")
359  parser.add_argument("--force-mglo-merger", action='store_true',
360  help=("Force to use the merger script dedicated for MG5 LO LHEs, as introduced in "
361  "https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl"))
362  parser.add_argument("--force-cpp-merger", action='store_true',
363  help=("Force to use the external mergeLheFiles.cpp file to merge LHE files, as introduced in "
364  "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns"))
365  parser.add_argument("-b", "--bypass-check", action='store_true',
366  help=("Bypass the compatibility check for the headers. If true, the header and init block "
367  "will be just a duplicate from the first input file, and events are concatenated without "
368  "modification."))
369  parser.add_argument("--debug", action='store_true',
370  help="Use the debug mode.")
371  args = parser.parse_args(argv)
372 
373  logging.basicConfig(
374  format='[%(levelname)s] %(message)s',
375  level=logging.INFO if not args.debug else DEBUG)
376  logging.info('>>> launch mergeLHE.py in %s' % os.path.abspath(os.getcwd()))
377 
378  # Extract input LHE files from the path
379  assert len(args.input_files), \
380  ('Please specify your input LHE files by -i/--input-files. '
381  'Run \'mergeLHE.py -h\' for details.')
382  input_files = [] # each individual input file
383  for path in args.input_files.split(','):
384  find_files = glob.glob(path)
385  if len(find_files) == 0:
386  logging.info('Warning: cannot find files in %s' % path)
387  input_files += find_files
388  input_files.sort()
389  logging.info('>>> Merge %d files: [%s]' % (len(input_files), ', '.join(input_files)))
390  logging.info('>>> Write to output: %s ' % args.output_file)
391 
392  if not os.path.exists(os.path.dirname(os.path.realpath(args.output_file))):
393  os.makedirs(os.path.dirname(os.path.realpath(args.output_file)))
394 
395  # Check arguments
396  assert len(input_files) > 0, 'Input LHE files should be more than 0.'
397  if len(input_files) == 1:
398  logging.warning('Input LHE only has 1 file. Will copy this file to the destination.')
399  import shutil
400  shutil.copy(input_files[0], args.output_file)
401  return
402  assert [args.force_mglo_merger, args.force_cpp_merger].count(True) <= 1, \
403  "Can only specify at most one from --force-mglo-merger or --force-cpp-merger."
404 
405  # Determine the merging scheme
406  if args.force_mglo_merger:
407  lhe_merger = MG5LOLHEMerger(input_files, args.output_file)
408  elif args.force_cpp_merger:
409  lhe_merger = ExternalCppLHEMerger(input_files, args.output_file)
410  else:
411  lhe_merger = DefaultLHEMerger(input_files, args.output_file, bypass_check=args.bypass_check)
412 
413  # Do merging
414  lhe_merger.merge()
415 
416 
417 if __name__=="__main__":
418  main()
def __init__(self, input_files, output_file)
Definition: mergeLHE.py:15
def merge_init_blocks(self)
Definition: mergeLHE.py:112
def main(argv=None)
Definition: mergeLHE.py:325
def all(container)
workaround iterator generators for ROOT classes
Definition: cmstools.py:25
bool any(const std::vector< T > &v, const T &what)
Definition: ECalSD.cc:37
def check_header_compatibility(self)
Definition: mergeLHE.py:46
def replace(string, replacements)
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:284
OutputIterator zip(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp)
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:304
static std::string join(char **cmd)
Definition: RemoteFile.cc:19
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:27
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
Definition: main.py:1
def file_iterator(self, path)
Definition: mergeLHE.py:40
#define str(s)