CMS 3D CMS Logo

mergeLHE.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 
3 from __future__ import print_function
4 import logging
5 import argparse
6 import math
7 import glob
8 import sys
9 import os
10 import re
11 
12 import addLHEnumbers
13 
14 
16  """Base class of the LHE merge schemes"""
17 
18  def __init__(self, input_files, output_file):
19  self.input_files = input_files
20  self.output_file = output_file
21 
22  def merge(self):
23  """Output the merged LHE"""
24  pass
25 
26 class DefaultLHEMerger(BaseLHEMerger):
27  """Default LHE merge scheme that copies the header of the first LHE file,
28  merges and outputs the init block, then concatenates all event blocks."""
29 
30  def __init__(self, input_files, output_file, **kwargs):
31  super(DefaultLHEMerger, self).__init__(input_files, output_file)
32 
33  self.bypass_check = kwargs.get('bypass_check', False)
34  # line-by-line iterator for each input file
35  self._f = [self.file_iterator(name) for name in self.input_files]
36  self._header_str = []
37  self._is_mglo = False
38  self._xsec_combined = 0.
39  self._uwgt = 0.
40  self._init_str = [] # initiated blocks for each input file
41  self._nevent = [] # number of events for each input file
42 
43  def file_iterator(self, path):
44  """Line-by-line iterator of a txt file"""
45  with open(path, 'r') as f:
46  for line in f:
47  yield line
48 
50  """Check if all headers for input files are consistent."""
51 
52  if self.bypass_check:
53  return
54 
55  inconsistent_error_info = ("Incompatibility found in LHE headers: %s. "
56  "Use -b/--bypass-check to bypass the check.")
57  allow_diff_keys = [
58  'nevent', 'numevts', 'iseed', 'Seed', 'Random', '.log', '.dat', '.lhe',
59  'Number of Events', 'Integrated weight'
60  ]
61  self._header_lines = [header.split('\n') for header in self._header_str]
62 
63  # Iterate over header lines for all input files and check consistency
64  logging.debug('header line number: %s' \
65  % ', '.join([str(len(lines)) for lines in self._header_lines]))
66  assert all([
67  len(self._header_lines[0]) == len(lines) for lines in self._header_lines]
68  ), inconsistent_error_info % "line number does not match"
69  inconsistent_lines_set = [set() for _ in self._header_lines]
70  for line_zip in zip(*self._header_lines):
71  if any([k in line_zip[0] for k in allow_diff_keys]):
72  logging.debug('Captured \'%s\', we allow difference in this line' % line_zip[0])
73  continue
74  if not all([line_zip[0] == line for line in line_zip]):
75  # Ok so meet inconsistency in some lines, then temporarily store them
76  for i, line in enumerate(line_zip):
77  inconsistent_lines_set[i].add(line)
78  # Those inconsistent lines still match, meaning that it is only a change of order
79  assert all([inconsistent_lines_set[0] == lset for lset in inconsistent_lines_set]), \
80  inconsistent_error_info % ('{' + ', '.join(inconsistent_lines_set[0]) + '}')
81 
82  def merge_headers(self):
83  """Merge the headers of input LHEs. Need special handle for the MG5 LO case."""
84 
85  self._is_mglo = all(['MGGenerationInfo' in header for header in self._header_str])
86  if self._is_mglo and not self.bypass_check:
87  # Special handling of MadGraph5 LO LHEs
88  match_geninfo = [
89  re.search(
90  (r"<MGGenerationInfo>\s+#\s*Number of Events\s*\:\s*(\S+)\s+"
91  r"#\s*Integrated weight \(pb\)\s*\:\s*(\S+)\s+<\/MGGenerationInfo>"),
92  header
93  ) for header in self._header_str
94  ]
95  self._xsec_combined = sum(
96  [float(info.group(2)) * nevt for info, nevt in zip(match_geninfo, self._nevent)]
97  ) / sum(self._nevent)
98  geninfo_combined = ("<MGGenerationInfo>\n"
99  "# Number of Events : %d\n"
100  "# Integrated weight (pb) : %.10f\n</MGGenerationInfo>") \
101  % (sum(self._nevent), self._xsec_combined)
102  logging.info('Detected: MG5 LO LHEs. Input <MGGenerationInfo>:\n\tnevt\txsec')
103  for info, nevt in zip(match_geninfo, self._nevent):
104  logging.info('\t%d\t%.10f' % (nevt, float(info.group(2))))
105  logging.info('Combined <MGGenerationInfo>:\n\t%d\t%.10f' \
106  % (sum(self._nevent), self._xsec_combined))
107 
108  header_combined = self._header_str[0].replace(match_geninfo[0].group(), geninfo_combined)
109  return header_combined
110 
111  else:
112  # No need to merge the headers
113  return self._header_str[0]
114 
115  def merge_init_blocks(self):
116  """If all <init> blocks are identical, return the same <init> block
117  (in the case of Powheg LHEs); otherwise, calculate the output <init>
118  blocks by merging the input blocks info using formula (same with the
119  MG5LOLHEMerger scheme):
120  XSECUP = sum(xsecup * no.events) / tot.events
121  XERRUP = sqrt( sum(sigma^2 * no.events^2) ) / tot.events
122  XMAXUP = max(xmaxup)
123  """
124 
125  if self.bypass_check:
126  # If bypass the consistency check, simply use the first LHE <init>
127  # block as the output
128  return self._init_str[0]
129 
130  # Initiate collected init block info. Will be in format of
131  # {iprocess: [xsecup, xerrup, xmaxup]}
132  new_init_block = {}
133  old_init_block = [{} for _ in self._init_str]
134 
135  # Read the xsecup, xerrup, and xmaxup info from the <init> block for
136  # all input LHEs
137  for i, bl in enumerate(self._init_str): # loop over files
138  nline = int(bl.split('\n')[0].strip().split()[-1])
139 
140  # loop over lines in <init> block
141  for bl_line in bl.split('\n')[1:nline + 1]:
142  bl_line_sp = bl_line.split()
143  old_init_block[i][int(bl_line_sp[3])] = [
144  float(bl_line_sp[0]), float(bl_line_sp[1]), float(bl_line_sp[2])]
145 
146  # After reading all subprocesses info, store the rest content in
147  # <init> block for the first file
148  if i == 0:
149  info_after_subprocess = bl.strip().split('\n')[nline + 1:]
150 
151  logging.info('Input file: %s' % self.input_files[i])
152  for ipr in sorted(list(old_init_block[i].keys()), reverse=True):
153  # reverse order: follow the MG5 custom
154  logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
155  % tuple(old_init_block[i][ipr] + [ipr]))
156 
157  # Adopt smarter <init> block merging method
158  # If all <init> blocks from input files are identical, return the same block;
159  # otherwise combine them based on MG5LOLHEMerger scheme
160  if all([old_init_block[i] == old_init_block[0] for i in range(len(self._f))]):
161  # All <init> blocks are identical
162  logging.info(
163  'All input <init> blocks are identical. Output the same "<init> block.')
164  return self._init_str[0]
165 
166  # Otherwise, calculate merged init block
167  for i in range(len(self._f)):
168  for ipr in old_init_block[i]:
169  # Initiate the subprocess for the new block if it is found for the
170  # first time in one input file
171  if ipr not in new_init_block:
172  new_init_block[ipr] = [0., 0., 0.]
173  new_init_block[ipr][0] += old_init_block[i][ipr][0] * self._nevent[i] # xsecup
174  new_init_block[ipr][1] += old_init_block[i][ipr][1]**2 * self._nevent[i]**2 # xerrup
175  new_init_block[ipr][2] = max(new_init_block[ipr][2], old_init_block[i][ipr][2]) # xmaxup
176  tot_nevent = sum([self._nevent[i] for i in range(len(self._f))])
177 
178  # Write first line of the <init> block (modify the nprocess at the last)
179  self._merged_init_str = self._init_str[0].split('\n')[0].strip().rsplit(' ', 1)[0] \
180  + ' ' + str(len(new_init_block)) + '\n'
181  # Form the merged init block
182  logging.info('Output file: %s' % self.output_file)
183  for ipr in sorted(list(new_init_block.keys()), reverse=True):
184  # reverse order: follow the MG5 custom
185  new_init_block[ipr][0] /= tot_nevent
186  new_init_block[ipr][1] = math.sqrt(new_init_block[ipr][1]) / tot_nevent
187  logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
188  % tuple(new_init_block[ipr] + [ipr]))
189  self._merged_init_str += '%.6E %.6E %.6E %d\n' % tuple(new_init_block[ipr] + [ipr])
190  self._merged_init_str += '\n'.join(info_after_subprocess)
191  if len(info_after_subprocess):
192  self._merged_init_str += '\n'
193 
194  return self._merged_init_str
195 
196  def merge(self):
197  with open(self.output_file, 'w') as fw:
198  # Read the header for the all input files
199  for i in range(len(self._f)):
200  header = []
201  line = next(self._f[i])
202  while not re.search('\s*<init(>|\s)', line):
203  header.append(line)
204  line = next(self._f[i])
205  # 'header' includes all contents before reaches <init>
206  self._header_str.append(''.join(header))
208 
209  # Read <init> blocks for all input_files
210  for i in range(len(self._f)):
211  init = []
212  line = next(self._f[i])
213  while not re.search('\s*</init>', line):
214  init.append(line)
215  line = next(self._f[i])
216  # 'init_str' includes all contents inside <init>...</init>
217  self._init_str.append(''.join(init))
218 
219  # Iterate over all events file-by-file and write events temporarily
220  # to .tmp.lhe
221  with open('.tmp.lhe', 'w') as _fwtmp:
222  for i in range(len(self._f)):
223  nevent = 0
224  while True:
225  line = next(self._f[i])
226  if re.search('\s*</event>', line):
227  nevent += 1
228  if re.search('\s*</LesHouchesEvents>', line):
229  break
230  _fwtmp.write(line)
231  self._nevent.append(nevent)
232  self._f[i].close()
233 
234  # Merge the header and init blocks and write to the output
235  fw.write(self.merge_headers())
236  fw.write('<init>\n' + self.merge_init_blocks() + '</init>\n')
237 
238  # Write event blocks in .tmp.lhe back to the output
239  # If is MG5 LO LHE, will recalculate the weights based on combined xsec
240  # and nevent read from <MGGenerationInfo>, and the 'event_norm' mode
241  if self._is_mglo and not self.bypass_check:
242  event_norm = re.search(
243  r'\s(\w+)\s*=\s*event_norm\s',
244  self._header_str[0]).group(1)
245  if event_norm == 'sum':
246  self._uwgt = self._xsec_combined / sum(self._nevent)
247  elif event_norm == 'average':
248  self._uwgt = self._xsec_combined
249  logging.info(("MG5 LO LHE with event_norm = %s detected. Will "
250  "recalculate weights in each event block.\n"
251  "Unit weight: %+.7E") % (event_norm, self._uwgt))
252 
253  # Modify event wgt when transfering .tmp.lhe to the output file
254  event_line = -999
255  with open('.tmp.lhe', 'r') as ftmp:
256  sign = lambda x: -1 if x < 0 else 1
257  for line in ftmp:
258  event_line += 1
259  if re.search('\s*<event.*>', line) and not re.search('\s*<event_num.*>', line):
260  event_line = 0
261  if event_line == 1:
262  # modify the XWGTUP appeared in the first line of the
263  # <event> block
264  orig_wgt = float(line.split()[2])
265  fw.write(re.sub(r'(^\s*\S+\s+\S+\s+)\S+(.+)', r'\g<1>%+.7E\g<2>' \
266  % (sign(orig_wgt) * self._uwgt), line))
267  elif re.search('\s*<wgt.*>.*</wgt>', line):
268  addi_wgt_str = re.search(r'<wgt.*>\s*(\S+)\s*<\/wgt>', line).group(1)
269  fw.write(line.replace(
270  addi_wgt_str, '%+.7E' % (float(addi_wgt_str) / orig_wgt * self._uwgt)))
271  else:
272  fw.write(line)
273  else:
274  # Simply transfer all lines
275  with open('.tmp.lhe', 'r') as ftmp:
276  for line in ftmp:
277  fw.write(line)
278  fw.write('</LesHouchesEvents>\n')
279  os.remove('.tmp.lhe')
280 
281 
283  """Use the merger script dedicated for MG5 LO LHEs, as introduced in
284  https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl
285  """
286 
287  def __init__(self, input_files, output_file, **kwargs):
288  super(MG5LOLHEMerger, self).__init__(input_files, output_file)
290  'https://raw.githubusercontent.com/cms-sw/genproductions/5c1e865a6fbe3a762a28363835d9a804c9cf0dbe/bin/MadGraph5_aMCatNLO/Utilities/merge.pl'
291 
292  def merge(self):
293  logging.info(
294  ('Use the merger script in genproductions dedicated for '
295  'MadGraph5-produced LHEs'))
296  os.system('curl -s -L %s | perl - %s %s.gz banner.txt' \
297  % (self._merger_script_url, ' '.join(self.input_files), self.output_file))
298  os.system('gzip -df %s.gz' % self.output_file)
299  os.system('rm banner.txt')
300 
301 
303  """Use the external mergeLheFiles.cpp file to merge LHE files, as introduced in
304  https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns
305  """
306 
307  def __init__(self, input_files, output_file, **kwargs):
308  super(ExternalCppLHEMerger, self).__init__(input_files, output_file)
310  'https://twiki.cern.ch/twiki/bin/viewfile/CMSPublic/SWGuideSubgroupMC?filename=mergeLheFiles.cpp;rev=2'
311 
312  def merge(self):
313  logging.info(
314  ('Use the external mergeLheFiles.cpp file to merge LHE files.'))
315  os.system('curl -s -o mergeLheFiles.cpp %s' % self._merger_script_url)
316  with open('mergeLheFiles.cpp') as f:
317  script_str = f.read()
318  with open('mergeLheFiles.cpp', 'w') as fw:
319  fw.write(script_str.replace('/tmp/covarell/out.lhe', self.output_file))
320  with open('input_files.txt', 'w') as fw:
321  fw.write('\n'.join(self.input_files) + '\n')
322 
323  os.system('g++ -Wall -o mergeLheFiles mergeLheFiles.cpp')
324  os.system('./mergeLheFiles input_files.txt')
325  os.system('rm mergeLheFiles* input_files.txt')
326 
327 
328 def main(argv = None):
329  """Main routine of the script.
330 
331  Arguments:
332  - `argv`: arguments passed to the main routine
333  """
334 
335  if argv == None:
336  argv = sys.argv[1:]
337 
338  parser = argparse.ArgumentParser(
339  description=("A universal script that merges multiple LHE files for all possible conditions and in the most "
340  "natural way.\n"
341  "A detailed description of the merging step (in the default mode):\n"
342  " 1. Header:\n"
343  " a. assert consistency of the headers (allow difference for the info of e.g. #event, seed);\n"
344  " b. if not MG LO LHEs, will simply use the header from the first LHE; otherwise, reset the "
345  "<MGGenerationInfo> from the headers by merging the #event & xsec info;\n"
346  " 2. Init block: if all <init> blocks are the same, use the same as output; otherwise (the MG LO "
347  "case), merge them by recalculating the # of subprocess (LRPUP) and XSECUP, XERRUP, XMAXUP per "
348  "each subprocess.\n"
349  " 3. Event block: concatenate all event blocks. If for MG LO LHEs, recalculate the per-event "
350  "XWGTUP and all <wgt> tags based on the new XSECUP, #event, and 'event_norm' read from the MG "
351  "run card.\n"
352  "For further development of this script please always validate the merging result on the test "
353  "routines: https://github.com/colizz/mergelhe_validate\n"
354  "Example usage:\n"
355  " mergeLHE.py -i 'thread*/*.lhe,another_file/another.lhe' -o output.lhe"),
356  formatter_class=argparse.RawTextHelpFormatter)
357  parser.add_argument("-i", "--input-files", type=str,
358  help="Input LHE file paths separated by commas. Shell-type wildcards are supported.")
359  parser.add_argument("-o", "--output-file",
360  default='output.lhe', type=str,
361  help="Output LHE file path.")
362  parser.add_argument("--force-mglo-merger", action='store_true',
363  help=("Force to use the merger script dedicated for MG5 LO LHEs, as introduced in "
364  "https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl"))
365  parser.add_argument("--force-cpp-merger", action='store_true',
366  help=("Force to use the external mergeLheFiles.cpp file to merge LHE files, as introduced in "
367  "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns"))
368  parser.add_argument("-b", "--bypass-check", action='store_true',
369  help=("Bypass the compatibility check for the headers. If true, the header and init block "
370  "will be just a duplicate from the first input file, and events are concatenated without "
371  "modification."))
372  parser.add_argument("-n", "--number-events", action='store_true',
373  help=("Add a tag to number each lhe event. Needed for Herwig to find correct lhe events"))
374  parser.add_argument("--debug", action='store_true',
375  help="Use the debug mode.")
376  args = parser.parse_args(argv)
377 
378  logging.basicConfig(
379  format='[%(levelname)s] %(message)s',
380  level=logging.INFO if not args.debug else DEBUG)
381  logging.info('>>> launch mergeLHE.py in %s' % os.path.abspath(os.getcwd()))
382 
383  # Extract input LHE files from the path
384  assert len(args.input_files), \
385  ('Please specify your input LHE files by -i/--input-files. '
386  'Run \'mergeLHE.py -h\' for details.')
387  input_files = [] # each individual input file
388  for path in args.input_files.split(','):
389  find_files = glob.glob(path)
390  if len(find_files) == 0:
391  logging.info('Warning: cannot find files in %s' % path)
392  input_files += find_files
393  input_files.sort()
394  logging.info('>>> Merge %d files: [%s]' % (len(input_files), ', '.join(input_files)))
395  logging.info('>>> Write to output: %s ' % args.output_file)
396 
397  if not os.path.exists(os.path.dirname(os.path.realpath(args.output_file))):
398  os.makedirs(os.path.dirname(os.path.realpath(args.output_file)))
399 
400  if args.number_events:
401  offset = 0
402  for input_file in input_files:
403  offset += addLHEnumbers.number_events(input_file, offset=offset)
404 
405  # Check arguments
406  assert len(input_files) > 0, 'Input LHE files should be more than 0.'
407  if len(input_files) == 1:
408  logging.warning('Input LHE only has 1 file. Will copy this file to the destination.')
409  import shutil
410  shutil.copy(input_files[0], args.output_file)
411  return
412  assert [args.force_mglo_merger, args.force_cpp_merger].count(True) <= 1, \
413  "Can only specify at most one from --force-mglo-merger or --force-cpp-merger."
414 
415  # Determine the merging scheme
416  if args.force_mglo_merger:
417  lhe_merger = MG5LOLHEMerger(input_files, args.output_file)
418  elif args.force_cpp_merger:
419  lhe_merger = ExternalCppLHEMerger(input_files, args.output_file)
420  else:
421  lhe_merger = DefaultLHEMerger(
422  input_files, args.output_file, bypass_check=args.bypass_check)
423 
424  # Do merging
425  lhe_merger.merge()
426 
427 
428 if __name__=="__main__":
429  main()
def __init__(self, input_files, output_file)
Definition: mergeLHE.py:18
def merge_init_blocks(self)
Definition: mergeLHE.py:115
def main(argv=None)
Definition: mergeLHE.py:328
def all(container)
workaround iterator generators for ROOT classes
Definition: cmstools.py:25
bool any(const std::vector< T > &v, const T &what)
Definition: ECalSD.cc:37
def check_header_compatibility(self)
Definition: mergeLHE.py:49
def replace(string, replacements)
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:287
OutputIterator zip(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp)
def number_events(input_file, output_file=None, offset=0)
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:307
static std::string join(char **cmd)
Definition: RemoteFile.cc:19
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:30
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
Definition: main.py:1
def file_iterator(self, path)
Definition: mergeLHE.py:43
#define str(s)