CMS 3D CMS Logo

mergeLHE.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 from __future__ import print_function
4 import logging
5 import argparse
6 import math
7 import glob
8 import sys
9 import os
10 import re
11 
12 import addLHEnumbers
13 
14 
16  """Base class of the LHE merge schemes"""
17 
18  def __init__(self, input_files, output_file):
19  self.input_files = input_files
20  self.output_file = output_file
21 
22  def merge(self):
23  """Output the merged LHE"""
24  pass
25 
26 class DefaultLHEMerger(BaseLHEMerger):
27  """Default LHE merge scheme that copies the header of the first LHE file,
28  merges and outputs the init block, then concatenates all event blocks."""
29 
30  def __init__(self, input_files, output_file, **kwargs):
31  super(DefaultLHEMerger, self).__init__(input_files, output_file)
32 
33  self.bypass_check = kwargs.get('bypass_check', False)
34  # line-by-line iterator for each input file
35  self._f = [self.file_iterator(name) for name in self.input_files]
36  self._header_str = []
37  self._is_mglo = False
38  self._xsec_combined = 0.
39  self._uwgt = 0.
40  self._init_str = [] # initiated blocks for each input file
41  self._nevent = [] # number of events for each input file
42 
43  def file_iterator(self, path):
44  """Line-by-line iterator of a txt file"""
45  with open(path, 'r') as f:
46  for line in f:
47  yield line
48 
50  """Check if all headers for input files are consistent."""
51 
52  if self.bypass_check:
53  return
54 
55  inconsistent_error_info = ("Incompatibility found in LHE headers: %s. "
56  "Use -b/--bypass-check to bypass the check.")
57  allow_diff_keys = [
58  'nevent', 'numevts', 'iseed', 'Seed', 'Random', '.log', '.dat', '.lhe',
59  'Number of Events', 'Integrated weight'
60  ]
61  self._header_lines = [header.split('\n') for header in self._header_str]
62 
63  # Iterate over header lines for all input files and check consistency
64  logging.debug('header line number: %s' \
65  % ', '.join([str(len(lines)) for lines in self._header_lines]))
66  assert all([
67  len(self._header_lines[0]) == len(lines) for lines in self._header_lines]
68  ), inconsistent_error_info % "line number does not match"
69  inconsistent_lines_set = [set() for _ in self._header_lines]
70  for line_zip in zip(*self._header_lines):
71  if any([k in line_zip[0] for k in allow_diff_keys]):
72  logging.debug('Captured \'%s\', we allow difference in this line' % line_zip[0])
73  continue
74  if not all([line_zip[0] == line for line in line_zip]):
75  # Ok so meet inconsistency in some lines, then temporarily store them
76  for i, line in enumerate(line_zip):
77  inconsistent_lines_set[i].add(line)
78  # Those inconsistent lines still match, meaning that it is only a change of order
79  assert all([inconsistent_lines_set[0] == lset for lset in inconsistent_lines_set]), \
80  inconsistent_error_info % ('{' + ', '.join(inconsistent_lines_set[0]) + '}')
81 
82  def merge_headers(self):
83  """Merge the headers of input LHEs. Need special handle for the MG5 LO case."""
84 
85  self._is_mglo = all(['MGGenerationInfo' in header for header in self._header_str])
86  if self._is_mglo and not self.bypass_check:
87  # Special handling of MadGraph5 LO LHEs
88  match_geninfo = [
89  re.search(
90  (r"<MGGenerationInfo>\s+#\s*Number of Events\s*\:\s*(\S+)\s+"
91  r"#\s*Integrated weight \(pb\)\s*\:\s*(\S+)\s+<\/MGGenerationInfo>"),
92  header
93  ) for header in self._header_str
94  ]
95  self._xsec_combined = sum(
96  [float(info.group(2)) * nevt for info, nevt in zip(match_geninfo, self._nevent)]
97  ) / sum(self._nevent)
98  geninfo_combined = ("<MGGenerationInfo>\n"
99  "# Number of Events : %d\n"
100  "# Integrated weight (pb) : %.10f\n</MGGenerationInfo>") \
101  % (sum(self._nevent), self._xsec_combined)
102  logging.info('Detected: MG5 LO LHEs. Input <MGGenerationInfo>:\n\tnevt\txsec')
103  for info, nevt in zip(match_geninfo, self._nevent):
104  logging.info('\t%d\t%.10f' % (nevt, float(info.group(2))))
105  logging.info('Combined <MGGenerationInfo>:\n\t%d\t%.10f' \
106  % (sum(self._nevent), self._xsec_combined))
107 
108  header_combined = self._header_str[0].replace(match_geninfo[0].group(), geninfo_combined)
109  return header_combined
110 
111  else:
112  # No need to merge the headers
113  return self._header_str[0]
114 
115  def merge_init_blocks(self):
116  """If all <init> blocks are identical, return the same <init> block
117  (in the case of Powheg LHEs); otherwise, calculate the output <init>
118  blocks by merging the input blocks info using formula (same with the
119  MG5LOLHEMerger scheme):
120  XSECUP = sum(xsecup * no.events) / tot.events
121  XERRUP = sqrt( sum(sigma^2 * no.events^2) ) / tot.events
122  XMAXUP = max(xmaxup)
123  """
124 
125  if self.bypass_check:
126  # If bypass the consistency check, simply use the first LHE <init>
127  # block as the output
128  return self._init_str[0]
129 
130  # Initiate collected init block info. Will be in format of
131  # {iprocess: [xsecup, xerrup, xmaxup]}
132  new_init_block = {}
133  old_init_block = [{} for _ in self._init_str]
134 
135  # Read the xsecup, xerrup, and xmaxup info from the <init> block for
136  # all input LHEs
137  for i, bl in enumerate(self._init_str): # loop over files
138  nline = int(bl.split('\n')[0].strip().split()[-1])
139 
140  # loop over lines in <init> block
141  for bl_line in bl.split('\n')[1:nline + 1]:
142  bl_line_sp = bl_line.split()
143  old_init_block[i][int(bl_line_sp[3])] = [
144  float(bl_line_sp[0]), float(bl_line_sp[1]), float(bl_line_sp[2])]
145 
146  # After reading all subprocesses info, store the rest content in
147  # <init> block for the first file
148  if i == 0:
149  info_after_subprocess = bl.strip().split('\n')[nline + 1:]
150 
151  logging.info('Input file: %s' % self.input_files[i])
152  for ipr in sorted(list(old_init_block[i].keys()), reverse=True):
153  # reverse order: follow the MG5 custom
154  logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
155  % tuple(old_init_block[i][ipr] + [ipr]))
156 
157  # Adopt smarter <init> block merging method
158  # If all <init> blocks from input files are identical, return the same block;
159  # otherwise combine them based on MG5LOLHEMerger scheme
160  if all([old_init_block[i] == old_init_block[0] for i in range(len(self._f))]):
161  # All <init> blocks are identical
162  logging.info(
163  'All input <init> blocks are identical. Output the same "<init> block.')
164  return self._init_str[0]
165 
166  # Otherwise, calculate merged init block
167  for i in range(len(self._f)):
168  for ipr in old_init_block[i]:
169  # Initiate the subprocess for the new block if it is found for the
170  # first time in one input file
171  if ipr not in new_init_block:
172  new_init_block[ipr] = [0., 0., 0.]
173  new_init_block[ipr][0] += old_init_block[i][ipr][0] * self._nevent[i] # xsecup
174  new_init_block[ipr][1] += old_init_block[i][ipr][1]**2 * self._nevent[i]**2 # xerrup
175  new_init_block[ipr][2] = max(new_init_block[ipr][2], old_init_block[i][ipr][2]) # xmaxup
176  tot_nevent = sum([self._nevent[i] for i in range(len(self._f))])
177 
178  # Write first line of the <init> block (modify the nprocess at the last)
179  self._merged_init_str = self._init_str[0].split('\n')[0].strip().rsplit(' ', 1)[0] \
180  + ' ' + str(len(new_init_block)) + '\n'
181  # Form the merged init block
182  logging.info('Output file: %s' % self.output_file)
183  for ipr in sorted(list(new_init_block.keys()), reverse=True):
184  # reverse order: follow the MG5 custom
185  new_init_block[ipr][0] /= tot_nevent
186  new_init_block[ipr][1] = math.sqrt(new_init_block[ipr][1]) / tot_nevent
187  logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
188  % tuple(new_init_block[ipr] + [ipr]))
189  self._merged_init_str += '%.6E %.6E %.6E %d\n' % tuple(new_init_block[ipr] + [ipr])
190  self._merged_init_str += '\n'.join(info_after_subprocess)
191  if len(info_after_subprocess):
192  self._merged_init_str += '\n'
193 
194  return self._merged_init_str
195 
196  def merge(self):
197  with open(self.output_file, 'w') as fw:
198  # Read the header for the all input files
199  for i in range(len(self._f)):
200  header = []
201  line = next(self._f[i])
202  while not re.search('\s*<init(>|\s)', line):
203  header.append(line)
204  line = next(self._f[i])
205  # 'header' includes all contents before reaches <init>
206  self._header_str.append(''.join(header))
208 
209  # Read <init> blocks for all input_files
210  for i in range(len(self._f)):
211  init = []
212  line = next(self._f[i])
213  while not re.search('\s*</init>', line):
214  init.append(line)
215  line = next(self._f[i])
216  # 'init_str' includes all contents inside <init>...</init>
217  self._init_str.append(''.join(init))
218 
219  # Iterate over all events file-by-file and write events temporarily
220  # to .tmp.lhe
221  with open('.tmp.lhe', 'w') as _fwtmp:
222  for i in range(len(self._f)):
223  nevent = 0
224  while True:
225  line = next(self._f[i])
226  if re.search('\s*</event>', line):
227  nevent += 1
228  if re.search('\s*</LesHouchesEvents>', line):
229  break
230  _fwtmp.write(line)
231  self._nevent.append(nevent)
232  self._f[i].close()
233 
234  # Merge the header and init blocks and write to the output
235  fw.write(self.merge_headers())
236  fw.write('<init>\n' + self.merge_init_blocks() + '</init>\n')
237 
238  # Write event blocks in .tmp.lhe back to the output
239  # If is MG5 LO LHE, will recalculate the weights based on combined xsec
240  # and nevent read from <MGGenerationInfo>, and the 'event_norm' mode
241  if self._is_mglo and not self.bypass_check:
242  event_norm = re.search(
243  r'\s(\w+)\s*=\s*event_norm\s',
244  self._header_str[0]).group(1)
245  if event_norm == 'sum':
246  self._uwgt = self._xsec_combined / sum(self._nevent)
247  elif event_norm == 'average':
248  self._uwgt = self._xsec_combined
249  logging.info(("MG5 LO LHE with event_norm = %s detected. Will "
250  "recalculate weights in each event block.\n"
251  "Unit weight: %+.7E") % (event_norm, self._uwgt))
252 
253  # Modify event wgt when transfering .tmp.lhe to the output file
254  event_line = -999
255  with open('.tmp.lhe', 'r') as ftmp:
256  sign = lambda x: -1 if x < 0 else 1
257  for line in ftmp:
258  event_line += 1
259  if re.search('\s*<event.*>', line) and not re.search('\s*<event_num.*>', line):
260  event_line = 0
261  if event_line == 1:
262  # modify the XWGTUP appeared in the first line of the
263  # <event> block
264  orig_wgt = float(line.split()[2])
265  fw.write(re.sub(r'(^\s*\S+\s+\S+\s+)\S+(.+)', r'\g<1>%+.7E\g<2>' \
266  % (sign(orig_wgt) * self._uwgt), line))
267  elif re.search('\s*<wgt.*>.*</wgt>', line):
268  addi_wgt_str = re.search(r'<wgt.*>\s*(\S+)\s*<\/wgt>', line).group(1)
269  fw.write(line.replace(
270  addi_wgt_str, '%+.7E' % (float(addi_wgt_str) / orig_wgt * self._uwgt)))
271  else:
272  fw.write(line)
273  else:
274  # Simply transfer all lines
275  with open('.tmp.lhe', 'r') as ftmp:
276  for line in ftmp:
277  fw.write(line)
278  fw.write('</LesHouchesEvents>\n')
279  os.remove('.tmp.lhe')
280 
281 
283  """Use the merger script dedicated for MG5 LO LHEs, as introduced in
284  https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl
285  """
286 
287  def __init__(self, input_files, output_file, **kwargs):
288  super(MG5LOLHEMerger, self).__init__(input_files, output_file)
290  'https://raw.githubusercontent.com/cms-sw/genproductions/5c1e865a6fbe3a762a28363835d9a804c9cf0dbe/bin/MadGraph5_aMCatNLO/Utilities/merge.pl'
291 
292  def merge(self):
293  logging.info(
294  ('Use the merger script in genproductions dedicated for '
295  'MadGraph5-produced LHEs'))
296  os.system('curl -s -L %s | perl - %s %s.gz banner.txt' \
297  % (self._merger_script_url, ' '.join(self.input_files), self.output_file))
298  os.system('gzip -df %s.gz' % self.output_file)
299  os.system('rm banner.txt')
300 
301 
303  """Use the external mergeLheFiles.cpp file to merge LHE files, as introduced in
304  https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns
305  """
306 
307  def __init__(self, input_files, output_file, **kwargs):
308  super(ExternalCppLHEMerger, self).__init__(input_files, output_file)
310  'https://twiki.cern.ch/twiki/bin/viewfile/CMSPublic/SWGuideSubgroupMC?filename=mergeLheFiles.cpp;rev=2'
311 
312  def merge(self):
313  logging.info(
314  ('Use the external mergeLheFiles.cpp file to merge LHE files.'))
315  os.system('curl -s -o mergeLheFiles.cpp %s' % self._merger_script_url)
316  with open('mergeLheFiles.cpp') as f:
317  script_str = f.read()
318  with open('mergeLheFiles.cpp', 'w') as fw:
319  fw.write(script_str.replace('/tmp/covarell/out.lhe', self.output_file))
320  with open('input_files.txt', 'w') as fw:
321  fw.write('\n'.join(self.input_files) + '\n')
322 
323  os.system('g++ -Wall -o mergeLheFiles mergeLheFiles.cpp')
324  os.system('./mergeLheFiles input_files.txt')
325  os.system('rm mergeLheFiles* input_files.txt')
326 
327 
328 def main(argv = None):
329  """Main routine of the script.
330 
331  Arguments:
332  - `argv`: arguments passed to the main routine
333  """
334 
335  if argv == None:
336  argv = sys.argv[1:]
337 
338  parser = argparse.ArgumentParser(
339  description=("A universal script that merges multiple LHE files for all possible conditions and in the most "
340  "natural way.\n"
341  "A detailed description of the merging step (in the default mode):\n"
342  " 1. Header:\n"
343  " a. assert consistency of the headers (allow difference for the info of e.g. #event, seed);\n"
344  " b. if not MG LO LHEs, will simply use the header from the first LHE; otherwise, reset the "
345  "<MGGenerationInfo> from the headers by merging the #event & xsec info;\n"
346  " 2. Init block: if all <init> blocks are the same, use the same as output; otherwise (the MG LO "
347  "case), merge them by recalculating the # of subprocess (LRPUP) and XSECUP, XERRUP, XMAXUP per "
348  "each subprocess.\n"
349  " 3. Event block: concatenate all event blocks. If for MG LO LHEs, recalculate the per-event "
350  "XWGTUP and all <wgt> tags based on the new XSECUP, #event, and 'event_norm' read from the MG "
351  "run card.\n"
352  "For further development of this script please always validate the merging result on the test "
353  "routines: https://github.com/colizz/mergelhe_validate\n"
354  "Example usage:\n"
355  " mergeLHE.py -i 'thread*/*.lhe,another_file/another.lhe' -o output.lhe"),
356  formatter_class=argparse.RawTextHelpFormatter)
357  parser.add_argument("-i", "--input-files", type=str,
358  help="Input LHE file paths separated by commas. Shell-type wildcards are supported.")
359  parser.add_argument("-o", "--output-file",
360  default='output.lhe', type=str,
361  help="Output LHE file path.")
362  parser.add_argument("--force-mglo-merger", action='store_true',
363  help=("Force to use the merger script dedicated for MG5 LO LHEs, as introduced in "
364  "https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl"))
365  parser.add_argument("--force-cpp-merger", action='store_true',
366  help=("Force to use the external mergeLheFiles.cpp file to merge LHE files, as introduced in "
367  "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns"))
368  parser.add_argument("-b", "--bypass-check", action='store_true',
369  help=("Bypass the compatibility check for the headers. If true, the header and init block "
370  "will be just a duplicate from the first input file, and events are concatenated without "
371  "modification."))
372  parser.add_argument("-n", "--number-events", action='store_true',
373  help=("Add a tag to number each lhe event. Needed for Herwig to find correct lhe events"))
374  parser.add_argument("--debug", action='store_true',
375  help="Use the debug mode.")
376  args = parser.parse_args(argv)
377 
378  logging.basicConfig(
379  format='[%(levelname)s] %(message)s',
380  level=logging.INFO if not args.debug else DEBUG)
381  logging.info('>>> launch mergeLHE.py in %s' % os.path.abspath(os.getcwd()))
382 
383  # Extract input LHE files from the path
384  assert len(args.input_files), \
385  ('Please specify your input LHE files by -i/--input-files. '
386  'Run \'mergeLHE.py -h\' for details.')
387  input_files = [] # each individual input file
388  for path in args.input_files.split(','):
389  find_files = glob.glob(path)
390  if len(find_files) == 0:
391  logging.info('Warning: cannot find files in %s' % path)
392  input_files += find_files
393  input_files.sort()
394  logging.info('>>> Merge %d files: [%s]' % (len(input_files), ', '.join(input_files)))
395  logging.info('>>> Write to output: %s ' % args.output_file)
396 
397  if not os.path.exists(os.path.dirname(os.path.realpath(args.output_file))):
398  os.makedirs(os.path.dirname(os.path.realpath(args.output_file)))
399 
400  if args.number_events:
401  offset = 0
402  for input_file in input_files:
403  offset += addLHEnumbers.number_events(input_file, offset=offset)
404  # Check arguments
405  assert len(input_files) > 0, 'Input LHE files should be more than 0.'
406  if len(input_files) == 1:
407  logging.warning('Input LHE only has 1 file. Will copy this file to the destination.')
408  import shutil
409  shutil.copy(input_files[0], args.output_file)
410  return
411  assert [args.force_mglo_merger, args.force_cpp_merger].count(True) <= 1, \
412  "Can only specify at most one from --force-mglo-merger or --force-cpp-merger."
413 
414  # Determine the merging scheme
415  if args.force_mglo_merger:
416  lhe_merger = MG5LOLHEMerger(input_files, args.output_file)
417  elif args.force_cpp_merger:
418  lhe_merger = ExternalCppLHEMerger(input_files, args.output_file)
419  else:
420  lhe_merger = DefaultLHEMerger(
421  input_files, args.output_file, bypass_check=args.bypass_check)
422 
423  # Do merging
424  lhe_merger.merge()
425 
426 
427 if __name__=="__main__":
428  main()
def __init__(self, input_files, output_file)
Definition: mergeLHE.py:18
def merge_init_blocks(self)
Definition: mergeLHE.py:115
def main(argv=None)
Definition: mergeLHE.py:328
bool any(const std::vector< T > &v, const T &what)
Definition: ECalSD.cc:37
def check_header_compatibility(self)
Definition: mergeLHE.py:49
def replace(string, replacements)
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:287
OutputIterator zip(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp)
def number_events(input_file, output_file=None, offset=0)
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:307
void add(std::map< std::string, TH1 * > &h, TH1 *hist)
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:30
Definition: main.py:1
def file_iterator(self, path)
Definition: mergeLHE.py:43
#define str(s)
double split
Definition: MVATrainer.cc:139
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run