CMS 3D CMS Logo

mergeLHE.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 from __future__ import print_function
4 import logging
5 import argparse
6 import math
7 import glob
8 import sys
9 import os
10 import re
11 
13  """Base class of the LHE merge schemes"""
14 
15  def __init__(self, input_files, output_file):
16  self.input_files = input_files
17  self.output_file = output_file
18 
19  def merge(self):
20  """Output the merged LHE"""
21  pass
22 
23 class DefaultLHEMerger(BaseLHEMerger):
24  """Default LHE merge scheme that copies the header of the first LHE file,
25  merges and outputs the init block, then concatenates all event blocks."""
26 
27  def __init__(self, input_files, output_file, **kwargs):
28  super(DefaultLHEMerger, self).__init__(input_files, output_file)
29 
30  self.bypass_check = kwargs.get('bypass_check', False)
31  # line-by-line iterator for each input file
32  self._f = [self.file_iterator(name) for name in self.input_files]
33  self._header_str = []
34  self._is_mglo = False
35  self._xsec_combined = 0.
36  self._uwgt = 0.
37  self._init_str = [] # initiated blocks for each input file
38  self._nevent = [] # number of events for each input file
39 
40  def file_iterator(self, path):
41  """Line-by-line iterator of a txt file"""
42  with open(path, 'r') as f:
43  for line in f:
44  yield line
45 
47  """Check if all headers for input files are consistent."""
48 
49  if self.bypass_check:
50  return
51 
52  inconsistent_error_info = ("Incompatibility found in LHE headers: %s. "
53  "Use -b/--bypass-check to bypass the check.")
54  allow_diff_keys = [
55  'nevent', 'numevts', 'iseed', 'Seed', 'Random', '.log', '.dat', '.lhe',
56  'Number of Events', 'Integrated weight'
57  ]
58  self._header_lines = [header.split('\n') for header in self._header_str]
59 
60  # Iterate over header lines for all input files and check consistency
61  logging.debug('header line number: %s' \
62  % ', '.join([str(len(lines)) for lines in self._header_lines]))
63  assert all([
64  len(self._header_lines[0]) == len(lines) for lines in self._header_lines]
65  ), inconsistent_error_info % "line number not matches"
66  inconsistent_lines_set = [set() for _ in self._header_lines]
67  for line_zip in zip(*self._header_lines):
68  if any([k in line_zip[0] for k in allow_diff_keys]):
69  logging.debug('Captured \'%s\', we allow difference in this line' % line_zip[0])
70  continue
71  if not all([line_zip[0] == line for line in line_zip]):
72  # Ok so meet inconsistency in some lines, then temporarily store them
73  for i, line in enumerate(line_zip):
74  inconsistent_lines_set[i].add(line)
75  # Those inconsistent lines still match, meaning that it is only a change of order
76  assert all([inconsistent_lines_set[0] == lset for lset in inconsistent_lines_set]), \
77  inconsistent_error_info % ('{' + ', '.join(inconsistent_lines_set[0]) + '}')
78 
79  def merge_headers(self):
80  """Merge the headers of input LHEs. Need special handle for the MG5 LO case."""
81 
82  self._is_mglo = all(['MGGenerationInfo' in header for header in self._header_str])
83  if self._is_mglo and not self.bypass_check:
84  # Special handling of MadGraph5 LO LHEs
85  match_geninfo = [
86  re.search(
87  (r"<MGGenerationInfo>\s+#\s*Number of Events\s*\:\s*(\S+)\s+"
88  r"#\s*Integrated weight \(pb\)\s*\:\s*(\S+)\s+<\/MGGenerationInfo>"),
89  header
90  ) for header in self._header_str
91  ]
92  self._xsec_combined = sum(
93  [float(info.group(2)) * nevt for info, nevt in zip(match_geninfo, self._nevent)]
94  ) / sum(self._nevent)
95  geninfo_combined = ("<MGGenerationInfo>\n"
96  "# Number of Events : %d\n"
97  "# Integrated weight (pb) : %.10f\n</MGGenerationInfo>") \
98  % (sum(self._nevent), self._xsec_combined)
99  logging.info('Detected: MG5 LO LHEs. Input <MGGenerationInfo>:\n\tnevt\txsec')
100  for info, nevt in zip(match_geninfo, self._nevent):
101  logging.info('\t%d\t%.10f' % (nevt, float(info.group(2))))
102  logging.info('Combined <MGGenerationInfo>:\n\t%d\t%.10f' \
103  % (sum(self._nevent), self._xsec_combined))
104 
105  header_combined = self._header_str[0].replace(match_geninfo[0].group(), geninfo_combined)
106  return header_combined
107 
108  else:
109  # No need to merge the headers
110  return self._header_str[0]
111 
112  def merge_init_blocks(self):
113  """If all <init> blocks are identical, return the same <init> block
114  (in the case of Powheg LHEs); otherwise, calculate the output <init>
115  blocks by merging the input blocks info using formula (same with the
116  MG5LOLHEMerger scheme):
117  XSECUP = sum(xsecup * no.events) / tot.events
118  XERRUP = sqrt( sum(sigma^2 * no.events^2) ) / tot.events
119  XMAXUP = max(xmaxup)
120  """
121 
122  if self.bypass_check:
123  # If bypass the consistency check, simply use the first LHE <init>
124  # block as the output
125  return self._init_str[0]
126 
127  # Initiate collected init block info. Will be in format of
128  # {iprocess: [xsecup, xerrup, xmaxup]}
129  new_init_block = {}
130  old_init_block = [{} for _ in self._init_str]
131 
132  # Read the xsecup, xerrup, and xmaxup info from the <init> block for
133  # all input LHEs
134  for i, bl in enumerate(self._init_str): # loop over files
135  nline = int(bl.split('\n')[0].strip().split()[-1])
136 
137  # loop over lines in <init> block
138  for bl_line in bl.split('\n')[1:nline + 1]:
139  bl_line_sp = bl_line.split()
140  old_init_block[i][int(bl_line_sp[3])] = [
141  float(bl_line_sp[0]), float(bl_line_sp[1]), float(bl_line_sp[2])]
142 
143  # After reading all subprocesses info, store the rest content in
144  # <init> block for the first file
145  if i == 0:
146  info_after_subprocess = bl.strip().split('\n')[nline + 1:]
147 
148  logging.info('Input file: %s' % self.input_files[i])
149  for ipr in sorted(list(old_init_block[i].keys()), reverse=True):
150  # reverse order: follow the MG5 custom
151  logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
152  % tuple(old_init_block[i][ipr] + [ipr]))
153 
154  # Adopt smarter <init> block merging method
155  # If all <init> blocks from input files are identical, return the same block;
156  # otherwise combine them based on MG5LOLHEMerger scheme
157  if all([old_init_block[i] == old_init_block[0] for i in range(len(self._f))]):
158  # All <init> blocks are identical
159  logging.info(
160  'All input <init> blocks are identical. Output the same "<init> block.')
161  return self._init_str[0]
162 
163  # Otherwise, calculate merged init block
164  for i in range(len(self._f)):
165  for ipr in old_init_block[i]:
166  # Initiate the subprocess for the new block if it is found for the
167  # first time in one input file
168  if ipr not in new_init_block:
169  new_init_block[ipr] = [0., 0., 0.]
170  new_init_block[ipr][0] += old_init_block[i][ipr][0] * self._nevent[i] # xsecup
171  new_init_block[ipr][1] += old_init_block[i][ipr][1]**2 * self._nevent[i]**2 # xerrup
172  new_init_block[ipr][2] = max(new_init_block[ipr][2], old_init_block[i][ipr][2]) # xmaxup
173  tot_nevent = sum([self._nevent[i] for i in range(len(self._f))])
174 
175  # Write first line of the <init> block (modify the nprocess at the last)
176  self._merged_init_str = self._init_str[0].split('\n')[0].strip().rsplit(' ', 1)[0] \
177  + ' ' + str(len(new_init_block)) + '\n'
178  # Form the merged init block
179  logging.info('Output file: %s' % self.output_file)
180  for ipr in sorted(list(new_init_block.keys()), reverse=True):
181  # reverse order: follow the MG5 custom
182  new_init_block[ipr][0] /= tot_nevent
183  new_init_block[ipr][1] = math.sqrt(new_init_block[ipr][1]) / tot_nevent
184  logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
185  % tuple(new_init_block[ipr] + [ipr]))
186  self._merged_init_str += '%.6E %.6E %.6E %d\n' % tuple(new_init_block[ipr] + [ipr])
187  self._merged_init_str += '\n'.join(info_after_subprocess)
188  if len(info_after_subprocess):
189  self._merged_init_str += '\n'
190 
191  return self._merged_init_str
192 
193  def merge(self):
194  with open(self.output_file, 'w') as fw:
195  # Read the header for the all input files
196  for i in range(len(self._f)):
197  header = []
198  line = next(self._f[i])
199  while not re.search('\s*<init(>|\s)', line):
200  header.append(line)
201  line = next(self._f[i])
202  # 'header' includes all contents before reaches <init>
203  self._header_str.append(''.join(header))
205 
206  # Read <init> blocks for all input_files
207  for i in range(len(self._f)):
208  init = []
209  line = next(self._f[i])
210  while not re.search('\s*</init>', line):
211  init.append(line)
212  line = next(self._f[i])
213  # 'init_str' includes all contents inside <init>...</init>
214  self._init_str.append(''.join(init))
215 
216  # Iterate over all events file-by-file and write events temporarily
217  # to .tmp.lhe
218  with open('.tmp.lhe', 'w') as _fwtmp:
219  for i in range(len(self._f)):
220  nevent = 0
221  while True:
222  line = next(self._f[i])
223  if re.search('\s*</event>', line):
224  nevent += 1
225  if re.search('\s*</LesHouchesEvents>', line):
226  break
227  _fwtmp.write(line)
228  self._nevent.append(nevent)
229  self._f[i].close()
230 
231  # Merge the header and init blocks and write to the output
232  fw.write(self.merge_headers())
233  fw.write('<init>\n' + self.merge_init_blocks() + '</init>\n')
234 
235  # Write event blocks in .tmp.lhe back to the output
236  # If is MG5 LO LHE, will recalculate the weights based on combined xsec
237  # and nevent read from <MGGenerationInfo>, and the 'event_norm' mode
238  if self._is_mglo and not self.bypass_check:
239  event_norm = re.search(
240  r'\s(\w+)\s*=\s*event_norm\s',
241  self._header_str[0]).group(1)
242  if event_norm == 'sum':
243  self._uwgt = self._xsec_combined / sum(self._nevent)
244  elif event_norm == 'average':
245  self._uwgt = self._xsec_combined
246  logging.info(("MG5 LO LHE with event_norm = %s detected. Will "
247  "recalculate weights in each event block.\n"
248  "Unit weight: %+.7E") % (event_norm, self._uwgt))
249 
250  # Modify event wgt when transfering .tmp.lhe to the output file
251  event_line = -999
252  with open('.tmp.lhe', 'r') as ftmp:
253  sign = lambda x: -1 if x < 0 else 1
254  for line in ftmp:
255  event_line += 1
256  if re.search('\s*<event.*>', line):
257  event_line = 0
258  if event_line == 1:
259  # modify the XWGTUP appeared in the first line of the
260  # <event> block
261  orig_wgt = float(line.split()[2])
262  fw.write(re.sub(r'(^\s*\S+\s+\S+\s+)\S+(.+)', r'\g<1>%+.7E\g<2>' \
263  % (sign(orig_wgt) * self._uwgt), line))
264  elif re.search('\s*<wgt.*>.*</wgt>', line):
265  addi_wgt_str = re.search(r'<wgt.*>\s*(\S+)\s*<\/wgt>', line).group(1)
266  fw.write(line.replace(
267  addi_wgt_str, '%+.7E' % (float(addi_wgt_str) / orig_wgt * self._uwgt)))
268  else:
269  fw.write(line)
270  else:
271  # Simply transfer all lines
272  with open('.tmp.lhe', 'r') as ftmp:
273  for line in ftmp:
274  fw.write(line)
275  fw.write('</LesHouchesEvents>\n')
276  os.remove('.tmp.lhe')
277 
278 
280  """Use the merger script dedicated for MG5 LO LHEs, as introduced in
281  https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl
282  """
283 
284  def __init__(self, input_files, output_file, **kwargs):
285  super(MG5LOLHEMerger, self).__init__(input_files, output_file)
287  'https://raw.githubusercontent.com/cms-sw/genproductions/5c1e865a6fbe3a762a28363835d9a804c9cf0dbe/bin/MadGraph5_aMCatNLO/Utilities/merge.pl'
288 
289  def merge(self):
290  logging.info(
291  ('Use the merger script in genproductions dedicated for '
292  'MadGraph5-produced LHEs'))
293  os.system('curl -s -L %s | perl - %s %s.gz banner.txt' \
294  % (self._merger_script_url, ' '.join(self.input_files), self.output_file))
295  os.system('gzip -df %s.gz' % self.output_file)
296  os.system('rm banner.txt')
297 
298 
300  """Use the external mergeLheFiles.cpp file to merge LHE files, as introduced in
301  https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns
302  """
303 
304  def __init__(self, input_files, output_file, **kwargs):
305  super(ExternalCppLHEMerger, self).__init__(input_files, output_file)
307  'https://twiki.cern.ch/twiki/bin/viewfile/CMSPublic/SWGuideSubgroupMC?filename=mergeLheFiles.cpp;rev=2'
308 
309  def merge(self):
310  logging.info(
311  ('Use the external mergeLheFiles.cpp file to merge LHE files.'))
312  os.system('curl -s -o mergeLheFiles.cpp %s' % self._merger_script_url)
313  with open('mergeLheFiles.cpp') as f:
314  script_str = f.read()
315  with open('mergeLheFiles.cpp', 'w') as fw:
316  fw.write(script_str.replace('/tmp/covarell/out.lhe', self.output_file))
317  with open('input_files.txt', 'w') as fw:
318  fw.write('\n'.join(self.input_files) + '\n')
319 
320  os.system('g++ -Wall -o mergeLheFiles mergeLheFiles.cpp')
321  os.system('./mergeLheFiles input_files.txt')
322  os.system('rm mergeLheFiles* input_files.txt')
323 
324 
325 def main(argv = None):
326  """Main routine of the script.
327 
328  Arguments:
329  - `argv`: arguments passed to the main routine
330  """
331 
332  if argv == None:
333  argv = sys.argv[1:]
334 
335  parser = argparse.ArgumentParser(
336  description=("A universal script that merges multiple LHE files for all possible conditions and in the most "
337  "natural way.\n"
338  "A detailed description of the merging step (in the default mode):\n"
339  " 1. Header:\n"
340  " a. assert consistency of the headers (allow difference for the info of e.g. #event, seed);\n"
341  " b. if not MG LO LHEs, will simply use the header from the first LHE; otherwise, reset the "
342  "<MGGenerationInfo> from the headers by merging the #event & xsec info;\n"
343  " 2. Init block: if all <init> blocks are the same, use the same as output; otherwise (the MG LO "
344  "case), merge them by recalculating the # of subprocess (LRPUP) and XSECUP, XERRUP, XMAXUP per "
345  "each subprocess.\n"
346  " 3. Event block: concatenate all event blocks. If for MG LO LHEs, recalculate the per-event "
347  "XWGTUP and all <wgt> tags based on the new XSECUP, #event, and 'event_norm' read from the MG "
348  "run card.\n"
349  "For further development of this script please always validate the merging result on the test "
350  "routines: https://github.com/colizz/mergelhe_validate\n"
351  "Example usage:\n"
352  " mergeLHE.py -i 'thread*/*.lhe,another_file/another.lhe' -o output.lhe"),
353  formatter_class=argparse.RawTextHelpFormatter)
354  parser.add_argument("-i", "--input-files", type=str,
355  help="Input LHE file paths separated by commas. Shell-type wildcards are supported.")
356  parser.add_argument("-o", "--output-file",
357  default='output.lhe', type=str,
358  help="Output LHE file path.")
359  parser.add_argument("--force-mglo-merger", action='store_true',
360  help=("Force to use the merger script dedicated for MG5 LO LHEs, as introduced in "
361  "https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl"))
362  parser.add_argument("--force-cpp-merger", action='store_true',
363  help=("Force to use the external mergeLheFiles.cpp file to merge LHE files, as introduced in "
364  "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns"))
365  parser.add_argument("-b", "--bypass-check", action='store_true',
366  help=("Bypass the compatibility check for the headers. If true, the header and init block "
367  "will be just a duplicate from the first input file, and events are concatenated without "
368  "modification."))
369  parser.add_argument("--debug", action='store_true',
370  help="Use the debug mode.")
371  args = parser.parse_args(argv)
372 
373  logging.basicConfig(
374  format='[%(levelname)s] %(message)s',
375  level=logging.INFO if not args.debug else DEBUG)
376  logging.info('>>> launch mergeLHE.py in %s' % os.path.abspath(os.getcwd()))
377 
378  # Extract input LHE files from the path
379  assert len(args.input_files), \
380  ('Please specify your input LHE files by -i/--input-files. '
381  'Run \'mergeLHE.py -h\' for details.')
382  input_files = [] # each individual input file
383  for path in args.input_files.split(','):
384  find_files = glob.glob(path)
385  if len(find_files) == 0:
386  logging.info('Warning: cannot find files in %s' % path)
387  input_files += find_files
388  input_files.sort()
389  logging.info('>>> Merge %d files: [%s]' % (len(input_files), ', '.join(input_files)))
390  logging.info('>>> Write to output: %s ' % args.output_file)
391 
392  if not os.path.exists(os.path.dirname(os.path.realpath(args.output_file))):
393  os.makedirs(os.path.dirname(os.path.realpath(args.output_file)))
394 
395  # Check arguments
396  assert len(input_files) > 0, 'Input LHE files should be more than 0.'
397  if len(input_files) == 1:
398  logging.warning('Input LHE only has 1 file. Will copy this file to the destination.')
399  import shutil
400  shutil.copy(input_files[0], args.output_file)
401  return
402  assert [args.force_mglo_merger, args.force_cpp_merger].count(True) <= 1, \
403  "Can only specify at most one from --force-mglo-merger or --force-cpp-merger."
404 
405  # Determine the merging scheme
406  if args.force_mglo_merger:
407  lhe_merger = MG5LOLHEMerger(input_files, args.output_file)
408  elif args.force_cpp_merger:
409  lhe_merger = ExternalCppLHEMerger(input_files, args.output_file)
410  else:
411  lhe_merger = DefaultLHEMerger(input_files, args.output_file, bypass_check=args.bypass_check)
412 
413  # Do merging
414  lhe_merger.merge()
415 
416 
417 if __name__=="__main__":
418  main()
def __init__(self, input_files, output_file)
Definition: mergeLHE.py:15
def merge_init_blocks(self)
Definition: mergeLHE.py:112
def main(argv=None)
Definition: mergeLHE.py:325
bool any(const std::vector< T > &v, const T &what)
Definition: ECalSD.cc:37
def check_header_compatibility(self)
Definition: mergeLHE.py:46
def replace(string, replacements)
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:284
OutputIterator zip(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp)
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:304
void add(std::map< std::string, TH1 * > &h, TH1 *hist)
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def __init__(self, input_files, output_file, kwargs)
Definition: mergeLHE.py:27
Definition: main.py:1
def file_iterator(self, path)
Definition: mergeLHE.py:40
#define str(s)
double split
Definition: MVATrainer.cc:139
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run