CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
DatacardParser.py
Go to the documentation of this file.
1 import re
2 from sys import stderr
3 
4 globalNuisances = re.compile('(lumi|pdf_(qqbar|gg|qg)|QCDscale_(ggH|qqH|VH|ggH1in|ggH2in|VV)|UEPS|FakeRate|CMS_(eff|fake|trigger|scale|res)_([gemtjb]|met))')
5 
7  parser.add_option("-s", "--stat", dest="stat", default=False, action="store_true", help="keep only statistical uncertainties, no systematics")
8  parser.add_option("-f", "--fix-pars", dest="fixpars",default=False, action="store_true", help="fix all floating parameters of the pdfs except for the POI")
9  parser.add_option("-c", "--compiled", dest="cexpr", default=False, action="store_true", help="use compiled expressions (not suggested)")
10  parser.add_option("-a", "--ascii", dest="bin", default=True, action="store_false", help="produce a Workspace in a rootfile in an HLF file (legacy, unsupported)")
11  parser.add_option("-b", "--binary", dest="bin", default=True, action="store_true", help="produce a Workspace in a rootfile (default)")
12  parser.add_option("-o", "--out", dest="out", default=None, type="string", help="output file (if none, it will print to stdout). Required for binary mode.")
13  parser.add_option("-v", "--verbose", dest="verbose", default=0, type="int", help="Verbosity level (0 = quiet, 1 = verbose, 2+ = more)")
14  parser.add_option("-m", "--mass", dest="mass", default=0, type="float", help="Higgs mass to use. Will also be written in the Workspace as RooRealVar 'MH'.")
15  parser.add_option("-D", "--dataset", dest="dataname", default="data_obs", type="string", help="Name of the observed dataset")
16  parser.add_option("-L", "--LoadLibrary", dest="libs", type="string" , action="append", help="Load these libraries")
17  parser.add_option("--poisson", dest="poisson", default=0, type="int", help="If set to a positive number, binned datasets wih more than this number of entries will be generated using poissonians")
18  parser.add_option("--default-morphing", dest="defMorph", type="string", default="shape2N", help="Default template morphing algorithm (to be used when the datacard has just 'shape')")
19  parser.add_option("--X-exclude-nuisance", dest="nuisancesToExclude", type="string", action="append", default=[], help="Exclude nuisances that match these regular expressions.")
20  parser.add_option("--X-force-simpdf", dest="forceSimPdf", default=False, action="store_true", help="FOR DEBUG ONLY: Always produce a RooSimultaneous, even for single channels.")
21  parser.add_option("--X-no-check-norm", dest="noCheckNorm", default=False, action="store_true", help="FOR DEBUG ONLY: Turn off the consistency check between datacard norms and shape norms. Will give you nonsensical results if you have shape uncertainties.")
22  parser.add_option("--X-no-jmax", dest="noJMax", default=False, action="store_true", help="FOR DEBUG ONLY: Turn off the consistency check between jmax and number of processes.")
23 
24 
25 class Datacard():
26  def __init__(self):
27  self.bins = []
28  self.obs = [] # empty or map bin -> value
29  self.processes = []; self.signals = []; self.isSignal = {}
30  self.keyline = []
31  self.exp = {} # map bin -> (process -> value)
32  self.systs = [] # list (name, nofloat, pdf, args, errline)
33  # errline: map bin -> (process -> value)
34  self.shapeMap = {} # map channel -> (process -> [fname, hname, hname_syst])
35  self.hasShape = False
37 
38 def isVetoed(name,vetoList):
39  for pattern in vetoList:
40  if not pattern: continue
41  if re.match(pattern,name): return True
42  return False
43 
44 def parseCard(file, options):
45  if type(file) == type("str"):
46  raise RuntimeError, "You should pass as argument to parseCards a file object, stream or a list of lines, not a string"
47  ret = Datacard()
48  #
49  nbins = -1;
50  nprocesses = -1;
51  nuisances = -1;
52  binline = []; processline = []; sigline = []
53  for l in file:
54  f = l.split();
55  if len(f) < 1: continue
56  if f[0] == "imax":
57  nbins = int(f[1]) if f[1] != "*" else -1
58  if f[0] == "jmax":
59  nprocesses = int(f[1])+1 if f[1] != "*" else -1
60  if f[0] == "kmax":
61  nuisances = int(f[1]) if f[1] != "*" else -1
62  if f[0] == "shapes":
63  if not options.bin: raise RuntimeError, "Can use shapes only with binary output mode"
64  if len(f) < 4: raise RuntimeError, "Malformed shapes line"
65  if not ret.shapeMap.has_key(f[2]): ret.shapeMap[f[2]] = {}
66  if ret.shapeMap[f[2]].has_key(f[1]): raise RuntimeError, "Duplicate definition for process '%s', channel '%s'" % (f[1], f[2])
67  ret.shapeMap[f[2]][f[1]] = f[3:]
68  if f[0] == "Observation" or f[0] == "observation":
69  ret.obs = [ float(x) for x in f[1:] ]
70  if nbins == -1: nbins = len(ret.obs)
71  if len(ret.obs) != nbins: raise RuntimeError, "Found %d observations but %d bins have been declared" % (len(ret.obs), nbins)
72  if binline != []:
73  if len(binline) != len(ret.obs): raise RuntimeError, "Found %d bins (%s) but %d bins have been declared" % (len(ret.bins), ret.bins, nbins)
74  ret.bins = binline
75  ret.obs = dict([(b,ret.obs[i]) for i,b in enumerate(ret.bins)])
76  binline = []
77  if f[0] == "bin":
78  binline = []
79  for b in f[1:]:
80  if re.match("[0-9]+", b): b = "bin"+b
81  binline.append(b)
82  if f[0] == "process":
83  if processline == []: # first line contains names
84  processline = f[1:]
85  if len(binline) != len(processline): raise RuntimeError, "'bin' line has a different length than 'process' line."
86  continue
87  sigline = f[1:] # second line contains ids
88  if re.match("-?[0-9]+", processline[0]) and not re.match("-?[0-9]+", sigline[0]):
89  (processline,sigline) = (sigline,processline)
90  if len(sigline) != len(processline): raise RuntimeError, "'bin' line has a different length than 'process' line."
91  hadBins = (len(ret.bins) > 0)
92  for i,b in enumerate(binline):
93  p = processline[i];
94  s = (int(sigline[i]) <= 0) # <=0 for signals, >0 for backgrounds
95  ret.keyline.append((b, processline[i], s))
96  if hadBins:
97  if b not in ret.bins: raise RuntimeError, "Bin %s not among the declared bins %s" % (b, ret.bins)
98  else:
99  if b not in ret.bins: ret.bins.append(b)
100  if p not in ret.processes: ret.processes.append(p)
101  if nprocesses == -1: nprocesses = len(ret.processes)
102  if nbins == -1: nbins = len(ret.bins)
103  if not options.noJMax:
104  if nprocesses != len(ret.processes): raise RuntimeError, "Found %d processes (%s), declared jmax = %d" % (len(ret.processes),ret.processes,nprocesses)
105  if nbins != len(ret.bins): raise RuntimeError, "Found %d bins (%s), declared imax = %d" % (len(ret.bins),ret.bins,nbins)
106  ret.exp = dict([(b,{}) for b in ret.bins])
107  ret.isSignal = dict([(p,None) for p in ret.processes])
108  if ret.obs != [] and type(ret.obs) == list: # still as list, must change into map with bin names
109  ret.obs = dict([(b,ret.obs[i]) for i,b in enumerate(ret.bins)])
110  for (b,p,s) in ret.keyline:
111  if ret.isSignal[p] == None:
112  ret.isSignal[p] = s
113  elif ret.isSignal[p] != s:
114  raise RuntimeError, "Process %s is declared as signal in some bin and as background in some other bin" % p
115  ret.signals = [p for p,s in ret.isSignal.items() if s == True]
116  if len(ret.signals) == 0: raise RuntimeError, "You must have at least one signal process (id <= 0)"
117  if f[0] == "rate":
118  if processline == []: raise RuntimeError, "Missing line with process names before rate line"
119  if sigline == []: raise RuntimeError, "Missing line with process id before rate line"
120  if len(f[1:]) != len(ret.keyline): raise RuntimeError, "Malformed rate line: length %d, while bins and process lines have length %d" % (len(f[1:]), len(ret.keyline))
121  for (b,p,s),r in zip(ret.keyline,f[1:]):
122  ret.exp[b][p] = float(r)
123  break # rate is the last line before nuisances
124  # parse nuisances
125  for l in file:
126  if l.startswith("--"): continue
127  l = re.sub("\\s*#.*","",l)
128  l = re.sub("(?<=\\s)-+(\\s|$)"," 0\\1",l);
129  f = l.split();
130  if len(f) <= 1: continue
131  nofloat = False
132  lsyst = f[0]; pdf = f[1]; args = []; numbers = f[2:];
133  if lsyst.endswith("[nofloat]"):
134  lsyst = lsyst.replace("[nofloat]","")
135  nofloat = True
136  if options.nuisancesToExclude and isVetoed(lsyst, options.nuisancesToExclude):
137  if options.verbose > 0: stderr.write("Excluding nuisance %s selected by a veto pattern among %s\n" % (lsyst, options.nuisancesToExclude))
138  if nuisances != -1: nuisances -= 1
139  continue
140  if re.match("[0-9]+",lsyst): lsyst = "theta"+lsyst
141  if pdf == "lnN" or pdf == "lnU" or pdf == "gmM" or pdf == "trG" or pdf.startswith("shape"):
142  pass # nothing special to do
143  elif pdf == "gmN":
144  args = [int(f[2])]; numbers = f[3:];
145  elif pdf == "unif":
146  args = [float(f[2]), float(f[3])]; numbers = f[4:];
147  elif pdf == "param":
148  # for parametric uncertainties, there's no line to account per bin/process effects
149  # just assume everything else is an argument and move on
150  args = f[2:]
151  if len(args) <= 1: raise RuntimeError, "Uncertainties of type 'param' must have at least two arguments (mean and sigma)"
152  ret.systs.append([lsyst,nofloat,pdf,args,[]])
153  continue
154  elif pdf == "flatParam":
155  ret.flatParamNuisances[lsyst] = True
156  #for flat parametric uncertainties, code already does the right thing as long as they are non-constant RooRealVars linked to the model
157  continue
158  else:
159  raise RuntimeError, "Unsupported pdf %s" % pdf
160  if len(numbers) < len(ret.keyline): raise RuntimeError, "Malformed systematics line %s of length %d: while bins and process lines have length %d" % (lsyst, len(numbers), len(ret.keyline))
161  errline = dict([(b,{}) for b in ret.bins])
162  nonNullEntries = 0
163  for (b,p,s),r in zip(ret.keyline,numbers):
164  if "/" in r: # "number/number"
165  if (pdf not in ["lnN","lnU"]) and ("?" not in pdf): raise RuntimeError, "Asymmetric errors are allowed only for Log-normals"
166  errline[b][p] = [ float(x) for x in r.split("/") ]
167  else:
168  errline[b][p] = float(r)
169  # set the rate to epsilon for backgrounds with zero observed sideband events.
170  if pdf == "gmN" and ret.exp[b][p] == 0 and float(r) != 0: ret.exp[b][p] = 1e-6
171  ret.systs.append([lsyst,nofloat,pdf,args,errline])
172  # check if there are bins with no rate
173  for b in ret.bins:
174  np_bin = sum([(ret.exp[b][p] != 0) for (b1,p,s) in ret.keyline if b1 == b])
175  ns_bin = sum([(ret.exp[b][p] != 0) for (b1,p,s) in ret.keyline if b1 == b and s == True])
176  nb_bin = sum([(ret.exp[b][p] != 0) for (b1,p,s) in ret.keyline if b1 == b and s != True])
177  if np_bin == 0: raise RuntimeError, "Bin %s has no processes contributing to it" % b
178  if ns_bin == 0: raise RuntimeError, "Bin %s has no signal processes contributing to it" % b
179  if nb_bin == 0: raise RuntimeError, "Bin %s has no background processes contributing to it" % b
180  # cleanup systematics that have no effect to avoid zero derivatives
181  syst2 = []
182  for lsyst,nofloat,pdf,args,errline in ret.systs:
183  nonNullEntries = 0
184  if pdf == "param": # this doesn't have an errline
185  syst2.append((lsyst,nofloat,pdf,args,errline))
186  continue
187  for (b,p,s) in ret.keyline:
188  r = errline[b][p]
189  nullEffect = (r == 0.0 or (pdf == "lnN" and r == 1.0))
190  if not nullEffect and ret.exp[b][p] != 0: nonNullEntries += 1 # is this a zero background?
191  if nonNullEntries != 0: syst2.append((lsyst,nofloat,pdf,args,errline))
192  elif nuisances != -1: nuisances -= 1 # remove from count of nuisances, since qe skipped it
193  ret.systs = syst2
194  # remove them if options.stat asks so
195  if options.stat:
196  nuisances = 0
197  ret.systs = []
198  # check number of nuisances
199  if nuisances == -1:
200  nuisances = len(ret.systs)
201  elif len(ret.systs) != nuisances:
202  raise RuntimeError, "Found %d systematics, expected %d" % (len(ret.systs), nuisances)
203  # set boolean to know about shape
204  ret.hasShapes = (len(ret.shapeMap) > 0)
205  # return result
206  return ret
def addDatacardParserOptions