CMS 3D CMS Logo

Classes | Functions | Variables

DatacardParser Namespace Reference

Classes

class  Datacard

Functions

def addDatacardParserOptions
def isVetoed
def parseCard

Variables

tuple globalNuisances = re.compile('(lumi|pdf_(qqbar|gg|qg)|QCDscale_(ggH|qqH|VH|ggH1in|ggH2in|VV)|UEPS|FakeRate|CMS_(eff|fake|trigger|scale|res)_([gemtjb]|met))')

Function Documentation

def DatacardParser::addDatacardParserOptions (   parser)

Definition at line 6 of file DatacardParser.py.

00007                                     :
00008     parser.add_option("-s", "--stat",   dest="stat",    default=False, action="store_true", help="keep only statistical uncertainties, no systematics") 
00009     parser.add_option("-f", "--fix-pars", dest="fixpars",default=False, action="store_true", help="fix all floating parameters of the pdfs except for the POI") 
00010     parser.add_option("-c", "--compiled", dest="cexpr", default=False, action="store_true", help="use compiled expressions (not suggested)")
00011     parser.add_option("-a", "--ascii",    dest="bin",   default=True, action="store_false", help="produce a Workspace in a rootfile in an HLF file (legacy, unsupported)")
00012     parser.add_option("-b", "--binary",   dest="bin",   default=True, action="store_true",  help="produce a Workspace in a rootfile (default)")
00013     parser.add_option("-o", "--out",      dest="out",   default=None,  type="string", help="output file (if none, it will print to stdout). Required for binary mode.")
00014     parser.add_option("-v", "--verbose",  dest="verbose",  default=0,  type="int",    help="Verbosity level (0 = quiet, 1 = verbose, 2+ = more)")
00015     parser.add_option("-m", "--mass",     dest="mass",     default=0,  type="float",  help="Higgs mass to use. Will also be written in the Workspace as RooRealVar 'MH'.")
00016     parser.add_option("-D", "--dataset",  dest="dataname", default="data_obs",  type="string",  help="Name of the observed dataset")
00017     parser.add_option("-L", "--LoadLibrary", dest="libs",  type="string" , action="append", help="Load these libraries")
00018     parser.add_option("--poisson",  dest="poisson",  default=0,  type="int",    help="If set to a positive number, binned datasets wih more than this number of entries will be generated using poissonians")
00019     parser.add_option("--default-morphing",  dest="defMorph", type="string", default="shape2N", help="Default template morphing algorithm (to be used when the datacard has just 'shape')")
00020     parser.add_option("--X-exclude-nuisance", dest="nuisancesToExclude", type="string", action="append", default=[], help="Exclude nuisances that match these regular expressions.")
00021     parser.add_option("--X-force-simpdf",  dest="forceSimPdf", default=False, action="store_true", help="FOR DEBUG ONLY: Always produce a RooSimultaneous, even for single channels.")
00022     parser.add_option("--X-no-check-norm",  dest="noCheckNorm", default=False, action="store_true", help="FOR DEBUG ONLY: Turn off the consistency check between datacard norms and shape norms. Will give you nonsensical results if you have shape uncertainties.")
00023     parser.add_option("--X-no-jmax",  dest="noJMax", default=False, action="store_true", help="FOR DEBUG ONLY: Turn off the consistency check between jmax and number of processes.")
00024 
    
def DatacardParser::isVetoed (   name,
  vetoList 
)

Definition at line 38 of file DatacardParser.py.

00039                            :
00040     for pattern in vetoList:
00041         if not pattern: continue 
00042         if re.match(pattern,name): return True
00043     return False

def DatacardParser::parseCard (   file,
  options 
)

Definition at line 44 of file DatacardParser.py.

00045                             :
00046     if type(file) == type("str"):
00047         raise RuntimeError, "You should pass as argument to parseCards a file object, stream or a list of lines, not a string"
00048     ret = Datacard()
00049     #
00050     nbins      = -1; 
00051     nprocesses = -1; 
00052     nuisances  = -1;
00053     binline = []; processline = []; sigline = []
00054     for l in file:
00055         f = l.split();
00056         if len(f) < 1: continue
00057         if f[0] == "imax": 
00058             nbins = int(f[1]) if f[1] != "*" else -1
00059         if f[0] == "jmax": 
00060             nprocesses = int(f[1])+1 if f[1] != "*" else -1
00061         if f[0] == "kmax": 
00062             nuisances = int(f[1]) if f[1] != "*" else -1
00063         if f[0] == "shapes":
00064             if not options.bin: raise RuntimeError, "Can use shapes only with binary output mode"
00065             if len(f) < 4: raise RuntimeError, "Malformed shapes line"
00066             if not ret.shapeMap.has_key(f[2]): ret.shapeMap[f[2]] = {}
00067             if ret.shapeMap[f[2]].has_key(f[1]): raise RuntimeError, "Duplicate definition for process '%s', channel '%s'" % (f[1], f[2])
00068             ret.shapeMap[f[2]][f[1]] = f[3:]
00069         if f[0] == "Observation" or f[0] == "observation": 
00070             ret.obs = [ float(x) for x in f[1:] ]
00071             if nbins == -1: nbins = len(ret.obs)
00072             if len(ret.obs) != nbins: raise RuntimeError, "Found %d observations but %d bins have been declared" % (len(ret.obs), nbins)
00073             if binline != []:
00074                 if len(binline) != len(ret.obs): raise RuntimeError, "Found %d bins (%s) but %d bins have been declared" % (len(ret.bins), ret.bins, nbins)
00075                 ret.bins = binline
00076                 ret.obs = dict([(b,ret.obs[i]) for i,b in enumerate(ret.bins)])
00077                 binline = []
00078         if f[0] == "bin": 
00079             binline = []
00080             for b in f[1:]:
00081                 if re.match("[0-9]+", b): b = "bin"+b
00082                 binline.append(b)
00083         if f[0] == "process": 
00084             if processline == []: # first line contains names
00085                 processline = f[1:]
00086                 if len(binline) != len(processline): raise RuntimeError, "'bin' line has a different length than 'process' line."
00087                 continue
00088             sigline = f[1:] # second line contains ids
00089             if re.match("-?[0-9]+", processline[0]) and not re.match("-?[0-9]+", sigline[0]):
00090                 (processline,sigline) = (sigline,processline)
00091             if len(sigline) != len(processline): raise RuntimeError, "'bin' line has a different length than 'process' line."
00092             hadBins = (len(ret.bins) > 0)
00093             for i,b in enumerate(binline):
00094                 p = processline[i];
00095                 s = (int(sigline[i]) <= 0) # <=0 for signals, >0 for backgrounds
00096                 ret.keyline.append((b, processline[i], s))
00097                 if hadBins:
00098                     if b not in ret.bins: raise RuntimeError, "Bin %s not among the declared bins %s" % (b, ret.bins)
00099                 else:
00100                     if b not in ret.bins: ret.bins.append(b)
00101                 if p not in ret.processes: ret.processes.append(p)
00102             if nprocesses == -1: nprocesses = len(ret.processes)
00103             if nbins      == -1: nbins      = len(ret.bins)
00104             if not options.noJMax:
00105                 if nprocesses != len(ret.processes): raise RuntimeError, "Found %d processes (%s), declared jmax = %d" % (len(ret.processes),ret.processes,nprocesses)
00106             if nbins      != len(ret.bins):      raise RuntimeError, "Found %d bins (%s), declared imax = %d" % (len(ret.bins),ret.bins,nbins)
00107             ret.exp = dict([(b,{}) for b in ret.bins])
00108             ret.isSignal = dict([(p,None) for p in ret.processes])
00109             if ret.obs != [] and type(ret.obs) == list: # still as list, must change into map with bin names
00110                 ret.obs = dict([(b,ret.obs[i]) for i,b in enumerate(ret.bins)])
00111             for (b,p,s) in ret.keyline:
00112                 if ret.isSignal[p] == None: 
00113                     ret.isSignal[p] = s
00114                 elif ret.isSignal[p] != s:
00115                     raise RuntimeError, "Process %s is declared as signal in some bin and as background in some other bin" % p
00116             ret.signals = [p for p,s in ret.isSignal.items() if s == True]
00117             if len(ret.signals) == 0: raise RuntimeError, "You must have at least one signal process (id <= 0)"
00118         if f[0] == "rate":
00119             if processline == []: raise RuntimeError, "Missing line with process names before rate line" 
00120             if sigline == []:     raise RuntimeError, "Missing line with process id before rate line" 
00121             if len(f[1:]) != len(ret.keyline): raise RuntimeError, "Malformed rate line: length %d, while bins and process lines have length %d" % (len(f[1:]), len(ret.keyline))
00122             for (b,p,s),r in zip(ret.keyline,f[1:]):
00123                 ret.exp[b][p] = float(r)
00124             break # rate is the last line before nuisances
00125     # parse nuisances   
00126     for l in file:
00127         if l.startswith("--"): continue
00128         l  = re.sub("\\s*#.*","",l)
00129         l = re.sub("(?<=\\s)-+(\\s|$)"," 0\\1",l);
00130         f = l.split();
00131         if len(f) <= 1: continue
00132         nofloat = False
00133         lsyst = f[0]; pdf = f[1]; args = []; numbers = f[2:];
00134         if lsyst.endswith("[nofloat]"):
00135           lsyst = lsyst.replace("[nofloat]","")
00136           nofloat = True
00137         if options.nuisancesToExclude and isVetoed(lsyst, options.nuisancesToExclude):
00138             if options.verbose > 0: stderr.write("Excluding nuisance %s selected by a veto pattern among %s\n" % (lsyst, options.nuisancesToExclude))
00139             if nuisances != -1: nuisances -= 1
00140             continue
00141         if re.match("[0-9]+",lsyst): lsyst = "theta"+lsyst
00142         if pdf == "lnN" or pdf == "lnU" or pdf == "gmM" or pdf == "trG" or pdf.startswith("shape"):
00143             pass # nothing special to do
00144         elif pdf == "gmN":
00145             args = [int(f[2])]; numbers = f[3:];
00146         elif pdf == "unif":
00147             args = [float(f[2]), float(f[3])]; numbers = f[4:];
00148         elif pdf == "param":
00149             # for parametric uncertainties, there's no line to account per bin/process effects
00150             # just assume everything else is an argument and move on
00151             args = f[2:]
00152             if len(args) <= 1: raise RuntimeError, "Uncertainties of type 'param' must have at least two arguments (mean and sigma)"
00153             ret.systs.append([lsyst,nofloat,pdf,args,[]])
00154             continue
00155         elif pdf == "flatParam":
00156             ret.flatParamNuisances[lsyst] = True
00157             #for flat parametric uncertainties, code already does the right thing as long as they are non-constant RooRealVars linked to the model
00158             continue
00159         else:
00160             raise RuntimeError, "Unsupported pdf %s" % pdf
00161         if len(numbers) < len(ret.keyline): raise RuntimeError, "Malformed systematics line %s of length %d: while bins and process lines have length %d" % (lsyst, len(numbers), len(ret.keyline))
00162         errline = dict([(b,{}) for b in ret.bins])
00163         nonNullEntries = 0 
00164         for (b,p,s),r in zip(ret.keyline,numbers):
00165             if "/" in r: # "number/number"
00166                 if (pdf not in ["lnN","lnU"]) and ("?" not in pdf): raise RuntimeError, "Asymmetric errors are allowed only for Log-normals"
00167                 errline[b][p] = [ float(x) for x in r.split("/") ]
00168             else:
00169                 errline[b][p] = float(r) 
00170             # set the rate to epsilon for backgrounds with zero observed sideband events.
00171             if pdf == "gmN" and ret.exp[b][p] == 0 and float(r) != 0: ret.exp[b][p] = 1e-6
00172         ret.systs.append([lsyst,nofloat,pdf,args,errline])
00173     # check if there are bins with no rate
00174     for b in ret.bins:
00175         np_bin = sum([(ret.exp[b][p] != 0) for (b1,p,s) in ret.keyline if b1 == b])
00176         ns_bin = sum([(ret.exp[b][p] != 0) for (b1,p,s) in ret.keyline if b1 == b and s == True])
00177         nb_bin = sum([(ret.exp[b][p] != 0) for (b1,p,s) in ret.keyline if b1 == b and s != True])
00178         if np_bin == 0: raise RuntimeError, "Bin %s has no processes contributing to it" % b
00179         if ns_bin == 0: raise RuntimeError, "Bin %s has no signal processes contributing to it" % b
00180         if nb_bin == 0: raise RuntimeError, "Bin %s has no background processes contributing to it" % b
00181     # cleanup systematics that have no effect to avoid zero derivatives
00182     syst2 = []
00183     for lsyst,nofloat,pdf,args,errline in ret.systs:
00184         nonNullEntries = 0 
00185         if pdf == "param": # this doesn't have an errline
00186             syst2.append((lsyst,nofloat,pdf,args,errline))
00187             continue
00188         for (b,p,s) in ret.keyline:
00189             r = errline[b][p]
00190             nullEffect = (r == 0.0 or (pdf == "lnN" and r == 1.0))
00191             if not nullEffect and ret.exp[b][p] != 0: nonNullEntries += 1 # is this a zero background?
00192         if nonNullEntries != 0: syst2.append((lsyst,nofloat,pdf,args,errline))
00193         elif nuisances != -1: nuisances -= 1 # remove from count of nuisances, since qe skipped it
00194     ret.systs = syst2
00195     # remove them if options.stat asks so
00196     if options.stat: 
00197         nuisances = 0
00198         ret.systs = []
00199     # check number of nuisances
00200     if nuisances == -1: 
00201         nuisances = len(ret.systs)
00202     elif len(ret.systs) != nuisances: 
00203         raise RuntimeError, "Found %d systematics, expected %d" % (len(ret.systs), nuisances)
00204     # set boolean to know about shape
00205     ret.hasShapes = (len(ret.shapeMap) > 0)
00206     # return result
00207     return ret

Variable Documentation

tuple DatacardParser::globalNuisances = re.compile('(lumi|pdf_(qqbar|gg|qg)|QCDscale_(ggH|qqH|VH|ggH1in|ggH2in|VV)|UEPS|FakeRate|CMS_(eff|fake|trigger|scale|res)_([gemtjb]|met))')

Definition at line 4 of file DatacardParser.py.