CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
ProcNormalize.cc
Go to the documentation of this file.
1 #include <algorithm>
2 #include <iterator>
3 #include <iostream>
4 #include <iomanip>
5 #include <cstring>
6 #include <vector>
7 #include <string>
8 #include <memory>
9 #include <map>
10 
11 #include <xercesc/dom/DOM.hpp>
12 
13 #include <TH1.h>
14 
16 
18 
24 
25 XERCES_CPP_NAMESPACE_USE
26 
27 using namespace PhysicsTools;
28 
29 namespace { // anonymous
30 
31 class ProcNormalize : public TrainProcessor {
32  public:
34 
35  ProcNormalize(const char *name, const AtomicId *id,
36  MVATrainer *trainer);
37  virtual ~ProcNormalize();
38 
39  virtual void configure(DOMElement *elem) override;
40  virtual Calibration::VarProcessor *getCalibration() const override;
41 
42  virtual void trainBegin() override;
43  virtual void trainData(const std::vector<double> *values,
44  bool target, double weight) override;
45  virtual void trainEnd() override;
46 
47  virtual bool load() override;
48  virtual void save() override;
49 
50  private:
51  enum Iteration {
52  ITER_EMPTY,
53  ITER_RANGE,
54  ITER_FILL,
55  ITER_DONE
56  };
57 
58  struct PDF {
59  operator Calibration::HistogramF() const
60  {
61  Calibration::HistogramF histo(distr.size(), range);
62  for(unsigned int i = 0; i < distr.size(); i++)
63  histo.setBinContent(i + 1, distr[i]);
64  return histo;
65  }
66 
67  unsigned int smooth;
68  std::vector<double> distr;
70  Iteration iteration;
71  bool fillSignal;
72  bool fillBackground;
73  };
74 
75  std::vector<PDF> pdfs;
76  int categoryIdx;
77  unsigned int nCategories;
78 };
79 
80 static ProcNormalize::Registry registry("ProcNormalize");
81 
82 ProcNormalize::ProcNormalize(const char *name, const AtomicId *id,
83  MVATrainer *trainer) :
84  TrainProcessor(name, id, trainer),
85  categoryIdx(-1),
86  nCategories(1)
87 {
88 }
89 
90 ProcNormalize::~ProcNormalize()
91 {
92 }
93 
94 void ProcNormalize::configure(DOMElement *elem)
95 {
96  int i = 0;
97  for(DOMNode *node = elem->getFirstChild();
98  node; node = node->getNextSibling()) {
99  if (node->getNodeType() != DOMNode::ELEMENT_NODE)
100  continue;
101 
102  DOMElement *elem = static_cast<DOMElement*>(node);
103 
104  XMLSimpleStr nodeName(node->getNodeName());
105 
106  if (std::strcmp(nodeName, "category") != 0) {
107  i++;
108  continue;
109  }
110 
111  if (categoryIdx >= 0)
112  throw cms::Exception("ProcNormalize")
113  << "More than one category variable given."
114  << std::endl;
115 
116 
117  unsigned int count = XMLDocument::readAttribute<unsigned int>(
118  elem, "count");
119 
120  categoryIdx = i;
121  nCategories = count;
122  }
123 
124  for(DOMNode *node = elem->getFirstChild();
125  node; node = node->getNextSibling()) {
126  if (node->getNodeType() != DOMNode::ELEMENT_NODE)
127  continue;
128 
129  XMLSimpleStr nodeName(node->getNodeName());
130  if (std::strcmp(nodeName, "category") == 0)
131  continue;
132 
133  if (std::strcmp(nodeName, "pdf") != 0)
134  throw cms::Exception("ProcNormalize")
135  << "Expected pdf tag in config section."
136  << std::endl;
137  elem = static_cast<DOMElement*>(node);
138 
139  PDF pdf;
140 
141  pdf.distr.resize(XMLDocument::readAttribute<unsigned int>(
142  elem, "size", 100));
143 
144  pdf.smooth = XMLDocument::readAttribute<unsigned int>(
145  elem, "smooth", 40);
146 
147  pdf.fillSignal =
148  XMLDocument::readAttribute<bool>(elem, "signal", true);
149  pdf.fillBackground =
150  XMLDocument::readAttribute<bool>(elem, "background", true);
151 
152  if (!pdf.fillSignal && !pdf.fillBackground)
153  throw cms::Exception("ProcNormalize")
154  << "Filling neither background nor signal "
155  "in config." << std::endl;
156 
157  if (XMLDocument::hasAttribute(elem, "lower") &&
158  XMLDocument::hasAttribute(elem, "upper")) {
159  pdf.range.min = XMLDocument::readAttribute<double>(
160  elem, "lower");
161  pdf.range.max = XMLDocument::readAttribute<double>(
162  elem, "upper");
163  pdf.iteration = ITER_FILL;
164  } else
165  pdf.iteration = ITER_EMPTY;
166 
167  for(unsigned int i = 0; i < nCategories; i++)
168  pdfs.push_back(pdf);
169  }
170 
171  unsigned int nInputs = getInputs().size();
172  if (categoryIdx >= 0)
173  nInputs--;
174 
175  if (pdfs.size() != nInputs * nCategories)
176  throw cms::Exception("ProcNormalize")
177  << "Got " << pdfs.size()
178  << " pdf configs in total for " << nCategories
179  << " categories and " << nInputs
180  << " input varibles (" << (nInputs * nCategories) << " in total)." << std::endl;
181 }
182 
183 Calibration::VarProcessor *ProcNormalize::getCalibration() const
184 {
186 
187  std::vector<unsigned int> pdfMap;
188  for(unsigned int i = 0; i < nCategories; i++)
189  for(unsigned int j = i; j < pdfs.size(); j += nCategories)
190  pdfMap.push_back(j);
191 
192  for(unsigned int i = 0; i < pdfs.size(); i++)
193  calib->distr.push_back(pdfs[pdfMap[i]]);
194 
195  calib->categoryIdx = categoryIdx;
196 
197  return calib;
198 }
199 
200 void ProcNormalize::trainBegin()
201 {
202 }
203 
204 void ProcNormalize::trainData(const std::vector<double> *values,
205  bool target, double weight)
206 {
207  int category = 0;
208  if (categoryIdx >= 0)
209  category = (int)values[categoryIdx].front();
210  if (category < 0 || category >= (int)nCategories)
211  return;
212 
213  int i = 0;
214  for(std::vector<PDF>::iterator iter = pdfs.begin() + category;
215  iter < pdfs.end(); iter += nCategories, values++) {
216  if (i++ == categoryIdx)
217  values++;
218 
219  switch(iter->iteration) {
220  case ITER_EMPTY:
221  for(std::vector<double>::const_iterator value =
222  values->begin();
223  value != values->end(); value++) {
224  iter->range.min = iter->range.max = *value;
225  iter->iteration = ITER_RANGE;
226  break;
227  }
228  case ITER_RANGE:
229  for(std::vector<double>::const_iterator value =
230  values->begin();
231  value != values->end(); value++) {
232  iter->range.min = std::min(iter->range.min,
233  *value);
234  iter->range.max = std::max(iter->range.max,
235  *value);
236  }
237  continue;
238  case ITER_FILL:
239  break;
240  default:
241  continue;
242  }
243 
244  if (!(target ? iter->fillSignal : iter->fillBackground))
245  continue;
246 
247  unsigned int n = iter->distr.size() - 1;
248  double mult = 1.0 / iter->range.width();
249 
250  for(std::vector<double>::const_iterator value =
251  values->begin(); value != values->end(); value++) {
252  double x = (*value - iter->range.min) * mult;
253  if (x < 0.0)
254  x = 0.0;
255  else if (x >= 1.0)
256  x = 1.0;
257 
258  iter->distr[(unsigned int)(x * n + 0.5)] += weight;
259  }
260  }
261 }
262 
263 static void smoothArray(unsigned int n, double *values, unsigned int nTimes)
264 {
265  for(unsigned int iter = 0; iter < nTimes; iter++) {
266  double hold = n > 0 ? values[0] : 0.0;
267  for(unsigned int i = 0; i < n; i++) {
268  double delta = hold * 0.1;
269  double rem = 0.0;
270  if (i > 0) {
271  values[i - 1] += delta;
272  rem -= delta;
273  }
274  if (i < n - 1) {
275  hold = values[i + 1];
276  values[i + 1] += delta;
277  rem -= delta;
278  }
279  values[i] += rem;
280  }
281  }
282 }
283 
284 void ProcNormalize::trainEnd()
285 {
286  bool done = true;
287  for(std::vector<PDF>::iterator iter = pdfs.begin();
288  iter != pdfs.end(); iter++) {
289  switch(iter->iteration) {
290  case ITER_EMPTY:
291  case ITER_RANGE:
292  iter->iteration = ITER_FILL;
293  done = false;
294  break;
295  case ITER_FILL:
296  iter->distr.front() *= 2;
297  iter->distr.back() *= 2;
298  smoothArray(iter->distr.size(),
299  &iter->distr.front(),
300  iter->smooth);
301 
302  iter->iteration = ITER_DONE;
303  break;
304  default:
305  /* shut up */;
306  }
307  }
308 
309  if (done)
310  trained = true;
311 
312  if (done && monitoring) {
313  std::vector<SourceVariable*> inputs = getInputs().get();
314  if (categoryIdx >= 0)
315  inputs.erase(inputs.begin() + categoryIdx);
316 
317  for(std::vector<PDF>::iterator iter = pdfs.begin();
318  iter != pdfs.end(); iter++) {
319  unsigned int idx = iter - pdfs.begin();
320  unsigned int catIdx = idx % nCategories;
321  unsigned int varIdx = idx / nCategories;
322  SourceVariable *var = inputs[varIdx];
323  std::string name =
324  (const char*)var->getSource()->getName()
325  + std::string("_")
326  + (const char*)var->getName();
328  if (categoryIdx >= 0) {
329  name += Form("_CAT%d", catIdx);
330  title += Form(" (cat. %d)", catIdx);
331  }
332 
333  unsigned int n = iter->distr.size() - 1;
334  double min = iter->range.min -
335  0.5 * iter->range.width() / n;
336  double max = iter->range.max +
337  0.5 * iter->range.width() / n;
338  TH1F *histo = monitoring->book<TH1F>(name + "_pdf",
339  name.c_str(), title.c_str(), n + 1, min, max);
340  for(unsigned int i = 0; i < n; i++)
341  histo->SetBinContent(i + 1, iter->distr[i]);
342  }
343  }
344 }
345 
346 namespace {
347  struct Id {
349  AtomicId name;
350  unsigned int category;
351 
352  inline Id(AtomicId source, AtomicId name,
353  unsigned int category) :
354  source(source), name(name), category(category) {}
355 
356  inline bool operator == (const Id &other) const
357  {
358  return source == other.source &&
359  name == other.name &&
360  category == other.category;
361  }
362 
363  inline bool operator < (const Id &other) const
364  {
365  if (source < other.source)
366  return true;
367  if (!(source == other.source))
368  return false;
369  if (name < other.name)
370  return true;
371  if (!(name == other.name))
372  return false;
373  return category < other.category;
374  }
375  };
376 }
377 
378 bool ProcNormalize::load()
379 {
380  std::string filename = trainer->trainFileName(this, "xml");
381  if (!exists(filename))
382  return false;
383 
384  XMLDocument xml(filename);
385  DOMElement *elem = xml.getRootNode();
386  if (std::strcmp(XMLSimpleStr(elem->getNodeName()),
387  "ProcNormalize") != 0)
388  throw cms::Exception("ProcNormalize")
389  << "XML training data file has bad root node."
390  << std::endl;
391 
392  unsigned int version = XMLDocument::readAttribute<unsigned int>(
393  elem, "version", 1);
394 
395  if (version < 1 || version > 2)
396  throw cms::Exception("ProcNormalize")
397  << "Unsupported version " << version
398  << "in train file." << std::endl;
399 
400  std::map<Id, PDF*> pdfMap;
401 
402  for(std::vector<PDF>::iterator iter = pdfs.begin();
403  iter != pdfs.end(); ++iter) {
404  PDF *ptr = &*iter;
405  unsigned int idx = iter - pdfs.begin();
406  unsigned int catIdx = idx % nCategories;
407  unsigned int varIdx = idx / nCategories;
408  if (categoryIdx >= 0 && (int)varIdx >= categoryIdx)
409  varIdx++;
410  const SourceVariable *var = getInputs().get()[varIdx];
411  Id id(var->getSource()->getName(), var->getName(), catIdx);
412 
413  pdfMap[id] = ptr;
414  }
415 
416  std::vector<PDF>::iterator cur = pdfs.begin();
417 
418  for(DOMNode *node = elem->getFirstChild();
419  node; node = node->getNextSibling()) {
420  if (node->getNodeType() != DOMNode::ELEMENT_NODE)
421  continue;
422 
423  if (std::strcmp(XMLSimpleStr(node->getNodeName()), "pdf") != 0)
424  throw cms::Exception("ProcNormalize")
425  << "Expected pdf tag in train file."
426  << std::endl;
427  elem = static_cast<DOMElement*>(node);
428 
429  PDF *pdf = 0;
430  switch(version) {
431  case 1:
432  if (cur == pdfs.end())
433  throw cms::Exception("ProcNormalize")
434  << "Superfluous PDF in train data."
435  << std::endl;
436  pdf = &*cur++;
437  break;
438  case 2: {
439  Id id(XMLDocument::readAttribute<std::string>(
440  elem, "source"),
441  XMLDocument::readAttribute<std::string>(
442  elem, "name"),
443  XMLDocument::readAttribute<unsigned int>(
444  elem, "category", 0));
445  std::map<Id, PDF*>::const_iterator pos =
446  pdfMap.find(id);
447  if (pos == pdfMap.end())
448  continue;
449  else
450  pdf = pos->second;
451  } break;
452  }
453 
454  pdf->range.min =
455  XMLDocument::readAttribute<double>(elem, "lower");
456  pdf->range.max =
457  XMLDocument::readAttribute<double>(elem, "upper");
458  pdf->iteration = ITER_DONE;
459  pdf->distr.clear();
460 
461  for(DOMNode *subNode = elem->getFirstChild();
462  subNode; subNode = subNode->getNextSibling()) {
463  if (subNode->getNodeType() != DOMNode::ELEMENT_NODE)
464  continue;
465 
466  if (std::strcmp(XMLSimpleStr(subNode->getNodeName()),
467  "value") != 0)
468  throw cms::Exception("ProcNormalize")
469  << "Expected value tag in train file."
470  << std::endl;
471 
472  elem = static_cast<DOMElement*>(node);
473 
474  pdf->distr.push_back(
475  XMLDocument::readContent<double>(subNode));
476  }
477  }
478 
479  if (version == 1 && cur != pdfs.end())
480  throw cms::Exception("ProcNormalize")
481  << "Missing PDF in train data." << std::endl;
482 
483  trained = true;
484  for(std::vector<PDF>::const_iterator iter = pdfs.begin();
485  iter != pdfs.end(); ++iter) {
486  if (iter->iteration != ITER_DONE) {
487  trained = false;
488  break;
489  }
490  }
491 
492  return true;
493 }
494 
495 void ProcNormalize::save()
496 {
497  XMLDocument xml(trainer->trainFileName(this, "xml"), true);
498  DOMDocument *doc = xml.createDocument("ProcNormalize");
499  XMLDocument::writeAttribute(doc->getDocumentElement(), "version", 2);
500 
501  for(std::vector<PDF>::const_iterator iter = pdfs.begin();
502  iter != pdfs.end(); iter++) {
503  DOMElement *elem = doc->createElement(XMLUniStr("pdf"));
504  xml.getRootNode()->appendChild(elem);
505 
506  unsigned int idx = iter - pdfs.begin();
507  unsigned int catIdx = idx % nCategories;
508  unsigned int varIdx = idx / nCategories;
509  if (categoryIdx >= 0 && (int)varIdx >= categoryIdx)
510  varIdx++;
511  const SourceVariable *var = getInputs().get()[varIdx];
512  XMLDocument::writeAttribute(elem, "source",
513  (const char*)var->getSource()->getName());
514  XMLDocument::writeAttribute(elem, "name",
515  (const char*)var->getName());
516  if (categoryIdx >= 0)
517  XMLDocument::writeAttribute(elem, "category", catIdx);
518 
519  XMLDocument::writeAttribute(elem, "lower", iter->range.min);
520  XMLDocument::writeAttribute(elem, "upper", iter->range.max);
521 
522  for(std::vector<double>::const_iterator iter2 =
523  iter->distr.begin(); iter2 != iter->distr.end(); iter2++) {
524  DOMElement *value =
525  doc->createElement(XMLUniStr("value"));
526  elem->appendChild(value);
527 
528  XMLDocument::writeContent<double>(value, doc, *iter2);
529  }
530  }
531 }
532 
533 } // anonymous namespace
dbl * delta
Definition: mlp_gen.cc:36
int i
Definition: DBlmapReader.cc:9
Histogram< float > HistogramF
Definition: Histogram.h:120
Source * getSource() const
template to generate a registry singleton for a type.
tuple node
Definition: Node.py:50
Cheap generic unique keyword identifier class.
Definition: AtomicId.h:31
T x() const
Cartesian x coordinate.
MVATrainerComputer * calib
Definition: MVATrainer.cc:64
bool operator<(const FedChannelConnection &, const FedChannelConnection &)
tuple iteration
Definition: align_cfg.py:5
static bool hasAttribute(XERCES_CPP_NAMESPACE_QUALIFIER DOMElement *elem, const char *name)
Definition: XMLDocument.cc:303
const AtomicId getName() const
Definition: Variable.h:143
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
def load
Definition: svgfig.py:546
int j
Definition: DBlmapReader.cc:9
T min(T a, T b)
Definition: MathUtil.h:58
static void writeAttribute(XERCES_CPP_NAMESPACE_QUALIFIER DOMElement *elem, const char *name, const T &value)
tuple idx
DEBUGGING if hasattr(process,&quot;trackMonIterativeTracking2012&quot;): print &quot;trackMonIterativeTracking2012 D...
XERCES_CPP_NAMESPACE_QUALIFIER DOMDocument * createDocument(const std::string &root)
Definition: XMLDocument.cc:266
list save
Definition: cuy.py:1163
AtomicId getName() const
Definition: Source.h:19
tuple filename
Definition: lut2db_cfg.py:20
static Interceptor::Registry registry("Interceptor")
static std::string const source
Definition: EdmProvDump.cc:42
std::vector< HistogramF > distr
Definition: MVAComputer.h:137