CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
ProcNormalize.cc
Go to the documentation of this file.
1 #include <algorithm>
2 #include <iterator>
3 #include <iostream>
4 #include <iomanip>
5 #include <cstring>
6 #include <vector>
7 #include <string>
8 #include <memory>
9 #include <map>
10 
11 #include <xercesc/dom/DOM.hpp>
12 
13 #include <TH1.h>
14 
16 
18 
24 
25 XERCES_CPP_NAMESPACE_USE
26 
27 using namespace PhysicsTools;
28 
29 namespace { // anonymous
30 
31 class ProcNormalize : public TrainProcessor {
32  public:
34 
35  ProcNormalize(const char *name, const AtomicId *id,
36  MVATrainer *trainer);
37  virtual ~ProcNormalize();
38 
39  virtual void configure(DOMElement *elem);
40  virtual Calibration::VarProcessor *getCalibration() const;
41 
42  virtual void trainBegin();
43  virtual void trainData(const std::vector<double> *values,
44  bool target, double weight);
45  virtual void trainEnd();
46 
47  virtual bool load();
48  virtual void save();
49 
50  private:
51  enum Iteration {
52  ITER_EMPTY,
53  ITER_RANGE,
54  ITER_FILL,
55  ITER_DONE
56  };
57 
58  struct PDF {
59  operator Calibration::HistogramF() const
60  {
61  Calibration::HistogramF histo(distr.size(), range);
62  for(unsigned int i = 0; i < distr.size(); i++)
63  histo.setBinContent(i + 1, distr[i]);
64  return histo;
65  }
66 
67  unsigned int smooth;
68  std::vector<double> distr;
70  Iteration iteration;
71  bool fillSignal;
72  bool fillBackground;
73  };
74 
75  std::vector<PDF> pdfs;
76  int categoryIdx;
77  unsigned int nCategories;
78 };
79 
80 static ProcNormalize::Registry registry("ProcNormalize");
81 
82 ProcNormalize::ProcNormalize(const char *name, const AtomicId *id,
83  MVATrainer *trainer) :
84  TrainProcessor(name, id, trainer),
85  categoryIdx(-1),
86  nCategories(1)
87 {
88 }
89 
90 ProcNormalize::~ProcNormalize()
91 {
92 }
93 
94 void ProcNormalize::configure(DOMElement *elem)
95 {
96  int i = 0;
97  for(DOMNode *node = elem->getFirstChild();
98  node; node = node->getNextSibling()) {
99  if (node->getNodeType() != DOMNode::ELEMENT_NODE)
100  continue;
101 
102  DOMElement *elem = static_cast<DOMElement*>(node);
103 
104  XMLSimpleStr nodeName(node->getNodeName());
105 
106  if (std::strcmp(nodeName, "category") != 0) {
107  i++;
108  continue;
109  }
110 
111  if (categoryIdx >= 0)
112  throw cms::Exception("ProcNormalize")
113  << "More than one category variable given."
114  << std::endl;
115 
116 
117  unsigned int count = XMLDocument::readAttribute<unsigned int>(
118  elem, "count");
119 
120  categoryIdx = i;
121  nCategories = count;
122  }
123 
124  for(DOMNode *node = elem->getFirstChild();
125  node; node = node->getNextSibling()) {
126  if (node->getNodeType() != DOMNode::ELEMENT_NODE)
127  continue;
128 
129  XMLSimpleStr nodeName(node->getNodeName());
130  if (std::strcmp(nodeName, "category") == 0)
131  continue;
132 
133  if (std::strcmp(nodeName, "pdf") != 0)
134  throw cms::Exception("ProcNormalize")
135  << "Expected pdf tag in config section."
136  << std::endl;
137  elem = static_cast<DOMElement*>(node);
138 
139  PDF pdf;
140 
141  pdf.distr.resize(XMLDocument::readAttribute<unsigned int>(
142  elem, "size", 100));
143 
144  pdf.smooth = XMLDocument::readAttribute<unsigned int>(
145  elem, "smooth", 40);
146 
147  pdf.fillSignal =
148  XMLDocument::readAttribute<bool>(elem, "signal", true);
149  pdf.fillBackground =
150  XMLDocument::readAttribute<bool>(elem, "background", true);
151 
152  if (!pdf.fillSignal && !pdf.fillBackground)
153  throw cms::Exception("ProcNormalize")
154  << "Filling neither background nor signal "
155  "in config." << std::endl;
156 
157  if (XMLDocument::hasAttribute(elem, "lower") &&
158  XMLDocument::hasAttribute(elem, "upper")) {
159  pdf.range.min = XMLDocument::readAttribute<double>(
160  elem, "lower");
161  pdf.range.max = XMLDocument::readAttribute<double>(
162  elem, "upper");
163  pdf.iteration = ITER_FILL;
164  } else
165  pdf.iteration = ITER_EMPTY;
166 
167  for(unsigned int i = 0; i < nCategories; i++)
168  pdfs.push_back(pdf);
169  }
170 
171  unsigned int nInputs = getInputs().size();
172  if (categoryIdx >= 0)
173  nInputs--;
174 
175  if (pdfs.size() != nInputs * nCategories)
176  throw cms::Exception("ProcNormalize")
177  << "Got " << (pdfs.size() / nCategories)
178  << " pdf configs for " << nInputs
179  << " input varibles." << std::endl;
180 }
181 
182 Calibration::VarProcessor *ProcNormalize::getCalibration() const
183 {
185 
186  std::vector<unsigned int> pdfMap;
187  for(unsigned int i = 0; i < nCategories; i++)
188  for(unsigned int j = i; j < pdfs.size(); j += nCategories)
189  pdfMap.push_back(j);
190 
191  for(unsigned int i = 0; i < pdfs.size(); i++)
192  calib->distr.push_back(pdfs[pdfMap[i]]);
193 
194  calib->categoryIdx = categoryIdx;
195 
196  return calib;
197 }
198 
199 void ProcNormalize::trainBegin()
200 {
201 }
202 
203 void ProcNormalize::trainData(const std::vector<double> *values,
204  bool target, double weight)
205 {
206  int category = 0;
207  if (categoryIdx >= 0)
208  category = (int)values[categoryIdx].front();
209  if (category < 0 || category >= (int)nCategories)
210  return;
211 
212  int i = 0;
213  for(std::vector<PDF>::iterator iter = pdfs.begin() + category;
214  iter < pdfs.end(); iter += nCategories, values++) {
215  if (i++ == categoryIdx)
216  values++;
217 
218  switch(iter->iteration) {
219  case ITER_EMPTY:
220  for(std::vector<double>::const_iterator value =
221  values->begin();
222  value != values->end(); value++) {
223  iter->range.min = iter->range.max = *value;
224  iter->iteration = ITER_RANGE;
225  break;
226  }
227  case ITER_RANGE:
228  for(std::vector<double>::const_iterator value =
229  values->begin();
230  value != values->end(); value++) {
231  iter->range.min = std::min(iter->range.min,
232  *value);
233  iter->range.max = std::max(iter->range.max,
234  *value);
235  }
236  continue;
237  case ITER_FILL:
238  break;
239  default:
240  continue;
241  }
242 
243  if (!(target ? iter->fillSignal : iter->fillBackground))
244  continue;
245 
246  unsigned int n = iter->distr.size() - 1;
247  double mult = 1.0 / iter->range.width();
248 
249  for(std::vector<double>::const_iterator value =
250  values->begin(); value != values->end(); value++) {
251  double x = (*value - iter->range.min) * mult;
252  if (x < 0.0)
253  x = 0.0;
254  else if (x >= 1.0)
255  x = 1.0;
256 
257  iter->distr[(unsigned int)(x * n + 0.5)] += weight;
258  }
259  }
260 }
261 
262 static void smoothArray(unsigned int n, double *values, unsigned int nTimes)
263 {
264  for(unsigned int iter = 0; iter < nTimes; iter++) {
265  double hold = n > 0 ? values[0] : 0.0;
266  for(unsigned int i = 0; i < n; i++) {
267  double delta = hold * 0.1;
268  double rem = 0.0;
269  if (i > 0) {
270  values[i - 1] += delta;
271  rem -= delta;
272  }
273  if (i < n - 1) {
274  hold = values[i + 1];
275  values[i + 1] += delta;
276  rem -= delta;
277  }
278  values[i] += rem;
279  }
280  }
281 }
282 
283 void ProcNormalize::trainEnd()
284 {
285  bool done = true;
286  for(std::vector<PDF>::iterator iter = pdfs.begin();
287  iter != pdfs.end(); iter++) {
288  switch(iter->iteration) {
289  case ITER_EMPTY:
290  case ITER_RANGE:
291  iter->iteration = ITER_FILL;
292  done = false;
293  break;
294  case ITER_FILL:
295  iter->distr.front() *= 2;
296  iter->distr.back() *= 2;
297  smoothArray(iter->distr.size(),
298  &iter->distr.front(),
299  iter->smooth);
300 
301  iter->iteration = ITER_DONE;
302  break;
303  default:
304  /* shut up */;
305  }
306  }
307 
308  if (done)
309  trained = true;
310 
311  if (done && monitoring) {
312  std::vector<SourceVariable*> inputs = getInputs().get();
313  if (categoryIdx >= 0)
314  inputs.erase(inputs.begin() + categoryIdx);
315 
316  for(std::vector<PDF>::iterator iter = pdfs.begin();
317  iter != pdfs.end(); iter++) {
318  unsigned int idx = iter - pdfs.begin();
319  unsigned int catIdx = idx % nCategories;
320  unsigned int varIdx = idx / nCategories;
321  SourceVariable *var = inputs[varIdx];
322  std::string name =
323  (const char*)var->getSource()->getName()
324  + std::string("_")
325  + (const char*)var->getName();
326  std::string title = name;
327  if (categoryIdx >= 0) {
328  name += Form("_CAT%d", catIdx);
329  title += Form(" (cat. %d)", catIdx);
330  }
331 
332  unsigned int n = iter->distr.size() - 1;
333  double min = iter->range.min -
334  0.5 * iter->range.width() / n;
335  double max = iter->range.max +
336  0.5 * iter->range.width() / n;
337  TH1F *histo = monitoring->book<TH1F>(name + "_pdf",
338  name.c_str(), title.c_str(), n + 1, min, max);
339  for(unsigned int i = 0; i < n; i++)
340  histo->SetBinContent(i + 1, iter->distr[i]);
341  }
342  }
343 }
344 
345 namespace {
346  struct Id {
348  AtomicId name;
349  unsigned int category;
350 
351  inline Id(AtomicId source, AtomicId name,
352  unsigned int category) :
353  source(source), name(name), category(category) {}
354 
355  inline bool operator == (const Id &other) const
356  {
357  return source == other.source &&
358  name == other.name &&
359  category == other.category;
360  }
361 
362  inline bool operator < (const Id &other) const
363  {
364  if (source < other.source)
365  return true;
366  if (!(source == other.source))
367  return false;
368  if (name < other.name)
369  return true;
370  if (!(name == other.name))
371  return false;
372  return category < other.category;
373  }
374  };
375 }
376 
377 bool ProcNormalize::load()
378 {
379  std::string filename = trainer->trainFileName(this, "xml");
380  if (!exists(filename))
381  return false;
382 
383  XMLDocument xml(filename);
384  DOMElement *elem = xml.getRootNode();
385  if (std::strcmp(XMLSimpleStr(elem->getNodeName()),
386  "ProcNormalize") != 0)
387  throw cms::Exception("ProcNormalize")
388  << "XML training data file has bad root node."
389  << std::endl;
390 
391  unsigned int version = XMLDocument::readAttribute<unsigned int>(
392  elem, "version", 1);
393 
394  if (version < 1 || version > 2)
395  throw cms::Exception("ProcNormalize")
396  << "Unsupported version " << version
397  << "in train file." << std::endl;
398 
399  std::map<Id, PDF*> pdfMap;
400 
401  for(std::vector<PDF>::iterator iter = pdfs.begin();
402  iter != pdfs.end(); ++iter) {
403  PDF *ptr = &*iter;
404  unsigned int idx = iter - pdfs.begin();
405  unsigned int catIdx = idx % nCategories;
406  unsigned int varIdx = idx / nCategories;
407  if (categoryIdx >= 0 && (int)varIdx >= categoryIdx)
408  varIdx++;
409  const SourceVariable *var = getInputs().get()[varIdx];
410  Id id(var->getSource()->getName(), var->getName(), catIdx);
411 
412  pdfMap[id] = ptr;
413  }
414 
415  std::vector<PDF>::iterator cur = pdfs.begin();
416 
417  for(DOMNode *node = elem->getFirstChild();
418  node; node = node->getNextSibling()) {
419  if (node->getNodeType() != DOMNode::ELEMENT_NODE)
420  continue;
421 
422  if (std::strcmp(XMLSimpleStr(node->getNodeName()), "pdf") != 0)
423  throw cms::Exception("ProcNormalize")
424  << "Expected pdf tag in train file."
425  << std::endl;
426  elem = static_cast<DOMElement*>(node);
427 
428  PDF *pdf = 0;
429  switch(version) {
430  case 1:
431  if (cur == pdfs.end())
432  throw cms::Exception("ProcNormalize")
433  << "Superfluous PDF in train data."
434  << std::endl;
435  pdf = &*cur++;
436  break;
437  case 2: {
438  Id id(XMLDocument::readAttribute<std::string>(
439  elem, "source"),
440  XMLDocument::readAttribute<std::string>(
441  elem, "name"),
442  XMLDocument::readAttribute<unsigned int>(
443  elem, "category", 0));
444  std::map<Id, PDF*>::const_iterator pos =
445  pdfMap.find(id);
446  if (pos == pdfMap.end())
447  continue;
448  else
449  pdf = pos->second;
450  } break;
451  }
452 
453  pdf->range.min =
454  XMLDocument::readAttribute<double>(elem, "lower");
455  pdf->range.max =
456  XMLDocument::readAttribute<double>(elem, "upper");
457  pdf->iteration = ITER_DONE;
458  pdf->distr.clear();
459 
460  for(DOMNode *subNode = elem->getFirstChild();
461  subNode; subNode = subNode->getNextSibling()) {
462  if (subNode->getNodeType() != DOMNode::ELEMENT_NODE)
463  continue;
464 
465  if (std::strcmp(XMLSimpleStr(subNode->getNodeName()),
466  "value") != 0)
467  throw cms::Exception("ProcNormalize")
468  << "Expected value tag in train file."
469  << std::endl;
470 
471  elem = static_cast<DOMElement*>(node);
472 
473  pdf->distr.push_back(
474  XMLDocument::readContent<double>(subNode));
475  }
476  }
477 
478  if (version == 1 && cur != pdfs.end())
479  throw cms::Exception("ProcNormalize")
480  << "Missing PDF in train data." << std::endl;
481 
482  trained = true;
483  for(std::vector<PDF>::const_iterator iter = pdfs.begin();
484  iter != pdfs.end(); ++iter) {
485  if (iter->iteration != ITER_DONE) {
486  trained = false;
487  break;
488  }
489  }
490 
491  return true;
492 }
493 
494 void ProcNormalize::save()
495 {
496  XMLDocument xml(trainer->trainFileName(this, "xml"), true);
497  DOMDocument *doc = xml.createDocument("ProcNormalize");
498  XMLDocument::writeAttribute(doc->getDocumentElement(), "version", 2);
499 
500  for(std::vector<PDF>::const_iterator iter = pdfs.begin();
501  iter != pdfs.end(); iter++) {
502  DOMElement *elem = doc->createElement(XMLUniStr("pdf"));
503  xml.getRootNode()->appendChild(elem);
504 
505  unsigned int idx = iter - pdfs.begin();
506  unsigned int catIdx = idx % nCategories;
507  unsigned int varIdx = idx / nCategories;
508  if (categoryIdx >= 0 && (int)varIdx >= categoryIdx)
509  varIdx++;
510  const SourceVariable *var = getInputs().get()[varIdx];
511  XMLDocument::writeAttribute(elem, "source",
512  (const char*)var->getSource()->getName());
513  XMLDocument::writeAttribute(elem, "name",
514  (const char*)var->getName());
515  if (categoryIdx >= 0)
516  XMLDocument::writeAttribute(elem, "category", catIdx);
517 
518  XMLDocument::writeAttribute(elem, "lower", iter->range.min);
519  XMLDocument::writeAttribute(elem, "upper", iter->range.max);
520 
521  for(std::vector<double>::const_iterator iter2 =
522  iter->distr.begin(); iter2 != iter->distr.end(); iter2++) {
523  DOMElement *value =
524  doc->createElement(XMLUniStr("value"));
525  elem->appendChild(value);
526 
527  XMLDocument::writeContent<double>(value, doc, *iter2);
528  }
529  }
530 }
531 
532 } // anonymous namespace
dbl * delta
Definition: mlp_gen.cc:36
int i
Definition: DBlmapReader.cc:9
Histogram< float > HistogramF
Definition: Histogram.h:106
Source * getSource() const
#define min(a, b)
Definition: mlp_lapack.h:161
bool operator==(const CaloTower &t1, const CaloTower &t2)
Definition: CaloTower.h:211
detail::ThreadSafeRegistry< ParameterSetID, ParameterSet, ProcessParameterSetIDCache > Registry
Definition: Registry.h:37
tuple node
Definition: Node.py:50
Cheap generic unique keyword identifier class.
Definition: AtomicId.h:32
MVATrainerComputer * calib
Definition: MVATrainer.cc:64
bool operator<(const FedChannelConnection &, const FedChannelConnection &)
tuple iteration
Definition: align_cfg.py:5
static bool hasAttribute(XERCES_CPP_NAMESPACE_QUALIFIER DOMElement *elem, const char *name)
Definition: XMLDocument.cc:303
const AtomicId getName() const
Definition: Variable.h:144
const T & max(const T &a, const T &b)
def load
Definition: svgfig.py:546
int j
Definition: DBlmapReader.cc:9
tuple doc
Definition: asciidump.py:381
static void writeAttribute(XERCES_CPP_NAMESPACE_QUALIFIER DOMElement *elem, const char *name, const T &value)
AtomicId getName() const
Definition: Source.h:19
tuple filename
Definition: lut2db_cfg.py:20
template to generate a registry singleton for a type.
static Interceptor::Registry registry("Interceptor")
x
Definition: VDTMath.h:216
std::vector< HistogramF > distr
Definition: MVAComputer.h:102