3 #include <ext/functional> 15 #include <xercesc/dom/DOM.hpp> 43 class MVATrainerComputer;
45 class BaseInterceptor :
public Calibration::Interceptor {
48 ~BaseInterceptor()
override {}
50 inline void setCalibration(MVATrainerComputer *
calib)
51 { this->calib =
calib; }
53 std::vector<Variable::Flags>
54 configure(
const MVAComputer *
computer,
unsigned int n,
55 const std::vector<Variable::Flags> &
flags)
override = 0;
58 intercept(
const std::vector<double> *
values)
const override = 0;
60 virtual void init() {}
61 virtual void finish(
bool save) {}
67 class InitInterceptor :
public BaseInterceptor {
70 ~InitInterceptor()
override {}
72 std::vector<Variable::Flags>
73 configure(
const MVAComputer *
computer,
unsigned int n,
74 const std::vector<Variable::Flags> &
flags)
override;
77 intercept(
const std::vector<double> *
values)
const override;
80 class TrainInterceptor :
public BaseInterceptor {
82 TrainInterceptor(TrainProcessor *
proc) :
proc(proc) {}
83 ~TrainInterceptor()
override {}
85 inline TrainProcessor *getProcessor()
const {
return proc; }
87 std::vector<Variable::Flags>
88 configure(
const MVAComputer *
computer,
unsigned int n,
89 const std::vector<Variable::Flags> &
flags)
override;
92 intercept(
const std::vector<double> *
values)
const override;
95 void finish(
bool save)
override;
100 mutable std::vector<std::vector<double> >
tmp;
104 class MVATrainerComputer :
public TrainMVAComputerCalibration {
106 typedef std::pair<unsigned int, BaseInterceptor*> Interceptor;
108 MVATrainerComputer(
const std::vector<Interceptor>
110 bool autoSave, UInt_t
seed,
double split);
112 ~MVATrainerComputer()
override;
114 std::vector<Calibration::VarProcessor*>
115 getProcessors()
const override;
116 void initFlags(std::vector<Variable::Flags>
117 &
flags)
const override;
119 void configured(BaseInterceptor *interceptor)
const;
124 { flags.push_back(flag); }
126 inline bool useForTraining()
const {
return splitResult; }
127 inline bool useForTesting()
const 130 inline bool isConfigured()
const 146 static inline void deleter(
T *ptr) {
delete ptr; }
149 struct auto_cleaner {
150 inline ~auto_cleaner()
153 inline void add(
T *ptr) {
clean.push_back(ptr); }
160 unsigned int size = std::min<unsigned int>(128, std::strlen(format));
163 int n = std::vsnprintf(buffer, size, format, va);
164 if (n >= 0 && (
unsigned int)n < size)
173 buffer =
new char[
size];
184 va_start(va, format);
192 std::vector<Variable::Flags>
194 const std::vector<Variable::Flags> &
flags)
196 calib->configured(
this);
201 InitInterceptor::intercept(
const std::vector<double> *
values)
const 209 std::vector<Variable::Flags>
210 TrainInterceptor::configure(
const MVAComputer *computer,
unsigned int n,
211 const std::vector<Variable::Flags> &flags)
218 std::vector<SourceVariable*>
inputs = inputSet.
get(
true);
220 std::vector<SourceVariable*>::const_iterator
pos;
222 assert(pos != inputs.end());
225 assert(pos != inputs.end());
228 calib->configured(
this);
231 if (targetIdx < weightIdx) {
232 result.erase(result.begin() +
weightIdx);
233 result.erase(result.begin() +
targetIdx);
235 result.erase(result.begin() +
targetIdx);
236 result.erase(result.begin() +
weightIdx);
239 proc->passFlags(result);
242 result.resize(n,
proc->getDefaultFlags());
246 if (targetIdx >= 2 || weightIdx >= 2)
255 <<
"TrainProcessor \"" << (
const char*)
proc->getName()
256 <<
"\" training iteration starting...";
258 proc->doTrainBegin();
262 TrainInterceptor::intercept(
const std::vector<double> *values)
const 267 <<
"Trainer input lacks target variable." 271 <<
"Multiple targets supplied in input." 279 <<
"Multiple weights supplied in input." 285 proc->doTrainData(values + 2, target > 0.5, weight,
286 calib->useForTraining(),
287 calib->useForTesting());
289 std::vector<std::vector<double> >::iterator
pos =
tmp.begin();
290 for(
unsigned int i = 0; pos !=
tmp.end();
i++)
295 calib->useForTraining(),
296 calib->useForTesting());
302 void TrainInterceptor::finish(
bool save)
307 <<
"... processor \"" << (
const char*)
proc->getName()
308 <<
"\" training iteration done.";
310 if (
proc->isTrained()) {
312 <<
"* Completed training of \"" 313 << (
const char*)
proc->getName() <<
"\".";
322 MVATrainerComputer::MVATrainerComputer(
const std::vector<Interceptor>
328 for(std::vector<Interceptor>::const_iterator iter =
329 interceptors.begin(); iter != interceptors.end(); ++iter)
330 iter->second->setCalibration(
this);
333 MVATrainerComputer::~MVATrainerComputer()
337 for(std::vector<Interceptor>::const_iterator iter =
338 interceptors.begin(); iter != interceptors.end(); ++iter)
342 std::vector<Calibration::VarProcessor*>
343 MVATrainerComputer::getProcessors()
const 345 std::vector<Calibration::VarProcessor*> processors =
348 for(std::vector<Interceptor>::const_iterator iter =
349 interceptors.begin(); iter != interceptors.end(); ++iter)
351 processors.insert(processors.begin() + iter->first,
357 void MVATrainerComputer::initFlags(std::vector<Variable::Flags> &flags)
const 359 assert(flags.size() == this->flags.size());
363 void MVATrainerComputer::configured(BaseInterceptor *interceptor)
const 367 for(std::vector<Interceptor>::const_iterator iter =
368 interceptors.begin();
369 iter != interceptors.end(); ++iter)
370 iter->second->init();
378 void MVATrainerComputer::done()
380 if (isConfigured()) {
381 for(std::vector<Interceptor>::const_iterator iter =
382 interceptors.begin();
383 iter != interceptors.end(); ++iter)
398 return id == MVATrainer::kTargetId ||
399 id == MVATrainer::kWeightId ||
406 for(std::string::const_iterator iter = in.begin();
407 iter != in.end(); ++iter) {
421 const char *styleSheet) :
424 doMonitoring(
false), randomSeed(65539), crossValidation(0.0)
430 "PhysicsTools/MVATrainer/data/MVATrainer.xsl")
441 DOMNode *node =
xml->getRootNode();
443 if (std::strcmp(
XMLSimpleStr(node->getNodeName()),
"MVATrainer") != 0)
445 <<
"Invalid XML root node." << std::endl;
452 } state = STATE_GENERAL;
454 for(node = node->getFirstChild();
455 node; node = node->getNextSibling()) {
456 if (node->getNodeType() != DOMNode::ELEMENT_NODE)
460 DOMElement *
elem =
static_cast<DOMElement*
>(node);
463 case STATE_GENERAL: {
464 if (name !=
"general")
466 <<
"Expected general config as first " 469 for(DOMNode *subNode = elem->getFirstChild();
470 subNode; subNode = subNode->getNextSibling()) {
471 if (subNode->getNodeType() !=
472 DOMNode::ELEMENT_NODE)
476 subNode->getNodeName()),
"option") != 0)
478 <<
"Expected option tag." 481 elem =
static_cast<DOMElement*
>(subNode);
482 name = XMLDocument::readAttribute<std::string>(
485 elem->getTextContent());
489 else if (name ==
"trainfiles")
493 <<
"Unknown option \"" 494 << name <<
"\"." << std::endl;
502 <<
"Expected input config as second " 505 AtomicId id = XMLDocument::readAttribute<std::string>(
508 input->getOutputs().append(
512 input->getOutputs().append(
516 sources.insert(std::make_pair(
id, input));
519 state = STATE_MIDDLE;
522 if (name ==
"output") {
529 }
else if (name !=
"processor")
531 <<
"Unexpected tag after input " 532 "config." << std::endl;
534 AtomicId id = XMLDocument::readAttribute<std::string>(
537 XMLDocument::readAttribute<std::string>(
544 <<
"Unexpected tag found after output." 550 if (state == STATE_FIRST)
552 <<
"Expected input variable config." << std::endl;
553 else if (state == STATE_MIDDLE)
555 <<
"Expected output variable config." << std::endl;
566 for(std::map<AtomicId, Source*>::const_iterator iter =
sources.begin();
567 iter !=
sources.end(); iter++) {
583 for(std::vector<AtomicId>::const_iterator iter =
586 std::map<AtomicId, Source*>::const_iterator
pos =
595 << source->getId() <<
" configuration for \"" 596 << (
const char*)source->getName()
597 <<
"\" loaded from file.";
605 for(std::vector<AtomicId>::const_iterator iter =
608 std::map<AtomicId, Source*>::const_iterator
pos =
615 if (source->isTrained())
622 DOMElement *xmlInput =
nullptr;
623 DOMElement *xmlConfig =
nullptr;
624 DOMElement *xmlOutput =
nullptr;
625 DOMElement *xmlData =
nullptr;
627 static struct NameExpect {
632 {
"input",
true, &xmlInput },
633 {
"config",
true, &xmlConfig },
634 {
"output",
true, &xmlOutput },
635 {
"data",
false, &xmlData },
639 const NameExpect *cur = expect;
640 for(DOMNode *node = elem->getFirstChild();
641 node; node = node->getNextSibling()) {
642 if (node->getNodeType() != DOMNode::ELEMENT_NODE)
646 DOMElement *elem =
static_cast<DOMElement*
>(node);
650 <<
"Superfluous tag " << tag
651 <<
"encountered in processor." << std::endl;
652 else if (tag != cur->tag && cur->mandatory)
654 <<
"Expected tag " << cur->tag <<
", got " 655 << tag <<
" instead in processor." 657 else if (tag != cur->tag) {
661 *(cur++)->elem = elem;
664 while(cur->tag && !cur->mandatory)
668 <<
"Unexpected end of processor configuration, " 669 <<
"expected tag " << cur->tag <<
"." << std::endl;
671 std::unique_ptr<TrainProcessor>
proc(
675 <<
"Variable processor trainer " << name
676 <<
" could not be instantiated. Most likely because" 677 " the trainer plugin for \"" << name <<
"\"" 678 " does not exist." << std::endl;
682 <<
"Duplicate variable processor id " 683 << (
const char*)
id <<
"." 690 <<
"Configuring " << (
const char*)proc->getId()
691 <<
" \"" << (
const char*)proc->getName() <<
"\".";
692 proc->configure(xmlConfig);
694 sources.insert(std::make_pair(
id, proc.release()));
705 arg_.c_str(), ext.c_str());
716 "monitoring",
"",
"root");
725 std::map<AtomicId, Source*>::const_iterator
pos =
sources.find(source);
729 return pos->second->getOutput(name);
747 std::vector<SourceVariable*>
tmp;
751 for(DOMNode *node = xml->getFirstChild(); node;
752 node = node->getNextSibling()) {
753 if (node->getNodeType() != DOMNode::ELEMENT_NODE)
756 if (std::strcmp(
XMLSimpleStr(node->getNodeName()),
"var") != 0)
758 <<
"Invalid input variable node." << std::endl;
760 DOMElement *
elem =
static_cast<DOMElement*
>(node);
764 AtomicId name = XMLDocument::readAttribute<std::string>(
770 <<
"Input variable " << (
const char*)source
771 <<
":" << (
const char*)name
772 <<
" not found." << std::endl;
774 if (XMLDocument::readAttribute<bool>(elem,
"target",
false)) {
777 <<
"Target variable defined twice" 781 if (XMLDocument::readAttribute<bool>(elem,
"weight",
false)) {
784 <<
"Weight variable defined twice" 795 tmp.insert(tmp.begin() +
802 tmp.insert(tmp.begin(), 1,
target);
806 for(std::vector<SourceVariable*>::const_iterator iter =
variables.begin();
808 std::vector<SourceVariable*>::const_iterator
pos =
809 std::find(tmp.begin(), tmp.end(), *iter);
810 if (pos == tmp.end())
816 else if (*iter == weight)
821 if (vars.
append(*iter, magic, pos - tmp.begin())) {
825 <<
"Input variable " << (
const char*)source
826 <<
":" << (
const char*)name
827 <<
" defined twice." << std::endl;
833 assert(tmp.size() ==
n);
839 for(DOMNode *node = xml->getFirstChild(); node;
840 node = node->getNextSibling()) {
841 if (node->getNodeType() != DOMNode::ELEMENT_NODE)
844 if (std::strcmp(
XMLSimpleStr(node->getNodeName()),
"var") != 0)
846 <<
"Invalid output variable node." 849 DOMElement *
elem =
static_cast<DOMElement*
>(node);
851 AtomicId name = XMLDocument::readAttribute<std::string>(
855 <<
"Output variable tag missing name." 859 <<
"Cannot use magic variable names in output." 864 if (XMLDocument::readAttribute<bool>(elem,
"optional",
true))
868 if (XMLDocument::readAttribute<bool>(elem,
"multiple",
true))
873 if (!var || vars.
append(var))
875 <<
"Output variable " 876 << (
const char*)source->
getName()
877 <<
":" << (
const char*)name
878 <<
" defined twice." << std::endl;
884 const std::vector<CalibratedProcessor> &procs,
885 bool withTarget)
const 887 std::map<SourceVariable*, unsigned int>
vars;
888 unsigned int size = 0;
890 MVATrainerComputer *trainCalib =
891 dynamic_cast<MVATrainerComputer*
>(
calib);
893 for(
unsigned int i = 0;
895 if (
i < 2 && !withTarget)
903 calib->
inputSet.push_back(calibVar);
905 trainCalib->addFlag(var->
getFlags());
908 for(std::vector<CalibratedProcessor>::const_iterator iter =
909 procs.begin(); iter != procs.end(); iter++) {
910 bool isInterceptor =
dynamic_cast<BaseInterceptor*
>(
911 iter->calib) !=
nullptr;
915 unsigned int last = 0;
916 std::vector<SourceVariable*> inoutVars;
918 inoutVars = iter->processor->getInputs().get(
920 for(std::vector<SourceVariable*>::const_iterator iter2 =
921 inoutVars.begin(); iter2 != inoutVars.end(); iter2++) {
923 unsigned int>::const_iterator
pos =
926 assert(pos != vars.end());
928 if (pos->second < last)
930 <<
"Input variables not declared " 931 "in order of appearance in \"" 932 << (
const char*)iter->processor->getName()
933 <<
"\"." << std::endl;
935 inputSet[last = pos->second] =
true;
938 assert(!isInterceptor || withTarget);
951 inoutVars = iter->processor->getOutputs().get();
952 for(std::vector<SourceVariable*>::const_iterator iter =
953 inoutVars.begin(); iter != inoutVars.end(); iter++) {
955 vars[*iter] = size++;
961 <<
"Exactly one output variable has to be specified." 965 std::map<SourceVariable*, unsigned int>::const_iterator
pos =
967 if (pos != vars.end())
968 calib->
output = pos->second;
976 std::vector<MVATrainerComputer::Interceptor> baseInterceptors;
979 BaseInterceptor *interceptor =
new InitInterceptor;
980 baseInterceptors.push_back(std::make_pair(0, interceptor));
983 for(
const AtomicId *iter = train; *iter; iter++) {
985 if (*iter == kOutputId)
988 std::map<AtomicId, Source*>::const_iterator
pos =
995 interceptors[*iter] =
new TrainInterceptor(source);
998 auto_cleaner<Calibration::VarProcessor> autoClean;
1000 std::set<AtomicId> done;
1001 for(
const AtomicId *iter = compute; *iter; iter++) {
1002 if (done.erase(*iter))
1005 std::map<AtomicId, Source*>::const_iterator
pos =
1011 assert(source->isTrained());
1017 autoClean.add(proc);
1023 std::vector<AtomicId>::const_iterator pos2 =
1025 this->processors.end(), *iter);
1026 assert(pos2 != this->processors.end());
1030 assert(pos2 != this->processors.end());
1034 if (*iter2 == *pos2)
1041 done.insert(*iter2);
1048 assert(source->isTrained());
1050 proc = source->getCalibration();
1052 autoClean.add(proc);
1053 processors.push_back(
1059 std::map<AtomicId, TrainInterceptor*>::iterator
1060 pos3 = interceptors.find(*pos2);
1061 if (pos3 != interceptors.end()) {
1063 baseInterceptors.push_back(
1064 std::make_pair(processors.size(),
1066 processors.push_back(
1068 pos3->second->getProcessor(),
1070 interceptors.erase(pos3);
1076 baseInterceptors.pop_back();
1077 processors.pop_back();
1082 for(std::map<AtomicId, TrainInterceptor*>::const_iterator iter =
1083 interceptors.begin(); iter != interceptors.end(); ++iter) {
1086 baseInterceptors.push_back(std::make_pair(processors.size(),
1091 std::unique_ptr<Calibration::MVAComputer>
calib(
1092 new MVATrainerComputer(baseInterceptors,
doAutoSave,
1097 return calib.release();
1102 MVATrainerComputer *
calib =
1103 dynamic_cast<MVATrainerComputer*
>(trainCalibration);
1107 <<
"Invalid training calibration passed to " 1108 "doneTraining()" << std::endl;
1115 std::set<Source*> toCheck;
1118 std::set<Source*> done;
1119 while(!toCheck.empty()) {
1121 toCheck.erase(toCheck.begin());
1124 for(std::vector<SourceVariable*>::const_iterator iter =
1125 inputs.begin(); iter != inputs.end(); ++iter) {
1126 source = (*iter)->getSource();
1127 if (done.insert(source).second)
1128 toCheck.insert(source);
1132 std::vector<AtomicId>
result;
1133 for(std::vector<AtomicId>::const_iterator iter =
processors.begin();
1135 std::map<AtomicId, Source*>::const_iterator
pos =
1137 if (pos !=
sources.end() && done.count(pos->second))
1138 result.push_back(*iter);
1148 std::unique_ptr<Calibration::MVAComputer>
calib(
1152 for(std::vector<AtomicId>::const_iterator iter = used.begin();
1153 iter != used.end(); iter++) {
1154 std::map<AtomicId, Source*>::const_iterator
pos =
1160 if (!source->isTrained())
1170 std::vector<AtomicId>::const_iterator
begin =
1172 this->processors.end(), *iter);
1173 assert(this->processors.end() - begin >
1174 (
int)(foreach->nProcs + 1));
1176 std::vector<AtomicId>::const_iterator
end =
1177 begin +
foreach->nProcs;
1178 foreach->nProcs = 0;
1179 for(std::vector<AtomicId>::const_iterator iter2 =
1180 iter; iter2 != used.end(); ++iter2)
1190 return calib.release();
1194 std::vector<AtomicId> &train)
const 1199 std::set<Source*> trainedSources;
1200 trainedSources.insert(
input);
1202 for(std::vector<AtomicId>::const_iterator iter =
1204 std::map<AtomicId, Source*>::const_iterator
pos =
1211 bool trainedDeps =
true;
1212 std::vector<SourceVariable*> inputVars =
1213 proc->getInputs().get();
1214 for(std::vector<SourceVariable*>::const_iterator iter2 =
1215 inputVars.begin(); iter2 != inputVars.end(); iter2++) {
1216 if (trainedSources.find((*iter2)->getSource())
1217 == trainedSources.end()) {
1218 trainedDeps =
false;
1226 if (proc->isTrained()) {
1227 trainedSources.insert(proc);
1228 compute.push_back(proc->getName());
1230 train.push_back(proc->getName());
1235 != trainedSources.end())
1236 train.push_back(kOutputId);
1241 std::vector<AtomicId>
compute, train;
1247 compute.push_back(
nullptr);
1248 train.push_back(
nullptr);
TrainProcessor *const proc
std::vector< Variable::Flags > flags
#define XERCES_CPP_NAMESPACE_QUALIFIER
void find(edm::Handle< EcalRecHitCollection > &hits, DetId thisDet, std::vector< EcalRecHitCollection::const_iterator > &hit, bool debug=false)
static std::string const input
MVATrainerComputer * calib
def elem(elemtype, innerHTML='', html_class='', kwargs)
void add(std::map< std::string, TH1 * > &h, TH1 *hist)
std::vector< Interceptor > interceptors
std::vector< std::vector< double > > tmp
std::string fullPath() const
static std::string const source