test
CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
Forest.cc
Go to the documentation of this file.
1 // Forest.cxx //
3 // =====================================================================//
4 // This is the object implementation of a forest of decision trees. //
5 // We need this to implement gradient boosting. //
6 // References include //
7 // *Elements of Statistical Learning by Hastie, //
8 // Tibshirani, and Friedman. //
9 // *Greedy Function Approximation: A Gradient Boosting Machine. //
10 // Friedman. The Annals of Statistics, Vol. 29, No. 5. Oct 2001. //
11 // *Inductive Learning of Tree-based Regression Models. Luis Torgo. //
12 // //
14 
16 // _______________________Includes_______________________________________//
18 
21 
22 #include "TStopwatch.h"
23 
24 #include <iostream>
25 #include <sstream>
26 #include <algorithm>
27 #include <fstream>
28 #include <utility>
29 
31 // _______________________Constructor(s)________________________________//
33 
35 {
36  events = std::vector< std::vector<Event*> >(1);
37 }
38 
40 // ----------------------------------------------------------------------
42 
43 L1TForest::L1TForest(std::vector<Event*>& trainingEvents)
44 {
45  setTrainingEvents(trainingEvents);
46 }
47 
49 // _______________________Destructor____________________________________//
51 
53 {
54  // When the forest is destroyed it will delete the trees as well as the
55  // events from the training and testing sets.
56  // The user may want the events to remain after they destroy the forest
57  // this should be changed in future upgrades.
58 
59  for(unsigned int i=0; i < trees.size(); i++)
60  {
61  delete trees[i];
62  }
63 }
65 // ______________________Get/Set_Functions______________________________//
67 
68 void L1TForest::setTrainingEvents(std::vector<Event*>& trainingEvents)
69 {
70  // tell the forest which events to use for training
71 
72  Event* e = trainingEvents[0];
73  // Unused variable
74  // unsigned int numrows = e->data.size();
75 
76  // Reset the events matrix.
77  events = std::vector< std::vector<Event*> >();
78 
79  events.reserve(e->data.size());
80 
81  for(unsigned int i=0; i<e->data.size(); i++)
82  {
83  events.push_back(trainingEvents);
84  }
85 }
86 
88 // ----------------------------------------------------------------------
90 
91 // return a copy of the training events
92 std::vector<Event*> L1TForest::getTrainingEvents(){ return events[0]; }
93 
95 // ----------------------------------------------------------------------
97 
98 // return the ith tree
99 Tree* L1TForest::getTree(unsigned int i)
100 {
101  if(/*i>=0 && */i<trees.size()) return trees[i];
102  else
103  {
104  //std::cout << i << "is an invalid input for getTree. Out of range." << std::endl;
105  return 0;
106  }
107 }
108 
110 // ______________________Various_Helpful_Functions______________________//
112 
113 unsigned int L1TForest::size()
114 {
115  // Return the number of trees in the forest.
116  return trees.size();
117 }
118 
120 //*** Need to make a data structure that includes the next few functions ***
121 //*** pertaining to events. These don't really have much to do with the ***
122 //*** forest class. ***
124 
126 // ----------------------------------------------------------------------
128 
129 void L1TForest::listEvents(std::vector< std::vector<Event*> >& e)
130 {
131  // Simply list the events in each event vector. We have multiple copies
132  // of the events vector. Each copy is sorted according to a different
133  // determining variable.
134  std::cout << std::endl << "Listing Events... " << std::endl;
135 
136  for(unsigned int i=0; i < e.size(); i++)
137  {
138  std::cout << std::endl << "Variable " << i << " vector contents: " << std::endl;
139  for(unsigned int j=0; j<e[i].size(); j++)
140  {
141  e[i][j]->outputEvent();
142  }
143  std::cout << std::endl;
144  }
145 }
146 
148 // ----------------------------------------------------------------------
150 
151 // We have to initialize Event::sortingIndex outside of a function since
152 // it is a static member.
153 Int_t Event::sortingIndex = 1;
154 
156 {
157  // Sort the events according to the variable given by the sortingIndex.
159 }
161 // ----------------------------------------------------------------------
163 
165 {
166  // Sort the events by ID. We need this to produce rate plots.
167  return e1->id < e2->id;
168 }
170 // ----------------------------------------------------------------------
172 
173 void L1TForest::sortEventVectors(std::vector< std::vector<Event*> >& e)
174 {
175  // When a node chooses the optimum split point and split variable it needs
176  // the events to be sorted according to the variable it is considering.
177 
178  for(unsigned int i=0; i<e.size(); i++)
179  {
181  std::sort(e[i].begin(), e[i].end(), compareEvents);
182  }
183 }
184 
186 // ----------------------------------------------------------------------
188 
189 void L1TForest::rankVariables(std::vector<int>& rank)
190 {
191  // This function ranks the determining variables according to their importance
192  // in determining the fit. Use a low learning rate for better results.
193  // Separates completely useless variables from useful ones well,
194  // but isn't the best at separating variables of similar importance.
195  // This is calculated using the error reduction on the training set. The function
196  // should be changed to use the testing set, but this works fine for now.
197  // I will try to change this in the future.
198 
199  // Initialize the vector v, which will store the total error reduction
200  // for each variable i in v[i].
201  std::vector<double> v(events.size(), 0);
202 
203  //std::cout << std::endl << "Ranking Variables by Net Error Reduction... " << std::endl;
204 
205  for(unsigned int j=0; j < trees.size(); j++)
206  {
207  trees[j]->rankVariables(v);
208  }
209 
210  double max = *std::max_element(v.begin(), v.end());
211 
212  // Scale the importance. Maximum importance = 100.
213  for(unsigned int i=0; i < v.size(); i++)
214  {
215  v[i] = 100*v[i]/max;
216  }
217 
218  // Change the storage format so that we can keep the index
219  // and the value associated after sorting.
220  std::vector< std::pair<double, Int_t> > w(events.size());
221 
222  for(unsigned int i=0; i<v.size(); i++)
223  {
224  w[i] = std::pair<double, Int_t>(v[i],i);
225  }
226 
227  // Sort so that we can output in order of importance.
228  std::sort(w.begin(),w.end());
229 
230  // Output the results.
231  for(int i=(v.size()-1); i>=0; i--)
232  {
233  rank.push_back(w[i].second);
234  // std::cout << "x" << w[i].second << ": " << w[i].first << std::endl;
235  }
236 
237  //std::cout << std::endl << "Done." << std::endl << std::endl;
238 }
239 
241 // ----------------------------------------------------------------------
243 
244 void L1TForest::saveSplitValues(const char* savefilename)
245 {
246  // This function gathers all of the split values from the forest and puts them into lists.
247 
248  std::ofstream splitvaluefile;
249  splitvaluefile.open(savefilename);
250 
251  // Initialize the matrix v, which will store the list of split values
252  // for each variable i in v[i].
253  std::vector<std::vector<double>> v(events.size(), std::vector<double>());
254 
255  //std::cout << std::endl << "Gathering split values... " << std::endl;
256 
257  // Gather the split values from each tree in the forest.
258  for(unsigned int j=0; j<trees.size(); j++)
259  {
260  trees[j]->getSplitValues(v);
261  }
262 
263  // Sort the lists of split values and remove the duplicates.
264  for(unsigned int i=0; i<v.size(); i++)
265  {
266  std::sort(v[i].begin(),v[i].end());
267  v[i].erase( unique( v[i].begin(), v[i].end() ), v[i].end() );
268  }
269 
270  // Output the results after removing duplicates.
271  // The 0th variable is special and is not used for splitting, so we start at 1.
272  for(unsigned int i=1; i<v.size(); i++)
273  {
274  TString splitValues;
275  for(unsigned int j=0; j<v[i].size(); j++)
276  {
277  std::stringstream ss;
278  ss.precision(14);
279  ss << std::scientific << v[i][j];
280  splitValues+=",";
281  splitValues+=ss.str().c_str();
282  }
283 
284  splitValues=splitValues(1,splitValues.Length());
285  splitvaluefile << splitValues << std::endl << std::endl;;
286  }
287 }
289 // ______________________Update_Events_After_Fitting____________________//
291 
293 {
294  // Prepare the global vector of events for the next tree.
295  // Update the fit for each event and set the new target value
296  // for the next tree.
297 
298  // Get the list of terminal nodes for this tree.
299  std::list<Node*>& tn = tree->getTerminalNodes();
300 
301  // Loop through the terminal nodes.
302  for(std::list<Node*>::iterator it=tn.begin(); it!=tn.end(); it++)
303  {
304  // Get the events in the current terminal region.
305  std::vector<Event*>& v = (*it)->getEvents()[0];
306 
307  // Fit the events depending on the loss function criteria.
308  double fit = l->fit(v);
309 
310  // Scale the rate at which the algorithm converges.
311  fit = learningRate*fit;
312 
313  // Store the official fit value in the terminal node.
314  (*it)->setFitValue(fit);
315 
316  // Loop through each event in the terminal region and update the
317  // the target for the next tree.
318  for(unsigned int j=0; j<v.size(); j++)
319  {
320  Event* e = v[j];
321  e->predictedValue += fit;
322  e->data[0] = l->target(e);
323  }
324 
325  // Release memory.
326  (*it)->getEvents() = std::vector< std::vector<Event*> >();
327  }
328 }
329 
331 // ----------------------------------------------------------------------
333 
335 {
336  // Prepare the test events for the next tree.
337 
338  // Get the list of terminal nodes for this tree.
339  std::list<Node*>& tn = tree->getTerminalNodes();
340 
341  // Loop through the terminal nodes.
342  for(std::list<Node*>::iterator it=tn.begin(); it!=tn.end(); it++)
343  {
344  std::vector<Event*>& v = (*it)->getEvents()[0];
345  double fit = (*it)->getFitValue();
346 
347  // Loop through each event in the terminal region and update the
348  // the global event it maps to.
349  for(unsigned int j=0; j<v.size(); j++)
350  {
351  Event* e = v[j];
352  e->predictedValue += fit;
353  }
354 
355  // Release memory.
356  (*it)->getEvents() = std::vector< std::vector<Event*> >();
357  }
358 }
359 
361 // ____________________Do/Test_the Regression___________________________//
363 
364 void L1TForest::doRegression(Int_t nodeLimit, Int_t treeLimit, double learningRate, L1TLossFunction* l, const char* savetreesdirectory, bool saveTrees)
365 {
366  // Build the forest using the training sample.
367 
368  //std::cout << std::endl << "--Building L1TForest..." << std::endl << std::endl;
369 
370  // The trees work with a matrix of events where the rows have the same set of events. Each row however
371  // is sorted according to the feature variable given by event->data[row].
372  // If we only had one set of events we would have to sort it according to the
373  // feature variable every time we want to calculate the best split point for that feature.
374  // By keeping sorted copies we avoid the sorting operation during splint point calculation
375  // and save computation time. If we do not sort each of the rows the regression will fail.
376  //std::cout << "Sorting event vectors..." << std::endl;
378 
379  // See how long the regression takes.
380  TStopwatch timer;
381  timer.Start(kTRUE);
382 
383  for(unsigned int i=0; i< (unsigned) treeLimit; i++)
384  {
385  // std::cout << "++Building Tree " << i << "... " << std::endl;
386  Tree* tree = new Tree(events);
387  trees.push_back(tree);
388  tree->buildTree(nodeLimit);
389 
390  // Update the targets for the next tree to fit.
391  updateRegTargets(tree, learningRate, l);
392 
393  // Save trees to xml in some directory.
394  std::ostringstream ss;
395  ss << savetreesdirectory << "/" << i << ".xml";
396  std::string s = ss.str();
397  const char* c = s.c_str();
398 
399  if(saveTrees) tree->saveToXML(c);
400  }
401  //std::cout << std::endl;
402  //std::cout << std::endl << "Done." << std::endl << std::endl;
403 
404  // std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl;
405 }
406 
408 // ----------------------------------------------------------------------
410 
411 void L1TForest::predictEvents(std::vector<Event*>& eventsp, unsigned int numtrees)
412 {
413  // Predict values for eventsp by running them through the forest up to numtrees.
414 
415  //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl;
416  if(numtrees > trees.size())
417  {
418  //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl;
419  numtrees = trees.size();
420  }
421 
422  // i iterates through the trees in the forest. Each tree corrects the last prediction.
423  for(unsigned int i=0; i < numtrees; i++)
424  {
425  //std::cout << "++Tree " << i << "..." << std::endl;
426  appendCorrection(eventsp, i);
427  }
428 }
429 
431 // ----------------------------------------------------------------------
433 
434 void L1TForest::appendCorrection(std::vector<Event*>& eventsp, Int_t treenum)
435 {
436  // Update the prediction by appending the next correction.
437 
438  Tree* tree = trees[treenum];
439  tree->filterEvents(eventsp);
440 
441  // Update the events with their new prediction.
442  updateEvents(tree);
443 }
444 
446 // ----------------------------------------------------------------------
448 
449 void L1TForest::predictEvent(Event* e, unsigned int numtrees)
450 {
451  // Predict values for eventsp by running them through the forest up to numtrees.
452 
453  //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl;
454  if(numtrees > trees.size())
455  {
456  //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl;
457  numtrees = trees.size();
458  }
459 
460  // i iterates through the trees in the forest. Each tree corrects the last prediction.
461  for(unsigned int i=0; i < numtrees; i++)
462  {
463  //std::cout << "++Tree " << i << "..." << std::endl;
464  appendCorrection(e, i);
465  }
466 }
467 
469 // ----------------------------------------------------------------------
471 
472 void L1TForest::appendCorrection(Event* e, Int_t treenum)
473 {
474  // Update the prediction by appending the next correction.
475 
476  Tree* tree = trees[treenum];
477  Node* terminalNode = tree->filterEvent(e);
478 
479  // Update the event with its new prediction.
480  double fit = terminalNode->getFitValue();
481  e->predictedValue += fit;
482 }
484 // ----------------------------------------------------------------------------------
486 
487 void L1TForest::loadL1TForestFromXML(const char* directory, unsigned int numTrees)
488 {
489  // Load a forest that has already been created and stored into XML somewhere.
490 
491  // Initialize the vector of trees.
492  trees = std::vector<Tree*>(numTrees);
493 
494  // Load the L1TForest.
495  //std::cout << std::endl << "Loading L1TForest from XML ... " << std::endl;
496  for(unsigned int i=0; i < numTrees; i++)
497  {
498  trees[i] = new Tree();
499 
500  std::stringstream ss;
501  ss << directory << "/" << i << ".xml";
502 
503  //trees[i]->loadFromXML(ss.str().c_str());
504  trees[i]->loadFromXML(edm::FileInPath(ss.str().c_str()).fullPath().c_str());
505  }
506 
507  // std::cout << "Done." << std::endl << std::endl;
508 }
509 
511 // ___________________Stochastic_Sampling_&_Regression__________________//
513 
515 {
516  // We use this for Stochastic Gradient Boosting. Basically you
517  // take a subsample of the training events and build a tree using
518  // those. Then use the tree built from the subsample to update
519  // the predictions for all the events.
520 
521  subSample = std::vector< std::vector<Event*> >(events.size()) ;
522  size_t subSampleSize = fraction*events[0].size();
523 
524  // Randomize the first subSampleSize events in events[0].
525  shuffle(events[0].begin(), events[0].end(), subSampleSize);
526 
527  // Get a copy of the random subset we just made.
528  std::vector<Event*> v(events[0].begin(), events[0].begin()+subSampleSize);
529 
530  // Initialize and sort the subSample collection.
531  for(unsigned int i=0; i<subSample.size(); i++)
532  {
533  subSample[i] = v;
534  }
535 
537 }
538 
540 // ----------------------------------------------------------------------
542 
543 void L1TForest::doStochasticRegression(Int_t nodeLimit, Int_t treeLimit, double learningRate, double fraction, L1TLossFunction* l)
544 {
545  // If the fraction of events to use is one then this algorithm is slower than doRegression due to the fact
546  // that we have to sort the events every time we extract a subsample. Without random sampling we simply
547  // use all of the events and keep them sorted.
548 
549  // Anyways, this algorithm uses a portion of the events to train each tree. All of the events are updated
550  // afterwards with the results from the subsample built tree.
551 
552  // Prepare some things.
554  trees = std::vector<Tree*>(treeLimit);
555 
556  // See how long the regression takes.
557  TStopwatch timer;
558  timer.Start(kTRUE);
559 
560  // Output the current settings.
561  // std::cout << std::endl << "Running stochastic regression ... " << std::endl;
562  //std::cout << "# Nodes: " << nodeLimit << std::endl;
563  //std::cout << "Learning Rate: " << learningRate << std::endl;
564  //std::cout << "Bagging Fraction: " << fraction << std::endl;
565  //std::cout << std::endl;
566 
567  for(unsigned int i=0; i< (unsigned) treeLimit; i++)
568  {
569  // Build the tree using a random subsample.
570  prepareRandomSubsample(fraction);
571  trees[i] = new Tree(subSample);
572  trees[i]->buildTree(nodeLimit);
573 
574  // Fit all of the events based upon the tree we built using
575  // the subsample of events.
576  trees[i]->filterEvents(events[0]);
577 
578  // Update the targets for the next tree to fit.
579  updateRegTargets(trees[i], learningRate, l);
580 
581  // Save trees to xml in some directory.
582  std::ostringstream ss;
583  ss << "trees/" << i << ".xml";
584  std::string s = ss.str();
585  const char* c = s.c_str();
586 
587  trees[i]->saveToXML(c);
588  }
589 
590  //std::cout << std::endl << "Done." << std::endl << std::endl;
591 
592  //std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl;
593 }
Node * filterEvent(Event *e)
Definition: Tree.cc:201
Double_t predictedValue
Definition: Event.h:20
Double_t getFitValue()
Definition: Node.cc:155
int i
Definition: DBlmapReader.cc:9
std::vector< Event * > getTrainingEvents()
Definition: Forest.cc:92
Int_t id
Definition: Event.h:29
const double w
Definition: UKUtility.cc:23
Definition: Node.h:10
void prepareRandomSubsample(double fraction)
Definition: Forest.cc:514
void rankVariables(std::vector< int > &rank)
Definition: Forest.cc:189
std::vector< Tree * > trees
Definition: Forest.h:60
unsigned int size()
Definition: Forest.cc:113
void doRegression(Int_t nodeLimit, Int_t treeLimit, double learningRate, L1TLossFunction *l, const char *savetreesdirectory, bool saveTrees)
Definition: Forest.cc:364
void loadL1TForestFromXML(const char *directory, unsigned int numTrees)
Definition: Forest.cc:487
void updateEvents(Tree *tree)
Definition: Forest.cc:334
virtual Double_t fit(std::vector< Event * > &v)=0
void doStochasticRegression(Int_t nodeLimit, Int_t treeLimit, double learningRate, double fraction, L1TLossFunction *l)
Definition: Forest.cc:543
bidiiter shuffle(bidiiter begin, bidiiter end, size_t num_random)
Definition: Utilities.h:26
Definition: Event.h:16
void buildTree(Int_t nodeLimit)
Definition: Tree.cc:106
U second(std::pair< T, U > const &p)
static Int_t sortingIndex
Definition: Event.h:28
void listEvents(std::vector< std::vector< Event * > > &e)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Definition: Forest.cc:129
void setTrainingEvents(std::vector< Event * > &trainingEvents)
Definition: Forest.cc:68
void appendCorrection(std::vector< Event * > &eventsp, Int_t treenum)
Definition: Forest.cc:434
~L1TForest()
Definition: Forest.cc:52
int j
Definition: DBlmapReader.cc:9
std::vector< std::vector< Event * > > events
Definition: Forest.h:58
#define end
Definition: vmac.h:37
void sortEventVectors(std::vector< std::vector< Event * > > &e)
Definition: Forest.cc:173
void updateRegTargets(Tree *tree, double learningRate, L1TLossFunction *l)
Definition: Forest.cc:292
Definition: Tree.h:17
bool compareEvents(Event *e1, Event *e2)
Definition: Forest.cc:155
Float e1
Definition: deltaR.h:20
std::vector< std::vector< Event * > > subSample
Definition: Forest.h:59
Tree * getTree(unsigned int i)
Definition: Forest.cc:99
L1TForest()
Definition: Forest.cc:34
std::list< Node * > & getTerminalNodes()
Definition: Tree.cc:74
void predictEvents(std::vector< Event * > &eventsp, unsigned int trees)
Definition: Forest.cc:411
Float e2
Definition: deltaR.h:21
void predictEvent(Event *e, unsigned int trees)
Definition: Forest.cc:449
void filterEvents(std::vector< Event * > &tEvents)
Definition: Tree.cc:167
#define begin
Definition: vmac.h:30
void saveSplitValues(const char *savefilename)
Definition: Forest.cc:244
virtual Double_t target(Event *e)=0
tuple cout
Definition: gather_cfg.py:145
void saveToXML(const char *filename)
Definition: Tree.cc:331
bool compareEventsById(Event *e1, Event *e2)
Definition: Forest.cc:164
std::vector< Double_t > data
Definition: Event.h:30