CMS 3D CMS Logo

Forest.cc
Go to the documentation of this file.
1 // Forest.cxx //
3 // =====================================================================//
4 // This is the object implementation of a forest of decision trees. //
5 // We need this to implement gradient boosting. //
6 // References include //
7 // *Elements of Statistical Learning by Hastie, //
8 // Tibshirani, and Friedman. //
9 // *Greedy Function Approximation: A Gradient Boosting Machine. //
10 // Friedman. The Annals of Statistics, Vol. 29, No. 5. Oct 2001. //
11 // *Inductive Learning of Tree-based Regression Models. Luis Torgo. //
12 // //
14 
16 // _______________________Includes_______________________________________//
18 
21 
22 #include "TStopwatch.h"
23 
24 #include <iostream>
25 #include <sstream>
26 #include <algorithm>
27 #include <fstream>
28 #include <utility>
29 
30 using namespace emtf;
31 
33 // _______________________Constructor(s)________________________________//
35 
37 {
38  events = std::vector< std::vector<Event*> >(1);
39 }
40 
42 // ----------------------------------------------------------------------
44 
45 L1TForest::L1TForest(std::vector<Event*>& trainingEvents)
46 {
47  setTrainingEvents(trainingEvents);
48 }
49 
51 // _______________________Destructor____________________________________//
53 
55 {
56  // When the forest is destroyed it will delete the trees as well as the
57  // events from the training and testing sets.
58  // The user may want the events to remain after they destroy the forest
59  // this should be changed in future upgrades.
60 
61  for(unsigned int i=0; i < trees.size(); i++)
62  {
63  delete trees[i];
64  }
65 }
67 // ______________________Get/Set_Functions______________________________//
69 
70 void L1TForest::setTrainingEvents(std::vector<Event*>& trainingEvents)
71 {
72  // tell the forest which events to use for training
73 
74  Event* e = trainingEvents[0];
75  // Unused variable
76  // unsigned int numrows = e->data.size();
77 
78  // Reset the events matrix.
79  events = std::vector< std::vector<Event*> >();
80 
81  events.reserve(e->data.size());
82 
83  for(unsigned int i=0; i<e->data.size(); i++)
84  {
85  events.push_back(trainingEvents);
86  }
87 }
88 
90 // ----------------------------------------------------------------------
92 
93 // return a copy of the training events
94 std::vector<Event*> L1TForest::getTrainingEvents(){ return events[0]; }
95 
97 // ----------------------------------------------------------------------
99 
100 // return the ith tree
102 {
103  if(/*i>=0 && */i<trees.size()) return trees[i];
104  else
105  {
106  //std::cout << i << "is an invalid input for getTree. Out of range." << std::endl;
107  return 0;
108  }
109 }
110 
112 // ______________________Various_Helpful_Functions______________________//
114 
115 unsigned int L1TForest::size()
116 {
117  // Return the number of trees in the forest.
118  return trees.size();
119 }
120 
122 //*** Need to make a data structure that includes the next few functions ***
123 //*** pertaining to events. These don't really have much to do with the ***
124 //*** forest class. ***
126 
128 // ----------------------------------------------------------------------
130 
131 void L1TForest::listEvents(std::vector< std::vector<Event*> >& e)
132 {
133  // Simply list the events in each event vector. We have multiple copies
134  // of the events vector. Each copy is sorted according to a different
135  // determining variable.
136  std::cout << std::endl << "Listing Events... " << std::endl;
137 
138  for(unsigned int i=0; i < e.size(); i++)
139  {
140  std::cout << std::endl << "Variable " << i << " vector contents: " << std::endl;
141  for(unsigned int j=0; j<e[i].size(); j++)
142  {
143  e[i][j]->outputEvent();
144  }
145  std::cout << std::endl;
146  }
147 }
148 
150 // ----------------------------------------------------------------------
152 
153 // We have to initialize Event::sortingIndex outside of a function since
154 // it is a static member.
155 Int_t Event::sortingIndex = 1;
156 
158 {
159  // Sort the events according to the variable given by the sortingIndex.
161 }
163 // ----------------------------------------------------------------------
165 
167 {
168  // Sort the events by ID. We need this to produce rate plots.
169  return e1->id < e2->id;
170 }
172 // ----------------------------------------------------------------------
174 
175 void L1TForest::sortEventVectors(std::vector< std::vector<Event*> >& e)
176 {
177  // When a node chooses the optimum split point and split variable it needs
178  // the events to be sorted according to the variable it is considering.
179 
180  for(unsigned int i=0; i<e.size(); i++)
181  {
183  std::sort(e[i].begin(), e[i].end(), compareEvents);
184  }
185 }
186 
188 // ----------------------------------------------------------------------
190 
191 void L1TForest::rankVariables(std::vector<int>& rank)
192 {
193  // This function ranks the determining variables according to their importance
194  // in determining the fit. Use a low learning rate for better results.
195  // Separates completely useless variables from useful ones well,
196  // but isn't the best at separating variables of similar importance.
197  // This is calculated using the error reduction on the training set. The function
198  // should be changed to use the testing set, but this works fine for now.
199  // I will try to change this in the future.
200 
201  // Initialize the vector v, which will store the total error reduction
202  // for each variable i in v[i].
203  std::vector<double> v(events.size(), 0);
204 
205  //std::cout << std::endl << "Ranking Variables by Net Error Reduction... " << std::endl;
206 
207  for(unsigned int j=0; j < trees.size(); j++)
208  {
209  trees[j]->rankVariables(v);
210  }
211 
212  double max = *std::max_element(v.begin(), v.end());
213 
214  // Scale the importance. Maximum importance = 100.
215  for(unsigned int i=0; i < v.size(); i++)
216  {
217  v[i] = 100*v[i]/max;
218  }
219 
220  // Change the storage format so that we can keep the index
221  // and the value associated after sorting.
222  std::vector< std::pair<double, Int_t> > w(events.size());
223 
224  for(unsigned int i=0; i<v.size(); i++)
225  {
226  w[i] = std::pair<double, Int_t>(v[i],i);
227  }
228 
229  // Sort so that we can output in order of importance.
230  std::sort(w.begin(),w.end());
231 
232  // Output the results.
233  for(int i=(v.size()-1); i>=0; i--)
234  {
235  rank.push_back(w[i].second);
236  // std::cout << "x" << w[i].second << ": " << w[i].first << std::endl;
237  }
238 
239  //std::cout << std::endl << "Done." << std::endl << std::endl;
240 }
241 
243 // ----------------------------------------------------------------------
245 
246 void L1TForest::saveSplitValues(const char* savefilename)
247 {
248  // This function gathers all of the split values from the forest and puts them into lists.
249 
250  std::ofstream splitvaluefile;
251  splitvaluefile.open(savefilename);
252 
253  // Initialize the matrix v, which will store the list of split values
254  // for each variable i in v[i].
255  std::vector<std::vector<double>> v(events.size(), std::vector<double>());
256 
257  //std::cout << std::endl << "Gathering split values... " << std::endl;
258 
259  // Gather the split values from each tree in the forest.
260  for(unsigned int j=0; j<trees.size(); j++)
261  {
262  trees[j]->getSplitValues(v);
263  }
264 
265  // Sort the lists of split values and remove the duplicates.
266  for(unsigned int i=0; i<v.size(); i++)
267  {
268  std::sort(v[i].begin(),v[i].end());
269  v[i].erase( unique( v[i].begin(), v[i].end() ), v[i].end() );
270  }
271 
272  // Output the results after removing duplicates.
273  // The 0th variable is special and is not used for splitting, so we start at 1.
274  for(unsigned int i=1; i<v.size(); i++)
275  {
276  TString splitValues;
277  for(unsigned int j=0; j<v[i].size(); j++)
278  {
279  std::stringstream ss;
280  ss.precision(14);
281  ss << std::scientific << v[i][j];
282  splitValues+=",";
283  splitValues+=ss.str().c_str();
284  }
285 
286  splitValues=splitValues(1,splitValues.Length());
287  splitvaluefile << splitValues << std::endl << std::endl;;
288  }
289 }
291 // ______________________Update_Events_After_Fitting____________________//
293 
295 {
296  // Prepare the global vector of events for the next tree.
297  // Update the fit for each event and set the new target value
298  // for the next tree.
299 
300  // Get the list of terminal nodes for this tree.
301  std::list<Node*>& tn = tree->getTerminalNodes();
302 
303  // Loop through the terminal nodes.
304  for(std::list<Node*>::iterator it=tn.begin(); it!=tn.end(); it++)
305  {
306  // Get the events in the current terminal region.
307  std::vector<Event*>& v = (*it)->getEvents()[0];
308 
309  // Fit the events depending on the loss function criteria.
310  double fit = l->fit(v);
311 
312  // Scale the rate at which the algorithm converges.
313  fit = learningRate*fit;
314 
315  // Store the official fit value in the terminal node.
316  (*it)->setFitValue(fit);
317 
318  // Loop through each event in the terminal region and update the
319  // the target for the next tree.
320  for(unsigned int j=0; j<v.size(); j++)
321  {
322  Event* e = v[j];
323  e->predictedValue += fit;
324  e->data[0] = l->target(e);
325  }
326 
327  // Release memory.
328  (*it)->getEvents() = std::vector< std::vector<Event*> >();
329  }
330 }
331 
333 // ----------------------------------------------------------------------
335 
337 {
338  // Prepare the test events for the next tree.
339 
340  // Get the list of terminal nodes for this tree.
341  std::list<Node*>& tn = tree->getTerminalNodes();
342 
343  // Loop through the terminal nodes.
344  for(std::list<Node*>::iterator it=tn.begin(); it!=tn.end(); it++)
345  {
346  std::vector<Event*>& v = (*it)->getEvents()[0];
347  double fit = (*it)->getFitValue();
348 
349  // Loop through each event in the terminal region and update the
350  // the global event it maps to.
351  for(unsigned int j=0; j<v.size(); j++)
352  {
353  Event* e = v[j];
354  e->predictedValue += fit;
355  }
356 
357  // Release memory.
358  (*it)->getEvents() = std::vector< std::vector<Event*> >();
359  }
360 }
361 
363 // ____________________Do/Test_the Regression___________________________//
365 
366 void L1TForest::doRegression(Int_t nodeLimit, Int_t treeLimit, double learningRate, L1TLossFunction* l, const char* savetreesdirectory, bool saveTrees)
367 {
368  // Build the forest using the training sample.
369 
370  //std::cout << std::endl << "--Building L1TForest..." << std::endl << std::endl;
371 
372  // The trees work with a matrix of events where the rows have the same set of events. Each row however
373  // is sorted according to the feature variable given by event->data[row].
374  // If we only had one set of events we would have to sort it according to the
375  // feature variable every time we want to calculate the best split point for that feature.
376  // By keeping sorted copies we avoid the sorting operation during splint point calculation
377  // and save computation time. If we do not sort each of the rows the regression will fail.
378  //std::cout << "Sorting event vectors..." << std::endl;
379  sortEventVectors(events);
380 
381  // See how long the regression takes.
382  TStopwatch timer;
383  timer.Start(kTRUE);
384 
385  for(unsigned int i=0; i< (unsigned) treeLimit; i++)
386  {
387  // std::cout << "++Building Tree " << i << "... " << std::endl;
389  trees.push_back(tree);
390  tree->buildTree(nodeLimit);
391 
392  // Update the targets for the next tree to fit.
393  updateRegTargets(tree, learningRate, l);
394 
395  // Save trees to xml in some directory.
396  std::ostringstream ss;
397  ss << savetreesdirectory << "/" << i << ".xml";
398  std::string s = ss.str();
399  const char* c = s.c_str();
400 
401  if(saveTrees) tree->saveToXML(c);
402  }
403  //std::cout << std::endl;
404  //std::cout << std::endl << "Done." << std::endl << std::endl;
405 
406  // std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl;
407 }
408 
410 // ----------------------------------------------------------------------
412 
413 void L1TForest::predictEvents(std::vector<Event*>& eventsp, unsigned int numtrees)
414 {
415  // Predict values for eventsp by running them through the forest up to numtrees.
416 
417  //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl;
418  if(numtrees > trees.size())
419  {
420  //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl;
421  numtrees = trees.size();
422  }
423 
424  // i iterates through the trees in the forest. Each tree corrects the last prediction.
425  for(unsigned int i=0; i < numtrees; i++)
426  {
427  //std::cout << "++Tree " << i << "..." << std::endl;
428  appendCorrection(eventsp, i);
429  }
430 }
431 
433 // ----------------------------------------------------------------------
435 
436 void L1TForest::appendCorrection(std::vector<Event*>& eventsp, Int_t treenum)
437 {
438  // Update the prediction by appending the next correction.
439 
440  emtf::Tree* tree = trees[treenum];
441  tree->filterEvents(eventsp);
442 
443  // Update the events with their new prediction.
444  updateEvents(tree);
445 }
446 
448 // ----------------------------------------------------------------------
450 
451 void L1TForest::predictEvent(Event* e, unsigned int numtrees)
452 {
453  // Predict values for eventsp by running them through the forest up to numtrees.
454 
455  //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl;
456  if(numtrees > trees.size())
457  {
458  //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl;
459  numtrees = trees.size();
460  }
461 
462  // i iterates through the trees in the forest. Each tree corrects the last prediction.
463  for(unsigned int i=0; i < numtrees; i++)
464  {
465  //std::cout << "++Tree " << i << "..." << std::endl;
466  appendCorrection(e, i);
467  }
468 }
469 
471 // ----------------------------------------------------------------------
473 
474 void L1TForest::appendCorrection(Event* e, Int_t treenum)
475 {
476  // Update the prediction by appending the next correction.
477 
478  emtf::Tree* tree = trees[treenum];
479  Node* terminalNode = tree->filterEvent(e);
480 
481  // Update the event with its new prediction.
482  double fit = terminalNode->getFitValue();
483  e->predictedValue += fit;
484 }
486 // ----------------------------------------------------------------------------------
488 
489 void L1TForest::loadL1TForestFromXML(const char* directory, unsigned int numTrees)
490 {
491  // Load a forest that has already been created and stored into XML somewhere.
492 
493  // Initialize the vector of trees.
494  trees = std::vector<emtf::Tree*>(numTrees);
495 
496  // Load the L1TForest.
497  //std::cout << std::endl << "Loading L1TForest from XML ... " << std::endl;
498  for(unsigned int i=0; i < numTrees; i++)
499  {
500  trees[i] = new emtf::Tree();
501 
502  std::stringstream ss;
503  ss << directory << "/" << i << ".xml";
504 
505  //trees[i]->loadFromXML(ss.str().c_str());
506  trees[i]->loadFromXML(edm::FileInPath(ss.str().c_str()).fullPath().c_str());
507  }
508 
509  // std::cout << "Done." << std::endl << std::endl;
510 }
511 
513 // ___________________Stochastic_Sampling_&_Regression__________________//
515 
517 {
518  // We use this for Stochastic Gradient Boosting. Basically you
519  // take a subsample of the training events and build a tree using
520  // those. Then use the tree built from the subsample to update
521  // the predictions for all the events.
522 
523  subSample = std::vector< std::vector<Event*> >(events.size()) ;
524  size_t subSampleSize = fraction*events[0].size();
525 
526  // Randomize the first subSampleSize events in events[0].
527  shuffle(events[0].begin(), events[0].end(), subSampleSize);
528 
529  // Get a copy of the random subset we just made.
530  std::vector<Event*> v(events[0].begin(), events[0].begin()+subSampleSize);
531 
532  // Initialize and sort the subSample collection.
533  for(unsigned int i=0; i<subSample.size(); i++)
534  {
535  subSample[i] = v;
536  }
537 
538  sortEventVectors(subSample);
539 }
540 
542 // ----------------------------------------------------------------------
544 
545 void L1TForest::doStochasticRegression(Int_t nodeLimit, Int_t treeLimit, double learningRate, double fraction, L1TLossFunction* l)
546 {
547  // If the fraction of events to use is one then this algorithm is slower than doRegression due to the fact
548  // that we have to sort the events every time we extract a subsample. Without random sampling we simply
549  // use all of the events and keep them sorted.
550 
551  // Anyways, this algorithm uses a portion of the events to train each tree. All of the events are updated
552  // afterwards with the results from the subsample built tree.
553 
554  // Prepare some things.
555  sortEventVectors(events);
556  trees = std::vector<emtf::Tree*>(treeLimit);
557 
558  // See how long the regression takes.
559  TStopwatch timer;
560  timer.Start(kTRUE);
561 
562  // Output the current settings.
563  // std::cout << std::endl << "Running stochastic regression ... " << std::endl;
564  //std::cout << "# Nodes: " << nodeLimit << std::endl;
565  //std::cout << "Learning Rate: " << learningRate << std::endl;
566  //std::cout << "Bagging Fraction: " << fraction << std::endl;
567  //std::cout << std::endl;
568 
569  for(unsigned int i=0; i< (unsigned) treeLimit; i++)
570  {
571  // Build the tree using a random subsample.
572  prepareRandomSubsample(fraction);
573  trees[i] = new emtf::Tree(subSample);
574  trees[i]->buildTree(nodeLimit);
575 
576  // Fit all of the events based upon the tree we built using
577  // the subsample of events.
578  trees[i]->filterEvents(events[0]);
579 
580  // Update the targets for the next tree to fit.
581  updateRegTargets(trees[i], learningRate, l);
582 
583  // Save trees to xml in some directory.
584  std::ostringstream ss;
585  ss << "trees/" << i << ".xml";
586  std::string s = ss.str();
587  const char* c = s.c_str();
588 
589  trees[i]->saveToXML(c);
590  }
591 
592  //std::cout << std::endl << "Done." << std::endl << std::endl;
593 
594  //std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl;
595 }
static Int_t sortingIndex
Definition: Event.h:28
std::vector< emtf::Event * > getTrainingEvents()
Definition: Forest.cc:94
void setTrainingEvents(std::vector< emtf::Event * > &trainingEvents)
Definition: Forest.cc:70
void predictEvents(std::vector< emtf::Event * > &eventsp, unsigned int trees)
Definition: Forest.cc:413
const double w
Definition: UKUtility.cc:23
void updateRegTargets(emtf::Tree *tree, double learningRate, L1TLossFunction *l)
Definition: Forest.cc:294
void prepareRandomSubsample(double fraction)
Definition: Forest.cc:516
void rankVariables(std::vector< int > &rank)
Definition: Forest.cc:191
unsigned int size()
Definition: Forest.cc:115
Definition: Event.h:15
void doRegression(Int_t nodeLimit, Int_t treeLimit, double learningRate, L1TLossFunction *l, const char *savetreesdirectory, bool saveTrees)
Definition: Forest.cc:366
void loadL1TForestFromXML(const char *directory, unsigned int numTrees)
Definition: Forest.cc:489
void doStochasticRegression(Int_t nodeLimit, Int_t treeLimit, double learningRate, double fraction, L1TLossFunction *l)
Definition: Forest.cc:545
emtf::Tree * getTree(unsigned int i)
Definition: Forest.cc:101
bidiiter shuffle(bidiiter begin, bidiiter end, size_t num_random)
Definition: Utilities.h:26
U second(std::pair< T, U > const &p)
void updateEvents(emtf::Tree *tree)
Definition: Forest.cc:336
std::vector< Double_t > data
Definition: Event.h:30
virtual Double_t target(emtf::Event *e)=0
~L1TForest()
Definition: Forest.cc:54
def unique(seq, keepstr=True)
Definition: tier0.py:24
Int_t id
Definition: Event.h:29
#define end
Definition: vmac.h:37
virtual Double_t fit(std::vector< emtf::Event * > &v)=0
std::list< Node * > & getTerminalNodes()
Definition: Tree.cc:76
Double_t getFitValue()
Definition: Node.cc:157
void buildTree(Int_t nodeLimit)
Definition: Tree.cc:108
bool compareEvents(Event *e1, Event *e2)
Definition: Forest.cc:157
Float e1
Definition: deltaR.h:20
Double_t predictedValue
Definition: Event.h:20
L1TForest()
Definition: Forest.cc:36
void sortEventVectors(std::vector< std::vector< emtf::Event * > > &e)
Definition: Forest.cc:175
Node * filterEvent(Event *e)
Definition: Tree.cc:203
void listEvents(std::vector< std::vector< emtf::Event * > > &e)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Definition: Forest.cc:131
Float e2
Definition: deltaR.h:21
void predictEvent(emtf::Event *e, unsigned int trees)
Definition: Forest.cc:451
void saveToXML(const char *filename)
Definition: Tree.cc:333
void appendCorrection(std::vector< emtf::Event * > &eventsp, Int_t treenum)
Definition: Forest.cc:436
#define begin
Definition: vmac.h:30
void saveSplitValues(const char *savefilename)
Definition: Forest.cc:246
Definition: tree.py:1
void filterEvents(std::vector< Event * > &tEvents)
Definition: Tree.cc:169
bool compareEventsById(Event *e1, Event *e2)
Definition: Forest.cc:166