00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #include "PhysicsTools/StatPatternRecognition/interface/SprExperiment.hh"
00014 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsFilter.hh"
00015 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsClassifier.hh"
00016 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsTrainedClassifier.hh"
00017 #include "PhysicsTools/StatPatternRecognition/interface/SprEmptyFilter.hh"
00018 #include "PhysicsTools/StatPatternRecognition/interface/SprUtils.hh"
00019 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsReader.hh"
00020 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsWriter.hh"
00021 #include "PhysicsTools/StatPatternRecognition/interface/SprClass.hh"
00022 #include "PhysicsTools/StatPatternRecognition/interface/SprBinarySplit.hh"
00023 #include "PhysicsTools/StatPatternRecognition/interface/SprAdaBoost.hh"
00024 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedAdaBoost.hh"
00025 #include "PhysicsTools/StatPatternRecognition/interface/SprFisher.hh"
00026 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedFisher.hh"
00027 #include "PhysicsTools/StatPatternRecognition/interface/SprLogitR.hh"
00028 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedLogitR.hh"
00029 #include "PhysicsTools/StatPatternRecognition/interface/SprStdBackprop.hh"
00030 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedStdBackprop.hh"
00031 #include "PhysicsTools/StatPatternRecognition/interface/SprBagger.hh"
00032 #include "PhysicsTools/StatPatternRecognition/interface/SprArcE4.hh"
00033 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedBagger.hh"
00034 #include "PhysicsTools/StatPatternRecognition/interface/SprTopdownTree.hh"
00035 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedTopdownTree.hh"
00036 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassGiniIndex.hh"
00037 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassIDFraction.hh"
00038 #include "PhysicsTools/StatPatternRecognition/interface/SprIntegerBootstrap.hh"
00039 #include "PhysicsTools/StatPatternRecognition/interface/SprStringParser.hh"
00040 #include "PhysicsTools/StatPatternRecognition/interface/SprDataFeeder.hh"
00041 #include "PhysicsTools/StatPatternRecognition/interface/SprRWFactory.hh"
00042 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsVarTransformer.hh"
00043 #include "PhysicsTools/StatPatternRecognition/interface/SprVarTransformerReader.hh"
00044 #include "PhysicsTools/StatPatternRecognition/interface/SprTransformerFilter.hh"
00045
00046 #include <unistd.h>
00047 #include <stdio.h>
00048 #include <errno.h>
00049 #include <sys/types.h>
00050 #include <sys/stat.h>
00051
00052 #include <cassert>
00053 #include <iostream>
00054 #include <fstream>
00055 #include <sstream>
00056 #include <map>
00057 #include <vector>
00058 #include <utility>
00059 #include <set>
00060 #include <string>
00061 #include <memory>
00062 #include <iomanip>
00063 #include <algorithm>
00064 #include <functional>
00065
00066 using namespace std;
00067
00068
00069 struct SIAResponse {
00070 int cls;
00071 double weight;
00072 double response;
00073
00074 ~SIAResponse() {}
00075
00076 SIAResponse(int c, double w, double r)
00077 : cls(c), weight(w), response(r) {}
00078
00079 SIAResponse(const SIAResponse& other)
00080 : cls(other.cls), weight(other.weight), response(other.response) {}
00081 };
00082
00083
00084 struct SIACmpPairDDFirst
00085 : public binary_function<pair<double,double>,pair<double,double>,bool> {
00086 bool operator()(const pair<double,double>& l, const pair<double,double>& r)
00087 const {
00088 return (l.first > r.first);
00089 }
00090 };
00091
00092
00093 void help(const char* prog)
00094 {
00095 cout << "Usage: " << prog << " training_data_file " << endl;
00096 cout << "\t Options: " << endl;
00097 cout << "\t-h --- help " << endl;
00098 cout << "\t-p path to temporary cache files (default=\"./\") " << endl;
00099 cout << "\t-o output Tuple file " << endl;
00100 cout << "\t-a input ascii file mode (see SprSimpleReader.hh) " << endl;
00101 cout << "\t-A save output data in ascii instead of Root " << endl;
00102 cout << "\t-y list of input classes (see SprAbsFilter.hh) " << endl;
00103 cout << "\t-Q apply variable transformation saved in file " << endl;
00104 cout << "\t-K keep this fraction in training set and " << endl;
00105 cout << "\t\t put the rest into validation set " << endl;
00106 cout << "\t-D randomize training set split-up " << endl;
00107 cout << "\t-t read validation/test data from a file " << endl;
00108 cout << "\t\t (must be in same format as input data!!! " << endl;
00109 cout << "\t-v verbose level (0=silent default,1,2) " << endl;
00110 cout << "\t-V include only these input variables " << endl;
00111 cout << "\t-z exclude input variables from the list " << endl;
00112 cout << "\t\t Variables must be listed in quotes and separated by commas."
00113 << endl;
00114 }
00115
00116
00117 int prepareExit(const map<string,SprAbsClassifier*>& classifiers,
00118 const vector<const SprAbsClassifier*>& cToClean,
00119 const vector<const SprIntegerBootstrap*>& bootstraps,
00120 int status)
00121 {
00122 for( map<string,SprAbsClassifier*>::const_iterator
00123 i=classifiers.begin();i!=classifiers.end();i++ )
00124 delete i->second;
00125 for( int i=0;i<cToClean.size();i++ )
00126 delete cToClean[i];
00127 for( int i=0;i<bootstraps.size();i++ )
00128 delete bootstraps[i];
00129 return status;
00130 }
00131
00132
00133 bool answerYN(const char* question)
00134 {
00135 cout << question << " y/n [y] ";
00136 char yn [2];
00137 cin.getline(yn,2,'\n');
00138 if( yn[0]=='\0' || yn[0]=='y' ) return true;
00139 return false;
00140 }
00141
00142
00143 unsigned answerHowMany(unsigned defaultN, const char* question)
00144 {
00145 cout << question << " [" << defaultN << "] ";
00146 string line;
00147 getline(cin,line,'\n');
00148 unsigned n = defaultN;
00149 if( !line.empty() )
00150 n = atoi(line.c_str());
00151 return n;
00152 }
00153
00154
00155 string answerName(const char* question)
00156 {
00157 cout << question << " ";
00158 string line;
00159 getline(cin,line,'\n');
00160 if( !line.empty() ) {
00161 line.erase( line.find_last_not_of(' ') + 1 );
00162 line.erase( 0, line.find_first_not_of(' ') );
00163 }
00164 return line;
00165 }
00166
00167
00168 bool findCache(const string& prefix, const char* cacheName, ifstream& file)
00169 {
00170 string fname = prefix;
00171 fname += cacheName;
00172 file.open(fname.c_str());
00173 if( !file ) return false;
00174 return true;
00175 }
00176
00177
00178 bool checkOldCache(const string& prefix)
00179 {
00180 string cmd = "ls -a ";
00181 cmd += prefix;
00182 cmd += "*";
00183 if( system(cmd.c_str()) == 0 )
00184 return true;
00185 return false;
00186 }
00187
00188
00189 bool eraseOldCache(const string& prefix)
00190 {
00191 string cmd = "rm -f ";
00192 cmd += prefix;
00193 cmd += "*";
00194 if( system(cmd.c_str()) == 0 )
00195 return true;
00196 return false;
00197 }
00198
00199
00200 bool moveNewOntoOld(const string& prefix, const char* name)
00201 {
00202
00203 string oldfile = prefix;
00204 oldfile += name;
00205 string newfile = oldfile;
00206 newfile += ".new";
00207
00208
00209 struct stat buf;
00210 if( stat(newfile.c_str(),&buf) == 0 ) {
00211 string cmd = "mv -f";
00212 cmd += " "; cmd += newfile;
00213 cmd += " "; cmd += oldfile;
00214 if( system(cmd.c_str()) != 0 ) {
00215 cerr << "Attempt to move new cache " << newfile.c_str()
00216 << " onto old cache " << oldfile.c_str()
00217 << " terminated with error " << errno << endl;
00218 return false;
00219 }
00220 cout << "Moved " << newfile.c_str() << " =====> "
00221 << oldfile.c_str() << endl;
00222 }
00223
00224
00225 return true;
00226 }
00227
00228
00229 bool makeNewCache(const string& prefix, const char* cacheName, ofstream& file)
00230 {
00231
00232 string fname = prefix;
00233 fname += cacheName;
00234 fname += ".new";
00235
00236
00237 struct stat buf;
00238 if( stat(fname.c_str(),&buf) == 0 ) {
00239 cerr << "Warning: file " << fname.c_str() << " will be deleted." << endl;
00240 string cmd = "rm -f ";
00241 cmd += fname.c_str();
00242 if( system(cmd.c_str()) != 0 ) {
00243 cerr << "Attempt to delete file " << fname.c_str()
00244 << " terminated with error " << errno << endl;
00245 return false;
00246 }
00247 }
00248
00249
00250 file.open(fname.c_str());
00251 if( !file ) {
00252 cerr << "Unable to open file " << fname.c_str() << endl;
00253 return false;
00254 }
00255
00256
00257 return true;
00258 }
00259
00260
00261 bool storeNewCache(const string& prefix, const char* cacheName,
00262 const vector<SIAResponse>& v)
00263 {
00264
00265 string fname = prefix;
00266 fname += cacheName;
00267 fname += ".new";
00268
00269
00270 ofstream file(fname.c_str(),ios_base::app);
00271 if( !file ) {
00272 cerr << "Unable to open file " << fname.c_str() << endl;
00273 return false;
00274 }
00275
00276
00277 int j = 0;
00278 for( int i=0;i<v.size();i++ ) {
00279 file << v[i].cls << " " << v[i].weight << " " << v[i].response << " # ";
00280 if( ++j == 10 ) {
00281 j = 0;
00282 file << endl;
00283 }
00284 }
00285
00286
00287 return true;
00288 }
00289
00290
00291 bool storeEffCache(const string& prefix, const char* cacheName,
00292 const vector<double>& v)
00293 {
00294
00295 string fname = prefix;
00296 fname += cacheName;
00297
00298
00299 struct stat buf;
00300 if( stat(fname.c_str(),&buf) == 0 ) {
00301 cerr << "Warning: file " << fname.c_str() << " will be deleted." << endl;
00302 string cmd = "rm -f ";
00303 cmd += fname.c_str();
00304 if( system(cmd.c_str()) != 0 ) {
00305 cerr << "Attempt to delete file " << fname.c_str()
00306 << " terminated with error " << errno << endl;
00307 return false;
00308 }
00309 }
00310
00311
00312 ofstream file(fname.c_str());
00313 if( !file ) {
00314 cerr << "Unable to open file " << fname.c_str() << endl;
00315 return false;
00316 }
00317
00318
00319 for( int i=0;i<v.size();i++ )
00320 file << v[i] << " ";
00321 file << endl;
00322
00323
00324 return true;
00325 }
00326
00327
00328 unsigned readCache(ifstream& file, vector<SIAResponse>& v)
00329 {
00330 v.clear();
00331 int cls(0); double weight(0), resp(0);
00332 string line;
00333 char dummy;
00334 while( getline(file,line) ) {
00335 istringstream str(line);
00336 unsigned n = 0;
00337 for( int i=0;i<line.size();i++ )
00338 if( line[i]=='#' ) n++;
00339 for( int i=0;i<n;i++ ) {
00340 str >> cls >> weight >> resp >> dummy;
00341 v.push_back(SIAResponse(cls,weight,resp));
00342 }
00343 }
00344 return v.size();
00345 }
00346
00347
00348 bool resetValidation(const char* cacheName,
00349 map<string,vector<SIAResponse> >& validated,
00350 const SprAbsFilter* filter)
00351 {
00352
00353 map<string,vector<SIAResponse> >::iterator iter = validated.find(cacheName);
00354
00355
00356 if( iter == validated.end() ) {
00357 pair<map<string,vector<SIAResponse> >::iterator,bool> inserted =
00358 validated.insert(pair<const string,vector<SIAResponse> >(cacheName,
00359 vector<SIAResponse>()));
00360 assert( inserted.second );
00361 return true;
00362 }
00363
00364
00365 if( filter->size() != iter->second.size() ) {
00366 iter->second.clear();
00367 return true;
00368 }
00369
00370
00371 return false;
00372 }
00373
00374
00375 int main(int argc, char ** argv)
00376 {
00377
00378 if( argc < 2 ) {
00379 help(argv[0]);
00380 return 1;
00381 }
00382
00383
00384 string tupleFile;
00385 string pathToCache;
00386 int readMode = 0;
00387 SprRWFactory::DataType writeMode = SprRWFactory::Root;
00388 int verbose = 0;
00389 string includeList, excludeList;
00390 string inputClassesString;
00391 string valFile;
00392 bool split = false;
00393 double splitFactor = 0;
00394 bool splitRandomize = false;
00395 string transformerFile;
00396
00397
00398 int c;
00399 extern char* optarg;
00400
00401 while( (c = getopt(argc,argv,"hp:o:a:Ay:Q:K:Dt:v:V:z:"))
00402 != EOF ) {
00403 switch( c )
00404 {
00405 case 'h' :
00406 help(argv[0]);
00407 return 1;
00408 case 'p' :
00409 pathToCache = optarg;
00410 break;
00411 case 'o' :
00412 tupleFile = optarg;
00413 break;
00414 case 'a' :
00415 readMode = (optarg==0 ? 0 : atoi(optarg));
00416 break;
00417 case 'A' :
00418 writeMode = SprRWFactory::Ascii;
00419 break;
00420 case 'y' :
00421 inputClassesString = optarg;
00422 break;
00423 case 'Q' :
00424 transformerFile = optarg;
00425 break;
00426 case 'K' :
00427 split = true;
00428 splitFactor = (optarg==0 ? 0 : atof(optarg));
00429 break;
00430 case 'D' :
00431 splitRandomize = true;
00432 break;
00433 case 't' :
00434 valFile = optarg;
00435 break;
00436 case 'v' :
00437 verbose = (optarg==0 ? 0 : atoi(optarg));
00438 break;
00439 case 'V' :
00440 includeList = optarg;
00441 break;
00442 case 'z' :
00443 excludeList = optarg;
00444 break;
00445 }
00446 }
00447
00448
00449 string trFile = argv[argc-1];
00450 if( trFile.empty() ) {
00451 cerr << "No training file is specified." << endl;
00452 return 1;
00453 }
00454
00455
00456 SprRWFactory::DataType inputType
00457 = ( readMode==0 ? SprRWFactory::Root : SprRWFactory::Ascii );
00458 auto_ptr<SprAbsReader> reader(SprRWFactory::makeReader(inputType,readMode));
00459
00460
00461 set<string> includeSet;
00462 if( !includeList.empty() ) {
00463 vector<vector<string> > includeVars;
00464 SprStringParser::parseToStrings(includeList.c_str(),includeVars);
00465 assert( !includeVars.empty() );
00466 for( int i=0;i<includeVars[0].size();i++ )
00467 includeSet.insert(includeVars[0][i]);
00468 if( !reader->chooseVars(includeSet) ) {
00469 cerr << "Unable to include variables in training set." << endl;
00470 return 2;
00471 }
00472 else {
00473 cout << "Following variables have been included in optimization: ";
00474 for( set<string>::const_iterator
00475 i=includeSet.begin();i!=includeSet.end();i++ )
00476 cout << "\"" << *i << "\"" << " ";
00477 cout << endl;
00478 }
00479 }
00480
00481
00482 set<string> excludeSet;
00483 if( !excludeList.empty() ) {
00484 vector<vector<string> > excludeVars;
00485 SprStringParser::parseToStrings(excludeList.c_str(),excludeVars);
00486 assert( !excludeVars.empty() );
00487 for( int i=0;i<excludeVars[0].size();i++ )
00488 excludeSet.insert(excludeVars[0][i]);
00489 if( !reader->chooseAllBut(excludeSet) ) {
00490 cerr << "Unable to exclude variables from training set." << endl;
00491 return 2;
00492 }
00493 else {
00494 cout << "Following variables have been excluded from optimization: ";
00495 for( set<string>::const_iterator
00496 i=excludeSet.begin();i!=excludeSet.end();i++ )
00497 cout << "\"" << *i << "\"" << " ";
00498 cout << endl;
00499 }
00500 }
00501
00502
00503 auto_ptr<SprAbsFilter> filter(reader->read(trFile.c_str()));
00504 if( filter.get() == 0 ) {
00505 cerr << "Unable to read data from file " << trFile.c_str() << endl;
00506 return 2;
00507 }
00508 vector<string> vars;
00509 filter->vars(vars);
00510 cout << "Read data from file " << trFile.c_str()
00511 << " for variables";
00512 for( int i=0;i<vars.size();i++ )
00513 cout << " \"" << vars[i].c_str() << "\"";
00514 cout << endl;
00515 cout << "Total number of points read: " << filter->size() << endl;
00516
00517
00518 vector<SprClass> inputClasses;
00519 if( !filter->filterByClass(inputClassesString.c_str()) ) {
00520 cerr << "Cannot choose input classes for string "
00521 << inputClassesString << endl;
00522 return 2;
00523 }
00524 filter->classes(inputClasses);
00525 assert( inputClasses.size() > 1 );
00526 cout << "Training data filtered by class." << endl;
00527 for( int i=0;i<inputClasses.size();i++ ) {
00528 cout << "Points in class " << inputClasses[i] << ": "
00529 << filter->ptsInClass(inputClasses[i]) << endl;
00530 }
00531
00532
00533 auto_ptr<SprAbsFilter> valFilter;
00534 if( !split && valFile.empty() ) {
00535 cout << "No test data specified. Will use training data." << endl;
00536 vector<double> weights;
00537 filter->weights(weights);
00538 bool ownData = false;
00539 valFilter.reset(new SprEmptyFilter(filter->data(),weights,ownData));
00540 }
00541 if( split && !valFile.empty() ) {
00542 cerr << "Unable to split training data and use validation data "
00543 << "from a separate file." << endl;
00544 return 2;
00545 }
00546 if( split ) {
00547 cout << "Splitting training data with factor " << splitFactor << endl;
00548 if( splitRandomize )
00549 cout << "Will use randomized splitting." << endl;
00550 vector<double> weights;
00551 SprData* splitted = filter->split(splitFactor,weights,splitRandomize);
00552 if( splitted == 0 ) {
00553 cerr << "Unable to split training data." << endl;
00554 return 2;
00555 }
00556 bool ownData = true;
00557 valFilter.reset(new SprEmptyFilter(splitted,weights,ownData));
00558 cout << "Training data re-filtered:" << endl;
00559 for( int i=0;i<inputClasses.size();i++ ) {
00560 cout << "Points in class " << inputClasses[i] << ": "
00561 << filter->ptsInClass(inputClasses[i]) << endl;
00562 }
00563 }
00564 if( !valFile.empty() ) {
00565 auto_ptr<SprAbsReader>
00566 valReader(SprRWFactory::makeReader(inputType,readMode));
00567 if( !includeSet.empty() ) {
00568 if( !valReader->chooseVars(includeSet) ) {
00569 cerr << "Unable to include variables in validation set." << endl;
00570 return 2;
00571 }
00572 }
00573 if( !excludeSet.empty() ) {
00574 if( !valReader->chooseAllBut(excludeSet) ) {
00575 cerr << "Unable to exclude variables from validation set." << endl;
00576 return 2;
00577 }
00578 }
00579 valFilter.reset(valReader->read(valFile.c_str()));
00580 if( valFilter.get() == 0 ) {
00581 cerr << "Unable to read data from file " << valFile.c_str() << endl;
00582 return 2;
00583 }
00584 vector<string> valVars;
00585 valFilter->vars(valVars);
00586 cout << "Read validation data from file " << valFile.c_str()
00587 << " for variables";
00588 for( int i=0;i<valVars.size();i++ )
00589 cout << " \"" << valVars[i].c_str() << "\"";
00590 cout << endl;
00591 cout << "Total number of points read: " << valFilter->size() << endl;
00592 }
00593
00594
00595 if( !valFilter->filterByClass(inputClassesString.c_str()) ) {
00596 cerr << "Cannot choose input classes for string "
00597 << inputClassesString << endl;
00598 return 2;
00599 }
00600 valFilter->classes(inputClasses);
00601 cout << "Validation data filtered by class." << endl;
00602 for( int i=0;i<inputClasses.size();i++ ) {
00603 cout << "Points in class " << inputClasses[i] << ": "
00604 << valFilter->ptsInClass(inputClasses[i]) << endl;
00605 }
00606
00607
00608 auto_ptr<SprAbsFilter> garbage_train, garbage_valid;
00609 if( !transformerFile.empty() ) {
00610 SprVarTransformerReader transReader;
00611 const SprAbsVarTransformer* t = transReader.read(transformerFile.c_str());
00612 if( t == 0 ) {
00613 cerr << "Unable to read VarTransformer from file "
00614 << transformerFile.c_str() << endl;
00615 return 2;
00616 }
00617 SprTransformerFilter* t_train = new SprTransformerFilter(filter.get());
00618 SprTransformerFilter* t_valid = 0;
00619 if( valFilter.get() != 0 )
00620 t_valid = new SprTransformerFilter(valFilter.get());
00621 bool replaceOriginalData = true;
00622 if( !t_train->transform(t,replaceOriginalData) ) {
00623 cerr << "Unable to apply VarTransformer to training data." << endl;
00624 return 2;
00625 }
00626 if( t_valid!=0 && !t_valid->transform(t,replaceOriginalData) ) {
00627 cerr << "Unable to apply VarTransformer to validation data." << endl;
00628 return 2;
00629 }
00630 cout << "Variable transformation from file "
00631 << transformerFile.c_str() << " has been applied to "
00632 << "training and validation data." << endl;
00633 garbage_train.reset(filter.release());
00634 garbage_valid.reset(valFilter.release());
00635 filter.reset(t_train);
00636 valFilter.reset(t_valid);
00637 }
00638
00639
00640 if( !pathToCache.empty() ) {
00641 pathToCache.erase( pathToCache.find_last_not_of(' ') + 1 );
00642 pathToCache.erase( 0, pathToCache.find_first_not_of(' ') );
00643 }
00644 if( !pathToCache.empty() && *(pathToCache.rbegin())!='/' )
00645 pathToCache += "/";
00646 cout << "Will use directory \"" << pathToCache.c_str()
00647 << "\" for cache." << endl;
00648 string prefix = pathToCache;
00649 prefix += ".cache_";
00650
00651
00652 if( checkOldCache(prefix) ) {
00653 cout << "Warning!!! Some old cache files .cache* are found." << endl;
00654 if( answerYN("Erase old cache?") ) {
00655 if( !eraseOldCache(prefix) ) return 2;
00656 }
00657 }
00658
00659
00660 SprTwoClassIDFraction idfrac;
00661 SprTwoClassGiniIndex gini;
00662
00663
00664
00665
00666 bool go = true;
00667 while( go ) {
00668
00669
00670
00671
00672 map<string,SprAbsClassifier*> classifiers;
00673 map<string,vector<SIAResponse> > validated;
00674 map<string,string> message;
00675 vector<const SprAbsClassifier*> cToClean;
00676 vector<const SprIntegerBootstrap*> bootstraps;
00677
00678
00679
00680
00681 if( answerYN("Use Linear Discriminant Analysis?") ) {
00682 classifiers.insert(pair<const string,
00683 SprAbsClassifier*>("LDA",
00684 new SprFisher(filter.get(),1)));
00685 ifstream input;
00686 if( findCache(prefix,"LDA",input) ) {
00687 if( resetValidation("LDA",validated,valFilter.get()) ) {
00688 if( readCache(input,validated.find("LDA")->second)
00689 != valFilter->size() ) {
00690 cerr << "Cannot read cached LDA values." << endl;
00691 return prepareExit(classifiers,cToClean,bootstraps,3);
00692 }
00693 }
00694 }
00695 ostringstream ost;
00696 ost << "LDA = Linear Discriminant Analysis";
00697 message.insert(pair<const string,string>("LDA",ost.str()));
00698 }
00699
00700
00701
00702
00703 if( answerYN("Use Quadratic Discriminant Analysis?") ) {
00704 classifiers.insert(pair<const string,
00705 SprAbsClassifier*>("QDA",
00706 new SprFisher(filter.get(),2)));
00707 ifstream input;
00708 if( findCache(prefix,"QDA",input) ) {
00709 if( resetValidation("QDA",validated,valFilter.get()) ) {
00710 if( readCache(input,validated.find("QDA")->second)
00711 != valFilter->size() ) {
00712 cerr << "Cannot read cached QDA values." << endl;
00713 return prepareExit(classifiers,cToClean,bootstraps,3);
00714 }
00715 }
00716 }
00717 ostringstream ost;
00718 ost << "QDA = Quadratic Discriminant Analysis";
00719 message.insert(pair<const string,string>("QDA",ost.str()));
00720 }
00721
00722
00723
00724
00725 if( answerYN("Use Logistic Regression?") ) {
00726 double eps = 0.001;
00727 double updateFactor = 1;
00728 int initToZero = 1;
00729 bool repeat = false;
00730 ifstream input;
00731 string line;
00732 if( findCache(prefix,"LR",input) ) {
00733 repeat = true;
00734 if( !getline(input,line,'\n') ) {
00735 cerr << "Cannot read top line from LR cache." << endl;
00736 return prepareExit(classifiers,cToClean,bootstraps,3);
00737 }
00738 istringstream str(line);
00739 str >> eps >> updateFactor >> initToZero;
00740 }
00741 cout << "Input accuracy, update factor, and init flag "
00742 << "(0 if initialize from 0; 1 if initialize from LDA) ["
00743 << eps << " " << updateFactor << " " << initToZero << "] ";
00744 getline(cin,line,'\n');
00745 if( !line.empty() ) {
00746 repeat = false;
00747 istringstream str(line);
00748 str >> eps >> updateFactor >> initToZero;
00749 }
00750 if( repeat ) {
00751 if( resetValidation("LR",validated,valFilter.get()) ) {
00752 if( readCache(input,validated.find("LR")->second)
00753 != valFilter->size() ) {
00754 cerr << "Cannot read cached LR values." << endl;
00755 return prepareExit(classifiers,cToClean,bootstraps,3);
00756 }
00757 }
00758 }
00759 else {
00760
00761 ofstream output;
00762 if( makeNewCache(prefix,"LR",output) ) {
00763 output << eps << " " << updateFactor << " " << initToZero << endl;
00764 }
00765 else {
00766 cerr << "Unable to make output file for LR." << endl;
00767 return prepareExit(classifiers,cToClean,bootstraps,3);
00768 }
00769
00770
00771 double beta0 = 0;
00772 SprVector beta;
00773 if( initToZero == 0 ) {
00774 SprVector dummy(filter->dim());
00775 beta = dummy;
00776 for( int i=0;i<filter->dim();i++ ) beta[i] = 0;
00777 }
00778 SprLogitR* logit = new SprLogitR(filter.get(),beta0,beta,
00779 eps,updateFactor);
00780 classifiers.insert(pair<const string,
00781 SprAbsClassifier*>("LR",logit));
00782 }
00783 ostringstream ost;
00784 ost << "LR = Logistic Regression with:"
00785 << " Eps=" << eps
00786 << " updateFactor=" << updateFactor
00787 << " initFlag=" << initToZero;
00788 message.insert(pair<const string,string>("LR",ost.str()));
00789 }
00790
00791
00792
00793
00794 if( answerYN("Use single decision tree?") ) {
00795
00796
00797 unsigned ninstance = 0;
00798 ifstream iinst;
00799 if( findCache(prefix,"DT_instances",iinst) ) {
00800 string line;
00801 if( !getline(iinst,line,'\n') ) {
00802 cerr << "Cannot read top line from DT cache." << endl;
00803 return prepareExit(classifiers,cToClean,bootstraps,3);
00804 }
00805 istringstream str(line);
00806 str >> ninstance;
00807 }
00808 ninstance =
00809 answerHowMany(ninstance,
00810 "How many instances of the classifier would you like to run?");
00811 ofstream oinst;
00812 if( makeNewCache(prefix,"DT_instances",oinst) ) {
00813 oinst << ninstance << endl;
00814 }
00815 else {
00816 cerr << "Unable to make output file for DT." << endl;
00817 return prepareExit(classifiers,cToClean,bootstraps,3);
00818 }
00819 if( !moveNewOntoOld(prefix,"DT_instances") )
00820 return prepareExit(classifiers,cToClean,bootstraps,3);
00821
00822
00823 for( int instance=0;instance<ninstance;instance++ ) {
00824 string name = "DT_";
00825 char s [200];
00826 sprintf(s,"%i",instance);
00827 name += s;
00828 unsigned nleaf = 0;
00829 bool repeat = false;
00830 ifstream input;
00831 string line;
00832 if( findCache(prefix,name.c_str(),input) ) {
00833 repeat = true;
00834 if( !getline(input,line,'\n') ) {
00835 cerr << "Cannot read top line from DT cache." << endl;
00836 return prepareExit(classifiers,cToClean,bootstraps,3);
00837 }
00838 istringstream str(line);
00839 str >> nleaf;
00840 }
00841 cout << "Input minimal tree leaf size "
00842 << "for DT instance " << instance << " [" << nleaf << "] ";
00843 getline(cin,line,'\n');
00844 if( !line.empty() ) {
00845 repeat = false;
00846 istringstream str(line);
00847 str >> nleaf;
00848 }
00849 if( repeat ) {
00850 if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
00851 if( readCache(input,validated.find(name)->second)
00852 != valFilter->size() ) {
00853 cerr << "Cannot read cached DT values." << endl;
00854 return prepareExit(classifiers,cToClean,bootstraps,3);
00855 }
00856 }
00857 }
00858 else {
00859
00860 ofstream output;
00861 if( makeNewCache(prefix,name.c_str(),output) ) {
00862 output << nleaf << endl;
00863 }
00864 else {
00865 cerr << "Unable to make output file for DT." << endl;
00866 return prepareExit(classifiers,cToClean,bootstraps,3);
00867 }
00868
00869
00870 bool discrete = false;
00871 SprTopdownTree* tree = new SprTopdownTree(filter.get(),&gini,
00872 nleaf,discrete);
00873 classifiers.insert(pair<const string,SprAbsClassifier*>(name,
00874 tree));
00875 }
00876 ostringstream ost;
00877 ost << name.c_str()
00878 << " = Decision Tree with:"
00879 << " FOM=Gini"
00880 << " nEventsPerLeaf=" << nleaf;
00881 message.insert(pair<const string,string>(name,ost.str()));
00882 }
00883 }
00884
00885
00886
00887
00888 if( answerYN("Use neural net?") ) {
00889
00890
00891 unsigned ninstance = 0;
00892 ifstream iinst;
00893 if( findCache(prefix,"STDNN_instances",iinst) ) {
00894 string line;
00895 if( !getline(iinst,line,'\n') ) {
00896 cerr << "Cannot read top line from STDNN cache." << endl;
00897 return prepareExit(classifiers,cToClean,bootstraps,3);
00898 }
00899 istringstream str(line);
00900 str >> ninstance;
00901 }
00902 ninstance =
00903 answerHowMany(ninstance,
00904 "How many instances of the classifier would you like to run?");
00905 ofstream oinst;
00906 if( makeNewCache(prefix,"STDNN_instances",oinst) ) {
00907 oinst << ninstance << endl;
00908 }
00909 else {
00910 cerr << "Unable to make output file for STDNN." << endl;
00911 return prepareExit(classifiers,cToClean,bootstraps,3);
00912 }
00913 if( !moveNewOntoOld(prefix,"STDNN_instances") )
00914 return prepareExit(classifiers,cToClean,bootstraps,3);
00915
00916
00917 for( int instance=0;instance<ninstance;instance++ ) {
00918 string name = "STDNN_";
00919 char s [200];
00920 sprintf(s,"%i",instance);
00921 name += s;
00922 string structure = "I:H:H:O";
00923 unsigned ncycles(0), initPoints(0);
00924 double eta(0.1), initEta(0.1);
00925 bool repeat = false;
00926 ifstream input;
00927 string line;
00928 if( findCache(prefix,name.c_str(),input) ) {
00929 repeat = true;
00930 if( !getline(input,line,'\n') ) {
00931 cerr << "Cannot read top line from STDNN cache." << endl;
00932 return prepareExit(classifiers,cToClean,bootstraps,3);
00933 }
00934 istringstream str(line);
00935 str >> structure >> ncycles >> eta >> initPoints >> initEta;
00936 }
00937 cout << "Input StdBackprop NN structure (see README), "
00938 << "number of training cycles, "
00939 << "learning rate, "
00940 << "number of points for initialization (0 if use all), "
00941 << "and learning rate for initialization "
00942 << "for STDNN instance " << instance
00943 << " [" << structure.c_str() << " " << ncycles
00944 << " " << eta << " " << initPoints << " " << initEta << "] ";
00945 getline(cin,line,'\n');
00946 if( !line.empty() ) {
00947 repeat = false;
00948 istringstream str(line);
00949 str >> structure >> ncycles >> eta >> initPoints >> initEta;
00950 }
00951 if( repeat ) {
00952 if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
00953 if( readCache(input,validated.find(name)->second)
00954 != valFilter->size() ) {
00955 cerr << "Cannot read cached STDNN values." << endl;
00956 return prepareExit(classifiers,cToClean,bootstraps,3);
00957 }
00958 }
00959 }
00960 else {
00961
00962 ofstream output;
00963 if( makeNewCache(prefix,name.c_str(),output) ) {
00964 output << structure.c_str() << " " << ncycles << " "
00965 << eta << " " << initPoints << " " << initEta << endl;
00966 }
00967 else {
00968 cerr << "Unable to make output file for STDNN." << endl;
00969 return prepareExit(classifiers,cToClean,bootstraps,3);
00970 }
00971
00972
00973 SprStdBackprop* stdnn =
00974 new SprStdBackprop(filter.get(),structure.c_str(),ncycles,eta);
00975 stdnn->init(initEta,initPoints);
00976 classifiers.insert(pair<const string,SprAbsClassifier*>(name,stdnn));
00977 }
00978
00979
00980 ostringstream ost;
00981 ost << name.c_str()
00982 << " = StdBackprop Neural Net with:"
00983 << " Structure=" << structure.c_str()
00984 << " nCycles=" << ncycles
00985 << " LearnRate=" << eta
00986 << " nInitPoints=" << initPoints
00987 << " InitLearnRate=" << initEta;
00988 message.insert(pair<const string,string>(name,ost.str()));
00989 }
00990 }
00991
00992
00993
00994
00995 if( answerYN("Use boosted neural nets?") ) {
00996
00997
00998 unsigned ninstance = 0;
00999 ifstream iinst;
01000 if( findCache(prefix,"BNN_instances",iinst) ) {
01001 string line;
01002 if( !getline(iinst,line,'\n') ) {
01003 cerr << "Cannot read top line from BNN cache." << endl;
01004 return prepareExit(classifiers,cToClean,bootstraps,3);
01005 }
01006 istringstream str(line);
01007 str >> ninstance;
01008 }
01009 ninstance =
01010 answerHowMany(ninstance,
01011 "How many instances of the classifier would you like to run?");
01012 ofstream oinst;
01013 if( makeNewCache(prefix,"BNN_instances",oinst) ) {
01014 oinst << ninstance << endl;
01015 }
01016 else {
01017 cerr << "Unable to make output file for BNN." << endl;
01018 return prepareExit(classifiers,cToClean,bootstraps,3);
01019 }
01020 if( !moveNewOntoOld(prefix,"BNN_instances") )
01021 return prepareExit(classifiers,cToClean,bootstraps,3);
01022
01023
01024 for( int instance=0;instance<ninstance;instance++ ) {
01025 string name = "BNN_";
01026 char s [200];
01027 sprintf(s,"%i",instance);
01028 name += s;
01029 int abMode = 0;
01030 double epsilon = 0;
01031 string structure = "I:H:H:O";
01032 unsigned adaCycles(0), nnCycles(0), initPoints(0);
01033 double eta(0.1), initEta(0.1);
01034 bool repeat = false;
01035 ifstream input;
01036 string line;
01037 if( findCache(prefix,name.c_str(),input) ) {
01038 repeat = true;
01039 if( !getline(input,line,'\n') ) {
01040 cerr << "Cannot read top line from BNN cache." << endl;
01041 return prepareExit(classifiers,cToClean,bootstraps,3);
01042 }
01043 istringstream str(line);
01044 str >> abMode >> epsilon >> adaCycles
01045 >> structure >> nnCycles >> eta >> initPoints >> initEta;
01046 }
01047 cout << "Input AdaBoost mode (Discrete=1 Real=2 Epsilon=3), "
01048 << "epsilon (only has effect for Epsilon and Real boost), "
01049 << "number of neural nets, "
01050 << "neural net structure, number of NN training cycles, "
01051 << "learning rate, "
01052 << "number of points for NN initialization (0 if use all), "
01053 << "and learning rate for NN initialization "
01054 << "for BNN instance " << instance
01055 << " [" << abMode << " " << epsilon << " "
01056 << adaCycles << " "
01057 << structure.c_str() << " " << nnCycles << " "
01058 << eta << " " << initPoints << " " << initEta << "] ";
01059 getline(cin,line,'\n');
01060 if( !line.empty() ) {
01061 repeat = false;
01062 istringstream str(line);
01063 str >> abMode >> epsilon >> adaCycles
01064 >> structure >> nnCycles >> eta >> initPoints >> initEta;
01065 }
01066 if( repeat ) {
01067 if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
01068 if( readCache(input,validated.find(name)->second)
01069 != valFilter->size() ) {
01070 cerr << "Cannot read cached BNN values." << endl;
01071 return prepareExit(classifiers,cToClean,bootstraps,3);
01072 }
01073 }
01074 }
01075 else {
01076
01077 ofstream output;
01078 if( makeNewCache(prefix,name.c_str(),output) ) {
01079 output << " " << abMode << " " << epsilon << " " << adaCycles
01080 << " " << structure << " " << nnCycles << " " << eta
01081 << " " << initPoints << " " << initEta << endl;
01082 }
01083 else {
01084 cerr << "Unable to make output file for BNN." << endl;
01085 return prepareExit(classifiers,cToClean,bootstraps,3);
01086 }
01087
01088
01089 SprTrainedAdaBoost::AdaBoostMode mode = SprTrainedAdaBoost::Discrete;
01090 switch( abMode )
01091 {
01092 case 1 :
01093 mode = SprTrainedAdaBoost::Discrete;
01094 break;
01095 case 2 :
01096 mode = SprTrainedAdaBoost::Real;
01097 break;
01098 case 3 :
01099 mode = SprTrainedAdaBoost::Epsilon;
01100 break;
01101 default :
01102 cerr << "Unknown mode for AdaBoost." << endl;
01103 return prepareExit(classifiers,cToClean,bootstraps,3);
01104 }
01105
01106
01107 SprStdBackprop* stdnn =
01108 new SprStdBackprop(filter.get(),structure.c_str(),nnCycles,eta);
01109 cToClean.push_back(stdnn);
01110 if( !stdnn->init(initEta,initPoints) ) {
01111 cerr << "Unable to initialize neural net." << endl;
01112 return prepareExit(classifiers,cToClean,bootstraps,3);
01113 }
01114
01115
01116 bool useStandardAB = true;
01117 bool bagInput = false;
01118 SprAdaBoost* ab = new SprAdaBoost(filter.get(),adaCycles,
01119 useStandardAB,mode,bagInput);
01120 ab->setEpsilon(epsilon);
01121 classifiers.insert(pair<const string,SprAbsClassifier*>(name,ab));
01122 if( !ab->addTrainable(stdnn) ) {
01123 cerr << "Unable to add a neural net to AdaBoost." << endl;
01124 return prepareExit(classifiers,cToClean,bootstraps,3);
01125 }
01126 }
01127
01128
01129 ostringstream ost;
01130 string sMode;
01131 switch( abMode )
01132 {
01133 case 1 :
01134 sMode = "Discrete";
01135 break;
01136 case 2 :
01137 sMode = "Real";
01138 break;
01139 case 3 :
01140 sMode = "Epsilon";
01141 break;
01142 default :
01143 cerr << "Unknown mode for AdaBoost." << endl;
01144 return prepareExit(classifiers,cToClean,bootstraps,3);
01145 }
01146 ost << name.c_str()
01147 << " = Boosted Neural Nets with:"
01148 << " Mode=" << sMode.c_str()
01149 << " Epsilon=" << epsilon
01150 << " nNets=" << adaCycles
01151 << " structure=" << structure.c_str()
01152 << " nCyclesPerNet=" << nnCycles
01153 << " LearnRate=" << eta
01154 << " nInitPoints=" << initPoints
01155 << " InitLearnRate=" << initEta;
01156 message.insert(pair<const string,string>(name,ost.str()));
01157 }
01158 }
01159
01160
01161
01162
01163 if( answerYN("Use boosted splits?") ) {
01164 unsigned ncycles = 0;
01165 bool repeat = false;
01166 ifstream input;
01167 string line;
01168 if( findCache(prefix,"BDS",input) ) {
01169 repeat = true;
01170 if( !getline(input,line,'\n') ) {
01171 cerr << "Cannot read top line from BDS cache." << endl;
01172 return prepareExit(classifiers,cToClean,bootstraps,3);
01173 }
01174 istringstream str(line);
01175 str >> ncycles;
01176 }
01177 cout << "Input number of AdaBoost splits per dimension ["
01178 << ncycles << "] ";
01179 getline(cin,line,'\n');
01180 if( !line.empty() ) {
01181 repeat = false;
01182 istringstream str(line);
01183 str >> ncycles;
01184 }
01185 if( repeat ) {
01186 if( resetValidation("BDS",validated,valFilter.get()) ) {
01187 if( readCache(input,validated.find("BDS")->second)
01188 != valFilter->size() ) {
01189 cerr << "Cannot read cached BDS values." << endl;
01190 return prepareExit(classifiers,cToClean,bootstraps,3);
01191 }
01192 }
01193 }
01194 else {
01195
01196 ofstream output;
01197 if( makeNewCache(prefix,"BDS",output) ) {
01198 output << ncycles << endl;
01199 }
01200 else {
01201 cerr << "Unable to make output file for BDS." << endl;
01202 return prepareExit(classifiers,cToClean,bootstraps,3);
01203 }
01204
01205
01206 bool useStandardAB = true;
01207 ncycles *= filter->dim();
01208 SprAdaBoost* ab = new SprAdaBoost(filter.get(),
01209 ncycles,
01210 useStandardAB,
01211 SprTrainedAdaBoost::Discrete);
01212 classifiers.insert(pair<const string,SprAbsClassifier*>("BDS",ab));
01213 for( int i=0;i<filter->dim();i++ ) {
01214 SprBinarySplit* s = new SprBinarySplit(filter.get(),&idfrac,i);
01215 cToClean.push_back(s);
01216 if( !ab->addTrainable(s,SprUtils::lowerBound(0.5)) ) {
01217 cerr << "Unable to add binary split to AdaBoost." << endl;
01218 return prepareExit(classifiers,cToClean,bootstraps,3);
01219 }
01220 }
01221 }
01222 ostringstream ost;
01223 ost << "BDS = Boosted Decision Splits with: nSplits="
01224 << ncycles;
01225 message.insert(pair<const string,string>("BDS",ost.str()));
01226 }
01227
01228
01229
01230
01231 if( answerYN("Use boosted trees?") ) {
01232
01233
01234 unsigned ninstance = 0;
01235 ifstream iinst;
01236 if( findCache(prefix,"BDT_instances",iinst) ) {
01237 string line;
01238 if( !getline(iinst,line,'\n') ) {
01239 cerr << "Cannot read top line from BDT cache." << endl;
01240 return prepareExit(classifiers,cToClean,bootstraps,3);
01241 }
01242 istringstream str(line);
01243 str >> ninstance;
01244 }
01245 ninstance =
01246 answerHowMany(ninstance,
01247 "How many instances of the classifier would you like to run?");
01248 ofstream oinst;
01249 if( makeNewCache(prefix,"BDT_instances",oinst) ) {
01250 oinst << ninstance << endl;
01251 }
01252 else {
01253 cerr << "Unable to make output file for BDT." << endl;
01254 return prepareExit(classifiers,cToClean,bootstraps,3);
01255 }
01256 if( !moveNewOntoOld(prefix,"BDT_instances") )
01257 return prepareExit(classifiers,cToClean,bootstraps,3);
01258
01259
01260 for( int instance=0;instance<ninstance;instance++ ) {
01261 string name = "BDT_";
01262 char s [200];
01263 sprintf(s,"%i",instance);
01264 name += s;
01265 int abMode = 0;
01266 double epsilon = 0;
01267 int iBagInput = 0;
01268 unsigned ncycles(0), nleaf(0);
01269 bool repeat = false;
01270 ifstream input;
01271 string line;
01272 if( findCache(prefix,name.c_str(),input) ) {
01273 repeat = true;
01274 if( !getline(input,line,'\n') ) {
01275 cerr << "Cannot read top line from BDT cache." << endl;
01276 return prepareExit(classifiers,cToClean,bootstraps,3);
01277 }
01278 istringstream str(line);
01279 str >> abMode >> epsilon >> iBagInput >> ncycles >> nleaf;
01280 }
01281 cout << "Input AdaBoost mode (Discrete=1 Real=2 Epsilon=3), "
01282 << "epsilon (only has effect for Epsilon and Real boost), "
01283 << "bag training events flag (No=0 Yes=1), "
01284 << "number of trees, and minimal tree leaf size "
01285 << "for BDT instance " << instance
01286 << " [" << abMode << " " << epsilon << " " << iBagInput << " "
01287 << ncycles << " " << nleaf << "] ";
01288 getline(cin,line,'\n');
01289 if( !line.empty() ) {
01290 repeat = false;
01291 istringstream str(line);
01292 str >> abMode >> epsilon >> iBagInput >> ncycles >> nleaf;
01293 }
01294 if( repeat ) {
01295 if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
01296 if( readCache(input,validated.find(name)->second)
01297 != valFilter->size() ) {
01298 cerr << "Cannot read cached BDT values." << endl;
01299 return prepareExit(classifiers,cToClean,bootstraps,3);
01300 }
01301 }
01302 }
01303 else {
01304
01305 ofstream output;
01306 if( makeNewCache(prefix,name.c_str(),output) ) {
01307 output << abMode << " " << epsilon << " " << iBagInput << " "
01308 << ncycles << " " << nleaf << endl;
01309 }
01310 else {
01311 cerr << "Unable to make output file for BDT." << endl;
01312 return prepareExit(classifiers,cToClean,bootstraps,3);
01313 }
01314
01315
01316 SprTrainedAdaBoost::AdaBoostMode mode = SprTrainedAdaBoost::Discrete;
01317 switch( abMode )
01318 {
01319 case 1 :
01320 mode = SprTrainedAdaBoost::Discrete;
01321 break;
01322 case 2 :
01323 mode = SprTrainedAdaBoost::Real;
01324 break;
01325 case 3 :
01326 mode = SprTrainedAdaBoost::Epsilon;
01327 break;
01328 default :
01329 cerr << "Unknown mode for AdaBoost." << endl;
01330 return prepareExit(classifiers,cToClean,bootstraps,3);
01331 }
01332
01333
01334 bool discrete = (mode!=SprTrainedAdaBoost::Real);
01335 SprTopdownTree* tree = new SprTopdownTree(filter.get(),&gini,
01336 nleaf,discrete,0);
01337 if( mode == SprTrainedAdaBoost::Real ) tree->forceMixedNodes();
01338 tree->useFastSort();
01339 cToClean.push_back(tree);
01340
01341
01342 bool useStandardAB = true;
01343 bool bagInput = (iBagInput==1);
01344 SprAdaBoost* ab = new SprAdaBoost(filter.get(),ncycles,
01345 useStandardAB,mode,bagInput);
01346 ab->setEpsilon(epsilon);
01347 classifiers.insert(pair<const string,SprAbsClassifier*>(name,ab));
01348 if( !ab->addTrainable(tree,SprUtils::lowerBound(0.5)) ) {
01349 cerr << "Unable to add a decision tree to AdaBoost." << endl;
01350 return prepareExit(classifiers,cToClean,bootstraps,3);
01351 }
01352 }
01353
01354
01355 ostringstream ost;
01356 string sMode;
01357 switch( abMode )
01358 {
01359 case 1 :
01360 sMode = "Discrete";
01361 break;
01362 case 2 :
01363 sMode = "Real";
01364 break;
01365 case 3 :
01366 sMode = "Epsilon";
01367 break;
01368 default :
01369 cerr << "Unknown mode for AdaBoost." << endl;
01370 return prepareExit(classifiers,cToClean,bootstraps,3);
01371 }
01372 string sBag;
01373 if( iBagInput == 1 )
01374 sBag = "Yes";
01375 else
01376 sBag = "No";
01377 ost << name.c_str()
01378 << " = Boosted Decision Trees with:"
01379 << " Mode=" << sMode.c_str()
01380 << " Epsilon=" << epsilon
01381 << " BagTrainingEvents=" << sBag.c_str()
01382 << " nTrees=" << ncycles
01383 << " nEventsPerLeaf=" << nleaf;
01384 message.insert(pair<const string,string>(name,ost.str()));
01385 }
01386 }
01387
01388
01389
01390
01391 if( answerYN("Use random forest?") ) {
01392
01393
01394 unsigned ninstance = 0;
01395 ifstream iinst;
01396 if( findCache(prefix,"RF_instances",iinst) ) {
01397 string line;
01398 if( !getline(iinst,line,'\n') ) {
01399 cerr << "Cannot read top line from RF cache." << endl;
01400 return prepareExit(classifiers,cToClean,bootstraps,3);
01401 }
01402 istringstream str(line);
01403 str >> ninstance;
01404 }
01405 ninstance =
01406 answerHowMany(ninstance,
01407 "How many instances of the classifier would you like to run?");
01408 ofstream oinst;
01409 if( makeNewCache(prefix,"RF_instances",oinst) ) {
01410 oinst << ninstance << endl;
01411 }
01412 else {
01413 cerr << "Unable to make output file for RF." << endl;
01414 return prepareExit(classifiers,cToClean,bootstraps,3);
01415 }
01416 if( !moveNewOntoOld(prefix,"RF_instances") )
01417 return prepareExit(classifiers,cToClean,bootstraps,3);
01418
01419
01420 for( int instance=0;instance<ninstance;instance++ ) {
01421 string name = "RF_";
01422 char s [200];
01423 sprintf(s,"%i",instance);
01424 name += s;
01425 unsigned ncycles(0), nleaf(0), nfeatures(filter->dim());
01426 bool repeat = false;
01427 ifstream input;
01428 string line;
01429 if( findCache(prefix,name.c_str(),input) ) {
01430 repeat = true;
01431 if( !getline(input,line,'\n') ) {
01432 cerr << "Cannot read top line from RF cache." << endl;
01433 return prepareExit(classifiers,cToClean,bootstraps,3);
01434 }
01435 istringstream str(line);
01436 str >> ncycles >> nleaf >> nfeatures;
01437 }
01438 cout << "Input number of trees, minimal tree leaf size, "
01439 << "and number of input features for sampling "
01440 << " for RF instance " << instance << " ["
01441 << ncycles << " " << nleaf << " " << nfeatures << "] ";
01442 getline(cin,line,'\n');
01443 if( !line.empty() ) {
01444 repeat = false;
01445 istringstream str(line);
01446 str >> ncycles >> nleaf >> nfeatures;
01447 }
01448 if( repeat ) {
01449 if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
01450 if( readCache(input,validated.find(name)->second)
01451 != valFilter->size() ) {
01452 cerr << "Cannot read cached RF values." << endl;
01453 return prepareExit(classifiers,cToClean,bootstraps,3);
01454 }
01455 }
01456 }
01457 else {
01458
01459 ofstream output;
01460 if( makeNewCache(prefix,name.c_str(),output) ) {
01461 output << ncycles << " " << nleaf << " " << nfeatures << endl;
01462 }
01463 else {
01464 cerr << "Unable to make output file for RF." << endl;
01465 return prepareExit(classifiers,cToClean,bootstraps,3);
01466 }
01467
01468
01469 SprIntegerBootstrap* bootstrap
01470 = new SprIntegerBootstrap(filter->dim(),nfeatures);
01471 bootstraps.push_back(bootstrap);
01472
01473
01474 bool discrete = false;
01475 SprTopdownTree* tree = new SprTopdownTree(filter.get(),&gini,
01476 nleaf,discrete,bootstrap);
01477 tree->useFastSort();
01478 cToClean.push_back(tree);
01479
01480
01481 SprBagger* bagger = new SprBagger(filter.get(),ncycles,discrete);
01482 classifiers.insert(pair<const string,SprAbsClassifier*>(name,
01483 bagger));
01484 if( !bagger->addTrainable(tree) ) {
01485 cerr << "Unable to add a decision tree to Random Forest." << endl;
01486 return prepareExit(classifiers,cToClean,bootstraps,3);
01487 }
01488 }
01489 ostringstream ost;
01490 ost << name.c_str()
01491 << " = Random Forest with: nTrees=" << ncycles
01492 << " nEventsPerLeaf=" << nleaf
01493 << " nFeaturesToSample=" << nfeatures;
01494 message.insert(pair<const string,string>(name,ost.str()));
01495 }
01496 }
01497
01498
01499
01500
01501 if( answerYN("Use arc-x4?") ) {
01502
01503
01504 unsigned ninstance = 0;
01505 ifstream iinst;
01506 if( findCache(prefix,"AX4instances_",iinst) ) {
01507 string line;
01508 if( !getline(iinst,line,'\n') ) {
01509 cerr << "Cannot read top line from AX4 cache." << endl;
01510 return prepareExit(classifiers,cToClean,bootstraps,3);
01511 }
01512 istringstream str(line);
01513 str >> ninstance;
01514 }
01515 ninstance =
01516 answerHowMany(ninstance,
01517 "How many instances of the classifier would you like to run?");
01518 ofstream oinst;
01519 if( makeNewCache(prefix,"AX4instances_",oinst) ) {
01520 oinst << ninstance << endl;
01521 }
01522 else {
01523 cerr << "Unable to make output file for AX4." << endl;
01524 return prepareExit(classifiers,cToClean,bootstraps,3);
01525 }
01526 if( !moveNewOntoOld(prefix,"AX4instances_") )
01527 return prepareExit(classifiers,cToClean,bootstraps,3);
01528
01529
01530 for( int instance=0;instance<ninstance;instance++ ) {
01531 string name = "AX4_";
01532 char s [200];
01533 sprintf(s,"%i",instance);
01534 name += s;
01535 unsigned ncycles(0), nleaf(0), nfeatures(filter->dim());
01536 bool repeat = false;
01537 ifstream input;
01538 string line;
01539 if( findCache(prefix,name.c_str(),input) ) {
01540 repeat = true;
01541 if( !getline(input,line,'\n') ) {
01542 cerr << "Cannot read top line from AX4 cache." << endl;
01543 return prepareExit(classifiers,cToClean,bootstraps,3);
01544 }
01545 istringstream str(line);
01546 str >> ncycles >> nleaf >> nfeatures;
01547 }
01548 cout << "Input number of trees, minimal tree leaf size, "
01549 << "and number of input features for sampling "
01550 << " for AX4 instance " << instance << " ["
01551 << ncycles << " " << nleaf << " " << nfeatures << "] ";
01552 getline(cin,line,'\n');
01553 if( !line.empty() ) {
01554 repeat = false;
01555 istringstream str(line);
01556 str >> ncycles >> nleaf >> nfeatures;
01557 }
01558 if( repeat ) {
01559 if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
01560 if( readCache(input,validated.find(name)->second)
01561 != valFilter->size() ) {
01562 cerr << "Cannot read cached AX4 values." << endl;
01563 return prepareExit(classifiers,cToClean,bootstraps,3);
01564 }
01565 }
01566 }
01567 else {
01568
01569 ofstream output;
01570 if( makeNewCache(prefix,name.c_str(),output) ) {
01571 output << ncycles << " " << nleaf << " " << nfeatures << endl;
01572 }
01573 else {
01574 cerr << "Unable to make output file for AX4." << endl;
01575 return prepareExit(classifiers,cToClean,bootstraps,3);
01576 }
01577
01578
01579 SprIntegerBootstrap* bootstrap
01580 = new SprIntegerBootstrap(filter->dim(),nfeatures);
01581 bootstraps.push_back(bootstrap);
01582
01583
01584 bool discrete = false;
01585 SprTopdownTree* tree = new SprTopdownTree(filter.get(),&gini,
01586 nleaf,discrete,bootstrap);
01587 tree->useFastSort();
01588 cToClean.push_back(tree);
01589
01590
01591 SprArcE4* arcer = new SprArcE4(filter.get(),ncycles,discrete);
01592 classifiers.insert(pair<const string,SprAbsClassifier*>(name,arcer));
01593 if( !arcer->addTrainable(tree) ) {
01594 cerr << "Unable to add a decision tree to arc-x4." << endl;
01595 return prepareExit(classifiers,cToClean,bootstraps,3);
01596 }
01597 }
01598 ostringstream ost;
01599 ost << name.c_str()
01600 << " = arc-x4 with: nTrees=" << ncycles
01601 << " nEventsPerLeaf=" << nleaf
01602 << " nFeaturesToSample=" << nfeatures;
01603 message.insert(pair<const string,string>(name,ost.str()));
01604 }
01605 }
01606
01607
01608
01609
01610 for( map<string,SprAbsClassifier*>::const_iterator
01611 iter=classifiers.begin();iter!=classifiers.end();iter++ ) {
01612 cout << "Training classifier " << iter->first.c_str() << endl;
01613
01614
01615 if( !iter->second->train(verbose) ) {
01616 cerr << "Classifier " << iter->first.c_str() << " cannot be trained. "
01617 << "Skipping..." << endl;
01618 continue;
01619 }
01620
01621
01622 if( resetValidation(iter->first.c_str(),validated,valFilter.get()) ) {
01623 cout << "Saving responses for validation data for classifier "
01624 << iter->first.c_str() << endl;
01625
01626
01627 SprAbsTrainedClassifier* t = iter->second->makeTrained();
01628 if( t == 0 ) {
01629 cerr << "Unable to make trained classifier for "
01630 << iter->first.c_str() << endl;
01631 continue;
01632 }
01633
01634
01635 map<string,vector<SIAResponse> >::iterator found =
01636 validated.find(iter->first);
01637 assert( found != validated.end() );
01638 for( int i=0;i<valFilter->size();i++ ) {
01639 const SprPoint* p = (*(valFilter.get()))[i];
01640 found->second.push_back(SIAResponse(int(inputClasses[1]==p->class_),
01641 valFilter->w(i),
01642 t->response(p)));
01643 }
01644
01645
01646 delete t;
01647
01648
01649 if( !storeNewCache(prefix,iter->first.c_str(),found->second) ) {
01650 cerr << "Unable to save new cache for classifier "
01651 << iter->first.c_str() << endl;
01652 return prepareExit(classifiers,cToClean,bootstraps,4);
01653 }
01654 }
01655 else {
01656 cerr << "Vector of responses for classifier " << iter->first.c_str()
01657 << " already exists. Skipping..."<< endl;
01658 }
01659 }
01660
01661
01662
01663
01664 for( map<string,SprAbsClassifier*>::const_iterator
01665 iter=classifiers.begin();iter!=classifiers.end();iter++ ) {
01666 if( !moveNewOntoOld(prefix,iter->first.c_str()) )
01667 return prepareExit(classifiers,cToClean,bootstraps,5);
01668 }
01669
01670
01671
01672
01673 vector<double> effS;
01674 ifstream effInput;
01675 if( findCache(prefix,"eff",effInput) ) {
01676 string line;
01677 if( getline(effInput,line) ) {
01678 istringstream str(line);
01679 double dummy = 0;
01680 while( str >> dummy ) effS.push_back(dummy);
01681 }
01682 }
01683 else {
01684 effS.push_back(0.1);
01685 effS.push_back(0.2);
01686 effS.push_back(0.3);
01687 effS.push_back(0.4);
01688 effS.push_back(0.5);
01689 effS.push_back(0.6);
01690 effS.push_back(0.7);
01691 effS.push_back(0.8);
01692 effS.push_back(0.9);
01693 }
01694 effInput.close();
01695 cout << "Input signal efficiency values for which background "
01696 << "will be estimated [ ";
01697 for( int i=0;i<effS.size();i++ )
01698 cout << effS[i] << " ";
01699 cout << "] ";
01700 string line;
01701 getline(cin,line,'\n');
01702 if( !line.empty() ) {
01703 effS.clear();
01704 istringstream str(line);
01705 double dummy = 0;
01706 while( str >> dummy ) effS.push_back(dummy);
01707 }
01708 if( effS.empty() ) {
01709 cerr << "What, are you trying to be cute? Enter values." << endl;
01710 return prepareExit(classifiers,cToClean,bootstraps,7);
01711 }
01712 stable_sort(effS.begin(),effS.end());
01713 if( !storeEffCache(prefix,"eff",effS) ) {
01714 cerr << "Unable to store efficiency values." << endl;
01715 return prepareExit(classifiers,cToClean,bootstraps,6);
01716 }
01717
01718
01719
01720
01721 map<string,vector<double> > effB;
01722 const double wsig = valFilter->weightInClass(inputClasses[1]);
01723 const double wbgr = valFilter->weightInClass(inputClasses[0]);
01724 assert( wsig > 0 );
01725 assert( wbgr > 0 );
01726
01727 for( map<string,vector<SIAResponse> >::const_iterator
01728 iter=validated.begin();iter!=validated.end();iter++ ) {
01729
01730
01731 vector<pair<double,double> > signal;
01732 vector<pair<double,double> > bgrnd;
01733
01734
01735 for( int i=0;i<iter->second.size();i++ ) {
01736 if( iter->second[i].cls == 0 ) {
01737 bgrnd.push_back(pair<double,double>(iter->second[i].response,
01738 iter->second[i].weight));
01739 }
01740 else if( iter->second[i].cls == 1 ) {
01741 signal.push_back(pair<double,double>(iter->second[i].response,
01742 iter->second[i].weight));
01743 }
01744 }
01745
01746
01747 stable_sort(bgrnd.begin(),bgrnd.end(),SIACmpPairDDFirst());
01748 stable_sort(signal.begin(),signal.end(),SIACmpPairDDFirst());
01749
01750
01751 vector<double> cuts(effS.size());
01752 double w = 0;
01753 int divider = 0;
01754 int i = 0;
01755 while( i<signal.size() && divider<effS.size() ) {
01756 w += signal[i].second;
01757 if( (w/wsig) > effS[divider] ) {
01758 if( i == 0 )
01759 cuts[divider] = signal[0].first;
01760 else
01761 cuts[divider] = 0.5 * (signal[i].first + signal[i-1].first);
01762 divider++;
01763 }
01764 i++;
01765 }
01766
01767
01768 pair<map<string,vector<double> >::iterator,bool> inserted =
01769 effB.insert(pair<const string,vector<double> >(iter->first,
01770 vector<double>(effS.size(),1)));
01771 assert( inserted.second );
01772 w = 0;
01773 divider = 0;
01774 i = 0;
01775 while( i<bgrnd.size() && divider<effS.size() ) {
01776 if( bgrnd[i].first < cuts[divider] )
01777 inserted.first->second[divider++] = w/wbgr;
01778 w += bgrnd[i].second;
01779 i++;
01780 }
01781 }
01782
01783
01784
01785
01786 cout << "===========================================" << endl;
01787 for( map<string,string>::const_iterator
01788 iter=message.begin();iter!=message.end();iter++ )
01789 cout << iter->second.c_str() << endl;
01790 cout << "===========================================" << endl;
01791 cout << "Table of surviving background fractions"
01792 << " (* shows minimal value in a row)" << endl;
01793 cout << "===========================================" << endl;
01794 char s[200];
01795 sprintf(s,"Signal Eff \\ Classifiers |");
01796 cout << s;
01797 string temp = "--------------------------";
01798 for( map<string,vector<double> >::const_iterator
01799 iter=effB.begin();iter!=effB.end();iter++ ) {
01800 sprintf(s," %8s |",iter->first.c_str());
01801 cout << s;
01802 temp += "-----------";
01803 }
01804 cout << endl;
01805 cout << temp.c_str() << endl;
01806 for( int i=0;i<effS.size();i++ ) {
01807 sprintf(s," %6.4f |",effS[i]);
01808 cout << s;
01809 vector<string> names;
01810 vector<double> values;
01811 for( map<string,vector<double> >::const_iterator
01812 iter=effB.begin();iter!=effB.end();iter++ ) {
01813 names.push_back(iter->first);
01814 values.push_back(iter->second[i]);
01815 }
01816 int foundMin = min_element(values.begin(),values.end()) - values.begin();
01817 for( int j=0;j<names.size();j++ ) {
01818 if( j == foundMin )
01819 sprintf(s," *%7.5f |",values[j]);
01820 else
01821 sprintf(s," %7.5f |",values[j]);
01822 cout << s;
01823 }
01824 cout << endl;
01825 }
01826 cout << "===========================================" << endl;
01827
01828
01829 if( answerYN("Save classifier output to an ntuple?") ) {
01830
01831
01832 string ntupleFile
01833 = answerName("Give name of the output ntuple file -->");
01834 auto_ptr<SprAbsWriter>
01835 writer(SprRWFactory::makeWriter(writeMode,"ClassifierComparison"));
01836 if( !writer->init(ntupleFile.c_str()) ) {
01837 cerr << "Unable to open ntuple file " << ntupleFile.c_str() << endl;
01838 return prepareExit(classifiers,cToClean,bootstraps,7);
01839 }
01840
01841
01842 vector<string> axes;
01843 valFilter->vars(axes);
01844 if( !writer->setAxes(axes) ) {
01845 cerr << "Unable to set axes for the ntuple writer." << endl;
01846 return prepareExit(classifiers,cToClean,bootstraps,8);
01847 }
01848 for( map<string,vector<SIAResponse> >::const_iterator
01849 iter=validated.begin();iter!=validated.end();iter++ ) {
01850 if( !writer->addAxis(iter->first.c_str()) ) {
01851 cerr << "Unable to add a classifier axis for "
01852 << iter->first.c_str() << endl;
01853 return prepareExit(classifiers,cToClean,bootstraps,8);
01854 }
01855 }
01856
01857
01858 for( int i=0;i<valFilter->size();i++ ) {
01859 const SprPoint* p = (*(valFilter.get()))[i];
01860 double w = valFilter->w(i);
01861 vector<double> f;
01862 for( map<string,vector<SIAResponse> >::const_iterator
01863 iter=validated.begin();iter!=validated.end();iter++ ) {
01864 f.push_back((iter->second)[i].response);
01865 }
01866 if( !writer->write(p->class_,p->index_,w,p->x_,f) ) {
01867 cerr << "Unable to write event " << p->index_ << endl;
01868 return prepareExit(classifiers,cToClean,bootstraps,8);
01869 }
01870 }
01871 cout << "Writing to ntuple done." << endl;
01872
01873
01874 writer->close();
01875 }
01876
01877
01878 prepareExit(classifiers,cToClean,bootstraps,0);
01879
01880
01881 if( !answerYN("Continue?") ) break;
01882 }
01883
01884
01885 return 0;
01886 }
01887