CMS 3D CMS Logo

SprInteractiveAnalysisApp.cc

Go to the documentation of this file.
00001 //$Id: SprInteractiveAnalysisApp.cc,v 1.4 2007/11/12 06:19:11 narsky Exp $
00002 /*
00003   This executable is intended for interactive analysis of small samples.
00004   The user can interactively add and remove various classifiers with
00005   various configurations. Should be self-explanatory.
00006 
00007   Warning: for now the code does not automatically delete cache for
00008   failed classifiers. For example, if one BDT instance fails, you will
00009   need to re-create all BDT instances. This will be fixed in a future
00010   release.
00011 */
00012 
00013 #include "PhysicsTools/StatPatternRecognition/interface/SprExperiment.hh"
00014 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsFilter.hh"
00015 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsClassifier.hh"
00016 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsTrainedClassifier.hh"
00017 #include "PhysicsTools/StatPatternRecognition/interface/SprEmptyFilter.hh"
00018 #include "PhysicsTools/StatPatternRecognition/interface/SprUtils.hh"
00019 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsReader.hh"
00020 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsWriter.hh"
00021 #include "PhysicsTools/StatPatternRecognition/interface/SprClass.hh"
00022 #include "PhysicsTools/StatPatternRecognition/interface/SprBinarySplit.hh"
00023 #include "PhysicsTools/StatPatternRecognition/interface/SprAdaBoost.hh"
00024 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedAdaBoost.hh"
00025 #include "PhysicsTools/StatPatternRecognition/interface/SprFisher.hh"
00026 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedFisher.hh"
00027 #include "PhysicsTools/StatPatternRecognition/interface/SprLogitR.hh"
00028 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedLogitR.hh"
00029 #include "PhysicsTools/StatPatternRecognition/interface/SprStdBackprop.hh"
00030 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedStdBackprop.hh"
00031 #include "PhysicsTools/StatPatternRecognition/interface/SprBagger.hh"
00032 #include "PhysicsTools/StatPatternRecognition/interface/SprArcE4.hh"
00033 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedBagger.hh"
00034 #include "PhysicsTools/StatPatternRecognition/interface/SprTopdownTree.hh"
00035 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedTopdownTree.hh"
00036 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassGiniIndex.hh"
00037 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassIDFraction.hh"
00038 #include "PhysicsTools/StatPatternRecognition/interface/SprIntegerBootstrap.hh"
00039 #include "PhysicsTools/StatPatternRecognition/interface/SprStringParser.hh"
00040 #include "PhysicsTools/StatPatternRecognition/interface/SprDataFeeder.hh"
00041 #include "PhysicsTools/StatPatternRecognition/interface/SprRWFactory.hh"
00042 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsVarTransformer.hh"
00043 #include "PhysicsTools/StatPatternRecognition/interface/SprVarTransformerReader.hh"
00044 #include "PhysicsTools/StatPatternRecognition/interface/SprTransformerFilter.hh"
00045 
00046 #include <unistd.h>
00047 #include <stdio.h>
00048 #include <errno.h>
00049 #include <sys/types.h>
00050 #include <sys/stat.h>
00051 
00052 #include <cassert>
00053 #include <iostream>
00054 #include <fstream>
00055 #include <sstream>
00056 #include <map>
00057 #include <vector>
00058 #include <utility>
00059 #include <set>
00060 #include <string>
00061 #include <memory>
00062 #include <iomanip>
00063 #include <algorithm>
00064 #include <functional>
00065 
00066 using namespace std;
00067 
00068 
00069 struct SIAResponse {
00070   int cls;
00071   double weight;
00072   double response;
00073 
00074   ~SIAResponse() {}
00075 
00076   SIAResponse(int c, double w, double r) 
00077     : cls(c), weight(w), response(r) {}
00078 
00079   SIAResponse(const SIAResponse& other)
00080     : cls(other.cls), weight(other.weight), response(other.response) {}
00081 };
00082 
00083 // sorts by greater, not less!!!
00084 struct SIACmpPairDDFirst
00085   : public binary_function<pair<double,double>,pair<double,double>,bool> {
00086   bool operator()(const pair<double,double>& l, const pair<double,double>& r)
00087     const {
00088     return (l.first > r.first);
00089   }
00090 };
00091 
00092 
00093 void help(const char* prog) 
00094 {
00095   cout << "Usage:  " << prog << " training_data_file " << endl;
00096   cout << "\t Options: " << endl;
00097   cout << "\t-h --- help                                        " << endl;
00098   cout << "\t-p path to temporary cache files (default=\"./\")  " << endl;
00099   cout << "\t-o output Tuple file                               " << endl;
00100   cout << "\t-a input ascii file mode (see SprSimpleReader.hh)  " << endl;
00101   cout << "\t-A save output data in ascii instead of Root       " << endl;
00102   cout << "\t-y list of input classes (see SprAbsFilter.hh)     " << endl;
00103   cout << "\t-Q apply variable transformation saved in file     " << endl;
00104   cout << "\t-K keep this fraction in training set and          " << endl;
00105   cout << "\t\t put the rest into validation set                " << endl;
00106   cout << "\t-D randomize training set split-up                 " << endl;
00107   cout << "\t-t read validation/test data from a file           " << endl;
00108   cout << "\t\t (must be in same format as input data!!!        " << endl;
00109   cout << "\t-v verbose level (0=silent default,1,2)            " << endl;
00110   cout << "\t-V include only these input variables              " << endl;
00111   cout << "\t-z exclude input variables from the list           " << endl;
00112   cout << "\t\t Variables must be listed in quotes and separated by commas." 
00113        << endl;
00114 }
00115 
00116 
00117 int prepareExit(const map<string,SprAbsClassifier*>& classifiers,
00118                 const vector<const SprAbsClassifier*>& cToClean,
00119                 const vector<const SprIntegerBootstrap*>& bootstraps,
00120                 int status)
00121 {
00122   for( map<string,SprAbsClassifier*>::const_iterator 
00123          i=classifiers.begin();i!=classifiers.end();i++ )
00124     delete i->second;
00125   for( int i=0;i<cToClean.size();i++ )
00126     delete cToClean[i];
00127   for( int i=0;i<bootstraps.size();i++ )
00128     delete bootstraps[i];
00129   return status;
00130 }
00131 
00132 
00133 bool answerYN(const char* question)
00134 {
00135   cout << question << " y/n [y] ";
00136   char yn [2];
00137   cin.getline(yn,2,'\n');
00138   if( yn[0]=='\0' || yn[0]=='y' ) return true;
00139   return false;
00140 }
00141 
00142 
00143 unsigned answerHowMany(unsigned defaultN, const char* question)
00144 {
00145   cout << question << " [" << defaultN << "] ";
00146   string line;
00147   getline(cin,line,'\n');
00148   unsigned n = defaultN;
00149   if( !line.empty() )
00150     n = atoi(line.c_str());
00151   return n;
00152 }
00153 
00154 
00155 string answerName(const char* question)
00156 {
00157   cout << question << " ";
00158   string line;
00159   getline(cin,line,'\n');
00160   if( !line.empty() ) {
00161     line.erase( line.find_last_not_of(' ') + 1 );
00162     line.erase( 0, line.find_first_not_of(' ') );
00163   }
00164   return line;
00165 }
00166 
00167 
00168 bool findCache(const string& prefix, const char* cacheName, ifstream& file)
00169 {
00170   string fname = prefix;
00171   fname += cacheName;
00172   file.open(fname.c_str());
00173   if( !file ) return false;
00174   return true;
00175 }
00176 
00177 
00178 bool checkOldCache(const string& prefix)
00179 {
00180   string cmd = "ls -a ";
00181   cmd += prefix;
00182   cmd += "*";
00183   if( system(cmd.c_str()) == 0 )
00184     return true;
00185   return false;
00186 }
00187 
00188 
00189 bool eraseOldCache(const string& prefix)
00190 {
00191   string cmd = "rm -f ";
00192   cmd += prefix;
00193   cmd += "*";
00194   if( system(cmd.c_str()) == 0 )
00195     return true;
00196   return false;
00197 }
00198 
00199 
00200 bool moveNewOntoOld(const string& prefix, const char* name)
00201 {
00202   // get file name
00203   string oldfile = prefix;
00204   oldfile += name;
00205   string newfile = oldfile;
00206   newfile += ".new";
00207   
00208   // move new cache to old cache
00209   struct stat buf;
00210   if( stat(newfile.c_str(),&buf) == 0 ) {
00211     string cmd = "mv -f";
00212     cmd += " "; cmd += newfile;
00213     cmd += " "; cmd += oldfile;
00214     if( system(cmd.c_str()) != 0 ) {
00215       cerr << "Attempt to move new cache " << newfile.c_str() 
00216            << " onto old cache " << oldfile.c_str()
00217            << " terminated with error " << errno << endl;
00218       return false;
00219     }
00220     cout << "Moved " << newfile.c_str() << " =====> " 
00221          << oldfile.c_str() << endl;
00222   }
00223 
00224   // exit
00225   return true;
00226 }
00227 
00228 
00229 bool makeNewCache(const string& prefix, const char* cacheName, ofstream& file)
00230 {
00231   // get file name
00232   string fname = prefix;
00233   fname += cacheName;
00234   fname += ".new";
00235 
00236   // check file existence
00237   struct stat buf;
00238   if( stat(fname.c_str(),&buf) == 0 ) {
00239     cerr << "Warning: file " << fname.c_str() << " will be deleted." << endl;
00240     string cmd = "rm -f ";
00241     cmd += fname.c_str();
00242     if( system(cmd.c_str()) != 0 ) {
00243       cerr << "Attempt to delete file " << fname.c_str() 
00244            << " terminated with error " << errno << endl;
00245       return false;
00246     }
00247   }
00248  
00249   // open a new file
00250   file.open(fname.c_str());
00251   if( !file ) {
00252     cerr << "Unable to open file " << fname.c_str() << endl;
00253     return false;
00254   }
00255 
00256   // exit
00257   return true;
00258 }
00259 
00260 
00261 bool storeNewCache(const string& prefix, const char* cacheName,
00262                    const vector<SIAResponse>& v)
00263 {
00264   // get file name
00265   string fname = prefix;
00266   fname += cacheName;
00267   fname += ".new";
00268 
00269   // open the file
00270   ofstream file(fname.c_str(),ios_base::app);
00271   if( !file ) {
00272     cerr << "Unable to open file " << fname.c_str() << endl;
00273     return false;
00274   }
00275 
00276   // store values
00277   int j = 0;
00278   for( int i=0;i<v.size();i++ ) {
00279     file << v[i].cls << " " << v[i].weight << " " << v[i].response << " #   ";
00280     if( ++j == 10 ) {
00281       j = 0;
00282       file << endl;
00283     }
00284   }
00285 
00286   // exit
00287   return true;
00288 }
00289 
00290 
00291 bool storeEffCache(const string& prefix, const char* cacheName,
00292                    const vector<double>& v)
00293 {
00294   // get file name
00295   string fname = prefix;
00296   fname += cacheName;
00297 
00298   // check file existence
00299   struct stat buf;
00300   if( stat(fname.c_str(),&buf) == 0 ) {
00301     cerr << "Warning: file " << fname.c_str() << " will be deleted." << endl;
00302     string cmd = "rm -f ";
00303     cmd += fname.c_str();
00304     if( system(cmd.c_str()) != 0 ) {
00305       cerr << "Attempt to delete file " << fname.c_str() 
00306            << " terminated with error " << errno << endl;
00307       return false;
00308     }
00309   }
00310  
00311   // open a new file
00312   ofstream file(fname.c_str());
00313   if( !file ) {
00314     cerr << "Unable to open file " << fname.c_str() << endl;
00315     return false;
00316   }
00317 
00318   // store values
00319   for( int i=0;i<v.size();i++ )
00320     file << v[i] << " ";
00321   file << endl;
00322 
00323   // exit
00324   return true;
00325 }
00326 
00327 
00328 unsigned readCache(ifstream& file, vector<SIAResponse>& v)
00329 {
00330   v.clear();
00331   int cls(0); double weight(0), resp(0);
00332   string line;
00333   char dummy;
00334   while( getline(file,line) ) {
00335     istringstream str(line);
00336     unsigned n = 0;
00337     for( int i=0;i<line.size();i++ )
00338       if( line[i]=='#' ) n++;
00339     for( int i=0;i<n;i++ ) {
00340       str >> cls >> weight >> resp >> dummy;
00341       v.push_back(SIAResponse(cls,weight,resp));
00342     }
00343   }
00344   return v.size();
00345 }
00346 
00347 
00348 bool resetValidation(const char* cacheName, 
00349                      map<string,vector<SIAResponse> >& validated, 
00350                      const SprAbsFilter* filter)
00351 {
00352   // find element
00353   map<string,vector<SIAResponse> >::iterator iter = validated.find(cacheName);
00354 
00355   // if it exists
00356   if( iter == validated.end() ) {
00357     pair<map<string,vector<SIAResponse> >::iterator,bool> inserted =
00358       validated.insert(pair<const string,vector<SIAResponse> >(cacheName,
00359                                                       vector<SIAResponse>()));
00360     assert( inserted.second );
00361     return true;
00362   }
00363 
00364   // if it exists with the wrong size
00365   if( filter->size() != iter->second.size() ) {
00366     iter->second.clear();
00367     return true;
00368   }
00369 
00370   // exit
00371   return false;
00372 }
00373 
00374 
00375 int main(int argc, char ** argv)
00376 {
00377   // check command line
00378   if( argc < 2 ) {
00379     help(argv[0]);
00380     return 1;
00381   }
00382 
00383   // init
00384   string tupleFile;
00385   string pathToCache;
00386   int readMode = 0;
00387   SprRWFactory::DataType writeMode = SprRWFactory::Root;
00388   int verbose = 0;
00389   string includeList, excludeList;
00390   string inputClassesString;
00391   string valFile;
00392   bool split = false;
00393   double splitFactor = 0;
00394   bool splitRandomize = false; 
00395   string transformerFile;
00396 
00397   // decode command line
00398   int c;
00399   extern char* optarg;
00400   //  extern int optind;
00401   while( (c = getopt(argc,argv,"hp:o:a:Ay:Q:K:Dt:v:V:z:")) 
00402          != EOF ) {
00403     switch( c )
00404       {
00405       case 'h' :
00406         help(argv[0]);
00407         return 1;
00408       case 'p' :
00409         pathToCache = optarg;
00410         break;
00411       case 'o' :
00412         tupleFile = optarg;
00413         break;
00414       case 'a' :
00415         readMode = (optarg==0 ? 0 : atoi(optarg));
00416         break;
00417       case 'A' :
00418         writeMode = SprRWFactory::Ascii;
00419         break;
00420       case 'y' :
00421         inputClassesString = optarg;
00422         break;
00423       case 'Q' :
00424         transformerFile = optarg;
00425         break;
00426       case 'K' :
00427         split = true;
00428         splitFactor = (optarg==0 ? 0 : atof(optarg));
00429         break;
00430       case 'D' :
00431         splitRandomize = true;
00432         break;
00433       case 't' :
00434         valFile = optarg;
00435         break;
00436       case 'v' :
00437         verbose = (optarg==0 ? 0 : atoi(optarg));
00438         break;
00439       case 'V' :
00440         includeList = optarg;
00441         break;
00442       case 'z' :
00443         excludeList = optarg;
00444         break;
00445       }
00446   }
00447 
00448   // There have to be 1 argument after all options.
00449   string trFile = argv[argc-1];
00450   if( trFile.empty() ) {
00451     cerr << "No training file is specified." << endl;
00452     return 1;
00453   }
00454 
00455   // make reader
00456   SprRWFactory::DataType inputType 
00457     = ( readMode==0 ? SprRWFactory::Root : SprRWFactory::Ascii );
00458   auto_ptr<SprAbsReader> reader(SprRWFactory::makeReader(inputType,readMode));
00459 
00460   // include variables
00461   set<string> includeSet;
00462   if( !includeList.empty() ) {
00463     vector<vector<string> > includeVars;
00464     SprStringParser::parseToStrings(includeList.c_str(),includeVars);
00465     assert( !includeVars.empty() );
00466     for( int i=0;i<includeVars[0].size();i++ ) 
00467       includeSet.insert(includeVars[0][i]);
00468     if( !reader->chooseVars(includeSet) ) {
00469       cerr << "Unable to include variables in training set." << endl;
00470       return 2;
00471     }
00472     else {
00473       cout << "Following variables have been included in optimization: ";
00474       for( set<string>::const_iterator 
00475              i=includeSet.begin();i!=includeSet.end();i++ )
00476         cout << "\"" << *i << "\"" << " ";
00477       cout << endl;
00478     }
00479   }
00480 
00481   // exclude variables
00482   set<string> excludeSet;
00483   if( !excludeList.empty() ) {
00484     vector<vector<string> > excludeVars;
00485     SprStringParser::parseToStrings(excludeList.c_str(),excludeVars);
00486     assert( !excludeVars.empty() );
00487     for( int i=0;i<excludeVars[0].size();i++ ) 
00488       excludeSet.insert(excludeVars[0][i]);
00489     if( !reader->chooseAllBut(excludeSet) ) {
00490       cerr << "Unable to exclude variables from training set." << endl;
00491       return 2;
00492     }
00493     else {
00494       cout << "Following variables have been excluded from optimization: ";
00495       for( set<string>::const_iterator 
00496              i=excludeSet.begin();i!=excludeSet.end();i++ )
00497         cout << "\"" << *i << "\"" << " ";
00498       cout << endl;
00499     }
00500   }
00501 
00502   // read training data from file
00503   auto_ptr<SprAbsFilter> filter(reader->read(trFile.c_str()));
00504   if( filter.get() == 0 ) {
00505     cerr << "Unable to read data from file " << trFile.c_str() << endl;
00506     return 2;
00507   }
00508   vector<string> vars;
00509   filter->vars(vars);
00510   cout << "Read data from file " << trFile.c_str() 
00511        << " for variables";
00512   for( int i=0;i<vars.size();i++ ) 
00513     cout << " \"" << vars[i].c_str() << "\"";
00514   cout << endl;
00515   cout << "Total number of points read: " << filter->size() << endl;
00516 
00517   // filter training data by class
00518   vector<SprClass> inputClasses;
00519   if( !filter->filterByClass(inputClassesString.c_str()) ) {
00520     cerr << "Cannot choose input classes for string " 
00521          << inputClassesString << endl;
00522     return 2;
00523   }
00524   filter->classes(inputClasses);
00525   assert( inputClasses.size() > 1 );
00526   cout << "Training data filtered by class." << endl;
00527   for( int i=0;i<inputClasses.size();i++ ) {
00528     cout << "Points in class " << inputClasses[i] << ":   " 
00529          << filter->ptsInClass(inputClasses[i]) << endl;
00530   }
00531 
00532   // read validation data from file
00533   auto_ptr<SprAbsFilter> valFilter;
00534   if( !split && valFile.empty() ) {
00535     cout << "No test data specified. Will use training data." << endl;
00536     vector<double> weights;
00537     filter->weights(weights);
00538     bool ownData = false;
00539     valFilter.reset(new SprEmptyFilter(filter->data(),weights,ownData));
00540   }
00541   if( split && !valFile.empty() ) {
00542     cerr << "Unable to split training data and use validation data " 
00543          << "from a separate file." << endl;
00544     return 2;
00545   }
00546   if( split ) {
00547     cout << "Splitting training data with factor " << splitFactor << endl;
00548     if( splitRandomize )
00549       cout << "Will use randomized splitting." << endl;
00550     vector<double> weights;
00551     SprData* splitted = filter->split(splitFactor,weights,splitRandomize);
00552     if( splitted == 0 ) {
00553       cerr << "Unable to split training data." << endl;
00554       return 2;
00555     }
00556     bool ownData = true;
00557     valFilter.reset(new SprEmptyFilter(splitted,weights,ownData));
00558     cout << "Training data re-filtered:" << endl;
00559     for( int i=0;i<inputClasses.size();i++ ) {
00560       cout << "Points in class " << inputClasses[i] << ":   " 
00561            << filter->ptsInClass(inputClasses[i]) << endl;
00562     }
00563   }
00564   if( !valFile.empty() ) {
00565     auto_ptr<SprAbsReader> 
00566       valReader(SprRWFactory::makeReader(inputType,readMode));
00567     if( !includeSet.empty() ) {
00568       if( !valReader->chooseVars(includeSet) ) {
00569         cerr << "Unable to include variables in validation set." << endl;
00570         return 2;
00571       }
00572     }
00573     if( !excludeSet.empty() ) {
00574       if( !valReader->chooseAllBut(excludeSet) ) {
00575         cerr << "Unable to exclude variables from validation set." << endl;
00576         return 2;
00577       }
00578     }
00579     valFilter.reset(valReader->read(valFile.c_str()));
00580     if( valFilter.get() == 0 ) {
00581       cerr << "Unable to read data from file " << valFile.c_str() << endl;
00582       return 2;
00583     }
00584     vector<string> valVars;
00585     valFilter->vars(valVars);
00586     cout << "Read validation data from file " << valFile.c_str() 
00587          << " for variables";
00588     for( int i=0;i<valVars.size();i++ ) 
00589       cout << " \"" << valVars[i].c_str() << "\"";
00590     cout << endl;
00591     cout << "Total number of points read: " << valFilter->size() << endl;
00592   }
00593 
00594   // filter validation data by class
00595   if( !valFilter->filterByClass(inputClassesString.c_str()) ) {
00596     cerr << "Cannot choose input classes for string " 
00597          << inputClassesString << endl;
00598     return 2;
00599   }
00600   valFilter->classes(inputClasses);
00601   cout << "Validation data filtered by class." << endl;
00602   for( int i=0;i<inputClasses.size();i++ ) {
00603     cout << "Points in class " << inputClasses[i] << ":   " 
00604          << valFilter->ptsInClass(inputClasses[i]) << endl;
00605   }
00606 
00607   // apply transformation of variables to training and test data
00608   auto_ptr<SprAbsFilter> garbage_train, garbage_valid;
00609   if( !transformerFile.empty() ) {
00610     SprVarTransformerReader transReader;
00611     const SprAbsVarTransformer* t = transReader.read(transformerFile.c_str());
00612     if( t == 0 ) {
00613       cerr << "Unable to read VarTransformer from file "
00614            << transformerFile.c_str() << endl;
00615       return 2;
00616     }
00617     SprTransformerFilter* t_train = new SprTransformerFilter(filter.get());
00618     SprTransformerFilter* t_valid = 0;
00619     if( valFilter.get() != 0 )
00620       t_valid = new SprTransformerFilter(valFilter.get());
00621     bool replaceOriginalData = true;
00622     if( !t_train->transform(t,replaceOriginalData) ) {
00623       cerr << "Unable to apply VarTransformer to training data." << endl;
00624       return 2;
00625     }
00626     if( t_valid!=0 && !t_valid->transform(t,replaceOriginalData) ) {
00627       cerr << "Unable to apply VarTransformer to validation data." << endl;
00628       return 2;
00629     }
00630     cout << "Variable transformation from file "
00631          << transformerFile.c_str() << " has been applied to "
00632          << "training and validation data." << endl;
00633     garbage_train.reset(filter.release());
00634     garbage_valid.reset(valFilter.release());
00635     filter.reset(t_train);
00636     valFilter.reset(t_valid);
00637   }
00638 
00639   // determine path to cache
00640   if( !pathToCache.empty() ) {
00641     pathToCache.erase( pathToCache.find_last_not_of(' ') + 1 );
00642     pathToCache.erase( 0, pathToCache.find_first_not_of(' ') );
00643   }
00644   if( !pathToCache.empty() && *(pathToCache.rbegin())!='/' )
00645     pathToCache += "/";
00646   cout << "Will use directory \"" << pathToCache.c_str() 
00647        << "\" for cache." << endl;
00648   string prefix = pathToCache;
00649   prefix += ".cache_";
00650   
00651   // check for presence of old cache
00652   if( checkOldCache(prefix) ) {
00653     cout << "Warning!!! Some old cache files .cache* are found." << endl;
00654     if( answerYN("Erase old cache?") ) {
00655       if( !eraseOldCache(prefix) ) return 2;
00656     }
00657   }
00658 
00659   // make criteria
00660   SprTwoClassIDFraction idfrac;
00661   SprTwoClassGiniIndex gini;
00662 
00663   //
00664   // put everything in a big loop
00665   //
00666   bool go = true;
00667   while( go ) {
00668 
00669     //
00670     // read classifier configurations
00671     //
00672     map<string,SprAbsClassifier*> classifiers;
00673     map<string,vector<SIAResponse> > validated;
00674     map<string,string> message;
00675     vector<const SprAbsClassifier*> cToClean;
00676     vector<const SprIntegerBootstrap*> bootstraps;
00677     
00678     //
00679     // LDA
00680     //
00681     if( answerYN("Use Linear Discriminant Analysis?") ) {
00682       classifiers.insert(pair<const string,
00683                          SprAbsClassifier*>("LDA",
00684                                             new SprFisher(filter.get(),1)));
00685       ifstream input;
00686       if( findCache(prefix,"LDA",input) ) {
00687         if( resetValidation("LDA",validated,valFilter.get()) ) {
00688           if( readCache(input,validated.find("LDA")->second) 
00689               != valFilter->size() ) {
00690             cerr << "Cannot read cached LDA values." << endl;
00691             return prepareExit(classifiers,cToClean,bootstraps,3);
00692           }
00693         }
00694       }
00695       ostringstream ost;
00696       ost << "LDA = Linear Discriminant Analysis";
00697       message.insert(pair<const string,string>("LDA",ost.str()));
00698     }
00699     
00700     //
00701     // QDA
00702     //
00703     if( answerYN("Use Quadratic Discriminant Analysis?") ) {
00704       classifiers.insert(pair<const string,
00705                          SprAbsClassifier*>("QDA",
00706                                             new SprFisher(filter.get(),2)));
00707       ifstream input;
00708       if( findCache(prefix,"QDA",input) ) {
00709         if( resetValidation("QDA",validated,valFilter.get()) ) {
00710           if( readCache(input,validated.find("QDA")->second) 
00711               != valFilter->size() ) {
00712             cerr << "Cannot read cached QDA values." << endl;
00713             return prepareExit(classifiers,cToClean,bootstraps,3);
00714           }
00715         }
00716       }
00717       ostringstream ost;
00718       ost << "QDA = Quadratic Discriminant Analysis";
00719       message.insert(pair<const string,string>("QDA",ost.str()));
00720     }
00721 
00722     //
00723     // LogitR
00724     //
00725     if( answerYN("Use Logistic Regression?") ) {
00726       double eps = 0.001;
00727       double updateFactor = 1;
00728       int initToZero = 1;
00729       bool repeat = false;
00730       ifstream input;
00731       string line;
00732       if( findCache(prefix,"LR",input) ) {
00733         repeat = true;
00734         if( !getline(input,line,'\n') ) {
00735           cerr << "Cannot read top line from LR cache." << endl;
00736           return prepareExit(classifiers,cToClean,bootstraps,3);
00737         }
00738         istringstream str(line);
00739         str >> eps >> updateFactor >> initToZero;
00740       }
00741       cout << "Input accuracy, update factor, and init flag "
00742            << "(0 if initialize from 0; 1 if initialize from LDA) ["
00743            << eps << " " << updateFactor << " " << initToZero << "] ";
00744       getline(cin,line,'\n');
00745       if( !line.empty() ) {
00746         repeat = false;
00747         istringstream str(line);
00748         str >> eps >> updateFactor >> initToZero;
00749       }
00750       if( repeat ) {
00751         if( resetValidation("LR",validated,valFilter.get()) ) {
00752           if( readCache(input,validated.find("LR")->second) 
00753               != valFilter->size() ) {
00754             cerr << "Cannot read cached LR values." << endl;
00755             return prepareExit(classifiers,cToClean,bootstraps,3);
00756           }
00757         }
00758       }
00759       else {
00760         // save LR parameters to file
00761         ofstream output;
00762         if( makeNewCache(prefix,"LR",output) ) {
00763           output << eps << " " << updateFactor << " " << initToZero << endl;
00764         }
00765         else {
00766           cerr << "Unable to make output file for LR." << endl;
00767           return prepareExit(classifiers,cToClean,bootstraps,3);
00768         }
00769         
00770         // make LR
00771         double beta0 = 0;
00772         SprVector beta;
00773         if( initToZero == 0 ) {
00774           SprVector dummy(filter->dim());
00775           beta = dummy;
00776           for( int i=0;i<filter->dim();i++ ) beta[i] = 0;
00777         }
00778         SprLogitR* logit = new SprLogitR(filter.get(),beta0,beta,
00779                                          eps,updateFactor);
00780         classifiers.insert(pair<const string,
00781                            SprAbsClassifier*>("LR",logit));
00782       }
00783       ostringstream ost;
00784       ost << "LR = Logistic Regression with:"
00785           << " Eps=" << eps
00786           << " updateFactor=" << updateFactor 
00787           << " initFlag=" << initToZero;
00788       message.insert(pair<const string,string>("LR",ost.str()));
00789     }
00790 
00791     //
00792     // single decision tree
00793     //
00794     if( answerYN("Use single decision tree?") ) {
00795       
00796       // get instances
00797       unsigned ninstance = 0;
00798       ifstream iinst;
00799       if( findCache(prefix,"DT_instances",iinst) ) {
00800         string line;
00801         if( !getline(iinst,line,'\n') ) {
00802           cerr << "Cannot read top line from DT cache." << endl;
00803           return prepareExit(classifiers,cToClean,bootstraps,3);
00804         }
00805         istringstream str(line);
00806         str >> ninstance;
00807       }
00808       ninstance = 
00809         answerHowMany(ninstance,
00810               "How many instances of the classifier would you like to run?");
00811       ofstream oinst;
00812       if( makeNewCache(prefix,"DT_instances",oinst) ) {
00813         oinst << ninstance << endl;
00814       }
00815       else {
00816         cerr << "Unable to make output file for DT." << endl;
00817         return prepareExit(classifiers,cToClean,bootstraps,3);
00818       }
00819       if( !moveNewOntoOld(prefix,"DT_instances") )
00820         return prepareExit(classifiers,cToClean,bootstraps,3);
00821       
00822       // loop over instances
00823       for( int instance=0;instance<ninstance;instance++ ) {
00824         string name = "DT_";
00825         char s [200];
00826         sprintf(s,"%i",instance);
00827         name += s;
00828         unsigned nleaf = 0;
00829         bool repeat = false;
00830         ifstream input;
00831         string line;
00832         if( findCache(prefix,name.c_str(),input) ) {
00833           repeat = true;
00834           if( !getline(input,line,'\n') ) {
00835             cerr << "Cannot read top line from DT cache." << endl;
00836             return prepareExit(classifiers,cToClean,bootstraps,3);
00837           }
00838           istringstream str(line);
00839           str >> nleaf;
00840         }
00841         cout << "Input minimal tree leaf size "
00842              << "for DT instance " << instance << " [" << nleaf << "] ";
00843         getline(cin,line,'\n');
00844         if( !line.empty() ) {
00845           repeat = false;
00846           istringstream str(line);
00847           str >> nleaf;
00848         }
00849         if( repeat ) {
00850           if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
00851             if( readCache(input,validated.find(name)->second) 
00852                 != valFilter->size() ) {
00853               cerr << "Cannot read cached DT values." << endl;
00854               return prepareExit(classifiers,cToClean,bootstraps,3);
00855             }
00856           }
00857         }
00858         else {
00859           // save params to file
00860           ofstream output;
00861           if( makeNewCache(prefix,name.c_str(),output) ) {
00862             output << nleaf << endl;
00863           }
00864           else {
00865             cerr << "Unable to make output file for DT." << endl;
00866             return prepareExit(classifiers,cToClean,bootstraps,3);
00867           }
00868           
00869           // make decision tree
00870           bool discrete = false;
00871           SprTopdownTree* tree = new SprTopdownTree(filter.get(),&gini,
00872                                                     nleaf,discrete);
00873           classifiers.insert(pair<const string,SprAbsClassifier*>(name,
00874                                                                   tree));
00875         }
00876         ostringstream ost;
00877         ost << name.c_str() 
00878             << " = Decision Tree with:"
00879             << " FOM=Gini"
00880             << " nEventsPerLeaf=" << nleaf;
00881         message.insert(pair<const string,string>(name,ost.str()));
00882       }
00883     }
00884 
00885     //
00886     // single neural net
00887     //
00888     if( answerYN("Use neural net?") ) {
00889 
00890       // get instances
00891       unsigned ninstance = 0;
00892       ifstream iinst;
00893       if( findCache(prefix,"STDNN_instances",iinst) ) {
00894         string line;
00895         if( !getline(iinst,line,'\n') ) {
00896           cerr << "Cannot read top line from STDNN cache." << endl;
00897           return prepareExit(classifiers,cToClean,bootstraps,3);
00898         }
00899         istringstream str(line);
00900         str >> ninstance;
00901       }
00902       ninstance = 
00903         answerHowMany(ninstance,
00904              "How many instances of the classifier would you like to run?");
00905       ofstream oinst;
00906       if( makeNewCache(prefix,"STDNN_instances",oinst) ) {
00907         oinst << ninstance << endl;
00908       }
00909       else {
00910         cerr << "Unable to make output file for STDNN." << endl;
00911         return prepareExit(classifiers,cToClean,bootstraps,3);
00912       }
00913       if( !moveNewOntoOld(prefix,"STDNN_instances") )
00914         return prepareExit(classifiers,cToClean,bootstraps,3);
00915 
00916       // loop over instances
00917       for( int instance=0;instance<ninstance;instance++ ) {
00918         string name = "STDNN_";
00919         char s [200];
00920         sprintf(s,"%i",instance);
00921         name += s;
00922         string structure = "I:H:H:O";
00923         unsigned ncycles(0), initPoints(0);
00924         double eta(0.1), initEta(0.1);
00925         bool repeat = false;
00926         ifstream input;
00927         string line;
00928         if( findCache(prefix,name.c_str(),input) ) {
00929           repeat = true;
00930           if( !getline(input,line,'\n') ) {
00931             cerr << "Cannot read top line from STDNN cache." << endl;
00932             return prepareExit(classifiers,cToClean,bootstraps,3);
00933           }
00934           istringstream str(line);
00935           str >> structure >> ncycles >> eta >> initPoints >> initEta;
00936         }
00937         cout << "Input StdBackprop NN structure (see README), "
00938              << "number of training cycles, "
00939              << "learning rate, "
00940              << "number of points for initialization (0 if use all), "
00941              << "and learning rate for initialization "
00942              << "for STDNN instance " << instance 
00943              << " [" << structure.c_str() << " " << ncycles
00944              << " " << eta << " " << initPoints << " " << initEta << "] ";
00945         getline(cin,line,'\n');
00946         if( !line.empty() ) {
00947           repeat = false;
00948           istringstream str(line);
00949           str >> structure >> ncycles >> eta >> initPoints >> initEta;
00950         }
00951         if( repeat ) {
00952           if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
00953             if( readCache(input,validated.find(name)->second) 
00954                 != valFilter->size() ) {
00955               cerr << "Cannot read cached STDNN values." << endl;
00956               return prepareExit(classifiers,cToClean,bootstraps,3);
00957             }
00958           }
00959         }
00960         else {
00961           // save params to file
00962           ofstream output;
00963           if( makeNewCache(prefix,name.c_str(),output) ) {
00964             output << structure.c_str() << " " << ncycles << " " 
00965                    << eta << " " << initPoints << " " << initEta << endl;
00966           }
00967           else {
00968             cerr << "Unable to make output file for STDNN." << endl;
00969             return prepareExit(classifiers,cToClean,bootstraps,3);
00970           }
00971 
00972           // make NN
00973           SprStdBackprop* stdnn = 
00974             new SprStdBackprop(filter.get(),structure.c_str(),ncycles,eta);
00975           stdnn->init(initEta,initPoints);
00976           classifiers.insert(pair<const string,SprAbsClassifier*>(name,stdnn));
00977         }
00978         
00979         // prepare message
00980         ostringstream ost;
00981         ost << name.c_str() 
00982             << " = StdBackprop Neural Net with:"
00983             << " Structure=" << structure.c_str()
00984             << " nCycles=" << ncycles
00985             << " LearnRate=" << eta
00986             << " nInitPoints=" << initPoints
00987             << " InitLearnRate=" << initEta;
00988         message.insert(pair<const string,string>(name,ost.str()));
00989       }
00990     }
00991 
00992     //
00993     // boosted neural nets
00994     //
00995     if( answerYN("Use boosted neural nets?") ) {
00996       
00997       // get instances
00998       unsigned ninstance = 0;
00999       ifstream iinst;
01000       if( findCache(prefix,"BNN_instances",iinst) ) {
01001         string line;
01002         if( !getline(iinst,line,'\n') ) {
01003           cerr << "Cannot read top line from BNN cache." << endl;
01004           return prepareExit(classifiers,cToClean,bootstraps,3);
01005         }
01006         istringstream str(line);
01007         str >> ninstance;
01008       }
01009       ninstance = 
01010         answerHowMany(ninstance,
01011             "How many instances of the classifier would you like to run?");
01012       ofstream oinst;
01013       if( makeNewCache(prefix,"BNN_instances",oinst) ) {
01014         oinst << ninstance << endl;
01015       }
01016       else {
01017         cerr << "Unable to make output file for BNN." << endl;
01018         return prepareExit(classifiers,cToClean,bootstraps,3);
01019       }
01020       if( !moveNewOntoOld(prefix,"BNN_instances") )
01021         return prepareExit(classifiers,cToClean,bootstraps,3);
01022 
01023       // loop over instances
01024       for( int instance=0;instance<ninstance;instance++ ) {
01025         string name = "BNN_";
01026         char s [200];
01027         sprintf(s,"%i",instance);
01028         name += s;
01029         int abMode = 0;
01030         double epsilon = 0;
01031         string structure = "I:H:H:O";
01032         unsigned adaCycles(0), nnCycles(0), initPoints(0);
01033         double eta(0.1), initEta(0.1);
01034         bool repeat = false;
01035         ifstream input;
01036         string line;
01037         if( findCache(prefix,name.c_str(),input) ) {
01038           repeat = true;
01039           if( !getline(input,line,'\n') ) {
01040             cerr << "Cannot read top line from BNN cache." << endl;
01041             return prepareExit(classifiers,cToClean,bootstraps,3);
01042           }
01043           istringstream str(line);
01044           str >> abMode >> epsilon >> adaCycles 
01045               >> structure >> nnCycles >> eta >> initPoints >> initEta;
01046         }
01047         cout << "Input AdaBoost mode (Discrete=1 Real=2 Epsilon=3), "
01048              << "epsilon (only has effect for Epsilon and Real boost), "
01049              << "number of neural nets, "
01050              << "neural net structure, number of NN training cycles, "
01051              << "learning rate, " 
01052              << "number of points for NN initialization (0 if use all), "
01053              << "and learning rate for NN initialization "
01054              << "for BNN instance " << instance 
01055              << " [" << abMode << " " << epsilon << " " 
01056              << adaCycles << " " 
01057              << structure.c_str() << " " << nnCycles << " "
01058              << eta << " " << initPoints << " " << initEta << "] ";
01059         getline(cin,line,'\n');
01060         if( !line.empty() ) {
01061           repeat = false;
01062           istringstream str(line);
01063           str >> abMode >> epsilon >> adaCycles 
01064               >> structure >> nnCycles >> eta >> initPoints >> initEta;
01065         }
01066         if( repeat ) {
01067           if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
01068             if( readCache(input,validated.find(name)->second) 
01069                 != valFilter->size() ) {
01070               cerr << "Cannot read cached BNN values." << endl;
01071               return prepareExit(classifiers,cToClean,bootstraps,3);
01072             }
01073           }
01074         }
01075         else {
01076           // save params to file
01077           ofstream output;
01078           if( makeNewCache(prefix,name.c_str(),output) ) {
01079             output << " " << abMode << " " << epsilon << " " << adaCycles 
01080                    << " " << structure << " " << nnCycles << " " << eta 
01081                    << " " << initPoints << " " << initEta << endl;
01082           }
01083           else {
01084             cerr << "Unable to make output file for BNN." << endl;
01085             return prepareExit(classifiers,cToClean,bootstraps,3);
01086           }
01087 
01088           // set AdaBoost mode
01089           SprTrainedAdaBoost::AdaBoostMode mode = SprTrainedAdaBoost::Discrete;
01090           switch( abMode )
01091             {
01092             case 1 :
01093               mode = SprTrainedAdaBoost::Discrete;
01094               break;
01095             case 2 :
01096               mode = SprTrainedAdaBoost::Real;
01097               break;
01098             case 3 :
01099               mode = SprTrainedAdaBoost::Epsilon;
01100               break;
01101             default :
01102               cerr << "Unknown mode for AdaBoost." << endl;
01103               return prepareExit(classifiers,cToClean,bootstraps,3);
01104             }
01105           
01106           // make neural net
01107           SprStdBackprop* stdnn = 
01108             new SprStdBackprop(filter.get(),structure.c_str(),nnCycles,eta);
01109           cToClean.push_back(stdnn);
01110           if( !stdnn->init(initEta,initPoints) ) {
01111             cerr << "Unable to initialize neural net." << endl;
01112             return prepareExit(classifiers,cToClean,bootstraps,3);
01113           }
01114           
01115           // make AdaBoost
01116           bool useStandardAB = true;
01117           bool bagInput = false;
01118           SprAdaBoost* ab = new SprAdaBoost(filter.get(),adaCycles,
01119                                             useStandardAB,mode,bagInput);
01120           ab->setEpsilon(epsilon);
01121           classifiers.insert(pair<const string,SprAbsClassifier*>(name,ab));
01122           if( !ab->addTrainable(stdnn) ) {
01123             cerr << "Unable to add a neural net to AdaBoost." << endl;
01124             return prepareExit(classifiers,cToClean,bootstraps,3);
01125           }
01126         }
01127         
01128         // prepare message
01129         ostringstream ost;
01130         string sMode;
01131         switch( abMode )
01132           {
01133           case 1 :
01134             sMode = "Discrete";
01135             break;
01136           case 2 :
01137             sMode = "Real";
01138             break;
01139           case 3 :
01140             sMode = "Epsilon";
01141             break;
01142           default :
01143             cerr << "Unknown mode for AdaBoost." << endl;
01144             return prepareExit(classifiers,cToClean,bootstraps,3);
01145           }
01146         ost << name.c_str() 
01147             << " = Boosted Neural Nets with:"
01148             << " Mode=" << sMode.c_str()
01149             << " Epsilon=" << epsilon
01150             << " nNets=" << adaCycles 
01151             << " structure=" << structure.c_str()
01152             << " nCyclesPerNet=" << nnCycles
01153             << " LearnRate=" << eta
01154             << " nInitPoints=" << initPoints
01155             << " InitLearnRate=" << initEta;
01156         message.insert(pair<const string,string>(name,ost.str()));
01157       }
01158     }
01159 
01160     //    
01161     // boosted splits
01162     //
01163     if( answerYN("Use boosted splits?") ) {
01164       unsigned ncycles = 0;
01165       bool repeat = false;
01166       ifstream input;
01167       string line;
01168       if( findCache(prefix,"BDS",input) ) {
01169         repeat = true;
01170         if( !getline(input,line,'\n') ) {
01171           cerr << "Cannot read top line from BDS cache." << endl;
01172           return prepareExit(classifiers,cToClean,bootstraps,3);
01173         }
01174         istringstream str(line);
01175         str >> ncycles;
01176       }
01177       cout << "Input number of AdaBoost splits per dimension [" 
01178            << ncycles << "] ";
01179       getline(cin,line,'\n');
01180       if( !line.empty() ) {
01181         repeat = false;
01182         istringstream str(line);
01183         str >> ncycles;
01184       }
01185       if( repeat ) {
01186         if( resetValidation("BDS",validated,valFilter.get()) ) {
01187           if( readCache(input,validated.find("BDS")->second) 
01188               != valFilter->size() ) {
01189             cerr << "Cannot read cached BDS values." << endl;
01190             return prepareExit(classifiers,cToClean,bootstraps,3);
01191           }
01192         }
01193       }
01194       else {
01195         // save ncycles to file
01196         ofstream output;
01197         if( makeNewCache(prefix,"BDS",output) ) {
01198           output << ncycles << endl;
01199         }
01200         else {
01201           cerr << "Unable to make output file for BDS." << endl;
01202           return prepareExit(classifiers,cToClean,bootstraps,3);
01203         }
01204         
01205         // make AdaBoost
01206         bool useStandardAB = true;
01207         ncycles *= filter->dim();
01208         SprAdaBoost* ab = new SprAdaBoost(filter.get(),
01209                                           ncycles,
01210                                           useStandardAB,
01211                                           SprTrainedAdaBoost::Discrete);
01212         classifiers.insert(pair<const string,SprAbsClassifier*>("BDS",ab));
01213         for( int i=0;i<filter->dim();i++ ) {
01214           SprBinarySplit* s = new SprBinarySplit(filter.get(),&idfrac,i);
01215           cToClean.push_back(s);
01216           if( !ab->addTrainable(s,SprUtils::lowerBound(0.5)) ) {
01217             cerr << "Unable to add binary split to AdaBoost." << endl;
01218             return prepareExit(classifiers,cToClean,bootstraps,3);
01219           }
01220         }
01221       }
01222       ostringstream ost;
01223       ost << "BDS = Boosted Decision Splits with: nSplits=" 
01224           << ncycles;
01225       message.insert(pair<const string,string>("BDS",ost.str()));
01226     }
01227     
01228     //
01229     // boosted trees
01230     //
01231     if( answerYN("Use boosted trees?") ) {
01232       
01233       // get instances
01234       unsigned ninstance = 0;
01235       ifstream iinst;
01236       if( findCache(prefix,"BDT_instances",iinst) ) {
01237         string line;
01238         if( !getline(iinst,line,'\n') ) {
01239           cerr << "Cannot read top line from BDT cache." << endl;
01240           return prepareExit(classifiers,cToClean,bootstraps,3);
01241         }
01242         istringstream str(line);
01243         str >> ninstance;
01244       }
01245       ninstance = 
01246         answerHowMany(ninstance,
01247             "How many instances of the classifier would you like to run?");
01248       ofstream oinst;
01249       if( makeNewCache(prefix,"BDT_instances",oinst) ) {
01250         oinst << ninstance << endl;
01251       }
01252       else {
01253         cerr << "Unable to make output file for BDT." << endl;
01254         return prepareExit(classifiers,cToClean,bootstraps,3);
01255       }
01256       if( !moveNewOntoOld(prefix,"BDT_instances") )
01257         return prepareExit(classifiers,cToClean,bootstraps,3);
01258 
01259       // loop over instances
01260       for( int instance=0;instance<ninstance;instance++ ) {
01261         string name = "BDT_";
01262         char s [200];
01263         sprintf(s,"%i",instance);
01264         name += s;
01265         int abMode = 0;
01266         double epsilon = 0;
01267         int iBagInput = 0;
01268         unsigned ncycles(0), nleaf(0);
01269         bool repeat = false;
01270         ifstream input;
01271         string line;
01272         if( findCache(prefix,name.c_str(),input) ) {
01273           repeat = true;
01274           if( !getline(input,line,'\n') ) {
01275             cerr << "Cannot read top line from BDT cache." << endl;
01276             return prepareExit(classifiers,cToClean,bootstraps,3);
01277           }
01278           istringstream str(line);
01279           str >> abMode >> epsilon >> iBagInput >> ncycles >> nleaf;
01280         }
01281         cout << "Input AdaBoost mode (Discrete=1 Real=2 Epsilon=3), "
01282              << "epsilon (only has effect for Epsilon and Real boost), "
01283              << "bag training events flag (No=0 Yes=1), "
01284              << "number of trees, and minimal tree leaf size "
01285              << "for BDT instance " << instance 
01286              << " [" << abMode << " " << epsilon << " " << iBagInput << " " 
01287              << ncycles << " " << nleaf << "] ";
01288         getline(cin,line,'\n');
01289         if( !line.empty() ) {
01290           repeat = false;
01291           istringstream str(line);
01292           str >> abMode >> epsilon >> iBagInput >> ncycles >> nleaf;
01293         }
01294         if( repeat ) {
01295           if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
01296             if( readCache(input,validated.find(name)->second) 
01297                 != valFilter->size() ) {
01298               cerr << "Cannot read cached BDT values." << endl;
01299               return prepareExit(classifiers,cToClean,bootstraps,3);
01300             }
01301           }
01302         }
01303         else {
01304           // save params to file
01305           ofstream output;
01306           if( makeNewCache(prefix,name.c_str(),output) ) {
01307             output << abMode << " " << epsilon << " " << iBagInput << " "
01308                    << ncycles << " " << nleaf << endl;
01309           }
01310           else {
01311             cerr << "Unable to make output file for BDT." << endl;
01312             return prepareExit(classifiers,cToClean,bootstraps,3);
01313           }
01314 
01315           // set AdaBoost mode
01316           SprTrainedAdaBoost::AdaBoostMode mode = SprTrainedAdaBoost::Discrete;
01317           switch( abMode )
01318             {
01319             case 1 :
01320               mode = SprTrainedAdaBoost::Discrete;
01321               break;
01322             case 2 :
01323               mode = SprTrainedAdaBoost::Real;
01324               break;
01325             case 3 :
01326               mode = SprTrainedAdaBoost::Epsilon;
01327               break;
01328             default :
01329               cerr << "Unknown mode for AdaBoost." << endl;
01330               return prepareExit(classifiers,cToClean,bootstraps,3);
01331             }
01332           
01333           // make decision tree
01334           bool discrete = (mode!=SprTrainedAdaBoost::Real);
01335           SprTopdownTree* tree = new SprTopdownTree(filter.get(),&gini,
01336                                                     nleaf,discrete,0);
01337           if( mode == SprTrainedAdaBoost::Real ) tree->forceMixedNodes();
01338           tree->useFastSort();
01339           cToClean.push_back(tree);
01340           
01341           // make AdaBoost
01342           bool useStandardAB = true;
01343           bool bagInput = (iBagInput==1);
01344           SprAdaBoost* ab = new SprAdaBoost(filter.get(),ncycles,
01345                                             useStandardAB,mode,bagInput);
01346           ab->setEpsilon(epsilon);
01347           classifiers.insert(pair<const string,SprAbsClassifier*>(name,ab));
01348           if( !ab->addTrainable(tree,SprUtils::lowerBound(0.5)) ) {
01349             cerr << "Unable to add a decision tree to AdaBoost." << endl;
01350             return prepareExit(classifiers,cToClean,bootstraps,3);
01351           }
01352         }
01353         
01354         // prepare message
01355         ostringstream ost;
01356         string sMode;
01357         switch( abMode )
01358           {
01359           case 1 :
01360             sMode = "Discrete";
01361             break;
01362           case 2 :
01363             sMode = "Real";
01364             break;
01365           case 3 :
01366             sMode = "Epsilon";
01367             break;
01368           default :
01369             cerr << "Unknown mode for AdaBoost." << endl;
01370             return prepareExit(classifiers,cToClean,bootstraps,3);
01371           }
01372         string sBag;
01373         if( iBagInput == 1 )
01374           sBag = "Yes";
01375         else
01376           sBag = "No";
01377         ost << name.c_str() 
01378             << " = Boosted Decision Trees with:"
01379             << " Mode=" << sMode.c_str()
01380             << " Epsilon=" << epsilon
01381             << " BagTrainingEvents=" << sBag.c_str()
01382             << " nTrees=" << ncycles 
01383             << " nEventsPerLeaf=" << nleaf;
01384         message.insert(pair<const string,string>(name,ost.str()));
01385       }
01386     }
01387     
01388     //
01389     // random forest
01390     //
01391     if( answerYN("Use random forest?") ) {
01392       
01393       // get instances
01394       unsigned ninstance = 0;
01395       ifstream iinst;
01396       if( findCache(prefix,"RF_instances",iinst) ) {
01397         string line;
01398         if( !getline(iinst,line,'\n') ) {
01399           cerr << "Cannot read top line from RF cache." << endl;
01400           return prepareExit(classifiers,cToClean,bootstraps,3);
01401         }
01402         istringstream str(line);
01403         str >> ninstance;
01404       }
01405       ninstance = 
01406         answerHowMany(ninstance,
01407              "How many instances of the classifier would you like to run?");
01408       ofstream oinst;
01409       if( makeNewCache(prefix,"RF_instances",oinst) ) {
01410         oinst << ninstance << endl;
01411       }
01412       else {
01413         cerr << "Unable to make output file for RF." << endl;
01414         return prepareExit(classifiers,cToClean,bootstraps,3);
01415       }
01416       if( !moveNewOntoOld(prefix,"RF_instances") )
01417         return prepareExit(classifiers,cToClean,bootstraps,3);
01418       
01419       // loop over instances
01420       for( int instance=0;instance<ninstance;instance++ ) {
01421         string name = "RF_";
01422         char s [200];
01423         sprintf(s,"%i",instance);
01424         name += s;
01425         unsigned ncycles(0), nleaf(0), nfeatures(filter->dim());
01426         bool repeat = false;
01427         ifstream input;
01428         string line;
01429         if( findCache(prefix,name.c_str(),input) ) {
01430           repeat = true;
01431           if( !getline(input,line,'\n') ) {
01432             cerr << "Cannot read top line from RF cache." << endl;
01433             return prepareExit(classifiers,cToClean,bootstraps,3);
01434           }
01435           istringstream str(line);
01436           str >> ncycles >> nleaf >> nfeatures;
01437         }
01438         cout << "Input number of trees, minimal tree leaf size, "
01439              << "and number of input features for sampling "
01440              << " for RF instance " << instance << " [" 
01441              << ncycles << " " << nleaf << " " << nfeatures << "] ";
01442         getline(cin,line,'\n');
01443         if( !line.empty() ) {
01444           repeat = false;
01445           istringstream str(line);
01446           str >> ncycles >> nleaf >> nfeatures;
01447         }
01448         if( repeat ) {
01449           if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
01450             if( readCache(input,validated.find(name)->second) 
01451                 != valFilter->size() ) {
01452               cerr << "Cannot read cached RF values." << endl;
01453               return prepareExit(classifiers,cToClean,bootstraps,3);
01454             }
01455           }
01456         }
01457         else {
01458           // save params to file
01459           ofstream output;
01460           if( makeNewCache(prefix,name.c_str(),output) ) {
01461             output << ncycles << " " << nleaf << " " << nfeatures << endl;
01462           }
01463           else {
01464             cerr << "Unable to make output file for RF." << endl;
01465             return prepareExit(classifiers,cToClean,bootstraps,3);
01466           }
01467           
01468           // make bootstrap object
01469           SprIntegerBootstrap* bootstrap 
01470             = new SprIntegerBootstrap(filter->dim(),nfeatures);
01471           bootstraps.push_back(bootstrap);
01472         
01473           // make decision tree
01474           bool discrete = false;
01475           SprTopdownTree* tree = new SprTopdownTree(filter.get(),&gini,
01476                                                     nleaf,discrete,bootstrap);
01477           tree->useFastSort();
01478           cToClean.push_back(tree);
01479           
01480           // make bagger
01481           SprBagger* bagger = new SprBagger(filter.get(),ncycles,discrete);
01482           classifiers.insert(pair<const string,SprAbsClassifier*>(name,
01483                                                                   bagger));
01484           if( !bagger->addTrainable(tree) ) {
01485             cerr << "Unable to add a decision tree to Random Forest." << endl;
01486             return prepareExit(classifiers,cToClean,bootstraps,3);
01487           }
01488         }
01489         ostringstream ost;
01490         ost << name.c_str() 
01491             << " = Random Forest with: nTrees=" << ncycles 
01492             << " nEventsPerLeaf=" << nleaf 
01493             << " nFeaturesToSample=" << nfeatures;
01494         message.insert(pair<const string,string>(name,ost.str()));
01495       }
01496     }
01497     
01498     //
01499     // arc-x4
01500     //
01501     if( answerYN("Use arc-x4?") ) {
01502       
01503       // get instances
01504       unsigned ninstance = 0;
01505       ifstream iinst;
01506       if( findCache(prefix,"AX4instances_",iinst) ) {
01507         string line;
01508         if( !getline(iinst,line,'\n') ) {
01509           cerr << "Cannot read top line from AX4 cache." << endl;
01510           return prepareExit(classifiers,cToClean,bootstraps,3);
01511         }
01512         istringstream str(line);
01513         str >> ninstance;
01514       }
01515       ninstance = 
01516         answerHowMany(ninstance,
01517             "How many instances of the classifier would you like to run?");
01518       ofstream oinst;
01519       if( makeNewCache(prefix,"AX4instances_",oinst) ) {
01520         oinst << ninstance << endl;
01521       }
01522       else {
01523         cerr << "Unable to make output file for AX4." << endl;
01524         return prepareExit(classifiers,cToClean,bootstraps,3);
01525       }
01526       if( !moveNewOntoOld(prefix,"AX4instances_") )
01527         return prepareExit(classifiers,cToClean,bootstraps,3);
01528       
01529       // loop over instances
01530       for( int instance=0;instance<ninstance;instance++ ) {
01531         string name = "AX4_";
01532         char s [200];
01533         sprintf(s,"%i",instance);
01534         name += s;
01535         unsigned ncycles(0), nleaf(0), nfeatures(filter->dim());
01536         bool repeat = false;
01537         ifstream input;
01538         string line;
01539         if( findCache(prefix,name.c_str(),input) ) {
01540           repeat = true;
01541           if( !getline(input,line,'\n') ) {
01542             cerr << "Cannot read top line from AX4 cache." << endl;
01543             return prepareExit(classifiers,cToClean,bootstraps,3);
01544           }
01545           istringstream str(line);
01546           str >> ncycles >> nleaf >> nfeatures;
01547         }
01548         cout << "Input number of trees, minimal tree leaf size, "
01549              << "and number of input features for sampling "
01550              << " for AX4 instance " << instance << " [" 
01551              << ncycles << " " << nleaf << " " << nfeatures << "] ";
01552         getline(cin,line,'\n');
01553         if( !line.empty() ) {
01554           repeat = false;
01555           istringstream str(line);
01556           str >> ncycles >> nleaf >> nfeatures;
01557         }
01558         if( repeat ) {
01559           if( resetValidation(name.c_str(),validated,valFilter.get()) ) {
01560             if( readCache(input,validated.find(name)->second) 
01561                 != valFilter->size() ) {
01562               cerr << "Cannot read cached AX4 values." << endl;
01563               return prepareExit(classifiers,cToClean,bootstraps,3);
01564             }
01565           }
01566         }
01567         else {
01568           // save params to file
01569           ofstream output;
01570           if( makeNewCache(prefix,name.c_str(),output) ) {
01571             output << ncycles << " " << nleaf << " " << nfeatures << endl;
01572           }
01573           else {
01574             cerr << "Unable to make output file for AX4." << endl;
01575             return prepareExit(classifiers,cToClean,bootstraps,3);
01576           }
01577           
01578           // make bootstrap object
01579           SprIntegerBootstrap* bootstrap 
01580             = new SprIntegerBootstrap(filter->dim(),nfeatures);
01581           bootstraps.push_back(bootstrap);
01582           
01583           // make decision tree
01584           bool discrete = false;
01585           SprTopdownTree* tree = new SprTopdownTree(filter.get(),&gini,
01586                                                     nleaf,discrete,bootstrap);
01587           tree->useFastSort();
01588           cToClean.push_back(tree);
01589           
01590           // make bagger
01591           SprArcE4* arcer = new SprArcE4(filter.get(),ncycles,discrete);
01592           classifiers.insert(pair<const string,SprAbsClassifier*>(name,arcer));
01593           if( !arcer->addTrainable(tree) ) {
01594             cerr << "Unable to add a decision tree to arc-x4." << endl;
01595             return prepareExit(classifiers,cToClean,bootstraps,3);
01596           }
01597         }
01598         ostringstream ost;
01599         ost << name.c_str() 
01600             << " = arc-x4 with: nTrees=" << ncycles 
01601             << " nEventsPerLeaf=" << nleaf 
01602             << " nFeaturesToSample=" << nfeatures;
01603         message.insert(pair<const string,string>(name,ost.str()));
01604       }
01605     }
01606     
01607     //
01608     // Train all classifiers.
01609     //
01610     for( map<string,SprAbsClassifier*>::const_iterator 
01611            iter=classifiers.begin();iter!=classifiers.end();iter++ ) {
01612       cout << "Training classifier " << iter->first.c_str() << endl;
01613       
01614       // train
01615       if( !iter->second->train(verbose) ) {
01616         cerr << "Classifier " << iter->first.c_str() << " cannot be trained. " 
01617              << "Skipping..." << endl;
01618         continue;
01619       }
01620       
01621       // fill out vector of responses for validation data
01622       if( resetValidation(iter->first.c_str(),validated,valFilter.get()) ) {
01623         cout << "Saving responses for validation data for classifier "
01624              << iter->first.c_str() << endl;
01625         
01626         // make a trained classifier
01627         SprAbsTrainedClassifier* t = iter->second->makeTrained();
01628         if( t == 0 ) {
01629           cerr << "Unable to make trained classifier for " 
01630                << iter->first.c_str() << endl;
01631           continue;
01632         }
01633 
01634         // compute responses
01635         map<string,vector<SIAResponse> >::iterator found =
01636           validated.find(iter->first);
01637         assert( found != validated.end() );
01638         for( int i=0;i<valFilter->size();i++ ) {
01639           const SprPoint* p = (*(valFilter.get()))[i];
01640           found->second.push_back(SIAResponse(int(inputClasses[1]==p->class_),
01641                                            valFilter->w(i),
01642                                            t->response(p)));
01643         }
01644         
01645         // clean up
01646         delete t;
01647 
01648         // store the vector into cache
01649         if( !storeNewCache(prefix,iter->first.c_str(),found->second) ) {
01650           cerr << "Unable to save new cache for classifier " 
01651                << iter->first.c_str() << endl;
01652           return prepareExit(classifiers,cToClean,bootstraps,4);
01653         }
01654       }
01655       else {
01656         cerr << "Vector of responses for classifier " << iter->first.c_str()
01657              << " already exists. Skipping..."<< endl;
01658       }
01659     }
01660     
01661     //
01662     // replace old cache with new cache
01663     //
01664     for( map<string,SprAbsClassifier*>::const_iterator 
01665            iter=classifiers.begin();iter!=classifiers.end();iter++ ) {
01666       if( !moveNewOntoOld(prefix,iter->first.c_str()) )
01667         return prepareExit(classifiers,cToClean,bootstraps,5);
01668     }
01669     
01670     //
01671     // ask the user to specify signal efficiency points
01672     //
01673     vector<double> effS;
01674     ifstream effInput;
01675     if( findCache(prefix,"eff",effInput) ) {
01676       string line;
01677       if( getline(effInput,line) ) {
01678         istringstream str(line);
01679         double dummy = 0;
01680         while( str >> dummy ) effS.push_back(dummy);
01681       }
01682     }
01683     else {
01684       effS.push_back(0.1);
01685       effS.push_back(0.2);
01686       effS.push_back(0.3);
01687       effS.push_back(0.4);
01688       effS.push_back(0.5);
01689       effS.push_back(0.6);
01690       effS.push_back(0.7);
01691       effS.push_back(0.8);
01692       effS.push_back(0.9);
01693     }
01694     effInput.close();
01695     cout << "Input signal efficiency values for which background "
01696          << "will be estimated [ ";
01697     for( int i=0;i<effS.size();i++ ) 
01698       cout << effS[i] << " ";
01699     cout << "] ";
01700     string line;
01701     getline(cin,line,'\n');
01702     if( !line.empty() ) {
01703       effS.clear();
01704       istringstream str(line);
01705       double dummy = 0;
01706       while( str >> dummy ) effS.push_back(dummy);
01707     }
01708     if( effS.empty() ) {
01709       cerr << "What, are you trying to be cute? Enter values." << endl;
01710       return prepareExit(classifiers,cToClean,bootstraps,7);
01711     }
01712     stable_sort(effS.begin(),effS.end());
01713     if( !storeEffCache(prefix,"eff",effS) ) {
01714       cerr << "Unable to store efficiency values." << endl;
01715       return prepareExit(classifiers,cToClean,bootstraps,6);
01716     }
01717     
01718     //
01719     // Estimate background fractions for these signal efficiencies.
01720     //
01721     map<string,vector<double> > effB;
01722     const double wsig = valFilter->weightInClass(inputClasses[1]);
01723     const double wbgr = valFilter->weightInClass(inputClasses[0]);
01724     assert( wsig > 0 );
01725     assert( wbgr > 0 );
01726     
01727     for( map<string,vector<SIAResponse> >::const_iterator 
01728            iter=validated.begin();iter!=validated.end();iter++ ) {
01729       
01730       // prepare vectors
01731       vector<pair<double,double> > signal;
01732       vector<pair<double,double> > bgrnd;
01733       
01734       // fill them
01735       for( int i=0;i<iter->second.size();i++ ) {
01736         if(      iter->second[i].cls == 0 ) {
01737           bgrnd.push_back(pair<double,double>(iter->second[i].response,
01738                                               iter->second[i].weight));
01739         }
01740         else if( iter->second[i].cls == 1 ) {
01741           signal.push_back(pair<double,double>(iter->second[i].response,
01742                                                iter->second[i].weight));
01743         }
01744       }
01745       
01746       // sort
01747       stable_sort(bgrnd.begin(),bgrnd.end(),SIACmpPairDDFirst());
01748       stable_sort(signal.begin(),signal.end(),SIACmpPairDDFirst());
01749       
01750       // find dividing point in classifier response
01751       vector<double> cuts(effS.size());
01752       double w = 0;
01753       int divider = 0;
01754       int i = 0;
01755       while( i<signal.size() && divider<effS.size() ) {
01756         w += signal[i].second;
01757         if( (w/wsig) > effS[divider] ) {
01758           if( i == 0 )
01759             cuts[divider] = signal[0].first;
01760           else
01761             cuts[divider] = 0.5 * (signal[i].first + signal[i-1].first); 
01762           divider++;
01763         }
01764         i++;
01765       }
01766       
01767       // find background fractions
01768       pair<map<string,vector<double> >::iterator,bool> inserted = 
01769         effB.insert(pair<const string,vector<double> >(iter->first,
01770                                             vector<double>(effS.size(),1)));
01771       assert( inserted.second );
01772       w = 0;
01773       divider = 0;
01774       i = 0;
01775       while( i<bgrnd.size() && divider<effS.size() ) {
01776         if( bgrnd[i].first < cuts[divider] )
01777           inserted.first->second[divider++] = w/wbgr;
01778         w += bgrnd[i].second;
01779         i++;
01780       }
01781     }
01782     
01783     //
01784     // make a table of signal and background efficiencies
01785     //
01786     cout << "===========================================" << endl;
01787     for( map<string,string>::const_iterator 
01788            iter=message.begin();iter!=message.end();iter++ )
01789       cout << iter->second.c_str() << endl;
01790     cout << "===========================================" << endl;
01791     cout << "Table of surviving background fractions" 
01792          << " (* shows minimal value in a row)" << endl;
01793     cout << "===========================================" << endl;
01794     char s[200];
01795     sprintf(s,"Signal Eff \\ Classifiers |");
01796     cout << s;
01797     string temp = "--------------------------";
01798     for( map<string,vector<double> >::const_iterator 
01799            iter=effB.begin();iter!=effB.end();iter++ ) {
01800       sprintf(s," %8s |",iter->first.c_str());
01801       cout << s;
01802       temp += "-----------";
01803     }
01804     cout << endl;
01805     cout << temp.c_str() << endl;
01806     for( int i=0;i<effS.size();i++ ) {
01807       sprintf(s,"          %6.4f         |",effS[i]);
01808       cout << s;
01809       vector<string> names;
01810       vector<double> values;
01811       for( map<string,vector<double> >::const_iterator 
01812              iter=effB.begin();iter!=effB.end();iter++ ) {
01813         names.push_back(iter->first);
01814         values.push_back(iter->second[i]);
01815       }
01816       int foundMin = min_element(values.begin(),values.end()) - values.begin();
01817       for( int j=0;j<names.size();j++ ) {
01818         if( j == foundMin )
01819           sprintf(s," *%7.5f |",values[j]);
01820         else
01821           sprintf(s,"  %7.5f |",values[j]);
01822         cout << s;
01823       }
01824       cout << endl;
01825     }
01826     cout << "===========================================" << endl;
01827 
01828     // save output to n-tuple
01829     if( answerYN("Save classifier output to an ntuple?") ) {
01830 
01831       // open file
01832       string ntupleFile 
01833         = answerName("Give name of the output ntuple file -->");
01834       auto_ptr<SprAbsWriter> 
01835         writer(SprRWFactory::makeWriter(writeMode,"ClassifierComparison"));
01836       if( !writer->init(ntupleFile.c_str()) ) {
01837         cerr << "Unable to open ntuple file " << ntupleFile.c_str() << endl;
01838         return prepareExit(classifiers,cToClean,bootstraps,7);
01839       }
01840 
01841       // make axes
01842       vector<string> axes;
01843       valFilter->vars(axes);
01844       if( !writer->setAxes(axes) ) {
01845         cerr << "Unable to set axes for the ntuple writer." << endl;
01846         return prepareExit(classifiers,cToClean,bootstraps,8);
01847       }
01848       for( map<string,vector<SIAResponse> >::const_iterator
01849              iter=validated.begin();iter!=validated.end();iter++ ) {
01850         if( !writer->addAxis(iter->first.c_str()) ) {
01851           cerr << "Unable to add a classifier axis for " 
01852                << iter->first.c_str() << endl;
01853           return prepareExit(classifiers,cToClean,bootstraps,8);
01854         }
01855       }
01856 
01857       // feed
01858       for( int i=0;i<valFilter->size();i++ ) {
01859         const SprPoint* p = (*(valFilter.get()))[i];
01860         double w = valFilter->w(i);
01861         vector<double> f;
01862         for( map<string,vector<SIAResponse> >::const_iterator
01863              iter=validated.begin();iter!=validated.end();iter++ ) {
01864           f.push_back((iter->second)[i].response);
01865         }
01866         if( !writer->write(p->class_,p->index_,w,p->x_,f) ) {
01867           cerr << "Unable to write event " << p->index_ << endl;
01868           return prepareExit(classifiers,cToClean,bootstraps,8);
01869         }
01870       }
01871       cout << "Writing to ntuple done." << endl;
01872 
01873       // clean up
01874       writer->close();
01875     }// end saving to file
01876 
01877     // clean up
01878     prepareExit(classifiers,cToClean,bootstraps,0);
01879 
01880     // exit?
01881     if( !answerYN("Continue?") ) break;
01882   }// end of the big loop
01883   
01884   // exit
01885   return 0;
01886 }
01887   

Generated on Tue Jun 9 17:41:59 2009 for CMSSW by  doxygen 1.5.4