CMS 3D CMS Logo

SprCombinerApp.cc File Reference

#include "PhysicsTools/StatPatternRecognition/interface/SprExperiment.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprCombiner.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprData.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprEmptyFilter.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprAbsReader.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprAbsWriter.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprRWFactory.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprStringParser.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprClass.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprDefs.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprClassifierReader.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprAbsTrainedClassifier.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprAbsTwoClassCriterion.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprAbsClassifier.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprIntegerBootstrap.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprLoss.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprAverageLoss.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprTransformation.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprAdaBoost.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprBagger.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprStdBackprop.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprAbsVarTransformer.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprVarTransformerReader.hh"
#include "PhysicsTools/StatPatternRecognition/interface/SprTransformerFilter.hh"
#include <stdlib.h>
#include <unistd.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <memory>
#include <string>
#include <cassert>
#include <map>
#include <utility>

Go to the source code of this file.

Functions

void help (const char *prog)
int main (int argc, char **argv)
void prepareExit (vector< SprAbsTwoClassCriterion * > &criteria, vector< SprIntegerBootstrap * > &bstraps, vector< SprAbsClassifier * > &classifiers)


Function Documentation

void help ( const char *  prog  ) 

Definition at line 45 of file SprCombinerApp.cc.

References GenMuonPlsPt100GeV_cfg::cout, and lat::endl().

00046 {
00047   cout << "Usage:  " << prog
00048        << " list_of_input_config_subclassifier_files"
00049        << " input_config_file_for_global_classifier" 
00050        << " input_data_file" << endl;
00051   cout << "\t Options: " << endl;
00052   cout << "\t-h --- help                                        " << endl;
00053   cout << "\t-y list of input classes (see SprAbsFilter.hh)     " << endl;
00054   cout << "\t-Q apply variable transformation saved in file     " << endl;
00055   cout << "\t-a input ascii file mode (see SprSimpleReader.hh)  " << endl;
00056   cout << "\t-v verbose level (0=silent default,1,2)            " << endl;
00057   cout << "\t-w scale all signal weights by this factor         " << endl;
00058   cout << "\t-f save trained classifier configuration to file   " << endl;
00059   cout << "\t-K keep this fraction in training set and          " << endl;
00060   cout << "\t\t put the rest into validation set                " << endl;
00061   cout << "\t-D randomize training set split-up                 " << endl;
00062   cout << "\t-t read validation/test data from a file           " << endl;
00063   cout << "\t\t (must be in same format as input data!!!        " << endl;
00064   cout << "\t-d frequency of print-outs for validation data     " << endl;
00065 }

int main ( int  argc,
char **  argv 
)

Definition at line 78 of file SprCombinerApp.cc.

References c, TestMuL1L2Filter_cff::cerr, GenMuonPlsPt100GeV_cfg::cout, EgammaValidation_Zee_cff::cut, dummy, lat::endl(), file, filter, first, mergeAndRegister_online::fname, help(), i, j, k, parsecf::pyparsing::line(), prepareExit(), edm::second(), split, t, vars, and weights.

00079 {
00080   // check command line
00081   if( argc < 4 ) {
00082     help(argv[0]);
00083     return 1;
00084   }
00085 
00086   // init
00087   int readMode = 0;
00088   int verbose = 0;
00089   bool scaleWeights = false;
00090   double sW = 1.;
00091   bool useStandard = false;
00092   string inputClassesString;
00093   string valFile;
00094   unsigned valPrint = 0;
00095   string outFile;
00096   bool split = false;
00097   double splitFactor = 0;
00098   bool splitRandomize = false;
00099   string transformerFile;
00100 
00101   // decode command line
00102   int c;
00103   extern char* optarg;
00104   extern int optind;
00105   while( (c = getopt(argc,argv,"hy:a:v:w:f:K:Dt:d:")) != EOF ) {
00106     switch( c )
00107       {
00108       case 'h' :
00109         help(argv[0]);
00110         return 1;
00111       case 'y' :
00112         inputClassesString = optarg;
00113         break;
00114       case 'Q' :
00115         transformerFile = optarg;
00116         break;
00117       case 'a' :
00118         readMode = (optarg==0 ? 0 : atoi(optarg));
00119         break;
00120       case 'v' :
00121         verbose = (optarg==0 ? 0 : atoi(optarg));
00122         break;
00123       case 'w' :
00124         if( optarg != 0 ) {
00125           scaleWeights = true;
00126           sW = atof(optarg);
00127         }
00128         break;
00129       case 'f' :
00130         outFile = optarg;
00131         break;
00132       case 'K' :
00133         split = true;
00134         splitFactor = (optarg==0 ? 0 : atof(optarg));
00135         break;
00136       case 'D' :
00137         splitRandomize = true;
00138         break;
00139       case 't' :
00140         valFile = optarg;
00141         break;
00142       case 'd' :
00143         valPrint = (optarg==0 ? 0 : atoi(optarg));
00144         break;
00145       }
00146   }
00147 
00148   // Must have 3 arguments on the command line
00149   string trainFile = argv[argc-1];
00150   if( trainFile.empty() ) {
00151     cerr << "No input data file is specified." << endl;
00152     return 1;
00153   }
00154   cout << "Will read input data from file " << trainFile.c_str() << endl;
00155   string configFile = argv[argc-2];
00156   if( configFile.empty() ) {
00157     cerr << "No config file for the global classifier specified." << endl;
00158     return 1;
00159   }
00160   cout << "Will read global classifier config from file "
00161        << configFile.c_str() << endl;
00162   string subConfigList = argv[argc-3];
00163   if( subConfigList.empty() ) {
00164     cerr << "No config file list found for sub-classifiers." << endl;
00165     return 1;
00166   }
00167   cout << "Will read sub-classifier configs from files " 
00168        << subConfigList.c_str() << endl;
00169 
00170   // check options
00171   if( subConfigList.empty() || configFile.empty() ) {
00172     cerr << "User must specify combiner configuration." << endl;
00173     return 1;
00174   }
00175 
00176   // get classifier names and config files
00177   vector<vector<string> > subConfigFiles;
00178   SprStringParser::parseToStrings(subConfigList.c_str(),subConfigFiles);
00179   bool useSubConfig 
00180     = ( !subConfigFiles.empty() && !subConfigFiles[0].empty() );
00181   if( !useSubConfig ) {
00182     cerr << "Unable to process list of sub-classifier config files." << endl;
00183     return 1;
00184   }
00185   int nTrained = subConfigFiles[0].size();
00186 
00187   // make reader
00188   SprRWFactory::DataType inputType 
00189     = ( readMode==0 ? SprRWFactory::Root : SprRWFactory::Ascii );
00190   auto_ptr<SprAbsReader> reader(SprRWFactory::makeReader(inputType,readMode));
00191 
00192   // read input data from file
00193   auto_ptr<SprAbsFilter> filter(reader->read(trainFile.c_str()));
00194   if( filter.get() == 0 ) {
00195     cerr << "Unable to read data from file " << trainFile.c_str() << endl;
00196     return 2;
00197   }
00198   vector<string> vars;
00199   filter->vars(vars);
00200   cout << "Read data from file " << trainFile.c_str() << " for variables";
00201   for( int i=0;i<vars.size();i++ ) 
00202     cout << " \"" << vars[i].c_str() << "\"";
00203   cout << endl;
00204   cout << "Total number of points read: " << filter->size() << endl;
00205 
00206   // filter training data by class
00207   vector<SprClass> inputClasses;
00208   if( !filter->filterByClass(inputClassesString.c_str()) ) {
00209     cerr << "Cannot choose input classes for string " 
00210          << inputClassesString << endl;
00211     return 2;
00212   }
00213   filter->classes(inputClasses);
00214   assert( inputClasses.size() > 1 );
00215   cout << "Training data filtered by class." << endl;
00216   for( int i=0;i<inputClasses.size();i++ ) {
00217     cout << "Points in class " << inputClasses[i] << ":   " 
00218          << filter->ptsInClass(inputClasses[i]) << endl;
00219   }
00220 
00221   // scale weights
00222   if( scaleWeights ) {
00223     cout << "Signal weights are multiplied by " << sW << endl;
00224     filter->scaleWeights(inputClasses[1],sW);
00225   }
00226 
00227   // read test data
00228   auto_ptr<SprAbsFilter> valFilter;
00229   if( split && !valFile.empty() ) {
00230     cerr << "Unable to split training data and use validation data " 
00231          << "from a separate file." << endl;
00232     return 2;
00233   }
00234   if( split ) {
00235     cout << "Splitting training data with factor " << splitFactor << endl;
00236     if( splitRandomize )
00237       cout << "Will use randomized splitting." << endl;
00238     vector<double> weights;
00239     SprData* splitted = filter->split(splitFactor,weights,splitRandomize);
00240     if( splitted == 0 ) {
00241       cerr << "Unable to split training data." << endl;
00242       return 2;
00243     }
00244     bool ownData = true;
00245     valFilter.reset(new SprEmptyFilter(splitted,weights,ownData));
00246     cout << "Training data re-filtered:" << endl;
00247     for( int i=0;i<inputClasses.size();i++ ) {
00248       cout << "Points in class " << inputClasses[i] << ":   " 
00249            << filter->ptsInClass(inputClasses[i]) << endl;
00250     }
00251   }
00252   if( !valFile.empty() ) {
00253     // make test reader
00254     auto_ptr<SprAbsReader> 
00255       valReader(SprRWFactory::makeReader(inputType,readMode));
00256     
00257     // read test data from file
00258     valFilter.reset(valReader->read(valFile.c_str()));
00259     if( valFilter.get() == 0 ) {
00260       cerr << "Unable to read data from file " << valFile.c_str() << endl;
00261       return 2;
00262     }
00263     vector<string> valVars;
00264     valFilter->vars(valVars);
00265     cout << "Read data from file " << valFile.c_str() << " for variables";
00266     for( int i=0;i<valVars.size();i++ ) 
00267       cout << " \"" << valVars[i].c_str() << "\"";
00268     cout << endl;
00269     cout << "Total number of points read: " << valFilter->size() << endl;
00270     
00271     // filter training data by class
00272     if( !valFilter->filterByClass(inputClassesString.c_str()) ) {
00273       cerr << "Cannot choose input classes for string " 
00274            << inputClassesString << endl;
00275       return 2;
00276     }
00277     valFilter->classes(inputClasses);
00278     assert( inputClasses.size() > 1 );
00279     cout << "Validation data filtered by class." << endl;
00280     for( int i=0;i<inputClasses.size();i++ ) {
00281       cout << "Points in class " << inputClasses[i] << ":   " 
00282            << valFilter->ptsInClass(inputClasses[i]) << endl;
00283     }
00284     
00285     // scale weights
00286     if( scaleWeights ) {
00287       cout << "Signal weights are multiplied by " << sW << endl;
00288       valFilter->scaleWeights(inputClasses[1],sW);
00289     }
00290   }
00291 
00292   // apply transformation of variables to training and test data
00293   auto_ptr<SprAbsFilter> garbage_train, garbage_valid;
00294   if( !transformerFile.empty() ) {
00295     SprVarTransformerReader transReader;
00296     const SprAbsVarTransformer* t = transReader.read(transformerFile.c_str());
00297     if( t == 0 ) {
00298       cerr << "Unable to read VarTransformer from file "
00299            << transformerFile.c_str() << endl;
00300       return 2;
00301     }
00302     SprTransformerFilter* t_train = new SprTransformerFilter(filter.get());
00303     SprTransformerFilter* t_valid = 0;
00304     if( valFilter.get() != 0 )
00305       t_valid = new SprTransformerFilter(valFilter.get());
00306     bool replaceOriginalData = true;
00307     if( !t_train->transform(t,replaceOriginalData) ) {
00308       cerr << "Unable to apply VarTransformer to training data." << endl;
00309       return 2;
00310     }
00311     if( t_valid!=0 && !t_valid->transform(t,replaceOriginalData) ) {
00312       cerr << "Unable to apply VarTransformer to validation data." << endl;
00313       return 2;
00314     }
00315     cout << "Variable transformation from file "
00316          << transformerFile.c_str() << " has been applied to "
00317          << "training and validation data." << endl;
00318     garbage_train.reset(filter.release());
00319     garbage_valid.reset(valFilter.release());
00320     filter.reset(t_train);
00321     valFilter.reset(t_valid);
00322   }
00323 
00324   //
00325   // make combiner
00326   //
00327   SprCombiner combiner(filter.get());
00328 
00329   //
00330   // read classifier configuration
00331   //
00332   for( int ic=0;ic<nTrained;ic++ ) {
00333 
00334     // open file
00335     string fname = subConfigFiles[0][ic];
00336     ifstream file(fname.c_str());
00337     if( !file ) {
00338       cerr << "Unable to open file " << fname.c_str() << endl;
00339       return 3;
00340     }
00341     cout << "Reading classifier configuration from file " 
00342          << fname.c_str() << endl;
00343 
00344     // get path to sub-classifier file
00345     string line;
00346     unsigned nLine = 1;
00347     if( !getline(file,line) ) {
00348       cerr << "Cannot read line " << nLine 
00349            << " from file " << fname.c_str() << endl;
00350       return 3;
00351     }
00352     string pathToConfig, dummy;
00353     istringstream istpath(line);
00354     istpath >> dummy >> pathToConfig;
00355     if( pathToConfig.empty() ) {
00356       cerr << "Path to classifier not specified in file "
00357            << fname.c_str() << endl;
00358     }
00359 
00360     // read designated classifier name
00361     nLine++;
00362     if( !getline(file,line) ) {
00363       cerr << "Cannot read line " << nLine 
00364            << " from file " << fname.c_str() << endl;
00365       return 3;
00366     }
00367     string subName;
00368     istringstream istname(line);
00369     istname >> dummy >> subName;
00370     if( subName.empty() ) {
00371       cout << "Name for classifier " << ic << " not specified." 
00372            << " Will use the default." << endl;
00373     }
00374 
00375     // read default value
00376     nLine++;
00377     if( !getline(file,line) ) {
00378       cerr << "Cannot read line " << nLine 
00379            << " from file " << fname.c_str() << endl;
00380       return 3;
00381     }
00382     double defaultValue = 0;
00383     istringstream istdefault(line);
00384     istdefault >> dummy >> defaultValue;
00385     cout << "Will use default response " << defaultValue 
00386          << " for classifier " << ic << endl;
00387 
00388     // read number of constraints
00389     nLine++;
00390     if( !getline(file,line) ) {
00391       cerr << "Cannot read line " << nLine 
00392            << " from file " << fname.c_str() << endl;
00393       return 3;
00394     }
00395     unsigned nConstraints = 0;
00396     istringstream istconst(line);
00397     istconst >> dummy >> nConstraints;
00398     cout << "Will use " << nConstraints << " constraints "
00399          << "for classifier " << ic << endl;
00400 
00401     // read constraints
00402     map<string,SprCut> constraints;
00403     for( int j=0;j<nConstraints;j++ ) {
00404       nLine++;
00405       if( !getline(file,line) ) {
00406         cerr << "Cannot read line " << nLine 
00407              << " from file " << fname.c_str() << endl;
00408         return 3;
00409       }
00410       istringstream ist(line);
00411       string varName;
00412       unsigned nCut = 0;
00413       ist >> varName >> nCut;
00414       if( varName.empty() ) {
00415         cerr << "Unable to read variable name on line " << nLine
00416              << " in file " << fname.c_str() << endl;
00417       }
00418       SprCut cut;
00419       double xa(0), xb(0);
00420       for( unsigned k=0;k<nCut;k++ ) {
00421         ist >> xa >> xb;
00422         cut.push_back(SprInterval(xa,xb));
00423       }
00424       cout << "Applying constraint on variable " << varName.c_str()
00425            << " for classifier " << ic << " : ";
00426       for( int k=0;k<cut.size();k++ ) 
00427         cout << cut[k].first << " " << cut[k].second << "   | ";
00428       cout << endl;
00429       constraints.insert(pair<const string,SprCut>(varName,cut));
00430     }
00431 
00432     // read classifier
00433     SprAbsTrainedClassifier* trained
00434       = SprClassifierReader::readTrained(pathToConfig.c_str(),verbose);
00435     if( trained == 0 ) {
00436       cerr << "Unable to read classifier configuration from file "
00437            << pathToConfig.c_str() << endl;
00438       return 3;
00439     }
00440     cout << "Read classifier " << trained->name().c_str()
00441          << " with dimensionality " << trained->dim() << endl;
00442 
00443     // get a list of trained variables
00444     vector<string> trainedVars;
00445     trained->vars(trainedVars);
00446     cout << "Variables:      " << endl;
00447     for( int j=0;j<trainedVars.size();j++ ) 
00448       cout << trainedVars[j].c_str() << " ";
00449     cout << endl;
00450 
00451     // add classifier to the combiner
00452     bool ownTrained = true;
00453     if( !combiner.addTrained(trained,subName.c_str(),constraints,
00454                              defaultValue,ownTrained) ) {
00455       cerr << "Unable to add trained classifier " << ic 
00456            << " to combiner." << endl;
00457       return 3;
00458     }
00459   }
00460 
00461   // close trained classifier list
00462   if( !combiner.closeClassifierList() ) {
00463     cerr << "Unable to close the trained classifier list for the combiner." 
00464          << endl;
00465     return 4;
00466   }
00467   SprEmptyFilter* features = combiner.features();
00468 
00469   //
00470   // read trainable classifier config
00471   //
00472   ifstream file(configFile.c_str());
00473   if( !file ) {
00474     cerr << "Unable to open file " << configFile.c_str() << endl;
00475     return 5;
00476   }
00477   cout << "Reading classifier configuration from file " 
00478        << configFile.c_str() << endl;
00479   unsigned nLine = 0;
00480   bool discreteTree = false;
00481   bool mixedNodesTree = false;
00482   bool fastSort = false;
00483   bool readOneEntry = true;
00484   vector<SprAbsTwoClassCriterion*> crits;
00485   vector<SprIntegerBootstrap*> bstraps;
00486   vector<SprAbsClassifier*> destroyC;
00487   vector<SprCCPair> useC;
00488   if( !SprClassifierReader::readTrainableConfig(file,nLine,features,
00489                                                 discreteTree,mixedNodesTree,
00490                                                 fastSort,crits,
00491                                                 bstraps,destroyC,
00492                                                 useC,readOneEntry) ) {
00493     cerr << "Unable to read trainable classifier config from file " 
00494          << configFile.c_str() << endl;
00495     prepareExit(crits,bstraps,destroyC);
00496     return 5;
00497   }
00498   SprAbsClassifier* trainable = useC[0].first;
00499   cout << "Setting trainable classifier for combiner to " 
00500        << trainable->name() << endl;
00501   combiner.setTrainable(trainable);
00502 
00503   // make per-event loss
00504   auto_ptr<SprAverageLoss> loss;
00505   if( valFilter.get()!=0 && valPrint>0 ) {
00506     string trainableName = trainable->name();
00507     if( trainableName=="AdaBoost" || trainableName=="Bagger"
00508         || trainableName=="ArcE4" || trainableName=="StdBackprop" ) {
00509       cout << "For simplicity only quadratic loss can be displayed." << endl;
00510       if( trainableName=="AdaBoost" ) {
00511         loss.reset(new SprAverageLoss(&SprLoss::quadratic,
00512                                       &SprTransformation::logit));
00513       }
00514       else {
00515         loss.reset(new SprAverageLoss(&SprLoss::quadratic));
00516       }
00517       if( trainableName=="AdaBoost" ) {
00518         if( !static_cast<SprAdaBoost*>(trainable)
00519             ->setValidation(features,valPrint,loss.get()) ) {
00520           cerr << "Unable to set validation loss." << endl;
00521           return 6;
00522         }
00523       }
00524       else if( trainableName=="Bagger" || trainableName=="ArcE4" ) {
00525         if( !static_cast<SprBagger*>(trainable)
00526             ->setValidation(features,valPrint,0,loss.get()) ) {
00527           cerr << "Unable to set validation loss." << endl;
00528           return 6;
00529         }
00530       }
00531       else if( trainableName=="StdBackprop" ) {
00532         if( !static_cast<SprStdBackprop*>(trainable)
00533             ->setValidation(features,valPrint,loss.get()) ) {
00534           cerr << "Unable to set validation loss." << endl;
00535           return 6;
00536         }
00537       }
00538     }
00539   }
00540 
00541   // train
00542   if( !combiner.train(verbose) ) {
00543     cerr << "Combiner finished with error." << endl;
00544     prepareExit(crits,bstraps,destroyC);
00545     return 7;
00546   }
00547 
00548   // save trained combiner
00549   if( !outFile.empty() ) {
00550     if( !combiner.store(outFile.c_str()) ) {
00551       cerr << "Cannot store Combiner to file " << outFile.c_str() << endl;
00552       prepareExit(crits,bstraps,destroyC);
00553       return 8;
00554     }
00555   }
00556 
00557   // exit
00558   prepareExit(crits,bstraps,destroyC);
00559   return 0;
00560 }

void prepareExit ( vector< SprAbsTwoClassCriterion * > &  criteria,
vector< SprIntegerBootstrap * > &  bstraps,
vector< SprAbsClassifier * > &  classifiers 
)

Definition at line 68 of file SprCombinerApp.cc.

References i.

00071 {
00072   for( int i=0;i<criteria.size();i++ ) delete criteria[i];
00073   for( int i=0;i<classifiers.size();i++ ) delete classifiers[i];
00074   for( int i=0;i<bstraps.size();i++ ) delete bstraps[i];
00075 }


Generated on Tue Jun 9 17:55:00 2009 for CMSSW by  doxygen 1.5.4