CMS 3D CMS Logo

SprOutputAnalyzerApp.cc

Go to the documentation of this file.
00001 //$Id: SprOutputAnalyzerApp.cc,v 1.4 2007/12/01 01:29:41 narsky Exp $
00002 /*
00003   This executable for analysis of output ascii files produced by 
00004   a classifier. It lets the user quickly estimate fractions of 
00005   surviving background given the signal efficiency.
00006 
00007   The executable lets you choose between looking at fractional
00008   efficiency (default) and absolute event weights (-W option). If -W
00009   option is chosen, you can adjust the weights of signal and
00010   background events on the fly using -s and -b command-line
00011   options. These simply multiply the weights by the factor you
00012   provide.
00013 
00014   -c lets you look at a specific FOM as a function of the signal and
00015   background efficiencies. This option will only work if -W is
00016   specified.
00017 
00018   Here is, for example, what I have been doing for the Knunu analysis:
00019 
00020   SprOutputAnalyzerApp -y '.:1' -C bag -W -c 9 -s 0.00032 -b 2 save.out
00021 
00022   That is, I trained random forest and then used -o option of
00023   SprBaggerDecisionTreeApp to compute classifier output for the
00024   validation set. SprBaggerDecisionTreeApp executable names random
00025   forest 'bag' and saves the values in the column with a corresponding
00026   name. 0.00032 is the factor by which signal events in the validation
00027   sample are multiplied to scale them to the actual number expected in
00028   the Runs 1-4 data and 2 is the corresponding factor for the
00029   background. I want to monitor Punzi FOM. In the end, I get something
00030   like this:
00031 
00032 Input signal weights for which background will be estimated [ 1.26 ] 0.9 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4
00033 ===========================================
00034 Table of surviving background fractions (* shows minimal value in a row; FOM shown in parentheses)
00035 ===========================================
00036 Signal Eff   \  Classifiers  |                                          bag |
00037 -----------------------------------------------------------------------------
00038               0.9000         | *     7.84885 +-      1.90363 (     0.20923) |
00039               1.0000         | *     9.15750 +-      2.15844 (     0.22094) |
00040               1.1000         | *     9.97107 +-      2.22960 (     0.23617) |
00041               1.2000         | *    12.98021 +-      2.54563 (     0.23516) |
00042               1.3000         | *    14.69828 +-      2.77771 (     0.24373) |
00043               1.4000         | *    17.63934 +-      3.07061 (     0.24562) |
00044               1.5000         | *    18.86761 +-      3.14460 (     0.25669) |
00045               1.6000         | *    24.41848 +-      3.68122 (     0.24839) |
00046               1.7000         | *    26.54597 +-      3.87213 (     0.25555) |
00047               1.8000         | *    29.81607 +-      4.02040 (     0.25861) |
00048               1.9000         | *    32.26733 +-      4.13141 (     0.26461) |
00049               2.0000         | *    33.90502 +-      4.20540 (     0.27312) |
00050               2.1000         | *    38.56942 +-      4.54545 (     0.27236) |
00051               2.2000         | *    44.02982 +-      4.80405 (     0.27042) |
00052               2.3000         | *    49.83156 +-      5.08591 (     0.26872) |
00053               2.4000         | *    56.12838 +-      5.42614 (     0.26691) |
00054               2.5000         | *    63.30685 +-      5.75517 (     0.26437) |
00055               2.6000         | *    77.99720 +-      6.41133 (     0.25166) |
00056               2.7000         | *    87.12832 +-      6.76247 (     0.24921) |
00057               2.8000         | *    98.63033 +-      7.11803 (     0.24494) |
00058               2.9000         | *   115.13059 +-      7.65837 (     0.23712) |
00059               3.0000         | *   133.51367 +-      8.23281 (     0.22980) |
00060 
00061   The 1st column shows the expected signal contribution (normalized to
00062   the luminosity in the data using the scale factor I provided), the
00063   2nd column shows expected background contribution with errors. The
00064   number in parentheses are the cut on the classifier output and
00065   monitored FOM.
00066 */
00067 
00068 #include "PhysicsTools/StatPatternRecognition/interface/SprExperiment.hh"
00069 #include "PhysicsTools/StatPatternRecognition/interface/SprClass.hh"
00070 #include "PhysicsTools/StatPatternRecognition/interface/SprStringParser.hh"
00071 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsFilter.hh"
00072 #include "PhysicsTools/StatPatternRecognition/interface/SprPlotter.hh"
00073 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsTwoClassCriterion.hh"
00074 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassSignalSignif.hh"
00075 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassIDFraction.hh"
00076 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassTaggerEff.hh"
00077 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassPurity.hh"
00078 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassGiniIndex.hh"
00079 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassCrossEntropy.hh"
00080 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassUniformPriorUL90.hh"
00081 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassBKDiscovery.hh"
00082 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassPunzi.hh"
00083 
00084 #include <stdlib.h>
00085 #include <unistd.h>
00086 #include <errno.h>
00087 #include <sys/types.h>
00088 #include <sys/stat.h>
00089 #include <cassert>
00090 #include <iostream>
00091 #include <fstream>
00092 #include <sstream>
00093 #include <map>
00094 #include <vector>
00095 #include <utility>
00096 #include <set>
00097 #include <string>
00098 #include <memory>
00099 #include <iomanip>
00100 #include <algorithm>
00101 #include <cmath>
00102 
00103 using namespace std;
00104 
00105 
00106 void help(const char* prog)
00107 {
00108   cout << "Usage:  " << prog
00109        << " ascii_file_to_analyze" << endl;
00110   cout << "\t Options: " << endl;
00111   cout << "\t-h --- help                                        " << endl;
00112   cout << "\t-y list of input classes (see SprAbsFilter.hh)     " << endl;
00113   cout << "\t-C list of classifier names (in quotes with commas)" << endl;
00114   cout << "\t-v verbose level (0=silent default,1,2)            " << endl;
00115   cout << "\t-W display absolute weights (def=relative effic-cy)" << endl;
00116   cout << "\t-s scale all signal weights by this factor         " << endl;
00117   cout << "\t-b scale all background weights by this factor     " << endl;
00118   cout << "\t-c criterion for optimization                      " << endl;
00119   cout << "\t\t 1 = correctly classified fraction               " << endl;
00120   cout << "\t\t 2 = signal significance s/sqrt(s+b)             " << endl;
00121   cout << "\t\t 3 = purity s/(s+b)                              " << endl;
00122   cout << "\t\t 4 = tagger efficiency Q                         " << endl;
00123   cout << "\t\t 5 = Gini index (default)                        " << endl;
00124   cout << "\t\t 6 = cross-entropy                               " << endl;
00125   cout << "\t\t 7 = 90% Bayesian upper limit with uniform prior " << endl;
00126   cout << "\t\t 8 = discovery potential 2*(sqrt(s+b)-sqrt(b))   " << endl;
00127   cout << "\t\t 9 = Punzi's sensitivity s/(0.5*nSigma+sqrt(b))  " << endl;
00128   cout << "\t-n do not show computed cut and FOM for more compact output" 
00129        << endl;
00130 }
00131 
00132 
00133 bool answerYN(const char* question)
00134 {
00135   cout << question << " y/n [y] ";
00136   char yn [2];
00137   cin.getline(yn,2,'\n');
00138   if( yn[0]=='\0' || yn[0]=='y' ) return true;
00139   return false;
00140 }
00141 
00142 
00143 int main(int argc, char ** argv)
00144 {
00145   // check command line
00146   if( argc < 2 ) {
00147     help(argv[0]);
00148     return 1;
00149   }
00150 
00151   // init
00152   bool useAbsolute = false;
00153   int verbose = 0;
00154   string inputClassesString;
00155   string inputClassifiersString;
00156   int iCrit = 5;
00157   double sW = 1.;
00158   double bW = 1.;
00159   bool showCutAndFOM = true;
00160 
00161   // decode command line
00162   int c;
00163   extern char* optarg;
00164   //  extern int optind;
00165   while( (c = getopt(argc,argv,"hy:C:v:Ws:b:c:n")) != EOF ) {
00166     switch( c )
00167       {
00168       case 'h' :
00169         help(argv[0]);
00170         return 1;
00171       case 'y' :
00172         inputClassesString = optarg;
00173         break;
00174       case 'C' :
00175         inputClassifiersString = optarg;
00176         break;
00177       case 'v' :
00178         verbose = (optarg==0 ? 0 : atoi(optarg));
00179         break;
00180       case 'W' :
00181         useAbsolute = true;
00182         break;
00183       case 's' :
00184         sW = (optarg==0 ? 1 : atof(optarg));
00185         break;
00186       case 'b' :
00187         bW = (optarg==0 ? 1 : atof(optarg));
00188         break;
00189       case 'c' :
00190         iCrit = (optarg==0 ? 5 : atoi(optarg));
00191         break;
00192       case 'n' :
00193         showCutAndFOM = false;
00194         break;
00195       }
00196   }
00197 
00198   // There has to be 1 argument after all options.
00199   string analyzeFile = argv[argc-1];
00200   if( analyzeFile.empty() ) {
00201     cerr << "No input file is specified." << endl;
00202     return 1;
00203   }
00204 
00205   // Prepare classifier list.
00206   vector<vector<string> > getClassifiers;
00207   SprStringParser::parseToStrings(inputClassifiersString.c_str(),
00208                                   getClassifiers);
00209   if( getClassifiers.empty() ) {
00210     cerr << "Unable to parse input classifier names." << endl;
00211     return 1;
00212   }
00213   vector<string> classifiers = getClassifiers[0];
00214 
00215   // Prepare class list.
00216   vector<SprClass> classes;
00217   if( !SprAbsFilter::decodeClassString(inputClassesString.c_str(),classes) ) {
00218     cerr << "Unable to decode classes from string " 
00219          << inputClassesString << endl;
00220     return 1;
00221   }
00222  
00223   // make optimization criterion
00224   auto_ptr<SprAbsTwoClassCriterion> crit;
00225   switch( iCrit )
00226     {
00227     case 1 :
00228       crit.reset(new SprTwoClassIDFraction);
00229       cout << "Optimization criterion set to "
00230            << "Fraction of correctly classified events " << endl;
00231       break;
00232     case 2 :
00233       crit.reset(new SprTwoClassSignalSignif);
00234       cout << "Optimization criterion set to "
00235            << "Signal significance S/sqrt(S+B) " << endl;
00236       break;
00237     case 3 :
00238       crit.reset(new SprTwoClassPurity);
00239       cout << "Optimization criterion set to "
00240            << "Purity S/(S+B) " << endl;
00241       break;
00242     case 4 :
00243       crit.reset(new SprTwoClassTaggerEff);
00244       cout << "Optimization criterion set to "
00245            << "Tagging efficiency Q = e*(1-2w)^2 " << endl;
00246       break;
00247     case 5 :
00248       crit.reset(new SprTwoClassGiniIndex);
00249       cout << "Optimization criterion set to "
00250            << "Gini index  -1+p^2+q^2 " << endl;
00251       break;
00252     case 6 :
00253       crit.reset(new SprTwoClassCrossEntropy);
00254       cout << "Optimization criterion set to "
00255            << "Cross-entropy p*log(p)+q*log(q) " << endl;
00256       break;
00257     case 7 :
00258       crit.reset(new SprTwoClassUniformPriorUL90);
00259       cout << "Optimization criterion set to "
00260            << "Inverse of 90% Bayesian upper limit with uniform prior" << endl;
00261       break;
00262     case 8 :
00263       crit.reset(new SprTwoClassBKDiscovery);
00264       cout << "Optimization criterion set to "
00265            << "Discovery potential 2*(sqrt(S+B)-sqrt(B))" << endl;
00266       break;
00267     case 9 :
00268       crit.reset(new SprTwoClassPunzi(1.));
00269       cout << "Optimization criterion set to "
00270            << "Punzi's sensitivity S/(0.5*nSigma+sqrt(B))" << endl;
00271       break;
00272     default :
00273       cerr << "Unable to make initialization criterion." << endl;
00274       return 1;
00275     }
00276 
00277   // open the analyzed file and read it
00278   ifstream file(analyzeFile.c_str());
00279   if( !file ) {
00280     cerr << "Unable to open file " << analyzeFile.c_str() << endl;
00281     return 2;
00282   }
00283 
00284   // read the header
00285   string line;
00286   getline(file,line,'\n');
00287   if( line.empty() ) {
00288     cerr << "Unable to read top line from file " 
00289          << analyzeFile.c_str() << endl;
00290     return 3;
00291   }
00292   istringstream str(line);
00293   string dummy;
00294   unsigned nRead = 1;
00295   set<string> foundClassifiers;
00296   vector<pair<unsigned,string> > position;
00297   while( str >> dummy ) {
00298     // check correct format
00299     if( nRead==1 && dummy!="index" ) {
00300       cerr << "Incorrect top line format in position " << nRead << endl;
00301       return 3;
00302     }
00303     if( nRead==2 && dummy!="i" ) {
00304       cerr << "Incorrect top line format in position " << nRead << endl;
00305       return 3;
00306     }
00307     if( nRead==3 && dummy!="w" ) {
00308       cerr << "Incorrect top line format in position " << nRead << endl;
00309       return 3;
00310     }
00311 
00312     // find the input name among classifier list
00313     if( find(classifiers.begin(),classifiers.end(),dummy) 
00314         != classifiers.end() ) {
00315       cout << "Output of classifier \"" << dummy.c_str() 
00316            << "\" found in position " << nRead << endl;
00317       set<string>::const_iterator found = foundClassifiers.find(dummy);
00318       if( found != foundClassifiers.end() ) {
00319         cerr << "Classifier " << dummy.c_str() 
00320              << " is already in the list. Skipping..." << endl;
00321       }
00322       else {
00323         position.push_back(pair<unsigned,string>(nRead,dummy));
00324       }
00325     }
00326 
00327     // increment record number
00328     ++nRead;
00329   }
00330 
00331   // sanity check
00332   if( position.empty() ) {
00333     cerr << "Specified classifiers not found in the input file." << endl;
00334     return 4;
00335   }
00336 
00337   // read lines and fill out responses
00338   vector<SprPlotter::Response> responses;
00339   while( getline(file,line) ) {
00340     int index(0), icls(0);
00341     double weight(0);
00342     istringstream event(line);
00343     event >> index >> icls >> weight;
00344 
00345     // decode input class
00346     int cls = -1;
00347     if(      icls == classes[0] )
00348       cls = 0;
00349     else if( icls == classes[1] )
00350       cls = 1;
00351     else
00352       continue;
00353 
00354     // fill out response
00355     SprPlotter::Response response(cls,weight);
00356     unsigned istart = 4;
00357     double fread = 0;
00358     for( int i=0;i<position.size();i++ ) {
00359       unsigned nRead = position[i].first;
00360       string classifier = position[i].second;
00361       for( int ipos=istart;ipos<nRead;ipos++ ) event >> fread;// rewind forward
00362       event >> fread;
00363       response.set(classifier.c_str(),fread);
00364       istart = nRead + 1;
00365     }
00366     
00367     // record response
00368     responses.push_back(response);
00369   }
00370 
00371   // sanity check
00372   if( responses.empty() ) {
00373     cerr << "Did not find any stored classifier responses." << endl;
00374     return 5;
00375   }
00376 
00377   // supply input vectors to the plotter
00378   SprPlotter plotter(responses);
00379   if( useAbsolute ) {
00380     plotter.useAbsolute();
00381     if( !plotter.setScaleFactors(sW,bW) ) {
00382       cerr << "Unable to set the scaling factors for the plotter." << endl;
00383       return 7;
00384     }
00385   }
00386   plotter.setCrit(crit.get());
00387 
00388   // print-out
00389   cout << "Read " << responses.size() 
00390        << " points from input file "<< analyzeFile.c_str() 
00391        << "   Bgrnd Nevents=" << plotter.bgrndNevts()
00392        << "   Bgrnd Weight=" << plotter.bgrndWeight()
00393        << "   Signal Nevents=" << plotter.signalNevts() 
00394        << "   Signal Weight=" << plotter.signalWeight() 
00395        << endl;
00396 
00397   // create vector of signal efficiencies
00398   vector<double> effS;
00399   if( useAbsolute ) {
00400     effS.push_back(1);
00401     effS.push_back(10);
00402     effS.push_back(100);
00403   }
00404   else {
00405     effS.push_back(0.1);
00406     effS.push_back(0.2);
00407     effS.push_back(0.3);
00408     effS.push_back(0.4);
00409     effS.push_back(0.5);
00410     effS.push_back(0.6);
00411     effS.push_back(0.7);
00412     effS.push_back(0.8);
00413     effS.push_back(0.9);
00414   }
00415 
00416   //
00417   // start loop over processing response values
00418   //
00419   while( true ) {
00420 
00421     // read efficiency values
00422     if( useAbsolute ) {
00423       cout << "Input signal weights for which background "
00424            << "will be estimated [ ";
00425     }
00426     else {
00427       cout << "Input signal efficiency values for which background "
00428            << "will be estimated [ ";
00429     }
00430     for( int i=0;i<effS.size();i++ )
00431       cout << effS[i] << " ";
00432     cout << "] ";
00433     string line;
00434     getline(cin,line,'\n');
00435     if( !line.empty() ) {
00436       effS.clear();
00437       istringstream str(line);
00438       double dummy = 0;
00439       while( str >> dummy ) effS.push_back(dummy);
00440     }
00441     if( effS.empty() ) {
00442       cerr << "What, are you trying to be cute? Enter values." << endl;
00443       return 6;
00444     }
00445     stable_sort(effS.begin(),effS.end());
00446     
00447     //
00448     // Estimate background fractions for these signal efficiencies.
00449     //
00450     map<string,vector<SprPlotter::FigureOfMerit> > effB;
00451     
00452     for( int iclassifier=0;iclassifier<position.size();iclassifier++ ) {
00453       const string classifier = position[iclassifier].second;
00454       
00455       // insert into map
00456       pair<map<string,vector<SprPlotter::FigureOfMerit> >::iterator,bool> 
00457         inserted = effB.insert(pair<const string,
00458                                vector<SprPlotter::FigureOfMerit> >(classifier,
00459                                       vector<SprPlotter::FigureOfMerit>()));
00460       assert( inserted.second );
00461     
00462       // process
00463       if( !plotter.backgroundCurve(effS,
00464                                    classifier.c_str(),
00465                                    inserted.first->second) ) {
00466         cerr << "Unable to compute the efficiency curve " 
00467              << "for classifier " << classifier.c_str() << endl;
00468         continue;
00469       }
00470     }// end of loop over classifiers
00471 
00472   
00473     //
00474     // make a table of signal and background efficiencies
00475     //
00476     cout << "===========================================" << endl;
00477     cout << "Table of surviving background fractions"
00478          << " (* shows minimal value in a row; " 
00479          << "Cut on classifier output and FOM are shown in parentheses)" 
00480          << endl;
00481     cout << "===========================================" << endl;
00482     char s[200];
00483     sprintf(s,"Signal Eff   \\  Classifiers  |");
00484     cout << s;
00485     string temp = "------------------------------";
00486     for( map<string,vector<SprPlotter::FigureOfMerit> >::const_iterator
00487            iter=effB.begin();iter!=effB.end();iter++ ) {
00488       if( showCutAndFOM )
00489         sprintf(s," %65s |",iter->first.c_str());
00490       else
00491         sprintf(s," %29s |",iter->first.c_str());
00492       cout << s;
00493       if( showCutAndFOM )
00494         temp += "--------------------------------------------------------------------";
00495       else
00496         temp += "--------------------------------";
00497     }
00498     cout << endl;
00499     cout << temp.c_str() << endl;
00500     for( int i=0;i<effS.size();i++ ) {
00501       sprintf(s,"          %10.4f         |",effS[i]);
00502       cout << s;
00503       vector<string> names;
00504       vector<double> cuts;
00505       vector<double> values;
00506       vector<double> errors;
00507       vector<double> fom;
00508       for( map<string,vector<SprPlotter::FigureOfMerit> >::const_iterator
00509              iter=effB.begin();iter!=effB.end();iter++ ) {
00510         names.push_back(iter->first);
00511         cuts.push_back(iter->second[i].lowerBound);
00512         double value = iter->second[i].bgrWeight;
00513         values.push_back(value);
00514         unsigned nevts = iter->second[i].bgrNevts;
00515         errors.push_back(( nevts>0 ? value/sqrt(double(nevts)) : 0 ));
00516         fom.push_back(iter->second[i].fom);
00517       }
00518       int foundMin 
00519         = min_element(values.begin(),values.end()) - values.begin();
00520       for( int j=0;j<names.size();j++ ) {
00521         if( showCutAndFOM ) {
00522           if( j == foundMin )
00523             sprintf(s," *%12.5f +- %12.5f (Cut=%12.5f FOM=%12.5f) |",
00524                     values[j],errors[j],cuts[j],fom[j]);
00525           else
00526             sprintf(s,"  %12.5f +- %12.5f (Cut=%12.5f FOM=%12.5f) |",
00527                     values[j],errors[j],cuts[j],fom[j]);
00528         }
00529         else {
00530           if( j == foundMin )
00531             sprintf(s," *%12.5f +- %12.5f |",values[j],errors[j]);
00532           else
00533             sprintf(s,"  %12.5f +- %12.5f |",values[j],errors[j]);
00534         }
00535         cout << s;
00536       }
00537       cout << endl;
00538     }
00539     cout << "===========================================" << endl;
00540   
00541     //
00542     // make a histogram for the requested classifier
00543     //
00544     if( answerYN("Histogram classifier output?") ) {
00545       
00546       // user input
00547       cout << "Input classifier name, low and upper limits, and step: "
00548            << "(Example: bag 0 1 0.1) ----> ";
00549       string line;
00550       getline(cin,line,'\n');
00551       if( line.empty() ) {
00552         cerr << "No values given. Exit to main loop." << endl;
00553         continue;
00554       }
00555       istringstream str(line);
00556       string classifier;
00557       double xlo(0), xhi(0), dx(0);
00558       str >> classifier >> xlo >> xhi >> dx;
00559       if( classifier.empty() || dx<=0. || xlo>=xhi ) {
00560         cerr << "Incorrect parameters given. Exit to main loop." << endl;
00561         continue;
00562       }
00563       
00564       // make histograms
00565       vector<pair<double,double> > shist, bhist;
00566       int nbin = plotter.histogram(classifier.c_str(),xlo,xhi,dx,shist,bhist);
00567       if( nbin < 1 ) {
00568         cerr << "Unable to make histogram for classifier " 
00569              << classifier.c_str() << endl;
00570         continue;
00571       }
00572       
00573       // print out
00574       cout << "===========================================" << endl;
00575       cout << "Histogram for output of classifier " 
00576            << classifier.c_str() << endl;
00577       char s[200];
00578       cout << "Xlo=" << xlo << " Xhi=" << xhi 
00579            << " dX=" << dx << " Nbin=" << nbin << endl;
00580       sprintf(s," %14s | %30s | %30s |","Bin center","Signal","Background");
00581       cout << s << endl;
00582       for( int i=0;i<nbin;i++ ) {
00583         double x = xlo + (i+0.5)*dx;
00584         sprintf(s," %12.5f   |   %12.5f +- %12.5f |   %12.5f +- %12.5f |",x,
00585                 shist[i].first,shist[i].second,
00586                 bhist[i].first,bhist[i].second);
00587         cout << s << endl;
00588       }
00589       cout << "===========================================" << endl;
00590     }// end histogram
00591     
00592     // exit?
00593     if( !answerYN("Continue?") ) break;
00594   }
00595     
00596   // exit
00597   return 0;
00598 }

Generated on Tue Jun 9 17:41:59 2009 for CMSSW by  doxygen 1.5.4