00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068 #include "PhysicsTools/StatPatternRecognition/interface/SprExperiment.hh"
00069 #include "PhysicsTools/StatPatternRecognition/interface/SprClass.hh"
00070 #include "PhysicsTools/StatPatternRecognition/interface/SprStringParser.hh"
00071 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsFilter.hh"
00072 #include "PhysicsTools/StatPatternRecognition/interface/SprPlotter.hh"
00073 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsTwoClassCriterion.hh"
00074 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassSignalSignif.hh"
00075 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassIDFraction.hh"
00076 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassTaggerEff.hh"
00077 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassPurity.hh"
00078 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassGiniIndex.hh"
00079 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassCrossEntropy.hh"
00080 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassUniformPriorUL90.hh"
00081 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassBKDiscovery.hh"
00082 #include "PhysicsTools/StatPatternRecognition/interface/SprTwoClassPunzi.hh"
00083
00084 #include <stdlib.h>
00085 #include <unistd.h>
00086 #include <errno.h>
00087 #include <sys/types.h>
00088 #include <sys/stat.h>
00089 #include <cassert>
00090 #include <iostream>
00091 #include <fstream>
00092 #include <sstream>
00093 #include <map>
00094 #include <vector>
00095 #include <utility>
00096 #include <set>
00097 #include <string>
00098 #include <memory>
00099 #include <iomanip>
00100 #include <algorithm>
00101 #include <cmath>
00102
00103 using namespace std;
00104
00105
00106 void help(const char* prog)
00107 {
00108 cout << "Usage: " << prog
00109 << " ascii_file_to_analyze" << endl;
00110 cout << "\t Options: " << endl;
00111 cout << "\t-h --- help " << endl;
00112 cout << "\t-y list of input classes (see SprAbsFilter.hh) " << endl;
00113 cout << "\t-C list of classifier names (in quotes with commas)" << endl;
00114 cout << "\t-v verbose level (0=silent default,1,2) " << endl;
00115 cout << "\t-W display absolute weights (def=relative effic-cy)" << endl;
00116 cout << "\t-s scale all signal weights by this factor " << endl;
00117 cout << "\t-b scale all background weights by this factor " << endl;
00118 cout << "\t-c criterion for optimization " << endl;
00119 cout << "\t\t 1 = correctly classified fraction " << endl;
00120 cout << "\t\t 2 = signal significance s/sqrt(s+b) " << endl;
00121 cout << "\t\t 3 = purity s/(s+b) " << endl;
00122 cout << "\t\t 4 = tagger efficiency Q " << endl;
00123 cout << "\t\t 5 = Gini index (default) " << endl;
00124 cout << "\t\t 6 = cross-entropy " << endl;
00125 cout << "\t\t 7 = 90% Bayesian upper limit with uniform prior " << endl;
00126 cout << "\t\t 8 = discovery potential 2*(sqrt(s+b)-sqrt(b)) " << endl;
00127 cout << "\t\t 9 = Punzi's sensitivity s/(0.5*nSigma+sqrt(b)) " << endl;
00128 cout << "\t-n do not show computed cut and FOM for more compact output"
00129 << endl;
00130 }
00131
00132
00133 bool answerYN(const char* question)
00134 {
00135 cout << question << " y/n [y] ";
00136 char yn [2];
00137 cin.getline(yn,2,'\n');
00138 if( yn[0]=='\0' || yn[0]=='y' ) return true;
00139 return false;
00140 }
00141
00142
00143 int main(int argc, char ** argv)
00144 {
00145
00146 if( argc < 2 ) {
00147 help(argv[0]);
00148 return 1;
00149 }
00150
00151
00152 bool useAbsolute = false;
00153 int verbose = 0;
00154 string inputClassesString;
00155 string inputClassifiersString;
00156 int iCrit = 5;
00157 double sW = 1.;
00158 double bW = 1.;
00159 bool showCutAndFOM = true;
00160
00161
00162 int c;
00163 extern char* optarg;
00164
00165 while( (c = getopt(argc,argv,"hy:C:v:Ws:b:c:n")) != EOF ) {
00166 switch( c )
00167 {
00168 case 'h' :
00169 help(argv[0]);
00170 return 1;
00171 case 'y' :
00172 inputClassesString = optarg;
00173 break;
00174 case 'C' :
00175 inputClassifiersString = optarg;
00176 break;
00177 case 'v' :
00178 verbose = (optarg==0 ? 0 : atoi(optarg));
00179 break;
00180 case 'W' :
00181 useAbsolute = true;
00182 break;
00183 case 's' :
00184 sW = (optarg==0 ? 1 : atof(optarg));
00185 break;
00186 case 'b' :
00187 bW = (optarg==0 ? 1 : atof(optarg));
00188 break;
00189 case 'c' :
00190 iCrit = (optarg==0 ? 5 : atoi(optarg));
00191 break;
00192 case 'n' :
00193 showCutAndFOM = false;
00194 break;
00195 }
00196 }
00197
00198
00199 string analyzeFile = argv[argc-1];
00200 if( analyzeFile.empty() ) {
00201 cerr << "No input file is specified." << endl;
00202 return 1;
00203 }
00204
00205
00206 vector<vector<string> > getClassifiers;
00207 SprStringParser::parseToStrings(inputClassifiersString.c_str(),
00208 getClassifiers);
00209 if( getClassifiers.empty() ) {
00210 cerr << "Unable to parse input classifier names." << endl;
00211 return 1;
00212 }
00213 vector<string> classifiers = getClassifiers[0];
00214
00215
00216 vector<SprClass> classes;
00217 if( !SprAbsFilter::decodeClassString(inputClassesString.c_str(),classes) ) {
00218 cerr << "Unable to decode classes from string "
00219 << inputClassesString << endl;
00220 return 1;
00221 }
00222
00223
00224 auto_ptr<SprAbsTwoClassCriterion> crit;
00225 switch( iCrit )
00226 {
00227 case 1 :
00228 crit.reset(new SprTwoClassIDFraction);
00229 cout << "Optimization criterion set to "
00230 << "Fraction of correctly classified events " << endl;
00231 break;
00232 case 2 :
00233 crit.reset(new SprTwoClassSignalSignif);
00234 cout << "Optimization criterion set to "
00235 << "Signal significance S/sqrt(S+B) " << endl;
00236 break;
00237 case 3 :
00238 crit.reset(new SprTwoClassPurity);
00239 cout << "Optimization criterion set to "
00240 << "Purity S/(S+B) " << endl;
00241 break;
00242 case 4 :
00243 crit.reset(new SprTwoClassTaggerEff);
00244 cout << "Optimization criterion set to "
00245 << "Tagging efficiency Q = e*(1-2w)^2 " << endl;
00246 break;
00247 case 5 :
00248 crit.reset(new SprTwoClassGiniIndex);
00249 cout << "Optimization criterion set to "
00250 << "Gini index -1+p^2+q^2 " << endl;
00251 break;
00252 case 6 :
00253 crit.reset(new SprTwoClassCrossEntropy);
00254 cout << "Optimization criterion set to "
00255 << "Cross-entropy p*log(p)+q*log(q) " << endl;
00256 break;
00257 case 7 :
00258 crit.reset(new SprTwoClassUniformPriorUL90);
00259 cout << "Optimization criterion set to "
00260 << "Inverse of 90% Bayesian upper limit with uniform prior" << endl;
00261 break;
00262 case 8 :
00263 crit.reset(new SprTwoClassBKDiscovery);
00264 cout << "Optimization criterion set to "
00265 << "Discovery potential 2*(sqrt(S+B)-sqrt(B))" << endl;
00266 break;
00267 case 9 :
00268 crit.reset(new SprTwoClassPunzi(1.));
00269 cout << "Optimization criterion set to "
00270 << "Punzi's sensitivity S/(0.5*nSigma+sqrt(B))" << endl;
00271 break;
00272 default :
00273 cerr << "Unable to make initialization criterion." << endl;
00274 return 1;
00275 }
00276
00277
00278 ifstream file(analyzeFile.c_str());
00279 if( !file ) {
00280 cerr << "Unable to open file " << analyzeFile.c_str() << endl;
00281 return 2;
00282 }
00283
00284
00285 string line;
00286 getline(file,line,'\n');
00287 if( line.empty() ) {
00288 cerr << "Unable to read top line from file "
00289 << analyzeFile.c_str() << endl;
00290 return 3;
00291 }
00292 istringstream str(line);
00293 string dummy;
00294 unsigned nRead = 1;
00295 set<string> foundClassifiers;
00296 vector<pair<unsigned,string> > position;
00297 while( str >> dummy ) {
00298
00299 if( nRead==1 && dummy!="index" ) {
00300 cerr << "Incorrect top line format in position " << nRead << endl;
00301 return 3;
00302 }
00303 if( nRead==2 && dummy!="i" ) {
00304 cerr << "Incorrect top line format in position " << nRead << endl;
00305 return 3;
00306 }
00307 if( nRead==3 && dummy!="w" ) {
00308 cerr << "Incorrect top line format in position " << nRead << endl;
00309 return 3;
00310 }
00311
00312
00313 if( find(classifiers.begin(),classifiers.end(),dummy)
00314 != classifiers.end() ) {
00315 cout << "Output of classifier \"" << dummy.c_str()
00316 << "\" found in position " << nRead << endl;
00317 set<string>::const_iterator found = foundClassifiers.find(dummy);
00318 if( found != foundClassifiers.end() ) {
00319 cerr << "Classifier " << dummy.c_str()
00320 << " is already in the list. Skipping..." << endl;
00321 }
00322 else {
00323 position.push_back(pair<unsigned,string>(nRead,dummy));
00324 }
00325 }
00326
00327
00328 ++nRead;
00329 }
00330
00331
00332 if( position.empty() ) {
00333 cerr << "Specified classifiers not found in the input file." << endl;
00334 return 4;
00335 }
00336
00337
00338 vector<SprPlotter::Response> responses;
00339 while( getline(file,line) ) {
00340 int index(0), icls(0);
00341 double weight(0);
00342 istringstream event(line);
00343 event >> index >> icls >> weight;
00344
00345
00346 int cls = -1;
00347 if( icls == classes[0] )
00348 cls = 0;
00349 else if( icls == classes[1] )
00350 cls = 1;
00351 else
00352 continue;
00353
00354
00355 SprPlotter::Response response(cls,weight);
00356 unsigned istart = 4;
00357 double fread = 0;
00358 for( int i=0;i<position.size();i++ ) {
00359 unsigned nRead = position[i].first;
00360 string classifier = position[i].second;
00361 for( int ipos=istart;ipos<nRead;ipos++ ) event >> fread;
00362 event >> fread;
00363 response.set(classifier.c_str(),fread);
00364 istart = nRead + 1;
00365 }
00366
00367
00368 responses.push_back(response);
00369 }
00370
00371
00372 if( responses.empty() ) {
00373 cerr << "Did not find any stored classifier responses." << endl;
00374 return 5;
00375 }
00376
00377
00378 SprPlotter plotter(responses);
00379 if( useAbsolute ) {
00380 plotter.useAbsolute();
00381 if( !plotter.setScaleFactors(sW,bW) ) {
00382 cerr << "Unable to set the scaling factors for the plotter." << endl;
00383 return 7;
00384 }
00385 }
00386 plotter.setCrit(crit.get());
00387
00388
00389 cout << "Read " << responses.size()
00390 << " points from input file "<< analyzeFile.c_str()
00391 << " Bgrnd Nevents=" << plotter.bgrndNevts()
00392 << " Bgrnd Weight=" << plotter.bgrndWeight()
00393 << " Signal Nevents=" << plotter.signalNevts()
00394 << " Signal Weight=" << plotter.signalWeight()
00395 << endl;
00396
00397
00398 vector<double> effS;
00399 if( useAbsolute ) {
00400 effS.push_back(1);
00401 effS.push_back(10);
00402 effS.push_back(100);
00403 }
00404 else {
00405 effS.push_back(0.1);
00406 effS.push_back(0.2);
00407 effS.push_back(0.3);
00408 effS.push_back(0.4);
00409 effS.push_back(0.5);
00410 effS.push_back(0.6);
00411 effS.push_back(0.7);
00412 effS.push_back(0.8);
00413 effS.push_back(0.9);
00414 }
00415
00416
00417
00418
00419 while( true ) {
00420
00421
00422 if( useAbsolute ) {
00423 cout << "Input signal weights for which background "
00424 << "will be estimated [ ";
00425 }
00426 else {
00427 cout << "Input signal efficiency values for which background "
00428 << "will be estimated [ ";
00429 }
00430 for( int i=0;i<effS.size();i++ )
00431 cout << effS[i] << " ";
00432 cout << "] ";
00433 string line;
00434 getline(cin,line,'\n');
00435 if( !line.empty() ) {
00436 effS.clear();
00437 istringstream str(line);
00438 double dummy = 0;
00439 while( str >> dummy ) effS.push_back(dummy);
00440 }
00441 if( effS.empty() ) {
00442 cerr << "What, are you trying to be cute? Enter values." << endl;
00443 return 6;
00444 }
00445 stable_sort(effS.begin(),effS.end());
00446
00447
00448
00449
00450 map<string,vector<SprPlotter::FigureOfMerit> > effB;
00451
00452 for( int iclassifier=0;iclassifier<position.size();iclassifier++ ) {
00453 const string classifier = position[iclassifier].second;
00454
00455
00456 pair<map<string,vector<SprPlotter::FigureOfMerit> >::iterator,bool>
00457 inserted = effB.insert(pair<const string,
00458 vector<SprPlotter::FigureOfMerit> >(classifier,
00459 vector<SprPlotter::FigureOfMerit>()));
00460 assert( inserted.second );
00461
00462
00463 if( !plotter.backgroundCurve(effS,
00464 classifier.c_str(),
00465 inserted.first->second) ) {
00466 cerr << "Unable to compute the efficiency curve "
00467 << "for classifier " << classifier.c_str() << endl;
00468 continue;
00469 }
00470 }
00471
00472
00473
00474
00475
00476 cout << "===========================================" << endl;
00477 cout << "Table of surviving background fractions"
00478 << " (* shows minimal value in a row; "
00479 << "Cut on classifier output and FOM are shown in parentheses)"
00480 << endl;
00481 cout << "===========================================" << endl;
00482 char s[200];
00483 sprintf(s,"Signal Eff \\ Classifiers |");
00484 cout << s;
00485 string temp = "------------------------------";
00486 for( map<string,vector<SprPlotter::FigureOfMerit> >::const_iterator
00487 iter=effB.begin();iter!=effB.end();iter++ ) {
00488 if( showCutAndFOM )
00489 sprintf(s," %65s |",iter->first.c_str());
00490 else
00491 sprintf(s," %29s |",iter->first.c_str());
00492 cout << s;
00493 if( showCutAndFOM )
00494 temp += "--------------------------------------------------------------------";
00495 else
00496 temp += "--------------------------------";
00497 }
00498 cout << endl;
00499 cout << temp.c_str() << endl;
00500 for( int i=0;i<effS.size();i++ ) {
00501 sprintf(s," %10.4f |",effS[i]);
00502 cout << s;
00503 vector<string> names;
00504 vector<double> cuts;
00505 vector<double> values;
00506 vector<double> errors;
00507 vector<double> fom;
00508 for( map<string,vector<SprPlotter::FigureOfMerit> >::const_iterator
00509 iter=effB.begin();iter!=effB.end();iter++ ) {
00510 names.push_back(iter->first);
00511 cuts.push_back(iter->second[i].lowerBound);
00512 double value = iter->second[i].bgrWeight;
00513 values.push_back(value);
00514 unsigned nevts = iter->second[i].bgrNevts;
00515 errors.push_back(( nevts>0 ? value/sqrt(double(nevts)) : 0 ));
00516 fom.push_back(iter->second[i].fom);
00517 }
00518 int foundMin
00519 = min_element(values.begin(),values.end()) - values.begin();
00520 for( int j=0;j<names.size();j++ ) {
00521 if( showCutAndFOM ) {
00522 if( j == foundMin )
00523 sprintf(s," *%12.5f +- %12.5f (Cut=%12.5f FOM=%12.5f) |",
00524 values[j],errors[j],cuts[j],fom[j]);
00525 else
00526 sprintf(s," %12.5f +- %12.5f (Cut=%12.5f FOM=%12.5f) |",
00527 values[j],errors[j],cuts[j],fom[j]);
00528 }
00529 else {
00530 if( j == foundMin )
00531 sprintf(s," *%12.5f +- %12.5f |",values[j],errors[j]);
00532 else
00533 sprintf(s," %12.5f +- %12.5f |",values[j],errors[j]);
00534 }
00535 cout << s;
00536 }
00537 cout << endl;
00538 }
00539 cout << "===========================================" << endl;
00540
00541
00542
00543
00544 if( answerYN("Histogram classifier output?") ) {
00545
00546
00547 cout << "Input classifier name, low and upper limits, and step: "
00548 << "(Example: bag 0 1 0.1) ----> ";
00549 string line;
00550 getline(cin,line,'\n');
00551 if( line.empty() ) {
00552 cerr << "No values given. Exit to main loop." << endl;
00553 continue;
00554 }
00555 istringstream str(line);
00556 string classifier;
00557 double xlo(0), xhi(0), dx(0);
00558 str >> classifier >> xlo >> xhi >> dx;
00559 if( classifier.empty() || dx<=0. || xlo>=xhi ) {
00560 cerr << "Incorrect parameters given. Exit to main loop." << endl;
00561 continue;
00562 }
00563
00564
00565 vector<pair<double,double> > shist, bhist;
00566 int nbin = plotter.histogram(classifier.c_str(),xlo,xhi,dx,shist,bhist);
00567 if( nbin < 1 ) {
00568 cerr << "Unable to make histogram for classifier "
00569 << classifier.c_str() << endl;
00570 continue;
00571 }
00572
00573
00574 cout << "===========================================" << endl;
00575 cout << "Histogram for output of classifier "
00576 << classifier.c_str() << endl;
00577 char s[200];
00578 cout << "Xlo=" << xlo << " Xhi=" << xhi
00579 << " dX=" << dx << " Nbin=" << nbin << endl;
00580 sprintf(s," %14s | %30s | %30s |","Bin center","Signal","Background");
00581 cout << s << endl;
00582 for( int i=0;i<nbin;i++ ) {
00583 double x = xlo + (i+0.5)*dx;
00584 sprintf(s," %12.5f | %12.5f +- %12.5f | %12.5f +- %12.5f |",x,
00585 shist[i].first,shist[i].second,
00586 bhist[i].first,bhist[i].second);
00587 cout << s << endl;
00588 }
00589 cout << "===========================================" << endl;
00590 }
00591
00592
00593 if( !answerYN("Continue?") ) break;
00594 }
00595
00596
00597 return 0;
00598 }