00001
00002
00003 #include "PhysicsTools/StatPatternRecognition/interface/SprExperiment.hh"
00004 #include "PhysicsTools/StatPatternRecognition/interface/SprClassifierEvaluator.hh"
00005 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsFilter.hh"
00006 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsTrainedClassifier.hh"
00007 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedMultiClassLearner.hh"
00008 #include "PhysicsTools/StatPatternRecognition/interface/SprCoordinateMapper.hh"
00009 #include "PhysicsTools/StatPatternRecognition/interface/SprClass.hh"
00010 #include "PhysicsTools/StatPatternRecognition/interface/SprLoss.hh"
00011 #include "PhysicsTools/StatPatternRecognition/interface/SprAverageLoss.hh"
00012 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedAdaBoost.hh"
00013 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedFisher.hh"
00014 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedLogitR.hh"
00015 #include "PhysicsTools/StatPatternRecognition/interface/SprIntegerPermutator.hh"
00016 #include "PhysicsTools/StatPatternRecognition/interface/SprPoint.hh"
00017 #include "PhysicsTools/StatPatternRecognition/interface/SprStringParser.hh"
00018 #include "PhysicsTools/StatPatternRecognition/interface/SprUtils.hh"
00019
00020 #include <iostream>
00021 #include <memory>
00022 #include <algorithm>
00023 #include <cassert>
00024 #include <cmath>
00025
00026
00027 using namespace std;
00028
00029
00030 bool SprClassifierEvaluator::variableImportance(
00031 const SprAbsFilter* data,
00032 SprAbsTrainedClassifier* trained,
00033 SprTrainedMultiClassLearner* mcTrained,
00034 SprCoordinateMapper* mapper,
00035 unsigned nPerm,
00036 std::vector<NameAndValue>& lossIncrease)
00037 {
00038
00039 if( data == 0 ) {
00040 cerr << "No data supplied for variableImportance." << endl;
00041 return false;
00042 }
00043 if( trained==0 && mcTrained==0 ) {
00044 cerr << "No classifiers provided for variableImportance." << endl;
00045 return false;
00046 }
00047 if( trained!=0 && mcTrained!=0 ) {
00048 cerr << "variableImportance cannot process both two-class "
00049 << "and multi-class learners." << endl;
00050 return false;
00051 }
00052 if( nPerm == 0 ) {
00053 cout << "No permutations requested. Will use one by default." << endl;
00054 nPerm = 1;
00055 }
00056
00057
00058 vector<SprClass> classes;
00059 data->classes(classes);
00060 if( classes.size() < 2 ) {
00061 cerr << "Classes have not been set." << endl;
00062 return false;
00063 }
00064 vector<int> mcClasses;
00065 if( mcTrained != 0 )
00066 mcTrained->classes(mcClasses);
00067
00068
00069 auto_ptr<SprAverageLoss> loss;
00070 if( mcTrained != 0 )
00071 loss.reset(new SprAverageLoss(&SprLoss::correct_id));
00072 else
00073 loss.reset(new SprAverageLoss(&SprLoss::quadratic));
00074 if( trained != 0 ) {
00075 if( trained->name() == "AdaBoost" ) {
00076 SprTrainedAdaBoost* specific
00077 = static_cast<SprTrainedAdaBoost*>(trained);
00078 specific->useNormalized();
00079 }
00080 else if( trained->name() == "Fisher" ) {
00081 SprTrainedFisher* specific
00082 = static_cast<SprTrainedFisher*>(trained);
00083 specific->useNormalized();
00084 }
00085 else if( trained->name() == "LogitR" ) {
00086 SprTrainedLogitR* specific
00087 = static_cast<SprTrainedLogitR*>(trained);
00088 specific->useNormalized();
00089 }
00090 }
00091
00092
00093
00094
00095 vector<string> testVars;
00096 if( mcTrained != 0 )
00097 mcTrained->vars(testVars);
00098 else
00099 trained->vars(testVars);
00100 int N = data->size();
00101 SprIntegerPermutator permu(N);
00102
00103
00104 for( int n=0;n<N;n++ ) {
00105 const SprPoint* p = (*data)[n];
00106 const SprPoint* mappedP = p;
00107 int icls = p->class_;
00108 if( mcTrained != 0 ) {
00109 if( find(mcClasses.begin(),mcClasses.end(),icls) == mcClasses.end() )
00110 continue;
00111 }
00112 else {
00113 if( icls == classes[0] )
00114 icls = 0;
00115 else if( icls == classes[1] )
00116 icls = 1;
00117 else
00118 continue;
00119 }
00120 if( mapper != 0 ) mappedP = mapper->output(p);
00121 if( mcTrained != 0 )
00122 loss->update(icls,mcTrained->response(mappedP),data->w(n));
00123 else
00124 loss->update(icls,trained->response(mappedP),data->w(n));
00125 if( mapper != 0 ) mapper->clear();
00126 }
00127 double nominalLoss = loss->value();
00128
00129
00130
00131
00132 cout << "Will perform " << nPerm << " permutations per variable." << endl;
00133 int nVars = testVars.size();
00134 lossIncrease.clear();
00135 lossIncrease.resize(nVars);
00136 for( int d=0;d<nVars;d++ ) {
00137 cout << "Permuting variable " << testVars[d].c_str() << endl;
00138
00139
00140 int mappedD = d;
00141 if( mapper != 0 )
00142 mappedD = mapper->mappedIndex(d);
00143 assert( mappedD>=0 && mappedD<data->dim() );
00144
00145
00146 vector<double> losses(nPerm);
00147 double aveLoss = 0;
00148 for( int i=0;i<nPerm;i++ ) {
00149
00150
00151 vector<unsigned> seq;
00152 if( !permu.sequence(seq) ) {
00153 cerr << "variableImportance is unable to permute points." << endl;
00154 return false;
00155 }
00156
00157
00158 loss->reset();
00159 for( int n=0;n<N;n++ ) {
00160 SprPoint p(*(*data)[n]);
00161 p.x_[mappedD] = (*data)[seq[n]]->x_[mappedD];
00162 const SprPoint* mappedP = &p;
00163 int icls = p.class_;
00164 if( mcTrained != 0 ) {
00165 if( find(mcClasses.begin(),mcClasses.end(),icls) == mcClasses.end() )
00166 continue;
00167 }
00168 else {
00169 if( icls == classes[0] )
00170 icls = 0;
00171 else if( icls == classes[1] )
00172 icls = 1;
00173 else
00174 continue;
00175 }
00176 if( mapper != 0 ) mappedP = mapper->output(&p);
00177 if( mcTrained != 0 )
00178 loss->update(icls,mcTrained->response(mappedP),data->w(n));
00179 else
00180 loss->update(icls,trained->response(mappedP),data->w(n));
00181 if( mapper != 0 ) mapper->clear();
00182 }
00183
00184
00185 losses[i] = loss->value();
00186 aveLoss += losses[i];
00187 }
00188
00189
00190 aveLoss /= nPerm;
00191 double err = 0;
00192 for( int i=0;i<nPerm;i++ )
00193 err += (losses[i]-aveLoss)*(losses[i]-aveLoss);
00194 if( nPerm > 1 )
00195 err /= (nPerm-1);
00196 err = sqrt(err);
00197
00198
00199 lossIncrease[d] = NameAndValue(testVars[d],
00200 ValueWithError(aveLoss-nominalLoss,err));
00201 }
00202
00203
00204 return true;
00205 }
00206
00207
00208 bool SprClassifierEvaluator::variableInteraction(
00209 const SprAbsFilter* data,
00210 SprAbsTrainedClassifier* trained,
00211 SprTrainedMultiClassLearner* mcTrained,
00212 SprCoordinateMapper* mapper,
00213 const char* vars,
00214 unsigned nPoints,
00215 std::vector<NameAndValue>& interaction,
00216 int verbose)
00217 {
00218
00219 if( data == 0 ) {
00220 cerr << "No data supplied for variableInteraction." << endl;
00221 return false;
00222 }
00223 if( trained==0 && mcTrained==0 ) {
00224 cerr << "No classifiers provided for variableInteraction." << endl;
00225 return false;
00226 }
00227 if( trained!=0 && mcTrained!=0 ) {
00228 cerr << "variableInteraction cannot process both two-class "
00229 << "and multi-class learners." << endl;
00230 return false;
00231 }
00232 if( nPoints > data->size() ) {
00233 cerr << "Number of points for integration " << nPoints
00234 << " cannot exceed the data size " << data->size() << endl;
00235 return false;
00236 }
00237
00238
00239 int nPass = 2;
00240 if( nPoints == 0 ) {
00241 nPoints = data->size();
00242 nPass = 1;
00243 }
00244
00245
00246 vector<vector<string> > v_of_vars;
00247 SprStringParser::parseToStrings(vars,v_of_vars);
00248
00249
00250 vector<string> svars;
00251 if( !v_of_vars.empty() && !v_of_vars[0].empty() )
00252 svars = v_of_vars[0];
00253
00254
00255 vector<string> testVars;
00256 if( trained != 0 )
00257 trained->vars(testVars);
00258 else if( mcTrained != 0 )
00259 mcTrained->vars(testVars);
00260 assert( !testVars.empty() );
00261 unsigned dim = testVars.size();
00262 unsigned dim_subset = svars.size();
00263 if( dim <= dim_subset ) {
00264 cerr << "Too many variables requested in variableInteraction." << endl;
00265 return false;
00266 }
00267
00268
00269 vector<int> subsetIndex(dim_subset,-1);
00270 for( int d=0;d<dim_subset;d++ ) {
00271 vector<string>::const_iterator found
00272 = find(testVars.begin(),testVars.end(),svars[d]);
00273 if( found == testVars.end() ) {
00274 cerr << "Variable " << svars[d].c_str()
00275 << " not found among trained variables in "
00276 << "variableInteraction." << endl;
00277 return false;
00278 }
00279 subsetIndex[d] = found - testVars.begin();
00280 }
00281
00282
00283 map<string,int> analyzeVarIndex;
00284 for( int d=0;d<dim;d++ ) {
00285 if( find(svars.begin(),svars.end(),testVars[d]) == svars.end() ) {
00286 if( !analyzeVarIndex.insert(pair<string,int>(testVars[d],d)).second ) {
00287 cerr << "Unable to insert into analyzeVarIndex." << endl;
00288 return false;
00289 }
00290 }
00291 }
00292
00293
00294 if( trained != 0 ) {
00295 if( trained->name() == "AdaBoost" ) {
00296 SprTrainedAdaBoost* specific
00297 = static_cast<SprTrainedAdaBoost*>(trained);
00298 specific->useNormalized();
00299 }
00300 else if( trained->name() == "Fisher" ) {
00301 SprTrainedFisher* specific
00302 = static_cast<SprTrainedFisher*>(trained);
00303 specific->useNormalized();
00304 }
00305 else if( trained->name() == "LogitR" ) {
00306 SprTrainedLogitR* specific
00307 = static_cast<SprTrainedLogitR*>(trained);
00308 specific->useNormalized();
00309 }
00310 }
00311
00312
00313 interaction.clear();
00314 interaction.resize(dim,NameAndValue("UserSubset",ValueWithError(1,0)));
00315
00316
00317
00318 unsigned N = data->size();
00319 SprIntegerPermutator permu(N);
00320
00321
00322
00323
00324 vector<vector<double> > pass(nPass,vector<double>(dim));
00325 for( int ipass=0;ipass<nPass;ipass++ ) {
00326
00327
00328 vector<unsigned> indices;
00329 if( !permu.sequence(indices) ) {
00330 cerr << "Unable to permute input indices." << endl;
00331 return false;
00332 }
00333 double wtot = 0;
00334 for( int i=0;i<nPoints;i++ )
00335 wtot += data->w(indices[i]);
00336
00337
00338
00339
00340 for( map<string,int>::const_iterator iter=analyzeVarIndex.begin();
00341 iter!=analyzeVarIndex.end();iter++ ) {
00342
00343
00344 int d = iter->second;
00345 assert( d < dim );
00346 if( verbose > 0 ) {
00347 cout << "Computing interaction for variable "
00348 << testVars[d].c_str() << " at pass " << ipass+1 << endl;
00349 }
00350
00351
00352 if( svars.empty() ) {
00353 dim_subset = dim-1;
00354 subsetIndex.clear();
00355 for( int k=0;k<dim;k++ ) {
00356 if( k == d ) continue;
00357 subsetIndex.push_back(k);
00358 }
00359 }
00360
00361
00362 vector<double> FS(nPoints,0), Fd(nPoints,0);
00363 for( int i=0;i<nPoints;i++ ) {
00364 if( verbose>1 && (i+1)%1000==0 )
00365 cout << "Processing point " << i+1 << endl;
00366 int ii = indices[i];
00367 double wi = data->w(ii);
00368 const SprPoint* pi = (*data)[ii];
00369 const SprPoint* mappedPi = pi;
00370 if( mapper != 0 ) mappedPi = mapper->output(pi);
00371 const vector<double>& xi = mappedPi->x_;
00372 vector<double> xi_subset(dim_subset);
00373 for( int k=0;k<dim_subset;k++ )
00374 xi_subset[k] = xi[subsetIndex[k]];
00375
00376 for( int j=0;j<nPoints;j++ ) {
00377 int jj = indices[j];
00378 const SprPoint* pj = (*data)[jj];
00379 vector<double> x_S(pj->x_), x_d(pj->x_);
00380 double wj = data->w(jj);
00381 if( mapper != 0 ) mapper->map(pj->x_,x_S);
00382 if( mapper != 0 ) mapper->map(pj->x_,x_d);
00383 x_d[d] = xi[d];
00384 for( int k=0;k<dim_subset;k++ )
00385 x_S[subsetIndex[k]] = xi_subset[k];
00386 if( trained != 0 ) {
00387 Fd[i] += wj * trained->response(x_d);
00388 FS[i] += wj * trained->response(x_S);
00389 }
00390 else if( mcTrained != 0 ) {
00391 Fd[i] += wj * mcTrained->response(x_d);
00392 FS[i] += wj * mcTrained->response(x_S);
00393 }
00394 }
00395 Fd[i] /= wtot;
00396 FS[i] /= wtot;
00397
00398
00399 if( mapper != 0 ) mapper->clear();
00400 }
00401
00402
00403 double FS_mean(0), Fd_mean(0);
00404 for( int i=0;i<nPoints;i++ ) {
00405 int ii = indices[i];
00406 double wi = data->w(ii);
00407 Fd_mean += wi*Fd[i];
00408 FS_mean += wi*FS[i];
00409 }
00410 Fd_mean /= wtot;
00411 FS_mean /= wtot;
00412 double var_S(0), var_d(0), cov(0);
00413 for( int i=0;i<nPoints;i++ ) {
00414 int ii = indices[i];
00415 double wi = data->w(ii);
00416 var_d += wi*pow(Fd[i]-Fd_mean,2);
00417 var_S += wi*pow(FS[i]-FS_mean,2);
00418 cov += wi*(Fd[i]-Fd_mean)*(FS[i]-FS_mean);
00419 }
00420 var_d /= wtot;
00421 var_S /= wtot;
00422 cov /= wtot;
00423
00424
00425 if( var_d<SprUtils::eps() || var_S<SprUtils::eps() ) {
00426 cerr << "Variance too small: " << var_d << " " << var_S
00427 << " for variable " << testVars[d].c_str()
00428 << ". Unable to compute variable interaction." << endl;
00429 pass[ipass][d] = 0;
00430 }
00431 else
00432 pass[ipass][d] = cov/(sqrt(var_d)*sqrt(var_S));
00433 }
00434 }
00435
00436
00437
00438
00439 for( map<string,int>::const_iterator iter=analyzeVarIndex.begin();
00440 iter!=analyzeVarIndex.end();iter++ ) {
00441 int d = iter->second;
00442 double mean = 0;
00443 for( int ipass=0;ipass<nPass;ipass++ )
00444 mean += pass[ipass][d];
00445 mean /= nPass;
00446 double var = 0;
00447 for( int ipass=0;ipass<nPass;ipass++ )
00448 var += pow(pass[ipass][d]-mean,2);
00449 if( nPass > 1 )
00450 var /= (nPass-1);
00451 double sigma = ( var>0 ? sqrt(var) : 0 );
00452 interaction[d] = NameAndValue(testVars[d],ValueWithError(mean,sigma));
00453 }
00454
00455
00456 return true;
00457 }