CMS 3D CMS Logo

SprClassifierEvaluator.cc

Go to the documentation of this file.
00001 // $Id: SprClassifierEvaluator.cc,v 1.2 2007/12/01 01:29:46 narsky Exp $
00002 
00003 #include "PhysicsTools/StatPatternRecognition/interface/SprExperiment.hh"
00004 #include "PhysicsTools/StatPatternRecognition/interface/SprClassifierEvaluator.hh"
00005 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsFilter.hh"
00006 #include "PhysicsTools/StatPatternRecognition/interface/SprAbsTrainedClassifier.hh"
00007 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedMultiClassLearner.hh"
00008 #include "PhysicsTools/StatPatternRecognition/interface/SprCoordinateMapper.hh"
00009 #include "PhysicsTools/StatPatternRecognition/interface/SprClass.hh"
00010 #include "PhysicsTools/StatPatternRecognition/interface/SprLoss.hh"
00011 #include "PhysicsTools/StatPatternRecognition/interface/SprAverageLoss.hh"
00012 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedAdaBoost.hh"
00013 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedFisher.hh"
00014 #include "PhysicsTools/StatPatternRecognition/interface/SprTrainedLogitR.hh"
00015 #include "PhysicsTools/StatPatternRecognition/interface/SprIntegerPermutator.hh"
00016 #include "PhysicsTools/StatPatternRecognition/interface/SprPoint.hh"
00017 #include "PhysicsTools/StatPatternRecognition/interface/SprStringParser.hh"
00018 #include "PhysicsTools/StatPatternRecognition/interface/SprUtils.hh"
00019 
00020 #include <iostream>
00021 #include <memory>
00022 #include <algorithm>
00023 #include <cassert>
00024 #include <cmath>
00025 
00026 
00027 using namespace std;
00028 
00029 
00030 bool SprClassifierEvaluator::variableImportance(
00031                                const SprAbsFilter* data,
00032                                SprAbsTrainedClassifier* trained,
00033                                SprTrainedMultiClassLearner* mcTrained,
00034                                SprCoordinateMapper* mapper,
00035                                unsigned nPerm,
00036                                std::vector<NameAndValue>& lossIncrease)
00037 {
00038   // sanity check
00039   if( data == 0 ) {
00040     cerr << "No data supplied for variableImportance." << endl;
00041     return false;
00042   }
00043   if( trained==0 && mcTrained==0 ) {
00044     cerr << "No classifiers provided for variableImportance." << endl;
00045     return false;
00046   }
00047   if( trained!=0 && mcTrained!=0 ) {
00048     cerr << "variableImportance cannot process both two-class " 
00049          << "and multi-class learners." << endl;
00050     return false;
00051   }
00052   if( nPerm == 0 ) {
00053     cout << "No permutations requested. Will use one by default." << endl;
00054     nPerm = 1;
00055   }
00056 
00057   // check classes
00058   vector<SprClass> classes; 
00059   data->classes(classes); 
00060   if( classes.size() < 2 ) {
00061     cerr << "Classes have not been set." << endl;
00062     return false; 
00063   }
00064   vector<int> mcClasses;
00065   if( mcTrained != 0 ) 
00066     mcTrained->classes(mcClasses);
00067 
00068   // make loss
00069   auto_ptr<SprAverageLoss> loss;
00070   if( mcTrained != 0 )
00071     loss.reset(new SprAverageLoss(&SprLoss::correct_id));
00072   else
00073     loss.reset(new SprAverageLoss(&SprLoss::quadratic));
00074   if( trained != 0 ) {
00075     if(      trained->name() == "AdaBoost" ) {
00076       SprTrainedAdaBoost* specific
00077         = static_cast<SprTrainedAdaBoost*>(trained);
00078       specific->useNormalized();
00079     }
00080     else if( trained->name() == "Fisher" ) {
00081       SprTrainedFisher* specific
00082         = static_cast<SprTrainedFisher*>(trained);
00083       specific->useNormalized();
00084     }
00085     else if( trained->name() == "LogitR" ) {
00086       SprTrainedLogitR* specific
00087         = static_cast<SprTrainedLogitR*>(trained);
00088       specific->useNormalized();
00089     }
00090   }
00091 
00092   //
00093   // pass through all variables
00094   //
00095   vector<string> testVars;
00096   if( mcTrained != 0 )
00097     mcTrained->vars(testVars);
00098   else
00099     trained->vars(testVars);
00100   int N = data->size();
00101   SprIntegerPermutator permu(N);
00102 
00103   // make first pass without permutations
00104   for( int n=0;n<N;n++ ) {
00105     const SprPoint* p = (*data)[n];
00106     const SprPoint* mappedP = p;
00107     int icls = p->class_;
00108     if( mcTrained != 0 ) {
00109       if( find(mcClasses.begin(),mcClasses.end(),icls) == mcClasses.end() )
00110         continue;
00111     }
00112     else {
00113       if(      icls == classes[0] )
00114         icls = 0;
00115       else if( icls == classes[1] )
00116         icls = 1;
00117       else
00118         continue;
00119     }
00120     if( mapper != 0 ) mappedP = mapper->output(p);
00121     if( mcTrained != 0 )
00122       loss->update(icls,mcTrained->response(mappedP),data->w(n));
00123     else
00124       loss->update(icls,trained->response(mappedP),data->w(n));
00125     if(  mapper != 0 ) mapper->clear();
00126   }
00127   double nominalLoss = loss->value();
00128 
00129   //
00130   // loop over permutations
00131   //
00132   cout << "Will perform " << nPerm << " permutations per variable." << endl;
00133   int nVars = testVars.size();
00134   lossIncrease.clear();
00135   lossIncrease.resize(nVars);
00136   for( int d=0;d<nVars;d++ ) {
00137     cout << "Permuting variable " << testVars[d].c_str() << endl;
00138 
00139     // map this var
00140     int mappedD = d;
00141     if( mapper != 0 )
00142       mappedD = mapper->mappedIndex(d);
00143     assert( mappedD>=0 && mappedD<data->dim() );
00144 
00145     // pass through all points permuting them
00146     vector<double> losses(nPerm);
00147     double aveLoss = 0;
00148     for( int i=0;i<nPerm;i++ ) {
00149 
00150       // permute this variable
00151       vector<unsigned> seq;
00152       if( !permu.sequence(seq) ) {
00153         cerr << "variableImportance is unable to permute points." << endl;
00154         return false;
00155       }
00156 
00157       // pass through points
00158       loss->reset();
00159       for( int n=0;n<N;n++ ) {
00160         SprPoint p(*(*data)[n]);
00161         p.x_[mappedD] = (*data)[seq[n]]->x_[mappedD];
00162         const SprPoint* mappedP = &p;
00163         int icls = p.class_;
00164         if( mcTrained != 0 ) {
00165           if( find(mcClasses.begin(),mcClasses.end(),icls) == mcClasses.end() )
00166             continue;
00167         }
00168         else {
00169           if(      icls == classes[0] )
00170             icls = 0;
00171           else if( icls == classes[1] )
00172             icls = 1;
00173           else
00174             continue;
00175         }
00176         if( mapper != 0 ) mappedP = mapper->output(&p);
00177         if( mcTrained != 0 )
00178           loss->update(icls,mcTrained->response(mappedP),data->w(n));
00179         else
00180           loss->update(icls,trained->response(mappedP),data->w(n));
00181         if( mapper != 0 ) mapper->clear();
00182       }
00183 
00184       // store loss
00185       losses[i] = loss->value();
00186       aveLoss += losses[i];
00187     }// end loop over permutations
00188 
00189     // compute error
00190     aveLoss /= nPerm;
00191     double err = 0;
00192     for( int i=0;i<nPerm;i++ )
00193       err += (losses[i]-aveLoss)*(losses[i]-aveLoss);
00194     if( nPerm > 1 )
00195       err /= (nPerm-1);
00196     err = sqrt(err);
00197 
00198     // store values
00199     lossIncrease[d] = NameAndValue(testVars[d],
00200                                    ValueWithError(aveLoss-nominalLoss,err));
00201   }// end loop over variables
00202 
00203   // exit
00204   return true;
00205 }
00206 
00207 
00208 bool SprClassifierEvaluator::variableInteraction(
00209                                        const SprAbsFilter* data,
00210                                        SprAbsTrainedClassifier* trained,
00211                                        SprTrainedMultiClassLearner* mcTrained,
00212                                        SprCoordinateMapper* mapper,
00213                                        const char* vars,
00214                                        unsigned nPoints,
00215                                        std::vector<NameAndValue>& interaction,
00216                                        int verbose)
00217 {
00218   // sanity check
00219   if( data == 0 ) {
00220     cerr << "No data supplied for variableInteraction." << endl;
00221     return false;
00222   }
00223   if( trained==0 && mcTrained==0 ) {
00224     cerr << "No classifiers provided for variableInteraction." << endl;
00225     return false;
00226   }
00227   if( trained!=0 && mcTrained!=0 ) {
00228     cerr << "variableInteraction cannot process both two-class " 
00229          << "and multi-class learners." << endl;
00230     return false;
00231   }
00232   if( nPoints > data->size() ) {
00233     cerr << "Number of points for integration " << nPoints 
00234          << " cannot exceed the data size " << data->size() << endl;
00235     return false;
00236   }
00237 
00238   // set number of points and passes
00239   int nPass = 2;
00240   if( nPoints == 0 ) {
00241     nPoints = data->size();
00242     nPass = 1;
00243   }
00244 
00245   // get input vars
00246   vector<vector<string> > v_of_vars;
00247   SprStringParser::parseToStrings(vars,v_of_vars);
00248 
00249   // can only handle 1st and 2nd order interactions for now
00250   vector<string> svars;
00251   if( !v_of_vars.empty() && !v_of_vars[0].empty() )
00252     svars = v_of_vars[0];
00253 
00254   // get trained vars
00255   vector<string> testVars;
00256   if(      trained != 0 )
00257     trained->vars(testVars);
00258   else if( mcTrained != 0 )
00259     mcTrained->vars(testVars);
00260   assert( !testVars.empty() );
00261   unsigned dim        = testVars.size();
00262   unsigned dim_subset = svars.size();
00263   if( dim <= dim_subset ) {
00264     cerr << "Too many variables requested in variableInteraction." << endl;
00265     return false;
00266   }
00267 
00268   // map subset vars onto classifier vars
00269   vector<int> subsetIndex(dim_subset,-1);
00270   for( int d=0;d<dim_subset;d++ ) {
00271     vector<string>::const_iterator found 
00272       = find(testVars.begin(),testVars.end(),svars[d]);
00273     if( found == testVars.end() ) {
00274       cerr << "Variable " << svars[d].c_str() 
00275            << " not found among trained variables in " 
00276            << "variableInteraction." << endl;
00277       return false;
00278     }
00279     subsetIndex[d] = found - testVars.begin();
00280   }
00281 
00282   // find classifier vars not included in subset vars
00283   map<string,int> analyzeVarIndex;
00284   for( int d=0;d<dim;d++ ) {
00285     if( find(svars.begin(),svars.end(),testVars[d]) == svars.end() ) {
00286       if( !analyzeVarIndex.insert(pair<string,int>(testVars[d],d)).second ) {
00287         cerr << "Unable to insert into analyzeVarIndex." << endl;
00288         return false;
00289       }
00290     }
00291   }
00292 
00293   // use normalized output
00294   if( trained != 0 ) {
00295     if(      trained->name() == "AdaBoost" ) {
00296       SprTrainedAdaBoost* specific
00297         = static_cast<SprTrainedAdaBoost*>(trained);
00298       specific->useNormalized();
00299     }
00300     else if( trained->name() == "Fisher" ) {
00301       SprTrainedFisher* specific
00302         = static_cast<SprTrainedFisher*>(trained);
00303       specific->useNormalized();
00304     }
00305     else if( trained->name() == "LogitR" ) {
00306       SprTrainedLogitR* specific
00307         = static_cast<SprTrainedLogitR*>(trained);
00308       specific->useNormalized();
00309     }
00310   }
00311 
00312   // init interaction
00313   interaction.clear();
00314   interaction.resize(dim,NameAndValue("UserSubset",ValueWithError(1,0)));
00315 
00316   // reduce data size for integration by randomly choosing
00317   //   the number of points defined by the user
00318   unsigned N = data->size();
00319   SprIntegerPermutator permu(N);
00320 
00321   //
00322   // two passes to get a rough error estimate
00323   //
00324   vector<vector<double> > pass(nPass,vector<double>(dim));
00325   for( int ipass=0;ipass<nPass;ipass++ ) {
00326 
00327     // permute indices
00328     vector<unsigned> indices;
00329     if( !permu.sequence(indices) ) {
00330     cerr << "Unable to permute input indices." << endl;
00331     return false;
00332     }
00333     double wtot = 0;
00334     for( int i=0;i<nPoints;i++ )
00335       wtot += data->w(indices[i]);
00336 
00337     //
00338     // compute correlation between F(S) and each variable in \S
00339     //
00340     for( map<string,int>::const_iterator iter=analyzeVarIndex.begin();
00341          iter!=analyzeVarIndex.end();iter++ ) {
00342 
00343       // get var index in the classifier list
00344       int d = iter->second;
00345       assert( d < dim );
00346       if( verbose > 0 ) {
00347         cout << "Computing interaction for variable "
00348              << testVars[d].c_str() << " at pass " << ipass+1 << endl;
00349       }
00350       
00351       // if no vars specified, use all vars but this one
00352       if( svars.empty() ) {
00353         dim_subset = dim-1;
00354         subsetIndex.clear();
00355         for( int k=0;k<dim;k++ ) {
00356           if( k == d ) continue;
00357           subsetIndex.push_back(k);
00358         }
00359       }
00360       
00361       // compute mean Fd and FS at each point
00362       vector<double> FS(nPoints,0), Fd(nPoints,0);
00363       for( int i=0;i<nPoints;i++ ) {
00364         if( verbose>1 && (i+1)%1000==0 )
00365           cout << "Processing point " << i+1 << endl;
00366         int ii = indices[i];
00367         double wi = data->w(ii);
00368         const SprPoint* pi = (*data)[ii];
00369         const SprPoint* mappedPi = pi;
00370         if( mapper != 0 ) mappedPi = mapper->output(pi);
00371         const vector<double>& xi = mappedPi->x_;
00372         vector<double> xi_subset(dim_subset);
00373         for( int k=0;k<dim_subset;k++ )
00374           xi_subset[k] = xi[subsetIndex[k]];
00375         
00376         for( int j=0;j<nPoints;j++ ) {
00377           int jj = indices[j];
00378           const SprPoint* pj = (*data)[jj];
00379           vector<double> x_S(pj->x_), x_d(pj->x_);
00380           double wj = data->w(jj);
00381           if( mapper != 0 ) mapper->map(pj->x_,x_S);
00382           if( mapper != 0 ) mapper->map(pj->x_,x_d);
00383           x_d[d] = xi[d];
00384           for( int k=0;k<dim_subset;k++ )
00385             x_S[subsetIndex[k]] = xi_subset[k];
00386           if(      trained != 0 ) {
00387             Fd[i] += wj * trained->response(x_d);
00388             FS[i] += wj * trained->response(x_S);
00389           }
00390           else if( mcTrained != 0 ) {
00391             Fd[i] += wj * mcTrained->response(x_d);
00392             FS[i] += wj * mcTrained->response(x_S);
00393           }
00394         }// end loop over j
00395         Fd[i] /= wtot;
00396         FS[i] /= wtot;
00397         
00398         // cleanup
00399         if( mapper != 0 ) mapper->clear();
00400       }// end loop over i
00401       
00402       // compute correlation between FS and Fd
00403       double FS_mean(0), Fd_mean(0);
00404       for( int i=0;i<nPoints;i++ ) {
00405         int ii = indices[i];
00406         double wi = data->w(ii);
00407         Fd_mean += wi*Fd[i];
00408         FS_mean += wi*FS[i];
00409       }
00410       Fd_mean /= wtot;
00411       FS_mean /= wtot;
00412       double var_S(0), var_d(0), cov(0);
00413       for( int i=0;i<nPoints;i++ ) {
00414         int ii = indices[i];
00415         double wi = data->w(ii);
00416         var_d += wi*pow(Fd[i]-Fd_mean,2);
00417         var_S += wi*pow(FS[i]-FS_mean,2);
00418         cov   += wi*(Fd[i]-Fd_mean)*(FS[i]-FS_mean);
00419       }
00420       var_d /= wtot;
00421       var_S /= wtot;
00422       cov   /= wtot;
00423       
00424       // set correlation
00425       if( var_d<SprUtils::eps() || var_S<SprUtils::eps() ) {
00426         cerr << "Variance too small: " << var_d << " " << var_S 
00427              << " for variable " << testVars[d].c_str()
00428              << ". Unable to compute variable interaction." << endl;
00429         pass[ipass][d] = 0;
00430       }
00431       else      
00432         pass[ipass][d] = cov/(sqrt(var_d)*sqrt(var_S));
00433     }// end loop over d
00434   }// end loop over ipass
00435 
00436   //
00437   // compute average over passes
00438   //
00439   for( map<string,int>::const_iterator iter=analyzeVarIndex.begin();
00440        iter!=analyzeVarIndex.end();iter++ ) {
00441     int d = iter->second;
00442     double mean = 0;
00443     for( int ipass=0;ipass<nPass;ipass++ )
00444       mean += pass[ipass][d];
00445     mean /= nPass;
00446     double var = 0;
00447     for( int ipass=0;ipass<nPass;ipass++ )
00448       var += pow(pass[ipass][d]-mean,2);
00449     if( nPass > 1 )
00450       var /= (nPass-1);
00451     double sigma = ( var>0 ? sqrt(var) : 0 );
00452     interaction[d] = NameAndValue(testVars[d],ValueWithError(mean,sigma));
00453   }
00454 
00455   // exit
00456   return true;
00457 }

Generated on Tue Jun 9 17:42:00 2009 for CMSSW by  doxygen 1.5.4