CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_5_3_3/src/EventFilter/Goodies/src/iDie.h

Go to the documentation of this file.
00001 #ifndef EVENTFILTER_GOODIES_IDIE_H
00002 #define EVENTFILTER_GOODIES_IDIE_H
00003 
00004 #include "EventFilter/Utilities/interface/Exception.h"
00005 #include "EventFilter/Utilities/interface/TriggerReportDef.h"
00006 
00007 #include "xdata/String.h"
00008 #include "xdata/UnsignedInteger32.h"
00009 #include "xdata/Boolean.h"
00010 #include "xdata/ActionListener.h"
00011 
00012 #include "xoap/MessageReference.h"
00013 #include "xoap/MessageFactory.h"
00014 #include "xoap/Method.h"
00015 
00016 #include "xgi/Utils.h"
00017 #include "xgi/Input.h"
00018 #include "xgi/Output.h"
00019 #include "xgi/Method.h"
00020 
00021 #include "xdaq/Application.h"
00022 
00023 #include "toolbox/net/URN.h"
00024 #include "toolbox/fsm/exception/Exception.h"
00025 
00026 
00027 #include <vector>
00028 #include <deque>
00029 
00030 #include <sys/time.h>
00031 
00032 #include "TFile.h"
00033 #include "TTree.h"
00034 
00035 #include "FWCore/Framework/interface/EventProcessor.h"
00036 #include "DQMServices/Core/src/DQMService.h"
00037 #include "DQMServices/Core/interface/DQMStore.h"
00038 #include "DQMServices/Core/interface/MonitorElement.h"
00039 
00040 #define MODLZSIZE 25
00041 #define MODLZSIZELUMI 20
00042 #define MOD_OCC_THRESHOLD 5
00043 
00044 namespace evf {
00045 
00046   int modlistSortFunction( const void *a, const void *b);
00047 
00048   namespace internal{
00049    struct fu{
00050       time_t tstamp;
00051       unsigned int ccount;
00052       std::vector<pid_t> cpids;
00053       std::vector<std::string> signals;
00054       std::vector<std::string> stacktraces;
00055     };
00056    struct rate{
00057      int nproc;
00058      int nsub;
00059      int nrep;
00060      int npath;
00061      int nendpath;
00062      int ptimesRun[evf::max_paths];
00063      int ptimesPassedPs[evf::max_paths];
00064      int ptimesPassedL1[evf::max_paths];
00065      int ptimesPassed[evf::max_paths];
00066      int ptimesFailed[evf::max_paths];
00067      int ptimesExcept[evf::max_paths];
00068      int etimesRun[evf::max_endpaths];
00069      int etimesPassedPs[evf::max_endpaths];
00070      int etimesPassedL1[evf::max_endpaths];
00071      int etimesPassed[evf::max_endpaths];
00072      int etimesFailed[evf::max_endpaths];
00073      int etimesExcept[evf::max_endpaths];
00074    };
00075 
00076   }
00077   typedef std::map<std::string,internal::fu> fmap;
00078   typedef fmap::iterator ifmap;
00079   
00080   class iDie : public xdaq::Application,
00081     public xdata::ActionListener
00082   {
00083   public:
00084     //
00085     // xdaq instantiator macro
00086     //
00087     XDAQ_INSTANTIATOR();
00088   
00089     
00090     //
00091     // construction/destruction
00092     //
00093     iDie(xdaq::ApplicationStub *s);
00094     virtual ~iDie();
00095     //UI
00096     void defaultWeb(xgi::Input *in,xgi::Output *out)
00097       throw (xgi::exception::Exception);
00098     void summaryTable(xgi::Input *in,xgi::Output *out)
00099       throw (xgi::exception::Exception);
00100     void detailsTable(xgi::Input *in,xgi::Output *out)
00101       throw (xgi::exception::Exception);
00102     void dumpTable(xgi::Input *in,xgi::Output *out)
00103       throw (xgi::exception::Exception);
00104     void updater(xgi::Input *in,xgi::Output *out)
00105       throw (xgi::exception::Exception);
00106     void iChoke(xgi::Input *in,xgi::Output *out)
00107       throw (xgi::exception::Exception);
00108     void iChokeMiniInterface(xgi::Input *in,xgi::Output *out)
00109       throw (xgi::exception::Exception);
00110     void spotlight(xgi::Input *in,xgi::Output *out)
00111       throw (xgi::exception::Exception);
00112     //AI
00113     void postEntry(xgi::Input*in,xgi::Output*out)
00114       throw (xgi::exception::Exception);
00115     void postEntryiChoke(xgi::Input*in,xgi::Output*out)
00116       throw (xgi::exception::Exception);
00117     
00118     // *fake* fsm soap command callback
00119     xoap::MessageReference fsmCallback(xoap::MessageReference msg)
00120       throw (xoap::exception::Exception);
00121 
00122     // xdata:ActionListener interface
00123     void actionPerformed(xdata::Event& e);
00124 
00125 
00126   private:
00127 
00128     struct sorted_indices{
00129       sorted_indices(const std::vector<int> &arr) : arr_(arr)
00130       {
00131         ind_.resize(arr_.size(),0);
00132         unsigned int i = 0;
00133         while(i<ind_.size()) {ind_[i] = i; i++;}
00134         std::sort(ind_.rbegin(),ind_.rend(),*this);
00135       }
00136       int operator[](size_t ind) const {return arr_[ind_[ind]];}
00137       
00138       bool operator()(const size_t a, const size_t b) const
00139       {
00140         return arr_[a]<arr_[b];
00141       }
00142       int ii(size_t ind){return ind_[ind];}
00143       std::vector<int> ind_;
00144       const std::vector<int> &arr_;
00145     };
00146     //
00147     // private member functions
00148     //
00149     class lsStat;
00150     class commonLsStat;
00151     
00152     void reset();
00153     void parseModuleLegenda(std::string);
00154     void parseModuleHisto(const char *, unsigned int);
00155     void parsePathLegenda(std::string);
00156     void parsePathHisto(const unsigned char *, unsigned int);
00157     void initFramework();
00158     void deleteFramework();
00159     void initMonitorElements();
00160     void fillDQMStatHist(unsigned int nbsIdx, unsigned int lsid);
00161     void fillDQMModFractionHist(unsigned int nbsIdx, unsigned int lsid, unsigned int nonIdle,
00162                                  std::vector<std::pair<unsigned int, unsigned int>> offenders);
00163  
00164     void updateRollingHistos(unsigned int nbsIdx, unsigned int lsid, lsStat & lst, commonLsStat & clst, bool roll);
00165     void doFlush();
00166     void perLumiFileSaver(unsigned int lsid);
00167     //
00168     // member data
00169     //
00170 
00171     // message logger
00172     Logger                          log_;
00173     std::string                     dqmState_;          
00174     // monitored parameters
00175     xdata::String                   url_;
00176     xdata::String                   class_;
00177     xdata::UnsignedInteger32        instance_;
00178     xdata::String                   hostname_;
00179     xdata::UnsignedInteger32        runNumber_;
00180     xdata::String                   dqmCollectorHost_;
00181     xdata::String                   dqmCollectorPort_;
00182     fmap                            fus_;
00183     
00184     unsigned int                    totalCores_;
00185     unsigned int                    nstates_;   
00186     std::vector<int>                cpuentries_;
00187     std::vector<std::vector<int> >  cpustat_;
00188     std::vector<std::string>        mapmod_;
00189     unsigned int                    last_ls_;
00190     std::vector<TriggerReportStatic>trp_;
00191     std::vector<int>                trpentries_;
00192     std::vector<std::string>        mappath_;
00193     //root stuff
00194     TFile                          *f_;
00195     TTree                          *t_;
00196     TBranch                        *b_;
00197     TBranch                        *b1_;
00198     TBranch                        *b2_;
00199     TBranch                        *b3_;
00200     TBranch                        *b4_;
00201     int                            *datap_;
00202     TriggerReportStatic            *trppriv_;
00203     internal::rate                  r_;
00204 
00205     //message statistics 
00206     int                             nModuleLegendaMessageReceived_;
00207     int                             nPathLegendaMessageReceived_;
00208     int                             nModuleLegendaMessageWithDataReceived_;
00209     int                             nPathLegendaMessageWithDataReceived_;
00210     int                             nModuleHistoMessageReceived_;
00211     int                             nPathHistoMessageReceived_;
00212     timeval                         runStartDetectedTimeStamp_;
00213     timeval                         lastModuleLegendaMessageTimeStamp_;
00214     timeval                         lastPathLegendaMessageTimeStamp_;
00215 
00216     //DQM histogram statistics
00217     std::vector<unsigned int> epInstances;
00218     std::vector<unsigned int> epMax;
00219     std::vector<float> HTscaling;
00220     std::vector<unsigned int> nbMachines;
00221     std::vector<float> machineWeight;
00222     std::vector<float> machineWeightInst;
00223 
00224     class commonLsStat {
00225       
00226       public:
00227       unsigned int ls_;
00228       std::vector<unsigned int> rateVec_;
00229       std::vector<float> busyVec_;
00230       std::vector<float> busyCPUVec_;
00231       std::vector<float> busyVecTheor_;
00232       std::vector<float> busyCPUVecTheor_;
00233       std::vector<unsigned int> nbMachines;
00234       commonLsStat(unsigned int lsid,unsigned int classes) {
00235         for (size_t i=0;i<classes;i++) {
00236           rateVec_.push_back(0.);
00237           busyVec_.push_back(0.);
00238           busyCPUVec_.push_back(0.);
00239           busyVecTheor_.push_back(0.);
00240           busyCPUVecTheor_.push_back(0.);
00241           nbMachines.push_back(0);
00242         }
00243         ls_=lsid;
00244       }
00245       void setBusyForClass(unsigned int classIdx,unsigned int rate,float busy,float busyTheor, float busyCPU, float busyCPUTheor, unsigned int nMachineReports) {
00246         rateVec_[classIdx]=rate;
00247         busyVec_[classIdx]=busy;
00248         busyCPUVec_[classIdx]=busyCPU;
00249         busyVecTheor_[classIdx]=busyTheor;
00250         busyCPUVecTheor_[classIdx]=busyCPUTheor;
00251         nbMachines[classIdx]=nMachineReports;
00252       }
00253 
00254       unsigned int getTotalRate() {
00255         unsigned int totRate=0;
00256         for (size_t i=0;i<rateVec_.size();i++) totRate+=rateVec_[i];
00257         return totRate;
00258       } 
00259 
00260       float getBusyTotalFrac(bool procstat,std::vector<float> & machineWeightInst) {
00261         double sum=0;
00262         double sumMachines=0;
00263         for (size_t i=0;i<busyVec_.size();i++) {
00264           if (!procstat)
00265             sum+=machineWeightInst[i]*nbMachines.at(i)*busyVec_[i];
00266           else
00267             sum+=machineWeightInst[i]*nbMachines.at(i)*busyCPUVec_[i];
00268           sumMachines+=machineWeightInst[i]*nbMachines.at(i);
00269         }
00270         if (sumMachines>0)
00271           return float(sum/sumMachines);
00272         else return 0.;
00273       }
00274 
00275       float getBusyTotalFracTheor(bool procstat,std::vector<float> & machineWeight) {
00276         float sum=0;
00277         float sumMachines=0;
00278         for (size_t i=0;i<busyVecTheor_.size() && i<nbMachines.size();i++) {
00279           if (!procstat)
00280             sum+=machineWeight[i]*nbMachines[i]*busyVecTheor_[i];
00281           else
00282             sum+=machineWeight[i]*nbMachines[i]*busyCPUVecTheor_[i];
00283           sumMachines+=machineWeight[i]*nbMachines[i];
00284         }
00285         if (sumMachines>0)
00286           return sum/sumMachines;
00287         else return 0.;
00288       }
00289 
00290       unsigned int getNReports() {
00291         unsigned int sum=0;
00292         for (size_t i=0;i<nbMachines.size();i++) sum+=nbMachines[i];
00293         return sum;
00294       }
00295 
00296       std::string printInfo() {
00297         std::ostringstream info;
00298         for (size_t i=0;i<rateVec_.size();i++) {
00299           info << i << "/r:" << rateVec_[i] <<"/b:"<<busyVec_[i]<<"/n:"<<nbMachines[i]<<"; ";
00300         }
00301         return info.str();
00302       }
00303     };
00304 
00305     class lsStat {
00306       public:
00307       unsigned int ls_;
00308       bool updated_;
00309       unsigned int nbSubs_;
00310       unsigned int nSampledNonIdle_;
00311       unsigned int nSampledNonIdle2_;
00312       unsigned int nSampledIdle_;
00313       unsigned int nSampledIdle2_;
00314       unsigned int nProc_;
00315       unsigned int nProc2_;
00316       unsigned int nCPUBusy_;
00317       unsigned int nReports_;
00318       unsigned int nMaxReports_;
00319       double rateAvg;
00320       double rateErr;
00321       double evtTimeAvg;
00322       double evtTimeErr;
00323       double fracWaitingAvg;
00324       double fracCPUBusy_;
00325       unsigned int nmodulenames_;
00326       std::pair<unsigned int,unsigned int> *moduleSamplingSums;
00327 
00328       lsStat(unsigned int ls, unsigned int nbSubs,unsigned int maxreps,unsigned int nmodulenames):
00329         ls_(ls),updated_(false),nbSubs_(nbSubs),
00330         nSampledNonIdle_(0),nSampledNonIdle2_(0),nSampledIdle_(0),nSampledIdle2_(0),
00331         nProc_(0),nProc2_(0),nCPUBusy_(0),nReports_(0),nMaxReports_(maxreps),nmodulenames_(nmodulenames)
00332       {
00333         moduleSamplingSums = new std::pair<unsigned int,unsigned int>[nmodulenames_];
00334         for (unsigned int i=0;i<nmodulenames_;i++) {
00335           moduleSamplingSums[i].first=i;
00336           moduleSamplingSums[i].second=0;
00337         }
00338       }
00339 
00340       void update(unsigned int nSampledNonIdle,unsigned int nSampledIdle, unsigned int nProc,unsigned int ncpubusy) {
00341         nReports_++;
00342         nSampledNonIdle_+=nSampledNonIdle;
00343         nSampledNonIdle2_+=pow(nSampledNonIdle,2);
00344         nSampledIdle_+=nSampledIdle;
00345         nSampledIdle2_+=pow(nSampledIdle,2);
00346         nProc_+=nProc;
00347         nProc2_+=pow(nProc,2);
00348         nCPUBusy_+=ncpubusy;
00349         updated_=true;
00350       }
00351 
00352       std::pair<unsigned int,unsigned int> * getModuleSamplingPtr() {
00353         return moduleSamplingSums;
00354       }
00355 
00356       void deleteModuleSamplingPtr() {
00357         delete moduleSamplingSums;
00358         moduleSamplingSums=nullptr;
00359         nmodulenames_=0;
00360       }
00361 
00362       void calcStat()
00363       {
00364         if (!updated_) return;
00365         rateAvg=nProc_ / 23.;
00366         rateErr=sqrt(fabs(nProc2_ - pow(nProc_,2)))/23.;
00367         if (rateAvg==0.) {rateErr=0.;evtTimeAvg=0.;evtTimeErr=0.;fracWaitingAvg=0;}
00368         else {
00369           if (nSampledNonIdle_+nSampledIdle_!=0) {
00370             float nAllInv = 1./(nSampledNonIdle_+nSampledIdle_);
00371             fracWaitingAvg= nSampledIdle_*nAllInv;
00372             double nSampledIdleErr2=fabs(nSampledIdle2_ - pow(nSampledIdle_,2));
00373             double nSampledNonIdleErr2=fabs(nSampledNonIdle2_ - pow(nSampledNonIdle_,2));
00374             double fracWaitingAvgErr= sqrt(
00375                                     (pow(nSampledIdle_,2)*nSampledNonIdleErr2
00376                                      + pow(nSampledNonIdle_,2)*nSampledIdleErr2))*pow(nAllInv,2);
00377             if (rateAvg) {
00378               float rateAvgInv=1./rateAvg;
00379               evtTimeAvg=nbSubs_ * nReports_ * (1.-fracWaitingAvg)*rateAvgInv;
00380               evtTimeErr = nbSubs_ * nReports_ * sqrt(pow(fracWaitingAvg*rateErr*pow(rateAvgInv,2),2) + pow(fracWaitingAvgErr*rateAvgInv,2));
00381             }
00382             else {
00383               evtTimeAvg=0;
00384               evtTimeErr=0;
00385             }
00386           }
00387         }
00388         if (nReports_) fracCPUBusy_=nCPUBusy_/(nReports_*1000.);
00389         else fracCPUBusy_=0.;
00390         updated_=false;
00391       }
00392 
00393       float getRate() {
00394         if (updated_) calcStat();
00395         return rateAvg;
00396       }
00397 
00398       float getRateErr() {
00399         if (updated_) calcStat();
00400         return rateErr;
00401       }
00402 
00403       float getRatePerMachine() {
00404         if (updated_) calcStat();
00405         if (nReports_)
00406         return rateAvg/(1.*nReports_);
00407         return 0.;
00408       }
00409 
00410       float getRateErrPerMachine() {
00411         if (updated_) calcStat();
00412         if (nReports_)
00413         return rateErr/(1.*nReports_);
00414         return 0.;
00415       }
00416 
00417       float getEvtTime() {
00418         if (updated_) calcStat();
00419         return evtTimeAvg;
00420       }
00421 
00422       float getEvtTimeErr() {
00423         if (updated_) calcStat();
00424         return evtTimeErr;
00425       }
00426 
00427       unsigned int getNSampledNonIdle() {
00428         if (updated_) calcStat();
00429         return nSampledNonIdle_;
00430       }
00431 
00432       float getFracBusy() {
00433         if (updated_) calcStat();
00434         return 1.-fracWaitingAvg;
00435       }
00436 
00437       float getFracCPUBusy() {
00438         if (updated_) calcStat();
00439         return fracCPUBusy_;
00440       }
00441 
00442       unsigned int getReports() {
00443         return nReports_;
00444       }
00445 
00446       std::vector<std::pair<unsigned int, unsigned int>> getOffendersVector() {
00447         std::vector<std::pair<unsigned int, unsigned int>> ret;
00448         if (updated_) calcStat();
00449         if (moduleSamplingSums) {
00450           std::qsort((void *)moduleSamplingSums, nmodulenames_,
00451                      sizeof(std::pair<unsigned int,unsigned int>), modlistSortFunction);
00452           unsigned int count=0;
00453           unsigned int saveidx=0;
00454           while (saveidx < MODLZSIZE && count<nmodulenames_ && saveidx<MODLZSIZE)
00455           {
00456             if (moduleSamplingSums[count].first==2) {count++;continue;}
00457             ret.push_back(moduleSamplingSums[count]);
00458             saveidx++;
00459             count++;
00460           }
00461         }
00462         return ret;
00463       }
00464     };
00465 
00466 
00467     //DQM
00468     boost::shared_ptr<std::vector<edm::ParameterSet> > pServiceSets_;
00469     edm::ServiceToken               serviceToken_;
00470     edm::EventProcessor             *evtProcessor_;
00471     bool                            meInitialized_;
00472     DQMService                      *dqmService_;
00473     DQMStore                        *dqmStore_;
00474     std::string                     configString_;
00475     xdata::Boolean                  dqmEnabled_;
00476 
00477     std::map<unsigned int,int> nbSubsList;
00478     std::map<int,unsigned int> nbSubsListInv;
00479     unsigned int nbSubsClasses;
00480     std::vector<MonitorElement*> meVecRate_;
00481     std::vector<MonitorElement*> meVecTime_;
00482     std::vector<MonitorElement*> meVecOffenders_;
00483     MonitorElement * rateSummary_;
00484     MonitorElement * timingSummary_;
00485     MonitorElement * busySummary_;
00486     MonitorElement * busySummary2_;
00487     MonitorElement * fuReportsSummary_;
00488     MonitorElement * daqBusySummary_;
00489     unsigned int summaryLastLs_;
00490     std::vector<std::map<unsigned int, unsigned int> > occupancyNameMap;
00491     //1 queue per number of subProcesses (and one common)
00492     std::deque<commonLsStat> commonLsHistory;
00493     std::deque<lsStat> *lsHistory;
00494 
00495     std::vector<unsigned int> currentLs_;
00496 
00497     xdata::UnsignedInteger32 saveLsInterval_;
00498     unsigned int ilumiprev_;
00499     std::list<std::string> pastSavedFiles_;
00500     xdata::String dqmSaveDir_;
00501     xdata::Boolean dqmFilesWritable_;
00502     xdata::String topLevelFolder_;
00503     unsigned int savedForLs_;
00504     std::string fileBaseName_;
00505     bool writeDirectoryPresent_;
00506   }; // class iDie
00507 
00508   int modlistSortFunction( const void *a, const void *b)
00509   {
00510     std::pair<unsigned int,unsigned int> intOne = *((std::pair<unsigned int,unsigned int>*)a);
00511     std::pair<unsigned int,unsigned int> intTwo = *((std::pair<unsigned int,unsigned int>*)b);
00512     if (intOne.second > intTwo.second)
00513       return -1;
00514     if (intOne.second == intTwo.second)
00515       return 0;
00516     return 1;
00517   }
00518 
00519 
00520 } // namespace evf
00521 
00522 
00523 #endif