CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_5_3_0/src/EventFilter/ResourceBroker/src/Stopping.cc

Go to the documentation of this file.
00001 
00006 #include "EventFilter/ResourceBroker/interface/RBStateMachine.h"
00007 //#include "EventFilter/ResourceBroker/interface/IPCMethod.h"
00008 #include "EventFilter/ResourceBroker/interface/SharedResources.h"
00009 
00010 #include <iostream>
00011 #include <vector>
00012 
00013 using std::cout;
00014 using std::endl;
00015 using std::vector;
00016 using std::string;
00017 using namespace evf::rb_statemachine;
00018 
00019 // entry action, state notification, state action
00020 //______________________________________________________________________________
00021 void Stopping::do_entryActionWork() {
00022 }
00023 
00024 void Stopping::do_stateNotify() {
00025         SharedResourcesPtr_t res = outermost_context().getSharedResources();
00026         LOG4CPLUS_INFO(res->log_, "--> ResourceBroker: NEW STATE: " << stateName());
00027         outermost_context().setExternallyVisibleState(stateName());
00028         outermost_context().setInternalStateName(stateName());
00029         // RCMS notification no longer required here
00030         // this is done in FUResourceBroker in SOAP reply
00031         //outermost_context().rcmsStateChangeNotify();
00032 }
00033 
00034 void Stopping::do_stateAction() const {
00035         SharedResourcesPtr_t res = outermost_context().getSharedResources();
00036 
00037         try {
00038                 LOG4CPLUS_INFO(res->log_, "Start stopping :) ...");
00039                 res->resourceStructure_->shutDownClients();
00040                 timeval now;
00041                 timeval then;
00042                 gettimeofday(&then, 0);
00043                 while (!res->resourceStructure_->isReadyToShutDown()) {
00044                         ::usleep(res->resourceStructureTimeout_.value_ * 10);
00045                         gettimeofday(&now, 0);
00046                         if ((unsigned int) (now.tv_sec - then.tv_sec)
00047                                         > res->resourceStructureTimeout_.value_ / 10000) {
00048                                 cout << "times: " << now.tv_sec << " " << then.tv_sec << " "
00049                                                 << res->resourceStructureTimeout_.value_ / 10000
00050                                                 << endl;
00051                                 LOG4CPLUS_WARN(res->log_,
00052                                                 "Some Process did not detach - going to Emergency stop!");
00053 
00057                                 res->lockRSAccess();
00058                                 emergencyStop();
00059                                 res->unlockRSAccess();
00060 
00061                                 break;
00062                         }
00063                 }
00064 
00065                 if (res->resourceStructure_->isReadyToShutDown()) {
00066 
00067                         // reset only if there was no emergency stop
00068                         if (res->allowAccessToResourceStructure_) {
00069                                 // UPDATED: release resources
00070                                 res->resourceStructure_->releaseResources();
00071                                 // UPDATED: forget pending allocates to BU
00072                                 res->resourceStructure_->resetPendingAllocates();
00073                                 // UPDATE: reset the underlying IPC method
00074                                 res->resourceStructure_->resetIPC();
00075                         }
00076 
00077                         LOG4CPLUS_INFO(res->log_, "Finished stopping!");
00078                         EventPtr stopDone(new StopDone());
00079                         res->commands_.enqEvent(stopDone);
00080                 }
00081         } catch (xcept::Exception &e) {
00082                 moveToFailedState(e);
00083         }
00084 }
00085 
00086 /*
00087  * I2O capability
00088  */
00089 bool Stopping::discardDataEvent(MemRef_t* bufRef) const {
00090         SharedResourcesPtr_t res = outermost_context().getSharedResources();
00091         bool returnValue = false;
00092         try {
00093                 returnValue = res->resourceStructure_->discardDataEvent(bufRef);
00094         } catch (evf::Exception& e) {
00095                 moveToFailedState(e);
00096         }
00097         return returnValue;
00098 }
00099 bool Stopping::discardDqmEvent(MemRef_t* bufRef) const {
00100         SharedResourcesPtr_t res = outermost_context().getSharedResources();
00101         bool returnValue = false;
00102         try {
00103                 returnValue = res->resourceStructure_->discardDqmEvent(bufRef);
00104         } catch (evf::Exception& e) {
00105                 moveToFailedState(e);
00106         }
00107         return returnValue;
00108 }
00109 
00110 // construction / destruction
00111 //______________________________________________________________________________
00112 Stopping::Stopping(my_context c) :
00113         my_base(c) {
00114         safeEntryAction();
00115 }
00116 
00117 Stopping::~Stopping() {
00118         safeExitAction();
00119 }
00120 
00121 void Stopping::emergencyStop() const {
00122         SharedResourcesPtr_t res = outermost_context().getSharedResources();
00123         IPCMethod* resourceStructure = res->resourceStructure_;
00124 
00125         LOG4CPLUS_WARN(res->log_, "in Emergency stop - handle non-clean stops");
00126 
00127         // UPDATE: while in emergency stop, access is no longer allowed to ResourceStructure
00128         // I2O messages from SM will be rejected
00129         res->allowAccessToResourceStructure_ = false;
00130 
00131         vector < pid_t > client_prc_ids = resourceStructure->clientPrcIds();
00132         for (UInt_t i = 0; i < client_prc_ids.size(); i++) {
00133                 pid_t pid = client_prc_ids[i];
00134                 cout << "B: killing process " << i << " pid= " << pid << endl;
00135                 if (pid != 0) {
00136                         //assume processes are dead by now
00137                         if (!resourceStructure->handleCrashedEP(res->runNumber_, pid))
00138                                 res->nbTimeoutsWithoutEvent_++;
00139                         else
00140                                 res->nbTimeoutsWithEvent_++;
00141                 }
00142         }
00143         resourceStructure->lastResort();
00144 	::sleep(1);
00145         if (!resourceStructure->isReadyToShutDown()) {
00146                 res->reasonForFailed_
00147                                 = "EmergencyStop: failed to shut down ResourceTable";
00148                 XCEPT_RAISE(evf::Exception, res->reasonForFailed_);
00149         }
00150 
00151         res->printWorkLoopStatus();
00152         res->lock();
00153 
00154         LOG4CPLUS_WARN(res->log_, "Deleting the resource structure!");
00155         delete res->resourceStructure_;
00156         res->resourceStructure_ = 0;
00157 
00158         cout << "cycle through resourcetable config " << endl;
00159         res->configureResources(outermost_context().getApp());
00160         res->unlock();
00161         if (res->shmInconsistent_)
00162                 XCEPT_RAISE(evf::Exception, "Inconsistent shm state");
00163         cout << "done with emergency stop" << endl;
00164 }
00165 
00166 // exit action, state name, move to failed state
00167 //______________________________________________________________________________
00168 void Stopping::do_exitActionWork() {
00169 }
00170 
00171 string Stopping::do_stateName() const {
00172         return std::string("Stopping");
00173 }
00174 
00175 void Stopping::do_moveToFailedState(xcept::Exception& exception) const {
00176         SharedResourcesPtr_t res = outermost_context().getSharedResources();
00177         res->reasonForFailed_ = exception.what();
00178         LOG4CPLUS_ERROR(res->log_,
00179                         "Moving to FAILED state! Reason: " << exception.what());
00180         EventPtr fail(new Fail());
00181         res->commands_.enqEvent(fail);
00182 }