CMS 3D CMS Logo

List of all members | Public Member Functions | Static Public Member Functions | Private Member Functions | Private Attributes
edm::ZombieKillerService Class Reference

Public Member Functions

 ZombieKillerService (edm::ParameterSet const &, edm::ActivityRegistry &)
 

Static Public Member Functions

static void fillDescriptions (ConfigurationDescriptions &descriptions)
 

Private Member Functions

void checkForZombie ()
 
void notAZombieYet ()
 
void startThread ()
 
void stopThread ()
 

Private Attributes

const unsigned int m_checkThreshold
 
bool m_jobDone
 
std::condition_variable m_jobDoneCondition
 
std::mutex m_jobDoneMutex
 
std::atomic< unsigned int > m_numberChecksWhenNotAlive
 
const unsigned int m_secsBetweenChecks
 
std::atomic< bool > m_stillAlive
 
std::thread m_watchingThread
 

Detailed Description

Definition at line 28 of file ZombieKillerService.cc.

Constructor & Destructor Documentation

ZombieKillerService::ZombieKillerService ( edm::ParameterSet const &  iPSet,
edm::ActivityRegistry iRegistry 
)

Definition at line 71 of file ZombieKillerService.cc.

References notAZombieYet(), startThread(), stopThread(), edm::ActivityRegistry::watchPostBeginJob(), edm::ActivityRegistry::watchPostEndJob(), edm::ActivityRegistry::watchPostModuleBeginStream(), edm::ActivityRegistry::watchPostModuleEndJob(), edm::ActivityRegistry::watchPostModuleEndStream(), edm::ActivityRegistry::watchPostModuleEvent(), edm::ActivityRegistry::watchPostModuleGlobalBeginLumi(), edm::ActivityRegistry::watchPostModuleGlobalBeginRun(), edm::ActivityRegistry::watchPostModuleGlobalEndLumi(), edm::ActivityRegistry::watchPostModuleGlobalEndRun(), edm::ActivityRegistry::watchPostModuleStreamBeginLumi(), edm::ActivityRegistry::watchPostModuleStreamBeginRun(), edm::ActivityRegistry::watchPostModuleStreamEndLumi(), edm::ActivityRegistry::watchPostModuleStreamEndRun(), edm::ActivityRegistry::watchPostSourceEvent(), edm::ActivityRegistry::watchPostSourceLumi(), edm::ActivityRegistry::watchPostSourceRun(), edm::ActivityRegistry::watchPreModuleBeginStream(), edm::ActivityRegistry::watchPreModuleEndJob(), edm::ActivityRegistry::watchPreModuleEndStream(), edm::ActivityRegistry::watchPreModuleEvent(), edm::ActivityRegistry::watchPreModuleGlobalBeginLumi(), edm::ActivityRegistry::watchPreModuleGlobalBeginRun(), edm::ActivityRegistry::watchPreModuleGlobalEndLumi(), edm::ActivityRegistry::watchPreModuleGlobalEndRun(), edm::ActivityRegistry::watchPreModuleStreamBeginLumi(), edm::ActivityRegistry::watchPreModuleStreamBeginRun(), edm::ActivityRegistry::watchPreModuleStreamEndLumi(), edm::ActivityRegistry::watchPreModuleStreamEndRun(), edm::ActivityRegistry::watchPreSourceEvent(), edm::ActivityRegistry::watchPreSourceLumi(), and edm::ActivityRegistry::watchPreSourceRun().

71  :
72 m_checkThreshold(iPSet.getUntrackedParameter<unsigned int>("numberOfAllowedFailedChecksInARow")),
73 m_secsBetweenChecks(iPSet.getUntrackedParameter<unsigned int>("secondsBetweenChecks")),
74 m_jobDone(false),
75 m_stillAlive(true),
77 {
78  iRegistry.watchPostBeginJob([this](){ startThread(); } );
79  iRegistry.watchPostEndJob([this]() {stopThread(); } );
80 
81  iRegistry.watchPreSourceRun([this](RunIndex){notAZombieYet();});
82  iRegistry.watchPostSourceRun([this](RunIndex){notAZombieYet();});
83 
86 
87  iRegistry.watchPreSourceEvent([this](StreamID){notAZombieYet();});
88  iRegistry.watchPostSourceEvent([this](StreamID){notAZombieYet();});
89 
90  iRegistry.watchPreModuleBeginStream([this](StreamContext const&, ModuleCallingContext const&){notAZombieYet();});
91  iRegistry.watchPostModuleBeginStream([this](StreamContext const&, ModuleCallingContext const&){notAZombieYet();});
92 
93  iRegistry.watchPreModuleEndStream([this](StreamContext const&, ModuleCallingContext const&){notAZombieYet();});
94  iRegistry.watchPostModuleEndStream([this](StreamContext const&, ModuleCallingContext const&){notAZombieYet();});
95 
96  iRegistry.watchPreModuleEndJob([this](ModuleDescription const&) {notAZombieYet();});
97  iRegistry.watchPostModuleEndJob([this](ModuleDescription const&) {notAZombieYet();});
98  iRegistry.watchPreModuleEvent([this](StreamContext const&, ModuleCallingContext const&){notAZombieYet();});
99  iRegistry.watchPostModuleEvent([this](StreamContext const&, ModuleCallingContext const&){notAZombieYet();});
100 
103 
104  iRegistry.watchPreModuleStreamEndRun([this](StreamContext const&, ModuleCallingContext const&){notAZombieYet();});
105  iRegistry.watchPostModuleStreamEndRun([this](StreamContext const&, ModuleCallingContext const&){notAZombieYet();});
106 
109 
110  iRegistry.watchPreModuleStreamEndLumi([this](StreamContext const&, ModuleCallingContext const&){notAZombieYet();});
112 
115 
116  iRegistry.watchPreModuleGlobalEndRun([this](GlobalContext const&, ModuleCallingContext const&){notAZombieYet();});
117  iRegistry.watchPostModuleGlobalEndRun([this](GlobalContext const&, ModuleCallingContext const&){notAZombieYet();});
118 
121 
122  iRegistry.watchPreModuleGlobalEndLumi([this](GlobalContext const&, ModuleCallingContext const&){notAZombieYet();});
124 
125 
126 }
void watchPostModuleGlobalEndLumi(PostModuleGlobalEndLumi::slot_type const &iSlot)
void watchPreModuleGlobalBeginRun(PreModuleGlobalBeginRun::slot_type const &iSlot)
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
void watchPostModuleEndStream(PostModuleEndStream::slot_type const &iSlot)
void watchPreModuleEvent(PreModuleEvent::slot_type const &iSlot)
void watchPostModuleEvent(PostModuleEvent::slot_type const &iSlot)
const unsigned int m_checkThreshold
void watchPostModuleGlobalBeginLumi(PostModuleGlobalBeginLumi::slot_type const &iSlot)
void watchPostModuleStreamEndLumi(PostModuleStreamEndLumi::slot_type const &iSlot)
void watchPostModuleStreamBeginRun(PostModuleStreamBeginRun::slot_type const &iSlot)
void watchPostSourceEvent(PostSourceEvent::slot_type const &iSlot)
void watchPreModuleBeginStream(PreModuleBeginStream::slot_type const &iSlot)
void watchPreModuleGlobalEndRun(PreModuleGlobalEndRun::slot_type const &iSlot)
void watchPostSourceRun(PostSourceRun::slot_type const &iSlot)
void watchPreSourceLumi(PreSourceLumi::slot_type const &iSlot)
void watchPreModuleEndJob(PreModuleEndJob::slot_type const &iSlot)
void watchPreSourceRun(PreSourceRun::slot_type const &iSlot)
void watchPreModuleGlobalBeginLumi(PreModuleGlobalBeginLumi::slot_type const &iSlot)
void watchPostModuleStreamEndRun(PostModuleStreamEndRun::slot_type const &iSlot)
void watchPreModuleStreamBeginLumi(PreModuleStreamBeginLumi::slot_type const &iSlot)
void watchPostModuleBeginStream(PostModuleBeginStream::slot_type const &iSlot)
std::atomic< unsigned int > m_numberChecksWhenNotAlive
void watchPostSourceLumi(PostSourceLumi::slot_type const &iSlot)
void watchPostModuleGlobalEndRun(PostModuleGlobalEndRun::slot_type const &iSlot)
void watchPostModuleStreamBeginLumi(PostModuleStreamBeginLumi::slot_type const &iSlot)
void watchPreModuleStreamEndLumi(PreModuleStreamEndLumi::slot_type const &iSlot)
void watchPreModuleStreamBeginRun(PreModuleStreamBeginRun::slot_type const &iSlot)
void watchPreModuleEndStream(PreModuleEndStream::slot_type const &iSlot)
const unsigned int m_secsBetweenChecks
std::atomic< bool > m_stillAlive
void watchPreModuleStreamEndRun(PreModuleStreamEndRun::slot_type const &iSlot)
void watchPostModuleGlobalBeginRun(PostModuleGlobalBeginRun::slot_type const &iSlot)
void watchPreSourceEvent(PreSourceEvent::slot_type const &iSlot)
void watchPostModuleEndJob(PostModuleEndJob::slot_type const &iSlot)
void watchPreModuleGlobalEndLumi(PreModuleGlobalEndLumi::slot_type const &iSlot)
void watchPostBeginJob(PostBeginJob::slot_type const &iSlot)
convenience function for attaching to signal

Member Function Documentation

void ZombieKillerService::checkForZombie ( )
private

Definition at line 159 of file ZombieKillerService.cc.

References m_checkThreshold, m_numberChecksWhenNotAlive, m_secsBetweenChecks, and m_stillAlive.

Referenced by startThread().

159  {
160  if (not m_stillAlive) {
163  edm::LogError("JobStuck")<<"Too long since the job has last made progress.";
164  std::terminate();
165  } else {
166  edm::LogWarning("JobProgressing")<<"It has been "<<m_numberChecksWhenNotAlive*m_secsBetweenChecks<<" seconds since job seen progressing";
167  }
168  }
169  m_stillAlive = false;
170 }
const unsigned int m_checkThreshold
std::atomic< unsigned int > m_numberChecksWhenNotAlive
const unsigned int m_secsBetweenChecks
std::atomic< bool > m_stillAlive
void ZombieKillerService::fillDescriptions ( ConfigurationDescriptions descriptions)
static

Definition at line 201 of file ZombieKillerService.cc.

References edm::ConfigurationDescriptions::add(), edm::ParameterSetDescription::addUntracked(), and DEFINE_FWK_SERVICE.

201  {
203  desc.addUntracked<unsigned int>("secondsBetweenChecks", 60)->setComment("Number of seconds to wait between checking if progress has been made.");
204  desc.addUntracked<unsigned int>("numberOfAllowedFailedChecksInARow", 3)->setComment("Number of allowed checks in a row with no progress.");
205  descriptions.add("ZombieKillerService", desc);
206 }
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
void add(std::string const &label, ParameterSetDescription const &psetDescription)
void ZombieKillerService::notAZombieYet ( )
private

Definition at line 153 of file ZombieKillerService.cc.

References m_numberChecksWhenNotAlive, and m_stillAlive.

Referenced by ZombieKillerService().

153  {
155  m_stillAlive = true;
156 }
std::atomic< unsigned int > m_numberChecksWhenNotAlive
std::atomic< bool > m_stillAlive
void ZombieKillerService::startThread ( )
private

Definition at line 173 of file ZombieKillerService.cc.

References checkForZombie(), CommonMethods::lock(), m_jobDoneCondition, m_jobDoneMutex, m_secsBetweenChecks, m_watchingThread, and seconds().

Referenced by ZombieKillerService().

173  {
174  m_watchingThread = std::thread([this]() {
175 
176  std::unique_lock<std::mutex> lock(m_jobDoneMutex);
177  while(not m_jobDoneCondition.wait_for(lock,
179  [this]()->bool
180  {
181  return m_jobDone;
182  }))
183  {
184  //we timed out
185  checkForZombie();
186  }
187  });
188 }
double seconds()
const unsigned int m_secsBetweenChecks
std::condition_variable m_jobDoneCondition
void ZombieKillerService::stopThread ( )
private

Definition at line 191 of file ZombieKillerService.cc.

References m_jobDone, m_jobDoneCondition, m_jobDoneMutex, and m_watchingThread.

Referenced by ZombieKillerService().

191  {
192  {
193  std::lock_guard<std::mutex> guard(m_jobDoneMutex);
194  m_jobDone=true;
195  }
196  m_jobDoneCondition.notify_all();
197  m_watchingThread.join();
198 }
std::condition_variable m_jobDoneCondition

Member Data Documentation

const unsigned int edm::ZombieKillerService::m_checkThreshold
private

Definition at line 35 of file ZombieKillerService.cc.

Referenced by checkForZombie().

bool edm::ZombieKillerService::m_jobDone
private

Definition at line 40 of file ZombieKillerService.cc.

Referenced by stopThread().

std::condition_variable edm::ZombieKillerService::m_jobDoneCondition
private

Definition at line 38 of file ZombieKillerService.cc.

Referenced by startThread(), and stopThread().

std::mutex edm::ZombieKillerService::m_jobDoneMutex
private

Definition at line 39 of file ZombieKillerService.cc.

Referenced by startThread(), and stopThread().

std::atomic<unsigned int> edm::ZombieKillerService::m_numberChecksWhenNotAlive
private

Definition at line 42 of file ZombieKillerService.cc.

Referenced by checkForZombie(), and notAZombieYet().

const unsigned int edm::ZombieKillerService::m_secsBetweenChecks
private

Definition at line 36 of file ZombieKillerService.cc.

Referenced by checkForZombie(), and startThread().

std::atomic<bool> edm::ZombieKillerService::m_stillAlive
private

Definition at line 41 of file ZombieKillerService.cc.

Referenced by checkForZombie(), and notAZombieYet().

std::thread edm::ZombieKillerService::m_watchingThread
private

Definition at line 37 of file ZombieKillerService.cc.

Referenced by startThread(), and stopThread().