CMS 3D CMS Logo

ZombieKillerService.cc
Go to the documentation of this file.
1 // -*- C++ -*-
2 //
3 // Package: FWCore/Services
4 // Class : ZombieKillerService
5 //
6 // Implementation:
7 // [Notes on implementation]
8 //
9 // Original Author: Chris Jones
10 // Created: Sat, 22 Mar 2014 16:25:47 GMT
11 //
12 
13 // system include files
14 #include <atomic>
15 #include <thread>
16 #include <mutex>
17 #include <condition_variable>
18 #include <exception>
19 
20 // user include files
26 
27 namespace edm {
29  public:
31 
32  static void fillDescriptions(ConfigurationDescriptions& descriptions);
33 
34  private:
35  const unsigned int m_checkThreshold;
36  const unsigned int m_secsBetweenChecks;
37  std::thread m_watchingThread;
38  std::condition_variable m_jobDoneCondition;
40  bool m_jobDone;
41  std::atomic<bool> m_stillAlive;
42  std::atomic<unsigned int> m_numberChecksWhenNotAlive;
43 
44  void notAZombieYet();
45  void checkForZombie();
46  void startThread();
47  void stopThread();
48  };
49 } // namespace edm
50 
51 using namespace edm;
52 
53 namespace edm {
54  namespace service {
55  inline bool isProcessWideService(ZombieKillerService const*) { return true; }
56  } // namespace service
57 } // namespace edm
58 
60  : m_checkThreshold(iPSet.getUntrackedParameter<unsigned int>("numberOfAllowedFailedChecksInARow")),
61  m_secsBetweenChecks(iPSet.getUntrackedParameter<unsigned int>("secondsBetweenChecks")),
62  m_jobDone(false),
63  m_stillAlive(true),
64  m_numberChecksWhenNotAlive(0) {
65  iRegistry.watchPostBeginJob([this]() { startThread(); });
66  iRegistry.watchPostEndJob([this]() { stopThread(); });
67 
68  iRegistry.watchPreSourceRun([this](RunIndex) { notAZombieYet(); });
69  iRegistry.watchPostSourceRun([this](RunIndex) { notAZombieYet(); });
70 
71  iRegistry.watchPreSourceLumi([this](LuminosityBlockIndex) { notAZombieYet(); });
72  iRegistry.watchPostSourceLumi([this](LuminosityBlockIndex) { notAZombieYet(); });
73 
74  iRegistry.watchPreSourceEvent([this](StreamID) { notAZombieYet(); });
75  iRegistry.watchPostSourceEvent([this](StreamID) { notAZombieYet(); });
76 
77  iRegistry.watchPreModuleBeginStream([this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
78  iRegistry.watchPostModuleBeginStream([this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
79 
80  iRegistry.watchPreModuleEndStream([this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
81  iRegistry.watchPostModuleEndStream([this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
82 
83  iRegistry.watchPreModuleEndJob([this](ModuleDescription const&) { notAZombieYet(); });
84  iRegistry.watchPostModuleEndJob([this](ModuleDescription const&) { notAZombieYet(); });
85  iRegistry.watchPreModuleEvent([this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
86  iRegistry.watchPostModuleEvent([this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
87 
89  [this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
91  [this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
92 
93  iRegistry.watchPreModuleStreamEndRun([this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
94  iRegistry.watchPostModuleStreamEndRun([this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
95 
97  [this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
99  [this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
100 
101  iRegistry.watchPreModuleStreamEndLumi([this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
103  [this](StreamContext const&, ModuleCallingContext const&) { notAZombieYet(); });
104 
106  [this](GlobalContext const&, ModuleCallingContext const&) { notAZombieYet(); });
108  [this](GlobalContext const&, ModuleCallingContext const&) { notAZombieYet(); });
109 
110  iRegistry.watchPreModuleGlobalEndRun([this](GlobalContext const&, ModuleCallingContext const&) { notAZombieYet(); });
111  iRegistry.watchPostModuleGlobalEndRun([this](GlobalContext const&, ModuleCallingContext const&) { notAZombieYet(); });
112 
114  [this](GlobalContext const&, ModuleCallingContext const&) { notAZombieYet(); });
116  [this](GlobalContext const&, ModuleCallingContext const&) { notAZombieYet(); });
117 
118  iRegistry.watchPreModuleGlobalEndLumi([this](GlobalContext const&, ModuleCallingContext const&) { notAZombieYet(); });
120  [this](GlobalContext const&, ModuleCallingContext const&) { notAZombieYet(); });
121 }
122 
123 // ZombieKillerService::ZombieKillerService(const ZombieKillerService& rhs)
124 // {
125 // // do actual copying here;
126 // }
127 
128 //ZombieKillerService::~ZombieKillerService()
129 //{
130 //}
131 
132 //
133 // assignment operators
134 //
135 // const ZombieKillerService& ZombieKillerService::operator=(const ZombieKillerService& rhs)
136 // {
137 // //An exception safe implementation is
138 // ZombieKillerService temp(rhs);
139 // swap(rhs);
140 //
141 // return *this;
142 // }
143 
144 //
145 // member functions
146 //
149  m_stillAlive = true;
150 }
151 
153  if (not m_stillAlive) {
156  edm::LogError("JobStuck") << "Too long since the job has last made progress.";
157  std::terminate();
158  } else {
159  edm::LogWarning("JobProgressing") << "It has been " << m_numberChecksWhenNotAlive * m_secsBetweenChecks
160  << " seconds since job seen progressing";
161  }
162  }
163  m_stillAlive = false;
164 }
165 
167  m_watchingThread = std::thread([this]() {
168  std::unique_lock<std::mutex> lock(m_jobDoneMutex);
169  while (not m_jobDoneCondition.wait_for(
170  lock, std::chrono::seconds(m_secsBetweenChecks), [this]() -> bool { return m_jobDone; })) {
171  //we timed out
172  checkForZombie();
173  }
174  });
175 }
176 
178  {
179  std::lock_guard<std::mutex> guard(m_jobDoneMutex);
180  m_jobDone = true;
181  }
182  m_jobDoneCondition.notify_all();
183  m_watchingThread.join();
184 }
185 
188  desc.addUntracked<unsigned int>("secondsBetweenChecks", 60)
189  ->setComment("Number of seconds to wait between checking if progress has been made.");
190  desc.addUntracked<unsigned int>("numberOfAllowedFailedChecksInARow", 3)
191  ->setComment("Number of allowed checks in a row with no progress.");
192  descriptions.add("ZombieKillerService", desc);
193 }
194 
195 //
196 // const member functions
197 //
198 
199 //
200 // static member functions
201 //
203 
void watchPostModuleGlobalEndLumi(PostModuleGlobalEndLumi::slot_type const &iSlot)
void watchPreModuleGlobalBeginRun(PreModuleGlobalBeginRun::slot_type const &iSlot)
static void fillDescriptions(ConfigurationDescriptions &descriptions)
double seconds()
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
void watchPostModuleEndStream(PostModuleEndStream::slot_type const &iSlot)
void watchPreModuleEvent(PreModuleEvent::slot_type const &iSlot)
static std::mutex mutex
Definition: Proxy.cc:8
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:98
void watchPostModuleEvent(PostModuleEvent::slot_type const &iSlot)
const unsigned int m_checkThreshold
void watchPostModuleGlobalBeginLumi(PostModuleGlobalBeginLumi::slot_type const &iSlot)
void watchPostModuleStreamEndLumi(PostModuleStreamEndLumi::slot_type const &iSlot)
void watchPostModuleStreamBeginRun(PostModuleStreamBeginRun::slot_type const &iSlot)
void watchPostSourceEvent(PostSourceEvent::slot_type const &iSlot)
void watchPreModuleBeginStream(PreModuleBeginStream::slot_type const &iSlot)
Log< level::Error, false > LogError
void watchPreModuleGlobalEndRun(PreModuleGlobalEndRun::slot_type const &iSlot)
void watchPostSourceRun(PostSourceRun::slot_type const &iSlot)
void watchPreSourceLumi(PreSourceLumi::slot_type const &iSlot)
void watchPreModuleEndJob(PreModuleEndJob::slot_type const &iSlot)
void watchPreSourceRun(PreSourceRun::slot_type const &iSlot)
void watchPreModuleGlobalBeginLumi(PreModuleGlobalBeginLumi::slot_type const &iSlot)
void watchPostModuleStreamEndRun(PostModuleStreamEndRun::slot_type const &iSlot)
void watchPreModuleStreamBeginLumi(PreModuleStreamBeginLumi::slot_type const &iSlot)
#define DEFINE_FWK_SERVICE(type)
Definition: ServiceMaker.h:97
void watchPostModuleBeginStream(PostModuleBeginStream::slot_type const &iSlot)
std::atomic< unsigned int > m_numberChecksWhenNotAlive
void watchPostSourceLumi(PostSourceLumi::slot_type const &iSlot)
void watchPostModuleGlobalEndRun(PostModuleGlobalEndRun::slot_type const &iSlot)
void watchPostModuleStreamBeginLumi(PostModuleStreamBeginLumi::slot_type const &iSlot)
void watchPreModuleStreamEndLumi(PreModuleStreamEndLumi::slot_type const &iSlot)
void watchPreModuleStreamBeginRun(PreModuleStreamBeginRun::slot_type const &iSlot)
void add(std::string const &label, ParameterSetDescription const &psetDescription)
void watchPreModuleEndStream(PreModuleEndStream::slot_type const &iSlot)
const unsigned int m_secsBetweenChecks
std::atomic< bool > m_stillAlive
HLT enums.
void watchPreModuleStreamEndRun(PreModuleStreamEndRun::slot_type const &iSlot)
ZombieKillerService(edm::ParameterSet const &, edm::ActivityRegistry &)
void watchPostModuleGlobalBeginRun(PostModuleGlobalBeginRun::slot_type const &iSlot)
void watchPreSourceEvent(PreSourceEvent::slot_type const &iSlot)
Log< level::Warning, false > LogWarning
void watchPostModuleEndJob(PostModuleEndJob::slot_type const &iSlot)
void watchPreModuleGlobalEndLumi(PreModuleGlobalEndLumi::slot_type const &iSlot)
std::condition_variable m_jobDoneCondition
void watchPostBeginJob(PostBeginJob::slot_type const &iSlot)
convenience function for attaching to signal