CMS 3D CMS Logo

InitRootHandlers.cc
Go to the documentation of this file.
2 
18 
19 #include "oneapi/tbb/concurrent_unordered_set.h"
20 #include "oneapi/tbb/task.h"
21 #include "oneapi/tbb/task_scheduler_observer.h"
22 #include "oneapi/tbb/global_control.h"
23 #include <memory>
24 
25 #include <thread>
26 #include <sys/wait.h>
27 #include <sstream>
28 #include <cstring>
29 #include <poll.h>
30 #include <atomic>
31 #include <algorithm>
32 #include <vector>
33 #include <string>
34 #include <array>
35 
36 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
37 // version. This can break our stack trace printer. Avoid this by
38 // invoking the syscall directly.
39 #ifdef __linux__
40 #include <syscall.h>
41 #endif
42 
43 #include "TROOT.h"
44 #include "TError.h"
45 #include "TFile.h"
46 #include "TInterpreter.h"
47 #include "TH1.h"
48 #include "TSystem.h"
49 #include "TUnixSystem.h"
50 #include "TTree.h"
51 #include "TVirtualStreamerInfo.h"
52 
53 #include "TClassTable.h"
54 
55 #include <memory>
56 
57 namespace {
58  // size of static buffer allocated for listing module names following a
59  // stacktrace abort
60  constexpr std::size_t moduleBufferSize = 128;
61 } // namespace
62 
63 namespace edm {
65  class ParameterSet;
66  class ActivityRegistry;
67 
68  namespace service {
69  class InitRootHandlers : public RootHandlers {
70  friend int cmssw_stacktrace(void*);
71 
72  public:
73  class ThreadTracker : public oneapi::tbb::task_scheduler_observer {
74  public:
75  typedef oneapi::tbb::concurrent_unordered_set<pthread_t> Container_type;
76 
77  ThreadTracker() : oneapi::tbb::task_scheduler_observer() { observe(); }
78  ~ThreadTracker() override = default;
79 
80  void on_scheduler_entry(bool) override {
81  // ensure thread local has been allocated; not necessary on Linux with
82  // the current cmsRun linkage, but could be an issue if the platform
83  // or linkage leads to "lazy" allocation of the thread local. By
84  // referencing it here we make sure it has been allocated and can be
85  // accessed safely from our signal handler.
87  threadIDs_.insert(pthread_self());
88  }
89  void on_scheduler_exit(bool) override {}
90  const Container_type& IDs() { return threadIDs_; }
91 
92  private:
94  };
95 
96  explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
97  ~InitRootHandlers() override;
98 
99  static void fillDescriptions(ConfigurationDescriptions& descriptions);
100  static void stacktraceFromThread();
103  if (threadTracker_) {
104  return threadTracker_->IDs();
105  }
106  return empty;
107  }
108  static int stackTracePause() { return stackTracePause_; }
109 
110  static std::vector<std::array<char, moduleBufferSize>> moduleListBuffers_;
111  static std::atomic<std::size_t> nextModule_, doneModules_;
112 
113  private:
114  static char const* const* getPstackArgv();
115  void enableWarnings_() override;
117  void willBeUsingThreads() override;
118 
119  void cachePidInfo();
120  static void stacktraceHelperThread();
121 
122  static constexpr int pidStringLength_ = 200;
124  static char const* const pstackArgv_[];
125  static int parentToChild_[2];
126  static int childToParent_[2];
127  static std::unique_ptr<std::thread> helperThread_;
128  static std::unique_ptr<ThreadTracker> threadTracker_;
129  static int stackTracePause_;
130 
136  std::shared_ptr<const void> sigBusHandler_;
137  std::shared_ptr<const void> sigSegvHandler_;
138  std::shared_ptr<const void> sigIllHandler_;
139  std::shared_ptr<const void> sigTermHandler_;
140  std::shared_ptr<const void> sigAbrtHandler_;
141  std::shared_ptr<const void> sigFpeHandler_;
142  };
143 
144  inline bool isProcessWideService(InitRootHandlers const*) { return true; }
145 
146  } // end of namespace service
147 } // end of namespace edm
148 
149 namespace edm {
150  namespace service {
151  int cmssw_stacktrace(void*);
152  }
153 } // namespace edm
154 
155 namespace {
157 
158  constexpr bool s_ignoreEverything = false;
159 
160  template <std::size_t SIZE>
161  bool find_if_string(const std::string& search, const std::array<const char* const, SIZE>& substrs) {
162  return (std::find_if(substrs.begin(), substrs.end(), [&search](const char* const s) -> bool {
163  return (search.find(s) != std::string::npos);
164  }) != substrs.end());
165  }
166 
167  //Contents of a message which should be reported as an INFO not a ERROR
168  constexpr std::array<const char* const, 9> in_message{
169  {"no dictionary for class",
170  "already in TClassTable",
171  "matrix not positive definite",
172  "not a TStreamerInfo object",
173  "Problems declaring payload",
174  "Announced number of args different from the real number of argument passed", // Always printed if gDebug>0 - regardless of whether warning message is real.
175  "nbins is <=0 - set to nbins = 1",
176  "nbinsy is <=0 - set to nbinsy = 1",
177  "oneapi::tbb::global_control is limiting"}};
178 
179  //Location generating messages which should be reported as an INFO not a ERROR
180  constexpr std::array<const char* const, 7> in_location{{"Fit",
181  "TDecompChol::Solve",
182  "THistPainter::PaintInit",
183  "TUnixSystem::SetDisplay",
184  "TGClient::GetFontByName",
185  "Inverter::Dinv",
186  "RTaskArenaWrapper"}};
187 
188  constexpr std::array<const char* const, 3> in_message_print_error{{"number of iterations was insufficient",
189  "bad integrand behavior",
190  "integral is divergent, or slowly convergent"}};
191 
192  void RootErrorHandlerImpl(int level, char const* location, char const* message) {
193  bool die = false;
194 
195  // Translate ROOT severity level to MessageLogger severity level
196 
198 
199  if (level >= kFatal) {
201  } else if (level >= kSysError) {
203  } else if (level >= kError) {
205  } else if (level >= kWarning) {
207  }
208 
209  if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
211  }
212 
213  // Adapt C-strings to std::strings
214  // Arrange to report the error location as furnished by Root
215 
216  std::string el_location = "@SUB=?";
217  if (location != nullptr)
218  el_location = std::string("@SUB=") + std::string(location);
219 
220  std::string el_message = "?";
221  if (message != nullptr)
222  el_message = message;
223 
224  // Try to create a meaningful id string using knowledge of ROOT error messages
225  //
226  // id == "ROOT-ClassName" where ClassName is the affected class
227  // else "ROOT/ClassName" where ClassName is the error-declaring class
228  // else "ROOT"
229 
230  std::string el_identifier = "ROOT";
231 
232  std::string precursor("class ");
233  size_t index1 = el_message.find(precursor);
234  if (index1 != std::string::npos) {
235  size_t index2 = index1 + precursor.length();
236  size_t index3 = el_message.find_first_of(" :", index2);
237  if (index3 != std::string::npos) {
238  size_t substrlen = index3 - index2;
239  el_identifier += "-";
240  el_identifier += el_message.substr(index2, substrlen);
241  }
242  } else {
243  index1 = el_location.find("::");
244  if (index1 != std::string::npos) {
245  el_identifier += "/";
246  el_identifier += el_location.substr(0, index1);
247  }
248  }
249 
250  // Intercept some messages and upgrade the severity
251 
252  if ((el_location.find("TBranchElement::Fill") != std::string::npos) &&
253  (el_message.find("fill branch") != std::string::npos) && (el_message.find("address") != std::string::npos) &&
254  (el_message.find("not set") != std::string::npos)) {
256  }
257 
258  if ((el_message.find("Tree branches") != std::string::npos) &&
259  (el_message.find("different numbers of entries") != std::string::npos)) {
261  }
262 
263  // Intercept some messages and downgrade the severity
264 
265  if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
266  (level < kError and (el_location.find("CINTTypedefBuilder::Setup") != std::string::npos) and
267  (el_message.find("possible entries are in use!") != std::string::npos))) {
269  }
270 
271  // These are a special case because we do not want them to
272  // be fatal, but we do want an error to print.
273  bool alreadyPrinted = false;
274  if (find_if_string(el_message, in_message_print_error)) {
276  edm::LogError("Root_Error") << el_location << el_message;
277  alreadyPrinted = true;
278  }
279 
280  if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
281  // Don't throw if the message is just informational.
282  die = false;
283  } else {
284  die = true;
285  }
286 
287  // Feed the message to the MessageLogger and let it choose to suppress or not.
288 
289  // Root has declared a fatal error. Throw an EDMException unless the
290  // message corresponds to a pending signal. In that case, do not throw
291  // but let the OS deal with the signal in the usual way.
292  if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
293  std::ostringstream sstr;
294  sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
295  edm::Exception except(edm::errors::FatalRootError, sstr.str());
296  except.addAdditionalInfo(except.message());
297  except.clearMessage();
298  throw except;
299  }
300 
301  // Typically, we get here only for informational messages,
302  // but we leave the other code in just in case we change
303  // the criteria for throwing.
304  if (!alreadyPrinted) {
305  if (el_severity == edm::RootHandlers::SeverityLevel::kFatal) {
306  edm::LogError("Root_Fatal") << el_location << el_message;
307  } else if (el_severity == edm::RootHandlers::SeverityLevel::kSysError) {
308  edm::LogError("Root_Severe") << el_location << el_message;
309  } else if (el_severity == edm::RootHandlers::SeverityLevel::kError) {
310  edm::LogError("Root_Error") << el_location << el_message;
311  } else if (el_severity == edm::RootHandlers::SeverityLevel::kWarning) {
312  edm::LogWarning("Root_Warning") << el_location << el_message;
313  } else if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
314  edm::LogInfo("Root_Information") << el_location << el_message;
315  }
316  }
317  }
318 
319  void RootErrorHandler(int level, bool, char const* location, char const* message) {
320  RootErrorHandlerImpl(level, location, message);
321  }
322 
323  extern "C" {
324  void set_default_signals() {
325  signal(SIGILL, SIG_DFL);
326  signal(SIGSEGV, SIG_DFL);
327  signal(SIGBUS, SIG_DFL);
328  signal(SIGTERM, SIG_DFL);
329  signal(SIGFPE, SIG_DFL);
330  signal(SIGABRT, SIG_DFL);
331  }
332 
333  static int full_write(int fd, const char* text) {
334  const char* buffer = text;
335  size_t count = strlen(text);
336  ssize_t written = 0;
337  while (count) {
338  written = write(fd, buffer, count);
339  if (written == -1) {
340  if (errno == EINTR) {
341  continue;
342  } else {
343  return -errno;
344  }
345  }
346  count -= written;
347  buffer += written;
348  }
349  return 0;
350  }
351 
352  static int full_read(int fd, char* inbuf, size_t len, int timeout_s = -1) {
353  char* buf = inbuf;
354  size_t count = len;
355  ssize_t complete = 0;
356  std::chrono::time_point<std::chrono::steady_clock> end_time =
358  int flags;
359  if (timeout_s < 0) {
360  flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
361  } else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
362  return -errno;
363  }
364  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
365  if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
366  return -errno;
367  }
368  }
369  while (count) {
370  if (timeout_s >= 0) {
371  struct pollfd poll_info {
372  fd, POLLIN, 0
373  };
374  int ms_remaining =
375  std::chrono::duration_cast<std::chrono::milliseconds>(end_time - std::chrono::steady_clock::now()).count();
376  if (ms_remaining > 0) {
377  int rc = poll(&poll_info, 1, ms_remaining);
378  if (rc <= 0) {
379  if (rc < 0) {
380  if (errno == EINTR || errno == EAGAIN) {
381  continue;
382  }
383  rc = -errno;
384  } else {
385  rc = -ETIMEDOUT;
386  }
387  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
388  fcntl(fd, F_SETFL, flags);
389  }
390  return rc;
391  }
392  } else if (ms_remaining < 0) {
393  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
394  fcntl(fd, F_SETFL, flags);
395  }
396  return -ETIMEDOUT;
397  }
398  }
399  complete = read(fd, buf, count);
400  if (complete == -1) {
401  if (errno == EINTR) {
402  continue;
403  } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
404  continue;
405  } else {
406  int orig_errno = errno;
407  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
408  fcntl(fd, F_SETFL, flags);
409  }
410  return -orig_errno;
411  }
412  }
413  count -= complete;
414  buf += complete;
415  }
416  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
417  fcntl(fd, F_SETFL, flags);
418  }
419  return 0;
420  }
421 
422  static int full_cerr_write(const char* text) { return full_write(2, text); }
423 
424 // these signals are only used inside the stacktrace signal handler,
425 // so common signals can be used. They do have to be different, since
426 // we do not set SA_NODEFER, and RESUME must be a signal that will
427 // cause sleep() to return early.
428 #if defined(SIGRTMAX)
429 #define PAUSE_SIGNAL SIGRTMAX
430 #define RESUME_SIGNAL SIGRTMAX - 1
431 #elif defined(SIGINFO) // macOS/BSD
432 #define PAUSE_SIGNAL SIGINFO
433 #define RESUME_SIGNAL SIGALRM
434 #endif
435 
436  // does nothing, here only to interrupt the sleep() in the pause handler
437  void sig_resume_handler(int sig, siginfo_t*, void*) {}
438 
439  // pause a thread so that a (slow) stacktrace will capture the current state
440  void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
441  using namespace edm::service;
442 
443 #ifdef RESUME_SIGNAL
444  sigset_t sigset;
445  sigemptyset(&sigset);
446  sigaddset(&sigset, RESUME_SIGNAL);
447  pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
448 #endif
449  // sleep interrrupts on a handled delivery of the resume signal
451 
452  if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
455  char* buff = InitRootHandlers::moduleListBuffers_[i].data();
456 
457  strlcpy(buff, "\nModule: ", moduleBufferSize);
459  strlcat(buff,
460  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
461  moduleBufferSize);
462  strlcat(buff, ":", moduleBufferSize);
463  strlcat(buff,
464  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
465  moduleBufferSize);
466  } else {
467  strlcat(buff, "none", moduleBufferSize);
468  }
470  }
471  }
472  }
473 
474  void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
475  using namespace edm::service;
476 
477  const auto& tids = InitRootHandlers::threadIDs();
478 
479  const auto self = pthread_self();
480 #ifdef PAUSE_SIGNAL
481  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
482  // install the "pause" handler
483  struct sigaction act;
484  act.sa_sigaction = sig_pause_for_stacktrace;
485  act.sa_flags = 0;
486  sigemptyset(&act.sa_mask);
487  sigaction(PAUSE_SIGNAL, &act, nullptr);
488 
489  // unblock pause signal globally, resume is unblocked in the pause handler
490  sigset_t pausesigset;
491  sigemptyset(&pausesigset);
492  sigaddset(&pausesigset, PAUSE_SIGNAL);
493  sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
494 
495  // send a pause signal to all CMSSW/TBB threads other than self
496  for (auto id : tids) {
497  if (self != id) {
498  pthread_kill(id, PAUSE_SIGNAL);
499  }
500  }
501 
502 #ifdef RESUME_SIGNAL
503  // install the "resume" handler
504  act.sa_sigaction = sig_resume_handler;
505  sigaction(RESUME_SIGNAL, &act, nullptr);
506 #endif
507  }
508 #endif
509 
510  const char* signalname = "unknown";
511  switch (sig) {
512  case SIGBUS: {
513  signalname = "bus error";
514  break;
515  }
516  case SIGSEGV: {
517  signalname = "segmentation violation";
518  break;
519  }
520  case SIGILL: {
521  signalname = "illegal instruction";
522  break;
523  }
524  case SIGFPE: {
525  signalname = "floating point exception";
526  break;
527  }
528  case SIGTERM: {
529  signalname = "external termination request";
530  break;
531  }
532  case SIGABRT: {
533  signalname = "abort signal";
534  break;
535  }
536  default:
537  break;
538  }
539  full_cerr_write("\n\nA fatal system signal has occurred: ");
540  full_cerr_write(signalname);
541  full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
542 
544 
545  // resume the signal handlers to store the current module; we are not guaranteed they
546  // will have time to store their modules, so there is a race condition; this could be
547  // avoided by storing the module information before sleeping, a change that may be
548  // made when we're convinced accessing the thread-local current module is safe.
549 #ifdef RESUME_SIGNAL
550  std::size_t notified = 0;
551  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
552  for (auto id : tids) {
553  if (self != id) {
554  if (pthread_kill(id, RESUME_SIGNAL) == 0)
555  ++notified;
556  }
557  }
558  }
559 #endif
560 
561  full_cerr_write("\nCurrent Modules:\n");
562 
563  // Checking tids.count(self) ensures that we only try to access the current module in
564  // CMSSW/TBB threads. Those threads access the thread-local current module at the same
565  // time the thread is registered, so any lazy allocation will have been done at that
566  // point. Not necessary on Linux with the current cmsRun linkage, as the thread-local
567  // is allocated at exec time, not lazily.
568  if (tids.count(self) > 0) {
569  char buff[moduleBufferSize] = "\nModule: ";
571  strlcat(buff,
572  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
573  moduleBufferSize);
574  strlcat(buff, ":", moduleBufferSize);
575  strlcat(buff,
576  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
577  moduleBufferSize);
578  } else {
579  strlcat(buff, "none", moduleBufferSize);
580  }
581  strlcat(buff, " (crashed)", moduleBufferSize);
582  full_cerr_write(buff);
583  } else {
584  full_cerr_write("\nModule: non-CMSSW (crashed)");
585  }
586 
587 #ifdef PAUSE_SIGNAL
588  // wait a short interval for the paused threads to resume and fill in their module
589  // information, then print
590  if (InitRootHandlers::doneModules_.is_lock_free()) {
591  int spincount = 0;
592  timespec t = {0, 1000};
593  while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) {
594  nanosleep(&t, nullptr);
595  }
596  for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
597  full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
598  }
599  }
600 #endif
601 
602  full_cerr_write("\n\nA fatal system signal has occurred: ");
603  full_cerr_write(signalname);
604  full_cerr_write("\n");
605 
606  // For these known cases, re-raise the signal to get the correct
607  // exit code.
608  if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGFPE) ||
609  (sig == SIGABRT)) {
610  signal(sig, SIG_DFL);
611  raise(sig);
612  } else {
613  set_default_signals();
614  ::abort();
615  }
616  }
617 
618  void sig_abort(int sig, siginfo_t*, void*) {
619  full_cerr_write("\n\nFatal system signal has occurred during exit\n");
620 
621  // re-raise the signal to get the correct exit code
622  signal(sig, SIG_DFL);
623  raise(sig);
624 
625  // shouldn't get here
626  set_default_signals();
627  ::sleep(10);
628  ::abort();
629  }
630  }
631 } // end of unnamed namespace
632 
633 namespace edm {
634  namespace service {
635 
636  /*
637  * We've run into issues where GDB fails to print the thread which calls clone().
638  * To avoid this problem, we have an alternate approach below where the signal handler
639  * only reads/writes to a dedicated thread via pipes. The helper thread does the clone()
640  * invocation; we don't care if that thread is missing from the traceback in this case.
641  */
642  static void cmssw_stacktrace_fork();
643 
645  int toParent = childToParent_[1];
646  int fromParent = parentToChild_[0];
647  char buf[2];
648  buf[1] = '\0';
649 
650  while (true) {
651  int result = full_read(fromParent, buf, 1);
652  if (result < 0) {
653  // To avoid a deadlock (this function is NOT re-entrant), reset signals
654  // We never set them back to the CMSSW handler because we assume the parent
655  // thread will abort for us.
656  set_default_signals();
657  close(toParent);
658  full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
659  full_cerr_write(strerror(-result));
660  full_cerr_write("\n");
661  ::abort();
662  }
663  if (buf[0] == '1') {
664  set_default_signals();
666  full_write(toParent, buf);
667  } else if (buf[0] == '2') {
668  // We have just finished forking. Reload the file descriptors for thread
669  // communication.
670  close(toParent);
671  close(fromParent);
672  toParent = childToParent_[1];
673  fromParent = parentToChild_[0];
674  } else if (buf[0] == '3') {
675  break;
676  } else {
677  set_default_signals();
678  close(toParent);
679  full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
680  full_cerr_write(buf);
681  full_cerr_write("\n");
682  ::abort();
683  }
684  }
685  }
686 
688  int result = full_write(parentToChild_[1], "1");
689  if (result < 0) {
690  full_cerr_write("\n\nAttempt to request stacktrace failed: ");
691  full_cerr_write(strerror(-result));
692  full_cerr_write("\n");
693  return;
694  }
695  char buf[2];
696  buf[1] = '\0';
697  if ((result = full_read(childToParent_[0], buf, 1, 5 * 60)) < 0) {
698  full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
699  if (result == -ETIMEDOUT) {
700  full_cerr_write("timed out waiting for GDB to complete.");
701  } else {
702  full_cerr_write(strerror(-result));
703  }
704  full_cerr_write("\n");
705  return;
706  }
707  }
708 
710  char child_stack[4 * 1024];
711  char* child_stack_ptr = child_stack + 4 * 1024;
712  // On Linux, we currently use jemalloc. This registers pthread_atfork handlers; these
713  // handlers are *not* async-signal safe. Hence, a deadlock is possible if we invoke
714  // fork() from our signal handlers. Accordingly, we use clone (not POSIX, but AS-safe)
715  // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
716  int pid =
717 #ifdef __linux__
718  clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM | CLONE_FS | SIGCHLD, nullptr);
719 #else
720  fork();
721  if (child_stack_ptr) {
722  } // Suppress 'unused variable' warning on non-Linux
723  if (pid == 0) {
725  }
726 #endif
727  if (pid == -1) {
728  full_cerr_write("(Attempt to perform stack dump failed.)\n");
729  } else {
730  int status;
731  if (waitpid(pid, &status, 0) == -1) {
732  full_cerr_write("(Failed to wait on stack dump output.)\n");
733  }
734  if (status) {
735  full_cerr_write("(GDB stack trace failed unexpectedly)\n");
736  }
737  }
738  }
739 
740  int cmssw_stacktrace(void* /*arg*/) {
741  set_default_signals();
742 
744  // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
745  // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
746  // calls dlsym.
747 #ifdef __linux__
748  syscall(SYS_execve, "/bin/sh", argv, __environ);
749 #else
750  execv("/bin/sh", argv);
751 #endif
752  ::abort();
753  return 1;
754  }
755 
756  static constexpr char pstackName[] = "(CMSSW stack trace helper)";
757  static constexpr char dashC[] = "-c";
760  int InitRootHandlers::parentToChild_[2] = {-1, -1};
761  int InitRootHandlers::childToParent_[2] = {-1, -1};
762  std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
763  std::unique_ptr<InitRootHandlers::ThreadTracker> InitRootHandlers::threadTracker_;
765  std::vector<std::array<char, moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
766  std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
767 
769  : RootHandlers(),
770  unloadSigHandler_(pset.getUntrackedParameter<bool>("UnloadRootSigHandler")),
771  resetErrHandler_(pset.getUntrackedParameter<bool>("ResetRootErrHandler")),
772  loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
773  autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool>("AutoLibraryLoader")),
774  interactiveDebug_(pset.getUntrackedParameter<bool>("InteractiveDebug")) {
775  stackTracePause_ = pset.getUntrackedParameter<int>("StackTracePauseTime");
776 
777  if (not threadTracker_) {
778  threadTracker_ = std::make_unique<ThreadTracker>();
779  iReg.watchPostEndJob([]() {
780  if (threadTracker_) {
781  threadTracker_->observe(false);
782  }
783  });
784  }
785 
786  if (unloadSigHandler_) {
787  // Deactivate all the Root signal handlers and restore the system defaults
788  gSystem->ResetSignal(kSigChild);
789  gSystem->ResetSignal(kSigBus);
790  gSystem->ResetSignal(kSigSegmentationViolation);
791  gSystem->ResetSignal(kSigIllegalInstruction);
792  gSystem->ResetSignal(kSigSystem);
793  gSystem->ResetSignal(kSigPipe);
794  gSystem->ResetSignal(kSigAlarm);
795  gSystem->ResetSignal(kSigUrgent);
796  gSystem->ResetSignal(kSigFloatingException);
797  gSystem->ResetSignal(kSigWindowChanged);
798  } else if (pset.getUntrackedParameter<bool>("AbortOnSignal")) {
799  cachePidInfo();
800 
801  //NOTE: ROOT can also be told to abort on these kinds of problems BUT
802  // it requires an TApplication to be instantiated which causes problems
803  gSystem->ResetSignal(kSigBus);
804  gSystem->ResetSignal(kSigSegmentationViolation);
805  gSystem->ResetSignal(kSigIllegalInstruction);
806  gSystem->ResetSignal(kSigFloatingException);
807  installCustomHandler(SIGBUS, sig_dostack_then_abort);
808  sigBusHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGBUS, sig_abort); });
809  installCustomHandler(SIGSEGV, sig_dostack_then_abort);
810  sigSegvHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGSEGV, sig_abort); });
811  installCustomHandler(SIGILL, sig_dostack_then_abort);
812  sigIllHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGILL, sig_abort); });
813  installCustomHandler(SIGTERM, sig_dostack_then_abort);
814  sigTermHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGTERM, sig_abort); });
815  installCustomHandler(SIGFPE, sig_dostack_then_abort);
816  sigFpeHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGFPE, sig_abort); });
817  installCustomHandler(SIGABRT, sig_dostack_then_abort);
818  sigAbrtHandler_ = std::shared_ptr<const void>(nullptr, [](void*) {
819  signal(SIGABRT, SIG_DFL); // release SIGABRT to default
820  });
821  }
822 
823  iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds) {
824  if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
825  moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
826  }
827  });
828 
829  if (resetErrHandler_) {
830  // Replace the Root error handler with one that uses the MessageLogger
831  SetErrorHandler(RootErrorHandler);
832  }
833 
834  // Enable automatic Root library loading.
835  if (autoLibraryLoader_) {
836  gInterpreter->SetClassAutoloading(1);
837  }
838 
839  // Set ROOT parameters.
840  TTree::SetMaxTreeSize(kMaxLong64);
841  TH1::AddDirectory(kFALSE);
842  //G__SetCatchException(0);
843 
844  // Set custom streamers
846 
847  // Load the library containing dictionaries for std:: classes, if not already loaded.
848  if (!hasDictionary(typeid(std::vector<std::vector<unsigned int>>))) {
849  TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
850  }
851 
852  int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
853  if (debugLevel > 0) {
854  gDebug = debugLevel;
855  }
856 
857  // Enable Root implicit multi-threading
858  bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
859  if (imt && not ROOT::IsImplicitMTEnabled()) {
860  //cmsRun uses global_control to set the number of allowed threads to use
861  // we need to tell ROOT the same value in order to avoid unnecessary warnings
862  ROOT::EnableImplicitMT(
863  oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
864  }
865  }
866 
868  // close all open ROOT files
869  TIter iter(gROOT->GetListOfFiles());
870  TObject* obj = nullptr;
871  while (nullptr != (obj = iter.Next())) {
872  TFile* f = dynamic_cast<TFile*>(obj);
873  if (f) {
874  // We get a new iterator each time,
875  // because closing a file can invalidate the iterator
876  f->Close();
877  iter = TIter(gROOT->GetListOfFiles());
878  }
879  }
880  //disengage from TBB to avoid possible at exit problems
881  threadTracker_.reset();
882  }
883 
885  //Tell Root we want to be multi-threaded
886  ROOT::EnableThreadSafety();
887 
888  //When threading, also have to keep ROOT from logging all TObjects into a list
889  TObject::SetObjectStat(false);
890 
891  //Have to avoid having Streamers modify themselves after they have been used
892  TVirtualStreamerInfo::Optimize(false);
893  }
894 
897  desc.setComment("Centralized interface to ROOT.");
898  desc.addUntracked<bool>("UnloadRootSigHandler", false)
899  ->setComment("If True, signals are handled by this service, rather than by ROOT.");
900  desc.addUntracked<bool>("ResetRootErrHandler", true)
901  ->setComment(
902  "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
903  desc.addUntracked<bool>("AutoLibraryLoader", true)
904  ->setComment("If True, enables automatic loading of data dictionaries.");
905  desc.addUntracked<bool>("LoadAllDictionaries", false)->setComment("If True, loads all ROOT dictionaries.");
906  desc.addUntracked<bool>("EnableIMT", true)->setComment("If True, calls ROOT::EnableImplicitMT().");
907  desc.addUntracked<bool>("AbortOnSignal", true)
908  ->setComment(
909  "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
910  "attempts to do a clean shutdown.");
911  desc.addUntracked<bool>("InteractiveDebug", false)
912  ->setComment(
913  "If True, leave gdb attached to cmsRun after a crash; "
914  "if False, attach gdb, print a stack trace, and quit gdb");
915  desc.addUntracked<int>("DebugLevel", 0)->setComment("Sets ROOT's gDebug value.");
916  desc.addUntracked<int>("StackTracePauseTime", 300)
917  ->setComment("Seconds to pause other threads during stack trace.");
918  descriptions.add("InitRootHandlers", desc);
919  }
920 
921  char const* const* InitRootHandlers::getPstackArgv() { return pstackArgv_; }
922 
924 
926 
928  if (helperThread_) {
929  //Another InitRootHandlers was initialized in this job, possibly
930  // because multiple EventProcessors are being used.
931  //In that case, we are already all setup
932  return;
933  }
934  std::string gdbcmd{"date; gdb -quiet -p %d"};
935  if (!interactiveDebug_) {
936  gdbcmd +=
937  " 2>&1 <<EOF |\n"
938  "set width 0\n"
939  "set height 0\n"
940  "set pagination no\n"
941  "thread apply all bt\n"
942  "EOF\n"
943  "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
944  }
945  if (snprintf(pidString_, pidStringLength_ - 1, gdbcmd.c_str(), getpid()) >= pidStringLength_) {
946  std::ostringstream sstr;
947  sstr << "Unable to pre-allocate stacktrace handler information";
948  edm::Exception except(edm::errors::OtherCMS, sstr.str());
949  throw except;
950  }
951 
952  // These are initialized to -1; harmless to close an invalid FD.
953  // If this is called post-fork, we don't want to be communicating on
954  // these FDs as they are used internally by the parent.
955  close(childToParent_[0]);
956  close(childToParent_[1]);
957  childToParent_[0] = -1;
958  childToParent_[1] = -1;
959  close(parentToChild_[0]);
960  close(parentToChild_[1]);
961  parentToChild_[0] = -1;
962  parentToChild_[1] = -1;
963 
964  if (-1 == pipe2(childToParent_, O_CLOEXEC)) {
965  std::ostringstream sstr;
966  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
967  edm::Exception except(edm::errors::OtherCMS, sstr.str());
968  throw except;
969  }
970 
971  if (-1 == pipe2(parentToChild_, O_CLOEXEC)) {
972  close(childToParent_[0]);
973  close(childToParent_[1]);
974  childToParent_[0] = -1;
975  childToParent_[1] = -1;
976  std::ostringstream sstr;
977  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
978  edm::Exception except(edm::errors::OtherCMS, sstr.str());
979  throw except;
980  }
981 
982  helperThread_ = std::make_unique<std::thread>(stacktraceHelperThread);
983  helperThread_->detach();
984  }
985 
986  } // end of namespace service
987 } // end of namespace edm
988 
990 
size
Write out results.
static constexpr char dashC[]
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:102
double seconds()
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setRefCoreStreamerInTClass()
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
std::vector< T >::const_iterator search(const cond::Time_t &val, const std::vector< T > &container)
Definition: IOVProxy.cc:21
oneapi::tbb::concurrent_unordered_set< pthread_t > Container_type
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:98
static ModuleCallingContext const * getCurrentModuleOnThread()
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
Log< level::Error, false > LogError
std::shared_ptr< const void > sigFpeHandler_
void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override
std::shared_ptr< const void > sigBusHandler_
static TypeWithDict byName(std::string const &name)
Definition: TypeWithDict.cc:74
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
std::shared_ptr< const void > sigAbrtHandler_
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static char const *const * getPstackArgv()
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
std::shared_ptr< const void > sigIllHandler_
std::shared_ptr< const void > sigTermHandler_
void addAdditionalInfo(std::string const &info)
Definition: Exception.cc:169
double f[11][100]
int cmssw_stacktrace(void *)
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static std::unique_ptr< ThreadTracker > threadTracker_
Log< level::Info, false > LogInfo
static constexpr int pidStringLength_
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
static char const *const pstackArgv_[]
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
Definition: eve_macros.cc:135
unsigned int maxNumberOfThreads() const
Definition: SystemBounds.h:38
std::string moduleName(StableProvenance const &provenance, ProcessHistory const &history)
Definition: Provenance.cc:27
HLT enums.
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:79
static void fillDescriptions(ConfigurationDescriptions &descriptions)
bool hasDictionary(std::type_info const &)
#define O_NONBLOCK
Definition: SysFile.h:23
Log< level::Warning, false > LogWarning
static constexpr char pstackName[]
fd
Definition: ztee.py:136