CMS 3D CMS Logo

InitRootHandlers.cc
Go to the documentation of this file.
2 
18 
19 #include "oneapi/tbb/concurrent_unordered_set.h"
20 #include "oneapi/tbb/task.h"
21 #include "oneapi/tbb/task_scheduler_observer.h"
22 #include "oneapi/tbb/global_control.h"
23 #include <memory>
24 
25 #include <thread>
26 #include <sys/wait.h>
27 #include <sstream>
28 #include <cstring>
29 #include <poll.h>
30 #include <atomic>
31 #include <algorithm>
32 #include <vector>
33 #include <string>
34 #include <array>
35 
36 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
37 // version. This can break our stack trace printer. Avoid this by
38 // invoking the syscall directly.
39 #ifdef __linux__
40 #include <syscall.h>
41 #endif
42 
43 #include "TROOT.h"
44 #include "TError.h"
45 #include "TFile.h"
46 #include "TInterpreter.h"
47 #include "TH1.h"
48 #include "TSystem.h"
49 #include "TUnixSystem.h"
50 #include "TTree.h"
51 #include "TVirtualStreamerInfo.h"
52 
53 #include "TClassTable.h"
54 
55 #include <memory>
56 
57 namespace {
58  // size of static buffer allocated for listing module names following a
59  // stacktrace abort
60  constexpr std::size_t moduleBufferSize = 128;
61 } // namespace
62 
63 namespace edm {
65  class ParameterSet;
66  class ActivityRegistry;
67 
68  namespace service {
69  class InitRootHandlers : public RootHandlers {
70  friend int cmssw_stacktrace(void*);
71 
72  public:
73  class ThreadTracker : public oneapi::tbb::task_scheduler_observer {
74  public:
75  typedef oneapi::tbb::concurrent_unordered_set<pthread_t> Container_type;
76 
77  ThreadTracker() : oneapi::tbb::task_scheduler_observer() { observe(); }
78  ~ThreadTracker() override = default;
79 
80  void on_scheduler_entry(bool) override {
81  // ensure thread local has been allocated; not necessary on Linux with
82  // the current cmsRun linkage, but could be an issue if the platform
83  // or linkage leads to "lazy" allocation of the thread local. By
84  // referencing it here we make sure it has been allocated and can be
85  // accessed safely from our signal handler.
87  threadIDs_.insert(pthread_self());
88  }
89  void on_scheduler_exit(bool) override {}
90  const Container_type& IDs() { return threadIDs_; }
91 
92  private:
94  };
95 
96  explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
97  ~InitRootHandlers() override;
98 
99  static void fillDescriptions(ConfigurationDescriptions& descriptions);
100  static void stacktraceFromThread();
103  if (threadTracker_) {
104  return threadTracker_->IDs();
105  }
106  return empty;
107  }
108  static int stackTracePause() { return stackTracePause_; }
109 
110  static std::vector<std::array<char, moduleBufferSize>> moduleListBuffers_;
111  static std::atomic<std::size_t> nextModule_, doneModules_;
112 
113  private:
114  static char const* const* getPstackArgv();
115  void enableWarnings_() override;
117  void willBeUsingThreads() override;
118 
119  void cachePidInfo();
120  static void stacktraceHelperThread();
121 
122  static constexpr int pidStringLength_ = 200;
124  static char const* const pstackArgv_[];
125  static int parentToChild_[2];
126  static int childToParent_[2];
127  static std::unique_ptr<std::thread> helperThread_;
128  static std::unique_ptr<ThreadTracker> threadTracker_;
129  static int stackTracePause_;
130 
137  std::shared_ptr<const void> sigBusHandler_;
138  std::shared_ptr<const void> sigSegvHandler_;
139  std::shared_ptr<const void> sigIllHandler_;
140  std::shared_ptr<const void> sigTermHandler_;
141  std::shared_ptr<const void> sigAbrtHandler_;
142  std::shared_ptr<const void> sigFpeHandler_;
143  };
144 
145  inline bool isProcessWideService(InitRootHandlers const*) { return true; }
146 
147  } // end of namespace service
148 } // end of namespace edm
149 
150 namespace edm {
151  namespace service {
152  int cmssw_stacktrace(void*);
153  }
154 } // namespace edm
155 
156 namespace {
158 
159  constexpr bool s_ignoreEverything = false;
160 
161  template <std::size_t SIZE>
162  bool find_if_string(const std::string& search, const std::array<const char* const, SIZE>& substrs) {
163  return (std::find_if(substrs.begin(), substrs.end(), [&search](const char* const s) -> bool {
164  return (search.find(s) != std::string::npos);
165  }) != substrs.end());
166  }
167 
168  //Contents of a message which should be reported as an INFO not a ERROR
169  constexpr std::array<const char* const, 9> in_message{
170  {"no dictionary for class",
171  "already in TClassTable",
172  "matrix not positive definite",
173  "not a TStreamerInfo object",
174  "Problems declaring payload",
175  "Announced number of args different from the real number of argument passed", // Always printed if gDebug>0 - regardless of whether warning message is real.
176  "nbins is <=0 - set to nbins = 1",
177  "nbinsy is <=0 - set to nbinsy = 1",
178  "oneapi::tbb::global_control is limiting"}};
179 
180  //Location generating messages which should be reported as an INFO not a ERROR
181  constexpr std::array<const char* const, 7> in_location{{"Fit",
182  "TDecompChol::Solve",
183  "THistPainter::PaintInit",
184  "TUnixSystem::SetDisplay",
185  "TGClient::GetFontByName",
186  "Inverter::Dinv",
187  "RTaskArenaWrapper"}};
188 
189  constexpr std::array<const char* const, 4> in_message_print_error{
190  {"number of iterations was insufficient",
191  "bad integrand behavior",
192  "integral is divergent, or slowly convergent",
193  "VariableMetricBuilder Initial matrix not pos.def."}};
194 
195  void RootErrorHandlerImpl(int level, char const* location, char const* message) {
196  bool die = false;
197 
198  // Translate ROOT severity level to MessageLogger severity level
199 
201 
202  if (level >= kFatal) {
204  } else if (level >= kSysError) {
206  } else if (level >= kError) {
208  } else if (level >= kWarning) {
210  }
211 
212  if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
214  }
215 
216  // Adapt C-strings to std::strings
217  // Arrange to report the error location as furnished by Root
218 
219  std::string el_location = "@SUB=?";
220  if (location != nullptr)
221  el_location = std::string("@SUB=") + std::string(location);
222 
223  std::string el_message = "?";
224  if (message != nullptr)
225  el_message = message;
226 
227  // Try to create a meaningful id string using knowledge of ROOT error messages
228  //
229  // id == "ROOT-ClassName" where ClassName is the affected class
230  // else "ROOT/ClassName" where ClassName is the error-declaring class
231  // else "ROOT"
232 
233  std::string el_identifier = "ROOT";
234 
235  std::string precursor("class ");
236  size_t index1 = el_message.find(precursor);
237  if (index1 != std::string::npos) {
238  size_t index2 = index1 + precursor.length();
239  size_t index3 = el_message.find_first_of(" :", index2);
240  if (index3 != std::string::npos) {
241  size_t substrlen = index3 - index2;
242  el_identifier += "-";
243  el_identifier += el_message.substr(index2, substrlen);
244  }
245  } else {
246  index1 = el_location.find("::");
247  if (index1 != std::string::npos) {
248  el_identifier += "/";
249  el_identifier += el_location.substr(0, index1);
250  }
251  }
252 
253  // Intercept some messages and upgrade the severity
254 
255  if ((el_location.find("TBranchElement::Fill") != std::string::npos) &&
256  (el_message.find("fill branch") != std::string::npos) && (el_message.find("address") != std::string::npos) &&
257  (el_message.find("not set") != std::string::npos)) {
259  }
260 
261  if ((el_message.find("Tree branches") != std::string::npos) &&
262  (el_message.find("different numbers of entries") != std::string::npos)) {
264  }
265 
266  // Intercept some messages and downgrade the severity
267 
268  if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
269  (level < kError and (el_location.find("CINTTypedefBuilder::Setup") != std::string::npos) and
270  (el_message.find("possible entries are in use!") != std::string::npos))) {
272  }
273 
274  // These are a special case because we do not want them to
275  // be fatal, but we do want an error to print.
276  bool alreadyPrinted = false;
277  if (find_if_string(el_message, in_message_print_error)) {
279  edm::LogError("Root_Error") << el_location << el_message;
280  alreadyPrinted = true;
281  }
282 
283  if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
284  // Don't throw if the message is just informational.
285  die = false;
286  } else {
287  die = true;
288  }
289 
290  // Feed the message to the MessageLogger and let it choose to suppress or not.
291 
292  // Root has declared a fatal error. Throw an EDMException unless the
293  // message corresponds to a pending signal. In that case, do not throw
294  // but let the OS deal with the signal in the usual way.
295  if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
296  std::ostringstream sstr;
297  sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
298  edm::Exception except(edm::errors::FatalRootError, sstr.str());
299  except.addAdditionalInfo(except.message());
300  except.clearMessage();
301  throw except;
302  }
303 
304  // Typically, we get here only for informational messages,
305  // but we leave the other code in just in case we change
306  // the criteria for throwing.
307  if (!alreadyPrinted) {
308  if (el_severity == edm::RootHandlers::SeverityLevel::kFatal) {
309  edm::LogError("Root_Fatal") << el_location << el_message;
310  } else if (el_severity == edm::RootHandlers::SeverityLevel::kSysError) {
311  edm::LogError("Root_Severe") << el_location << el_message;
312  } else if (el_severity == edm::RootHandlers::SeverityLevel::kError) {
313  edm::LogError("Root_Error") << el_location << el_message;
314  } else if (el_severity == edm::RootHandlers::SeverityLevel::kWarning) {
315  edm::LogWarning("Root_Warning") << el_location << el_message;
316  } else if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
317  edm::LogInfo("Root_Information") << el_location << el_message;
318  }
319  }
320  }
321 
322  void RootErrorHandler(int level, bool, char const* location, char const* message) {
323  RootErrorHandlerImpl(level, location, message);
324  }
325 
326  extern "C" {
327  void set_default_signals() {
328  signal(SIGILL, SIG_DFL);
329  signal(SIGSEGV, SIG_DFL);
330  signal(SIGBUS, SIG_DFL);
331  signal(SIGTERM, SIG_DFL);
332  signal(SIGFPE, SIG_DFL);
333  signal(SIGABRT, SIG_DFL);
334  }
335 
336  static int full_write(int fd, const char* text) {
337  const char* buffer = text;
338  size_t count = strlen(text);
339  ssize_t written = 0;
340  while (count) {
341  written = write(fd, buffer, count);
342  if (written == -1) {
343  if (errno == EINTR) {
344  continue;
345  } else {
346  return -errno;
347  }
348  }
349  count -= written;
350  buffer += written;
351  }
352  return 0;
353  }
354 
355  static int full_read(int fd, char* inbuf, size_t len, int timeout_s = -1) {
356  char* buf = inbuf;
357  size_t count = len;
358  ssize_t complete = 0;
359  std::chrono::time_point<std::chrono::steady_clock> end_time =
361  int flags;
362  if (timeout_s < 0) {
363  flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
364  } else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
365  return -errno;
366  }
367  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
368  if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
369  return -errno;
370  }
371  }
372  while (count) {
373  if (timeout_s >= 0) {
374  struct pollfd poll_info {
375  fd, POLLIN, 0
376  };
377  int ms_remaining =
378  std::chrono::duration_cast<std::chrono::milliseconds>(end_time - std::chrono::steady_clock::now()).count();
379  if (ms_remaining > 0) {
380  int rc = poll(&poll_info, 1, ms_remaining);
381  if (rc <= 0) {
382  if (rc < 0) {
383  if (errno == EINTR || errno == EAGAIN) {
384  continue;
385  }
386  rc = -errno;
387  } else {
388  rc = -ETIMEDOUT;
389  }
390  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
391  fcntl(fd, F_SETFL, flags);
392  }
393  return rc;
394  }
395  } else if (ms_remaining < 0) {
396  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
397  fcntl(fd, F_SETFL, flags);
398  }
399  return -ETIMEDOUT;
400  }
401  }
402  complete = read(fd, buf, count);
403  if (complete == -1) {
404  if (errno == EINTR) {
405  continue;
406  } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
407  continue;
408  } else {
409  int orig_errno = errno;
410  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
411  fcntl(fd, F_SETFL, flags);
412  }
413  return -orig_errno;
414  }
415  }
416  count -= complete;
417  buf += complete;
418  }
419  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
420  fcntl(fd, F_SETFL, flags);
421  }
422  return 0;
423  }
424 
425  static int full_cerr_write(const char* text) { return full_write(2, text); }
426 
427 // these signals are only used inside the stacktrace signal handler,
428 // so common signals can be used. They do have to be different, since
429 // we do not set SA_NODEFER, and RESUME must be a signal that will
430 // cause sleep() to return early.
431 #if defined(SIGRTMAX)
432 #define PAUSE_SIGNAL SIGRTMAX
433 #define RESUME_SIGNAL SIGRTMAX - 1
434 #elif defined(SIGINFO) // macOS/BSD
435 #define PAUSE_SIGNAL SIGINFO
436 #define RESUME_SIGNAL SIGALRM
437 #endif
438 
439  // does nothing, here only to interrupt the sleep() in the pause handler
440  void sig_resume_handler(int sig, siginfo_t*, void*) {}
441 
442  // pause a thread so that a (slow) stacktrace will capture the current state
443  void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
444  using namespace edm::service;
445 
446 #ifdef RESUME_SIGNAL
447  sigset_t sigset;
448  sigemptyset(&sigset);
449  sigaddset(&sigset, RESUME_SIGNAL);
450  pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
451 #endif
452  // sleep interrrupts on a handled delivery of the resume signal
454 
455  if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
458  char* buff = InitRootHandlers::moduleListBuffers_[i].data();
459 
460  strlcpy(buff, "\nModule: ", moduleBufferSize);
462  strlcat(buff,
463  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
464  moduleBufferSize);
465  strlcat(buff, ":", moduleBufferSize);
466  strlcat(buff,
467  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
468  moduleBufferSize);
469  } else {
470  strlcat(buff, "none", moduleBufferSize);
471  }
473  }
474  }
475  }
476 
477  void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
478  using namespace edm::service;
479 
480  const auto& tids = InitRootHandlers::threadIDs();
481 
482  const auto self = pthread_self();
483 #ifdef PAUSE_SIGNAL
484  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
485  // install the "pause" handler
486  struct sigaction act;
487  act.sa_sigaction = sig_pause_for_stacktrace;
488  act.sa_flags = 0;
489  sigemptyset(&act.sa_mask);
490  sigaction(PAUSE_SIGNAL, &act, nullptr);
491 
492  // unblock pause signal globally, resume is unblocked in the pause handler
493  sigset_t pausesigset;
494  sigemptyset(&pausesigset);
495  sigaddset(&pausesigset, PAUSE_SIGNAL);
496  sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
497 
498  // send a pause signal to all CMSSW/TBB threads other than self
499  for (auto id : tids) {
500  if (self != id) {
501  pthread_kill(id, PAUSE_SIGNAL);
502  }
503  }
504 
505 #ifdef RESUME_SIGNAL
506  // install the "resume" handler
507  act.sa_sigaction = sig_resume_handler;
508  sigaction(RESUME_SIGNAL, &act, nullptr);
509 #endif
510  }
511 #endif
512 
513  const char* signalname = "unknown";
514  switch (sig) {
515  case SIGBUS: {
516  signalname = "bus error";
517  break;
518  }
519  case SIGSEGV: {
520  signalname = "segmentation violation";
521  break;
522  }
523  case SIGILL: {
524  signalname = "illegal instruction";
525  break;
526  }
527  case SIGFPE: {
528  signalname = "floating point exception";
529  break;
530  }
531  case SIGTERM: {
532  signalname = "external termination request";
533  break;
534  }
535  case SIGABRT: {
536  signalname = "abort signal";
537  break;
538  }
539  default:
540  break;
541  }
542  full_cerr_write("\n\nA fatal system signal has occurred: ");
543  full_cerr_write(signalname);
544  full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
545 
547 
548  // resume the signal handlers to store the current module; we are not guaranteed they
549  // will have time to store their modules, so there is a race condition; this could be
550  // avoided by storing the module information before sleeping, a change that may be
551  // made when we're convinced accessing the thread-local current module is safe.
552 #ifdef RESUME_SIGNAL
553  std::size_t notified = 0;
554  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
555  for (auto id : tids) {
556  if (self != id) {
557  if (pthread_kill(id, RESUME_SIGNAL) == 0)
558  ++notified;
559  }
560  }
561  }
562 #endif
563 
564  full_cerr_write("\nCurrent Modules:\n");
565 
566  // Checking tids.count(self) ensures that we only try to access the current module in
567  // CMSSW/TBB threads. Those threads access the thread-local current module at the same
568  // time the thread is registered, so any lazy allocation will have been done at that
569  // point. Not necessary on Linux with the current cmsRun linkage, as the thread-local
570  // is allocated at exec time, not lazily.
571  if (tids.count(self) > 0) {
572  char buff[moduleBufferSize] = "\nModule: ";
574  strlcat(buff,
575  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
576  moduleBufferSize);
577  strlcat(buff, ":", moduleBufferSize);
578  strlcat(buff,
579  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
580  moduleBufferSize);
581  } else {
582  strlcat(buff, "none", moduleBufferSize);
583  }
584  strlcat(buff, " (crashed)", moduleBufferSize);
585  full_cerr_write(buff);
586  } else {
587  full_cerr_write("\nModule: non-CMSSW (crashed)");
588  }
589 
590 #ifdef PAUSE_SIGNAL
591  // wait a short interval for the paused threads to resume and fill in their module
592  // information, then print
593  if (InitRootHandlers::doneModules_.is_lock_free()) {
594  int spincount = 0;
595  timespec t = {0, 1000};
596  while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) {
597  nanosleep(&t, nullptr);
598  }
599  for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
600  full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
601  }
602  }
603 #endif
604 
605  full_cerr_write("\n\nA fatal system signal has occurred: ");
606  full_cerr_write(signalname);
607  full_cerr_write("\n");
608 
609  // For these known cases, re-raise the signal to get the correct
610  // exit code.
611  if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGFPE) ||
612  (sig == SIGABRT)) {
613  signal(sig, SIG_DFL);
614  raise(sig);
615  } else {
616  set_default_signals();
617  ::abort();
618  }
619  }
620 
621  void sig_abort(int sig, siginfo_t*, void*) {
622  full_cerr_write("\n\nFatal system signal has occurred during exit\n");
623 
624  // re-raise the signal to get the correct exit code
625  signal(sig, SIG_DFL);
626  raise(sig);
627 
628  // shouldn't get here
629  set_default_signals();
630  ::sleep(10);
631  ::abort();
632  }
633  }
634 } // end of unnamed namespace
635 
636 namespace edm {
637  namespace service {
638 
639  /*
640  * We've run into issues where GDB fails to print the thread which calls clone().
641  * To avoid this problem, we have an alternate approach below where the signal handler
642  * only reads/writes to a dedicated thread via pipes. The helper thread does the clone()
643  * invocation; we don't care if that thread is missing from the traceback in this case.
644  */
645  static void cmssw_stacktrace_fork();
646 
648  int toParent = childToParent_[1];
649  int fromParent = parentToChild_[0];
650  char buf[2];
651  buf[1] = '\0';
652 
653  while (true) {
654  int result = full_read(fromParent, buf, 1);
655  if (result < 0) {
656  // To avoid a deadlock (this function is NOT re-entrant), reset signals
657  // We never set them back to the CMSSW handler because we assume the parent
658  // thread will abort for us.
659  set_default_signals();
660  close(toParent);
661  full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
662  full_cerr_write(strerror(-result));
663  full_cerr_write("\n");
664  ::abort();
665  }
666  if (buf[0] == '1') {
667  set_default_signals();
669  full_write(toParent, buf);
670  } else if (buf[0] == '2') {
671  // We have just finished forking. Reload the file descriptors for thread
672  // communication.
673  close(toParent);
674  close(fromParent);
675  toParent = childToParent_[1];
676  fromParent = parentToChild_[0];
677  } else if (buf[0] == '3') {
678  break;
679  } else {
680  set_default_signals();
681  close(toParent);
682  full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
683  full_cerr_write(buf);
684  full_cerr_write("\n");
685  ::abort();
686  }
687  }
688  }
689 
691  int result = full_write(parentToChild_[1], "1");
692  if (result < 0) {
693  full_cerr_write("\n\nAttempt to request stacktrace failed: ");
694  full_cerr_write(strerror(-result));
695  full_cerr_write("\n");
696  return;
697  }
698  char buf[2];
699  buf[1] = '\0';
700  if ((result = full_read(childToParent_[0], buf, 1, 5 * 60)) < 0) {
701  full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
702  if (result == -ETIMEDOUT) {
703  full_cerr_write("timed out waiting for GDB to complete.");
704  } else {
705  full_cerr_write(strerror(-result));
706  }
707  full_cerr_write("\n");
708  return;
709  }
710  }
711 
713  char child_stack[4 * 1024];
714  char* child_stack_ptr = child_stack + 4 * 1024;
715  // On Linux, we currently use jemalloc. This registers pthread_atfork handlers; these
716  // handlers are *not* async-signal safe. Hence, a deadlock is possible if we invoke
717  // fork() from our signal handlers. Accordingly, we use clone (not POSIX, but AS-safe)
718  // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
719  int pid =
720 #ifdef __linux__
721  clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM | CLONE_FS | SIGCHLD, nullptr);
722 #else
723  fork();
724  if (child_stack_ptr) {
725  } // Suppress 'unused variable' warning on non-Linux
726  if (pid == 0) {
728  }
729 #endif
730  if (pid == -1) {
731  full_cerr_write("(Attempt to perform stack dump failed.)\n");
732  } else {
733  int status;
734  if (waitpid(pid, &status, 0) == -1) {
735  full_cerr_write("(Failed to wait on stack dump output.)\n");
736  }
737  if (status) {
738  full_cerr_write("(GDB stack trace failed unexpectedly)\n");
739  }
740  }
741  }
742 
743  int cmssw_stacktrace(void* /*arg*/) {
744  set_default_signals();
745 
747  // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
748  // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
749  // calls dlsym.
750 #ifdef __linux__
751  syscall(SYS_execve, "/bin/sh", argv, __environ);
752 #else
753  execv("/bin/sh", argv);
754 #endif
755  ::abort();
756  return 1;
757  }
758 
759  static constexpr char pstackName[] = "(CMSSW stack trace helper)";
760  static constexpr char dashC[] = "-c";
763  int InitRootHandlers::parentToChild_[2] = {-1, -1};
764  int InitRootHandlers::childToParent_[2] = {-1, -1};
765  std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
766  std::unique_ptr<InitRootHandlers::ThreadTracker> InitRootHandlers::threadTracker_;
768  std::vector<std::array<char, moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
769  std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
770 
772  : RootHandlers(),
773  unloadSigHandler_(pset.getUntrackedParameter<bool>("UnloadRootSigHandler")),
774  resetErrHandler_(pset.getUntrackedParameter<bool>("ResetRootErrHandler")),
775  loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
776  autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool>("AutoLibraryLoader")),
777  autoClassParser_(pset.getUntrackedParameter<bool>("AutoClassParser")),
778  interactiveDebug_(pset.getUntrackedParameter<bool>("InteractiveDebug")) {
779  stackTracePause_ = pset.getUntrackedParameter<int>("StackTracePauseTime");
780 
781  if (not threadTracker_) {
782  threadTracker_ = std::make_unique<ThreadTracker>();
783  iReg.watchPostEndJob([]() {
784  if (threadTracker_) {
785  threadTracker_->observe(false);
786  }
787  });
788  }
789 
790  if (unloadSigHandler_) {
791  // Deactivate all the Root signal handlers and restore the system defaults
792  gSystem->ResetSignal(kSigChild);
793  gSystem->ResetSignal(kSigBus);
794  gSystem->ResetSignal(kSigSegmentationViolation);
795  gSystem->ResetSignal(kSigIllegalInstruction);
796  gSystem->ResetSignal(kSigSystem);
797  gSystem->ResetSignal(kSigPipe);
798  gSystem->ResetSignal(kSigAlarm);
799  gSystem->ResetSignal(kSigUrgent);
800  gSystem->ResetSignal(kSigFloatingException);
801  gSystem->ResetSignal(kSigWindowChanged);
802  } else if (pset.getUntrackedParameter<bool>("AbortOnSignal")) {
803  cachePidInfo();
804 
805  //NOTE: ROOT can also be told to abort on these kinds of problems BUT
806  // it requires an TApplication to be instantiated which causes problems
807  gSystem->ResetSignal(kSigBus);
808  gSystem->ResetSignal(kSigSegmentationViolation);
809  gSystem->ResetSignal(kSigIllegalInstruction);
810  gSystem->ResetSignal(kSigFloatingException);
811  installCustomHandler(SIGBUS, sig_dostack_then_abort);
812  sigBusHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGBUS, sig_abort); });
813  installCustomHandler(SIGSEGV, sig_dostack_then_abort);
814  sigSegvHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGSEGV, sig_abort); });
815  installCustomHandler(SIGILL, sig_dostack_then_abort);
816  sigIllHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGILL, sig_abort); });
817  installCustomHandler(SIGTERM, sig_dostack_then_abort);
818  sigTermHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGTERM, sig_abort); });
819  installCustomHandler(SIGFPE, sig_dostack_then_abort);
820  sigFpeHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGFPE, sig_abort); });
821  installCustomHandler(SIGABRT, sig_dostack_then_abort);
822  sigAbrtHandler_ = std::shared_ptr<const void>(nullptr, [](void*) {
823  signal(SIGABRT, SIG_DFL); // release SIGABRT to default
824  });
825  }
826 
827  iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds) {
828  if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
829  moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
830  }
831  });
832 
833  if (resetErrHandler_) {
834  // Replace the Root error handler with one that uses the MessageLogger
835  SetErrorHandler(RootErrorHandler);
836  }
837 
838  // Enable automatic Root library loading.
839  if (autoLibraryLoader_) {
840  gInterpreter->SetClassAutoloading(1);
841  }
842 
843  // Enable/disable automatic parsing of headers
844  if (not autoClassParser_) {
845  // Disable automatic parsing of headers during module construction
847  [](edm::ModuleDescription const&) { gInterpreter->SetClassAutoparsing(false); });
849  [](edm::ModuleDescription const&) { gInterpreter->SetClassAutoparsing(true); });
850  }
851 
852  // Set ROOT parameters.
853  TTree::SetMaxTreeSize(kMaxLong64);
854  TH1::AddDirectory(kFALSE);
855  //G__SetCatchException(0);
856 
857  // Set custom streamers
859 
860  // Load the library containing dictionaries for std:: classes, if not already loaded.
861  if (!hasDictionary(typeid(std::vector<std::vector<unsigned int>>))) {
862  TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
863  }
864 
865  int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
866  if (debugLevel > 0) {
867  gDebug = debugLevel;
868  }
869 
870  // Enable Root implicit multi-threading
871  bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
872  if (imt && not ROOT::IsImplicitMTEnabled()) {
873  //cmsRun uses global_control to set the number of allowed threads to use
874  // we need to tell ROOT the same value in order to avoid unnecessary warnings
875  ROOT::EnableImplicitMT(
876  oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
877  }
878  }
879 
881  // close all open ROOT files
882  TIter iter(gROOT->GetListOfFiles());
883  TObject* obj = nullptr;
884  while (nullptr != (obj = iter.Next())) {
885  TFile* f = dynamic_cast<TFile*>(obj);
886  if (f) {
887  // We get a new iterator each time,
888  // because closing a file can invalidate the iterator
889  f->Close();
890  iter = TIter(gROOT->GetListOfFiles());
891  }
892  }
893  //disengage from TBB to avoid possible at exit problems
894  threadTracker_.reset();
895  }
896 
898  //Tell Root we want to be multi-threaded
899  ROOT::EnableThreadSafety();
900 
901  //When threading, also have to keep ROOT from logging all TObjects into a list
902  TObject::SetObjectStat(false);
903 
904  //Have to avoid having Streamers modify themselves after they have been used
905  TVirtualStreamerInfo::Optimize(false);
906  }
907 
910  desc.setComment("Centralized interface to ROOT.");
911  desc.addUntracked<bool>("UnloadRootSigHandler", false)
912  ->setComment("If True, signals are handled by this service, rather than by ROOT.");
913  desc.addUntracked<bool>("ResetRootErrHandler", true)
914  ->setComment(
915  "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
916  desc.addUntracked<bool>("AutoLibraryLoader", true)
917  ->setComment("If True, enables automatic loading of data dictionaries.");
918  desc.addUntracked<bool>("AutoClassParser", true)
919  ->setComment(
920  "If False, the automatic parsing of class headers for dictionaries when pre-built dictionaries are "
921  "missing is disable during module construction. The current implementation of disabling the parsing is "
922  "fragile, and may work only in a single-thread job that does not use reco::parser::cutParser() or "
923  "reco::parser::expressionParser() (and it certainly does not work on multiple threads).");
924  desc.addUntracked<bool>("LoadAllDictionaries", false)->setComment("If True, loads all ROOT dictionaries.");
925  desc.addUntracked<bool>("EnableIMT", true)->setComment("If True, calls ROOT::EnableImplicitMT().");
926  desc.addUntracked<bool>("AbortOnSignal", true)
927  ->setComment(
928  "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
929  "attempts to do a clean shutdown.");
930  desc.addUntracked<bool>("InteractiveDebug", false)
931  ->setComment(
932  "If True, leave gdb attached to cmsRun after a crash; "
933  "if False, attach gdb, print a stack trace, and quit gdb");
934  desc.addUntracked<int>("DebugLevel", 0)->setComment("Sets ROOT's gDebug value.");
935  desc.addUntracked<int>("StackTracePauseTime", 300)
936  ->setComment("Seconds to pause other threads during stack trace.");
937  descriptions.add("InitRootHandlers", desc);
938  }
939 
940  char const* const* InitRootHandlers::getPstackArgv() { return pstackArgv_; }
941 
943 
945 
947  if (helperThread_) {
948  //Another InitRootHandlers was initialized in this job, possibly
949  // because multiple EventProcessors are being used.
950  //In that case, we are already all setup
951  return;
952  }
953  std::string gdbcmd{"date; gdb -quiet -p %d"};
954  if (!interactiveDebug_) {
955  gdbcmd +=
956  " 2>&1 <<EOF |\n"
957  "set width 0\n"
958  "set height 0\n"
959  "set pagination no\n"
960  "thread apply all bt\n"
961  "EOF\n"
962  "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
963  }
964  if (snprintf(pidString_, pidStringLength_ - 1, gdbcmd.c_str(), getpid()) >= pidStringLength_) {
965  std::ostringstream sstr;
966  sstr << "Unable to pre-allocate stacktrace handler information";
967  edm::Exception except(edm::errors::OtherCMS, sstr.str());
968  throw except;
969  }
970 
971  // These are initialized to -1; harmless to close an invalid FD.
972  // If this is called post-fork, we don't want to be communicating on
973  // these FDs as they are used internally by the parent.
974  close(childToParent_[0]);
975  close(childToParent_[1]);
976  childToParent_[0] = -1;
977  childToParent_[1] = -1;
978  close(parentToChild_[0]);
979  close(parentToChild_[1]);
980  parentToChild_[0] = -1;
981  parentToChild_[1] = -1;
982 
983  if (-1 == pipe2(childToParent_, O_CLOEXEC)) {
984  std::ostringstream sstr;
985  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
986  edm::Exception except(edm::errors::OtherCMS, sstr.str());
987  throw except;
988  }
989 
990  if (-1 == pipe2(parentToChild_, O_CLOEXEC)) {
991  close(childToParent_[0]);
992  close(childToParent_[1]);
993  childToParent_[0] = -1;
994  childToParent_[1] = -1;
995  std::ostringstream sstr;
996  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
997  edm::Exception except(edm::errors::OtherCMS, sstr.str());
998  throw except;
999  }
1000 
1001  helperThread_ = std::make_unique<std::thread>(stacktraceHelperThread);
1002  helperThread_->detach();
1003  }
1004 
1005  } // end of namespace service
1006 } // end of namespace edm
1007 
1009 
size
Write out results.
void watchPostModuleConstruction(PostModuleConstruction::slot_type const &iSlot)
static constexpr char dashC[]
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:102
double seconds()
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setRefCoreStreamerInTClass()
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
void watchPreModuleConstruction(PreModuleConstruction::slot_type const &iSlot)
std::vector< T >::const_iterator search(const cond::Time_t &val, const std::vector< T > &container)
Definition: IOVProxy.cc:21
oneapi::tbb::concurrent_unordered_set< pthread_t > Container_type
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:98
static ModuleCallingContext const * getCurrentModuleOnThread()
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
Log< level::Error, false > LogError
std::shared_ptr< const void > sigFpeHandler_
void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override
std::shared_ptr< const void > sigBusHandler_
static TypeWithDict byName(std::string const &name)
Definition: TypeWithDict.cc:74
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
std::shared_ptr< const void > sigAbrtHandler_
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static char const *const * getPstackArgv()
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
std::shared_ptr< const void > sigIllHandler_
std::shared_ptr< const void > sigTermHandler_
void addAdditionalInfo(std::string const &info)
Definition: Exception.cc:173
double f[11][100]
int cmssw_stacktrace(void *)
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static std::unique_ptr< ThreadTracker > threadTracker_
Log< level::Info, false > LogInfo
static constexpr int pidStringLength_
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
static char const *const pstackArgv_[]
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
Definition: eve_macros.cc:135
unsigned int maxNumberOfThreads() const
Definition: SystemBounds.h:38
std::string moduleName(StableProvenance const &provenance, ProcessHistory const &history)
Definition: Provenance.cc:27
HLT enums.
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:80
static void fillDescriptions(ConfigurationDescriptions &descriptions)
bool hasDictionary(std::type_info const &)
#define O_NONBLOCK
Definition: SysFile.h:23
Log< level::Warning, false > LogWarning
static constexpr char pstackName[]
fd
Definition: ztee.py:136