CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
InitRootHandlers.cc
Go to the documentation of this file.
2 
4 
20 
21 #include "oneapi/tbb/concurrent_unordered_set.h"
22 #include "oneapi/tbb/task.h"
23 #include "oneapi/tbb/task_scheduler_observer.h"
24 #include "oneapi/tbb/global_control.h"
25 #include <memory>
26 
27 #include <thread>
28 #include <sys/wait.h>
29 #include <sstream>
30 #include <cstring>
31 #include <poll.h>
32 #include <atomic>
33 #include <algorithm>
34 #include <vector>
35 #include <string>
36 #include <array>
37 
38 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
39 // version. This can break our stack trace printer. Avoid this by
40 // invoking the syscall directly.
41 #ifdef __linux__
42 #include <syscall.h>
43 #endif
44 
45 #include "TROOT.h"
46 #include "TError.h"
47 #include "TFile.h"
48 #include "TInterpreter.h"
49 #include "TH1.h"
50 #include "TSystem.h"
51 #include "TUnixSystem.h"
52 #include "TTree.h"
53 #include "TVirtualStreamerInfo.h"
54 
55 #include "TClassTable.h"
56 
57 #include <memory>
58 
59 namespace {
60  // size of static buffer allocated for listing module names following a
61  // stacktrace abort
62  constexpr std::size_t moduleBufferSize = 128;
63 } // namespace
64 
65 namespace edm {
67  class ParameterSet;
68  class ActivityRegistry;
69 
70  namespace service {
71  class InitRootHandlers : public RootHandlers {
72  friend int cmssw_stacktrace(void*);
73 
74  public:
75  class ThreadTracker : public oneapi::tbb::task_scheduler_observer {
76  public:
77  typedef oneapi::tbb::concurrent_unordered_set<pthread_t> Container_type;
78 
79  ThreadTracker() : oneapi::tbb::task_scheduler_observer() { observe(); }
80  ~ThreadTracker() override = default;
81 
82  void on_scheduler_entry(bool) override {
83  // ensure thread local has been allocated; not necessary on Linux with
84  // the current cmsRun linkage, but could be an issue if the platform
85  // or linkage leads to "lazy" allocation of the thread local. By
86  // referencing it here we make sure it has been allocated and can be
87  // accessed safely from our signal handler.
89  threadIDs_.insert(pthread_self());
90  }
91  void on_scheduler_exit(bool) override {}
92  const Container_type& IDs() { return threadIDs_; }
93 
94  private:
96  };
97 
98  explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
99  ~InitRootHandlers() override;
100 
101  static void fillDescriptions(ConfigurationDescriptions& descriptions);
102  static void stacktraceFromThread();
105  if (threadTracker_) {
106  return threadTracker_->IDs();
107  }
108  return empty;
109  }
110  static int stackTracePause() { return stackTracePause_; }
111 
112  static std::vector<std::array<char, moduleBufferSize>> moduleListBuffers_;
113  static std::atomic<std::size_t> nextModule_, doneModules_;
114 
115  private:
116  static char const* const* getPstackArgv();
117  void enableWarnings_() override;
119  void willBeUsingThreads() override;
120 
121  void cachePidInfo();
122  static void stacktraceHelperThread();
123 
124  static constexpr int pidStringLength_ = 200;
126  static char const* const pstackArgv_[];
127  static int parentToChild_[2];
128  static int childToParent_[2];
129  static std::unique_ptr<std::thread> helperThread_;
130  static std::unique_ptr<ThreadTracker> threadTracker_;
131  static int stackTracePause_;
132 
138  std::shared_ptr<const void> sigBusHandler_;
139  std::shared_ptr<const void> sigSegvHandler_;
140  std::shared_ptr<const void> sigIllHandler_;
141  std::shared_ptr<const void> sigTermHandler_;
142  std::shared_ptr<const void> sigAbrtHandler_;
143  };
144 
145  inline bool isProcessWideService(InitRootHandlers const*) { return true; }
146 
147  } // end of namespace service
148 } // end of namespace edm
149 
150 namespace edm {
151  namespace service {
152  int cmssw_stacktrace(void*);
153  }
154 } // namespace edm
155 
156 namespace {
158 
159  constexpr bool s_ignoreEverything = false;
160 
161  template <std::size_t SIZE>
162  bool find_if_string(const std::string& search, const std::array<const char* const, SIZE>& substrs) {
163  return (std::find_if(substrs.begin(), substrs.end(), [&search](const char* const s) -> bool {
164  return (search.find(s) != std::string::npos);
165  }) != substrs.end());
166  }
167 
168  //Contents of a message which should be reported as an INFO not a ERROR
169  constexpr std::array<const char* const, 9> in_message{
170  {"no dictionary for class",
171  "already in TClassTable",
172  "matrix not positive definite",
173  "not a TStreamerInfo object",
174  "Problems declaring payload",
175  "Announced number of args different from the real number of argument passed", // Always printed if gDebug>0 - regardless of whether warning message is real.
176  "nbins is <=0 - set to nbins = 1",
177  "nbinsy is <=0 - set to nbinsy = 1",
178  "oneapi::tbb::global_control is limiting"}};
179 
180  //Location generating messages which should be reported as an INFO not a ERROR
181  constexpr std::array<const char* const, 7> in_location{{"Fit",
182  "TDecompChol::Solve",
183  "THistPainter::PaintInit",
184  "TUnixSystem::SetDisplay",
185  "TGClient::GetFontByName",
186  "Inverter::Dinv",
187  "RTaskArenaWrapper"}};
188 
189  constexpr std::array<const char* const, 3> in_message_print_error{{"number of iterations was insufficient",
190  "bad integrand behavior",
191  "integral is divergent, or slowly convergent"}};
192 
193  void RootErrorHandlerImpl(int level, char const* location, char const* message) {
194  bool die = false;
195 
196  // Translate ROOT severity level to MessageLogger severity level
197 
199 
200  if (level >= kFatal) {
202  } else if (level >= kSysError) {
204  } else if (level >= kError) {
206  } else if (level >= kWarning) {
208  }
209 
210  if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
212  }
213 
214  // Adapt C-strings to std::strings
215  // Arrange to report the error location as furnished by Root
216 
217  std::string el_location = "@SUB=?";
218  if (location != nullptr)
219  el_location = std::string("@SUB=") + std::string(location);
220 
221  std::string el_message = "?";
222  if (message != nullptr)
223  el_message = message;
224 
225  // Try to create a meaningful id string using knowledge of ROOT error messages
226  //
227  // id == "ROOT-ClassName" where ClassName is the affected class
228  // else "ROOT/ClassName" where ClassName is the error-declaring class
229  // else "ROOT"
230 
231  std::string el_identifier = "ROOT";
232 
233  std::string precursor("class ");
234  size_t index1 = el_message.find(precursor);
235  if (index1 != std::string::npos) {
236  size_t index2 = index1 + precursor.length();
237  size_t index3 = el_message.find_first_of(" :", index2);
238  if (index3 != std::string::npos) {
239  size_t substrlen = index3 - index2;
240  el_identifier += "-";
241  el_identifier += el_message.substr(index2, substrlen);
242  }
243  } else {
244  index1 = el_location.find("::");
245  if (index1 != std::string::npos) {
246  el_identifier += "/";
247  el_identifier += el_location.substr(0, index1);
248  }
249  }
250 
251  // Intercept some messages and upgrade the severity
252 
253  if ((el_location.find("TBranchElement::Fill") != std::string::npos) &&
254  (el_message.find("fill branch") != std::string::npos) && (el_message.find("address") != std::string::npos) &&
255  (el_message.find("not set") != std::string::npos)) {
257  }
258 
259  if ((el_message.find("Tree branches") != std::string::npos) &&
260  (el_message.find("different numbers of entries") != std::string::npos)) {
262  }
263 
264  // Intercept some messages and downgrade the severity
265 
266  if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
267  (level < kError and (el_location.find("CINTTypedefBuilder::Setup") != std::string::npos) and
268  (el_message.find("possible entries are in use!") != std::string::npos))) {
270  }
271 
272  // These are a special case because we do not want them to
273  // be fatal, but we do want an error to print.
274  bool alreadyPrinted = false;
275  if (find_if_string(el_message, in_message_print_error)) {
277  edm::LogError("Root_Error") << el_location << el_message;
278  alreadyPrinted = true;
279  }
280 
281  if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
282  // Don't throw if the message is just informational.
283  die = false;
284  } else {
285  die = true;
286  }
287 
288  // Feed the message to the MessageLogger and let it choose to suppress or not.
289 
290  // Root has declared a fatal error. Throw an EDMException unless the
291  // message corresponds to a pending signal. In that case, do not throw
292  // but let the OS deal with the signal in the usual way.
293  if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
294  std::ostringstream sstr;
295  sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
296  edm::Exception except(edm::errors::FatalRootError, sstr.str());
297  except.addAdditionalInfo(except.message());
298  except.clearMessage();
299  throw except;
300  }
301 
302  // Typically, we get here only for informational messages,
303  // but we leave the other code in just in case we change
304  // the criteria for throwing.
305  if (!alreadyPrinted) {
306  if (el_severity == edm::RootHandlers::SeverityLevel::kFatal) {
307  edm::LogError("Root_Fatal") << el_location << el_message;
308  } else if (el_severity == edm::RootHandlers::SeverityLevel::kSysError) {
309  edm::LogError("Root_Severe") << el_location << el_message;
310  } else if (el_severity == edm::RootHandlers::SeverityLevel::kError) {
311  edm::LogError("Root_Error") << el_location << el_message;
312  } else if (el_severity == edm::RootHandlers::SeverityLevel::kWarning) {
313  edm::LogWarning("Root_Warning") << el_location << el_message;
314  } else if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
315  edm::LogInfo("Root_Information") << el_location << el_message;
316  }
317  }
318  }
319 
320  void RootErrorHandler(int level, bool, char const* location, char const* message) {
321  RootErrorHandlerImpl(level, location, message);
322  }
323 
324  extern "C" {
325  void set_default_signals() {
326  signal(SIGILL, SIG_DFL);
327  signal(SIGSEGV, SIG_DFL);
328  signal(SIGBUS, SIG_DFL);
329  signal(SIGTERM, SIG_DFL);
330  signal(SIGABRT, SIG_DFL);
331  }
332 
333  static int full_write(int fd, const char* text) {
334  const char* buffer = text;
335  size_t count = strlen(text);
336  ssize_t written = 0;
337  while (count) {
338  written = write(fd, buffer, count);
339  if (written == -1) {
340  if (errno == EINTR) {
341  continue;
342  } else {
343  return -errno;
344  }
345  }
346  count -= written;
347  buffer += written;
348  }
349  return 0;
350  }
351 
352  static int full_read(int fd, char* inbuf, size_t len, int timeout_s = -1) {
353  char* buf = inbuf;
354  size_t count = len;
355  ssize_t complete = 0;
356  std::chrono::time_point<std::chrono::steady_clock> end_time =
358  int flags;
359  if (timeout_s < 0) {
360  flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
361  } else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
362  return -errno;
363  }
364  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
365  if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
366  return -errno;
367  }
368  }
369  while (count) {
370  if (timeout_s >= 0) {
371  struct pollfd poll_info {
372  fd, POLLIN, 0
373  };
374  int ms_remaining =
375  std::chrono::duration_cast<std::chrono::milliseconds>(end_time - std::chrono::steady_clock::now()).count();
376  if (ms_remaining > 0) {
377  int rc = poll(&poll_info, 1, ms_remaining);
378  if (rc <= 0) {
379  if (rc < 0) {
380  if (errno == EINTR || errno == EAGAIN) {
381  continue;
382  }
383  rc = -errno;
384  } else {
385  rc = -ETIMEDOUT;
386  }
387  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
388  fcntl(fd, F_SETFL, flags);
389  }
390  return rc;
391  }
392  } else if (ms_remaining < 0) {
393  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
394  fcntl(fd, F_SETFL, flags);
395  }
396  return -ETIMEDOUT;
397  }
398  }
399  complete = read(fd, buf, count);
400  if (complete == -1) {
401  if (errno == EINTR) {
402  continue;
403  } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
404  continue;
405  } else {
406  int orig_errno = errno;
407  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
408  fcntl(fd, F_SETFL, flags);
409  }
410  return -orig_errno;
411  }
412  }
413  count -= complete;
414  buf += complete;
415  }
416  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
417  fcntl(fd, F_SETFL, flags);
418  }
419  return 0;
420  }
421 
422  static int full_cerr_write(const char* text) { return full_write(2, text); }
423 
424 // these signals are only used inside the stacktrace signal handler,
425 // so common signals can be used. They do have to be different, since
426 // we do not set SA_NODEFER, and RESUME must be a signal that will
427 // cause sleep() to return early.
428 #if defined(SIGRTMAX)
429 #define PAUSE_SIGNAL SIGRTMAX
430 #define RESUME_SIGNAL SIGRTMAX - 1
431 #elif defined(SIGINFO) // macOS/BSD
432 #define PAUSE_SIGNAL SIGINFO
433 #define RESUME_SIGNAL SIGALRM
434 #endif
435 
436  // does nothing, here only to interrupt the sleep() in the pause handler
437  void sig_resume_handler(int sig, siginfo_t*, void*) {}
438 
439  // pause a thread so that a (slow) stacktrace will capture the current state
440  void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
441  using namespace edm::service;
442 
443 #ifdef RESUME_SIGNAL
444  sigset_t sigset;
445  sigemptyset(&sigset);
446  sigaddset(&sigset, RESUME_SIGNAL);
447  pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
448 #endif
449  // sleep interrrupts on a handled delivery of the resume signal
451 
452  if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
455  char* buff = InitRootHandlers::moduleListBuffers_[i].data();
456 
457  strlcpy(buff, "\nModule: ", moduleBufferSize);
459  strlcat(buff,
460  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
461  moduleBufferSize);
462  strlcat(buff, ":", moduleBufferSize);
463  strlcat(buff,
464  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
465  moduleBufferSize);
466  } else {
467  strlcat(buff, "none", moduleBufferSize);
468  }
470  }
471  }
472  }
473 
474  void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
475  using namespace edm::service;
476 
477  const auto& tids = InitRootHandlers::threadIDs();
478 
479  const auto self = pthread_self();
480 #ifdef PAUSE_SIGNAL
481  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
482  // install the "pause" handler
483  struct sigaction act;
484  act.sa_sigaction = sig_pause_for_stacktrace;
485  act.sa_flags = 0;
486  sigemptyset(&act.sa_mask);
487  sigaction(PAUSE_SIGNAL, &act, nullptr);
488 
489  // unblock pause signal globally, resume is unblocked in the pause handler
490  sigset_t pausesigset;
491  sigemptyset(&pausesigset);
492  sigaddset(&pausesigset, PAUSE_SIGNAL);
493  sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
494 
495  // send a pause signal to all CMSSW/TBB threads other than self
496  for (auto id : tids) {
497  if (self != id) {
498  pthread_kill(id, PAUSE_SIGNAL);
499  }
500  }
501 
502 #ifdef RESUME_SIGNAL
503  // install the "resume" handler
504  act.sa_sigaction = sig_resume_handler;
505  sigaction(RESUME_SIGNAL, &act, nullptr);
506 #endif
507  }
508 #endif
509 
510  const char* signalname = "unknown";
511  switch (sig) {
512  case SIGBUS: {
513  signalname = "bus error";
514  break;
515  }
516  case SIGSEGV: {
517  signalname = "segmentation violation";
518  break;
519  }
520  case SIGILL: {
521  signalname = "illegal instruction";
522  break;
523  }
524  case SIGTERM: {
525  signalname = "external termination request";
526  break;
527  }
528  case SIGABRT: {
529  signalname = "abort signal";
530  break;
531  }
532  default:
533  break;
534  }
535  full_cerr_write("\n\nA fatal system signal has occurred: ");
536  full_cerr_write(signalname);
537  full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
538 
540 
541  // resume the signal handlers to store the current module; we are not guaranteed they
542  // will have time to store their modules, so there is a race condition; this could be
543  // avoided by storing the module information before sleeping, a change that may be
544  // made when we're convinced accessing the thread-local current module is safe.
545 #ifdef RESUME_SIGNAL
546  std::size_t notified = 0;
547  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
548  for (auto id : tids) {
549  if (self != id) {
550  if (pthread_kill(id, RESUME_SIGNAL) == 0)
551  ++notified;
552  }
553  }
554  }
555 #endif
556 
557  full_cerr_write("\nCurrent Modules:\n");
558 
559  // Checking tids.count(self) ensures that we only try to access the current module in
560  // CMSSW/TBB threads. Those threads access the thread-local current module at the same
561  // time the thread is registered, so any lazy allocation will have been done at that
562  // point. Not necessary on Linux with the current cmsRun linkage, as the thread-local
563  // is allocated at exec time, not lazily.
564  if (tids.count(self) > 0) {
565  char buff[moduleBufferSize] = "\nModule: ";
567  strlcat(buff,
568  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
569  moduleBufferSize);
570  strlcat(buff, ":", moduleBufferSize);
571  strlcat(buff,
572  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
573  moduleBufferSize);
574  } else {
575  strlcat(buff, "none", moduleBufferSize);
576  }
577  strlcat(buff, " (crashed)", moduleBufferSize);
578  full_cerr_write(buff);
579  } else {
580  full_cerr_write("\nModule: non-CMSSW (crashed)");
581  }
582 
583 #ifdef PAUSE_SIGNAL
584  // wait a short interval for the paused threads to resume and fill in their module
585  // information, then print
586  if (InitRootHandlers::doneModules_.is_lock_free()) {
587  int spincount = 0;
588  timespec t = {0, 1000};
589  while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) {
590  nanosleep(&t, nullptr);
591  }
592  for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
593  full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
594  }
595  }
596 #endif
597 
598  full_cerr_write("\n\nA fatal system signal has occurred: ");
599  full_cerr_write(signalname);
600  full_cerr_write("\n");
601 
602  // For these five known cases, re-raise the signal to get the correct
603  // exit code.
604  if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGABRT)) {
605  signal(sig, SIG_DFL);
606  raise(sig);
607  } else {
608  set_default_signals();
609  ::abort();
610  }
611  }
612 
613  void sig_abort(int sig, siginfo_t*, void*) {
614  full_cerr_write("\n\nFatal system signal has occurred during exit\n");
615 
616  // re-raise the signal to get the correct exit code
617  signal(sig, SIG_DFL);
618  raise(sig);
619 
620  // shouldn't get here
621  set_default_signals();
622  ::sleep(10);
623  ::abort();
624  }
625  }
626 } // end of unnamed namespace
627 
628 namespace edm {
629  namespace service {
630 
631  /*
632  * We've run into issues where GDB fails to print the thread which calls clone().
633  * To avoid this problem, we have an alternate approach below where the signal handler
634  * only reads/writes to a dedicated thread via pipes. The helper thread does the clone()
635  * invocation; we don't care if that thread is missing from the traceback in this case.
636  */
637  static void cmssw_stacktrace_fork();
638 
640  int toParent = childToParent_[1];
641  int fromParent = parentToChild_[0];
642  char buf[2];
643  buf[1] = '\0';
644 
645  while (true) {
646  int result = full_read(fromParent, buf, 1);
647  if (result < 0) {
648  // To avoid a deadlock (this function is NOT re-entrant), reset signals
649  // We never set them back to the CMSSW handler because we assume the parent
650  // thread will abort for us.
651  set_default_signals();
652  close(toParent);
653  full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
654  full_cerr_write(strerror(-result));
655  full_cerr_write("\n");
656  ::abort();
657  }
658  if (buf[0] == '1') {
659  set_default_signals();
661  full_write(toParent, buf);
662  } else if (buf[0] == '2') {
663  // We have just finished forking. Reload the file descriptors for thread
664  // communication.
665  close(toParent);
666  close(fromParent);
667  toParent = childToParent_[1];
668  fromParent = parentToChild_[0];
669  } else if (buf[0] == '3') {
670  break;
671  } else {
672  set_default_signals();
673  close(toParent);
674  full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
675  full_cerr_write(buf);
676  full_cerr_write("\n");
677  ::abort();
678  }
679  }
680  }
681 
683  int result = full_write(parentToChild_[1], "1");
684  if (result < 0) {
685  full_cerr_write("\n\nAttempt to request stacktrace failed: ");
686  full_cerr_write(strerror(-result));
687  full_cerr_write("\n");
688  return;
689  }
690  char buf[2];
691  buf[1] = '\0';
692  if ((result = full_read(childToParent_[0], buf, 1, 5 * 60)) < 0) {
693  full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
694  if (result == -ETIMEDOUT) {
695  full_cerr_write("timed out waiting for GDB to complete.");
696  } else {
697  full_cerr_write(strerror(-result));
698  }
699  full_cerr_write("\n");
700  return;
701  }
702  }
703 
705  char child_stack[4 * 1024];
706  char* child_stack_ptr = child_stack + 4 * 1024;
707  // On Linux, we currently use jemalloc. This registers pthread_atfork handlers; these
708  // handlers are *not* async-signal safe. Hence, a deadlock is possible if we invoke
709  // fork() from our signal handlers. Accordingly, we use clone (not POSIX, but AS-safe)
710  // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
711  int pid =
712 #ifdef __linux__
713  clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM | CLONE_FS | SIGCHLD, nullptr);
714 #else
715  fork();
716  if (child_stack_ptr) {
717  } // Suppress 'unused variable' warning on non-Linux
718  if (pid == 0) {
720  }
721 #endif
722  if (pid == -1) {
723  full_cerr_write("(Attempt to perform stack dump failed.)\n");
724  } else {
725  int status;
726  if (waitpid(pid, &status, 0) == -1) {
727  full_cerr_write("(Failed to wait on stack dump output.)\n");
728  }
729  if (status) {
730  full_cerr_write("(GDB stack trace failed unexpectedly)\n");
731  }
732  }
733  }
734 
735  int cmssw_stacktrace(void* /*arg*/) {
736  set_default_signals();
737 
739  // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
740  // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
741  // calls dlsym.
742 #ifdef __linux__
743  syscall(SYS_execve, "/bin/sh", argv, __environ);
744 #else
745  execv("/bin/sh", argv);
746 #endif
747  ::abort();
748  return 1;
749  }
750 
751  static constexpr char pstackName[] = "(CMSSW stack trace helper)";
752  static constexpr char dashC[] = "-c";
755  int InitRootHandlers::parentToChild_[2] = {-1, -1};
756  int InitRootHandlers::childToParent_[2] = {-1, -1};
757  std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
758  std::unique_ptr<InitRootHandlers::ThreadTracker> InitRootHandlers::threadTracker_;
760  std::vector<std::array<char, moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
761  std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
762 
764  : RootHandlers(),
765  unloadSigHandler_(pset.getUntrackedParameter<bool>("UnloadRootSigHandler")),
766  resetErrHandler_(pset.getUntrackedParameter<bool>("ResetRootErrHandler")),
767  loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
768  autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool>("AutoLibraryLoader")),
769  interactiveDebug_(pset.getUntrackedParameter<bool>("InteractiveDebug")) {
770  stackTracePause_ = pset.getUntrackedParameter<int>("StackTracePauseTime");
771 
772  if (not threadTracker_) {
773  threadTracker_ = std::make_unique<ThreadTracker>();
774  iReg.watchPostEndJob([]() {
775  if (threadTracker_) {
776  threadTracker_->observe(false);
777  }
778  });
779  }
780 
781  if (unloadSigHandler_) {
782  // Deactivate all the Root signal handlers and restore the system defaults
783  gSystem->ResetSignal(kSigChild);
784  gSystem->ResetSignal(kSigBus);
785  gSystem->ResetSignal(kSigSegmentationViolation);
786  gSystem->ResetSignal(kSigIllegalInstruction);
787  gSystem->ResetSignal(kSigSystem);
788  gSystem->ResetSignal(kSigPipe);
789  gSystem->ResetSignal(kSigAlarm);
790  gSystem->ResetSignal(kSigUrgent);
791  gSystem->ResetSignal(kSigFloatingException);
792  gSystem->ResetSignal(kSigWindowChanged);
793  } else if (pset.getUntrackedParameter<bool>("AbortOnSignal")) {
794  cachePidInfo();
795 
796  //NOTE: ROOT can also be told to abort on these kinds of problems BUT
797  // it requires an TApplication to be instantiated which causes problems
798  gSystem->ResetSignal(kSigBus);
799  gSystem->ResetSignal(kSigSegmentationViolation);
800  gSystem->ResetSignal(kSigIllegalInstruction);
801  installCustomHandler(SIGBUS, sig_dostack_then_abort);
802  sigBusHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGBUS, sig_abort); });
803  installCustomHandler(SIGSEGV, sig_dostack_then_abort);
804  sigSegvHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGSEGV, sig_abort); });
805  installCustomHandler(SIGILL, sig_dostack_then_abort);
806  sigIllHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGILL, sig_abort); });
807  installCustomHandler(SIGTERM, sig_dostack_then_abort);
808  sigTermHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGTERM, sig_abort); });
809  installCustomHandler(SIGABRT, sig_dostack_then_abort);
810  sigAbrtHandler_ = std::shared_ptr<const void>(nullptr, [](void*) {
811  signal(SIGABRT, SIG_DFL); // release SIGABRT to default
812  });
813  }
814 
815  iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds) {
816  if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
817  moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
818  }
819  });
820 
821  if (resetErrHandler_) {
822  // Replace the Root error handler with one that uses the MessageLogger
823  SetErrorHandler(RootErrorHandler);
824  }
825 
826  // Enable automatic Root library loading.
827  if (autoLibraryLoader_) {
828  gInterpreter->SetClassAutoloading(1);
829  }
830 
831  // Set ROOT parameters.
832  TTree::SetMaxTreeSize(kMaxLong64);
833  TH1::AddDirectory(kFALSE);
834  //G__SetCatchException(0);
835 
836  // Set custom streamers
838 
839  // Load the library containing dictionaries for std:: classes, if not already loaded.
840  if (!hasDictionary(typeid(std::vector<std::vector<unsigned int>>))) {
841  TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
842  }
843 
844  int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
845  if (debugLevel > 0) {
846  gDebug = debugLevel;
847  }
848 
849  // Enable Root implicit multi-threading
850  bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
851  if (imt && not ROOT::IsImplicitMTEnabled()) {
852  //cmsRun uses global_control to set the number of allowed threads to use
853  // we need to tell ROOT the same value in order to avoid unnecessary warnings
854  ROOT::EnableImplicitMT(
855  oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
856  }
857  }
858 
860  // close all open ROOT files
861  TIter iter(gROOT->GetListOfFiles());
862  TObject* obj = nullptr;
863  while (nullptr != (obj = iter.Next())) {
864  TFile* f = dynamic_cast<TFile*>(obj);
865  if (f) {
866  // We get a new iterator each time,
867  // because closing a file can invalidate the iterator
868  f->Close();
869  iter = TIter(gROOT->GetListOfFiles());
870  }
871  }
872  //disengage from TBB to avoid possible at exit problems
873  threadTracker_.reset();
874  }
875 
877  //Tell Root we want to be multi-threaded
878  ROOT::EnableThreadSafety();
879 
880  //When threading, also have to keep ROOT from logging all TObjects into a list
881  TObject::SetObjectStat(false);
882 
883  //Have to avoid having Streamers modify themselves after they have been used
884  TVirtualStreamerInfo::Optimize(false);
885  }
886 
889  desc.setComment("Centralized interface to ROOT.");
890  desc.addUntracked<bool>("UnloadRootSigHandler", false)
891  ->setComment("If True, signals are handled by this service, rather than by ROOT.");
892  desc.addUntracked<bool>("ResetRootErrHandler", true)
893  ->setComment(
894  "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
895  desc.addUntracked<bool>("AutoLibraryLoader", true)
896  ->setComment("If True, enables automatic loading of data dictionaries.");
897  desc.addUntracked<bool>("LoadAllDictionaries", false)->setComment("If True, loads all ROOT dictionaries.");
898  desc.addUntracked<bool>("EnableIMT", true)->setComment("If True, calls ROOT::EnableImplicitMT().");
899  desc.addUntracked<bool>("AbortOnSignal", true)
900  ->setComment(
901  "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
902  "attempts to do a clean shutdown.");
903  desc.addUntracked<bool>("InteractiveDebug", false)
904  ->setComment(
905  "If True, leave gdb attached to cmsRun after a crash; "
906  "if False, attach gdb, print a stack trace, and quit gdb");
907  desc.addUntracked<int>("DebugLevel", 0)->setComment("Sets ROOT's gDebug value.");
908  desc.addUntracked<int>("StackTracePauseTime", 300)
909  ->setComment("Seconds to pause other threads during stack trace.");
910  descriptions.add("InitRootHandlers", desc);
911  }
912 
913  char const* const* InitRootHandlers::getPstackArgv() { return pstackArgv_; }
914 
916 
918 
920  if (helperThread_) {
921  //Another InitRootHandlers was initialized in this job, possibly
922  // because multiple EventProcessors are being used.
923  //In that case, we are already all setup
924  return;
925  }
926  std::string gdbcmd{"date; gdb -quiet -p %d"};
927  if (!interactiveDebug_) {
928  gdbcmd +=
929  " 2>&1 <<EOF |\n"
930  "set width 0\n"
931  "set height 0\n"
932  "set pagination no\n"
933  "thread apply all bt\n"
934  "EOF\n"
935  "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
936  }
937  if (snprintf(pidString_, pidStringLength_ - 1, gdbcmd.c_str(), getpid()) >= pidStringLength_) {
938  std::ostringstream sstr;
939  sstr << "Unable to pre-allocate stacktrace handler information";
940  edm::Exception except(edm::errors::OtherCMS, sstr.str());
941  throw except;
942  }
943 
944  // These are initialized to -1; harmless to close an invalid FD.
945  // If this is called post-fork, we don't want to be communicating on
946  // these FDs as they are used internally by the parent.
947  close(childToParent_[0]);
948  close(childToParent_[1]);
949  childToParent_[0] = -1;
950  childToParent_[1] = -1;
951  close(parentToChild_[0]);
952  close(parentToChild_[1]);
953  parentToChild_[0] = -1;
954  parentToChild_[1] = -1;
955 
956  if (-1 == pipe2(childToParent_, O_CLOEXEC)) {
957  std::ostringstream sstr;
958  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
959  edm::Exception except(edm::errors::OtherCMS, sstr.str());
960  throw except;
961  }
962 
963  if (-1 == pipe2(parentToChild_, O_CLOEXEC)) {
964  close(childToParent_[0]);
965  close(childToParent_[1]);
966  childToParent_[0] = -1;
967  childToParent_[1] = -1;
968  std::ostringstream sstr;
969  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
970  edm::Exception except(edm::errors::OtherCMS, sstr.str());
971  throw except;
972  }
973 
974  helperThread_ = std::make_unique<std::thread>(stacktraceHelperThread);
975  helperThread_->detach();
976  }
977 
978  } // end of namespace service
979 } // end of namespace edm
980 
unsigned int maxNumberOfThreads() const
Definition: SystemBounds.h:38
T getUntrackedParameter(std::string const &, T const &) const
static constexpr char dashC[]
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:100
double seconds()
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setRefCoreStreamerInTClass()
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventIDconst &, edm::Timestampconst & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
std::vector< T >::const_iterator search(const cond::Time_t &val, const std::vector< T > &container)
Definition: IOVProxy.cc:21
oneapi::tbb::concurrent_unordered_set< pthread_t > Container_type
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:98
list status
Definition: mps_update.py:107
static ModuleCallingContext const * getCurrentModuleOnThread()
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
Log< level::Error, false > LogError
friend int cmssw_stacktrace(void *)
void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override
std::shared_ptr< const void > sigBusHandler_
static TypeWithDict byName(std::string const &name)
Definition: TypeWithDict.cc:74
tuple result
Definition: mps_fire.py:311
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
std::shared_ptr< const void > sigAbrtHandler_
void setComment(std::string const &value)
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
std::shared_ptr< const void > sigIllHandler_
std::shared_ptr< const void > sigTermHandler_
tuple fd
Definition: ztee.py:136
void addAdditionalInfo(std::string const &info)
Definition: Exception.cc:169
tuple text
Definition: runonSM.py:43
int cmssw_stacktrace(void *)
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static std::unique_ptr< ThreadTracker > threadTracker_
Log< level::Info, false > LogInfo
static constexpr int pidStringLength_
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
static char const *const * getPstackArgv()
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
Definition: eve_macros.cc:135
static char const *const pstackArgv_[]
std::string moduleName(StableProvenance const &provenance, ProcessHistory const &history)
Definition: Provenance.cc:27
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:79
static void fillDescriptions(ConfigurationDescriptions &descriptions)
bool hasDictionary(std::type_info const &)
#define O_NONBLOCK
Definition: SysFile.h:23
tuple level
Definition: testEve_cfg.py:47
Log< level::Warning, false > LogWarning
static constexpr char pstackName[]
tuple size
Write out results.