CMS 3D CMS Logo

InitRootHandlers.cc
Go to the documentation of this file.
2 
18 
19 #include "oneapi/tbb/concurrent_unordered_set.h"
20 #include "oneapi/tbb/task.h"
21 #include "oneapi/tbb/task_scheduler_observer.h"
22 #include "oneapi/tbb/global_control.h"
23 #include <memory>
24 
25 #include <thread>
26 #include <sys/wait.h>
27 #include <sstream>
28 #include <cstring>
29 #include <poll.h>
30 #include <atomic>
31 #include <algorithm>
32 #include <vector>
33 #include <string>
34 #include <array>
35 
36 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
37 // version. This can break our stack trace printer. Avoid this by
38 // invoking the syscall directly.
39 #ifdef __linux__
40 #include <syscall.h>
41 #endif
42 
43 #include "TROOT.h"
44 #include "TError.h"
45 #include "TFile.h"
46 #include "TInterpreter.h"
47 #include "TH1.h"
48 #include "TSystem.h"
49 #include "TUnixSystem.h"
50 #include "TTree.h"
51 #include "TVirtualStreamerInfo.h"
52 
53 #include "TClassTable.h"
54 
55 #include <memory>
56 
57 namespace {
58  // size of static buffer allocated for listing module names following a
59  // stacktrace abort
60  constexpr std::size_t moduleBufferSize = 128;
61 } // namespace
62 
63 namespace edm {
65  class ParameterSet;
66  class ActivityRegistry;
67 
68  namespace service {
69  class InitRootHandlers : public RootHandlers {
70  friend int cmssw_stacktrace(void*);
71 
72  public:
73  class ThreadTracker : public oneapi::tbb::task_scheduler_observer {
74  public:
75  typedef oneapi::tbb::concurrent_unordered_set<pthread_t> Container_type;
76 
77  ThreadTracker() : oneapi::tbb::task_scheduler_observer() { observe(); }
78  ~ThreadTracker() override = default;
79 
80  void on_scheduler_entry(bool) override {
81  // ensure thread local has been allocated; not necessary on Linux with
82  // the current cmsRun linkage, but could be an issue if the platform
83  // or linkage leads to "lazy" allocation of the thread local. By
84  // referencing it here we make sure it has been allocated and can be
85  // accessed safely from our signal handler.
87  threadIDs_.insert(pthread_self());
88  }
89  void on_scheduler_exit(bool) override {}
90  const Container_type& IDs() { return threadIDs_; }
91 
92  private:
94  };
95 
96  explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
97  ~InitRootHandlers() override;
98 
99  static void fillDescriptions(ConfigurationDescriptions& descriptions);
100  static void stacktraceFromThread();
103  if (threadTracker_) {
104  return threadTracker_->IDs();
105  }
106  return empty;
107  }
108  static int stackTracePause() { return stackTracePause_; }
109 
110  static std::vector<std::array<char, moduleBufferSize>> moduleListBuffers_;
111  static std::atomic<std::size_t> nextModule_, doneModules_;
112 
113  private:
114  static char const* const* getPstackArgv();
115  void enableWarnings_() override;
117  void willBeUsingThreads() override;
118 
119  void cachePidInfo();
120  static void stacktraceHelperThread();
121 
122  static constexpr int pidStringLength_ = 200;
124  static char const* const pstackArgv_[];
125  static int parentToChild_[2];
126  static int childToParent_[2];
127  static std::unique_ptr<std::thread> helperThread_;
128  static std::unique_ptr<ThreadTracker> threadTracker_;
129  static int stackTracePause_;
130 
137  std::shared_ptr<const void> sigBusHandler_;
138  std::shared_ptr<const void> sigSegvHandler_;
139  std::shared_ptr<const void> sigIllHandler_;
140  std::shared_ptr<const void> sigTermHandler_;
141  std::shared_ptr<const void> sigAbrtHandler_;
142  std::shared_ptr<const void> sigFpeHandler_;
143  };
144 
145  inline bool isProcessWideService(InitRootHandlers const*) { return true; }
146 
147  } // end of namespace service
148 } // end of namespace edm
149 
150 namespace edm {
151  namespace service {
152  int cmssw_stacktrace(void*);
153  }
154 } // namespace edm
155 
156 namespace {
158 
159  constexpr bool s_ignoreEverything = false;
160 
161  template <std::size_t SIZE>
162  bool find_if_string(const std::string& search, const std::array<const char* const, SIZE>& substrs) {
163  return (std::find_if(substrs.begin(), substrs.end(), [&search](const char* const s) -> bool {
164  return (search.find(s) != std::string::npos);
165  }) != substrs.end());
166  }
167 
168  //Contents of a message which should be reported as an INFO not a ERROR
169  constexpr std::array<const char* const, 9> in_message{
170  {"no dictionary for class",
171  "already in TClassTable",
172  "matrix not positive definite",
173  "not a TStreamerInfo object",
174  "Problems declaring payload",
175  "Announced number of args different from the real number of argument passed", // Always printed if gDebug>0 - regardless of whether warning message is real.
176  "nbins is <=0 - set to nbins = 1",
177  "nbinsy is <=0 - set to nbinsy = 1",
178  "oneapi::tbb::global_control is limiting"}};
179 
180  //Location generating messages which should be reported as an INFO not a ERROR
181  constexpr std::array<const char* const, 7> in_location{{"Fit",
182  "TDecompChol::Solve",
183  "THistPainter::PaintInit",
184  "TUnixSystem::SetDisplay",
185  "TGClient::GetFontByName",
186  "Inverter::Dinv",
187  "RTaskArenaWrapper"}};
188 
189  constexpr std::array<const char* const, 3> in_message_print_error{{"number of iterations was insufficient",
190  "bad integrand behavior",
191  "integral is divergent, or slowly convergent"}};
192 
193  void RootErrorHandlerImpl(int level, char const* location, char const* message) {
194  bool die = false;
195 
196  // Translate ROOT severity level to MessageLogger severity level
197 
199 
200  if (level >= kFatal) {
202  } else if (level >= kSysError) {
204  } else if (level >= kError) {
206  } else if (level >= kWarning) {
208  }
209 
210  if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
212  }
213 
214  // Adapt C-strings to std::strings
215  // Arrange to report the error location as furnished by Root
216 
217  std::string el_location = "@SUB=?";
218  if (location != nullptr)
219  el_location = std::string("@SUB=") + std::string(location);
220 
221  std::string el_message = "?";
222  if (message != nullptr)
223  el_message = message;
224 
225  // Try to create a meaningful id string using knowledge of ROOT error messages
226  //
227  // id == "ROOT-ClassName" where ClassName is the affected class
228  // else "ROOT/ClassName" where ClassName is the error-declaring class
229  // else "ROOT"
230 
231  std::string el_identifier = "ROOT";
232 
233  std::string precursor("class ");
234  size_t index1 = el_message.find(precursor);
235  if (index1 != std::string::npos) {
236  size_t index2 = index1 + precursor.length();
237  size_t index3 = el_message.find_first_of(" :", index2);
238  if (index3 != std::string::npos) {
239  size_t substrlen = index3 - index2;
240  el_identifier += "-";
241  el_identifier += el_message.substr(index2, substrlen);
242  }
243  } else {
244  index1 = el_location.find("::");
245  if (index1 != std::string::npos) {
246  el_identifier += "/";
247  el_identifier += el_location.substr(0, index1);
248  }
249  }
250 
251  // Intercept some messages and upgrade the severity
252 
253  if ((el_location.find("TBranchElement::Fill") != std::string::npos) &&
254  (el_message.find("fill branch") != std::string::npos) && (el_message.find("address") != std::string::npos) &&
255  (el_message.find("not set") != std::string::npos)) {
257  }
258 
259  if ((el_message.find("Tree branches") != std::string::npos) &&
260  (el_message.find("different numbers of entries") != std::string::npos)) {
262  }
263 
264  // Intercept some messages and downgrade the severity
265 
266  if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
267  (level < kError and (el_location.find("CINTTypedefBuilder::Setup") != std::string::npos) and
268  (el_message.find("possible entries are in use!") != std::string::npos))) {
270  }
271 
272  // These are a special case because we do not want them to
273  // be fatal, but we do want an error to print.
274  bool alreadyPrinted = false;
275  if (find_if_string(el_message, in_message_print_error)) {
277  edm::LogError("Root_Error") << el_location << el_message;
278  alreadyPrinted = true;
279  }
280 
281  if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
282  // Don't throw if the message is just informational.
283  die = false;
284  } else {
285  die = true;
286  }
287 
288  // Feed the message to the MessageLogger and let it choose to suppress or not.
289 
290  // Root has declared a fatal error. Throw an EDMException unless the
291  // message corresponds to a pending signal. In that case, do not throw
292  // but let the OS deal with the signal in the usual way.
293  if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
294  std::ostringstream sstr;
295  sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
296  edm::Exception except(edm::errors::FatalRootError, sstr.str());
297  except.addAdditionalInfo(except.message());
298  except.clearMessage();
299  throw except;
300  }
301 
302  // Typically, we get here only for informational messages,
303  // but we leave the other code in just in case we change
304  // the criteria for throwing.
305  if (!alreadyPrinted) {
306  if (el_severity == edm::RootHandlers::SeverityLevel::kFatal) {
307  edm::LogError("Root_Fatal") << el_location << el_message;
308  } else if (el_severity == edm::RootHandlers::SeverityLevel::kSysError) {
309  edm::LogError("Root_Severe") << el_location << el_message;
310  } else if (el_severity == edm::RootHandlers::SeverityLevel::kError) {
311  edm::LogError("Root_Error") << el_location << el_message;
312  } else if (el_severity == edm::RootHandlers::SeverityLevel::kWarning) {
313  edm::LogWarning("Root_Warning") << el_location << el_message;
314  } else if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
315  edm::LogInfo("Root_Information") << el_location << el_message;
316  }
317  }
318  }
319 
320  void RootErrorHandler(int level, bool, char const* location, char const* message) {
321  RootErrorHandlerImpl(level, location, message);
322  }
323 
324  extern "C" {
325  void set_default_signals() {
326  signal(SIGILL, SIG_DFL);
327  signal(SIGSEGV, SIG_DFL);
328  signal(SIGBUS, SIG_DFL);
329  signal(SIGTERM, SIG_DFL);
330  signal(SIGFPE, SIG_DFL);
331  signal(SIGABRT, SIG_DFL);
332  }
333 
334  static int full_write(int fd, const char* text) {
335  const char* buffer = text;
336  size_t count = strlen(text);
337  ssize_t written = 0;
338  while (count) {
339  written = write(fd, buffer, count);
340  if (written == -1) {
341  if (errno == EINTR) {
342  continue;
343  } else {
344  return -errno;
345  }
346  }
347  count -= written;
348  buffer += written;
349  }
350  return 0;
351  }
352 
353  static int full_read(int fd, char* inbuf, size_t len, int timeout_s = -1) {
354  char* buf = inbuf;
355  size_t count = len;
356  ssize_t complete = 0;
357  std::chrono::time_point<std::chrono::steady_clock> end_time =
359  int flags;
360  if (timeout_s < 0) {
361  flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
362  } else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
363  return -errno;
364  }
365  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
366  if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
367  return -errno;
368  }
369  }
370  while (count) {
371  if (timeout_s >= 0) {
372  struct pollfd poll_info {
373  fd, POLLIN, 0
374  };
375  int ms_remaining =
376  std::chrono::duration_cast<std::chrono::milliseconds>(end_time - std::chrono::steady_clock::now()).count();
377  if (ms_remaining > 0) {
378  int rc = poll(&poll_info, 1, ms_remaining);
379  if (rc <= 0) {
380  if (rc < 0) {
381  if (errno == EINTR || errno == EAGAIN) {
382  continue;
383  }
384  rc = -errno;
385  } else {
386  rc = -ETIMEDOUT;
387  }
388  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
389  fcntl(fd, F_SETFL, flags);
390  }
391  return rc;
392  }
393  } else if (ms_remaining < 0) {
394  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
395  fcntl(fd, F_SETFL, flags);
396  }
397  return -ETIMEDOUT;
398  }
399  }
400  complete = read(fd, buf, count);
401  if (complete == -1) {
402  if (errno == EINTR) {
403  continue;
404  } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
405  continue;
406  } else {
407  int orig_errno = errno;
408  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
409  fcntl(fd, F_SETFL, flags);
410  }
411  return -orig_errno;
412  }
413  }
414  count -= complete;
415  buf += complete;
416  }
417  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
418  fcntl(fd, F_SETFL, flags);
419  }
420  return 0;
421  }
422 
423  static int full_cerr_write(const char* text) { return full_write(2, text); }
424 
425 // these signals are only used inside the stacktrace signal handler,
426 // so common signals can be used. They do have to be different, since
427 // we do not set SA_NODEFER, and RESUME must be a signal that will
428 // cause sleep() to return early.
429 #if defined(SIGRTMAX)
430 #define PAUSE_SIGNAL SIGRTMAX
431 #define RESUME_SIGNAL SIGRTMAX - 1
432 #elif defined(SIGINFO) // macOS/BSD
433 #define PAUSE_SIGNAL SIGINFO
434 #define RESUME_SIGNAL SIGALRM
435 #endif
436 
437  // does nothing, here only to interrupt the sleep() in the pause handler
438  void sig_resume_handler(int sig, siginfo_t*, void*) {}
439 
440  // pause a thread so that a (slow) stacktrace will capture the current state
441  void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
442  using namespace edm::service;
443 
444 #ifdef RESUME_SIGNAL
445  sigset_t sigset;
446  sigemptyset(&sigset);
447  sigaddset(&sigset, RESUME_SIGNAL);
448  pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
449 #endif
450  // sleep interrrupts on a handled delivery of the resume signal
452 
453  if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
456  char* buff = InitRootHandlers::moduleListBuffers_[i].data();
457 
458  strlcpy(buff, "\nModule: ", moduleBufferSize);
460  strlcat(buff,
461  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
462  moduleBufferSize);
463  strlcat(buff, ":", moduleBufferSize);
464  strlcat(buff,
465  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
466  moduleBufferSize);
467  } else {
468  strlcat(buff, "none", moduleBufferSize);
469  }
471  }
472  }
473  }
474 
475  void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
476  using namespace edm::service;
477 
478  const auto& tids = InitRootHandlers::threadIDs();
479 
480  const auto self = pthread_self();
481 #ifdef PAUSE_SIGNAL
482  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
483  // install the "pause" handler
484  struct sigaction act;
485  act.sa_sigaction = sig_pause_for_stacktrace;
486  act.sa_flags = 0;
487  sigemptyset(&act.sa_mask);
488  sigaction(PAUSE_SIGNAL, &act, nullptr);
489 
490  // unblock pause signal globally, resume is unblocked in the pause handler
491  sigset_t pausesigset;
492  sigemptyset(&pausesigset);
493  sigaddset(&pausesigset, PAUSE_SIGNAL);
494  sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
495 
496  // send a pause signal to all CMSSW/TBB threads other than self
497  for (auto id : tids) {
498  if (self != id) {
499  pthread_kill(id, PAUSE_SIGNAL);
500  }
501  }
502 
503 #ifdef RESUME_SIGNAL
504  // install the "resume" handler
505  act.sa_sigaction = sig_resume_handler;
506  sigaction(RESUME_SIGNAL, &act, nullptr);
507 #endif
508  }
509 #endif
510 
511  const char* signalname = "unknown";
512  switch (sig) {
513  case SIGBUS: {
514  signalname = "bus error";
515  break;
516  }
517  case SIGSEGV: {
518  signalname = "segmentation violation";
519  break;
520  }
521  case SIGILL: {
522  signalname = "illegal instruction";
523  break;
524  }
525  case SIGFPE: {
526  signalname = "floating point exception";
527  break;
528  }
529  case SIGTERM: {
530  signalname = "external termination request";
531  break;
532  }
533  case SIGABRT: {
534  signalname = "abort signal";
535  break;
536  }
537  default:
538  break;
539  }
540  full_cerr_write("\n\nA fatal system signal has occurred: ");
541  full_cerr_write(signalname);
542  full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
543 
545 
546  // resume the signal handlers to store the current module; we are not guaranteed they
547  // will have time to store their modules, so there is a race condition; this could be
548  // avoided by storing the module information before sleeping, a change that may be
549  // made when we're convinced accessing the thread-local current module is safe.
550 #ifdef RESUME_SIGNAL
551  std::size_t notified = 0;
552  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
553  for (auto id : tids) {
554  if (self != id) {
555  if (pthread_kill(id, RESUME_SIGNAL) == 0)
556  ++notified;
557  }
558  }
559  }
560 #endif
561 
562  full_cerr_write("\nCurrent Modules:\n");
563 
564  // Checking tids.count(self) ensures that we only try to access the current module in
565  // CMSSW/TBB threads. Those threads access the thread-local current module at the same
566  // time the thread is registered, so any lazy allocation will have been done at that
567  // point. Not necessary on Linux with the current cmsRun linkage, as the thread-local
568  // is allocated at exec time, not lazily.
569  if (tids.count(self) > 0) {
570  char buff[moduleBufferSize] = "\nModule: ";
572  strlcat(buff,
573  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
574  moduleBufferSize);
575  strlcat(buff, ":", moduleBufferSize);
576  strlcat(buff,
577  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
578  moduleBufferSize);
579  } else {
580  strlcat(buff, "none", moduleBufferSize);
581  }
582  strlcat(buff, " (crashed)", moduleBufferSize);
583  full_cerr_write(buff);
584  } else {
585  full_cerr_write("\nModule: non-CMSSW (crashed)");
586  }
587 
588 #ifdef PAUSE_SIGNAL
589  // wait a short interval for the paused threads to resume and fill in their module
590  // information, then print
591  if (InitRootHandlers::doneModules_.is_lock_free()) {
592  int spincount = 0;
593  timespec t = {0, 1000};
594  while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) {
595  nanosleep(&t, nullptr);
596  }
597  for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
598  full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
599  }
600  }
601 #endif
602 
603  full_cerr_write("\n\nA fatal system signal has occurred: ");
604  full_cerr_write(signalname);
605  full_cerr_write("\n");
606 
607  // For these known cases, re-raise the signal to get the correct
608  // exit code.
609  if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGFPE) ||
610  (sig == SIGABRT)) {
611  signal(sig, SIG_DFL);
612  raise(sig);
613  } else {
614  set_default_signals();
615  ::abort();
616  }
617  }
618 
619  void sig_abort(int sig, siginfo_t*, void*) {
620  full_cerr_write("\n\nFatal system signal has occurred during exit\n");
621 
622  // re-raise the signal to get the correct exit code
623  signal(sig, SIG_DFL);
624  raise(sig);
625 
626  // shouldn't get here
627  set_default_signals();
628  ::sleep(10);
629  ::abort();
630  }
631  }
632 } // end of unnamed namespace
633 
634 namespace edm {
635  namespace service {
636 
637  /*
638  * We've run into issues where GDB fails to print the thread which calls clone().
639  * To avoid this problem, we have an alternate approach below where the signal handler
640  * only reads/writes to a dedicated thread via pipes. The helper thread does the clone()
641  * invocation; we don't care if that thread is missing from the traceback in this case.
642  */
643  static void cmssw_stacktrace_fork();
644 
646  int toParent = childToParent_[1];
647  int fromParent = parentToChild_[0];
648  char buf[2];
649  buf[1] = '\0';
650 
651  while (true) {
652  int result = full_read(fromParent, buf, 1);
653  if (result < 0) {
654  // To avoid a deadlock (this function is NOT re-entrant), reset signals
655  // We never set them back to the CMSSW handler because we assume the parent
656  // thread will abort for us.
657  set_default_signals();
658  close(toParent);
659  full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
660  full_cerr_write(strerror(-result));
661  full_cerr_write("\n");
662  ::abort();
663  }
664  if (buf[0] == '1') {
665  set_default_signals();
667  full_write(toParent, buf);
668  } else if (buf[0] == '2') {
669  // We have just finished forking. Reload the file descriptors for thread
670  // communication.
671  close(toParent);
672  close(fromParent);
673  toParent = childToParent_[1];
674  fromParent = parentToChild_[0];
675  } else if (buf[0] == '3') {
676  break;
677  } else {
678  set_default_signals();
679  close(toParent);
680  full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
681  full_cerr_write(buf);
682  full_cerr_write("\n");
683  ::abort();
684  }
685  }
686  }
687 
689  int result = full_write(parentToChild_[1], "1");
690  if (result < 0) {
691  full_cerr_write("\n\nAttempt to request stacktrace failed: ");
692  full_cerr_write(strerror(-result));
693  full_cerr_write("\n");
694  return;
695  }
696  char buf[2];
697  buf[1] = '\0';
698  if ((result = full_read(childToParent_[0], buf, 1, 5 * 60)) < 0) {
699  full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
700  if (result == -ETIMEDOUT) {
701  full_cerr_write("timed out waiting for GDB to complete.");
702  } else {
703  full_cerr_write(strerror(-result));
704  }
705  full_cerr_write("\n");
706  return;
707  }
708  }
709 
711  char child_stack[4 * 1024];
712  char* child_stack_ptr = child_stack + 4 * 1024;
713  // On Linux, we currently use jemalloc. This registers pthread_atfork handlers; these
714  // handlers are *not* async-signal safe. Hence, a deadlock is possible if we invoke
715  // fork() from our signal handlers. Accordingly, we use clone (not POSIX, but AS-safe)
716  // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
717  int pid =
718 #ifdef __linux__
719  clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM | CLONE_FS | SIGCHLD, nullptr);
720 #else
721  fork();
722  if (child_stack_ptr) {
723  } // Suppress 'unused variable' warning on non-Linux
724  if (pid == 0) {
726  }
727 #endif
728  if (pid == -1) {
729  full_cerr_write("(Attempt to perform stack dump failed.)\n");
730  } else {
731  int status;
732  if (waitpid(pid, &status, 0) == -1) {
733  full_cerr_write("(Failed to wait on stack dump output.)\n");
734  }
735  if (status) {
736  full_cerr_write("(GDB stack trace failed unexpectedly)\n");
737  }
738  }
739  }
740 
741  int cmssw_stacktrace(void* /*arg*/) {
742  set_default_signals();
743 
745  // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
746  // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
747  // calls dlsym.
748 #ifdef __linux__
749  syscall(SYS_execve, "/bin/sh", argv, __environ);
750 #else
751  execv("/bin/sh", argv);
752 #endif
753  ::abort();
754  return 1;
755  }
756 
757  static constexpr char pstackName[] = "(CMSSW stack trace helper)";
758  static constexpr char dashC[] = "-c";
761  int InitRootHandlers::parentToChild_[2] = {-1, -1};
762  int InitRootHandlers::childToParent_[2] = {-1, -1};
763  std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
764  std::unique_ptr<InitRootHandlers::ThreadTracker> InitRootHandlers::threadTracker_;
766  std::vector<std::array<char, moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
767  std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
768 
770  : RootHandlers(),
771  unloadSigHandler_(pset.getUntrackedParameter<bool>("UnloadRootSigHandler")),
772  resetErrHandler_(pset.getUntrackedParameter<bool>("ResetRootErrHandler")),
773  loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
774  autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool>("AutoLibraryLoader")),
775  autoClassParser_(pset.getUntrackedParameter<bool>("AutoClassParser")),
776  interactiveDebug_(pset.getUntrackedParameter<bool>("InteractiveDebug")) {
777  stackTracePause_ = pset.getUntrackedParameter<int>("StackTracePauseTime");
778 
779  if (not threadTracker_) {
780  threadTracker_ = std::make_unique<ThreadTracker>();
781  iReg.watchPostEndJob([]() {
782  if (threadTracker_) {
783  threadTracker_->observe(false);
784  }
785  });
786  }
787 
788  if (unloadSigHandler_) {
789  // Deactivate all the Root signal handlers and restore the system defaults
790  gSystem->ResetSignal(kSigChild);
791  gSystem->ResetSignal(kSigBus);
792  gSystem->ResetSignal(kSigSegmentationViolation);
793  gSystem->ResetSignal(kSigIllegalInstruction);
794  gSystem->ResetSignal(kSigSystem);
795  gSystem->ResetSignal(kSigPipe);
796  gSystem->ResetSignal(kSigAlarm);
797  gSystem->ResetSignal(kSigUrgent);
798  gSystem->ResetSignal(kSigFloatingException);
799  gSystem->ResetSignal(kSigWindowChanged);
800  } else if (pset.getUntrackedParameter<bool>("AbortOnSignal")) {
801  cachePidInfo();
802 
803  //NOTE: ROOT can also be told to abort on these kinds of problems BUT
804  // it requires an TApplication to be instantiated which causes problems
805  gSystem->ResetSignal(kSigBus);
806  gSystem->ResetSignal(kSigSegmentationViolation);
807  gSystem->ResetSignal(kSigIllegalInstruction);
808  gSystem->ResetSignal(kSigFloatingException);
809  installCustomHandler(SIGBUS, sig_dostack_then_abort);
810  sigBusHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGBUS, sig_abort); });
811  installCustomHandler(SIGSEGV, sig_dostack_then_abort);
812  sigSegvHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGSEGV, sig_abort); });
813  installCustomHandler(SIGILL, sig_dostack_then_abort);
814  sigIllHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGILL, sig_abort); });
815  installCustomHandler(SIGTERM, sig_dostack_then_abort);
816  sigTermHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGTERM, sig_abort); });
817  installCustomHandler(SIGFPE, sig_dostack_then_abort);
818  sigFpeHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGFPE, sig_abort); });
819  installCustomHandler(SIGABRT, sig_dostack_then_abort);
820  sigAbrtHandler_ = std::shared_ptr<const void>(nullptr, [](void*) {
821  signal(SIGABRT, SIG_DFL); // release SIGABRT to default
822  });
823  }
824 
825  iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds) {
826  if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
827  moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
828  }
829  });
830 
831  if (resetErrHandler_) {
832  // Replace the Root error handler with one that uses the MessageLogger
833  SetErrorHandler(RootErrorHandler);
834  }
835 
836  // Enable automatic Root library loading.
837  if (autoLibraryLoader_) {
838  gInterpreter->SetClassAutoloading(1);
839  }
840 
841  // Enable/disable automatic parsing of headers
842  if (not autoClassParser_) {
843  // Disable automatic parsing of headers during module construction
845  [](edm::ModuleDescription const&) { gInterpreter->SetClassAutoparsing(false); });
847  [](edm::ModuleDescription const&) { gInterpreter->SetClassAutoparsing(true); });
848  }
849 
850  // Set ROOT parameters.
851  TTree::SetMaxTreeSize(kMaxLong64);
852  TH1::AddDirectory(kFALSE);
853  //G__SetCatchException(0);
854 
855  // Set custom streamers
857 
858  // Load the library containing dictionaries for std:: classes, if not already loaded.
859  if (!hasDictionary(typeid(std::vector<std::vector<unsigned int>>))) {
860  TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
861  }
862 
863  int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
864  if (debugLevel > 0) {
865  gDebug = debugLevel;
866  }
867 
868  // Enable Root implicit multi-threading
869  bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
870  if (imt && not ROOT::IsImplicitMTEnabled()) {
871  //cmsRun uses global_control to set the number of allowed threads to use
872  // we need to tell ROOT the same value in order to avoid unnecessary warnings
873  ROOT::EnableImplicitMT(
874  oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
875  }
876  }
877 
879  // close all open ROOT files
880  TIter iter(gROOT->GetListOfFiles());
881  TObject* obj = nullptr;
882  while (nullptr != (obj = iter.Next())) {
883  TFile* f = dynamic_cast<TFile*>(obj);
884  if (f) {
885  // We get a new iterator each time,
886  // because closing a file can invalidate the iterator
887  f->Close();
888  iter = TIter(gROOT->GetListOfFiles());
889  }
890  }
891  //disengage from TBB to avoid possible at exit problems
892  threadTracker_.reset();
893  }
894 
896  //Tell Root we want to be multi-threaded
897  ROOT::EnableThreadSafety();
898 
899  //When threading, also have to keep ROOT from logging all TObjects into a list
900  TObject::SetObjectStat(false);
901 
902  //Have to avoid having Streamers modify themselves after they have been used
903  TVirtualStreamerInfo::Optimize(false);
904  }
905 
908  desc.setComment("Centralized interface to ROOT.");
909  desc.addUntracked<bool>("UnloadRootSigHandler", false)
910  ->setComment("If True, signals are handled by this service, rather than by ROOT.");
911  desc.addUntracked<bool>("ResetRootErrHandler", true)
912  ->setComment(
913  "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
914  desc.addUntracked<bool>("AutoLibraryLoader", true)
915  ->setComment("If True, enables automatic loading of data dictionaries.");
916  desc.addUntracked<bool>("AutoClassParser", true)
917  ->setComment(
918  "If False, the automatic parsing of class headers for dictionaries when pre-built dictionaries are "
919  "missing is disable during module construction. The current implementation of disabling the parsing is "
920  "fragile, and may work only in a single-thread job that does not use reco::parser::cutParser() or "
921  "reco::parser::expressionParser() (and it certainly does not work on multiple threads).");
922  desc.addUntracked<bool>("LoadAllDictionaries", false)->setComment("If True, loads all ROOT dictionaries.");
923  desc.addUntracked<bool>("EnableIMT", true)->setComment("If True, calls ROOT::EnableImplicitMT().");
924  desc.addUntracked<bool>("AbortOnSignal", true)
925  ->setComment(
926  "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
927  "attempts to do a clean shutdown.");
928  desc.addUntracked<bool>("InteractiveDebug", false)
929  ->setComment(
930  "If True, leave gdb attached to cmsRun after a crash; "
931  "if False, attach gdb, print a stack trace, and quit gdb");
932  desc.addUntracked<int>("DebugLevel", 0)->setComment("Sets ROOT's gDebug value.");
933  desc.addUntracked<int>("StackTracePauseTime", 300)
934  ->setComment("Seconds to pause other threads during stack trace.");
935  descriptions.add("InitRootHandlers", desc);
936  }
937 
938  char const* const* InitRootHandlers::getPstackArgv() { return pstackArgv_; }
939 
941 
943 
945  if (helperThread_) {
946  //Another InitRootHandlers was initialized in this job, possibly
947  // because multiple EventProcessors are being used.
948  //In that case, we are already all setup
949  return;
950  }
951  std::string gdbcmd{"date; gdb -quiet -p %d"};
952  if (!interactiveDebug_) {
953  gdbcmd +=
954  " 2>&1 <<EOF |\n"
955  "set width 0\n"
956  "set height 0\n"
957  "set pagination no\n"
958  "thread apply all bt\n"
959  "EOF\n"
960  "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
961  }
962  if (snprintf(pidString_, pidStringLength_ - 1, gdbcmd.c_str(), getpid()) >= pidStringLength_) {
963  std::ostringstream sstr;
964  sstr << "Unable to pre-allocate stacktrace handler information";
965  edm::Exception except(edm::errors::OtherCMS, sstr.str());
966  throw except;
967  }
968 
969  // These are initialized to -1; harmless to close an invalid FD.
970  // If this is called post-fork, we don't want to be communicating on
971  // these FDs as they are used internally by the parent.
972  close(childToParent_[0]);
973  close(childToParent_[1]);
974  childToParent_[0] = -1;
975  childToParent_[1] = -1;
976  close(parentToChild_[0]);
977  close(parentToChild_[1]);
978  parentToChild_[0] = -1;
979  parentToChild_[1] = -1;
980 
981  if (-1 == pipe2(childToParent_, O_CLOEXEC)) {
982  std::ostringstream sstr;
983  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
984  edm::Exception except(edm::errors::OtherCMS, sstr.str());
985  throw except;
986  }
987 
988  if (-1 == pipe2(parentToChild_, O_CLOEXEC)) {
989  close(childToParent_[0]);
990  close(childToParent_[1]);
991  childToParent_[0] = -1;
992  childToParent_[1] = -1;
993  std::ostringstream sstr;
994  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
995  edm::Exception except(edm::errors::OtherCMS, sstr.str());
996  throw except;
997  }
998 
999  helperThread_ = std::make_unique<std::thread>(stacktraceHelperThread);
1000  helperThread_->detach();
1001  }
1002 
1003  } // end of namespace service
1004 } // end of namespace edm
1005 
1007 
size
Write out results.
void watchPostModuleConstruction(PostModuleConstruction::slot_type const &iSlot)
static constexpr char dashC[]
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:102
double seconds()
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setRefCoreStreamerInTClass()
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
void watchPreModuleConstruction(PreModuleConstruction::slot_type const &iSlot)
std::vector< T >::const_iterator search(const cond::Time_t &val, const std::vector< T > &container)
Definition: IOVProxy.cc:21
oneapi::tbb::concurrent_unordered_set< pthread_t > Container_type
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:98
static ModuleCallingContext const * getCurrentModuleOnThread()
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
Log< level::Error, false > LogError
std::shared_ptr< const void > sigFpeHandler_
void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override
std::shared_ptr< const void > sigBusHandler_
static TypeWithDict byName(std::string const &name)
Definition: TypeWithDict.cc:74
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
std::shared_ptr< const void > sigAbrtHandler_
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static char const *const * getPstackArgv()
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
std::shared_ptr< const void > sigIllHandler_
std::shared_ptr< const void > sigTermHandler_
void addAdditionalInfo(std::string const &info)
Definition: Exception.cc:173
double f[11][100]
int cmssw_stacktrace(void *)
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static std::unique_ptr< ThreadTracker > threadTracker_
Log< level::Info, false > LogInfo
static constexpr int pidStringLength_
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
static char const *const pstackArgv_[]
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
Definition: eve_macros.cc:135
unsigned int maxNumberOfThreads() const
Definition: SystemBounds.h:38
std::string moduleName(StableProvenance const &provenance, ProcessHistory const &history)
Definition: Provenance.cc:27
HLT enums.
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:80
static void fillDescriptions(ConfigurationDescriptions &descriptions)
bool hasDictionary(std::type_info const &)
#define O_NONBLOCK
Definition: SysFile.h:23
Log< level::Warning, false > LogWarning
static constexpr char pstackName[]
fd
Definition: ztee.py:136