CMS 3D CMS Logo

InitRootHandlers.cc
Go to the documentation of this file.
2 
18 
19 #include "oneapi/tbb/concurrent_unordered_set.h"
20 #include "oneapi/tbb/task.h"
21 #include "oneapi/tbb/task_scheduler_observer.h"
22 #include "oneapi/tbb/global_control.h"
23 #include <memory>
24 
25 #include <thread>
26 #include <sys/wait.h>
27 #include <sstream>
28 #include <cstring>
29 #include <poll.h>
30 #include <atomic>
31 #include <algorithm>
32 #include <vector>
33 #include <string>
34 #include <array>
35 
36 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
37 // version. This can break our stack trace printer. Avoid this by
38 // invoking the syscall directly.
39 #ifdef __linux__
40 #include <syscall.h>
41 #endif
42 
43 #include "TROOT.h"
44 #include "TError.h"
45 #include "TFile.h"
46 #include "TInterpreter.h"
47 #include "TH1.h"
48 #include "TSystem.h"
49 #include "TUnixSystem.h"
50 #include "TTree.h"
51 #include "TVirtualStreamerInfo.h"
52 
53 #include "TClassTable.h"
54 
55 #include <memory>
56 
57 namespace {
58  // size of static buffer allocated for listing module names following a
59  // stacktrace abort
60  constexpr std::size_t moduleBufferSize = 128;
61 } // namespace
62 
63 namespace edm {
65  class ParameterSet;
66  class ActivityRegistry;
67 
68  namespace service {
69  class InitRootHandlers : public RootHandlers {
70  friend int cmssw_stacktrace(void*);
71 
72  public:
73  class ThreadTracker : public oneapi::tbb::task_scheduler_observer {
74  public:
75  typedef oneapi::tbb::concurrent_unordered_set<pthread_t> Container_type;
76 
77  ThreadTracker() : oneapi::tbb::task_scheduler_observer() { observe(); }
78  ~ThreadTracker() override = default;
79 
80  void on_scheduler_entry(bool) override {
81  // ensure thread local has been allocated; not necessary on Linux with
82  // the current cmsRun linkage, but could be an issue if the platform
83  // or linkage leads to "lazy" allocation of the thread local. By
84  // referencing it here we make sure it has been allocated and can be
85  // accessed safely from our signal handler.
87  threadIDs_.insert(pthread_self());
88  }
89  void on_scheduler_exit(bool) override {}
90  const Container_type& IDs() { return threadIDs_; }
91 
92  private:
94  };
95 
96  explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
97  ~InitRootHandlers() override;
98 
99  static void fillDescriptions(ConfigurationDescriptions& descriptions);
100  static void stacktraceFromThread();
103  if (threadTracker_) {
104  return threadTracker_->IDs();
105  }
106  return empty;
107  }
108  static int stackTracePause() { return stackTracePause_; }
109 
110  static std::vector<std::array<char, moduleBufferSize>> moduleListBuffers_;
111  static std::atomic<std::size_t> nextModule_, doneModules_;
112 
113  private:
114  static char const* const* getPstackArgv();
115  void enableWarnings_() override;
117  void willBeUsingThreads() override;
118 
119  void cachePidInfo();
120  static void stacktraceHelperThread();
121 
122  static constexpr int pidStringLength_ = 200;
124  static char const* const pstackArgv_[];
125  static int parentToChild_[2];
126  static int childToParent_[2];
127  static std::unique_ptr<std::thread> helperThread_;
128  static std::unique_ptr<ThreadTracker> threadTracker_;
129  static int stackTracePause_;
130 
137  std::shared_ptr<const void> sigBusHandler_;
138  std::shared_ptr<const void> sigSegvHandler_;
139  std::shared_ptr<const void> sigIllHandler_;
140  std::shared_ptr<const void> sigTermHandler_;
141  std::shared_ptr<const void> sigAbrtHandler_;
142  std::shared_ptr<const void> sigFpeHandler_;
143  };
144 
145  inline bool isProcessWideService(InitRootHandlers const*) { return true; }
146 
147  } // end of namespace service
148 } // end of namespace edm
149 
150 namespace edm {
151  namespace service {
152  int cmssw_stacktrace(void*);
153  }
154 } // namespace edm
155 
156 namespace {
158 
159  constexpr bool s_ignoreEverything = false;
160 
161  template <std::size_t SIZE>
162  bool find_if_string(const std::string& search, const std::array<const char* const, SIZE>& substrs) {
163  return (std::find_if(substrs.begin(), substrs.end(), [&search](const char* const s) -> bool {
164  return (search.find(s) != std::string::npos);
165  }) != substrs.end());
166  }
167 
168  //Contents of a message which should be reported as an INFO not a ERROR
169  constexpr std::array<const char* const, 11> in_message{
170  {"no dictionary for class",
171  "already in TClassTable",
172  "matrix not positive definite",
173  "not a TStreamerInfo object",
174  "Problems declaring payload",
175  "Announced number of args different from the real number of argument passed", // Always printed if gDebug>0 - regardless of whether warning message is real.
176  "nbins is <=0 - set to nbins = 1",
177  "nbinsy is <=0 - set to nbinsy = 1",
178  "oneapi::tbb::global_control is limiting",
179  "ufirst < fXmin, fXmin is used",
180  "ulast > fXmax, fXmax is used"}};
181 
182  //Location generating messages which should be reported as an INFO not a ERROR
183  constexpr std::array<const char* const, 7> in_location{{"Fit",
184  "TDecompChol::Solve",
185  "THistPainter::PaintInit",
186  "TUnixSystem::SetDisplay",
187  "TGClient::GetFontByName",
188  "Inverter::Dinv",
189  "RTaskArenaWrapper"}};
190 
191  constexpr std::array<const char* const, 4> in_message_print_error{
192  {"number of iterations was insufficient",
193  "bad integrand behavior",
194  "integral is divergent, or slowly convergent",
195  "VariableMetricBuilder Initial matrix not pos.def."}};
196 
197  void RootErrorHandlerImpl(int level, char const* location, char const* message) {
198  bool die = false;
199 
200  // Translate ROOT severity level to MessageLogger severity level
201 
203 
204  if (level >= kFatal) {
206  } else if (level >= kSysError) {
208  } else if (level >= kError) {
210  } else if (level >= kWarning) {
212  }
213 
214  if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
216  }
217 
218  // Adapt C-strings to std::strings
219  // Arrange to report the error location as furnished by Root
220 
221  std::string el_location = "@SUB=?";
222  if (location != nullptr)
223  el_location = std::string("@SUB=") + std::string(location);
224 
225  std::string el_message = "?";
226  if (message != nullptr)
227  el_message = message;
228 
229  // Try to create a meaningful id string using knowledge of ROOT error messages
230  //
231  // id == "ROOT-ClassName" where ClassName is the affected class
232  // else "ROOT/ClassName" where ClassName is the error-declaring class
233  // else "ROOT"
234 
235  std::string el_identifier = "ROOT";
236 
237  std::string precursor("class ");
238  size_t index1 = el_message.find(precursor);
239  if (index1 != std::string::npos) {
240  size_t index2 = index1 + precursor.length();
241  size_t index3 = el_message.find_first_of(" :", index2);
242  if (index3 != std::string::npos) {
243  size_t substrlen = index3 - index2;
244  el_identifier += "-";
245  el_identifier += el_message.substr(index2, substrlen);
246  }
247  } else {
248  index1 = el_location.find("::");
249  if (index1 != std::string::npos) {
250  el_identifier += "/";
251  el_identifier += el_location.substr(0, index1);
252  }
253  }
254 
255  // Intercept some messages and upgrade the severity
256 
257  if ((el_location.find("TBranchElement::Fill") != std::string::npos) &&
258  (el_message.find("fill branch") != std::string::npos) && (el_message.find("address") != std::string::npos) &&
259  (el_message.find("not set") != std::string::npos)) {
261  }
262 
263  if ((el_message.find("Tree branches") != std::string::npos) &&
264  (el_message.find("different numbers of entries") != std::string::npos)) {
266  }
267 
268  // Intercept some messages and downgrade the severity
269 
270  if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
271  (level < kError and (el_location.find("CINTTypedefBuilder::Setup") != std::string::npos) and
272  (el_message.find("possible entries are in use!") != std::string::npos))) {
274  }
275 
276  // These are a special case because we do not want them to
277  // be fatal, but we do want an error to print.
278  bool alreadyPrinted = false;
279  if (find_if_string(el_message, in_message_print_error)) {
281  edm::LogError("Root_Error") << el_location << el_message;
282  alreadyPrinted = true;
283  }
284 
285  if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
286  // Don't throw if the message is just informational.
287  die = false;
288  } else {
289  die = true;
290  }
291 
292  // Feed the message to the MessageLogger and let it choose to suppress or not.
293 
294  // Root has declared a fatal error. Throw an EDMException unless the
295  // message corresponds to a pending signal. In that case, do not throw
296  // but let the OS deal with the signal in the usual way.
297  if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
298  std::ostringstream sstr;
299  sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
300  edm::Exception except(edm::errors::FatalRootError, sstr.str());
301  except.addAdditionalInfo(except.message());
302  except.clearMessage();
303  throw except;
304  }
305 
306  // Typically, we get here only for informational messages,
307  // but we leave the other code in just in case we change
308  // the criteria for throwing.
309  if (!alreadyPrinted) {
310  if (el_severity == edm::RootHandlers::SeverityLevel::kFatal) {
311  edm::LogError("Root_Fatal") << el_location << el_message;
312  } else if (el_severity == edm::RootHandlers::SeverityLevel::kSysError) {
313  edm::LogError("Root_Severe") << el_location << el_message;
314  } else if (el_severity == edm::RootHandlers::SeverityLevel::kError) {
315  edm::LogError("Root_Error") << el_location << el_message;
316  } else if (el_severity == edm::RootHandlers::SeverityLevel::kWarning) {
317  edm::LogWarning("Root_Warning") << el_location << el_message;
318  } else if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
319  edm::LogInfo("Root_Information") << el_location << el_message;
320  }
321  }
322  }
323 
324  void RootErrorHandler(int level, bool, char const* location, char const* message) {
325  RootErrorHandlerImpl(level, location, message);
326  }
327 
328  extern "C" {
329  void set_default_signals() {
330  signal(SIGILL, SIG_DFL);
331  signal(SIGSEGV, SIG_DFL);
332  signal(SIGBUS, SIG_DFL);
333  signal(SIGTERM, SIG_DFL);
334  signal(SIGFPE, SIG_DFL);
335  signal(SIGABRT, SIG_DFL);
336  }
337 
338  static int full_write(int fd, const char* text) {
339  const char* buffer = text;
340  size_t count = strlen(text);
341  ssize_t written = 0;
342  while (count) {
343  written = write(fd, buffer, count);
344  if (written == -1) {
345  if (errno == EINTR) {
346  continue;
347  } else {
348  return -errno;
349  }
350  }
351  count -= written;
352  buffer += written;
353  }
354  return 0;
355  }
356 
357  static int full_read(int fd, char* inbuf, size_t len, int timeout_s = -1) {
358  char* buf = inbuf;
359  size_t count = len;
360  ssize_t complete = 0;
361  std::chrono::time_point<std::chrono::steady_clock> end_time =
363  int flags;
364  if (timeout_s < 0) {
365  flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
366  } else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
367  return -errno;
368  }
369  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
370  if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
371  return -errno;
372  }
373  }
374  while (count) {
375  if (timeout_s >= 0) {
376  struct pollfd poll_info {
377  fd, POLLIN, 0
378  };
379  int ms_remaining =
380  std::chrono::duration_cast<std::chrono::milliseconds>(end_time - std::chrono::steady_clock::now()).count();
381  if (ms_remaining > 0) {
382  int rc = poll(&poll_info, 1, ms_remaining);
383  if (rc <= 0) {
384  if (rc < 0) {
385  if (errno == EINTR || errno == EAGAIN) {
386  continue;
387  }
388  rc = -errno;
389  } else {
390  rc = -ETIMEDOUT;
391  }
392  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
393  fcntl(fd, F_SETFL, flags);
394  }
395  return rc;
396  }
397  } else if (ms_remaining < 0) {
398  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
399  fcntl(fd, F_SETFL, flags);
400  }
401  return -ETIMEDOUT;
402  }
403  }
404  complete = read(fd, buf, count);
405  if (complete == -1) {
406  if (errno == EINTR) {
407  continue;
408  } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
409  continue;
410  } else {
411  int orig_errno = errno;
412  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
413  fcntl(fd, F_SETFL, flags);
414  }
415  return -orig_errno;
416  }
417  }
418  count -= complete;
419  buf += complete;
420  }
421  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
422  fcntl(fd, F_SETFL, flags);
423  }
424  return 0;
425  }
426 
427  static int full_cerr_write(const char* text) { return full_write(2, text); }
428 
429 // these signals are only used inside the stacktrace signal handler,
430 // so common signals can be used. They do have to be different, since
431 // we do not set SA_NODEFER, and RESUME must be a signal that will
432 // cause sleep() to return early.
433 #if defined(SIGRTMAX)
434 #define PAUSE_SIGNAL SIGRTMAX
435 #define RESUME_SIGNAL SIGRTMAX - 1
436 #elif defined(SIGINFO) // macOS/BSD
437 #define PAUSE_SIGNAL SIGINFO
438 #define RESUME_SIGNAL SIGALRM
439 #endif
440 
441  // does nothing, here only to interrupt the sleep() in the pause handler
442  void sig_resume_handler(int sig, siginfo_t*, void*) {}
443 
444  // pause a thread so that a (slow) stacktrace will capture the current state
445  void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
446  using namespace edm::service;
447 
448 #ifdef RESUME_SIGNAL
449  sigset_t sigset;
450  sigemptyset(&sigset);
451  sigaddset(&sigset, RESUME_SIGNAL);
452  pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
453 #endif
454  // sleep interrrupts on a handled delivery of the resume signal
456 
457  if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
461 
462  strlcpy(buff, "\nModule: ", moduleBufferSize);
464  strlcat(buff,
465  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
466  moduleBufferSize);
467  strlcat(buff, ":", moduleBufferSize);
468  strlcat(buff,
469  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
470  moduleBufferSize);
471  } else {
472  strlcat(buff, "none", moduleBufferSize);
473  }
475  }
476  }
477  }
478 
479  void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
480  using namespace edm::service;
481 
482  const auto& tids = InitRootHandlers::threadIDs();
483 
484  const auto self = pthread_self();
485 #ifdef PAUSE_SIGNAL
486  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
487  // install the "pause" handler
488  struct sigaction act;
489  act.sa_sigaction = sig_pause_for_stacktrace;
490  act.sa_flags = 0;
491  sigemptyset(&act.sa_mask);
492  sigaction(PAUSE_SIGNAL, &act, nullptr);
493 
494  // unblock pause signal globally, resume is unblocked in the pause handler
495  sigset_t pausesigset;
496  sigemptyset(&pausesigset);
497  sigaddset(&pausesigset, PAUSE_SIGNAL);
498  sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
499 
500  // send a pause signal to all CMSSW/TBB threads other than self
501  for (auto id : tids) {
502  if (self != id) {
503  pthread_kill(id, PAUSE_SIGNAL);
504  }
505  }
506 
507 #ifdef RESUME_SIGNAL
508  // install the "resume" handler
509  act.sa_sigaction = sig_resume_handler;
510  sigaction(RESUME_SIGNAL, &act, nullptr);
511 #endif
512  }
513 #endif
514 
515  const char* signalname = "unknown";
516  switch (sig) {
517  case SIGBUS: {
518  signalname = "bus error";
519  break;
520  }
521  case SIGSEGV: {
522  signalname = "segmentation violation";
523  break;
524  }
525  case SIGILL: {
526  signalname = "illegal instruction";
527  break;
528  }
529  case SIGFPE: {
530  signalname = "floating point exception";
531  break;
532  }
533  case SIGTERM: {
534  signalname = "external termination request";
535  break;
536  }
537  case SIGABRT: {
538  signalname = "abort signal";
539  break;
540  }
541  default:
542  break;
543  }
544  full_cerr_write("\n\nA fatal system signal has occurred: ");
545  full_cerr_write(signalname);
546  full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
547 
549 
550  // resume the signal handlers to store the current module; we are not guaranteed they
551  // will have time to store their modules, so there is a race condition; this could be
552  // avoided by storing the module information before sleeping, a change that may be
553  // made when we're convinced accessing the thread-local current module is safe.
554 #ifdef RESUME_SIGNAL
555  std::size_t notified = 0;
556  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
557  for (auto id : tids) {
558  if (self != id) {
559  if (pthread_kill(id, RESUME_SIGNAL) == 0)
560  ++notified;
561  }
562  }
563  }
564 #endif
565 
566  full_cerr_write("\nCurrent Modules:\n");
567 
568  // Checking tids.count(self) ensures that we only try to access the current module in
569  // CMSSW/TBB threads. Those threads access the thread-local current module at the same
570  // time the thread is registered, so any lazy allocation will have been done at that
571  // point. Not necessary on Linux with the current cmsRun linkage, as the thread-local
572  // is allocated at exec time, not lazily.
573  if (tids.count(self) > 0) {
574  char buff[moduleBufferSize] = "\nModule: ";
576  strlcat(buff,
577  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
578  moduleBufferSize);
579  strlcat(buff, ":", moduleBufferSize);
580  strlcat(buff,
581  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
582  moduleBufferSize);
583  } else {
584  strlcat(buff, "none", moduleBufferSize);
585  }
586  strlcat(buff, " (crashed)", moduleBufferSize);
587  full_cerr_write(buff);
588  } else {
589  full_cerr_write("\nModule: non-CMSSW (crashed)");
590  }
591 
592 #ifdef PAUSE_SIGNAL
593  // wait a short interval for the paused threads to resume and fill in their module
594  // information, then print
595  if (InitRootHandlers::doneModules_.is_lock_free()) {
596  int spincount = 0;
597  timespec t = {0, 1000};
598  while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) {
599  nanosleep(&t, nullptr);
600  }
601  for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
602  full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
603  }
604  }
605 #endif
606 
607  full_cerr_write("\n\nA fatal system signal has occurred: ");
608  full_cerr_write(signalname);
609  full_cerr_write("\n");
610 
611  // For these known cases, re-raise the signal to get the correct
612  // exit code.
613  if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGFPE) ||
614  (sig == SIGABRT)) {
615  signal(sig, SIG_DFL);
616  raise(sig);
617  } else {
618  set_default_signals();
619  ::abort();
620  }
621  }
622 
623  void sig_abort(int sig, siginfo_t*, void*) {
624  full_cerr_write("\n\nFatal system signal has occurred during exit\n");
625 
626  // re-raise the signal to get the correct exit code
627  signal(sig, SIG_DFL);
628  raise(sig);
629 
630  // shouldn't get here
631  set_default_signals();
632  ::sleep(10);
633  ::abort();
634  }
635  }
636 } // end of unnamed namespace
637 
638 namespace edm {
639  namespace service {
640 
641  /*
642  * We've run into issues where GDB fails to print the thread which calls clone().
643  * To avoid this problem, we have an alternate approach below where the signal handler
644  * only reads/writes to a dedicated thread via pipes. The helper thread does the clone()
645  * invocation; we don't care if that thread is missing from the traceback in this case.
646  */
647  static void cmssw_stacktrace_fork();
648 
650  int toParent = childToParent_[1];
651  int fromParent = parentToChild_[0];
652  char buf[2];
653  buf[1] = '\0';
654 
655  while (true) {
656  int result = full_read(fromParent, buf, 1);
657  if (result < 0) {
658  // To avoid a deadlock (this function is NOT re-entrant), reset signals
659  // We never set them back to the CMSSW handler because we assume the parent
660  // thread will abort for us.
661  set_default_signals();
662  close(toParent);
663  full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
664  full_cerr_write(strerror(-result));
665  full_cerr_write("\n");
666  ::abort();
667  }
668  if (buf[0] == '1') {
669  set_default_signals();
671  full_write(toParent, buf);
672  } else if (buf[0] == '2') {
673  // We have just finished forking. Reload the file descriptors for thread
674  // communication.
675  close(toParent);
676  close(fromParent);
677  toParent = childToParent_[1];
678  fromParent = parentToChild_[0];
679  } else if (buf[0] == '3') {
680  break;
681  } else {
682  set_default_signals();
683  close(toParent);
684  full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
685  full_cerr_write(buf);
686  full_cerr_write("\n");
687  ::abort();
688  }
689  }
690  }
691 
693  int result = full_write(parentToChild_[1], "1");
694  if (result < 0) {
695  full_cerr_write("\n\nAttempt to request stacktrace failed: ");
696  full_cerr_write(strerror(-result));
697  full_cerr_write("\n");
698  return;
699  }
700  char buf[2];
701  buf[1] = '\0';
702  if ((result = full_read(childToParent_[0], buf, 1, 5 * 60)) < 0) {
703  full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
704  if (result == -ETIMEDOUT) {
705  full_cerr_write("timed out waiting for GDB to complete.");
706  } else {
707  full_cerr_write(strerror(-result));
708  }
709  full_cerr_write("\n");
710  return;
711  }
712  }
713 
715  char child_stack[4 * 1024];
716  char* child_stack_ptr = child_stack + 4 * 1024;
717  // On Linux, we currently use jemalloc. This registers pthread_atfork handlers; these
718  // handlers are *not* async-signal safe. Hence, a deadlock is possible if we invoke
719  // fork() from our signal handlers. Accordingly, we use clone (not POSIX, but AS-safe)
720  // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
721  int pid =
722 #ifdef __linux__
723  clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM | CLONE_FS | SIGCHLD, nullptr);
724 #else
725  fork();
726  if (child_stack_ptr) {
727  } // Suppress 'unused variable' warning on non-Linux
728  if (pid == 0) {
730  }
731 #endif
732  if (pid == -1) {
733  full_cerr_write("(Attempt to perform stack dump failed.)\n");
734  } else {
735  int status;
736  if (waitpid(pid, &status, 0) == -1) {
737  full_cerr_write("(Failed to wait on stack dump output.)\n");
738  }
739  if (status) {
740  full_cerr_write("(GDB stack trace failed unexpectedly)\n");
741  }
742  }
743  }
744 
745  int cmssw_stacktrace(void* /*arg*/) {
746  set_default_signals();
747 
749  // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
750  // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
751  // calls dlsym.
752 #ifdef __linux__
753  syscall(SYS_execve, "/bin/sh", argv, __environ);
754 #else
755  execv("/bin/sh", argv);
756 #endif
757  ::abort();
758  return 1;
759  }
760 
761  static constexpr char pstackName[] = "(CMSSW stack trace helper)";
762  static constexpr char dashC[] = "-c";
765  int InitRootHandlers::parentToChild_[2] = {-1, -1};
766  int InitRootHandlers::childToParent_[2] = {-1, -1};
767  std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
768  std::unique_ptr<InitRootHandlers::ThreadTracker> InitRootHandlers::threadTracker_;
770  std::vector<std::array<char, moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
771  std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
772 
774  : RootHandlers(),
775  unloadSigHandler_(pset.getUntrackedParameter<bool>("UnloadRootSigHandler")),
776  resetErrHandler_(pset.getUntrackedParameter<bool>("ResetRootErrHandler")),
777  loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
778  autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool>("AutoLibraryLoader")),
779  autoClassParser_(pset.getUntrackedParameter<bool>("AutoClassParser")),
780  interactiveDebug_(pset.getUntrackedParameter<bool>("InteractiveDebug")) {
781  stackTracePause_ = pset.getUntrackedParameter<int>("StackTracePauseTime");
782 
783  if (not threadTracker_) {
784  threadTracker_ = std::make_unique<ThreadTracker>();
785  iReg.watchPostEndJob([]() {
786  if (threadTracker_) {
787  threadTracker_->observe(false);
788  }
789  });
790  }
791 
792  if (unloadSigHandler_) {
793  // Deactivate all the Root signal handlers and restore the system defaults
794  gSystem->ResetSignal(kSigChild);
795  gSystem->ResetSignal(kSigBus);
796  gSystem->ResetSignal(kSigSegmentationViolation);
797  gSystem->ResetSignal(kSigIllegalInstruction);
798  gSystem->ResetSignal(kSigSystem);
799  gSystem->ResetSignal(kSigPipe);
800  gSystem->ResetSignal(kSigAlarm);
801  gSystem->ResetSignal(kSigUrgent);
802  gSystem->ResetSignal(kSigFloatingException);
803  gSystem->ResetSignal(kSigWindowChanged);
804  } else if (pset.getUntrackedParameter<bool>("AbortOnSignal")) {
805  cachePidInfo();
806 
807  //NOTE: ROOT can also be told to abort on these kinds of problems BUT
808  // it requires an TApplication to be instantiated which causes problems
809  gSystem->ResetSignal(kSigBus);
810  gSystem->ResetSignal(kSigSegmentationViolation);
811  gSystem->ResetSignal(kSigIllegalInstruction);
812  gSystem->ResetSignal(kSigFloatingException);
813  installCustomHandler(SIGBUS, sig_dostack_then_abort);
814  sigBusHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGBUS, sig_abort); });
815  installCustomHandler(SIGSEGV, sig_dostack_then_abort);
816  sigSegvHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGSEGV, sig_abort); });
817  installCustomHandler(SIGILL, sig_dostack_then_abort);
818  sigIllHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGILL, sig_abort); });
819  installCustomHandler(SIGTERM, sig_dostack_then_abort);
820  sigTermHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGTERM, sig_abort); });
821  installCustomHandler(SIGFPE, sig_dostack_then_abort);
822  sigFpeHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGFPE, sig_abort); });
823  installCustomHandler(SIGABRT, sig_dostack_then_abort);
824  sigAbrtHandler_ = std::shared_ptr<const void>(nullptr, [](void*) {
825  signal(SIGABRT, SIG_DFL); // release SIGABRT to default
826  });
827  }
828 
829  iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds) {
830  if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
831  moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
832  }
833  });
834 
835  if (resetErrHandler_) {
836  // Replace the Root error handler with one that uses the MessageLogger
837  SetErrorHandler(RootErrorHandler);
838  }
839 
840  // Enable automatic Root library loading.
841  if (autoLibraryLoader_) {
842  gInterpreter->SetClassAutoloading(1);
843  }
844 
845  // Enable/disable automatic parsing of headers
846  if (not autoClassParser_) {
847  // Disable automatic parsing of headers during module construction
849  [](edm::ModuleDescription const&) { gInterpreter->SetClassAutoparsing(false); });
851  [](edm::ModuleDescription const&) { gInterpreter->SetClassAutoparsing(true); });
852  }
853 
854  // Set ROOT parameters.
855  TTree::SetMaxTreeSize(kMaxLong64);
856  TH1::AddDirectory(kFALSE);
857  //G__SetCatchException(0);
858 
859  // Set custom streamers
861 
862  // Load the library containing dictionaries for std:: classes, if not already loaded.
863  if (!hasDictionary(typeid(std::vector<std::vector<unsigned int>>))) {
864  TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
865  }
866 
867  int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
868  if (debugLevel > 0) {
869  gDebug = debugLevel;
870  }
871 
872  // Enable Root implicit multi-threading
873  bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
874  if (imt && not ROOT::IsImplicitMTEnabled()) {
875  //cmsRun uses global_control to set the number of allowed threads to use
876  // we need to tell ROOT the same value in order to avoid unnecessary warnings
877  ROOT::EnableImplicitMT(
878  oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
879  }
880  }
881 
883  // close all open ROOT files
884  TIter iter(gROOT->GetListOfFiles());
885  TObject* obj = nullptr;
886  while (nullptr != (obj = iter.Next())) {
887  TFile* f = dynamic_cast<TFile*>(obj);
888  if (f) {
889  // We get a new iterator each time,
890  // because closing a file can invalidate the iterator
891  f->Close();
892  iter = TIter(gROOT->GetListOfFiles());
893  }
894  }
895  //disengage from TBB to avoid possible at exit problems
896  threadTracker_.reset();
897  }
898 
900  //Tell Root we want to be multi-threaded
901  ROOT::EnableThreadSafety();
902 
903  //When threading, also have to keep ROOT from logging all TObjects into a list
904  TObject::SetObjectStat(false);
905 
906  //Have to avoid having Streamers modify themselves after they have been used
907  TVirtualStreamerInfo::Optimize(false);
908  }
909 
912  desc.setComment("Centralized interface to ROOT.");
913  desc.addUntracked<bool>("UnloadRootSigHandler", false)
914  ->setComment("If True, signals are handled by this service, rather than by ROOT.");
915  desc.addUntracked<bool>("ResetRootErrHandler", true)
916  ->setComment(
917  "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
918  desc.addUntracked<bool>("AutoLibraryLoader", true)
919  ->setComment("If True, enables automatic loading of data dictionaries.");
920  desc.addUntracked<bool>("AutoClassParser", true)
921  ->setComment(
922  "If False, the automatic parsing of class headers for dictionaries when pre-built dictionaries are "
923  "missing is disable during module construction. The current implementation of disabling the parsing is "
924  "fragile, and may work only in a single-thread job that does not use reco::parser::cutParser() or "
925  "reco::parser::expressionParser() (and it certainly does not work on multiple threads).");
926  desc.addUntracked<bool>("LoadAllDictionaries", false)->setComment("If True, loads all ROOT dictionaries.");
927  desc.addUntracked<bool>("EnableIMT", true)->setComment("If True, calls ROOT::EnableImplicitMT().");
928  desc.addUntracked<bool>("AbortOnSignal", true)
929  ->setComment(
930  "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
931  "attempts to do a clean shutdown.");
932  desc.addUntracked<bool>("InteractiveDebug", false)
933  ->setComment(
934  "If True, leave gdb attached to cmsRun after a crash; "
935  "if False, attach gdb, print a stack trace, and quit gdb");
936  desc.addUntracked<int>("DebugLevel", 0)->setComment("Sets ROOT's gDebug value.");
937  desc.addUntracked<int>("StackTracePauseTime", 300)
938  ->setComment("Seconds to pause other threads during stack trace.");
939  descriptions.add("InitRootHandlers", desc);
940  }
941 
942  char const* const* InitRootHandlers::getPstackArgv() { return pstackArgv_; }
943 
945 
947 
949  if (helperThread_) {
950  //Another InitRootHandlers was initialized in this job, possibly
951  // because multiple EventProcessors are being used.
952  //In that case, we are already all setup
953  return;
954  }
955  std::string gdbcmd{"date; gdb -quiet -p %d"};
956  if (!interactiveDebug_) {
957  gdbcmd +=
958  " 2>&1 <<EOF |\n"
959  "set width 0\n"
960  "set height 0\n"
961  "set pagination no\n"
962  "thread apply all bt\n"
963  "EOF\n"
964  "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
965  }
966  if (snprintf(pidString_, pidStringLength_ - 1, gdbcmd.c_str(), getpid()) >= pidStringLength_) {
967  std::ostringstream sstr;
968  sstr << "Unable to pre-allocate stacktrace handler information";
969  edm::Exception except(edm::errors::OtherCMS, sstr.str());
970  throw except;
971  }
972 
973  // These are initialized to -1; harmless to close an invalid FD.
974  // If this is called post-fork, we don't want to be communicating on
975  // these FDs as they are used internally by the parent.
976  close(childToParent_[0]);
977  close(childToParent_[1]);
978  childToParent_[0] = -1;
979  childToParent_[1] = -1;
980  close(parentToChild_[0]);
981  close(parentToChild_[1]);
982  parentToChild_[0] = -1;
983  parentToChild_[1] = -1;
984 
985  if (-1 == pipe2(childToParent_, O_CLOEXEC)) {
986  std::ostringstream sstr;
987  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
988  edm::Exception except(edm::errors::OtherCMS, sstr.str());
989  throw except;
990  }
991 
992  if (-1 == pipe2(parentToChild_, O_CLOEXEC)) {
993  close(childToParent_[0]);
994  close(childToParent_[1]);
995  childToParent_[0] = -1;
996  childToParent_[1] = -1;
997  std::ostringstream sstr;
998  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
999  edm::Exception except(edm::errors::OtherCMS, sstr.str());
1000  throw except;
1001  }
1002 
1003  helperThread_ = std::make_unique<std::thread>(stacktraceHelperThread);
1004  helperThread_->detach();
1005  }
1006 
1007  } // end of namespace service
1008 } // end of namespace edm
1009 
1011 
size
Write out results.
void watchPostModuleConstruction(PostModuleConstruction::slot_type const &iSlot)
static constexpr char dashC[]
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:102
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setRefCoreStreamerInTClass()
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
void watchPreModuleConstruction(PreModuleConstruction::slot_type const &iSlot)
std::vector< T >::const_iterator search(const cond::Time_t &val, const std::vector< T > &container)
Definition: IOVProxy.cc:22
oneapi::tbb::concurrent_unordered_set< pthread_t > Container_type
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:98
static ModuleCallingContext const * getCurrentModuleOnThread()
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
Log< level::Error, false > LogError
location
10.6 INPUT and workflows
Definition: relval_nano.py:81
std::shared_ptr< const void > sigFpeHandler_
void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override
std::shared_ptr< const void > sigBusHandler_
static TypeWithDict byName(std::string const &name)
Definition: TypeWithDict.cc:74
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
std::shared_ptr< const void > sigAbrtHandler_
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static char const *const * getPstackArgv()
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
std::shared_ptr< const void > sigIllHandler_
std::shared_ptr< const void > sigTermHandler_
void addAdditionalInfo(std::string const &info)
Definition: Exception.cc:173
double f[11][100]
int cmssw_stacktrace(void *)
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static std::unique_ptr< ThreadTracker > threadTracker_
Log< level::Info, false > LogInfo
static constexpr int pidStringLength_
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
static char const *const pstackArgv_[]
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
Definition: eve_macros.cc:135
unsigned int maxNumberOfThreads() const
Definition: SystemBounds.h:38
std::string moduleName(StableProvenance const &provenance, ProcessHistory const &history)
Definition: Provenance.cc:27
HLT enums.
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:80
static void fillDescriptions(ConfigurationDescriptions &descriptions)
bool hasDictionary(std::type_info const &)
#define O_NONBLOCK
Definition: SysFile.h:23
Log< level::Warning, false > LogWarning
static constexpr char pstackName[]
fd
Definition: ztee.py:136
buff
***.cc ################