CMS 3D CMS Logo

InitRootHandlers.cc
Go to the documentation of this file.
2 
4 
20 
21 #include "tbb/task.h"
22 #include "tbb/task_scheduler_observer.h"
23 #include "tbb/concurrent_unordered_set.h"
24 #include <thread>
25 #include <sys/wait.h>
26 #include <sstream>
27 #include <cstring>
28 #include <poll.h>
29 #include <atomic>
30 #include <algorithm>
31 #include <vector>
32 #include <string>
33 #include <array>
34 
35 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
36 // version. This can break our stack trace printer. Avoid this by
37 // invoking the syscall directly.
38 #ifdef __linux__
39 #include <syscall.h>
40 #endif
41 
42 #include "TROOT.h"
43 #include "TError.h"
44 #include "TFile.h"
45 #include "TInterpreter.h"
46 #include "TH1.h"
47 #include "TSystem.h"
48 #include "TUnixSystem.h"
49 #include "TTree.h"
50 #include "TVirtualStreamerInfo.h"
51 
52 #include "TClassTable.h"
53 
54 #include <memory>
55 
56 namespace {
57  // size of static buffer allocated for listing module names following a
58  // stacktrace abort
59  constexpr std::size_t moduleBufferSize = 128;
60 } // namespace
61 
62 namespace edm {
64  class ParameterSet;
65  class ActivityRegistry;
66 
67  namespace service {
68  class InitRootHandlers : public RootHandlers {
69  friend int cmssw_stacktrace(void*);
70 
71  public:
72  class ThreadTracker : public tbb::task_scheduler_observer {
73  public:
74  typedef tbb::concurrent_unordered_set<pthread_t> Container_type;
75 
76  ThreadTracker() : tbb::task_scheduler_observer() { observe(true); }
77  void on_scheduler_entry(bool) override {
78  // ensure thread local has been allocated; not necessary on Linux with
79  // the current cmsRun linkage, but could be an issue if the platform
80  // or linkage leads to "lazy" allocation of the thread local. By
81  // referencing it here we make sure it has been allocated and can be
82  // accessed safely from our signal handler.
84  threadIDs_.insert(pthread_self());
85  }
86  const Container_type& IDs() { return threadIDs_; }
87 
88  private:
89  Container_type threadIDs_;
90  };
91 
92  explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
93  ~InitRootHandlers() override;
94 
95  static void fillDescriptions(ConfigurationDescriptions& descriptions);
96  static void stacktraceFromThread();
98  static int stackTracePause() { return stackTracePause_; }
99 
100  static std::vector<std::array<char, moduleBufferSize>> moduleListBuffers_;
101  static std::atomic<std::size_t> nextModule_, doneModules_;
102 
103  private:
104  static char* const* getPstackArgv();
105  void enableWarnings_() override;
107  void willBeUsingThreads() override;
108 
109  void cachePidInfo();
110  static void stacktraceHelperThread();
111 
112  static const int pidStringLength_ = 200;
114  static char* const pstackArgv_[];
115  static int parentToChild_[2];
116  static int childToParent_[2];
117  static std::unique_ptr<std::thread> helperThread_;
119  static int stackTracePause_;
120 
125  std::shared_ptr<const void> sigBusHandler_;
126  std::shared_ptr<const void> sigSegvHandler_;
127  std::shared_ptr<const void> sigIllHandler_;
128  std::shared_ptr<const void> sigTermHandler_;
129  std::shared_ptr<const void> sigAbrtHandler_;
130  };
131 
132  inline bool isProcessWideService(InitRootHandlers const*) { return true; }
133 
134  } // end of namespace service
135 } // end of namespace edm
136 
137 namespace edm {
138  namespace service {
139  int cmssw_stacktrace(void*);
140  }
141 } // namespace edm
142 
143 namespace {
145 
146  bool s_ignoreEverything = false;
147 
148  template <std::size_t SIZE>
149  bool find_if_string(const std::string& search, const std::array<const char* const, SIZE>& substrs) {
150  return (std::find_if(substrs.begin(), substrs.end(), [&search](const char* const s) -> bool {
151  return (search.find(s) != std::string::npos);
152  }) != substrs.end());
153  }
154 
155  constexpr std::array<const char* const, 8> in_message{
156  {"no dictionary for class",
157  "already in TClassTable",
158  "matrix not positive definite",
159  "not a TStreamerInfo object",
160  "Problems declaring payload",
161  "Announced number of args different from the real number of argument passed", // Always printed if gDebug>0 - regardless of whether warning message is real.
162  "nbins is <=0 - set to nbins = 1",
163  "nbinsy is <=0 - set to nbinsy = 1"}};
164 
165  constexpr std::array<const char* const, 6> in_location{{"Fit",
166  "TDecompChol::Solve",
167  "THistPainter::PaintInit",
168  "TUnixSystem::SetDisplay",
169  "TGClient::GetFontByName",
170  "Inverter::Dinv"}};
171 
172  constexpr std::array<const char* const, 4> in_message_print{{"number of iterations was insufficient",
173  "bad integrand behavior",
174  "integral is divergent, or slowly convergent",
175  "but fEntryCurrent should not be in between the two"}};
176 
177  void RootErrorHandlerImpl(int level, char const* location, char const* message) {
178  bool die = false;
179 
180  // Translate ROOT severity level to MessageLogger severity level
181 
183 
184  if (level >= kFatal) {
186  } else if (level >= kSysError) {
188  } else if (level >= kError) {
190  } else if (level >= kWarning) {
192  }
193 
194  if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
196  }
197 
198  // Adapt C-strings to std::strings
199  // Arrange to report the error location as furnished by Root
200 
201  std::string el_location = "@SUB=?";
202  if (location != nullptr)
203  el_location = std::string("@SUB=") + std::string(location);
204 
205  std::string el_message = "?";
206  if (message != nullptr)
207  el_message = message;
208 
209  // Try to create a meaningful id string using knowledge of ROOT error messages
210  //
211  // id == "ROOT-ClassName" where ClassName is the affected class
212  // else "ROOT/ClassName" where ClassName is the error-declaring class
213  // else "ROOT"
214 
215  std::string el_identifier = "ROOT";
216 
217  std::string precursor("class ");
218  size_t index1 = el_message.find(precursor);
219  if (index1 != std::string::npos) {
220  size_t index2 = index1 + precursor.length();
221  size_t index3 = el_message.find_first_of(" :", index2);
222  if (index3 != std::string::npos) {
223  size_t substrlen = index3 - index2;
224  el_identifier += "-";
225  el_identifier += el_message.substr(index2, substrlen);
226  }
227  } else {
228  index1 = el_location.find("::");
229  if (index1 != std::string::npos) {
230  el_identifier += "/";
231  el_identifier += el_location.substr(0, index1);
232  }
233  }
234 
235  // Intercept some messages and upgrade the severity
236 
237  if ((el_location.find("TBranchElement::Fill") != std::string::npos) &&
238  (el_message.find("fill branch") != std::string::npos) && (el_message.find("address") != std::string::npos) &&
239  (el_message.find("not set") != std::string::npos)) {
241  }
242 
243  if ((el_message.find("Tree branches") != std::string::npos) &&
244  (el_message.find("different numbers of entries") != std::string::npos)) {
246  }
247 
248  // Intercept some messages and downgrade the severity
249 
250  if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
251  (level < kError and (el_location.find("CINTTypedefBuilder::Setup") != std::string::npos) and
252  (el_message.find("possible entries are in use!") != std::string::npos))) {
254  }
255 
256  // These are a special case because we do not want them to
257  // be fatal, but we do want an error to print.
258  bool alreadyPrinted = false;
259  if (find_if_string(el_message, in_message_print)) {
261  edm::LogError("Root_Error") << el_location << el_message;
262  alreadyPrinted = true;
263  }
264 
265  if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
266  // Don't throw if the message is just informational.
267  die = false;
268  } else {
269  die = true;
270  }
271 
272  // Feed the message to the MessageLogger and let it choose to suppress or not.
273 
274  // Root has declared a fatal error. Throw an EDMException unless the
275  // message corresponds to a pending signal. In that case, do not throw
276  // but let the OS deal with the signal in the usual way.
277  if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
278  std::ostringstream sstr;
279  sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
280  edm::Exception except(edm::errors::FatalRootError, sstr.str());
281  except.addAdditionalInfo(except.message());
282  except.clearMessage();
283  throw except;
284  }
285 
286  // Typically, we get here only for informational messages,
287  // but we leave the other code in just in case we change
288  // the criteria for throwing.
289  if (!alreadyPrinted) {
290  if (el_severity == edm::RootHandlers::SeverityLevel::kFatal) {
291  edm::LogError("Root_Fatal") << el_location << el_message;
292  } else if (el_severity == edm::RootHandlers::SeverityLevel::kSysError) {
293  edm::LogError("Root_Severe") << el_location << el_message;
294  } else if (el_severity == edm::RootHandlers::SeverityLevel::kError) {
295  edm::LogError("Root_Error") << el_location << el_message;
296  } else if (el_severity == edm::RootHandlers::SeverityLevel::kWarning) {
297  edm::LogWarning("Root_Warning") << el_location << el_message;
298  } else if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
299  edm::LogInfo("Root_Information") << el_location << el_message;
300  }
301  }
302  }
303 
304  void RootErrorHandler(int level, bool, char const* location, char const* message) {
305  RootErrorHandlerImpl(level, location, message);
306  }
307 
308  extern "C" {
309  void set_default_signals() {
310  signal(SIGILL, SIG_DFL);
311  signal(SIGSEGV, SIG_DFL);
312  signal(SIGBUS, SIG_DFL);
313  signal(SIGTERM, SIG_DFL);
314  signal(SIGABRT, SIG_DFL);
315  }
316 
317  static int full_write(int fd, const char* text) {
318  const char* buffer = text;
319  size_t count = strlen(text);
320  ssize_t written = 0;
321  while (count) {
322  written = write(fd, buffer, count);
323  if (written == -1) {
324  if (errno == EINTR) {
325  continue;
326  } else {
327  return -errno;
328  }
329  }
330  count -= written;
331  buffer += written;
332  }
333  return 0;
334  }
335 
336  static int full_read(int fd, char* inbuf, size_t len, int timeout_s = -1) {
337  char* buf = inbuf;
338  size_t count = len;
339  ssize_t complete = 0;
340  std::chrono::time_point<std::chrono::steady_clock> end_time =
342  int flags;
343  if (timeout_s < 0) {
344  flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
345  } else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
346  return -errno;
347  }
348  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
349  if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
350  return -errno;
351  }
352  }
353  while (count) {
354  if (timeout_s >= 0) {
355  struct pollfd poll_info {
356  fd, POLLIN, 0
357  };
358  int ms_remaining =
359  std::chrono::duration_cast<std::chrono::milliseconds>(end_time - std::chrono::steady_clock::now()).count();
360  if (ms_remaining > 0) {
361  int rc = poll(&poll_info, 1, ms_remaining);
362  if (rc <= 0) {
363  if (rc < 0) {
364  if (errno == EINTR || errno == EAGAIN) {
365  continue;
366  }
367  rc = -errno;
368  } else {
369  rc = -ETIMEDOUT;
370  }
371  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
372  fcntl(fd, F_SETFL, flags);
373  }
374  return rc;
375  }
376  } else if (ms_remaining < 0) {
377  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
378  fcntl(fd, F_SETFL, flags);
379  }
380  return -ETIMEDOUT;
381  }
382  }
383  complete = read(fd, buf, count);
384  if (complete == -1) {
385  if (errno == EINTR) {
386  continue;
387  } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
388  continue;
389  } else {
390  int orig_errno = errno;
391  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
392  fcntl(fd, F_SETFL, flags);
393  }
394  return -orig_errno;
395  }
396  }
397  count -= complete;
398  buf += complete;
399  }
400  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
401  fcntl(fd, F_SETFL, flags);
402  }
403  return 0;
404  }
405 
406  static int full_cerr_write(const char* text) { return full_write(2, text); }
407 
408 // these signals are only used inside the stacktrace signal handler,
409 // so common signals can be used. They do have to be different, since
410 // we do not set SA_NODEFER, and RESUME must be a signal that will
411 // cause sleep() to return early.
412 #if defined(SIGRTMAX)
413 #define PAUSE_SIGNAL SIGRTMAX
414 #define RESUME_SIGNAL SIGRTMAX - 1
415 #elif defined(SIGINFO) // macOS/BSD
416 #define PAUSE_SIGNAL SIGINFO
417 #define RESUME_SIGNAL SIGALRM
418 #endif
419 
420  // does nothing, here only to interrupt the sleep() in the pause handler
421  void sig_resume_handler(int sig, siginfo_t*, void*) {}
422 
423  // pause a thread so that a (slow) stacktrace will capture the current state
424  void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
425  using namespace edm::service;
426 
427 #ifdef RESUME_SIGNAL
428  sigset_t sigset;
429  sigemptyset(&sigset);
430  sigaddset(&sigset, RESUME_SIGNAL);
431  pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
432 #endif
433  // sleep interrrupts on a handled delivery of the resume signal
435 
436  if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
439  char* buff = InitRootHandlers::moduleListBuffers_[i].data();
440 
441  strlcpy(buff, "\nModule: ", moduleBufferSize);
443  strlcat(buff,
444  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
445  moduleBufferSize);
446  strlcat(buff, ":", moduleBufferSize);
447  strlcat(buff,
448  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
449  moduleBufferSize);
450  } else {
451  strlcat(buff, "none", moduleBufferSize);
452  }
454  }
455  }
456  }
457 
458  void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
459  using namespace edm::service;
460 
461  const auto& tids = InitRootHandlers::threadIDs();
462 
463  const auto self = pthread_self();
464 #ifdef PAUSE_SIGNAL
465  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
466  // install the "pause" handler
467  struct sigaction act;
468  act.sa_sigaction = sig_pause_for_stacktrace;
469  act.sa_flags = 0;
470  sigemptyset(&act.sa_mask);
471  sigaction(PAUSE_SIGNAL, &act, nullptr);
472 
473  // unblock pause signal globally, resume is unblocked in the pause handler
474  sigset_t pausesigset;
475  sigemptyset(&pausesigset);
476  sigaddset(&pausesigset, PAUSE_SIGNAL);
477  sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
478 
479  // send a pause signal to all CMSSW/TBB threads other than self
480  for (auto id : tids) {
481  if (self != id) {
482  pthread_kill(id, PAUSE_SIGNAL);
483  }
484  }
485 
486 #ifdef RESUME_SIGNAL
487  // install the "resume" handler
488  act.sa_sigaction = sig_resume_handler;
489  sigaction(RESUME_SIGNAL, &act, nullptr);
490 #endif
491  }
492 #endif
493 
494  const char* signalname = "unknown";
495  switch (sig) {
496  case SIGBUS: {
497  signalname = "bus error";
498  break;
499  }
500  case SIGSEGV: {
501  signalname = "segmentation violation";
502  break;
503  }
504  case SIGILL: {
505  signalname = "illegal instruction";
506  break;
507  }
508  case SIGTERM: {
509  signalname = "external termination request";
510  break;
511  }
512  case SIGABRT: {
513  signalname = "abort signal";
514  break;
515  }
516  default:
517  break;
518  }
519  full_cerr_write("\n\nA fatal system signal has occurred: ");
520  full_cerr_write(signalname);
521  full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
522 
524 
525  // resume the signal handlers to store the current module; we are not guaranteed they
526  // will have time to store their modules, so there is a race condition; this could be
527  // avoided by storing the module information before sleeping, a change that may be
528  // made when we're convinced accessing the thread-local current module is safe.
529 #ifdef RESUME_SIGNAL
530  std::size_t notified = 0;
531  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
532  for (auto id : tids) {
533  if (self != id) {
534  if (pthread_kill(id, RESUME_SIGNAL) == 0)
535  ++notified;
536  }
537  }
538  }
539 #endif
540 
541  full_cerr_write("\nCurrent Modules:\n");
542 
543  // Checking tids.count(self) ensures that we only try to access the current module in
544  // CMSSW/TBB threads. Those threads access the thread-local current module at the same
545  // time the thread is registered, so any lazy allocation will have been done at that
546  // point. Not necessary on Linux with the current cmsRun linkage, as the thread-local
547  // is allocated at exec time, not lazily.
548  if (tids.count(self) > 0) {
549  char buff[moduleBufferSize] = "\nModule: ";
551  strlcat(buff,
552  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
553  moduleBufferSize);
554  strlcat(buff, ":", moduleBufferSize);
555  strlcat(buff,
556  edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
557  moduleBufferSize);
558  } else {
559  strlcat(buff, "none", moduleBufferSize);
560  }
561  strlcat(buff, " (crashed)", moduleBufferSize);
562  full_cerr_write(buff);
563  } else {
564  full_cerr_write("\nModule: non-CMSSW (crashed)");
565  }
566 
567 #ifdef PAUSE_SIGNAL
568  // wait a short interval for the paused threads to resume and fill in their module
569  // information, then print
570  if (InitRootHandlers::doneModules_.is_lock_free()) {
571  int spincount = 0;
572  timespec t = {0, 1000};
573  while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) {
574  nanosleep(&t, nullptr);
575  }
576  for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
577  full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
578  }
579  }
580 #endif
581 
582  full_cerr_write("\n\nA fatal system signal has occurred: ");
583  full_cerr_write(signalname);
584  full_cerr_write("\n");
585 
586  // For these five known cases, re-raise the signal to get the correct
587  // exit code.
588  if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGABRT)) {
589  signal(sig, SIG_DFL);
590  raise(sig);
591  } else {
592  set_default_signals();
593  ::abort();
594  }
595  }
596 
597  void sig_abort(int sig, siginfo_t*, void*) {
598  full_cerr_write("\n\nFatal system signal has occurred during exit\n");
599 
600  // re-raise the signal to get the correct exit code
601  signal(sig, SIG_DFL);
602  raise(sig);
603 
604  // shouldn't get here
605  set_default_signals();
606  ::sleep(10);
607  ::abort();
608  }
609  }
610 } // end of unnamed namespace
611 
612 namespace edm {
613  namespace service {
614 
615  /*
616  * We've run into issues where GDB fails to print the thread which calls clone().
617  * To avoid this problem, we have an alternate approach below where the signal handler
618  * only reads/writes to a dedicated thread via pipes. The helper thread does the clone()
619  * invocation; we don't care if that thread is missing from the traceback in this case.
620  */
621  static void cmssw_stacktrace_fork();
622 
624  int toParent = childToParent_[1];
625  int fromParent = parentToChild_[0];
626  char buf[2];
627  buf[1] = '\0';
628 
629  while (true) {
630  int result = full_read(fromParent, buf, 1);
631  if (result < 0) {
632  // To avoid a deadlock (this function is NOT re-entrant), reset signals
633  // We never set them back to the CMSSW handler because we assume the parent
634  // thread will abort for us.
635  set_default_signals();
636  close(toParent);
637  full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
638  full_cerr_write(strerror(-result));
639  full_cerr_write("\n");
640  ::abort();
641  }
642  if (buf[0] == '1') {
643  set_default_signals();
645  full_write(toParent, buf);
646  } else if (buf[0] == '2') {
647  // We have just finished forking. Reload the file descriptors for thread
648  // communication.
649  close(toParent);
650  close(fromParent);
651  toParent = childToParent_[1];
652  fromParent = parentToChild_[0];
653  } else if (buf[0] == '3') {
654  break;
655  } else {
656  set_default_signals();
657  close(toParent);
658  full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
659  full_cerr_write(buf);
660  full_cerr_write("\n");
661  ::abort();
662  }
663  }
664  }
665 
667  int result = full_write(parentToChild_[1], "1");
668  if (result < 0) {
669  full_cerr_write("\n\nAttempt to request stacktrace failed: ");
670  full_cerr_write(strerror(-result));
671  full_cerr_write("\n");
672  return;
673  }
674  char buf[2];
675  buf[1] = '\0';
676  if ((result = full_read(childToParent_[0], buf, 1, 5 * 60)) < 0) {
677  full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
678  if (result == -ETIMEDOUT) {
679  full_cerr_write("timed out waiting for GDB to complete.");
680  } else {
681  full_cerr_write(strerror(-result));
682  }
683  full_cerr_write("\n");
684  return;
685  }
686  }
687 
689  char child_stack[4 * 1024];
690  char* child_stack_ptr = child_stack + 4 * 1024;
691  // On Linux, we currently use jemalloc. This registers pthread_atfork handlers; these
692  // handlers are *not* async-signal safe. Hence, a deadlock is possible if we invoke
693  // fork() from our signal handlers. Accordingly, we use clone (not POSIX, but AS-safe)
694  // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
695  int pid =
696 #ifdef __linux__
697  clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM | CLONE_FS | SIGCHLD, nullptr);
698 #else
699  fork();
700  if (child_stack_ptr) {
701  } // Suppress 'unused variable' warning on non-Linux
702  if (pid == 0) {
704  }
705 #endif
706  if (pid == -1) {
707  full_cerr_write("(Attempt to perform stack dump failed.)\n");
708  } else {
709  int status;
710  if (waitpid(pid, &status, 0) == -1) {
711  full_cerr_write("(Failed to wait on stack dump output.)\n");
712  }
713  if (status) {
714  full_cerr_write("(GDB stack trace failed unexpectedly)\n");
715  }
716  }
717  }
718 
719  int cmssw_stacktrace(void* /*arg*/) {
720  set_default_signals();
721 
723  // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
724  // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
725  // calls dlsym.
726 #ifdef __linux__
727  syscall(SYS_execve, "/bin/sh", argv, __environ);
728 #else
729  execv("/bin/sh", argv);
730 #endif
731  ::abort();
732  return 1;
733  }
734 
735  static char pstackName[] = "(CMSSW stack trace helper)";
736  static char dashC[] = "-c";
739  int InitRootHandlers::parentToChild_[2] = {-1, -1};
740  int InitRootHandlers::childToParent_[2] = {-1, -1};
741  std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
743  std::vector<std::array<char, moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
744  std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
746 
748  : RootHandlers(),
749  unloadSigHandler_(pset.getUntrackedParameter<bool>("UnloadRootSigHandler")),
750  resetErrHandler_(pset.getUntrackedParameter<bool>("ResetRootErrHandler")),
751  loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
752  autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool>("AutoLibraryLoader")) {
753  stackTracePause_ = pset.getUntrackedParameter<int>("StackTracePauseTime");
754 
755  if (unloadSigHandler_) {
756  // Deactivate all the Root signal handlers and restore the system defaults
757  gSystem->ResetSignal(kSigChild);
758  gSystem->ResetSignal(kSigBus);
759  gSystem->ResetSignal(kSigSegmentationViolation);
760  gSystem->ResetSignal(kSigIllegalInstruction);
761  gSystem->ResetSignal(kSigSystem);
762  gSystem->ResetSignal(kSigPipe);
763  gSystem->ResetSignal(kSigAlarm);
764  gSystem->ResetSignal(kSigUrgent);
765  gSystem->ResetSignal(kSigFloatingException);
766  gSystem->ResetSignal(kSigWindowChanged);
767  } else if (pset.getUntrackedParameter<bool>("AbortOnSignal")) {
768  cachePidInfo();
769 
770  //NOTE: ROOT can also be told to abort on these kinds of problems BUT
771  // it requires an TApplication to be instantiated which causes problems
772  gSystem->ResetSignal(kSigBus);
773  gSystem->ResetSignal(kSigSegmentationViolation);
774  gSystem->ResetSignal(kSigIllegalInstruction);
775  installCustomHandler(SIGBUS, sig_dostack_then_abort);
776  sigBusHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGBUS, sig_abort); });
777  installCustomHandler(SIGSEGV, sig_dostack_then_abort);
778  sigSegvHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGSEGV, sig_abort); });
779  installCustomHandler(SIGILL, sig_dostack_then_abort);
780  sigIllHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGILL, sig_abort); });
781  installCustomHandler(SIGTERM, sig_dostack_then_abort);
782  sigTermHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGTERM, sig_abort); });
783  installCustomHandler(SIGABRT, sig_dostack_then_abort);
784  sigAbrtHandler_ = std::shared_ptr<const void>(nullptr, [](void*) {
785  signal(SIGABRT, SIG_DFL); // release SIGABRT to default
786  });
787  }
788 
789  iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds) {
790  if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
791  moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
792  }
793  });
794 
795  if (resetErrHandler_) {
796  // Replace the Root error handler with one that uses the MessageLogger
797  SetErrorHandler(RootErrorHandler);
798  }
799 
800  // Enable automatic Root library loading.
801  if (autoLibraryLoader_) {
802  gInterpreter->SetClassAutoloading(1);
803  }
804 
805  // Set ROOT parameters.
806  TTree::SetMaxTreeSize(kMaxLong64);
807  TH1::AddDirectory(kFALSE);
808  //G__SetCatchException(0);
809 
810  // Set custom streamers
812 
813  // Load the library containing dictionaries for std:: classes, if not already loaded.
814  if (!hasDictionary(typeid(std::vector<std::vector<unsigned int>>))) {
815  TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
816  }
817 
818  int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
819  if (debugLevel > 0) {
820  gDebug = debugLevel;
821  }
822 
823  // Enable Root implicit multi-threading
824  bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
825  if (imt && not ROOT::IsImplicitMTEnabled()) {
826  ROOT::EnableImplicitMT();
827  }
828  }
829 
831  // close all open ROOT files
832  TIter iter(gROOT->GetListOfFiles());
833  TObject* obj = nullptr;
834  while (nullptr != (obj = iter.Next())) {
835  TFile* f = dynamic_cast<TFile*>(obj);
836  if (f) {
837  // We get a new iterator each time,
838  // because closing a file can invalidate the iterator
839  f->Close();
840  iter = TIter(gROOT->GetListOfFiles());
841  }
842  }
843  }
844 
846  //Tell Root we want to be multi-threaded
847  ROOT::EnableThreadSafety();
848 
849  //When threading, also have to keep ROOT from logging all TObjects into a list
850  TObject::SetObjectStat(false);
851 
852  //Have to avoid having Streamers modify themselves after they have been used
853  TVirtualStreamerInfo::Optimize(false);
854  }
855 
858  desc.setComment("Centralized interface to ROOT.");
859  desc.addUntracked<bool>("UnloadRootSigHandler", false)
860  ->setComment("If True, signals are handled by this service, rather than by ROOT.");
861  desc.addUntracked<bool>("ResetRootErrHandler", true)
862  ->setComment(
863  "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
864  desc.addUntracked<bool>("AutoLibraryLoader", true)
865  ->setComment("If True, enables automatic loading of data dictionaries.");
866  desc.addUntracked<bool>("LoadAllDictionaries", false)->setComment("If True, loads all ROOT dictionaries.");
867  desc.addUntracked<bool>("EnableIMT", true)->setComment("If True, calls ROOT::EnableImplicitMT().");
868  desc.addUntracked<bool>("AbortOnSignal", true)
869  ->setComment(
870  "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
871  "attempts to do a clean shutdown.");
872  desc.addUntracked<int>("DebugLevel", 0)->setComment("Sets ROOT's gDebug value.");
873  desc.addUntracked<int>("StackTracePauseTime", 300)
874  ->setComment("Seconds to pause other threads during stack trace.");
875  descriptions.add("InitRootHandlers", desc);
876  }
877 
878  char* const* InitRootHandlers::getPstackArgv() { return pstackArgv_; }
879 
881 
883 
885  if (helperThread_) {
886  //Another InitRootHandlers was initialized in this job, possibly
887  // because multiple EventProcessors are being used.
888  //In that case, we are already all setup
889  return;
890  }
891  if (snprintf(pidString_,
892  pidStringLength_ - 1,
893  "date; gdb -quiet -p %d 2>&1 <<EOF |\n"
894  "set width 0\n"
895  "set height 0\n"
896  "set pagination no\n"
897  "thread apply all bt\n"
898  "EOF\n"
899  "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'",
900  getpid()) >= pidStringLength_) {
901  std::ostringstream sstr;
902  sstr << "Unable to pre-allocate stacktrace handler information";
903  edm::Exception except(edm::errors::OtherCMS, sstr.str());
904  throw except;
905  }
906 
907  // These are initialized to -1; harmless to close an invalid FD.
908  // If this is called post-fork, we don't want to be communicating on
909  // these FDs as they are used internally by the parent.
910  close(childToParent_[0]);
911  close(childToParent_[1]);
912  childToParent_[0] = -1;
913  childToParent_[1] = -1;
914  close(parentToChild_[0]);
915  close(parentToChild_[1]);
916  parentToChild_[0] = -1;
917  parentToChild_[1] = -1;
918 
919  if (-1 == pipe2(childToParent_, O_CLOEXEC)) {
920  std::ostringstream sstr;
921  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
922  edm::Exception except(edm::errors::OtherCMS, sstr.str());
923  throw except;
924  }
925 
926  if (-1 == pipe2(parentToChild_, O_CLOEXEC)) {
927  close(childToParent_[0]);
928  close(childToParent_[1]);
929  childToParent_[0] = -1;
930  childToParent_[1] = -1;
931  std::ostringstream sstr;
932  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
933  edm::Exception except(edm::errors::OtherCMS, sstr.str());
934  throw except;
935  }
936 
937  helperThread_.reset(new std::thread(stacktraceHelperThread));
938  helperThread_->detach();
939  }
940 
941  } // end of namespace service
942 } // end of namespace edm
943 
size
Write out results.
unsigned int maxNumberOfThreads() const
Definition: SystemBounds.h:38
T getUntrackedParameter(std::string const &, T const &) const
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
static void cmssw_stacktrace_fork()
double seconds()
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setRefCoreStreamerInTClass()
static char *const pstackArgv_[]
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
std::vector< T >::const_iterator search(const cond::Time_t &val, const std::vector< T > &container)
Definition: IOVProxy.cc:314
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:99
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:109
static ModuleCallingContext const * getCurrentModuleOnThread()
std::vector< Variable::Flags > flags
Definition: MVATrainer.cc:135
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override
std::shared_ptr< const void > sigBusHandler_
static TypeWithDict byName(std::string const &name)
Definition: TypeWithDict.cc:74
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
std::shared_ptr< const void > sigAbrtHandler_
void setComment(std::string const &value)
std::string moduleName(Provenance const &provenance)
Definition: Provenance.cc:27
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static ThreadTracker threadTracker_
static char *const * getPstackArgv()
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
std::shared_ptr< const void > sigIllHandler_
std::shared_ptr< const void > sigTermHandler_
void addAdditionalInfo(std::string const &info)
Definition: Exception.cc:169
double f[11][100]
int cmssw_stacktrace(void *)
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static char pstackName[]
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
tbb::concurrent_unordered_set< pthread_t > Container_type
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
Definition: eve_macros.cc:135
Definition: TBBSession.h:68
static char dashC[]
HLT enums.
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:82
static void fillDescriptions(ConfigurationDescriptions &descriptions)
bool hasDictionary(std::type_info const &)
#define O_NONBLOCK
Definition: SysFile.h:21
def write(self, setup)
#define constexpr