CMS 3D CMS Logo

InitRootHandlers.cc
Go to the documentation of this file.
2 
4 
20 
21 #include "tbb/task.h"
22 #include "tbb/task_scheduler_observer.h"
23 #include "tbb/concurrent_unordered_set.h"
24 #include <thread>
25 #include <sys/wait.h>
26 #include <sstream>
27 #include <cstring>
28 #include <poll.h>
29 #include <atomic>
30 
31 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
32 // version. This can break our stack trace printer. Avoid this by
33 // invoking the syscall directly.
34 #ifdef __linux__
35 #include <syscall.h>
36 #endif
37 
38 #include "TROOT.h"
39 #include "TError.h"
40 #include "TFile.h"
41 #include "TInterpreter.h"
42 #include "TH1.h"
43 #include "TSystem.h"
44 #include "TUnixSystem.h"
45 #include "TTree.h"
46 #include "TVirtualStreamerInfo.h"
47 
48 #include "TClassTable.h"
49 
50 #include <memory>
51 
52 namespace {
53  // size of static buffer allocated for listing module names following a
54  // stacktrace abort
55  constexpr std::size_t moduleBufferSize = 128;
56 }
57 
58 namespace edm {
60  class ParameterSet;
61  class ActivityRegistry;
62 
63  namespace service {
64  class InitRootHandlers : public RootHandlers {
65 
66  friend int cmssw_stacktrace(void *);
67 
68  public:
69  class ThreadTracker : public tbb::task_scheduler_observer {
70  public:
71  typedef tbb::concurrent_unordered_set<pthread_t> Container_type;
72 
73  ThreadTracker() : tbb::task_scheduler_observer() {
74  observe(true);
75  }
76  void on_scheduler_entry(bool) override {
77  // ensure thread local has been allocated; not necessary on Linux with
78  // the current cmsRun linkage, but could be an issue if the platform
79  // or linkage leads to "lazy" allocation of the thread local. By
80  // referencing it here we make sure it has been allocated and can be
81  // accessed safely from our signal handler.
83  threadIDs_.insert(pthread_self());
84  }
85  const Container_type& IDs() { return threadIDs_; }
86 
87  private:
88  Container_type threadIDs_;
89  };
90 
91  explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
92  ~InitRootHandlers() override;
93 
94  static void fillDescriptions(ConfigurationDescriptions& descriptions);
95  static void stacktraceFromThread();
97  static int stackTracePause() { return stackTracePause_; }
98 
99  static std::vector<std::array<char,moduleBufferSize>> moduleListBuffers_;
100  static std::atomic<std::size_t> nextModule_, doneModules_;
101  private:
102  static char *const *getPstackArgv();
103  void enableWarnings_() override;
104  void ignoreWarnings_() override;
105  void willBeUsingThreads() override;
106 
107  void cachePidInfo();
108  static void stacktraceHelperThread();
109 
110  static const int pidStringLength_ = 200;
112  static char * const pstackArgv_[];
113  static int parentToChild_[2];
114  static int childToParent_[2];
115  static std::unique_ptr<std::thread> helperThread_;
117  static int stackTracePause_;
118 
123  std::shared_ptr<const void> sigBusHandler_;
124  std::shared_ptr<const void> sigSegvHandler_;
125  std::shared_ptr<const void> sigIllHandler_;
126  std::shared_ptr<const void> sigTermHandler_;
127  std::shared_ptr<const void> sigAbrtHandler_;
128  };
129 
130  inline
132  return true;
133  }
134 
135  } // end of namespace service
136 } // end of namespace edm
137 
138 namespace edm {
139  namespace service {
140  int cmssw_stacktrace(void *);
141  }
142 }
143 
144 namespace {
145  enum class SeverityLevel {
146  kInfo,
147  kWarning,
148  kError,
149  kSysError,
150  kFatal
151  };
152 
153  thread_local bool s_ignoreWarnings = false;
154 
155  bool s_ignoreEverything = false;
156 
157  void RootErrorHandlerImpl(int level, char const* location, char const* message) {
158 
159  bool die = false;
160 
161  // Translate ROOT severity level to MessageLogger severity level
162 
163  SeverityLevel el_severity = SeverityLevel::kInfo;
164 
165  if (level >= kFatal) {
166  el_severity = SeverityLevel::kFatal;
167  } else if (level >= kSysError) {
168  el_severity = SeverityLevel::kSysError;
169  } else if (level >= kError) {
170  el_severity = SeverityLevel::kError;
171  } else if (level >= kWarning) {
172  el_severity = s_ignoreWarnings ? SeverityLevel::kInfo : SeverityLevel::kWarning;
173  }
174 
175  if(s_ignoreEverything) {
176  el_severity = SeverityLevel::kInfo;
177  }
178 
179  // Adapt C-strings to std::strings
180  // Arrange to report the error location as furnished by Root
181 
182  std::string el_location = "@SUB=?";
183  if (location != nullptr) el_location = std::string("@SUB=")+std::string(location);
184 
185  std::string el_message = "?";
186  if (message != nullptr) el_message = message;
187 
188  // Try to create a meaningful id string using knowledge of ROOT error messages
189  //
190  // id == "ROOT-ClassName" where ClassName is the affected class
191  // else "ROOT/ClassName" where ClassName is the error-declaring class
192  // else "ROOT"
193 
194  std::string el_identifier = "ROOT";
195 
196  std::string precursor("class ");
197  size_t index1 = el_message.find(precursor);
198  if (index1 != std::string::npos) {
199  size_t index2 = index1 + precursor.length();
200  size_t index3 = el_message.find_first_of(" :", index2);
201  if (index3 != std::string::npos) {
202  size_t substrlen = index3-index2;
203  el_identifier += "-";
204  el_identifier += el_message.substr(index2,substrlen);
205  }
206  } else {
207  index1 = el_location.find("::");
208  if (index1 != std::string::npos) {
209  el_identifier += "/";
210  el_identifier += el_location.substr(0, index1);
211  }
212  }
213 
214  // Intercept some messages and upgrade the severity
215 
216  if ((el_location.find("TBranchElement::Fill") != std::string::npos)
217  && (el_message.find("fill branch") != std::string::npos)
218  && (el_message.find("address") != std::string::npos)
219  && (el_message.find("not set") != std::string::npos)) {
220  el_severity = SeverityLevel::kFatal;
221  }
222 
223  if ((el_message.find("Tree branches") != std::string::npos)
224  && (el_message.find("different numbers of entries") != std::string::npos)) {
225  el_severity = SeverityLevel::kFatal;
226  }
227 
228 
229  // Intercept some messages and downgrade the severity
230 
231  if ((el_message.find("no dictionary for class") != std::string::npos) ||
232  (el_message.find("already in TClassTable") != std::string::npos) ||
233  (el_message.find("matrix not positive definite") != std::string::npos) ||
234  (el_message.find("not a TStreamerInfo object") != std::string::npos) ||
235  (el_message.find("Problems declaring payload") != std::string::npos) ||
236  (el_message.find("Announced number of args different from the real number of argument passed") != std::string::npos) || // Always printed if gDebug>0 - regardless of whether warning message is real.
237  (el_location.find("Fit") != std::string::npos) ||
238  (el_location.find("TDecompChol::Solve") != std::string::npos) ||
239  (el_location.find("THistPainter::PaintInit") != std::string::npos) ||
240  (el_location.find("TUnixSystem::SetDisplay") != std::string::npos) ||
241  (el_location.find("TGClient::GetFontByName") != std::string::npos) ||
242  (el_location.find("Inverter::Dinv") != std::string::npos) ||
243  (el_message.find("nbins is <=0 - set to nbins = 1") != std::string::npos) ||
244  (el_message.find("nbinsy is <=0 - set to nbinsy = 1") != std::string::npos) ||
245  (level < kError and
246  (el_location.find("CINTTypedefBuilder::Setup")!= std::string::npos) and
247  (el_message.find("possible entries are in use!") != std::string::npos))) {
248  el_severity = SeverityLevel::kInfo;
249  }
250 
251  // These are a special case because we do not want them to
252  // be fatal, but we do want an error to print.
253  bool alreadyPrinted = false;
254  if ((el_message.find("number of iterations was insufficient") != std::string::npos) ||
255  (el_message.find("bad integrand behavior") != std::string::npos) ||
256  (el_message.find("integral is divergent, or slowly convergent") != std::string::npos)) {
257  el_severity = SeverityLevel::kInfo;
258  edm::LogError("Root_Error") << el_location << el_message;
259  alreadyPrinted = true;
260  }
261 
262  if (el_severity == SeverityLevel::kInfo) {
263  // Don't throw if the message is just informational.
264  die = false;
265  } else {
266  die = true;
267  }
268 
269  // Feed the message to the MessageLogger and let it choose to suppress or not.
270 
271  // Root has declared a fatal error. Throw an EDMException unless the
272  // message corresponds to a pending signal. In that case, do not throw
273  // but let the OS deal with the signal in the usual way.
274  if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
275  std::ostringstream sstr;
276  sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
277  edm::Exception except(edm::errors::FatalRootError, sstr.str());
278  except.addAdditionalInfo(except.message());
279  except.clearMessage();
280  throw except;
281 
282  }
283 
284  // Typically, we get here only for informational messages,
285  // but we leave the other code in just in case we change
286  // the criteria for throwing.
287  if (!alreadyPrinted) {
288  if (el_severity == SeverityLevel::kFatal) {
289  edm::LogError("Root_Fatal") << el_location << el_message;
290  } else if (el_severity == SeverityLevel::kSysError) {
291  edm::LogError("Root_Severe") << el_location << el_message;
292  } else if (el_severity == SeverityLevel::kError) {
293  edm::LogError("Root_Error") << el_location << el_message;
294  } else if (el_severity == SeverityLevel::kWarning) {
295  edm::LogWarning("Root_Warning") << el_location << el_message ;
296  } else if (el_severity == SeverityLevel::kInfo) {
297  edm::LogInfo("Root_Information") << el_location << el_message ;
298  }
299  }
300  }
301 
302  void RootErrorHandler(int level, bool, char const* location, char const* message) {
303  RootErrorHandlerImpl(level, location, message);
304  }
305 
306  extern "C" {
307  void set_default_signals() {
308  signal(SIGILL, SIG_DFL);
309  signal(SIGSEGV, SIG_DFL);
310  signal(SIGBUS, SIG_DFL);
311  signal(SIGTERM, SIG_DFL);
312  signal(SIGABRT, SIG_DFL);
313  }
314 
315  static int full_write(int fd, const char *text)
316  {
317  const char *buffer = text;
318  size_t count = strlen(text);
319  ssize_t written = 0;
320  while (count)
321  {
322  written = write(fd, buffer, count);
323  if (written == -1)
324  {
325  if (errno == EINTR) {continue;}
326  else {return -errno;}
327  }
328  count -= written;
329  buffer += written;
330  }
331  return 0;
332  }
333 
334  static int full_read(int fd, char *inbuf, size_t len, int timeout_s=-1)
335  {
336  char *buf = inbuf;
337  size_t count = len;
338  ssize_t complete = 0;
339  std::chrono::time_point<std::chrono::steady_clock> end_time = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_s);
340  int flags;
341  if (timeout_s < 0)
342  {
343  flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
344  }
345  else if ((-1 == (flags = fcntl(fd, F_GETFL))))
346  {
347  return -errno;
348  }
349  if ((flags & O_NONBLOCK) != O_NONBLOCK)
350  {
351  if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK))
352  {
353  return -errno;
354  }
355  }
356  while (count)
357  {
358  if (timeout_s >= 0)
359  {
360  struct pollfd poll_info{fd, POLLIN, 0};
361  int ms_remaining = std::chrono::duration_cast<std::chrono::milliseconds>(end_time-std::chrono::steady_clock::now()).count();
362  if (ms_remaining > 0)
363  {
364  int rc = poll(&poll_info, 1, ms_remaining);
365  if (rc <= 0)
366  {
367  if (rc < 0) {
368  if (errno == EINTR || errno == EAGAIN) { continue; }
369  rc = -errno;
370  } else {
371  rc = -ETIMEDOUT;
372  }
373  if ((flags & O_NONBLOCK) != O_NONBLOCK)
374  {
375  fcntl(fd, F_SETFL, flags);
376  }
377  return rc;
378  }
379  }
380  else if (ms_remaining < 0)
381  {
382  if ((flags & O_NONBLOCK) != O_NONBLOCK)
383  {
384  fcntl(fd, F_SETFL, flags);
385  }
386  return -ETIMEDOUT;
387  }
388  }
389  complete = read(fd, buf, count);
390  if (complete == -1)
391  {
392  if (errno == EINTR) {continue;}
393  else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {continue;}
394  else
395  {
396  int orig_errno = errno;
397  if ((flags & O_NONBLOCK) != O_NONBLOCK)
398  {
399  fcntl(fd, F_SETFL, flags);
400  }
401  return -orig_errno;
402  }
403  }
404  count -= complete;
405  buf += complete;
406  }
407  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
408  fcntl(fd, F_SETFL, flags);
409  }
410  return 0;
411  }
412 
413  static int full_cerr_write(const char *text)
414  {
415  return full_write(2, text);
416  }
417 
418 // these signals are only used inside the stacktrace signal handler,
419 // so common signals can be used. They do have to be different, since
420 // we do not set SA_NODEFER, and RESUME must be a signal that will
421 // cause sleep() to return early.
422 #if defined(SIGRTMAX)
423 #define PAUSE_SIGNAL SIGRTMAX
424 #define RESUME_SIGNAL SIGRTMAX-1
425 #elif defined(SIGINFO) // macOS/BSD
426 #define PAUSE_SIGNAL SIGINFO
427 #define RESUME_SIGNAL SIGALRM
428 #endif
429 
430  // does nothing, here only to interrupt the sleep() in the pause handler
431  void sig_resume_handler(int sig, siginfo_t*, void*) {}
432 
433  // pause a thread so that a (slow) stacktrace will capture the current state
434  void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
435  using namespace edm::service;
436 
437 #ifdef RESUME_SIGNAL
438  sigset_t sigset;
439  sigemptyset(&sigset);
440  sigaddset(&sigset, RESUME_SIGNAL);
441  pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
442 #endif
443  // sleep interrrupts on a handled delivery of the resume signal
445 
446  if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
449  char* buff = InitRootHandlers::moduleListBuffers_[i].data();
450 
451  strlcpy(buff, "\nModule: ", moduleBufferSize);
453  strlcat(buff, edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(), moduleBufferSize);
454  strlcat(buff, ":", moduleBufferSize);
455  strlcat(buff, edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(), moduleBufferSize);
456  } else {
457  strlcat(buff, "none", moduleBufferSize);
458  }
460  }
461  }
462  }
463 
464  void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
465  using namespace edm::service;
466 
467  const auto& tids = InitRootHandlers::threadIDs();
468 
469  const auto self = pthread_self();
470 #ifdef PAUSE_SIGNAL
471  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
472  // install the "pause" handler
473  struct sigaction act;
474  act.sa_sigaction = sig_pause_for_stacktrace;
475  act.sa_flags = 0;
476  sigemptyset(&act.sa_mask);
477  sigaction(PAUSE_SIGNAL, &act, nullptr);
478 
479  // unblock pause signal globally, resume is unblocked in the pause handler
480  sigset_t pausesigset;
481  sigemptyset(&pausesigset);
482  sigaddset(&pausesigset, PAUSE_SIGNAL);
483  sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
484 
485  // send a pause signal to all CMSSW/TBB threads other than self
486  for (auto id : tids) {
487  if (self != id) {
488  pthread_kill(id, PAUSE_SIGNAL);
489  }
490  }
491 
492 #ifdef RESUME_SIGNAL
493  // install the "resume" handler
494  act.sa_sigaction = sig_resume_handler;
495  sigaction(RESUME_SIGNAL, &act, nullptr);
496 #endif
497  }
498 #endif
499 
500  const char* signalname = "unknown";
501  switch (sig) {
502  case SIGBUS:
503  {
504  signalname = "bus error";
505  break;
506  }
507  case SIGSEGV:
508  {
509  signalname = "segmentation violation";
510  break;
511  }
512  case SIGILL:
513  {
514  signalname = "illegal instruction";
515  break;
516  }
517  case SIGTERM:
518  {
519  signalname = "external termination request";
520  break;
521  }
522  case SIGABRT:
523  {
524  signalname = "abort signal";
525  break;
526  }
527  default:
528  break;
529  }
530  full_cerr_write("\n\nA fatal system signal has occurred: ");
531  full_cerr_write(signalname);
532  full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
533 
535 
536  // resume the signal handlers to store the current module; we are not guaranteed they
537  // will have time to store their modules, so there is a race condition; this could be
538  // avoided by storing the module information before sleeping, a change that may be
539  // made when we're convinced accessing the thread-local current module is safe.
540 #ifdef RESUME_SIGNAL
541  std::size_t notified = 0;
542  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
543  for (auto id : tids) {
544  if (self != id) {
545  if (pthread_kill(id, RESUME_SIGNAL) == 0) ++notified;
546  }
547  }
548  }
549 #endif
550 
551  full_cerr_write("\nCurrent Modules:\n");
552 
553  // Checking tids.count(self) ensures that we only try to access the current module in
554  // CMSSW/TBB threads. Those threads access the thread-local current module at the same
555  // time the thread is registered, so any lazy allocation will have been done at that
556  // point. Not necessary on Linux with the current cmsRun linkage, as the thread-local
557  // is allocated at exec time, not lazily.
558  if (tids.count(self) > 0) {
559  char buff[moduleBufferSize] = "\nModule: ";
561  strlcat(buff, edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(), moduleBufferSize);
562  strlcat(buff, ":", moduleBufferSize);
563  strlcat(buff, edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(), moduleBufferSize);
564  } else {
565  strlcat(buff, "none", moduleBufferSize);
566  }
567  strlcat(buff, " (crashed)", moduleBufferSize);
568  full_cerr_write(buff);
569  } else {
570  full_cerr_write("\nModule: non-CMSSW (crashed)");
571  }
572 
573 #ifdef PAUSE_SIGNAL
574  // wait a short interval for the paused threads to resume and fill in their module
575  // information, then print
576  if (InitRootHandlers::doneModules_.is_lock_free()) {
577  int spincount = 0;
578  timespec t = { 0, 1000 };
579  while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) { nanosleep(&t, nullptr); }
580  for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
581  full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
582  }
583  }
584 #endif
585 
586  full_cerr_write("\n\nA fatal system signal has occurred: ");
587  full_cerr_write(signalname);
588  full_cerr_write("\n");
589 
590  // For these five known cases, re-raise the signal to get the correct
591  // exit code.
592  if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGABRT))
593  {
594  signal(sig, SIG_DFL);
595  raise(sig);
596  }
597  else
598  {
599  set_default_signals();
600  ::abort();
601  }
602  }
603 
604  void sig_abort(int sig, siginfo_t*, void*) {
605  full_cerr_write("\n\nFatal system signal has occurred during exit\n");
606 
607  // re-raise the signal to get the correct exit code
608  signal(sig, SIG_DFL);
609  raise(sig);
610 
611  // shouldn't get here
612  set_default_signals();
613  ::sleep(10);
614  ::abort();
615  }
616  }
617 } // end of unnamed namespace
618 
619 namespace edm {
620  namespace service {
621 
622  /*
623  * We've run into issues where GDB fails to print the thread which calls clone().
624  * To avoid this problem, we have an alternate approach below where the signal handler
625  * only reads/writes to a dedicated thread via pipes. The helper thread does the clone()
626  * invocation; we don't care if that thread is missing from the traceback in this case.
627  */
628  static void cmssw_stacktrace_fork();
629 
631  {
632  int toParent = childToParent_[1];
633  int fromParent = parentToChild_[0];
634  char buf[2]; buf[1] = '\0';
635 
636  while(true)
637  {
638  int result = full_read(fromParent, buf, 1);
639  if (result < 0)
640  {
641  // To avoid a deadlock (this function is NOT re-entrant), reset signals
642  // We never set them back to the CMSSW handler because we assume the parent
643  // thread will abort for us.
644  set_default_signals();
645  close(toParent);
646  full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
647  full_cerr_write(strerror(-result));
648  full_cerr_write("\n");
649  ::abort();
650  }
651  if (buf[0] == '1')
652  {
653  set_default_signals();
655  full_write(toParent, buf);
656  }
657  else if (buf[0] == '2')
658  {
659  // We have just finished forking. Reload the file descriptors for thread
660  // communication.
661  close(toParent);
662  close(fromParent);
663  toParent = childToParent_[1];
664  fromParent = parentToChild_[0];
665  }
666  else if (buf[0] == '3')
667  {
668  break;
669  }
670  else
671  {
672  set_default_signals();
673  close(toParent);
674  full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
675  full_cerr_write(buf);
676  full_cerr_write("\n");
677  ::abort();
678  }
679  }
680  }
681 
683  {
684  int result = full_write(parentToChild_[1], "1");
685  if (result < 0)
686  {
687  full_cerr_write("\n\nAttempt to request stacktrace failed: ");
688  full_cerr_write(strerror(-result));
689  full_cerr_write("\n");
690  return;
691  }
692  char buf[2]; buf[1] = '\0';
693  if ((result = full_read(childToParent_[0], buf, 1, 5*60)) < 0)
694  {
695  full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
696  if (result == -ETIMEDOUT)
697  {
698  full_cerr_write("timed out waiting for GDB to complete.");
699  }
700  else
701  {
702  full_cerr_write(strerror(-result));
703  }
704  full_cerr_write("\n");
705  return;
706  }
707  }
708 
710  {
711  char child_stack[4*1024];
712  char *child_stack_ptr = child_stack + 4*1024;
713  // On Linux, we currently use jemalloc. This registers pthread_atfork handlers; these
714  // handlers are *not* async-signal safe. Hence, a deadlock is possible if we invoke
715  // fork() from our signal handlers. Accordingly, we use clone (not POSIX, but AS-safe)
716  // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
717  int pid =
718 #ifdef __linux__
719  clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM|CLONE_FS|SIGCHLD, nullptr);
720 #else
721  fork();
722  if (child_stack_ptr) {} // Suppress 'unused variable' warning on non-Linux
723  if (pid == 0) { edm::service::cmssw_stacktrace(nullptr); }
724 #endif
725  if (pid == -1)
726  {
727  full_cerr_write("(Attempt to perform stack dump failed.)\n");
728  }
729  else
730  {
731  int status;
732  if (waitpid(pid, &status, 0) == -1)
733  {
734  full_cerr_write("(Failed to wait on stack dump output.)\n");
735  }
736  if (status)
737  {
738  full_cerr_write("(GDB stack trace failed unexpectedly)\n");
739  }
740  }
741  }
742 
743  int cmssw_stacktrace(void * /*arg*/)
744  {
745  set_default_signals();
746 
748  // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
749  // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
750  // calls dlsym.
751 #ifdef __linux__
752  syscall(SYS_execve, "/bin/sh", argv, __environ);
753 #else
754  execv("/bin/sh", argv);
755 #endif
756  ::abort();
757  return 1;
758  }
759 
760  static char pstackName[] = "(CMSSW stack trace helper)";
761  static char dashC[] = "-c";
764  int InitRootHandlers::parentToChild_[2] = {-1, -1};
765  int InitRootHandlers::childToParent_[2] = {-1, -1};
766  std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
768  std::vector<std::array<char,moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
769  std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
771 
772 
774  : RootHandlers(),
775  unloadSigHandler_(pset.getUntrackedParameter<bool> ("UnloadRootSigHandler")),
776  resetErrHandler_(pset.getUntrackedParameter<bool> ("ResetRootErrHandler")),
777  loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
778  autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool> ("AutoLibraryLoader"))
779  {
780  stackTracePause_ = pset.getUntrackedParameter<int> ("StackTracePauseTime");
781 
782  if(unloadSigHandler_) {
783  // Deactivate all the Root signal handlers and restore the system defaults
784  gSystem->ResetSignal(kSigChild);
785  gSystem->ResetSignal(kSigBus);
786  gSystem->ResetSignal(kSigSegmentationViolation);
787  gSystem->ResetSignal(kSigIllegalInstruction);
788  gSystem->ResetSignal(kSigSystem);
789  gSystem->ResetSignal(kSigPipe);
790  gSystem->ResetSignal(kSigAlarm);
791  gSystem->ResetSignal(kSigUrgent);
792  gSystem->ResetSignal(kSigFloatingException);
793  gSystem->ResetSignal(kSigWindowChanged);
794  } else if(pset.getUntrackedParameter<bool>("AbortOnSignal")){
795  cachePidInfo();
796 
797  //NOTE: ROOT can also be told to abort on these kinds of problems BUT
798  // it requires an TApplication to be instantiated which causes problems
799  gSystem->ResetSignal(kSigBus);
800  gSystem->ResetSignal(kSigSegmentationViolation);
801  gSystem->ResetSignal(kSigIllegalInstruction);
802  installCustomHandler(SIGBUS,sig_dostack_then_abort);
803  sigBusHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
804  installCustomHandler(SIGBUS,sig_abort);
805  });
806  installCustomHandler(SIGSEGV,sig_dostack_then_abort);
807  sigSegvHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
808  installCustomHandler(SIGSEGV,sig_abort);
809  });
810  installCustomHandler(SIGILL,sig_dostack_then_abort);
811  sigIllHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
812  installCustomHandler(SIGILL,sig_abort);
813  });
814  installCustomHandler(SIGTERM,sig_dostack_then_abort);
815  sigTermHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
816  installCustomHandler(SIGTERM,sig_abort);
817  });
818  installCustomHandler(SIGABRT,sig_dostack_then_abort);
819  sigAbrtHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
820  signal(SIGABRT,SIG_DFL); // release SIGABRT to default
821  });
822  }
823 
824  iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds){
825  if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
826  moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
827  }
828  });
829 
830  if(resetErrHandler_) {
831 
832  // Replace the Root error handler with one that uses the MessageLogger
833  SetErrorHandler(RootErrorHandler);
834  }
835 
836  // Enable automatic Root library loading.
837  if(autoLibraryLoader_) {
838  gInterpreter->SetClassAutoloading(1);
839  }
840 
841  // Set ROOT parameters.
842  TTree::SetMaxTreeSize(kMaxLong64);
843  TH1::AddDirectory(kFALSE);
844  //G__SetCatchException(0);
845 
846  // Set custom streamers
848 
849  // Load the library containing dictionaries for std:: classes, if not already loaded.
850  if (!hasDictionary(typeid(std::vector<std::vector<unsigned int> >))) {
851  TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
852  }
853 
854  int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
855  if(debugLevel >0) {
856  gDebug = debugLevel;
857  }
858 
859  // Enable Root implicit multi-threading
860  bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
861  if (imt && not ROOT::IsImplicitMTEnabled()) {
862  ROOT::EnableImplicitMT();
863  }
864  }
865 
867  // close all open ROOT files
868  TIter iter(gROOT->GetListOfFiles());
869  TObject *obj = nullptr;
870  while(nullptr != (obj = iter.Next())) {
871  TFile* f = dynamic_cast<TFile*>(obj);
872  if(f) {
873  // We get a new iterator each time,
874  // because closing a file can invalidate the iterator
875  f->Close();
876  iter = TIter(gROOT->GetListOfFiles());
877  }
878  }
879  }
880 
882  //Tell Root we want to be multi-threaded
883  ROOT::EnableThreadSafety();
884 
885  //When threading, also have to keep ROOT from logging all TObjects into a list
886  TObject::SetObjectStat(false);
887 
888  //Have to avoid having Streamers modify themselves after they have been used
889  TVirtualStreamerInfo::Optimize(false);
890  }
891 
894  desc.setComment("Centralized interface to ROOT.");
895  desc.addUntracked<bool>("UnloadRootSigHandler", false)
896  ->setComment("If True, signals are handled by this service, rather than by ROOT.");
897  desc.addUntracked<bool>("ResetRootErrHandler", true)
898  ->setComment("If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
899  desc.addUntracked<bool>("AutoLibraryLoader", true)
900  ->setComment("If True, enables automatic loading of data dictionaries.");
901  desc.addUntracked<bool>("LoadAllDictionaries",false)
902  ->setComment("If True, loads all ROOT dictionaries.");
903  desc.addUntracked<bool>("EnableIMT",true)
904  ->setComment("If True, calls ROOT::EnableImplicitMT().");
905  desc.addUntracked<bool>("AbortOnSignal",true)
906  ->setComment("If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which attempts to do a clean shutdown.");
907  desc.addUntracked<int>("DebugLevel",0)
908  ->setComment("Sets ROOT's gDebug value.");
909  desc.addUntracked<int>("StackTracePauseTime", 300)
910  ->setComment("Seconds to pause other threads during stack trace.");
911  descriptions.add("InitRootHandlers", desc);
912  }
913 
914  char *const *
916  return pstackArgv_;
917  }
918 
919  void
921  s_ignoreWarnings =false;
922  }
923 
924  void
926  s_ignoreWarnings = true;
927  }
928 
929  void
931  {
932  if(helperThread_) {
933  //Another InitRootHandlers was initialized in this job, possibly
934  // because multiple EventProcessors are being used.
935  //In that case, we are already all setup
936  return;
937  }
938  if (snprintf(pidString_, pidStringLength_-1, "gdb -quiet -p %d 2>&1 <<EOF |\n"
939  "set width 0\n"
940  "set height 0\n"
941  "set pagination no\n"
942  "thread apply all bt\n"
943  "EOF\n"
944  "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'", getpid()) >= pidStringLength_)
945  {
946  std::ostringstream sstr;
947  sstr << "Unable to pre-allocate stacktrace handler information";
948  edm::Exception except(edm::errors::OtherCMS, sstr.str());
949  throw except;
950  }
951 
952  // These are initialized to -1; harmless to close an invalid FD.
953  // If this is called post-fork, we don't want to be communicating on
954  // these FDs as they are used internally by the parent.
955  close(childToParent_[0]);
956  close(childToParent_[1]);
957  childToParent_[0] = -1; childToParent_[1] = -1;
958  close(parentToChild_[0]);
959  close(parentToChild_[1]);
960  parentToChild_[0] = -1; parentToChild_[1] = -1;
961 
962  if (-1 == pipe2(childToParent_, O_CLOEXEC))
963  {
964  std::ostringstream sstr;
965  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
966  edm::Exception except(edm::errors::OtherCMS, sstr.str());
967  throw except;
968  }
969 
970  if (-1 == pipe2(parentToChild_, O_CLOEXEC))
971  {
972  close(childToParent_[0]); close(childToParent_[1]);
973  childToParent_[0] = -1; childToParent_[1] = -1;
974  std::ostringstream sstr;
975  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
976  edm::Exception except(edm::errors::OtherCMS, sstr.str());
977  throw except;
978  }
979 
980  helperThread_.reset(new std::thread(stacktraceHelperThread));
981  helperThread_->detach();
982  }
983 
984  } // end of namespace service
985 } // end of namespace edm
986 
990 
size
Write out results.
unsigned int maxNumberOfThreads() const
Definition: SystemBounds.h:46
T getUntrackedParameter(std::string const &, T const &) const
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:117
double seconds()
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setRefCoreStreamerInTClass()
static char *const pstackArgv_[]
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:99
static ModuleCallingContext const * getCurrentModuleOnThread()
std::vector< Variable::Flags > flags
Definition: MVATrainer.cc:135
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
#define constexpr
std::shared_ptr< const void > sigBusHandler_
static TypeWithDict byName(std::string const &name)
Definition: TypeWithDict.cc:59
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
std::shared_ptr< const void > sigAbrtHandler_
void setComment(std::string const &value)
std::string moduleName(Provenance const &provenance)
Definition: Provenance.cc:27
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static ThreadTracker threadTracker_
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
static char *const * getPstackArgv()
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
std::shared_ptr< const void > sigIllHandler_
std::shared_ptr< const void > sigTermHandler_
void addAdditionalInfo(std::string const &info)
Definition: Exception.cc:235
double f[11][100]
int cmssw_stacktrace(void *)
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static char pstackName[]
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
tbb::concurrent_unordered_set< pthread_t > Container_type
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
Definition: eve_macros.cc:135
Definition: TBBSession.h:68
static char dashC[]
HLT enums.
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:82
static void fillDescriptions(ConfigurationDescriptions &descriptions)
SeverityLevel
bool hasDictionary(std::type_info const &)
#define O_NONBLOCK
Definition: SysFile.h:21
def write(self, setup)