test
CMS 3D CMS Logo

All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
InitRootHandlers.cc
Go to the documentation of this file.
2 
4 
19 
20 #include "tbb/task.h"
21 #include "tbb/task_scheduler_observer.h"
22 #include "tbb/concurrent_unordered_set.h"
23 #include <thread>
24 #include <sys/wait.h>
25 #include <sstream>
26 #include <string.h>
27 #include <poll.h>
28 #include <atomic>
29 
30 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
31 // version. This can break our stack trace printer. Avoid this by
32 // invoking the syscall directly.
33 #ifdef __linux__
34 #include <syscall.h>
35 #endif
36 
37 #include "TROOT.h"
38 #include "TError.h"
39 #include "TFile.h"
40 #include "TInterpreter.h"
41 #include "TH1.h"
42 #include "TSystem.h"
43 #include "TUnixSystem.h"
44 #include "TTree.h"
45 #include "TVirtualStreamerInfo.h"
46 
47 #include "TThread.h"
48 #include "TClassTable.h"
49 
50 #include <memory>
51 
52 namespace {
53  // size of static buffer allocated for listing module names following a
54  // stacktrace abort
55  constexpr std::size_t moduleBufferSize = 128;
56 }
57 
58 namespace edm {
60  class ParameterSet;
61  class ActivityRegistry;
62 
63  namespace service {
64  class InitRootHandlers : public RootHandlers {
65 
66  friend int cmssw_stacktrace(void *);
67 
68  public:
69  class ThreadTracker : public tbb::task_scheduler_observer {
70  public:
71  typedef tbb::concurrent_unordered_set<pthread_t> Container_type;
72 
73  ThreadTracker() : tbb::task_scheduler_observer() {
74  observe(true);
75  }
76  void on_scheduler_entry(bool) {
77  // ensure thread local has been allocated; not necessary on Linux with
78  // the current cmsRun linkage, but could be an issue if the platform
79  // or linkage leads to "lazy" allocation of the thread local. By
80  // referencing it here we make sure it has been allocated and can be
81  // accessed safely from our signal handler.
83  threadIDs_.insert(pthread_self());
84  }
85  const Container_type& IDs() { return threadIDs_; }
86 
87  private:
89  };
90 
91  explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
92  virtual ~InitRootHandlers();
93 
94  static void fillDescriptions(ConfigurationDescriptions& descriptions);
95  static void stacktraceFromThread();
97  static int stackTracePause() { return stackTracePause_; }
98 
99  static std::vector<std::array<char,moduleBufferSize>> moduleListBuffers_;
100  static std::atomic<std::size_t> nextModule_, doneModules_;
101  private:
102  static char *const *getPstackArgv();
103  virtual void enableWarnings_() override;
104  virtual void ignoreWarnings_() override;
105  virtual void willBeUsingThreads() override;
106  virtual void initializeThisThreadForUse() override;
107 
108  void cachePidInfoHandler(unsigned int, unsigned int) {cachePidInfo();}
109  void cachePidInfo();
110  static void stacktraceHelperThread();
111 
112  static const int pidStringLength_ = 200;
114  static char * const pstackArgv_[];
115  static int parentToChild_[2];
116  static int childToParent_[2];
117  static std::unique_ptr<std::thread> helperThread_;
119  static int stackTracePause_;
120 
125  std::shared_ptr<const void> sigBusHandler_;
126  std::shared_ptr<const void> sigSegvHandler_;
127  std::shared_ptr<const void> sigIllHandler_;
128  std::shared_ptr<const void> sigTermHandler_;
129  };
130 
131  inline
133  return true;
134  }
135 
136  } // end of namespace service
137 } // end of namespace edm
138 
139 namespace edm {
140  namespace service {
141  int cmssw_stacktrace(void *);
142  }
143 }
144 
145 namespace {
146  enum class SeverityLevel {
147  kInfo,
148  kWarning,
149  kError,
150  kSysError,
151  kFatal
152  };
153 
154  static thread_local bool s_ignoreWarnings = false;
155 
156  static bool s_ignoreEverything = false;
157 
158  void RootErrorHandlerImpl(int level, char const* location, char const* message) {
159 
160  bool die = false;
161 
162  // Translate ROOT severity level to MessageLogger severity level
163 
164  SeverityLevel el_severity = SeverityLevel::kInfo;
165 
166  if (level >= kFatal) {
167  el_severity = SeverityLevel::kFatal;
168  } else if (level >= kSysError) {
169  el_severity = SeverityLevel::kSysError;
170  } else if (level >= kError) {
171  el_severity = SeverityLevel::kError;
172  } else if (level >= kWarning) {
173  el_severity = s_ignoreWarnings ? SeverityLevel::kInfo : SeverityLevel::kWarning;
174  }
175 
176  if(s_ignoreEverything) {
177  el_severity = SeverityLevel::kInfo;
178  }
179 
180  // Adapt C-strings to std::strings
181  // Arrange to report the error location as furnished by Root
182 
183  std::string el_location = "@SUB=?";
184  if (location != 0) el_location = std::string("@SUB=")+std::string(location);
185 
186  std::string el_message = "?";
187  if (message != 0) el_message = message;
188 
189  // Try to create a meaningful id string using knowledge of ROOT error messages
190  //
191  // id == "ROOT-ClassName" where ClassName is the affected class
192  // else "ROOT/ClassName" where ClassName is the error-declaring class
193  // else "ROOT"
194 
195  std::string el_identifier = "ROOT";
196 
197  std::string precursor("class ");
198  size_t index1 = el_message.find(precursor);
199  if (index1 != std::string::npos) {
200  size_t index2 = index1 + precursor.length();
201  size_t index3 = el_message.find_first_of(" :", index2);
202  if (index3 != std::string::npos) {
203  size_t substrlen = index3-index2;
204  el_identifier += "-";
205  el_identifier += el_message.substr(index2,substrlen);
206  }
207  } else {
208  index1 = el_location.find("::");
209  if (index1 != std::string::npos) {
210  el_identifier += "/";
211  el_identifier += el_location.substr(0, index1);
212  }
213  }
214 
215  // Intercept some messages and upgrade the severity
216 
217  if ((el_location.find("TBranchElement::Fill") != std::string::npos)
218  && (el_message.find("fill branch") != std::string::npos)
219  && (el_message.find("address") != std::string::npos)
220  && (el_message.find("not set") != std::string::npos)) {
221  el_severity = SeverityLevel::kFatal;
222  }
223 
224  if ((el_message.find("Tree branches") != std::string::npos)
225  && (el_message.find("different numbers of entries") != std::string::npos)) {
226  el_severity = SeverityLevel::kFatal;
227  }
228 
229 
230  // Intercept some messages and downgrade the severity
231 
232  if ((el_message.find("no dictionary for class") != std::string::npos) ||
233  (el_message.find("already in TClassTable") != std::string::npos) ||
234  (el_message.find("matrix not positive definite") != std::string::npos) ||
235  (el_message.find("not a TStreamerInfo object") != std::string::npos) ||
236  (el_message.find("Problems declaring payload") != std::string::npos) ||
237  (el_message.find("Announced number of args different from the real number of argument passed") != std::string::npos) || // Always printed if gDebug>0 - regardless of whether warning message is real.
238  (el_location.find("Fit") != std::string::npos) ||
239  (el_location.find("TDecompChol::Solve") != std::string::npos) ||
240  (el_location.find("THistPainter::PaintInit") != std::string::npos) ||
241  (el_location.find("TUnixSystem::SetDisplay") != std::string::npos) ||
242  (el_location.find("TGClient::GetFontByName") != std::string::npos) ||
243  (el_location.find("Inverter::Dinv") != std::string::npos) ||
244  (el_message.find("nbins is <=0 - set to nbins = 1") != std::string::npos) ||
245  (el_message.find("nbinsy is <=0 - set to nbinsy = 1") != std::string::npos) ||
246  (level < kError and
247  (el_location.find("CINTTypedefBuilder::Setup")!= std::string::npos) and
248  (el_message.find("possible entries are in use!") != std::string::npos))) {
249  el_severity = SeverityLevel::kInfo;
250  }
251 
252  if (el_severity == SeverityLevel::kInfo) {
253  // Don't throw if the message is just informational.
254  die = false;
255  } else {
256  die = true;
257  }
258 
259  // Feed the message to the MessageLogger and let it choose to suppress or not.
260 
261  // Root has declared a fatal error. Throw an EDMException unless the
262  // message corresponds to a pending signal. In that case, do not throw
263  // but let the OS deal with the signal in the usual way.
264  if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
265  std::ostringstream sstr;
266  sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
267  edm::Exception except(edm::errors::FatalRootError, sstr.str());
268  except.addAdditionalInfo(except.message());
269  except.clearMessage();
270  throw except;
271 
272  }
273 
274  // Typically, we get here only for informational messages,
275  // but we leave the other code in just in case we change
276  // the criteria for throwing.
277  if (el_severity == SeverityLevel::kFatal) {
278  edm::LogError("Root_Fatal") << el_location << el_message;
279  } else if (el_severity == SeverityLevel::kSysError) {
280  edm::LogError("Root_Severe") << el_location << el_message;
281  } else if (el_severity == SeverityLevel::kError) {
282  edm::LogError("Root_Error") << el_location << el_message;
283  } else if (el_severity == SeverityLevel::kWarning) {
284  edm::LogWarning("Root_Warning") << el_location << el_message ;
285  } else if (el_severity == SeverityLevel::kInfo) {
286  edm::LogInfo("Root_Information") << el_location << el_message ;
287  }
288  }
289 
290  void RootErrorHandler(int level, bool, char const* location, char const* message) {
291  RootErrorHandlerImpl(level, location, message);
292  }
293 
294  extern "C" {
295 
296  static int full_write(int fd, const char *text)
297  {
298  const char *buffer = text;
299  size_t count = strlen(text);
300  ssize_t written = 0;
301  while (count)
302  {
303  written = write(fd, buffer, count);
304  if (written == -1)
305  {
306  if (errno == EINTR) {continue;}
307  else {return -errno;}
308  }
309  count -= written;
310  buffer += written;
311  }
312  return 0;
313  }
314 
315  static int full_read(int fd, char *inbuf, size_t len, int timeout_s=-1)
316  {
317  char *buf = inbuf;
318  size_t count = len;
319  ssize_t complete = 0;
320  std::chrono::time_point<std::chrono::steady_clock> end_time = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_s);
321  int flags;
322  if (timeout_s < 0)
323  {
324  flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
325  }
326  else if ((-1 == (flags = fcntl(fd, F_GETFL))))
327  {
328  return -errno;
329  }
330  if ((flags & O_NONBLOCK) != O_NONBLOCK)
331  {
332  if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK))
333  {
334  return -errno;
335  }
336  }
337  while (count)
338  {
339  if (timeout_s >= 0)
340  {
341  struct pollfd poll_info{fd, POLLIN, 0};
342  int ms_remaining = std::chrono::duration_cast<std::chrono::milliseconds>(end_time-std::chrono::steady_clock::now()).count();
343  if (ms_remaining > 0)
344  {
345  if (poll(&poll_info, 1, ms_remaining) == 0)
346  {
347  if ((flags & O_NONBLOCK) != O_NONBLOCK)
348  {
349  fcntl(fd, F_SETFL, flags);
350  }
351  return -ETIMEDOUT;
352  }
353  }
354  else if (ms_remaining < 0)
355  {
356  if ((flags & O_NONBLOCK) != O_NONBLOCK)
357  {
358  fcntl(fd, F_SETFL, flags);
359  }
360  return -ETIMEDOUT;
361  }
362  }
363  complete = read(fd, buf, count);
364  if (complete == -1)
365  {
366  if (errno == EINTR) {continue;}
367  else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {continue;}
368  else
369  {
370  int orig_errno = errno;
371  if ((flags & O_NONBLOCK) != O_NONBLOCK)
372  {
373  fcntl(fd, F_SETFL, flags);
374  }
375  return -orig_errno;
376  }
377  }
378  count -= complete;
379  buf += complete;
380  }
381  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
382  fcntl(fd, F_SETFL, flags);
383  }
384  return 0;
385  }
386 
387  static int full_cerr_write(const char *text)
388  {
389  return full_write(2, text);
390  }
391 
392 // these signals are only used inside the stacktrace signal handler,
393 // so common signals can be used. They do have to be different, since
394 // we do not set SA_NODEFER, and RESUME must be a signal that will
395 // cause sleep() to return early.
396 #if defined(SIGRTMAX)
397 #define PAUSE_SIGNAL SIGRTMAX
398 #define RESUME_SIGNAL SIGRTMAX-1
399 #elif defined(SIGINFO) // macOS/BSD
400 #define PAUSE_SIGNAL SIGINFO
401 #define RESUME_SIGNAL SIGALRM
402 #endif
403 
404  // does nothing, here only to interrupt the sleep() in the pause handler
405  void sig_resume_handler(int sig, siginfo_t*, void*) {}
406 
407  // pause a thread so that a (slow) stacktrace will capture the current state
408  void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
409  using namespace edm::service;
410 
411 #ifdef RESUME_SIGNAL
412  sigset_t sigset;
413  sigemptyset(&sigset);
414  sigaddset(&sigset, RESUME_SIGNAL);
415  pthread_sigmask(SIG_UNBLOCK, &sigset, 0);
416 #endif
417  // sleep interrrupts on a handled delivery of the resume signal
419 
420  if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
423  char* buff = InitRootHandlers::moduleListBuffers_[i].data();
424 
425  strlcpy(buff, "\nModule: ", moduleBufferSize);
427  strlcat(buff, edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(), moduleBufferSize);
428  } else {
429  strlcat(buff, "none", moduleBufferSize);
430  }
432  }
433  }
434  }
435 
436  void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
437  using namespace edm::service;
438 
439  const auto& tids = InitRootHandlers::threadIDs();
440 
441  const auto self = pthread_self();
442 #ifdef PAUSE_SIGNAL
443  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
444  // install the "pause" handler
445  struct sigaction act;
446  act.sa_sigaction = sig_pause_for_stacktrace;
447  act.sa_flags = 0;
448  sigemptyset(&act.sa_mask);
449  sigaction(PAUSE_SIGNAL, &act, NULL);
450 
451  // unblock pause signal globally, resume is unblocked in the pause handler
452  sigset_t pausesigset;
453  sigemptyset(&pausesigset);
454  sigaddset(&pausesigset, PAUSE_SIGNAL);
455  sigprocmask(SIG_UNBLOCK, &pausesigset, 0);
456 
457  // send a pause signal to all CMSSW/TBB threads other than self
458  for (auto id : tids) {
459  if (self != id) {
460  pthread_kill(id, PAUSE_SIGNAL);
461  }
462  }
463 
464 #ifdef RESUME_SIGNAL
465  // install the "resume" handler
466  act.sa_sigaction = sig_resume_handler;
467  sigaction(RESUME_SIGNAL, &act, NULL);
468 #endif
469  }
470 #endif
471 
472  const char* signalname = "unknown";
473  switch (sig) {
474  case SIGBUS:
475  {
476  signalname = "bus error";
477  break;
478  }
479  case SIGSEGV:
480  {
481  signalname = "segmentation violation";
482  break;
483  }
484  case SIGILL:
485  {
486  signalname = "illegal instruction";
487  break;
488  }
489  case SIGTERM:
490  {
491  signalname = "external termination request";
492  break;
493  }
494  default:
495  break;
496  }
497  full_cerr_write("\n\nA fatal system signal has occurred: ");
498  full_cerr_write(signalname);
499  full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
500 
502 
503  // resume the signal handlers to store the current module; we are not guaranteed they
504  // will have time to store their modules, so there is a race condition; this could be
505  // avoided by storing the module information before sleeping, a change that may be
506  // made when we're convinced accessing the thread-local current module is safe.
507 #ifdef RESUME_SIGNAL
508  std::size_t notified = 0;
509  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
510  for (auto id : tids) {
511  if (self != id) {
512  if (pthread_kill(id, RESUME_SIGNAL) == 0) ++notified;
513  }
514  }
515  }
516 #endif
517 
518  full_cerr_write("\nCurrent Modules:\n");
519 
520  // Checking tids.count(self) ensures that we only try to access the current module in
521  // CMSSW/TBB threads. Those threads access the thread-local current module at the same
522  // time the thread is registered, so any lazy allocation will have been done at that
523  // point. Not necessary on Linux with the current cmsRun linkage, as the thread-local
524  // is allocated at exec time, not lazily.
525  if (tids.count(self) > 0) {
526  char buff[moduleBufferSize] = "\nModule: ";
528  strlcat(buff, edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(), moduleBufferSize);
529  } else {
530  strlcat(buff, "none", moduleBufferSize);
531  }
532  strlcat(buff, " (crashed)", moduleBufferSize);
533  full_cerr_write(buff);
534  } else {
535  full_cerr_write("\nModule: non-CMSSW (crashed)");
536  }
537 
538 #ifdef PAUSE_SIGNAL
539  // wait a short interval for the paused threads to resume and fill in their module
540  // information, then print
541  if (InitRootHandlers::doneModules_.is_lock_free()) {
542  int spincount = 0;
543  timespec t = { 0, 1000 };
544  while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) { nanosleep(&t, nullptr); }
545  for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
546  full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
547  }
548  }
549 #endif
550 
551  full_cerr_write("\n\nA fatal system signal has occurred: ");
552  full_cerr_write(signalname);
553  full_cerr_write("\n");
554 
555  // For these four known cases, re-raise the signal so get the correct
556  // exit code.
557  if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM))
558  {
559  signal(sig, SIG_DFL);
560  raise(sig);
561  }
562  else
563  {
564  ::abort();
565  }
566  }
567 
568  void sig_abort(int sig, siginfo_t*, void*) {
569  ::abort();
570  }
571  }
572 
573  void set_default_signals() {
574  signal(SIGILL, SIG_DFL);
575  signal(SIGSEGV, SIG_DFL);
576  signal(SIGBUS, SIG_DFL);
577  signal(SIGTERM, SIG_DFL);
578  }
579 
580 } // end of unnamed namespace
581 
582 namespace edm {
583  namespace service {
584 
585  /*
586  * We've run into issues where GDB fails to print the thread which calls clone().
587  * To avoid this problem, we have an alternate approach below where the signal handler
588  * only reads/writes to a dedicated thread via pipes. The helper thread does the clone()
589  * invocation; we don't care if that thread is missing from the traceback in this case.
590  */
591  static void cmssw_stacktrace_fork();
592 
594  {
595  int toParent = childToParent_[1];
596  int fromParent = parentToChild_[0];
597  char buf[2]; buf[1] = '\0';
598 
599  while(true)
600  {
601  int result = full_read(fromParent, buf, 1);
602  if (result < 0)
603  {
604  // To avoid a deadlock (this function is NOT re-entrant), reset signals
605  // We never set them back to the CMSSW handler because we assume the parent
606  // thread will abort for us.
607  set_default_signals();
608  close(toParent);
609  full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
610  full_cerr_write(strerror(-result));
611  full_cerr_write("\n");
612  ::abort();
613  }
614  if (buf[0] == '1')
615  {
616  set_default_signals();
618  full_write(toParent, buf);
619  }
620  else if (buf[0] == '2')
621  {
622  // We have just finished forking. Reload the file descriptors for thread
623  // communication.
624  close(toParent);
625  close(fromParent);
626  toParent = childToParent_[1];
627  fromParent = parentToChild_[0];
628  }
629  else if (buf[0] == '3')
630  {
631  break;
632  }
633  else
634  {
635  set_default_signals();
636  close(toParent);
637  full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
638  full_cerr_write(buf);
639  full_cerr_write("\n");
640  ::abort();
641  }
642  }
643  }
644 
646  {
647  int result = full_write(parentToChild_[1], "1");
648  if (result < 0)
649  {
650  full_cerr_write("\n\nAttempt to request stacktrace failed: ");
651  full_cerr_write(strerror(-result));
652  full_cerr_write("\n");
653  return;
654  }
655  char buf[2]; buf[1] = '\0';
656  if ((result = full_read(childToParent_[0], buf, 1, 5*60)) < 0)
657  {
658  full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
659  if (result == -ETIMEDOUT)
660  {
661  full_cerr_write("timed out waiting for GDB to complete.");
662  }
663  else
664  {
665  full_cerr_write(strerror(-result));
666  }
667  full_cerr_write("\n");
668  return;
669  }
670  }
671 
673  {
674  char child_stack[4*1024];
675  char *child_stack_ptr = child_stack + 4*1024;
676  // On Linux, we currently use jemalloc. This registers pthread_atfork handlers; these
677  // handlers are *not* async-signal safe. Hence, a deadlock is possible if we invoke
678  // fork() from our signal handlers. Accordingly, we use clone (not POSIX, but AS-safe)
679  // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
680  int pid =
681 #ifdef __linux__
682  clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM|CLONE_FS|SIGCHLD, nullptr);
683 #else
684  fork();
685  if (child_stack_ptr) {} // Suppress 'unused variable' warning on non-Linux
686  if (pid == 0) {edm::service::cmssw_stacktrace(nullptr); ::abort();}
687 #endif
688  if (pid == -1)
689  {
690  full_cerr_write("(Attempt to perform stack dump failed.)\n");
691  }
692  else
693  {
694  int status;
695  if (waitpid(pid, &status, 0) == -1)
696  {
697  full_cerr_write("(Failed to wait on stack dump output.)\n");
698  }
699  if (status)
700  {
701  full_cerr_write("(GDB stack trace failed unexpectedly)\n");
702  }
703  }
704  }
705 
706  int cmssw_stacktrace(void * /*arg*/)
707  {
709  // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
710  // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
711  // calls dlsym.
712 #ifdef __linux__
713  syscall(SYS_execve, "/bin/sh", argv, __environ);
714 #else
715  execv("/bin/sh", argv);
716 #endif
717  ::abort();
718  return 1;
719  }
720 
721  namespace {
722 
723  void localInitializeThisThreadForUse() {
724  static thread_local TThread guard;
725  }
726 
727  class InitializeThreadTask : public tbb::task {
728  public:
729  InitializeThreadTask(std::atomic<unsigned int>* counter,
730  tbb::task* waitingTask):
731  threadsLeft_(counter),
732  waitTask_(waitingTask) {}
733 
734  tbb::task* execute() override {
735  //For each tbb thread, setup the initialization
736  // required by ROOT and then wait until all
737  // threads have done so in order to guarantee the all get setup
738 
739  localInitializeThisThreadForUse();
740  (*threadsLeft_)--;
741  while(0 != threadsLeft_->load());
742  waitTask_->decrement_ref_count();
743  return nullptr;
744  }
745  private:
746  std::atomic<unsigned int>* threadsLeft_;
747  tbb::task* waitTask_;
748  };
749  }
750 
751  static char pstackName[] = "(CMSSW stack trace helper)";
752  static char dashC[] = "-c";
755  int InitRootHandlers::parentToChild_[2] = {-1, -1};
756  int InitRootHandlers::childToParent_[2] = {-1, -1};
757  std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
759  std::vector<std::array<char,moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
760  std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
762 
763 
765  : RootHandlers(),
766  unloadSigHandler_(pset.getUntrackedParameter<bool> ("UnloadRootSigHandler")),
767  resetErrHandler_(pset.getUntrackedParameter<bool> ("ResetRootErrHandler")),
768  loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
769  autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool> ("AutoLibraryLoader"))
770  {
771  stackTracePause_ = pset.getUntrackedParameter<int> ("StackTracePauseTime");
772 
773  if(unloadSigHandler_) {
774  // Deactivate all the Root signal handlers and restore the system defaults
775  gSystem->ResetSignal(kSigChild);
776  gSystem->ResetSignal(kSigBus);
777  gSystem->ResetSignal(kSigSegmentationViolation);
778  gSystem->ResetSignal(kSigIllegalInstruction);
779  gSystem->ResetSignal(kSigSystem);
780  gSystem->ResetSignal(kSigPipe);
781  gSystem->ResetSignal(kSigAlarm);
782  gSystem->ResetSignal(kSigUrgent);
783  gSystem->ResetSignal(kSigFloatingException);
784  gSystem->ResetSignal(kSigWindowChanged);
785  } else if(pset.getUntrackedParameter<bool>("AbortOnSignal")){
786  cachePidInfo();
787 
788  //NOTE: ROOT can also be told to abort on these kinds of problems BUT
789  // it requires an TApplication to be instantiated which causes problems
790  gSystem->ResetSignal(kSigBus);
791  gSystem->ResetSignal(kSigSegmentationViolation);
792  gSystem->ResetSignal(kSigIllegalInstruction);
793  installCustomHandler(SIGBUS,sig_dostack_then_abort);
794  sigBusHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
795  installCustomHandler(SIGBUS,sig_abort);
796  });
797  installCustomHandler(SIGSEGV,sig_dostack_then_abort);
798  sigSegvHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
799  installCustomHandler(SIGSEGV,sig_abort);
800  });
801  installCustomHandler(SIGILL,sig_dostack_then_abort);
802  sigIllHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
803  installCustomHandler(SIGILL,sig_abort);
804  });
805  installCustomHandler(SIGTERM,sig_dostack_then_abort);
806  sigTermHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
807  installCustomHandler(SIGTERM,sig_abort);
808  });
810  }
811 
812  //Initialize each TBB thread so ROOT knows about them
813  iReg.watchPreallocate( [](service::SystemBounds const& iBounds) {
814  auto const nThreads =iBounds.maxNumberOfThreads();
815  if(nThreads > 1) {
816  std::atomic<unsigned int> threadsLeft{nThreads};
817 
818  std::shared_ptr<tbb::empty_task> waitTask{new (tbb::task::allocate_root()) tbb::empty_task{},
819  [](tbb::empty_task* iTask){tbb::task::destroy(*iTask);} };
820 
821  waitTask->set_ref_count(1+nThreads);
822  for(unsigned int i=0; i<nThreads;++i) {
823  tbb::task::spawn( *( new(tbb::task::allocate_root()) InitializeThreadTask(&threadsLeft, waitTask.get())));
824  }
825 
826  waitTask->wait_for_all();
827 
828  }
829  }
830  );
831 
832  iReg.watchPreallocate([this](edm::service::SystemBounds const& iBounds){
833  if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
834  moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
835  }
836  });
837 
838  if(resetErrHandler_) {
839 
840  // Replace the Root error handler with one that uses the MessageLogger
841  SetErrorHandler(RootErrorHandler);
842  }
843 
844  // Enable automatic Root library loading.
845  if(autoLibraryLoader_) {
846  gInterpreter->SetClassAutoloading(1);
847  }
848 
849  // Set ROOT parameters.
850  TTree::SetMaxTreeSize(kMaxLong64);
851  TH1::AddDirectory(kFALSE);
852  //G__SetCatchException(0);
853 
854  // Set custom streamers
856 
857  // Load the library containing dictionaries for std:: classes, if not already loaded.
858  if (!hasDictionary(typeid(std::vector<std::vector<unsigned int> >))) {
859  TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
860  }
861 
862  int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
863  if(debugLevel >0) {
864  gDebug = debugLevel;
865  }
866  }
867 
869  // close all open ROOT files
870  TIter iter(gROOT->GetListOfFiles());
871  TObject *obj = nullptr;
872  while(nullptr != (obj = iter.Next())) {
873  TFile* f = dynamic_cast<TFile*>(obj);
874  if(f) {
875  // We get a new iterator each time,
876  // because closing a file can invalidate the iterator
877  f->Close();
878  iter = TIter(gROOT->GetListOfFiles());
879  }
880  }
881  }
882 
884  //Tell Root we want to be multi-threaded
885  TThread::Initialize();
886  //When threading, also have to keep ROOT from logging all TObjects into a list
887  TObject::SetObjectStat(false);
888 
889  //Have to avoid having Streamers modify themselves after they have been used
890  TVirtualStreamerInfo::Optimize(false);
891  }
892 
894  localInitializeThisThreadForUse();
895  }
896 
899  desc.setComment("Centralized interface to ROOT.");
900  desc.addUntracked<bool>("UnloadRootSigHandler", false)
901  ->setComment("If True, signals are handled by this service, rather than by ROOT.");
902  desc.addUntracked<bool>("ResetRootErrHandler", true)
903  ->setComment("If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
904  desc.addUntracked<bool>("AutoLibraryLoader", true)
905  ->setComment("If True, enables automatic loading of data dictionaries.");
906  desc.addUntracked<bool>("LoadAllDictionaries",false)
907  ->setComment("If True, loads all ROOT dictionaries.");
908  desc.addUntracked<bool>("AbortOnSignal",true)
909  ->setComment("If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which attempts to do a clean shutdown.");
910  desc.addUntracked<int>("DebugLevel",0)
911  ->setComment("Sets ROOT's gDebug value.");
912  desc.addUntracked<int>("StackTracePauseTime", 300)
913  ->setComment("Seconds to pause other threads during stack trace.");
914  descriptions.add("InitRootHandlers", desc);
915  }
916 
917  char *const *
919  return pstackArgv_;
920  }
921 
922  void
924  s_ignoreWarnings =false;
925  }
926 
927  void
929  s_ignoreWarnings = true;
930  }
931 
932  void
934  {
935  if (snprintf(pidString_, pidStringLength_-1, "gdb -quiet -p %d 2>&1 <<EOF |\n"
936  "set width 0\n"
937  "set height 0\n"
938  "set pagination no\n"
939  "thread apply all bt\n"
940  "EOF\n"
941  "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'", getpid()) >= pidStringLength_)
942  {
943  std::ostringstream sstr;
944  sstr << "Unable to pre-allocate stacktrace handler information";
945  edm::Exception except(edm::errors::OtherCMS, sstr.str());
946  throw except;
947  }
948 
949  // These are initialized to -1; harmless to close an invalid FD.
950  // If this is called post-fork, we don't want to be communicating on
951  // these FDs as they are used internally by the parent.
952  close(childToParent_[0]);
953  close(childToParent_[1]);
954  childToParent_[0] = -1; childToParent_[1] = -1;
955  close(parentToChild_[0]);
956  close(parentToChild_[1]);
957  parentToChild_[0] = -1; parentToChild_[1] = -1;
958 
959  if (-1 == pipe2(childToParent_, O_CLOEXEC))
960  {
961  std::ostringstream sstr;
962  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
963  edm::Exception except(edm::errors::OtherCMS, sstr.str());
964  throw except;
965  }
966 
967  if (-1 == pipe2(parentToChild_, O_CLOEXEC))
968  {
969  close(childToParent_[0]); close(childToParent_[1]);
970  childToParent_[0] = -1; childToParent_[1] = -1;
971  std::ostringstream sstr;
972  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
973  edm::Exception except(edm::errors::OtherCMS, sstr.str());
974  throw except;
975  }
976 
977  helperThread_.reset(new std::thread(stacktraceHelperThread));
978  helperThread_->detach();
979  }
980 
981  } // end of namespace service
982 } // end of namespace edm
983 
987 
unsigned int maxNumberOfThreads() const
Definition: SystemBounds.h:46
T getUntrackedParameter(std::string const &, T const &) const
int i
Definition: DBlmapReader.cc:9
virtual void enableWarnings_() override
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:117
double seconds()
void watchPreallocate(Preallocate::slot_type const &iSlot)
static char *const pstackArgv_[]
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventIDconst &, edm::Timestampconst & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:99
static ModuleCallingContext const * getCurrentModuleOnThread()
void setRefCoreStreamer(bool resetAll=false)
std::vector< Variable::Flags > flags
Definition: MVATrainer.cc:135
#define NULL
Definition: scimark2.h:8
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
friend int cmssw_stacktrace(void *)
#define constexpr
std::shared_ptr< const void > sigBusHandler_
static TypeWithDict byName(std::string const &name)
Definition: TypeWithDict.cc:60
tuple result
Definition: mps_fire.py:84
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
void cachePidInfoHandler(unsigned int, unsigned int)
std::atomic< unsigned int > * threadsLeft_
void setComment(std::string const &value)
std::string moduleName(Provenance const &provenance)
Definition: Provenance.cc:27
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static ThreadTracker threadTracker_
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
static char *const * getPstackArgv()
std::shared_ptr< const void > sigIllHandler_
virtual void initializeThisThreadForUse() override
virtual void ignoreWarnings_() override
std::shared_ptr< const void > sigTermHandler_
tuple fd
Definition: ztee.py:136
void addAdditionalInfo(std::string const &info)
Definition: Exception.cc:235
double f[11][100]
tuple text
Definition: runonSM.py:42
int cmssw_stacktrace(void *)
tbb::task * waitTask_
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static char pstackName[]
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
tuple pid
Definition: sysUtil.py:22
tbb::concurrent_unordered_set< pthread_t > Container_type
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
Definition: eve_macros.cc:135
void watchPostForkReacquireResources(PostForkReacquireResources::slot_type const &iSlot)
virtual void willBeUsingThreads() override
static char dashC[]
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:82
static void fillDescriptions(ConfigurationDescriptions &descriptions)
static std::atomic< unsigned int > counter
SeverityLevel
bool hasDictionary(std::type_info const &)
#define O_NONBLOCK
Definition: SysFile.h:21
tuple level
Definition: testEve_cfg.py:34
tuple size
Write out results.
tuple status
Definition: mps_update.py:57