CMS 3D CMS Logo

InitRootHandlers.cc
Go to the documentation of this file.
2 
4 
20 
21 #include "tbb/task.h"
22 #include "tbb/task_scheduler_observer.h"
23 #include "tbb/concurrent_unordered_set.h"
24 #include <thread>
25 #include <sys/wait.h>
26 #include <sstream>
27 #include <string.h>
28 #include <poll.h>
29 #include <atomic>
30 
31 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
32 // version. This can break our stack trace printer. Avoid this by
33 // invoking the syscall directly.
34 #ifdef __linux__
35 #include <syscall.h>
36 #endif
37 
38 #include "TROOT.h"
39 #include "TError.h"
40 #include "TFile.h"
41 #include "TInterpreter.h"
42 #include "TH1.h"
43 #include "TSystem.h"
44 #include "TUnixSystem.h"
45 #include "TTree.h"
46 #include "TVirtualStreamerInfo.h"
47 
48 #include "TThread.h"
49 #include "TClassTable.h"
50 
51 #include <memory>
52 
53 namespace {
54  // size of static buffer allocated for listing module names following a
55  // stacktrace abort
56  constexpr std::size_t moduleBufferSize = 128;
57 }
58 
59 namespace edm {
61  class ParameterSet;
62  class ActivityRegistry;
63 
64  namespace service {
65  class InitRootHandlers : public RootHandlers {
66 
67  friend int cmssw_stacktrace(void *);
68 
69  public:
70  class ThreadTracker : public tbb::task_scheduler_observer {
71  public:
72  typedef tbb::concurrent_unordered_set<pthread_t> Container_type;
73 
74  ThreadTracker() : tbb::task_scheduler_observer() {
75  observe(true);
76  }
77  void on_scheduler_entry(bool) {
78  // ensure thread local has been allocated; not necessary on Linux with
79  // the current cmsRun linkage, but could be an issue if the platform
80  // or linkage leads to "lazy" allocation of the thread local. By
81  // referencing it here we make sure it has been allocated and can be
82  // accessed safely from our signal handler.
84  threadIDs_.insert(pthread_self());
85  }
86  const Container_type& IDs() { return threadIDs_; }
87 
88  private:
89  Container_type threadIDs_;
90  };
91 
92  explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
93  virtual ~InitRootHandlers();
94 
95  static void fillDescriptions(ConfigurationDescriptions& descriptions);
96  static void stacktraceFromThread();
98  static int stackTracePause() { return stackTracePause_; }
99 
100  static std::vector<std::array<char,moduleBufferSize>> moduleListBuffers_;
101  static std::atomic<std::size_t> nextModule_, doneModules_;
102  private:
103  static char *const *getPstackArgv();
104  virtual void enableWarnings_() override;
105  virtual void ignoreWarnings_() override;
106  virtual void willBeUsingThreads() override;
107  virtual void initializeThisThreadForUse() override;
108 
109  void cachePidInfoHandler(unsigned int, unsigned int) {
110  //this is called only on a fork, so the thread doesn't
111  // actually exist anymore
112  helperThread_.reset();
113  cachePidInfo();}
114  void cachePidInfo();
115  static void stacktraceHelperThread();
116 
117  static const int pidStringLength_ = 200;
119  static char * const pstackArgv_[];
120  static int parentToChild_[2];
121  static int childToParent_[2];
122  static std::unique_ptr<std::thread> helperThread_;
124  static int stackTracePause_;
125 
130  std::shared_ptr<const void> sigBusHandler_;
131  std::shared_ptr<const void> sigSegvHandler_;
132  std::shared_ptr<const void> sigIllHandler_;
133  std::shared_ptr<const void> sigTermHandler_;
134  };
135 
136  inline
138  return true;
139  }
140 
141  } // end of namespace service
142 } // end of namespace edm
143 
144 namespace edm {
145  namespace service {
146  int cmssw_stacktrace(void *);
147  }
148 }
149 
150 namespace {
151  enum class SeverityLevel {
152  kInfo,
153  kWarning,
154  kError,
155  kSysError,
156  kFatal
157  };
158 
159  static thread_local bool s_ignoreWarnings = false;
160 
161  static bool s_ignoreEverything = false;
162 
163  void RootErrorHandlerImpl(int level, char const* location, char const* message) {
164 
165  bool die = false;
166 
167  // Translate ROOT severity level to MessageLogger severity level
168 
169  SeverityLevel el_severity = SeverityLevel::kInfo;
170 
171  if (level >= kFatal) {
172  el_severity = SeverityLevel::kFatal;
173  } else if (level >= kSysError) {
174  el_severity = SeverityLevel::kSysError;
175  } else if (level >= kError) {
176  el_severity = SeverityLevel::kError;
177  } else if (level >= kWarning) {
178  el_severity = s_ignoreWarnings ? SeverityLevel::kInfo : SeverityLevel::kWarning;
179  }
180 
181  if(s_ignoreEverything) {
182  el_severity = SeverityLevel::kInfo;
183  }
184 
185  // Adapt C-strings to std::strings
186  // Arrange to report the error location as furnished by Root
187 
188  std::string el_location = "@SUB=?";
189  if (location != 0) el_location = std::string("@SUB=")+std::string(location);
190 
191  std::string el_message = "?";
192  if (message != 0) el_message = message;
193 
194  // Try to create a meaningful id string using knowledge of ROOT error messages
195  //
196  // id == "ROOT-ClassName" where ClassName is the affected class
197  // else "ROOT/ClassName" where ClassName is the error-declaring class
198  // else "ROOT"
199 
200  std::string el_identifier = "ROOT";
201 
202  std::string precursor("class ");
203  size_t index1 = el_message.find(precursor);
204  if (index1 != std::string::npos) {
205  size_t index2 = index1 + precursor.length();
206  size_t index3 = el_message.find_first_of(" :", index2);
207  if (index3 != std::string::npos) {
208  size_t substrlen = index3-index2;
209  el_identifier += "-";
210  el_identifier += el_message.substr(index2,substrlen);
211  }
212  } else {
213  index1 = el_location.find("::");
214  if (index1 != std::string::npos) {
215  el_identifier += "/";
216  el_identifier += el_location.substr(0, index1);
217  }
218  }
219 
220  // Intercept some messages and upgrade the severity
221 
222  if ((el_location.find("TBranchElement::Fill") != std::string::npos)
223  && (el_message.find("fill branch") != std::string::npos)
224  && (el_message.find("address") != std::string::npos)
225  && (el_message.find("not set") != std::string::npos)) {
226  el_severity = SeverityLevel::kFatal;
227  }
228 
229  if ((el_message.find("Tree branches") != std::string::npos)
230  && (el_message.find("different numbers of entries") != std::string::npos)) {
231  el_severity = SeverityLevel::kFatal;
232  }
233 
234 
235  // Intercept some messages and downgrade the severity
236 
237  if ((el_message.find("no dictionary for class") != std::string::npos) ||
238  (el_message.find("already in TClassTable") != std::string::npos) ||
239  (el_message.find("matrix not positive definite") != std::string::npos) ||
240  (el_message.find("not a TStreamerInfo object") != std::string::npos) ||
241  (el_message.find("Problems declaring payload") != std::string::npos) ||
242  (el_message.find("Announced number of args different from the real number of argument passed") != std::string::npos) || // Always printed if gDebug>0 - regardless of whether warning message is real.
243  (el_location.find("Fit") != std::string::npos) ||
244  (el_location.find("TDecompChol::Solve") != std::string::npos) ||
245  (el_location.find("THistPainter::PaintInit") != std::string::npos) ||
246  (el_location.find("TUnixSystem::SetDisplay") != std::string::npos) ||
247  (el_location.find("TGClient::GetFontByName") != std::string::npos) ||
248  (el_location.find("Inverter::Dinv") != std::string::npos) ||
249  (el_message.find("nbins is <=0 - set to nbins = 1") != std::string::npos) ||
250  (el_message.find("nbinsy is <=0 - set to nbinsy = 1") != std::string::npos) ||
251  (level < kError and
252  (el_location.find("CINTTypedefBuilder::Setup")!= std::string::npos) and
253  (el_message.find("possible entries are in use!") != std::string::npos))) {
254  el_severity = SeverityLevel::kInfo;
255  }
256 
257  if (el_severity == SeverityLevel::kInfo) {
258  // Don't throw if the message is just informational.
259  die = false;
260  } else {
261  die = true;
262  }
263 
264  // Feed the message to the MessageLogger and let it choose to suppress or not.
265 
266  // Root has declared a fatal error. Throw an EDMException unless the
267  // message corresponds to a pending signal. In that case, do not throw
268  // but let the OS deal with the signal in the usual way.
269  if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
270  std::ostringstream sstr;
271  sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
272  edm::Exception except(edm::errors::FatalRootError, sstr.str());
273  except.addAdditionalInfo(except.message());
274  except.clearMessage();
275  throw except;
276 
277  }
278 
279  // Typically, we get here only for informational messages,
280  // but we leave the other code in just in case we change
281  // the criteria for throwing.
282  if (el_severity == SeverityLevel::kFatal) {
283  edm::LogError("Root_Fatal") << el_location << el_message;
284  } else if (el_severity == SeverityLevel::kSysError) {
285  edm::LogError("Root_Severe") << el_location << el_message;
286  } else if (el_severity == SeverityLevel::kError) {
287  edm::LogError("Root_Error") << el_location << el_message;
288  } else if (el_severity == SeverityLevel::kWarning) {
289  edm::LogWarning("Root_Warning") << el_location << el_message ;
290  } else if (el_severity == SeverityLevel::kInfo) {
291  edm::LogInfo("Root_Information") << el_location << el_message ;
292  }
293  }
294 
295  void RootErrorHandler(int level, bool, char const* location, char const* message) {
296  RootErrorHandlerImpl(level, location, message);
297  }
298 
299  extern "C" {
300 
301  static int full_write(int fd, const char *text)
302  {
303  const char *buffer = text;
304  size_t count = strlen(text);
305  ssize_t written = 0;
306  while (count)
307  {
308  written = write(fd, buffer, count);
309  if (written == -1)
310  {
311  if (errno == EINTR) {continue;}
312  else {return -errno;}
313  }
314  count -= written;
315  buffer += written;
316  }
317  return 0;
318  }
319 
320  static int full_read(int fd, char *inbuf, size_t len, int timeout_s=-1)
321  {
322  char *buf = inbuf;
323  size_t count = len;
324  ssize_t complete = 0;
325  std::chrono::time_point<std::chrono::steady_clock> end_time = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_s);
326  int flags;
327  if (timeout_s < 0)
328  {
329  flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
330  }
331  else if ((-1 == (flags = fcntl(fd, F_GETFL))))
332  {
333  return -errno;
334  }
335  if ((flags & O_NONBLOCK) != O_NONBLOCK)
336  {
337  if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK))
338  {
339  return -errno;
340  }
341  }
342  while (count)
343  {
344  if (timeout_s >= 0)
345  {
346  struct pollfd poll_info{fd, POLLIN, 0};
347  int ms_remaining = std::chrono::duration_cast<std::chrono::milliseconds>(end_time-std::chrono::steady_clock::now()).count();
348  if (ms_remaining > 0)
349  {
350  if (poll(&poll_info, 1, ms_remaining) == 0)
351  {
352  if ((flags & O_NONBLOCK) != O_NONBLOCK)
353  {
354  fcntl(fd, F_SETFL, flags);
355  }
356  return -ETIMEDOUT;
357  }
358  }
359  else if (ms_remaining < 0)
360  {
361  if ((flags & O_NONBLOCK) != O_NONBLOCK)
362  {
363  fcntl(fd, F_SETFL, flags);
364  }
365  return -ETIMEDOUT;
366  }
367  }
368  complete = read(fd, buf, count);
369  if (complete == -1)
370  {
371  if (errno == EINTR) {continue;}
372  else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {continue;}
373  else
374  {
375  int orig_errno = errno;
376  if ((flags & O_NONBLOCK) != O_NONBLOCK)
377  {
378  fcntl(fd, F_SETFL, flags);
379  }
380  return -orig_errno;
381  }
382  }
383  count -= complete;
384  buf += complete;
385  }
386  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
387  fcntl(fd, F_SETFL, flags);
388  }
389  return 0;
390  }
391 
392  static int full_cerr_write(const char *text)
393  {
394  return full_write(2, text);
395  }
396 
397 // these signals are only used inside the stacktrace signal handler,
398 // so common signals can be used. They do have to be different, since
399 // we do not set SA_NODEFER, and RESUME must be a signal that will
400 // cause sleep() to return early.
401 #if defined(SIGRTMAX)
402 #define PAUSE_SIGNAL SIGRTMAX
403 #define RESUME_SIGNAL SIGRTMAX-1
404 #elif defined(SIGINFO) // macOS/BSD
405 #define PAUSE_SIGNAL SIGINFO
406 #define RESUME_SIGNAL SIGALRM
407 #endif
408 
409  // does nothing, here only to interrupt the sleep() in the pause handler
410  void sig_resume_handler(int sig, siginfo_t*, void*) {}
411 
412  // pause a thread so that a (slow) stacktrace will capture the current state
413  void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
414  using namespace edm::service;
415 
416 #ifdef RESUME_SIGNAL
417  sigset_t sigset;
418  sigemptyset(&sigset);
419  sigaddset(&sigset, RESUME_SIGNAL);
420  pthread_sigmask(SIG_UNBLOCK, &sigset, 0);
421 #endif
422  // sleep interrrupts on a handled delivery of the resume signal
424 
425  if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
428  char* buff = InitRootHandlers::moduleListBuffers_[i].data();
429 
430  strlcpy(buff, "\nModule: ", moduleBufferSize);
432  strlcat(buff, edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(), moduleBufferSize);
433  } else {
434  strlcat(buff, "none", moduleBufferSize);
435  }
437  }
438  }
439  }
440 
441  void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
442  using namespace edm::service;
443 
444  const auto& tids = InitRootHandlers::threadIDs();
445 
446  const auto self = pthread_self();
447 #ifdef PAUSE_SIGNAL
448  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
449  // install the "pause" handler
450  struct sigaction act;
451  act.sa_sigaction = sig_pause_for_stacktrace;
452  act.sa_flags = 0;
453  sigemptyset(&act.sa_mask);
454  sigaction(PAUSE_SIGNAL, &act, NULL);
455 
456  // unblock pause signal globally, resume is unblocked in the pause handler
457  sigset_t pausesigset;
458  sigemptyset(&pausesigset);
459  sigaddset(&pausesigset, PAUSE_SIGNAL);
460  sigprocmask(SIG_UNBLOCK, &pausesigset, 0);
461 
462  // send a pause signal to all CMSSW/TBB threads other than self
463  for (auto id : tids) {
464  if (self != id) {
465  pthread_kill(id, PAUSE_SIGNAL);
466  }
467  }
468 
469 #ifdef RESUME_SIGNAL
470  // install the "resume" handler
471  act.sa_sigaction = sig_resume_handler;
472  sigaction(RESUME_SIGNAL, &act, NULL);
473 #endif
474  }
475 #endif
476 
477  const char* signalname = "unknown";
478  switch (sig) {
479  case SIGBUS:
480  {
481  signalname = "bus error";
482  break;
483  }
484  case SIGSEGV:
485  {
486  signalname = "segmentation violation";
487  break;
488  }
489  case SIGILL:
490  {
491  signalname = "illegal instruction";
492  break;
493  }
494  case SIGTERM:
495  {
496  signalname = "external termination request";
497  break;
498  }
499  default:
500  break;
501  }
502  full_cerr_write("\n\nA fatal system signal has occurred: ");
503  full_cerr_write(signalname);
504  full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
505 
507 
508  // resume the signal handlers to store the current module; we are not guaranteed they
509  // will have time to store their modules, so there is a race condition; this could be
510  // avoided by storing the module information before sleeping, a change that may be
511  // made when we're convinced accessing the thread-local current module is safe.
512 #ifdef RESUME_SIGNAL
513  std::size_t notified = 0;
514  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
515  for (auto id : tids) {
516  if (self != id) {
517  if (pthread_kill(id, RESUME_SIGNAL) == 0) ++notified;
518  }
519  }
520  }
521 #endif
522 
523  full_cerr_write("\nCurrent Modules:\n");
524 
525  // Checking tids.count(self) ensures that we only try to access the current module in
526  // CMSSW/TBB threads. Those threads access the thread-local current module at the same
527  // time the thread is registered, so any lazy allocation will have been done at that
528  // point. Not necessary on Linux with the current cmsRun linkage, as the thread-local
529  // is allocated at exec time, not lazily.
530  if (tids.count(self) > 0) {
531  char buff[moduleBufferSize] = "\nModule: ";
533  strlcat(buff, edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(), moduleBufferSize);
534  } else {
535  strlcat(buff, "none", moduleBufferSize);
536  }
537  strlcat(buff, " (crashed)", moduleBufferSize);
538  full_cerr_write(buff);
539  } else {
540  full_cerr_write("\nModule: non-CMSSW (crashed)");
541  }
542 
543 #ifdef PAUSE_SIGNAL
544  // wait a short interval for the paused threads to resume and fill in their module
545  // information, then print
546  if (InitRootHandlers::doneModules_.is_lock_free()) {
547  int spincount = 0;
548  timespec t = { 0, 1000 };
549  while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) { nanosleep(&t, nullptr); }
550  for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
551  full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
552  }
553  }
554 #endif
555 
556  full_cerr_write("\n\nA fatal system signal has occurred: ");
557  full_cerr_write(signalname);
558  full_cerr_write("\n");
559 
560  // For these four known cases, re-raise the signal so get the correct
561  // exit code.
562  if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM))
563  {
564  signal(sig, SIG_DFL);
565  raise(sig);
566  }
567  else
568  {
569  ::abort();
570  }
571  }
572 
573  void sig_abort(int sig, siginfo_t*, void*) {
574  full_cerr_write("\n\nFatal system signal has occurred during exit\n");
575 
576  // re-raise the signal to get the correct exit code
577  signal(sig, SIG_DFL);
578  raise(sig);
579 
580  // shouldn't get here
581  ::sleep(10);
582  ::abort();
583  }
584  }
585 
586  void set_default_signals() {
587  signal(SIGILL, SIG_DFL);
588  signal(SIGSEGV, SIG_DFL);
589  signal(SIGBUS, SIG_DFL);
590  signal(SIGTERM, SIG_DFL);
591  }
592 
593 } // end of unnamed namespace
594 
595 namespace edm {
596  namespace service {
597 
598  /*
599  * We've run into issues where GDB fails to print the thread which calls clone().
600  * To avoid this problem, we have an alternate approach below where the signal handler
601  * only reads/writes to a dedicated thread via pipes. The helper thread does the clone()
602  * invocation; we don't care if that thread is missing from the traceback in this case.
603  */
604  static void cmssw_stacktrace_fork();
605 
607  {
608  int toParent = childToParent_[1];
609  int fromParent = parentToChild_[0];
610  char buf[2]; buf[1] = '\0';
611 
612  while(true)
613  {
614  int result = full_read(fromParent, buf, 1);
615  if (result < 0)
616  {
617  // To avoid a deadlock (this function is NOT re-entrant), reset signals
618  // We never set them back to the CMSSW handler because we assume the parent
619  // thread will abort for us.
620  set_default_signals();
621  close(toParent);
622  full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
623  full_cerr_write(strerror(-result));
624  full_cerr_write("\n");
625  ::abort();
626  }
627  if (buf[0] == '1')
628  {
629  set_default_signals();
631  full_write(toParent, buf);
632  }
633  else if (buf[0] == '2')
634  {
635  // We have just finished forking. Reload the file descriptors for thread
636  // communication.
637  close(toParent);
638  close(fromParent);
639  toParent = childToParent_[1];
640  fromParent = parentToChild_[0];
641  }
642  else if (buf[0] == '3')
643  {
644  break;
645  }
646  else
647  {
648  set_default_signals();
649  close(toParent);
650  full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
651  full_cerr_write(buf);
652  full_cerr_write("\n");
653  ::abort();
654  }
655  }
656  }
657 
659  {
660  int result = full_write(parentToChild_[1], "1");
661  if (result < 0)
662  {
663  full_cerr_write("\n\nAttempt to request stacktrace failed: ");
664  full_cerr_write(strerror(-result));
665  full_cerr_write("\n");
666  return;
667  }
668  char buf[2]; buf[1] = '\0';
669  if ((result = full_read(childToParent_[0], buf, 1, 5*60)) < 0)
670  {
671  full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
672  if (result == -ETIMEDOUT)
673  {
674  full_cerr_write("timed out waiting for GDB to complete.");
675  }
676  else
677  {
678  full_cerr_write(strerror(-result));
679  }
680  full_cerr_write("\n");
681  return;
682  }
683  }
684 
686  {
687  char child_stack[4*1024];
688  char *child_stack_ptr = child_stack + 4*1024;
689  // On Linux, we currently use jemalloc. This registers pthread_atfork handlers; these
690  // handlers are *not* async-signal safe. Hence, a deadlock is possible if we invoke
691  // fork() from our signal handlers. Accordingly, we use clone (not POSIX, but AS-safe)
692  // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
693  int pid =
694 #ifdef __linux__
695  clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM|CLONE_FS|SIGCHLD, nullptr);
696 #else
697  fork();
698  if (child_stack_ptr) {} // Suppress 'unused variable' warning on non-Linux
699  if (pid == 0) {edm::service::cmssw_stacktrace(nullptr); ::abort();}
700 #endif
701  if (pid == -1)
702  {
703  full_cerr_write("(Attempt to perform stack dump failed.)\n");
704  }
705  else
706  {
707  int status;
708  if (waitpid(pid, &status, 0) == -1)
709  {
710  full_cerr_write("(Failed to wait on stack dump output.)\n");
711  }
712  if (status)
713  {
714  full_cerr_write("(GDB stack trace failed unexpectedly)\n");
715  }
716  }
717  }
718 
719  int cmssw_stacktrace(void * /*arg*/)
720  {
722  // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
723  // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
724  // calls dlsym.
725 #ifdef __linux__
726  syscall(SYS_execve, "/bin/sh", argv, __environ);
727 #else
728  execv("/bin/sh", argv);
729 #endif
730  ::abort();
731  return 1;
732  }
733 
734  namespace {
735 
736  void localInitializeThisThreadForUse() {
737  static thread_local TThread guard;
738  }
739 
740  class InitializeThreadTask : public tbb::task {
741  public:
742  InitializeThreadTask(std::atomic<unsigned int>* counter,
743  tbb::task* waitingTask):
744  threadsLeft_(counter),
745  waitTask_(waitingTask) {}
746 
747  tbb::task* execute() override {
748  //For each tbb thread, setup the initialization
749  // required by ROOT and then wait until all
750  // threads have done so in order to guarantee the all get setup
751 
752  localInitializeThisThreadForUse();
753  (*threadsLeft_)--;
754  while(0 != threadsLeft_->load());
755  waitTask_->decrement_ref_count();
756  return nullptr;
757  }
758  private:
759  std::atomic<unsigned int>* threadsLeft_;
760  tbb::task* waitTask_;
761  };
762  }
763 
764  static char pstackName[] = "(CMSSW stack trace helper)";
765  static char dashC[] = "-c";
768  int InitRootHandlers::parentToChild_[2] = {-1, -1};
769  int InitRootHandlers::childToParent_[2] = {-1, -1};
770  std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
772  std::vector<std::array<char,moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
773  std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
775 
776 
778  : RootHandlers(),
779  unloadSigHandler_(pset.getUntrackedParameter<bool> ("UnloadRootSigHandler")),
780  resetErrHandler_(pset.getUntrackedParameter<bool> ("ResetRootErrHandler")),
781  loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
782  autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool> ("AutoLibraryLoader"))
783  {
784  stackTracePause_ = pset.getUntrackedParameter<int> ("StackTracePauseTime");
785 
786  if(unloadSigHandler_) {
787  // Deactivate all the Root signal handlers and restore the system defaults
788  gSystem->ResetSignal(kSigChild);
789  gSystem->ResetSignal(kSigBus);
790  gSystem->ResetSignal(kSigSegmentationViolation);
791  gSystem->ResetSignal(kSigIllegalInstruction);
792  gSystem->ResetSignal(kSigSystem);
793  gSystem->ResetSignal(kSigPipe);
794  gSystem->ResetSignal(kSigAlarm);
795  gSystem->ResetSignal(kSigUrgent);
796  gSystem->ResetSignal(kSigFloatingException);
797  gSystem->ResetSignal(kSigWindowChanged);
798  } else if(pset.getUntrackedParameter<bool>("AbortOnSignal")){
799  cachePidInfo();
800 
801  //NOTE: ROOT can also be told to abort on these kinds of problems BUT
802  // it requires an TApplication to be instantiated which causes problems
803  gSystem->ResetSignal(kSigBus);
804  gSystem->ResetSignal(kSigSegmentationViolation);
805  gSystem->ResetSignal(kSigIllegalInstruction);
806  installCustomHandler(SIGBUS,sig_dostack_then_abort);
807  sigBusHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
808  installCustomHandler(SIGBUS,sig_abort);
809  });
810  installCustomHandler(SIGSEGV,sig_dostack_then_abort);
811  sigSegvHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
812  installCustomHandler(SIGSEGV,sig_abort);
813  });
814  installCustomHandler(SIGILL,sig_dostack_then_abort);
815  sigIllHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
816  installCustomHandler(SIGILL,sig_abort);
817  });
818  installCustomHandler(SIGTERM,sig_dostack_then_abort);
819  sigTermHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
820  installCustomHandler(SIGTERM,sig_abort);
821  });
823  }
824 
825  //Initialize each TBB thread so ROOT knows about them
826  iReg.watchPreallocate( [](service::SystemBounds const& iBounds) {
827  auto const nThreads =iBounds.maxNumberOfThreads();
828  if(nThreads > 1) {
829  std::atomic<unsigned int> threadsLeft{nThreads};
830 
831  std::shared_ptr<tbb::empty_task> waitTask{new (tbb::task::allocate_root()) tbb::empty_task{},
832  [](tbb::empty_task* iTask){tbb::task::destroy(*iTask);} };
833 
834  waitTask->set_ref_count(1+nThreads);
835  for(unsigned int i=0; i<nThreads;++i) {
836  tbb::task::spawn( *( new(tbb::task::allocate_root()) InitializeThreadTask(&threadsLeft, waitTask.get())));
837  }
838 
839  waitTask->wait_for_all();
840 
841  }
842  }
843  );
844 
845  iReg.watchPreallocate([this](edm::service::SystemBounds const& iBounds){
846  if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
847  moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
848  }
849  });
850 
851  if(resetErrHandler_) {
852 
853  // Replace the Root error handler with one that uses the MessageLogger
854  SetErrorHandler(RootErrorHandler);
855  }
856 
857  // Enable automatic Root library loading.
858  if(autoLibraryLoader_) {
859  gInterpreter->SetClassAutoloading(1);
860  }
861 
862  // Set ROOT parameters.
863  TTree::SetMaxTreeSize(kMaxLong64);
864  TH1::AddDirectory(kFALSE);
865  //G__SetCatchException(0);
866 
867  // Set custom streamers
869 
870  // Load the library containing dictionaries for std:: classes, if not already loaded.
871  if (!hasDictionary(typeid(std::vector<std::vector<unsigned int> >))) {
872  TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
873  }
874 
875  int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
876  if(debugLevel >0) {
877  gDebug = debugLevel;
878  }
879 
880  // Enable Root implicit multi-threading
881  bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
882  if (imt) ROOT::EnableImplicitMT();
883  }
884 
886  // close all open ROOT files
887  TIter iter(gROOT->GetListOfFiles());
888  TObject *obj = nullptr;
889  while(nullptr != (obj = iter.Next())) {
890  TFile* f = dynamic_cast<TFile*>(obj);
891  if(f) {
892  // We get a new iterator each time,
893  // because closing a file can invalidate the iterator
894  f->Close();
895  iter = TIter(gROOT->GetListOfFiles());
896  }
897  }
898  }
899 
901  //Tell Root we want to be multi-threaded
902  TThread::Initialize();
903  //When threading, also have to keep ROOT from logging all TObjects into a list
904  TObject::SetObjectStat(false);
905 
906  //Have to avoid having Streamers modify themselves after they have been used
907  TVirtualStreamerInfo::Optimize(false);
908  }
909 
911  localInitializeThisThreadForUse();
912  }
913 
916  desc.setComment("Centralized interface to ROOT.");
917  desc.addUntracked<bool>("UnloadRootSigHandler", false)
918  ->setComment("If True, signals are handled by this service, rather than by ROOT.");
919  desc.addUntracked<bool>("ResetRootErrHandler", true)
920  ->setComment("If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
921  desc.addUntracked<bool>("AutoLibraryLoader", true)
922  ->setComment("If True, enables automatic loading of data dictionaries.");
923  desc.addUntracked<bool>("LoadAllDictionaries",false)
924  ->setComment("If True, loads all ROOT dictionaries.");
925  desc.addUntracked<bool>("EnableIMT",false)
926  ->setComment("If True, calls ROOT::EnableImplicitMT().");
927  desc.addUntracked<bool>("AbortOnSignal",true)
928  ->setComment("If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which attempts to do a clean shutdown.");
929  desc.addUntracked<int>("DebugLevel",0)
930  ->setComment("Sets ROOT's gDebug value.");
931  desc.addUntracked<int>("StackTracePauseTime", 300)
932  ->setComment("Seconds to pause other threads during stack trace.");
933  descriptions.add("InitRootHandlers", desc);
934  }
935 
936  char *const *
938  return pstackArgv_;
939  }
940 
941  void
943  s_ignoreWarnings =false;
944  }
945 
946  void
948  s_ignoreWarnings = true;
949  }
950 
951  void
953  {
954  if(helperThread_) {
955  //Another InitRootHandlers was initialized in this job, possibly
956  // because multiple EventProcessors are being used.
957  //In that case, we are already all setup
958  return;
959  }
960  if (snprintf(pidString_, pidStringLength_-1, "gdb -quiet -p %d 2>&1 <<EOF |\n"
961  "set width 0\n"
962  "set height 0\n"
963  "set pagination no\n"
964  "thread apply all bt\n"
965  "EOF\n"
966  "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'", getpid()) >= pidStringLength_)
967  {
968  std::ostringstream sstr;
969  sstr << "Unable to pre-allocate stacktrace handler information";
970  edm::Exception except(edm::errors::OtherCMS, sstr.str());
971  throw except;
972  }
973 
974  // These are initialized to -1; harmless to close an invalid FD.
975  // If this is called post-fork, we don't want to be communicating on
976  // these FDs as they are used internally by the parent.
977  close(childToParent_[0]);
978  close(childToParent_[1]);
979  childToParent_[0] = -1; childToParent_[1] = -1;
980  close(parentToChild_[0]);
981  close(parentToChild_[1]);
982  parentToChild_[0] = -1; parentToChild_[1] = -1;
983 
984  if (-1 == pipe2(childToParent_, O_CLOEXEC))
985  {
986  std::ostringstream sstr;
987  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
988  edm::Exception except(edm::errors::OtherCMS, sstr.str());
989  throw except;
990  }
991 
992  if (-1 == pipe2(parentToChild_, O_CLOEXEC))
993  {
994  close(childToParent_[0]); close(childToParent_[1]);
995  childToParent_[0] = -1; childToParent_[1] = -1;
996  std::ostringstream sstr;
997  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
998  edm::Exception except(edm::errors::OtherCMS, sstr.str());
999  throw except;
1000  }
1001 
1002  helperThread_.reset(new std::thread(stacktraceHelperThread));
1003  helperThread_->detach();
1004  }
1005 
1006  } // end of namespace service
1007 } // end of namespace edm
1008 
1012 
size
Write out results.
unsigned int maxNumberOfThreads() const
Definition: SystemBounds.h:46
T getUntrackedParameter(std::string const &, T const &) const
virtual void enableWarnings_() override
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:117
double seconds()
void watchPreallocate(Preallocate::slot_type const &iSlot)
static char *const pstackArgv_[]
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
def destroy(e)
Definition: pyrootRender.py:13
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:99
static ModuleCallingContext const * getCurrentModuleOnThread()
void setRefCoreStreamer(bool resetAll=false)
std::vector< Variable::Flags > flags
Definition: MVATrainer.cc:135
#define NULL
Definition: scimark2.h:8
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
#define constexpr
std::shared_ptr< const void > sigBusHandler_
static TypeWithDict byName(std::string const &name)
Definition: TypeWithDict.cc:60
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
void cachePidInfoHandler(unsigned int, unsigned int)
std::atomic< unsigned int > * threadsLeft_
void setComment(std::string const &value)
std::string moduleName(Provenance const &provenance)
Definition: Provenance.cc:27
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static ThreadTracker threadTracker_
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
static char *const * getPstackArgv()
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
std::shared_ptr< const void > sigIllHandler_
virtual void initializeThisThreadForUse() override
virtual void ignoreWarnings_() override
std::shared_ptr< const void > sigTermHandler_
void addAdditionalInfo(std::string const &info)
Definition: Exception.cc:235
double f[11][100]
int cmssw_stacktrace(void *)
tbb::task * waitTask_
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static char pstackName[]
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
tbb::concurrent_unordered_set< pthread_t > Container_type
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
Definition: eve_macros.cc:135
void watchPostForkReacquireResources(PostForkReacquireResources::slot_type const &iSlot)
virtual void willBeUsingThreads() override
static char dashC[]
HLT enums.
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:82
static void fillDescriptions(ConfigurationDescriptions &descriptions)
SeverityLevel
bool hasDictionary(std::type_info const &)
#define O_NONBLOCK
Definition: SysFile.h:21
def write(self, setup)