CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
InitRootHandlers.cc
Go to the documentation of this file.
2 
4 
19 
20 #include "tbb/task.h"
21 #include "tbb/task_scheduler_observer.h"
22 #include "tbb/concurrent_unordered_set.h"
23 #include <thread>
24 #include <sys/wait.h>
25 #include <sstream>
26 #include <string.h>
27 #include <poll.h>
28 #include <atomic>
29 
30 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
31 // version. This can break our stack trace printer. Avoid this by
32 // invoking the syscall directly.
33 #ifdef __linux__
34 #include <syscall.h>
35 #endif
36 
37 #include "TROOT.h"
38 #include "TError.h"
39 #include "TFile.h"
40 #include "TInterpreter.h"
41 #include "TH1.h"
42 #include "TSystem.h"
43 #include "TUnixSystem.h"
44 #include "TTree.h"
45 #include "TVirtualStreamerInfo.h"
46 
47 #include "TThread.h"
48 #include "TClassTable.h"
49 
50 #include <memory>
51 
52 namespace {
53  // size of static buffer allocated for listing module names following a
54  // stacktrace abort
55  constexpr std::size_t moduleBufferSize = 128;
56 }
57 
58 namespace edm {
60  class ParameterSet;
61  class ActivityRegistry;
62 
63  namespace service {
64  class InitRootHandlers : public RootHandlers {
65 
66  friend int cmssw_stacktrace(void *);
67 
68  public:
69  class ThreadTracker : public tbb::task_scheduler_observer {
70  public:
71  typedef tbb::concurrent_unordered_set<pthread_t> Container_type;
72 
73  ThreadTracker() : tbb::task_scheduler_observer() {
74  observe(true);
75  }
76  void on_scheduler_entry(bool) {
77  // ensure thread local has been allocated; not necessary on Linux with
78  // the current cmsRun linkage, but could be an issue if the platform
79  // or linkage leads to "lazy" allocation of the thread local. By
80  // referencing it here we make sure it has been allocated and can be
81  // accessed safely from our signal handler.
83  threadIDs_.insert(pthread_self());
84  }
85  const Container_type& IDs() { return threadIDs_; }
86 
87  private:
89  };
90 
91  explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
92  virtual ~InitRootHandlers();
93 
94  static void fillDescriptions(ConfigurationDescriptions& descriptions);
95  static void stacktraceFromThread();
97  static int stackTracePause() { return stackTracePause_; }
98 
99  static std::vector<std::array<char,moduleBufferSize>> moduleListBuffers_;
100  static std::atomic<std::size_t> nextModule_, doneModules_;
101  private:
102  static char *const *getPstackArgv();
103  virtual void enableWarnings_() override;
104  virtual void ignoreWarnings_() override;
105  virtual void willBeUsingThreads() override;
106  virtual void initializeThisThreadForUse() override;
107 
108  void cachePidInfoHandler(unsigned int, unsigned int) {cachePidInfo();}
109  void cachePidInfo();
110  static void stacktraceHelperThread();
111 
112  static const int pidStringLength_ = 200;
114  static char * const pstackArgv_[];
115  static int parentToChild_[2];
116  static int childToParent_[2];
117  static std::unique_ptr<std::thread> helperThread_;
119  static int stackTracePause_;
120 
125  std::shared_ptr<const void> sigBusHandler_;
126  std::shared_ptr<const void> sigSegvHandler_;
127  std::shared_ptr<const void> sigIllHandler_;
128  std::shared_ptr<const void> sigTermHandler_;
129  };
130 
131  inline
133  return true;
134  }
135 
136  } // end of namespace service
137 } // end of namespace edm
138 
139 namespace edm {
140  namespace service {
141  int cmssw_stacktrace(void *);
142  }
143 }
144 
145 namespace {
146  enum class SeverityLevel {
147  kInfo,
148  kWarning,
149  kError,
150  kSysError,
151  kFatal
152  };
153 
154  static thread_local bool s_ignoreWarnings = false;
155 
156  static bool s_ignoreEverything = false;
157 
158  void RootErrorHandlerImpl(int level, char const* location, char const* message) {
159 
160  bool die = false;
161 
162  // Translate ROOT severity level to MessageLogger severity level
163 
164  SeverityLevel el_severity = SeverityLevel::kInfo;
165 
166  if (level >= kFatal) {
167  el_severity = SeverityLevel::kFatal;
168  } else if (level >= kSysError) {
169  el_severity = SeverityLevel::kSysError;
170  } else if (level >= kError) {
171  el_severity = SeverityLevel::kError;
172  } else if (level >= kWarning) {
173  el_severity = s_ignoreWarnings ? SeverityLevel::kInfo : SeverityLevel::kWarning;
174  }
175 
176  if(s_ignoreEverything) {
177  el_severity = SeverityLevel::kInfo;
178  }
179 
180  // Adapt C-strings to std::strings
181  // Arrange to report the error location as furnished by Root
182 
183  std::string el_location = "@SUB=?";
184  if (location != 0) el_location = std::string("@SUB=")+std::string(location);
185 
186  std::string el_message = "?";
187  if (message != 0) el_message = message;
188 
189  // Try to create a meaningful id string using knowledge of ROOT error messages
190  //
191  // id == "ROOT-ClassName" where ClassName is the affected class
192  // else "ROOT/ClassName" where ClassName is the error-declaring class
193  // else "ROOT"
194 
195  std::string el_identifier = "ROOT";
196 
197  std::string precursor("class ");
198  size_t index1 = el_message.find(precursor);
199  if (index1 != std::string::npos) {
200  size_t index2 = index1 + precursor.length();
201  size_t index3 = el_message.find_first_of(" :", index2);
202  if (index3 != std::string::npos) {
203  size_t substrlen = index3-index2;
204  el_identifier += "-";
205  el_identifier += el_message.substr(index2,substrlen);
206  }
207  } else {
208  index1 = el_location.find("::");
209  if (index1 != std::string::npos) {
210  el_identifier += "/";
211  el_identifier += el_location.substr(0, index1);
212  }
213  }
214 
215  // Intercept some messages and upgrade the severity
216 
217  if ((el_location.find("TBranchElement::Fill") != std::string::npos)
218  && (el_message.find("fill branch") != std::string::npos)
219  && (el_message.find("address") != std::string::npos)
220  && (el_message.find("not set") != std::string::npos)) {
221  el_severity = SeverityLevel::kFatal;
222  }
223 
224  if ((el_message.find("Tree branches") != std::string::npos)
225  && (el_message.find("different numbers of entries") != std::string::npos)) {
226  el_severity = SeverityLevel::kFatal;
227  }
228 
229 
230  // Intercept some messages and downgrade the severity
231 
232  if ((el_message.find("no dictionary for class") != std::string::npos) ||
233  (el_message.find("already in TClassTable") != std::string::npos) ||
234  (el_message.find("matrix not positive definite") != std::string::npos) ||
235  (el_message.find("not a TStreamerInfo object") != std::string::npos) ||
236  (el_message.find("Problems declaring payload") != std::string::npos) ||
237  (el_message.find("Announced number of args different from the real number of argument passed") != std::string::npos) || // Always printed if gDebug>0 - regardless of whether warning message is real.
238  (el_location.find("Fit") != std::string::npos) ||
239  (el_location.find("TDecompChol::Solve") != std::string::npos) ||
240  (el_location.find("THistPainter::PaintInit") != std::string::npos) ||
241  (el_location.find("TUnixSystem::SetDisplay") != std::string::npos) ||
242  (el_location.find("TGClient::GetFontByName") != std::string::npos) ||
243  (el_location.find("Inverter::Dinv") != std::string::npos) ||
244  (el_message.find("nbins is <=0 - set to nbins = 1") != std::string::npos) ||
245  (el_message.find("nbinsy is <=0 - set to nbinsy = 1") != std::string::npos) ||
246  (level < kError and
247  (el_location.find("CINTTypedefBuilder::Setup")!= std::string::npos) and
248  (el_message.find("possible entries are in use!") != std::string::npos))) {
249  el_severity = SeverityLevel::kInfo;
250  }
251 
252  if (el_severity == SeverityLevel::kInfo) {
253  // Don't throw if the message is just informational.
254  die = false;
255  } else {
256  die = true;
257  }
258 
259  // Feed the message to the MessageLogger and let it choose to suppress or not.
260 
261  // Root has declared a fatal error. Throw an EDMException unless the
262  // message corresponds to a pending signal. In that case, do not throw
263  // but let the OS deal with the signal in the usual way.
264  if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
265  std::ostringstream sstr;
266  sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
267  edm::Exception except(edm::errors::FatalRootError, sstr.str());
268  except.addAdditionalInfo(except.message());
269  except.clearMessage();
270  throw except;
271 
272  }
273 
274  // Typically, we get here only for informational messages,
275  // but we leave the other code in just in case we change
276  // the criteria for throwing.
277  if (el_severity == SeverityLevel::kFatal) {
278  edm::LogError("Root_Fatal") << el_location << el_message;
279  } else if (el_severity == SeverityLevel::kSysError) {
280  edm::LogError("Root_Severe") << el_location << el_message;
281  } else if (el_severity == SeverityLevel::kError) {
282  edm::LogError("Root_Error") << el_location << el_message;
283  } else if (el_severity == SeverityLevel::kWarning) {
284  edm::LogWarning("Root_Warning") << el_location << el_message ;
285  } else if (el_severity == SeverityLevel::kInfo) {
286  edm::LogInfo("Root_Information") << el_location << el_message ;
287  }
288  }
289 
290  void RootErrorHandler(int level, bool, char const* location, char const* message) {
291  RootErrorHandlerImpl(level, location, message);
292  }
293 
294  extern "C" {
295 
296  static int full_write(int fd, const char *text)
297  {
298  const char *buffer = text;
299  size_t count = strlen(text);
300  ssize_t written = 0;
301  while (count)
302  {
303  written = write(fd, buffer, count);
304  if (written == -1)
305  {
306  if (errno == EINTR) {continue;}
307  else {return -errno;}
308  }
309  count -= written;
310  buffer += written;
311  }
312  return 0;
313  }
314 
315  static int full_read(int fd, char *inbuf, size_t len, int timeout_s=-1)
316  {
317  char *buf = inbuf;
318  size_t count = len;
319  ssize_t complete = 0;
320  std::chrono::time_point<std::chrono::steady_clock> end_time = std::chrono::steady_clock::now() + std::chrono::seconds(timeout_s);
321  int flags;
322  if (timeout_s < 0)
323  {
324  flags = O_NONBLOCK; // Prevents us from trying to set / restore flags later.
325  }
326  else if ((-1 == (flags = fcntl(fd, F_GETFL))))
327  {
328  return -errno;
329  }
330  if ((flags & O_NONBLOCK) != O_NONBLOCK)
331  {
332  if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK))
333  {
334  return -errno;
335  }
336  }
337  while (count)
338  {
339  if (timeout_s >= 0)
340  {
341  struct pollfd poll_info{fd, POLLIN, 0};
342  int ms_remaining = std::chrono::duration_cast<std::chrono::milliseconds>(end_time-std::chrono::steady_clock::now()).count();
343  if (ms_remaining > 0)
344  {
345  if (poll(&poll_info, 1, ms_remaining) == 0)
346  {
347  if ((flags & O_NONBLOCK) != O_NONBLOCK)
348  {
349  fcntl(fd, F_SETFL, flags);
350  }
351  return -ETIMEDOUT;
352  }
353  }
354  else if (ms_remaining < 0)
355  {
356  if ((flags & O_NONBLOCK) != O_NONBLOCK)
357  {
358  fcntl(fd, F_SETFL, flags);
359  }
360  return -ETIMEDOUT;
361  }
362  }
363  complete = read(fd, buf, count);
364  if (complete == -1)
365  {
366  if (errno == EINTR) {continue;}
367  else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {continue;}
368  else
369  {
370  int orig_errno = errno;
371  if ((flags & O_NONBLOCK) != O_NONBLOCK)
372  {
373  fcntl(fd, F_SETFL, flags);
374  }
375  return -orig_errno;
376  }
377  }
378  count -= complete;
379  buf += complete;
380  }
381  if ((flags & O_NONBLOCK) != O_NONBLOCK) {
382  fcntl(fd, F_SETFL, flags);
383  }
384  return 0;
385  }
386 
387  static int full_cerr_write(const char *text)
388  {
389  return full_write(2, text);
390  }
391 
392 // these signals are only used inside the stacktrace signal handler,
393 // so common signals can be used. They do have to be different, since
394 // we do not set SA_NODEFER, and RESUME must be a signal that will
395 // cause sleep() to return early.
396 #if defined(SIGRTMAX)
397 #define PAUSE_SIGNAL SIGRTMAX
398 #define RESUME_SIGNAL SIGRTMAX-1
399 #elif defined(SIGINFO) // macOS/BSD
400 #define PAUSE_SIGNAL SIGINFO
401 #define RESUME_SIGNAL SIGALRM
402 #endif
403 
404  // does nothing, here only to interrupt the sleep() in the pause handler
405  void sig_resume_handler(int sig, siginfo_t*, void*) {}
406 
407  // pause a thread so that a (slow) stacktrace will capture the current state
408  void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
409  using namespace edm::service;
410 
411 #ifdef RESUME_SIGNAL
412  sigset_t sigset;
413  sigemptyset(&sigset);
414  sigaddset(&sigset, RESUME_SIGNAL);
415  pthread_sigmask(SIG_UNBLOCK, &sigset, 0);
416 #endif
417  // sleep interrrupts on a handled delivery of the resume signal
419 
420  if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
423  char* buff = InitRootHandlers::moduleListBuffers_[i].data();
424 
425  strlcpy(buff, "\nModule: ", moduleBufferSize);
427  strlcat(buff, edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(), moduleBufferSize);
428  } else {
429  strlcat(buff, "none", moduleBufferSize);
430  }
432  }
433  }
434  }
435 
436  void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
437  using namespace edm::service;
438 
439  const auto& tids = InitRootHandlers::threadIDs();
440 
441  const auto self = pthread_self();
442 #ifdef PAUSE_SIGNAL
443  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
444  // install the "pause" handler
445  struct sigaction act;
446  act.sa_sigaction = sig_pause_for_stacktrace;
447  act.sa_flags = 0;
448  sigemptyset(&act.sa_mask);
449  sigaction(PAUSE_SIGNAL, &act, NULL);
450 
451  // unblock pause signal globally, resume is unblocked in the pause handler
452  sigset_t pausesigset;
453  sigemptyset(&pausesigset);
454  sigaddset(&pausesigset, PAUSE_SIGNAL);
455  sigprocmask(SIG_UNBLOCK, &pausesigset, 0);
456 
457  // send a pause signal to all CMSSW/TBB threads other than self
458  for (auto id : tids) {
459  if (self != id) {
460  pthread_kill(id, PAUSE_SIGNAL);
461  }
462  }
463 
464 #ifdef RESUME_SIGNAL
465  // install the "resume" handler
466  act.sa_sigaction = sig_resume_handler;
467  sigaction(RESUME_SIGNAL, &act, NULL);
468 #endif
469  }
470 #endif
471 
472  const char* signalname = "unknown";
473  switch (sig) {
474  case SIGBUS:
475  {
476  signalname = "bus error";
477  break;
478  }
479  case SIGSEGV:
480  {
481  signalname = "segmentation violation";
482  break;
483  }
484  case SIGILL:
485  {
486  signalname = "illegal instruction";
487  break;
488  }
489  case SIGTERM:
490  {
491  signalname = "external termination request";
492  break;
493  }
494  default:
495  break;
496  }
497  full_cerr_write("\n\nA fatal system signal has occurred: ");
498  full_cerr_write(signalname);
499  full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
500 
502 
503  // resume the signal handlers to store the current module; we are not guaranteed they
504  // will have time to store their modules, so there is a race condition; this could be
505  // avoided by storing the module information before sleeping, a change that may be
506  // made when we're convinced accessing the thread-local current module is safe.
507 #ifdef RESUME_SIGNAL
508  std::size_t notified = 0;
509  if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
510  for (auto id : tids) {
511  if (self != id) {
512  if (pthread_kill(id, RESUME_SIGNAL) == 0) ++notified;
513  }
514  }
515  }
516 #endif
517 
518  full_cerr_write("\nCurrent Modules:\n");
519 
520  // Checking tids.count(self) ensures that we only try to access the current module in
521  // CMSSW/TBB threads. Those threads access the thread-local current module at the same
522  // time the thread is registered, so any lazy allocation will have been done at that
523  // point. Not necessary on Linux with the current cmsRun linkage, as the thread-local
524  // is allocated at exec time, not lazily.
525  if (tids.count(self) > 0) {
526  char buff[moduleBufferSize] = "\nModule: ";
528  strlcat(buff, edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(), moduleBufferSize);
529  } else {
530  strlcat(buff, "none", moduleBufferSize);
531  }
532  strlcat(buff, " (crashed)", moduleBufferSize);
533  full_cerr_write(buff);
534  } else {
535  full_cerr_write("\nModule: non-CMSSW (crashed)");
536  }
537 
538 #ifdef PAUSE_SIGNAL
539  // wait a short interval for the paused threads to resume and fill in their module
540  // information, then print
541  if (InitRootHandlers::doneModules_.is_lock_free()) {
542  int spincount = 0;
543  timespec t = { 0, 1000 };
544  while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) { nanosleep(&t, nullptr); }
545  for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
546  full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
547  }
548  }
549 #endif
550 
551  full_cerr_write("\n\nA fatal system signal has occurred: ");
552  full_cerr_write(signalname);
553  full_cerr_write("\n");
554 
555  // For these four known cases, re-raise the signal so get the correct
556  // exit code.
557  if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM))
558  {
559  signal(sig, SIG_DFL);
560  raise(sig);
561  }
562  else
563  {
564  ::abort();
565  }
566  }
567 
568  void sig_abort(int sig, siginfo_t*, void*) {
569  full_cerr_write("\n\nFatal system signal has occurred during exit\n");
570 
571  // re-raise the signal to get the correct exit code
572  signal(sig, SIG_DFL);
573  raise(sig);
574 
575  // shouldn't get here
576  ::sleep(10);
577  ::abort();
578  }
579  }
580 
581  void set_default_signals() {
582  signal(SIGILL, SIG_DFL);
583  signal(SIGSEGV, SIG_DFL);
584  signal(SIGBUS, SIG_DFL);
585  signal(SIGTERM, SIG_DFL);
586  }
587 
588 } // end of unnamed namespace
589 
590 namespace edm {
591  namespace service {
592 
593  /*
594  * We've run into issues where GDB fails to print the thread which calls clone().
595  * To avoid this problem, we have an alternate approach below where the signal handler
596  * only reads/writes to a dedicated thread via pipes. The helper thread does the clone()
597  * invocation; we don't care if that thread is missing from the traceback in this case.
598  */
599  static void cmssw_stacktrace_fork();
600 
602  {
603  int toParent = childToParent_[1];
604  int fromParent = parentToChild_[0];
605  char buf[2]; buf[1] = '\0';
606 
607  while(true)
608  {
609  int result = full_read(fromParent, buf, 1);
610  if (result < 0)
611  {
612  // To avoid a deadlock (this function is NOT re-entrant), reset signals
613  // We never set them back to the CMSSW handler because we assume the parent
614  // thread will abort for us.
615  set_default_signals();
616  close(toParent);
617  full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
618  full_cerr_write(strerror(-result));
619  full_cerr_write("\n");
620  ::abort();
621  }
622  if (buf[0] == '1')
623  {
624  set_default_signals();
626  full_write(toParent, buf);
627  }
628  else if (buf[0] == '2')
629  {
630  // We have just finished forking. Reload the file descriptors for thread
631  // communication.
632  close(toParent);
633  close(fromParent);
634  toParent = childToParent_[1];
635  fromParent = parentToChild_[0];
636  }
637  else if (buf[0] == '3')
638  {
639  break;
640  }
641  else
642  {
643  set_default_signals();
644  close(toParent);
645  full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
646  full_cerr_write(buf);
647  full_cerr_write("\n");
648  ::abort();
649  }
650  }
651  }
652 
654  {
655  int result = full_write(parentToChild_[1], "1");
656  if (result < 0)
657  {
658  full_cerr_write("\n\nAttempt to request stacktrace failed: ");
659  full_cerr_write(strerror(-result));
660  full_cerr_write("\n");
661  return;
662  }
663  char buf[2]; buf[1] = '\0';
664  if ((result = full_read(childToParent_[0], buf, 1, 5*60)) < 0)
665  {
666  full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
667  if (result == -ETIMEDOUT)
668  {
669  full_cerr_write("timed out waiting for GDB to complete.");
670  }
671  else
672  {
673  full_cerr_write(strerror(-result));
674  }
675  full_cerr_write("\n");
676  return;
677  }
678  }
679 
681  {
682  char child_stack[4*1024];
683  char *child_stack_ptr = child_stack + 4*1024;
684  // On Linux, we currently use jemalloc. This registers pthread_atfork handlers; these
685  // handlers are *not* async-signal safe. Hence, a deadlock is possible if we invoke
686  // fork() from our signal handlers. Accordingly, we use clone (not POSIX, but AS-safe)
687  // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
688  int pid =
689 #ifdef __linux__
690  clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM|CLONE_FS|SIGCHLD, nullptr);
691 #else
692  fork();
693  if (child_stack_ptr) {} // Suppress 'unused variable' warning on non-Linux
694  if (pid == 0) {edm::service::cmssw_stacktrace(nullptr); ::abort();}
695 #endif
696  if (pid == -1)
697  {
698  full_cerr_write("(Attempt to perform stack dump failed.)\n");
699  }
700  else
701  {
702  int status;
703  if (waitpid(pid, &status, 0) == -1)
704  {
705  full_cerr_write("(Failed to wait on stack dump output.)\n");
706  }
707  if (status)
708  {
709  full_cerr_write("(GDB stack trace failed unexpectedly)\n");
710  }
711  }
712  }
713 
714  int cmssw_stacktrace(void * /*arg*/)
715  {
717  // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
718  // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
719  // calls dlsym.
720 #ifdef __linux__
721  syscall(SYS_execve, "/bin/sh", argv, __environ);
722 #else
723  execv("/bin/sh", argv);
724 #endif
725  ::abort();
726  return 1;
727  }
728 
729  namespace {
730 
731  void localInitializeThisThreadForUse() {
732  static thread_local TThread guard;
733  }
734 
735  class InitializeThreadTask : public tbb::task {
736  public:
737  InitializeThreadTask(std::atomic<unsigned int>* counter,
738  tbb::task* waitingTask):
739  threadsLeft_(counter),
740  waitTask_(waitingTask) {}
741 
742  tbb::task* execute() override {
743  //For each tbb thread, setup the initialization
744  // required by ROOT and then wait until all
745  // threads have done so in order to guarantee the all get setup
746 
747  localInitializeThisThreadForUse();
748  (*threadsLeft_)--;
749  while(0 != threadsLeft_->load());
750  waitTask_->decrement_ref_count();
751  return nullptr;
752  }
753  private:
754  std::atomic<unsigned int>* threadsLeft_;
755  tbb::task* waitTask_;
756  };
757  }
758 
759  static char pstackName[] = "(CMSSW stack trace helper)";
760  static char dashC[] = "-c";
763  int InitRootHandlers::parentToChild_[2] = {-1, -1};
764  int InitRootHandlers::childToParent_[2] = {-1, -1};
765  std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
767  std::vector<std::array<char,moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
768  std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
770 
771 
773  : RootHandlers(),
774  unloadSigHandler_(pset.getUntrackedParameter<bool> ("UnloadRootSigHandler")),
775  resetErrHandler_(pset.getUntrackedParameter<bool> ("ResetRootErrHandler")),
776  loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
777  autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool> ("AutoLibraryLoader"))
778  {
779  stackTracePause_ = pset.getUntrackedParameter<int> ("StackTracePauseTime");
780 
781  if(unloadSigHandler_) {
782  // Deactivate all the Root signal handlers and restore the system defaults
783  gSystem->ResetSignal(kSigChild);
784  gSystem->ResetSignal(kSigBus);
785  gSystem->ResetSignal(kSigSegmentationViolation);
786  gSystem->ResetSignal(kSigIllegalInstruction);
787  gSystem->ResetSignal(kSigSystem);
788  gSystem->ResetSignal(kSigPipe);
789  gSystem->ResetSignal(kSigAlarm);
790  gSystem->ResetSignal(kSigUrgent);
791  gSystem->ResetSignal(kSigFloatingException);
792  gSystem->ResetSignal(kSigWindowChanged);
793  } else if(pset.getUntrackedParameter<bool>("AbortOnSignal")){
794  cachePidInfo();
795 
796  //NOTE: ROOT can also be told to abort on these kinds of problems BUT
797  // it requires an TApplication to be instantiated which causes problems
798  gSystem->ResetSignal(kSigBus);
799  gSystem->ResetSignal(kSigSegmentationViolation);
800  gSystem->ResetSignal(kSigIllegalInstruction);
801  installCustomHandler(SIGBUS,sig_dostack_then_abort);
802  sigBusHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
803  installCustomHandler(SIGBUS,sig_abort);
804  });
805  installCustomHandler(SIGSEGV,sig_dostack_then_abort);
806  sigSegvHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
807  installCustomHandler(SIGSEGV,sig_abort);
808  });
809  installCustomHandler(SIGILL,sig_dostack_then_abort);
810  sigIllHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
811  installCustomHandler(SIGILL,sig_abort);
812  });
813  installCustomHandler(SIGTERM,sig_dostack_then_abort);
814  sigTermHandler_ = std::shared_ptr<const void>(nullptr,[](void*) {
815  installCustomHandler(SIGTERM,sig_abort);
816  });
818  }
819 
820  //Initialize each TBB thread so ROOT knows about them
821  iReg.watchPreallocate( [](service::SystemBounds const& iBounds) {
822  auto const nThreads =iBounds.maxNumberOfThreads();
823  if(nThreads > 1) {
824  std::atomic<unsigned int> threadsLeft{nThreads};
825 
826  std::shared_ptr<tbb::empty_task> waitTask{new (tbb::task::allocate_root()) tbb::empty_task{},
827  [](tbb::empty_task* iTask){tbb::task::destroy(*iTask);} };
828 
829  waitTask->set_ref_count(1+nThreads);
830  for(unsigned int i=0; i<nThreads;++i) {
831  tbb::task::spawn( *( new(tbb::task::allocate_root()) InitializeThreadTask(&threadsLeft, waitTask.get())));
832  }
833 
834  waitTask->wait_for_all();
835 
836  }
837  }
838  );
839 
840  iReg.watchPreallocate([this](edm::service::SystemBounds const& iBounds){
841  if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
842  moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
843  }
844  });
845 
846  if(resetErrHandler_) {
847 
848  // Replace the Root error handler with one that uses the MessageLogger
849  SetErrorHandler(RootErrorHandler);
850  }
851 
852  // Enable automatic Root library loading.
853  if(autoLibraryLoader_) {
854  gInterpreter->SetClassAutoloading(1);
855  }
856 
857  // Set ROOT parameters.
858  TTree::SetMaxTreeSize(kMaxLong64);
859  TH1::AddDirectory(kFALSE);
860  //G__SetCatchException(0);
861 
862  // Set custom streamers
864 
865  // Load the library containing dictionaries for std:: classes, if not already loaded.
866  if (!hasDictionary(typeid(std::vector<std::vector<unsigned int> >))) {
867  TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
868  }
869 
870  int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
871  if(debugLevel >0) {
872  gDebug = debugLevel;
873  }
874  }
875 
877  // close all open ROOT files
878  TIter iter(gROOT->GetListOfFiles());
879  TObject *obj = nullptr;
880  while(nullptr != (obj = iter.Next())) {
881  TFile* f = dynamic_cast<TFile*>(obj);
882  if(f) {
883  // We get a new iterator each time,
884  // because closing a file can invalidate the iterator
885  f->Close();
886  iter = TIter(gROOT->GetListOfFiles());
887  }
888  }
889  }
890 
892  //Tell Root we want to be multi-threaded
893  TThread::Initialize();
894  //When threading, also have to keep ROOT from logging all TObjects into a list
895  TObject::SetObjectStat(false);
896 
897  //Have to avoid having Streamers modify themselves after they have been used
898  TVirtualStreamerInfo::Optimize(false);
899  }
900 
902  localInitializeThisThreadForUse();
903  }
904 
907  desc.setComment("Centralized interface to ROOT.");
908  desc.addUntracked<bool>("UnloadRootSigHandler", false)
909  ->setComment("If True, signals are handled by this service, rather than by ROOT.");
910  desc.addUntracked<bool>("ResetRootErrHandler", true)
911  ->setComment("If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
912  desc.addUntracked<bool>("AutoLibraryLoader", true)
913  ->setComment("If True, enables automatic loading of data dictionaries.");
914  desc.addUntracked<bool>("LoadAllDictionaries",false)
915  ->setComment("If True, loads all ROOT dictionaries.");
916  desc.addUntracked<bool>("AbortOnSignal",true)
917  ->setComment("If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which attempts to do a clean shutdown.");
918  desc.addUntracked<int>("DebugLevel",0)
919  ->setComment("Sets ROOT's gDebug value.");
920  desc.addUntracked<int>("StackTracePauseTime", 300)
921  ->setComment("Seconds to pause other threads during stack trace.");
922  descriptions.add("InitRootHandlers", desc);
923  }
924 
925  char *const *
927  return pstackArgv_;
928  }
929 
930  void
932  s_ignoreWarnings =false;
933  }
934 
935  void
937  s_ignoreWarnings = true;
938  }
939 
940  void
942  {
943  if (snprintf(pidString_, pidStringLength_-1, "gdb -quiet -p %d 2>&1 <<EOF |\n"
944  "set width 0\n"
945  "set height 0\n"
946  "set pagination no\n"
947  "thread apply all bt\n"
948  "EOF\n"
949  "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'", getpid()) >= pidStringLength_)
950  {
951  std::ostringstream sstr;
952  sstr << "Unable to pre-allocate stacktrace handler information";
953  edm::Exception except(edm::errors::OtherCMS, sstr.str());
954  throw except;
955  }
956 
957  // These are initialized to -1; harmless to close an invalid FD.
958  // If this is called post-fork, we don't want to be communicating on
959  // these FDs as they are used internally by the parent.
960  close(childToParent_[0]);
961  close(childToParent_[1]);
962  childToParent_[0] = -1; childToParent_[1] = -1;
963  close(parentToChild_[0]);
964  close(parentToChild_[1]);
965  parentToChild_[0] = -1; parentToChild_[1] = -1;
966 
967  if (-1 == pipe2(childToParent_, O_CLOEXEC))
968  {
969  std::ostringstream sstr;
970  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
971  edm::Exception except(edm::errors::OtherCMS, sstr.str());
972  throw except;
973  }
974 
975  if (-1 == pipe2(parentToChild_, O_CLOEXEC))
976  {
977  close(childToParent_[0]); close(childToParent_[1]);
978  childToParent_[0] = -1; childToParent_[1] = -1;
979  std::ostringstream sstr;
980  sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
981  edm::Exception except(edm::errors::OtherCMS, sstr.str());
982  throw except;
983  }
984 
985  helperThread_.reset(new std::thread(stacktraceHelperThread));
986  helperThread_->detach();
987  }
988 
989  } // end of namespace service
990 } // end of namespace edm
991 
995 
unsigned int maxNumberOfThreads() const
Definition: SystemBounds.h:46
T getUntrackedParameter(std::string const &, T const &) const
int i
Definition: DBlmapReader.cc:9
virtual void enableWarnings_() override
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:117
double seconds()
void watchPreallocate(Preallocate::slot_type const &iSlot)
static char *const pstackArgv_[]
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventIDconst &, edm::Timestampconst & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:99
static ModuleCallingContext const * getCurrentModuleOnThread()
void setRefCoreStreamer(bool resetAll=false)
std::vector< Variable::Flags > flags
Definition: MVATrainer.cc:135
#define NULL
Definition: scimark2.h:8
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
friend int cmssw_stacktrace(void *)
#define constexpr
std::shared_ptr< const void > sigBusHandler_
static TypeWithDict byName(std::string const &name)
Definition: TypeWithDict.cc:60
tuple result
Definition: mps_fire.py:84
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
void cachePidInfoHandler(unsigned int, unsigned int)
std::atomic< unsigned int > * threadsLeft_
void setComment(std::string const &value)
std::string moduleName(Provenance const &provenance)
Definition: Provenance.cc:27
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static ThreadTracker threadTracker_
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
static char *const * getPstackArgv()
std::shared_ptr< const void > sigIllHandler_
virtual void initializeThisThreadForUse() override
virtual void ignoreWarnings_() override
std::shared_ptr< const void > sigTermHandler_
tuple fd
Definition: ztee.py:136
void addAdditionalInfo(std::string const &info)
Definition: Exception.cc:235
double f[11][100]
tuple text
Definition: runonSM.py:42
int cmssw_stacktrace(void *)
tbb::task * waitTask_
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static char pstackName[]
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
tuple pid
Definition: sysUtil.py:22
tbb::concurrent_unordered_set< pthread_t > Container_type
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
Definition: eve_macros.cc:135
void watchPostForkReacquireResources(PostForkReacquireResources::slot_type const &iSlot)
virtual void willBeUsingThreads() override
static char dashC[]
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:82
static void fillDescriptions(ConfigurationDescriptions &descriptions)
static std::atomic< unsigned int > counter
SeverityLevel
bool hasDictionary(std::type_info const &)
#define O_NONBLOCK
Definition: SysFile.h:21
tuple level
Definition: testEve_cfg.py:34
tuple size
Write out results.
tuple status
Definition: mps_update.py:57