21 #include "oneapi/tbb/concurrent_unordered_set.h"
22 #include "oneapi/tbb/task.h"
23 #include "oneapi/tbb/task_scheduler_observer.h"
24 #include "oneapi/tbb/global_control.h"
48 #include "TInterpreter.h"
51 #include "TUnixSystem.h"
53 #include "TVirtualStreamerInfo.h"
55 #include "TClassTable.h"
62 constexpr std::size_t moduleBufferSize = 128;
159 constexpr
bool s_ignoreEverything =
false;
161 template <std::
size_t SIZE>
162 bool find_if_string(
const std::string&
search,
const std::array<const char* const, SIZE>& substrs) {
163 return (std::find_if(substrs.begin(), substrs.end(), [&
search](
const char*
const s) ->
bool {
164 return (search.find(
s) != std::string::npos);
165 }) != substrs.end());
169 constexpr std::array<const char* const, 9> in_message{
170 {
"no dictionary for class",
171 "already in TClassTable",
172 "matrix not positive definite",
173 "not a TStreamerInfo object",
174 "Problems declaring payload",
175 "Announced number of args different from the real number of argument passed",
176 "nbins is <=0 - set to nbins = 1",
177 "nbinsy is <=0 - set to nbinsy = 1",
178 "oneapi::tbb::global_control is limiting"}};
181 constexpr std::array<const char* const, 7> in_location{{
"Fit",
182 "TDecompChol::Solve",
183 "THistPainter::PaintInit",
184 "TUnixSystem::SetDisplay",
185 "TGClient::GetFontByName",
187 "RTaskArenaWrapper"}};
189 constexpr std::array<const char* const, 3> in_message_print_error{{
"number of iterations was insufficient",
190 "bad integrand behavior",
191 "integral is divergent, or slowly convergent"}};
193 void RootErrorHandlerImpl(
int level,
char const* location,
char const* message) {
200 if (level >= kFatal) {
202 }
else if (level >= kSysError) {
204 }
else if (level >=
kError) {
210 if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
218 if (location !=
nullptr)
222 if (message !=
nullptr)
223 el_message = message;
234 size_t index1 = el_message.find(precursor);
235 if (index1 != std::string::npos) {
236 size_t index2 = index1 + precursor.length();
237 size_t index3 = el_message.find_first_of(
" :", index2);
238 if (index3 != std::string::npos) {
239 size_t substrlen = index3 - index2;
240 el_identifier +=
"-";
241 el_identifier += el_message.substr(index2, substrlen);
244 index1 = el_location.find(
"::");
245 if (index1 != std::string::npos) {
246 el_identifier +=
"/";
247 el_identifier += el_location.substr(0, index1);
253 if ((el_location.find(
"TBranchElement::Fill") != std::string::npos) &&
254 (el_message.find(
"fill branch") != std::string::npos) && (el_message.find(
"address") != std::string::npos) &&
255 (el_message.find(
"not set") != std::string::npos)) {
259 if ((el_message.find(
"Tree branches") != std::string::npos) &&
260 (el_message.find(
"different numbers of entries") != std::string::npos)) {
266 if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
267 (level <
kError and (el_location.find(
"CINTTypedefBuilder::Setup") != std::string::npos) and
268 (el_message.find(
"possible entries are in use!") != std::string::npos))) {
274 bool alreadyPrinted =
false;
275 if (find_if_string(el_message, in_message_print_error)) {
278 alreadyPrinted =
true;
293 if (die && (el_location !=
std::string(
"@SUB=TUnixSystem::DispatchSignals"))) {
294 std::ostringstream sstr;
295 sstr <<
"Fatal Root Error: " << el_location <<
"\n" << el_message <<
'\n';
298 except.clearMessage();
305 if (!alreadyPrinted) {
315 edm::LogInfo(
"Root_Information") << el_location << el_message;
320 void RootErrorHandler(
int level,
bool,
char const* location,
char const* message) {
321 RootErrorHandlerImpl(level, location, message);
325 void set_default_signals() {
326 signal(SIGILL, SIG_DFL);
327 signal(SIGSEGV, SIG_DFL);
328 signal(SIGBUS, SIG_DFL);
329 signal(SIGTERM, SIG_DFL);
330 signal(SIGABRT, SIG_DFL);
333 static int full_write(
int fd,
const char*
text) {
335 size_t count = strlen(text);
338 written =
write(fd, buffer, count);
340 if (errno == EINTR) {
352 static int full_read(
int fd,
char* inbuf,
size_t len,
int timeout_s = -1) {
355 ssize_t complete = 0;
356 std::chrono::time_point<std::chrono::steady_clock> end_time =
361 }
else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
365 if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
370 if (timeout_s >= 0) {
371 struct pollfd poll_info {
376 if (ms_remaining > 0) {
377 int rc = poll(&poll_info, 1, ms_remaining);
380 if (errno == EINTR || errno == EAGAIN) {
387 if ((flags & O_NONBLOCK) != O_NONBLOCK) {
388 fcntl(fd, F_SETFL, flags);
392 }
else if (ms_remaining < 0) {
393 if ((flags & O_NONBLOCK) != O_NONBLOCK) {
394 fcntl(fd, F_SETFL, flags);
399 complete =
read(fd, buf, count);
400 if (complete == -1) {
401 if (errno == EINTR) {
403 }
else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
406 int orig_errno = errno;
407 if ((flags & O_NONBLOCK) != O_NONBLOCK) {
408 fcntl(fd, F_SETFL, flags);
416 if ((flags & O_NONBLOCK) != O_NONBLOCK) {
417 fcntl(fd, F_SETFL, flags);
422 static int full_cerr_write(
const char* text) {
return full_write(2, text); }
428 #if defined(SIGRTMAX)
429 #define PAUSE_SIGNAL SIGRTMAX
430 #define RESUME_SIGNAL SIGRTMAX - 1
431 #elif defined(SIGINFO) // macOS/BSD
432 #define PAUSE_SIGNAL SIGINFO
433 #define RESUME_SIGNAL SIGALRM
437 void sig_resume_handler(
int sig, siginfo_t*,
void*) {}
440 void sig_pause_for_stacktrace(
int sig, siginfo_t*,
void*) {
441 using namespace edm::service;
445 sigemptyset(&sigset);
446 sigaddset(&sigset, RESUME_SIGNAL);
447 pthread_sigmask(SIG_UNBLOCK, &sigset,
nullptr);
457 strlcpy(buff,
"\nModule: ", moduleBufferSize);
462 strlcat(buff,
":", moduleBufferSize);
467 strlcat(buff,
"none", moduleBufferSize);
474 void sig_dostack_then_abort(
int sig, siginfo_t*,
void*) {
475 using namespace edm::service;
479 const auto self = pthread_self();
483 struct sigaction act;
484 act.sa_sigaction = sig_pause_for_stacktrace;
486 sigemptyset(&act.sa_mask);
487 sigaction(PAUSE_SIGNAL, &act,
nullptr);
490 sigset_t pausesigset;
491 sigemptyset(&pausesigset);
492 sigaddset(&pausesigset, PAUSE_SIGNAL);
493 sigprocmask(SIG_UNBLOCK, &pausesigset,
nullptr);
496 for (
auto id : tids) {
498 pthread_kill(
id, PAUSE_SIGNAL);
504 act.sa_sigaction = sig_resume_handler;
505 sigaction(RESUME_SIGNAL, &act,
nullptr);
510 const char* signalname =
"unknown";
513 signalname =
"bus error";
517 signalname =
"segmentation violation";
521 signalname =
"illegal instruction";
525 signalname =
"external termination request";
529 signalname =
"abort signal";
535 full_cerr_write(
"\n\nA fatal system signal has occurred: ");
536 full_cerr_write(signalname);
537 full_cerr_write(
"\nThe following is the call stack containing the origin of the signal.\n\n");
546 std::size_t notified = 0;
548 for (
auto id : tids) {
550 if (pthread_kill(
id, RESUME_SIGNAL) == 0)
557 full_cerr_write(
"\nCurrent Modules:\n");
564 if (tids.count(
self) > 0) {
565 char buff[moduleBufferSize] =
"\nModule: ";
570 strlcat(buff,
":", moduleBufferSize);
575 strlcat(buff,
"none", moduleBufferSize);
577 strlcat(buff,
" (crashed)", moduleBufferSize);
578 full_cerr_write(buff);
580 full_cerr_write(
"\nModule: non-CMSSW (crashed)");
588 timespec
t = {0, 1000};
590 nanosleep(&t,
nullptr);
598 full_cerr_write(
"\n\nA fatal system signal has occurred: ");
599 full_cerr_write(signalname);
600 full_cerr_write(
"\n");
604 if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGABRT)) {
605 signal(sig, SIG_DFL);
608 set_default_signals();
613 void sig_abort(
int sig, siginfo_t*,
void*) {
614 full_cerr_write(
"\n\nFatal system signal has occurred during exit\n");
617 signal(sig, SIG_DFL);
621 set_default_signals();
646 int result = full_read(fromParent, buf, 1);
651 set_default_signals();
653 full_cerr_write(
"\n\nTraceback helper thread failed to read from parent: ");
654 full_cerr_write(strerror(-result));
655 full_cerr_write(
"\n");
659 set_default_signals();
661 full_write(toParent, buf);
662 }
else if (buf[0] ==
'2') {
669 }
else if (buf[0] ==
'3') {
672 set_default_signals();
674 full_cerr_write(
"\n\nTraceback helper thread got unknown command from parent: ");
675 full_cerr_write(buf);
676 full_cerr_write(
"\n");
685 full_cerr_write(
"\n\nAttempt to request stacktrace failed: ");
686 full_cerr_write(strerror(-result));
687 full_cerr_write(
"\n");
692 if ((result = full_read(
childToParent_[0], buf, 1, 5 * 60)) < 0) {
693 full_cerr_write(
"\n\nWaiting for stacktrace completion failed: ");
694 if (result == -ETIMEDOUT) {
695 full_cerr_write(
"timed out waiting for GDB to complete.");
697 full_cerr_write(strerror(-result));
699 full_cerr_write(
"\n");
705 char child_stack[4 * 1024];
706 char* child_stack_ptr = child_stack + 4 * 1024;
716 if (child_stack_ptr) {
723 full_cerr_write(
"(Attempt to perform stack dump failed.)\n");
726 if (waitpid(pid, &status, 0) == -1) {
727 full_cerr_write(
"(Failed to wait on stack dump output.)\n");
730 full_cerr_write(
"(GDB stack trace failed unexpectedly)\n");
736 set_default_signals();
743 syscall(SYS_execve,
"/bin/sh", argv, __environ);
745 execv(
"/bin/sh", argv);
751 static constexpr
char pstackName[] =
"(CMSSW stack trace helper)";
752 static constexpr
char dashC[] =
"-c";
765 unloadSigHandler_(pset.getUntrackedParameter<bool>(
"UnloadRootSigHandler")),
766 resetErrHandler_(pset.getUntrackedParameter<bool>(
"ResetRootErrHandler")),
767 loadAllDictionaries_(pset.getUntrackedParameter<bool>(
"LoadAllDictionaries")),
768 autoLibraryLoader_(loadAllDictionaries_
or pset.getUntrackedParameter<bool>(
"AutoLibraryLoader")),
769 interactiveDebug_(pset.getUntrackedParameter<bool>(
"InteractiveDebug")) {
773 threadTracker_ = std::make_unique<ThreadTracker>();
775 if (threadTracker_) {
776 threadTracker_->observe(
false);
783 gSystem->ResetSignal(kSigChild);
784 gSystem->ResetSignal(kSigBus);
785 gSystem->ResetSignal(kSigSegmentationViolation);
786 gSystem->ResetSignal(kSigIllegalInstruction);
787 gSystem->ResetSignal(kSigSystem);
788 gSystem->ResetSignal(kSigPipe);
789 gSystem->ResetSignal(kSigAlarm);
790 gSystem->ResetSignal(kSigUrgent);
791 gSystem->ResetSignal(kSigFloatingException);
792 gSystem->ResetSignal(kSigWindowChanged);
798 gSystem->ResetSignal(kSigBus);
799 gSystem->ResetSignal(kSigSegmentationViolation);
800 gSystem->ResetSignal(kSigIllegalInstruction);
811 signal(SIGABRT, SIG_DFL);
823 SetErrorHandler(RootErrorHandler);
828 gInterpreter->SetClassAutoloading(1);
832 TTree::SetMaxTreeSize(kMaxLong64);
833 TH1::AddDirectory(kFALSE);
845 if (debugLevel > 0) {
851 if (imt && not ROOT::IsImplicitMTEnabled()) {
854 ROOT::EnableImplicitMT(
855 oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
861 TIter iter(gROOT->GetListOfFiles());
862 TObject*
obj =
nullptr;
863 while (
nullptr != (obj = iter.Next())) {
864 TFile*
f =
dynamic_cast<TFile*
>(
obj);
869 iter = TIter(gROOT->GetListOfFiles());
878 ROOT::EnableThreadSafety();
881 TObject::SetObjectStat(
false);
884 TVirtualStreamerInfo::Optimize(
false);
889 desc.
setComment(
"Centralized interface to ROOT.");
891 ->setComment(
"If True, signals are handled by this service, rather than by ROOT.");
894 "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
896 ->setComment(
"If True, enables automatic loading of data dictionaries.");
897 desc.
addUntracked<
bool>(
"LoadAllDictionaries",
false)->setComment(
"If True, loads all ROOT dictionaries.");
898 desc.
addUntracked<
bool>(
"EnableIMT",
true)->setComment(
"If True, calls ROOT::EnableImplicitMT().");
901 "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
902 "attempts to do a clean shutdown.");
905 "If True, leave gdb attached to cmsRun after a crash; "
906 "if False, attach gdb, print a stack trace, and quit gdb");
907 desc.
addUntracked<
int>(
"DebugLevel", 0)->setComment(
"Sets ROOT's gDebug value.");
909 ->setComment(
"Seconds to pause other threads during stack trace.");
910 descriptions.
add(
"InitRootHandlers", desc);
932 "set pagination no\n"
933 "thread apply all bt\n"
935 "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
938 std::ostringstream sstr;
939 sstr <<
"Unable to pre-allocate stacktrace handler information";
957 std::ostringstream sstr;
958 sstr <<
"Failed to create child-to-parent pipes (errno=" << errno <<
"): " << strerror(errno);
968 std::ostringstream sstr;
969 sstr <<
"Failed to create child-to-parent pipes (errno=" << errno <<
"): " << strerror(errno);
void on_scheduler_exit(bool) override
unsigned int maxNumberOfThreads() const
T getUntrackedParameter(std::string const &, T const &) const
static constexpr char dashC[]
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
bool loadAllDictionaries_
void enableWarnings_() override
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setRefCoreStreamerInTClass()
static void stacktraceFromThread()
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventIDconst &, edm::Timestampconst & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Container_type threadIDs_
std::vector< T >::const_iterator search(const cond::Time_t &val, const std::vector< T > &container)
oneapi::tbb::concurrent_unordered_set< pthread_t > Container_type
bool isProcessWideService(TFileService const *)
static ModuleCallingContext const * getCurrentModuleOnThread()
static int stackTracePause_
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
Log< level::Error, false > LogError
friend int cmssw_stacktrace(void *)
void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override
std::shared_ptr< const void > sigBusHandler_
~ThreadTracker() override=default
static TypeWithDict byName(std::string const &name)
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
std::shared_ptr< const void > sigAbrtHandler_
void setComment(std::string const &value)
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
std::shared_ptr< const void > sigIllHandler_
static int childToParent_[2]
std::shared_ptr< const void > sigTermHandler_
void addAdditionalInfo(std::string const &info)
int cmssw_stacktrace(void *)
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static std::unique_ptr< ThreadTracker > threadTracker_
Log< level::Info, false > LogInfo
static constexpr int pidStringLength_
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
static char const *const * getPstackArgv()
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
static int parentToChild_[2]
static char const *const pstackArgv_[]
std::string moduleName(StableProvenance const &provenance, ProcessHistory const &history)
void willBeUsingThreads() override
char data[epos_bytes_allocation]
static void fillDescriptions(ConfigurationDescriptions &descriptions)
~InitRootHandlers() override
const Container_type & IDs()
static int stackTracePause()
static void stacktraceHelperThread()
bool hasDictionary(std::type_info const &)
Log< level::Warning, false > LogWarning
static constexpr char pstackName[]
void on_scheduler_entry(bool) override
tuple size
Write out results.