19 #include "oneapi/tbb/concurrent_unordered_set.h" 20 #include "oneapi/tbb/task.h" 21 #include "oneapi/tbb/task_scheduler_observer.h" 22 #include "oneapi/tbb/global_control.h" 46 #include "TInterpreter.h" 49 #include "TUnixSystem.h" 51 #include "TVirtualStreamerInfo.h" 53 #include "TClassTable.h" 60 constexpr std::size_t moduleBufferSize = 128;
159 constexpr
bool s_ignoreEverything =
false;
161 template <std::
size_t SIZE>
162 bool find_if_string(
const std::string&
search,
const std::array<const char* const, SIZE>& substrs) {
163 return (std::find_if(substrs.begin(), substrs.end(), [&
search](
const char*
const s) ->
bool {
164 return (
search.find(
s) != std::string::npos);
165 }) != substrs.end());
169 constexpr std::array<const char* const, 9> in_message{
170 {
"no dictionary for class",
171 "already in TClassTable",
172 "matrix not positive definite",
173 "not a TStreamerInfo object",
174 "Problems declaring payload",
175 "Announced number of args different from the real number of argument passed",
176 "nbins is <=0 - set to nbins = 1",
177 "nbinsy is <=0 - set to nbinsy = 1",
178 "oneapi::tbb::global_control is limiting"}};
181 constexpr std::array<const char* const, 7> in_location{{
"Fit",
182 "TDecompChol::Solve",
183 "THistPainter::PaintInit",
184 "TUnixSystem::SetDisplay",
185 "TGClient::GetFontByName",
187 "RTaskArenaWrapper"}};
189 constexpr std::array<const char* const, 3> in_message_print_error{{
"number of iterations was insufficient",
190 "bad integrand behavior",
191 "integral is divergent, or slowly convergent"}};
193 void RootErrorHandlerImpl(
int level,
char const*
location,
char const* message) {
200 if (
level >= kFatal) {
202 }
else if (
level >= kSysError) {
210 if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
222 if (message !=
nullptr)
223 el_message = message;
234 size_t index1 = el_message.find(precursor);
235 if (index1 != std::string::npos) {
236 size_t index2 = index1 + precursor.length();
237 size_t index3 = el_message.find_first_of(
" :", index2);
238 if (index3 != std::string::npos) {
239 size_t substrlen = index3 - index2;
240 el_identifier +=
"-";
241 el_identifier += el_message.substr(index2, substrlen);
244 index1 = el_location.find(
"::");
245 if (index1 != std::string::npos) {
246 el_identifier +=
"/";
247 el_identifier += el_location.substr(0, index1);
253 if ((el_location.find(
"TBranchElement::Fill") != std::string::npos) &&
254 (el_message.find(
"fill branch") != std::string::npos) && (el_message.find(
"address") != std::string::npos) &&
255 (el_message.find(
"not set") != std::string::npos)) {
259 if ((el_message.find(
"Tree branches") != std::string::npos) &&
260 (el_message.find(
"different numbers of entries") != std::string::npos)) {
266 if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
267 (
level <
kError and (el_location.find(
"CINTTypedefBuilder::Setup") != std::string::npos) and
268 (el_message.find(
"possible entries are in use!") != std::string::npos))) {
274 bool alreadyPrinted =
false;
275 if (find_if_string(el_message, in_message_print_error)) {
278 alreadyPrinted =
true;
293 if (die && (el_location !=
std::string(
"@SUB=TUnixSystem::DispatchSignals"))) {
294 std::ostringstream sstr;
295 sstr <<
"Fatal Root Error: " << el_location <<
"\n" << el_message <<
'\n';
298 except.clearMessage();
305 if (!alreadyPrinted) {
315 edm::LogInfo(
"Root_Information") << el_location << el_message;
320 void RootErrorHandler(
int level,
bool,
char const*
location,
char const* message) {
325 void set_default_signals() {
326 signal(SIGILL, SIG_DFL);
327 signal(SIGSEGV, SIG_DFL);
328 signal(SIGBUS, SIG_DFL);
330 signal(SIGFPE, SIG_DFL);
331 signal(SIGABRT, SIG_DFL);
334 static int full_write(
int fd,
const char*
text) {
341 if (errno == EINTR) {
353 static int full_read(
int fd,
char* inbuf,
size_t len,
int timeout_s = -1) {
356 ssize_t complete = 0;
357 std::chrono::time_point<std::chrono::steady_clock> end_time =
362 }
else if ((-1 == (
flags = fcntl(
fd, F_GETFL)))) {
371 if (timeout_s >= 0) {
372 struct pollfd poll_info {
377 if (ms_remaining > 0) {
378 int rc = poll(&poll_info, 1, ms_remaining);
381 if (errno == EINTR || errno == EAGAIN) {
393 }
else if (ms_remaining < 0) {
401 if (complete == -1) {
402 if (errno == EINTR) {
404 }
else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
407 int orig_errno = errno;
423 static int full_cerr_write(
const char*
text) {
return full_write(2,
text); }
429 #if defined(SIGRTMAX) 430 #define PAUSE_SIGNAL SIGRTMAX 431 #define RESUME_SIGNAL SIGRTMAX - 1 432 #elif defined(SIGINFO) // macOS/BSD 433 #define PAUSE_SIGNAL SIGINFO 434 #define RESUME_SIGNAL SIGALRM 438 void sig_resume_handler(
int sig, siginfo_t*,
void*) {}
441 void sig_pause_for_stacktrace(
int sig, siginfo_t*,
void*) {
446 sigemptyset(&sigset);
447 sigaddset(&sigset, RESUME_SIGNAL);
448 pthread_sigmask(SIG_UNBLOCK, &sigset,
nullptr);
458 strlcpy(buff,
"\nModule: ", moduleBufferSize);
463 strlcat(buff,
":", moduleBufferSize);
468 strlcat(buff,
"none", moduleBufferSize);
475 void sig_dostack_then_abort(
int sig, siginfo_t*,
void*) {
480 const auto self = pthread_self();
484 struct sigaction act;
485 act.sa_sigaction = sig_pause_for_stacktrace;
487 sigemptyset(&act.sa_mask);
488 sigaction(PAUSE_SIGNAL, &act,
nullptr);
491 sigset_t pausesigset;
492 sigemptyset(&pausesigset);
493 sigaddset(&pausesigset, PAUSE_SIGNAL);
494 sigprocmask(SIG_UNBLOCK, &pausesigset,
nullptr);
497 for (
auto id : tids) {
499 pthread_kill(
id, PAUSE_SIGNAL);
505 act.sa_sigaction = sig_resume_handler;
506 sigaction(RESUME_SIGNAL, &act,
nullptr);
511 const char* signalname =
"unknown";
514 signalname =
"bus error";
518 signalname =
"segmentation violation";
522 signalname =
"illegal instruction";
526 signalname =
"floating point exception";
530 signalname =
"external termination request";
534 signalname =
"abort signal";
540 full_cerr_write(
"\n\nA fatal system signal has occurred: ");
541 full_cerr_write(signalname);
542 full_cerr_write(
"\nThe following is the call stack containing the origin of the signal.\n\n");
551 std::size_t notified = 0;
553 for (
auto id : tids) {
555 if (pthread_kill(
id, RESUME_SIGNAL) == 0)
562 full_cerr_write(
"\nCurrent Modules:\n");
569 if (tids.count(
self) > 0) {
570 char buff[moduleBufferSize] =
"\nModule: ";
575 strlcat(buff,
":", moduleBufferSize);
580 strlcat(buff,
"none", moduleBufferSize);
582 strlcat(buff,
" (crashed)", moduleBufferSize);
583 full_cerr_write(buff);
585 full_cerr_write(
"\nModule: non-CMSSW (crashed)");
593 timespec
t = {0, 1000};
595 nanosleep(&
t,
nullptr);
603 full_cerr_write(
"\n\nA fatal system signal has occurred: ");
604 full_cerr_write(signalname);
605 full_cerr_write(
"\n");
609 if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig ==
SIGTERM) || (sig == SIGFPE) ||
611 signal(sig, SIG_DFL);
614 set_default_signals();
619 void sig_abort(
int sig, siginfo_t*,
void*) {
620 full_cerr_write(
"\n\nFatal system signal has occurred during exit\n");
623 signal(sig, SIG_DFL);
627 set_default_signals();
652 int result = full_read(fromParent,
buf, 1);
657 set_default_signals();
659 full_cerr_write(
"\n\nTraceback helper thread failed to read from parent: ");
660 full_cerr_write(strerror(-
result));
661 full_cerr_write(
"\n");
665 set_default_signals();
667 full_write(toParent,
buf);
668 }
else if (
buf[0] ==
'2') {
675 }
else if (
buf[0] ==
'3') {
678 set_default_signals();
680 full_cerr_write(
"\n\nTraceback helper thread got unknown command from parent: ");
681 full_cerr_write(
buf);
682 full_cerr_write(
"\n");
691 full_cerr_write(
"\n\nAttempt to request stacktrace failed: ");
692 full_cerr_write(strerror(-
result));
693 full_cerr_write(
"\n");
699 full_cerr_write(
"\n\nWaiting for stacktrace completion failed: ");
700 if (
result == -ETIMEDOUT) {
701 full_cerr_write(
"timed out waiting for GDB to complete.");
703 full_cerr_write(strerror(-
result));
705 full_cerr_write(
"\n");
711 char child_stack[4 * 1024];
712 char* child_stack_ptr = child_stack + 4 * 1024;
722 if (child_stack_ptr) {
729 full_cerr_write(
"(Attempt to perform stack dump failed.)\n");
732 if (waitpid(pid, &
status, 0) == -1) {
733 full_cerr_write(
"(Failed to wait on stack dump output.)\n");
736 full_cerr_write(
"(GDB stack trace failed unexpectedly)\n");
742 set_default_signals();
749 syscall(SYS_execve,
"/bin/sh",
argv, __environ);
751 execv(
"/bin/sh",
argv);
757 static constexpr
char pstackName[] =
"(CMSSW stack trace helper)";
758 static constexpr
char dashC[] =
"-c";
771 unloadSigHandler_(
pset.getUntrackedParameter<
bool>(
"UnloadRootSigHandler")),
772 resetErrHandler_(
pset.getUntrackedParameter<
bool>(
"ResetRootErrHandler")),
773 loadAllDictionaries_(
pset.getUntrackedParameter<
bool>(
"LoadAllDictionaries")),
774 autoLibraryLoader_(loadAllDictionaries_
or pset.getUntrackedParameter<
bool>(
"AutoLibraryLoader")),
775 autoClassParser_(
pset.getUntrackedParameter<
bool>(
"AutoClassParser")),
776 interactiveDebug_(
pset.getUntrackedParameter<
bool>(
"InteractiveDebug")) {
790 gSystem->ResetSignal(kSigChild);
791 gSystem->ResetSignal(kSigBus);
792 gSystem->ResetSignal(kSigSegmentationViolation);
793 gSystem->ResetSignal(kSigIllegalInstruction);
794 gSystem->ResetSignal(kSigSystem);
795 gSystem->ResetSignal(kSigPipe);
796 gSystem->ResetSignal(kSigAlarm);
797 gSystem->ResetSignal(kSigUrgent);
798 gSystem->ResetSignal(kSigFloatingException);
799 gSystem->ResetSignal(kSigWindowChanged);
800 }
else if (
pset.getUntrackedParameter<
bool>(
"AbortOnSignal")) {
805 gSystem->ResetSignal(kSigBus);
806 gSystem->ResetSignal(kSigSegmentationViolation);
807 gSystem->ResetSignal(kSigIllegalInstruction);
808 gSystem->ResetSignal(kSigFloatingException);
821 signal(SIGABRT, SIG_DFL);
833 SetErrorHandler(RootErrorHandler);
838 gInterpreter->SetClassAutoloading(1);
851 TTree::SetMaxTreeSize(kMaxLong64);
852 TH1::AddDirectory(kFALSE);
869 bool imt =
pset.getUntrackedParameter<
bool>(
"EnableIMT");
870 if (imt && not ROOT::IsImplicitMTEnabled()) {
873 ROOT::EnableImplicitMT(
874 oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
880 TIter iter(gROOT->GetListOfFiles());
881 TObject*
obj =
nullptr;
882 while (
nullptr != (
obj = iter.Next())) {
883 TFile*
f =
dynamic_cast<TFile*
>(
obj);
888 iter = TIter(gROOT->GetListOfFiles());
897 ROOT::EnableThreadSafety();
900 TObject::SetObjectStat(
false);
903 TVirtualStreamerInfo::Optimize(
false);
908 desc.setComment(
"Centralized interface to ROOT.");
909 desc.addUntracked<
bool>(
"UnloadRootSigHandler",
false)
910 ->setComment(
"If True, signals are handled by this service, rather than by ROOT.");
911 desc.addUntracked<
bool>(
"ResetRootErrHandler",
true)
913 "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
914 desc.addUntracked<
bool>(
"AutoLibraryLoader",
true)
915 ->setComment(
"If True, enables automatic loading of data dictionaries.");
916 desc.addUntracked<
bool>(
"AutoClassParser",
true)
918 "If False, the automatic parsing of class headers for dictionaries when pre-built dictionaries are " 919 "missing is disable during module construction. The current implementation of disabling the parsing is " 920 "fragile, and may work only in a single-thread job that does not use reco::parser::cutParser() or " 921 "reco::parser::expressionParser() (and it certainly does not work on multiple threads).");
922 desc.addUntracked<
bool>(
"LoadAllDictionaries",
false)->setComment(
"If True, loads all ROOT dictionaries.");
923 desc.addUntracked<
bool>(
"EnableIMT",
true)->setComment(
"If True, calls ROOT::EnableImplicitMT().");
924 desc.addUntracked<
bool>(
"AbortOnSignal",
true)
926 "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which " 927 "attempts to do a clean shutdown.");
928 desc.addUntracked<
bool>(
"InteractiveDebug",
false)
930 "If True, leave gdb attached to cmsRun after a crash; " 931 "if False, attach gdb, print a stack trace, and quit gdb");
932 desc.addUntracked<
int>(
"DebugLevel", 0)->setComment(
"Sets ROOT's gDebug value.");
933 desc.addUntracked<
int>(
"StackTracePauseTime", 300)
934 ->setComment(
"Seconds to pause other threads during stack trace.");
935 descriptions.
add(
"InitRootHandlers",
desc);
957 "set pagination no\n" 958 "thread apply all bt\n" 960 "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
963 std::ostringstream sstr;
964 sstr <<
"Unable to pre-allocate stacktrace handler information";
982 std::ostringstream sstr;
983 sstr <<
"Failed to create child-to-parent pipes (errno=" << errno <<
"): " << strerror(errno);
993 std::ostringstream sstr;
994 sstr <<
"Failed to create child-to-parent pipes (errno=" << errno <<
"): " << strerror(errno);
void on_scheduler_exit(bool) override
void watchPostModuleConstruction(PostModuleConstruction::slot_type const &iSlot)
static constexpr char dashC[]
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
bool loadAllDictionaries_
void enableWarnings_() override
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setRefCoreStreamerInTClass()
static void stacktraceFromThread()
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
Container_type threadIDs_
void watchPreModuleConstruction(PreModuleConstruction::slot_type const &iSlot)
std::vector< T >::const_iterator search(const cond::Time_t &val, const std::vector< T > &container)
oneapi::tbb::concurrent_unordered_set< pthread_t > Container_type
bool isProcessWideService(TFileService const *)
static ModuleCallingContext const * getCurrentModuleOnThread()
static int stackTracePause_
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
Log< level::Error, false > LogError
std::shared_ptr< const void > sigFpeHandler_
friend int cmssw_stacktrace(void *)
void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override
std::shared_ptr< const void > sigBusHandler_
~ThreadTracker() override=default
static TypeWithDict byName(std::string const &name)
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
std::shared_ptr< const void > sigAbrtHandler_
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static char const *const * getPstackArgv()
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
std::shared_ptr< const void > sigIllHandler_
static int childToParent_[2]
std::shared_ptr< const void > sigTermHandler_
void addAdditionalInfo(std::string const &info)
int cmssw_stacktrace(void *)
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static std::unique_ptr< ThreadTracker > threadTracker_
Log< level::Info, false > LogInfo
static constexpr int pidStringLength_
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
static char const *const pstackArgv_[]
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
static int parentToChild_[2]
unsigned int maxNumberOfThreads() const
std::string moduleName(StableProvenance const &provenance, ProcessHistory const &history)
void willBeUsingThreads() override
char data[epos_bytes_allocation]
static void fillDescriptions(ConfigurationDescriptions &descriptions)
~InitRootHandlers() override
const Container_type & IDs()
static int stackTracePause()
static void stacktraceHelperThread()
bool hasDictionary(std::type_info const &)
Log< level::Warning, false > LogWarning
static constexpr char pstackName[]
void on_scheduler_entry(bool) override