19 #include "oneapi/tbb/concurrent_unordered_set.h" 20 #include "oneapi/tbb/task.h" 21 #include "oneapi/tbb/task_scheduler_observer.h" 22 #include "oneapi/tbb/global_control.h" 46 #include "TInterpreter.h" 49 #include "TUnixSystem.h" 51 #include "TVirtualStreamerInfo.h" 53 #include "TClassTable.h" 60 constexpr std::size_t moduleBufferSize = 128;
66 class ActivityRegistry;
159 constexpr bool s_ignoreEverything =
false;
161 template <std::
size_t SIZE>
162 bool find_if_string(
const std::string&
search,
const std::array<const char* const, SIZE>& substrs) {
163 return (std::find_if(substrs.begin(), substrs.end(), [&
search](
const char*
const s) ->
bool {
164 return (
search.find(
s) != std::string::npos);
165 }) != substrs.end());
169 constexpr std::array<const char* const, 11> in_message{
170 {
"no dictionary for class",
171 "already in TClassTable",
172 "matrix not positive definite",
173 "not a TStreamerInfo object",
174 "Problems declaring payload",
175 "Announced number of args different from the real number of argument passed",
176 "nbins is <=0 - set to nbins = 1",
177 "nbinsy is <=0 - set to nbinsy = 1",
178 "oneapi::tbb::global_control is limiting",
179 "ufirst < fXmin, fXmin is used",
180 "ulast > fXmax, fXmax is used"}};
183 constexpr std::array<const char* const, 7> in_location{{
"Fit",
184 "TDecompChol::Solve",
185 "THistPainter::PaintInit",
186 "TUnixSystem::SetDisplay",
187 "TGClient::GetFontByName",
189 "RTaskArenaWrapper"}};
191 constexpr std::array<const char* const, 4> in_message_print_error{
192 {
"number of iterations was insufficient",
193 "bad integrand behavior",
194 "integral is divergent, or slowly convergent",
195 "VariableMetricBuilder Initial matrix not pos.def."}};
197 void RootErrorHandlerImpl(
int level,
char const*
location,
char const* message) {
204 if (
level >= kFatal) {
206 }
else if (
level >= kSysError) {
214 if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
226 if (message !=
nullptr)
227 el_message = message;
238 size_t index1 = el_message.find(precursor);
239 if (index1 != std::string::npos) {
240 size_t index2 = index1 + precursor.length();
241 size_t index3 = el_message.find_first_of(
" :", index2);
242 if (index3 != std::string::npos) {
243 size_t substrlen = index3 - index2;
244 el_identifier +=
"-";
245 el_identifier += el_message.substr(index2, substrlen);
248 index1 = el_location.find(
"::");
249 if (index1 != std::string::npos) {
250 el_identifier +=
"/";
251 el_identifier += el_location.substr(0, index1);
257 if ((el_location.find(
"TBranchElement::Fill") != std::string::npos) &&
258 (el_message.find(
"fill branch") != std::string::npos) && (el_message.find(
"address") != std::string::npos) &&
259 (el_message.find(
"not set") != std::string::npos)) {
263 if ((el_message.find(
"Tree branches") != std::string::npos) &&
264 (el_message.find(
"different numbers of entries") != std::string::npos)) {
270 if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
271 (
level <
kError and (el_location.find(
"CINTTypedefBuilder::Setup") != std::string::npos) and
272 (el_message.find(
"possible entries are in use!") != std::string::npos))) {
278 bool alreadyPrinted =
false;
279 if (find_if_string(el_message, in_message_print_error)) {
282 alreadyPrinted =
true;
297 if (die && (el_location !=
std::string(
"@SUB=TUnixSystem::DispatchSignals"))) {
298 std::ostringstream sstr;
299 sstr <<
"Fatal Root Error: " << el_location <<
"\n" << el_message <<
'\n';
302 except.clearMessage();
309 if (!alreadyPrinted) {
319 edm::LogInfo(
"Root_Information") << el_location << el_message;
324 void RootErrorHandler(
int level,
bool,
char const*
location,
char const* message) {
329 void set_default_signals() {
330 signal(SIGILL, SIG_DFL);
331 signal(SIGSEGV, SIG_DFL);
332 signal(SIGBUS, SIG_DFL);
334 signal(SIGFPE, SIG_DFL);
335 signal(SIGABRT, SIG_DFL);
338 static int full_write(
int fd,
const char*
text) {
345 if (errno == EINTR) {
357 static int full_read(
int fd,
char* inbuf,
size_t len,
int timeout_s = -1) {
360 ssize_t complete = 0;
361 std::chrono::time_point<std::chrono::steady_clock> end_time =
366 }
else if ((-1 == (
flags = fcntl(
fd, F_GETFL)))) {
375 if (timeout_s >= 0) {
376 struct pollfd poll_info {
381 if (ms_remaining > 0) {
382 int rc = poll(&poll_info, 1, ms_remaining);
385 if (errno == EINTR || errno == EAGAIN) {
397 }
else if (ms_remaining < 0) {
405 if (complete == -1) {
406 if (errno == EINTR) {
408 }
else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
411 int orig_errno = errno;
427 static int full_cerr_write(
const char*
text) {
return full_write(2,
text); }
433 #if defined(SIGRTMAX) 434 #define PAUSE_SIGNAL SIGRTMAX 435 #define RESUME_SIGNAL SIGRTMAX - 1 436 #elif defined(SIGINFO) // macOS/BSD 437 #define PAUSE_SIGNAL SIGINFO 438 #define RESUME_SIGNAL SIGALRM 442 void sig_resume_handler(
int sig, siginfo_t*,
void*) {}
445 void sig_pause_for_stacktrace(
int sig, siginfo_t*,
void*) {
450 sigemptyset(&sigset);
451 sigaddset(&sigset, RESUME_SIGNAL);
452 pthread_sigmask(SIG_UNBLOCK, &sigset,
nullptr);
462 strlcpy(buff,
"\nModule: ", moduleBufferSize);
467 strlcat(buff,
":", moduleBufferSize);
472 strlcat(buff,
"none", moduleBufferSize);
479 void sig_dostack_then_abort(
int sig, siginfo_t*,
void*) {
484 const auto self = pthread_self();
488 struct sigaction act;
489 act.sa_sigaction = sig_pause_for_stacktrace;
491 sigemptyset(&act.sa_mask);
492 sigaction(PAUSE_SIGNAL, &act,
nullptr);
495 sigset_t pausesigset;
496 sigemptyset(&pausesigset);
497 sigaddset(&pausesigset, PAUSE_SIGNAL);
498 sigprocmask(SIG_UNBLOCK, &pausesigset,
nullptr);
501 for (
auto id : tids) {
503 pthread_kill(
id, PAUSE_SIGNAL);
509 act.sa_sigaction = sig_resume_handler;
510 sigaction(RESUME_SIGNAL, &act,
nullptr);
515 const char* signalname =
"unknown";
518 signalname =
"bus error";
522 signalname =
"segmentation violation";
526 signalname =
"illegal instruction";
530 signalname =
"floating point exception";
534 signalname =
"external termination request";
538 signalname =
"abort signal";
544 full_cerr_write(
"\n\nA fatal system signal has occurred: ");
545 full_cerr_write(signalname);
546 full_cerr_write(
"\nThe following is the call stack containing the origin of the signal.\n\n");
555 std::size_t notified = 0;
557 for (
auto id : tids) {
559 if (pthread_kill(
id, RESUME_SIGNAL) == 0)
566 full_cerr_write(
"\nCurrent Modules:\n");
573 if (tids.count(
self) > 0) {
574 char buff[moduleBufferSize] =
"\nModule: ";
579 strlcat(buff,
":", moduleBufferSize);
584 strlcat(buff,
"none", moduleBufferSize);
586 strlcat(buff,
" (crashed)", moduleBufferSize);
587 full_cerr_write(buff);
589 full_cerr_write(
"\nModule: non-CMSSW (crashed)");
597 timespec
t = {0, 1000};
599 nanosleep(&
t,
nullptr);
607 full_cerr_write(
"\n\nA fatal system signal has occurred: ");
608 full_cerr_write(signalname);
609 full_cerr_write(
"\n");
613 if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig ==
SIGTERM) || (sig == SIGFPE) ||
615 signal(sig, SIG_DFL);
618 set_default_signals();
623 void sig_abort(
int sig, siginfo_t*,
void*) {
624 full_cerr_write(
"\n\nFatal system signal has occurred during exit\n");
627 signal(sig, SIG_DFL);
631 set_default_signals();
656 int result = full_read(fromParent,
buf, 1);
661 set_default_signals();
663 full_cerr_write(
"\n\nTraceback helper thread failed to read from parent: ");
664 full_cerr_write(strerror(-
result));
665 full_cerr_write(
"\n");
669 set_default_signals();
671 full_write(toParent,
buf);
672 }
else if (
buf[0] ==
'2') {
679 }
else if (
buf[0] ==
'3') {
682 set_default_signals();
684 full_cerr_write(
"\n\nTraceback helper thread got unknown command from parent: ");
685 full_cerr_write(
buf);
686 full_cerr_write(
"\n");
695 full_cerr_write(
"\n\nAttempt to request stacktrace failed: ");
696 full_cerr_write(strerror(-
result));
697 full_cerr_write(
"\n");
703 full_cerr_write(
"\n\nWaiting for stacktrace completion failed: ");
704 if (
result == -ETIMEDOUT) {
705 full_cerr_write(
"timed out waiting for GDB to complete.");
707 full_cerr_write(strerror(-
result));
709 full_cerr_write(
"\n");
715 char child_stack[4 * 1024];
716 char* child_stack_ptr = child_stack + 4 * 1024;
726 if (child_stack_ptr) {
733 full_cerr_write(
"(Attempt to perform stack dump failed.)\n");
736 if (waitpid(pid, &
status, 0) == -1) {
737 full_cerr_write(
"(Failed to wait on stack dump output.)\n");
740 full_cerr_write(
"(GDB stack trace failed unexpectedly)\n");
746 set_default_signals();
753 syscall(SYS_execve,
"/bin/sh",
argv, __environ);
755 execv(
"/bin/sh",
argv);
775 unloadSigHandler_(
pset.getUntrackedParameter<
bool>(
"UnloadRootSigHandler")),
776 resetErrHandler_(
pset.getUntrackedParameter<
bool>(
"ResetRootErrHandler")),
777 loadAllDictionaries_(
pset.getUntrackedParameter<
bool>(
"LoadAllDictionaries")),
778 autoLibraryLoader_(loadAllDictionaries_
or pset.getUntrackedParameter<
bool>(
"AutoLibraryLoader")),
779 autoClassParser_(
pset.getUntrackedParameter<
bool>(
"AutoClassParser")),
780 interactiveDebug_(
pset.getUntrackedParameter<
bool>(
"InteractiveDebug")) {
794 gSystem->ResetSignal(kSigChild);
795 gSystem->ResetSignal(kSigBus);
796 gSystem->ResetSignal(kSigSegmentationViolation);
797 gSystem->ResetSignal(kSigIllegalInstruction);
798 gSystem->ResetSignal(kSigSystem);
799 gSystem->ResetSignal(kSigPipe);
800 gSystem->ResetSignal(kSigAlarm);
801 gSystem->ResetSignal(kSigUrgent);
802 gSystem->ResetSignal(kSigFloatingException);
803 gSystem->ResetSignal(kSigWindowChanged);
804 }
else if (
pset.getUntrackedParameter<
bool>(
"AbortOnSignal")) {
809 gSystem->ResetSignal(kSigBus);
810 gSystem->ResetSignal(kSigSegmentationViolation);
811 gSystem->ResetSignal(kSigIllegalInstruction);
812 gSystem->ResetSignal(kSigFloatingException);
825 signal(SIGABRT, SIG_DFL);
837 SetErrorHandler(RootErrorHandler);
842 gInterpreter->SetClassAutoloading(1);
855 TTree::SetMaxTreeSize(kMaxLong64);
856 TH1::AddDirectory(kFALSE);
873 bool imt =
pset.getUntrackedParameter<
bool>(
"EnableIMT");
874 if (imt && not ROOT::IsImplicitMTEnabled()) {
877 ROOT::EnableImplicitMT(
878 oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
884 TIter iter(gROOT->GetListOfFiles());
885 TObject*
obj =
nullptr;
886 while (
nullptr != (
obj = iter.Next())) {
887 TFile*
f =
dynamic_cast<TFile*
>(
obj);
892 iter = TIter(gROOT->GetListOfFiles());
901 ROOT::EnableThreadSafety();
904 TObject::SetObjectStat(
false);
907 TVirtualStreamerInfo::Optimize(
false);
912 desc.setComment(
"Centralized interface to ROOT.");
913 desc.addUntracked<
bool>(
"UnloadRootSigHandler",
false)
914 ->setComment(
"If True, signals are handled by this service, rather than by ROOT.");
915 desc.addUntracked<
bool>(
"ResetRootErrHandler",
true)
917 "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
918 desc.addUntracked<
bool>(
"AutoLibraryLoader",
true)
919 ->setComment(
"If True, enables automatic loading of data dictionaries.");
920 desc.addUntracked<
bool>(
"AutoClassParser",
true)
922 "If False, the automatic parsing of class headers for dictionaries when pre-built dictionaries are " 923 "missing is disable during module construction. The current implementation of disabling the parsing is " 924 "fragile, and may work only in a single-thread job that does not use reco::parser::cutParser() or " 925 "reco::parser::expressionParser() (and it certainly does not work on multiple threads).");
926 desc.addUntracked<
bool>(
"LoadAllDictionaries",
false)->setComment(
"If True, loads all ROOT dictionaries.");
927 desc.addUntracked<
bool>(
"EnableIMT",
true)->setComment(
"If True, calls ROOT::EnableImplicitMT().");
928 desc.addUntracked<
bool>(
"AbortOnSignal",
true)
930 "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which " 931 "attempts to do a clean shutdown.");
932 desc.addUntracked<
bool>(
"InteractiveDebug",
false)
934 "If True, leave gdb attached to cmsRun after a crash; " 935 "if False, attach gdb, print a stack trace, and quit gdb");
936 desc.addUntracked<
int>(
"DebugLevel", 0)->setComment(
"Sets ROOT's gDebug value.");
937 desc.addUntracked<
int>(
"StackTracePauseTime", 300)
938 ->setComment(
"Seconds to pause other threads during stack trace.");
939 descriptions.
add(
"InitRootHandlers",
desc);
961 "set pagination no\n" 962 "thread apply all bt\n" 964 "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
967 std::ostringstream sstr;
968 sstr <<
"Unable to pre-allocate stacktrace handler information";
986 std::ostringstream sstr;
987 sstr <<
"Failed to create child-to-parent pipes (errno=" << errno <<
"): " << strerror(errno);
997 std::ostringstream sstr;
998 sstr <<
"Failed to create child-to-parent pipes (errno=" << errno <<
"): " << strerror(errno);
void on_scheduler_exit(bool) override
void watchPostModuleConstruction(PostModuleConstruction::slot_type const &iSlot)
static constexpr char dashC[]
edm::serviceregistry::AllArgsMaker< edm::RootHandlers, InitRootHandlers > RootHandlersMaker
bool loadAllDictionaries_
void enableWarnings_() override
static void cmssw_stacktrace_fork()
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setRefCoreStreamerInTClass()
static void stacktraceFromThread()
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
Container_type threadIDs_
void watchPreModuleConstruction(PreModuleConstruction::slot_type const &iSlot)
std::vector< T >::const_iterator search(const cond::Time_t &val, const std::vector< T > &container)
oneapi::tbb::concurrent_unordered_set< pthread_t > Container_type
bool isProcessWideService(TFileService const *)
static ModuleCallingContext const * getCurrentModuleOnThread()
static int stackTracePause_
void installCustomHandler(int signum, CFUNC func)
std::shared_ptr< const void > sigSegvHandler_
Log< level::Error, false > LogError
std::shared_ptr< const void > sigFpeHandler_
friend int cmssw_stacktrace(void *)
void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override
std::shared_ptr< const void > sigBusHandler_
~ThreadTracker() override=default
static TypeWithDict byName(std::string const &name)
static std::atomic< std::size_t > doneModules_
static const ThreadTracker::Container_type & threadIDs()
std::shared_ptr< const void > sigAbrtHandler_
static std::atomic< std::size_t > nextModule_
static char pidString_[pidStringLength_]
static char const *const * getPstackArgv()
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
std::shared_ptr< const void > sigIllHandler_
static int childToParent_[2]
std::shared_ptr< const void > sigTermHandler_
void addAdditionalInfo(std::string const &info)
int cmssw_stacktrace(void *)
static std::unique_ptr< std::thread > helperThread_
static std::vector< std::array< char, moduleBufferSize > > moduleListBuffers_
static std::unique_ptr< ThreadTracker > threadTracker_
Log< level::Info, false > LogInfo
static constexpr int pidStringLength_
InitRootHandlers(ParameterSet const &pset, ActivityRegistry &iReg)
static char const *const pstackArgv_[]
void add(std::string const &label, ParameterSetDescription const &psetDescription)
TEveGeoShape * clone(const TEveElement *element, TEveElement *parent)
static int parentToChild_[2]
unsigned int maxNumberOfThreads() const
std::string moduleName(StableProvenance const &provenance, ProcessHistory const &history)
void willBeUsingThreads() override
char data[epos_bytes_allocation]
static void fillDescriptions(ConfigurationDescriptions &descriptions)
~InitRootHandlers() override
const Container_type & IDs()
static int stackTracePause()
static void stacktraceHelperThread()
bool hasDictionary(std::type_info const &)
Log< level::Warning, false > LogWarning
static constexpr char pstackName[]
void on_scheduler_entry(bool) override