CMS 3D CMS Logo

FastMonitoringService.cc
Go to the documentation of this file.
3 
15 
19 
22 
23 #include <iostream>
24 #include <iomanip>
25 #include <sys/time.h>
26 
27 using namespace jsoncollector;
28 
29 constexpr double throughputFactor() { return (1000000) / double(1024 * 1024); }
30 
31 namespace evf {
32 
33  const edm::ModuleDescription FastMonitoringService::reservedMicroStateNames[FastMonState::mCOUNT] = {
34  edm::ModuleDescription("Dummy", "Invalid"),
35  edm::ModuleDescription("Dummy", "Idle"),
36  edm::ModuleDescription("Dummy", "FwkOvhSrc"),
37  edm::ModuleDescription("Dummy", "FwkOvhMod"), //set post produce, analyze or filter
38  edm::ModuleDescription("Dummy", "FwkEoL"),
39  edm::ModuleDescription("Dummy", "Input"),
40  edm::ModuleDescription("Dummy", "DQM"),
41  edm::ModuleDescription("Dummy", "BoL"),
42  edm::ModuleDescription("Dummy", "EoL"),
43  edm::ModuleDescription("Dummy", "GlobalEoL")};
44 
45  const std::string FastMonitoringService::macroStateNames[FastMonState::MCOUNT] = {"Init",
46  "JobReady",
47  "RunGiven",
48  "Running",
49  "Stopping",
50  "Done",
51  "JobEnded",
52  "Error",
53  "ErrorEnded",
54  "End",
55  "Invalid"};
56 
57  const std::string FastMonitoringService::inputStateNames[FastMonState::inCOUNT] = {
58  "Ignore",
59  "Init",
60  "WaitInput",
61  "NewLumi",
62  "NewLumiBusyEndingLS",
63  "NewLumiIdleEndingLS",
64  "RunEnd",
65  "ProcessingFile",
66  "WaitChunk",
67  "ChunkReceived",
68  "ChecksumEvent",
69  "CachedEvent",
70  "ReadEvent",
71  "ReadCleanup",
72  "NoRequest",
73  "NoRequestWithIdleThreads",
74  "NoRequestWithGlobalEoL",
75  "NoRequestWithEoLThreads",
76  "SupFileLimit",
77  "SupWaitFreeChunk",
78  "SupWaitFreeChunkCopying",
79  "SupWaitFreeThread",
80  "SupWaitFreeThreadCopying",
81  "SupBusy",
82  "SupLockPolling",
83  "SupLockPollingCopying",
84  "SupNoFile",
85  "SupNewFile",
86  "SupNewFileWaitThreadCopying",
87  "SupNewFileWaitThread",
88  "SupNewFileWaitChunkCopying",
89  "SupNewFileWaitChunk",
90  "WaitInput_fileLimit",
91  "WaitInput_waitFreeChunk",
92  "WaitInput_waitFreeChunkCopying",
93  "WaitInput_waitFreeThread",
94  "WaitInput_waitFreeThreadCopying",
95  "WaitInput_busy",
96  "WaitInput_lockPolling",
97  "WaitInput_lockPollingCopying",
98  "WaitInput_runEnd",
99  "WaitInput_noFile",
100  "WaitInput_newFile",
101  "WaitInput_newFileWaitThreadCopying",
102  "WaitInput_newFileWaitThread",
103  "WaitInput_newFileWaitChunkCopying",
104  "WaitInput_newFileWaitChunk",
105  "WaitChunk_fileLimit",
106  "WaitChunk_waitFreeChunk",
107  "WaitChunk_waitFreeChunkCopying",
108  "WaitChunk_waitFreeThread",
109  "WaitChunk_waitFreeThreadCopying",
110  "WaitChunk_busy",
111  "WaitChunk_lockPolling",
112  "WaitChunk_lockPollingCopying",
113  "WaitChunk_runEnd",
114  "WaitChunk_noFile",
115  "WaitChunk_newFile",
116  "WaitChunk_newFileWaitThreadCopying",
117  "WaitChunk_newFileWaitThread",
118  "WaitChunk_newFileWaitChunkCopying",
119  "WaitChunk_newFileWaitChunk",
120  "inSupThrottled",
121  "inThrottled"};
122 
123  const std::string FastMonitoringService::nopath_ = "NoPath";
124 
126  : MicroStateService(iPS, reg),
127  fmt_(new FastMonitoringThread()),
128  nStreams_(0) //until initialized
129  ,
130  sleepTime_(iPS.getUntrackedParameter<int>("sleepTime", 1)),
131  fastMonIntervals_(iPS.getUntrackedParameter<unsigned int>("fastMonIntervals", 2)),
132  fastName_("fastmoni"),
133  slowName_("slowmoni"),
134  filePerFwkStream_(iPS.getUntrackedParameter<bool>("filePerFwkStream", false)),
135  totalEventsProcessed_(0) {
136  reg.watchPreallocate(this, &FastMonitoringService::preallocate); //receiving information on number of threads
138 
143 
147 
152 
154 
155  reg.watchPreEvent(this, &FastMonitoringService::preEvent); //stream
157 
158  reg.watchPreSourceEvent(this, &FastMonitoringService::preSourceEvent); //source (with streamID of requestor)
160 
163 
166 
170 
171  //find microstate definition path (required by the module)
172  struct stat statbuf;
173  std::string microstateBaseSuffix = "src/EventFilter/Utilities/plugins/microstatedef.jsd";
174  std::string microstatePath = std::string(std::getenv("CMSSW_BASE")) + "/" + microstateBaseSuffix;
175  if (stat(microstatePath.c_str(), &statbuf)) {
176  microstatePath = std::string(std::getenv("CMSSW_RELEASE_BASE")) + "/" + microstateBaseSuffix;
177  if (stat(microstatePath.c_str(), &statbuf)) {
178  microstatePath = microstateBaseSuffix;
179  if (stat(microstatePath.c_str(), &statbuf))
180  throw cms::Exception("FastMonitoringService") << "microstate definition file not found";
181  }
182  }
183  fastMicrostateDefPath_ = microstateDefPath_ = microstatePath;
184  }
185 
187 
190  desc.setComment("Service for File-based DAQ monitoring and event accounting");
191  desc.addUntracked<int>("sleepTime", 1)->setComment("Sleep time of the monitoring thread");
192  desc.addUntracked<unsigned int>("fastMonIntervals", 2)
193  ->setComment("Modulo of sleepTime intervals on which fastmon file is written out");
194  desc.addUntracked<bool>("filePerFwkStream", false)
195  ->setComment("Switches on monitoring output per framework stream");
196  desc.setAllowAnything();
197  descriptions.add("FastMonitoringService", desc);
198  }
199 
201  Json::Value legendaVector(Json::arrayValue);
202  for (int i = 0; i < fmt_->m_data.encPath_[0].current_; i++)
203  legendaVector.append(Json::Value(*(static_cast<const std::string*>(fmt_->m_data.encPath_[0].decode(i)))));
204  Json::Value valReserved(nReservedPaths);
205  Json::Value pathLegend;
206  pathLegend["names"] = legendaVector;
207  pathLegend["reserved"] = valReserved;
209  return writer.write(pathLegend);
210  }
211 
213  Json::Value legendaVector(Json::arrayValue);
214  for (int i = 0; i < fmt_->m_data.encModule_.current_; i++)
215  legendaVector.append(
216  Json::Value((static_cast<const edm::ModuleDescription*>(fmt_->m_data.encModule_.decode(i)))->moduleLabel()));
217  //duplicate modules adding a list for acquire states (not all modules actually have it)
218  for (int i = 0; i < fmt_->m_data.encModule_.current_; i++)
219  legendaVector.append(Json::Value(
220  (static_cast<const edm::ModuleDescription*>(fmt_->m_data.encModule_.decode(i)))->moduleLabel() + "__ACQ"));
221  Json::Value valReserved(nReservedModules);
222  Json::Value valSpecial(nSpecialModules);
223  Json::Value valOutputModules(nOutputModules_);
224  Json::Value moduleLegend;
225  moduleLegend["names"] = legendaVector;
226  moduleLegend["reserved"] = valReserved;
227  moduleLegend["special"] = valSpecial;
228  moduleLegend["output"] = valOutputModules;
230  return writer.write(moduleLegend);
231  }
232 
234  Json::Value legendaVector(Json::arrayValue);
235  for (int i = 0; i < FastMonState::inCOUNT; i++)
236  legendaVector.append(Json::Value(inputStateNames[i]));
237  Json::Value moduleLegend;
238  moduleLegend["names"] = legendaVector;
240  return writer.write(moduleLegend);
241  }
242 
244  nStreams_ = bounds.maxNumberOfStreams();
245  nThreads_ = bounds.maxNumberOfThreads();
246  //this should already be >=1
247  if (nStreams_ == 0)
248  nStreams_ = 1;
249  if (nThreads_ == 0)
250  nThreads_ = 1;
251  }
252 
254  edm::ProcessContext const& pc) {
255  // FIND RUN DIRECTORY
256  // The run dir should be set via the configuration of EvFDaqDirector
257 
258  if (edm::Service<evf::EvFDaqDirector>().operator->() == nullptr) {
259  throw cms::Exception("FastMonitoringService") << "EvFDaqDirector is not present";
260  }
261  std::filesystem::path runDirectory{edm::Service<evf::EvFDaqDirector>()->baseRunDir()};
262  workingDirectory_ = runDirectory_ = runDirectory;
263  workingDirectory_ /= "mon";
264 
265  if (!std::filesystem::is_directory(workingDirectory_)) {
266  LogDebug("FastMonitoringService") << "<MON> DIR NOT FOUND! Trying to create -: " << workingDirectory_.string();
267  std::filesystem::create_directories(workingDirectory_);
268  if (!std::filesystem::is_directory(workingDirectory_))
269  edm::LogWarning("FastMonitoringService") << "Unable to create <MON> DIR -: " << workingDirectory_.string()
270  << ". No monitoring data will be written.";
271  }
272 
273  std::ostringstream fastFileName;
274 
275  fastFileName << fastName_ << "_pid" << std::setfill('0') << std::setw(5) << getpid() << ".fast";
277  fast /= fastFileName.str();
278  fastPath_ = fast.string();
279  if (filePerFwkStream_)
280  for (unsigned int i = 0; i < nStreams_; i++) {
281  std::ostringstream fastFileNameTid;
282  fastFileNameTid << fastName_ << "_pid" << std::setfill('0') << std::setw(5) << getpid() << "_tid" << i
283  << ".fast";
285  fastTid /= fastFileNameTid.str();
286  fastPathList_.push_back(fastTid.string());
287  }
288 
289  std::ostringstream moduleLegFile;
290  std::ostringstream moduleLegFileJson;
291  moduleLegFile << "microstatelegend_pid" << std::setfill('0') << std::setw(5) << getpid() << ".leg";
292  moduleLegFileJson << "microstatelegend_pid" << std::setfill('0') << std::setw(5) << getpid() << ".jsn";
293  moduleLegendFile_ = (workingDirectory_ / moduleLegFile.str()).string();
294  moduleLegendFileJson_ = (workingDirectory_ / moduleLegFileJson.str()).string();
295 
296  std::ostringstream pathLegFile;
297  std::ostringstream pathLegFileJson;
298  pathLegFile << "pathlegend_pid" << std::setfill('0') << std::setw(5) << getpid() << ".leg";
299  pathLegendFile_ = (workingDirectory_ / pathLegFile.str()).string();
300  pathLegFileJson << "pathlegend_pid" << std::setfill('0') << std::setw(5) << getpid() << ".jsn";
301  pathLegendFileJson_ = (workingDirectory_ / pathLegFileJson.str()).string();
302 
303  std::ostringstream inputLegFileJson;
304  inputLegFileJson << "inputlegend_pid" << std::setfill('0') << std::setw(5) << getpid() << ".jsn";
305  inputLegendFileJson_ = (workingDirectory_ / inputLegFileJson.str()).string();
306 
307  LogDebug("FastMonitoringService") << "Initializing FastMonitor with microstate def path -: " << microstateDefPath_;
308  //<< encPath_.current_ + 1 << " " << encModule_.current_ + 1
309 
310  /*
311  * initialize the fast monitor with:
312  * vector of pointers to monitorable parameters
313  * path to definition
314  *
315  */
316 
317  fmt_->m_data.macrostate_ = FastMonState::sInit;
318 
319  for (unsigned int i = 0; i < (FastMonState::mCOUNT); i++)
320  fmt_->m_data.encModule_.updateReserved(static_cast<const void*>(reservedMicroStateNames + i));
321  fmt_->m_data.encModule_.completeReservedWithDummies();
322 
323  for (unsigned int i = 0; i < nStreams_; i++) {
324  fmt_->m_data.ministate_.emplace_back(&nopath_);
325  fmt_->m_data.microstate_.emplace_back(&reservedMicroStateNames[FastMonState::mInvalid]);
326  fmt_->m_data.microstateAcqFlag_.push_back(0);
327 
328  //for synchronization
329  streamCounterUpdating_.push_back(new std::atomic<bool>(false));
330 
331  //path (mini) state
332  fmt_->m_data.encPath_.emplace_back(0);
333  fmt_->m_data.encPath_[i].update(static_cast<const void*>(&nopath_));
334 
335  for (auto& path : pathsInfo.paths()) {
336  fmt_->m_data.encPath_[i].updatePreinit(path);
337  }
338  for (auto& endPath : pathsInfo.endPaths()) {
339  fmt_->m_data.encPath_[i].updatePreinit(endPath);
340  }
341  }
342  //for (unsigned int i=0;i<nThreads_;i++)
343  // threadMicrostate_.push_back(&reservedMicroStateNames[mInvalid]);
344 
345  //initial size until we detect number of bins
346  fmt_->m_data.macrostateBins_ = FastMonState::MCOUNT;
347  fmt_->m_data.microstateBins_ = 0;
348  fmt_->m_data.inputstateBins_ = FastMonState::inCOUNT;
349  fmt_->m_data.ministateBins_ = fmt_->m_data.encPath_[0].vecsize();
350 
351  lastGlobalLumi_ = 0;
352  isInitTransition_ = true;
353  lumiFromSource_ = 0;
354 
355  //startup monitoring
356  fmt_->resetFastMonitor(microstateDefPath_, fastMicrostateDefPath_);
357  fmt_->jsonMonitor_->setNStreams(nStreams_);
358  fmt_->m_data.registerVariables(fmt_->jsonMonitor_.get(), nStreams_, threadIDAvailable_ ? nThreads_ : 0);
359  monInit_.store(false, std::memory_order_release);
360  if (sleepTime_ > 0)
362 
363  //this definition needs: #include "tbb/compat/thread"
364  //however this would results in TBB imeplementation replacing std::thread
365  //(both supposedly call pthread_self())
366  //number of threads created in process could be obtained from /proc,
367  //assuming that all posix threads are true kernel threads capable of running in parallel
368 
369  //#if TBB_IMPLEMENT_CPP0X
371  //threadIDAvailable_=true;
372  //#endif
373  }
374 
378  context = " FromThisContext ";
380  context = " FromAnotherContext";
382  context = " FromExternalSignal";
383  edm::LogWarning("FastMonitoringService")
384  << " STREAM " << sc.streamID().value() << " earlyTermination -: ID:" << sc.eventID()
385  << " LS:" << sc.eventID().luminosityBlock() << " " << context;
386  std::lock_guard<std::mutex> lock(fmt_->monlock_);
387  exceptionInLS_.push_back(sc.eventID().luminosityBlock());
388  has_data_exception_.store(true);
389  }
390 
394  context = " FromThisContext ";
396  context = " FromAnotherContext";
398  context = " FromExternalSignal";
399  edm::LogWarning("FastMonitoringService")
400  << " GLOBAL "
401  << "earlyTermination -: LS:" << gc.luminosityBlockID().luminosityBlock() << " " << context;
402  std::lock_guard<std::mutex> lock(fmt_->monlock_);
404  has_data_exception_.store(true);
405  }
406 
410  context = " FromThisContext ";
412  context = " FromAnotherContext";
414  context = " FromExternalSignal";
415  edm::LogWarning("FastMonitoringService") << " SOURCE "
416  << "earlyTermination -: " << context;
417  std::lock_guard<std::mutex> lock(fmt_->monlock_);
418  exception_detected_ = true;
419  has_source_exception_.store(true);
420  has_data_exception_.store(true);
421  }
422 
424  std::lock_guard<std::mutex> lock(fmt_->monlock_);
425  if (!ls)
426  exception_detected_ = true;
427  else
428  exceptionInLS_.push_back(ls);
429  }
430 
432  return has_source_exception_.load() || has_data_exception_.load();
433  }
434 
436  if (!has_data_exception_.load())
437  return false;
438  if (has_source_exception_.load())
439  return true;
440  std::lock_guard<std::mutex> lock(fmt_->monlock_);
441  for (auto ex : exceptionInLS_) {
442  if (ls == ex)
443  return true;
444  }
445  return false;
446  }
447 
449 
450  //new output module name is stream
452  std::lock_guard<std::mutex> lock(fmt_->monlock_);
453  //std::cout << " Pre module Begin Job module: " << desc.moduleName() << std::endl;
454 
455  //build a map of modules keyed by their module description address
456  //here we need to treat output modules in a special way so they can be easily singled out
457  if (desc.moduleName() == "Stream" || desc.moduleName() == "GlobalEvFOutputModule" ||
458  desc.moduleName() == "EvFOutputModule" || desc.moduleName() == "EventStreamFileWriter" ||
459  desc.moduleName() == "PoolOutputModule") {
460  fmt_->m_data.encModule_.updateReserved((void*)&desc);
461  nOutputModules_++;
462  } else
463  fmt_->m_data.encModule_.update((void*)&desc);
464  }
465 
467  std::string&& moduleLegStrJson = makeModuleLegendaJson();
468  FileIO::writeStringToFile(moduleLegendFileJson_, moduleLegStrJson);
469 
470  std::string inputLegendStrJson = makeInputLegendaJson();
471  FileIO::writeStringToFile(inputLegendFileJson_, inputLegendStrJson);
472 
473  std::string pathLegendStrJson = makePathLegendaJson();
474  FileIO::writeStringToFile(pathLegendFileJson_, pathLegendStrJson);
475 
476  fmt_->m_data.macrostate_ = FastMonState::sJobReady;
477 
478  //update number of entries in module histogram
479  std::lock_guard<std::mutex> lock(fmt_->monlock_);
480  //double the size to add post-acquire states
481  fmt_->m_data.microstateBins_ = fmt_->m_data.encModule_.vecsize() * 2;
482  }
483 
485  fmt_->m_data.macrostate_ = FastMonState::sJobEnded;
486  fmt_->stop();
487  }
488 
490  fmt_->m_data.macrostate_ = FastMonState::sRunning;
491  isInitTransition_ = false;
492  }
493 
495  timeval lumiStartTime;
496  gettimeofday(&lumiStartTime, nullptr);
497  unsigned int newLumi = gc.luminosityBlockID().luminosityBlock();
498  lastGlobalLumi_ = newLumi;
499 
500  std::lock_guard<std::mutex> lock(fmt_->monlock_);
501  lumiStartTime_[newLumi] = lumiStartTime;
502  }
503 
505  unsigned int lumi = gc.luminosityBlockID().luminosityBlock();
506  LogDebug("FastMonitoringService") << "Lumi ended. Writing JSON information. LUMI -: " << lumi;
507  timeval lumiStopTime;
508  gettimeofday(&lumiStopTime, nullptr);
509 
510  std::lock_guard<std::mutex> lock(fmt_->monlock_);
511 
512  // Compute throughput
513  timeval stt = lumiStartTime_[lumi];
514  lumiStartTime_.erase(lumi);
515  unsigned long usecondsForLumi = (lumiStopTime.tv_sec - stt.tv_sec) * 1000000 + (lumiStopTime.tv_usec - stt.tv_usec);
516  unsigned long accuSize = accuSize_.find(lumi) == accuSize_.end() ? 0 : accuSize_[lumi];
517  accuSize_.erase(lumi);
518  double throughput = throughputFactor() * double(accuSize) / double(usecondsForLumi);
519  //store to registered variable
520  fmt_->m_data.fastThroughputJ_.value() = throughput;
521 
522  //update
523  doSnapshot(lumi, true);
524 
525  //retrieve one result we need (todo: sanity check if it's found)
526  IntJ* lumiProcessedJptr = dynamic_cast<IntJ*>(fmt_->jsonMonitor_->getMergedIntJForLumi("Processed", lumi));
527  if (!lumiProcessedJptr)
528  throw cms::Exception("FastMonitoringService") << "Internal error: got null pointer from FastMonitor";
529  processedEventsPerLumi_[lumi] = std::pair<unsigned int, bool>(lumiProcessedJptr->value(), false);
530 
531  //checking if exception has been thrown (in case of Global/Stream early termination, for this LS)
532  bool exception_detected = exception_detected_;
533  for (auto ex : exceptionInLS_)
534  if (lumi == ex)
535  exception_detected = true;
536 
537  if (edm::shutdown_flag || exception_detected) {
538  edm::LogInfo("FastMonitoringService")
539  << "Run interrupted. Skip writing EoL information -: " << processedEventsPerLumi_[lumi].first
540  << " events were processed in LUMI " << lumi;
541  //this will prevent output modules from producing json file for possibly incomplete lumi
542  processedEventsPerLumi_[lumi].first = 0;
543  processedEventsPerLumi_[lumi].second = true;
544  //disable this exception, so service can be used standalone (will be thrown if output module asks for this information)
545  //throw cms::Exception("FastMonitoringService") << "SOURCE did not send update for lumi block. LUMI -:" << lumi;
546  return;
547  }
548 
549  if (inputSource_) {
550  auto sourceReport = inputSource_->getEventReport(lumi, true);
551  if (sourceReport.first) {
552  if (sourceReport.second != processedEventsPerLumi_[lumi].first) {
553  throw cms::Exception("FastMonitoringService") << "MISMATCH with SOURCE update. LUMI -: " << lumi
554  << ", events(processed):" << processedEventsPerLumi_[lumi].first
555  << " events(source):" << sourceReport.second;
556  }
557  }
558  }
559  edm::LogInfo("FastMonitoringService")
560  << "Statistics for lumisection -: lumi = " << lumi << " events = " << lumiProcessedJptr->value()
561  << " time = " << usecondsForLumi / 1000000 << " size = " << accuSize << " thr = " << throughput;
562  delete lumiProcessedJptr;
563 
564  //full global and stream merge&output for this lumi
565 
566  // create file name for slow monitoring file
567  bool output = sleepTime_ > 0;
568  if (filePerFwkStream_) {
569  std::stringstream slowFileNameStem;
570  slowFileNameStem << slowName_ << "_ls" << std::setfill('0') << std::setw(4) << lumi << "_pid" << std::setfill('0')
571  << std::setw(5) << getpid();
573  slow /= slowFileNameStem.str();
574  fmt_->jsonMonitor_->outputFullJSONs(slow.string(), ".jsn", lumi, output);
575  } else {
576  std::stringstream slowFileName;
577  slowFileName << slowName_ << "_ls" << std::setfill('0') << std::setw(4) << lumi << "_pid" << std::setfill('0')
578  << std::setw(5) << getpid() << ".jsn";
580  slow /= slowFileName.str();
581  //full global and stream merge and JSON write for this lumi
582  fmt_->jsonMonitor_->outputFullJSON(slow.string(), lumi, output);
583  }
584  fmt_->jsonMonitor_->discardCollected(lumi); //we don't do further updates for this lumi
585  }
586 
588  std::lock_guard<std::mutex> lock(fmt_->monlock_);
589  unsigned int lumi = gc.luminosityBlockID().luminosityBlock();
590  //LS monitoring snapshot with input source data has been taken in previous callback
591  avgLeadTime_.erase(lumi);
593  lockStatsDuringLumi_.erase(lumi);
594 
595  //output module already used this in end lumi (this could be migrated to EvFDaqDirector as it is essential for FFF bookkeeping)
597  }
598 
600  unsigned int sid = sc.streamID().value();
601 
602  std::lock_guard<std::mutex> lock(fmt_->monlock_);
603  fmt_->m_data.streamLumi_[sid] = sc.eventID().luminosityBlock();
604 
605  //reset collected values for this stream
606  *(fmt_->m_data.processed_[sid]) = 0;
607 
608  fmt_->m_data.ministate_[sid] = &nopath_;
609  fmt_->m_data.microstate_[sid] = &reservedMicroStateNames[FastMonState::mBoL];
610  }
611 
613  fmt_->m_data.microstate_[sc.streamID().value()] = &reservedMicroStateNames[FastMonState::mIdle];
614  }
615 
617  unsigned int sid = sc.streamID().value();
618  std::lock_guard<std::mutex> lock(fmt_->monlock_);
619 
620  //update processed count to be complete at this time
621  //doStreamEOLSnapshot(sc.eventID().luminosityBlock(), sid);
622  fmt_->jsonMonitor_->snapStreamAtomic(sc.eventID().luminosityBlock(), sid);
623  //reset this in case stream does not get notified of next lumi (we keep processed events only)
624  fmt_->m_data.ministate_[sid] = &nopath_;
625  fmt_->m_data.microstate_[sid] = &reservedMicroStateNames[FastMonState::mEoL];
626  }
628  fmt_->m_data.microstate_[sc.streamID().value()] = &reservedMicroStateNames[FastMonState::mFwkEoL];
629  }
630 
632  fmt_->m_data.ministate_[sc.streamID()] = &(pc.pathName());
633  }
634 
636 
638  fmt_->m_data.microstate_[sc.streamID()] = &reservedMicroStateNames[FastMonState::mIdle];
639 
640  fmt_->m_data.ministate_[sc.streamID()] = &nopath_;
641 
642  (*(fmt_->m_data.processed_[sc.streamID()]))++;
643 
644  //fast path counter (events accumulated in a run)
645  unsigned long res = totalEventsProcessed_.fetch_add(1, std::memory_order_relaxed);
646  fmt_->m_data.fastPathProcessedJ_ = res + 1;
647  }
648 
650  fmt_->m_data.microstate_[sid.value()] = &reservedMicroStateNames[FastMonState::mInput];
651  }
652 
654  fmt_->m_data.microstate_[sid.value()] = &reservedMicroStateNames[FastMonState::mFwkOvhSrc];
655  }
656 
658  edm::ModuleCallingContext const& mcc) {
659  fmt_->m_data.microstate_[sc.streamID().value()] = (void*)(mcc.moduleDescription());
660  }
661 
663  edm::ModuleCallingContext const& mcc) {
664  //fmt_->m_data.microstate_[sc.streamID().value()] = (void*)(mcc.moduleDescription());
665  fmt_->m_data.microstateAcqFlag_[sc.streamID().value()] = 1;
666  }
667 
669  fmt_->m_data.microstate_[sc.streamID().value()] = (void*)(mcc.moduleDescription());
670  fmt_->m_data.microstateAcqFlag_[sc.streamID().value()] = 0;
671  }
672 
674  fmt_->m_data.microstate_[sc.streamID().value()] = &reservedMicroStateNames[FastMonState::mFwkOvhMod];
675  }
676 
677  //FUNCTIONS CALLED FROM OUTSIDE
678 
679  //this is for old-fashioned service that is not thread safe and can block other streams
680  //(we assume the worst case - everything is blocked)
682  for (unsigned int i = 0; i < nStreams_; i++)
683  fmt_->m_data.microstate_[i] = &reservedMicroStateNames[m];
684  }
685 
686  //this is for services that are multithreading-enabled or rarely blocks other streams
688  fmt_->m_data.microstate_[sid] = &reservedMicroStateNames[m];
689  }
690 
691  //from source
692  void FastMonitoringService::accumulateFileSize(unsigned int lumi, unsigned long fileSize) {
693  std::lock_guard<std::mutex> lock(fmt_->monlock_);
694 
695  if (accuSize_.find(lumi) == accuSize_.end())
696  accuSize_[lumi] = fileSize;
697  else
698  accuSize_[lumi] += fileSize;
699 
702  else
704  }
705 
707  gettimeofday(&fileLookStart_, nullptr);
708  /*
709  std::cout << "Started looking for .raw file at: s=" << fileLookStart_.tv_sec << ": ms = "
710  << fileLookStart_.tv_usec / 1000.0 << std::endl;
711  */
712  }
713 
715  gettimeofday(&fileLookStop_, nullptr);
716  /*
717  std::cout << "Stopped looking for .raw file at: s=" << fileLookStop_.tv_sec << ": ms = "
718  << fileLookStop_.tv_usec / 1000.0 << std::endl;
719  */
720  std::lock_guard<std::mutex> lock(fmt_->monlock_);
721 
722  if (lumi > lumiFromSource_) {
724  leadTimes_.clear();
725  }
726  unsigned long elapsedTime = (fileLookStop_.tv_sec - fileLookStart_.tv_sec) * 1000000 // sec to us
727  + (fileLookStop_.tv_usec - fileLookStart_.tv_usec); // us
728  // add this to lead times for this lumi
729  leadTimes_.push_back((double)elapsedTime);
730 
731  // recompute average lead time for this lumi
732  if (leadTimes_.size() == 1)
734  else {
735  double totTime = 0;
736  for (unsigned int i = 0; i < leadTimes_.size(); i++)
737  totTime += leadTimes_[i];
738  avgLeadTime_[lumi] = 0.001 * (totTime / leadTimes_.size());
739  }
740  }
741 
742  void FastMonitoringService::reportLockWait(unsigned int ls, double waitTime, unsigned int lockCount) {
743  std::lock_guard<std::mutex> lock(fmt_->monlock_);
744  lockStatsDuringLumi_[ls] = std::pair<double, unsigned int>(waitTime, lockCount);
745  }
746 
747  //for the output module
748  unsigned int FastMonitoringService::getEventsProcessedForLumi(unsigned int lumi, bool* abortFlag) {
749  std::lock_guard<std::mutex> lock(fmt_->monlock_);
750 
751  auto it = processedEventsPerLumi_.find(lumi);
752  if (it != processedEventsPerLumi_.end()) {
753  unsigned int proc = it->second.first;
754  if (abortFlag)
755  *abortFlag = it->second.second;
756  return proc;
757  } else {
758  throw cms::Exception("FastMonitoringService")
759  << "output module wants already deleted (or never reported by SOURCE) lumisection event count for LUMI -: "
760  << lumi;
761  return 0;
762  }
763  }
764 
765  //for the output module
767  std::lock_guard<std::mutex> lock(fmt_->monlock_);
768 
769  auto it = processedEventsPerLumi_.find(lumi);
770  if (it != processedEventsPerLumi_.end()) {
771  unsigned int abortFlag = it->second.second;
772  return abortFlag;
773  } else {
774  throw cms::Exception("FastMonitoringService")
775  << "output module wants already deleted (or never reported by SOURCE) lumisection status for LUMI -: "
776  << lumi;
777  return false;
778  }
779  }
780 
781  // the function to be called in the thread. Thread completes when function returns.
783  monInit_.exchange(true, std::memory_order_acquire);
784  while (!fmt_->m_stoprequest) {
785  std::vector<std::vector<unsigned int>> lastEnc;
786  {
787  std::unique_lock<std::mutex> lock(fmt_->monlock_);
788 
789  doSnapshot(lastGlobalLumi_, false);
790 
791  lastEnc.emplace_back(fmt_->m_data.ministateEncoded_);
792  lastEnc.emplace_back(fmt_->m_data.microstateEncoded_);
793 
795  if (filePerFwkStream_) {
796  std::vector<std::string> CSVv;
797  for (unsigned int i = 0; i < nStreams_; i++) {
798  CSVv.push_back(fmt_->jsonMonitor_->getCSVString((int)i));
799  }
800  // release mutex before writing out fast path file
801  lock.release()->unlock();
802  for (unsigned int i = 0; i < nStreams_; i++) {
803  if (!CSVv[i].empty())
804  fmt_->jsonMonitor_->outputCSV(fastPathList_[i], CSVv[i]);
805  }
806  } else {
807  std::string CSV = fmt_->jsonMonitor_->getCSVString();
808  // release mutex before writing out fast path file
809  lock.release()->unlock();
810  if (!CSV.empty())
811  fmt_->jsonMonitor_->outputCSV(fastPath_, CSV);
812  }
813  }
814  snapCounter_++;
815  }
816 
817  {
818  edm::LogInfo msg("FastMonitoringService");
819  auto f = [&](std::vector<unsigned int> const& p) {
820  for (unsigned int i = 0; i < nStreams_; i++) {
821  if (i == 0)
822  msg << "[" << p[i] << ",";
823  else if (i <= nStreams_ - 1)
824  msg << p[i] << ",";
825  else
826  msg << p[i] << "]";
827  }
828  };
829 
830  msg << "Current states: Ms=" << fmt_->m_data.fastMacrostateJ_.value() << " ms=";
831  f(lastEnc[0]);
832  msg << " us=";
833  f(lastEnc[1]);
835  }
836 
837  ::sleep(sleepTime_);
838  }
839  }
840 
841  void FastMonitoringService::doSnapshot(const unsigned int ls, const bool isGlobalEOL) {
842  // update macrostate
843  fmt_->m_data.fastMacrostateJ_ = fmt_->m_data.macrostate_;
844 
845  std::vector<const void*> microstateCopy(fmt_->m_data.microstate_.begin(), fmt_->m_data.microstate_.end());
846  std::vector<unsigned char> microstateAcqCopy(fmt_->m_data.microstateAcqFlag_.begin(),
847  fmt_->m_data.microstateAcqFlag_.end());
848 
849  if (!isInitTransition_) {
850  auto itd = avgLeadTime_.find(ls);
851  if (itd != avgLeadTime_.end())
852  fmt_->m_data.fastAvgLeadTimeJ_ = itd->second;
853  else
854  fmt_->m_data.fastAvgLeadTimeJ_ = 0.;
855 
856  auto iti = filesProcessedDuringLumi_.find(ls);
857  if (iti != filesProcessedDuringLumi_.end())
858  fmt_->m_data.fastFilesProcessedJ_ = iti->second;
859  else
860  fmt_->m_data.fastFilesProcessedJ_ = 0;
861 
862  auto itrd = lockStatsDuringLumi_.find(ls);
863  if (itrd != lockStatsDuringLumi_.end()) {
864  fmt_->m_data.fastLockWaitJ_ = itrd->second.first;
865  fmt_->m_data.fastLockCountJ_ = itrd->second.second;
866  } else {
867  fmt_->m_data.fastLockWaitJ_ = 0.;
868  fmt_->m_data.fastLockCountJ_ = 0.;
869  }
870  }
871 
872  for (unsigned int i = 0; i < nStreams_; i++) {
873  fmt_->m_data.ministateEncoded_[i] = fmt_->m_data.encPath_[i].encodeString(fmt_->m_data.ministate_[i]);
874  if (microstateAcqCopy[i])
875  fmt_->m_data.microstateEncoded_[i] =
876  fmt_->m_data.microstateBins_ + fmt_->m_data.encModule_.encode(microstateCopy[i]);
877  else
878  fmt_->m_data.microstateEncoded_[i] = fmt_->m_data.encModule_.encode(microstateCopy[i]);
879  }
880 
881  bool inputStatePerThread = false;
882 
884  switch (inputSupervisorState_) {
886  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_fileLimit;
887  break;
889  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_waitFreeChunk;
890  break;
892  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_waitFreeChunkCopying;
893  break;
895  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_waitFreeThread;
896  break;
898  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_waitFreeThreadCopying;
899  break;
901  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_busy;
902  break;
904  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_lockPolling;
905  break;
907  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_lockPollingCopying;
908  break;
910  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_runEnd;
911  break;
913  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_noFile;
914  break;
916  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_newFile;
917  break;
920  break;
922  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_newFileWaitThread;
923  break;
926  break;
928  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_newFileWaitChunk;
929  break;
930  default:
931  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput;
932  }
933  } else if (inputState_ == FastMonState::inWaitChunk) {
934  switch (inputSupervisorState_) {
936  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_fileLimit;
937  break;
939  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_waitFreeChunk;
940  break;
942  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_waitFreeChunkCopying;
943  break;
945  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_waitFreeThread;
946  break;
948  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_waitFreeThreadCopying;
949  break;
951  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_busy;
952  break;
954  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_lockPolling;
955  break;
957  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_lockPollingCopying;
958  break;
960  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_runEnd;
961  break;
963  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_noFile;
964  break;
966  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_newFile;
967  break;
970  break;
972  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_newFileWaitThread;
973  break;
976  break;
978  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_newFileWaitChunk;
979  break;
980  default:
981  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk;
982  }
983  } else if (inputState_ == FastMonState::inNoRequest) {
984  inputStatePerThread = true;
985  for (unsigned int i = 0; i < nStreams_; i++) {
986  if (microstateCopy[i] == &reservedMicroStateNames[FastMonState::mIdle])
987  fmt_->m_data.inputState_[i] = FastMonState::inNoRequestWithIdleThreads;
988  else if (microstateCopy[i] == &reservedMicroStateNames[FastMonState::mEoL] ||
989  microstateCopy[i] == &reservedMicroStateNames[FastMonState::mFwkEoL])
990  fmt_->m_data.inputState_[i] = FastMonState::inNoRequestWithEoLThreads;
991  else
992  fmt_->m_data.inputState_[i] = FastMonState::inNoRequest;
993  }
994  } else if (inputState_ == FastMonState::inNewLumi) {
995  inputStatePerThread = true;
996  for (unsigned int i = 0; i < nStreams_; i++) {
997  if (microstateCopy[i] == &reservedMicroStateNames[FastMonState::mEoL] ||
998  microstateCopy[i] == &reservedMicroStateNames[FastMonState::mFwkEoL])
999  fmt_->m_data.inputState_[i] = FastMonState::inNewLumi;
1000  }
1002  //apply directly throttled state from supervisor
1003  fmt_->m_data.inputState_[0] = inputSupervisorState_;
1004  } else
1005  fmt_->m_data.inputState_[0] = inputState_;
1006 
1007  //this is same for all streams
1008  if (!inputStatePerThread)
1009  for (unsigned int i = 1; i < nStreams_; i++)
1010  fmt_->m_data.inputState_[i] = fmt_->m_data.inputState_[0];
1011 
1012  if (isGlobalEOL) { //only update global variables
1013  fmt_->jsonMonitor_->snapGlobal(ls);
1014  } else
1015  fmt_->jsonMonitor_->snap(ls);
1016  }
1017 
1018  //compatibility
1020 
1022 
1023 } //end namespace evf
std::atomic< bool > has_data_exception_
void prePathEvent(edm::StreamContext const &, edm::PathContext const &)
std::atomic< FastMonState::InputState > inputState_
void watchPreStreamEarlyTermination(PreStreamEarlyTermination::slot_type const &iSlot)
Definition: fillJson.h:27
static const std::string inputStateNames[FastMonState::inCOUNT]
void watchPreEvent(PreEvent::slot_type const &iSlot)
ModuleDescription const * moduleDescription() const
void postModuleEventAcquire(edm::StreamContext const &, edm::ModuleCallingContext const &)
LuminosityBlockNumber_t luminosityBlock() const
std::atomic< bool > isInitTransition_
void watchPrePathEvent(PrePathEvent::slot_type const &iSlot)
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setExceptionDetected(unsigned int ls)
void watchPreModuleEventAcquire(PreModuleEventAcquire::slot_type const &iSlot)
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
void preallocate(edm::service::SystemBounds const &)
std::map< unsigned int, timeval > lumiStartTime_
void preGlobalBeginLumi(edm::GlobalContext const &)
std::pair< bool, unsigned int > getEventReport(unsigned int lumi, bool erase)
void watchPreModuleEvent(PreModuleEvent::slot_type const &iSlot)
void postGlobalEndLumi(edm::GlobalContext const &)
void postEvent(edm::StreamContext const &)
void accumulateFileSize(unsigned int lumi, unsigned long fileSize)
void watchPreGlobalEndLumi(PreGlobalEndLumi::slot_type const &iSlot)
std::map< unsigned int, unsigned long > accuSize_
std::filesystem::path workingDirectory_
std::vector< std::atomic< bool > * > streamCounterUpdating_
void watchPostEvent(PostEvent::slot_type const &iSlot)
volatile std::atomic< bool > shutdown_flag
void watchPostStreamEndLumi(PostStreamEndLumi::slot_type const &iSlot)
void watchPreGlobalBeginLumi(PreGlobalBeginLumi::slot_type const &iSlot)
void watchPostModuleEvent(PostModuleEvent::slot_type const &iSlot)
Value & append(const Value &value)
Append value to array at the end.
void watchPostSourceEvent(PostSourceEvent::slot_type const &iSlot)
std::map< unsigned int, unsigned int > filesProcessedDuringLumi_
void preGlobalEndLumi(edm::GlobalContext const &)
void setMicroState(FastMonState::Microstate)
Represents a JSON value.
Definition: value.h:99
bool isExceptionOnData(unsigned int ls)
void watchPreStreamEndLumi(PreStreamEndLumi::slot_type const &iSlot)
void preGlobalEarlyTermination(edm::GlobalContext const &, edm::TerminationOrigin)
void watchPreSourceEarlyTermination(PreSourceEarlyTermination::slot_type const &iSlot)
static const edm::ModuleDescription reservedMicroStateNames[FastMonState::mCOUNT]
Definition: Electron.h:6
constexpr int nSpecialModules
std::vector< std::string > const & endPaths() const
LuminosityBlockNumber_t luminosityBlock() const
Definition: EventID.h:39
void watchJobFailure(JobFailure::slot_type const &iSlot)
convenience function for attaching to signal
void preModuleBeginJob(edm::ModuleDescription const &)
std::map< unsigned int, std::pair< unsigned int, bool > > processedEventsPerLumi_
MicroStateService(const edm::ParameterSet &, edm::ActivityRegistry &)
void preStreamEndLumi(edm::StreamContext const &)
std::map< unsigned int, double > avgLeadTime_
void watchPostStreamBeginLumi(PostStreamBeginLumi::slot_type const &iSlot)
void doSnapshot(const unsigned int ls, const bool isGlobalEOL)
void preStreamEarlyTermination(edm::StreamContext const &, edm::TerminationOrigin)
void watchPreGlobalEarlyTermination(PreGlobalEarlyTermination::slot_type const &iSlot)
std::atomic< FastMonState::InputState > inputSupervisorState_
StreamID const & streamID() const
Definition: StreamContext.h:55
static const std::string nopath_
constexpr double throughputFactor()
void watchPostModuleEventAcquire(PostModuleEventAcquire::slot_type const &iSlot)
std::filesystem::path runDirectory_
double f[11][100]
void watchPostGlobalEndLumi(PostGlobalEndLumi::slot_type const &iSlot)
void watchPreModuleBeginJob(PreModuleBeginJob::slot_type const &iSlot)
void preModuleEventAcquire(edm::StreamContext const &, edm::ModuleCallingContext const &)
void postModuleEvent(edm::StreamContext const &, edm::ModuleCallingContext const &)
LuminosityBlockID const & luminosityBlockID() const
Definition: GlobalContext.h:62
std::vector< std::string > fastPathList_
constexpr int nReservedPaths
void postStreamBeginLumi(edm::StreamContext const &)
Log< level::Info, false > LogInfo
def ls(path, rec=False)
Definition: eostools.py:349
void postStreamEndLumi(edm::StreamContext const &)
static void fillDescriptions(edm::ConfigurationDescriptions &descriptions)
void preStreamBeginLumi(edm::StreamContext const &)
std::atomic< unsigned long > totalEventsProcessed_
FedRawDataInputSource * inputSource_
std::map< unsigned int, std::pair< double, unsigned int > > lockStatsDuringLumi_
constexpr int nReservedModules
void watchPreStreamBeginLumi(PreStreamBeginLumi::slot_type const &iSlot)
void add(std::string const &label, ParameterSetDescription const &psetDescription)
tuple msg
Definition: mps_check.py:286
std::vector< double > leadTimes_
void preBeginJob(edm::PathsAndConsumesOfModulesBase const &, edm::ProcessContext const &pc)
void watchPreBeginJob(PreBeginJob::slot_type const &iSlot)
convenience function for attaching to signal
void stoppedLookingForFile(unsigned int lumi)
std::atomic< bool > has_source_exception_
EventID const & eventID() const
Definition: StreamContext.h:60
void postGlobalBeginRun(edm::GlobalContext const &)
void preEvent(edm::StreamContext const &)
std::string const & pathName() const
Definition: PathContext.h:30
unsigned int getEventsProcessedForLumi(unsigned int lumi, bool *abortFlag=nullptr)
void preSourceEarlyTermination(edm::TerminationOrigin)
void watchPreSourceEvent(PreSourceEvent::slot_type const &iSlot)
unsigned int value() const
Definition: StreamID.h:43
bool getAbortFlagForLumi(unsigned int lumi)
std::shared_ptr< FastMonitoringThread > fmt_
Log< level::Warning, false > LogWarning
void preModuleEvent(edm::StreamContext const &, edm::ModuleCallingContext const &)
Writes a Value in JSON format in a human friendly way.
Definition: writer.h:63
void reportLockWait(unsigned int ls, double waitTime, unsigned int lockCount)
std::vector< unsigned int > exceptionInLS_
std::vector< std::string > const & paths() const
#define LogDebug(id)
void watchPostBeginJob(PostBeginJob::slot_type const &iSlot)
convenience function for attaching to signal
array value (ordered list)
Definition: value.h:30