CMS 3D CMS Logo

FastMonitoringService.cc
Go to the documentation of this file.
3 
16 
20 
23 
24 #include <iostream>
25 #include <iomanip>
26 #include <sys/time.h>
27 
28 using namespace jsoncollector;
29 
30 constexpr double throughputFactor() { return (1000000) / double(1024 * 1024); }
31 
32 namespace evf {
33 
34  const edm::ModuleDescription FastMonitoringService::reservedMicroStateNames[FastMonState::mCOUNT] = {
35  edm::ModuleDescription("Dummy", "Invalid"),
36  edm::ModuleDescription("Dummy", "Idle"),
37  edm::ModuleDescription("Dummy", "FwkOvhSrc"),
38  edm::ModuleDescription("Dummy", "FwkOvhMod"), //set post produce, analyze or filter
39  edm::ModuleDescription("Dummy", "FwkEoL"),
40  edm::ModuleDescription("Dummy", "Input"),
41  edm::ModuleDescription("Dummy", "DQM"),
42  edm::ModuleDescription("Dummy", "BoL"),
43  edm::ModuleDescription("Dummy", "EoL"),
44  edm::ModuleDescription("Dummy", "GlobalEoL")};
45 
46  const std::string FastMonitoringService::macroStateNames[FastMonState::MCOUNT] = {"Init",
47  "JobReady",
48  "RunGiven",
49  "Running",
50  "Stopping",
51  "Done",
52  "JobEnded",
53  "Error",
54  "ErrorEnded",
55  "End",
56  "Invalid"};
57 
58  const std::string FastMonitoringService::inputStateNames[FastMonState::inCOUNT] = {
59  "Ignore",
60  "Init",
61  "WaitInput",
62  "NewLumi",
63  "NewLumiBusyEndingLS",
64  "NewLumiIdleEndingLS",
65  "RunEnd",
66  "ProcessingFile",
67  "WaitChunk",
68  "ChunkReceived",
69  "ChecksumEvent",
70  "CachedEvent",
71  "ReadEvent",
72  "ReadCleanup",
73  "NoRequest",
74  "NoRequestWithIdleThreads",
75  "NoRequestWithGlobalEoL",
76  "NoRequestWithEoLThreads",
77  "SupFileLimit",
78  "SupWaitFreeChunk",
79  "SupWaitFreeChunkCopying",
80  "SupWaitFreeThread",
81  "SupWaitFreeThreadCopying",
82  "SupBusy",
83  "SupLockPolling",
84  "SupLockPollingCopying",
85  "SupNoFile",
86  "SupNewFile",
87  "SupNewFileWaitThreadCopying",
88  "SupNewFileWaitThread",
89  "SupNewFileWaitChunkCopying",
90  "SupNewFileWaitChunk",
91  "WaitInput_fileLimit",
92  "WaitInput_waitFreeChunk",
93  "WaitInput_waitFreeChunkCopying",
94  "WaitInput_waitFreeThread",
95  "WaitInput_waitFreeThreadCopying",
96  "WaitInput_busy",
97  "WaitInput_lockPolling",
98  "WaitInput_lockPollingCopying",
99  "WaitInput_runEnd",
100  "WaitInput_noFile",
101  "WaitInput_newFile",
102  "WaitInput_newFileWaitThreadCopying",
103  "WaitInput_newFileWaitThread",
104  "WaitInput_newFileWaitChunkCopying",
105  "WaitInput_newFileWaitChunk",
106  "WaitChunk_fileLimit",
107  "WaitChunk_waitFreeChunk",
108  "WaitChunk_waitFreeChunkCopying",
109  "WaitChunk_waitFreeThread",
110  "WaitChunk_waitFreeThreadCopying",
111  "WaitChunk_busy",
112  "WaitChunk_lockPolling",
113  "WaitChunk_lockPollingCopying",
114  "WaitChunk_runEnd",
115  "WaitChunk_noFile",
116  "WaitChunk_newFile",
117  "WaitChunk_newFileWaitThreadCopying",
118  "WaitChunk_newFileWaitThread",
119  "WaitChunk_newFileWaitChunkCopying",
120  "WaitChunk_newFileWaitChunk",
121  "inSupThrottled",
122  "inThrottled"};
123 
124  const std::string FastMonitoringService::nopath_ = "NoPath";
125 
127  : MicroStateService(iPS, reg),
128  fmt_(new FastMonitoringThread()),
129  nStreams_(0) //until initialized
130  ,
131  sleepTime_(iPS.getUntrackedParameter<int>("sleepTime", 1)),
132  fastMonIntervals_(iPS.getUntrackedParameter<unsigned int>("fastMonIntervals", 2)),
133  fastName_("fastmoni"),
134  slowName_("slowmoni"),
135  filePerFwkStream_(iPS.getUntrackedParameter<bool>("filePerFwkStream", false)),
136  totalEventsProcessed_(0),
137  verbose_(iPS.getUntrackedParameter<bool>("verbose")) {
138  reg.watchPreallocate(this, &FastMonitoringService::preallocate); //receiving information on number of threads
140 
145 
149 
154 
156 
157  reg.watchPreEvent(this, &FastMonitoringService::preEvent); //stream
159 
160  reg.watchPreSourceEvent(this, &FastMonitoringService::preSourceEvent); //source (with streamID of requestor)
162 
165 
168 
172 
173  //find microstate definition path (required by the module)
174  struct stat statbuf;
175  std::string microstateBaseSuffix = "src/EventFilter/Utilities/plugins/microstatedef.jsd";
176  std::string microstatePath = std::string(std::getenv("CMSSW_BASE")) + "/" + microstateBaseSuffix;
177  if (stat(microstatePath.c_str(), &statbuf)) {
178  microstatePath = std::string(std::getenv("CMSSW_RELEASE_BASE")) + "/" + microstateBaseSuffix;
179  if (stat(microstatePath.c_str(), &statbuf)) {
180  microstatePath = microstateBaseSuffix;
181  if (stat(microstatePath.c_str(), &statbuf))
182  throw cms::Exception("FastMonitoringService") << "microstate definition file not found";
183  }
184  }
185  fastMicrostateDefPath_ = microstateDefPath_ = microstatePath;
186  }
187 
189 
192  desc.setComment("Service for File-based DAQ monitoring and event accounting");
193  desc.addUntracked<int>("sleepTime", 1)->setComment("Sleep time of the monitoring thread");
194  desc.addUntracked<unsigned int>("fastMonIntervals", 2)
195  ->setComment("Modulo of sleepTime intervals on which fastmon file is written out");
196  desc.addUntracked<bool>("filePerFwkStream", false)
197  ->setComment("Switches on monitoring output per framework stream");
198  desc.addUntracked<bool>("verbose", false)->setComment("Set to use LogInfo messages from the monitoring thread");
199  desc.setAllowAnything();
200  descriptions.add("FastMonitoringService", desc);
201  }
202 
204  Json::Value legendaVector(Json::arrayValue);
205  for (int i = 0; i < fmt_->m_data.encPath_[0].current_; i++)
206  legendaVector.append(Json::Value(*(static_cast<const std::string*>(fmt_->m_data.encPath_[0].decode(i)))));
207  Json::Value valReserved(nReservedPaths);
208  Json::Value pathLegend;
209  pathLegend["names"] = legendaVector;
210  pathLegend["reserved"] = valReserved;
212  return writer.write(pathLegend);
213  }
214 
216  Json::Value legendaVector(Json::arrayValue);
217  for (int i = 0; i < fmt_->m_data.encModule_.current_; i++)
218  legendaVector.append(
219  Json::Value((static_cast<const edm::ModuleDescription*>(fmt_->m_data.encModule_.decode(i)))->moduleLabel()));
220  //duplicate modules adding a list for acquire states (not all modules actually have it)
221  for (int i = 0; i < fmt_->m_data.encModule_.current_; i++)
222  legendaVector.append(Json::Value(
223  (static_cast<const edm::ModuleDescription*>(fmt_->m_data.encModule_.decode(i)))->moduleLabel() + "__ACQ"));
224  Json::Value valReserved(nReservedModules);
225  Json::Value valSpecial(nSpecialModules);
226  Json::Value valOutputModules(nOutputModules_);
227  Json::Value moduleLegend;
228  moduleLegend["names"] = legendaVector;
229  moduleLegend["reserved"] = valReserved;
230  moduleLegend["special"] = valSpecial;
231  moduleLegend["output"] = valOutputModules;
233  return writer.write(moduleLegend);
234  }
235 
237  Json::Value legendaVector(Json::arrayValue);
238  for (int i = 0; i < FastMonState::inCOUNT; i++)
239  legendaVector.append(Json::Value(inputStateNames[i]));
240  Json::Value moduleLegend;
241  moduleLegend["names"] = legendaVector;
243  return writer.write(moduleLegend);
244  }
245 
247  nStreams_ = bounds.maxNumberOfStreams();
248  nThreads_ = bounds.maxNumberOfThreads();
249  //this should already be >=1
250  if (nStreams_ == 0)
251  nStreams_ = 1;
252  if (nThreads_ == 0)
253  nThreads_ = 1;
254  }
255 
257  edm::ProcessContext const& pc) {
258  // FIND RUN DIRECTORY
259  // The run dir should be set via the configuration of EvFDaqDirector
260 
261  if (edm::Service<evf::EvFDaqDirector>().operator->() == nullptr) {
262  throw cms::Exception("FastMonitoringService") << "EvFDaqDirector is not present";
263  }
264  std::filesystem::path runDirectory{edm::Service<evf::EvFDaqDirector>()->baseRunDir()};
265  workingDirectory_ = runDirectory_ = runDirectory;
266  workingDirectory_ /= "mon";
267 
268  if (!std::filesystem::is_directory(workingDirectory_)) {
269  LogDebug("FastMonitoringService") << "<MON> DIR NOT FOUND! Trying to create -: " << workingDirectory_.string();
270  std::filesystem::create_directories(workingDirectory_);
271  if (!std::filesystem::is_directory(workingDirectory_))
272  edm::LogWarning("FastMonitoringService") << "Unable to create <MON> DIR -: " << workingDirectory_.string()
273  << ". No monitoring data will be written.";
274  }
275 
276  std::ostringstream fastFileName;
277 
278  fastFileName << fastName_ << "_pid" << std::setfill('0') << std::setw(5) << getpid() << ".fast";
280  fast /= fastFileName.str();
281  fastPath_ = fast.string();
282  if (filePerFwkStream_)
283  for (unsigned int i = 0; i < nStreams_; i++) {
284  std::ostringstream fastFileNameTid;
285  fastFileNameTid << fastName_ << "_pid" << std::setfill('0') << std::setw(5) << getpid() << "_tid" << i
286  << ".fast";
288  fastTid /= fastFileNameTid.str();
289  fastPathList_.push_back(fastTid.string());
290  }
291 
292  std::ostringstream moduleLegFile;
293  std::ostringstream moduleLegFileJson;
294  moduleLegFile << "microstatelegend_pid" << std::setfill('0') << std::setw(5) << getpid() << ".leg";
295  moduleLegFileJson << "microstatelegend_pid" << std::setfill('0') << std::setw(5) << getpid() << ".jsn";
296  moduleLegendFile_ = (workingDirectory_ / moduleLegFile.str()).string();
297  moduleLegendFileJson_ = (workingDirectory_ / moduleLegFileJson.str()).string();
298 
299  std::ostringstream pathLegFile;
300  std::ostringstream pathLegFileJson;
301  pathLegFile << "pathlegend_pid" << std::setfill('0') << std::setw(5) << getpid() << ".leg";
302  pathLegendFile_ = (workingDirectory_ / pathLegFile.str()).string();
303  pathLegFileJson << "pathlegend_pid" << std::setfill('0') << std::setw(5) << getpid() << ".jsn";
304  pathLegendFileJson_ = (workingDirectory_ / pathLegFileJson.str()).string();
305 
306  std::ostringstream inputLegFileJson;
307  inputLegFileJson << "inputlegend_pid" << std::setfill('0') << std::setw(5) << getpid() << ".jsn";
308  inputLegendFileJson_ = (workingDirectory_ / inputLegFileJson.str()).string();
309 
310  LogDebug("FastMonitoringService") << "Initializing FastMonitor with microstate def path -: " << microstateDefPath_;
311  //<< encPath_.current_ + 1 << " " << encModule_.current_ + 1
312 
313  /*
314  * initialize the fast monitor with:
315  * vector of pointers to monitorable parameters
316  * path to definition
317  *
318  */
319 
320  fmt_->m_data.macrostate_ = FastMonState::sInit;
321 
322  for (unsigned int i = 0; i < (FastMonState::mCOUNT); i++)
323  fmt_->m_data.encModule_.updateReserved(static_cast<const void*>(reservedMicroStateNames + i));
324  fmt_->m_data.encModule_.completeReservedWithDummies();
325 
326  for (unsigned int i = 0; i < nStreams_; i++) {
327  fmt_->m_data.ministate_.emplace_back(&nopath_);
328  fmt_->m_data.microstate_.emplace_back(&reservedMicroStateNames[FastMonState::mInvalid]);
329  fmt_->m_data.microstateAcqFlag_.push_back(0);
330 
331  //for synchronization
332  streamCounterUpdating_.push_back(new std::atomic<bool>(false));
333 
334  //path (mini) state
335  fmt_->m_data.encPath_.emplace_back(0);
336  fmt_->m_data.encPath_[i].update(static_cast<const void*>(&nopath_));
337 
338  for (auto& path : pathsInfo.paths()) {
339  fmt_->m_data.encPath_[i].updatePreinit(path);
340  }
341  for (auto& endPath : pathsInfo.endPaths()) {
342  fmt_->m_data.encPath_[i].updatePreinit(endPath);
343  }
344  }
345  //for (unsigned int i=0;i<nThreads_;i++)
346  // threadMicrostate_.push_back(&reservedMicroStateNames[mInvalid]);
347 
348  //initial size until we detect number of bins
349  fmt_->m_data.macrostateBins_ = FastMonState::MCOUNT;
350  fmt_->m_data.microstateBins_ = 0;
351  fmt_->m_data.inputstateBins_ = FastMonState::inCOUNT;
352  fmt_->m_data.ministateBins_ = fmt_->m_data.encPath_[0].vecsize();
353 
354  lastGlobalLumi_ = 0;
355  isInitTransition_ = true;
356  lumiFromSource_ = 0;
357 
358  //startup monitoring
359  fmt_->resetFastMonitor(microstateDefPath_, fastMicrostateDefPath_);
360  fmt_->jsonMonitor_->setNStreams(nStreams_);
361  fmt_->m_data.registerVariables(fmt_->jsonMonitor_.get(), nStreams_, threadIDAvailable_ ? nThreads_ : 0);
362  monInit_.store(false, std::memory_order_release);
363  if (sleepTime_ > 0)
365 
366  //this definition needs: #include "tbb/compat/thread"
367  //however this would results in TBB imeplementation replacing std::thread
368  //(both supposedly call pthread_self())
369  //number of threads created in process could be obtained from /proc,
370  //assuming that all posix threads are true kernel threads capable of running in parallel
371 
372  //#if TBB_IMPLEMENT_CPP0X
374  //threadIDAvailable_=true;
375  //#endif
376  }
377 
381  context = " FromThisContext ";
383  context = " FromAnotherContext";
385  context = " FromExternalSignal";
386  edm::LogWarning("FastMonitoringService")
387  << " STREAM " << sc.streamID().value() << " earlyTermination -: ID:" << sc.eventID()
388  << " LS:" << sc.eventID().luminosityBlock() << " " << context;
389  std::lock_guard<std::mutex> lock(fmt_->monlock_);
390  exceptionInLS_.push_back(sc.eventID().luminosityBlock());
391  has_data_exception_.store(true);
392  }
393 
397  context = " FromThisContext ";
399  context = " FromAnotherContext";
401  context = " FromExternalSignal";
402  edm::LogWarning("FastMonitoringService")
403  << " GLOBAL "
404  << "earlyTermination -: LS:" << gc.luminosityBlockID().luminosityBlock() << " " << context;
405  std::lock_guard<std::mutex> lock(fmt_->monlock_);
407  has_data_exception_.store(true);
408  }
409 
413  context = " FromThisContext ";
415  context = " FromAnotherContext";
417  context = " FromExternalSignal";
418  edm::LogWarning("FastMonitoringService") << " SOURCE "
419  << "earlyTermination -: " << context;
420  std::lock_guard<std::mutex> lock(fmt_->monlock_);
421  exception_detected_ = true;
422  has_source_exception_.store(true);
423  has_data_exception_.store(true);
424  }
425 
427  std::lock_guard<std::mutex> lock(fmt_->monlock_);
428  if (!ls)
429  exception_detected_ = true;
430  else
431  exceptionInLS_.push_back(ls);
432  }
433 
435  return has_source_exception_.load() || has_data_exception_.load();
436  }
437 
439  if (!has_data_exception_.load())
440  return false;
441  if (has_source_exception_.load())
442  return true;
443  std::lock_guard<std::mutex> lock(fmt_->monlock_);
444  for (auto ex : exceptionInLS_) {
445  if (ls == ex)
446  return true;
447  }
448  return false;
449  }
450 
452 
453  //new output module name is stream
455  std::lock_guard<std::mutex> lock(fmt_->monlock_);
456  //std::cout << " Pre module Begin Job module: " << desc.moduleName() << std::endl;
457 
458  //build a map of modules keyed by their module description address
459  //here we need to treat output modules in a special way so they can be easily singled out
460  if (desc.moduleName() == "Stream" || desc.moduleName() == "GlobalEvFOutputModule" ||
461  desc.moduleName() == "EvFOutputModule" || desc.moduleName() == "EventStreamFileWriter" ||
462  desc.moduleName() == "PoolOutputModule") {
463  fmt_->m_data.encModule_.updateReserved((void*)&desc);
464  nOutputModules_++;
465  } else
466  fmt_->m_data.encModule_.update((void*)&desc);
467  }
468 
470  std::string&& moduleLegStrJson = makeModuleLegendaJson();
471  FileIO::writeStringToFile(moduleLegendFileJson_, moduleLegStrJson);
472 
473  std::string inputLegendStrJson = makeInputLegendaJson();
474  FileIO::writeStringToFile(inputLegendFileJson_, inputLegendStrJson);
475 
476  std::string pathLegendStrJson = makePathLegendaJson();
477  FileIO::writeStringToFile(pathLegendFileJson_, pathLegendStrJson);
478 
479  fmt_->m_data.macrostate_ = FastMonState::sJobReady;
480 
481  //update number of entries in module histogram
482  std::lock_guard<std::mutex> lock(fmt_->monlock_);
483  //double the size to add post-acquire states
484  fmt_->m_data.microstateBins_ = fmt_->m_data.encModule_.vecsize() * 2;
485  }
486 
488  fmt_->m_data.macrostate_ = FastMonState::sJobEnded;
489  fmt_->stop();
490  }
491 
493  fmt_->m_data.macrostate_ = FastMonState::sRunning;
494  isInitTransition_ = false;
495  }
496 
498  timeval lumiStartTime;
499  gettimeofday(&lumiStartTime, nullptr);
500  unsigned int newLumi = gc.luminosityBlockID().luminosityBlock();
501  lastGlobalLumi_ = newLumi;
502 
503  std::lock_guard<std::mutex> lock(fmt_->monlock_);
504  lumiStartTime_[newLumi] = lumiStartTime;
505  }
506 
508  unsigned int lumi = gc.luminosityBlockID().luminosityBlock();
509  LogDebug("FastMonitoringService") << "Lumi ended. Writing JSON information. LUMI -: " << lumi;
510  timeval lumiStopTime;
511  gettimeofday(&lumiStopTime, nullptr);
512 
513  std::lock_guard<std::mutex> lock(fmt_->monlock_);
514 
515  // Compute throughput
516  timeval stt = lumiStartTime_[lumi];
517  lumiStartTime_.erase(lumi);
518  unsigned long usecondsForLumi = (lumiStopTime.tv_sec - stt.tv_sec) * 1000000 + (lumiStopTime.tv_usec - stt.tv_usec);
519  unsigned long accuSize = accuSize_.find(lumi) == accuSize_.end() ? 0 : accuSize_[lumi];
520  accuSize_.erase(lumi);
521  double throughput = throughputFactor() * double(accuSize) / double(usecondsForLumi);
522  //store to registered variable
523  fmt_->m_data.fastThroughputJ_.value() = throughput;
524 
525  //update
526  doSnapshot(lumi, true);
527 
528  //retrieve one result we need (todo: sanity check if it's found)
529  IntJ* lumiProcessedJptr = dynamic_cast<IntJ*>(fmt_->jsonMonitor_->getMergedIntJForLumi("Processed", lumi));
530  if (!lumiProcessedJptr)
531  throw cms::Exception("FastMonitoringService") << "Internal error: got null pointer from FastMonitor";
532  processedEventsPerLumi_[lumi] = std::pair<unsigned int, bool>(lumiProcessedJptr->value(), false);
533 
534  //checking if exception has been thrown (in case of Global/Stream early termination, for this LS)
535  bool exception_detected = exception_detected_;
536  for (auto ex : exceptionInLS_)
537  if (lumi == ex)
538  exception_detected = true;
539 
540  if (edm::shutdown_flag || exception_detected) {
541  edm::LogInfo("FastMonitoringService")
542  << "Run interrupted. Skip writing EoL information -: " << processedEventsPerLumi_[lumi].first
543  << " events were processed in LUMI " << lumi;
544  //this will prevent output modules from producing json file for possibly incomplete lumi
545  processedEventsPerLumi_[lumi].first = 0;
546  processedEventsPerLumi_[lumi].second = true;
547  //disable this exception, so service can be used standalone (will be thrown if output module asks for this information)
548  //throw cms::Exception("FastMonitoringService") << "SOURCE did not send update for lumi block. LUMI -:" << lumi;
549  return;
550  }
551 
552  if (inputSource_ || daqInputSource_) {
553  auto sourceReport =
555  if (sourceReport.first) {
556  if (sourceReport.second != processedEventsPerLumi_[lumi].first) {
557  throw cms::Exception("FastMonitoringService") << "MISMATCH with SOURCE update. LUMI -: " << lumi
558  << ", events(processed):" << processedEventsPerLumi_[lumi].first
559  << " events(source):" << sourceReport.second;
560  }
561  }
562  }
563 
564  edm::LogInfo("FastMonitoringService")
565  << "Statistics for lumisection -: lumi = " << lumi << " events = " << lumiProcessedJptr->value()
566  << " time = " << usecondsForLumi / 1000000 << " size = " << accuSize << " thr = " << throughput;
567  delete lumiProcessedJptr;
568 
569  //full global and stream merge&output for this lumi
570 
571  // create file name for slow monitoring file
572  bool output = sleepTime_ > 0;
573  if (filePerFwkStream_) {
574  std::stringstream slowFileNameStem;
575  slowFileNameStem << slowName_ << "_ls" << std::setfill('0') << std::setw(4) << lumi << "_pid" << std::setfill('0')
576  << std::setw(5) << getpid();
578  slow /= slowFileNameStem.str();
579  fmt_->jsonMonitor_->outputFullJSONs(slow.string(), ".jsn", lumi, output);
580  } else {
581  std::stringstream slowFileName;
582  slowFileName << slowName_ << "_ls" << std::setfill('0') << std::setw(4) << lumi << "_pid" << std::setfill('0')
583  << std::setw(5) << getpid() << ".jsn";
585  slow /= slowFileName.str();
586  //full global and stream merge and JSON write for this lumi
587  fmt_->jsonMonitor_->outputFullJSON(slow.string(), lumi, output);
588  }
589  fmt_->jsonMonitor_->discardCollected(lumi); //we don't do further updates for this lumi
590  }
591 
593  std::lock_guard<std::mutex> lock(fmt_->monlock_);
594  unsigned int lumi = gc.luminosityBlockID().luminosityBlock();
595  //LS monitoring snapshot with input source data has been taken in previous callback
596  avgLeadTime_.erase(lumi);
598  lockStatsDuringLumi_.erase(lumi);
599 
600  //output module already used this in end lumi (this could be migrated to EvFDaqDirector as it is essential for FFF bookkeeping)
602  }
603 
605  unsigned int sid = sc.streamID().value();
606 
607  std::lock_guard<std::mutex> lock(fmt_->monlock_);
608  fmt_->m_data.streamLumi_[sid] = sc.eventID().luminosityBlock();
609 
610  //reset collected values for this stream
611  *(fmt_->m_data.processed_[sid]) = 0;
612 
613  fmt_->m_data.ministate_[sid] = &nopath_;
614  fmt_->m_data.microstate_[sid] = &reservedMicroStateNames[FastMonState::mBoL];
615  }
616 
618  fmt_->m_data.microstate_[sc.streamID().value()] = &reservedMicroStateNames[FastMonState::mIdle];
619  }
620 
622  unsigned int sid = sc.streamID().value();
623  std::lock_guard<std::mutex> lock(fmt_->monlock_);
624 
625  //update processed count to be complete at this time
626  //doStreamEOLSnapshot(sc.eventID().luminosityBlock(), sid);
627  fmt_->jsonMonitor_->snapStreamAtomic(sc.eventID().luminosityBlock(), sid);
628  //reset this in case stream does not get notified of next lumi (we keep processed events only)
629  fmt_->m_data.ministate_[sid] = &nopath_;
630  fmt_->m_data.microstate_[sid] = &reservedMicroStateNames[FastMonState::mEoL];
631  }
633  fmt_->m_data.microstate_[sc.streamID().value()] = &reservedMicroStateNames[FastMonState::mFwkEoL];
634  }
635 
637  fmt_->m_data.ministate_[sc.streamID()] = &(pc.pathName());
638  }
639 
641 
643  fmt_->m_data.microstate_[sc.streamID()] = &reservedMicroStateNames[FastMonState::mIdle];
644 
645  fmt_->m_data.ministate_[sc.streamID()] = &nopath_;
646 
647  (*(fmt_->m_data.processed_[sc.streamID()]))++;
648 
649  //fast path counter (events accumulated in a run)
650  unsigned long res = totalEventsProcessed_.fetch_add(1, std::memory_order_relaxed);
651  fmt_->m_data.fastPathProcessedJ_ = res + 1;
652  }
653 
655  fmt_->m_data.microstate_[sid.value()] = &reservedMicroStateNames[FastMonState::mInput];
656  }
657 
659  fmt_->m_data.microstate_[sid.value()] = &reservedMicroStateNames[FastMonState::mFwkOvhSrc];
660  }
661 
663  edm::ModuleCallingContext const& mcc) {
664  fmt_->m_data.microstate_[sc.streamID().value()] = (void*)(mcc.moduleDescription());
665  }
666 
668  edm::ModuleCallingContext const& mcc) {
669  //fmt_->m_data.microstate_[sc.streamID().value()] = (void*)(mcc.moduleDescription());
670  fmt_->m_data.microstateAcqFlag_[sc.streamID().value()] = 1;
671  }
672 
674  fmt_->m_data.microstate_[sc.streamID().value()] = (void*)(mcc.moduleDescription());
675  fmt_->m_data.microstateAcqFlag_[sc.streamID().value()] = 0;
676  }
677 
679  fmt_->m_data.microstate_[sc.streamID().value()] = &reservedMicroStateNames[FastMonState::mFwkOvhMod];
680  }
681 
682  //FUNCTIONS CALLED FROM OUTSIDE
683 
684  //this is for old-fashioned service that is not thread safe and can block other streams
685  //(we assume the worst case - everything is blocked)
687  for (unsigned int i = 0; i < nStreams_; i++)
688  fmt_->m_data.microstate_[i] = &reservedMicroStateNames[m];
689  }
690 
691  //this is for services that are multithreading-enabled or rarely blocks other streams
693  fmt_->m_data.microstate_[sid] = &reservedMicroStateNames[m];
694  }
695 
696  //from source
697  void FastMonitoringService::accumulateFileSize(unsigned int lumi, unsigned long fileSize) {
698  std::lock_guard<std::mutex> lock(fmt_->monlock_);
699 
700  if (accuSize_.find(lumi) == accuSize_.end())
701  accuSize_[lumi] = fileSize;
702  else
703  accuSize_[lumi] += fileSize;
704 
707  else
709  }
710 
712  gettimeofday(&fileLookStart_, nullptr);
713  /*
714  std::cout << "Started looking for .raw file at: s=" << fileLookStart_.tv_sec << ": ms = "
715  << fileLookStart_.tv_usec / 1000.0 << std::endl;
716  */
717  }
718 
720  gettimeofday(&fileLookStop_, nullptr);
721  /*
722  std::cout << "Stopped looking for .raw file at: s=" << fileLookStop_.tv_sec << ": ms = "
723  << fileLookStop_.tv_usec / 1000.0 << std::endl;
724  */
725  std::lock_guard<std::mutex> lock(fmt_->monlock_);
726 
727  if (lumi > lumiFromSource_) {
729  leadTimes_.clear();
730  }
731  unsigned long elapsedTime = (fileLookStop_.tv_sec - fileLookStart_.tv_sec) * 1000000 // sec to us
732  + (fileLookStop_.tv_usec - fileLookStart_.tv_usec); // us
733  // add this to lead times for this lumi
734  leadTimes_.push_back((double)elapsedTime);
735 
736  // recompute average lead time for this lumi
737  if (leadTimes_.size() == 1)
739  else {
740  double totTime = 0;
741  for (unsigned int i = 0; i < leadTimes_.size(); i++)
742  totTime += leadTimes_[i];
743  avgLeadTime_[lumi] = 0.001 * (totTime / leadTimes_.size());
744  }
745  }
746 
747  void FastMonitoringService::reportLockWait(unsigned int ls, double waitTime, unsigned int lockCount) {
748  std::lock_guard<std::mutex> lock(fmt_->monlock_);
749  lockStatsDuringLumi_[ls] = std::pair<double, unsigned int>(waitTime, lockCount);
750  }
751 
752  //for the output module
753  unsigned int FastMonitoringService::getEventsProcessedForLumi(unsigned int lumi, bool* abortFlag) {
754  std::lock_guard<std::mutex> lock(fmt_->monlock_);
755 
756  auto it = processedEventsPerLumi_.find(lumi);
757  if (it != processedEventsPerLumi_.end()) {
758  unsigned int proc = it->second.first;
759  if (abortFlag)
760  *abortFlag = it->second.second;
761  return proc;
762  } else {
763  throw cms::Exception("FastMonitoringService")
764  << "output module wants already deleted (or never reported by SOURCE) lumisection event count for LUMI -: "
765  << lumi;
766  return 0;
767  }
768  }
769 
770  //for the output module
772  std::lock_guard<std::mutex> lock(fmt_->monlock_);
773 
774  auto it = processedEventsPerLumi_.find(lumi);
775  if (it != processedEventsPerLumi_.end()) {
776  unsigned int abortFlag = it->second.second;
777  return abortFlag;
778  } else {
779  throw cms::Exception("FastMonitoringService")
780  << "output module wants already deleted (or never reported by SOURCE) lumisection status for LUMI -: "
781  << lumi;
782  return false;
783  }
784  }
785 
786  // the function to be called in the thread. Thread completes when function returns.
788  monInit_.exchange(true, std::memory_order_acquire);
789  while (!fmt_->m_stoprequest) {
790  std::vector<std::vector<unsigned int>> lastEnc;
791  {
792  std::unique_lock<std::mutex> lock(fmt_->monlock_);
793 
794  doSnapshot(lastGlobalLumi_, false);
795 
796  lastEnc.emplace_back(fmt_->m_data.ministateEncoded_);
797  lastEnc.emplace_back(fmt_->m_data.microstateEncoded_);
798 
800  if (filePerFwkStream_) {
801  std::vector<std::string> CSVv;
802  for (unsigned int i = 0; i < nStreams_; i++) {
803  CSVv.push_back(fmt_->jsonMonitor_->getCSVString((int)i));
804  }
805  // release mutex before writing out fast path file
806  lock.release()->unlock();
807  for (unsigned int i = 0; i < nStreams_; i++) {
808  if (!CSVv[i].empty())
809  fmt_->jsonMonitor_->outputCSV(fastPathList_[i], CSVv[i]);
810  }
811  } else {
812  std::string CSV = fmt_->jsonMonitor_->getCSVString();
813  // release mutex before writing out fast path file
814  lock.release()->unlock();
815  if (!CSV.empty())
816  fmt_->jsonMonitor_->outputCSV(fastPath_, CSV);
817  }
818  }
819  snapCounter_++;
820  }
821 
822  if (verbose_) {
823  edm::LogInfo msg("FastMonitoringService");
824  auto f = [&](std::vector<unsigned int> const& p) {
825  for (unsigned int i = 0; i < nStreams_; i++) {
826  if (i == 0)
827  msg << "[" << p[i] << ",";
828  else if (i <= nStreams_ - 1)
829  msg << p[i] << ",";
830  else
831  msg << p[i] << "]";
832  }
833  };
834 
835  msg << "Current states: Ms=" << fmt_->m_data.fastMacrostateJ_.value() << " ms=";
836  f(lastEnc[0]);
837  msg << " us=";
838  f(lastEnc[1]);
840  }
841 
842  ::sleep(sleepTime_);
843  }
844  }
845 
846  void FastMonitoringService::doSnapshot(const unsigned int ls, const bool isGlobalEOL) {
847  // update macrostate
848  fmt_->m_data.fastMacrostateJ_ = fmt_->m_data.macrostate_;
849 
850  std::vector<const void*> microstateCopy(fmt_->m_data.microstate_.begin(), fmt_->m_data.microstate_.end());
851  std::vector<unsigned char> microstateAcqCopy(fmt_->m_data.microstateAcqFlag_.begin(),
852  fmt_->m_data.microstateAcqFlag_.end());
853 
854  if (!isInitTransition_) {
855  auto itd = avgLeadTime_.find(ls);
856  if (itd != avgLeadTime_.end())
857  fmt_->m_data.fastAvgLeadTimeJ_ = itd->second;
858  else
859  fmt_->m_data.fastAvgLeadTimeJ_ = 0.;
860 
861  auto iti = filesProcessedDuringLumi_.find(ls);
862  if (iti != filesProcessedDuringLumi_.end())
863  fmt_->m_data.fastFilesProcessedJ_ = iti->second;
864  else
865  fmt_->m_data.fastFilesProcessedJ_ = 0;
866 
867  auto itrd = lockStatsDuringLumi_.find(ls);
868  if (itrd != lockStatsDuringLumi_.end()) {
869  fmt_->m_data.fastLockWaitJ_ = itrd->second.first;
870  fmt_->m_data.fastLockCountJ_ = itrd->second.second;
871  } else {
872  fmt_->m_data.fastLockWaitJ_ = 0.;
873  fmt_->m_data.fastLockCountJ_ = 0.;
874  }
875  }
876 
877  for (unsigned int i = 0; i < nStreams_; i++) {
878  fmt_->m_data.ministateEncoded_[i] = fmt_->m_data.encPath_[i].encodeString(fmt_->m_data.ministate_[i]);
879  if (microstateAcqCopy[i])
880  fmt_->m_data.microstateEncoded_[i] =
881  fmt_->m_data.microstateBins_ + fmt_->m_data.encModule_.encode(microstateCopy[i]);
882  else
883  fmt_->m_data.microstateEncoded_[i] = fmt_->m_data.encModule_.encode(microstateCopy[i]);
884  }
885 
886  bool inputStatePerThread = false;
887 
889  switch (inputSupervisorState_) {
891  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_fileLimit;
892  break;
894  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_waitFreeChunk;
895  break;
897  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_waitFreeChunkCopying;
898  break;
900  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_waitFreeThread;
901  break;
903  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_waitFreeThreadCopying;
904  break;
906  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_busy;
907  break;
909  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_lockPolling;
910  break;
912  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_lockPollingCopying;
913  break;
915  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_runEnd;
916  break;
918  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_noFile;
919  break;
921  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_newFile;
922  break;
925  break;
927  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_newFileWaitThread;
928  break;
931  break;
933  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput_newFileWaitChunk;
934  break;
935  default:
936  fmt_->m_data.inputState_[0] = FastMonState::inWaitInput;
937  }
938  } else if (inputState_ == FastMonState::inWaitChunk) {
939  switch (inputSupervisorState_) {
941  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_fileLimit;
942  break;
944  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_waitFreeChunk;
945  break;
947  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_waitFreeChunkCopying;
948  break;
950  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_waitFreeThread;
951  break;
953  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_waitFreeThreadCopying;
954  break;
956  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_busy;
957  break;
959  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_lockPolling;
960  break;
962  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_lockPollingCopying;
963  break;
965  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_runEnd;
966  break;
968  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_noFile;
969  break;
971  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_newFile;
972  break;
975  break;
977  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_newFileWaitThread;
978  break;
981  break;
983  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk_newFileWaitChunk;
984  break;
985  default:
986  fmt_->m_data.inputState_[0] = FastMonState::inWaitChunk;
987  }
988  } else if (inputState_ == FastMonState::inNoRequest) {
989  inputStatePerThread = true;
990  for (unsigned int i = 0; i < nStreams_; i++) {
991  if (microstateCopy[i] == &reservedMicroStateNames[FastMonState::mIdle])
992  fmt_->m_data.inputState_[i] = FastMonState::inNoRequestWithIdleThreads;
993  else if (microstateCopy[i] == &reservedMicroStateNames[FastMonState::mEoL] ||
994  microstateCopy[i] == &reservedMicroStateNames[FastMonState::mFwkEoL])
995  fmt_->m_data.inputState_[i] = FastMonState::inNoRequestWithEoLThreads;
996  else
997  fmt_->m_data.inputState_[i] = FastMonState::inNoRequest;
998  }
999  } else if (inputState_ == FastMonState::inNewLumi) {
1000  inputStatePerThread = true;
1001  for (unsigned int i = 0; i < nStreams_; i++) {
1002  if (microstateCopy[i] == &reservedMicroStateNames[FastMonState::mEoL] ||
1003  microstateCopy[i] == &reservedMicroStateNames[FastMonState::mFwkEoL])
1004  fmt_->m_data.inputState_[i] = FastMonState::inNewLumi;
1005  }
1007  //apply directly throttled state from supervisor
1008  fmt_->m_data.inputState_[0] = inputSupervisorState_;
1009  } else
1010  fmt_->m_data.inputState_[0] = inputState_;
1011 
1012  //this is same for all streams
1013  if (!inputStatePerThread)
1014  for (unsigned int i = 1; i < nStreams_; i++)
1015  fmt_->m_data.inputState_[i] = fmt_->m_data.inputState_[0];
1016 
1017  if (isGlobalEOL) { //only update global variables
1018  fmt_->jsonMonitor_->snapGlobal(ls);
1019  } else
1020  fmt_->jsonMonitor_->snap(ls);
1021  }
1022 
1023  //compatibility
1025 
1027 
1028 } //end namespace evf
std::atomic< bool > has_data_exception_
void prePathEvent(edm::StreamContext const &, edm::PathContext const &)
std::atomic< FastMonState::InputState > inputState_
void watchPreStreamEarlyTermination(PreStreamEarlyTermination::slot_type const &iSlot)
Definition: fillJson.h:27
static const std::string inputStateNames[FastMonState::inCOUNT]
void watchPreEvent(PreEvent::slot_type const &iSlot)
ModuleDescription const * moduleDescription() const
void postModuleEventAcquire(edm::StreamContext const &, edm::ModuleCallingContext const &)
LuminosityBlockNumber_t luminosityBlock() const
std::atomic< bool > isInitTransition_
void watchPrePathEvent(PrePathEvent::slot_type const &iSlot)
void watchPreallocate(Preallocate::slot_type const &iSlot)
void setExceptionDetected(unsigned int ls)
void watchPreModuleEventAcquire(PreModuleEventAcquire::slot_type const &iSlot)
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
void preallocate(edm::service::SystemBounds const &)
std::map< unsigned int, timeval > lumiStartTime_
void preGlobalBeginLumi(edm::GlobalContext const &)
std::pair< bool, unsigned int > getEventReport(unsigned int lumi, bool erase)
void watchPreModuleEvent(PreModuleEvent::slot_type const &iSlot)
void postGlobalEndLumi(edm::GlobalContext const &)
void postEvent(edm::StreamContext const &)
void accumulateFileSize(unsigned int lumi, unsigned long fileSize)
void watchPreGlobalEndLumi(PreGlobalEndLumi::slot_type const &iSlot)
std::map< unsigned int, unsigned long > accuSize_
std::filesystem::path workingDirectory_
std::vector< std::atomic< bool > * > streamCounterUpdating_
void watchPostEvent(PostEvent::slot_type const &iSlot)
volatile std::atomic< bool > shutdown_flag
void watchPostStreamEndLumi(PostStreamEndLumi::slot_type const &iSlot)
void watchPreGlobalBeginLumi(PreGlobalBeginLumi::slot_type const &iSlot)
void watchPostModuleEvent(PostModuleEvent::slot_type const &iSlot)
Value & append(const Value &value)
Append value to array at the end.
void watchPostSourceEvent(PostSourceEvent::slot_type const &iSlot)
std::map< unsigned int, unsigned int > filesProcessedDuringLumi_
std::pair< bool, unsigned int > getEventReport(unsigned int lumi, bool erase)
Definition: DAQSource.cc:1378
void preGlobalEndLumi(edm::GlobalContext const &)
void setMicroState(FastMonState::Microstate)
Represents a JSON value.
Definition: value.h:99
bool isExceptionOnData(unsigned int ls)
void watchPreStreamEndLumi(PreStreamEndLumi::slot_type const &iSlot)
void preGlobalEarlyTermination(edm::GlobalContext const &, edm::TerminationOrigin)
void watchPreSourceEarlyTermination(PreSourceEarlyTermination::slot_type const &iSlot)
static const edm::ModuleDescription reservedMicroStateNames[FastMonState::mCOUNT]
Definition: Electron.h:6
constexpr int nSpecialModules
std::vector< std::string > const & endPaths() const
LuminosityBlockNumber_t luminosityBlock() const
Definition: EventID.h:39
void watchJobFailure(JobFailure::slot_type const &iSlot)
convenience function for attaching to signal
void preModuleBeginJob(edm::ModuleDescription const &)
std::map< unsigned int, std::pair< unsigned int, bool > > processedEventsPerLumi_
MicroStateService(const edm::ParameterSet &, edm::ActivityRegistry &)
void preStreamEndLumi(edm::StreamContext const &)
std::map< unsigned int, double > avgLeadTime_
void watchPostStreamBeginLumi(PostStreamBeginLumi::slot_type const &iSlot)
void doSnapshot(const unsigned int ls, const bool isGlobalEOL)
void preStreamEarlyTermination(edm::StreamContext const &, edm::TerminationOrigin)
void watchPreGlobalEarlyTermination(PreGlobalEarlyTermination::slot_type const &iSlot)
std::atomic< FastMonState::InputState > inputSupervisorState_
StreamID const & streamID() const
Definition: StreamContext.h:55
static const std::string nopath_
constexpr double throughputFactor()
void watchPostModuleEventAcquire(PostModuleEventAcquire::slot_type const &iSlot)
std::filesystem::path runDirectory_
double f[11][100]
void watchPostGlobalEndLumi(PostGlobalEndLumi::slot_type const &iSlot)
void watchPreModuleBeginJob(PreModuleBeginJob::slot_type const &iSlot)
void preModuleEventAcquire(edm::StreamContext const &, edm::ModuleCallingContext const &)
void postModuleEvent(edm::StreamContext const &, edm::ModuleCallingContext const &)
LuminosityBlockID const & luminosityBlockID() const
Definition: GlobalContext.h:62
std::vector< std::string > fastPathList_
constexpr int nReservedPaths
void postStreamBeginLumi(edm::StreamContext const &)
Log< level::Info, false > LogInfo
def ls(path, rec=False)
Definition: eostools.py:349
void postStreamEndLumi(edm::StreamContext const &)
static void fillDescriptions(edm::ConfigurationDescriptions &descriptions)
void preStreamBeginLumi(edm::StreamContext const &)
std::atomic< unsigned long > totalEventsProcessed_
FedRawDataInputSource * inputSource_
std::map< unsigned int, std::pair< double, unsigned int > > lockStatsDuringLumi_
constexpr int nReservedModules
void watchPreStreamBeginLumi(PreStreamBeginLumi::slot_type const &iSlot)
void add(std::string const &label, ParameterSetDescription const &psetDescription)
tuple msg
Definition: mps_check.py:286
std::vector< double > leadTimes_
void preBeginJob(edm::PathsAndConsumesOfModulesBase const &, edm::ProcessContext const &pc)
void watchPreBeginJob(PreBeginJob::slot_type const &iSlot)
convenience function for attaching to signal
void stoppedLookingForFile(unsigned int lumi)
std::atomic< bool > has_source_exception_
EventID const & eventID() const
Definition: StreamContext.h:60
void postGlobalBeginRun(edm::GlobalContext const &)
void preEvent(edm::StreamContext const &)
std::string const & pathName() const
Definition: PathContext.h:30
Definition: output.py:1
unsigned int getEventsProcessedForLumi(unsigned int lumi, bool *abortFlag=nullptr)
void preSourceEarlyTermination(edm::TerminationOrigin)
void watchPreSourceEvent(PreSourceEvent::slot_type const &iSlot)
unsigned int value() const
Definition: StreamID.h:43
bool getAbortFlagForLumi(unsigned int lumi)
std::shared_ptr< FastMonitoringThread > fmt_
Log< level::Warning, false > LogWarning
void preModuleEvent(edm::StreamContext const &, edm::ModuleCallingContext const &)
Writes a Value in JSON format in a human friendly way.
Definition: writer.h:63
void reportLockWait(unsigned int ls, double waitTime, unsigned int lockCount)
std::vector< unsigned int > exceptionInLS_
std::vector< std::string > const & paths() const
#define LogDebug(id)
void watchPostBeginJob(PostBeginJob::slot_type const &iSlot)
convenience function for attaching to signal
array value (ordered list)
Definition: value.h:30