CMS 3D CMS Logo

CUDAMonitoringService.cc
Go to the documentation of this file.
1 #include <iostream>
2 
3 #include <cuda.h>
4 
17 
18 namespace edm {
19  class StreamContext;
20 }
21 
23 public:
25  ~CUDAMonitoringService() = default;
26 
27  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
28 
32  void postEvent(edm::StreamContext const& sc);
33 
34 private:
36 };
37 
39  // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
41  if (not cuda or not cuda->enabled())
42  return;
43 
44  numberOfDevices_ = cuda->numberOfDevices();
45 
46  if (config.getUntrackedParameter<bool>("memoryConstruction")) {
48  }
49  if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
51  }
52  if (config.getUntrackedParameter<bool>("memoryPerModule")) {
54  }
55  if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
57  }
58 }
59 
62 
63  desc.addUntracked<bool>("memoryConstruction", false)
64  ->setComment("Print memory information for each device after the construction of each module");
65  desc.addUntracked<bool>("memoryBeginStream", true)
66  ->setComment("Print memory information for each device after the beginStream() of each module");
67  desc.addUntracked<bool>("memoryPerModule", true)
68  ->setComment("Print memory information for each device after the event of each module");
69  desc.addUntracked<bool>("memoryPerEvent", true)
70  ->setComment("Print memory information for each device after each event");
71 
72  descriptions.add("CUDAMonitoringService", desc);
73  descriptions.setComment(
74  "The memory information is the global state of the device. This gets confusing if there are multiple processes "
75  "running on the same device. Probably the information retrieval should be re-thought?");
76 }
77 
78 // activity handlers
79 namespace {
80  template <typename T>
81  void dumpUsedMemory(T& log, int num) {
82  auto const cachingDeviceAllocatorStatus = cms::cuda::deviceAllocatorStatus();
83  int old = 0;
84  cudaCheck(cudaGetDevice(&old));
85  constexpr auto mbytes = 1 << 20;
86  for (int i = 0; i < num; ++i) {
87  size_t freeMemory, totalMemory;
88  cudaCheck(cudaSetDevice(i));
89  cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
90  log << "\n"
91  << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
92  auto found = cachingDeviceAllocatorStatus.find(i);
93  if (found != cachingDeviceAllocatorStatus.end()) {
94  auto const& cached = found->second;
95  log << "; CachingDeviceAllocator " << cached.live / mbytes << " MB live "
96  << "(" << cached.liveRequested / mbytes << " MB requested) " << cached.free / mbytes << " MB free "
97  << (cached.live + cached.free) / mbytes << " MB total cached";
98  }
99  }
100  cudaCheck(cudaSetDevice(old));
101  }
102 } // namespace
103 
105  auto log = edm::LogPrint("CUDAMonitoringService");
106  log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
107  dumpUsedMemory(log, numberOfDevices_);
108 }
109 
111  auto log = edm::LogPrint("CUDAMonitoringService");
112  log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
113  << mcc.moduleDescription()->moduleName() << ")";
114  dumpUsedMemory(log, numberOfDevices_);
115 }
116 
118  auto log = edm::LogPrint("CUDAMonitoringService");
119  log << "CUDA device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
120  << mcc.moduleDescription()->moduleName() << ")";
121  dumpUsedMemory(log, numberOfDevices_);
122 }
123 
125  auto log = edm::LogPrint("CUDAMonitoringService");
126  log << "CUDA device memory after event";
127  dumpUsedMemory(log, numberOfDevices_);
128 }
129 
void watchPostModuleConstruction(PostModuleConstruction::slot_type const &iSlot)
CUDAMonitoringService(edm::ParameterSet const &iConfig, edm::ActivityRegistry &iRegistry)
void watchPostEvent(PostEvent::slot_type const &iSlot)
static void fillDescriptions(edm::ConfigurationDescriptions &descriptions)
void watchPostModuleEvent(PostModuleEvent::slot_type const &iSlot)
std::string const & moduleName() const
Definition: config.py:1
ModuleDescription const * moduleDescription() const noexcept
void postEvent(edm::StreamContext const &sc)
allocator::GpuCachedBytes deviceAllocatorStatus()
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
Log< level::Warning, true > LogPrint
#define DEFINE_FWK_SERVICE(type)
Definition: ServiceMaker.h:97
void setComment(std::string const &value)
void watchPostModuleBeginStream(PostModuleBeginStream::slot_type const &iSlot)
void add(std::string const &label, ParameterSetDescription const &psetDescription)
HLT enums.
void postModuleConstruction(edm::ModuleDescription const &desc)
void postModuleBeginStream(edm::StreamContext const &, edm::ModuleCallingContext const &mcc)
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
~CUDAMonitoringService()=default
long double T
std::string const & moduleLabel() const
void postModuleEvent(edm::StreamContext const &sc, edm::ModuleCallingContext const &mcc)