CMS 3D CMS Logo

CUDAMonitoringService.cc
Go to the documentation of this file.
1 #include <iostream>
2 
3 #include <cuda.h>
4 
17 
18 namespace edm {
19  class StreamContext;
20 }
21 
23 public:
25  ~CUDAMonitoringService() = default;
26 
27  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
28 
32  void postEvent(edm::StreamContext const& sc);
33 
34 private:
36 };
37 
39  // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
40  edm::Service<CUDAService> cudaService;
41  if (!cudaService->enabled())
42  return;
43  numberOfDevices_ = cudaService->numberOfDevices();
44 
45  if (config.getUntrackedParameter<bool>("memoryConstruction")) {
47  }
48  if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
50  }
51  if (config.getUntrackedParameter<bool>("memoryPerModule")) {
53  }
54  if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
56  }
57 }
58 
61 
62  desc.addUntracked<bool>("memoryConstruction", false)
63  ->setComment("Print memory information for each device after the construction of each module");
64  desc.addUntracked<bool>("memoryBeginStream", true)
65  ->setComment("Print memory information for each device after the beginStream() of each module");
66  desc.addUntracked<bool>("memoryPerModule", true)
67  ->setComment("Print memory information for each device after the event of each module");
68  desc.addUntracked<bool>("memoryPerEvent", true)
69  ->setComment("Print memory information for each device after each event");
70 
71  descriptions.add("CUDAMonitoringService", desc);
72  descriptions.setComment(
73  "The memory information is the global state of the device. This gets confusing if there are multiple processes "
74  "running on the same device. Probably the information retrieval should be re-thought?");
75 }
76 
77 // activity handlers
78 namespace {
79  template <typename T>
80  void dumpUsedMemory(T& log, int num) {
81  auto const cachingDeviceAllocatorStatus = cms::cuda::deviceAllocatorStatus();
82  int old = 0;
83  cudaCheck(cudaGetDevice(&old));
84  constexpr auto mbytes = 1 << 20;
85  for (int i = 0; i < num; ++i) {
86  size_t freeMemory, totalMemory;
87  cudaCheck(cudaSetDevice(i));
88  cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
89  log << "\n"
90  << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
91  auto found = cachingDeviceAllocatorStatus.find(i);
92  if (found != cachingDeviceAllocatorStatus.end()) {
93  auto const& cached = found->second;
94  log << "; CachingDeviceAllocator " << cached.live / mbytes << " MB live "
95  << "(" << cached.liveRequested / mbytes << " MB requested) " << cached.free / mbytes << " MB free "
96  << (cached.live + cached.free) / mbytes << " MB total cached";
97  }
98  }
99  cudaCheck(cudaSetDevice(old));
100  }
101 } // namespace
102 
104  auto log = edm::LogPrint("CUDAMonitoringService");
105  log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
106  dumpUsedMemory(log, numberOfDevices_);
107 }
108 
110  auto log = edm::LogPrint("CUDAMonitoringService");
111  log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
112  << mcc.moduleDescription()->moduleName() << ")";
113  dumpUsedMemory(log, numberOfDevices_);
114 }
115 
117  auto log = edm::LogPrint("CUDAMonitoringService");
118  log << "CUDA device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
119  << mcc.moduleDescription()->moduleName() << ")";
120  dumpUsedMemory(log, numberOfDevices_);
121 }
122 
124  auto log = edm::LogPrint("CUDAMonitoringService");
125  log << "CUDA device memory after event";
126  dumpUsedMemory(log, numberOfDevices_);
127 }
128 
void watchPostModuleConstruction(PostModuleConstruction::slot_type const &iSlot)
CUDAMonitoringService(edm::ParameterSet const &iConfig, edm::ActivityRegistry &iRegistry)
ModuleDescription const * moduleDescription() const
void watchPostEvent(PostEvent::slot_type const &iSlot)
static void fillDescriptions(edm::ConfigurationDescriptions &descriptions)
void watchPostModuleEvent(PostModuleEvent::slot_type const &iSlot)
std::string const & moduleName() const
Definition: config.py:1
int numberOfDevices() const
Definition: CUDAService.h:24
void postEvent(edm::StreamContext const &sc)
allocator::GpuCachedBytes deviceAllocatorStatus()
Log< level::Warning, true > LogPrint
#define DEFINE_FWK_SERVICE(type)
Definition: ServiceMaker.h:97
void setComment(std::string const &value)
void watchPostModuleBeginStream(PostModuleBeginStream::slot_type const &iSlot)
void add(std::string const &label, ParameterSetDescription const &psetDescription)
bool enabled() const
Definition: CUDAService.h:22
HLT enums.
void postModuleConstruction(edm::ModuleDescription const &desc)
void postModuleBeginStream(edm::StreamContext const &, edm::ModuleCallingContext const &mcc)
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
~CUDAMonitoringService()=default
long double T
std::string const & moduleLabel() const
void postModuleEvent(edm::StreamContext const &sc, edm::ModuleCallingContext const &mcc)