CMS 3D CMS Logo

CUDAMonitoringService.cc
Go to the documentation of this file.
1 #include <iostream>
2 
3 #include <cuda.h>
4 
17 
18 namespace edm {
19  class StreamContext;
20 }
21 
23 public:
25  ~CUDAMonitoringService() = default;
26 
27  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
28 
32  void postEvent(edm::StreamContext const& sc);
33 
34 private:
36 };
37 
39  // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
40  edm::Service<CUDAService> cudaService;
41  if (!cudaService->enabled())
42  return;
43  numberOfDevices_ = cudaService->numberOfDevices();
44 
45  if (config.getUntrackedParameter<bool>("memoryConstruction")) {
47  }
48  if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
50  }
51  if (config.getUntrackedParameter<bool>("memoryPerModule")) {
53  }
54  if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
56  }
57 }
58 
61 
62  desc.addUntracked<bool>("memoryConstruction", false)
63  ->setComment("Print memory information for each device after the construction of each module");
64  desc.addUntracked<bool>("memoryBeginStream", true)
65  ->setComment("Print memory information for each device after the beginStream() of each module");
66  desc.addUntracked<bool>("memoryPerModule", true)
67  ->setComment("Print memory information for each device after the event of each module");
68  desc.addUntracked<bool>("memoryPerEvent", true)
69  ->setComment("Print memory information for each device after each event");
70 
71  descriptions.add("CUDAMonitoringService", desc);
72  descriptions.setComment(
73  "The memory information is the global state of the device. This gets confusing if there are multiple processes "
74  "running on the same device. Probably the information retrieval should be re-thought?");
75 }
76 
77 // activity handlers
78 namespace {
79  template <typename T>
80  void dumpUsedMemory(T& log, int num) {
81  auto const cachingDeviceAllocatorStatus = cms::cuda::deviceAllocatorStatus();
82  int old = 0;
83  cudaCheck(cudaGetDevice(&old));
84  constexpr auto mbytes = 1 << 20;
85  for (int i = 0; i < num; ++i) {
86  size_t freeMemory, totalMemory;
87  cudaCheck(cudaSetDevice(i));
88  cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
89  log << "\n"
90  << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
91  auto found = cachingDeviceAllocatorStatus.find(i);
92  if (found != cachingDeviceAllocatorStatus.end()) {
93  auto const& cached = found->second;
94  log << "; CachingDeviceAllocator " << cached.live / mbytes << " MB live "
95  << "(" << cached.liveRequested / mbytes << " MB requested) " << cached.free / mbytes << " MB free "
96  << (cached.live + cached.free) / mbytes << " MB total cached";
97  }
98  }
99  cudaCheck(cudaSetDevice(old));
100  }
101 } // namespace
102 
104  auto log = edm::LogPrint("CUDAMonitoringService");
105  log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
106  dumpUsedMemory(log, numberOfDevices_);
107 }
108 
110  auto log = edm::LogPrint("CUDAMonitoringService");
111  log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
112  << mcc.moduleDescription()->moduleName() << ")";
113  dumpUsedMemory(log, numberOfDevices_);
114 }
115 
117  auto log = edm::LogPrint("CUDAMonitoringService");
118  log << "CUDA device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
119  << mcc.moduleDescription()->moduleName() << ")";
120  dumpUsedMemory(log, numberOfDevices_);
121 }
122 
124  auto log = edm::LogPrint("CUDAMonitoringService");
125  log << "CUDA device memory after event";
126  dumpUsedMemory(log, numberOfDevices_);
127 }
128 
ConfigurationDescriptions.h
CUDAMonitoringService::~CUDAMonitoringService
~CUDAMonitoringService()=default
edm::ModuleDescription::moduleLabel
std::string const & moduleLabel() const
Definition: ModuleDescription.h:43
ModuleCallingContext.h
mps_fire.i
i
Definition: mps_fire.py:428
MessageLogger.h
edm
HLT enums.
Definition: AlignableModifier.h:19
edm::LogPrint
Log< level::Warning, true > LogPrint
Definition: MessageLogger.h:130
edm::ModuleDescription::moduleName
std::string const & moduleName() const
Definition: ModuleDescription.h:42
edm::ParameterSetDescription
Definition: ParameterSetDescription.h:52
DEFINE_FWK_SERVICE
#define DEFINE_FWK_SERVICE(type)
Definition: ServiceMaker.h:96
CUDAMonitoringService
Definition: CUDAMonitoringService.cc:22
edm::ModuleCallingContext::moduleDescription
ModuleDescription const * moduleDescription() const
Definition: ModuleCallingContext.h:50
newFWLiteAna.found
found
Definition: newFWLiteAna.py:118
CUDAService::numberOfDevices
int numberOfDevices() const
Definition: CUDAService.h:24
CUDAMonitoringService::postModuleEvent
void postModuleEvent(edm::StreamContext const &sc, edm::ModuleCallingContext const &mcc)
Definition: CUDAMonitoringService.cc:116
edm::ModuleDescription
Definition: ModuleDescription.h:21
edm::ActivityRegistry::watchPostModuleBeginStream
void watchPostModuleBeginStream(PostModuleBeginStream::slot_type const &iSlot)
Definition: ActivityRegistry.h:253
ModuleDescription.h
ActivityRegistry.h
config
Definition: config.py:1
CUDAMonitoringService::CUDAMonitoringService
CUDAMonitoringService(edm::ParameterSet const &iConfig, edm::ActivityRegistry &iRegistry)
Definition: CUDAMonitoringService.cc:38
edm::ConfigurationDescriptions::add
void add(std::string const &label, ParameterSetDescription const &psetDescription)
Definition: ConfigurationDescriptions.cc:57
edm::StreamContext
Definition: StreamContext.h:31
Service.h
edm::ActivityRegistry
Definition: ActivityRegistry.h:134
cms::cuda::deviceAllocatorStatus
allocator::GpuCachedBytes deviceAllocatorStatus()
Definition: deviceAllocatorStatus.cc:6
ParameterSetDescription.h
edm::ActivityRegistry::watchPostModuleConstruction
void watchPostModuleConstruction(PostModuleConstruction::slot_type const &iSlot)
Definition: ActivityRegistry.h:706
CUDAMonitoringService::numberOfDevices_
int numberOfDevices_
Definition: CUDAMonitoringService.cc:35
ServiceMaker.h
edm::ConfigurationDescriptions
Definition: ConfigurationDescriptions.h:28
edm::ActivityRegistry::watchPostEvent
void watchPostEvent(PostEvent::slot_type const &iSlot)
Definition: ActivityRegistry.h:464
edm::ParameterSet
Definition: ParameterSet.h:47
edm::ConfigurationDescriptions::setComment
void setComment(std::string const &value)
Definition: ConfigurationDescriptions.cc:48
edm::Service
Definition: Service.h:30
cudaCheck.h
CUDAMonitoringService::postEvent
void postEvent(edm::StreamContext const &sc)
Definition: CUDAMonitoringService.cc:123
deviceAllocatorStatus.h
CUDAMonitoringService::postModuleConstruction
void postModuleConstruction(edm::ModuleDescription const &desc)
Definition: CUDAMonitoringService.cc:103
EgammaValidation_cff.num
num
Definition: EgammaValidation_cff.py:34
CUDAService.h
CUDAService::enabled
bool enabled() const
Definition: CUDAService.h:22
CUDAMonitoringService::postModuleBeginStream
void postModuleBeginStream(edm::StreamContext const &, edm::ModuleCallingContext const &mcc)
Definition: CUDAMonitoringService.cc:109
submitPVResolutionJobs.desc
string desc
Definition: submitPVResolutionJobs.py:251
cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:62
edm::ActivityRegistry::watchPostModuleEvent
void watchPostModuleEvent(PostModuleEvent::slot_type const &iSlot)
Definition: ActivityRegistry.h:783
T
long double T
Definition: Basic3DVectorLD.h:48
dqm-mbProfile.log
log
Definition: dqm-mbProfile.py:17
CUDAMonitoringService::fillDescriptions
static void fillDescriptions(edm::ConfigurationDescriptions &descriptions)
Definition: CUDAMonitoringService.cc:59
ParameterSet.h
edm::ModuleCallingContext
Definition: ModuleCallingContext.h:29