CMS 3D CMS Logo

List of all members | Public Member Functions | Static Public Member Functions | Private Attributes
CUDAService Class Reference

#include <CUDAService.h>

Public Member Functions

std::pair< int, int > computeCapability (int device) const
 
 CUDAService (edm::ParameterSet const &iConfig)
 Constructor. More...
 
int deviceWithMostFreeMemory () const
 
bool enabled () const
 
int numberOfDevices () const
 
 ~CUDAService ()
 

Static Public Member Functions

static void fillDescriptions (edm::ConfigurationDescriptions &descriptions)
 

Private Attributes

std::vector< std::pair< int, int > > computeCapabilities_
 
bool enabled_ = false
 
int numberOfDevices_ = 0
 
bool verbose_ = false
 

Detailed Description

Definition at line 15 of file CUDAService.h.

Constructor & Destructor Documentation

◆ CUDAService()

CUDAService::CUDAService ( edm::ParameterSet const &  iConfig)

Constructor.

Definition at line 134 of file CUDAService.cc.

References HLT_2022v14_cff::allocator, cms::cuda::allocator::cachingAllocatorsConstruct(), cms::cuda::StreamCache::clear(), cms::cuda::EventCache::clear(), computeCapabilities_, cudaCheck, HLT_2022v14_cff::cudaLimitDevRuntimePendingLaunchCount, HLT_2022v14_cff::cudaLimitDevRuntimeSyncDepth, HLT_2022v14_cff::cudaLimitMallocHeapSize, HLT_2022v14_cff::cudaLimitPrintfFifoSize, HLT_2022v14_cff::cudaLimitStackSize, decodeVersion(), HLT_2022v14_cff::devicePreallocate, enabled_, HLT_2022v14_cff::flags, getCudaCoresPerSM(), cms::cuda::getEventCache(), cms::cuda::getStreamCache(), HLT_2022v14_cff::hostPreallocate, mps_fire::i, TH2PolyOfflineMaps::limits, dqm-mbProfile::log, SiStripPI::min, numberOfDevices_, nvmlCheck, or, setCudaLimit(), findQualityFiles::size, mps_update::status, cms::cuda::allocator::useCaching, relativeConstraints::value, and verbose_.

134  : verbose_(config.getUntrackedParameter<bool>("verbose")) {
135  bool configEnabled = config.getUntrackedParameter<bool>("enabled");
136  if (not configEnabled) {
137  edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
138  return;
139  }
140 
141  auto status = cudaGetDeviceCount(&numberOfDevices_);
142  if (cudaSuccess != status) {
143  edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
144  << "Disabling the CUDAService.";
145  return;
146  }
148 
149  // NVIDIA system driver version, e.g. 470.57.02
150  char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
151  nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
152  nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
153  nvmlCheck(nvmlShutdown());
154 
155  // CUDA driver version, e.g. 11.4
156  // the full version, like 11.4.1 or 11.4.100, is not reported
157  int driverVersion = 0;
158  cudaCheck(cudaDriverGetVersion(&driverVersion));
159 
160  // CUDA runtime version, e.g. 11.4
161  // the full version, like 11.4.1 or 11.4.108, is not reported
162  int runtimeVersion = 0;
163  cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
164 
165  edm::LogInfo log("CUDAService");
166  if (verbose_) {
167  log << "NVIDIA driver: " << systemDriverVersion << '\n';
168  log << "CUDA driver API: " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
169  << ")\n";
170  log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
171  << ")\n";
172  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
173  } else {
174  log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
175  << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
176  }
177 
178  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
179  auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
180  auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
181  auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
182  auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
183  auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
184 
185  for (int i = 0; i < numberOfDevices_; ++i) {
186  // read information about the compute device.
187  // see the documentation of cudaGetDeviceProperties() for more information.
188  cudaDeviceProp properties;
189  cudaCheck(cudaGetDeviceProperties(&properties, i));
190  log << '\n' << "CUDA device " << i << ": " << properties.name;
191  if (verbose_) {
192  log << '\n';
193  }
194 
195  // compute capabilities
196  computeCapabilities_.emplace_back(properties.major, properties.minor);
197  if (verbose_) {
198  log << " compute capability: " << properties.major << "." << properties.minor;
199  }
200  log << " (sm_" << properties.major << properties.minor << ")";
201  if (verbose_) {
202  log << '\n';
203  log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
204  log << " CUDA cores: " << std::setw(28)
205  << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
206  log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
207  << ":1\n";
208  }
209 
210  // compute mode
211  static constexpr const char* computeModeDescription[] = {
212  "default (shared)", // cudaComputeModeDefault
213  "exclusive (single thread)", // cudaComputeModeExclusive
214  "prohibited", // cudaComputeModeProhibited
215  "exclusive (single process)", // cudaComputeModeExclusiveProcess
216  "unknown"};
217  if (verbose_) {
218  log << " compute mode:" << std::right << std::setw(27)
219  << computeModeDescription[std::min(properties.computeMode,
220  static_cast<int>(std::size(computeModeDescription)) - 1)]
221  << '\n';
222  }
223 
224  // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
225  cudaCheck(cudaSetDevice(i));
226  cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
227 
228  // read the free and total amount of memory available for allocation by the device, in bytes.
229  // see the documentation of cudaMemGetInfo() for more information.
230  if (verbose_) {
231  size_t freeMemory, totalMemory;
232  cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
233  log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
234  << totalMemory / (1 << 20) << " MB total\n";
235  log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
236  log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
237  }
238 
239  // L1 cache behaviour
240  if (verbose_) {
241  static constexpr const char* l1CacheModeDescription[] = {
242  "unknown", "local memory", "global memory", "local and global memory"};
243  int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
244  log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
245  log << '\n';
246 
247  log << "Other capabilities\n";
248  log << " " << (properties.canMapHostMemory ? "can" : "cannot")
249  << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
250  log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
251  << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
252  log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
253  << " access pageable memory via the host's page tables\n";
254  log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
255  << " access host registered memory at the same virtual address as the host\n";
256  log << " " << (properties.unifiedAddressing ? "shares" : "does not share")
257  << " a unified address space with the host\n";
258  log << " " << (properties.managedMemory ? "supports" : "does not support")
259  << " allocating managed memory on this system\n";
260  log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
261  << " coherently access managed memory concurrently with the host\n";
262  log << " "
263  << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
264  << " directly access managed memory on the device without migration\n";
265  log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
266  << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
267  log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
268  << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
269  log << '\n';
270  }
271 
272  // set and read the CUDA device flags.
273  // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for more information.
274  if (verbose_) {
275  log << "CUDA flags\n";
276  unsigned int flags;
277  cudaCheck(cudaGetDeviceFlags(&flags));
278  switch (flags & cudaDeviceScheduleMask) {
279  case cudaDeviceScheduleAuto:
280  log << " thread policy: default\n";
281  break;
282  case cudaDeviceScheduleSpin:
283  log << " thread policy: spin\n";
284  break;
285  case cudaDeviceScheduleYield:
286  log << " thread policy: yield\n";
287  break;
288  case cudaDeviceScheduleBlockingSync:
289  log << " thread policy: blocking sync\n";
290  break;
291  default:
292  log << " thread policy: undefined\n";
293  }
294  if (flags & cudaDeviceMapHost) {
295  log << " pinned host memory allocations: enabled\n";
296  } else {
297  log << " pinned host memory allocations: disabled\n";
298  }
299  if (flags & cudaDeviceLmemResizeToMax) {
300  log << " kernel host memory reuse: enabled\n";
301  } else {
302  log << " kernel host memory reuse: disabled\n";
303  }
304  log << '\n';
305  }
306 
307  // set and read the CUDA resource limits.
308  // see the documentation of cudaDeviceSetLimit() for more information.
309 
310  // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
311  // printf() device system call.
312  if (printfFifoSize >= 0) {
313  setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
314  }
315  // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
316  if (stackSize >= 0) {
317  setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
318  }
319  // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
320  // and free() device system calls.
321  if (mallocHeapSize >= 0) {
322  setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
323  }
324  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
325  // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
326  // a thread can safely call cudaDeviceSynchronize().
327  if (devRuntimeSyncDepth >= 0) {
328  setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
329  }
330  // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
331  // device runtime launches that can be made from the current device.
332  if (devRuntimePendingLaunchCount >= 0) {
334  "cudaLimitDevRuntimePendingLaunchCount",
335  devRuntimePendingLaunchCount);
336  }
337  }
338 
339  if (verbose_) {
340  size_t value;
341  log << "CUDA limits\n";
342  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
343  log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n";
344  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
345  log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
346  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
347  log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
348  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
349  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
350  log << " runtime sync depth: " << std::setw(10) << value << '\n';
352  log << " runtime pending launch count: " << std::setw(10) << value << '\n';
353  }
354  }
355  }
356 
357  // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
358  if constexpr (cms::cuda::allocator::useCaching) {
360  }
363 
364  if (verbose_) {
365  log << '\n' << "CUDAService fully initialized";
366  }
367  enabled_ = true;
368 
369  // Preallocate buffers if asked to
370  auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
371  devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
372  hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
373 }
size
Write out results.
bool verbose_
Definition: CUDAService.h:36
Definition: config.py:1
#define nvmlCheck(ARG,...)
Definition: nvmlCheck.h:52
void setCudaLimit(cudaLimit limit, const char *name, size_t request)
Definition: CUDAService.cc:24
int numberOfDevices_
Definition: CUDAService.h:33
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
Definition: value.py:1
bool enabled_
Definition: CUDAService.h:35
Log< level::Info, false > LogInfo
std::vector< std::pair< int, int > > computeCapabilities_
Definition: CUDAService.h:34
constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor)
Definition: CUDAService.cc:46
StreamCache & getStreamCache()
Definition: StreamCache.cc:39
constexpr bool useCaching
EventCache & getEventCache()
Definition: EventCache.cc:64
std::string decodeVersion(int version)
Definition: CUDAService.cc:95
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
Log< level::Warning, false > LogWarning

◆ ~CUDAService()

CUDAService::~CUDAService ( )

Definition at line 375 of file CUDAService.cc.

References cms::cuda::allocator::cachingAllocatorsFreeCached(), cms::cuda::StreamCache::clear(), cms::cuda::EventCache::clear(), cudaCheck, enabled_, cms::cuda::getEventCache(), cms::cuda::getStreamCache(), mps_fire::i, numberOfDevices_, and cms::cuda::allocator::useCaching.

375  {
376  if (enabled_) {
377  // Explicitly destruct the allocator before the device resets below
378  if constexpr (cms::cuda::allocator::useCaching) {
380  }
383 
384  for (int i = 0; i < numberOfDevices_; ++i) {
385  cudaCheck(cudaSetDevice(i));
386  cudaCheck(cudaDeviceSynchronize());
387  // Explicitly destroys and cleans up all resources associated with the current device in the
388  // current process. Any subsequent API call to this device will reinitialize the device.
389  // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
390  cudaDeviceReset();
391  }
392  }
393 }
int numberOfDevices_
Definition: CUDAService.h:33
bool enabled_
Definition: CUDAService.h:35
StreamCache & getStreamCache()
Definition: StreamCache.cc:39
constexpr bool useCaching
EventCache & getEventCache()
Definition: EventCache.cc:64
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69

Member Function Documentation

◆ computeCapability()

std::pair<int, int> CUDAService::computeCapability ( int  device) const
inline

Definition at line 27 of file CUDAService.h.

References computeCapabilities_.

27 { return computeCapabilities_.at(device); }
std::vector< std::pair< int, int > > computeCapabilities_
Definition: CUDAService.h:34

◆ deviceWithMostFreeMemory()

int CUDAService::deviceWithMostFreeMemory ( ) const

Definition at line 425 of file CUDAService.cc.

References cudaCheck, cms::cuda::currentDevice(), mps_fire::i, and numberOfDevices_.

425  {
426  // save the current device
427  int currentDevice;
428  cudaCheck(cudaGetDevice(&currentDevice));
429 
430  size_t maxFreeMemory = 0;
431  int device = -1;
432  for (int i = 0; i < numberOfDevices_; ++i) {
433  size_t freeMemory, totalMemory;
434  cudaSetDevice(i);
435  cudaMemGetInfo(&freeMemory, &totalMemory);
436  edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
437  << totalMemory / (1 << 20) << " MB total memory";
438  if (freeMemory > maxFreeMemory) {
439  maxFreeMemory = freeMemory;
440  device = i;
441  }
442  }
443  // restore the current device
444  cudaCheck(cudaSetDevice(currentDevice));
445  return device;
446 }
int numberOfDevices_
Definition: CUDAService.h:33
Log< level::Warning, true > LogPrint
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
int currentDevice()
Definition: currentDevice.h:10

◆ enabled()

bool CUDAService::enabled ( ) const
inline

◆ fillDescriptions()

void CUDAService::fillDescriptions ( edm::ConfigurationDescriptions descriptions)
static

Definition at line 395 of file CUDAService.cc.

References edm::ConfigurationDescriptions::add(), HLT_2022v14_cff::allocator, submitPVResolutionJobs::desc, and TH2PolyOfflineMaps::limits.

395  {
397  desc.addUntracked<bool>("enabled", true);
398  desc.addUntracked<bool>("verbose", false);
399 
401  limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
402  ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
403  limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
404  limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
405  ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
406  limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
407  ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
408  limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
409  ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
410  desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
411  ->setComment(
412  "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
413  "the default value.");
414 
416  allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
417  ->setComment("Preallocates buffers of given bytes on all devices");
418  allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
419  ->setComment("Preallocates buffers of given bytes on the host");
420  desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
421 
422  descriptions.add("CUDAService", desc);
423 }
void add(std::string const &label, ParameterSetDescription const &psetDescription)

◆ numberOfDevices()

int CUDAService::numberOfDevices ( ) const
inline

Definition at line 24 of file CUDAService.h.

References numberOfDevices_.

Referenced by cms::cuda::chooseDevice(), and CUDAMonitoringService::CUDAMonitoringService().

24 { return numberOfDevices_; }
int numberOfDevices_
Definition: CUDAService.h:33

Member Data Documentation

◆ computeCapabilities_

std::vector<std::pair<int, int> > CUDAService::computeCapabilities_
private

Definition at line 34 of file CUDAService.h.

Referenced by computeCapability(), and CUDAService().

◆ enabled_

bool CUDAService::enabled_ = false
private

Definition at line 35 of file CUDAService.h.

Referenced by CUDAService(), enabled(), and ~CUDAService().

◆ numberOfDevices_

int CUDAService::numberOfDevices_ = 0
private

Definition at line 33 of file CUDAService.h.

Referenced by CUDAService(), deviceWithMostFreeMemory(), numberOfDevices(), and ~CUDAService().

◆ verbose_

bool CUDAService::verbose_ = false
private

Definition at line 36 of file CUDAService.h.

Referenced by CUDAService().