CMS 3D CMS Logo

List of all members | Public Member Functions | Static Public Member Functions | Private Attributes
CUDAService Class Reference
Inheritance diagram for CUDAService:
CUDAInterface

Public Member Functions

std::pair< int, int > computeCapability (int device) const final
 
 CUDAService (edm::ParameterSet const &config)
 Constructor. More...
 
bool enabled () const final
 
int numberOfDevices () const final
 
 ~CUDAService () override
 
- Public Member Functions inherited from CUDAInterface
 CUDAInterface ()=default
 
virtual ~CUDAInterface ()=default
 

Static Public Member Functions

static void fillDescriptions (edm::ConfigurationDescriptions &descriptions)
 

Private Attributes

std::vector< std::pair< int, int > > computeCapabilities_
 
bool enabled_ = false
 
int numberOfDevices_ = 0
 
bool verbose_ = false
 

Detailed Description

Definition at line 31 of file CUDAService.cc.

Constructor & Destructor Documentation

◆ CUDAService()

CUDAService::CUDAService ( edm::ParameterSet const &  config)

Constructor.

Definition at line 177 of file CUDAService.cc.

References HLT_2022v15_cff::allocator, cms::cuda::allocator::cachingAllocatorsConstruct(), cms::cuda::StreamCache::clear(), cms::cuda::EventCache::clear(), computeCapabilities_, cudaCheck, HLT_2022v15_cff::cudaLimitDevRuntimePendingLaunchCount, HLT_2022v15_cff::cudaLimitDevRuntimeSyncDepth, HLT_2022v15_cff::cudaLimitMallocHeapSize, HLT_2022v15_cff::cudaLimitPrintfFifoSize, HLT_2022v15_cff::cudaLimitStackSize, decodeVersion(), HLT_2022v15_cff::devicePreallocate, enabled_, HLT_2022v15_cff::flags, getCudaCoresPerSM(), cms::cuda::getEventCache(), cms::cuda::getStreamCache(), HLT_2022v15_cff::hostPreallocate, mps_fire::i, edm::Service< T >::isAvailable(), TH2PolyOfflineMaps::limits, dqm-mbProfile::log, SiStripPI::min, numberOfDevices_, nvmlCheck, or, edm::ResourceInformation::setCudaDriverVersion(), setCudaLimit(), edm::ResourceInformation::setCudaRuntimeVersion(), edm::ResourceInformation::setGPUModels(), edm::ResourceInformation::setNvidiaDriverVersion(), findQualityFiles::size, mps_update::status, AlCaHLTBitMon_QueryRunRegistry::string, cms::cuda::allocator::useCaching, relativeConstraints::value, and verbose_.

177  : verbose_(config.getUntrackedParameter<bool>("verbose")) {
178  if (not config.getUntrackedParameter<bool>("enabled")) {
179  edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
180  return;
181  }
182 
183  auto status = cudaGetDeviceCount(&numberOfDevices_);
184  if (cudaSuccess != status) {
185  edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
186  << "Disabling the CUDAService.";
187  return;
188  }
190 
191  // NVIDIA system driver version, e.g. 470.57.02
192  char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
193  nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
194  nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
195  nvmlCheck(nvmlShutdown());
196 
197  // CUDA driver version, e.g. 11.4
198  // the full version, like 11.4.1 or 11.4.100, is not reported
199  int driverVersion = 0;
200  cudaCheck(cudaDriverGetVersion(&driverVersion));
201 
202  // CUDA runtime version, e.g. 11.4
203  // the full version, like 11.4.1 or 11.4.108, is not reported
204  int runtimeVersion = 0;
205  cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
206 
207  edm::LogInfo log("CUDAService");
208  if (verbose_) {
209  log << "NVIDIA driver: " << systemDriverVersion << '\n';
210  log << "CUDA driver API: " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
211  << ")\n";
212  log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
213  << ")\n";
214  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
215  } else {
216  log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
217  << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
218  }
219 
220  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
221  auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
222  auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
223  auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
224  auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
225  auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
226 
227  std::set<std::string> models;
228 
229  for (int i = 0; i < numberOfDevices_; ++i) {
230  // read information about the compute device.
231  // see the documentation of cudaGetDeviceProperties() for more information.
232  cudaDeviceProp properties;
233  cudaCheck(cudaGetDeviceProperties(&properties, i));
234  log << '\n' << "CUDA device " << i << ": " << properties.name;
235  if (verbose_) {
236  log << '\n';
237  }
238  models.insert(std::string(properties.name));
239 
240  // compute capabilities
241  computeCapabilities_.emplace_back(properties.major, properties.minor);
242  if (verbose_) {
243  log << " compute capability: " << properties.major << "." << properties.minor;
244  }
245  log << " (sm_" << properties.major << properties.minor << ")";
246  if (verbose_) {
247  log << '\n';
248  log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
249  log << " CUDA cores: " << std::setw(28)
250  << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
251  log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
252  << ":1\n";
253  }
254 
255  // compute mode
256  static constexpr const char* computeModeDescription[] = {
257  "default (shared)", // cudaComputeModeDefault
258  "exclusive (single thread)", // cudaComputeModeExclusive
259  "prohibited", // cudaComputeModeProhibited
260  "exclusive (single process)", // cudaComputeModeExclusiveProcess
261  "unknown"};
262  if (verbose_) {
263  log << " compute mode:" << std::right << std::setw(27)
264  << computeModeDescription[std::min(properties.computeMode,
265  static_cast<int>(std::size(computeModeDescription)) - 1)]
266  << '\n';
267  }
268 
269  // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
270  cudaCheck(cudaSetDevice(i));
271  cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
272 
273  // read the free and total amount of memory available for allocation by the device, in bytes.
274  // see the documentation of cudaMemGetInfo() for more information.
275  if (verbose_) {
276  size_t freeMemory, totalMemory;
277  cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
278  log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
279  << totalMemory / (1 << 20) << " MB total\n";
280  log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
281  log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
282  }
283 
284  // L1 cache behaviour
285  if (verbose_) {
286  static constexpr const char* l1CacheModeDescription[] = {
287  "unknown", "local memory", "global memory", "local and global memory"};
288  int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
289  log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
290  log << '\n';
291 
292  log << "Other capabilities\n";
293  log << " " << (properties.canMapHostMemory ? "can" : "cannot")
294  << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
295  log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
296  << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
297  log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
298  << " access pageable memory via the host's page tables\n";
299  log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
300  << " access host registered memory at the same virtual address as the host\n";
301  log << " " << (properties.unifiedAddressing ? "shares" : "does not share")
302  << " a unified address space with the host\n";
303  log << " " << (properties.managedMemory ? "supports" : "does not support")
304  << " allocating managed memory on this system\n";
305  log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
306  << " coherently access managed memory concurrently with the host\n";
307  log << " "
308  << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
309  << " directly access managed memory on the device without migration\n";
310  log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
311  << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
312  log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
313  << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
314  log << '\n';
315  }
316 
317  // set and read the CUDA device flags.
318  // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for more information.
319  if (verbose_) {
320  log << "CUDA flags\n";
321  unsigned int flags;
322  cudaCheck(cudaGetDeviceFlags(&flags));
323  switch (flags & cudaDeviceScheduleMask) {
324  case cudaDeviceScheduleAuto:
325  log << " thread policy: default\n";
326  break;
327  case cudaDeviceScheduleSpin:
328  log << " thread policy: spin\n";
329  break;
330  case cudaDeviceScheduleYield:
331  log << " thread policy: yield\n";
332  break;
333  case cudaDeviceScheduleBlockingSync:
334  log << " thread policy: blocking sync\n";
335  break;
336  default:
337  log << " thread policy: undefined\n";
338  }
339  if (flags & cudaDeviceMapHost) {
340  log << " pinned host memory allocations: enabled\n";
341  } else {
342  log << " pinned host memory allocations: disabled\n";
343  }
344  if (flags & cudaDeviceLmemResizeToMax) {
345  log << " kernel host memory reuse: enabled\n";
346  } else {
347  log << " kernel host memory reuse: disabled\n";
348  }
349  log << '\n';
350  }
351 
352  // set and read the CUDA resource limits.
353  // see the documentation of cudaDeviceSetLimit() for more information.
354 
355  // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
356  // printf() device system call.
357  if (printfFifoSize >= 0) {
358  setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
359  }
360  // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
361  if (stackSize >= 0) {
362  setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
363  }
364  // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
365  // and free() device system calls.
366  if (mallocHeapSize >= 0) {
367  setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
368  }
369  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
370  // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
371  // a thread can safely call cudaDeviceSynchronize().
372  if (devRuntimeSyncDepth >= 0) {
373  setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
374  }
375  // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
376  // device runtime launches that can be made from the current device.
377  if (devRuntimePendingLaunchCount >= 0) {
379  "cudaLimitDevRuntimePendingLaunchCount",
380  devRuntimePendingLaunchCount);
381  }
382  }
383 
384  if (verbose_) {
385  size_t value;
386  log << "CUDA limits\n";
387  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
388  log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n";
389  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
390  log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
391  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
392  log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
393  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
394  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
395  log << " runtime sync depth: " << std::setw(10) << value << '\n';
397  log << " runtime pending launch count: " << std::setw(10) << value << '\n';
398  }
399  }
400  }
401 
402  edm::Service<edm::ResourceInformation> resourceInformationService;
403  if (resourceInformationService.isAvailable()) {
404  std::vector<std::string> modelsV(models.begin(), models.end());
405  resourceInformationService->setGPUModels(modelsV);
406  std::string nvidiaDriverVersion{systemDriverVersion};
407  resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
408  resourceInformationService->setCudaDriverVersion(driverVersion);
409  resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
410  }
411 
412  // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
413  if constexpr (cms::cuda::allocator::useCaching) {
415  }
418 
419  if (verbose_) {
420  log << '\n' << "CUDAService fully initialized";
421  }
422  enabled_ = true;
423 
424  // Preallocate buffers if asked to
425  auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
426  devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int>>("devicePreallocate"));
427  hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int>>("hostPreallocate"));
428 }
size
Write out results.
Definition: config.py:1
Definition: models.py:1
#define nvmlCheck(ARG,...)
Definition: nvmlCheck.h:52
void setCudaLimit(cudaLimit limit, const char *name, size_t request)
Definition: CUDAService.cc:59
virtual void setCudaRuntimeVersion(int)=0
int numberOfDevices_
Definition: CUDAService.cc:53
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
Definition: value.py:1
Log< level::Info, false > LogInfo
std::vector< std::pair< int, int > > computeCapabilities_
Definition: CUDAService.cc:54
virtual void setNvidiaDriverVersion(std::string const &)=0
constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor)
Definition: CUDAService.cc:81
StreamCache & getStreamCache()
Definition: StreamCache.cc:39
constexpr bool useCaching
bool isAvailable() const
Definition: Service.h:40
EventCache & getEventCache()
Definition: EventCache.cc:64
std::string decodeVersion(int version)
Definition: CUDAService.cc:138
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
Log< level::Warning, false > LogWarning
virtual void setGPUModels(std::vector< std::string > const &)=0
virtual void setCudaDriverVersion(int)=0

◆ ~CUDAService()

CUDAService::~CUDAService ( )
override

Definition at line 430 of file CUDAService.cc.

References cms::cuda::allocator::cachingAllocatorsFreeCached(), cms::cuda::StreamCache::clear(), cms::cuda::EventCache::clear(), cudaCheck, enabled_, cms::cuda::getEventCache(), cms::cuda::getStreamCache(), mps_fire::i, numberOfDevices_, and cms::cuda::allocator::useCaching.

430  {
431  if (enabled_) {
432  // Explicitly destruct the allocator before the device resets below
433  if constexpr (cms::cuda::allocator::useCaching) {
435  }
438 
439  for (int i = 0; i < numberOfDevices_; ++i) {
440  cudaCheck(cudaSetDevice(i));
441  cudaCheck(cudaDeviceSynchronize());
442  // Explicitly destroys and cleans up all resources associated with the current device in the
443  // current process. Any subsequent API call to this device will reinitialize the device.
444  // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
445  cudaDeviceReset();
446  }
447  }
448 }
int numberOfDevices_
Definition: CUDAService.cc:53
StreamCache & getStreamCache()
Definition: StreamCache.cc:39
constexpr bool useCaching
EventCache & getEventCache()
Definition: EventCache.cc:64
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69

Member Function Documentation

◆ computeCapability()

std::pair<int, int> CUDAService::computeCapability ( int  device) const
inlinefinalvirtual

Implements CUDAInterface.

Definition at line 43 of file CUDAService.cc.

References computeCapabilities_, findQualityFiles::size, and cond::impl::to_string().

43  {
44  int size = computeCapabilities_.size();
45  if (device < 0 or device >= size) {
46  throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
47  std::to_string(size - 1));
48  }
49  return computeCapabilities_[device];
50  }
size
Write out results.
std::string to_string(const V &value)
Definition: OMSAccess.h:71
std::vector< std::pair< int, int > > computeCapabilities_
Definition: CUDAService.cc:54

◆ enabled()

bool CUDAService::enabled ( ) const
inlinefinalvirtual

Implements CUDAInterface.

Definition at line 38 of file CUDAService.cc.

References enabled_.

38 { return enabled_; }

◆ fillDescriptions()

void CUDAService::fillDescriptions ( edm::ConfigurationDescriptions descriptions)
static

Definition at line 450 of file CUDAService.cc.

References edm::ConfigurationDescriptions::add(), HLT_2022v15_cff::allocator, submitPVResolutionJobs::desc, and TH2PolyOfflineMaps::limits.

450  {
452  desc.addUntracked<bool>("enabled", true);
453  desc.addUntracked<bool>("verbose", false);
454 
456  limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
457  ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
458  limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
459  limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
460  ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
461  limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
462  ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
463  limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
464  ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
465  desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
466  ->setComment(
467  "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
468  "the default value.");
469 
471  allocator.addUntracked<std::vector<unsigned int>>("devicePreallocate", std::vector<unsigned int>{})
472  ->setComment("Preallocates buffers of given bytes on all devices");
473  allocator.addUntracked<std::vector<unsigned int>>("hostPreallocate", std::vector<unsigned int>{})
474  ->setComment("Preallocates buffers of given bytes on the host");
475  desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
476 
477  descriptions.add("CUDAService", desc);
478 }
void add(std::string const &label, ParameterSetDescription const &psetDescription)

◆ numberOfDevices()

int CUDAService::numberOfDevices ( ) const
inlinefinalvirtual

Implements CUDAInterface.

Definition at line 40 of file CUDAService.cc.

References numberOfDevices_.

40 { return numberOfDevices_; }
int numberOfDevices_
Definition: CUDAService.cc:53

Member Data Documentation

◆ computeCapabilities_

std::vector<std::pair<int, int> > CUDAService::computeCapabilities_
private

Definition at line 54 of file CUDAService.cc.

Referenced by computeCapability(), and CUDAService().

◆ enabled_

bool CUDAService::enabled_ = false
private

Definition at line 55 of file CUDAService.cc.

Referenced by CUDAService(), enabled(), and ~CUDAService().

◆ numberOfDevices_

int CUDAService::numberOfDevices_ = 0
private

Definition at line 53 of file CUDAService.cc.

Referenced by CUDAService(), numberOfDevices(), and ~CUDAService().

◆ verbose_

bool CUDAService::verbose_ = false
private

Definition at line 56 of file CUDAService.cc.

Referenced by CUDAService().