#include <CUDAService.h>

Public Member Functions
std::pair< int, int >	computeCapability (int device) const

	CUDAService (edm::ParameterSet const &iConfig)
	Constructor. More...

int	deviceWithMostFreeMemory () const

bool	enabled () const

int	numberOfDevices () const

	~CUDAService ()

Static Public Member Functions
static void	fillDescriptions (edm::ConfigurationDescriptions &descriptions)

Private Attributes
std::vector< std::pair< int, int > >	computeCapabilities_

bool	enabled_ = false

int	numberOfDevices_ = 0

bool	verbose_ = false

Detailed Description

Definition at line 15 of file CUDAService.h.

Constructor & Destructor Documentation

◆ CUDAService()

CUDAService::CUDAService ( edm::ParameterSet const & iConfig )

Constructor.

Definition at line 134 of file CUDAService.cc.

References HLT_2022v12_cff::allocator, cms::cuda::allocator::cachingAllocatorsConstruct(), cms::cuda::StreamCache::clear(), cms::cuda::EventCache::clear(), computeCapabilities_, cudaCheck, HLT_2022v12_cff::cudaLimitDevRuntimePendingLaunchCount, HLT_2022v12_cff::cudaLimitDevRuntimeSyncDepth, HLT_2022v12_cff::cudaLimitMallocHeapSize, HLT_2022v12_cff::cudaLimitPrintfFifoSize, HLT_2022v12_cff::cudaLimitStackSize, decodeVersion(), HLT_2022v12_cff::devicePreallocate, enabled_, HLT_2022v12_cff::flags, getCudaCoresPerSM(), cms::cuda::getEventCache(), cms::cuda::getStreamCache(), HLT_2022v12_cff::hostPreallocate, mps_fire::i, TH2PolyOfflineMaps::limits, dqm-mbProfile::log, SiStripPI::min, numberOfDevices_, nvmlCheck, or, setCudaLimit(), findQualityFiles::size, mps_update::status, cms::cuda::allocator::useCaching, relativeConstraints::value, and verbose_.

                                                       : verbose_(config.getUntrackedParameter<bool>("verbose")) {
   bool configEnabled = config.getUntrackedParameter<bool>("enabled");
   if (not configEnabled) {
     edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
     return;
   }
 
   auto status = cudaGetDeviceCount(&numberOfDevices_);
   if (cudaSuccess != status) {
     edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
                                    << "Disabling the CUDAService.";
     return;
   }
   computeCapabilities_.reserve(numberOfDevices_);
 
   // NVIDIA system driver version, e.g. 470.57.02
   char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
   nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
   nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
   nvmlCheck(nvmlShutdown());
 
   // CUDA driver version, e.g. 11.4
   // the full version, like 11.4.1 or 11.4.100, is not reported
   int driverVersion = 0;
   cudaCheck(cudaDriverGetVersion(&driverVersion));
 
   // CUDA runtime version, e.g. 11.4
   // the full version, like 11.4.1 or 11.4.108, is not reported
   int runtimeVersion = 0;
   cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
 
   edm::LogInfo log("CUDAService");
   if (verbose_) {
     log << "NVIDIA driver:    " << systemDriverVersion << '\n';
     log << "CUDA driver API:  " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
         << ")\n";
     log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
         << ")\n";
     log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
   } else {
     log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
         << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
   }
 
   auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
   auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
   auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
   auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
   auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
   auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
 
   for (int i = 0; i < numberOfDevices_; ++i) {
     // read information about the compute device.
     // see the documentation of cudaGetDeviceProperties() for more information.
     cudaDeviceProp properties;
     cudaCheck(cudaGetDeviceProperties(&properties, i));
     log << '\n' << "CUDA device " << i << ": " << properties.name;
     if (verbose_) {
       log << '\n';
     }
 
     // compute capabilities
     computeCapabilities_.emplace_back(properties.major, properties.minor);
     if (verbose_) {
       log << "  compute capability:          " << properties.major << "." << properties.minor;
     }
     log << " (sm_" << properties.major << properties.minor << ")";
     if (verbose_) {
       log << '\n';
       log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
       log << "  CUDA cores: " << std::setw(28)
           << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
       log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
           << ":1\n";
     }
 
     // compute mode
     static constexpr const char* computeModeDescription[] = {
         "default (shared)",            // cudaComputeModeDefault
         "exclusive (single thread)",   // cudaComputeModeExclusive
         "prohibited",                  // cudaComputeModeProhibited
         "exclusive (single process)",  // cudaComputeModeExclusiveProcess
         "unknown"};
     if (verbose_) {
       log << "  compute mode:" << std::right << std::setw(27)
           << computeModeDescription[std::min(properties.computeMode,
                                              static_cast<int>(std::size(computeModeDescription)) - 1)]
           << '\n';
     }
 
     // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
     cudaCheck(cudaSetDevice(i));
     cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
 
     // read the free and total amount of memory available for allocation by the device, in bytes.
     // see the documentation of cudaMemGetInfo() for more information.
     if (verbose_) {
       size_t freeMemory, totalMemory;
       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
       log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
           << totalMemory / (1 << 20) << " MB total\n";
       log << "  constant memory:               " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
       log << "  L2 cache size:                 " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
     }
 
     // L1 cache behaviour
     if (verbose_) {
       static constexpr const char* l1CacheModeDescription[] = {
           "unknown", "local memory", "global memory", "local and global memory"};
       int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
       log << "  L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
       log << '\n';
 
       log << "Other capabilities\n";
       log << "  " << (properties.canMapHostMemory ? "can" : "cannot")
           << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
       log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support")
           << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
       log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
           << " access pageable memory via the host's page tables\n";
       log << "  " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
           << " access host registered memory at the same virtual address as the host\n";
       log << "  " << (properties.unifiedAddressing ? "shares" : "does not share")
           << " a unified address space with the host\n";
       log << "  " << (properties.managedMemory ? "supports" : "does not support")
           << " allocating managed memory on this system\n";
       log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot")
           << " coherently access managed memory concurrently with the host\n";
       log << "  "
           << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
           << " directly access managed memory on the device without migration\n";
       log << "  " << (properties.cooperativeLaunch ? "supports" : "does not support")
           << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
       log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
           << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
       log << '\n';
     }
 
     // set and read the CUDA device flags.
     // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for  more information.
     if (verbose_) {
       log << "CUDA flags\n";
       unsigned int flags;
       cudaCheck(cudaGetDeviceFlags(&flags));
       switch (flags & cudaDeviceScheduleMask) {
         case cudaDeviceScheduleAuto:
           log << "  thread policy:                   default\n";
           break;
         case cudaDeviceScheduleSpin:
           log << "  thread policy:                      spin\n";
           break;
         case cudaDeviceScheduleYield:
           log << "  thread policy:                     yield\n";
           break;
         case cudaDeviceScheduleBlockingSync:
           log << "  thread policy:             blocking sync\n";
           break;
         default:
           log << "  thread policy:                 undefined\n";
       }
       if (flags & cudaDeviceMapHost) {
         log << "  pinned host memory allocations:  enabled\n";
       } else {
         log << "  pinned host memory allocations: disabled\n";
       }
       if (flags & cudaDeviceLmemResizeToMax) {
         log << "  kernel host memory reuse:        enabled\n";
       } else {
         log << "  kernel host memory reuse:       disabled\n";
       }
       log << '\n';
     }
 
     // set and read the CUDA resource limits.
     // see the documentation of cudaDeviceSetLimit() for more information.
 
     // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
     // printf() device system call.
     if (printfFifoSize >= 0) {
       setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
     }
     // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
     if (stackSize >= 0) {
       setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
     }
     // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
     // and free() device system calls.
     if (mallocHeapSize >= 0) {
       setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
     }
     if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
       // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
       // a thread can safely call cudaDeviceSynchronize().
       if (devRuntimeSyncDepth >= 0) {
         setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
       }
       // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
       // device runtime launches that can be made from the current device.
       if (devRuntimePendingLaunchCount >= 0) {
         setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
                      "cudaLimitDevRuntimePendingLaunchCount",
                      devRuntimePendingLaunchCount);
       }
     }
 
     if (verbose_) {
       size_t value;
       log << "CUDA limits\n";
       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
       log << "  printf buffer size:        " << std::setw(10) << value / (1 << 20) << " MB\n";
       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
       log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
       log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
       if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
         cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
         log << "  runtime sync depth:           " << std::setw(10) << value << '\n';
         cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
         log << "  runtime pending launch count: " << std::setw(10) << value << '\n';
       }
     }
   }
 
   // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
   if constexpr (cms::cuda::allocator::useCaching) {
     cms::cuda::allocator::cachingAllocatorsConstruct();
   }
   cms::cuda::getEventCache().clear();
   cms::cuda::getStreamCache().clear();
 
   if (verbose_) {
     log << '\n' << "CUDAService fully initialized";
   }
   enabled_ = true;
 
   // Preallocate buffers if asked to
   auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
   devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
   hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
 }

◆ ~CUDAService()

CUDAService::~CUDAService ( )

Definition at line 375 of file CUDAService.cc.

References cms::cuda::allocator::cachingAllocatorsFreeCached(), cms::cuda::StreamCache::clear(), cms::cuda::EventCache::clear(), cudaCheck, enabled_, cms::cuda::getEventCache(), cms::cuda::getStreamCache(), mps_fire::i, numberOfDevices_, and cms::cuda::allocator::useCaching.

                           {
   if (enabled_) {
     // Explicitly destruct the allocator before the device resets below
     if constexpr (cms::cuda::allocator::useCaching) {
       cms::cuda::allocator::cachingAllocatorsFreeCached();
     }
     cms::cuda::getEventCache().clear();
     cms::cuda::getStreamCache().clear();
 
     for (int i = 0; i < numberOfDevices_; ++i) {
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaDeviceSynchronize());
       // Explicitly destroys and cleans up all resources associated with the current device in the
       // current process. Any subsequent API call to this device will reinitialize the device.
       // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
       cudaDeviceReset();
     }
   }
 }

Member Function Documentation

◆ computeCapability()

std::pair<int, int> CUDAService::computeCapability ( int device ) const

inline

Definition at line 27 of file CUDAService.h.

References computeCapabilities_.

27 { return computeCapabilities_.at(device); }

CUDAService::computeCapabilities_

std::vector< std::pair< int, int > > computeCapabilities_

Definition: CUDAService.h:34

◆ deviceWithMostFreeMemory()

int CUDAService::deviceWithMostFreeMemory ( ) const

Definition at line 425 of file CUDAService.cc.

References cudaCheck, cms::cuda::currentDevice(), mps_fire::i, and numberOfDevices_.

                                                 {
   // save the current device
   int currentDevice;
   cudaCheck(cudaGetDevice(&currentDevice));
 
   size_t maxFreeMemory = 0;
   int device = -1;
   for (int i = 0; i < numberOfDevices_; ++i) {
     size_t freeMemory, totalMemory;
     cudaSetDevice(i);
     cudaMemGetInfo(&freeMemory, &totalMemory);
     edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
                                  << totalMemory / (1 << 20) << " MB total memory";
     if (freeMemory > maxFreeMemory) {
       maxFreeMemory = freeMemory;
       device = i;
     }
   }
   // restore the current device
   cudaCheck(cudaSetDevice(currentDevice));
   return device;
 }

◆ enabled()

bool CUDAService::enabled ( ) const

inline

Definition at line 22 of file CUDAService.h.

References enabled_.

Referenced by cms::cuda::chooseDevice(), and CUDAMonitoringService::CUDAMonitoringService().

22 { return enabled_; }

CUDAService::enabled_

bool enabled_

Definition: CUDAService.h:35

◆ fillDescriptions()

void CUDAService::fillDescriptions ( edm::ConfigurationDescriptions & descriptions )

static

Definition at line 395 of file CUDAService.cc.

References edm::ConfigurationDescriptions::add(), HLT_2022v12_cff::allocator, submitPVResolutionJobs::desc, and TH2PolyOfflineMaps::limits.

                                                                              {
   edm::ParameterSetDescription desc;
   desc.addUntracked<bool>("enabled", true);
   desc.addUntracked<bool>("verbose", false);
 
   edm::ParameterSetDescription limits;
   limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
       ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
   limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
   limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
       ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
   limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
       ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
   limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
       ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
   desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
       ->setComment(
           "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
           "the default value.");
 
   edm::ParameterSetDescription allocator;
   allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
       ->setComment("Preallocates buffers of given bytes on all devices");
   allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
       ->setComment("Preallocates buffers of given bytes on the host");
   desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
 
   descriptions.add("CUDAService", desc);
 }