#include <CUDAService.h>

Public Member Functions
std::pair< int, int >	computeCapability (int device) const

	CUDAService (edm::ParameterSet const &iConfig)
	Constructor. More...

int	deviceWithMostFreeMemory () const

bool	enabled () const

int	numberOfDevices () const

	~CUDAService ()

Static Public Member Functions
static void	fillDescriptions (edm::ConfigurationDescriptions &descriptions)

Private Attributes
std::vector< std::pair< int, int > >	computeCapabilities_

bool	enabled_ = false

int	numberOfDevices_ = 0

Detailed Description

Definition at line 15 of file CUDAService.h.

Constructor & Destructor Documentation

◆ CUDAService()

CUDAService::CUDAService ( edm::ParameterSet const & iConfig )

Constructor.

Definition at line 122 of file CUDAService.cc.

                                                       {
   bool configEnabled = config.getUntrackedParameter<bool>("enabled");
   if (not configEnabled) {
     edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
     return;
   }
  
   auto status = cudaGetDeviceCount(&numberOfDevices_);
   if (cudaSuccess != status) {
     edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
                                    << "Disabling the CUDAService.";
     return;
   }
   edm::LogInfo log("CUDAService");
   computeCapabilities_.reserve(numberOfDevices_);
   log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
  
   auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
   auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
   auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
   auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
   auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
   auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
  
   for (int i = 0; i < numberOfDevices_; ++i) {
     // read information about the compute device.
     // see the documentation of cudaGetDeviceProperties() for more information.
     cudaDeviceProp properties;
     cudaCheck(cudaGetDeviceProperties(&properties, i));
     log << "CUDA device " << i << ": " << properties.name << '\n';
  
     // compute capabilities
     log << "  compute capability:          " << properties.major << "." << properties.minor << " (sm_"
         << properties.major << properties.minor << ")\n";
     computeCapabilities_.emplace_back(properties.major, properties.minor);
     log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
     log << "  CUDA cores: " << std::setw(28)
         << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
     log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
  
     // compute mode
     static constexpr const char* computeModeDescription[] = {
         "default (shared)",            // cudaComputeModeDefault
         "exclusive (single thread)",   // cudaComputeModeExclusive
         "prohibited",                  // cudaComputeModeProhibited
         "exclusive (single process)",  // cudaComputeModeExclusiveProcess
         "unknown"};
     log << "  compute mode:" << std::right << std::setw(27)
         << computeModeDescription[std::min(properties.computeMode,
                                            static_cast<int>(std::size(computeModeDescription)) - 1)]
         << '\n';
  
     // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
     cudaCheck(cudaSetDevice(i));
     cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
  
     // read the free and total amount of memory available for allocation by the device, in bytes.
     // see the documentation of cudaMemGetInfo() for more information.
     size_t freeMemory, totalMemory;
     cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
     log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
         << totalMemory / (1 << 20) << " MB total\n";
     log << "  constant memory:               " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
     log << "  L2 cache size:                 " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
  
     // L1 cache behaviour
     static constexpr const char* l1CacheModeDescription[] = {
         "unknown", "local memory", "global memory", "local and global memory"};
     int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
     log << "  L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
     log << '\n';
  
     log << "Other capabilities\n";
     log << "  " << (properties.canMapHostMemory ? "can" : "cannot")
         << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
     log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support")
         << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
     log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
         << " access pageable memory via the host's page tables\n";
     log << "  " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
         << " access host registered memory at the same virtual address as the host\n";
     log << "  " << (properties.unifiedAddressing ? "shares" : "does not share")
         << " a unified address space with the host\n";
     log << "  " << (properties.managedMemory ? "supports" : "does not support")
         << " allocating managed memory on this system\n";
     log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot")
         << " coherently access managed memory concurrently with the host\n";
     log << "  "
         << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
         << " directly access managed memory on the device without migration\n";
     log << "  " << (properties.cooperativeLaunch ? "supports" : "does not support")
         << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
     log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
         << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
     log << '\n';
  
     // set and read the CUDA device flags.
     // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for  more information.
     log << "CUDA flags\n";
     unsigned int flags;
     cudaCheck(cudaGetDeviceFlags(&flags));
     switch (flags & cudaDeviceScheduleMask) {
       case cudaDeviceScheduleAuto:
         log << "  thread policy:                   default\n";
         break;
       case cudaDeviceScheduleSpin:
         log << "  thread policy:                      spin\n";
         break;
       case cudaDeviceScheduleYield:
         log << "  thread policy:                     yield\n";
         break;
       case cudaDeviceScheduleBlockingSync:
         log << "  thread policy:             blocking sync\n";
         break;
       default:
         log << "  thread policy:                 undefined\n";
     }
     if (flags & cudaDeviceMapHost) {
       log << "  pinned host memory allocations:  enabled\n";
     } else {
       log << "  pinned host memory allocations: disabled\n";
     }
     if (flags & cudaDeviceLmemResizeToMax) {
       log << "  kernel host memory reuse:        enabled\n";
     } else {
       log << "  kernel host memory reuse:       disabled\n";
     }
     log << '\n';
  
     // set and read the CUDA resource limits.
     // see the documentation of cudaDeviceSetLimit() for more information.
  
     // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
     // printf() device system call.
     if (printfFifoSize >= 0) {
       setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
     }
     // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
     if (stackSize >= 0) {
       setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
     }
     // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
     // and free() device system calls.
     if (mallocHeapSize >= 0) {
       setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
     }
     if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
       // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
       // a thread can safely call cudaDeviceSynchronize().
       if (devRuntimeSyncDepth >= 0) {
         setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
       }
       // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
       // device runtime launches that can be made from the current device.
       if (devRuntimePendingLaunchCount >= 0) {
         setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
                      "cudaLimitDevRuntimePendingLaunchCount",
                      devRuntimePendingLaunchCount);
       }
     }
  
     size_t value;
     log << "CUDA limits\n";
     cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
     log << "  printf buffer size:        " << std::setw(10) << value / (1 << 20) << " MB\n";
     cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
     log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
     cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
     log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
     if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
       log << "  runtime sync depth:           " << std::setw(10) << value << '\n';
       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
       log << "  runtime pending launch count: " << std::setw(10) << value << '\n';
     }
     log << '\n';
   }
   log << "\n";
  
   // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
   if constexpr (cms::cuda::allocator::useCaching) {
     cms::cuda::allocator::getCachingDeviceAllocator();
     cms::cuda::allocator::getCachingHostAllocator();
   }
   cms::cuda::getEventCache().clear();
   cms::cuda::getStreamCache().clear();
  
   log << "CUDAService fully initialized";
   enabled_ = true;
  
   // Preallocate buffers if asked to
   auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
   devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
   hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
 }

References cms::cuda::StreamCache::clear(), cms::cuda::EventCache::clear(), computeCapabilities_, cudaCheck, enabled_, HLT_FULL_cff::flags, cms::cuda::allocator::getCachingDeviceAllocator(), cms::cuda::allocator::getCachingHostAllocator(), getCudaCoresPerSM(), cms::cuda::getEventCache(), cms::cuda::getStreamCache(), mps_fire::i, TH2PolyOfflineMaps::limits, dqm-mbProfile::log, min(), numberOfDevices_, or, setCudaLimit(), findQualityFiles::size, mps_update::status, cms::cuda::allocator::useCaching, and relativeConstraints::value.

◆ ~CUDAService()

CUDAService::~CUDAService ( )

Definition at line 318 of file CUDAService.cc.

                           {
   if (enabled_) {
     // Explicitly destruct the allocator before the device resets below
     if constexpr (cms::cuda::allocator::useCaching) {
       cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
       cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
     }
     cms::cuda::getEventCache().clear();
     cms::cuda::getStreamCache().clear();
  
     for (int i = 0; i < numberOfDevices_; ++i) {
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaDeviceSynchronize());
       // Explicitly destroys and cleans up all resources associated with the current device in the
       // current process. Any subsequent API call to this device will reinitialize the device.
       // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
       cudaDeviceReset();
     }
   }
 }

References cms::cuda::StreamCache::clear(), cms::cuda::EventCache::clear(), cudaCheck, enabled_, notcub::CachingHostAllocator::FreeAllCached(), notcub::CachingDeviceAllocator::FreeAllCached(), cms::cuda::allocator::getCachingDeviceAllocator(), cms::cuda::allocator::getCachingHostAllocator(), cms::cuda::getEventCache(), cms::cuda::getStreamCache(), mps_fire::i, numberOfDevices_, and cms::cuda::allocator::useCaching.

Member Function Documentation

◆ computeCapability()

std::pair<int, int> CUDAService::computeCapability ( int device ) const

inline

Definition at line 27 of file CUDAService.h.

27 { return computeCapabilities_.at(device); }

References computeCapabilities_.

◆ deviceWithMostFreeMemory()

int CUDAService::deviceWithMostFreeMemory ( ) const

Definition at line 368 of file CUDAService.cc.

                                                 {
   // save the current device
   int currentDevice;
   cudaCheck(cudaGetDevice(&currentDevice));
  
   size_t maxFreeMemory = 0;
   int device = -1;
   for (int i = 0; i < numberOfDevices_; ++i) {
     size_t freeMemory, totalMemory;
     cudaSetDevice(i);
     cudaMemGetInfo(&freeMemory, &totalMemory);
     edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
                                  << totalMemory / (1 << 20) << " MB total memory";
     if (freeMemory > maxFreeMemory) {
       maxFreeMemory = freeMemory;
       device = i;
     }
   }
   // restore the current device
   cudaCheck(cudaSetDevice(currentDevice));
   return device;
 }

References cudaCheck, cms::cuda::currentDevice(), mps_fire::i, and numberOfDevices_.

◆ enabled()

bool CUDAService::enabled ( ) const

inline

Definition at line 22 of file CUDAService.h.

22 { return enabled_; }

References enabled_.

Referenced by cms::cuda::chooseDevice(), and CUDAMonitoringService::CUDAMonitoringService().

◆ fillDescriptions()

void CUDAService::fillDescriptions ( edm::ConfigurationDescriptions & descriptions )

static

Definition at line 339 of file CUDAService.cc.

                                                                              {
   edm::ParameterSetDescription desc;
   desc.addUntracked<bool>("enabled", true);
  
   edm::ParameterSetDescription limits;
   limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
       ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
   limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
   limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
       ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
   limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
       ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
   limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
       ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
   desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
       ->setComment(
           "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
           "the default value.");
  
   edm::ParameterSetDescription allocator;
   allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
       ->setComment("Preallocates buffers of given bytes on all devices");
   allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
       ->setComment("Preallocates buffers of given bytes on the host");
   desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
  
   descriptions.add("CUDAService", desc);
 }