CMS 3D CMS Logo

List of all members | Public Member Functions | Static Public Member Functions | Private Attributes
CUDAService Class Reference

#include <CUDAService.h>

Public Member Functions

std::pair< int, int > computeCapability (int device) const
 
 CUDAService (edm::ParameterSet const &iConfig)
 Constructor. More...
 
int deviceWithMostFreeMemory () const
 
bool enabled () const
 
int numberOfDevices () const
 
 ~CUDAService ()
 

Static Public Member Functions

static void fillDescriptions (edm::ConfigurationDescriptions &descriptions)
 

Private Attributes

std::vector< std::pair< int, int > > computeCapabilities_
 
bool enabled_ = false
 
int numberOfDevices_ = 0
 

Detailed Description

Definition at line 15 of file CUDAService.h.

Constructor & Destructor Documentation

◆ CUDAService()

CUDAService::CUDAService ( edm::ParameterSet const &  iConfig)

Constructor.

Definition at line 122 of file CUDAService.cc.

122  {
123  bool configEnabled = config.getUntrackedParameter<bool>("enabled");
124  if (not configEnabled) {
125  edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
126  return;
127  }
128 
129  auto status = cudaGetDeviceCount(&numberOfDevices_);
130  if (cudaSuccess != status) {
131  edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
132  << "Disabling the CUDAService.";
133  return;
134  }
135  edm::LogInfo log("CUDAService");
137  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
138 
139  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
140  auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
141  auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
142  auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
143  auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
144  auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
145 
146  for (int i = 0; i < numberOfDevices_; ++i) {
147  // read information about the compute device.
148  // see the documentation of cudaGetDeviceProperties() for more information.
149  cudaDeviceProp properties;
150  cudaCheck(cudaGetDeviceProperties(&properties, i));
151  log << "CUDA device " << i << ": " << properties.name << '\n';
152 
153  // compute capabilities
154  log << " compute capability: " << properties.major << "." << properties.minor << " (sm_"
155  << properties.major << properties.minor << ")\n";
156  computeCapabilities_.emplace_back(properties.major, properties.minor);
157  log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
158  log << " CUDA cores: " << std::setw(28)
159  << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
160  log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
161 
162  // compute mode
163  static constexpr const char* computeModeDescription[] = {
164  "default (shared)", // cudaComputeModeDefault
165  "exclusive (single thread)", // cudaComputeModeExclusive
166  "prohibited", // cudaComputeModeProhibited
167  "exclusive (single process)", // cudaComputeModeExclusiveProcess
168  "unknown"};
169  log << " compute mode:" << std::right << std::setw(27)
170  << computeModeDescription[std::min(properties.computeMode,
171  static_cast<int>(std::size(computeModeDescription)) - 1)]
172  << '\n';
173 
174  // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
175  cudaCheck(cudaSetDevice(i));
176  cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
177 
178  // read the free and total amount of memory available for allocation by the device, in bytes.
179  // see the documentation of cudaMemGetInfo() for more information.
180  size_t freeMemory, totalMemory;
181  cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
182  log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
183  << totalMemory / (1 << 20) << " MB total\n";
184  log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
185  log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
186 
187  // L1 cache behaviour
188  static constexpr const char* l1CacheModeDescription[] = {
189  "unknown", "local memory", "global memory", "local and global memory"};
190  int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
191  log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
192  log << '\n';
193 
194  log << "Other capabilities\n";
195  log << " " << (properties.canMapHostMemory ? "can" : "cannot")
196  << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
197  log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
198  << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
199  log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
200  << " access pageable memory via the host's page tables\n";
201  log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
202  << " access host registered memory at the same virtual address as the host\n";
203  log << " " << (properties.unifiedAddressing ? "shares" : "does not share")
204  << " a unified address space with the host\n";
205  log << " " << (properties.managedMemory ? "supports" : "does not support")
206  << " allocating managed memory on this system\n";
207  log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
208  << " coherently access managed memory concurrently with the host\n";
209  log << " "
210  << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
211  << " directly access managed memory on the device without migration\n";
212  log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
213  << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
214  log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
215  << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
216  log << '\n';
217 
218  // set and read the CUDA device flags.
219  // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for more information.
220  log << "CUDA flags\n";
221  unsigned int flags;
222  cudaCheck(cudaGetDeviceFlags(&flags));
223  switch (flags & cudaDeviceScheduleMask) {
224  case cudaDeviceScheduleAuto:
225  log << " thread policy: default\n";
226  break;
227  case cudaDeviceScheduleSpin:
228  log << " thread policy: spin\n";
229  break;
230  case cudaDeviceScheduleYield:
231  log << " thread policy: yield\n";
232  break;
233  case cudaDeviceScheduleBlockingSync:
234  log << " thread policy: blocking sync\n";
235  break;
236  default:
237  log << " thread policy: undefined\n";
238  }
239  if (flags & cudaDeviceMapHost) {
240  log << " pinned host memory allocations: enabled\n";
241  } else {
242  log << " pinned host memory allocations: disabled\n";
243  }
244  if (flags & cudaDeviceLmemResizeToMax) {
245  log << " kernel host memory reuse: enabled\n";
246  } else {
247  log << " kernel host memory reuse: disabled\n";
248  }
249  log << '\n';
250 
251  // set and read the CUDA resource limits.
252  // see the documentation of cudaDeviceSetLimit() for more information.
253 
254  // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
255  // printf() device system call.
256  if (printfFifoSize >= 0) {
257  setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
258  }
259  // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
260  if (stackSize >= 0) {
261  setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
262  }
263  // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
264  // and free() device system calls.
265  if (mallocHeapSize >= 0) {
266  setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
267  }
268  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
269  // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
270  // a thread can safely call cudaDeviceSynchronize().
271  if (devRuntimeSyncDepth >= 0) {
272  setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
273  }
274  // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
275  // device runtime launches that can be made from the current device.
276  if (devRuntimePendingLaunchCount >= 0) {
277  setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
278  "cudaLimitDevRuntimePendingLaunchCount",
279  devRuntimePendingLaunchCount);
280  }
281  }
282 
283  size_t value;
284  log << "CUDA limits\n";
285  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
286  log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n";
287  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
288  log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
289  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
290  log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
291  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
292  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
293  log << " runtime sync depth: " << std::setw(10) << value << '\n';
294  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
295  log << " runtime pending launch count: " << std::setw(10) << value << '\n';
296  }
297  log << '\n';
298  }
299  log << "\n";
300 
301  // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
302  if constexpr (cms::cuda::allocator::useCaching) {
305  }
308 
309  log << "CUDAService fully initialized";
310  enabled_ = true;
311 
312  // Preallocate buffers if asked to
313  auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
314  devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
315  hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
316 }

References cms::cuda::StreamCache::clear(), cms::cuda::EventCache::clear(), computeCapabilities_, cudaCheck, enabled_, HLT_FULL_cff::flags, cms::cuda::allocator::getCachingDeviceAllocator(), cms::cuda::allocator::getCachingHostAllocator(), getCudaCoresPerSM(), cms::cuda::getEventCache(), cms::cuda::getStreamCache(), mps_fire::i, TH2PolyOfflineMaps::limits, dqm-mbProfile::log, min(), numberOfDevices_, or, setCudaLimit(), findQualityFiles::size, mps_update::status, cms::cuda::allocator::useCaching, and relativeConstraints::value.

◆ ~CUDAService()

CUDAService::~CUDAService ( )

Definition at line 318 of file CUDAService.cc.

318  {
319  if (enabled_) {
320  // Explicitly destruct the allocator before the device resets below
321  if constexpr (cms::cuda::allocator::useCaching) {
324  }
327 
328  for (int i = 0; i < numberOfDevices_; ++i) {
329  cudaCheck(cudaSetDevice(i));
330  cudaCheck(cudaDeviceSynchronize());
331  // Explicitly destroys and cleans up all resources associated with the current device in the
332  // current process. Any subsequent API call to this device will reinitialize the device.
333  // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
334  cudaDeviceReset();
335  }
336  }
337 }

References cms::cuda::StreamCache::clear(), cms::cuda::EventCache::clear(), cudaCheck, enabled_, notcub::CachingHostAllocator::FreeAllCached(), notcub::CachingDeviceAllocator::FreeAllCached(), cms::cuda::allocator::getCachingDeviceAllocator(), cms::cuda::allocator::getCachingHostAllocator(), cms::cuda::getEventCache(), cms::cuda::getStreamCache(), mps_fire::i, numberOfDevices_, and cms::cuda::allocator::useCaching.

Member Function Documentation

◆ computeCapability()

std::pair<int, int> CUDAService::computeCapability ( int  device) const
inline

Definition at line 27 of file CUDAService.h.

27 { return computeCapabilities_.at(device); }

References computeCapabilities_.

◆ deviceWithMostFreeMemory()

int CUDAService::deviceWithMostFreeMemory ( ) const

Definition at line 368 of file CUDAService.cc.

368  {
369  // save the current device
370  int currentDevice;
371  cudaCheck(cudaGetDevice(&currentDevice));
372 
373  size_t maxFreeMemory = 0;
374  int device = -1;
375  for (int i = 0; i < numberOfDevices_; ++i) {
376  size_t freeMemory, totalMemory;
377  cudaSetDevice(i);
378  cudaMemGetInfo(&freeMemory, &totalMemory);
379  edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
380  << totalMemory / (1 << 20) << " MB total memory";
381  if (freeMemory > maxFreeMemory) {
382  maxFreeMemory = freeMemory;
383  device = i;
384  }
385  }
386  // restore the current device
387  cudaCheck(cudaSetDevice(currentDevice));
388  return device;
389 }

References cudaCheck, cms::cuda::currentDevice(), mps_fire::i, and numberOfDevices_.

◆ enabled()

bool CUDAService::enabled ( ) const
inline

Definition at line 22 of file CUDAService.h.

22 { return enabled_; }

References enabled_.

Referenced by cms::cuda::chooseDevice(), and CUDAMonitoringService::CUDAMonitoringService().

◆ fillDescriptions()

void CUDAService::fillDescriptions ( edm::ConfigurationDescriptions descriptions)
static

Definition at line 339 of file CUDAService.cc.

339  {
341  desc.addUntracked<bool>("enabled", true);
342 
344  limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
345  ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
346  limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
347  limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
348  ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
349  limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
350  ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
351  limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
352  ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
353  desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
354  ->setComment(
355  "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
356  "the default value.");
357 
359  allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
360  ->setComment("Preallocates buffers of given bytes on all devices");
361  allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
362  ->setComment("Preallocates buffers of given bytes on the host");
363  desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
364 
365  descriptions.add("CUDAService", desc);
366 }

References edm::ConfigurationDescriptions::add(), edm::ParameterSetDescription::addUntracked(), submitPVResolutionJobs::desc, and TH2PolyOfflineMaps::limits.

◆ numberOfDevices()

int CUDAService::numberOfDevices ( ) const
inline

Definition at line 24 of file CUDAService.h.

24 { return numberOfDevices_; }

References numberOfDevices_.

Referenced by cms::cuda::chooseDevice(), and CUDAMonitoringService::CUDAMonitoringService().

Member Data Documentation

◆ computeCapabilities_

std::vector<std::pair<int, int> > CUDAService::computeCapabilities_
private

Definition at line 34 of file CUDAService.h.

Referenced by computeCapability(), and CUDAService().

◆ enabled_

bool CUDAService::enabled_ = false
private

Definition at line 35 of file CUDAService.h.

Referenced by CUDAService(), enabled(), and ~CUDAService().

◆ numberOfDevices_

int CUDAService::numberOfDevices_ = 0
private

Definition at line 33 of file CUDAService.h.

Referenced by CUDAService(), deviceWithMostFreeMemory(), numberOfDevices(), and ~CUDAService().

mps_fire.i
i
Definition: mps_fire.py:428
cms::cuda::allocator::useCaching
constexpr bool useCaching
Definition: getCachingDeviceAllocator.h:14
getCudaCoresPerSM
constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor)
Definition: CUDAService.cc:44
mps_update.status
status
Definition: mps_update.py:69
min
T min(T a, T b)
Definition: MathUtil.h:58
edm::LogPrint
Log< level::Warning, true > LogPrint
Definition: MessageLogger.h:130
edm::ParameterSetDescription
Definition: ParameterSetDescription.h:52
cms::cuda::StreamCache::clear
void clear()
Definition: StreamCache.cc:29
edm::LogInfo
Log< level::Info, false > LogInfo
Definition: MessageLogger.h:125
edm::LogWarning
Log< level::Warning, false > LogWarning
Definition: MessageLogger.h:122
notcub::CachingDeviceAllocator::FreeAllCached
cudaError_t FreeAllCached()
Frees all cached device allocations on all devices.
Definition: CachingDeviceAllocator.h:691
config
Definition: config.py:1
edm::ConfigurationDescriptions::add
void add(std::string const &label, ParameterSetDescription const &psetDescription)
Definition: ConfigurationDescriptions.cc:57
cms::cuda::EventCache::clear
void clear()
Definition: EventCache.cc:54
CUDAService::enabled_
bool enabled_
Definition: CUDAService.h:35
cms::cuda::currentDevice
int currentDevice()
Definition: currentDevice.h:10
cms::cuda::getStreamCache
StreamCache & getStreamCache()
Definition: StreamCache.cc:39
edm::ParameterSetDescription::addUntracked
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
Definition: ParameterSetDescription.h:100
edm::ParameterSet
Definition: ParameterSet.h:47
CUDAService::numberOfDevices_
int numberOfDevices_
Definition: CUDAService.h:33
value
Definition: value.py:1
TH2PolyOfflineMaps.limits
limits
Definition: TH2PolyOfflineMaps.py:45
setCudaLimit
void setCudaLimit(cudaLimit limit, const char *name, size_t request)
Definition: CUDAService.cc:22
submitPVResolutionJobs.desc
string desc
Definition: submitPVResolutionJobs.py:251
cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:62
notcub::CachingHostAllocator::FreeAllCached
cudaError_t FreeAllCached()
Frees all cached pinned host allocations.
Definition: CachingHostAllocator.h:604
cms::cuda::allocator::getCachingHostAllocator
notcub::CachingHostAllocator & getCachingHostAllocator()
Definition: getCachingHostAllocator.h:14
relativeConstraints.value
value
Definition: relativeConstraints.py:53
or
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
dqm-mbProfile.log
log
Definition: dqm-mbProfile.py:17
CUDAService::computeCapabilities_
std::vector< std::pair< int, int > > computeCapabilities_
Definition: CUDAService.h:34
HLT_FULL_cff.flags
flags
Definition: HLT_FULL_cff.py:13216
edm::Log
Definition: MessageLogger.h:70
cms::cuda::getEventCache
EventCache & getEventCache()
Definition: EventCache.cc:64
findQualityFiles.size
size
Write out results.
Definition: findQualityFiles.py:443
cms::cuda::allocator::getCachingDeviceAllocator
notcub::CachingDeviceAllocator & getCachingDeviceAllocator()
Definition: getCachingDeviceAllocator.h:45