CMS 3D CMS Logo

CUDAService.cc
Go to the documentation of this file.
1 #include <iomanip>
2 #include <iostream>
3 #include <limits>
4 
5 #include <cuda.h>
6 #include <cuda_runtime.h>
7 #include <nvml.h>
8 
23 
24 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
25  // read the current device
26  int device;
27  cudaCheck(cudaGetDevice(&device));
28  // try to set the requested limit
29  auto result = cudaDeviceSetLimit(limit, request);
30  if (cudaErrorUnsupportedLimit == result) {
31  edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
32  return;
33  }
34  // read back the limit value
35  size_t value;
36  result = cudaDeviceGetLimit(&value, limit);
37  if (cudaSuccess != result) {
38  edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
39  << request << ", current value is " << value;
40  } else if (value != request) {
41  edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
42  << " instead of requested " << request;
43  }
44 }
45 
46 constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
47  switch (major * 10 + minor) {
48  // Fermi architecture
49  case 20: // SM 2.0: GF100 class
50  return 32;
51  case 21: // SM 2.1: GF10x class
52  return 48;
53 
54  // Kepler architecture
55  case 30: // SM 3.0: GK10x class
56  case 32: // SM 3.2: GK10x class
57  case 35: // SM 3.5: GK11x class
58  case 37: // SM 3.7: GK21x class
59  return 192;
60 
61  // Maxwell architecture
62  case 50: // SM 5.0: GM10x class
63  case 52: // SM 5.2: GM20x class
64  case 53: // SM 5.3: GM20x class
65  return 128;
66 
67  // Pascal architecture
68  case 60: // SM 6.0: GP100 class
69  return 64;
70  case 61: // SM 6.1: GP10x class
71  case 62: // SM 6.2: GP10x class
72  return 128;
73 
74  // Volta architecture
75  case 70: // SM 7.0: GV100 class
76  case 72: // SM 7.2: GV11b class
77  return 64;
78 
79  // Turing architecture
80  case 75: // SM 7.5: TU10x class
81  return 64;
82 
83  // Ampere architecture
84  case 80: // SM 8.0: GA100 class
85  return 64;
86  case 86: // SM 8.6: GA10x class
87  return 128;
88 
89  // unknown architecture, return a default value
90  default:
91  return 64;
92  }
93 }
94 
96  return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10);
97 }
98 
99 namespace {
100  template <template <typename> typename UniquePtr, typename Allocate>
101  void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
102  if (bufferSizes.empty())
103  return;
104 
105  auto streamPtr = cms::cuda::getStreamCache().get();
106 
107  std::vector<UniquePtr<char[]> > buffers;
108  buffers.reserve(bufferSizes.size());
109  for (auto size : bufferSizes) {
110  buffers.push_back(allocate(size, streamPtr.get()));
111  }
112  }
113 
114  void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
115  int device;
116  cudaCheck(cudaGetDevice(&device));
117  for (int i = 0; i < numberOfDevices; ++i) {
118  cudaCheck(cudaSetDevice(i));
119  preallocate<cms::cuda::device::unique_ptr>(
120  [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
121  bufferSizes);
122  }
123  cudaCheck(cudaSetDevice(device));
124  }
125 
126  void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
127  preallocate<cms::cuda::host::unique_ptr>(
128  [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
129  bufferSizes);
130  }
131 } // namespace
132 
134 CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
135  bool configEnabled = config.getUntrackedParameter<bool>("enabled");
136  if (not configEnabled) {
137  edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
138  return;
139  }
140 
141  auto status = cudaGetDeviceCount(&numberOfDevices_);
142  if (cudaSuccess != status) {
143  edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
144  << "Disabling the CUDAService.";
145  return;
146  }
148 
149  // NVIDIA system driver version, e.g. 470.57.02
150  char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
151  nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
152  nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
153  nvmlCheck(nvmlShutdown());
154 
155  // CUDA driver version, e.g. 11.4
156  // the full version, like 11.4.1 or 11.4.100, is not reported
157  int driverVersion = 0;
158  cudaCheck(cudaDriverGetVersion(&driverVersion));
159 
160  // CUDA runtime version, e.g. 11.4
161  // the full version, like 11.4.1 or 11.4.108, is not reported
162  int runtimeVersion = 0;
163  cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
164 
165  edm::LogInfo log("CUDAService");
166  if (verbose_) {
167  log << "NVIDIA driver: " << systemDriverVersion << '\n';
168  log << "CUDA driver API: " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
169  << ")\n";
170  log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
171  << ")\n";
172  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
173  } else {
174  log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
175  << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
176  }
177 
178  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
179  auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
180  auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
181  auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
182  auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
183  auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
184 
185  for (int i = 0; i < numberOfDevices_; ++i) {
186  // read information about the compute device.
187  // see the documentation of cudaGetDeviceProperties() for more information.
188  cudaDeviceProp properties;
189  cudaCheck(cudaGetDeviceProperties(&properties, i));
190  log << '\n' << "CUDA device " << i << ": " << properties.name;
191  if (verbose_) {
192  log << '\n';
193  }
194 
195  // compute capabilities
196  computeCapabilities_.emplace_back(properties.major, properties.minor);
197  if (verbose_) {
198  log << " compute capability: " << properties.major << "." << properties.minor;
199  }
200  log << " (sm_" << properties.major << properties.minor << ")";
201  if (verbose_) {
202  log << '\n';
203  log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
204  log << " CUDA cores: " << std::setw(28)
205  << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
206  log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
207  << ":1\n";
208  }
209 
210  // compute mode
211  static constexpr const char* computeModeDescription[] = {
212  "default (shared)", // cudaComputeModeDefault
213  "exclusive (single thread)", // cudaComputeModeExclusive
214  "prohibited", // cudaComputeModeProhibited
215  "exclusive (single process)", // cudaComputeModeExclusiveProcess
216  "unknown"};
217  if (verbose_) {
218  log << " compute mode:" << std::right << std::setw(27)
219  << computeModeDescription[std::min(properties.computeMode,
220  static_cast<int>(std::size(computeModeDescription)) - 1)]
221  << '\n';
222  }
223 
224  // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
225  cudaCheck(cudaSetDevice(i));
226  cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
227 
228  // read the free and total amount of memory available for allocation by the device, in bytes.
229  // see the documentation of cudaMemGetInfo() for more information.
230  if (verbose_) {
231  size_t freeMemory, totalMemory;
232  cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
233  log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
234  << totalMemory / (1 << 20) << " MB total\n";
235  log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
236  log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
237  }
238 
239  // L1 cache behaviour
240  if (verbose_) {
241  static constexpr const char* l1CacheModeDescription[] = {
242  "unknown", "local memory", "global memory", "local and global memory"};
243  int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
244  log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
245  log << '\n';
246 
247  log << "Other capabilities\n";
248  log << " " << (properties.canMapHostMemory ? "can" : "cannot")
249  << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
250  log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
251  << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
252  log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
253  << " access pageable memory via the host's page tables\n";
254  log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
255  << " access host registered memory at the same virtual address as the host\n";
256  log << " " << (properties.unifiedAddressing ? "shares" : "does not share")
257  << " a unified address space with the host\n";
258  log << " " << (properties.managedMemory ? "supports" : "does not support")
259  << " allocating managed memory on this system\n";
260  log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
261  << " coherently access managed memory concurrently with the host\n";
262  log << " "
263  << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
264  << " directly access managed memory on the device without migration\n";
265  log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
266  << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
267  log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
268  << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
269  log << '\n';
270  }
271 
272  // set and read the CUDA device flags.
273  // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for more information.
274  if (verbose_) {
275  log << "CUDA flags\n";
276  unsigned int flags;
277  cudaCheck(cudaGetDeviceFlags(&flags));
278  switch (flags & cudaDeviceScheduleMask) {
279  case cudaDeviceScheduleAuto:
280  log << " thread policy: default\n";
281  break;
282  case cudaDeviceScheduleSpin:
283  log << " thread policy: spin\n";
284  break;
285  case cudaDeviceScheduleYield:
286  log << " thread policy: yield\n";
287  break;
288  case cudaDeviceScheduleBlockingSync:
289  log << " thread policy: blocking sync\n";
290  break;
291  default:
292  log << " thread policy: undefined\n";
293  }
294  if (flags & cudaDeviceMapHost) {
295  log << " pinned host memory allocations: enabled\n";
296  } else {
297  log << " pinned host memory allocations: disabled\n";
298  }
299  if (flags & cudaDeviceLmemResizeToMax) {
300  log << " kernel host memory reuse: enabled\n";
301  } else {
302  log << " kernel host memory reuse: disabled\n";
303  }
304  log << '\n';
305  }
306 
307  // set and read the CUDA resource limits.
308  // see the documentation of cudaDeviceSetLimit() for more information.
309 
310  // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
311  // printf() device system call.
312  if (printfFifoSize >= 0) {
313  setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
314  }
315  // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
316  if (stackSize >= 0) {
317  setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
318  }
319  // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
320  // and free() device system calls.
321  if (mallocHeapSize >= 0) {
322  setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
323  }
324  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
325  // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
326  // a thread can safely call cudaDeviceSynchronize().
327  if (devRuntimeSyncDepth >= 0) {
328  setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
329  }
330  // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
331  // device runtime launches that can be made from the current device.
332  if (devRuntimePendingLaunchCount >= 0) {
334  "cudaLimitDevRuntimePendingLaunchCount",
335  devRuntimePendingLaunchCount);
336  }
337  }
338 
339  if (verbose_) {
340  size_t value;
341  log << "CUDA limits\n";
342  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
343  log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n";
344  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
345  log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
346  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
347  log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
348  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
349  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
350  log << " runtime sync depth: " << std::setw(10) << value << '\n';
352  log << " runtime pending launch count: " << std::setw(10) << value << '\n';
353  }
354  }
355  }
356 
357  // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
358  if constexpr (cms::cuda::allocator::useCaching) {
360  }
363 
364  if (verbose_) {
365  log << '\n' << "CUDAService fully initialized";
366  }
367  enabled_ = true;
368 
369  // Preallocate buffers if asked to
370  auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
371  devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
372  hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
373 }
374 
376  if (enabled_) {
377  // Explicitly destruct the allocator before the device resets below
378  if constexpr (cms::cuda::allocator::useCaching) {
380  }
383 
384  for (int i = 0; i < numberOfDevices_; ++i) {
385  cudaCheck(cudaSetDevice(i));
386  cudaCheck(cudaDeviceSynchronize());
387  // Explicitly destroys and cleans up all resources associated with the current device in the
388  // current process. Any subsequent API call to this device will reinitialize the device.
389  // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
390  cudaDeviceReset();
391  }
392  }
393 }
394 
397  desc.addUntracked<bool>("enabled", true);
398  desc.addUntracked<bool>("verbose", false);
399 
401  limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
402  ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
403  limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
404  limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
405  ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
406  limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
407  ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
408  limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
409  ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
410  desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
411  ->setComment(
412  "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
413  "the default value.");
414 
416  allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
417  ->setComment("Preallocates buffers of given bytes on all devices");
418  allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
419  ->setComment("Preallocates buffers of given bytes on the host");
420  desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
421 
422  descriptions.add("CUDAService", desc);
423 }
424 
426  // save the current device
427  int currentDevice;
428  cudaCheck(cudaGetDevice(&currentDevice));
429 
430  size_t maxFreeMemory = 0;
431  int device = -1;
432  for (int i = 0; i < numberOfDevices_; ++i) {
433  size_t freeMemory, totalMemory;
434  cudaSetDevice(i);
435  cudaMemGetInfo(&freeMemory, &totalMemory);
436  edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
437  << totalMemory / (1 << 20) << " MB total memory";
438  if (freeMemory > maxFreeMemory) {
439  maxFreeMemory = freeMemory;
440  device = i;
441  }
442  }
443  // restore the current device
444  cudaCheck(cudaSetDevice(currentDevice));
445  return device;
446 }
size
Write out results.
bool verbose_
Definition: CUDAService.h:36
std::string to_string(const V &value)
Definition: OMSAccess.h:71
Definition: config.py:1
#define nvmlCheck(ARG,...)
Definition: nvmlCheck.h:52
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
void setCudaLimit(cudaLimit limit, const char *name, size_t request)
Definition: CUDAService.cc:24
static void fillDescriptions(edm::ConfigurationDescriptions &descriptions)
Definition: CUDAService.cc:395
int numberOfDevices_
Definition: CUDAService.h:33
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
SharedStreamPtr get()
Definition: StreamCache.cc:20
Definition: value.py:1
Log< level::Warning, true > LogPrint
bool enabled_
Definition: CUDAService.h:35
Log< level::Info, false > LogInfo
int deviceWithMostFreeMemory() const
Definition: CUDAService.cc:425
std::vector< std::pair< int, int > > computeCapabilities_
Definition: CUDAService.h:34
void add(std::string const &label, ParameterSetDescription const &psetDescription)
constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor)
Definition: CUDAService.cc:46
StreamCache & getStreamCache()
Definition: StreamCache.cc:39
constexpr bool useCaching
int numberOfDevices()
EventCache & getEventCache()
Definition: EventCache.cc:64
std::string decodeVersion(int version)
Definition: CUDAService.cc:95
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
Log< level::Warning, false > LogWarning
CUDAService(edm::ParameterSet const &iConfig)
Constructor.
Definition: CUDAService.cc:134
int currentDevice()
Definition: currentDevice.h:10