CMS 3D CMS Logo

CUDAService.cc
Go to the documentation of this file.
1 #include <iomanip>
2 #include <iostream>
3 #include <limits>
4 #include <set>
5 #include <stdexcept>
6 #include <string>
7 #include <utility>
8 #include <vector>
9 
10 #include <cuda.h>
11 #include <cuda_runtime.h>
12 #include <nvml.h>
13 
30 
31 class CUDAService : public CUDAInterface {
32 public:
34  ~CUDAService() override;
35 
36  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
37 
38  bool enabled() const final { return enabled_; }
39 
40  int numberOfDevices() const final { return numberOfDevices_; }
41 
42  // Return the (major, minor) CUDA compute capability of the given device.
43  std::pair<int, int> computeCapability(int device) const final {
44  int size = computeCapabilities_.size();
45  if (device < 0 or device >= size) {
46  throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
47  std::to_string(size - 1));
48  }
49  return computeCapabilities_[device];
50  }
51 
52 private:
54  std::vector<std::pair<int, int>> computeCapabilities_;
55  bool enabled_ = false;
56  bool verbose_ = false;
57 };
58 
59 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
60  // read the current device
61  int device;
62  cudaCheck(cudaGetDevice(&device));
63  // try to set the requested limit
64  auto result = cudaDeviceSetLimit(limit, request);
65  if (cudaErrorUnsupportedLimit == result) {
66  edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
67  return;
68  }
69  // read back the limit value
70  size_t value;
71  result = cudaDeviceGetLimit(&value, limit);
72  if (cudaSuccess != result) {
73  edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
74  << request << ", current value is " << value;
75  } else if (value != request) {
76  edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
77  << " instead of requested " << request;
78  }
79 }
80 
81 constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
82  switch (major * 10 + minor) {
83  // Fermi architecture
84  case 20: // SM 2.0: GF100 class
85  return 32;
86  case 21: // SM 2.1: GF10x class
87  return 48;
88 
89  // Kepler architecture
90  case 30: // SM 3.0: GK10x class
91  case 32: // SM 3.2: GK10x class
92  case 35: // SM 3.5: GK11x class
93  case 37: // SM 3.7: GK21x class
94  return 192;
95 
96  // Maxwell architecture
97  case 50: // SM 5.0: GM10x class
98  case 52: // SM 5.2: GM20x class
99  case 53: // SM 5.3: GM20x class
100  return 128;
101 
102  // Pascal architecture
103  case 60: // SM 6.0: GP100 class
104  return 64;
105  case 61: // SM 6.1: GP10x class
106  case 62: // SM 6.2: GP10x class
107  return 128;
108 
109  // Volta architecture
110  case 70: // SM 7.0: GV100 class
111  case 72: // SM 7.2: GV11b class
112  return 64;
113 
114  // Turing architecture
115  case 75: // SM 7.5: TU10x class
116  return 64;
117 
118  // Ampere architecture
119  case 80: // SM 8.0: GA100 class
120  return 64;
121  case 86: // SM 8.6: GA10x class
122  return 128;
123 
124  // Ada Lovelace architectures
125  case 89: // SM 8.9: AD10x class
126  return 128;
127 
128  // Hopper architecture
129  case 90: // SM 9.0: GH100 class
130  return 128;
131 
132  // unknown architecture, return a default value
133  default:
134  return 64;
135  }
136 }
137 
139  return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10);
140 }
141 
142 namespace {
143  template <template <typename> typename UniquePtr, typename Allocate>
144  void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
145  if (bufferSizes.empty())
146  return;
147 
148  auto streamPtr = cms::cuda::getStreamCache().get();
149 
150  std::vector<UniquePtr<char[]>> buffers;
151  buffers.reserve(bufferSizes.size());
152  for (auto size : bufferSizes) {
153  buffers.push_back(allocate(size, streamPtr.get()));
154  }
155  }
156 
157  void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
158  int device;
159  cudaCheck(cudaGetDevice(&device));
160  for (int i = 0; i < numberOfDevices; ++i) {
161  cudaCheck(cudaSetDevice(i));
162  preallocate<cms::cuda::device::unique_ptr>(
163  [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
164  bufferSizes);
165  }
166  cudaCheck(cudaSetDevice(device));
167  }
168 
169  void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
170  preallocate<cms::cuda::host::unique_ptr>(
171  [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
172  bufferSizes);
173  }
174 } // namespace
175 
177 CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
178  if (not config.getUntrackedParameter<bool>("enabled")) {
179  edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
180  return;
181  }
182 
183  auto status = cudaGetDeviceCount(&numberOfDevices_);
184  if (cudaSuccess != status) {
185  edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
186  << "Disabling the CUDAService.";
187  return;
188  }
190 
191  // NVIDIA system driver version, e.g. 470.57.02
192  char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
193  nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
194  nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
195  nvmlCheck(nvmlShutdown());
196 
197  // CUDA driver version, e.g. 11.4
198  // the full version, like 11.4.1 or 11.4.100, is not reported
199  int driverVersion = 0;
200  cudaCheck(cudaDriverGetVersion(&driverVersion));
201 
202  // CUDA runtime version, e.g. 11.4
203  // the full version, like 11.4.1 or 11.4.108, is not reported
204  int runtimeVersion = 0;
205  cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
206 
207  edm::LogInfo log("CUDAService");
208  if (verbose_) {
209  log << "NVIDIA driver: " << systemDriverVersion << '\n';
210  log << "CUDA driver API: " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
211  << ")\n";
212  log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
213  << ")\n";
214  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
215  } else {
216  log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
217  << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
218  }
219 
220  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
221  auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
222  auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
223  auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
224  auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
225  auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
226 
227  std::set<std::string> models;
228 
229  for (int i = 0; i < numberOfDevices_; ++i) {
230  // read information about the compute device.
231  // see the documentation of cudaGetDeviceProperties() for more information.
232  cudaDeviceProp properties;
233  cudaCheck(cudaGetDeviceProperties(&properties, i));
234  log << '\n' << "CUDA device " << i << ": " << properties.name;
235  if (verbose_) {
236  log << '\n';
237  }
238  models.insert(std::string(properties.name));
239 
240  // compute capabilities
241  computeCapabilities_.emplace_back(properties.major, properties.minor);
242  if (verbose_) {
243  log << " compute capability: " << properties.major << "." << properties.minor;
244  }
245  log << " (sm_" << properties.major << properties.minor << ")";
246  if (verbose_) {
247  log << '\n';
248  log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
249  log << " CUDA cores: " << std::setw(28)
250  << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
251  log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
252  << ":1\n";
253  }
254 
255  // compute mode
256  static constexpr const char* computeModeDescription[] = {
257  "default (shared)", // cudaComputeModeDefault
258  "exclusive (single thread)", // cudaComputeModeExclusive
259  "prohibited", // cudaComputeModeProhibited
260  "exclusive (single process)", // cudaComputeModeExclusiveProcess
261  "unknown"};
262  if (verbose_) {
263  log << " compute mode:" << std::right << std::setw(27)
264  << computeModeDescription[std::min(properties.computeMode,
265  static_cast<int>(std::size(computeModeDescription)) - 1)]
266  << '\n';
267  }
268 
269  // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
270  cudaCheck(cudaSetDevice(i));
271  cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
272 
273  // read the free and total amount of memory available for allocation by the device, in bytes.
274  // see the documentation of cudaMemGetInfo() for more information.
275  if (verbose_) {
276  size_t freeMemory, totalMemory;
277  cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
278  log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
279  << totalMemory / (1 << 20) << " MB total\n";
280  log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
281  log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
282  }
283 
284  // L1 cache behaviour
285  if (verbose_) {
286  static constexpr const char* l1CacheModeDescription[] = {
287  "unknown", "local memory", "global memory", "local and global memory"};
288  int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
289  log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
290  log << '\n';
291 
292  log << "Other capabilities\n";
293  log << " " << (properties.canMapHostMemory ? "can" : "cannot")
294  << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
295  log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
296  << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
297  log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
298  << " access pageable memory via the host's page tables\n";
299  log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
300  << " access host registered memory at the same virtual address as the host\n";
301  log << " " << (properties.unifiedAddressing ? "shares" : "does not share")
302  << " a unified address space with the host\n";
303  log << " " << (properties.managedMemory ? "supports" : "does not support")
304  << " allocating managed memory on this system\n";
305  log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
306  << " coherently access managed memory concurrently with the host\n";
307  log << " "
308  << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
309  << " directly access managed memory on the device without migration\n";
310  log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
311  << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
312  log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
313  << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
314  log << '\n';
315  }
316 
317  // set and read the CUDA device flags.
318  // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for more information.
319  if (verbose_) {
320  log << "CUDA flags\n";
321  unsigned int flags;
322  cudaCheck(cudaGetDeviceFlags(&flags));
323  switch (flags & cudaDeviceScheduleMask) {
324  case cudaDeviceScheduleAuto:
325  log << " thread policy: default\n";
326  break;
327  case cudaDeviceScheduleSpin:
328  log << " thread policy: spin\n";
329  break;
330  case cudaDeviceScheduleYield:
331  log << " thread policy: yield\n";
332  break;
333  case cudaDeviceScheduleBlockingSync:
334  log << " thread policy: blocking sync\n";
335  break;
336  default:
337  log << " thread policy: undefined\n";
338  }
339  if (flags & cudaDeviceMapHost) {
340  log << " pinned host memory allocations: enabled\n";
341  } else {
342  log << " pinned host memory allocations: disabled\n";
343  }
344  if (flags & cudaDeviceLmemResizeToMax) {
345  log << " kernel host memory reuse: enabled\n";
346  } else {
347  log << " kernel host memory reuse: disabled\n";
348  }
349  log << '\n';
350  }
351 
352  // set and read the CUDA resource limits.
353  // see the documentation of cudaDeviceSetLimit() for more information.
354 
355  // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
356  // printf() device system call.
357  if (printfFifoSize >= 0) {
358  setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
359  }
360  // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
361  if (stackSize >= 0) {
362  setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
363  }
364  // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
365  // and free() device system calls.
366  if (mallocHeapSize >= 0) {
367  setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
368  }
369  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
370  // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
371  // a thread can safely call cudaDeviceSynchronize().
372  if (devRuntimeSyncDepth >= 0) {
373  setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
374  }
375  // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
376  // device runtime launches that can be made from the current device.
377  if (devRuntimePendingLaunchCount >= 0) {
378  setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
379  "cudaLimitDevRuntimePendingLaunchCount",
380  devRuntimePendingLaunchCount);
381  }
382  }
383 
384  if (verbose_) {
385  size_t value;
386  log << "CUDA limits\n";
387  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
388  log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n";
389  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
390  log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
391  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
392  log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
393  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
394  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
395  log << " runtime sync depth: " << std::setw(10) << value << '\n';
396  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
397  log << " runtime pending launch count: " << std::setw(10) << value << '\n';
398  }
399  }
400  }
401 
402  edm::Service<edm::ResourceInformation> resourceInformationService;
403  if (resourceInformationService.isAvailable()) {
404  std::vector<std::string> modelsV(models.begin(), models.end());
405  resourceInformationService->setGPUModels(modelsV);
406  std::string nvidiaDriverVersion{systemDriverVersion};
407  resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
408  resourceInformationService->setCudaDriverVersion(driverVersion);
409  resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
410  }
411 
412  // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
413  if constexpr (cms::cuda::allocator::useCaching) {
415  }
418 
419  if (verbose_) {
420  log << '\n' << "CUDAService fully initialized";
421  }
422  enabled_ = true;
423 
424  // Preallocate buffers if asked to
425  auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
426  devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int>>("devicePreallocate"));
427  hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int>>("hostPreallocate"));
428 }
429 
431  if (enabled_) {
432  // Explicitly destruct the allocator before the device resets below
433  if constexpr (cms::cuda::allocator::useCaching) {
435  }
438 
439  for (int i = 0; i < numberOfDevices_; ++i) {
440  cudaCheck(cudaSetDevice(i));
441  cudaCheck(cudaDeviceSynchronize());
442  // Explicitly destroys and cleans up all resources associated with the current device in the
443  // current process. Any subsequent API call to this device will reinitialize the device.
444  // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
445  cudaDeviceReset();
446  }
447  }
448 }
449 
452  desc.addUntracked<bool>("enabled", true);
453  desc.addUntracked<bool>("verbose", false);
454 
456  limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
457  ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
458  limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
459  limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
460  ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
461  limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
462  ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
463  limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
464  ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
465  desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
466  ->setComment(
467  "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
468  "the default value.");
469 
471  allocator.addUntracked<std::vector<unsigned int>>("devicePreallocate", std::vector<unsigned int>{})
472  ->setComment("Preallocates buffers of given bytes on all devices");
473  allocator.addUntracked<std::vector<unsigned int>>("hostPreallocate", std::vector<unsigned int>{})
474  ->setComment("Preallocates buffers of given bytes on the host");
475  desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
476 
477  descriptions.add("CUDAService", desc);
478 }
479 
480 namespace edm {
481  namespace service {
482  inline bool isProcessWideService(CUDAService const*) { return true; }
483  } // namespace service
484 } // namespace edm
485 
size
Write out results.
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
Definition: ServiceMaker.h:102
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
std::pair< int, int > computeCapability(int device) const final
Definition: CUDAService.cc:43
bool isProcessWideService(TFileService const *)
Definition: TFileService.h:98
Definition: config.py:1
Definition: models.py:1
#define nvmlCheck(ARG,...)
Definition: nvmlCheck.h:52
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
void setCudaLimit(cudaLimit limit, const char *name, size_t request)
Definition: CUDAService.cc:59
static std::string to_string(const XMLCh *ch)
virtual void setCudaRuntimeVersion(int)=0
static void fillDescriptions(edm::ConfigurationDescriptions &descriptions)
Definition: CUDAService.cc:450
int numberOfDevices_
Definition: CUDAService.cc:53
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
CUDAService(edm::ParameterSet const &config)
Constructor.
Definition: CUDAService.cc:177
SharedStreamPtr get()
Definition: StreamCache.cc:20
Definition: value.py:1
Log< level::Info, false > LogInfo
std::vector< std::pair< int, int > > computeCapabilities_
Definition: CUDAService.cc:54
void add(std::string const &label, ParameterSetDescription const &psetDescription)
virtual void setNvidiaDriverVersion(std::string const &)=0
constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor)
Definition: CUDAService.cc:81
~CUDAService() override
Definition: CUDAService.cc:430
HLT enums.
StreamCache & getStreamCache()
Definition: StreamCache.cc:39
constexpr bool useCaching
int numberOfDevices()
bool isAvailable() const
Definition: Service.h:40
EventCache & getEventCache()
Definition: EventCache.cc:64
std::string decodeVersion(int version)
Definition: CUDAService.cc:138
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
int numberOfDevices() const final
Definition: CUDAService.cc:40
Log< level::Warning, false > LogWarning
virtual void setGPUModels(std::vector< std::string > const &)=0
bool enabled() const final
Definition: CUDAService.cc:38
virtual void setCudaDriverVersion(int)=0