CMS 3D CMS Logo

CUDAService.cc
Go to the documentation of this file.
1 #include <iomanip>
2 #include <iostream>
3 #include <limits>
4 
5 #include <cuda.h>
6 
21 
22 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
23  // read the current device
24  int device;
25  cudaCheck(cudaGetDevice(&device));
26  // try to set the requested limit
27  auto result = cudaDeviceSetLimit(limit, request);
28  if (cudaErrorUnsupportedLimit == result) {
29  edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
30  return;
31  }
32  // read back the limit value
33  size_t value;
34  cudaCheck(cudaDeviceGetLimit(&value, limit));
35  if (cudaSuccess != result) {
36  edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
37  << request << ", current value is " << value;
38  } else if (value != request) {
39  edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
40  << " instead of requested " << request;
41  }
42 }
43 
44 constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
45  switch (major * 10 + minor) {
46  // Fermi architecture
47  case 20: // SM 2.0: GF100 class
48  return 32;
49  case 21: // SM 2.1: GF10x class
50  return 48;
51 
52  // Kepler architecture
53  case 30: // SM 3.0: GK10x class
54  case 32: // SM 3.2: GK10x class
55  case 35: // SM 3.5: GK11x class
56  case 37: // SM 3.7: GK21x class
57  return 192;
58 
59  // Maxwell architecture
60  case 50: // SM 5.0: GM10x class
61  case 52: // SM 5.2: GM20x class
62  case 53: // SM 5.3: GM20x class
63  return 128;
64 
65  // Pascal architecture
66  case 60: // SM 6.0: GP100 class
67  return 64;
68  case 61: // SM 6.1: GP10x class
69  case 62: // SM 6.2: GP10x class
70  return 128;
71 
72  // Volta architecture
73  case 70: // SM 7.0: GV100 class
74  case 72: // SM 7.2: GV11b class
75  return 64;
76 
77  // Turing architecture
78  case 75: // SM 7.5: TU10x class
79  return 64;
80 
81  // unknown architecture, return a default value
82  default:
83  return 64;
84  }
85 }
86 
87 namespace {
88  template <template <typename> typename UniquePtr, typename Allocate>
89  void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
90  if (bufferSizes.empty())
91  return;
92 
93  auto streamPtr = cms::cuda::getStreamCache().get();
94 
95  std::vector<UniquePtr<char[]> > buffers;
96  buffers.reserve(bufferSizes.size());
97  for (auto size : bufferSizes) {
98  buffers.push_back(allocate(size, streamPtr.get()));
99  }
100  }
101 
102  void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
103  int device;
104  cudaCheck(cudaGetDevice(&device));
105  for (int i = 0; i < numberOfDevices; ++i) {
106  cudaCheck(cudaSetDevice(i));
107  preallocate<cms::cuda::device::unique_ptr>(
108  [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
109  bufferSizes);
110  }
111  cudaCheck(cudaSetDevice(device));
112  }
113 
114  void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
115  preallocate<cms::cuda::host::unique_ptr>(
116  [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
117  bufferSizes);
118  }
119 } // namespace
120 
123  bool configEnabled = config.getUntrackedParameter<bool>("enabled");
124  if (not configEnabled) {
125  edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
126  return;
127  }
128 
129  auto status = cudaGetDeviceCount(&numberOfDevices_);
130  if (cudaSuccess != status) {
131  edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
132  << "Disabling the CUDAService.";
133  return;
134  }
135  edm::LogInfo log("CUDAService");
137  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
138 
139  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
140  auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
141  auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
142  auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
143  auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
144  auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
145 
146  for (int i = 0; i < numberOfDevices_; ++i) {
147  // read information about the compute device.
148  // see the documentation of cudaGetDeviceProperties() for more information.
149  cudaDeviceProp properties;
150  cudaCheck(cudaGetDeviceProperties(&properties, i));
151  log << "CUDA device " << i << ": " << properties.name << '\n';
152 
153  // compute capabilities
154  log << " compute capability: " << properties.major << "." << properties.minor << " (sm_"
155  << properties.major << properties.minor << ")\n";
156  computeCapabilities_.emplace_back(properties.major, properties.minor);
157  log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
158  log << " CUDA cores: " << std::setw(28)
159  << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
160  log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
161 
162  // compute mode
163  static constexpr const char* computeModeDescription[] = {
164  "default (shared)", // cudaComputeModeDefault
165  "exclusive (single thread)", // cudaComputeModeExclusive
166  "prohibited", // cudaComputeModeProhibited
167  "exclusive (single process)", // cudaComputeModeExclusiveProcess
168  "unknown"};
169  log << " compute mode:" << std::right << std::setw(27)
170  << computeModeDescription[std::min(properties.computeMode,
171  static_cast<int>(std::size(computeModeDescription)) - 1)]
172  << '\n';
173 
174  // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
175  cudaCheck(cudaSetDevice(i));
176  cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
177 
178  // read the free and total amount of memory available for allocation by the device, in bytes.
179  // see the documentation of cudaMemGetInfo() for more information.
180  size_t freeMemory, totalMemory;
181  cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
182  log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
183  << totalMemory / (1 << 20) << " MB total\n";
184  log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
185  log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
186 
187  // L1 cache behaviour
188  static constexpr const char* l1CacheModeDescription[] = {
189  "unknown", "local memory", "global memory", "local and global memory"};
190  int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
191  log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
192  log << '\n';
193 
194  log << "Other capabilities\n";
195  log << " " << (properties.canMapHostMemory ? "can" : "cannot")
196  << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
197  log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
198  << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
199  log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
200  << " access pageable memory via the host's page tables\n";
201  log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
202  << " access host registered memory at the same virtual address as the host\n";
203  log << " " << (properties.unifiedAddressing ? "shares" : "does not share")
204  << " a unified address space with the host\n";
205  log << " " << (properties.managedMemory ? "supports" : "does not support")
206  << " allocating managed memory on this system\n";
207  log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
208  << " coherently access managed memory concurrently with the host\n";
209  log << " "
210  << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
211  << " directly access managed memory on the device without migration\n";
212  log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
213  << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
214  log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
215  << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
216  log << '\n';
217 
218  // set and read the CUDA device flags.
219  // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for more information.
220  log << "CUDA flags\n";
221  unsigned int flags;
222  cudaCheck(cudaGetDeviceFlags(&flags));
223  switch (flags & cudaDeviceScheduleMask) {
224  case cudaDeviceScheduleAuto:
225  log << " thread policy: default\n";
226  break;
227  case cudaDeviceScheduleSpin:
228  log << " thread policy: spin\n";
229  break;
230  case cudaDeviceScheduleYield:
231  log << " thread policy: yield\n";
232  break;
233  case cudaDeviceScheduleBlockingSync:
234  log << " thread policy: blocking sync\n";
235  break;
236  default:
237  log << " thread policy: undefined\n";
238  }
239  if (flags & cudaDeviceMapHost) {
240  log << " pinned host memory allocations: enabled\n";
241  } else {
242  log << " pinned host memory allocations: disabled\n";
243  }
244  if (flags & cudaDeviceLmemResizeToMax) {
245  log << " kernel host memory reuse: enabled\n";
246  } else {
247  log << " kernel host memory reuse: disabled\n";
248  }
249  log << '\n';
250 
251  // set and read the CUDA resource limits.
252  // see the documentation of cudaDeviceSetLimit() for more information.
253 
254  // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
255  // printf() device system call.
256  if (printfFifoSize >= 0) {
257  setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
258  }
259  // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
260  if (stackSize >= 0) {
261  setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
262  }
263  // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
264  // and free() device system calls.
265  if (mallocHeapSize >= 0) {
266  setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
267  }
268  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
269  // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
270  // a thread can safely call cudaDeviceSynchronize().
271  if (devRuntimeSyncDepth >= 0) {
272  setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
273  }
274  // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
275  // device runtime launches that can be made from the current device.
276  if (devRuntimePendingLaunchCount >= 0) {
277  setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
278  "cudaLimitDevRuntimePendingLaunchCount",
279  devRuntimePendingLaunchCount);
280  }
281  }
282 
283  size_t value;
284  log << "CUDA limits\n";
285  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
286  log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n";
287  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
288  log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
289  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
290  log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
291  if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
292  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
293  log << " runtime sync depth: " << std::setw(10) << value << '\n';
294  cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
295  log << " runtime pending launch count: " << std::setw(10) << value << '\n';
296  }
297  log << '\n';
298  }
299  log << "\n";
300 
301  // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
302  if constexpr (cms::cuda::allocator::useCaching) {
305  }
308 
309  log << "CUDAService fully initialized";
310  enabled_ = true;
311 
312  // Preallocate buffers if asked to
313  auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
314  devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
315  hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
316 }
317 
319  if (enabled_) {
320  // Explicitly destruct the allocator before the device resets below
321  if constexpr (cms::cuda::allocator::useCaching) {
324  }
327 
328  for (int i = 0; i < numberOfDevices_; ++i) {
329  cudaCheck(cudaSetDevice(i));
330  cudaCheck(cudaDeviceSynchronize());
331  // Explicitly destroys and cleans up all resources associated with the current device in the
332  // current process. Any subsequent API call to this device will reinitialize the device.
333  // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
334  cudaDeviceReset();
335  }
336  }
337 }
338 
341  desc.addUntracked<bool>("enabled", true);
342 
344  limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
345  ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
346  limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
347  limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
348  ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
349  limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
350  ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
351  limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
352  ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
354  ->setComment(
355  "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
356  "the default value.");
357 
359  allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
360  ->setComment("Preallocates buffers of given bytes on all devices");
361  allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
362  ->setComment("Preallocates buffers of given bytes on the host");
363  desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
364 
365  descriptions.add("CUDAService", desc);
366 }
367 
369  // save the current device
370  int currentDevice;
371  cudaCheck(cudaGetDevice(&currentDevice));
372 
373  size_t maxFreeMemory = 0;
374  int device = -1;
375  for (int i = 0; i < numberOfDevices_; ++i) {
376  size_t freeMemory, totalMemory;
377  cudaSetDevice(i);
378  cudaMemGetInfo(&freeMemory, &totalMemory);
379  edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
380  << totalMemory / (1 << 20) << " MB total memory";
381  if (freeMemory > maxFreeMemory) {
382  maxFreeMemory = freeMemory;
383  device = i;
384  }
385  }
386  // restore the current device
387  cudaCheck(cudaSetDevice(currentDevice));
388  return device;
389 }
ConfigurationDescriptions.h
mps_fire.i
i
Definition: mps_fire.py:355
MessageLogger.h
StreamCache.h
cms::cuda::allocator::useCaching
constexpr bool useCaching
Definition: getCachingDeviceAllocator.h:14
getCudaCoresPerSM
constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor)
Definition: CUDAService.cc:44
mps_update.status
status
Definition: mps_update.py:69
min
T min(T a, T b)
Definition: MathUtil.h:58
cms::cuda::stream
cudaStream_t stream
Definition: HistoContainer.h:57
edm::LogInfo
Definition: MessageLogger.h:254
edm::ParameterSetDescription
Definition: ParameterSetDescription.h:52
cms::cuda::StreamCache::clear
void clear()
Definition: StreamCache.cc:29
device_unique_ptr.h
notcub::CachingDeviceAllocator::FreeAllCached
cudaError_t FreeAllCached()
Frees all cached device allocations on all devices.
Definition: CachingDeviceAllocator.h:677
getCachingDeviceAllocator.h
config
Definition: config.py:1
std::size
constexpr auto size(const C &c) -> decltype(c.size())
Definition: cuda_cxx17.h:13
ReusableObjectHolder.h
host_unique_ptr.h
edm::ConfigurationDescriptions::add
void add(std::string const &label, ParameterSetDescription const &psetDescription)
Definition: ConfigurationDescriptions.cc:57
cms::cuda::numberOfDevices
int numberOfDevices()
Definition: numberOfDevices.cc:6
cms::cuda::EventCache::clear
void clear()
Definition: EventCache.cc:54
CUDAService::enabled_
bool enabled_
Definition: CUDAService.h:35
ParameterSetDescription.h
cms::cuda::currentDevice
int currentDevice()
Definition: currentDevice.h:10
edm::ConfigurationDescriptions
Definition: ConfigurationDescriptions.h:28
cms::cuda::getStreamCache
StreamCache & getStreamCache()
Definition: StreamCache.cc:39
edm::LogWarning
Definition: MessageLogger.h:141
edm::ParameterSetDescription::addUntracked
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
Definition: ParameterSetDescription.h:100
edm::ParameterSet
Definition: ParameterSet.h:36
getCachingHostAllocator.h
CUDAService::fillDescriptions
static void fillDescriptions(edm::ConfigurationDescriptions &descriptions)
Definition: CUDAService.cc:339
CUDAService::numberOfDevices_
int numberOfDevices_
Definition: CUDAService.h:33
CUDAService::deviceWithMostFreeMemory
int deviceWithMostFreeMemory() const
Definition: CUDAService.cc:368
cms::cuda::StreamCache::get
SharedStreamPtr get()
Definition: StreamCache.cc:20
value
Definition: value.py:1
cudaCheck.h
edm::LogPrint
Definition: MessageLogger.h:342
TH2PolyOfflineMaps.limits
limits
Definition: TH2PolyOfflineMaps.py:45
CUDAService.h
setCudaLimit
void setCudaLimit(cudaLimit limit, const char *name, size_t request)
Definition: CUDAService.cc:22
CUDAService::CUDAService
CUDAService(edm::ParameterSet const &iConfig)
Constructor.
Definition: CUDAService.cc:122
cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:62
notcub::CachingHostAllocator::FreeAllCached
cudaError_t FreeAllCached()
Frees all cached pinned host allocations.
Definition: CachingHostAllocator.h:603
remoteMonitoring_LED_IterMethod_cfg.limit
limit
Definition: remoteMonitoring_LED_IterMethod_cfg.py:427
cms::cuda::allocator::getCachingHostAllocator
notcub::CachingHostAllocator & getCachingHostAllocator()
Definition: getCachingHostAllocator.h:14
relativeConstraints.value
value
Definition: relativeConstraints.py:53
EventCache.h
currentDevice.h
CUDAService::~CUDAService
~CUDAService()
Definition: CUDAService.cc:318
Skims_PA_cff.name
name
Definition: Skims_PA_cff.py:17
or
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
dqm-mbProfile.log
log
Definition: dqm-mbProfile.py:17
CUDAService::computeCapabilities_
std::vector< std::pair< int, int > > computeCapabilities_
Definition: CUDAService.h:34
mps_fire.result
result
Definition: mps_fire.py:303
ParameterSet.h
HLT_2018_cff.flags
flags
Definition: HLT_2018_cff.py:11758
cms::cuda::getEventCache
EventCache & getEventCache()
Definition: EventCache.cc:64
findQualityFiles.size
size
Write out results.
Definition: findQualityFiles.py:443
cms::cuda::allocator::getCachingDeviceAllocator
notcub::CachingDeviceAllocator & getCachingDeviceAllocator()
Definition: getCachingDeviceAllocator.h:45