28 if (cudaErrorUnsupportedLimit ==
result) {
29 edm::LogWarning(
"CUDAService") <<
"CUDA device " << device <<
": unsupported limit \"" <<
name <<
"\"";
35 if (cudaSuccess !=
result) {
36 edm::LogWarning(
"CUDAService") <<
"CUDA device " << device <<
": failed to set limit \"" <<
name <<
"\" to "
37 << request <<
", current value is " <<
value;
38 }
else if (
value != request) {
40 <<
" instead of requested " << request;
45 switch (major * 10 + minor) {
88 template <
template <
typename>
typename UniquePtr,
typename Allocate>
89 void preallocate(Allocate allocate,
const std::vector<unsigned int>& bufferSizes) {
90 if (bufferSizes.empty())
95 std::vector<UniquePtr<char[]> > buffers;
96 buffers.reserve(bufferSizes.size());
97 for (
auto size : bufferSizes) {
98 buffers.push_back(allocate(
size, streamPtr.get()));
102 void devicePreallocate(
int numberOfDevices,
const std::vector<unsigned int>& bufferSizes) {
107 preallocate<cms::cuda::device::unique_ptr>(
108 [&](
size_t size, cudaStream_t
stream) {
return cms::cuda::make_device_unique<char[]>(
size,
stream); },
114 void hostPreallocate(
const std::vector<unsigned int>& bufferSizes) {
115 preallocate<cms::cuda::host::unique_ptr>(
116 [&](
size_t size, cudaStream_t
stream) {
return cms::cuda::make_host_unique<char[]>(
size,
stream); },
123 bool configEnabled =
config.getUntrackedParameter<
bool>(
"enabled");
124 if (not configEnabled) {
125 edm::LogInfo(
"CUDAService") <<
"CUDAService disabled by configuration";
130 if (cudaSuccess !=
status) {
131 edm::LogWarning(
"CUDAService") <<
"Failed to initialize the CUDA runtime.\n"
132 <<
"Disabling the CUDAService.";
137 log <<
"CUDA runtime successfully initialised, found " <<
numberOfDevices_ <<
" compute devices.\n\n";
140 auto printfFifoSize =
limits.getUntrackedParameter<
int>(
"cudaLimitPrintfFifoSize");
141 auto stackSize =
limits.getUntrackedParameter<
int>(
"cudaLimitStackSize");
142 auto mallocHeapSize =
limits.getUntrackedParameter<
int>(
"cudaLimitMallocHeapSize");
143 auto devRuntimeSyncDepth =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimeSyncDepth");
144 auto devRuntimePendingLaunchCount =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimePendingLaunchCount");
149 cudaDeviceProp properties;
150 cudaCheck(cudaGetDeviceProperties(&properties,
i));
151 log <<
"CUDA device " <<
i <<
": " << properties.name <<
'\n';
154 log <<
" compute capability: " << properties.major <<
"." << properties.minor <<
" (sm_"
155 << properties.major << properties.minor <<
")\n";
157 log <<
" streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount <<
'\n';
158 log <<
" CUDA cores: " << std::setw(28)
159 << properties.multiProcessorCount *
getCudaCoresPerSM(properties.major, properties.minor) <<
'\n';
160 log <<
" single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio <<
":1\n";
163 static constexpr
const char* computeModeDescription[] = {
165 "exclusive (single thread)",
167 "exclusive (single process)",
169 log <<
" compute mode:" << std::right << std::setw(27)
170 << computeModeDescription[
std::min(properties.computeMode,
171 static_cast<int>(
std::size(computeModeDescription)) - 1)]
176 cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
180 size_t freeMemory, totalMemory;
181 cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
182 log <<
" memory: " << std::setw(6) << freeMemory / (1 << 20) <<
" MB free / " << std::setw(6)
183 << totalMemory / (1 << 20) <<
" MB total\n";
184 log <<
" constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) <<
" kB\n";
185 log <<
" L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) <<
" kB\n";
188 static constexpr
const char* l1CacheModeDescription[] = {
189 "unknown",
"local memory",
"global memory",
"local and global memory"};
190 int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
191 log <<
" L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] <<
'\n';
194 log <<
"Other capabilities\n";
195 log <<
" " << (properties.canMapHostMemory ?
"can" :
"cannot")
196 <<
" map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
197 log <<
" " << (properties.pageableMemoryAccess ?
"supports" :
"does not support")
198 <<
" coherently accessing pageable memory without calling cudaHostRegister() on it\n";
199 log <<
" " << (properties.pageableMemoryAccessUsesHostPageTables ?
"can" :
"cannot")
200 <<
" access pageable memory via the host's page tables\n";
201 log <<
" " << (properties.canUseHostPointerForRegisteredMem ?
"can" :
"cannot")
202 <<
" access host registered memory at the same virtual address as the host\n";
203 log <<
" " << (properties.unifiedAddressing ?
"shares" :
"does not share")
204 <<
" a unified address space with the host\n";
205 log <<
" " << (properties.managedMemory ?
"supports" :
"does not support")
206 <<
" allocating managed memory on this system\n";
207 log <<
" " << (properties.concurrentManagedAccess ?
"can" :
"cannot")
208 <<
" coherently access managed memory concurrently with the host\n";
210 <<
"the host " << (properties.directManagedMemAccessFromHost ?
"can" :
"cannot")
211 <<
" directly access managed memory on the device without migration\n";
212 log <<
" " << (properties.cooperativeLaunch ?
"supports" :
"does not support")
213 <<
" launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
214 log <<
" " << (properties.cooperativeMultiDeviceLaunch ?
"supports" :
"does not support")
215 <<
" launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
220 log <<
"CUDA flags\n";
223 switch (
flags & cudaDeviceScheduleMask) {
224 case cudaDeviceScheduleAuto:
225 log <<
" thread policy: default\n";
227 case cudaDeviceScheduleSpin:
228 log <<
" thread policy: spin\n";
230 case cudaDeviceScheduleYield:
231 log <<
" thread policy: yield\n";
233 case cudaDeviceScheduleBlockingSync:
234 log <<
" thread policy: blocking sync\n";
237 log <<
" thread policy: undefined\n";
239 if (
flags & cudaDeviceMapHost) {
240 log <<
" pinned host memory allocations: enabled\n";
242 log <<
" pinned host memory allocations: disabled\n";
244 if (
flags & cudaDeviceLmemResizeToMax) {
245 log <<
" kernel host memory reuse: enabled\n";
247 log <<
" kernel host memory reuse: disabled\n";
256 if (printfFifoSize >= 0) {
257 setCudaLimit(cudaLimitPrintfFifoSize,
"cudaLimitPrintfFifoSize", printfFifoSize);
260 if (stackSize >= 0) {
261 setCudaLimit(cudaLimitStackSize,
"cudaLimitStackSize", stackSize);
265 if (mallocHeapSize >= 0) {
266 setCudaLimit(cudaLimitMallocHeapSize,
"cudaLimitMallocHeapSize", mallocHeapSize);
268 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
271 if (devRuntimeSyncDepth >= 0) {
272 setCudaLimit(cudaLimitDevRuntimeSyncDepth,
"cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
276 if (devRuntimePendingLaunchCount >= 0) {
278 "cudaLimitDevRuntimePendingLaunchCount",
279 devRuntimePendingLaunchCount);
284 log <<
"CUDA limits\n";
286 log <<
" printf buffer size: " << std::setw(10) <<
value / (1 << 20) <<
" MB\n";
288 log <<
" stack size: " << std::setw(10) <<
value / (1 << 10) <<
" kB\n";
290 log <<
" malloc heap size: " << std::setw(10) <<
value / (1 << 20) <<
" MB\n";
291 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
292 cudaCheck(cudaDeviceGetLimit(&
value, cudaLimitDevRuntimeSyncDepth));
293 log <<
" runtime sync depth: " << std::setw(10) <<
value <<
'\n';
294 cudaCheck(cudaDeviceGetLimit(&
value, cudaLimitDevRuntimePendingLaunchCount));
295 log <<
" runtime pending launch count: " << std::setw(10) <<
value <<
'\n';
309 log <<
"CUDAService fully initialized";
314 devicePreallocate(
numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >(
"devicePreallocate"));
315 hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >(
"hostPreallocate"));
341 desc.addUntracked<
bool>(
"enabled",
true);
344 limits.addUntracked<
int>(
"cudaLimitPrintfFifoSize", -1)
345 ->setComment(
"Size in bytes of the shared FIFO used by the printf() device system call.");
346 limits.addUntracked<
int>(
"cudaLimitStackSize", -1)->setComment(
"Stack size in bytes of each GPU thread.");
347 limits.addUntracked<
int>(
"cudaLimitMallocHeapSize", -1)
348 ->setComment(
"Size in bytes of the heap used by the malloc() and free() device system calls.");
349 limits.addUntracked<
int>(
"cudaLimitDevRuntimeSyncDepth", -1)
350 ->setComment(
"Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
351 limits.addUntracked<
int>(
"cudaLimitDevRuntimePendingLaunchCount", -1)
352 ->setComment(
"Maximum number of outstanding device runtime launches that can be made from the current device.");
355 "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
356 "the default value.");
359 allocator.
addUntracked<std::vector<unsigned int> >(
"devicePreallocate", std::vector<unsigned int>{})
360 ->setComment(
"Preallocates buffers of given bytes on all devices");
361 allocator.
addUntracked<std::vector<unsigned int> >(
"hostPreallocate", std::vector<unsigned int>{})
362 ->setComment(
"Preallocates buffers of given bytes on the host");
365 descriptions.
add(
"CUDAService",
desc);
373 size_t maxFreeMemory = 0;
376 size_t freeMemory, totalMemory;
378 cudaMemGetInfo(&freeMemory, &totalMemory);
379 edm::LogPrint(
"CUDAService") <<
"CUDA device " <<
i <<
": " << freeMemory / (1 << 20) <<
" MB free / "
380 << totalMemory / (1 << 20) <<
" MB total memory";
381 if (freeMemory > maxFreeMemory) {
382 maxFreeMemory = freeMemory;