6 #include <cuda_runtime.h>
29 auto result = cudaDeviceSetLimit(limit, request);
30 if (cudaErrorUnsupportedLimit ==
result) {
31 edm::LogWarning(
"CUDAService") <<
"CUDA device " << device <<
": unsupported limit \"" << name <<
"\"";
36 result = cudaDeviceGetLimit(&value, limit);
37 if (cudaSuccess !=
result) {
38 edm::LogWarning(
"CUDAService") <<
"CUDA device " << device <<
": failed to set limit \"" << name <<
"\" to "
39 << request <<
", current value is " <<
value;
40 }
else if (value != request) {
41 edm::LogWarning(
"CUDAService") <<
"CUDA device " << device <<
": limit \"" << name <<
"\" set to " << value
42 <<
" instead of requested " << request;
47 switch (major * 10 + minor) {
96 return std::to_string(version / 1000) +
'.' + std::to_string(version % 1000 / 10);
100 template <
template <
typename>
typename UniquePtr,
typename Allocate>
101 void preallocate(Allocate allocate,
const std::vector<unsigned int>& bufferSizes) {
102 if (bufferSizes.empty())
107 std::vector<UniquePtr<char[]> > buffers;
108 buffers.reserve(bufferSizes.size());
109 for (
auto size : bufferSizes) {
110 buffers.push_back(allocate(
size, streamPtr.get()));
114 void devicePreallocate(
int numberOfDevices,
const std::vector<unsigned int>& bufferSizes) {
119 preallocate<cms::cuda::device::unique_ptr>(
120 [&](
size_t size, cudaStream_t
stream) {
return cms::cuda::make_device_unique<char[]>(
size,
stream); },
126 void hostPreallocate(
const std::vector<unsigned int>& bufferSizes) {
127 preallocate<cms::cuda::host::unique_ptr>(
128 [&](
size_t size, cudaStream_t
stream) {
return cms::cuda::make_host_unique<char[]>(
size,
stream); },
136 if (not configEnabled) {
137 edm::LogInfo(
"CUDAService") <<
"CUDAService disabled by configuration";
142 if (cudaSuccess !=
status) {
143 edm::LogWarning(
"CUDAService") <<
"Failed to initialize the CUDA runtime.\n"
144 <<
"Disabling the CUDAService.";
150 char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
151 nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
152 nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion,
sizeof(systemDriverVersion)));
157 int driverVersion = 0;
158 cudaCheck(cudaDriverGetVersion(&driverVersion));
162 int runtimeVersion = 0;
163 cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
167 log <<
"NVIDIA driver: " << systemDriverVersion <<
'\n';
172 log <<
"CUDA runtime successfully initialised, found " <<
numberOfDevices_ <<
" compute devices.\n";
174 log <<
"CUDA runtime version " <<
decodeVersion(runtimeVersion) <<
", driver version "
175 <<
decodeVersion(driverVersion) <<
", NVIDIA driver version " << systemDriverVersion;
179 auto printfFifoSize =
limits.getUntrackedParameter<
int>(
"cudaLimitPrintfFifoSize");
180 auto stackSize =
limits.getUntrackedParameter<
int>(
"cudaLimitStackSize");
181 auto mallocHeapSize =
limits.getUntrackedParameter<
int>(
"cudaLimitMallocHeapSize");
182 auto devRuntimeSyncDepth =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimeSyncDepth");
183 auto devRuntimePendingLaunchCount =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimePendingLaunchCount");
188 cudaDeviceProp properties;
189 cudaCheck(cudaGetDeviceProperties(&properties,
i));
190 log <<
'\n' <<
"CUDA device " <<
i <<
": " << properties.name;
198 log <<
" compute capability: " << properties.major <<
"." << properties.minor;
200 log <<
" (sm_" << properties.major << properties.minor <<
")";
203 log <<
" streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount <<
'\n';
204 log <<
" CUDA cores: " << std::setw(28)
205 << properties.multiProcessorCount *
getCudaCoresPerSM(properties.major, properties.minor) <<
'\n';
206 log <<
" single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
211 static constexpr
const char* computeModeDescription[] = {
213 "exclusive (single thread)",
215 "exclusive (single process)",
218 log <<
" compute mode:" << std::right << std::setw(27)
219 << computeModeDescription[
std::min(properties.computeMode,
220 static_cast<int>(
std::size(computeModeDescription)) - 1)]
226 cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
231 size_t freeMemory, totalMemory;
232 cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
233 log <<
" memory: " << std::setw(6) << freeMemory / (1 << 20) <<
" MB free / " << std::setw(6)
234 << totalMemory / (1 << 20) <<
" MB total\n";
235 log <<
" constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) <<
" kB\n";
236 log <<
" L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) <<
" kB\n";
241 static constexpr
const char* l1CacheModeDescription[] = {
242 "unknown",
"local memory",
"global memory",
"local and global memory"};
243 int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
244 log <<
" L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] <<
'\n';
247 log <<
"Other capabilities\n";
248 log <<
" " << (properties.canMapHostMemory ?
"can" :
"cannot")
249 <<
" map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
250 log <<
" " << (properties.pageableMemoryAccess ?
"supports" :
"does not support")
251 <<
" coherently accessing pageable memory without calling cudaHostRegister() on it\n";
252 log <<
" " << (properties.pageableMemoryAccessUsesHostPageTables ?
"can" :
"cannot")
253 <<
" access pageable memory via the host's page tables\n";
254 log <<
" " << (properties.canUseHostPointerForRegisteredMem ?
"can" :
"cannot")
255 <<
" access host registered memory at the same virtual address as the host\n";
256 log <<
" " << (properties.unifiedAddressing ?
"shares" :
"does not share")
257 <<
" a unified address space with the host\n";
258 log <<
" " << (properties.managedMemory ?
"supports" :
"does not support")
259 <<
" allocating managed memory on this system\n";
260 log <<
" " << (properties.concurrentManagedAccess ?
"can" :
"cannot")
261 <<
" coherently access managed memory concurrently with the host\n";
263 <<
"the host " << (properties.directManagedMemAccessFromHost ?
"can" :
"cannot")
264 <<
" directly access managed memory on the device without migration\n";
265 log <<
" " << (properties.cooperativeLaunch ?
"supports" :
"does not support")
266 <<
" launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
267 log <<
" " << (properties.cooperativeMultiDeviceLaunch ?
"supports" :
"does not support")
268 <<
" launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
275 log <<
"CUDA flags\n";
278 switch (flags & cudaDeviceScheduleMask) {
279 case cudaDeviceScheduleAuto:
280 log <<
" thread policy: default\n";
282 case cudaDeviceScheduleSpin:
283 log <<
" thread policy: spin\n";
285 case cudaDeviceScheduleYield:
286 log <<
" thread policy: yield\n";
288 case cudaDeviceScheduleBlockingSync:
289 log <<
" thread policy: blocking sync\n";
292 log <<
" thread policy: undefined\n";
294 if (flags & cudaDeviceMapHost) {
295 log <<
" pinned host memory allocations: enabled\n";
297 log <<
" pinned host memory allocations: disabled\n";
299 if (flags & cudaDeviceLmemResizeToMax) {
300 log <<
" kernel host memory reuse: enabled\n";
302 log <<
" kernel host memory reuse: disabled\n";
312 if (printfFifoSize >= 0) {
313 setCudaLimit(cudaLimitPrintfFifoSize,
"cudaLimitPrintfFifoSize", printfFifoSize);
316 if (stackSize >= 0) {
317 setCudaLimit(cudaLimitStackSize,
"cudaLimitStackSize", stackSize);
321 if (mallocHeapSize >= 0) {
322 setCudaLimit(cudaLimitMallocHeapSize,
"cudaLimitMallocHeapSize", mallocHeapSize);
324 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
327 if (devRuntimeSyncDepth >= 0) {
328 setCudaLimit(cudaLimitDevRuntimeSyncDepth,
"cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
332 if (devRuntimePendingLaunchCount >= 0) {
334 "cudaLimitDevRuntimePendingLaunchCount",
335 devRuntimePendingLaunchCount);
341 log <<
"CUDA limits\n";
342 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
343 log <<
" printf buffer size: " << std::setw(10) << value / (1 << 20) <<
" MB\n";
344 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
345 log <<
" stack size: " << std::setw(10) << value / (1 << 10) <<
" kB\n";
346 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
347 log <<
" malloc heap size: " << std::setw(10) << value / (1 << 20) <<
" MB\n";
348 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
349 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
350 log <<
" runtime sync depth: " << std::setw(10) << value <<
'\n';
351 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
352 log <<
" runtime pending launch count: " << std::setw(10) << value <<
'\n';
365 log <<
'\n' <<
"CUDAService fully initialized";
371 devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >(
"devicePreallocate"));
372 hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >(
"hostPreallocate"));
401 limits.addUntracked<
int>(
"cudaLimitPrintfFifoSize", -1)
402 ->setComment(
"Size in bytes of the shared FIFO used by the printf() device system call.");
403 limits.addUntracked<
int>(
"cudaLimitStackSize", -1)->setComment(
"Stack size in bytes of each GPU thread.");
404 limits.addUntracked<
int>(
"cudaLimitMallocHeapSize", -1)
405 ->setComment(
"Size in bytes of the heap used by the malloc() and free() device system calls.");
406 limits.addUntracked<
int>(
"cudaLimitDevRuntimeSyncDepth", -1)
407 ->setComment(
"Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
408 limits.addUntracked<
int>(
"cudaLimitDevRuntimePendingLaunchCount", -1)
409 ->setComment(
"Maximum number of outstanding device runtime launches that can be made from the current device.");
412 "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
413 "the default value.");
416 allocator.
addUntracked<std::vector<unsigned int> >(
"devicePreallocate", std::vector<unsigned int>{})
417 ->setComment(
"Preallocates buffers of given bytes on all devices");
418 allocator.
addUntracked<std::vector<unsigned int> >(
"hostPreallocate", std::vector<unsigned int>{})
419 ->setComment(
"Preallocates buffers of given bytes on the host");
422 descriptions.
add(
"CUDAService", desc);
428 cudaCheck(cudaGetDevice(¤tDevice));
430 size_t maxFreeMemory = 0;
433 size_t freeMemory, totalMemory;
435 cudaMemGetInfo(&freeMemory, &totalMemory);
436 edm::LogPrint(
"CUDAService") <<
"CUDA device " <<
i <<
": " << freeMemory / (1 << 20) <<
" MB free / "
437 << totalMemory / (1 << 20) <<
" MB total memory";
438 if (freeMemory > maxFreeMemory) {
439 maxFreeMemory = freeMemory;
T getUntrackedParameter(std::string const &, T const &) const
static std::vector< std::string > checklist log
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventIDconst &, edm::Timestampconst & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
#define nvmlCheck(ARG,...)
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
void setCudaLimit(cudaLimit limit, const char *name, size_t request)
static void fillDescriptions(edm::ConfigurationDescriptions &descriptions)
Log< level::Warning, true > LogPrint
Log< level::Info, false > LogInfo
std::vector< std::pair< int, int > > computeCapabilities_
void add(std::string const &label, ParameterSetDescription const &psetDescription)
int deviceWithMostFreeMemory() const
constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor)
void cachingAllocatorsConstruct()
tuple config
parse the configuration file
StreamCache & getStreamCache()
constexpr bool useCaching
void cachingAllocatorsFreeCached()
EventCache & getEventCache()
std::string decodeVersion(int version)
#define cudaCheck(ARG,...)
Log< level::Warning, false > LogWarning
CUDAService(edm::ParameterSet const &iConfig)
Constructor.
tuple size
Write out results.