Constructor.
178 if (not
config.getUntrackedParameter<
bool>(
"enabled")) {
179 edm::LogInfo(
"CUDAService") <<
"CUDAService disabled by configuration";
184 if (cudaSuccess !=
status) {
185 edm::LogWarning(
"CUDAService") <<
"Failed to initialize the CUDA runtime.\n" 186 <<
"Disabling the CUDAService.";
192 char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
193 nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
194 nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion,
sizeof(systemDriverVersion)));
199 int driverVersion = 0;
200 cudaCheck(cudaDriverGetVersion(&driverVersion));
204 int runtimeVersion = 0;
205 cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
209 log <<
"NVIDIA driver: " << systemDriverVersion <<
'\n';
214 log <<
"CUDA runtime successfully initialised, found " <<
numberOfDevices_ <<
" compute devices.\n";
216 log <<
"CUDA runtime version " <<
decodeVersion(runtimeVersion) <<
", driver version " 217 <<
decodeVersion(driverVersion) <<
", NVIDIA driver version " << systemDriverVersion;
221 auto printfFifoSize =
limits.getUntrackedParameter<
int>(
"cudaLimitPrintfFifoSize");
222 auto stackSize =
limits.getUntrackedParameter<
int>(
"cudaLimitStackSize");
223 auto mallocHeapSize =
limits.getUntrackedParameter<
int>(
"cudaLimitMallocHeapSize");
224 auto devRuntimeSyncDepth =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimeSyncDepth");
225 auto devRuntimePendingLaunchCount =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimePendingLaunchCount");
227 std::set<std::string>
models;
232 cudaDeviceProp properties;
233 cudaCheck(cudaGetDeviceProperties(&properties,
i));
234 log <<
'\n' <<
"CUDA device " <<
i <<
": " << properties.name;
243 log <<
" compute capability: " << properties.major <<
"." << properties.minor;
245 log <<
" (sm_" << properties.major << properties.minor <<
")";
248 log <<
" streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount <<
'\n';
249 log <<
" CUDA cores: " << std::setw(28)
250 << properties.multiProcessorCount *
getCudaCoresPerSM(properties.major, properties.minor) <<
'\n';
251 log <<
" single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
256 static constexpr const char* computeModeDescription[] = {
258 "exclusive (single thread)",
260 "exclusive (single process)",
263 log <<
" compute mode:" << std::right << std::setw(27)
264 << computeModeDescription[
std::min(properties.computeMode,
265 static_cast<int>(std::size(computeModeDescription)) - 1)]
271 cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
276 size_t freeMemory, totalMemory;
277 cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
278 log <<
" memory: " << std::setw(6) << freeMemory / (1 << 20) <<
" MB free / " << std::setw(6)
279 << totalMemory / (1 << 20) <<
" MB total\n";
280 log <<
" constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) <<
" kB\n";
281 log <<
" L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) <<
" kB\n";
286 static constexpr const char* l1CacheModeDescription[] = {
287 "unknown",
"local memory",
"global memory",
"local and global memory"};
288 int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
289 log <<
" L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] <<
'\n';
292 log <<
"Other capabilities\n";
293 log <<
" " << (properties.canMapHostMemory ?
"can" :
"cannot")
294 <<
" map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
295 log <<
" " << (properties.pageableMemoryAccess ?
"supports" :
"does not support")
296 <<
" coherently accessing pageable memory without calling cudaHostRegister() on it\n";
297 log <<
" " << (properties.pageableMemoryAccessUsesHostPageTables ?
"can" :
"cannot")
298 <<
" access pageable memory via the host's page tables\n";
299 log <<
" " << (properties.canUseHostPointerForRegisteredMem ?
"can" :
"cannot")
300 <<
" access host registered memory at the same virtual address as the host\n";
301 log <<
" " << (properties.unifiedAddressing ?
"shares" :
"does not share")
302 <<
" a unified address space with the host\n";
303 log <<
" " << (properties.managedMemory ?
"supports" :
"does not support")
304 <<
" allocating managed memory on this system\n";
305 log <<
" " << (properties.concurrentManagedAccess ?
"can" :
"cannot")
306 <<
" coherently access managed memory concurrently with the host\n";
308 <<
"the host " << (properties.directManagedMemAccessFromHost ?
"can" :
"cannot")
309 <<
" directly access managed memory on the device without migration\n";
310 log <<
" " << (properties.cooperativeLaunch ?
"supports" :
"does not support")
311 <<
" launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
312 log <<
" " << (properties.cooperativeMultiDeviceLaunch ?
"supports" :
"does not support")
313 <<
" launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
320 log <<
"CUDA flags\n";
323 switch (
flags & cudaDeviceScheduleMask) {
324 case cudaDeviceScheduleAuto:
325 log <<
" thread policy: default\n";
327 case cudaDeviceScheduleSpin:
328 log <<
" thread policy: spin\n";
330 case cudaDeviceScheduleYield:
331 log <<
" thread policy: yield\n";
333 case cudaDeviceScheduleBlockingSync:
334 log <<
" thread policy: blocking sync\n";
337 log <<
" thread policy: undefined\n";
339 if (
flags & cudaDeviceMapHost) {
340 log <<
" pinned host memory allocations: enabled\n";
342 log <<
" pinned host memory allocations: disabled\n";
344 if (
flags & cudaDeviceLmemResizeToMax) {
345 log <<
" kernel host memory reuse: enabled\n";
347 log <<
" kernel host memory reuse: disabled\n";
357 if (printfFifoSize >= 0) {
358 setCudaLimit(cudaLimitPrintfFifoSize,
"cudaLimitPrintfFifoSize", printfFifoSize);
361 if (stackSize >= 0) {
362 setCudaLimit(cudaLimitStackSize,
"cudaLimitStackSize", stackSize);
366 if (mallocHeapSize >= 0) {
367 setCudaLimit(cudaLimitMallocHeapSize,
"cudaLimitMallocHeapSize", mallocHeapSize);
369 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
372 if (devRuntimeSyncDepth >= 0) {
373 setCudaLimit(cudaLimitDevRuntimeSyncDepth,
"cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
377 if (devRuntimePendingLaunchCount >= 0) {
379 "cudaLimitDevRuntimePendingLaunchCount",
380 devRuntimePendingLaunchCount);
386 log <<
"CUDA limits\n";
388 log <<
" printf buffer size: " << std::setw(10) <<
value / (1 << 20) <<
" MB\n";
390 log <<
" stack size: " << std::setw(10) <<
value / (1 << 10) <<
" kB\n";
392 log <<
" malloc heap size: " << std::setw(10) <<
value / (1 << 20) <<
" MB\n";
393 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
394 cudaCheck(cudaDeviceGetLimit(&
value, cudaLimitDevRuntimeSyncDepth));
395 log <<
" runtime sync depth: " << std::setw(10) <<
value <<
'\n';
396 cudaCheck(cudaDeviceGetLimit(&
value, cudaLimitDevRuntimePendingLaunchCount));
397 log <<
" runtime pending launch count: " << std::setw(10) <<
value <<
'\n';
404 std::vector<std::string> modelsV(
models.begin(),
models.end());
406 std::string nvidiaDriverVersion{systemDriverVersion};
420 log <<
'\n' <<
"CUDAService fully initialized";
426 devicePreallocate(
numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int>>(
"devicePreallocate"));
427 hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int>>(
"hostPreallocate"));
#define nvmlCheck(ARG,...)
void setCudaLimit(cudaLimit limit, const char *name, size_t request)
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Log< level::Info, false > LogInfo
std::vector< std::pair< int, int > > computeCapabilities_
constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor)
void cachingAllocatorsConstruct()
StreamCache & getStreamCache()
constexpr bool useCaching
EventCache & getEventCache()
std::string decodeVersion(int version)
#define cudaCheck(ARG,...)
Log< level::Warning, false > LogWarning