Constructor.
135 bool configEnabled =
config.getUntrackedParameter<
bool>(
"enabled");
136 if (not configEnabled) {
137 edm::LogInfo(
"CUDAService") <<
"CUDAService disabled by configuration";
142 if (cudaSuccess !=
status) {
143 edm::LogWarning(
"CUDAService") <<
"Failed to initialize the CUDA runtime.\n" 144 <<
"Disabling the CUDAService.";
150 char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
151 nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
152 nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion,
sizeof(systemDriverVersion)));
157 int driverVersion = 0;
158 cudaCheck(cudaDriverGetVersion(&driverVersion));
162 int runtimeVersion = 0;
163 cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
167 log <<
"NVIDIA driver: " << systemDriverVersion <<
'\n';
172 log <<
"CUDA runtime successfully initialised, found " <<
numberOfDevices_ <<
" compute devices.\n";
174 log <<
"CUDA runtime version " <<
decodeVersion(runtimeVersion) <<
", driver version " 175 <<
decodeVersion(driverVersion) <<
", NVIDIA driver version " << systemDriverVersion;
179 auto printfFifoSize =
limits.getUntrackedParameter<
int>(
"cudaLimitPrintfFifoSize");
180 auto stackSize =
limits.getUntrackedParameter<
int>(
"cudaLimitStackSize");
181 auto mallocHeapSize =
limits.getUntrackedParameter<
int>(
"cudaLimitMallocHeapSize");
182 auto devRuntimeSyncDepth =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimeSyncDepth");
183 auto devRuntimePendingLaunchCount =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimePendingLaunchCount");
188 cudaDeviceProp properties;
189 cudaCheck(cudaGetDeviceProperties(&properties,
i));
190 log <<
'\n' <<
"CUDA device " <<
i <<
": " << properties.name;
198 log <<
" compute capability: " << properties.major <<
"." << properties.minor;
200 log <<
" (sm_" << properties.major << properties.minor <<
")";
203 log <<
" streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount <<
'\n';
204 log <<
" CUDA cores: " << std::setw(28)
205 << properties.multiProcessorCount *
getCudaCoresPerSM(properties.major, properties.minor) <<
'\n';
206 log <<
" single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
211 static constexpr
const char* computeModeDescription[] = {
213 "exclusive (single thread)",
215 "exclusive (single process)",
218 log <<
" compute mode:" << std::right << std::setw(27)
219 << computeModeDescription[
std::min(properties.computeMode,
220 static_cast<int>(
std::size(computeModeDescription)) - 1)]
226 cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
231 size_t freeMemory, totalMemory;
232 cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
233 log <<
" memory: " << std::setw(6) << freeMemory / (1 << 20) <<
" MB free / " << std::setw(6)
234 << totalMemory / (1 << 20) <<
" MB total\n";
235 log <<
" constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) <<
" kB\n";
236 log <<
" L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) <<
" kB\n";
241 static constexpr
const char* l1CacheModeDescription[] = {
242 "unknown",
"local memory",
"global memory",
"local and global memory"};
243 int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
244 log <<
" L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] <<
'\n';
247 log <<
"Other capabilities\n";
248 log <<
" " << (properties.canMapHostMemory ?
"can" :
"cannot")
249 <<
" map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
250 log <<
" " << (properties.pageableMemoryAccess ?
"supports" :
"does not support")
251 <<
" coherently accessing pageable memory without calling cudaHostRegister() on it\n";
252 log <<
" " << (properties.pageableMemoryAccessUsesHostPageTables ?
"can" :
"cannot")
253 <<
" access pageable memory via the host's page tables\n";
254 log <<
" " << (properties.canUseHostPointerForRegisteredMem ?
"can" :
"cannot")
255 <<
" access host registered memory at the same virtual address as the host\n";
256 log <<
" " << (properties.unifiedAddressing ?
"shares" :
"does not share")
257 <<
" a unified address space with the host\n";
258 log <<
" " << (properties.managedMemory ?
"supports" :
"does not support")
259 <<
" allocating managed memory on this system\n";
260 log <<
" " << (properties.concurrentManagedAccess ?
"can" :
"cannot")
261 <<
" coherently access managed memory concurrently with the host\n";
263 <<
"the host " << (properties.directManagedMemAccessFromHost ?
"can" :
"cannot")
264 <<
" directly access managed memory on the device without migration\n";
265 log <<
" " << (properties.cooperativeLaunch ?
"supports" :
"does not support")
266 <<
" launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
267 log <<
" " << (properties.cooperativeMultiDeviceLaunch ?
"supports" :
"does not support")
268 <<
" launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
275 log <<
"CUDA flags\n";
278 switch (
flags & cudaDeviceScheduleMask) {
279 case cudaDeviceScheduleAuto:
280 log <<
" thread policy: default\n";
282 case cudaDeviceScheduleSpin:
283 log <<
" thread policy: spin\n";
285 case cudaDeviceScheduleYield:
286 log <<
" thread policy: yield\n";
288 case cudaDeviceScheduleBlockingSync:
289 log <<
" thread policy: blocking sync\n";
292 log <<
" thread policy: undefined\n";
294 if (
flags & cudaDeviceMapHost) {
295 log <<
" pinned host memory allocations: enabled\n";
297 log <<
" pinned host memory allocations: disabled\n";
299 if (
flags & cudaDeviceLmemResizeToMax) {
300 log <<
" kernel host memory reuse: enabled\n";
302 log <<
" kernel host memory reuse: disabled\n";
312 if (printfFifoSize >= 0) {
316 if (stackSize >= 0) {
321 if (mallocHeapSize >= 0) {
324 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
327 if (devRuntimeSyncDepth >= 0) {
332 if (devRuntimePendingLaunchCount >= 0) {
334 "cudaLimitDevRuntimePendingLaunchCount",
335 devRuntimePendingLaunchCount);
341 log <<
"CUDA limits\n";
343 log <<
" printf buffer size: " << std::setw(10) <<
value / (1 << 20) <<
" MB\n";
345 log <<
" stack size: " << std::setw(10) <<
value / (1 << 10) <<
" kB\n";
347 log <<
" malloc heap size: " << std::setw(10) <<
value / (1 << 20) <<
" MB\n";
348 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
350 log <<
" runtime sync depth: " << std::setw(10) <<
value <<
'\n';
352 log <<
" runtime pending launch count: " << std::setw(10) <<
value <<
'\n';
365 log <<
'\n' <<
"CUDAService fully initialized";
#define nvmlCheck(ARG,...)
void setCudaLimit(cudaLimit limit, const char *name, size_t request)
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
cudaLimitDevRuntimePendingLaunchCount
Log< level::Info, false > LogInfo
cudaLimitDevRuntimeSyncDepth
std::vector< std::pair< int, int > > computeCapabilities_
constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor)
void cachingAllocatorsConstruct()
StreamCache & getStreamCache()
constexpr bool useCaching
EventCache & getEventCache()
std::string decodeVersion(int version)
#define cudaCheck(ARG,...)
Log< level::Warning, false > LogWarning