Constructor.
123 bool configEnabled =
config.getUntrackedParameter<
bool>(
"enabled");
124 if (not configEnabled) {
125 edm::LogInfo(
"CUDAService") <<
"CUDAService disabled by configuration";
130 if (cudaSuccess !=
status) {
131 edm::LogWarning(
"CUDAService") <<
"Failed to initialize the CUDA runtime.\n"
132 <<
"Disabling the CUDAService.";
137 log <<
"CUDA runtime successfully initialised, found " <<
numberOfDevices_ <<
" compute devices.\n\n";
140 auto printfFifoSize =
limits.getUntrackedParameter<
int>(
"cudaLimitPrintfFifoSize");
141 auto stackSize =
limits.getUntrackedParameter<
int>(
"cudaLimitStackSize");
142 auto mallocHeapSize =
limits.getUntrackedParameter<
int>(
"cudaLimitMallocHeapSize");
143 auto devRuntimeSyncDepth =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimeSyncDepth");
144 auto devRuntimePendingLaunchCount =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimePendingLaunchCount");
149 cudaDeviceProp properties;
150 cudaCheck(cudaGetDeviceProperties(&properties,
i));
151 log <<
"CUDA device " <<
i <<
": " << properties.name <<
'\n';
154 log <<
" compute capability: " << properties.major <<
"." << properties.minor <<
" (sm_"
155 << properties.major << properties.minor <<
")\n";
157 log <<
" streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount <<
'\n';
158 log <<
" CUDA cores: " << std::setw(28)
159 << properties.multiProcessorCount *
getCudaCoresPerSM(properties.major, properties.minor) <<
'\n';
160 log <<
" single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio <<
":1\n";
163 static constexpr
const char* computeModeDescription[] = {
165 "exclusive (single thread)",
167 "exclusive (single process)",
169 log <<
" compute mode:" << std::right << std::setw(27)
170 << computeModeDescription[
std::min(properties.computeMode,
171 static_cast<int>(
std::size(computeModeDescription)) - 1)]
176 cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
180 size_t freeMemory, totalMemory;
181 cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
182 log <<
" memory: " << std::setw(6) << freeMemory / (1 << 20) <<
" MB free / " << std::setw(6)
183 << totalMemory / (1 << 20) <<
" MB total\n";
184 log <<
" constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) <<
" kB\n";
185 log <<
" L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) <<
" kB\n";
188 static constexpr
const char* l1CacheModeDescription[] = {
189 "unknown",
"local memory",
"global memory",
"local and global memory"};
190 int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
191 log <<
" L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] <<
'\n';
194 log <<
"Other capabilities\n";
195 log <<
" " << (properties.canMapHostMemory ?
"can" :
"cannot")
196 <<
" map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
197 log <<
" " << (properties.pageableMemoryAccess ?
"supports" :
"does not support")
198 <<
" coherently accessing pageable memory without calling cudaHostRegister() on it\n";
199 log <<
" " << (properties.pageableMemoryAccessUsesHostPageTables ?
"can" :
"cannot")
200 <<
" access pageable memory via the host's page tables\n";
201 log <<
" " << (properties.canUseHostPointerForRegisteredMem ?
"can" :
"cannot")
202 <<
" access host registered memory at the same virtual address as the host\n";
203 log <<
" " << (properties.unifiedAddressing ?
"shares" :
"does not share")
204 <<
" a unified address space with the host\n";
205 log <<
" " << (properties.managedMemory ?
"supports" :
"does not support")
206 <<
" allocating managed memory on this system\n";
207 log <<
" " << (properties.concurrentManagedAccess ?
"can" :
"cannot")
208 <<
" coherently access managed memory concurrently with the host\n";
210 <<
"the host " << (properties.directManagedMemAccessFromHost ?
"can" :
"cannot")
211 <<
" directly access managed memory on the device without migration\n";
212 log <<
" " << (properties.cooperativeLaunch ?
"supports" :
"does not support")
213 <<
" launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
214 log <<
" " << (properties.cooperativeMultiDeviceLaunch ?
"supports" :
"does not support")
215 <<
" launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
220 log <<
"CUDA flags\n";
223 switch (
flags & cudaDeviceScheduleMask) {
224 case cudaDeviceScheduleAuto:
225 log <<
" thread policy: default\n";
227 case cudaDeviceScheduleSpin:
228 log <<
" thread policy: spin\n";
230 case cudaDeviceScheduleYield:
231 log <<
" thread policy: yield\n";
233 case cudaDeviceScheduleBlockingSync:
234 log <<
" thread policy: blocking sync\n";
237 log <<
" thread policy: undefined\n";
239 if (
flags & cudaDeviceMapHost) {
240 log <<
" pinned host memory allocations: enabled\n";
242 log <<
" pinned host memory allocations: disabled\n";
244 if (
flags & cudaDeviceLmemResizeToMax) {
245 log <<
" kernel host memory reuse: enabled\n";
247 log <<
" kernel host memory reuse: disabled\n";
256 if (printfFifoSize >= 0) {
257 setCudaLimit(cudaLimitPrintfFifoSize,
"cudaLimitPrintfFifoSize", printfFifoSize);
260 if (stackSize >= 0) {
261 setCudaLimit(cudaLimitStackSize,
"cudaLimitStackSize", stackSize);
265 if (mallocHeapSize >= 0) {
266 setCudaLimit(cudaLimitMallocHeapSize,
"cudaLimitMallocHeapSize", mallocHeapSize);
268 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
271 if (devRuntimeSyncDepth >= 0) {
272 setCudaLimit(cudaLimitDevRuntimeSyncDepth,
"cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
276 if (devRuntimePendingLaunchCount >= 0) {
278 "cudaLimitDevRuntimePendingLaunchCount",
279 devRuntimePendingLaunchCount);
284 log <<
"CUDA limits\n";
286 log <<
" printf buffer size: " << std::setw(10) <<
value / (1 << 20) <<
" MB\n";
288 log <<
" stack size: " << std::setw(10) <<
value / (1 << 10) <<
" kB\n";
290 log <<
" malloc heap size: " << std::setw(10) <<
value / (1 << 20) <<
" MB\n";
291 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
292 cudaCheck(cudaDeviceGetLimit(&
value, cudaLimitDevRuntimeSyncDepth));
293 log <<
" runtime sync depth: " << std::setw(10) <<
value <<
'\n';
294 cudaCheck(cudaDeviceGetLimit(&
value, cudaLimitDevRuntimePendingLaunchCount));
295 log <<
" runtime pending launch count: " << std::setw(10) <<
value <<
'\n';
309 log <<
"CUDAService fully initialized";
314 devicePreallocate(
numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >(
"devicePreallocate"));
315 hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >(
"hostPreallocate"));