11 #include <cuda_runtime.h> 45 if (device < 0 or device >=
size) {
46 throw std::out_of_range(
"Invalid device index" +
std::to_string(device) +
": the valid range is from 0 to " +
65 if (cudaErrorUnsupportedLimit ==
result) {
66 edm::LogWarning(
"CUDAService") <<
"CUDA device " << device <<
": unsupported limit \"" <<
name <<
"\"";
72 if (cudaSuccess !=
result) {
73 edm::LogWarning(
"CUDAService") <<
"CUDA device " << device <<
": failed to set limit \"" <<
name <<
"\" to " 74 << request <<
", current value is " <<
value;
75 }
else if (
value != request) {
77 <<
" instead of requested " << request;
82 switch (major * 10 + minor) {
143 template <
template <
typename>
typename UniquePtr,
typename Allocate>
144 void preallocate(Allocate allocate,
const std::vector<unsigned int>& bufferSizes) {
145 if (bufferSizes.empty())
150 std::vector<UniquePtr<char[]>> buffers;
151 buffers.reserve(bufferSizes.size());
152 for (
auto size : bufferSizes) {
153 buffers.push_back(allocate(
size, streamPtr.get()));
157 void devicePreallocate(
int numberOfDevices,
const std::vector<unsigned int>& bufferSizes) {
162 preallocate<cms::cuda::device::unique_ptr>(
163 [&](
size_t size, cudaStream_t
stream) {
return cms::cuda::make_device_unique<char[]>(
size,
stream); },
169 void hostPreallocate(
const std::vector<unsigned int>& bufferSizes) {
170 preallocate<cms::cuda::host::unique_ptr>(
171 [&](
size_t size, cudaStream_t
stream) {
return cms::cuda::make_host_unique<char[]>(
size,
stream); },
178 if (not
config.getUntrackedParameter<
bool>(
"enabled")) {
179 edm::LogInfo(
"CUDAService") <<
"CUDAService disabled by configuration";
184 if (cudaSuccess !=
status) {
185 edm::LogWarning(
"CUDAService") <<
"Failed to initialize the CUDA runtime.\n" 186 <<
"Disabling the CUDAService.";
192 char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
193 nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
194 nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion,
sizeof(systemDriverVersion)));
199 int driverVersion = 0;
200 cudaCheck(cudaDriverGetVersion(&driverVersion));
204 int runtimeVersion = 0;
205 cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
209 log <<
"NVIDIA driver: " << systemDriverVersion <<
'\n';
214 log <<
"CUDA runtime successfully initialised, found " <<
numberOfDevices_ <<
" compute devices.\n";
216 log <<
"CUDA runtime version " <<
decodeVersion(runtimeVersion) <<
", driver version " 217 <<
decodeVersion(driverVersion) <<
", NVIDIA driver version " << systemDriverVersion;
221 auto printfFifoSize =
limits.getUntrackedParameter<
int>(
"cudaLimitPrintfFifoSize");
222 auto stackSize =
limits.getUntrackedParameter<
int>(
"cudaLimitStackSize");
223 auto mallocHeapSize =
limits.getUntrackedParameter<
int>(
"cudaLimitMallocHeapSize");
224 auto devRuntimeSyncDepth =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimeSyncDepth");
225 auto devRuntimePendingLaunchCount =
limits.getUntrackedParameter<
int>(
"cudaLimitDevRuntimePendingLaunchCount");
227 std::set<std::string>
models;
232 cudaDeviceProp properties;
233 cudaCheck(cudaGetDeviceProperties(&properties,
i));
234 log <<
'\n' <<
"CUDA device " <<
i <<
": " << properties.name;
243 log <<
" compute capability: " << properties.major <<
"." << properties.minor;
245 log <<
" (sm_" << properties.major << properties.minor <<
")";
248 log <<
" streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount <<
'\n';
249 log <<
" CUDA cores: " << std::setw(28)
250 << properties.multiProcessorCount *
getCudaCoresPerSM(properties.major, properties.minor) <<
'\n';
251 log <<
" single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
256 static constexpr const char* computeModeDescription[] = {
258 "exclusive (single thread)",
260 "exclusive (single process)",
263 log <<
" compute mode:" << std::right << std::setw(27)
264 << computeModeDescription[
std::min(properties.computeMode,
265 static_cast<int>(std::size(computeModeDescription)) - 1)]
271 cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
276 size_t freeMemory, totalMemory;
277 cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
278 log <<
" memory: " << std::setw(6) << freeMemory / (1 << 20) <<
" MB free / " << std::setw(6)
279 << totalMemory / (1 << 20) <<
" MB total\n";
280 log <<
" constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) <<
" kB\n";
281 log <<
" L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) <<
" kB\n";
286 static constexpr const char* l1CacheModeDescription[] = {
287 "unknown",
"local memory",
"global memory",
"local and global memory"};
288 int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
289 log <<
" L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] <<
'\n';
292 log <<
"Other capabilities\n";
293 log <<
" " << (properties.canMapHostMemory ?
"can" :
"cannot")
294 <<
" map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
295 log <<
" " << (properties.pageableMemoryAccess ?
"supports" :
"does not support")
296 <<
" coherently accessing pageable memory without calling cudaHostRegister() on it\n";
297 log <<
" " << (properties.pageableMemoryAccessUsesHostPageTables ?
"can" :
"cannot")
298 <<
" access pageable memory via the host's page tables\n";
299 log <<
" " << (properties.canUseHostPointerForRegisteredMem ?
"can" :
"cannot")
300 <<
" access host registered memory at the same virtual address as the host\n";
301 log <<
" " << (properties.unifiedAddressing ?
"shares" :
"does not share")
302 <<
" a unified address space with the host\n";
303 log <<
" " << (properties.managedMemory ?
"supports" :
"does not support")
304 <<
" allocating managed memory on this system\n";
305 log <<
" " << (properties.concurrentManagedAccess ?
"can" :
"cannot")
306 <<
" coherently access managed memory concurrently with the host\n";
308 <<
"the host " << (properties.directManagedMemAccessFromHost ?
"can" :
"cannot")
309 <<
" directly access managed memory on the device without migration\n";
310 log <<
" " << (properties.cooperativeLaunch ?
"supports" :
"does not support")
311 <<
" launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
312 log <<
" " << (properties.cooperativeMultiDeviceLaunch ?
"supports" :
"does not support")
313 <<
" launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
320 log <<
"CUDA flags\n";
323 switch (
flags & cudaDeviceScheduleMask) {
324 case cudaDeviceScheduleAuto:
325 log <<
" thread policy: default\n";
327 case cudaDeviceScheduleSpin:
328 log <<
" thread policy: spin\n";
330 case cudaDeviceScheduleYield:
331 log <<
" thread policy: yield\n";
333 case cudaDeviceScheduleBlockingSync:
334 log <<
" thread policy: blocking sync\n";
337 log <<
" thread policy: undefined\n";
339 if (
flags & cudaDeviceMapHost) {
340 log <<
" pinned host memory allocations: enabled\n";
342 log <<
" pinned host memory allocations: disabled\n";
344 if (
flags & cudaDeviceLmemResizeToMax) {
345 log <<
" kernel host memory reuse: enabled\n";
347 log <<
" kernel host memory reuse: disabled\n";
357 if (printfFifoSize >= 0) {
358 setCudaLimit(cudaLimitPrintfFifoSize,
"cudaLimitPrintfFifoSize", printfFifoSize);
361 if (stackSize >= 0) {
362 setCudaLimit(cudaLimitStackSize,
"cudaLimitStackSize", stackSize);
366 if (mallocHeapSize >= 0) {
367 setCudaLimit(cudaLimitMallocHeapSize,
"cudaLimitMallocHeapSize", mallocHeapSize);
369 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
372 if (devRuntimeSyncDepth >= 0) {
373 setCudaLimit(cudaLimitDevRuntimeSyncDepth,
"cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
377 if (devRuntimePendingLaunchCount >= 0) {
379 "cudaLimitDevRuntimePendingLaunchCount",
380 devRuntimePendingLaunchCount);
386 log <<
"CUDA limits\n";
388 log <<
" printf buffer size: " << std::setw(10) <<
value / (1 << 20) <<
" MB\n";
390 log <<
" stack size: " << std::setw(10) <<
value / (1 << 10) <<
" kB\n";
392 log <<
" malloc heap size: " << std::setw(10) <<
value / (1 << 20) <<
" MB\n";
393 if ((properties.major > 3)
or (properties.major == 3 and properties.minor >= 5)) {
394 cudaCheck(cudaDeviceGetLimit(&
value, cudaLimitDevRuntimeSyncDepth));
395 log <<
" runtime sync depth: " << std::setw(10) <<
value <<
'\n';
396 cudaCheck(cudaDeviceGetLimit(&
value, cudaLimitDevRuntimePendingLaunchCount));
397 log <<
" runtime pending launch count: " << std::setw(10) <<
value <<
'\n';
404 std::vector<std::string> modelsV(
models.begin(),
models.end());
406 std::string nvidiaDriverVersion{systemDriverVersion};
420 log <<
'\n' <<
"CUDAService fully initialized";
426 devicePreallocate(
numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int>>(
"devicePreallocate"));
427 hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int>>(
"hostPreallocate"));
452 desc.addUntracked<
bool>(
"enabled",
true);
453 desc.addUntracked<
bool>(
"verbose",
false);
456 limits.addUntracked<
int>(
"cudaLimitPrintfFifoSize", -1)
457 ->setComment(
"Size in bytes of the shared FIFO used by the printf() device system call.");
458 limits.addUntracked<
int>(
"cudaLimitStackSize", -1)->setComment(
"Stack size in bytes of each GPU thread.");
459 limits.addUntracked<
int>(
"cudaLimitMallocHeapSize", -1)
460 ->setComment(
"Size in bytes of the heap used by the malloc() and free() device system calls.");
461 limits.addUntracked<
int>(
"cudaLimitDevRuntimeSyncDepth", -1)
462 ->setComment(
"Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
463 limits.addUntracked<
int>(
"cudaLimitDevRuntimePendingLaunchCount", -1)
464 ->setComment(
"Maximum number of outstanding device runtime launches that can be made from the current device.");
467 "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps " 468 "the default value.");
471 allocator.
addUntracked<std::vector<unsigned int>>(
"devicePreallocate", std::vector<unsigned int>{})
472 ->setComment(
"Preallocates buffers of given bytes on all devices");
473 allocator.
addUntracked<std::vector<unsigned int>>(
"hostPreallocate", std::vector<unsigned int>{})
474 ->setComment(
"Preallocates buffers of given bytes on the host");
477 descriptions.
add(
"CUDAService",
desc);
#define DEFINE_FWK_SERVICE_MAKER(concrete, maker)
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)
std::pair< int, int > computeCapability(int device) const final
bool isProcessWideService(TFileService const *)
#define nvmlCheck(ARG,...)
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
void setCudaLimit(cudaLimit limit, const char *name, size_t request)
static std::string to_string(const XMLCh *ch)
static void fillDescriptions(edm::ConfigurationDescriptions &descriptions)
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
CUDAService(edm::ParameterSet const &config)
Constructor.
Log< level::Info, false > LogInfo
std::vector< std::pair< int, int > > computeCapabilities_
void add(std::string const &label, ParameterSetDescription const &psetDescription)
constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor)
void cachingAllocatorsConstruct()
StreamCache & getStreamCache()
constexpr bool useCaching
void cachingAllocatorsFreeCached()
EventCache & getEventCache()
std::string decodeVersion(int version)
#define cudaCheck(ARG,...)
int numberOfDevices() const final
Log< level::Warning, false > LogWarning
bool enabled() const final