CMS 3D CMS Logo

List of all members | Public Member Functions | Static Public Member Functions | Private Attributes
ROCmService Class Reference
Inheritance diagram for ROCmService:
ROCmInterface

Public Member Functions

std::pair< int, int > computeCapability (int device) const final
 
bool enabled () const final
 
int numberOfDevices () const final
 
 ROCmService (edm::ParameterSet const &config)
 Constructor. More...
 
 ~ROCmService () override
 
- Public Member Functions inherited from ROCmInterface
 ROCmInterface ()=default
 
virtual ~ROCmInterface ()=default
 

Static Public Member Functions

static void fillDescriptions (edm::ConfigurationDescriptions &descriptions)
 

Private Attributes

std::vector< std::pair< int, int > > computeCapabilities_
 
bool enabled_ = false
 
int numberOfDevices_ = 0
 
bool verbose_ = false
 

Detailed Description

Definition at line 27 of file ROCmService.cc.

Constructor & Destructor Documentation

◆ ROCmService()

ROCmService::ROCmService ( edm::ParameterSet const &  config)

Constructor.

Definition at line 88 of file ROCmService.cc.

References computeCapabilities_, ALPAKA_ACCELERATOR_NAMESPACE::brokenline::constexpr(), decodeVersion(), enabled_, HLT_2024v14_cff::flags, hipCheck, mps_fire::i, edm::Service< T >::isAvailable(), TH2PolyOfflineMaps::limits, CrabHelper::log, SiStripPI::min, numberOfDevices_, rsmiCheck, edm::ResourceInformation::setGPUModels(), setHipLimit(), mps_update::status, AlCaHLTBitMon_QueryRunRegistry::string, relativeConstraints::value, and verbose_.

88  : verbose_(config.getUntrackedParameter<bool>("verbose")) {
89  if (not config.getUntrackedParameter<bool>("enabled")) {
90  edm::LogInfo("ROCmService") << "ROCmService disabled by configuration";
91  return;
92  }
93 
94  auto status = hipGetDeviceCount(&numberOfDevices_);
95  if (hipSuccess != status) {
96  edm::LogWarning("ROCmService") << "Failed to initialize the ROCm runtime.\n"
97  << "Disabling the ROCmService.";
98  return;
99  }
101 
102  // AMD system driver version, e.g. 5.16.9.22.20 or 6.1.5
103  char systemDriverVersion[256];
104  rsmiCheck(rsmi_init(0x00));
105  rsmiCheck(rsmi_version_str_get(RSMI_SW_COMP_DRIVER, systemDriverVersion, sizeof(systemDriverVersion) - 1));
106  rsmiCheck(rsmi_shut_down());
107 
108  // ROCm driver version, e.g. 11.4
109  // the full version, like 11.4.1 or 11.4.100, is not reported
110  int driverVersion = 0;
111  hipCheck(hipDriverGetVersion(&driverVersion));
112 
113  // ROCm runtime version, e.g. 11.4
114  // the full version, like 11.4.1 or 11.4.108, is not reported
115  int runtimeVersion = 0;
116  hipCheck(hipRuntimeGetVersion(&runtimeVersion));
117 
118  edm::LogInfo log("ROCmService");
119  if (verbose_) {
120  log << "AMD kernel driver: " << systemDriverVersion << '\n';
121  log << "ROCm driver API: " << decodeVersion(driverVersion) << " (compiled with ROCm " <<
122 #ifdef ROCM_BUILD_INFO
123  // ROCM_BUILD_INFO has been introduced in ROCm 5.5.0
124  ROCM_BUILD_INFO
125 #else
126  ROCM_VERSION_MAJOR << '.' << ROCM_VERSION_MINOR << '.' << ROCM_VERSION_PATCH
127 #endif
128  << ")\n";
129  log << "ROCm runtime API: " << decodeVersion(runtimeVersion) << " (compiled with HIP " << HIP_VERSION_MAJOR << '.'
130  << HIP_VERSION_MINOR << '.' << HIP_VERSION_PATCH << ")\n";
131  log << "ROCm runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
132  } else {
133  log << "ROCm runtime version " << decodeVersion(runtimeVersion) << ", driver version "
134  << decodeVersion(driverVersion) << ", AMD driver version " << systemDriverVersion;
135  }
136 
137 #if HIP_VERSION >= 50400000
138  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
139  auto stackSize = limits.getUntrackedParameter<int>("hipLimitStackSize");
140  auto mallocHeapSize = limits.getUntrackedParameter<int>("hipLimitMallocHeapSize");
141 #endif
142 
143  std::set<std::string> models;
144 
145  for (int i = 0; i < numberOfDevices_; ++i) {
146  // read information about the compute device.
147  // see the documentation of hipGetDeviceProperties() for more information.
148  hipDeviceProp_t properties;
149  hipCheck(hipGetDeviceProperties(&properties, i));
150  log << '\n' << "ROCm device " << i << ": " << properties.name;
151  if (verbose_) {
152  log << '\n';
153  }
154  models.insert(std::string(properties.name));
155 
156  // compute capabilities
157  computeCapabilities_.emplace_back(properties.major, properties.minor);
158  if (verbose_) {
159  log << " compute capability: " << properties.gcnArchName;
160  } else {
161  log << " (" << properties.gcnArchName << ")";
162  }
163  if (verbose_) {
164  log << '\n';
165  log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
166  log << " ROCm cores: " << std::setw(28) << "not yet implemented" << '\n';
167  // ROCm does not provide single to double performance ratio
168  }
169 
170  // compute mode
171  static constexpr const char* computeModeDescription[] = {
172  "default (shared)", // hipComputeModeDefault
173  "exclusive (single thread)", // hipComputeModeExclusive
174  "prohibited", // hipComputeModeProhibited
175  "exclusive (single process)", // hipComputeModeExclusiveProcess
176  "unknown"};
177  if (verbose_) {
178  log << " compute mode:" << std::right << std::setw(27)
179  << computeModeDescription[std::min(properties.computeMode,
180  static_cast<int>(std::size(computeModeDescription)) - 1)]
181  << '\n';
182  }
183 
184  // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with an exception
185  hipCheck(hipSetDevice(i));
186  hipCheck(hipSetDeviceFlags(hipDeviceScheduleAuto | hipDeviceMapHost));
187 
188  if (verbose_) {
189  // read the free and total amount of memory available for allocation by the device, in bytes.
190  // see the documentation of hipMemGetInfo() for more information.
191  size_t freeMemory = 0;
192  size_t totalMemory = 0;
193  hipCheck(hipMemGetInfo(&freeMemory, &totalMemory));
194  log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
195  << totalMemory / (1 << 20) << " MB total\n";
196  log << " constant memory: " << std::setw(8) << properties.totalConstMem / (1 << 10) << " kB\n";
197  log << " L2 cache size: " << std::setw(8) << properties.l2CacheSize / (1 << 10) << " kB\n";
198 
199  log << '\n';
200 
201  // other capabilities
202  log << "Other capabilities\n";
203  log << " " << (properties.canMapHostMemory ? "can" : "cannot")
204  << " map host memory into the ROCm address space for use with hipHostAlloc()/hipHostGetDevicePointer()\n";
205  log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
206  << " coherently accessing pageable memory without calling hipHostRegister() on it\n";
207  log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
208  << " access pageable memory via the host's page tables\n";
209  log << " " << (properties.managedMemory ? "supports" : "does not support")
210  << " allocating managed memory on this system\n";
211  log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
212  << " coherently access managed memory concurrently with the host\n";
213  log << " "
214  << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
215  << " directly access managed memory on the device without migration\n";
216  log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
217  << " launching cooperative kernels via hipLaunchCooperativeKernel()\n";
218  log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
219  << " launching cooperative kernels via hipLaunchCooperativeKernelMultiDevice()\n";
220  log << '\n';
221  }
222 
223  // set and read the ROCm device flags.
224  // see the documentation of hipSetDeviceFlags and hipGetDeviceFlags for more information.
225  if (verbose_) {
226  log << "ROCm flags\n";
227  unsigned int flags;
228  hipCheck(hipGetDeviceFlags(&flags));
229  switch (flags & hipDeviceScheduleMask) {
230  case hipDeviceScheduleAuto:
231  log << " thread policy: default\n";
232  break;
233  case hipDeviceScheduleSpin:
234  log << " thread policy: spin\n";
235  break;
236  case hipDeviceScheduleYield:
237  log << " thread policy: yield\n";
238  break;
239  case hipDeviceScheduleBlockingSync:
240  log << " thread policy: blocking sync\n";
241  break;
242  default:
243  log << " thread policy: undefined\n";
244  }
245  if (flags & hipDeviceMapHost) {
246  log << " pinned host memory allocations: enabled\n";
247  } else {
248  log << " pinned host memory allocations: disabled\n";
249  }
250  if (flags & hipDeviceLmemResizeToMax) {
251  log << " kernel host memory reuse: enabled\n";
252  } else {
253  log << " kernel host memory reuse: disabled\n";
254  }
255  log << '\n';
256  }
257 
258  // set and read the ROCm resource limits.
259  // see the documentation of hipDeviceSetLimit() for more information.
260 
261 #if HIP_VERSION >= 50400000
262  // hipLimitStackSize controls the stack size in bytes of each GPU thread.
263  if (stackSize >= 0) {
264  setHipLimit(hipLimitStackSize, "hipLimitStackSize", stackSize);
265  }
266  // hipLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
267  // and free() device system calls.
268  if (mallocHeapSize >= 0) {
269  setHipLimit(hipLimitMallocHeapSize, "hipLimitMallocHeapSize", mallocHeapSize);
270  }
271 #endif
272 
273  if (verbose_) {
274  size_t value;
275  log << "ROCm limits\n";
276 #if HIP_VERSION >= 50400000
277  hipCheck(hipDeviceGetLimit(&value, hipLimitStackSize));
278  log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
279 #endif
280  hipCheck(hipDeviceGetLimit(&value, hipLimitMallocHeapSize));
281  log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
282  }
283  }
284 
285  edm::Service<edm::ResourceInformation> resourceInformationService;
286  if (resourceInformationService.isAvailable()) {
287  std::vector<std::string> modelsV(models.begin(), models.end());
288  resourceInformationService->setGPUModels(modelsV);
289  /*
290  std::string nvidiaDriverVersion{systemDriverVersion};
291  resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
292  resourceInformationService->setCudaDriverVersion(driverVersion);
293  resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
294  */
295  }
296 
297  if (verbose_) {
298  log << '\n' << "ROCmService fully initialized";
299  }
300  enabled_ = true;
301 }
Definition: config.py:1
Definition: models.py:1
#define rsmiCheck(ARG,...)
Definition: rsmiCheck.h:53
std::vector< std::pair< int, int > > computeCapabilities_
Definition: ROCmService.cc:50
Definition: value.py:1
void setHipLimit(hipLimit_t limit, const char *name, size_t request)
Definition: ROCmService.cc:55
Log< level::Info, false > LogInfo
int numberOfDevices_
Definition: ROCmService.cc:49
#define hipCheck(ARG,...)
Definition: hipCheck.h:52
std::string decodeVersion(int version)
Definition: ROCmService.cc:81
bool isAvailable() const
Definition: Service.h:40
Log< level::Warning, false > LogWarning
virtual void setGPUModels(std::vector< std::string > const &)=0

◆ ~ROCmService()

ROCmService::~ROCmService ( )
override

Definition at line 303 of file ROCmService.cc.

References enabled_, hipCheck, mps_fire::i, and numberOfDevices_.

303  {
304  if (enabled_) {
305  for (int i = 0; i < numberOfDevices_; ++i) {
306  hipCheck(hipSetDevice(i));
307  hipCheck(hipDeviceSynchronize());
308  // Explicitly destroys and cleans up all resources associated with the current device in the
309  // current process. Any subsequent API call to this device will reinitialize the device.
310  // Useful to check for memory leaks.
311  hipCheck(hipDeviceReset());
312  }
313  }
314 }
int numberOfDevices_
Definition: ROCmService.cc:49
#define hipCheck(ARG,...)
Definition: hipCheck.h:52

Member Function Documentation

◆ computeCapability()

std::pair<int, int> ROCmService::computeCapability ( int  device) const
inlinefinalvirtual

Implements ROCmInterface.

Definition at line 39 of file ROCmService.cc.

References computeCapabilities_, findQualityFiles::size, and to_string().

39  {
40  int size = computeCapabilities_.size();
41  if (device < 0 or device >= size) {
42  throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
43  std::to_string(size - 1));
44  }
45  return computeCapabilities_[device];
46  }
size
Write out results.
static std::string to_string(const XMLCh *ch)
std::vector< std::pair< int, int > > computeCapabilities_
Definition: ROCmService.cc:50

◆ enabled()

bool ROCmService::enabled ( ) const
inlinefinalvirtual

Implements ROCmInterface.

Definition at line 34 of file ROCmService.cc.

References enabled_.

34 { return enabled_; }

◆ fillDescriptions()

void ROCmService::fillDescriptions ( edm::ConfigurationDescriptions descriptions)
static

Definition at line 316 of file ROCmService.cc.

References edm::ConfigurationDescriptions::add(), submitPVResolutionJobs::desc, and TH2PolyOfflineMaps::limits.

316  {
318  desc.addUntracked<bool>("enabled", true);
319  desc.addUntracked<bool>("verbose", false);
320 
321 #if HIP_VERSION >= 50400000
323  limits.addUntracked<int>("hipLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
324  limits.addUntracked<int>("hipLimitMallocHeapSize", -1)
325  ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
326  desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
327  ->setComment(
328  "See the documentation of hipDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
329  "the default value.");
330 #endif
331 
332  descriptions.add("ROCmService", desc);
333 }
void add(std::string const &label, ParameterSetDescription const &psetDescription)

◆ numberOfDevices()

int ROCmService::numberOfDevices ( ) const
inlinefinalvirtual

Implements ROCmInterface.

Definition at line 36 of file ROCmService.cc.

References numberOfDevices_.

36 { return numberOfDevices_; }
int numberOfDevices_
Definition: ROCmService.cc:49

Member Data Documentation

◆ computeCapabilities_

std::vector<std::pair<int, int> > ROCmService::computeCapabilities_
private

Definition at line 50 of file ROCmService.cc.

Referenced by computeCapability(), and ROCmService().

◆ enabled_

bool ROCmService::enabled_ = false
private

Definition at line 51 of file ROCmService.cc.

Referenced by enabled(), ROCmService(), and ~ROCmService().

◆ numberOfDevices_

int ROCmService::numberOfDevices_ = 0
private

Definition at line 49 of file ROCmService.cc.

Referenced by numberOfDevices(), ROCmService(), and ~ROCmService().

◆ verbose_

bool ROCmService::verbose_ = false
private

Definition at line 52 of file ROCmService.cc.

Referenced by ROCmService().