CMS 3D CMS Logo

List of all members | Public Member Functions | Static Public Member Functions | Private Attributes
ROCmService Class Reference
Inheritance diagram for ROCmService:
ROCmInterface

Public Member Functions

std::pair< int, int > computeCapability (int device) const final
 
bool enabled () const final
 
int numberOfDevices () const final
 
 ROCmService (edm::ParameterSet const &config)
 Constructor. More...
 
 ~ROCmService () override
 
- Public Member Functions inherited from ROCmInterface
 ROCmInterface ()=default
 
virtual ~ROCmInterface ()=default
 

Static Public Member Functions

static void fillDescriptions (edm::ConfigurationDescriptions &descriptions)
 

Private Attributes

std::vector< std::pair< int, int > > computeCapabilities_
 
bool enabled_ = false
 
int numberOfDevices_ = 0
 
bool verbose_ = false
 

Detailed Description

Definition at line 22 of file ROCmService.cc.

Constructor & Destructor Documentation

◆ ROCmService()

ROCmService::ROCmService ( edm::ParameterSet const &  config)

Constructor.

Definition at line 83 of file ROCmService.cc.

References computeCapabilities_, ALPAKA_ACCELERATOR_NAMESPACE::brokenline::constexpr(), decodeVersion(), enabled_, HLT_2024v11_cff::flags, hipCheck, mps_fire::i, edm::Service< T >::isAvailable(), TH2PolyOfflineMaps::limits, dqm-mbProfile::log, SiStripPI::min, numberOfDevices_, rsmiCheck, edm::ResourceInformation::setGPUModels(), setHipLimit(), mps_update::status, AlCaHLTBitMon_QueryRunRegistry::string, relativeConstraints::value, and verbose_.

83  : verbose_(config.getUntrackedParameter<bool>("verbose")) {
84  if (not config.getUntrackedParameter<bool>("enabled")) {
85  edm::LogInfo("ROCmService") << "ROCmService disabled by configuration";
86  return;
87  }
88 
89  auto status = hipGetDeviceCount(&numberOfDevices_);
90  if (hipSuccess != status) {
91  edm::LogWarning("ROCmService") << "Failed to initialize the ROCm runtime.\n"
92  << "Disabling the ROCmService.";
93  return;
94  }
96 
97  // AMD system driver version, e.g. 5.16.9.22.20 or 6.1.5
98  char systemDriverVersion[256];
99  rsmiCheck(rsmi_init(0x00));
100  rsmiCheck(rsmi_version_str_get(RSMI_SW_COMP_DRIVER, systemDriverVersion, sizeof(systemDriverVersion) - 1));
101  rsmiCheck(rsmi_shut_down());
102 
103  // ROCm driver version, e.g. 11.4
104  // the full version, like 11.4.1 or 11.4.100, is not reported
105  int driverVersion = 0;
106  hipCheck(hipDriverGetVersion(&driverVersion));
107 
108  // ROCm runtime version, e.g. 11.4
109  // the full version, like 11.4.1 or 11.4.108, is not reported
110  int runtimeVersion = 0;
111  hipCheck(hipRuntimeGetVersion(&runtimeVersion));
112 
113  edm::LogInfo log("ROCmService");
114  if (verbose_) {
115  log << "AMD kernel driver: " << systemDriverVersion << '\n';
116  log << "ROCm driver API: " << decodeVersion(driverVersion) << " (compiled with ROCm " <<
117 #ifdef ROCM_BUILD_INFO
118  // ROCM_BUILD_INFO has been introduced in ROCm 5.5.0
119  ROCM_BUILD_INFO
120 #else
121  ROCM_VERSION_MAJOR << '.' << ROCM_VERSION_MINOR << '.' << ROCM_VERSION_PATCH
122 #endif
123  << ")\n";
124  log << "ROCm runtime API: " << decodeVersion(runtimeVersion) << " (compiled with HIP " << HIP_VERSION_MAJOR << '.'
125  << HIP_VERSION_MINOR << '.' << HIP_VERSION_PATCH << ")\n";
126  log << "ROCm runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
127  } else {
128  log << "ROCm runtime version " << decodeVersion(runtimeVersion) << ", driver version "
129  << decodeVersion(driverVersion) << ", AMD driver version " << systemDriverVersion;
130  }
131 
132 #if HIP_VERSION >= 50400000
133  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
134  auto stackSize = limits.getUntrackedParameter<int>("hipLimitStackSize");
135  auto mallocHeapSize = limits.getUntrackedParameter<int>("hipLimitMallocHeapSize");
136 #endif
137 
138  std::set<std::string> models;
139 
140  for (int i = 0; i < numberOfDevices_; ++i) {
141  // read information about the compute device.
142  // see the documentation of hipGetDeviceProperties() for more information.
143  hipDeviceProp_t properties;
144  hipCheck(hipGetDeviceProperties(&properties, i));
145  log << '\n' << "ROCm device " << i << ": " << properties.name;
146  if (verbose_) {
147  log << '\n';
148  }
149  models.insert(std::string(properties.name));
150 
151  // compute capabilities
152  computeCapabilities_.emplace_back(properties.major, properties.minor);
153  if (verbose_) {
154  log << " compute capability: " << properties.gcnArchName;
155  } else {
156  log << " (" << properties.gcnArchName << ")";
157  }
158  if (verbose_) {
159  log << '\n';
160  log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
161  log << " ROCm cores: " << std::setw(28) << "not yet implemented" << '\n';
162  // ROCm does not provide single to double performance ratio
163  }
164 
165  // compute mode
166  static constexpr const char* computeModeDescription[] = {
167  "default (shared)", // hipComputeModeDefault
168  "exclusive (single thread)", // hipComputeModeExclusive
169  "prohibited", // hipComputeModeProhibited
170  "exclusive (single process)", // hipComputeModeExclusiveProcess
171  "unknown"};
172  if (verbose_) {
173  log << " compute mode:" << std::right << std::setw(27)
174  << computeModeDescription[std::min(properties.computeMode,
175  static_cast<int>(std::size(computeModeDescription)) - 1)]
176  << '\n';
177  }
178 
179  // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with an exception
180  hipCheck(hipSetDevice(i));
181  hipCheck(hipSetDeviceFlags(hipDeviceScheduleAuto | hipDeviceMapHost));
182 
183  if (verbose_) {
184  // read the free and total amount of memory available for allocation by the device, in bytes.
185  // see the documentation of hipMemGetInfo() for more information.
186  size_t freeMemory = 0;
187  size_t totalMemory = 0;
188  hipCheck(hipMemGetInfo(&freeMemory, &totalMemory));
189  log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
190  << totalMemory / (1 << 20) << " MB total\n";
191  log << " constant memory: " << std::setw(8) << properties.totalConstMem / (1 << 10) << " kB\n";
192  log << " L2 cache size: " << std::setw(8) << properties.l2CacheSize / (1 << 10) << " kB\n";
193 
194  log << '\n';
195 
196  // other capabilities
197  log << "Other capabilities\n";
198  log << " " << (properties.canMapHostMemory ? "can" : "cannot")
199  << " map host memory into the ROCm address space for use with hipHostAlloc()/hipHostGetDevicePointer()\n";
200  log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
201  << " coherently accessing pageable memory without calling hipHostRegister() on it\n";
202  log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
203  << " access pageable memory via the host's page tables\n";
204  log << " " << (properties.managedMemory ? "supports" : "does not support")
205  << " allocating managed memory on this system\n";
206  log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
207  << " coherently access managed memory concurrently with the host\n";
208  log << " "
209  << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
210  << " directly access managed memory on the device without migration\n";
211  log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
212  << " launching cooperative kernels via hipLaunchCooperativeKernel()\n";
213  log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
214  << " launching cooperative kernels via hipLaunchCooperativeKernelMultiDevice()\n";
215  log << '\n';
216  }
217 
218  // set and read the ROCm device flags.
219  // see the documentation of hipSetDeviceFlags and hipGetDeviceFlags for more information.
220  if (verbose_) {
221  log << "ROCm flags\n";
222  unsigned int flags;
223  hipCheck(hipGetDeviceFlags(&flags));
224  switch (flags & hipDeviceScheduleMask) {
225  case hipDeviceScheduleAuto:
226  log << " thread policy: default\n";
227  break;
228  case hipDeviceScheduleSpin:
229  log << " thread policy: spin\n";
230  break;
231  case hipDeviceScheduleYield:
232  log << " thread policy: yield\n";
233  break;
234  case hipDeviceScheduleBlockingSync:
235  log << " thread policy: blocking sync\n";
236  break;
237  default:
238  log << " thread policy: undefined\n";
239  }
240  if (flags & hipDeviceMapHost) {
241  log << " pinned host memory allocations: enabled\n";
242  } else {
243  log << " pinned host memory allocations: disabled\n";
244  }
245  if (flags & hipDeviceLmemResizeToMax) {
246  log << " kernel host memory reuse: enabled\n";
247  } else {
248  log << " kernel host memory reuse: disabled\n";
249  }
250  log << '\n';
251  }
252 
253  // set and read the ROCm resource limits.
254  // see the documentation of hipDeviceSetLimit() for more information.
255 
256 #if HIP_VERSION >= 50400000
257  // hipLimitStackSize controls the stack size in bytes of each GPU thread.
258  if (stackSize >= 0) {
259  setHipLimit(hipLimitStackSize, "hipLimitStackSize", stackSize);
260  }
261  // hipLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
262  // and free() device system calls.
263  if (mallocHeapSize >= 0) {
264  setHipLimit(hipLimitMallocHeapSize, "hipLimitMallocHeapSize", mallocHeapSize);
265  }
266 #endif
267 
268  if (verbose_) {
269  size_t value;
270  log << "ROCm limits\n";
271 #if HIP_VERSION >= 50400000
272  hipCheck(hipDeviceGetLimit(&value, hipLimitStackSize));
273  log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
274 #endif
275  hipCheck(hipDeviceGetLimit(&value, hipLimitMallocHeapSize));
276  log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
277  }
278  }
279 
280  edm::Service<edm::ResourceInformation> resourceInformationService;
281  if (resourceInformationService.isAvailable()) {
282  std::vector<std::string> modelsV(models.begin(), models.end());
283  resourceInformationService->setGPUModels(modelsV);
284  /*
285  std::string nvidiaDriverVersion{systemDriverVersion};
286  resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
287  resourceInformationService->setCudaDriverVersion(driverVersion);
288  resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
289  */
290  }
291 
292  if (verbose_) {
293  log << '\n' << "ROCmService fully initialized";
294  }
295  enabled_ = true;
296 }
Definition: config.py:1
Definition: models.py:1
#define rsmiCheck(ARG,...)
Definition: rsmiCheck.h:53
std::vector< std::pair< int, int > > computeCapabilities_
Definition: ROCmService.cc:45
Definition: value.py:1
void setHipLimit(hipLimit_t limit, const char *name, size_t request)
Definition: ROCmService.cc:50
Log< level::Info, false > LogInfo
int numberOfDevices_
Definition: ROCmService.cc:44
#define hipCheck(ARG,...)
Definition: hipCheck.h:52
std::string decodeVersion(int version)
Definition: ROCmService.cc:76
bool isAvailable() const
Definition: Service.h:40
Log< level::Warning, false > LogWarning
virtual void setGPUModels(std::vector< std::string > const &)=0

◆ ~ROCmService()

ROCmService::~ROCmService ( )
override

Definition at line 298 of file ROCmService.cc.

References enabled_, hipCheck, mps_fire::i, and numberOfDevices_.

298  {
299  if (enabled_) {
300  for (int i = 0; i < numberOfDevices_; ++i) {
301  hipCheck(hipSetDevice(i));
302  hipCheck(hipDeviceSynchronize());
303  // Explicitly destroys and cleans up all resources associated with the current device in the
304  // current process. Any subsequent API call to this device will reinitialize the device.
305  // Useful to check for memory leaks.
306  hipCheck(hipDeviceReset());
307  }
308  }
309 }
int numberOfDevices_
Definition: ROCmService.cc:44
#define hipCheck(ARG,...)
Definition: hipCheck.h:52

Member Function Documentation

◆ computeCapability()

std::pair<int, int> ROCmService::computeCapability ( int  device) const
inlinefinalvirtual

Implements ROCmInterface.

Definition at line 34 of file ROCmService.cc.

References computeCapabilities_, findQualityFiles::size, and to_string().

34  {
35  int size = computeCapabilities_.size();
36  if (device < 0 or device >= size) {
37  throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
38  std::to_string(size - 1));
39  }
40  return computeCapabilities_[device];
41  }
size
Write out results.
static std::string to_string(const XMLCh *ch)
std::vector< std::pair< int, int > > computeCapabilities_
Definition: ROCmService.cc:45

◆ enabled()

bool ROCmService::enabled ( ) const
inlinefinalvirtual

Implements ROCmInterface.

Definition at line 29 of file ROCmService.cc.

References enabled_.

29 { return enabled_; }

◆ fillDescriptions()

void ROCmService::fillDescriptions ( edm::ConfigurationDescriptions descriptions)
static

Definition at line 311 of file ROCmService.cc.

References edm::ConfigurationDescriptions::add(), submitPVResolutionJobs::desc, and TH2PolyOfflineMaps::limits.

311  {
313  desc.addUntracked<bool>("enabled", true);
314  desc.addUntracked<bool>("verbose", false);
315 
316 #if HIP_VERSION >= 50400000
318  limits.addUntracked<int>("hipLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
319  limits.addUntracked<int>("hipLimitMallocHeapSize", -1)
320  ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
321  desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
322  ->setComment(
323  "See the documentation of hipDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
324  "the default value.");
325 #endif
326 
327  descriptions.add("ROCmService", desc);
328 }
void add(std::string const &label, ParameterSetDescription const &psetDescription)

◆ numberOfDevices()

int ROCmService::numberOfDevices ( ) const
inlinefinalvirtual

Implements ROCmInterface.

Definition at line 31 of file ROCmService.cc.

References numberOfDevices_.

31 { return numberOfDevices_; }
int numberOfDevices_
Definition: ROCmService.cc:44

Member Data Documentation

◆ computeCapabilities_

std::vector<std::pair<int, int> > ROCmService::computeCapabilities_
private

Definition at line 45 of file ROCmService.cc.

Referenced by computeCapability(), and ROCmService().

◆ enabled_

bool ROCmService::enabled_ = false
private

Definition at line 46 of file ROCmService.cc.

Referenced by enabled(), ROCmService(), and ~ROCmService().

◆ numberOfDevices_

int ROCmService::numberOfDevices_ = 0
private

Definition at line 44 of file ROCmService.cc.

Referenced by numberOfDevices(), ROCmService(), and ~ROCmService().

◆ verbose_

bool ROCmService::verbose_ = false
private

Definition at line 47 of file ROCmService.cc.

Referenced by ROCmService().