CMS 3D CMS Logo

List of all members | Classes | Public Member Functions | Static Public Member Functions | Private Member Functions | Private Attributes
TritonService Class Reference

#include <TritonService.h>

Classes

struct  FallbackOpts
 
struct  Model
 
struct  Module
 
struct  Server
 

Public Member Functions

void addModel (const std::string &modelName, const std::string &path)
 
const std::string & pid () const
 
Server serverInfo (const std::string &model, const std::string &preferred="") const
 
 TritonService (const edm::ParameterSet &pset, edm::ActivityRegistry &areg)
 
 ~TritonService ()=default
 

Static Public Member Functions

static void fillDescriptions (edm::ConfigurationDescriptions &descriptions)
 

Private Member Functions

void postEndJob ()
 
void postModuleConstruction (edm::ModuleDescription const &)
 
void preBeginJob (edm::PathsAndConsumesOfModulesBase const &, edm::ProcessContext const &)
 
void preModuleConstruction (edm::ModuleDescription const &)
 
void preModuleDestruction (edm::ModuleDescription const &)
 
template<typename LOG >
void printFallbackServerLog () const
 

Private Attributes

bool allowAddModel_
 
unsigned currentModuleId_
 
FallbackOpts fallbackOpts_
 
std::unordered_map< std::string, Modelmodels_
 
std::unordered_map< unsigned, Modulemodules_
 
std::string pid_
 
std::unordered_map< std::string, Serverservers_
 
bool startedFallback_
 
std::unordered_map< std::string, ModelunservedModels_
 
bool verbose_
 

Detailed Description

Definition at line 27 of file TritonService.h.

Constructor & Destructor Documentation

◆ TritonService()

TritonService::TritonService ( const edm::ParameterSet pset,
edm::ActivityRegistry areg 
)

Definition at line 57 of file TritonService.cc.

References HLTObjectMonitor_Client_cff::client, TritonService::FallbackOpts::enable, Exception, TritonService::Server::fallbackAddress, TritonService::Server::fallbackName, fallbackOpts_, LocalCPU, LocalGPU, ecalBarrelClusterFastTimer_cfi::modelName, models_, mps_check::msg, postEndJob(), postModuleConstruction(), preBeginJob(), preModuleConstruction(), preModuleDestruction(), muonDTDigis_cfi::pset, Remote, contentValuesFiles::server, servers_, AlCaHLTBitMon_QueryRunRegistry::string, TRITON_THROW_IF_ERROR, tier0::unique(), TritonService::FallbackOpts::useGPU, verbose_, edm::ActivityRegistry::watchPostEndJob(), edm::ActivityRegistry::watchPostModuleConstruction(), edm::ActivityRegistry::watchPreBeginJob(), edm::ActivityRegistry::watchPreModuleConstruction(), and edm::ActivityRegistry::watchPreModuleDestruction().

58  : verbose_(pset.getUntrackedParameter<bool>("verbose")),
59  fallbackOpts_(pset.getParameterSet("fallback")),
61  allowAddModel_(false),
62  startedFallback_(false),
63  pid_(std::to_string(::getpid())) {
64  //module construction is assumed to be serial (correct at the time this code was written)
68  //fallback server will be launched (if needed) before beginJob
71 
72  //include fallback server in set if enabled
73  if (fallbackOpts_.enable) {
74  auto serverType = TritonServerType::Remote;
75  if (!fallbackOpts_.useGPU)
76  serverType = TritonServerType::LocalCPU;
77 #ifdef TRITON_ENABLE_GPU
78  else
79  serverType = TritonServerType::LocalGPU;
80 #endif
81 
82  servers_.emplace(std::piecewise_construct,
83  std::forward_as_tuple(Server::fallbackName),
84  std::forward_as_tuple(Server::fallbackName, Server::fallbackAddress, serverType));
85  }
86 
87  //loop over input servers: check which models they have
89  if (verbose_)
90  msg = "List of models for each server:\n";
91  for (const auto& serverPset : pset.getUntrackedParameterSetVector("servers")) {
92  const std::string& serverName(serverPset.getUntrackedParameter<std::string>("name"));
93  //ensure uniqueness
94  auto [sit, unique] = servers_.emplace(serverName, serverPset);
95  if (!unique)
96  throw cms::Exception("DuplicateServer")
97  << "TritonService: Not allowed to specify more than one server with same name (" << serverName << ")";
98  auto& server(sit->second);
99 
100  std::unique_ptr<tc::InferenceServerGrpcClient> client;
102  tc::InferenceServerGrpcClient::Create(&client, server.url, false, server.useSsl, server.sslOptions),
103  "TritonService(): unable to create inference context for " + serverName + " (" + server.url + ")");
104 
105  if (verbose_) {
106  inference::ServerMetadataResponse serverMetaResponse;
107  TRITON_THROW_IF_ERROR(client->ServerMetadata(&serverMetaResponse),
108  "TritonService(): unable to get metadata for " + serverName + " (" + server.url + ")");
109  edm::LogInfo("TritonService") << "Server " << serverName << ": url = " << server.url
110  << ", version = " << serverMetaResponse.version();
111  }
112 
113  inference::RepositoryIndexResponse repoIndexResponse;
115  client->ModelRepositoryIndex(&repoIndexResponse),
116  "TritonService(): unable to get repository index for " + serverName + " (" + server.url + ")");
117 
118  //servers keep track of models and vice versa
119  if (verbose_)
120  msg += serverName + ": ";
121  for (const auto& modelIndex : repoIndexResponse.models()) {
122  const auto& modelName = modelIndex.name();
123  auto mit = models_.find(modelName);
124  if (mit == models_.end())
125  mit = models_.emplace(modelName, "").first;
126  auto& modelInfo(mit->second);
127  modelInfo.servers.insert(serverName);
128  server.models.insert(modelName);
129  if (verbose_)
130  msg += modelName + ", ";
131  }
132  if (verbose_)
133  msg += "\n";
134  }
135  if (verbose_)
136  edm::LogInfo("TritonService") << msg;
137 }
void watchPostModuleConstruction(PostModuleConstruction::slot_type const &iSlot)
std::unordered_map< std::string, Model > models_
void watchPostEndJob(PostEndJob::slot_type const &iSlot)
#define TRITON_THROW_IF_ERROR(X, MSG)
Definition: triton_utils.h:75
void watchPreModuleConstruction(PreModuleConstruction::slot_type const &iSlot)
void postModuleConstruction(edm::ModuleDescription const &)
void watchPreModuleDestruction(PreModuleDestruction::slot_type const &iSlot)
std::string to_string(const V &value)
Definition: OMSAccess.h:71
unsigned currentModuleId_
static const std::string fallbackAddress
Definition: TritonService.h:87
T getUntrackedParameter(std::string const &, T const &) const
FallbackOpts fallbackOpts_
def unique(seq, keepstr=True)
Definition: tier0.py:24
std::string pid_
void preBeginJob(edm::PathsAndConsumesOfModulesBase const &, edm::ProcessContext const &)
void preModuleConstruction(edm::ModuleDescription const &)
Log< level::Info, false > LogInfo
void preModuleDestruction(edm::ModuleDescription const &)
tuple msg
Definition: mps_check.py:286
void watchPreBeginJob(PreBeginJob::slot_type const &iSlot)
convenience function for attaching to signal
std::unordered_map< std::string, Server > servers_
static const std::string fallbackName
Definition: TritonService.h:86

◆ ~TritonService()

TritonService::~TritonService ( )
default

Member Function Documentation

◆ addModel()

void TritonService::addModel ( const std::string &  modelName,
const std::string &  path 
)

Definition at line 144 of file TritonService.cc.

References allowAddModel_, currentModuleId_, Exception, ecalBarrelClusterFastTimer_cfi::modelName, models_, modules_, castor_dqm_sourceclient_file_cfg::path, and unservedModels_.

Referenced by TritonEDFilterT< G, Capabilities >::initializeGlobalCache(), TritonEDProducerT< G, Capabilities >::initializeGlobalCache(), and TritonOneEDAnalyzer< Capabilities >::TritonOneEDAnalyzer().

144  {
145  //should only be called in module constructors
146  if (!allowAddModel_)
147  throw cms::Exception("DisallowedAddModel")
148  << "TritonService: Attempt to call addModel() outside of module constructors";
149  //if model is not in the list, then no specified server provides it
150  auto mit = models_.find(modelName);
151  if (mit == models_.end()) {
152  auto& modelInfo(unservedModels_.emplace(modelName, path).first->second);
153  modelInfo.modules.insert(currentModuleId_);
154  //only keep track of modules that need unserved models
156  }
157 }
std::unordered_map< std::string, Model > models_
std::unordered_map< std::string, Model > unservedModels_
unsigned currentModuleId_
std::unordered_map< unsigned, Module > modules_

◆ fillDescriptions()

void TritonService::fillDescriptions ( edm::ConfigurationDescriptions descriptions)
static

Definition at line 330 of file TritonService.cc.

References edm::ParameterSetDescription::addUntracked(), edm::ConfigurationDescriptions::addWithDefaultLabel(), submitPVResolutionJobs::desc, and AlCaHLTBitMon_QueryRunRegistry::string.

330  {
332  desc.addUntracked<bool>("verbose", false);
333 
335  validator.addUntracked<std::string>("name");
336  validator.addUntracked<std::string>("address");
337  validator.addUntracked<unsigned>("port");
338  validator.addUntracked<bool>("useSsl", false);
339  validator.addUntracked<std::string>("rootCertificates", "");
340  validator.addUntracked<std::string>("privateKey", "");
341  validator.addUntracked<std::string>("certificateChain", "");
342 
343  desc.addVPSetUntracked("servers", validator, {});
344 
345  edm::ParameterSetDescription fallbackDesc;
346  fallbackDesc.addUntracked<bool>("enable", false);
347  fallbackDesc.addUntracked<bool>("debug", false);
348  fallbackDesc.addUntracked<bool>("verbose", false);
349  fallbackDesc.addUntracked<bool>("useDocker", false);
350  fallbackDesc.addUntracked<bool>("useGPU", false);
351  fallbackDesc.addUntracked<int>("retries", -1);
352  fallbackDesc.addUntracked<int>("wait", -1);
353  fallbackDesc.addUntracked<std::string>("instanceBaseName", "triton_server_instance");
354  fallbackDesc.addUntracked<std::string>("instanceName", "");
355  fallbackDesc.addUntracked<std::string>("tempDir", "");
356  fallbackDesc.addUntracked<std::string>("imageName", "");
357  fallbackDesc.addUntracked<std::string>("sandboxName", "");
358  desc.add<edm::ParameterSetDescription>("fallback", fallbackDesc);
359 
360  descriptions.addWithDefaultLabel(desc);
361 }
void addWithDefaultLabel(ParameterSetDescription const &psetDescription)
ParameterDescriptionBase * addUntracked(U const &iLabel, T const &value)

◆ pid()

const std::string& TritonService::pid ( ) const
inline

Definition at line 111 of file TritonService.h.

References pid_.

Referenced by TritonClient::TritonClient().

111 { return pid_; }
std::string pid_

◆ postEndJob()

void TritonService::postEndJob ( )
private

Definition at line 290 of file TritonService.cc.

References mps_check::command, TritonService::FallbackOpts::command, Exception, fallbackOpts_, convertSQLitetoXML_cfg::output, startedFallback_, AlCaHLTBitMon_QueryRunRegistry::string, and verbose_.

Referenced by TritonService().

290  {
291  if (!startedFallback_)
292  return;
293 
295  if (verbose_)
296  edm::LogInfo("TritonService") << command;
297 
298  const auto& [output, rv] = execSys(command);
299  if (rv != 0) {
300  edm::LogError("TritonService") << output;
301  printFallbackServerLog<edm::LogError>();
302  throw cms::Exception("FallbackFailed")
303  << "TritonService: Stopping the fallback server failed with exit code " << rv;
304  } else if (verbose_) {
305  edm::LogInfo("TritonService") << output;
306  printFallbackServerLog<edm::LogInfo>();
307  }
308 }
Log< level::Error, false > LogError
FallbackOpts fallbackOpts_
Log< level::Info, false > LogInfo
list command
Definition: mps_check.py:25

◆ postModuleConstruction()

void TritonService::postModuleConstruction ( edm::ModuleDescription const &  desc)
private

Definition at line 159 of file TritonService.cc.

References allowAddModel_.

Referenced by TritonService().

159 { allowAddModel_ = false; }

◆ preBeginJob()

void TritonService::preBeginJob ( edm::PathsAndConsumesOfModulesBase const &  ,
edm::ProcessContext const &   
)
private

Definition at line 204 of file TritonService.cc.

References mps_check::command, TritonService::FallbackOpts::command, edm::createGlobalIdentifier(), TritonService::FallbackOpts::debug, TritonService::FallbackOpts::enable, Exception, TritonService::Server::fallbackName, fallbackOpts_, caHitNtupletGeneratorKernels::if(), TritonService::FallbackOpts::imageName, TritonService::FallbackOpts::instanceName, ReggeGribovPartonMC_EposLHC_2760GeV_PbPb_cfi::model, ecalBarrelClusterFastTimer_cfi::modelName, models_, mps_check::msg, or, convertSQLitetoXML_cfg::output, pid_, TritonService::FallbackOpts::retries, TritonService::FallbackOpts::sandboxName, contentValuesFiles::server, servers_, startedFallback_, AlCaHLTBitMon_QueryRunRegistry::string, TritonService::FallbackOpts::tempDir, cond::impl::to_string(), unservedModels_, TritonService::FallbackOpts::useDocker, TritonService::FallbackOpts::useGPU, TritonService::FallbackOpts::verbose, verbose_, and TritonService::FallbackOpts::wait.

Referenced by TritonService().

204  {
205  //only need fallback if there are unserved models
206  if (!fallbackOpts_.enable or unservedModels_.empty())
207  return;
208 
210  if (verbose_)
211  msg = "List of models for fallback server: ";
212  //all unserved models are provided by fallback server
213  auto& server(servers_.find(Server::fallbackName)->second);
214  for (const auto& [modelName, model] : unservedModels_) {
215  auto& modelInfo(models_.emplace(modelName, model).first->second);
216  modelInfo.servers.insert(Server::fallbackName);
217  server.models.insert(modelName);
218  if (verbose_)
219  msg += modelName + ", ";
220  }
221  if (verbose_)
222  edm::LogInfo("TritonService") << msg;
223 
224  //assemble server start command
225  fallbackOpts_.command = "cmsTriton -P -1 -p " + pid_;
226  if (fallbackOpts_.debug)
227  fallbackOpts_.command += " -c";
229  fallbackOpts_.command += " -v";
231  fallbackOpts_.command += " -d";
232  if (fallbackOpts_.useGPU)
233  fallbackOpts_.command += " -g";
234  if (!fallbackOpts_.instanceName.empty())
236  if (fallbackOpts_.retries >= 0)
238  if (fallbackOpts_.wait >= 0)
240  for (const auto& [modelName, model] : unservedModels_) {
241  fallbackOpts_.command += " -m " + model.path;
242  }
243  if (!fallbackOpts_.imageName.empty())
245  if (!fallbackOpts_.sandboxName.empty())
247  //don't need this anymore
248  unservedModels_.clear();
249 
250  //get a random temporary directory if none specified
251  if (fallbackOpts_.tempDir.empty()) {
252  auto tmp_dir_path{std::filesystem::temp_directory_path() /= edm::createGlobalIdentifier()};
253  fallbackOpts_.tempDir = tmp_dir_path.string();
254  }
255  //special case ".": use script default (temp dir = .$instanceName)
256  if (fallbackOpts_.tempDir != ".")
258 
260 
261  if (fallbackOpts_.debug)
262  edm::LogInfo("TritonService") << "Fallback server temporary directory: " << fallbackOpts_.tempDir;
263  if (verbose_)
264  edm::LogInfo("TritonService") << command;
265 
266  //mark as started before executing in case of ctrl+c while command is running
267  startedFallback_ = true;
268  const auto& [output, rv] = execSys(command);
269  if (rv != 0) {
270  edm::LogError("TritonService") << output;
271  printFallbackServerLog<edm::LogError>();
272  throw cms::Exception("FallbackFailed")
273  << "TritonService: Starting the fallback server failed with exit code " << rv;
274  } else if (verbose_)
275  edm::LogInfo("TritonService") << output;
276  //get the port
277  const std::string& portIndicator("CMS_TRITON_GRPC_PORT: ");
278  //find last instance in log in case multiple ports were tried
279  auto pos = output.rfind(portIndicator);
280  if (pos != std::string::npos) {
281  auto pos2 = pos + portIndicator.size();
282  auto pos3 = output.find('\n', pos2);
283  const auto& portNum = output.substr(pos2, pos3 - pos2);
284  server.url += ":" + portNum;
285  } else
286  throw cms::Exception("FallbackFailed") << "TritonService: Unknown port for fallback server, log follows:\n"
287  << output;
288 }
std::unordered_map< std::string, Model > models_
std::unordered_map< std::string, Model > unservedModels_
std::string to_string(const V &value)
Definition: OMSAccess.h:71
Log< level::Error, false > LogError
FallbackOpts fallbackOpts_
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
std::string pid_
std::string createGlobalIdentifier(bool binary=false)
Log< level::Info, false > LogInfo
tuple msg
Definition: mps_check.py:286
list command
Definition: mps_check.py:25
std::unordered_map< std::string, Server > servers_
static const std::string fallbackName
Definition: TritonService.h:86

◆ preModuleConstruction()

void TritonService::preModuleConstruction ( edm::ModuleDescription const &  desc)
private

Definition at line 139 of file TritonService.cc.

References allowAddModel_, currentModuleId_, and submitPVResolutionJobs::desc.

Referenced by TritonService().

139  {
140  currentModuleId_ = desc.id();
141  allowAddModel_ = true;
142 }
unsigned currentModuleId_

◆ preModuleDestruction()

void TritonService::preModuleDestruction ( edm::ModuleDescription const &  desc)
private

Definition at line 161 of file TritonService.cc.

References submitPVResolutionJobs::desc, modules_, and unservedModels_.

Referenced by TritonService().

161  {
162  //remove destructed modules from unserved list
163  if (unservedModels_.empty())
164  return;
165  auto id = desc.id();
166  auto oit = modules_.find(id);
167  if (oit != modules_.end()) {
168  const auto& moduleInfo(oit->second);
169  auto mit = unservedModels_.find(moduleInfo.model);
170  if (mit != unservedModels_.end()) {
171  auto& modelInfo(mit->second);
172  modelInfo.modules.erase(id);
173  //remove a model if it is no longer needed by any modules
174  if (modelInfo.modules.empty())
175  unservedModels_.erase(mit);
176  }
177  modules_.erase(oit);
178  }
179 }
std::unordered_map< std::string, Model > unservedModels_
std::unordered_map< unsigned, Module > modules_

◆ printFallbackServerLog()

template<typename LOG >
void TritonService::printFallbackServerLog ( ) const
private

Definition at line 311 of file TritonService.cc.

References fallbackOpts_, timingPdfMaker::infile, TritonService::FallbackOpts::instanceName, LOG, heppy_check::logName, and TritonService::FallbackOpts::tempDir.

311  {
312  std::vector<std::string> logNames{"log_" + fallbackOpts_.instanceName + ".log"};
313  //cmsTriton script moves log from temp to current dir in verbose mode or in some cases when auto_stop is called
314  // -> check both places
315  logNames.push_back(fallbackOpts_.tempDir + "/" + logNames[0]);
316  bool foundLog = false;
317  for (const auto& logName : logNames) {
318  std::ifstream infile(logName);
319  if (infile.is_open()) {
320  LOG("TritonService") << "TritonService: server log " << logName << "\n" << infile.rdbuf();
321  foundLog = true;
322  break;
323  }
324  }
325  if (!foundLog)
326  LOG("TritonService") << "TritonService: could not find server log " << logNames[0] << " in current directory or "
328 }
#define LOG(A)
FallbackOpts fallbackOpts_

◆ serverInfo()

TritonService::Server TritonService::serverInfo ( const std::string &  model,
const std::string &  preferred = "" 
) const

Definition at line 182 of file TritonService.cc.

References ReggeGribovPartonMC_EposLHC_2760GeV_PbPb_cfi::model, models_, contentValuesFiles::server, and servers_.

Referenced by TritonClient::TritonClient().

182  {
183  auto mit = models_.find(model);
184  if (mit == models_.end())
185  throw cms::Exception("MissingModel") << "TritonService: There are no servers that provide model " << model;
186  const auto& modelInfo(mit->second);
187  const auto& modelServers = modelInfo.servers;
188 
189  auto msit = modelServers.end();
190  if (!preferred.empty()) {
191  msit = modelServers.find(preferred);
192  //todo: add a "strict" parameter to stop execution if preferred server isn't found?
193  if (msit == modelServers.end())
194  edm::LogWarning("PreferredServer") << "Preferred server " << preferred << " for model " << model
195  << " not available, will choose another server";
196  }
197  const auto& serverName(msit == modelServers.end() ? *modelServers.begin() : preferred);
198 
199  //todo: use some algorithm to select server rather than just picking arbitrarily
200  const auto& server(servers_.find(serverName)->second);
201  return server;
202 }
std::unordered_map< std::string, Model > models_
std::unordered_map< std::string, Server > servers_

Member Data Documentation

◆ allowAddModel_

bool TritonService::allowAddModel_
private

Definition at line 129 of file TritonService.h.

Referenced by addModel(), postModuleConstruction(), and preModuleConstruction().

◆ currentModuleId_

unsigned TritonService::currentModuleId_
private

Definition at line 128 of file TritonService.h.

Referenced by addModel(), and preModuleConstruction().

◆ fallbackOpts_

FallbackOpts TritonService::fallbackOpts_
private

Definition at line 127 of file TritonService.h.

Referenced by postEndJob(), preBeginJob(), printFallbackServerLog(), and TritonService().

◆ models_

std::unordered_map<std::string, Model> TritonService::models_
private

Definition at line 135 of file TritonService.h.

Referenced by addModel(), preBeginJob(), serverInfo(), and TritonService().

◆ modules_

std::unordered_map<unsigned, Module> TritonService::modules_
private

Definition at line 136 of file TritonService.h.

Referenced by addModel(), and preModuleDestruction().

◆ pid_

std::string TritonService::pid_
private

Definition at line 131 of file TritonService.h.

Referenced by pid(), and preBeginJob().

◆ servers_

std::unordered_map<std::string, Server> TritonService::servers_
private

Definition at line 134 of file TritonService.h.

Referenced by preBeginJob(), serverInfo(), and TritonService().

◆ startedFallback_

bool TritonService::startedFallback_
private

Definition at line 130 of file TritonService.h.

Referenced by postEndJob(), and preBeginJob().

◆ unservedModels_

std::unordered_map<std::string, Model> TritonService::unservedModels_
private

Definition at line 132 of file TritonService.h.

Referenced by addModel(), preBeginJob(), and preModuleDestruction().

◆ verbose_

bool TritonService::verbose_
private

Definition at line 126 of file TritonService.h.

Referenced by postEndJob(), preBeginJob(), and TritonService().