9 #include "grpc_client.h"
10 #include "grpc_service.pb.h"
28 verbose_(
params.getUntrackedParameter<
bool>(
"verbose")),
32 const auto& [
url, isFallbackCPU] =
44 "TritonClient(): unable to create inference context");
49 options_.client_timeout_ =
params.getUntrackedParameter<
unsigned>(
"timeout") * 1e6;
52 inference::ModelConfigResponse modelConfigResponse;
54 "TritonClient(): unable to get model config");
55 inference::ModelConfig modelConfig(modelConfigResponse.config());
66 inference::ModelMetadataResponse modelMetadata;
68 "TritonClient(): unable to get model metadata");
71 const auto& nicInputs = modelMetadata.inputs();
72 const auto& nicOutputs = modelMetadata.outputs();
75 std::stringstream
msg;
79 if (nicInputs.empty())
80 msg <<
"Model on server appears malformed (zero inputs)\n";
82 if (nicOutputs.empty())
83 msg <<
"Model on server appears malformed (zero outputs)\n";
91 std::stringstream io_msg;
93 io_msg <<
"Model inputs: "
96 for (
const auto& nicInput : nicInputs) {
97 const auto& iname = nicInput.name();
99 std::piecewise_construct, std::forward_as_tuple(iname), std::forward_as_tuple(iname, nicInput,
noBatch_));
100 auto& curr_input = curr_itr->second;
103 io_msg <<
" " << iname <<
" (" << curr_input.dname() <<
", " << curr_input.byteSize()
109 const auto& v_outputs =
params.getUntrackedParameter<std::vector<std::string>>(
"outputs");
110 std::unordered_set s_outputs(v_outputs.begin(), v_outputs.end());
114 io_msg <<
"Model outputs: "
117 for (
const auto& nicOutput : nicOutputs) {
118 const auto&
oname = nicOutput.name();
119 if (!s_outputs.empty() and s_outputs.find(
oname) == s_outputs.end())
122 std::piecewise_construct, std::forward_as_tuple(
oname), std::forward_as_tuple(
oname, nicOutput,
noBatch_));
123 auto& curr_output = curr_itr->second;
126 io_msg <<
" " <<
oname <<
" (" << curr_output.dname() <<
", " << curr_output.byteSize()
129 if (!s_outputs.empty())
130 s_outputs.erase(
oname);
134 if (!s_outputs.empty())
142 std::stringstream model_msg;
144 model_msg <<
"Model name: " <<
options_.model_name_ <<
"\n"
145 <<
"Model version: " <<
options_.model_version_ <<
"\n"
159 for (
auto& element :
input_) {
160 element.second.setBatchSize(bsize);
162 for (
auto& element :
output_) {
163 element.second.setBatchSize(bsize);
170 for (
auto& element :
input_) {
171 element.second.reset();
173 for (
auto& element :
output_) {
174 element.second.reset();
181 if (
output.variableDims()) {
182 std::vector<int64_t> tmp_shape;
184 "getResults(): unable to get output shape for " +
oname);
187 output.setShape(tmp_shape,
false);
212 [
t1, start_status,
this](nic::InferResult*
results) {
214 std::shared_ptr<nic::InferResult> results_ptr(results);
215 bool status = triton_utils::warnIfError(results_ptr->RequestStatus(),
"evaluate(): unable to get result");
224 <<
"Remote time: " << std::chrono::duration_cast<std::chrono::microseconds>(
t2 -
t1).count();
242 "evaluate(): unable to launch async run");
252 "evaluate(): unable to run and/or get result");
259 if (!debugName_.empty())
261 << std::chrono::duration_cast<std::chrono::microseconds>(
t2 -
t1).count();
263 const auto& end_status = getServerSideStatus();
266 const auto&
stats = summarizeServerStats(start_status, end_status);
267 reportServerSideStats(
stats);
270 std::shared_ptr<nic::InferResult> results_ptr(
results);
271 status = getResults(results_ptr);
278 std::stringstream
msg;
282 msg <<
" Inference count: " <<
stats.inference_count_ <<
"\n";
283 msg <<
" Execution count: " <<
stats.execution_count_ <<
"\n";
284 msg <<
" Successful request count: " <<
count <<
"\n";
289 return tval / us_to_ns /
count;
292 const uint64_t cumm_avg_us = get_avg_us(
stats.cumm_time_ns_);
293 const uint64_t queue_avg_us = get_avg_us(
stats.queue_time_ns_);
294 const uint64_t compute_input_avg_us = get_avg_us(
stats.compute_input_time_ns_);
295 const uint64_t compute_infer_avg_us = get_avg_us(
stats.compute_infer_time_ns_);
296 const uint64_t compute_output_avg_us = get_avg_us(
stats.compute_output_time_ns_);
297 const uint64_t compute_avg_us = compute_input_avg_us + compute_infer_avg_us + compute_output_avg_us;
299 (cumm_avg_us > queue_avg_us + compute_avg_us) ? (cumm_avg_us - queue_avg_us - compute_avg_us) : 0;
301 msg <<
" Avg request latency: " << cumm_avg_us <<
" usec"
303 <<
" (overhead " << overhead <<
" usec + "
304 <<
"queue " << queue_avg_us <<
" usec + "
305 <<
"compute input " << compute_input_avg_us <<
" usec + "
306 <<
"compute infer " << compute_infer_avg_us <<
" usec + "
307 <<
"compute output " << compute_output_avg_us <<
" usec)" << std::endl;
315 const inference::ModelStatistics& end_status)
const {
318 server_stats.
inference_count_ = end_status.inference_count() - start_status.inference_count();
319 server_stats.
execution_count_ = end_status.execution_count() - start_status.execution_count();
321 end_status.inference_stats().success().count() - start_status.inference_stats().success().count();
323 end_status.inference_stats().success().ns() - start_status.inference_stats().success().ns();
324 server_stats.
queue_time_ns_ = end_status.inference_stats().queue().ns() - start_status.inference_stats().queue().ns();
326 end_status.inference_stats().compute_input().ns() - start_status.inference_stats().compute_input().ns();
328 end_status.inference_stats().compute_infer().ns() - start_status.inference_stats().compute_infer().ns();
330 end_status.inference_stats().compute_output().ns() - start_status.inference_stats().compute_output().ns();
337 inference::ModelStatisticsResponse resp;
340 "getServerSideStatus(): unable to get model statistics");
342 return *(resp.model_stats().begin());
344 return inference::ModelStatistics{};
358 descClient.
addUntracked<std::vector<std::string>>(
"outputs", {});