CMS 3D CMS Logo

MLClient.cc
Go to the documentation of this file.
2 
4 
6 
8 
10 
12 
14 
15 #include <string>
16 
17 using namespace cms::Ort;
18 
19 namespace ecaldqm {
20  MLClient::MLClient() : DQWorkerClient() { qualitySummaries_.insert("MLQualitySummary"); }
21 
23  MLThreshold_ = _params.getUntrackedParameter<double>("MLThreshold");
24  PUcorr_slope_ = _params.getUntrackedParameter<double>("PUcorr_slope");
25  PUcorr_intercept_ = _params.getUntrackedParameter<double>("PUcorr_intercept");
26  avgOcc_ = _params.getUntrackedParameter<std::vector<double>>("avgOcc");
27  if (!onlineMode_) {
28  MEs_.erase(std::string("MLQualitySummary"));
29  MEs_.erase(std::string("EventsperMLImage"));
30  sources_.erase(std::string("PU"));
31  sources_.erase(std::string("NumEvents"));
32  sources_.erase(std::string("DigiAllByLumi"));
33  sources_.erase(std::string("AELoss"));
34  }
35  }
36 
38  if (!onlineMode_)
39  return;
40  using namespace std;
41  MESet& meMLQualitySummary(MEs_.at("MLQualitySummary"));
42  MESet& meEventsperMLImage(MEs_.at("EventsperMLImage"));
43 
44  MESetNonObject const& sPU(static_cast<MESetNonObject&>(sources_.at("PU")));
45  MESetNonObject const& sNumEvents(static_cast<MESetNonObject&>(sources_.at("NumEvents")));
46 
47  //Get the no.of events and the PU per LS calculated in OccupancyTask
48  int nEv = sNumEvents.getFloatValue();
49  double pu = sPU.getFloatValue();
50  //Do not compute ML quality if PU is non existent.
51  if (pu < 0.) {
52  return;
53  }
57 
59  //Inorder to feed the data into the ML model we apply some preprocessing.
60  //We use the Digi Occupancy per Lumisection as the input source.
61  //The model was trained on each occupancy plot having 500 events.
62  //In apprehension of the low luminosity in the beginning of Run3, where in online DQM
63  //the no.of events per LS could be lower than 500, we sum the occupancies over a fixed no.of lumisections as a running sum,
64  //and require that the total no.of events on this summed occupancy to be atleast 200.
65  //(This no.of LS and the no.of events are parameters which would require tuning later)
66  //This summed occupancy is now the input image, which is then corrected for PileUp(PU) dependence and
67  //change in no.of events, which are derived from training.
68  //The input image is also padded by replicating the top and bottom rows so as to prevent the "edge effect"
69  //wherein the ML model's learning degrades near the edge of the data set it sees.
70  //This padding is then removed during inference on the model output.
71 
72  //Get the histogram of the input digi occupancy per lumisection.
73  TH2F* hEbDigiMap((sources_.at("DigiAllByLumi")).getME(1)->getTH2F());
74 
75  size_t nTowers = nEtaTowers * nPhiTowers; //Each occupancy map is of size 34x72 towers
76  std::vector<float> ebOccMap1dCumulPad; //Vector to feed into the ML network
77  std::valarray<float> ebOccMap1d(nTowers); //Array to store occupancy map of size 34x72
78  //Store the values from the input histogram into the array
79  //to do preprocessing
80  for (int i = 0; i < hEbDigiMap->GetNbinsY(); i++) { //NbinsY = 34, NbinsX = 72
81  for (int j = 0; j < hEbDigiMap->GetNbinsX(); j++) {
82  int bin = hEbDigiMap->GetBin(j + 1, i + 1);
83  int k = (i * nPhiTowers) + j;
84  ebOccMap1d[k] = hEbDigiMap->GetBinContent(bin);
85  }
86  }
87  ebOccMap1dQ.push_back(ebOccMap1d); //Queue which stores input occupancy maps for nLS lumis
88  NEventQ.push_back(nEv); //Queue which stores the no.of events per LS for nLS lumis
89 
90  if (NEventQ.size() < nLS) {
91  return; //Should have nLS lumis to add the occupancy over.
92  }
93  if (NEventQ.size() > nLS) {
94  NEventQ.pop_front(); //Keep only nLS consecutive LS. Pop the first one if size greater than nLS
95  }
96  if (ebOccMap1dQ.size() > nLS) {
97  ebOccMap1dQ.pop_front(); //Same conditon for the input occupancy maps.
98  }
99 
100  int TNum = 0;
101  for (size_t i = 0; i < nLS; i++) {
102  TNum += NEventQ[i]; //Total no.of events over nLS lumis
103  }
104  if (TNum < 200) {
105  return; //The total no.of events should be atleast 200 over nLS for meaningful statistics
106  }
107  //Fill the ME to monitor the trend of the total no.of events in each input image to the ML model
108  meEventsperMLImage.fill(getEcalDQMSetupObjects(), EcalBarrel, double(timestamp_.iLumi), double(TNum));
109 
110  //Array to hold the sum of inputs, which make atleast 200 events.
111  std::valarray<float> ebOccMap1dCumul(0., nTowers);
112 
113  for (size_t i = 0; i < ebOccMap1dQ.size(); i++) {
114  ebOccMap1dCumul += ebOccMap1dQ[i]; //Sum the input arrays of N LS.
115  }
116  //Applying PU correction derived from training
117  ebOccMap1dCumul = ebOccMap1dCumul / (PUcorr_slope_ * pu + PUcorr_intercept_);
118 
119  //Scaling up to match input dimensions. 36*72 used instead of 34*72 to accommodate the additional padding
120  //of 2 rows to prevent the "edge effect" which is done below
121  ebOccMap1dCumul = ebOccMap1dCumul * (nEtaTowersPad * nPhiTowers);
122 
123  //Correction for no.of events in each input image as originally model trained with 500 events per image
124  ebOccMap1dCumul = ebOccMap1dCumul * (500. / TNum);
125 
126  //The pre-processed input is now fed into the input tensor vector which will go into the ML model
127  ebOccMap1dCumulPad.assign(std::begin(ebOccMap1dCumul), std::end(ebOccMap1dCumul));
128 
129  //Replicate and pad with the first and last row to prevent the edge effect
130  for (int k = 0; k < nPhiTowers; k++) {
131  float val = ebOccMap1dCumulPad[nPhiTowers - 1];
132  ebOccMap1dCumulPad.insert(ebOccMap1dCumulPad.begin(),
133  val); //padding in the beginning with the first row elements
134  }
135 
136  int size = ebOccMap1dCumulPad.size();
137  for (int k = (size - nPhiTowers); k < size; k++) {
138  float val = ebOccMap1dCumulPad[k];
139  ebOccMap1dCumulPad.push_back(val); //padding at the end with the last row elements
140  }
141 
143  //An Autoencoder (AE) network with resnet architecture is used here which is trained on
144  //certified good data (EB digi occupancy) from Run 2018 data.
145  //On giving an input occupancy map, the encoder part of the AE compresses and reduces the input data, learning its features,
146  //and the decoder reconstructs the data from the encoded form into a representation as close to the original input as possible.
147  //We then compute the Mean squared error (MSE) between the input and output image, also called the Reconstruction Loss,
148  //calculated at a tower by tower basis.
149  //Thus, given an anomalous tower the loss should be significantly higher than the loss with respect to good towers, which the model
150  //has already seen --> anomaly detection.
151  //When calculating the loss we also apply a response correction by dividing each input and output image with the average occupancy from
152  //all 2018 data (also to be tuned),to accommodate the difference in response of crystals in different regions of the Ecal Barrel
153  //Further each loss map from each input image is then multiplied by the last N loss maps,
155  //A quality threshold is then applied on this time multiplied loss map, to mark them as GOOD or BAD,
156  //after which it is stored as a quality summary ME.
157 
159  std::string instanceName{"AE-DQM-inference"};
160  std::string modelFilepath = edm::FileInPath("DQM/EcalMonitorClient/data/onnxModels/resnet.onnx").fullPath();
161 
162  Ort::SessionOptions sessionOptions;
163  sessionOptions.SetIntraOpNumThreads(1);
164  Ort::Env env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, instanceName.c_str());
165  Ort::Session session(env, modelFilepath.c_str(), sessionOptions);
166 
167  Ort::AllocatorWithDefaultOptions allocator;
168 
169  // Strings returned by session.GetInputNameAllocated are temporary, need to copy them before they are deallocated
170  std::string inputName{session.GetInputNameAllocated(0, allocator).get()};
171 
172  Ort::TypeInfo inputTypeInfo = session.GetInputTypeInfo(0);
173  auto inputTensorInfo = inputTypeInfo.GetTensorTypeAndShapeInfo();
174 
175  std::vector<int64_t> inputDims = inputTensorInfo.GetShape();
176 
177  std::string outputName{session.GetOutputNameAllocated(0, allocator).get()};
178 
179  Ort::TypeInfo outputTypeInfo = session.GetOutputTypeInfo(0);
180  auto outputTensorInfo = outputTypeInfo.GetTensorTypeAndShapeInfo();
181 
182  std::vector<int64_t> outputDims = outputTensorInfo.GetShape();
183 
184  size_t TensorSize = nEtaTowersPad * nPhiTowers;
185  std::vector<float> ebRecoOccMap1dPad(TensorSize); //To store the output reconstructed occupancy
186 
187  std::vector<const char*> inputNames{inputName.c_str()};
188  std::vector<const char*> outputNames{outputName.c_str()};
189  std::vector<Ort::Value> inputTensors;
190  std::vector<Ort::Value> outputTensors;
191 
192  Ort::MemoryInfo memoryInfo =
193  Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
194  inputTensors.push_back(Ort::Value::CreateTensor<float>(
195  memoryInfo, ebOccMap1dCumulPad.data(), TensorSize, inputDims.data(), inputDims.size()));
196 
197  outputTensors.push_back(Ort::Value::CreateTensor<float>(
198  memoryInfo, ebRecoOccMap1dPad.data(), TensorSize, outputDims.data(), outputDims.size()));
199 
200  session.Run(Ort::RunOptions{nullptr},
201  inputNames.data(),
202  inputTensors.data(),
203  1,
204  outputNames.data(),
205  outputTensors.data(),
206  1);
207 
209  //2D Loss map to store tower by tower loss between the output (reconstructed) and input occupancies,
210  //Have same dimensions as the occupancy plot
211  std::valarray<std::valarray<float>> lossMap2d(std::valarray<float>(nPhiTowers), nEtaTowers);
212 
213  //1D val arrays to store row wise information corresponding to the reconstructed, input and average occupancies, and loss.
214  //and to do element wise (tower wise) operations on them to calculate the MSE loss between the reco and input occupancy.
215  std::valarray<float> recoOcc1d(0., nPhiTowers);
216  std::valarray<float> inputOcc1d(0., nPhiTowers);
217  std::valarray<float> avgOcc1d(0., nPhiTowers);
218  std::valarray<float> loss_;
219 
220  //Loss calculation
221  //Ignore the top and bottom replicated padded rows when doing inference
222  //by making index i run over (1,35) instead of (0,36)
223  for (int i = 1; i < 35; i++) {
224  for (int j = 0; j < nPhiTowers; j++) {
225  int k = (i * nPhiTowers) + j;
226  recoOcc1d[j] = ebRecoOccMap1dPad[k];
227  inputOcc1d[j] = ebOccMap1dCumulPad[k];
228  avgOcc1d[j] = avgOcc_[k];
229  }
230  //Calculate the MSE loss = (output-input)^2, with avg response correction
231  loss_ = std::pow((recoOcc1d / avgOcc1d - inputOcc1d / avgOcc1d), 2);
232  lossMap2d[i - 1] = (loss_);
233  }
234 
235  lossMap2dQ.push_back(lossMap2d); //Store each loss map from the output in the queue
236  if (lossMap2dQ.size() > nLSloss) {
237  lossMap2dQ.pop_front(); //Keep exactly nLSloss loss maps to multiply
238  }
239  if (lossMap2dQ.size() < nLSloss) { //Exit if there are not nLSloss loss maps
240  return;
241  }
242  //To hold the final multiplied loss
243  std::valarray<std::valarray<float>> lossMap2dMult(std::valarray<float>(1., nPhiTowers), nEtaTowers);
244 
245  //Multiply together the last nLSloss loss maps
246  //So that real anomalies which persist with time are enhanced and fluctuations are suppressed.
247  for (size_t i = 0; i < lossMap2dQ.size(); i++) {
248  lossMap2dMult *= lossMap2dQ[i];
249  }
250 
251  //Fill the AELoss ME with the values of this time multiplied loss map
252  MESet const& sAELoss(sources_.at("AELoss"));
253  TH2F* hLossMap2dMult(sAELoss.getME(1)->getTH2F());
254  for (int i = 0; i < hLossMap2dMult->GetNbinsY(); i++) {
255  for (int j = 0; j < hLossMap2dMult->GetNbinsX(); j++) {
256  int bin_ = hLossMap2dMult->GetBin(j + 1, i + 1);
257  double content = lossMap2dMult[i][j];
258  hLossMap2dMult->SetBinContent(bin_, content);
259  }
260  }
262  //Apply the quality threshold on the time multiplied loss map stored in the ME AELoss
263  //If anomalous, the tower entry will have a large loss value. If good, the value will be close to zero.
264 
265  MESet::const_iterator dAEnd(sAELoss.end(GetElectronicsMap()));
266  for (MESet::const_iterator dItr(sAELoss.beginChannel(GetElectronicsMap())); dItr != dAEnd;
268  DetId id(dItr->getId());
269 
270  bool doMaskML(meMLQualitySummary.maskMatches(id, mask, statusManager_, GetTrigTowerMap()));
271 
272  float entries(dItr->getBinContent());
273  int quality(doMaskML ? kMGood : kGood);
274  //If a trigger tower entry is greater than the ML threshold, set it to Bad quality, otherwise Good.
275  if (entries > MLThreshold_) {
276  quality = doMaskML ? kMBad : kBad;
277  }
278  //Fill the quality summary with the quality of the given tower id.
279  meMLQualitySummary.setBinContent(getEcalDQMSetupObjects(), id, double(quality));
280  } // ML Quality Summary
281  } // producePlots()
282 
284 } // namespace ecaldqm
size
Write out results.
static const int nEtaTowersPad
Definition: MLClient.h:26
void setParams(edm::ParameterSet const &) override
Definition: MLClient.cc:22
#define DEFINE_ECALDQM_WORKER(TYPE)
Definition: DQWorker.h:168
float MLThreshold_
Definition: MLClient.h:27
edm::LuminosityBlockNumber_t iLumi
Definition: DQWorker.h:48
static const int PHYSICS_BAD_CHANNEL_WARNING
std::string fullPath() const
Definition: FileInPath.cc:161
MESet & at(const std::string &key)
Definition: MESet.h:399
static const int PEDESTAL_ONLINE_HIGH_GAIN_RMS_ERROR
static const int PHYSICS_BAD_CHANNEL_ERROR
void producePlots(ProcessType) override
Definition: MLClient.cc:37
constexpr uint32_t mask
Definition: gpuClustering.h:26
string quality
double getFloatValue() const
std::set< std::string > qualitySummaries_
StatusManager const * statusManager_
std::deque< std::valarray< std::valarray< float > > > lossMap2dQ
Definition: MLClient.h:36
static const int nEtaTowers
Definition: MLClient.h:23
size_t nLSloss
Definition: MLClient.h:31
MESetCollection sources_
EcalElectronicsMapping const * GetElectronicsMap()
Definition: DQWorker.cc:150
EcalDQMSetupObjects const getEcalDQMSetupObjects()
Definition: DQWorker.cc:170
Definition: DetId.h:17
Timestamp timestamp_
Definition: DQWorker.h:134
MESetCollection MEs_
Definition: DQWorker.h:131
std::deque< int > NEventQ
Definition: MLClient.h:33
float PUcorr_intercept_
Definition: MLClient.h:29
const_iterator & toNextChannel(EcalElectronicsMapping const *)
Definition: MESet.cc:413
float PUcorr_slope_
Definition: MLClient.h:28
EcalTrigTowerConstituentsMap const * GetTrigTowerMap()
Definition: DQWorker.cc:155
static const int nPhiTowers
Definition: MLClient.h:24
void erase(const std::string &key)
Definition: MESet.h:390
std::deque< std::valarray< float > > ebOccMap1dQ
Definition: MLClient.h:34
Power< A, B >::type pow(const A &a, const B &b)
Definition: Power.h:29
std::vector< double > avgOcc_
Definition: MLClient.h:35