CMS 3D CMS Logo

MLClient.cc
Go to the documentation of this file.
2 
4 
6 
8 
10 
12 
14 
15 using namespace cms::Ort;
16 
17 namespace ecaldqm {
18  MLClient::MLClient() : DQWorkerClient() { qualitySummaries_.insert("MLQualitySummary"); }
19 
21  MLThreshold_ = _params.getUntrackedParameter<double>("MLThreshold");
22  PUcorr_slope_ = _params.getUntrackedParameter<double>("PUcorr_slope");
23  PUcorr_intercept_ = _params.getUntrackedParameter<double>("PUcorr_intercept");
24  avgOcc_ = _params.getUntrackedParameter<std::vector<double>>("avgOcc");
25  if (!onlineMode_) {
26  MEs_.erase(std::string("MLQualitySummary"));
27  MEs_.erase(std::string("EventsperMLImage"));
28  sources_.erase(std::string("PU"));
29  sources_.erase(std::string("NumEvents"));
30  sources_.erase(std::string("DigiAllByLumi"));
31  sources_.erase(std::string("AELoss"));
32  }
33  }
34 
36  if (!onlineMode_)
37  return;
38  using namespace std;
39  MESet& meMLQualitySummary(MEs_.at("MLQualitySummary"));
40  MESet& meEventsperMLImage(MEs_.at("EventsperMLImage"));
41 
42  MESetNonObject const& sPU(static_cast<MESetNonObject&>(sources_.at("PU")));
43  MESetNonObject const& sNumEvents(static_cast<MESetNonObject&>(sources_.at("NumEvents")));
44 
45  //Get the no.of events and the PU per LS calculated in OccupancyTask
46  int nEv = sNumEvents.getFloatValue();
47  double pu = sPU.getFloatValue();
48  //Do not compute ML quality if PU is non existent.
49  if (pu < 0.) {
50  return;
51  }
55 
57  //Inorder to feed the data into the ML model we apply some preprocessing.
58  //We use the Digi Occupancy per Lumisection as the input source.
59  //The model was trained on each occupancy plot having 500 events.
60  //In apprehension of the low luminosity in the beginning of Run3, where in online DQM
61  //the no.of events per LS could be lower than 500, we sum the occupancies over a fixed no.of lumisections as a running sum,
62  //and require that the total no.of events on this summed occupancy to be atleast 200.
63  //(This no.of LS and the no.of events are parameters which would require tuning later)
64  //This summed occupancy is now the input image, which is then corrected for PileUp(PU) dependence and
65  //change in no.of events, which are derived from training.
66  //The input image is also padded by replicating the top and bottom rows so as to prevent the "edge effect"
67  //wherein the ML model's learning degrades near the edge of the data set it sees.
68  //This padding is then removed during inference on the model output.
69 
70  //Get the histogram of the input digi occupancy per lumisection.
71  TH2F* hEbDigiMap((sources_.at("DigiAllByLumi")).getME(1)->getTH2F());
72 
73  size_t nTowers = nEtaTowers * nPhiTowers; //Each occupancy map is of size 34x72 towers
74  std::vector<float> ebOccMap1dCumulPad; //Vector to feed into the ML network
75  std::valarray<float> ebOccMap1d(nTowers); //Array to store occupancy map of size 34x72
76  //Store the values from the input histogram into the array
77  //to do preprocessing
78  for (int i = 0; i < hEbDigiMap->GetNbinsY(); i++) { //NbinsY = 34, NbinsX = 72
79  for (int j = 0; j < hEbDigiMap->GetNbinsX(); j++) {
80  int bin = hEbDigiMap->GetBin(j + 1, i + 1);
81  int k = (i * nPhiTowers) + j;
82  ebOccMap1d[k] = hEbDigiMap->GetBinContent(bin);
83  }
84  }
85  ebOccMap1dQ.push_back(ebOccMap1d); //Queue which stores input occupancy maps for nLS lumis
86  NEventQ.push_back(nEv); //Queue which stores the no.of events per LS for nLS lumis
87 
88  if (NEventQ.size() < nLS) {
89  return; //Should have nLS lumis to add the occupancy over.
90  }
91  if (NEventQ.size() > nLS) {
92  NEventQ.pop_front(); //Keep only nLS consecutive LS. Pop the first one if size greater than nLS
93  }
94  if (ebOccMap1dQ.size() > nLS) {
95  ebOccMap1dQ.pop_front(); //Same conditon for the input occupancy maps.
96  }
97 
98  int TNum = 0;
99  for (size_t i = 0; i < nLS; i++) {
100  TNum += NEventQ[i]; //Total no.of events over nLS lumis
101  }
102  if (TNum < 200) {
103  return; //The total no.of events should be atleast 200 over nLS for meaningful statistics
104  }
105  //Fill the ME to monitor the trend of the total no.of events in each input image to the ML model
106  meEventsperMLImage.fill(getEcalDQMSetupObjects(), EcalBarrel, double(timestamp_.iLumi), double(TNum));
107 
108  //Array to hold the sum of inputs, which make atleast 200 events.
109  std::valarray<float> ebOccMap1dCumul(0., nTowers);
110 
111  for (size_t i = 0; i < ebOccMap1dQ.size(); i++) {
112  ebOccMap1dCumul += ebOccMap1dQ[i]; //Sum the input arrays of N LS.
113  }
114  //Applying PU correction derived from training
115  ebOccMap1dCumul = ebOccMap1dCumul / (PUcorr_slope_ * pu + PUcorr_intercept_);
116 
117  //Scaling up to match input dimensions. 36*72 used instead of 34*72 to accommodate the additional padding
118  //of 2 rows to prevent the "edge effect" which is done below
119  ebOccMap1dCumul = ebOccMap1dCumul * (nEtaTowersPad * nPhiTowers);
120 
121  //Correction for no.of events in each input image as originally model trained with 500 events per image
122  ebOccMap1dCumul = ebOccMap1dCumul * (500. / TNum);
123 
124  //The pre-processed input is now fed into the input tensor vector which will go into the ML model
125  ebOccMap1dCumulPad.assign(std::begin(ebOccMap1dCumul), std::end(ebOccMap1dCumul));
126 
127  //Replicate and pad with the first and last row to prevent the edge effect
128  for (int k = 0; k < nPhiTowers; k++) {
129  float val = ebOccMap1dCumulPad[nPhiTowers - 1];
130  ebOccMap1dCumulPad.insert(ebOccMap1dCumulPad.begin(),
131  val); //padding in the beginning with the first row elements
132  }
133 
134  int size = ebOccMap1dCumulPad.size();
135  for (int k = (size - nPhiTowers); k < size; k++) {
136  float val = ebOccMap1dCumulPad[k];
137  ebOccMap1dCumulPad.push_back(val); //padding at the end with the last row elements
138  }
139 
141  //An Autoencoder (AE) network with resnet architecture is used here which is trained on
142  //certified good data (EB digi occupancy) from Run 2018 data.
143  //On giving an input occupancy map, the encoder part of the AE compresses and reduces the input data, learning its features,
144  //and the decoder reconstructs the data from the encoded form into a representation as close to the original input as possible.
145  //We then compute the Mean squared error (MSE) between the input and output image, also called the Reconstruction Loss,
146  //calculated at a tower by tower basis.
147  //Thus, given an anomalous tower the loss should be significantly higher than the loss with respect to good towers, which the model
148  //has already seen --> anomaly detection.
149  //When calculating the loss we also apply a response correction by dividing each input and output image with the average occupancy from
150  //all 2018 data (also to be tuned),to accommodate the difference in response of crystals in different regions of the Ecal Barrel
151  //Further each loss map from each input image is then multiplied by the last N loss maps,
153  //A quality threshold is then applied on this time multiplied loss map, to mark them as GOOD or BAD,
154  //after which it is stored as a quality summary ME.
155 
157  std::string instanceName{"AE-DQM-inference"};
158  std::string modelFilepath = edm::FileInPath("DQM/EcalMonitorClient/data/onnxModels/resnet.onnx").fullPath();
159 
160  Ort::SessionOptions sessionOptions;
161  sessionOptions.SetIntraOpNumThreads(1);
162  Ort::Env env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, instanceName.c_str());
163  Ort::Session session(env, modelFilepath.c_str(), sessionOptions);
164 
165  Ort::AllocatorWithDefaultOptions allocator;
166 
167  const char* inputName = session.GetInputName(0, allocator);
168 
169  Ort::TypeInfo inputTypeInfo = session.GetInputTypeInfo(0);
170  auto inputTensorInfo = inputTypeInfo.GetTensorTypeAndShapeInfo();
171 
172  std::vector<int64_t> inputDims = inputTensorInfo.GetShape();
173 
174  const char* outputName = session.GetOutputName(0, allocator);
175 
176  Ort::TypeInfo outputTypeInfo = session.GetOutputTypeInfo(0);
177  auto outputTensorInfo = outputTypeInfo.GetTensorTypeAndShapeInfo();
178 
179  std::vector<int64_t> outputDims = outputTensorInfo.GetShape();
180 
181  size_t TensorSize = nEtaTowersPad * nPhiTowers;
182  std::vector<float> ebRecoOccMap1dPad(TensorSize); //To store the output reconstructed occupancy
183 
184  std::vector<const char*> inputNames{inputName};
185  std::vector<const char*> outputNames{outputName};
186  std::vector<Ort::Value> inputTensors;
187  std::vector<Ort::Value> outputTensors;
188 
189  Ort::MemoryInfo memoryInfo =
190  Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
191  inputTensors.push_back(Ort::Value::CreateTensor<float>(
192  memoryInfo, ebOccMap1dCumulPad.data(), TensorSize, inputDims.data(), inputDims.size()));
193 
194  outputTensors.push_back(Ort::Value::CreateTensor<float>(
195  memoryInfo, ebRecoOccMap1dPad.data(), TensorSize, outputDims.data(), outputDims.size()));
196 
197  session.Run(Ort::RunOptions{nullptr},
198  inputNames.data(),
199  inputTensors.data(),
200  1,
201  outputNames.data(),
202  outputTensors.data(),
203  1);
204 
206  //2D Loss map to store tower by tower loss between the output (reconstructed) and input occupancies,
207  //Have same dimensions as the occupancy plot
208  std::valarray<std::valarray<float>> lossMap2d(std::valarray<float>(nPhiTowers), nEtaTowers);
209 
210  //1D val arrays to store row wise information corresponding to the reconstructed, input and average occupancies, and loss.
211  //and to do element wise (tower wise) operations on them to calculate the MSE loss between the reco and input occupancy.
212  std::valarray<float> recoOcc1d(0., nPhiTowers);
213  std::valarray<float> inputOcc1d(0., nPhiTowers);
214  std::valarray<float> avgOcc1d(0., nPhiTowers);
215  std::valarray<float> loss_;
216 
217  //Loss calculation
218  //Ignore the top and bottom replicated padded rows when doing inference
219  //by making index i run over (1,35) instead of (0,36)
220  for (int i = 1; i < 35; i++) {
221  for (int j = 0; j < nPhiTowers; j++) {
222  int k = (i * nPhiTowers) + j;
223  recoOcc1d[j] = ebRecoOccMap1dPad[k];
224  inputOcc1d[j] = ebOccMap1dCumulPad[k];
225  avgOcc1d[j] = avgOcc_[k];
226  }
227  //Calculate the MSE loss = (output-input)^2, with avg response correction
228  loss_ = std::pow((recoOcc1d / avgOcc1d - inputOcc1d / avgOcc1d), 2);
229  lossMap2d[i - 1] = (loss_);
230  }
231 
232  lossMap2dQ.push_back(lossMap2d); //Store each loss map from the output in the queue
233  if (lossMap2dQ.size() > nLSloss) {
234  lossMap2dQ.pop_front(); //Keep exactly nLSloss loss maps to multiply
235  }
236  if (lossMap2dQ.size() < nLSloss) { //Exit if there are not nLSloss loss maps
237  return;
238  }
239  //To hold the final multiplied loss
240  std::valarray<std::valarray<float>> lossMap2dMult(std::valarray<float>(1., nPhiTowers), nEtaTowers);
241 
242  //Multiply together the last nLSloss loss maps
243  //So that real anomalies which persist with time are enhanced and fluctuations are suppressed.
244  for (size_t i = 0; i < lossMap2dQ.size(); i++) {
245  lossMap2dMult *= lossMap2dQ[i];
246  }
247 
248  //Fill the AELoss ME with the values of this time multiplied loss map
249  MESet const& sAELoss(sources_.at("AELoss"));
250  TH2F* hLossMap2dMult(sAELoss.getME(1)->getTH2F());
251  for (int i = 0; i < hLossMap2dMult->GetNbinsY(); i++) {
252  for (int j = 0; j < hLossMap2dMult->GetNbinsX(); j++) {
253  int bin_ = hLossMap2dMult->GetBin(j + 1, i + 1);
254  double content = lossMap2dMult[i][j];
255  hLossMap2dMult->SetBinContent(bin_, content);
256  }
257  }
259  //Apply the quality threshold on the time multiplied loss map stored in the ME AELoss
260  //If anomalous, the tower entry will have a large loss value. If good, the value will be close to zero.
261 
262  MESet::const_iterator dAEnd(sAELoss.end(GetElectronicsMap()));
263  for (MESet::const_iterator dItr(sAELoss.beginChannel(GetElectronicsMap())); dItr != dAEnd;
265  DetId id(dItr->getId());
266 
267  bool doMaskML(meMLQualitySummary.maskMatches(id, mask, statusManager_, GetTrigTowerMap()));
268 
269  float entries(dItr->getBinContent());
270  int quality(doMaskML ? kMGood : kGood);
271  //If a trigger tower entry is greater than the ML threshold, set it to Bad quality, otherwise Good.
272  if (entries > MLThreshold_) {
273  quality = doMaskML ? kMBad : kBad;
274  }
275  //Fill the quality summary with the quality of the given tower id.
276  meMLQualitySummary.setBinContent(getEcalDQMSetupObjects(), id, double(quality));
277  } // ML Quality Summary
278  } // producePlots()
279 
281 } // namespace ecaldqm
size
Write out results.
static const int nEtaTowersPad
Definition: MLClient.h:26
void setParams(edm::ParameterSet const &) override
Definition: MLClient.cc:20
#define DEFINE_ECALDQM_WORKER(TYPE)
Definition: DQWorker.h:168
float MLThreshold_
Definition: MLClient.h:27
edm::LuminosityBlockNumber_t iLumi
Definition: DQWorker.h:48
static const int PHYSICS_BAD_CHANNEL_WARNING
std::string fullPath() const
Definition: FileInPath.cc:161
MESet & at(const std::string &key)
Definition: MESet.h:399
static const int PEDESTAL_ONLINE_HIGH_GAIN_RMS_ERROR
static const int PHYSICS_BAD_CHANNEL_ERROR
void producePlots(ProcessType) override
Definition: MLClient.cc:35
constexpr uint32_t mask
Definition: gpuClustering.h:24
double getFloatValue() const
std::set< std::string > qualitySummaries_
StatusManager const * statusManager_
std::deque< std::valarray< std::valarray< float > > > lossMap2dQ
Definition: MLClient.h:36
static const int nEtaTowers
Definition: MLClient.h:23
size_t nLSloss
Definition: MLClient.h:31
MESetCollection sources_
EcalElectronicsMapping const * GetElectronicsMap()
Definition: DQWorker.cc:150
EcalDQMSetupObjects const getEcalDQMSetupObjects()
Definition: DQWorker.cc:170
Definition: DetId.h:17
Timestamp timestamp_
Definition: DQWorker.h:134
MESetCollection MEs_
Definition: DQWorker.h:131
std::deque< int > NEventQ
Definition: MLClient.h:33
float PUcorr_intercept_
Definition: MLClient.h:29
const_iterator & toNextChannel(EcalElectronicsMapping const *)
Definition: MESet.cc:413
float PUcorr_slope_
Definition: MLClient.h:28
EcalTrigTowerConstituentsMap const * GetTrigTowerMap()
Definition: DQWorker.cc:155
static const int nPhiTowers
Definition: MLClient.h:24
string quality
void erase(const std::string &key)
Definition: MESet.h:390
std::deque< std::valarray< float > > ebOccMap1dQ
Definition: MLClient.h:34
Power< A, B >::type pow(const A &a, const B &b)
Definition: Power.h:29
std::vector< double > avgOcc_
Definition: MLClient.h:35