CMS 3D CMS Logo

Mahi.dev.cc
Go to the documentation of this file.
1 #include <iostream>
2 #include <alpaka/alpaka.hpp>
3 #include <Eigen/Dense>
4 
7 
9 // needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
12 // TODO reuse some of the HCAL constats from
13 //#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
14 
15 #include "Mahi.h"
16 
17 #ifdef HCAL_MAHI_GPUDEBUG
18 #define DETID_TO_DEBUG 1125647428
19 #endif
20 
22 
23  using namespace cms::alpakatools;
25  namespace mahi {
26 
27  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float uint_as_float(uint32_t val) {
28 #if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
29  return __uint_as_float(val);
30 #else
31  return edm::bit_cast<float>(val);
32 #endif
33  }
34 
35  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t float_as_uint(float val) {
36 #if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
37  return __float_as_uint(val);
38 #else
39  return edm::bit_cast<unsigned int>(val);
40 #endif
41  }
42 
43  ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_time_slew_delay(float const fC,
44  float const tzero,
45  float const slope,
46  float const tmax) {
47  auto const rawDelay = tzero + slope * std::log(fC);
48  return rawDelay < 0 ? 0 : (rawDelay > tmax ? tmax : rawDelay);
49  }
50 
51  // HcalQIEShapes are hardcoded in HcalQIEData.cc basically
52  // + some logic to generate 128 and 256 value arrays...
53  ALPAKA_STATIC_ACC_MEM_CONSTANT float const qie8shape[129] = {
54  -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
55  18, 20, 22, 24, 26, 28, 31, 34, 37, 40, 44, 48, 52, 57, 62, 57, 62,
56  67, 72, 77, 82, 87, 92, 97, 102, 107, 112, 117, 122, 127, 132, 142, 152, 162,
57  172, 182, 192, 202, 217, 232, 247, 262, 282, 302, 322, 347, 372, 347, 372, 397, 422,
58  447, 472, 497, 522, 547, 572, 597, 622, 647, 672, 697, 722, 772, 822, 872, 922, 972,
59  1022, 1072, 1147, 1222, 1297, 1372, 1472, 1572, 1672, 1797, 1922, 1797, 1922, 2047, 2172, 2297, 2422,
60  2547, 2672, 2797, 2922, 3047, 3172, 3297, 3422, 3547, 3672, 3922, 4172, 4422, 4672, 4922, 5172, 5422,
61  5797, 6172, 6547, 6922, 7422, 7922, 8422, 9047, 9672, 10297};
62 
63  ALPAKA_STATIC_ACC_MEM_CONSTANT float const qie11shape[257] = {
64  -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5,
65  11.5, 12.5, 13.5, 14.5, 15.5, 17.5, 19.5, 21.5, 23.5, 25.5, 27.5, 29.5,
66  31.5, 33.5, 35.5, 37.5, 39.5, 41.5, 43.5, 45.5, 47.5, 49.5, 51.5, 53.5,
67  55.5, 59.5, 63.5, 67.5, 71.5, 75.5, 79.5, 83.5, 87.5, 91.5, 95.5, 99.5,
68  103.5, 107.5, 111.5, 115.5, 119.5, 123.5, 127.5, 131.5, 135.5, 139.5, 147.5, 155.5,
69  163.5, 171.5, 179.5, 187.5, 171.5, 179.5, 187.5, 195.5, 203.5, 211.5, 219.5, 227.5,
70  235.5, 243.5, 251.5, 259.5, 267.5, 275.5, 283.5, 291.5, 299.5, 315.5, 331.5, 347.5,
71  363.5, 379.5, 395.5, 411.5, 427.5, 443.5, 459.5, 475.5, 491.5, 507.5, 523.5, 539.5,
72  555.5, 571.5, 587.5, 603.5, 619.5, 651.5, 683.5, 715.5, 747.5, 779.5, 811.5, 843.5,
73  875.5, 907.5, 939.5, 971.5, 1003.5, 1035.5, 1067.5, 1099.5, 1131.5, 1163.5, 1195.5, 1227.5,
74  1259.5, 1291.5, 1355.5, 1419.5, 1483.5, 1547.5, 1611.5, 1675.5, 1547.5, 1611.5, 1675.5, 1739.5,
75  1803.5, 1867.5, 1931.5, 1995.5, 2059.5, 2123.5, 2187.5, 2251.5, 2315.5, 2379.5, 2443.5, 2507.5,
76  2571.5, 2699.5, 2827.5, 2955.5, 3083.5, 3211.5, 3339.5, 3467.5, 3595.5, 3723.5, 3851.5, 3979.5,
77  4107.5, 4235.5, 4363.5, 4491.5, 4619.5, 4747.5, 4875.5, 5003.5, 5131.5, 5387.5, 5643.5, 5899.5,
78  6155.5, 6411.5, 6667.5, 6923.5, 7179.5, 7435.5, 7691.5, 7947.5, 8203.5, 8459.5, 8715.5, 8971.5,
79  9227.5, 9483.5, 9739.5, 9995.5, 10251.5, 10507.5, 11019.5, 11531.5, 12043.5, 12555.5, 13067.5, 13579.5,
80  12555.5, 13067.5, 13579.5, 14091.5, 14603.5, 15115.5, 15627.5, 16139.5, 16651.5, 17163.5, 17675.5, 18187.5,
81  18699.5, 19211.5, 19723.5, 20235.5, 20747.5, 21771.5, 22795.5, 23819.5, 24843.5, 25867.5, 26891.5, 27915.5,
82  28939.5, 29963.5, 30987.5, 32011.5, 33035.5, 34059.5, 35083.5, 36107.5, 37131.5, 38155.5, 39179.5, 40203.5,
83  41227.5, 43275.5, 45323.5, 47371.5, 49419.5, 51467.5, 53515.5, 55563.5, 57611.5, 59659.5, 61707.5, 63755.5,
84  65803.5, 67851.5, 69899.5, 71947.5, 73995.5, 76043.5, 78091.5, 80139.5, 82187.5, 84235.5, 88331.5, 92427.5,
85  96523.5, 100620, 104716, 108812, 112908};
86 
87  constexpr int32_t IPHI_MAX = 72;
88 
89  ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHB(
90  uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB) {
91  HcalDetId did{didraw};
92  uint32_t const value = (did.depth() - 1) + maxDepthHB * (did.iphi() - 1);
93  return did.ieta() > 0
94  ? value + maxDepthHB * hcal::reconstruction::mahi::IPHI_MAX * (did.ieta() - firstHBRing)
95  : value + maxDepthHB * hcal::reconstruction::mahi::IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB);
96  }
97  ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHE(uint32_t const didraw,
98  int const maxDepthHE,
99  int const maxPhiHE,
100  int const firstHERing,
101  int const lastHERing,
102  int const nEtaHE) {
103  HcalDetId did{didraw};
104  uint32_t const value = (did.depth() - 1) + maxDepthHE * (did.iphi() - 1);
105  return did.ieta() > 0 ? value + maxDepthHE * maxPhiHE * (did.ieta() - firstHERing)
106  : value + maxDepthHE * maxPhiHE * (did.ieta() + lastHERing + nEtaHE);
107  }
108  ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range) {
109  return capid * 4 + range;
110  }
111 
112  ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_reco_correction_factor(float const par1,
113  float const par2,
114  float const par3,
115  float const x) {
116  return par3 * x * x + par2 * x + par1;
117  }
118 
119  // compute the charge using the adc, qie type and the appropriate qie shape array
120  ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_coder_charge(
121  int const qieType, uint8_t const adc, uint8_t const capid, float const* qieOffsets, float const* qieSlopes) {
122  auto const range = qieType == 0 ? (adc >> 5) & 0x3 : (adc >> 6) & 0x3;
123  auto const* qieShapeToUse = qieType == 0 ? qie8shape : qie11shape;
124  auto const nbins = qieType == 0 ? 32 : 64;
125  auto const center = adc % nbins == nbins - 1 ? 0.5 * (3 * qieShapeToUse[adc] - qieShapeToUse[adc - 1])
126  : 0.5 * (qieShapeToUse[adc] + qieShapeToUse[adc + 1]);
127  auto const index = get_qiecoder_index(capid, range);
128  return (center - qieOffsets[index]) / qieSlopes[index];
129  }
130 
131  // this is from
132  // https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc#L140
133 
134  ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_diff_charge_gain(int const qieType,
135  uint8_t adc,
136  uint8_t const capid,
137  float const* qieOffsets,
138  float const* qieSlopes,
139  bool const isqie11) {
140  constexpr uint32_t mantissaMaskQIE8 = 0x1fu;
141  constexpr uint32_t mantissaMaskQIE11 = 0x3f;
142  auto const mantissaMask = isqie11 ? mantissaMaskQIE11 : mantissaMaskQIE8;
143  auto const q = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
144  auto const mantissa = adc & mantissaMask;
145 
146  if (mantissa == 0u || mantissa == mantissaMask - 1u)
147  return compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes) - q;
148  else if (mantissa == 1u || mantissa == mantissaMask)
149  return q - compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
150  else {
151  auto const qup = compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes);
152  auto const qdown = compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
153  auto const upgain = qup - q;
154  auto const downgain = q - qdown;
155  auto const averagegain = (qup - qdown) / 2.f;
156  if (std::abs(upgain - downgain) < 0.01f * averagegain)
157  return averagegain;
158  else {
159  auto const q2up = compute_coder_charge(qieType, adc + 2u, capid, qieOffsets, qieSlopes);
160  auto const q2down = compute_coder_charge(qieType, adc - 2u, capid, qieOffsets, qieSlopes);
161  auto const upgain2 = q2up - qup;
162  auto const downgain2 = qdown - q2down;
163  if (std::abs(upgain2 - upgain) < std::abs(downgain2 - downgain))
164  return upgain;
165  else
166  return downgain;
167  }
168  }
169  }
170 
171  using PulseShapeConstElement = typename HcalPulseShapeSoA::ConstView::const_element;
172  // TODO: remove what's not needed
173  // originally from from RecoLocalCalo/HcalRecAlgos/src/PulseShapeFunctor.cc
174  ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_pulse_shape_value(PulseShapeConstElement const& pulseShape,
175  float const pulse_time,
176  int const sample,
177  int const shift) {
178  auto const& acc25nsVec = pulseShape.acc25nsVec();
179  auto const& diff25nsItvlVec = pulseShape.diff25nsItvlVec();
180  auto const& accVarLenIdxMinusOneVec = pulseShape.accVarLenIdxMinusOneVec();
181  auto const& diffVarItvlIdxMinusOneVec = pulseShape.diffVarItvlIdxMinusOneVec();
182  auto const& accVarLenIdxZeroVec = pulseShape.accVarLenIdxZEROVec();
183  auto const& diffVarItvlIdxZeroVec = pulseShape.diffVarItvlIdxZEROVec();
184 
185  // constants
186  constexpr float slew = 0.f;
187  constexpr auto ns_per_bx = ::hcal::constants::nsPerBX;
188 
189  // FIXME: clean up all the rounding... this is coming from original cpu version
190  float const i_start_float = -::hcal::constants::iniTimeShift - pulse_time - slew > 0.f
191  ? 0.f
192  : std::abs(-::hcal::constants::iniTimeShift - pulse_time - slew) + 1.f;
193  int i_start = static_cast<int>(i_start_float);
194  float offset_start = static_cast<float>(i_start) - ::hcal::constants::iniTimeShift - pulse_time - slew;
195 
196  // boundary
197  if (offset_start == 1.0f) {
198  offset_start = 0.f;
199  i_start -= 1;
200  }
201 
202  int const bin_start = static_cast<int>(offset_start);
203  float const bin_start_up = static_cast<float>(bin_start) + 0.5f;
204  int const bin_0_start = offset_start < bin_start_up ? bin_start - 1 : bin_start;
205  int const its_start = i_start / ns_per_bx;
206  int const distTo25ns_start = ns_per_bx - 1 - i_start % ns_per_bx;
207  auto const factor = offset_start - static_cast<float>(bin_0_start) - 0.5;
208 
209  auto const sample_over10ts = sample + shift;
210  float value = 0.0f;
211  if (sample_over10ts == its_start) {
212  value = bin_0_start == -1
213  ? accVarLenIdxMinusOneVec[distTo25ns_start] + factor * diffVarItvlIdxMinusOneVec[distTo25ns_start]
214  : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start];
215  } else if (sample_over10ts > its_start) {
216  int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start;
217  value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx];
218  }
219  return value;
220  }
221 
222  // TODO: provide constants from configuration
223  // from RecoLocalCalo/HcalRecProducers/python/HBHEMahiParameters_cfi.py
226  constexpr double nnlsThresh = 1e-11;
228 
229  // from RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc
230  ALPAKA_FN_ACC float get_raw_charge(double const charge,
231  double const pedestal,
232  float const* shrChargeMinusPedestal,
233  float const parLin1,
234  float const parLin2,
235  float const parLin3,
236  int32_t const nsamplesForCompute,
237  int32_t const soi,
238  int const sipmQTSShift,
239  int const sipmQNTStoSum,
240  float const fcByPE,
241  int32_t const lch,
242  bool const isqie11) {
243  float rawCharge;
244 
245  if (!isqie11)
246  rawCharge = charge;
247  else {
248  int const first = std::max(soi + sipmQTSShift, 0);
249  int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute);
250  float sipmq = 0.0f;
251  for (auto ts = first; ts < last; ts++)
252  sipmq += shrChargeMinusPedestal[lch * nsamplesForCompute + ts];
253  auto const effectivePixelsFired = sipmq / fcByPE;
254  auto const factor = compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
255  rawCharge = (charge - pedestal) * factor + pedestal;
256 
257 #ifdef HCAL_MAHI_GPUDEBUG
258  printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
259 #endif
260  }
261  return rawCharge;
262  }
263 
264  // Tried using lambda, but strange error from with nvcc
265  inline constexpr bool TSenergyCompare(std::pair<unsigned int, float> a, std::pair<unsigned int, float> b) {
266  return a.second > b.second;
267  };
268 
270  public:
271  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
272  ALPAKA_FN_ACC void operator()(TAcc const& acc,
273  OProductType::View outputGPU,
274  IProductTypef01::ConstView f01HEDigis,
275  IProductTypef5::ConstView f5HBDigis,
276  IProductTypef3::ConstView f3HBDigis,
277  HcalMahiConditionsPortableDevice::ConstView mahi,
278  HcalSiPMCharacteristicsPortableDevice::ConstView sipmCharacteristics,
280  bool const useEffectivePedestals,
281  int const sipmQTSShift,
282  int const sipmQNTStoSum,
283  int const firstSampleShift,
284  float const ts4Thresh,
285  float* amplitudes,
286  float* noiseTerms,
287  float* electronicNoiseTerms,
288  int8_t* soiSamples,
289  int const windowSize) const {
290  auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
291  auto const startingSample = compute_nsamples<Flavor1>(f01HEDigis.stride()) - windowSize;
292  auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
293 
294  //first index = groups of channel
295  auto const nchannels_per_block(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
296  //2nd index = groups of sample
297  auto const nsamplesForCompute(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[1u]);
298 
299  // configure shared mem
300  float* shrEnergyM0PerTS = alpaka::getDynSharedMem<float>(acc);
301  float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesForCompute * nchannels_per_block;
302  float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesForCompute * nchannels_per_block;
303  float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block;
304  unsigned long long int* shrMethod0EnergySamplePair =
305  reinterpret_cast<unsigned long long int*>(shrEnergyM0TotalAccum + nchannels_per_block);
306 
307  //Loop over all groups of channels
308  for (auto group : uniform_groups_y(acc, nchannels)) {
309  //Loop over each channel, first compute soiSamples and shrMem for all channels
310  for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
311  auto const gch = channel.global;
312  auto const lch = channel.local;
313 
314  for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
315  auto const sampleWithinWindow = i_sample;
316  auto const sample = i_sample + startingSample;
317  auto const linearThPerBlock = i_sample + channel.local * nsamplesForCompute;
318  // initialize
319  if (sampleWithinWindow == 0) {
320  soiSamples[gch] = -1;
321  shrMethod0EnergyAccum[lch] = 0;
322  shrMethod0EnergySamplePair[lch] = 0;
323  shrEnergyM0TotalAccum[lch] = 0;
324  }
325 
326  // compute soiSamples
327  if (gch < f01HEDigis.size()) {
328  // NOTE: assume that soi is high only for a single guy!
329  // which must be the case. cpu version does not check for that
330  // if that is not the case, we will see that with cuda mmecheck
331  auto const soibit = soibit_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample);
332  if (soibit == 1)
333  soiSamples[gch] = sampleWithinWindow;
334  } else if (gch >= nchannelsf015) {
335  auto const soibit = soibit_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample);
336  if (soibit == 1)
337  soiSamples[gch] = sampleWithinWindow;
338  }
339 
340  // compute shrMem
341  auto const id = gch < f01HEDigis.size()
342  ? f01HEDigis.ids()[gch]
343  : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
344  : f3HBDigis.ids()[gch - nchannelsf015]);
345  auto const did = HcalDetId{id};
346 
347  auto const adc =
348  gch < f01HEDigis.size()
349  ? adc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
350  : (gch < nchannelsf015
351  ? adc_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
352  : adc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
353  auto const capid =
354  gch < f01HEDigis.size()
355  ? capid_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
356  : (gch < nchannelsf015
357  ? capid_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
358  : capid_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
359 
360  // compute hash for this did
361  // hash needed to convert condition arrays order (based on Topo) into digi arrays order(based on FED)
362  auto const hashedId =
363  did.subdetId() == HcalBarrel
364  ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
365  : did2linearIndexHE(id,
366  mahi.maxDepthHE(),
367  mahi.maxPhiHE(),
368  mahi.firstHERing(),
369  mahi.lastHERing(),
370  mahi.nEtaHE()) +
371  mahi.offsetForHashes();
372 
373  // conditions based on the hash
374  auto const qieType = mahi.qieTypes_values()[hashedId] > 0 ? 1 : 0; // 2 types at this point
375  auto const* qieOffsets = mahi.qieCoders_offsets()[hashedId].data();
376  auto const* qieSlopes = mahi.qieCoders_slopes()[hashedId].data();
377  auto const pedestal = mahi.pedestals_value()[hashedId][capid];
378 
379  // compute charge
380  auto const charge = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
381 
382  shrChargeMinusPedestal[linearThPerBlock] = charge - pedestal;
383  }
384  }
385  alpaka::syncBlockThreads(acc);
386 
387  //Loop over each channel, compute input for multifit using shrChargeMinusPedestal
388  for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
389  auto const gch = channel.global;
390  auto const lch = channel.local;
391  for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
392  auto const sampleWithinWindow = i_sample;
393  auto const sample = i_sample + startingSample;
394 
395  // initialize all output buffers
396  if (sampleWithinWindow == 0) {
397  outputGPU.detId()[gch] = 0;
398  outputGPU.energyM0()[gch] = 0;
399  outputGPU.timeM0()[gch] = 0;
400  outputGPU.energy()[gch] = 0;
401  outputGPU.chi2()[gch] = 0;
402  }
403 
404  // offset output
405  auto* amplitudesForChannel = amplitudes + nsamplesForCompute * gch;
406  auto* noiseTermsForChannel = noiseTerms + nsamplesForCompute * gch;
407  auto* electronicNoiseTermsForChannel = electronicNoiseTerms + nsamplesForCompute * gch;
408 
409  // get event input quantities
410  auto const stride = gch < f01HEDigis.size()
411  ? f01HEDigis.stride()
412  : (gch < f5HBDigis.size() ? f5HBDigis.stride() : f3HBDigis.stride());
413  auto const nsamples =
414  gch < f01HEDigis.size()
415  ? compute_nsamples<Flavor1>(stride)
416  : (gch < nchannelsf015 ? compute_nsamples<Flavor5>(stride) : compute_nsamples<Flavor3>(stride));
417 
418  ALPAKA_ASSERT_ACC(nsamples == nsamplesForCompute || nsamples - startingSample == nsamplesForCompute);
419 
420  auto const id = gch < f01HEDigis.size()
421  ? f01HEDigis.ids()[gch]
422  : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
423  : f3HBDigis.ids()[gch - nchannelsf015]);
424  auto const did = HcalDetId{id};
425 
426  auto const adc =
427  gch < f01HEDigis.size()
428  ? adc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
429  : (gch < nchannelsf015
430  ? adc_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
431  : adc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
432  auto const capid =
433  gch < f01HEDigis.size()
434  ? capid_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
435  : (gch < nchannelsf015
436  ? capid_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
437  : capid_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
438 
439  // compute hash for this did
440  // hash needed to convert condition arrays order (based on Topo) into digi arrays order(based on FED)
441  auto const hashedId =
442  did.subdetId() == HcalBarrel
443  ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
444  : did2linearIndexHE(id,
445  mahi.maxDepthHE(),
446  mahi.maxPhiHE(),
447  mahi.firstHERing(),
448  mahi.lastHERing(),
449  mahi.nEtaHE()) +
450  mahi.offsetForHashes();
451 
452  // conditions based on the hash
453  auto const qieType = mahi.qieTypes_values()[hashedId] > 0 ? 1 : 0; // 2 types at this point
454  auto const* qieOffsets = mahi.qieCoders_offsets()[hashedId].data();
455  auto const* qieSlopes = mahi.qieCoders_slopes()[hashedId].data();
456  auto const* pedestalWidthsForChannel =
457  useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
458  ? mahi.effectivePedestalWidths()[hashedId].data()
459  : mahi.pedestals_width()[hashedId].data();
460 
461  auto const gain = mahi.gains_value()[hashedId][capid];
462  auto const gain0 = mahi.gains_value()[hashedId][0];
463  auto const respCorrection = mahi.respCorrs_values()[hashedId];
464  auto const pedestal = mahi.pedestals_value()[hashedId][capid];
465  auto const pedestalWidth = pedestalWidthsForChannel[capid];
466  // if needed, only use effective pedestals for f01
467  auto const pedestalToUseForMethod0 =
468  useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
469  ? mahi.effectivePedestals()[hashedId][capid]
470  : pedestal;
471  auto const sipmType = mahi.sipmPar_type()[hashedId];
472  auto const fcByPE = mahi.sipmPar_fcByPE()[hashedId];
473  auto const recoParam1 = recoParamsWithPS.recoParamView().param1()[hashedId];
474  auto const recoParam2 = recoParamsWithPS.recoParamView().param2()[hashedId];
475 
476 #ifdef HCAL_MAHI_GPUDEBUG
477  printf("qieType = %d qieOffset0 = %f qieOffset1 = %f qieSlope0 = %f qieSlope1 = %f\n",
478  qieType,
479  qieOffsets[0],
480  qieOffsets[1],
481  qieSlopes[0],
482  qieSlopes[1]);
483 #endif
484 
485  // compute charge
486  auto const charge = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
487 
488  int32_t const soi =
489  gch < f01HEDigis.size()
490  ? soiSamples[gch]
491  : (gch < nchannelsf015 ? f5HBDigis.npresamples()[gch - f01HEDigis.size()] : soiSamples[gch]);
492 
493  bool badSOI = (soi < 0 or static_cast<unsigned>(soi) >= nsamplesForCompute);
494  if (badSOI and sampleWithinWindow == 0) {
495 #ifdef GPU_DEBUG
496  printf("Found HBHE channel %d with invalid SOI %d\n", gch, soi);
497 #endif
498  // mark the channel as bad
499  outputGPU.chi2()[gch] = -9999.f;
500  }
501 
502  // type index starts from 1 .. 6
503  // precicionItem index starts from 0 .. 5
504  auto const precisionItem = sipmCharacteristics.precisionItem()[sipmType - 1];
505  auto const parLin1 = precisionItem.parLin1_;
506  auto const parLin2 = precisionItem.parLin2_;
507  auto const parLin3 = precisionItem.parLin3_;
508 
509  //int32_t const soi = gch >= nchannelsf01HE
510  // ? npresamplesf5HB[gch - nchannelsf01HE]
511  // : soiSamples[gch];
512  // this is here just to make things uniform...
513  if (gch >= f01HEDigis.size() && gch < nchannelsf015 && sampleWithinWindow == 0)
514  soiSamples[gch] = f5HBDigis.npresamples()[gch - f01HEDigis.size()];
515 
516  //
517  // compute various quantities (raw charge and tdc stuff)
518  // NOTE: this branch will be divergent only for a single warp that
519  // sits on the boundary when flavor 01 channels end and flavor 5 start
520  //
521  float const rawCharge = get_raw_charge(charge,
522  pedestal,
523  shrChargeMinusPedestal,
524  parLin1,
525  parLin2,
526  parLin3,
527  nsamplesForCompute,
528  soi,
529  sipmQTSShift,
531  fcByPE,
532  lch,
533  gch < f01HEDigis.size() || gch >= nchannelsf015);
534 
535  auto const dfc = compute_diff_charge_gain(
536  qieType, adc, capid, qieOffsets, qieSlopes, gch < f01HEDigis.size() || gch >= nchannelsf015);
537 
538 #ifdef COMPUTE_TDC_TIME
539  float tdcTime;
540  if (gch >= f01HEDigis.size() && gch < nchannelsf015) {
542  } else {
543  if (gch < f01HEDigis.size())
544  tdcTime =
545  HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample));
546  else if (gch >= nchannelsf015)
548  tdc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
549  }
550 #endif // COMPUTE_TDC_TIME
551 
552  // compute method 0 quantities
553  // TODO: need to apply containment
554  // TODO: need to apply time slew
555  // TODO: for < run 3, apply HBM legacy energy correction
556  auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
557  auto const startSampleTmp = soi + firstSampleShift;
558  auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp;
559  auto const endSample =
560  startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute;
561  // NOTE: gain is a small number < 10^-3, multiply it last
562  auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
563  auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
564  // store to shared mem
565  shrEnergyM0PerTS[lch * nsamplesForCompute + sampleWithinWindow] = energym0_per_ts;
566 
568  acc, &shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0, alpaka::hierarchy::Threads{});
569 
570 #ifdef HCAL_MAHI_GPUDEBUG
571  printf(
572  "id = %u sample = %d gch = %d hashedId = %u adc = %u capid = %u\n"
573  " charge = %f rawCharge = %f dfc = %f pedestal = %f\n"
574  " gain = %f respCorrection = %f energym0_per_ts = %f\n",
575  id,
576  sample,
577  gch,
578  hashedId,
579  adc,
580  capid,
581  charge,
582  rawCharge,
583  dfc,
584  pedestalToUseForMethod0,
585  gain,
586  respCorrection,
587  energym0_per_ts);
588  printf("startSample = %d endSample = %d param1 = %u param2 = %u\n",
589  startSample,
590  endSample,
591  recoParam1,
592  recoParam2);
593 #endif
594  //Find the max energy of lch channel and the corresponding TS
595  if (sampleWithinWindow >= static_cast<unsigned>(startSample) &&
596  sampleWithinWindow < static_cast<unsigned>(endSample)) {
597  //sum the energys of all TS
598  alpaka::atomicAdd(acc, &shrMethod0EnergyAccum[lch], energym0_per_ts, alpaka::hierarchy::Threads{});
599  // pair TS and E for all TSs, find max pair according to E
600  // TODO: Non-deterministic behavior for TS with equal energy
602  acc, &shrMethod0EnergySamplePair[lch], {sampleWithinWindow, energym0_per_ts}, TSenergyCompare);
603  }
604 
605  // NOTE: must take soi, as values for that thread are used...
606  // NOTE: does not run if soi is bad, because it does not match any sampleWithinWindow
607  if (sampleWithinWindow == static_cast<unsigned>(soi)) {
608  // Channel quality check
609  // https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecAlgos/plugins/HcalChannelPropertiesEP.cc#L107-L109
610  // https://github.com/cms-sw/cmssw/blob/6d2f66057131baacc2fcbdd203588c41c885b42c/CondCore/HcalPlugins/plugins/HcalChannelQuality_PayloadInspector.cc#L30
611  // const bool taggedBadByDb = severity.dropChannel(digistatus->getValue());
612  // do not run MAHI if taggedBadByDb = true
613  auto const digiStatus_ = mahi.channelQuality_status()[hashedId];
614  const bool taggedBadByDb = (digiStatus_ / 32770);
615 
616  if (taggedBadByDb)
617  outputGPU.chi2()[gch] = -9999.f;
618 
619  // check as in cpu version if mahi is not needed
620  // (use "not" and ">", instead of "<=", to ensure that a NaN value will pass the check, and the hit be flagged as invalid)
621  if (not(energym0_per_ts_gain0 > ts4Thresh)) {
622  outputGPU.chi2()[gch] = -9999.f;
623  }
624  }
625  //
626  // preparations for mahi fit
627  //
628  auto const amplitude = rawCharge - pedestalToUseForMethod0;
629  auto const noiseADC = (1. / std::sqrt(12)) * dfc;
630  auto const noisePhotoSq = amplitude > pedestalWidth ? (amplitude * fcByPE) : 0.f;
631  auto const noiseTerm = noiseADC * noiseADC + noisePhotoSq + pedestalWidth * pedestalWidth;
632 
633  // store to global memory
634  amplitudesForChannel[sampleWithinWindow] = amplitude;
635  noiseTermsForChannel[sampleWithinWindow] = noiseTerm;
636  electronicNoiseTermsForChannel[sampleWithinWindow] = pedestalWidth;
637 
638  } //end sample loop
639  } // end channel loop
640  alpaka::syncBlockThreads(acc);
641 
642  // compute energy using shrMethod0EnergySamplePair
643  for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
644  auto const gch = channel.global;
645  auto const lch = channel.local;
646  for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
647  auto const sampleWithinWindow = i_sample;
648 
649  int32_t const soi =
650  gch < f01HEDigis.size()
651  ? soiSamples[gch]
652  : (gch < nchannelsf015 ? f5HBDigis.npresamples()[gch - f01HEDigis.size()] : soiSamples[gch]);
653 
654  auto const id = gch < f01HEDigis.size()
655  ? f01HEDigis.ids()[gch]
656  : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
657  : f3HBDigis.ids()[gch - nchannelsf015]);
658  auto const did = HcalDetId{id};
659  auto const hashedId =
660  did.subdetId() == HcalBarrel
661  ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
662  : did2linearIndexHE(id,
663  mahi.maxDepthHE(),
664  mahi.maxPhiHE(),
665  mahi.firstHERing(),
666  mahi.lastHERing(),
667  mahi.nEtaHE()) +
668  mahi.offsetForHashes();
669 
670  auto const recoParam1 = recoParamsWithPS.recoParamView().param1()[hashedId];
671  auto const recoParam2 = recoParamsWithPS.recoParamView().param2()[hashedId];
672 
673  auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
674  // NOTE: must take soi, as values for that thread are used...
675  // NOTE: does not run if soi is bad, because it does not match any sampleWithinWindow
676  if (sampleWithinWindow == static_cast<unsigned>(soi)) {
677  auto const method0_energy = shrMethod0EnergyAccum[lch];
678  auto const val = shrMethod0EnergySamplePair[lch];
679  int const max_sample = (val >> 32) & 0xffffffff;
680 
681  float const max_energy = uint_as_float(static_cast<uint32_t>(val & 0xffffffff));
682 
683  float const max_energy_1 = static_cast<unsigned>(max_sample) < nsamplesForCompute - 1
684  ? shrEnergyM0PerTS[lch * nsamplesForCompute + max_sample + 1]
685  : 0.f;
686  float const position = nsamplesToAdd < nsamplesForCompute ? max_sample - soi : max_sample;
687  auto const sum = max_energy + max_energy_1;
688  // FIXME: for full comparison with cpu method 0 timing,
689  // need to correct by slew
690  // requires an accumulator -> more shared mem -> omit here unless
691  // really needed
692  float const time =
693  max_energy > 0.f && max_energy_1 > 0.f ? 25.f * (position + max_energy_1 / sum) : 25.f * position;
694 
695  // store method0 quantities to global mem
696  outputGPU.detId()[gch] = id;
697  outputGPU.energyM0()[gch] = method0_energy;
698  outputGPU.timeM0()[gch] = time;
699 
700  // check as in cpu version if mahi is not needed
701  // FIXME: KNOWN ISSUE: observed a problem when rawCharge and pedestal
702  // are basically equal and generate -0.00000...
703  // needs to be treated properly
704  // (use "not" and ">", instead of "<=", to ensure that a NaN value will pass the check, and the hit be flagged as invalid)
705  if (not(shrEnergyM0TotalAccum[lch] > 0)) {
706  outputGPU.chi2()[gch] = -9999.f;
707  }
708 
709 #ifdef HCAL_MAHI_GPUDEBUG
710  printf("tsTOT = %f tstrig = %f ts4Thresh = %f\n",
711  shrEnergyM0TotalAccum[lch],
712  energym0_per_ts_gain0,
713  ts4Thresh);
714 #endif
715 
716 #ifdef HCAL_MAHI_GPUDEBUG
717  printf(" method0_energy = %f max_sample = %d max_energy = %f time = %f\n",
718  method0_energy,
719  max_sample,
720  max_energy,
721  time);
722 #endif
723  }
724 #ifdef HCAL_MAHI_GPUDEBUG
725  printf(
726  "charge(%d) = %f pedestal(%d) = %f dfc(%d) = %f pedestalWidth(%d) = %f noiseADC(%d) = %f "
727  "noisPhoto(%d) =%f outputGPU chi2[gch] = %f \n",
728  sample,
729  rawCharge,
730  sample,
731  pedestalToUseForMethod0,
732  sample,
733  dfc,
734  sample,
735  pedestalWidth,
736  sample,
737  noiseADC,
738  sample,
739  noisePhotoSq,
740  outputGPU.chi2()[gch]);
741 #endif
742 
743  } // loop for sample
744  } // loop for channels
745  } // loop for channgel groups
746  }
747  }; //Kernel_prep1d_sameNumberOfSamples
748 
750  public:
751  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
752  ALPAKA_FN_ACC void operator()(TAcc const& acc,
753  float* pulseMatrices,
754  float* pulseMatricesM,
755  float* pulseMatricesP,
756  HcalMahiPulseOffsetsPortableDevice::ConstView pulseOffsets,
757  float const* amplitudes,
758  IProductTypef01::ConstView f01HEDigis,
759  IProductTypef5::ConstView f5HBDigis,
760  IProductTypef3::ConstView f3HBDigis,
761  int8_t* soiSamples,
762  HcalMahiConditionsPortableDevice::ConstView mahi,
764  float const meanTime,
765  float const timeSigmaSiPM,
766  float const timeSigmaHPD,
767  bool const applyTimeSlew,
768  float const tzeroTimeSlew,
769  float const slopeTimeSlew,
770  float const tmaxTimeSlew) const {
771  //2nd index = pulse
772  auto const npulses(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[1u]);
773  //3rd index = sample
774  auto const nsamples(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[2u]);
775 
776  auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
777  auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
778 
779  //Loop over each channel
780  for (auto channel : uniform_elements_z(acc, nchannels)) {
781  //Loop over pulses
782  for (auto ipulse : independent_group_elements_y(acc, npulses)) {
783  //Loop over sample
784  for (auto sample : independent_group_elements_x(acc, nsamples)) {
785  // conditions
786  auto const id = channel < f01HEDigis.size()
787  ? f01HEDigis.ids()[channel]
788  : (channel < nchannelsf015 ? f5HBDigis.ids()[channel - f01HEDigis.size()]
789  : f3HBDigis.ids()[channel - nchannelsf015]);
790  auto const deltaT =
791  channel >= f01HEDigis.size() && channel < nchannelsf015 ? timeSigmaHPD : timeSigmaSiPM;
792 
793  // compute hash for this did
794  // hash needed to convert condition arrays order (based on Topo) into digi arrays order(based on FED)
795  auto const did = DetId{id};
796  auto const hashedId =
797  did.subdetId() == HcalBarrel
798  ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
799  : did2linearIndexHE(id,
800  mahi.maxDepthHE(),
801  mahi.maxPhiHE(),
802  mahi.firstHERing(),
803  mahi.lastHERing(),
804  mahi.nEtaHE()) +
805  mahi.offsetForHashes();
806  auto const pulseShape = recoParamsWithPS.getPulseShape(hashedId);
807 
808  // offset output arrays
809  auto* pulseMatrix = pulseMatrices + nsamples * npulses * channel;
810  auto* pulseMatrixM = pulseMatricesM + nsamples * npulses * channel;
811  auto* pulseMatrixP = pulseMatricesP + nsamples * npulses * channel;
812 
813  // amplitude per ipulse
814  int const soi = soiSamples[channel];
815  int const pulseOffset = pulseOffsets.offsets()[ipulse];
816  auto const amplitude = amplitudes[channel * nsamples + pulseOffset + soi];
817 
818  if (amplitude <= 0.f) {
819  pulseMatrix[ipulse * nsamples + sample] = 0.f;
820  pulseMatrixM[ipulse * nsamples + sample] = 0.f;
821  pulseMatrixP[ipulse * nsamples + sample] = 0.f;
822  continue;
823  }
824 #ifdef HCAL_MAHI_GPUDEBUG
825 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
826  if (id != DETID_TO_DEBUG)
827  return;
828 #endif
829  if (sample == 0 && ipulse == 0) {
830  for (int i = 0; i < 8; i++)
831  printf("amplitude(%d) = %f\n", i, amplitudes[channel * nsamples + i]);
832  printf("acc25nsVec and diff25nsItvlVec for recoPulseShapeId = %u\n", recoPulseShapeId);
833  for (int i = 0; i < 256; i++) {
834  printf("acc25nsVec(%d) = %f diff25nsItvlVec(%d) = %f\n", i, acc25nsVec[i], i, diff25nsItvlVec[i]);
835  }
836  printf("accVarLenIdxZEROVec and accVarLenIdxMinusOneVec\n");
837  for (int i = 0; i < 25; i++) {
838  printf("accVarLenIdxZEROVec(%d) = %f accVarLenIdxMinusOneVec(%d) = %f\n",
839  i,
840  accVarLenIdxZeroVec[i],
841  i,
842  accVarLenIdxMinusOneVec[i]);
843  }
844  printf("diffVarItvlIdxZEROVec and diffVarItvlIdxMinusOneVec\n");
845  for (int i = 0; i < 25; i++) {
846  printf("diffVarItvlIdxZEROVec(%d) = %f diffVarItvlIdxMinusOneVec(%d) = %f\n",
847  i,
848  diffVarItvlIdxZeroVec[i],
849  i,
850  diffVarItvlIdxMinusOneVec[i]);
851  }
852  }
853 #endif
854 
855  auto t0 = meanTime;
856  if (applyTimeSlew) {
857  if (amplitude <= 1.0f)
858  t0 += compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
859  else
860  t0 += compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
861  }
862  auto const t0m = -deltaT + t0;
863  auto const t0p = deltaT + t0;
864 
865 #ifdef HCAL_MAHI_GPUDEBUG
866  if (sample == 0 && ipulse == 0) {
867  printf("time values: %f %f %f\n", t0, t0m, t0p);
868  }
869 
870  if (sample == 0 && ipulse == 0) {
871  for (int i = 0; i < hcal::constants::maxSamples; i++) {
872  auto const value = compute_pulse_shape_value(pulseShape, t0, i, 0);
873  }
874  printf("\n");
875  for (int i = 0; i < hcal::constants::maxSamples; i++) {
876  auto const value = compute_pulse_shape_value(pulseShape, t0p, i, 0);
877  }
878  printf("\n");
879  for (int i = 0; i < hcal::constants::maxSamples; i++) {
880  auto const value = compute_pulse_shape_value(pulseShape, t0m, i, 0);
881  }
882  }
883 #endif
884 
885  // FIXME: shift should be treated properly,
886  // here assume 8 time slices and 8 samples
887  auto const shift = 4 - soi; // as in cpu version!
888 
889  int32_t const idx = sample - pulseOffset;
890  auto const value = idx >= 0 && static_cast<unsigned>(idx) < nsamples
891  ? compute_pulse_shape_value(pulseShape, t0, idx, shift)
892  : 0;
893  auto const value_t0m = idx >= 0 && static_cast<unsigned>(idx) < nsamples
894  ? compute_pulse_shape_value(pulseShape, t0m, idx, shift)
895  : 0;
896  auto const value_t0p = idx >= 0 && static_cast<unsigned>(idx) < nsamples
897  ? compute_pulse_shape_value(pulseShape, t0p, idx, shift)
898  : 0;
899 
900  // store to global
901  pulseMatrix[ipulse * nsamples + sample] = value;
902  pulseMatrixM[ipulse * nsamples + sample] = value_t0m;
903  pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
904 
905  } // loop over sample
906  } // loop over pulse
907  } // loop over channels
908  }
909  }; // Kernel_prep_pulseMatrices_sameNumberOfSamples
910 
911  template <int NSAMPLES, int NPULSES>
912  ALPAKA_FN_ACC ALPAKA_FN_INLINE void update_covariance(
913  calo::multifit::ColumnVector<NPULSES> const& resultAmplitudesVector,
915  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
916  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
917  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
919  for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
920  auto const resultAmplitude = resultAmplitudesVector(ipulse);
921  if (resultAmplitude == 0)
922  continue;
923 
924 #ifdef HCAL_MAHI_GPUDEBUG
925  printf("pulse cov array for ibx = %d\n", ipulse);
926 #endif
927 
928  // preload a column
929  float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES];
931  for (int counter = 0; counter < NSAMPLES; counter++) {
932  pmcol[counter] = pulseMatrix.coeffRef(counter, ipulse);
933  pmpcol[counter] = pulseMatrixP.coeffRef(counter, ipulse);
934  pmmcol[counter] = pulseMatrixM.coeffRef(counter, ipulse);
935  }
936 
937  auto const ampl2 = resultAmplitude * resultAmplitude;
939  for (int col = 0; col < NSAMPLES; col++) {
940  auto const valueP_col = pmpcol[col];
941  auto const valueM_col = pmmcol[col];
942  auto const value_col = pmcol[col];
943  auto const tmppcol = valueP_col - value_col;
944  auto const tmpmcol = valueM_col - value_col;
945 
946  // diagonal
947  auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol);
948  covarianceMatrix(col, col) += ampl2 * tmp_value;
949 
950  // FIXME: understand if this actually gets unrolled
952  for (int row = col + 1; row < NSAMPLES; row++) {
953  float const valueP_row = pmpcol[row]; //pulseMatrixP(j, ipulseReal);
954  float const value_row = pmcol[row]; //pulseMatrix(j, ipulseReal);
955  float const valueM_row = pmmcol[row]; //pulseMatrixM(j, ipulseReal);
956 
957  float tmpprow = valueP_row - value_row;
958  float tmpmrow = valueM_row - value_row;
959 
960  auto const covValue = 0.5 * (tmppcol * tmpprow + tmpmcol * tmpmrow);
961 
962  covarianceMatrix(row, col) += ampl2 * covValue;
963  }
964  }
965  }
966  }
967 
968  template <int NSAMPLES, int NPULSES>
970  public:
971  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
972  ALPAKA_FN_ACC void operator()(TAcc const& acc,
973  OProductType::View outputGPU,
974  float const* amplitudes,
975  float* pulseMatrices,
976  float* pulseMatricesM,
977  float* pulseMatricesP,
978  HcalMahiPulseOffsetsPortableDevice::ConstView pulseOffsetsView,
979  float* noiseTerms,
980  float* electronicNoiseTerms,
981  int8_t* soiSamples,
982  HcalMahiConditionsPortableDevice::ConstView mahi,
983  bool const useEffectivePedestals,
984  IProductTypef01::ConstView f01HEDigis,
985  IProductTypef5::ConstView f5HBDigis,
986  IProductTypef3::ConstView f3HBDigis) const {
987  // can be relaxed if needed - minor updates are needed in that case!
988  static_assert(NPULSES == NSAMPLES);
989 
990  auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
991 
992  auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
993 
994  auto const threadsPerBlock(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
995 
996  //Loop over all groups of channels
997  for (auto group : uniform_groups_x(acc, nchannels)) {
998  //Loop over each channel
999  for (auto channel : uniform_group_elements_x(acc, group, nchannels)) {
1000  auto const gch = channel.global;
1001  auto const lch = channel.local;
1002 
1003  // if chi2 is set to -9999 do not run minimization
1004  if (outputGPU.chi2()[gch] == -9999.f)
1005  continue;
1006 
1007  // configure shared mem
1008  float* shrmem = alpaka::getDynSharedMem<float>(acc);
1009  float* shrMatrixLFnnlsStorage = shrmem + calo::multifit::MapSymM<float, NPULSES>::total * lch;
1010  float* shrAtAStorage = shrmem + calo::multifit::MapSymM<float, NPULSES>::total * (lch + threadsPerBlock);
1011 
1012  // conditions for pedestal widths
1013  auto const id = gch < f01HEDigis.size() ? f01HEDigis.ids()[gch]
1014  : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
1015  : f3HBDigis.ids()[gch - nchannelsf015]);
1016  auto const did = DetId{id};
1017  auto const hashedId =
1018  did.subdetId() == HcalBarrel
1019  ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
1020  : did2linearIndexHE(id,
1021  mahi.maxDepthHE(),
1022  mahi.maxPhiHE(),
1023  mahi.firstHERing(),
1024  mahi.lastHERing(),
1025  mahi.nEtaHE()) +
1026  mahi.offsetForHashes();
1027 
1028  // conditions based on the hash
1029  auto const* pedestalWidthsForChannel =
1030  useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
1031  ? mahi.effectivePedestalWidths()[hashedId].data()
1032  : mahi.pedestals_width()[hashedId].data();
1033  auto const averagePedestalWidth2 = 0.25 * (pedestalWidthsForChannel[0] * pedestalWidthsForChannel[0] +
1034  pedestalWidthsForChannel[1] * pedestalWidthsForChannel[1] +
1035  pedestalWidthsForChannel[2] * pedestalWidthsForChannel[2] +
1036  pedestalWidthsForChannel[3] * pedestalWidthsForChannel[3]);
1037 
1038  // FIXME on cpu ts 0 capid was used - does it make any difference
1039  auto const gain = mahi.gains_value()[hashedId][0];
1040 
1041  auto const respCorrection = mahi.respCorrs_values()[hashedId];
1042  auto const noisecorr = mahi.sipmPar_auxi2()[hashedId];
1043 
1044 #ifdef HCAL_MAHI_GPUDEBUG
1045 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
1046  if (id != DETID_TO_DEBUG)
1047  return;
1048 #endif
1049 #endif
1050 
1053  for (int i = 0; i < NPULSES; ++i)
1054  pulseOffsets(i) = i;
1055 
1056  // output amplitudes/weights
1057  calo::multifit::ColumnVector<NPULSES> resultAmplitudesVector =
1059 
1060  // map views
1061  Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> inputAmplitudesView{amplitudes + gch * NSAMPLES};
1062  Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
1063  Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseElectronicView{electronicNoiseTerms +
1064  gch * NSAMPLES};
1065  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{
1066  pulseMatricesM + gch * NSAMPLES * NPULSES};
1067  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{
1068  pulseMatricesP + gch * NSAMPLES * NPULSES};
1069  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{
1070  pulseMatrices + gch * NSAMPLES * NPULSES};
1071 #ifdef HCAL_MAHI_GPUDEBUG
1072  for (int i = 0; i < NSAMPLES; i++)
1073  printf("inputValues(%d) = %f noiseTerms(%d) = %f\n", i, inputAmplitudesView(i), i, noiseTermsView(i));
1074  for (int i = 0; i < NSAMPLES; i++) {
1075  for (int j = 0; j < NPULSES; j++)
1076  printf("%f ", glbPulseMatrixView(i, j));
1077  printf("\n");
1078  }
1079  printf("\n");
1080  for (int i = 0; i < NSAMPLES; i++) {
1081  for (int j = 0; j < NPULSES; j++)
1082  printf("%f ", glbPulseMatrixMView(i, j));
1083  printf("\n");
1084  }
1085  printf("\n");
1086  for (int i = 0; i < NSAMPLES; i++) {
1087  for (int j = 0; j < NPULSES; j++)
1088  printf("%f ", glbPulseMatrixPView(i, j));
1089  printf("\n");
1090  }
1091 #endif
1092 
1093  int npassive = 0;
1094  float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f;
1095  for (int iter = 1; iter < nMaxItersMin; iter++) {
1096  //float covarianceMatrixStorage[MapSymM<float, NSAMPLES>::total];
1097  // NOTE: only works when NSAMPLES == NPULSES
1098  // if does not hold -> slightly rearrange shared mem to still reuse
1099  // shared memory
1100  float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
1101  calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
1104  covarianceMatrixStorage[counter] = (noisecorr != 0.f) ? 0.f : averagePedestalWidth2;
1107  covarianceMatrix(counter, counter) += noiseTermsView.coeffRef(counter);
1108  if (counter != 0)
1109  covarianceMatrix(counter, counter - 1) +=
1110  noisecorr * noiseElectronicView.coeffRef(counter - 1) * noiseElectronicView.coeffRef(counter);
1111  }
1112 
1113  // update covariance matrix
1114  update_covariance(resultAmplitudesVector,
1115  covarianceMatrix,
1116  glbPulseMatrixView,
1117  glbPulseMatrixMView,
1118  glbPulseMatrixPView);
1119 
1120 #ifdef HCAL_MAHI_GPUDEBUG
1121  printf("covariance matrix\n");
1122  for (int i = 0; i < 8; i++) {
1123  for (int j = 0; j < 8; j++)
1124  printf("%f ", covarianceMatrix(i, j));
1125  printf("\n");
1126  }
1127 #endif
1128 
1129  // compute Cholesky Decomposition L matrix
1130  //matrixDecomposition.compute(covarianceMatrix);
1131  //auto const& matrixL = matrixDecomposition.matrixL();
1133  calo::multifit::MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
1134  calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix);
1135 
1136  //
1137  // replace eigen
1138  //
1139  //auto const& A = matrixDecomposition
1140  // .matrixL()
1141  // .solve(pulseMatrixView);
1143  calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
1144 
1145  //
1146  // remove eigen
1147  //
1148  //auto const& b = matrixL
1149  // .solve(inputAmplitudesView);
1150  //
1151  float reg_b[NSAMPLES];
1152  calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
1153 
1154  // TODO: we do not really need to change these matrcies
1155  // will be fixed in the optimized version
1156  //ColMajorMatrix<NPULSES, NPULSES> AtA = A.transpose() * A;
1157  //ColumnVector<NPULSES> Atb = A.transpose() * b;
1158  //ColMajorMatrix<NPULSES, NPULSES> AtA;
1159  //float AtAStorage[MapSymM<float, NPULSES>::total];
1160  calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
1163  for (int icol = 0; icol < NPULSES; icol++) {
1164  float reg_ai[NSAMPLES];
1165 
1166  // load column icol
1168  for (int counter = 0; counter < NSAMPLES; counter++)
1169  reg_ai[counter] = A(counter, icol);
1170 
1171  // compute diagonal
1172  float sum = 0.f;
1174  for (int counter = 0; counter < NSAMPLES; counter++)
1175  sum += reg_ai[counter] * reg_ai[counter];
1176 
1177  // store
1178  AtA(icol, icol) = sum;
1179 
1180  // go thru the other columns
1182  for (int j = icol + 1; j < NPULSES; j++) {
1183  // load column j
1184  float reg_aj[NSAMPLES];
1186  for (int counter = 0; counter < NSAMPLES; counter++)
1187  reg_aj[counter] = A(counter, j);
1188 
1189  // accum
1190  float sum = 0.f;
1192  for (int counter = 0; counter < NSAMPLES; counter++)
1193  sum += reg_aj[counter] * reg_ai[counter];
1194 
1195  // store
1196  //AtA(icol, j) = sum;
1197  AtA(j, icol) = sum;
1198  }
1199 
1200  // Atb accum
1201  float sum_atb = 0;
1203  for (int counter = 0; counter < NSAMPLES; counter++)
1204  sum_atb += reg_ai[counter] * reg_b[counter];
1205 
1206  // store atb
1207  Atb(icol) = sum_atb;
1208  }
1209 
1210 #ifdef HCAL_MAHI_GPUDEBUG
1211  printf("AtA\n");
1212  for (int i = 0; i < 8; i++) {
1213  for (int j = 0; j < 8; j++)
1214  printf("%f ", AtA(i, j));
1215  printf("\n");
1216  }
1217  printf("Atb\n");
1218  for (int i = 0; i < 8; i++)
1219  printf("%f ", Atb(i));
1220  printf("\n");
1221  printf("result Amplitudes before nnls\n");
1222  for (int i = 0; i < 8; i++)
1223  printf("%f ", resultAmplitudesVector(i));
1224  printf("\n");
1225 #endif
1226 
1227  // for fnnls
1228  calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
1229 
1230  // run fast nnls
1232  Atb,
1233  resultAmplitudesVector,
1234  npassive,
1235  pulseOffsets,
1236  matrixLForFnnls,
1237  nnlsThresh,
1238  nMaxItersNNLS,
1239  10,
1240  10);
1241 
1242 #ifdef HCAL_MAHI_GPUDEBUG
1243  printf("result Amplitudes\n");
1244  for (int i = 0; i < 8; i++)
1245  printf("resultAmplitudes(%d) = %f\n", i, resultAmplitudesVector(i));
1246 #endif
1247 
1249  matrixL, glbPulseMatrixView, resultAmplitudesVector, inputAmplitudesView, chi2);
1250 
1251  auto const deltaChi2 = std::abs(chi2 - previous_chi2);
1252  if (chi2 == chi2_2itersback && chi2 < previous_chi2)
1253  break;
1254 
1255  // update
1256  chi2_2itersback = previous_chi2;
1257  previous_chi2 = chi2;
1258 
1259  // exit condition
1260  if (deltaChi2 < deltaChi2Threashold)
1261  break;
1262  }
1263 
1264 #ifdef HCAL_MAHI_GPUDEBUG
1265  for (int i = 0; i < NPULSES; i++)
1266  printf("pulseOffsets(%d) = %d outputAmplitudes(%d) = %f\n",
1267  i,
1268  pulseOffsets(i),
1269  i,
1270  resultAmplitudesVector(i));
1271  printf("chi2 = %f\n", chi2);
1272 #endif
1273 
1274  outputGPU.chi2()[gch] = chi2;
1275  auto const idx_for_energy = std::abs(pulseOffsetsView.offsets()[0]);
1276  outputGPU.energy()[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
1277 
1278  } // loop over channels
1279  } //loop over group of channels
1280  }
1281  }; // Kernel_minimize
1282 
1283  } // namespace mahi
1284 
1285  void runMahiAsync(Queue& queue,
1286  IProductTypef01::ConstView const& f01HEDigis,
1287  IProductTypef5::ConstView const& f5HBDigis,
1288  IProductTypef3::ConstView const& f3HBDigis,
1289  OProductType::View outputGPU,
1290  HcalMahiConditionsPortableDevice::ConstView const& mahi,
1291  HcalSiPMCharacteristicsPortableDevice::ConstView const& sipmCharacteristics,
1292  HcalRecoParamWithPulseShapeDevice::ConstView const& recoParamsWithPS,
1293  HcalMahiPulseOffsetsPortableDevice::ConstView const& mahiPulseOffsets,
1294  ConfigParameters const& configParameters) {
1295  auto const totalChannels =
1296  f01HEDigis.metadata().size() + f5HBDigis.metadata().size() + f3HBDigis.metadata().size();
1297  // FIXME: the number of channels in output might change given that some channesl might be filtered out
1298 
1299  // TODO: this can be lifted by implementing a separate kernel
1300  // similar to the default one, but properly handling the diff in #sample
1301  // or modifying existing one
1302  // TODO: assert startingSample = f01nsamples - windowSize to be 0 or 2
1303  // assert f01nsamples == f5nsamples
1304  // assert f01nsamples == f3nsamples
1305  int constexpr windowSize = 8;
1306 
1307  //compute work division
1308  uint32_t nchannels_per_block = configParameters.kprep1dChannelsPerBlock;
1309  auto const blocks_y = cms::alpakatools::divide_up_by(totalChannels, nchannels_per_block);
1310 
1311  Vec2D const blocks_2d{blocks_y, 1u}; // {y, x} coordiantes
1312  Vec2D const threads_2d{nchannels_per_block, windowSize};
1313  auto workDivPrep2D = cms::alpakatools::make_workdiv<Acc2D>(blocks_2d, threads_2d);
1314 
1315  //Device buffer for output
1316  auto amplitudes = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1317  auto noiseTerms = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1318  auto electronicNoiseTerms = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1319  auto soiSamples = cms::alpakatools::make_device_buffer<int8_t[]>(queue, totalChannels * windowSize);
1320 
1321  alpaka::exec<Acc2D>(queue,
1322  workDivPrep2D,
1324  outputGPU,
1325  f01HEDigis,
1326  f5HBDigis,
1327  f3HBDigis,
1328  mahi,
1329  sipmCharacteristics,
1330  recoParamsWithPS,
1331  configParameters.useEffectivePedestals,
1332  configParameters.sipmQTSShift,
1333  configParameters.sipmQNTStoSum,
1334  configParameters.firstSampleShift,
1335  configParameters.ts4Thresh,
1336  amplitudes.data(),
1337  noiseTerms.data(),
1338  electronicNoiseTerms.data(),
1339  soiSamples.data(),
1340  windowSize);
1341 
1344  uint32_t const channelsPerBlock = 1024 / (windowSize * mahiPulseOffsets.metadata().size());
1345 
1346  //launch 1D blocks of 3D threads
1347  auto const blocks_z = cms::alpakatools::divide_up_by(totalChannels, channelsPerBlock);
1348  Vec3D const blocks_3d{blocks_z, 1u, 1u}; // 1D block in z {z,y,x} coordinates
1349  Vec3D const threads_3d{channelsPerBlock, mahiPulseOffsets.metadata().size(), windowSize};
1350 
1351  auto workDivPrep3D = cms::alpakatools::make_workdiv<Acc3D>(blocks_3d, threads_3d);
1352 
1353  //Device buffer for output
1354  auto pulseMatrices = cms::alpakatools::make_device_buffer<float[]>(
1355  queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1356  auto pulseMatricesM = cms::alpakatools::make_device_buffer<float[]>(
1357  queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1358  auto pulseMatricesP = cms::alpakatools::make_device_buffer<float[]>(
1359  queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1360 
1361  alpaka::exec<Acc3D>(queue,
1362  workDivPrep3D,
1364  pulseMatrices.data(),
1365  pulseMatricesM.data(),
1366  pulseMatricesP.data(),
1367  mahiPulseOffsets,
1368  amplitudes.data(),
1369  f01HEDigis,
1370  f5HBDigis,
1371  f3HBDigis,
1372  soiSamples.data(),
1373  mahi,
1374  recoParamsWithPS,
1375  configParameters.meanTime,
1376  configParameters.timeSigmaSiPM,
1377  configParameters.timeSigmaHPD,
1378  configParameters.applyTimeSlew,
1379  configParameters.tzeroTimeSlew,
1380  configParameters.slopeTimeSlew,
1381  configParameters.tmaxTimeSlew);
1382 
1383  uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
1384  auto blocks_1d = cms::alpakatools::divide_up_by(totalChannels, threadsPerBlock);
1385 
1386  auto workDivPrep1D = cms::alpakatools::make_workdiv<Acc1D>(blocks_1d, threadsPerBlock);
1387 
1388  alpaka::exec<Acc1D>(queue,
1389  workDivPrep1D,
1391  outputGPU,
1392  amplitudes.data(),
1393  pulseMatrices.data(),
1394  pulseMatricesM.data(),
1395  pulseMatricesP.data(),
1396  mahiPulseOffsets,
1397  noiseTerms.data(),
1398  electronicNoiseTerms.data(),
1399  soiSamples.data(),
1400  mahi,
1401  configParameters.useEffectivePedestals,
1402  f01HEDigis,
1403  f5HBDigis,
1404  f3HBDigis);
1405  }
1406 
1407  } // namespace hcal::reconstruction
1408 } // namespace ALPAKA_ACCELERATOR_NAMESPACE
1409 
1410 namespace alpaka::trait {
1412 
1414  template <typename TAcc>
1415  struct BlockSharedMemDynSizeBytes<Kernel_prep1d_sameNumberOfSamples, TAcc> {
1417  template <typename TVec, typename... TArgs>
1419  TVec const& threadsPerBlock,
1420  TVec const& elemsPerThread,
1421  TArgs const&...) -> std::size_t {
1422  // return the amount of dynamic shared memory needed
1423 
1424  // threadsPerBlock[1] = threads2d.x = windowSize = 8
1425  // threadsPerBlock[0] = threads2d.y = nchannels_per_block = 32
1426  // elemsPerThread = 1
1427  std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] *
1428  ((2 * threadsPerBlock[1u] * elemsPerThread[1u] + 2) * sizeof(float) + sizeof(uint64_t));
1429  return bytes;
1430  }
1431  };
1432 
1434  template <int NSAMPLES, int NPULSES, typename TAcc>
1435  struct BlockSharedMemDynSizeBytes<Kernel_minimize<NSAMPLES, NPULSES>, TAcc> {
1437  template <typename TVec, typename... TArgs>
1439  TVec const& threadsPerBlock,
1440  TVec const& elemsPerThread,
1441  TArgs const&...) -> std::size_t {
1442  // return the amount of dynamic shared memory needed
1443 
1444  std::size_t bytes =
1445  2 * threadsPerBlock[0u] * elemsPerThread[0u] * (calo::multifit::MapSymM<float, 8>::total * sizeof(float));
1446  return bytes;
1447  }
1448  };
1449 
1450 } // namespace alpaka::trait
ALPAKA_FN_ACC auto uniform_elements_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:351
ALPAKA_FN_ACC float get_raw_charge(double const charge, double const pedestal, float const *shrChargeMinusPedestal, float const parLin1, float const parLin2, float const parLin3, int32_t const nsamplesForCompute, int32_t const soi, int const sipmQTSShift, int const sipmQNTStoSum, float const fcByPE, int32_t const lch, bool const isqie11)
Definition: Mahi.dev.cc:230
ALPAKA_FN_ACC void operator()(TAcc const &acc, OProductType::View outputGPU, IProductTypef01::ConstView f01HEDigis, IProductTypef5::ConstView f5HBDigis, IProductTypef3::ConstView f3HBDigis, HcalMahiConditionsPortableDevice::ConstView mahi, HcalSiPMCharacteristicsPortableDevice::ConstView sipmCharacteristics, HcalRecoParamWithPulseShapeDevice::ConstView recoParamsWithPS, bool const useEffectivePedestals, int const sipmQTSShift, int const sipmQNTStoSum, int const firstSampleShift, float const ts4Thresh, float *amplitudes, float *noiseTerms, float *electronicNoiseTerms, int8_t *soiSamples, int const windowSize) const
Definition: Mahi.dev.cc:272
ALPAKA_STATIC_ACC_MEM_CONSTANT float const qie8shape[129]
Definition: Mahi.dev.cc:53
constexpr Idx divide_up_by(Idx value, Idx divisor)
Definition: workdivision.h:20
ALPAKA_FN_ACC auto uniform_groups_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:785
ALPAKA_FN_ACC void operator()(TAcc const &acc, OProductType::View outputGPU, float const *amplitudes, float *pulseMatrices, float *pulseMatricesM, float *pulseMatricesP, HcalMahiPulseOffsetsPortableDevice::ConstView pulseOffsetsView, float *noiseTerms, float *electronicNoiseTerms, int8_t *soiSamples, HcalMahiConditionsPortableDevice::ConstView mahi, bool const useEffectivePedestals, IProductTypef01::ConstView f01HEDigis, IProductTypef5::ConstView f5HBDigis, IProductTypef3::ConstView f3HBDigis) const
Definition: Mahi.dev.cc:972
static ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(Kernel_prep1d_sameNumberOfSamples const &, TVec const &threadsPerBlock, TVec const &elemsPerThread, TArgs const &...) -> std::size_t
Definition: Mahi.dev.cc:1418
Vec< Dim2D > Vec2D
Definition: config.h:26
static const double slope[3]
constexpr bool TSenergyCompare(std::pair< unsigned int, float > a, std::pair< unsigned int, float > b)
Definition: Mahi.dev.cc:265
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition_unrolled(MatrixType1 &L, MatrixType2 const &M)
ALPAKA_FN_ACC auto uniform_group_elements_y(TAcc const &acc, TArgs... args)
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void atomicMaxPair(const TAcc &acc, unsigned long long int *address, std::pair< unsigned int, float > value, F comparator)
Definition: atomicMaxPair.h:10
ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_time_slew_delay(float const fC, float const tzero, float const slope, float const tmax)
Definition: Mahi.dev.cc:43
ALPAKA_STATIC_ACC_MEM_CONSTANT float const qie11shape[257]
Definition: Mahi.dev.cc:63
ALPAKA_FN_ACC PulseShapeConstElement getPulseShape(uint32_t const hashedId) const
#define CMS_UNROLL_LOOP
Definition: CMSUnrollLoop.h:47
ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_coder_charge(int const qieType, uint8_t const adc, uint8_t const capid, float const *qieOffsets, float const *qieSlopes)
Definition: Mahi.dev.cc:120
ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_diff_charge_gain(int const qieType, uint8_t adc, uint8_t const capid, float const *qieOffsets, float const *qieSlopes, bool const isqie11)
Definition: Mahi.dev.cc:134
ALPAKA_FN_ACC auto independent_group_elements_y(TAcc const &acc, TArgs... args)
Eigen::Matrix< T, SIZE, 1 > ColumnVector
void runMahiAsync(Queue &queue, IProductTypef01::ConstView const &f01HEDigis, IProductTypef5::ConstView const &f5HBDigis, IProductTypef3::ConstView const &f3HBDigis, OProductType::View outputGPU, HcalMahiConditionsPortableDevice::ConstView const &mahi, HcalSiPMCharacteristicsPortableDevice::ConstView const &sipmCharacteristics, HcalRecoParamWithPulseShapeDevice::ConstView const &recoParamsWithPS, HcalMahiPulseOffsetsPortableDevice::ConstView const &mahiPulseOffsets, ConfigParameters const &configParameters)
Definition: Mahi.dev.cc:1285
constexpr uint32_t stride
Definition: HelixFit.h:22
constexpr float getTDCTime(const int tdc)
ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_pulse_shape_value(PulseShapeConstElement const &pulseShape, float const pulse_time, int const sample, int const shift)
Definition: Mahi.dev.cc:174
EIGEN_DEVICE_FUNC void fnnls(MatrixType const &AtA, VectorType const &Atb, VectorType &solution, int &npassive, ColumnVector< VectorType::RowsAtCompileTime, int > &pulseOffsets, MapSymM< float, VectorType::RowsAtCompileTime > &matrixL, double eps, const int maxIterations, const int relaxationPeriod, const int relaxationFactor)
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void calculateChiSq(MatrixType1 const &matrixL, MatrixType2 const &pulseMatrixView, MatrixType3 const &resultAmplitudesVector, MatrixType4 const &inputAmplitudesView, float &chi2)
EIGEN_DEVICE_FUNC void solve_forward_subst_matrix(MatrixType1 &A, MatrixType2 const &pulseMatrixView, MatrixType3 const &matrixL)
ALPAKA_FN_ACC void operator()(TAcc const &acc, float *pulseMatrices, float *pulseMatricesM, float *pulseMatricesP, HcalMahiPulseOffsetsPortableDevice::ConstView pulseOffsets, float const *amplitudes, IProductTypef01::ConstView f01HEDigis, IProductTypef5::ConstView f5HBDigis, IProductTypef3::ConstView f3HBDigis, int8_t *soiSamples, HcalMahiConditionsPortableDevice::ConstView mahi, HcalRecoParamWithPulseShapeDevice::ConstView recoParamsWithPS, float const meanTime, float const timeSigmaSiPM, float const timeSigmaHPD, bool const applyTimeSlew, float const tzeroTimeSlew, float const slopeTimeSlew, float const tmaxTimeSlew) const
Definition: Mahi.dev.cc:752
T sqrt(T t)
Definition: SSEVec.h:23
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
double f[11][100]
ALPAKA_FN_ACC auto independent_group_elements_x(TAcc const &acc, TArgs... args)
constexpr int subdetId() const
get the contents of the subdetector field (not cast into any detector&#39;s numbering enum) ...
Definition: DetId.h:48
Definition: value.py:1
constexpr int nsPerBX
Definition: HcalConstants.h:8
constexpr uint8_t adc_for_sample< Flavor5 >(uint16_t const *const dfstart, uint32_t const sample)
ALPAKA_FN_ACC auto uniform_groups_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:792
static const double tmax[3]
Definition: DetId.h:17
typename HcalPulseShapeSoA::ConstView::const_element PulseShapeConstElement
Definition: Mahi.dev.cc:171
EIGEN_DEVICE_FUNC void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime], MatrixType1 inputAmplitudesView, MatrixType2 matrixL)
static ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(Kernel_minimize< NSAMPLES, NPULSES > const &, TVec const &threadsPerBlock, TVec const &elemsPerThread, TArgs const &...) -> std::size_t
Definition: Mahi.dev.cc:1438
unsigned long long uint64_t
Definition: Time.h:13
constexpr int maxSamples
Definition: HcalConstants.h:6
double b
Definition: hdecay.h:120
ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_reco_correction_factor(float const par1, float const par2, float const par3, float const x)
Definition: Mahi.dev.cc:112
Vec< Dim3D > Vec3D
Definition: config.h:27
ALPAKA_FN_ACC auto uniform_group_elements_x(TAcc const &acc, TArgs... args)
constexpr uint8_t capid_for_sample< Flavor3 >(uint16_t const *const dfstart, uint32_t const sample)
constexpr float UNKNOWN_T_NOTDC
double a
Definition: hdecay.h:121
static std::atomic< unsigned int > counter
static int position[264][3]
Definition: ReadPGInfo.cc:289
ALPAKA_FN_ACC ALPAKA_FN_INLINE void update_covariance(calo::multifit::ColumnVector< NPULSES > const &resultAmplitudesVector, calo::multifit::MapSymM< float, NSAMPLES > &covarianceMatrix, Eigen::Map< const calo::multifit::ColMajorMatrix< NSAMPLES, NPULSES >> const &pulseMatrix, Eigen::Map< const calo::multifit::ColMajorMatrix< NSAMPLES, NPULSES >> const &pulseMatrixM, Eigen::Map< const calo::multifit::ColMajorMatrix< NSAMPLES, NPULSES >> const &pulseMatrixP)
Definition: Mahi.dev.cc:912
col
Definition: cuy.py:1009
static unsigned int const shift
static const double tzero[3]
float x
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t float_as_uint(float val)
Definition: Mahi.dev.cc:35
Definition: APVGainStruct.h:7
constexpr float iniTimeShift
Definition: HcalConstants.h:9
ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHE(uint32_t const didraw, int const maxDepthHE, int const maxPhiHE, int const firstHERing, int const lastHERing, int const nEtaHE)
Definition: Mahi.dev.cc:97
Eigen::Matrix< float, NROWS, NCOLS, Eigen::ColMajor > ColMajorMatrix
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr RecoParamCollection::ConstView recoParamView()
static const int IPHI_MAX
Definition: HcalTopology.cc:13
uint16_t *__restrict__ uint16_t const *__restrict__ adc
ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHB(uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB)
Definition: Mahi.dev.cc:89
ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range)
Definition: Mahi.dev.cc:108
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float uint_as_float(uint32_t val)
Definition: Mahi.dev.cc:27