CMS 3D CMS Logo

All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
Mahi.dev.cc
Go to the documentation of this file.
1 #include <iostream>
2 #include <alpaka/alpaka.hpp>
3 #include <Eigen/Dense>
4 
7 
9 // needed to compile with USER_CXXFLAGS="-DCOMPUTE_TDC_TIME"
12 // TODO reuse some of the HCAL constats from
13 //#include "RecoLocalCalo/HcalRecAlgos/interface/HcalConstants.h"
14 
15 #include "Mahi.h"
16 
17 #ifdef HCAL_MAHI_GPUDEBUG
18 #define DETID_TO_DEBUG 1125647428
19 #endif
20 
22 
23  using namespace cms::alpakatools;
25  namespace mahi {
26 
27  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float uint_as_float(uint32_t val) {
28 #if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
29  return __uint_as_float(val);
30 #else
31  return edm::bit_cast<float>(val);
32 #endif
33  }
34 
35  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t float_as_uint(float val) {
36 #if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__)
37  return __float_as_uint(val);
38 #else
39  return edm::bit_cast<unsigned int>(val);
40 #endif
41  }
42 
43  ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_time_slew_delay(float const fC,
44  float const tzero,
45  float const slope,
46  float const tmax) {
47  auto const rawDelay = tzero + slope * std::log(fC);
48  return rawDelay < 0 ? 0 : (rawDelay > tmax ? tmax : rawDelay);
49  }
50 
51  // HcalQIEShapes are hardcoded in HcalQIEData.cc basically
52  // + some logic to generate 128 and 256 value arrays...
53  ALPAKA_STATIC_ACC_MEM_CONSTANT float const qie8shape[129] = {
54  -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
55  18, 20, 22, 24, 26, 28, 31, 34, 37, 40, 44, 48, 52, 57, 62, 57, 62,
56  67, 72, 77, 82, 87, 92, 97, 102, 107, 112, 117, 122, 127, 132, 142, 152, 162,
57  172, 182, 192, 202, 217, 232, 247, 262, 282, 302, 322, 347, 372, 347, 372, 397, 422,
58  447, 472, 497, 522, 547, 572, 597, 622, 647, 672, 697, 722, 772, 822, 872, 922, 972,
59  1022, 1072, 1147, 1222, 1297, 1372, 1472, 1572, 1672, 1797, 1922, 1797, 1922, 2047, 2172, 2297, 2422,
60  2547, 2672, 2797, 2922, 3047, 3172, 3297, 3422, 3547, 3672, 3922, 4172, 4422, 4672, 4922, 5172, 5422,
61  5797, 6172, 6547, 6922, 7422, 7922, 8422, 9047, 9672, 10297};
62 
63  ALPAKA_STATIC_ACC_MEM_CONSTANT float const qie11shape[257] = {
64  -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5,
65  11.5, 12.5, 13.5, 14.5, 15.5, 17.5, 19.5, 21.5, 23.5, 25.5, 27.5, 29.5,
66  31.5, 33.5, 35.5, 37.5, 39.5, 41.5, 43.5, 45.5, 47.5, 49.5, 51.5, 53.5,
67  55.5, 59.5, 63.5, 67.5, 71.5, 75.5, 79.5, 83.5, 87.5, 91.5, 95.5, 99.5,
68  103.5, 107.5, 111.5, 115.5, 119.5, 123.5, 127.5, 131.5, 135.5, 139.5, 147.5, 155.5,
69  163.5, 171.5, 179.5, 187.5, 171.5, 179.5, 187.5, 195.5, 203.5, 211.5, 219.5, 227.5,
70  235.5, 243.5, 251.5, 259.5, 267.5, 275.5, 283.5, 291.5, 299.5, 315.5, 331.5, 347.5,
71  363.5, 379.5, 395.5, 411.5, 427.5, 443.5, 459.5, 475.5, 491.5, 507.5, 523.5, 539.5,
72  555.5, 571.5, 587.5, 603.5, 619.5, 651.5, 683.5, 715.5, 747.5, 779.5, 811.5, 843.5,
73  875.5, 907.5, 939.5, 971.5, 1003.5, 1035.5, 1067.5, 1099.5, 1131.5, 1163.5, 1195.5, 1227.5,
74  1259.5, 1291.5, 1355.5, 1419.5, 1483.5, 1547.5, 1611.5, 1675.5, 1547.5, 1611.5, 1675.5, 1739.5,
75  1803.5, 1867.5, 1931.5, 1995.5, 2059.5, 2123.5, 2187.5, 2251.5, 2315.5, 2379.5, 2443.5, 2507.5,
76  2571.5, 2699.5, 2827.5, 2955.5, 3083.5, 3211.5, 3339.5, 3467.5, 3595.5, 3723.5, 3851.5, 3979.5,
77  4107.5, 4235.5, 4363.5, 4491.5, 4619.5, 4747.5, 4875.5, 5003.5, 5131.5, 5387.5, 5643.5, 5899.5,
78  6155.5, 6411.5, 6667.5, 6923.5, 7179.5, 7435.5, 7691.5, 7947.5, 8203.5, 8459.5, 8715.5, 8971.5,
79  9227.5, 9483.5, 9739.5, 9995.5, 10251.5, 10507.5, 11019.5, 11531.5, 12043.5, 12555.5, 13067.5, 13579.5,
80  12555.5, 13067.5, 13579.5, 14091.5, 14603.5, 15115.5, 15627.5, 16139.5, 16651.5, 17163.5, 17675.5, 18187.5,
81  18699.5, 19211.5, 19723.5, 20235.5, 20747.5, 21771.5, 22795.5, 23819.5, 24843.5, 25867.5, 26891.5, 27915.5,
82  28939.5, 29963.5, 30987.5, 32011.5, 33035.5, 34059.5, 35083.5, 36107.5, 37131.5, 38155.5, 39179.5, 40203.5,
83  41227.5, 43275.5, 45323.5, 47371.5, 49419.5, 51467.5, 53515.5, 55563.5, 57611.5, 59659.5, 61707.5, 63755.5,
84  65803.5, 67851.5, 69899.5, 71947.5, 73995.5, 76043.5, 78091.5, 80139.5, 82187.5, 84235.5, 88331.5, 92427.5,
85  96523.5, 100620, 104716, 108812, 112908};
86 
87  constexpr int32_t IPHI_MAX = 72;
88 
89  ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHB(
90  uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB) {
91  HcalDetId did{didraw};
92  uint32_t const value = (did.depth() - 1) + maxDepthHB * (did.iphi() - 1);
93  return did.ieta() > 0
94  ? value + maxDepthHB * hcal::reconstruction::mahi::IPHI_MAX * (did.ieta() - firstHBRing)
95  : value + maxDepthHB * hcal::reconstruction::mahi::IPHI_MAX * (did.ieta() + lastHBRing + nEtaHB);
96  }
97  ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHE(uint32_t const didraw,
98  int const maxDepthHE,
99  int const maxPhiHE,
100  int const firstHERing,
101  int const lastHERing,
102  int const nEtaHE) {
103  HcalDetId did{didraw};
104  uint32_t const value = (did.depth() - 1) + maxDepthHE * (did.iphi() - 1);
105  return did.ieta() > 0 ? value + maxDepthHE * maxPhiHE * (did.ieta() - firstHERing)
106  : value + maxDepthHE * maxPhiHE * (did.ieta() + lastHERing + nEtaHE);
107  }
108  ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range) {
109  return capid * 4 + range;
110  }
111 
112  ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_reco_correction_factor(float const par1,
113  float const par2,
114  float const par3,
115  float const x) {
116  return par3 * x * x + par2 * x + par1;
117  }
118 
119  // compute the charge using the adc, qie type and the appropriate qie shape array
120  ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_coder_charge(
121  int const qieType, uint8_t const adc, uint8_t const capid, float const* qieOffsets, float const* qieSlopes) {
122  auto const range = qieType == 0 ? (adc >> 5) & 0x3 : (adc >> 6) & 0x3;
123  auto const* qieShapeToUse = qieType == 0 ? qie8shape : qie11shape;
124  auto const nbins = qieType == 0 ? 32 : 64;
125  auto const center = adc % nbins == nbins - 1 ? 0.5 * (3 * qieShapeToUse[adc] - qieShapeToUse[adc - 1])
126  : 0.5 * (qieShapeToUse[adc] + qieShapeToUse[adc + 1]);
127  auto const index = get_qiecoder_index(capid, range);
128  return (center - qieOffsets[index]) / qieSlopes[index];
129  }
130 
131  // this is from
132  // https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc#L140
133 
134  ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_diff_charge_gain(int const qieType,
135  uint8_t adc,
136  uint8_t const capid,
137  float const* qieOffsets,
138  float const* qieSlopes,
139  bool const isqie11) {
140  constexpr uint32_t mantissaMaskQIE8 = 0x1fu;
141  constexpr uint32_t mantissaMaskQIE11 = 0x3f;
142  auto const mantissaMask = isqie11 ? mantissaMaskQIE11 : mantissaMaskQIE8;
143  auto const q = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
144  auto const mantissa = adc & mantissaMask;
145 
146  if (mantissa == 0u || mantissa == mantissaMask - 1u)
147  return compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes) - q;
148  else if (mantissa == 1u || mantissa == mantissaMask)
149  return q - compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
150  else {
151  auto const qup = compute_coder_charge(qieType, adc + 1u, capid, qieOffsets, qieSlopes);
152  auto const qdown = compute_coder_charge(qieType, adc - 1u, capid, qieOffsets, qieSlopes);
153  auto const upgain = qup - q;
154  auto const downgain = q - qdown;
155  auto const averagegain = (qup - qdown) / 2.f;
156  if (std::abs(upgain - downgain) < 0.01f * averagegain)
157  return averagegain;
158  else {
159  auto const q2up = compute_coder_charge(qieType, adc + 2u, capid, qieOffsets, qieSlopes);
160  auto const q2down = compute_coder_charge(qieType, adc - 2u, capid, qieOffsets, qieSlopes);
161  auto const upgain2 = q2up - qup;
162  auto const downgain2 = qdown - q2down;
163  if (std::abs(upgain2 - upgain) < std::abs(downgain2 - downgain))
164  return upgain;
165  else
166  return downgain;
167  }
168  }
169  }
170 
171  using PulseShapeConstElement = typename HcalPulseShapeSoA::ConstView::const_element;
172  // TODO: remove what's not needed
173  // originally from from RecoLocalCalo/HcalRecAlgos/src/PulseShapeFunctor.cc
174  ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_pulse_shape_value(PulseShapeConstElement const& pulseShape,
175  float const pulse_time,
176  int const sample,
177  int const shift) {
178  auto const& acc25nsVec = pulseShape.acc25nsVec();
179  auto const& diff25nsItvlVec = pulseShape.diff25nsItvlVec();
180  auto const& accVarLenIdxMinusOneVec = pulseShape.accVarLenIdxMinusOneVec();
181  auto const& diffVarItvlIdxMinusOneVec = pulseShape.diffVarItvlIdxMinusOneVec();
182  auto const& accVarLenIdxZeroVec = pulseShape.accVarLenIdxZEROVec();
183  auto const& diffVarItvlIdxZeroVec = pulseShape.diffVarItvlIdxZEROVec();
184 
185  // constants
186  constexpr float slew = 0.f;
187  constexpr auto ns_per_bx = ::hcal::constants::nsPerBX;
188 
189  // FIXME: clean up all the rounding... this is coming from original cpu version
190  float const i_start_float = -::hcal::constants::iniTimeShift - pulse_time - slew > 0.f
191  ? 0.f
192  : std::abs(-::hcal::constants::iniTimeShift - pulse_time - slew) + 1.f;
193  int i_start = static_cast<int>(i_start_float);
194  float offset_start = static_cast<float>(i_start) - ::hcal::constants::iniTimeShift - pulse_time - slew;
195 
196  // boundary
197  if (offset_start == 1.0f) {
198  offset_start = 0.f;
199  i_start -= 1;
200  }
201 
202  int const bin_start = static_cast<int>(offset_start);
203  float const bin_start_up = static_cast<float>(bin_start) + 0.5f;
204  int const bin_0_start = offset_start < bin_start_up ? bin_start - 1 : bin_start;
205  int const its_start = i_start / ns_per_bx;
206  int const distTo25ns_start = ns_per_bx - 1 - i_start % ns_per_bx;
207  auto const factor = offset_start - static_cast<float>(bin_0_start) - 0.5;
208 
209  auto const sample_over10ts = sample + shift;
210  float value = 0.0f;
211  if (sample_over10ts == its_start) {
212  value = bin_0_start == -1
213  ? accVarLenIdxMinusOneVec[distTo25ns_start] + factor * diffVarItvlIdxMinusOneVec[distTo25ns_start]
214  : accVarLenIdxZeroVec[distTo25ns_start] + factor * diffVarItvlIdxZeroVec[distTo25ns_start];
215  } else if (sample_over10ts > its_start) {
216  int const bin_idx = distTo25ns_start + 1 + (sample_over10ts - its_start - 1) * ns_per_bx + bin_0_start;
217  value = acc25nsVec[bin_idx] + factor * diff25nsItvlVec[bin_idx];
218  }
219  return value;
220  }
221 
222  // TODO: provide constants from configuration
223  // from RecoLocalCalo/HcalRecProducers/python/HBHEMahiParameters_cfi.py
226  constexpr double nnlsThresh = 1e-11;
228 
229  // from RecoLocalCalo/HcalRecProducers/src/HBHEPhase1Reconstructor.cc
230  ALPAKA_FN_ACC float get_raw_charge(double const charge,
231  double const pedestal,
232  float const* shrChargeMinusPedestal,
233  float const parLin1,
234  float const parLin2,
235  float const parLin3,
236  int32_t const nsamplesForCompute,
237  int32_t const soi,
238  int const sipmQTSShift,
239  int const sipmQNTStoSum,
240  float const fcByPE,
241  int32_t const lch,
242  bool const isqie11) {
243  float rawCharge;
244 
245  if (!isqie11)
246  rawCharge = charge;
247  else {
248  int const first = std::max(soi + sipmQTSShift, 0);
249  int const last = std::min(soi + sipmQNTStoSum, nsamplesForCompute);
250  float sipmq = 0.0f;
251  for (auto ts = first; ts < last; ts++)
252  sipmq += shrChargeMinusPedestal[lch * nsamplesForCompute + ts];
253  auto const effectivePixelsFired = sipmq / fcByPE;
254  auto const factor = compute_reco_correction_factor(parLin1, parLin2, parLin3, effectivePixelsFired);
255  rawCharge = (charge - pedestal) * factor + pedestal;
256 
257 #ifdef HCAL_MAHI_GPUDEBUG
258  printf("first = %d last = %d sipmQ = %f factor = %f rawCharge = %f\n", first, last, sipmq, factor, rawCharge);
259 #endif
260  }
261  return rawCharge;
262  }
263 
264  // Tried using lambda, but strange error from with nvcc
265  inline constexpr bool TSenergyCompare(std::pair<unsigned int, float> a, std::pair<unsigned int, float> b) {
266  return a.second > b.second;
267  };
268 
270  public:
271  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
272  ALPAKA_FN_ACC void operator()(TAcc const& acc,
273  OProductType::View outputGPU,
274  IProductTypef01::ConstView f01HEDigis,
275  IProductTypef5::ConstView f5HBDigis,
276  IProductTypef3::ConstView f3HBDigis,
277  HcalMahiConditionsPortableDevice::ConstView mahi,
278  HcalSiPMCharacteristicsPortableDevice::ConstView sipmCharacteristics,
280  bool const useEffectivePedestals,
281  int const sipmQTSShift,
282  int const sipmQNTStoSum,
283  int const firstSampleShift,
284  float const ts4Thresh,
285  float* amplitudes,
286  float* noiseTerms,
287  float* electronicNoiseTerms,
288  int8_t* soiSamples,
289  int const windowSize) const {
290  auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
291  auto const startingSample = compute_nsamples<Flavor1>(f01HEDigis.stride()) - windowSize;
292  auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
293 
294  //first index = groups of channel
295  auto const nchannels_per_block(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
296  //2nd index = groups of sample
297  auto const nsamplesForCompute(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[1u]);
298 
299  // configure shared mem
300  float* shrEnergyM0PerTS = alpaka::getDynSharedMem<float>(acc);
301  float* shrChargeMinusPedestal = shrEnergyM0PerTS + nsamplesForCompute * nchannels_per_block;
302  float* shrMethod0EnergyAccum = shrChargeMinusPedestal + nsamplesForCompute * nchannels_per_block;
303  float* shrEnergyM0TotalAccum = shrMethod0EnergyAccum + nchannels_per_block;
304  unsigned long long int* shrMethod0EnergySamplePair =
305  reinterpret_cast<unsigned long long int*>(shrEnergyM0TotalAccum + nchannels_per_block);
306 
307  //Loop over all groups of channels
308  for (auto group : uniform_groups_y(acc, nchannels)) {
309  //Loop over each channel, first compute soiSamples and shrMem for all channels
310  for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
311  auto const gch = channel.global;
312  auto const lch = channel.local;
313 
314  for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
315  auto const sampleWithinWindow = i_sample;
316  auto const sample = i_sample + startingSample;
317  auto const linearThPerBlock = i_sample + channel.local * nsamplesForCompute;
318  // initialize
319  if (sampleWithinWindow == 0) {
320  soiSamples[gch] = -1;
321  shrMethod0EnergyAccum[lch] = 0;
322  shrMethod0EnergySamplePair[lch] = 0;
323  shrEnergyM0TotalAccum[lch] = 0;
324  }
325 
326  // compute soiSamples
327  if (gch < f01HEDigis.size()) {
328  // NOTE: assume that soi is high only for a single guy!
329  // which must be the case. cpu version does not check for that
330  // if that is not the case, we will see that with cuda mmecheck
331  auto const soibit = soibit_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample);
332  if (soibit == 1)
333  soiSamples[gch] = sampleWithinWindow;
334  } else if (gch >= nchannelsf015) {
335  auto const soibit = soibit_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample);
336  if (soibit == 1)
337  soiSamples[gch] = sampleWithinWindow;
338  }
339 
340  // compute shrMem
341  auto const id = gch < f01HEDigis.size()
342  ? f01HEDigis.ids()[gch]
343  : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
344  : f3HBDigis.ids()[gch - nchannelsf015]);
345  auto const did = HcalDetId{id};
346 
347  auto const adc =
348  gch < f01HEDigis.size()
349  ? adc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
350  : (gch < nchannelsf015
351  ? adc_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
352  : adc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
353  auto const capid =
354  gch < f01HEDigis.size()
355  ? capid_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
356  : (gch < nchannelsf015
357  ? capid_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
358  : capid_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
359 
360  // compute hash for this did
361  // hash needed to convert condition arrays order (based on Topo) into digi arrays order(based on FED)
362  auto const hashedId =
363  did.subdetId() == HcalBarrel
364  ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
365  : did2linearIndexHE(id,
366  mahi.maxDepthHE(),
367  mahi.maxPhiHE(),
368  mahi.firstHERing(),
369  mahi.lastHERing(),
370  mahi.nEtaHE()) +
371  mahi.offsetForHashes();
372 
373  // conditions based on the hash
374  auto const qieType = mahi.qieTypes_values()[hashedId] > 0 ? 1 : 0; // 2 types at this point
375  auto const* qieOffsets = mahi.qieCoders_offsets()[hashedId].data();
376  auto const* qieSlopes = mahi.qieCoders_slopes()[hashedId].data();
377  auto const pedestal = mahi.pedestals_value()[hashedId][capid];
378 
379  // compute charge
380  auto const charge = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
381 
382  shrChargeMinusPedestal[linearThPerBlock] = charge - pedestal;
383  }
384  }
385  alpaka::syncBlockThreads(acc);
386 
387  //Loop over each channel, compute input for multifit using shrChargeMinusPedestal
388  for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
389  auto const gch = channel.global;
390  auto const lch = channel.local;
391  for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
392  auto const sampleWithinWindow = i_sample;
393  auto const sample = i_sample + startingSample;
394 
395  // initialize all output buffers
396  if (sampleWithinWindow == 0) {
397  outputGPU.detId()[gch] = 0;
398  outputGPU.energyM0()[gch] = 0;
399  outputGPU.timeM0()[gch] = 0;
400  outputGPU.energy()[gch] = 0;
401  outputGPU.chi2()[gch] = 0;
402  }
403 
404  // offset output
405  auto* amplitudesForChannel = amplitudes + nsamplesForCompute * gch;
406  auto* noiseTermsForChannel = noiseTerms + nsamplesForCompute * gch;
407  auto* electronicNoiseTermsForChannel = electronicNoiseTerms + nsamplesForCompute * gch;
408 
409  // get event input quantities
410  auto const stride = gch < f01HEDigis.size()
411  ? f01HEDigis.stride()
412  : (gch < f5HBDigis.size() ? f5HBDigis.stride() : f3HBDigis.stride());
413  auto const nsamples =
414  gch < f01HEDigis.size()
415  ? compute_nsamples<Flavor1>(stride)
416  : (gch < nchannelsf015 ? compute_nsamples<Flavor5>(stride) : compute_nsamples<Flavor3>(stride));
417 
418  ALPAKA_ASSERT_ACC(nsamples == nsamplesForCompute || nsamples - startingSample == nsamplesForCompute);
419 
420  auto const id = gch < f01HEDigis.size()
421  ? f01HEDigis.ids()[gch]
422  : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
423  : f3HBDigis.ids()[gch - nchannelsf015]);
424  auto const did = HcalDetId{id};
425 
426  auto const adc =
427  gch < f01HEDigis.size()
428  ? adc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
429  : (gch < nchannelsf015
430  ? adc_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
431  : adc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
432  auto const capid =
433  gch < f01HEDigis.size()
434  ? capid_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample)
435  : (gch < nchannelsf015
436  ? capid_for_sample<Flavor5>(&(f5HBDigis.data()[gch - f01HEDigis.size()][0]), sample)
437  : capid_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
438 
439  // compute hash for this did
440  // hash needed to convert condition arrays order (based on Topo) into digi arrays order(based on FED)
441  auto const hashedId =
442  did.subdetId() == HcalBarrel
443  ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
444  : did2linearIndexHE(id,
445  mahi.maxDepthHE(),
446  mahi.maxPhiHE(),
447  mahi.firstHERing(),
448  mahi.lastHERing(),
449  mahi.nEtaHE()) +
450  mahi.offsetForHashes();
451 
452  // conditions based on the hash
453  auto const qieType = mahi.qieTypes_values()[hashedId] > 0 ? 1 : 0; // 2 types at this point
454  auto const* qieOffsets = mahi.qieCoders_offsets()[hashedId].data();
455  auto const* qieSlopes = mahi.qieCoders_slopes()[hashedId].data();
456  auto const* pedestalWidthsForChannel =
457  useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
458  ? mahi.effectivePedestalWidths()[hashedId].data()
459  : mahi.pedestals_width()[hashedId].data();
460 
461  auto const gain = mahi.gains_value()[hashedId][capid];
462  auto const gain0 = mahi.gains_value()[hashedId][0];
463  auto const respCorrection = mahi.respCorrs_values()[hashedId];
464  auto const pedestal = mahi.pedestals_value()[hashedId][capid];
465  auto const pedestalWidth = pedestalWidthsForChannel[capid];
466  // if needed, only use effective pedestals for f01
467  auto const pedestalToUseForMethod0 =
468  useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
469  ? mahi.effectivePedestals()[hashedId][capid]
470  : pedestal;
471  auto const sipmType = mahi.sipmPar_type()[hashedId];
472  auto const fcByPE = mahi.sipmPar_fcByPE()[hashedId];
473  auto const recoParam1 = recoParamsWithPS.recoParamView().param1()[hashedId];
474  auto const recoParam2 = recoParamsWithPS.recoParamView().param2()[hashedId];
475 
476 #ifdef HCAL_MAHI_GPUDEBUG
477  printf("qieType = %d qieOffset0 = %f qieOffset1 = %f qieSlope0 = %f qieSlope1 = %f\n",
478  qieType,
479  qieOffsets[0],
480  qieOffsets[1],
481  qieSlopes[0],
482  qieSlopes[1]);
483 #endif
484 
485  // compute charge
486  auto const charge = compute_coder_charge(qieType, adc, capid, qieOffsets, qieSlopes);
487 
488  int32_t const soi =
489  gch < f01HEDigis.size()
490  ? soiSamples[gch]
491  : (gch < nchannelsf015 ? f5HBDigis.npresamples()[gch - f01HEDigis.size()] : soiSamples[gch]);
492 
493  bool badSOI = (soi < 0 or static_cast<unsigned>(soi) >= nsamplesForCompute);
494  if (badSOI and sampleWithinWindow == 0) {
495 #ifdef GPU_DEBUG
496  printf("Found HBHE channel %d with invalid SOI %d\n", gch, soi);
497 #endif
498  // mark the channel as bad
499  outputGPU.chi2()[gch] = -9999.f;
500  }
501 
502  // type index starts from 1 .. 6
503  // precicionItem index starts from 0 .. 5
504  auto const precisionItem = sipmCharacteristics.precisionItem()[sipmType - 1];
505  auto const parLin1 = precisionItem.parLin1_;
506  auto const parLin2 = precisionItem.parLin2_;
507  auto const parLin3 = precisionItem.parLin3_;
508 
509  //int32_t const soi = gch >= nchannelsf01HE
510  // ? npresamplesf5HB[gch - nchannelsf01HE]
511  // : soiSamples[gch];
512  // this is here just to make things uniform...
513  if (gch >= f01HEDigis.size() && gch < nchannelsf015 && sampleWithinWindow == 0)
514  soiSamples[gch] = f5HBDigis.npresamples()[gch - f01HEDigis.size()];
515 
516  //
517  // compute various quantities (raw charge and tdc stuff)
518  // NOTE: this branch will be divergent only for a single warp that
519  // sits on the boundary when flavor 01 channels end and flavor 5 start
520  //
521  float const rawCharge = get_raw_charge(charge,
522  pedestal,
523  shrChargeMinusPedestal,
524  parLin1,
525  parLin2,
526  parLin3,
527  nsamplesForCompute,
528  soi,
529  sipmQTSShift,
531  fcByPE,
532  lch,
533  gch < f01HEDigis.size() || gch >= nchannelsf015);
534 
535  auto const dfc = compute_diff_charge_gain(
536  qieType, adc, capid, qieOffsets, qieSlopes, gch < f01HEDigis.size() || gch >= nchannelsf015);
537 
538 #ifdef COMPUTE_TDC_TIME
539  float tdcTime;
540  if (gch >= f01HEDigis.size() && gch < nchannelsf015) {
542  } else {
543  if (gch < f01HEDigis.size())
544  tdcTime =
545  HcalSpecialTimes::getTDCTime(tdc_for_sample<Flavor1>(&(f01HEDigis.data()[gch][0]), sample));
546  else if (gch >= nchannelsf015)
548  tdc_for_sample<Flavor3>(&(f3HBDigis.data()[gch - nchannelsf015][0]), sample));
549  }
550 #endif // COMPUTE_TDC_TIME
551 
552  // compute method 0 quantities
553  // TODO: need to apply containment
554  // TODO: need to apply time slew
555  // TODO: for < run 3, apply HBM legacy energy correction
556  auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
557  auto const startSampleTmp = soi + firstSampleShift;
558  auto const startSample = startSampleTmp < 0 ? 0 : startSampleTmp;
559  auto const endSample =
560  startSample + nsamplesToAdd < nsamplesForCompute ? startSample + nsamplesToAdd : nsamplesForCompute;
561  // NOTE: gain is a small number < 10^-3, multiply it last
562  auto const energym0_per_ts = gain * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
563  auto const energym0_per_ts_gain0 = gain0 * ((rawCharge - pedestalToUseForMethod0) * respCorrection);
564  // store to shared mem
565  shrEnergyM0PerTS[lch * nsamplesForCompute + sampleWithinWindow] = energym0_per_ts;
566 
568  acc, &shrEnergyM0TotalAccum[lch], energym0_per_ts_gain0, alpaka::hierarchy::Threads{});
569 
570 #ifdef HCAL_MAHI_GPUDEBUG
571  printf(
572  "id = %u sample = %d gch = %d hashedId = %u adc = %u capid = %u\n"
573  " charge = %f rawCharge = %f dfc = %f pedestal = %f\n"
574  " gain = %f respCorrection = %f energym0_per_ts = %f\n",
575  id,
576  sample,
577  gch,
578  hashedId,
579  adc,
580  capid,
581  charge,
582  rawCharge,
583  dfc,
584  pedestalToUseForMethod0,
585  gain,
586  respCorrection,
587  energym0_per_ts);
588  printf("startSample = %d endSample = %d param1 = %u param2 = %u\n",
589  startSample,
590  endSample,
591  recoParam1,
592  recoParam2);
593 #endif
594  //Find the max energy of lch channel and the corresponding TS
595  if (sampleWithinWindow >= static_cast<unsigned>(startSample) &&
596  sampleWithinWindow < static_cast<unsigned>(endSample)) {
597  //sum the energys of all TS
598  alpaka::atomicAdd(acc, &shrMethod0EnergyAccum[lch], energym0_per_ts, alpaka::hierarchy::Threads{});
599  // pair TS and E for all TSs, find max pair according to E
600  // TODO: Non-deterministic behavior for TS with equal energy
602  acc, &shrMethod0EnergySamplePair[lch], {sampleWithinWindow, energym0_per_ts}, TSenergyCompare);
603  }
604 
605  // NOTE: must take soi, as values for that thread are used...
606  // NOTE: does not run if soi is bad, because it does not match any sampleWithinWindow
607  if (sampleWithinWindow == static_cast<unsigned>(soi)) {
608  // Channel quality check
609  // https://github.com/cms-sw/cmssw/blob/master/RecoLocalCalo/HcalRecAlgos/plugins/HcalChannelPropertiesEP.cc#L107-L109
610  // https://github.com/cms-sw/cmssw/blob/6d2f66057131baacc2fcbdd203588c41c885b42c/CondCore/HcalPlugins/plugins/HcalChannelQuality_PayloadInspector.cc#L30
611  // const bool taggedBadByDb = severity.dropChannel(digistatus->getValue());
612  // do not run MAHI if taggedBadByDb = true
613  auto const digiStatus_ = mahi.channelQuality_status()[hashedId];
614  const bool taggedBadByDb = (digiStatus_ / 32770);
615 
616  if (taggedBadByDb)
617  outputGPU.chi2()[gch] = -9999.f;
618 
619  // check as in cpu version if mahi is not needed
620  // FIXME: KNOWN ISSUE: observed a problem when rawCharge and pedestal
621  // are basically equal and generate -0.00000...
622  // needs to be treated properly
623  if (!(shrEnergyM0TotalAccum[lch] > 0 && energym0_per_ts_gain0 > ts4Thresh)) {
624  // do not need to run mahi minimization
625  //outputEnergy[gch] = 0; energy already inited to 0
626  outputGPU.chi2()[gch] = -9999.f;
627  }
628  }
629  //
630  // preparations for mahi fit
631  //
632  auto const amplitude = rawCharge - pedestalToUseForMethod0;
633  auto const noiseADC = (1. / std::sqrt(12)) * dfc;
634  auto const noisePhotoSq = amplitude > pedestalWidth ? (amplitude * fcByPE) : 0.f;
635  auto const noiseTerm = noiseADC * noiseADC + noisePhotoSq + pedestalWidth * pedestalWidth;
636 
637  // store to global memory
638  amplitudesForChannel[sampleWithinWindow] = amplitude;
639  noiseTermsForChannel[sampleWithinWindow] = noiseTerm;
640  electronicNoiseTermsForChannel[sampleWithinWindow] = pedestalWidth;
641 
642  } //end sample loop
643  } // end channel loop
644  alpaka::syncBlockThreads(acc);
645 
646  // compute energy using shrMethod0EnergySamplePair
647  for (auto channel : uniform_group_elements_y(acc, group, nchannels)) {
648  auto const gch = channel.global;
649  auto const lch = channel.local;
650  for (auto i_sample : independent_group_elements_x(acc, nsamplesForCompute)) {
651  auto const sampleWithinWindow = i_sample;
652 
653  int32_t const soi =
654  gch < f01HEDigis.size()
655  ? soiSamples[gch]
656  : (gch < nchannelsf015 ? f5HBDigis.npresamples()[gch - f01HEDigis.size()] : soiSamples[gch]);
657 
658  auto const id = gch < f01HEDigis.size()
659  ? f01HEDigis.ids()[gch]
660  : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
661  : f3HBDigis.ids()[gch - nchannelsf015]);
662  auto const did = HcalDetId{id};
663  auto const hashedId =
664  did.subdetId() == HcalBarrel
665  ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
666  : did2linearIndexHE(id,
667  mahi.maxDepthHE(),
668  mahi.maxPhiHE(),
669  mahi.firstHERing(),
670  mahi.lastHERing(),
671  mahi.nEtaHE()) +
672  mahi.offsetForHashes();
673 
674  auto const recoParam1 = recoParamsWithPS.recoParamView().param1()[hashedId];
675  auto const recoParam2 = recoParamsWithPS.recoParamView().param2()[hashedId];
676 
677  auto const nsamplesToAdd = recoParam1 < 10 ? recoParam2 : (recoParam1 >> 14) & 0xF;
678  // NOTE: must take soi, as values for that thread are used...
679  // NOTE: does not run if soi is bad, because it does not match any sampleWithinWindow
680  if (sampleWithinWindow == static_cast<unsigned>(soi)) {
681  auto const method0_energy = shrMethod0EnergyAccum[lch];
682  auto const val = shrMethod0EnergySamplePair[lch];
683  int const max_sample = (val >> 32) & 0xffffffff;
684 
685  float const max_energy = uint_as_float(static_cast<uint32_t>(val & 0xffffffff));
686 
687  float const max_energy_1 = static_cast<unsigned>(max_sample) < nsamplesForCompute - 1
688  ? shrEnergyM0PerTS[lch * nsamplesForCompute + max_sample + 1]
689  : 0.f;
690  float const position = nsamplesToAdd < nsamplesForCompute ? max_sample - soi : max_sample;
691  auto const sum = max_energy + max_energy_1;
692  // FIXME: for full comparison with cpu method 0 timing,
693  // need to correct by slew
694  // requires an accumulator -> more shared mem -> omit here unless
695  // really needed
696  float const time =
697  max_energy > 0.f && max_energy_1 > 0.f ? 25.f * (position + max_energy_1 / sum) : 25.f * position;
698 
699  // store method0 quantities to global mem
700  outputGPU.detId()[gch] = id;
701  outputGPU.energyM0()[gch] = method0_energy;
702  outputGPU.timeM0()[gch] = time;
703 
704 #ifdef HCAL_MAHI_GPUDEBUG
705  printf("tsTOT = %f tstrig = %f ts4Thresh = %f\n",
706  shrEnergyM0TotalAccum[lch],
707  energym0_per_ts_gain0,
708  ts4Thresh);
709 #endif
710 
711 #ifdef HCAL_MAHI_GPUDEBUG
712  printf(" method0_energy = %f max_sample = %d max_energy = %f time = %f\n",
713  method0_energy,
714  max_sample,
715  max_energy,
716  time);
717 #endif
718  }
719 #ifdef HCAL_MAHI_GPUDEBUG
720  printf(
721  "charge(%d) = %f pedestal(%d) = %f dfc(%d) = %f pedestalWidth(%d) = %f noiseADC(%d) = %f "
722  "noisPhoto(%d) =%f outputGPU chi2[gch] = %f \n",
723  sample,
724  rawCharge,
725  sample,
726  pedestalToUseForMethod0,
727  sample,
728  dfc,
729  sample,
730  pedestalWidth,
731  sample,
732  noiseADC,
733  sample,
734  noisePhotoSq,
735  outputGPU.chi2()[gch]);
736 #endif
737 
738  } // loop for sample
739  } // loop for channels
740  } // loop for channgel groups
741  }
742  }; //Kernel_prep1d_sameNumberOfSamples
743 
745  public:
746  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
747  ALPAKA_FN_ACC void operator()(TAcc const& acc,
748  float* pulseMatrices,
749  float* pulseMatricesM,
750  float* pulseMatricesP,
751  HcalMahiPulseOffsetsPortableDevice::ConstView pulseOffsets,
752  float const* amplitudes,
753  IProductTypef01::ConstView f01HEDigis,
754  IProductTypef5::ConstView f5HBDigis,
755  IProductTypef3::ConstView f3HBDigis,
756  int8_t* soiSamples,
757  HcalMahiConditionsPortableDevice::ConstView mahi,
759  float const meanTime,
760  float const timeSigmaSiPM,
761  float const timeSigmaHPD,
762  bool const applyTimeSlew,
763  float const tzeroTimeSlew,
764  float const slopeTimeSlew,
765  float const tmaxTimeSlew) const {
766  //2nd index = pulse
767  auto const npulses(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[1u]);
768  //3rd index = sample
769  auto const nsamples(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[2u]);
770 
771  auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
772  auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
773 
774  //Loop over each channel
775  for (auto channel : uniform_elements_z(acc, nchannels)) {
776  //Loop over pulses
777  for (auto ipulse : independent_group_elements_y(acc, npulses)) {
778  //Loop over sample
779  for (auto sample : independent_group_elements_x(acc, nsamples)) {
780  // conditions
781  auto const id = channel < f01HEDigis.size()
782  ? f01HEDigis.ids()[channel]
783  : (channel < nchannelsf015 ? f5HBDigis.ids()[channel - f01HEDigis.size()]
784  : f3HBDigis.ids()[channel - nchannelsf015]);
785  auto const deltaT =
786  channel >= f01HEDigis.size() && channel < nchannelsf015 ? timeSigmaHPD : timeSigmaSiPM;
787 
788  // compute hash for this did
789  // hash needed to convert condition arrays order (based on Topo) into digi arrays order(based on FED)
790  auto const did = DetId{id};
791  auto const hashedId =
792  did.subdetId() == HcalBarrel
793  ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
794  : did2linearIndexHE(id,
795  mahi.maxDepthHE(),
796  mahi.maxPhiHE(),
797  mahi.firstHERing(),
798  mahi.lastHERing(),
799  mahi.nEtaHE()) +
800  mahi.offsetForHashes();
801  auto const pulseShape = recoParamsWithPS.getPulseShape(hashedId);
802 
803  // offset output arrays
804  auto* pulseMatrix = pulseMatrices + nsamples * npulses * channel;
805  auto* pulseMatrixM = pulseMatricesM + nsamples * npulses * channel;
806  auto* pulseMatrixP = pulseMatricesP + nsamples * npulses * channel;
807 
808  // amplitude per ipulse
809  int const soi = soiSamples[channel];
810  int const pulseOffset = pulseOffsets.offsets()[ipulse];
811  auto const amplitude = amplitudes[channel * nsamples + pulseOffset + soi];
812 
813  if (amplitude <= 0.f) {
814  pulseMatrix[ipulse * nsamples + sample] = 0.f;
815  pulseMatrixM[ipulse * nsamples + sample] = 0.f;
816  pulseMatrixP[ipulse * nsamples + sample] = 0.f;
817  continue;
818  }
819 #ifdef HCAL_MAHI_GPUDEBUG
820 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
821  if (id != DETID_TO_DEBUG)
822  return;
823 #endif
824  if (sample == 0 && ipulse == 0) {
825  for (int i = 0; i < 8; i++)
826  printf("amplitude(%d) = %f\n", i, amplitudes[channel * nsamples + i]);
827  printf("acc25nsVec and diff25nsItvlVec for recoPulseShapeId = %u\n", recoPulseShapeId);
828  for (int i = 0; i < 256; i++) {
829  printf("acc25nsVec(%d) = %f diff25nsItvlVec(%d) = %f\n", i, acc25nsVec[i], i, diff25nsItvlVec[i]);
830  }
831  printf("accVarLenIdxZEROVec and accVarLenIdxMinusOneVec\n");
832  for (int i = 0; i < 25; i++) {
833  printf("accVarLenIdxZEROVec(%d) = %f accVarLenIdxMinusOneVec(%d) = %f\n",
834  i,
835  accVarLenIdxZeroVec[i],
836  i,
837  accVarLenIdxMinusOneVec[i]);
838  }
839  printf("diffVarItvlIdxZEROVec and diffVarItvlIdxMinusOneVec\n");
840  for (int i = 0; i < 25; i++) {
841  printf("diffVarItvlIdxZEROVec(%d) = %f diffVarItvlIdxMinusOneVec(%d) = %f\n",
842  i,
843  diffVarItvlIdxZeroVec[i],
844  i,
845  diffVarItvlIdxMinusOneVec[i]);
846  }
847  }
848 #endif
849 
850  auto t0 = meanTime;
851  if (applyTimeSlew) {
852  if (amplitude <= 1.0f)
853  t0 += compute_time_slew_delay(1.0, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
854  else
855  t0 += compute_time_slew_delay(amplitude, tzeroTimeSlew, slopeTimeSlew, tmaxTimeSlew);
856  }
857  auto const t0m = -deltaT + t0;
858  auto const t0p = deltaT + t0;
859 
860 #ifdef HCAL_MAHI_GPUDEBUG
861  if (sample == 0 && ipulse == 0) {
862  printf("time values: %f %f %f\n", t0, t0m, t0p);
863  }
864 
865  if (sample == 0 && ipulse == 0) {
866  for (int i = 0; i < hcal::constants::maxSamples; i++) {
867  auto const value = compute_pulse_shape_value(pulseShape, t0, i, 0);
868  }
869  printf("\n");
870  for (int i = 0; i < hcal::constants::maxSamples; i++) {
871  auto const value = compute_pulse_shape_value(pulseShape, t0p, i, 0);
872  }
873  printf("\n");
874  for (int i = 0; i < hcal::constants::maxSamples; i++) {
875  auto const value = compute_pulse_shape_value(pulseShape, t0m, i, 0);
876  }
877  }
878 #endif
879 
880  // FIXME: shift should be treated properly,
881  // here assume 8 time slices and 8 samples
882  auto const shift = 4 - soi; // as in cpu version!
883 
884  int32_t const idx = sample - pulseOffset;
885  auto const value = idx >= 0 && static_cast<unsigned>(idx) < nsamples
886  ? compute_pulse_shape_value(pulseShape, t0, idx, shift)
887  : 0;
888  auto const value_t0m = idx >= 0 && static_cast<unsigned>(idx) < nsamples
889  ? compute_pulse_shape_value(pulseShape, t0m, idx, shift)
890  : 0;
891  auto const value_t0p = idx >= 0 && static_cast<unsigned>(idx) < nsamples
892  ? compute_pulse_shape_value(pulseShape, t0p, idx, shift)
893  : 0;
894 
895  // store to global
896  pulseMatrix[ipulse * nsamples + sample] = value;
897  pulseMatrixM[ipulse * nsamples + sample] = value_t0m;
898  pulseMatrixP[ipulse * nsamples + sample] = value_t0p;
899 
900  } // loop over sample
901  } // loop over pulse
902  } // loop over channels
903  }
904  }; // Kernel_prep_pulseMatrices_sameNumberOfSamples
905 
906  template <int NSAMPLES, int NPULSES>
907  ALPAKA_FN_ACC ALPAKA_FN_INLINE void update_covariance(
908  calo::multifit::ColumnVector<NPULSES> const& resultAmplitudesVector,
910  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrix,
911  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixM,
912  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> const& pulseMatrixP) {
914  for (int ipulse = 0; ipulse < NPULSES; ipulse++) {
915  auto const resultAmplitude = resultAmplitudesVector(ipulse);
916  if (resultAmplitude == 0)
917  continue;
918 
919 #ifdef HCAL_MAHI_GPUDEBUG
920  printf("pulse cov array for ibx = %d\n", ipulse);
921 #endif
922 
923  // preload a column
924  float pmcol[NSAMPLES], pmpcol[NSAMPLES], pmmcol[NSAMPLES];
926  for (int counter = 0; counter < NSAMPLES; counter++) {
927  pmcol[counter] = pulseMatrix.coeffRef(counter, ipulse);
928  pmpcol[counter] = pulseMatrixP.coeffRef(counter, ipulse);
929  pmmcol[counter] = pulseMatrixM.coeffRef(counter, ipulse);
930  }
931 
932  auto const ampl2 = resultAmplitude * resultAmplitude;
934  for (int col = 0; col < NSAMPLES; col++) {
935  auto const valueP_col = pmpcol[col];
936  auto const valueM_col = pmmcol[col];
937  auto const value_col = pmcol[col];
938  auto const tmppcol = valueP_col - value_col;
939  auto const tmpmcol = valueM_col - value_col;
940 
941  // diagonal
942  auto tmp_value = 0.5 * (tmppcol * tmppcol + tmpmcol * tmpmcol);
943  covarianceMatrix(col, col) += ampl2 * tmp_value;
944 
945  // FIXME: understand if this actually gets unrolled
947  for (int row = col + 1; row < NSAMPLES; row++) {
948  float const valueP_row = pmpcol[row]; //pulseMatrixP(j, ipulseReal);
949  float const value_row = pmcol[row]; //pulseMatrix(j, ipulseReal);
950  float const valueM_row = pmmcol[row]; //pulseMatrixM(j, ipulseReal);
951 
952  float tmpprow = valueP_row - value_row;
953  float tmpmrow = valueM_row - value_row;
954 
955  auto const covValue = 0.5 * (tmppcol * tmpprow + tmpmcol * tmpmrow);
956 
957  covarianceMatrix(row, col) += ampl2 * covValue;
958  }
959  }
960  }
961  }
962 
963  template <int NSAMPLES, int NPULSES>
965  public:
966  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
967  ALPAKA_FN_ACC void operator()(TAcc const& acc,
968  OProductType::View outputGPU,
969  float const* amplitudes,
970  float* pulseMatrices,
971  float* pulseMatricesM,
972  float* pulseMatricesP,
973  HcalMahiPulseOffsetsPortableDevice::ConstView pulseOffsetsView,
974  float* noiseTerms,
975  float* electronicNoiseTerms,
976  int8_t* soiSamples,
977  HcalMahiConditionsPortableDevice::ConstView mahi,
978  bool const useEffectivePedestals,
979  IProductTypef01::ConstView f01HEDigis,
980  IProductTypef5::ConstView f5HBDigis,
981  IProductTypef3::ConstView f3HBDigis) const {
982  // can be relaxed if needed - minor updates are needed in that case!
983  static_assert(NPULSES == NSAMPLES);
984 
985  auto const nchannels = f01HEDigis.size() + f5HBDigis.size() + f3HBDigis.size();
986 
987  auto const nchannelsf015 = f01HEDigis.size() + f5HBDigis.size();
988 
989  auto const threadsPerBlock(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]);
990 
991  //Loop over all groups of channels
992  for (auto group : uniform_groups_x(acc, nchannels)) {
993  //Loop over each channel
994  for (auto channel : uniform_group_elements_x(acc, group, nchannels)) {
995  auto const gch = channel.global;
996  auto const lch = channel.local;
997 
998  // if chi2 is set to -9999 do not run minimization
999  if (outputGPU.chi2()[gch] == -9999.f)
1000  continue;
1001 
1002  // configure shared mem
1003  float* shrmem = alpaka::getDynSharedMem<float>(acc);
1004  float* shrMatrixLFnnlsStorage = shrmem + calo::multifit::MapSymM<float, NPULSES>::total * lch;
1005  float* shrAtAStorage = shrmem + calo::multifit::MapSymM<float, NPULSES>::total * (lch + threadsPerBlock);
1006 
1007  // conditions for pedestal widths
1008  auto const id = gch < f01HEDigis.size() ? f01HEDigis.ids()[gch]
1009  : (gch < nchannelsf015 ? f5HBDigis.ids()[gch - f01HEDigis.size()]
1010  : f3HBDigis.ids()[gch - nchannelsf015]);
1011  auto const did = DetId{id};
1012  auto const hashedId =
1013  did.subdetId() == HcalBarrel
1014  ? did2linearIndexHB(id, mahi.maxDepthHB(), mahi.firstHBRing(), mahi.lastHBRing(), mahi.nEtaHB())
1015  : did2linearIndexHE(id,
1016  mahi.maxDepthHE(),
1017  mahi.maxPhiHE(),
1018  mahi.firstHERing(),
1019  mahi.lastHERing(),
1020  mahi.nEtaHE()) +
1021  mahi.offsetForHashes();
1022 
1023  // conditions based on the hash
1024  auto const* pedestalWidthsForChannel =
1025  useEffectivePedestals && (gch < f01HEDigis.size() || gch >= nchannelsf015)
1026  ? mahi.effectivePedestalWidths()[hashedId].data()
1027  : mahi.pedestals_width()[hashedId].data();
1028  auto const averagePedestalWidth2 = 0.25 * (pedestalWidthsForChannel[0] * pedestalWidthsForChannel[0] +
1029  pedestalWidthsForChannel[1] * pedestalWidthsForChannel[1] +
1030  pedestalWidthsForChannel[2] * pedestalWidthsForChannel[2] +
1031  pedestalWidthsForChannel[3] * pedestalWidthsForChannel[3]);
1032 
1033  // FIXME on cpu ts 0 capid was used - does it make any difference
1034  auto const gain = mahi.gains_value()[hashedId][0];
1035 
1036  auto const respCorrection = mahi.respCorrs_values()[hashedId];
1037  auto const noisecorr = mahi.sipmPar_auxi2()[hashedId];
1038 
1039 #ifdef HCAL_MAHI_GPUDEBUG
1040 #ifdef HCAL_MAHI_GPUDEBUG_FILTERDETID
1041  if (id != DETID_TO_DEBUG)
1042  return;
1043 #endif
1044 #endif
1045 
1048  for (int i = 0; i < NPULSES; ++i)
1049  pulseOffsets(i) = i;
1050 
1051  // output amplitudes/weights
1052  calo::multifit::ColumnVector<NPULSES> resultAmplitudesVector =
1054 
1055  // map views
1056  Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> inputAmplitudesView{amplitudes + gch * NSAMPLES};
1057  Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseTermsView{noiseTerms + gch * NSAMPLES};
1058  Eigen::Map<const calo::multifit::ColumnVector<NSAMPLES>> noiseElectronicView{electronicNoiseTerms +
1059  gch * NSAMPLES};
1060  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixMView{
1061  pulseMatricesM + gch * NSAMPLES * NPULSES};
1062  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixPView{
1063  pulseMatricesP + gch * NSAMPLES * NPULSES};
1064  Eigen::Map<const calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES>> glbPulseMatrixView{
1065  pulseMatrices + gch * NSAMPLES * NPULSES};
1066 #ifdef HCAL_MAHI_GPUDEBUG
1067  for (int i = 0; i < NSAMPLES; i++)
1068  printf("inputValues(%d) = %f noiseTerms(%d) = %f\n", i, inputAmplitudesView(i), i, noiseTermsView(i));
1069  for (int i = 0; i < NSAMPLES; i++) {
1070  for (int j = 0; j < NPULSES; j++)
1071  printf("%f ", glbPulseMatrixView(i, j));
1072  printf("\n");
1073  }
1074  printf("\n");
1075  for (int i = 0; i < NSAMPLES; i++) {
1076  for (int j = 0; j < NPULSES; j++)
1077  printf("%f ", glbPulseMatrixMView(i, j));
1078  printf("\n");
1079  }
1080  printf("\n");
1081  for (int i = 0; i < NSAMPLES; i++) {
1082  for (int j = 0; j < NPULSES; j++)
1083  printf("%f ", glbPulseMatrixPView(i, j));
1084  printf("\n");
1085  }
1086 #endif
1087 
1088  int npassive = 0;
1089  float chi2 = 0, previous_chi2 = 0.f, chi2_2itersback = 0.f;
1090  for (int iter = 1; iter < nMaxItersMin; iter++) {
1091  //float covarianceMatrixStorage[MapSymM<float, NSAMPLES>::total];
1092  // NOTE: only works when NSAMPLES == NPULSES
1093  // if does not hold -> slightly rearrange shared mem to still reuse
1094  // shared memory
1095  float* covarianceMatrixStorage = shrMatrixLFnnlsStorage;
1096  calo::multifit::MapSymM<float, NSAMPLES> covarianceMatrix{covarianceMatrixStorage};
1099  covarianceMatrixStorage[counter] = (noisecorr != 0.f) ? 0.f : averagePedestalWidth2;
1102  covarianceMatrix(counter, counter) += noiseTermsView.coeffRef(counter);
1103  if (counter != 0)
1104  covarianceMatrix(counter, counter - 1) +=
1105  noisecorr * noiseElectronicView.coeffRef(counter - 1) * noiseElectronicView.coeffRef(counter);
1106  }
1107 
1108  // update covariance matrix
1109  update_covariance(resultAmplitudesVector,
1110  covarianceMatrix,
1111  glbPulseMatrixView,
1112  glbPulseMatrixMView,
1113  glbPulseMatrixPView);
1114 
1115 #ifdef HCAL_MAHI_GPUDEBUG
1116  printf("covariance matrix\n");
1117  for (int i = 0; i < 8; i++) {
1118  for (int j = 0; j < 8; j++)
1119  printf("%f ", covarianceMatrix(i, j));
1120  printf("\n");
1121  }
1122 #endif
1123 
1124  // compute Cholesky Decomposition L matrix
1125  //matrixDecomposition.compute(covarianceMatrix);
1126  //auto const& matrixL = matrixDecomposition.matrixL();
1128  calo::multifit::MapSymM<float, NSAMPLES> matrixL{matrixLStorage};
1129  calo::multifit::compute_decomposition_unrolled(matrixL, covarianceMatrix);
1130 
1131  //
1132  // replace eigen
1133  //
1134  //auto const& A = matrixDecomposition
1135  // .matrixL()
1136  // .solve(pulseMatrixView);
1138  calo::multifit::solve_forward_subst_matrix(A, glbPulseMatrixView, matrixL);
1139 
1140  //
1141  // remove eigen
1142  //
1143  //auto const& b = matrixL
1144  // .solve(inputAmplitudesView);
1145  //
1146  float reg_b[NSAMPLES];
1147  calo::multifit::solve_forward_subst_vector(reg_b, inputAmplitudesView, matrixL);
1148 
1149  // TODO: we do not really need to change these matrcies
1150  // will be fixed in the optimized version
1151  //ColMajorMatrix<NPULSES, NPULSES> AtA = A.transpose() * A;
1152  //ColumnVector<NPULSES> Atb = A.transpose() * b;
1153  //ColMajorMatrix<NPULSES, NPULSES> AtA;
1154  //float AtAStorage[MapSymM<float, NPULSES>::total];
1155  calo::multifit::MapSymM<float, NPULSES> AtA{shrAtAStorage};
1158  for (int icol = 0; icol < NPULSES; icol++) {
1159  float reg_ai[NSAMPLES];
1160 
1161  // load column icol
1163  for (int counter = 0; counter < NSAMPLES; counter++)
1164  reg_ai[counter] = A(counter, icol);
1165 
1166  // compute diagonal
1167  float sum = 0.f;
1169  for (int counter = 0; counter < NSAMPLES; counter++)
1170  sum += reg_ai[counter] * reg_ai[counter];
1171 
1172  // store
1173  AtA(icol, icol) = sum;
1174 
1175  // go thru the other columns
1177  for (int j = icol + 1; j < NPULSES; j++) {
1178  // load column j
1179  float reg_aj[NSAMPLES];
1181  for (int counter = 0; counter < NSAMPLES; counter++)
1182  reg_aj[counter] = A(counter, j);
1183 
1184  // accum
1185  float sum = 0.f;
1187  for (int counter = 0; counter < NSAMPLES; counter++)
1188  sum += reg_aj[counter] * reg_ai[counter];
1189 
1190  // store
1191  //AtA(icol, j) = sum;
1192  AtA(j, icol) = sum;
1193  }
1194 
1195  // Atb accum
1196  float sum_atb = 0;
1198  for (int counter = 0; counter < NSAMPLES; counter++)
1199  sum_atb += reg_ai[counter] * reg_b[counter];
1200 
1201  // store atb
1202  Atb(icol) = sum_atb;
1203  }
1204 
1205 #ifdef HCAL_MAHI_GPUDEBUG
1206  printf("AtA\n");
1207  for (int i = 0; i < 8; i++) {
1208  for (int j = 0; j < 8; j++)
1209  printf("%f ", AtA(i, j));
1210  printf("\n");
1211  }
1212  printf("Atb\n");
1213  for (int i = 0; i < 8; i++)
1214  printf("%f ", Atb(i));
1215  printf("\n");
1216  printf("result Amplitudes before nnls\n");
1217  for (int i = 0; i < 8; i++)
1218  printf("%f ", resultAmplitudesVector(i));
1219  printf("\n");
1220 #endif
1221 
1222  // for fnnls
1223  calo::multifit::MapSymM<float, NPULSES> matrixLForFnnls{shrMatrixLFnnlsStorage};
1224 
1225  // run fast nnls
1227  Atb,
1228  resultAmplitudesVector,
1229  npassive,
1230  pulseOffsets,
1231  matrixLForFnnls,
1232  nnlsThresh,
1233  nMaxItersNNLS,
1234  10,
1235  10);
1236 
1237 #ifdef HCAL_MAHI_GPUDEBUG
1238  printf("result Amplitudes\n");
1239  for (int i = 0; i < 8; i++)
1240  printf("resultAmplitudes(%d) = %f\n", i, resultAmplitudesVector(i));
1241 #endif
1242 
1244  matrixL, glbPulseMatrixView, resultAmplitudesVector, inputAmplitudesView, chi2);
1245 
1246  auto const deltaChi2 = std::abs(chi2 - previous_chi2);
1247  if (chi2 == chi2_2itersback && chi2 < previous_chi2)
1248  break;
1249 
1250  // update
1251  chi2_2itersback = previous_chi2;
1252  previous_chi2 = chi2;
1253 
1254  // exit condition
1255  if (deltaChi2 < deltaChi2Threashold)
1256  break;
1257  }
1258 
1259 #ifdef HCAL_MAHI_GPUDEBUG
1260  for (int i = 0; i < NPULSES; i++)
1261  printf("pulseOffsets(%d) = %d outputAmplitudes(%d) = %f\n",
1262  i,
1263  pulseOffsets(i),
1264  i,
1265  resultAmplitudesVector(i));
1266  printf("chi2 = %f\n", chi2);
1267 #endif
1268 
1269  outputGPU.chi2()[gch] = chi2;
1270  auto const idx_for_energy = std::abs(pulseOffsetsView.offsets()[0]);
1271  outputGPU.energy()[gch] = (gain * resultAmplitudesVector(idx_for_energy)) * respCorrection;
1272 
1273  } // loop over channels
1274  } //loop over group of channels
1275  }
1276  }; // Kernel_minimize
1277 
1278  } // namespace mahi
1279 
1280  void runMahiAsync(Queue& queue,
1281  IProductTypef01::ConstView const& f01HEDigis,
1282  IProductTypef5::ConstView const& f5HBDigis,
1283  IProductTypef3::ConstView const& f3HBDigis,
1284  OProductType::View outputGPU,
1285  HcalMahiConditionsPortableDevice::ConstView const& mahi,
1286  HcalSiPMCharacteristicsPortableDevice::ConstView const& sipmCharacteristics,
1287  HcalRecoParamWithPulseShapeDevice::ConstView const& recoParamsWithPS,
1288  HcalMahiPulseOffsetsPortableDevice::ConstView const& mahiPulseOffsets,
1289  ConfigParameters const& configParameters) {
1290  auto const totalChannels =
1291  f01HEDigis.metadata().size() + f5HBDigis.metadata().size() + f3HBDigis.metadata().size();
1292  // FIXME: the number of channels in output might change given that some channesl might be filtered out
1293 
1294  // TODO: this can be lifted by implementing a separate kernel
1295  // similar to the default one, but properly handling the diff in #sample
1296  // or modifying existing one
1297  // TODO: assert startingSample = f01nsamples - windowSize to be 0 or 2
1298  // assert f01nsamples == f5nsamples
1299  // assert f01nsamples == f3nsamples
1300  int constexpr windowSize = 8;
1301 
1302  //compute work division
1303  uint32_t nchannels_per_block = configParameters.kprep1dChannelsPerBlock;
1304  auto const blocks_y = cms::alpakatools::divide_up_by(totalChannels, nchannels_per_block);
1305 
1306  Vec2D const blocks_2d{blocks_y, 1u}; // {y, x} coordiantes
1307  Vec2D const threads_2d{nchannels_per_block, windowSize};
1308  auto workDivPrep2D = cms::alpakatools::make_workdiv<Acc2D>(blocks_2d, threads_2d);
1309 
1310  //Device buffer for output
1311  auto amplitudes = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1312  auto noiseTerms = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1313  auto electronicNoiseTerms = cms::alpakatools::make_device_buffer<float[]>(queue, totalChannels * windowSize);
1314  auto soiSamples = cms::alpakatools::make_device_buffer<int8_t[]>(queue, totalChannels * windowSize);
1315 
1316  alpaka::exec<Acc2D>(queue,
1317  workDivPrep2D,
1319  outputGPU,
1320  f01HEDigis,
1321  f5HBDigis,
1322  f3HBDigis,
1323  mahi,
1324  sipmCharacteristics,
1325  recoParamsWithPS,
1326  configParameters.useEffectivePedestals,
1327  configParameters.sipmQTSShift,
1328  configParameters.sipmQNTStoSum,
1329  configParameters.firstSampleShift,
1330  configParameters.ts4Thresh,
1331  amplitudes.data(),
1332  noiseTerms.data(),
1333  electronicNoiseTerms.data(),
1334  soiSamples.data(),
1335  windowSize);
1336 
1339  uint32_t const channelsPerBlock = 1024 / (windowSize * mahiPulseOffsets.metadata().size());
1340 
1341  //launch 1D blocks of 3D threads
1342  auto const blocks_z = cms::alpakatools::divide_up_by(totalChannels, channelsPerBlock);
1343  Vec3D const blocks_3d{blocks_z, 1u, 1u}; // 1D block in z {z,y,x} coordinates
1344  Vec3D const threads_3d{channelsPerBlock, mahiPulseOffsets.metadata().size(), windowSize};
1345 
1346  auto workDivPrep3D = cms::alpakatools::make_workdiv<Acc3D>(blocks_3d, threads_3d);
1347 
1348  //Device buffer for output
1349  auto pulseMatrices = cms::alpakatools::make_device_buffer<float[]>(
1350  queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1351  auto pulseMatricesM = cms::alpakatools::make_device_buffer<float[]>(
1352  queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1353  auto pulseMatricesP = cms::alpakatools::make_device_buffer<float[]>(
1354  queue, totalChannels * windowSize * mahiPulseOffsets.metadata().size());
1355 
1356  alpaka::exec<Acc3D>(queue,
1357  workDivPrep3D,
1359  pulseMatrices.data(),
1360  pulseMatricesM.data(),
1361  pulseMatricesP.data(),
1362  mahiPulseOffsets,
1363  amplitudes.data(),
1364  f01HEDigis,
1365  f5HBDigis,
1366  f3HBDigis,
1367  soiSamples.data(),
1368  mahi,
1369  recoParamsWithPS,
1370  configParameters.meanTime,
1371  configParameters.timeSigmaSiPM,
1372  configParameters.timeSigmaHPD,
1373  configParameters.applyTimeSlew,
1374  configParameters.tzeroTimeSlew,
1375  configParameters.slopeTimeSlew,
1376  configParameters.tmaxTimeSlew);
1377 
1378  uint32_t threadsPerBlock = configParameters.kernelMinimizeThreads[0];
1379  auto blocks_1d = cms::alpakatools::divide_up_by(totalChannels, threadsPerBlock);
1380 
1381  auto workDivPrep1D = cms::alpakatools::make_workdiv<Acc1D>(blocks_1d, threadsPerBlock);
1382 
1383  alpaka::exec<Acc1D>(queue,
1384  workDivPrep1D,
1386  outputGPU,
1387  amplitudes.data(),
1388  pulseMatrices.data(),
1389  pulseMatricesM.data(),
1390  pulseMatricesP.data(),
1391  mahiPulseOffsets,
1392  noiseTerms.data(),
1393  electronicNoiseTerms.data(),
1394  soiSamples.data(),
1395  mahi,
1396  configParameters.useEffectivePedestals,
1397  f01HEDigis,
1398  f5HBDigis,
1399  f3HBDigis);
1400  }
1401 
1402  } // namespace hcal::reconstruction
1403 } // namespace ALPAKA_ACCELERATOR_NAMESPACE
1404 
1405 namespace alpaka::trait {
1407 
1409  template <typename TAcc>
1410  struct BlockSharedMemDynSizeBytes<Kernel_prep1d_sameNumberOfSamples, TAcc> {
1412  template <typename TVec, typename... TArgs>
1414  TVec const& threadsPerBlock,
1415  TVec const& elemsPerThread,
1416  TArgs const&...) -> std::size_t {
1417  // return the amount of dynamic shared memory needed
1418 
1419  // threadsPerBlock[1] = threads2d.x = windowSize = 8
1420  // threadsPerBlock[0] = threads2d.y = nchannels_per_block = 32
1421  // elemsPerThread = 1
1422  std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] *
1423  ((2 * threadsPerBlock[1u] * elemsPerThread[1u] + 2) * sizeof(float) + sizeof(uint64_t));
1424  return bytes;
1425  }
1426  };
1427 
1429  template <int NSAMPLES, int NPULSES, typename TAcc>
1430  struct BlockSharedMemDynSizeBytes<Kernel_minimize<NSAMPLES, NPULSES>, TAcc> {
1432  template <typename TVec, typename... TArgs>
1434  TVec const& threadsPerBlock,
1435  TVec const& elemsPerThread,
1436  TArgs const&...) -> std::size_t {
1437  // return the amount of dynamic shared memory needed
1438 
1439  std::size_t bytes =
1440  2 * threadsPerBlock[0u] * elemsPerThread[0u] * (calo::multifit::MapSymM<float, 8>::total * sizeof(float));
1441  return bytes;
1442  }
1443  };
1444 
1445 } // namespace alpaka::trait
ALPAKA_FN_ACC auto uniform_elements_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:351
ALPAKA_FN_ACC float get_raw_charge(double const charge, double const pedestal, float const *shrChargeMinusPedestal, float const parLin1, float const parLin2, float const parLin3, int32_t const nsamplesForCompute, int32_t const soi, int const sipmQTSShift, int const sipmQNTStoSum, float const fcByPE, int32_t const lch, bool const isqie11)
Definition: Mahi.dev.cc:230
ALPAKA_FN_ACC void operator()(TAcc const &acc, OProductType::View outputGPU, IProductTypef01::ConstView f01HEDigis, IProductTypef5::ConstView f5HBDigis, IProductTypef3::ConstView f3HBDigis, HcalMahiConditionsPortableDevice::ConstView mahi, HcalSiPMCharacteristicsPortableDevice::ConstView sipmCharacteristics, HcalRecoParamWithPulseShapeDevice::ConstView recoParamsWithPS, bool const useEffectivePedestals, int const sipmQTSShift, int const sipmQNTStoSum, int const firstSampleShift, float const ts4Thresh, float *amplitudes, float *noiseTerms, float *electronicNoiseTerms, int8_t *soiSamples, int const windowSize) const
Definition: Mahi.dev.cc:272
ALPAKA_STATIC_ACC_MEM_CONSTANT float const qie8shape[129]
Definition: Mahi.dev.cc:53
constexpr Idx divide_up_by(Idx value, Idx divisor)
Definition: workdivision.h:20
ALPAKA_FN_ACC auto uniform_groups_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:785
ALPAKA_FN_ACC void operator()(TAcc const &acc, OProductType::View outputGPU, float const *amplitudes, float *pulseMatrices, float *pulseMatricesM, float *pulseMatricesP, HcalMahiPulseOffsetsPortableDevice::ConstView pulseOffsetsView, float *noiseTerms, float *electronicNoiseTerms, int8_t *soiSamples, HcalMahiConditionsPortableDevice::ConstView mahi, bool const useEffectivePedestals, IProductTypef01::ConstView f01HEDigis, IProductTypef5::ConstView f5HBDigis, IProductTypef3::ConstView f3HBDigis) const
Definition: Mahi.dev.cc:967
static ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(Kernel_prep1d_sameNumberOfSamples const &, TVec const &threadsPerBlock, TVec const &elemsPerThread, TArgs const &...) -> std::size_t
Definition: Mahi.dev.cc:1413
Vec< Dim2D > Vec2D
Definition: config.h:26
static const double slope[3]
constexpr bool TSenergyCompare(std::pair< unsigned int, float > a, std::pair< unsigned int, float > b)
Definition: Mahi.dev.cc:265
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition_unrolled(MatrixType1 &L, MatrixType2 const &M)
ALPAKA_FN_ACC auto uniform_group_elements_y(TAcc const &acc, TArgs... args)
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void atomicMaxPair(const TAcc &acc, unsigned long long int *address, std::pair< unsigned int, float > value, F comparator)
Definition: atomicMaxPair.h:10
ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_time_slew_delay(float const fC, float const tzero, float const slope, float const tmax)
Definition: Mahi.dev.cc:43
ALPAKA_STATIC_ACC_MEM_CONSTANT float const qie11shape[257]
Definition: Mahi.dev.cc:63
ALPAKA_FN_ACC PulseShapeConstElement getPulseShape(uint32_t const hashedId) const
#define CMS_UNROLL_LOOP
Definition: CMSUnrollLoop.h:47
ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_coder_charge(int const qieType, uint8_t const adc, uint8_t const capid, float const *qieOffsets, float const *qieSlopes)
Definition: Mahi.dev.cc:120
ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_diff_charge_gain(int const qieType, uint8_t adc, uint8_t const capid, float const *qieOffsets, float const *qieSlopes, bool const isqie11)
Definition: Mahi.dev.cc:134
ALPAKA_FN_ACC auto independent_group_elements_y(TAcc const &acc, TArgs... args)
Eigen::Matrix< T, SIZE, 1 > ColumnVector
void runMahiAsync(Queue &queue, IProductTypef01::ConstView const &f01HEDigis, IProductTypef5::ConstView const &f5HBDigis, IProductTypef3::ConstView const &f3HBDigis, OProductType::View outputGPU, HcalMahiConditionsPortableDevice::ConstView const &mahi, HcalSiPMCharacteristicsPortableDevice::ConstView const &sipmCharacteristics, HcalRecoParamWithPulseShapeDevice::ConstView const &recoParamsWithPS, HcalMahiPulseOffsetsPortableDevice::ConstView const &mahiPulseOffsets, ConfigParameters const &configParameters)
Definition: Mahi.dev.cc:1280
constexpr uint32_t stride
Definition: HelixFit.h:22
constexpr float getTDCTime(const int tdc)
ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_pulse_shape_value(PulseShapeConstElement const &pulseShape, float const pulse_time, int const sample, int const shift)
Definition: Mahi.dev.cc:174
EIGEN_DEVICE_FUNC void fnnls(MatrixType const &AtA, VectorType const &Atb, VectorType &solution, int &npassive, ColumnVector< VectorType::RowsAtCompileTime, int > &pulseOffsets, MapSymM< float, VectorType::RowsAtCompileTime > &matrixL, double eps, const int maxIterations, const int relaxationPeriod, const int relaxationFactor)
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void calculateChiSq(MatrixType1 const &matrixL, MatrixType2 const &pulseMatrixView, MatrixType3 const &resultAmplitudesVector, MatrixType4 const &inputAmplitudesView, float &chi2)
EIGEN_DEVICE_FUNC void solve_forward_subst_matrix(MatrixType1 &A, MatrixType2 const &pulseMatrixView, MatrixType3 const &matrixL)
ALPAKA_FN_ACC void operator()(TAcc const &acc, float *pulseMatrices, float *pulseMatricesM, float *pulseMatricesP, HcalMahiPulseOffsetsPortableDevice::ConstView pulseOffsets, float const *amplitudes, IProductTypef01::ConstView f01HEDigis, IProductTypef5::ConstView f5HBDigis, IProductTypef3::ConstView f3HBDigis, int8_t *soiSamples, HcalMahiConditionsPortableDevice::ConstView mahi, HcalRecoParamWithPulseShapeDevice::ConstView recoParamsWithPS, float const meanTime, float const timeSigmaSiPM, float const timeSigmaHPD, bool const applyTimeSlew, float const tzeroTimeSlew, float const slopeTimeSlew, float const tmaxTimeSlew) const
Definition: Mahi.dev.cc:747
T sqrt(T t)
Definition: SSEVec.h:19
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
double f[11][100]
ALPAKA_FN_ACC auto independent_group_elements_x(TAcc const &acc, TArgs... args)
constexpr int subdetId() const
get the contents of the subdetector field (not cast into any detector&#39;s numbering enum) ...
Definition: DetId.h:48
Definition: value.py:1
constexpr int nsPerBX
Definition: HcalConstants.h:8
constexpr uint8_t adc_for_sample< Flavor5 >(uint16_t const *const dfstart, uint32_t const sample)
ALPAKA_FN_ACC auto uniform_groups_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:792
static const double tmax[3]
Definition: DetId.h:17
typename HcalPulseShapeSoA::ConstView::const_element PulseShapeConstElement
Definition: Mahi.dev.cc:171
EIGEN_DEVICE_FUNC void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime], MatrixType1 inputAmplitudesView, MatrixType2 matrixL)
static ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(Kernel_minimize< NSAMPLES, NPULSES > const &, TVec const &threadsPerBlock, TVec const &elemsPerThread, TArgs const &...) -> std::size_t
Definition: Mahi.dev.cc:1433
unsigned long long uint64_t
Definition: Time.h:13
constexpr int maxSamples
Definition: HcalConstants.h:6
double b
Definition: hdecay.h:120
ALPAKA_FN_ACC ALPAKA_FN_INLINE float compute_reco_correction_factor(float const par1, float const par2, float const par3, float const x)
Definition: Mahi.dev.cc:112
Vec< Dim3D > Vec3D
Definition: config.h:27
ALPAKA_FN_ACC auto uniform_group_elements_x(TAcc const &acc, TArgs... args)
constexpr uint8_t capid_for_sample< Flavor3 >(uint16_t const *const dfstart, uint32_t const sample)
constexpr float UNKNOWN_T_NOTDC
double a
Definition: hdecay.h:121
static std::atomic< unsigned int > counter
static int position[264][3]
Definition: ReadPGInfo.cc:289
ALPAKA_FN_ACC ALPAKA_FN_INLINE void update_covariance(calo::multifit::ColumnVector< NPULSES > const &resultAmplitudesVector, calo::multifit::MapSymM< float, NSAMPLES > &covarianceMatrix, Eigen::Map< const calo::multifit::ColMajorMatrix< NSAMPLES, NPULSES >> const &pulseMatrix, Eigen::Map< const calo::multifit::ColMajorMatrix< NSAMPLES, NPULSES >> const &pulseMatrixM, Eigen::Map< const calo::multifit::ColMajorMatrix< NSAMPLES, NPULSES >> const &pulseMatrixP)
Definition: Mahi.dev.cc:907
col
Definition: cuy.py:1009
static unsigned int const shift
static const double tzero[3]
float x
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE uint32_t float_as_uint(float val)
Definition: Mahi.dev.cc:35
Definition: APVGainStruct.h:7
constexpr float iniTimeShift
Definition: HcalConstants.h:9
ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHE(uint32_t const didraw, int const maxDepthHE, int const maxPhiHE, int const firstHERing, int const lastHERing, int const nEtaHE)
Definition: Mahi.dev.cc:97
Eigen::Matrix< float, NROWS, NCOLS, Eigen::ColMajor > ColMajorMatrix
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr RecoParamCollection::ConstView recoParamView()
static const int IPHI_MAX
Definition: HcalTopology.cc:13
uint16_t *__restrict__ uint16_t const *__restrict__ adc
ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t did2linearIndexHB(uint32_t const didraw, int const maxDepthHB, int const firstHBRing, int const lastHBRing, int const nEtaHB)
Definition: Mahi.dev.cc:89
ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t get_qiecoder_index(uint32_t const capid, uint32_t const range)
Definition: Mahi.dev.cc:108
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float uint_as_float(uint32_t val)
Definition: Mahi.dev.cc:27