CMS 3D CMS Logo

TimeComputationKernels.h
Go to the documentation of this file.
1 #ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_TimeComputationKernels_h
2 #define RecoLocalCalo_EcalRecProducers_plugins_alpaka_TimeComputationKernels_h
3 
4 #include <cassert>
5 #include <cmath>
6 #include <limits>
7 
17 
18 #include "DeclsForKernels.h"
19 #include "KernelHelpers.h"
20 
21 //#define ECAL_RECO_ALPAKA_DEBUG
22 
24 
25  ALPAKA_FN_ACC ALPAKA_FN_INLINE bool use_sample(unsigned int sample_mask, unsigned int sample) {
26  return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1)));
27  }
28 
29  ALPAKA_FN_ACC constexpr float fast_expf(float x) { return unsafe_expf<6>(x); }
30  ALPAKA_FN_ACC constexpr float fast_logf(float x) { return unsafe_logf<7>(x); }
31 
34 
35  public:
36  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
37  ALPAKA_FN_ACC void operator()(TAcc const& acc,
38  ScalarType* const sample_values,
39  ScalarType* const sample_value_errors,
40  bool* const useless_sample_values,
41  ScalarType* chi2s,
42  ScalarType* sum0s,
43  ScalarType* sumAAs,
44  uint32_t const nchannels) const {
45  constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
46 
47  // indices
48  auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
49 
50  // shared mem inits
51  auto* s_sum0 = alpaka::getDynSharedMem<char>(acc);
52  auto* s_sum1 = reinterpret_cast<ScalarType*>(s_sum0 + elemsPerBlock);
53  auto* s_sumA = s_sum1 + elemsPerBlock;
54  auto* s_sumAA = s_sumA + elemsPerBlock;
55 
56  for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
57  // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
58  auto tx = nchannels * nsamples - 1 - txforward;
59  auto const ch = tx / nsamples;
60 
61  auto const sample = tx % nsamples;
62  auto const ltx = tx % elemsPerBlock;
63 
64  // TODO make sure no div by 0
65  auto const inv_error =
66  useless_sample_values[tx] ? 0. : 1. / (sample_value_errors[tx] * sample_value_errors[tx]);
67  auto const sample_value = sample_values[tx];
68  s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1;
69  s_sum1[ltx] = inv_error;
70  s_sumA[ltx] = sample_value * inv_error;
71  s_sumAA[ltx] = sample_value * sample_value * inv_error;
72  alpaka::syncBlockThreads(acc);
73 
74  // 5 threads for [0, 4] samples
75  if (sample < 5) {
76  s_sum0[ltx] += s_sum0[ltx + 5];
77  s_sum1[ltx] += s_sum1[ltx + 5];
78  s_sumA[ltx] += s_sumA[ltx + 5];
79  s_sumAA[ltx] += s_sumAA[ltx + 5];
80  }
81  alpaka::syncBlockThreads(acc);
82 
83  if (sample < 2) {
84  // note double counting of sample 3
85  s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3];
86  s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3];
87  s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3];
88  s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3];
89  }
90  alpaka::syncBlockThreads(acc);
91 
92  if (sample == 0) {
93  // note, subtract to remove the double counting of sample == 3
94  auto const sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3];
95  auto const sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3];
96  auto const sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3];
97  auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3];
98  auto const chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast<ScalarType>(0);
99  chi2s[ch] = chi2;
100  sum0s[ch] = sum0;
101  sumAAs[ch] = sumAA;
102 
103 #ifdef DEBUG_TC_NULLHYPOT
104  if (ch == 0) {
105  printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast<int>(sum0), sumAA);
106  }
107 #endif
108  }
109  }
110  }
111  };
112 
113  //
114  // launch ctx parameters are
115  // 45 threads per channel, X channels per block, Y blocks
116  // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9
117  // TODO: it might be much beter to use 32 threads per channel instead of 45
118  // to simplify the synchronization
121 
122  public:
123  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
124  ALPAKA_FN_ACC void operator()(TAcc const& acc,
125  EcalDigiDeviceCollection::ConstView digisDevEB,
126  EcalDigiDeviceCollection::ConstView digisDevEE,
127  ScalarType* const sample_values,
128  ScalarType* const sample_value_errors,
129  bool* const useless_sample_values,
130  char* const pedestal_nums,
131  ScalarType* const sumAAsNullHypot,
132  ScalarType* const sum0sNullHypot,
133  ScalarType* tMaxAlphaBetas,
134  ScalarType* tMaxErrorAlphaBetas,
135  ScalarType* g_accTimeMax,
136  ScalarType* g_accTimeWgt,
137  TimeComputationState* g_state,
138  EcalMultifitParametersDevice::ConstView paramsDev,
139  ConfigurationParameters::type const timeFitLimits_firstEB,
140  ConfigurationParameters::type const timeFitLimits_firstEE,
141  ConfigurationParameters::type const timeFitLimits_secondEB,
142  ConfigurationParameters::type const timeFitLimits_secondEE) const {
143  // constants
144  constexpr uint32_t nchannels_per_block = 10;
145  constexpr auto nthreads_per_channel = nchannels_per_block * (nchannels_per_block - 1) / 2;
146  constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
147  auto const nchannels = digisDevEB.size() + digisDevEE.size();
148  auto const offsetForInputs = digisDevEB.size();
149  auto const totalElements = nthreads_per_channel * nchannels;
150 
151  auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
152  ALPAKA_ASSERT_ACC(nthreads_per_channel * nchannels_per_block == elemsPerBlock);
153 
154  auto* shr_chi2s = alpaka::getDynSharedMem<ScalarType>(acc);
155  auto* shr_time_wgt = shr_chi2s + elemsPerBlock;
156  auto* shr_time_max = shr_time_wgt + elemsPerBlock;
157  auto* shrTimeMax = shr_time_max + elemsPerBlock;
158  auto* shrTimeWgt = shrTimeMax + elemsPerBlock;
159  auto* shr_chi2 = shrTimeWgt + elemsPerBlock;
160  auto* shr_tmax = shr_chi2 + elemsPerBlock;
161  auto* shr_tmaxerr = shr_tmax + elemsPerBlock;
162  auto* shr_condForUselessSamples = reinterpret_cast<bool*>(shr_tmaxerr + elemsPerBlock);
163  auto* shr_internalCondForSkipping1 = shr_condForUselessSamples + elemsPerBlock;
164  auto* shr_internalCondForSkipping2 = shr_internalCondForSkipping1 + elemsPerBlock;
165 
166  for (auto block : cms::alpakatools::uniform_groups(acc, totalElements)) {
167  for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
168  auto const ch = idx.global / nthreads_per_channel;
169  auto const ltx = idx.global % nthreads_per_channel;
170 
171  auto const ch_start = ch * nsamples;
172  auto const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
173  auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
174 
175  auto const did = DetId{dids[inputCh]};
176  auto const isBarrel = did.subdetId() == EcalBarrel;
177  auto* const amplitudeFitParameters =
178  isBarrel ? paramsDev.amplitudeFitParamsEB().data() : paramsDev.amplitudeFitParamsEE().data();
179  auto* const timeFitParameters =
180  isBarrel ? paramsDev.timeFitParamsEB().data() : paramsDev.timeFitParamsEE().data();
181  auto const timeFitParameters_size =
182  isBarrel ? paramsDev.timeFitParamsEB().size() : paramsDev.timeFitParamsEE().size();
183  auto const timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE;
184  auto const timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE;
185 
186  // map tx -> (sample_i, sample_j)
187  int sample_i = 0;
188  int sample_j = 0;
189  if (ltx <= 8) {
190  sample_i = 0;
191  sample_j = 1 + ltx;
192  } else if (ltx <= 16) {
193  sample_i = 1;
194  sample_j = 2 + ltx - 9;
195  } else if (ltx <= 23) {
196  sample_i = 2;
197  sample_j = 3 + ltx - 17;
198  } else if (ltx <= 29) {
199  sample_i = 3;
200  sample_j = 4 + ltx - 24;
201  } else if (ltx <= 34) {
202  sample_i = 4;
203  sample_j = 5 + ltx - 30;
204  } else if (ltx <= 38) {
205  sample_i = 5;
206  sample_j = 6 + ltx - 35;
207  } else if (ltx <= 41) {
208  sample_i = 6;
209  sample_j = 7 + ltx - 39;
210  } else if (ltx <= 43) {
211  sample_i = 7;
212  sample_j = 8 + ltx - 42;
213  } else if (ltx <= 44) {
214  sample_i = 8;
215  sample_j = 9;
216  } else {
217  // FIXME this needs a more portable solution, that wraps abort() / __trap() / throw depending on the back-end
218  ALPAKA_ASSERT_ACC(false);
219  }
220 
221  auto const tx_i = ch_start + sample_i;
222  auto const tx_j = ch_start + sample_j;
223 
224  //
225  // note, given the way we partition the block, with 45 threads per channel
226  // we will end up with inactive threads which need to be dragged along
227  // through the synching point
228  //
229  bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] ||
230  sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1;
231 
232  //
233  // see cpu implementation for explanation
234  //
236  ScalarType tmax = 0;
237  ScalarType tmaxerr = 0;
238  shrTimeMax[idx.local] = 0;
239  shrTimeWgt[idx.local] = 0;
240 
241  bool internalCondForSkipping1 = true;
242  bool internalCondForSkipping2 = true;
243  if (!condForUselessSamples) {
244  auto const rtmp = sample_values[tx_i] / sample_values[tx_j];
245  auto const invampl_i = 1. / sample_values[tx_i];
246  auto const relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i;
247  auto const invampl_j = 1. / sample_values[tx_j];
248  auto const relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j;
249  auto const err1 = rtmp * rtmp * (relErr2_i + relErr2_j);
250  auto err2 =
251  sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j);
252  // TODO non-divergent branch for a block if each block has 1 channel
253  // otherwise non-divergent for groups of 45 threads
254  // at this point, pedestal_nums[ch] can be either 0, 1 or 2
255  if (pedestal_nums[ch] == 2)
256  err2 *= err2 * 0.5;
257  auto const err3 = (0.289 * 0.289) * (invampl_j * invampl_j);
258  auto const total_error = std::sqrt(err1 + err2 + err3);
259 
260  auto const alpha = amplitudeFitParameters[0];
261  auto const beta = amplitudeFitParameters[1];
262  auto const alphabeta = alpha * beta;
263  auto const invalphabeta = 1. / alphabeta;
264 
265  // variables instead of a struct
266  auto const ratio_index = sample_i;
267  auto const ratio_step = sample_j - sample_i;
268  auto const ratio_value = rtmp;
269  auto const ratio_error = total_error;
270 
271  auto const rlim_i_j = fast_expf(static_cast<ScalarType>(sample_j - sample_i) / beta) - 0.001;
272  internalCondForSkipping1 = !(total_error < 1. && rtmp > 0.001 && rtmp < rlim_i_j);
273  if (!internalCondForSkipping1) {
274  //
275  // precompute.
276  // in cpu version this was done conditionally
277  // however easier to do it here (precompute) and then just filter out
278  // if not needed
279  //
280  auto const l_timeFitLimits_first = timeFitLimits_first;
281  auto const l_timeFitLimits_second = timeFitLimits_second;
282  if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
283  auto const time_max_i = static_cast<ScalarType>(ratio_index);
284  auto u = timeFitParameters[timeFitParameters_size - 1];
286  for (int k = timeFitParameters_size - 2; k >= 0; --k)
287  u = u * ratio_value + timeFitParameters[k];
288 
289  auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]);
290  for (int k = timeFitParameters_size - 2; k >= 1; --k)
291  du = du * ratio_value + k * timeFitParameters[k];
292 
293  auto const error2 = ratio_error * ratio_error * du * du;
294  auto const time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast<ScalarType>(0);
295  auto const time_wgt = error2 > 0 ? 1. / error2 : static_cast<ScalarType>(0);
296 
297  // store into shared mem
298  // note, this name is essentially identical to the one used
299  // below.
300  shrTimeMax[idx.local] = error2 > 0 ? time_max : 0;
301  shrTimeWgt[idx.local] = error2 > 0 ? time_wgt : 0;
302  } else {
303  shrTimeMax[idx.local] = 0;
304  shrTimeWgt[idx.local] = 0;
305  }
306 
307  // continue with ratios
308  auto const stepOverBeta = static_cast<ScalarType>(ratio_step) / beta;
309  auto const offset = static_cast<ScalarType>(ratio_index) + alphabeta;
310  auto const rmin = std::max(ratio_value - ratio_error, 0.001);
311  auto const rmax =
312  std::min(ratio_value + ratio_error, fast_expf(static_cast<ScalarType>(ratio_step) / beta) - 0.001);
313  auto const time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.);
314  auto const time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.);
315 
316  // set these guys
317  tmax = 0.5 * (time1 + time2);
318  tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2));
319 #ifdef DEBUG_TC_MAKERATIO
320  if (ch == 1 || ch == 0)
321  printf(
322  "ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = "
323  "%f\n",
324  ch,
325  ltx,
326  tmax,
327  tmaxerr,
328  time1,
329  time2,
330  offset,
331  rmin,
332  rmax);
333 #endif
334 
335  ScalarType sumAf = 0;
336  ScalarType sumff = 0;
337  const int itmin = std::max(-1, static_cast<int>(std::floor(tmax - alphabeta)));
338  auto loffset = (static_cast<ScalarType>(itmin) - tmax) * invalphabeta;
339  // TODO: data dependence
340  for (int it = itmin + 1; it < nsamples; ++it) {
341  loffset += invalphabeta;
342  if (useless_sample_values[ch_start + it])
343  continue;
344  auto const inverr2 = 1. / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]);
345  auto const term1 = 1. + loffset;
346  auto const f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0;
347  sumAf += sample_values[ch_start + it] * (f * inverr2);
348  sumff += f * (f * inverr2);
349  }
350 
351  auto const sumAA = sumAAsNullHypot[ch];
352  auto const sum0 = sum0sNullHypot[ch];
353  chi2 = sumAA;
354  // TODO: sum0 can not be 0 below, need to introduce the check upfront
355  if (sumff > 0) {
356  chi2 = sumAA - sumAf * (sumAf / sumff);
357  }
358  chi2 /= sum0;
359 
360 #ifdef DEBUG_TC_MAKERATIO
361  if (ch == 1 || ch == 0)
362  printf(
363  "ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = "
364  "%f\n",
365  ch,
366  ltx,
367  sumAf,
368  sumff,
369  sumAA,
370  static_cast<int>(sum0),
371  tmax,
372  tmaxerr,
373  chi2);
374 #endif
375 
376  if (chi2 > 0 && tmax > 0 && tmaxerr > 0)
377  internalCondForSkipping2 = false;
378  else
380  }
381  }
382 
383  // store into smem
384  shr_chi2s[idx.local] = chi2;
385  shr_chi2[idx.local] = chi2;
386  shr_tmax[idx.local] = tmax;
387  shr_tmaxerr[idx.local] = tmaxerr;
388  shr_condForUselessSamples[idx.local] = condForUselessSamples;
389  shr_internalCondForSkipping1[idx.local] = internalCondForSkipping1;
390  shr_internalCondForSkipping2[idx.local] = internalCondForSkipping2;
391  }
392 
393  alpaka::syncBlockThreads(acc);
394 
395  // find min chi2 - quite crude for now
396  // TODO validate/check
397  auto iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
398  bool oddElements = nthreads_per_channel % 2;
400  while (iter >= 1) {
401  for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
402  auto const ltx = idx.global % nthreads_per_channel;
403 
404  if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) {
405  // for odd ns, the last guy will just store itself
406  // exception is for ltx == 0 and iter==1
407  shr_chi2s[idx.local] = std::min(shr_chi2s[idx.local], shr_chi2s[idx.local + iter]);
408  }
409  }
410  alpaka::syncBlockThreads(acc);
411 
412  oddElements = iter % 2;
413  iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
414  }
415 
416  for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
417  auto const ltx = idx.global % nthreads_per_channel;
418 
419  // get precomputedflags for this element from shared memory
420  auto const condForUselessSamples = shr_condForUselessSamples[idx.local];
421  auto const internalCondForSkipping1 = shr_internalCondForSkipping1[idx.local];
422  auto const internalCondForSkipping2 = shr_internalCondForSkipping2[idx.local];
423  // filter out inactive or useless samples threads
424  if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) {
425  // min chi2, now compute weighted average of tmax measurements
426  // see cpu version for more explanation
427  auto const chi2 = shr_chi2[idx.local];
428  auto const chi2min = shr_chi2s[idx.local - ltx];
429  auto const chi2Limit = chi2min + 1.;
430  auto const tmaxerr = shr_tmaxerr[idx.local];
431  auto const inverseSigmaSquared = chi2 < chi2Limit ? 1. / (tmaxerr * tmaxerr) : 0.;
432 
433 #ifdef DEBUG_TC_MAKERATIO
434  if (ch == 1 || ch == 0) {
435  auto const ch = idx.global / nthreads_per_channel;
436  printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n",
437  ch,
438  ltx,
439  chi2min,
440  chi2Limit,
441  inverseSigmaSquared);
442  }
443 #endif
444 
445  // store into shared mem and run reduction
446  // TODO: check if cooperative groups would be better
447  // TODO: check if shuffling intrinsics are better
448  auto const tmax = shr_tmax[idx.local];
449  shr_time_wgt[idx.local] = inverseSigmaSquared;
450  shr_time_max[idx.local] = tmax * inverseSigmaSquared;
451  } else {
452  shr_time_wgt[idx.local] = 0;
453  shr_time_max[idx.local] = 0;
454  }
455  }
456 
457  alpaka::syncBlockThreads(acc);
458 
459  // reduce to compute time_max and time_wgt
460  iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
461  oddElements = nthreads_per_channel % 2;
463  while (iter >= 1) {
464  for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
465  auto const ltx = idx.global % nthreads_per_channel;
466 
467  if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) {
468  shr_time_wgt[idx.local] += shr_time_wgt[idx.local + iter];
469  shr_time_max[idx.local] += shr_time_max[idx.local + iter];
470  shrTimeMax[idx.local] += shrTimeMax[idx.local + iter];
471  shrTimeWgt[idx.local] += shrTimeWgt[idx.local + iter];
472  }
473  }
474 
475  alpaka::syncBlockThreads(acc);
476  oddElements = iter % 2;
477  iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2;
478  }
479 
480  for (auto idx : cms::alpakatools::uniform_group_elements(acc, block, totalElements)) {
481  auto const ltx = idx.global % nthreads_per_channel;
482 
483  // load from shared memory the 0th guy (will contain accumulated values)
484  // compute
485  // store into global mem
486  if (ltx == 0) {
487  auto const ch = idx.global / nthreads_per_channel;
488  auto const tmp_time_max = shr_time_max[idx.local];
489  auto const tmp_time_wgt = shr_time_wgt[idx.local];
490 
491  // we are done if there number of time ratios is 0
492  if (tmp_time_wgt == 0 && tmp_time_max == 0) {
493  g_state[ch] = TimeComputationState::Finished;
494  continue;
495  }
496 
497  // no div by 0
498  auto const tMaxAlphaBeta = tmp_time_max / tmp_time_wgt;
499  auto const tMaxErrorAlphaBeta = 1. / std::sqrt(tmp_time_wgt);
500 
501  tMaxAlphaBetas[ch] = tMaxAlphaBeta;
502  tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta;
503  g_accTimeMax[ch] = shrTimeMax[idx.local];
504  g_accTimeWgt[ch] = shrTimeWgt[idx.local];
505  g_state[ch] = TimeComputationState::NotFinished;
506 
507 #ifdef DEBUG_TC_MAKERATIO
508  printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt);
509  printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n",
510  ch,
511  tMaxAlphaBeta,
512  tMaxErrorAlphaBeta,
513  shrTimeMax[idx.local],
514  shrTimeWgt[idx.local]);
515 #endif
516  }
517  }
518  }
519  }
520  };
521 
524 
525  public:
526  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
527  ALPAKA_FN_ACC void operator()(TAcc const& acc,
528  EcalDigiDeviceCollection::ConstView digisDevEB,
529  EcalDigiDeviceCollection::ConstView digisDevEE,
530  ScalarType* const sample_values,
531  ScalarType* const sample_value_errors,
532  bool* const useless_samples,
533  ScalarType* const g_tMaxAlphaBeta,
534  ScalarType* const g_tMaxErrorAlphaBeta,
535  ScalarType* const g_accTimeMax,
536  ScalarType* const g_accTimeWgt,
537  ScalarType* const sumAAsNullHypot,
538  ScalarType* const sum0sNullHypot,
539  ScalarType* const chi2sNullHypot,
540  TimeComputationState* g_state,
541  ScalarType* g_ampMaxAlphaBeta,
542  ScalarType* g_ampMaxError,
543  ScalarType* g_timeMax,
544  ScalarType* g_timeError,
545  EcalMultifitParametersDevice::ConstView paramsDev) const {
549  //#define DEBUG_FINDAMPLCHI2_AND_FINISH
550 
551  // constants
552  constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
553  auto const nchannels = digisDevEB.size() + digisDevEE.size();
554  auto const offsetForInputs = digisDevEB.size();
555 
556  auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
557 
558  // configure shared mem
559  // per block, we need #threads per block * 2 * sizeof(ScalarType)
560  // we run with N channels per block
561  auto* shr_sumAf = alpaka::getDynSharedMem<ScalarType>(acc);
562  auto* shr_sumff = shr_sumAf + elemsPerBlock;
563 
564  for (auto gtxforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
565  // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
566  auto gtx = nchannels * nsamples - 1 - gtxforward;
567  auto const ch = gtx / nsamples;
568  auto const elemIdx = gtx % elemsPerBlock;
569  auto const sample = elemIdx % nsamples;
570 
571  auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
572  auto const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
573 
574  auto state = g_state[ch];
575  auto const did = DetId{dids[inputCh]};
576  auto* const amplitudeFitParameters = did.subdetId() == EcalBarrel ? paramsDev.amplitudeFitParamsEB().data()
577  : paramsDev.amplitudeFitParamsEE().data();
578 
579  // TODO is that better than storing into global and launching another kernel
580  // for the first 10 threads
582  auto const alpha = amplitudeFitParameters[0];
583  auto const beta = amplitudeFitParameters[1];
584  auto const alphabeta = alpha * beta;
585  auto const invalphabeta = 1. / alphabeta;
586  auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
587  auto const sample_value = sample_values[gtx];
588  auto const sample_value_error = sample_value_errors[gtx];
589  auto const inverr2 =
590  useless_samples[gtx] ? static_cast<ScalarType>(0) : 1. / (sample_value_error * sample_value_error);
591  auto const offset = (static_cast<ScalarType>(sample) - tMaxAlphaBeta) * invalphabeta;
592  auto const term1 = 1. + offset;
593  auto const f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast<ScalarType>(0.);
594  auto const sumAf = sample_value * (f * inverr2);
595  auto const sumff = f * (f * inverr2);
596 
597  // store into shared mem
598  shr_sumAf[elemIdx] = sumAf;
599  shr_sumff[elemIdx] = sumff;
600  } else {
601  shr_sumAf[elemIdx] = 0;
602  shr_sumff[elemIdx] = 0;
603  }
604 
605  alpaka::syncBlockThreads(acc);
606 
607  // reduce
608  // unroll completely here (but hardcoded)
609  if (sample < 5) {
610  shr_sumAf[elemIdx] += shr_sumAf[elemIdx + 5];
611  shr_sumff[elemIdx] += shr_sumff[elemIdx + 5];
612  }
613 
614  alpaka::syncBlockThreads(acc);
615 
616  if (sample < 2) {
617  // will need to subtract for ltx = 3, we double count here
618  shr_sumAf[elemIdx] += shr_sumAf[elemIdx + 2] + shr_sumAf[elemIdx + 3];
619  shr_sumff[elemIdx] += shr_sumff[elemIdx + 2] + shr_sumff[elemIdx + 3];
620  }
621 
622  alpaka::syncBlockThreads(acc);
623 
624  if (sample == 0) {
625  // exit if the state is done
626  // note, we do not exit before all __synchtreads are finished
628  g_timeMax[ch] = 5;
629  g_timeError[ch] = -999;
630  continue;
631  }
632 
633  // subtract to avoid double counting
634  auto const sumff = shr_sumff[elemIdx] + shr_sumff[elemIdx + 1] - shr_sumff[elemIdx + 3];
635  auto const sumAf = shr_sumAf[elemIdx] + shr_sumAf[elemIdx + 1] - shr_sumAf[elemIdx + 3];
636 
637  auto const ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0;
638  auto const sumAA = sumAAsNullHypot[ch];
639  auto const sum0 = sum0sNullHypot[ch];
640  auto const nullChi2 = chi2sNullHypot[ch];
641  if (sumff > 0) {
642  auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0;
643  if (chi2AlphaBeta > nullChi2) {
644  // null hypothesis is better
646 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
647  printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n",
648  ch,
649  chi2AlphaBeta,
650  nullChi2,
651  sumAA,
652  sumAf,
653  sumff,
654  sum0);
655 #endif
656  }
657 
658  // store to global
659  g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta;
660  } else {
661 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
662  printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf);
663 #endif
665  }
666 
667  // store the state to global and finish calcs
668  g_state[ch] = state;
670  // store default values into global
671  g_timeMax[ch] = 5;
672  g_timeError[ch] = -999;
673 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
674  printf("ch = %d finished state\n", ch);
675 #endif
676  continue;
677  }
678 
679  auto const ampMaxError = g_ampMaxError[ch];
680  auto const test_ratio = ampMaxAlphaBeta / ampMaxError;
681  auto const accTimeMax = g_accTimeMax[ch];
682  auto const accTimeWgt = g_accTimeWgt[ch];
683  auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch];
684  auto const tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch];
685  // branch to separate large vs small pulses
686  // see cpu version for more info
687  if (test_ratio > 5. && accTimeWgt > 0) {
688  auto const tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast<ScalarType>(0);
689  auto const tMaxErrorRatio = accTimeWgt > 0 ? 1. / std::sqrt(accTimeWgt) : static_cast<ScalarType>(0);
690 
691  if (test_ratio > 10.) {
692  g_timeMax[ch] = tMaxRatio;
693  g_timeError[ch] = tMaxErrorRatio;
694 
695 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
696  printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio);
697 #endif
698  } else {
699  auto const timeMax = (tMaxAlphaBeta * (10. - ampMaxAlphaBeta / ampMaxError) +
700  tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.)) /
701  5.;
702  auto const timeError = (tMaxErrorAlphaBeta * (10. - ampMaxAlphaBeta / ampMaxError) +
703  tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.)) /
704  5.;
706  g_state[ch] = state;
707  g_timeMax[ch] = timeMax;
708  g_timeError[ch] = timeError;
709 
710 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
711  printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError);
712 #endif
713  }
714  } else {
716  g_state[ch] = state;
717  g_timeMax[ch] = tMaxAlphaBeta;
718  g_timeError[ch] = tMaxErrorAlphaBeta;
719 
720 #ifdef DEBUG_FINDAMPLCHI2_AND_FINISH
721  printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta);
722 #endif
723  }
724  }
725  }
726  }
727  };
728 
731 
732  public:
733  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
734  ALPAKA_FN_ACC void operator()(TAcc const& acc,
735  EcalDigiDeviceCollection::ConstView digisDevEB,
736  EcalDigiDeviceCollection::ConstView digisDevEE,
737  EcalMultifitConditionsDevice::ConstView conditionsDev,
738  ScalarType* sample_values,
739  ScalarType* sample_value_errors,
740  bool* useless_sample_values) const {
741  // constants
742  constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
743 
744  auto const nchannelsEB = digisDevEB.size();
745  auto const offsetForInputs = nchannelsEB;
746 
747  auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
748 
749  for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannelsEB * nsamples)) {
750  auto const elemIdx = gtx % elemsPerBlock;
751  auto const sample = elemIdx % nsamples;
752  auto const ch = gtx / nsamples;
753 
754  // remove thread for sample 0, oversubscribing is easier than ....
755  if (sample == 0)
756  continue;
757 
758  if (!use_sample(conditionsDev.sampleMask_EB(), sample))
759  continue;
760 
761  int const inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx;
762  auto const* digis = ch >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
763 
764  auto const gainIdPrev = ecalMGPA::gainId(digis[inputGtx - 1]);
765  auto const gainIdNext = ecalMGPA::gainId(digis[inputGtx]);
766  if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) {
767  sample_values[gtx - 1] = 0;
768  sample_value_errors[gtx - 1] = 1e+9;
769  useless_sample_values[gtx - 1] = true;
770  }
771  }
772  }
773  };
774 
775  //#define ECAL_RECO_ALPAKA_TC_INIT_DEBUG
778 
779  public:
780  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
781  ALPAKA_FN_ACC void operator()(TAcc const& acc,
782  EcalDigiDeviceCollection::ConstView digisDevEB,
783  EcalDigiDeviceCollection::ConstView digisDevEE,
784  EcalMultifitConditionsDevice::ConstView conditionsDev,
785  ScalarType* sample_values,
786  ScalarType* sample_value_errors,
787  ScalarType* ampMaxError,
788  bool* useless_sample_values,
789  char* pedestal_nums) const {
790  // constants
791  constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
792 
793  // indices
794  auto const nchannelsEB = digisDevEB.size();
795  auto const nchannels = nchannelsEB + digisDevEE.size();
796  auto const offsetForInputs = nchannelsEB;
797  auto const offsetForHashes = conditionsDev.offsetEE();
798 
799  auto const elemsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
800 
801  // configure shared mem
802  auto* shrSampleValues = alpaka::getDynSharedMem<ScalarType>(acc);
803  auto* shrSampleValueErrors = shrSampleValues + elemsPerBlock;
804 
805  for (auto txforward : cms::alpakatools::uniform_elements(acc, nchannels * nsamples)) {
806  // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution
807  auto tx = nchannels * nsamples - 1 - txforward;
808  auto const ch = tx / nsamples;
809  auto const elemIdx = tx % elemsPerBlock;
810 
811  int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx;
812  int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch;
813  auto const* digis = ch >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
814  auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
815 
816  // indices/inits
817  auto const sample = tx % nsamples;
818  auto const input_ch_start = inputCh * nsamples;
819  ScalarType pedestal = 0.;
820  int num = 0;
821 
822  // 0 and 1 sample values
823  auto const adc0 = ecalMGPA::adc(digis[input_ch_start]);
824  auto const gainId0 = ecalMGPA::gainId(digis[input_ch_start]);
825  auto const adc1 = ecalMGPA::adc(digis[input_ch_start + 1]);
826  auto const gainId1 = ecalMGPA::gainId(digis[input_ch_start + 1]);
827  auto const did = DetId{dids[inputCh]};
828  auto const isBarrel = did.subdetId() == EcalBarrel;
829  auto const sample_mask = isBarrel ? conditionsDev.sampleMask_EB() : conditionsDev.sampleMask_EE();
830  auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
831  : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
832 
833  // set pedestal
834  // TODO this branch is non-divergent for a group of 10 threads
835  if (gainId0 == 1 && use_sample(sample_mask, 0)) {
836  pedestal = static_cast<ScalarType>(adc0);
837  num = 1;
838 
839  auto const diff = adc1 - adc0;
840  if (gainId1 == 1 && use_sample(sample_mask, 1) &&
841  std::abs(diff) < 3 * conditionsDev.pedestals_rms_x12()[hashedId]) {
842  pedestal = (pedestal + static_cast<ScalarType>(adc1)) / 2.;
843  num = 2;
844  }
845  } else {
846  pedestal = conditionsDev.pedestals_mean_x12()[ch];
847  }
848 
849  // ped subtracted and gain-renormalized samples.
850  auto const gainId = ecalMGPA::gainId(digis[inputTx]);
851  auto const adc = ecalMGPA::adc(digis[inputTx]);
852 
853  bool bad = false;
854  ScalarType sample_value, sample_value_error;
855  // TODO divergent branch
856  // TODO: piece below is general both for amplitudes and timing
857  // potentially there is a way to reduce the amount of code...
858  if (!use_sample(sample_mask, sample)) {
859  bad = true;
860  sample_value = 0;
861  sample_value_error = 0;
862  } else if (gainId == 1) {
863  sample_value = static_cast<ScalarType>(adc) - pedestal;
864  sample_value_error = conditionsDev.pedestals_rms_x12()[hashedId];
865  } else if (gainId == 2) {
866  auto const mean_x6 = conditionsDev.pedestals_mean_x6()[hashedId];
867  auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId];
868  auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
869  sample_value = (static_cast<ScalarType>(adc) - mean_x6) * gain12Over6;
870  sample_value_error = rms_x6 * gain12Over6;
871  } else if (gainId == 3) {
872  auto const mean_x1 = conditionsDev.pedestals_mean_x1()[hashedId];
873  auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId];
874  auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId];
875  auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId];
876  sample_value = (static_cast<ScalarType>(adc) - mean_x1) * gain6Over1 * gain12Over6;
877  sample_value_error = rms_x1 * gain6Over1 * gain12Over6;
878  } else {
879  sample_value = 0;
880  sample_value_error = 0;
881  bad = true;
882  }
883 
884  // TODO: make sure we save things correctly when sample is useless
885  auto const useless_sample = (sample_value_error <= 0) | bad;
886  useless_sample_values[tx] = useless_sample;
887  sample_values[tx] = sample_value;
888  sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error;
889 
890  // DEBUG
891 #ifdef ECAL_RECO_ALPAKA_TC_INIT_DEBUG
892  if (ch == 0) {
893  printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n",
894  sample,
895  sample_value,
896  sample_value_error,
897  useless_sample ? '1' : '0');
898  }
899 #endif
900 
901  // store into the shared mem
902  shrSampleValues[elemIdx] = sample_value_error > 0 ? sample_value : std::numeric_limits<ScalarType>::min();
903  shrSampleValueErrors[elemIdx] = sample_value_error;
904  alpaka::syncBlockThreads(acc);
905 
906  // perform the reduction with min
907  if (sample < 5) {
908  // note, if equal -> we keep the value with lower sample as for cpu
909  shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 5]
910  ? shrSampleValueErrors[elemIdx + 5]
911  : shrSampleValueErrors[elemIdx];
912  shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 5]);
913  }
914  alpaka::syncBlockThreads(acc);
915 
916  // a bit of an overkill, but easier than to compare across 3 values
917  if (sample < 3) {
918  shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 3]
919  ? shrSampleValueErrors[elemIdx + 3]
920  : shrSampleValueErrors[elemIdx];
921  shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 3]);
922  }
923  alpaka::syncBlockThreads(acc);
924 
925  if (sample < 2) {
926  shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 2]
927  ? shrSampleValueErrors[elemIdx + 2]
928  : shrSampleValueErrors[elemIdx];
929  shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 2]);
930  }
931  alpaka::syncBlockThreads(acc);
932 
933  if (sample == 0) {
934  // we only need the max error
935  auto const maxSampleValueError = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 1]
936  ? shrSampleValueErrors[elemIdx + 1]
937  : shrSampleValueErrors[elemIdx];
938 
939  // # pedestal samples used
940  pedestal_nums[ch] = num;
941  // this is used downstream
942  ampMaxError[ch] = maxSampleValueError;
943 
944  // DEBUG
945 #ifdef ECAL_RECO_ALPAKA_TC_INIT_DEBUG
946  if (ch == 0) {
947  printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError);
948  }
949 #endif
950  }
951  }
952  }
953  };
954 
958  //#define DEBUG_TIME_CORRECTION
961 
962  public:
963  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
964  ALPAKA_FN_ACC void operator()(TAcc const& acc,
965  EcalDigiDeviceCollection::ConstView digisDevEB,
966  EcalDigiDeviceCollection::ConstView digisDevEE,
969  EcalMultifitConditionsDevice::ConstView conditionsDev,
970  ScalarType* const g_timeMax,
971  ScalarType* const g_timeError,
972  ConfigurationParameters::type const timeConstantTermEB,
973  ConfigurationParameters::type const timeConstantTermEE,
974  ConfigurationParameters::type const timeNconstEB,
975  ConfigurationParameters::type const timeNconstEE,
978  ConfigurationParameters::type const outOfTimeThreshG12pEB,
979  ConfigurationParameters::type const outOfTimeThreshG12pEE,
980  ConfigurationParameters::type const outOfTimeThreshG12mEB,
981  ConfigurationParameters::type const outOfTimeThreshG12mEE,
982  ConfigurationParameters::type const outOfTimeThreshG61pEB,
983  ConfigurationParameters::type const outOfTimeThreshG61pEE,
984  ConfigurationParameters::type const outOfTimeThreshG61mEB,
985  ConfigurationParameters::type const outOfTimeThreshG61mEE) const {
986  // constants
987  constexpr auto nsamples = EcalDataFrame::MAXSAMPLES;
988  auto const nchannelsEB = digisDevEB.size();
989  auto const nchannels = nchannelsEB + digisDevEE.size();
990  auto const offsetForInputs = nchannelsEB;
991  auto const offsetForHashes = conditionsDev.offsetEE();
992 
993  for (auto gtx : cms::alpakatools::uniform_elements(acc, nchannels)) {
994  const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx;
995  auto const* dids = gtx >= offsetForInputs ? digisDevEE.id() : digisDevEB.id();
996  auto const* digis = gtx >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data();
997 
998  auto* g_amplitude = gtx >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude();
999  auto* g_jitter = gtx >= nchannelsEB ? uncalibRecHitsEE.jitter() : uncalibRecHitsEB.jitter();
1000  auto* g_jitterError = gtx >= nchannelsEB ? uncalibRecHitsEE.jitterError() : uncalibRecHitsEB.jitterError();
1001  auto* flags = gtx >= nchannelsEB ? uncalibRecHitsEE.flags() : uncalibRecHitsEB.flags();
1002 
1003  auto const did = DetId{dids[inputGtx]};
1004  auto const isBarrel = did.subdetId() == EcalBarrel;
1005  auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId())
1006  : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId());
1007  // need to access the underlying data directly here because the std::arrays have different size for EB and EE, which is not compatible with the ? operator
1008  auto* const amplitudeBins = isBarrel ? conditionsDev.timeBiasCorrections_amplitude_EB().data()
1009  : conditionsDev.timeBiasCorrections_amplitude_EE().data();
1010  auto* const shiftBins = isBarrel ? conditionsDev.timeBiasCorrections_shift_EB().data()
1011  : conditionsDev.timeBiasCorrections_shift_EE().data();
1012  auto const amplitudeBinsSize =
1013  isBarrel ? conditionsDev.timeBiasCorrectionSizeEB() : conditionsDev.timeBiasCorrectionSizeEE();
1014  auto const timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE;
1015  auto const timeNconst = isBarrel ? timeNconstEB : timeNconstEE;
1016  auto const offsetTimeValue = isBarrel ? conditionsDev.timeOffset_EB() : conditionsDev.timeOffset_EE();
1018  auto const outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE;
1019  auto const outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE;
1020  auto const outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE;
1021  auto const outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE;
1022 
1023  // load some
1024  auto const amplitude = g_amplitude[inputGtx];
1025  auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId];
1026  auto const timeCalibConst = conditionsDev.timeCalibConstants()[hashedId];
1027 
1028  int myBin = -1;
1029  for (size_t bin = 0; bin < amplitudeBinsSize; ++bin) {
1030  if (amplitude > amplitudeBins[bin])
1031  myBin = bin;
1032  else
1033  break;
1034  }
1035 
1036  ScalarType correction = 0;
1037  if (myBin == -1) {
1038  correction = shiftBins[0];
1039  } else if (myBin == static_cast<int>(amplitudeBinsSize) - 1) {
1040  correction = shiftBins[myBin];
1041  } else {
1042  correction = shiftBins[myBin + 1] - shiftBins[myBin];
1043  correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]);
1044  correction += shiftBins[myBin];
1045  }
1046 
1047  // correction * 1./25.
1048  correction = correction * 0.04;
1049  auto const timeMax = g_timeMax[gtx];
1050  auto const timeError = g_timeError[gtx];
1051  auto const jitter = timeMax - 5 + correction;
1052  auto const jitterError =
1053  std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25.
1054 
1055 #ifdef DEBUG_TIME_CORRECTION
1056  printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n",
1057  gtx,
1058  timeMax,
1059  timeError,
1060  jitter,
1061  correction);
1062 #endif
1063 
1064  // store back to global
1065  g_jitter[inputGtx] = jitter;
1066  g_jitterError[inputGtx] = jitterError;
1067 
1068  // set the flag
1069  // TODO: replace with something more efficient (if required),
1070  // for now just to make it work
1071  if (amplitude > amplitudeThreshold * rms_x12) {
1072  auto threshP = outOfTimeThreshG12p;
1073  auto threshM = outOfTimeThreshG12m;
1074  if (amplitude > 3000.) {
1075  for (int isample = 0; isample < nsamples; isample++) {
1076  auto const gainid = ecalMGPA::gainId(digis[nsamples * inputGtx + isample]);
1077  if (gainid != 1) {
1078  threshP = outOfTimeThreshG61p;
1079  threshM = outOfTimeThreshG61m;
1080  break;
1081  }
1082  }
1083  }
1084 
1085  auto const correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue;
1086  auto const nterm = timeNconst * rms_x12 / amplitude;
1087  auto const sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm);
1088  if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM)
1089  flags[inputGtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime;
1090  }
1091  }
1092  }
1093  };
1094 
1095 } // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit
1096 
1097 namespace alpaka::trait {
1099 
1101  template <typename TAcc>
1102  struct BlockSharedMemDynSizeBytes<Kernel_time_compute_nullhypot, TAcc> {
1104  template <typename TVec, typename... TArgs>
1105  ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_nullhypot const&,
1106  TVec const& threadsPerBlock,
1107  TVec const& elemsPerThread,
1108  TArgs const&...) -> std::size_t {
1110 
1111  // return the amount of dynamic shared memory needed
1112  std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] * 4 * sizeof(ScalarType);
1113  return bytes;
1114  }
1115  };
1116 
1118  template <typename TAcc>
1119  struct BlockSharedMemDynSizeBytes<Kernel_time_compute_makeratio, TAcc> {
1120  template <typename TVec, typename... TArgs>
1121  ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_makeratio const&,
1122  TVec const& threadsPerBlock,
1123  TVec const& elemsPerThread,
1124  TArgs const&...) -> std::size_t {
1126 
1127  std::size_t bytes = (8 * sizeof(ScalarType) + 3 * sizeof(bool)) * threadsPerBlock[0u] * elemsPerThread[0u];
1128  return bytes;
1129  }
1130  };
1131 
1133  template <typename TAcc>
1134  struct BlockSharedMemDynSizeBytes<Kernel_time_compute_findamplchi2_and_finish, TAcc> {
1135  template <typename TVec, typename... TArgs>
1137  TVec const& threadsPerBlock,
1138  TVec const& elemsPerThread,
1139  TArgs const&...) -> std::size_t {
1141 
1142  std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * sizeof(ScalarType);
1143  return bytes;
1144  }
1145  };
1146 
1148  template <typename TAcc>
1149  struct BlockSharedMemDynSizeBytes<Kernel_time_computation_init, TAcc> {
1150  template <typename TVec, typename... TArgs>
1151  ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_computation_init const&,
1152  TVec const& threadsPerBlock,
1153  TVec const& elemsPerThread,
1154  TArgs const&...) -> std::size_t {
1156 
1157  std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * sizeof(ScalarType);
1158  return bytes;
1159  }
1160  };
1161 
1162 } // namespace alpaka::trait
1163 
1164 #endif // RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h
ALPAKA_FN_ACC void operator()(TAcc const &acc, EcalDigiDeviceCollection::ConstView digisDevEB, EcalDigiDeviceCollection::ConstView digisDevEE, ScalarType *const sample_values, ScalarType *const sample_value_errors, bool *const useless_sample_values, char *const pedestal_nums, ScalarType *const sumAAsNullHypot, ScalarType *const sum0sNullHypot, ScalarType *tMaxAlphaBetas, ScalarType *tMaxErrorAlphaBetas, ScalarType *g_accTimeMax, ScalarType *g_accTimeWgt, TimeComputationState *g_state, EcalMultifitParametersDevice::ConstView paramsDev, ConfigurationParameters::type const timeFitLimits_firstEB, ConfigurationParameters::type const timeFitLimits_firstEE, ConfigurationParameters::type const timeFitLimits_secondEB, ConfigurationParameters::type const timeFitLimits_secondEE) const
ALPAKA_FN_ACC auto uniform_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:311
ALPAKA_FN_ACC constexpr float fast_logf(float x)
double Scalar
Definition: Definitions.h:25
T ScalarType
ALPAKA_FN_ACC auto uniform_group_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:978
#define CMS_UNROLL_LOOP
Definition: CMSUnrollLoop.h:47
ALPAKA_FN_ACC uint32_t hashedIndexEB(uint32_t id)
ALPAKA_FN_ACC constexpr float fast_expf(float x)
static ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(Kernel_time_computation_init const &, TVec const &threadsPerBlock, TVec const &elemsPerThread, TArgs const &...) -> std::size_t
T sqrt(T t)
Definition: SSEVec.h:23
ALPAKA_FN_ACC uint32_t hashedIndexEE(uint32_t id)
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
double f[11][100]
constexpr int subdetId() const
get the contents of the subdetector field (not cast into any detector&#39;s numbering enum) ...
Definition: DetId.h:48
constexpr int adc(sample_type sample)
get the ADC sample (12 bits)
ALPAKA_FN_ACC void operator()(TAcc const &acc, EcalDigiDeviceCollection::ConstView digisDevEB, EcalDigiDeviceCollection::ConstView digisDevEE, EcalMultifitConditionsDevice::ConstView conditionsDev, ScalarType *sample_values, ScalarType *sample_value_errors, ScalarType *ampMaxError, bool *useless_sample_values, char *pedestal_nums) const
ALPAKA_FN_ACC void operator()(TAcc const &acc, ScalarType *const sample_values, ScalarType *const sample_value_errors, bool *const useless_sample_values, ScalarType *chi2s, ScalarType *sum0s, ScalarType *sumAAs, uint32_t const nchannels) const
static const double tmax[3]
Definition: DetId.h:17
ALPAKA_FN_ACC void operator()(TAcc const &acc, EcalDigiDeviceCollection::ConstView digisDevEB, EcalDigiDeviceCollection::ConstView digisDevEE, EcalMultifitConditionsDevice::ConstView conditionsDev, ScalarType *sample_values, ScalarType *sample_value_errors, bool *useless_sample_values) const
ALPAKA_FN_ACC ALPAKA_FN_INLINE bool use_sample(unsigned int sample_mask, unsigned int sample)
constexpr int gainId(sample_type sample)
get the gainId (2 bits)
static ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_makeratio const &, TVec const &threadsPerBlock, TVec const &elemsPerThread, TArgs const &...) -> std::size_t
static ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_nullhypot const &, TVec const &threadsPerBlock, TVec const &elemsPerThread, TArgs const &...) -> std::size_t
ALPAKA_FN_ACC void operator()(TAcc const &acc, EcalDigiDeviceCollection::ConstView digisDevEB, EcalDigiDeviceCollection::ConstView digisDevEE, ScalarType *const sample_values, ScalarType *const sample_value_errors, bool *const useless_samples, ScalarType *const g_tMaxAlphaBeta, ScalarType *const g_tMaxErrorAlphaBeta, ScalarType *const g_accTimeMax, ScalarType *const g_accTimeWgt, ScalarType *const sumAAsNullHypot, ScalarType *const sum0sNullHypot, ScalarType *const chi2sNullHypot, TimeComputationState *g_state, ScalarType *g_ampMaxAlphaBeta, ScalarType *g_ampMaxError, ScalarType *g_timeMax, ScalarType *g_timeError, EcalMultifitParametersDevice::ConstView paramsDev) const
ALPAKA_FN_ACC void operator()(TAcc const &acc, EcalDigiDeviceCollection::ConstView digisDevEB, EcalDigiDeviceCollection::ConstView digisDevEE, EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEB, EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEE, EcalMultifitConditionsDevice::ConstView conditionsDev, ScalarType *const g_timeMax, ScalarType *const g_timeError, ConfigurationParameters::type const timeConstantTermEB, ConfigurationParameters::type const timeConstantTermEE, ConfigurationParameters::type const timeNconstEB, ConfigurationParameters::type const timeNconstEE, ConfigurationParameters::type const amplitudeThresholdEB, ConfigurationParameters::type const amplitudeThresholdEE, ConfigurationParameters::type const outOfTimeThreshG12pEB, ConfigurationParameters::type const outOfTimeThreshG12pEE, ConfigurationParameters::type const outOfTimeThreshG12mEB, ConfigurationParameters::type const outOfTimeThreshG12mEE, ConfigurationParameters::type const outOfTimeThreshG61pEB, ConfigurationParameters::type const outOfTimeThreshG61pEE, ConfigurationParameters::type const outOfTimeThreshG61mEB, ConfigurationParameters::type const outOfTimeThreshG61mEE) const
constexpr int gainId(sample_type sample)
get the gainId (2 bits)
ALPAKA_FN_ACC auto uniform_groups(TAcc const &acc, TArgs... args)
Definition: workdivision.h:759
float x
static constexpr int MAXSAMPLES
Definition: EcalDataFrame.h:48
static ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_findamplchi2_and_finish const &, TVec const &threadsPerBlock, TVec const &elemsPerThread, TArgs const &...) -> std::size_t
uint16_t *__restrict__ uint16_t const *__restrict__ adc