CMS 3D CMS Logo

PFClusterSoAProducerKernel.dev.cc
Go to the documentation of this file.
1 #include <alpaka/alpaka.hpp>
2 
7 
11 
13 
14  using namespace cms::alpakatools;
15 
16  using namespace reco::pfClustering;
17 
19  static constexpr uint32_t blocksForExoticClusters = 4;
20 
21  // cutoffFraction -> Is a rechit almost entirely attributed to one cluster
22  // cutoffDistance -> Is a rechit close enough to a cluster to be associated
23  // Values are from RecoParticleFlow/PFClusterProducer/plugins/Basic2DGenericPFlowClusterizer.cc
24  static constexpr float cutoffDistance = 100.;
25  static constexpr float cutoffFraction = 0.9999;
26 
27  static constexpr uint32_t kHBHalf = 1296;
28  static constexpr uint32_t maxTopoInput = 2 * kHBHalf;
29 
30  // Calculation of dR2 for Clustering
31  ALPAKA_FN_ACC ALPAKA_FN_INLINE static float dR2(Position4 pos1, Position4 pos2) {
32  float mag1 = sqrtf(pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
33  float cosTheta1 = mag1 > 0.0 ? pos1.z / mag1 : 1.0;
34  float eta1 = 0.5f * logf((1.0f + cosTheta1) / (1.0f - cosTheta1));
35  float phi1 = atan2f(pos1.y, pos1.x);
36 
37  float mag2 = sqrtf(pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);
38  float cosTheta2 = mag2 > 0.0 ? pos2.z / mag2 : 1.0;
39  float eta2 = 0.5f * logf((1.0f + cosTheta2) / (1.0f - cosTheta2));
40  float phi2 = atan2f(pos2.y, pos2.x);
41 
42  float deta = eta2 - eta1;
43  constexpr const float fPI = M_PI;
44  float dphi = std::abs(std::abs(phi2 - phi1) - fPI) - fPI;
45  return (deta * deta + dphi * dphi);
46  }
47 
48  // Get index of seed
49  ALPAKA_FN_ACC static auto getSeedRhIdx(int* seeds, int seedNum) { return seeds[seedNum]; }
50 
51  // Get rechit fraction of a given rechit for a given seed
52  ALPAKA_FN_ACC static auto getRhFrac(reco::PFClusteringVarsDeviceCollection::View pfClusteringVars,
53  int topoSeedBegin,
55  int seedNum,
56  int rhNum) {
57  int seedIdx = pfClusteringVars[topoSeedBegin + seedNum].topoSeedList();
58  return fracView[pfClusteringVars[seedIdx].seedFracOffsets() + rhNum].frac();
59  }
60 
61  // Cluster position calculation
62  template <bool debug = false>
63  ALPAKA_FN_ACC static void updateClusterPos(reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
64  Position4& pos4,
65  float frac,
66  int rhInd,
67  reco::PFRecHitDeviceCollection::ConstView pfRecHits,
68  float rhENormInv) {
69  Position4 rechitPos = Position4{pfRecHits[rhInd].x(), pfRecHits[rhInd].y(), pfRecHits[rhInd].z(), 1.0};
70  const auto rh_energy = pfRecHits[rhInd].energy() * frac;
71  const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : std::max(0.0f, logf(rh_energy * rhENormInv)));
72  if constexpr (debug)
73  printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n",
74  rhInd,
75  norm,
76  frac,
77  rh_energy,
78  rechitPos.x,
79  rechitPos.y,
80  rechitPos.z);
81 
82  pos4.x += rechitPos.x * norm;
83  pos4.y += rechitPos.y * norm;
84  pos4.z += rechitPos.z * norm;
85  pos4.w += norm; // position_norm
86  }
87 
88  // Processing single seed clusters
89  // Device function designed to be called by all threads of a given block
90  template <bool debug = false, typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
91  ALPAKA_FN_ACC static void hcalFastCluster_singleSeed(
92  const TAcc& acc,
93  reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
94  const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
95  int topoId, // from selection
96  int nRHTopo, // from selection
97  reco::PFRecHitDeviceCollection::ConstView pfRecHits,
101  int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; // thread index is rechit number
102  // Declaration of shared variables
103  int& i = alpaka::declareSharedVar<int, __COUNTER__>(acc);
104  int& nRHOther = alpaka::declareSharedVar<int, __COUNTER__>(acc);
105  unsigned int& iter = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
106  float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
107  float& clusterEnergy = alpaka::declareSharedVar<float, __COUNTER__>(acc);
108  float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
109  float& seedEnergy = alpaka::declareSharedVar<float, __COUNTER__>(acc);
110  Position4& clusterPos = alpaka::declareSharedVar<Position4, __COUNTER__>(acc);
111  Position4& prevClusterPos = alpaka::declareSharedVar<Position4, __COUNTER__>(acc);
112  Position4& seedPos = alpaka::declareSharedVar<Position4, __COUNTER__>(acc);
113  bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
114  if (once_per_block(acc)) {
115  i = pfClusteringVars[pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList(); // i is the seed rechit index
116  nRHOther = nRHTopo - 1; // number of non-seed rechits
117  seedPos = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.};
118  clusterPos = seedPos; // Initial cluster position is just the seed
119  prevClusterPos = seedPos;
120  seedEnergy = pfRecHits[i].energy();
121  clusterEnergy = seedEnergy;
122  tol = pfClusParams.stoppingTolerance(); // stopping tolerance * tolerance scaling
123 
124  if (topology.cutsFromDB()) {
125  rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
126  } else {
127  if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
128  rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
129  else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
130  rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
131  else {
132  rhENormInv = 0.;
133  printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
134  }
135  }
136 
137  iter = 0;
138  notDone = true;
139  }
140  alpaka::syncBlockThreads(acc); // all threads call sync
141 
142  int j = -1; // j is the rechit index for this thread
143  int rhFracOffset = -1;
144  Position4 rhPos;
145  float rhEnergy = -1., rhPosNorm = -1.;
146 
147  if (tid < nRHOther) {
148  rhFracOffset =
149  pfClusteringVars[i].seedFracOffsets() + tid + 1; // Offset for this rechit in pcrhfrac, pcrhfracidx arrays
150  j = fracView[rhFracOffset].pfrhIdx(); // rechit index for this thread
151  rhPos = Position4{pfRecHits[j].x(), pfRecHits[j].y(), pfRecHits[j].z(), 1.};
152  rhEnergy = pfRecHits[j].energy();
153  rhPosNorm = fmaxf(0., logf(rhEnergy * rhENormInv));
154  }
155  alpaka::syncBlockThreads(acc); // all threads call sync
156 
157  do {
158  if constexpr (debug) {
159  if (once_per_block(acc))
160  printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
161  }
162  float dist2 = -1., d2 = -1., fraction = -1.;
163  if (tid < nRHOther) {
164  // Rechit distance calculation
165  dist2 = (clusterPos.x - rhPos.x) * (clusterPos.x - rhPos.x) +
166  (clusterPos.y - rhPos.y) * (clusterPos.y - rhPos.y) +
167  (clusterPos.z - rhPos.z) * (clusterPos.z - rhPos.z);
168 
169  d2 = dist2 / pfClusParams.showerSigma2();
170  fraction = clusterEnergy * rhENormInv * expf(-0.5f * d2);
171 
172  // For single seed clusters, rechit fraction is either 1 (100%) or -1 (not included)
173  if (fraction > pfClusParams.minFracTot() && d2 < cutoffDistance)
174  fraction = 1.;
175  else
176  fraction = -1.;
177  fracView[rhFracOffset].frac() = fraction;
178  }
179  alpaka::syncBlockThreads(acc); // all threads call sync
180 
181  if constexpr (debug) {
182  if (once_per_block(acc))
183  printf("Computing cluster position for topoId %d\n", topoId);
184  }
185 
186  if (once_per_block(acc)) {
187  // Reset cluster position and energy
188  clusterPos = seedPos;
189  clusterEnergy = seedEnergy;
190  }
191  alpaka::syncBlockThreads(acc); // all threads call sync
192 
193  // Recalculate cluster position and energy
194  if (fraction > -0.5) {
195  alpaka::atomicAdd(acc, &clusterEnergy, rhEnergy, alpaka::hierarchy::Threads{});
196  alpaka::atomicAdd(acc, &clusterPos.x, rhPos.x * rhPosNorm, alpaka::hierarchy::Threads{});
197  alpaka::atomicAdd(acc, &clusterPos.y, rhPos.y * rhPosNorm, alpaka::hierarchy::Threads{});
198  alpaka::atomicAdd(acc, &clusterPos.z, rhPos.z * rhPosNorm, alpaka::hierarchy::Threads{});
199  alpaka::atomicAdd(acc, &clusterPos.w, rhPosNorm, alpaka::hierarchy::Threads{}); // position_norm
200  }
201  alpaka::syncBlockThreads(acc); // all threads call sync
202 
203  if (once_per_block(acc)) {
204  // Normalize the seed postiion
205  if (clusterPos.w >= pfClusParams.minAllowedNormalization()) {
206  // Divide by position norm
207  clusterPos.x /= clusterPos.w;
208  clusterPos.y /= clusterPos.w;
209  clusterPos.z /= clusterPos.w;
210 
211  if constexpr (debug)
212  printf("\tPF cluster (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
213  i,
214  clusterEnergy,
215  clusterPos.x,
216  clusterPos.y,
217  clusterPos.z);
218  } else {
219  if constexpr (debug)
220  printf("\tPF cluster (seed %d) position norm (%f) less than minimum (%f)\n",
221  i,
222  clusterPos.w,
223  pfClusParams.minAllowedNormalization());
224  clusterPos.x = 0.;
225  clusterPos.y = 0.;
226  clusterPos.z = 0.;
227  }
228  float diff2 = dR2(prevClusterPos, clusterPos);
229  if constexpr (debug)
230  printf("\tPF cluster (seed %d) has diff2 = %f\n", i, diff2);
231  prevClusterPos = clusterPos; // Save clusterPos
232 
233  float tol2 = tol * tol;
234  iter++;
235  notDone = (diff2 > tol2) && (iter < pfClusParams.maxIterations());
236  if constexpr (debug) {
237  if (diff2 > tol2)
238  printf("\tTopoId %d has diff2 = %f greater than squared tolerance %f (continuing)\n", topoId, diff2, tol2);
239  else if constexpr (debug)
240  printf("\tTopoId %d has diff2 = %f LESS than squared tolerance %f (terminating!)\n", topoId, diff2, tol2);
241  }
242  }
243  alpaka::syncBlockThreads(acc); // all threads call sync
244  } while (notDone); // shared variable condition ensures synchronization is well defined
245  if (once_per_block(acc)) { // Cluster is finalized, assign cluster information to te SoA
246  int rhIdx =
247  pfClusteringVars[pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList(); // i is the seed rechit index
248  int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
249  clusterView[seedIdx].energy() = clusterEnergy;
250  clusterView[seedIdx].x() = clusterPos.x;
251  clusterView[seedIdx].y() = clusterPos.y;
252  clusterView[seedIdx].z() = clusterPos.z;
253  }
254  }
255 
256  // Processing clusters up to 100 seeds and 512 non-seed rechits using shared memory accesses
257  // Device function designed to be called by all threads of a given block
258  template <bool debug = false, typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
259  ALPAKA_FN_ACC static void hcalFastCluster_multiSeedParallel(
260  const TAcc& acc,
261  reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
262  const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
263  int topoId, // from selection
264  int nSeeds, // from selection
265  int nRHTopo, // from selection
266  reco::PFRecHitDeviceCollection::ConstView pfRecHits,
270  int tid = alpaka::getIdx<alpaka::Block,
271  alpaka::Threads>( // Thread index corresponds to a single rechit of the topo cluster
272  acc)[0u];
273 
274  int& nRHNotSeed = alpaka::declareSharedVar<int, __COUNTER__>(acc);
275  int& topoSeedBegin = alpaka::declareSharedVar<int, __COUNTER__>(acc);
276  int& stride = alpaka::declareSharedVar<int, __COUNTER__>(acc);
277  int& iter = alpaka::declareSharedVar<int, __COUNTER__>(acc);
278  float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
279  float& diff2 = alpaka::declareSharedVar<float, __COUNTER__>(acc);
280  float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
281  bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
282  auto& clusterPos = alpaka::declareSharedVar<Position4[100], __COUNTER__>(acc);
283  auto& prevClusterPos = alpaka::declareSharedVar<Position4[100], __COUNTER__>(acc);
284  auto& clusterEnergy = alpaka::declareSharedVar<float[100], __COUNTER__>(acc);
285  auto& rhFracSum = alpaka::declareSharedVar<float[threadsPerBlockForClustering], __COUNTER__>(acc);
286  auto& seeds = alpaka::declareSharedVar<int[100], __COUNTER__>(acc);
287  auto& rechits = alpaka::declareSharedVar<int[threadsPerBlockForClustering], __COUNTER__>(acc);
288 
289  if (once_per_block(acc)) {
290  nRHNotSeed = nRHTopo - nSeeds + 1; // 1 + (# rechits per topoId that are NOT seeds)
291  topoSeedBegin = pfClusteringVars[topoId].topoSeedOffsets();
292  tol = pfClusParams.stoppingTolerance() *
293  powf(fmaxf(1.0, nSeeds - 1), 2.0); // stopping tolerance * tolerance scaling
294  stride = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
295  iter = 0;
296  notDone = true;
297 
298  int i = pfClusteringVars[topoSeedBegin].topoSeedList();
299 
300  if (topology.cutsFromDB()) {
301  rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
302  } else {
303  if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
304  rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
305  else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
306  rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
307  else {
308  rhENormInv = 0.;
309  printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
310  }
311  }
312  }
313  alpaka::syncBlockThreads(acc); // all threads call sync
314 
315  if (tid < nSeeds)
316  seeds[tid] = pfClusteringVars[topoSeedBegin + tid].topoSeedList();
317  if (tid < nRHNotSeed - 1)
318  rechits[tid] =
319  fracView[pfClusteringVars[pfClusteringVars[topoSeedBegin].topoSeedList()].seedFracOffsets() + tid + 1]
320  .pfrhIdx();
321 
322  alpaka::syncBlockThreads(acc); // all threads call sync
323 
324  if constexpr (debug) {
325  if (once_per_block(acc)) {
326  printf("\n===========================================================================================\n");
327  printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
328  for (int s = 0; s < nSeeds; s++) {
329  if (s != 0)
330  printf(", ");
331  printf("%d", getSeedRhIdx(seeds, s));
332  }
333  if (nRHTopo == nSeeds) {
334  printf(")\n\n");
335  } else {
336  printf(") and other rechits (");
337  for (int r = 1; r < nRHNotSeed; r++) {
338  if (r != 1)
339  printf(", ");
340  if (r <= 0) {
341  printf("Invalid rhNum (%d) for get RhFracIdx!\n", r);
342  }
343  printf("%d", rechits[r - 1]);
344  }
345  printf(")\n\n");
346  }
347  }
348  alpaka::syncBlockThreads(acc); // all (or none) threads call sync
349  }
350 
351  // Set initial cluster position (energy) to seed rechit position (energy)
352  if (tid < nSeeds) {
353  int i = getSeedRhIdx(seeds, tid);
354  clusterPos[tid] = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.0};
355  prevClusterPos[tid] = clusterPos[tid];
356  clusterEnergy[tid] = pfRecHits[i].energy();
357  for (int r = 0; r < (nRHNotSeed - 1); r++) {
358  fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].pfrhIdx() = rechits[r];
359  fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].frac() = -1.;
360  }
361  }
362  alpaka::syncBlockThreads(acc); // all threads call sync
363 
364  int rhThreadIdx = -1;
365  Position4 rhThreadPos;
366  if (tid < (nRHNotSeed - 1)) {
367  rhThreadIdx = rechits[tid]; // Index when thread represents rechit
368  rhThreadPos = Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
369  }
370 
371  // Neighbors when threadIdx represents seed
372  int seedThreadIdx = -1;
373  Neighbours4 seedNeighbors = Neighbours4{-9, -9, -9, -9};
374  float seedEnergy = -1.;
375  Position4 seedInitClusterPos = Position4{0., 0., 0., 0.};
376  if (tid < nSeeds) {
377  if constexpr (debug)
378  printf("tid: %d\n", tid);
379  seedThreadIdx = getSeedRhIdx(seeds, tid);
380  seedNeighbors = Neighbours4{pfRecHits[seedThreadIdx].neighbours()(0),
381  pfRecHits[seedThreadIdx].neighbours()(1),
382  pfRecHits[seedThreadIdx].neighbours()(2),
383  pfRecHits[seedThreadIdx].neighbours()(3)};
384  seedEnergy = pfRecHits[seedThreadIdx].energy();
385 
386  // Compute initial cluster position shift for seed
387  updateClusterPos(pfClusParams, seedInitClusterPos, 1., seedThreadIdx, pfRecHits, rhENormInv);
388  }
389 
390  do {
391  if constexpr (debug) {
392  if (once_per_block(acc))
393  printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
394  }
395 
396  // Reset rhFracSum
397  rhFracSum[tid] = 0.;
398  if (once_per_block(acc))
399  diff2 = -1;
400 
401  if (tid < (nRHNotSeed - 1)) {
402  for (int s = 0; s < nSeeds; s++) {
403  float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
404  (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
405  (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
406 
407  float d2 = dist2 / pfClusParams.showerSigma2();
408  float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
409 
410  rhFracSum[tid] += fraction;
411  }
412  }
413  alpaka::syncBlockThreads(acc); // all threads call sync
414 
415  if (tid < (nRHNotSeed - 1)) {
416  for (int s = 0; s < nSeeds; s++) {
417  int i = seeds[s];
418  float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
419  (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
420  (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
421 
422  float d2 = dist2 / pfClusParams.showerSigma2();
423  float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
424 
425  if (rhFracSum[tid] > pfClusParams.minFracTot()) {
426  float fracpct = fraction / rhFracSum[tid];
427  if (fracpct > cutoffFraction || (d2 < cutoffDistance && fracpct > pfClusParams.minFracToKeep())) {
428  fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = fracpct;
429  } else {
430  fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
431  }
432  } else {
433  fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
434  }
435  }
436  }
437  alpaka::syncBlockThreads(acc); // all threads call sync
438 
439  if constexpr (debug) {
440  if (once_per_block(acc))
441  printf("Computing cluster position for topoId %d\n", topoId);
442  }
443 
444  // Reset cluster position and energy
445  if (tid < nSeeds) {
446  clusterPos[tid] = seedInitClusterPos;
447  clusterEnergy[tid] = seedEnergy;
448  if constexpr (debug) {
449  printf("Cluster %d (seed %d) has energy %f\tpos = (%f, %f, %f, %f)\n",
450  tid,
451  seeds[tid],
452  clusterEnergy[tid],
453  clusterPos[tid].x,
454  clusterPos[tid].y,
455  clusterPos[tid].z,
456  clusterPos[tid].w);
457  }
458  }
459  alpaka::syncBlockThreads(acc); // all threads call sync
460 
461  // Recalculate position
462  if (tid < nSeeds) {
463  for (int r = 0; r < nRHNotSeed - 1; r++) {
464  int j = rechits[r];
465  float frac = getRhFrac(pfClusteringVars, topoSeedBegin, fracView, tid, r + 1);
466 
467  if (frac > -0.5) {
468  clusterEnergy[tid] += frac * pfRecHits[j].energy();
469 
470  if (nSeeds == 1 || j == seedNeighbors.x || j == seedNeighbors.y || j == seedNeighbors.z ||
471  j == seedNeighbors.w)
472  updateClusterPos(pfClusParams, clusterPos[tid], frac, j, pfRecHits, rhENormInv);
473  }
474  }
475  }
476  alpaka::syncBlockThreads(acc); // all threads call sync
477 
478  // Position normalization
479  if (tid < nSeeds) {
480  if (clusterPos[tid].w >= pfClusParams.minAllowedNormalization()) {
481  // Divide by position norm
482  clusterPos[tid].x /= clusterPos[tid].w;
483  clusterPos[tid].y /= clusterPos[tid].w;
484  clusterPos[tid].z /= clusterPos[tid].w;
485 
486  if constexpr (debug)
487  printf("\tCluster %d (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
488  tid,
489  seedThreadIdx,
490  clusterEnergy[tid],
491  clusterPos[tid].x,
492  clusterPos[tid].y,
493  clusterPos[tid].z);
494  } else {
495  if constexpr (debug)
496  printf("\tCluster %d (seed %d) position norm (%f) less than minimum (%f)\n",
497  tid,
498  seedThreadIdx,
499  clusterPos[tid].w,
500  pfClusParams.minAllowedNormalization());
501  clusterPos[tid].x = 0.0;
502  clusterPos[tid].y = 0.0;
503  clusterPos[tid].z = 0.0;
504  }
505  }
506  alpaka::syncBlockThreads(acc); // all threads call sync
507 
508  if (tid < nSeeds) {
509  float delta2 = dR2(prevClusterPos[tid], clusterPos[tid]);
510  if constexpr (debug)
511  printf("\tCluster %d (seed %d) has delta2 = %f\n", tid, seeds[tid], delta2);
512  atomicMaxF(acc, &diff2, delta2);
513  prevClusterPos[tid] = clusterPos[tid]; // Save clusterPos
514  }
515  alpaka::syncBlockThreads(acc); // all threads call sync
516 
517  if (once_per_block(acc)) {
518  float tol2 = tol * tol;
519  iter++;
520  notDone = (diff2 > tol2) && ((unsigned int)iter < pfClusParams.maxIterations());
521  if constexpr (debug) {
522  if (diff2 > tol2)
523  printf("\tTopoId %d has diff2 = %f greater than squared tolerance %f (continuing)\n", topoId, diff2, tol2);
524  else if constexpr (debug)
525  printf("\tTopoId %d has diff2 = %f LESS than squared tolerance %f (terminating!)\n", topoId, diff2, tol2);
526  }
527  }
528  alpaka::syncBlockThreads(acc); // all threads call sync
529  } while (notDone); // shared variable condition ensures synchronization is well defined
530  if (once_per_block(acc))
531  // Fill PFCluster-level info
532  if (tid < nSeeds) {
533  int rhIdx = pfClusteringVars[tid + pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
534  int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
535  clusterView[seedIdx].energy() = clusterEnergy[tid];
536  clusterView[seedIdx].x() = clusterPos[tid].x;
537  clusterView[seedIdx].y() = clusterPos[tid].y;
538  clusterView[seedIdx].z() = clusterPos[tid].z;
539  }
540  }
541 
542  // Process very large exotic clusters, from nSeeds > 400 and non-seeds > 1500
543  // Uses global memory access
544  // Device function designed to be called by all threads of a given block
545  template <bool debug = false, typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
546  ALPAKA_FN_ACC static void hcalFastCluster_exotic(const TAcc& acc,
547  reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
548  const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
549  int topoId,
550  int nSeeds,
551  int nRHTopo,
552  reco::PFRecHitDeviceCollection::ConstView pfRecHits,
556  Position4* __restrict__ globalClusterPos,
557  Position4* __restrict__ globalPrevClusterPos,
558  float* __restrict__ globalClusterEnergy,
559  float* __restrict__ globalRhFracSum,
560  int* __restrict__ globalSeeds,
561  int* __restrict__ globalRechits) {
562  int& nRHNotSeed = alpaka::declareSharedVar<int, __COUNTER__>(acc);
563  int& blockIdx = alpaka::declareSharedVar<int, __COUNTER__>(acc);
564  int& topoSeedBegin = alpaka::declareSharedVar<int, __COUNTER__>(acc);
565  int& stride = alpaka::declareSharedVar<int, __COUNTER__>(acc);
566  int& iter = alpaka::declareSharedVar<int, __COUNTER__>(acc);
567  float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
568  float& diff2 = alpaka::declareSharedVar<float, __COUNTER__>(acc);
569  float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
570  bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
571 
572  blockIdx = maxTopoInput * alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
573  Position4* clusterPos = globalClusterPos + blockIdx;
574  Position4* prevClusterPos = globalPrevClusterPos + blockIdx;
575  float* clusterEnergy = globalClusterEnergy + blockIdx;
576  float* rhFracSum = globalRhFracSum + blockIdx;
577  int* seeds = globalSeeds + blockIdx;
578  int* rechits = globalRechits + blockIdx;
579 
580  if (once_per_block(acc)) {
581  nRHNotSeed = nRHTopo - nSeeds + 1; // 1 + (# rechits per topoId that are NOT seeds)
582  topoSeedBegin = pfClusteringVars[topoId].topoSeedOffsets();
583  tol = pfClusParams.stoppingTolerance() *
584  powf(fmaxf(1.0, nSeeds - 1), 2.0); // stopping tolerance * tolerance scaling
585  stride = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
586  iter = 0;
587  notDone = true;
588 
589  int i = pfClusteringVars[topoSeedBegin].topoSeedList();
590 
591  if (topology.cutsFromDB()) {
592  rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
593  } else {
594  if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
595  rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
596  else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
597  rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
598  else {
599  rhENormInv = 0.;
600  printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
601  }
602  }
603  }
604  alpaka::syncBlockThreads(acc); // all threads call sync
605 
606  for (int n = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; n < nRHTopo; n += stride) {
607  if (n < nSeeds)
608  seeds[n] = pfClusteringVars[topoSeedBegin + n].topoSeedList();
609  if (n < nRHNotSeed - 1)
610  rechits[n] =
611  fracView[pfClusteringVars[pfClusteringVars[topoSeedBegin].topoSeedList()].seedFracOffsets() + n + 1]
612  .pfrhIdx();
613  }
614  alpaka::syncBlockThreads(acc); // all threads call sync
615 
616  if constexpr (debug) {
617  if (once_per_block(acc)) {
618  printf("\n===========================================================================================\n");
619  printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
620  for (int s = 0; s < nSeeds; s++) {
621  if (s != 0)
622  printf(", ");
623  printf("%d", getSeedRhIdx(seeds, s));
624  }
625  if (nRHTopo == nSeeds) {
626  printf(")\n\n");
627  } else {
628  printf(") and other rechits (");
629  for (int r = 1; r < nRHNotSeed; r++) {
630  if (r != 1)
631  printf(", ");
632  if (r <= 0) {
633  printf("Invalid rhNum (%d) for get RhFracIdx!\n", r);
634  }
635  printf("%d", rechits[r - 1]);
636  }
637  printf(")\n\n");
638  }
639  }
640  alpaka::syncBlockThreads(acc); // all (or none) threads call sync
641  }
642 
643  // Set initial cluster position (energy) to seed rechit position (energy)
644  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
645  int i = seeds[s];
646  clusterPos[s] = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.0};
647  prevClusterPos[s] = clusterPos[s];
648  clusterEnergy[s] = pfRecHits[i].energy();
649  for (int r = 0; r < (nRHNotSeed - 1); r++) {
650  fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].pfrhIdx() = rechits[r];
651  fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].frac() = -1.;
652  }
653  }
654  alpaka::syncBlockThreads(acc); // all threads call sync
655 
656  do {
657  if constexpr (debug) {
658  if (once_per_block(acc))
659  printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
660  }
661 
662  if (once_per_block(acc))
663  diff2 = -1;
664  // Reset rhFracSum
665  for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
666  rhFracSum[tid] = 0.;
667  int rhThreadIdx = rechits[tid];
668  Position4 rhThreadPos =
669  Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
670  for (int s = 0; s < nSeeds; s++) {
671  float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
672  (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
673  (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
674 
675  float d2 = dist2 / pfClusParams.showerSigma2();
676  float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
677 
678  rhFracSum[tid] += fraction;
679  }
680  }
681  alpaka::syncBlockThreads(acc); // all threads call sync
682 
683  for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
684  int rhThreadIdx = rechits[tid];
685  Position4 rhThreadPos =
686  Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
687  for (int s = 0; s < nSeeds; s++) {
688  int i = seeds[s];
689  float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
690  (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
691  (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
692 
693  float d2 = dist2 / pfClusParams.showerSigma2();
694  float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
695 
696  if (rhFracSum[tid] > pfClusParams.minFracTot()) {
697  float fracpct = fraction / rhFracSum[tid];
698  if (fracpct > cutoffFraction || (d2 < cutoffDistance && fracpct > pfClusParams.minFracToKeep())) {
699  fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = fracpct;
700  } else {
701  fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
702  }
703  } else {
704  fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
705  }
706  }
707  }
708  alpaka::syncBlockThreads(acc); // all threads call sync
709 
710  if constexpr (debug) {
711  if (once_per_block(acc))
712  printf("Computing cluster position for topoId %d\n", topoId);
713  }
714 
715  // Reset cluster position and energy
716  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
717  int seedRhIdx = getSeedRhIdx(seeds, s);
718  float norm = logf(pfRecHits[seedRhIdx].energy() * rhENormInv);
719  clusterPos[s] = Position4{
720  pfRecHits[seedRhIdx].x() * norm, pfRecHits[seedRhIdx].y() * norm, pfRecHits[seedRhIdx].z() * norm, norm};
721  clusterEnergy[s] = pfRecHits[seedRhIdx].energy();
722  if constexpr (debug) {
723  printf("Cluster %d (seed %d) has energy %f\tpos = (%f, %f, %f, %f)\n",
724  s,
725  seeds[s],
726  clusterEnergy[s],
727  clusterPos[s].x,
728  clusterPos[s].y,
729  clusterPos[s].z,
730  clusterPos[s].w);
731  }
732  }
733  alpaka::syncBlockThreads(acc); // all threads call sync
734 
735  // Recalculate position
736  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
737  int seedRhIdx = getSeedRhIdx(seeds, s);
738  for (int r = 0; r < nRHNotSeed - 1; r++) {
739  int j = rechits[r];
740  float frac = getRhFrac(pfClusteringVars, topoSeedBegin, fracView, s, r + 1);
741 
742  if (frac > -0.5) {
743  clusterEnergy[s] += frac * pfRecHits[j].energy();
744 
745  if (nSeeds == 1 || j == pfRecHits[seedRhIdx].neighbours()(0) || j == pfRecHits[seedRhIdx].neighbours()(1) ||
746  j == pfRecHits[seedRhIdx].neighbours()(2) || j == pfRecHits[seedRhIdx].neighbours()(3))
747  updateClusterPos(pfClusParams, clusterPos[s], frac, j, pfRecHits, rhENormInv);
748  }
749  }
750  }
751  alpaka::syncBlockThreads(acc); // all threads call sync
752 
753  // Position normalization
754  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
755  if (clusterPos[s].w >= pfClusParams.minAllowedNormalization()) {
756  // Divide by position norm
757  clusterPos[s].x /= clusterPos[s].w;
758  clusterPos[s].y /= clusterPos[s].w;
759  clusterPos[s].z /= clusterPos[s].w;
760 
761  if constexpr (debug)
762  printf("\tCluster %d (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
763  s,
764  seeds[s],
765  clusterEnergy[s],
766  clusterPos[s].x,
767  clusterPos[s].y,
768  clusterPos[s].z);
769  } else {
770  if constexpr (debug)
771  printf("\tCluster %d (seed %d) position norm (%f) less than minimum (%f)\n",
772  s,
773  seeds[s],
774  clusterPos[s].w,
775  pfClusParams.minAllowedNormalization());
776  clusterPos[s].x = 0.0;
777  clusterPos[s].y = 0.0;
778  clusterPos[s].z = 0.0;
779  }
780  }
781  alpaka::syncBlockThreads(acc); // all threads call sync
782 
783  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
784  float delta2 = dR2(prevClusterPos[s], clusterPos[s]);
785  if constexpr (debug)
786  printf("\tCluster %d (seed %d) has delta2 = %f\n", s, seeds[s], delta2);
787  atomicMaxF(acc, &diff2, delta2);
788  prevClusterPos[s] = clusterPos[s]; // Save clusterPos
789  }
790  alpaka::syncBlockThreads(acc); // all threads call sync
791 
792  if (once_per_block(acc)) {
793  float tol2 = tol * tol;
794  iter++;
795  notDone = (diff2 > tol2) && ((unsigned int)iter < pfClusParams.maxIterations());
796  if constexpr (debug) {
797  if (diff2 > tol2)
798  printf("\tTopoId %d has diff2 = %f greater than squared tolerance %f (continuing)\n", topoId, diff2, tol2);
799  else if constexpr (debug)
800  printf("\tTopoId %d has diff2 = %f LESS than squared tolerance %f (terminating!)\n", topoId, diff2, tol2);
801  }
802  }
803  alpaka::syncBlockThreads(acc); // all threads call sync
804  } while (notDone); // shared variable ensures synchronization is well defined
805  if (once_per_block(acc))
806  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
807  int rhIdx = pfClusteringVars[s + pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
808  int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
809  clusterView[seedIdx].energy() = pfRecHits[s].energy();
810  clusterView[seedIdx].x() = pfRecHits[s].x();
811  clusterView[seedIdx].y() = pfRecHits[s].y();
812  clusterView[seedIdx].z() = pfRecHits[s].z();
813  }
814  alpaka::syncBlockThreads(acc); // all threads call sync
815  }
816 
817  // Process clusters with up to 400 seeds and 1500 non seeds using shared memory
818  // Device function designed to be called by all threads of a given block
819  template <bool debug = false, typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
820  ALPAKA_FN_ACC static void hcalFastCluster_multiSeedIterative(
821  const TAcc& acc,
822  reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
823  const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
824  int topoId,
825  int nSeeds,
826  int nRHTopo,
827  reco::PFRecHitDeviceCollection::ConstView pfRecHits,
831  int& nRHNotSeed = alpaka::declareSharedVar<int, __COUNTER__>(acc);
832  int& topoSeedBegin = alpaka::declareSharedVar<int, __COUNTER__>(acc);
833  int& stride = alpaka::declareSharedVar<int, __COUNTER__>(acc);
834  int& iter = alpaka::declareSharedVar<int, __COUNTER__>(acc);
835  float& tol = alpaka::declareSharedVar<float, __COUNTER__>(acc);
836  float& diff2 = alpaka::declareSharedVar<float, __COUNTER__>(acc);
837  float& rhENormInv = alpaka::declareSharedVar<float, __COUNTER__>(acc);
838  bool& notDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
839 
840  auto& clusterPos = alpaka::declareSharedVar<Position4[400], __COUNTER__>(acc);
841  auto& prevClusterPos = alpaka::declareSharedVar<Position4[400], __COUNTER__>(acc);
842  auto& clusterEnergy = alpaka::declareSharedVar<float[400], __COUNTER__>(acc);
843  auto& rhFracSum = alpaka::declareSharedVar<float[1500], __COUNTER__>(acc);
844  auto& seeds = alpaka::declareSharedVar<int[400], __COUNTER__>(acc);
845  auto& rechits = alpaka::declareSharedVar<int[1500], __COUNTER__>(acc);
846 
847  if (once_per_block(acc)) {
848  nRHNotSeed = nRHTopo - nSeeds + 1; // 1 + (# rechits per topoId that are NOT seeds)
849  topoSeedBegin = pfClusteringVars[topoId].topoSeedOffsets();
850  tol = pfClusParams.stoppingTolerance() * // stopping tolerance * tolerance scaling
851  powf(fmaxf(1.0, nSeeds - 1), 2.0);
852  stride = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
853  iter = 0;
854  notDone = true;
855 
856  int i = pfClusteringVars[topoSeedBegin].topoSeedList();
857 
858  if (topology.cutsFromDB()) {
859  rhENormInv = (1.f / topology[pfRecHits[i].denseId()].noiseThreshold());
860  } else {
861  if (pfRecHits[i].layer() == PFLayer::HCAL_BARREL1)
862  rhENormInv = pfClusParams.recHitEnergyNormInvHB_vec()[pfRecHits[i].depth() - 1];
863  else if (pfRecHits[i].layer() == PFLayer::HCAL_ENDCAP)
864  rhENormInv = pfClusParams.recHitEnergyNormInvHE_vec()[pfRecHits[i].depth() - 1];
865  else {
866  rhENormInv = 0.;
867  printf("Rechit %d has invalid layer %d!\n", i, pfRecHits[i].layer());
868  }
869  }
870  }
871  alpaka::syncBlockThreads(acc); // all threads call sync
872 
873  for (int n = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; n < nRHTopo; n += stride) {
874  if (n < nSeeds)
875  seeds[n] = pfClusteringVars[topoSeedBegin + n].topoSeedList();
876  if (n < nRHNotSeed - 1)
877  rechits[n] =
878  fracView[pfClusteringVars[pfClusteringVars[topoSeedBegin].topoSeedList()].seedFracOffsets() + n + 1]
879  .pfrhIdx();
880  }
881  alpaka::syncBlockThreads(acc); // all threads call sync
882 
883  if constexpr (debug) {
884  if (once_per_block(acc)) {
885  printf("\n===========================================================================================\n");
886  printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
887  for (int s = 0; s < nSeeds; s++) {
888  if (s != 0)
889  printf(", ");
890  printf("%d", getSeedRhIdx(seeds, s));
891  }
892  if (nRHTopo == nSeeds) {
893  printf(")\n\n");
894  } else {
895  printf(") and other rechits (");
896  for (int r = 1; r < nRHNotSeed; r++) {
897  if (r != 1)
898  printf(", ");
899  if (r <= 0) {
900  printf("Invalid rhNum (%d) for get RhFracIdx!\n", r);
901  }
902  printf("%d", rechits[r - 1]);
903  }
904  printf(")\n\n");
905  }
906  }
907  alpaka::syncBlockThreads(acc); // all (or none) threads call sync
908  }
909 
910  // Set initial cluster position (energy) to seed rechit position (energy)
911  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
912  int i = seeds[s];
913  clusterPos[s] = Position4{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z(), 1.0};
914  prevClusterPos[s] = clusterPos[s];
915  clusterEnergy[s] = pfRecHits[i].energy();
916  for (int r = 0; r < (nRHNotSeed - 1); r++) {
917  fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].pfrhIdx() = rechits[r];
918  fracView[pfClusteringVars[i].seedFracOffsets() + r + 1].frac() = -1.;
919  }
920  }
921  alpaka::syncBlockThreads(acc); // all threads call sync
922 
923  do {
924  if constexpr (debug) {
925  if (once_per_block(acc))
926  printf("\n--- Now on iter %d for topoId %d ---\n", iter, topoId);
927  }
928 
929  if (once_per_block(acc))
930  diff2 = -1;
931  // Reset rhFracSum
932  for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
933  rhFracSum[tid] = 0.;
934  int rhThreadIdx = rechits[tid];
935  Position4 rhThreadPos =
936  Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
937  for (int s = 0; s < nSeeds; s++) {
938  float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
939  (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
940  (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
941 
942  float d2 = dist2 / pfClusParams.showerSigma2();
943  float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
944 
945  rhFracSum[tid] += fraction;
946  }
947  }
948  alpaka::syncBlockThreads(acc); // all threads call sync
949 
950  for (int tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; tid < nRHNotSeed - 1; tid += stride) {
951  int rhThreadIdx = rechits[tid];
952  Position4 rhThreadPos =
953  Position4{pfRecHits[rhThreadIdx].x(), pfRecHits[rhThreadIdx].y(), pfRecHits[rhThreadIdx].z(), 1.};
954  for (int s = 0; s < nSeeds; s++) {
955  int i = seeds[s];
956  float dist2 = (clusterPos[s].x - rhThreadPos.x) * (clusterPos[s].x - rhThreadPos.x) +
957  (clusterPos[s].y - rhThreadPos.y) * (clusterPos[s].y - rhThreadPos.y) +
958  (clusterPos[s].z - rhThreadPos.z) * (clusterPos[s].z - rhThreadPos.z);
959 
960  float d2 = dist2 / pfClusParams.showerSigma2();
961  float fraction = clusterEnergy[s] * rhENormInv * expf(-0.5f * d2);
962 
963  if (rhFracSum[tid] > pfClusParams.minFracTot()) {
964  float fracpct = fraction / rhFracSum[tid];
965  if (fracpct > cutoffFraction || (d2 < cutoffDistance && fracpct > pfClusParams.minFracToKeep())) {
966  fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = fracpct;
967  } else {
968  fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
969  }
970  } else {
971  fracView[pfClusteringVars[i].seedFracOffsets() + tid + 1].frac() = -1;
972  }
973  }
974  }
975  alpaka::syncBlockThreads(acc); // all threads call sync
976 
977  if constexpr (debug) {
978  if (once_per_block(acc))
979  printf("Computing cluster position for topoId %d\n", topoId);
980  }
981 
982  // Reset cluster position and energy
983  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
984  int seedRhIdx = getSeedRhIdx(seeds, s);
985  float norm = logf(pfRecHits[seedRhIdx].energy() * rhENormInv);
986  clusterPos[s] = Position4{
987  pfRecHits[seedRhIdx].x() * norm, pfRecHits[seedRhIdx].y() * norm, pfRecHits[seedRhIdx].z() * norm, norm};
988  clusterEnergy[s] = pfRecHits[seedRhIdx].energy();
989  if constexpr (debug) {
990  printf("Cluster %d (seed %d) has energy %f\tpos = (%f, %f, %f, %f)\n",
991  s,
992  seeds[s],
993  clusterEnergy[s],
994  clusterPos[s].x,
995  clusterPos[s].y,
996  clusterPos[s].z,
997  clusterPos[s].w);
998  }
999  }
1000  alpaka::syncBlockThreads(acc); // all threads call sync
1001 
1002  // Recalculate position
1003  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1004  int seedRhIdx = getSeedRhIdx(seeds, s);
1005  for (int r = 0; r < nRHNotSeed - 1; r++) {
1006  int j = rechits[r];
1007  float frac = getRhFrac(pfClusteringVars, topoSeedBegin, fracView, s, r + 1);
1008 
1009  if (frac > -0.5) {
1010  clusterEnergy[s] += frac * pfRecHits[j].energy();
1011 
1012  if (nSeeds == 1 || j == pfRecHits[seedRhIdx].neighbours()(0) || j == pfRecHits[seedRhIdx].neighbours()(1) ||
1013  j == pfRecHits[seedRhIdx].neighbours()(2) || j == pfRecHits[seedRhIdx].neighbours()(3))
1014  updateClusterPos(pfClusParams, clusterPos[s], frac, j, pfRecHits, rhENormInv);
1015  }
1016  }
1017  }
1018  alpaka::syncBlockThreads(acc); // all threads call sync
1019 
1020  // Position normalization
1021  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1022  if (clusterPos[s].w >= pfClusParams.minAllowedNormalization()) {
1023  // Divide by position norm
1024  clusterPos[s].x /= clusterPos[s].w;
1025  clusterPos[s].y /= clusterPos[s].w;
1026  clusterPos[s].z /= clusterPos[s].w;
1027 
1028  if constexpr (debug)
1029  printf("\tCluster %d (seed %d) energy = %f\tposition = (%f, %f, %f)\n",
1030  s,
1031  seeds[s],
1032  clusterEnergy[s],
1033  clusterPos[s].x,
1034  clusterPos[s].y,
1035  clusterPos[s].z);
1036  } else {
1037  if constexpr (debug)
1038  printf("\tCluster %d (seed %d) position norm (%f) less than minimum (%f)\n",
1039  s,
1040  seeds[s],
1041  clusterPos[s].w,
1042  pfClusParams.minAllowedNormalization());
1043  clusterPos[s].x = 0.0;
1044  clusterPos[s].y = 0.0;
1045  clusterPos[s].z = 0.0;
1046  }
1047  }
1048  alpaka::syncBlockThreads(acc); // all threads call sync
1049 
1050  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1051  float delta2 = dR2(prevClusterPos[s], clusterPos[s]);
1052  if constexpr (debug)
1053  printf("\tCluster %d (seed %d) has delta2 = %f\n", s, seeds[s], delta2);
1054  atomicMaxF(acc, &diff2, delta2);
1055  prevClusterPos[s] = clusterPos[s]; // Save clusterPos
1056  }
1057  alpaka::syncBlockThreads(acc); // all threads call sync
1058 
1059  if (once_per_block(acc)) {
1060  float tol2 = tol * tol;
1061  iter++;
1062  notDone = (diff2 > tol2) && ((unsigned int)iter < pfClusParams.maxIterations());
1063  if constexpr (debug) {
1064  if (diff2 > tol2)
1065  printf("\tTopoId %d has diff2 = %f greater than tolerance %f (continuing)\n", topoId, diff2, tol2);
1066  else if constexpr (debug)
1067  printf("\tTopoId %d has diff2 = %f LESS than tolerance %f (terminating!)\n", topoId, diff2, tol2);
1068  }
1069  }
1070  alpaka::syncBlockThreads(acc); // all threads call sync
1071  } while (notDone); // shared variable ensures synchronization is well defined
1072  if (once_per_block(acc))
1073  for (int s = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; s < nSeeds; s += stride) {
1074  int rhIdx = pfClusteringVars[s + pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
1075  int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
1076  clusterView[seedIdx].energy() = pfRecHits[s].energy();
1077  clusterView[seedIdx].x() = pfRecHits[s].x();
1078  clusterView[seedIdx].y() = pfRecHits[s].y();
1079  clusterView[seedIdx].z() = pfRecHits[s].z();
1080  }
1081  }
1082 
1083  // Seeding using local energy maxima
1085  public:
1086  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1087  ALPAKA_FN_ACC void operator()(const TAcc& acc,
1089  const reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
1090  const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
1094  uint32_t* __restrict__ nSeeds) const {
1095  const int nRH = pfRecHits.size();
1096 
1097  if (once_per_grid(acc)) {
1098  clusterView.size() = nRH;
1099  }
1100 
1101  for (auto i : uniform_elements(acc, nRH)) {
1102  // Initialize arrays
1103  pfClusteringVars[i].pfrh_isSeed() = 0;
1104  pfClusteringVars[i].rhCount() = 0;
1105  pfClusteringVars[i].topoSeedCount() = 0;
1106  pfClusteringVars[i].topoRHCount() = 0;
1107  pfClusteringVars[i].seedFracOffsets() = -1;
1108  pfClusteringVars[i].topoSeedOffsets() = -1;
1109  pfClusteringVars[i].topoSeedList() = -1;
1110  clusterView[i].seedRHIdx() = -1;
1111 
1112  int layer = pfRecHits[i].layer();
1113  int depthOffset = pfRecHits[i].depth() - 1;
1114  float energy = pfRecHits[i].energy();
1115  Position3 pos = Position3{pfRecHits[i].x(), pfRecHits[i].y(), pfRecHits[i].z()};
1116  float seedThreshold = 9999.;
1117  float topoThreshold = 9999.;
1118 
1119  if (topology.cutsFromDB()) {
1120  seedThreshold = topology[pfRecHits[i].denseId()].seedThreshold();
1121  topoThreshold = topology[pfRecHits[i].denseId()].noiseThreshold();
1122  } else {
1123  if (layer == PFLayer::HCAL_BARREL1) {
1124  seedThreshold = pfClusParams.seedEThresholdHB_vec()[depthOffset];
1125  topoThreshold = pfClusParams.topoEThresholdHB_vec()[depthOffset];
1126  } else if (layer == PFLayer::HCAL_ENDCAP) {
1127  seedThreshold = pfClusParams.seedEThresholdHE_vec()[depthOffset];
1128  topoThreshold = pfClusParams.topoEThresholdHE_vec()[depthOffset];
1129  }
1130  }
1131 
1132  // cmssdt.cern.ch/lxr/source/DataFormats/ParticleFlowReco/interface/PFRecHit.h#0108
1133  float pt2 = energy * energy * (pos.x * pos.x + pos.y * pos.y) / (pos.x * pos.x + pos.y * pos.y + pos.z * pos.z);
1134 
1135  // Seeding threshold test
1136  if ((layer == PFLayer::HCAL_BARREL1 && energy > seedThreshold && pt2 > pfClusParams.seedPt2ThresholdHB()) ||
1137  (layer == PFLayer::HCAL_ENDCAP && energy > seedThreshold && pt2 > pfClusParams.seedPt2ThresholdHE())) {
1138  pfClusteringVars[i].pfrh_isSeed() = 1;
1139  for (int k = 0; k < 4; k++) { // Does this seed candidate have a higher energy than four neighbours
1140  if (pfRecHits[i].neighbours()(k) < 0)
1141  continue;
1142  if (energy < pfRecHits[pfRecHits[i].neighbours()(k)].energy()) {
1143  pfClusteringVars[i].pfrh_isSeed() = 0;
1144  break;
1145  }
1146  }
1147  if (pfClusteringVars[i].pfrh_isSeed())
1148  alpaka::atomicAdd(acc, nSeeds, 1u);
1149  }
1150  // Topo clustering threshold test
1151 
1152  if ((layer == PFLayer::HCAL_ENDCAP && energy > topoThreshold) ||
1153  (layer == PFLayer::HCAL_BARREL1 && energy > topoThreshold)) {
1154  pfClusteringVars[i].pfrh_passTopoThresh() = true;
1155  pfClusteringVars[i].pfrh_topoId() = i;
1156  } else {
1157  pfClusteringVars[i].pfrh_passTopoThresh() = false;
1158  pfClusteringVars[i].pfrh_topoId() = -1;
1159  }
1160  }
1161  }
1162  };
1163 
1164  // Preparation of topo inputs. Initializing topoId, egdeIdx, nEdges, edgeList
1166  public:
1167  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1168  ALPAKA_FN_ACC void operator()(const TAcc& acc,
1172  uint32_t* __restrict__ nSeeds) const {
1173  const int nRH = pfRecHits.size();
1174 
1175  if (once_per_grid(acc)) {
1176  pfClusteringVars.nEdges() = nRH * 8;
1177  pfClusteringEdgeVars[nRH].pfrh_edgeIdx() = nRH * 8;
1178  }
1179  for (uint32_t i : cms::alpakatools::uniform_elements(acc, nRH)) {
1180  pfClusteringEdgeVars[i].pfrh_edgeIdx() = i * 8;
1181  pfClusteringVars[i].pfrh_topoId() = 0;
1182  for (int j = 0; j < 8; j++) { // checking if neighbours exist and assigning neighbours as edges
1183  if (pfRecHits[i].neighbours()(j) == -1)
1184  pfClusteringEdgeVars[i * 8 + j].pfrh_edgeList() = i;
1185  else
1186  pfClusteringEdgeVars[i * 8 + j].pfrh_edgeList() = pfRecHits[i].neighbours()(j);
1187  }
1188  }
1189 
1190  return;
1191  }
1192  };
1193 
1194  // Contraction in a single block
1196  public:
1197  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1198  ALPAKA_FN_ACC void operator()(const TAcc& acc,
1202  uint32_t* __restrict__ nSeeds) const {
1203  const int nRH = pfRecHits.size();
1204  int& totalSeedOffset = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1205  int& totalSeedFracOffset = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1206 
1207  // rhCount, topoRHCount, topoSeedCount initialized earlier
1208  if (once_per_block(acc)) {
1209  pfClusteringVars.nTopos() = 0;
1210  pfClusteringVars.nRHFracs() = 0;
1211  totalSeedOffset = 0;
1212  totalSeedFracOffset = 0;
1213  pfClusteringVars.pcrhFracSize() = 0;
1214  }
1215 
1216  alpaka::syncBlockThreads(acc); // all threads call sync
1217 
1218  // Now determine the number of seeds and rechits in each topo cluster [topoRHCount, topoSeedCount]
1219  // Also get the list of topoIds (smallest rhIdx of each topo cluser)
1220  for (int rhIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; rhIdx < nRH;
1221  rhIdx += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1222  pfClusteringVars[rhIdx].rhIdxToSeedIdx() = -1;
1223  int topoId = pfClusteringVars[rhIdx].pfrh_topoId();
1224  if (topoId > -1) {
1225  // Valid topo cluster
1226  alpaka::atomicAdd(acc, &pfClusteringVars[topoId].topoRHCount(), 1);
1227  // Valid topoId not counted yet
1228  if (topoId == rhIdx) { // For every topo cluster, there is one rechit that meets this condition.
1229  int topoIdx = alpaka::atomicAdd(acc, &pfClusteringVars.nTopos(), 1);
1230  pfClusteringVars[topoIdx].topoIds() =
1231  topoId; // topoId: the smallest index of rechits that belong to a topo cluster.
1232  }
1233  // This is a cluster seed
1234  if (pfClusteringVars[rhIdx].pfrh_isSeed()) { // # of seeds in this topo cluster
1235  alpaka::atomicAdd(acc, &pfClusteringVars[topoId].topoSeedCount(), 1);
1236  }
1237  }
1238  }
1239 
1240  alpaka::syncBlockThreads(acc); // all threads call sync
1241 
1242  // Determine offsets for topo ID seed array [topoSeedOffsets]
1243  for (int topoId = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; topoId < nRH;
1244  topoId += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1245  if (pfClusteringVars[topoId].topoSeedCount() > 0) {
1246  // This is a valid topo ID
1247  int offset = alpaka::atomicAdd(acc, &totalSeedOffset, pfClusteringVars[topoId].topoSeedCount());
1248  pfClusteringVars[topoId].topoSeedOffsets() = offset;
1249  }
1250  }
1251  alpaka::syncBlockThreads(acc); // all threads call sync
1252 
1253  // Fill arrays of rechit indicies for each seed [topoSeedList] and rhIdx->seedIdx conversion for each seed [rhIdxToSeedIdx]
1254  // Also fill seedRHIdx, topoId, depth
1255  for (int rhIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; rhIdx < nRH;
1256  rhIdx += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1257  int topoId = pfClusteringVars[rhIdx].pfrh_topoId();
1258  if (pfClusteringVars[rhIdx].pfrh_isSeed()) {
1259  // Valid topo cluster and this rhIdx corresponds to a seed
1260  int k = alpaka::atomicAdd(acc, &pfClusteringVars[topoId].rhCount(), 1);
1261  int seedIdx = pfClusteringVars[topoId].topoSeedOffsets() + k;
1262  if ((unsigned int)seedIdx >= *nSeeds)
1263  printf("Warning(contraction) %8d > %8d should not happen, check topoId: %d has %d rh\n",
1264  seedIdx,
1265  *nSeeds,
1266  topoId,
1267  k);
1268  pfClusteringVars[seedIdx].topoSeedList() = rhIdx;
1269  pfClusteringVars[rhIdx].rhIdxToSeedIdx() = seedIdx;
1270  clusterView[seedIdx].topoId() = topoId;
1271  clusterView[seedIdx].seedRHIdx() = rhIdx;
1272  clusterView[seedIdx].depth() = pfRecHits[rhIdx].depth();
1273  }
1274  }
1275 
1276  alpaka::syncBlockThreads(acc); // all threads call sync
1277 
1278  // Determine seed offsets for rechit fraction array
1279  for (int rhIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]; rhIdx < nRH;
1280  rhIdx += alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]) {
1281  pfClusteringVars[rhIdx].rhCount() = 1; // Reset this counter array
1282 
1283  int topoId = pfClusteringVars[rhIdx].pfrh_topoId();
1284  if (pfClusteringVars[rhIdx].pfrh_isSeed() && topoId > -1) {
1285  // Allot the total number of rechits for this topo cluster for rh fractions
1286  int offset = alpaka::atomicAdd(acc, &totalSeedFracOffset, pfClusteringVars[topoId].topoRHCount());
1287 
1288  // Add offset for this PF cluster seed
1289  pfClusteringVars[rhIdx].seedFracOffsets() = offset;
1290 
1291  // Store recHitFraction offset & size information for each seed
1292  clusterView[pfClusteringVars[rhIdx].rhIdxToSeedIdx()].rhfracOffset() =
1293  pfClusteringVars[rhIdx].seedFracOffsets();
1294  clusterView[pfClusteringVars[rhIdx].rhIdxToSeedIdx()].rhfracSize() =
1295  pfClusteringVars[topoId].topoRHCount() - pfClusteringVars[topoId].topoSeedCount() + 1;
1296  }
1297  }
1298 
1299  alpaka::syncBlockThreads(acc); // all threads call sync
1300 
1301  if (once_per_block(acc)) {
1302  pfClusteringVars.pcrhFracSize() = totalSeedFracOffset;
1303  pfClusteringVars.nRHFracs() = totalSeedFracOffset;
1304  clusterView.nRHFracs() = totalSeedFracOffset;
1305  clusterView.nSeeds() = *nSeeds;
1306  clusterView.nTopos() = pfClusteringVars.nTopos();
1307 
1308  if (pfClusteringVars.pcrhFracSize() > 200000) // Warning in case the fraction is too large
1309  printf("At the end of topoClusterContraction, found large *pcrhFracSize = %d\n",
1310  pfClusteringVars.pcrhFracSize());
1311  }
1312  }
1313  };
1314 
1315  // Prefill the rechit index for all PFCluster fractions
1316  // Optimized for GPU parallel, but works on any backend
1318  public:
1319  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1320  ALPAKA_FN_ACC void operator()(const TAcc& acc,
1324  const int nRH = pfRecHits.size();
1325 
1326  for (auto index : uniform_elements_nd(acc, {nRH, nRH})) {
1327  const int i = index[0u]; // i is a seed index
1328  const int j = index[1u]; // j is NOT a seed
1329  int topoId = pfClusteringVars[i].pfrh_topoId();
1330  if (topoId > -1 && pfClusteringVars[i].pfrh_isSeed() && topoId == pfClusteringVars[j].pfrh_topoId()) {
1331  if (!pfClusteringVars[j].pfrh_isSeed()) { // NOT a seed
1332  int k = alpaka::atomicAdd(
1333  acc, &pfClusteringVars[i].rhCount(), 1); // Increment the number of rechit fractions for this seed
1334  auto fraction = fracView[pfClusteringVars[i].seedFracOffsets() + k];
1335  fraction.pfrhIdx() = j;
1336  fraction.pfcIdx() = pfClusteringVars[i].rhIdxToSeedIdx();
1337  } else if (i == j) { // i==j is a seed rechit index
1338  auto seed = fracView[pfClusteringVars[i].seedFracOffsets()];
1339  seed.pfrhIdx() = j;
1340  seed.frac() = 1;
1341  seed.pfcIdx() = pfClusteringVars[i].rhIdxToSeedIdx();
1342  }
1343  }
1344  }
1345  }
1346  };
1347 
1348  class FastCluster {
1349  public:
1350  template <bool debug = false, typename TAcc, typename = std::enable_if<!std::is_same_v<Device, alpaka::DevCpu>>>
1351  ALPAKA_FN_ACC void operator()(const TAcc& acc,
1353  const reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
1354  const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
1358  const int nRH = pfRecHits.size();
1359  int& topoId = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1360  int& nRHTopo = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1361  int& nSeeds = alpaka::declareSharedVar<int, __COUNTER__>(acc);
1362 
1363  if (once_per_block(acc)) {
1364  topoId = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
1365  nRHTopo = pfClusteringVars[topoId].topoRHCount();
1366  nSeeds = pfClusteringVars[topoId].topoSeedCount();
1367  }
1368 
1369  alpaka::syncBlockThreads(acc); // all threads call sync
1370 
1371  if (topoId < nRH && nRHTopo > 0 && nSeeds > 0) {
1372  if (nRHTopo == nSeeds) {
1373  // PF cluster is isolated seed. No iterations needed
1374  if (once_per_block(acc)) {
1375  // Fill PFCluster-level information
1376  int rhIdx = pfClusteringVars[pfClusteringVars[topoId].topoSeedOffsets()].topoSeedList();
1377  int seedIdx = pfClusteringVars[rhIdx].rhIdxToSeedIdx();
1378  clusterView[seedIdx].energy() = pfRecHits[rhIdx].energy();
1379  clusterView[seedIdx].x() = pfRecHits[rhIdx].x();
1380  clusterView[seedIdx].y() = pfRecHits[rhIdx].y();
1381  clusterView[seedIdx].z() = pfRecHits[rhIdx].z();
1382  }
1383  // singleSeed and multiSeedParallel functions work only for GPU backend
1384  } else if ((not std::is_same_v<Device, alpaka::DevCpu>)&&nSeeds == 1) {
1385  // Single seed cluster
1387  acc, pfClusParams, topology, topoId, nRHTopo, pfRecHits, pfClusteringVars, clusterView, fracView);
1388  } else if ((not std::is_same_v<Device, alpaka::DevCpu>)&&nSeeds <= 100 &&
1389  nRHTopo - nSeeds < threadsPerBlockForClustering) {
1391  acc, pfClusParams, topology, topoId, nSeeds, nRHTopo, pfRecHits, pfClusteringVars, clusterView, fracView);
1392  } else if (nSeeds <= 400 && nRHTopo - nSeeds <= 1500) {
1393  // nSeeds value must match exotic in FastClusterExotic
1395  acc, pfClusParams, topology, topoId, nSeeds, nRHTopo, pfRecHits, pfClusteringVars, clusterView, fracView);
1396  } else {
1397  if constexpr (debug) {
1398  if (once_per_block(acc))
1399  printf("Topo cluster %d has %d seeds and %d rechits. Will be processed in next kernel.\n",
1400  topoId,
1401  nSeeds,
1402  nRHTopo);
1403  }
1404  }
1405  }
1406  }
1407  };
1408 
1409  // Process very large, exotic topo clusters
1411  public:
1412  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1413  ALPAKA_FN_ACC void operator()(const TAcc& acc,
1415  const reco::PFClusterParamsDeviceCollection::ConstView pfClusParams,
1416  const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology,
1420  Position4* __restrict__ globalClusterPos,
1421  Position4* __restrict__ globalPrevClusterPos,
1422  float* __restrict__ globalClusterEnergy,
1423  float* __restrict__ globalRhFracSum,
1424  int* __restrict__ globalSeeds,
1425  int* __restrict__ globalRechits) const {
1426  const int nRH = pfRecHits.size();
1427  for (int topoId = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]; topoId < nRH;
1428  topoId += blocksForExoticClusters) {
1429  int nRHTopo = pfClusteringVars[topoId].topoRHCount();
1430  int nSeeds = pfClusteringVars[topoId].topoSeedCount();
1431 
1432  // nSeeds value must match multiSeedIterative in FastCluster
1433  if (nRHTopo > 0 && (nSeeds > 400 || nRHTopo - nSeeds > 1500)) {
1435  pfClusParams,
1436  topology,
1437  topoId,
1438  nSeeds,
1439  nRHTopo,
1440  pfRecHits,
1441  pfClusteringVars,
1442  clusterView,
1443  fracView,
1444  globalClusterPos,
1445  globalPrevClusterPos,
1446  globalClusterEnergy,
1447  globalRhFracSum,
1448  globalSeeds,
1449  globalRechits);
1450  }
1451  alpaka::syncBlockThreads(acc); // all threads call sync
1452  }
1453  }
1454  };
1455 
1457  : nSeeds(cms::alpakatools::make_device_buffer<uint32_t>(queue)),
1458  globalClusterPos(
1460  globalPrevClusterPos(
1462  globalClusterEnergy(
1464  globalRhFracSum(cms::alpakatools::make_device_buffer<float[]>(queue, blocksForExoticClusters * maxTopoInput)),
1465  globalSeeds(cms::alpakatools::make_device_buffer<int[]>(queue, blocksForExoticClusters * maxTopoInput)),
1466  globalRechits(cms::alpakatools::make_device_buffer<int[]>(queue, blocksForExoticClusters * maxTopoInput)) {
1467  alpaka::memset(queue, nSeeds, 0x00); // Reset nSeeds
1468  }
1469 
1473  reco::PFClusteringVarsDeviceCollection& pfClusteringVars,
1474  reco::PFClusteringEdgeVarsDeviceCollection& pfClusteringEdgeVars,
1476  reco::PFClusterDeviceCollection& pfClusters,
1477  reco::PFRecHitFractionDeviceCollection& pfrhFractions) {
1478  const int nRH = pfRecHits->size();
1479  const int threadsPerBlock = 256;
1480  const int blocks = divide_up_by(nRH, threadsPerBlock);
1481 
1482  // seedingTopoThresh
1483  alpaka::exec<Acc1D>(queue,
1484  make_workdiv<Acc1D>(blocks, threadsPerBlock),
1486  pfClusteringVars.view(),
1487  params.view(),
1488  topology.view(),
1489  pfRecHits.view(),
1490  pfClusters.view(),
1491  pfrhFractions.view(),
1492  nSeeds.data());
1493  // prepareTopoInputs
1494  alpaka::exec<Acc1D>(queue,
1495  make_workdiv<Acc1D>(blocks, threadsPerBlock),
1497  pfRecHits.view(),
1498  pfClusteringVars.view(),
1499  pfClusteringEdgeVars.view(),
1500  nSeeds.data());
1501  // ECLCC
1502  alpaka::exec<Acc1D>(queue,
1503  make_workdiv<Acc1D>(blocks, threadsPerBlock),
1504  ECLCCInit{},
1505  pfRecHits.view(),
1506  pfClusteringVars.view(),
1507  pfClusteringEdgeVars.view());
1508  alpaka::exec<Acc1D>(queue,
1509  make_workdiv<Acc1D>(blocks, threadsPerBlock),
1510  ECLCCCompute1{},
1511  pfRecHits.view(),
1512  pfClusteringVars.view(),
1513  pfClusteringEdgeVars.view());
1514  alpaka::exec<Acc1D>(queue,
1515  make_workdiv<Acc1D>(blocks, threadsPerBlock),
1516  ECLCCFlatten{},
1517  pfRecHits.view(),
1518  pfClusteringVars.view(),
1519  pfClusteringEdgeVars.view());
1520  // topoClusterContraction
1521  alpaka::exec<Acc1D>(queue,
1522  make_workdiv<Acc1D>(1, threadsPerBlockForClustering),
1524  pfRecHits.view(),
1525  pfClusteringVars.view(),
1526  pfClusters.view(),
1527  nSeeds.data());
1528 
1529  // fillRhfIndex
1530  alpaka::exec<Acc2D>(queue,
1531  make_workdiv<Acc2D>({divide_up_by(nRH, 32), divide_up_by(nRH, 32)}, {32, 32}),
1532  FillRhfIndex{},
1533  pfRecHits.view(),
1534  pfClusteringVars.view(),
1535  pfrhFractions.view());
1536 
1537  // Run fastCluster
1538  alpaka::exec<Acc1D>(queue,
1539  make_workdiv<Acc1D>(nRH, threadsPerBlockForClustering),
1540  FastCluster{},
1541  pfRecHits.view(),
1542  params.view(),
1543  topology.view(),
1544  pfClusteringVars.view(),
1545  pfClusters.view(),
1546  pfrhFractions.view());
1547  // exotic clustering kernel
1548  alpaka::exec<Acc1D>(queue,
1549  make_workdiv<Acc1D>(blocksForExoticClusters,
1550  threadsPerBlockForClustering), // uses 4 blocks to minimize memory usage
1552  pfRecHits.view(),
1553  params.view(),
1554  topology.view(),
1555  pfClusteringVars.view(),
1556  pfClusters.view(),
1557  pfrhFractions.view(),
1558  globalClusterPos.data(),
1559  globalPrevClusterPos.data(),
1560  globalClusterEnergy.data(),
1561  globalRhFracSum.data(),
1562  globalSeeds.data(),
1563  globalRechits.data());
1564  }
1565 
1566 } // namespace ALPAKA_ACCELERATOR_NAMESPACE
static constexpr uint32_t blocksForExoticClusters
ALPAKA_FN_ACC constexpr bool once_per_block(TAcc const &acc)
static ALPAKA_FN_ACC auto getSeedRhIdx(int *seeds, int seedNum)
ALPAKA_FN_ACC auto uniform_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:311
constexpr Idx divide_up_by(Idx value, Idx divisor)
Definition: workdivision.h:20
cms::alpakatools::device_buffer< Device, uint32_t > nSeeds
ALPAKA_FN_ACC void operator()(const TAcc &acc, const reco::PFRecHitHostCollection::ConstView pfRecHits, const reco::PFClusterParamsDeviceCollection::ConstView pfClusParams, const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology, reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, reco::PFClusterDeviceCollection::View clusterView, reco::PFRecHitFractionDeviceCollection::View fracView) const
T w() const
ALPAKA_FN_ACC auto uniform_elements_nd(TAcc const &acc)
Definition: workdivision.h:575
static ALPAKA_FN_ACC void hcalFastCluster_multiSeedIterative(const TAcc &acc, reco::PFClusterParamsDeviceCollection::ConstView pfClusParams, const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology, int topoId, int nSeeds, int nRHTopo, reco::PFRecHitDeviceCollection::ConstView pfRecHits, reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, reco::PFClusterDeviceCollection::View clusterView, reco::PFRecHitFractionDeviceCollection::View fracView)
ALPAKA_FN_ACC void operator()(const TAcc &acc, const reco::PFRecHitHostCollection::ConstView pfRecHits, reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, reco::PFRecHitFractionDeviceCollection::View fracView) const
std::enable_if_t< alpaka::isDevice< TDev > and not std::is_array_v< T >, device_buffer< TDev, T > > make_device_buffer(TDev const &device)
Definition: memory.h:185
static ALPAKA_FN_ACC void hcalFastCluster_multiSeedParallel(const TAcc &acc, reco::PFClusterParamsDeviceCollection::ConstView pfClusParams, const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology, int topoId, int nSeeds, int nRHTopo, reco::PFRecHitDeviceCollection::ConstView pfRecHits, reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, reco::PFClusterDeviceCollection::View clusterView, reco::PFRecHitFractionDeviceCollection::View fracView)
cms::alpakatools::device_buffer< Device, reco::pfClustering::Position4[]> globalPrevClusterPos
cms::alpakatools::device_buffer< Device, int[]> globalSeeds
static ALPAKA_FN_ACC void hcalFastCluster_exotic(const TAcc &acc, reco::PFClusterParamsDeviceCollection::ConstView pfClusParams, const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology, int topoId, int nSeeds, int nRHTopo, reco::PFRecHitDeviceCollection::ConstView pfRecHits, reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, reco::PFClusterDeviceCollection::View clusterView, reco::PFRecHitFractionDeviceCollection::View fracView, Position4 *__restrict__ globalClusterPos, Position4 *__restrict__ globalPrevClusterPos, float *__restrict__ globalClusterEnergy, float *__restrict__ globalRhFracSum, int *__restrict__ globalSeeds, int *__restrict__ globalRechits)
constexpr uint32_t stride
Definition: HelixFit.h:22
ALPAKA_FN_ACC void operator()(const TAcc &acc, reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, const reco::PFClusterParamsDeviceCollection::ConstView pfClusParams, const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology, const reco::PFRecHitHostCollection::ConstView pfRecHits, reco::PFClusterDeviceCollection::View clusterView, reco::PFRecHitFractionDeviceCollection::View fracView, uint32_t *__restrict__ nSeeds) const
ALPAKA_FN_ACC void operator()(const TAcc &acc, const reco::PFRecHitHostCollection::ConstView pfRecHits, reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, reco::PFClusterDeviceCollection::View clusterView, uint32_t *__restrict__ nSeeds) const
ALPAKA_FN_ACC static ALPAKA_FN_INLINE float dR2(Position4 pos1, Position4 pos2)
ALPAKA_FN_ACC void operator()(const TAcc &acc, const reco::PFRecHitHostCollection::ConstView pfRecHits, const reco::PFClusterParamsDeviceCollection::ConstView pfClusParams, const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology, reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, reco::PFClusterDeviceCollection::View clusterView, reco::PFRecHitFractionDeviceCollection::View fracView, Position4 *__restrict__ globalClusterPos, Position4 *__restrict__ globalPrevClusterPos, float *__restrict__ globalClusterEnergy, float *__restrict__ globalRhFracSum, int *__restrict__ globalSeeds, int *__restrict__ globalRechits) const
void execute(Queue &queue, const reco::PFClusterParamsDeviceCollection &params, const reco::PFRecHitHCALTopologyDeviceCollection &topology, reco::PFClusteringVarsDeviceCollection &pfClusteringVars, reco::PFClusteringEdgeVarsDeviceCollection &pfClusteringEdgeVars, const reco::PFRecHitHostCollection &pfRecHits, reco::PFClusterDeviceCollection &pfClusters, reco::PFRecHitFractionDeviceCollection &pfrhFractions)
cms::alpakatools::device_buffer< Device, float[]> globalRhFracSum
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
PortableCollection<::reco::PFClusterSoA > PFClusterDeviceCollection
ALPAKA_FN_HOST_ACC static ALPAKA_FN_INLINE float atomicMaxF(const TAcc &acc, float *address, float val)
Definition: atomicMaxF.h:11
double f[11][100]
cms::alpakatools::device_buffer< Device, int[]> globalRechits
ALPAKA_FN_ACC void operator()(const TAcc &acc, const reco::PFRecHitHostCollection::ConstView pfRecHits, reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, reco::PFClusteringEdgeVarsDeviceCollection::View pfClusteringEdgeVars, uint32_t *__restrict__ nSeeds) const
const dim3 blockIdx
Definition: cudaCompat.h:32
PortableCollection<::reco::PFClusteringVarsSoA > PFClusteringVarsDeviceCollection
#define M_PI
PortableCollection<::reco::PFRecHitHCALTopologySoA > PFRecHitHCALTopologyDeviceCollection
static ALPAKA_FN_ACC auto getRhFrac(reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, int topoSeedBegin, reco::PFRecHitFractionDeviceCollection::View fracView, int seedNum, int rhNum)
Namespace of DDCMS conversion namespace.
PortableCollection<::reco::PFRecHitFractionSoA > PFRecHitFractionDeviceCollection
typename Layout::ConstView ConstView
PortableCollection<::reco::PFClusterParamsSoA > PFClusterParamsDeviceCollection
#define debug
Definition: HDRShower.cc:19
cms::alpakatools::device_buffer< Device, float[]> globalClusterEnergy
static ALPAKA_FN_ACC void updateClusterPos(reco::PFClusterParamsDeviceCollection::ConstView pfClusParams, Position4 &pos4, float frac, int rhInd, reco::PFRecHitDeviceCollection::ConstView pfRecHits, float rhENormInv)
T mag2() const
The vector magnitude squared. Equivalent to vec.dot(vec)
PFClusterProducerKernel(Queue &queue, const reco::PFRecHitHostCollection &pfRecHits)
PortableCollection<::reco::PFClusteringEdgeVarsSoA > PFClusteringEdgeVarsDeviceCollection
ALPAKA_FN_ACC constexpr bool once_per_grid(TAcc const &acc)
static ALPAKA_FN_ACC void hcalFastCluster_singleSeed(const TAcc &acc, reco::PFClusterParamsDeviceCollection::ConstView pfClusParams, const reco::PFRecHitHCALTopologyDeviceCollection::ConstView topology, int topoId, int nRHTopo, reco::PFRecHitDeviceCollection::ConstView pfRecHits, reco::PFClusteringVarsDeviceCollection::View pfClusteringVars, reco::PFClusterDeviceCollection::View clusterView, reco::PFRecHitFractionDeviceCollection::View fracView)
float x
cms::alpakatools::device_buffer< Device, reco::pfClustering::Position4[]> globalClusterPos
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61