CMS 3D CMS Logo

PixelClustering.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelClusterizer_alpaka_PixelClustering_h
2 #define RecoLocalTracker_SiPixelClusterizer_alpaka_PixelClustering_h
3 
4 #include <cmath>
5 #include <cstdint>
6 #include <cstdio>
7 #include <type_traits>
8 
9 #include <alpaka/alpaka.hpp>
10 
16 
17 //#define GPU_DEBUG
18 
20 
21 #ifdef GPU_DEBUG
22  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
23  ALPAKA_STATIC_ACC_MEM_GLOBAL uint32_t gMaxHit = 0;
24 #endif
25 
26  namespace pixelStatus {
27  // Phase-1 pixel modules
30 
31  // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
32  enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
33 
34  constexpr uint32_t bits = 2;
35  constexpr uint32_t mask = (0x01 << bits) - 1;
36  constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
38 
39  ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y) {
40  return (pixelSizeX * y + x) / valuesPerWord;
41  }
42 
43  ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y) {
44  return (x % valuesPerWord) * 2;
45  }
46 
47  ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const* __restrict__ status,
48  uint16_t x,
49  uint16_t y) {
50  uint32_t index = getIndex(x, y);
51  uint32_t shift = getShift(x, y);
52  return Status{(status[index] >> shift) & mask};
53  }
54 
55  ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const* __restrict__ status,
56  uint16_t x,
57  uint16_t y) {
58  return getStatus(status, x, y) == kDuplicate;
59  }
60 
61  /* FIXME
62  * In the more general case (e.g. a multithreaded CPU backend) there is a potential race condition
63  * between the read of status[index] at line NNN and the atomicCas at line NNN.
64  * We should investigate:
65  * - if `status` should be read through a `volatile` pointer (CUDA/ROCm)
66  * - if `status` should be read with an atomic load (CPU)
67  */
68  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
69  ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(TAcc const& acc,
70  uint32_t* __restrict__ status,
71  const uint16_t x,
72  const uint16_t y) {
73  uint32_t index = getIndex(x, y);
74  uint32_t shift = getShift(x, y);
75  uint32_t old_word = status[index];
76  uint32_t expected = old_word;
77  do {
78  expected = old_word;
79  Status old_status{(old_word >> shift) & mask};
80  if (kDuplicate == old_status) {
81  // nothing to do
82  return;
83  }
84  Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
85  uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
86  old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Blocks{});
87  } while (expected != old_word);
88  }
89 
90  } // namespace pixelStatus
91 
92  template <typename TrackerTraits>
93  struct CountModules {
94  template <typename TAcc>
95  ALPAKA_FN_ACC void operator()(const TAcc& acc,
96  SiPixelDigisSoAView digi_view,
97  SiPixelClustersSoAView clus_view,
98  const unsigned int numElements) const {
99  // Make sure the atomicInc below does not overflow
101 
102 #ifdef GPU_DEBUG
104  printf("Starting to count modules to set module starts:");
105  }
106 #endif
107  for (int32_t i : cms::alpakatools::uniform_elements(acc, numElements)) {
108  digi_view[i].clus() = i;
109  if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
110  continue;
111 
112  int32_t j = i - 1;
113  while (j >= 0 and digi_view[j].moduleId() == ::pixelClustering::invalidModuleId)
114  --j;
115  if (j < 0 or digi_view[j].moduleId() != digi_view[i].moduleId()) {
116  // Found a module boundary: count the number of modules in clus_view[0].moduleStart()
117  auto loc = alpaka::atomicInc(acc,
118  &clus_view[0].moduleStart(),
119  static_cast<uint32_t>(::pixelClustering::maxNumModules),
122 #ifdef GPU_DEBUG
123  printf("> New module (no. %d) found at digi %d \n", loc, i);
124 #endif
125  clus_view[loc + 1].moduleStart() = i;
126  }
127  }
128  }
129  };
130 
131  template <typename TrackerTraits>
132  struct FindClus {
133  // assume that we can cover the whole module with up to 16 blockDimension-wide iterations
134  static constexpr uint32_t maxIterGPU = 16;
135 
136  // this must be larger than maxPixInModule / maxIterGPU, and should be a multiple of the warp size
137  static constexpr uint32_t maxElementsPerBlock =
138  cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / maxIterGPU, 128);
139 
140  template <typename TAcc>
141  ALPAKA_FN_ACC void operator()(const TAcc& acc,
142  SiPixelDigisSoAView digi_view,
143  SiPixelClustersSoAView clus_view,
144  const unsigned int numElements) const {
146 
147  auto& lastPixel = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
148 
149  const uint32_t lastModule = clus_view[0].moduleStart();
150  for (uint32_t module : cms::alpakatools::independent_groups(acc, lastModule)) {
151  auto firstPixel = clus_view[1 + module].moduleStart();
152  uint32_t thisModuleId = digi_view[firstPixel].moduleId();
154 
155 #ifdef GPU_DEBUG
156  if (thisModuleId % 100 == 1)
158  printf("start clusterizer for module %4d in block %4d\n",
159  thisModuleId,
160  alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
161 #endif
162 
163  // find the index of the first pixel not belonging to this module (or invalid)
164  lastPixel = numElements;
165  alpaka::syncBlockThreads(acc);
166 
167  // skip threads not associated to an existing pixel
168  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
169  auto id = digi_view[i].moduleId();
170  // skip invalid pixels
172  continue;
173  // find the first pixel in a different module
174  if (id != thisModuleId) {
175  alpaka::atomicMin(acc, &lastPixel, i, alpaka::hierarchy::Threads{});
176  break;
177  }
178  }
179 
180  using Hist = cms::alpakatools::HistoContainer<uint16_t,
181  TrackerTraits::clusterBinning,
182  TrackerTraits::maxPixInModule,
183  TrackerTraits::clusterBits,
184  uint16_t>;
185 #if defined(__HIP_DEVICE_COMPILE__)
186  constexpr auto warpSize = __AMDGCN_WAVEFRONT_SIZE;
187 #else
188  constexpr auto warpSize = 32;
189 #endif
190  auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
191  auto& ws = alpaka::declareSharedVar<typename Hist::Counter[warpSize], __COUNTER__>(acc);
192  for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::totbins())) {
193  hist.off[j] = 0;
194  }
195  alpaka::syncBlockThreads(acc);
196 
197  ALPAKA_ASSERT_ACC((lastPixel == numElements) or
198  ((lastPixel < numElements) and (digi_view[lastPixel].moduleId() != thisModuleId)));
199  // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
201  if (lastPixel - firstPixel > TrackerTraits::maxPixInModule) {
202  printf("too many pixels in module %u: %u > %u\n",
203  thisModuleId,
204  lastPixel - firstPixel,
205  TrackerTraits::maxPixInModule);
206  lastPixel = TrackerTraits::maxPixInModule + firstPixel;
207  }
208  }
209  alpaka::syncBlockThreads(acc);
210  ALPAKA_ASSERT_ACC(lastPixel - firstPixel <= TrackerTraits::maxPixInModule);
211 
212 #ifdef GPU_DEBUG
213  auto& totGood = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
214  totGood = 0;
215  alpaka::syncBlockThreads(acc);
216 #endif
217 
218  // remove duplicate pixels
220  if constexpr (not isPhase2) {
221  // packed words array used to store the pixelStatus of each pixel
222  auto& status = alpaka::declareSharedVar<uint32_t[pixelStatus::size], __COUNTER__>(acc);
223 
224  if (lastPixel > 1) {
226  status[i] = 0;
227  }
228  alpaka::syncBlockThreads(acc);
229 
230  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
231  // skip invalid pixels
232  if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
233  continue;
234  pixelStatus::promote(acc, status, digi_view[i].xx(), digi_view[i].yy());
235  }
236  alpaka::syncBlockThreads(acc);
237 
238  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
239  // skip invalid pixels
240  if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
241  continue;
242  if (pixelStatus::isDuplicate(status, digi_view[i].xx(), digi_view[i].yy())) {
243  digi_view[i].moduleId() = ::pixelClustering::invalidModuleId;
244  digi_view[i].rawIdArr() = 0;
245  }
246  }
247  alpaka::syncBlockThreads(acc);
248  }
249  }
250 
251  // fill histo
252  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
253  // skip invalid pixels
254  if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
255  hist.count(acc, digi_view[i].yy());
256 #ifdef GPU_DEBUG
257  alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{});
258 #endif
259  }
260  }
261  alpaka::syncBlockThreads(acc); // FIXME this can be removed
262  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, warpSize)) {
263  ws[i] = 0; // used by prefix scan...
264  }
265  alpaka::syncBlockThreads(acc);
266  hist.finalize(acc, ws);
267  alpaka::syncBlockThreads(acc);
268 #ifdef GPU_DEBUG
269  ALPAKA_ASSERT_ACC(hist.size() == totGood);
270  if (thisModuleId % 100 == 1)
272  printf("histo size %d\n", hist.size());
273 #endif
274  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
275  // skip invalid pixels
276  if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
277  hist.fill(acc, digi_view[i].yy(), i - firstPixel);
278  }
279  }
280 
281 #ifdef GPU_DEBUG
282  // look for anomalous high occupancy
283  auto& n40 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
284  auto& n60 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
286  n40 = 0;
287  n60 = 0;
288  }
289  alpaka::syncBlockThreads(acc);
291  if (hist.size(j) > 60)
293  if (hist.size(j) > 40)
295  }
296  alpaka::syncBlockThreads(acc);
298  if (n60 > 0)
299  printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
300  else if (n40 > 0)
301  printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
302  }
303  alpaka::syncBlockThreads(acc);
304 #endif
305 
306  [[maybe_unused]] const uint32_t blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
307  // assume that we can cover the whole module with up to maxIterGPU blockDimension-wide iterations
308  ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < maxIterGPU);
309 
310  // number of elements per thread
311  constexpr uint32_t maxElements =
312  cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? maxElementsPerBlock : 1;
313  ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u] <= maxElements));
314 
315  constexpr unsigned int maxIter = maxIterGPU * maxElements;
316 
317  // nearest neighbours (nn)
318  // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
319  constexpr int maxNeighbours = 10;
320  uint16_t nn[maxIter][maxNeighbours];
321  uint8_t nnn[maxIter]; // number of nn
322  for (uint32_t k = 0; k < maxIter; ++k) {
323  nnn[k] = 0;
324  }
325 
326  alpaka::syncBlockThreads(acc); // for hit filling!
327 
328  // fill the nearest neighbours
329  uint32_t k = 0;
330  for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
331  ALPAKA_ASSERT_ACC(k < maxIter);
332  auto p = hist.begin() + j;
333  auto i = *p + firstPixel;
335  ALPAKA_ASSERT_ACC(digi_view[i].moduleId() == thisModuleId); // same module
336  auto bin = Hist::bin(digi_view[i].yy() + 1);
337  auto end = hist.end(bin);
338  ++p;
339  ALPAKA_ASSERT_ACC(0 == nnn[k]);
340  for (; p < end; ++p) {
341  auto m = *p + firstPixel;
342  ALPAKA_ASSERT_ACC(m != i);
343  ALPAKA_ASSERT_ACC(int(digi_view[m].yy()) - int(digi_view[i].yy()) >= 0);
344  ALPAKA_ASSERT_ACC(int(digi_view[m].yy()) - int(digi_view[i].yy()) <= 1);
345  if (std::abs(int(digi_view[m].xx()) - int(digi_view[i].xx())) <= 1) {
346  auto l = nnn[k]++;
347  ALPAKA_ASSERT_ACC(l < maxNeighbours);
348  nn[k][l] = *p;
349  }
350  }
351  ++k;
352  }
353 
354  // for each pixel, look at all the pixels until the end of the module;
355  // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
356  // after the loop, all the pixel in each cluster should have the id equeal to the lowest
357  // pixel in the cluster ( clus[i] == i ).
358  bool more = true;
359  /*
360  int nloops = 0;
361  */
362  while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
363  /*
364  if (nloops % 2 == 0) {
365  // even iterations of the outer loop
366  */
367  more = false;
368  uint32_t k = 0;
369  for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
370  ALPAKA_ASSERT_ACC(k < maxIter);
371  auto p = hist.begin() + j;
372  auto i = *p + firstPixel;
373  for (int kk = 0; kk < nnn[k]; ++kk) {
374  auto l = nn[k][kk];
375  auto m = l + firstPixel;
376  ALPAKA_ASSERT_ACC(m != i);
377  // FIXME ::Threads ?
378  auto old = alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Blocks{});
379  if (old != digi_view[i].clus()) {
380  // end the loop only if no changes were applied
381  more = true;
382  }
383  // FIXME ::Threads ?
384  alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Blocks{});
385  } // neighbours loop
386  ++k;
387  } // pixel loop
388  /*
389  // use the outer loop to force a synchronisation
390  } else {
391  // odd iterations of the outer loop
392  */
393  alpaka::syncBlockThreads(acc);
394  for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
395  auto p = hist.begin() + j;
396  auto i = *p + firstPixel;
397  auto m = digi_view[i].clus();
398  while (m != digi_view[m].clus())
399  m = digi_view[m].clus();
400  digi_view[i].clus() = m;
401  }
402  /*
403  }
404  ++nloops;
405  */
406  } // end while
407 
408  /*
409  // check that all threads in the block have executed the same number of iterations
410  auto& n0 = alpaka::declareSharedVar<int, __COUNTER__>(acc);
411  if (cms::alpakatools::once_per_block(acc))
412  n0 = nloops;
413  alpaka::syncBlockThreads(acc);
414  ALPAKA_ASSERT_ACC(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, nloops == n0));
415  if (thisModuleId % 100 == 1)
416  if (cms::alpakatools::once_per_block(acc))
417  printf("# loops %d\n", nloops);
418  */
419 
420  auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
421  foundClusters = 0;
422  alpaka::syncBlockThreads(acc);
423 
424  // find the number of different clusters, identified by a pixels with clus[i] == i;
425  // mark these pixels with a negative id.
426  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
427  // skip invalid pixels
428  if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
429  continue;
430  if (digi_view[i].clus() == static_cast<int>(i)) {
431  auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
432  digi_view[i].clus() = -(old + 1);
433  }
434  }
435  alpaka::syncBlockThreads(acc);
436 
437  // propagate the negative id to all the pixels in the cluster.
438  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
439  // skip invalid pixels
440  if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
441  continue;
442  if (digi_view[i].clus() >= 0) {
443  // mark each pixel in a cluster with the same id as the first one
444  digi_view[i].clus() = digi_view[digi_view[i].clus()].clus();
445  }
446  }
447  alpaka::syncBlockThreads(acc);
448 
449  // adjust the cluster id to be a positive value starting from 0
450  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
451  if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId) {
452  // mark invalid pixels with an invalid cluster index
453  digi_view[i].clus() = ::pixelClustering::invalidClusterId;
454  } else {
455  digi_view[i].clus() = -digi_view[i].clus() - 1;
456  }
457  }
458  alpaka::syncBlockThreads(acc);
459 
461  clus_view[thisModuleId].clusInModule() = foundClusters;
462  clus_view[module].moduleId() = thisModuleId;
463 #ifdef GPU_DEBUG
464  if (foundClusters > gMaxHit<TAcc>) {
465  gMaxHit<TAcc> = foundClusters;
466  if (foundClusters > 8)
467  printf("max hit %d in %d\n", foundClusters, thisModuleId);
468  }
469  if (thisModuleId % 100 == 1)
470  printf("%d clusters in module %d\n", foundClusters, thisModuleId);
471 #endif
472  }
473  } // module loop
474  }
475  };
476 
477 } // namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering
478 
479 #endif // plugin_SiPixelClusterizer_alpaka_PixelClustering.h
cms::alpakatools::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
ALPAKA_FN_ACC constexpr bool once_per_block(TAcc const &acc)
ALPAKA_FN_ACC auto uniform_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:311
static constexpr uint16_t numColsInModule
ALPAKA_FN_ACC auto independent_group_elements(TAcc const &acc, TArgs... args)
static constexpr uint16_t numRowsInModule
ALPAKA_FN_ACC auto independent_groups(TAcc const &acc, TArgs... args)
constexpr uint16_t numberOfModules
ALPAKA_FN_ACC void operator()(const TAcc &acc, SiPixelDigisSoAView digi_view, SiPixelClustersSoAView clus_view, const unsigned int numElements) const
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y)
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(TAcc const &acc, uint32_t *__restrict__ status, const uint16_t x, const uint16_t y)
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y)
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
constexpr Idx round_up_by(Idx value, Idx divisor)
Definition: workdivision.h:17
std::vector< Block > Blocks
Definition: Block.h:99
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
constexpr uint16_t invalidModuleId
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ moduleStart
ALPAKA_FN_ACC void operator()(const TAcc &acc, SiPixelDigisSoAView digi_view, SiPixelClustersSoAView clus_view, const unsigned int numElements) const
ALPAKA_FN_ACC constexpr bool once_per_grid(TAcc const &acc)
static unsigned int const shift
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
float x
constexpr int invalidClusterId
constexpr uint16_t maxNumModules
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const *__restrict__ status, uint16_t x, uint16_t y)
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const *__restrict__ status, uint16_t x, uint16_t y)
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr uint16_t invalidModuleId