CMS 3D CMS Logo

PixelClustering.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelClusterizer_alpaka_PixelClustering_h
2 #define RecoLocalTracker_SiPixelClusterizer_alpaka_PixelClustering_h
3 
4 #include <cmath>
5 #include <cstdint>
6 #include <cstdio>
7 #include <type_traits>
8 
9 #include <alpaka/alpaka.hpp>
10 
16 
17 //#define GPU_DEBUG
18 
20 
21 #ifdef GPU_DEBUG
22  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
23  ALPAKA_STATIC_ACC_MEM_GLOBAL uint32_t gMaxHit = 0;
24 #endif
25 
26  namespace pixelStatus {
27  // Phase-1 pixel modules
30 
31  // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
32  enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
33 
34  constexpr uint32_t bits = 2;
35  constexpr uint32_t mask = (0x01 << bits) - 1;
36  constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
38 
39  ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y) {
40  return (pixelSizeX * y + x) / valuesPerWord;
41  }
42 
43  ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y) {
44  return (x % valuesPerWord) * 2;
45  }
46 
47  ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const* __restrict__ status,
48  uint16_t x,
49  uint16_t y) {
50  uint32_t index = getIndex(x, y);
51  uint32_t shift = getShift(x, y);
52  return Status{(status[index] >> shift) & mask};
53  }
54 
55  ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const* __restrict__ status,
56  uint16_t x,
57  uint16_t y) {
58  return getStatus(status, x, y) == kDuplicate;
59  }
60 
61  /* FIXME
62  * In the more general case (e.g. a multithreaded CPU backend) there is a potential race condition
63  * between the read of status[index] at line NNN and the atomicCas at line NNN.
64  * We should investigate:
65  * - if `status` should be read through a `volatile` pointer (CUDA/ROCm)
66  * - if `status` should be read with an atomic load (CPU)
67  */
68  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
69  ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(TAcc const& acc,
70  uint32_t* __restrict__ status,
71  const uint16_t x,
72  const uint16_t y) {
73  uint32_t index = getIndex(x, y);
74  uint32_t shift = getShift(x, y);
75  uint32_t old_word = status[index];
76  uint32_t expected = old_word;
77  do {
78  expected = old_word;
79  Status old_status{(old_word >> shift) & mask};
80  if (kDuplicate == old_status) {
81  // nothing to do
82  return;
83  }
84  Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
85  uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
86  old_word = alpaka::atomicCas(acc, &status[index], expected, new_word, alpaka::hierarchy::Blocks{});
87  } while (expected != old_word);
88  }
89 
90  } // namespace pixelStatus
91 
92  template <typename TrackerTraits>
93  struct CountModules {
94  template <typename TAcc>
95  ALPAKA_FN_ACC void operator()(const TAcc& acc,
96  SiPixelDigisSoAView digi_view,
97  SiPixelClustersSoAView clus_view,
98  const unsigned int numElements) const {
99  // Make sure the atomicInc below does not overflow
101 
102 #ifdef GPU_DEBUG
104  printf("Starting to count modules to set module starts:");
105  }
106 #endif
107  for (int32_t i : cms::alpakatools::uniform_elements(acc, numElements)) {
108  digi_view[i].clus() = i;
109  if (::pixelClustering::invalidModuleId == digi_view[i].moduleId())
110  continue;
111 
112  int32_t j = i - 1;
113  while (j >= 0 and digi_view[j].moduleId() == ::pixelClustering::invalidModuleId)
114  --j;
115  if (j < 0 or digi_view[j].moduleId() != digi_view[i].moduleId()) {
116  // Found a module boundary: count the number of modules in clus_view[0].moduleStart()
117  auto loc = alpaka::atomicInc(acc,
118  &clus_view[0].moduleStart(),
119  static_cast<uint32_t>(::pixelClustering::maxNumModules),
122 #ifdef GPU_DEBUG
123  printf("> New module (no. %d) found at digi %d \n", loc, i);
124 #endif
125  clus_view[loc + 1].moduleStart() = i;
126  }
127  }
128  }
129  };
130 
131  template <typename TrackerTraits>
132  struct FindClus {
133  // assume that we can cover the whole module with up to 16 blockDimension-wide iterations
134  static constexpr uint32_t maxIterGPU = 16;
135 
136  // this must be larger than maxPixInModule / maxIterGPU, and should be a multiple of the warp size
137  static constexpr uint32_t maxElementsPerBlock =
138  cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / maxIterGPU, 128);
139 
140  template <typename TAcc>
141  ALPAKA_FN_ACC void operator()(const TAcc& acc,
142  SiPixelDigisSoAView digi_view,
143  SiPixelClustersSoAView clus_view,
144  const unsigned int numElements) const {
146 
147  auto& lastPixel = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
148 
149  const uint32_t lastModule = clus_view[0].moduleStart();
150  for (uint32_t module : cms::alpakatools::independent_groups(acc, lastModule)) {
151  auto firstPixel = clus_view[1 + module].moduleStart();
152  uint32_t thisModuleId = digi_view[firstPixel].moduleId();
154 
155 #ifdef GPU_DEBUG
156  if (thisModuleId % 100 == 1)
158  printf("start clusterizer for module %4d in block %4d\n",
159  thisModuleId,
160  alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
161 #endif
162 
163  // find the index of the first pixel not belonging to this module (or invalid)
164  lastPixel = numElements;
165  alpaka::syncBlockThreads(acc);
166 
167  // skip threads not associated to an existing pixel
168  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, numElements)) {
169  auto id = digi_view[i].moduleId();
170  // skip invalid pixels
172  continue;
173  // find the first pixel in a different module
174  if (id != thisModuleId) {
175  alpaka::atomicMin(acc, &lastPixel, i, alpaka::hierarchy::Threads{});
176  break;
177  }
178  }
179 
180  using Hist = cms::alpakatools::HistoContainer<uint16_t,
181  TrackerTraits::clusterBinning,
182  TrackerTraits::maxPixInModule,
183  TrackerTraits::clusterBits,
184  uint16_t>;
185  auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
186  auto& ws = alpaka::declareSharedVar<typename Hist::Counter[32], __COUNTER__>(acc);
187  for (uint32_t j : cms::alpakatools::independent_group_elements(acc, Hist::totbins())) {
188  hist.off[j] = 0;
189  }
190  alpaka::syncBlockThreads(acc);
191 
192  ALPAKA_ASSERT_ACC((lastPixel == numElements) or
193  ((lastPixel < numElements) and (digi_view[lastPixel].moduleId() != thisModuleId)));
194  // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
196  if (lastPixel - firstPixel > TrackerTraits::maxPixInModule) {
197  printf("too many pixels in module %u: %u > %u\n",
198  thisModuleId,
199  lastPixel - firstPixel,
200  TrackerTraits::maxPixInModule);
201  lastPixel = TrackerTraits::maxPixInModule + firstPixel;
202  }
203  }
204  alpaka::syncBlockThreads(acc);
205  ALPAKA_ASSERT_ACC(lastPixel - firstPixel <= TrackerTraits::maxPixInModule);
206 
207 #ifdef GPU_DEBUG
208  auto& totGood = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
209  totGood = 0;
210  alpaka::syncBlockThreads(acc);
211 #endif
212 
213  // remove duplicate pixels
215  if constexpr (not isPhase2) {
216  // packed words array used to store the pixelStatus of each pixel
217  auto& status = alpaka::declareSharedVar<uint32_t[pixelStatus::size], __COUNTER__>(acc);
218 
219  if (lastPixel > 1) {
221  status[i] = 0;
222  }
223  alpaka::syncBlockThreads(acc);
224 
225  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
226  // skip invalid pixels
227  if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
228  continue;
229  pixelStatus::promote(acc, status, digi_view[i].xx(), digi_view[i].yy());
230  }
231  alpaka::syncBlockThreads(acc);
232 
233  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel - 1)) {
234  // skip invalid pixels
235  if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
236  continue;
237  if (pixelStatus::isDuplicate(status, digi_view[i].xx(), digi_view[i].yy())) {
238  digi_view[i].moduleId() = ::pixelClustering::invalidModuleId;
239  digi_view[i].rawIdArr() = 0;
240  }
241  }
242  alpaka::syncBlockThreads(acc);
243  }
244  }
245 
246  // fill histo
247  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
248  // skip invalid pixels
249  if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
250  hist.count(acc, digi_view[i].yy());
251 #ifdef GPU_DEBUG
252  alpaka::atomicAdd(acc, &totGood, 1u, alpaka::hierarchy::Blocks{});
253 #endif
254  }
255  }
256  alpaka::syncBlockThreads(acc); // FIXME this can be removed
257  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, 32u)) {
258  ws[i] = 0; // used by prefix scan...
259  }
260  alpaka::syncBlockThreads(acc);
261  hist.finalize(acc, ws);
262  alpaka::syncBlockThreads(acc);
263 #ifdef GPU_DEBUG
264  ALPAKA_ASSERT_ACC(hist.size() == totGood);
265  if (thisModuleId % 100 == 1)
267  printf("histo size %d\n", hist.size());
268 #endif
269  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
270  // skip invalid pixels
271  if (digi_view[i].moduleId() != ::pixelClustering::invalidModuleId) {
272  hist.fill(acc, digi_view[i].yy(), i - firstPixel);
273  }
274  }
275 
276 #ifdef GPU_DEBUG
277  // look for anomalous high occupancy
278  auto& n40 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
279  auto& n60 = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
281  n40 = 0;
282  n60 = 0;
283  }
284  alpaka::syncBlockThreads(acc);
286  if (hist.size(j) > 60)
288  if (hist.size(j) > 40)
290  }
291  alpaka::syncBlockThreads(acc);
293  if (n60 > 0)
294  printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
295  else if (n40 > 0)
296  printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
297  }
298  alpaka::syncBlockThreads(acc);
299 #endif
300 
301  [[maybe_unused]] const uint32_t blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
302  // assume that we can cover the whole module with up to maxIterGPU blockDimension-wide iterations
303  ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < maxIterGPU);
304 
305  // number of elements per thread
306  constexpr uint32_t maxElements =
307  cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? maxElementsPerBlock : 1;
308  ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u] <= maxElements));
309 
310  constexpr unsigned int maxIter = maxIterGPU * maxElements;
311 
312  // nearest neighbours (nn)
313  // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
314  constexpr int maxNeighbours = 10;
315  uint16_t nn[maxIter][maxNeighbours];
316  uint8_t nnn[maxIter]; // number of nn
317  for (uint32_t k = 0; k < maxIter; ++k) {
318  nnn[k] = 0;
319  }
320 
321  alpaka::syncBlockThreads(acc); // for hit filling!
322 
323  // fill the nearest neighbours
324  uint32_t k = 0;
325  for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
326  ALPAKA_ASSERT_ACC(k < maxIter);
327  auto p = hist.begin() + j;
328  auto i = *p + firstPixel;
330  ALPAKA_ASSERT_ACC(digi_view[i].moduleId() == thisModuleId); // same module
331  auto bin = Hist::bin(digi_view[i].yy() + 1);
332  auto end = hist.end(bin);
333  ++p;
334  ALPAKA_ASSERT_ACC(0 == nnn[k]);
335  for (; p < end; ++p) {
336  auto m = *p + firstPixel;
337  ALPAKA_ASSERT_ACC(m != i);
338  ALPAKA_ASSERT_ACC(int(digi_view[m].yy()) - int(digi_view[i].yy()) >= 0);
339  ALPAKA_ASSERT_ACC(int(digi_view[m].yy()) - int(digi_view[i].yy()) <= 1);
340  if (std::abs(int(digi_view[m].xx()) - int(digi_view[i].xx())) <= 1) {
341  auto l = nnn[k]++;
342  ALPAKA_ASSERT_ACC(l < maxNeighbours);
343  nn[k][l] = *p;
344  }
345  }
346  ++k;
347  }
348 
349  // for each pixel, look at all the pixels until the end of the module;
350  // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
351  // after the loop, all the pixel in each cluster should have the id equeal to the lowest
352  // pixel in the cluster ( clus[i] == i ).
353  bool more = true;
354  /*
355  int nloops = 0;
356  */
357  while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
358  /*
359  if (nloops % 2 == 0) {
360  // even iterations of the outer loop
361  */
362  more = false;
363  uint32_t k = 0;
364  for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
365  ALPAKA_ASSERT_ACC(k < maxIter);
366  auto p = hist.begin() + j;
367  auto i = *p + firstPixel;
368  for (int kk = 0; kk < nnn[k]; ++kk) {
369  auto l = nn[k][kk];
370  auto m = l + firstPixel;
371  ALPAKA_ASSERT_ACC(m != i);
372  // FIXME ::Threads ?
373  auto old = alpaka::atomicMin(acc, &digi_view[m].clus(), digi_view[i].clus(), alpaka::hierarchy::Blocks{});
374  if (old != digi_view[i].clus()) {
375  // end the loop only if no changes were applied
376  more = true;
377  }
378  // FIXME ::Threads ?
379  alpaka::atomicMin(acc, &digi_view[i].clus(), old, alpaka::hierarchy::Blocks{});
380  } // neighbours loop
381  ++k;
382  } // pixel loop
383  /*
384  // use the outer loop to force a synchronisation
385  } else {
386  // odd iterations of the outer loop
387  */
388  alpaka::syncBlockThreads(acc);
389  for (uint32_t j : cms::alpakatools::independent_group_elements(acc, hist.size())) {
390  auto p = hist.begin() + j;
391  auto i = *p + firstPixel;
392  auto m = digi_view[i].clus();
393  while (m != digi_view[m].clus())
394  m = digi_view[m].clus();
395  digi_view[i].clus() = m;
396  }
397  /*
398  }
399  ++nloops;
400  */
401  } // end while
402 
403  /*
404  // check that all threads in the block have executed the same number of iterations
405  auto& n0 = alpaka::declareSharedVar<int, __COUNTER__>(acc);
406  if (cms::alpakatools::once_per_block(acc))
407  n0 = nloops;
408  alpaka::syncBlockThreads(acc);
409  ALPAKA_ASSERT_ACC(alpaka::syncBlockThreadsPredicate<alpaka::BlockAnd>(acc, nloops == n0));
410  if (thisModuleId % 100 == 1)
411  if (cms::alpakatools::once_per_block(acc))
412  printf("# loops %d\n", nloops);
413  */
414 
415  auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
416  foundClusters = 0;
417  alpaka::syncBlockThreads(acc);
418 
419  // find the number of different clusters, identified by a pixels with clus[i] == i;
420  // mark these pixels with a negative id.
421  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
422  // skip invalid pixels
423  if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
424  continue;
425  if (digi_view[i].clus() == static_cast<int>(i)) {
426  auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
427  digi_view[i].clus() = -(old + 1);
428  }
429  }
430  alpaka::syncBlockThreads(acc);
431 
432  // propagate the negative id to all the pixels in the cluster.
433  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
434  // skip invalid pixels
435  if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId)
436  continue;
437  if (digi_view[i].clus() >= 0) {
438  // mark each pixel in a cluster with the same id as the first one
439  digi_view[i].clus() = digi_view[digi_view[i].clus()].clus();
440  }
441  }
442  alpaka::syncBlockThreads(acc);
443 
444  // adjust the cluster id to be a positive value starting from 0
445  for (uint32_t i : cms::alpakatools::independent_group_elements(acc, firstPixel, lastPixel)) {
446  if (digi_view[i].moduleId() == ::pixelClustering::invalidModuleId) {
447  // mark invalid pixels with an invalid cluster index
448  digi_view[i].clus() = ::pixelClustering::invalidClusterId;
449  } else {
450  digi_view[i].clus() = -digi_view[i].clus() - 1;
451  }
452  }
453  alpaka::syncBlockThreads(acc);
454 
456  clus_view[thisModuleId].clusInModule() = foundClusters;
457  clus_view[module].moduleId() = thisModuleId;
458 #ifdef GPU_DEBUG
459  if (foundClusters > gMaxHit<TAcc>) {
460  gMaxHit<TAcc> = foundClusters;
461  if (foundClusters > 8)
462  printf("max hit %d in %d\n", foundClusters, thisModuleId);
463  }
464  if (thisModuleId % 100 == 1)
465  printf("%d clusters in module %d\n", foundClusters, thisModuleId);
466 #endif
467  }
468  } // module loop
469  }
470  };
471 
472 } // namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering
473 
474 #endif // plugin_SiPixelClusterizer_alpaka_PixelClustering.h
cms::alpakatools::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
ALPAKA_FN_ACC constexpr bool once_per_block(TAcc const &acc)
ALPAKA_FN_ACC auto uniform_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:311
static constexpr uint16_t numColsInModule
ALPAKA_FN_ACC auto independent_group_elements(TAcc const &acc, TArgs... args)
static constexpr uint16_t numRowsInModule
ALPAKA_FN_ACC auto independent_groups(TAcc const &acc, TArgs... args)
constexpr uint16_t numberOfModules
ALPAKA_FN_ACC void operator()(const TAcc &acc, SiPixelDigisSoAView digi_view, SiPixelClustersSoAView clus_view, const unsigned int numElements) const
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getIndex(uint16_t x, uint16_t y)
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr void promote(TAcc const &acc, uint32_t *__restrict__ status, const uint16_t x, const uint16_t y)
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr uint32_t getShift(uint16_t x, uint16_t y)
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
constexpr Idx round_up_by(Idx value, Idx divisor)
Definition: workdivision.h:17
std::vector< Block > Blocks
Definition: Block.h:99
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
constexpr uint16_t invalidModuleId
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ moduleStart
ALPAKA_FN_ACC void operator()(const TAcc &acc, SiPixelDigisSoAView digi_view, SiPixelClustersSoAView clus_view, const unsigned int numElements) const
ALPAKA_FN_ACC constexpr bool once_per_grid(TAcc const &acc)
static unsigned int const shift
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
float x
constexpr int invalidClusterId
constexpr uint16_t maxNumModules
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr bool isDuplicate(uint32_t const *__restrict__ status, uint16_t x, uint16_t y)
ALPAKA_FN_ACC ALPAKA_FN_INLINE constexpr Status getStatus(uint32_t const *__restrict__ status, uint16_t x, uint16_t y)
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr uint16_t invalidModuleId