CMS 3D CMS Logo

gpuClustering.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
2 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
3 
4 #include <cstdint>
5 #include <cstdio>
6 
12 
13 namespace gpuClustering {
14 
15  // Phase-1 pixel modules
16  constexpr uint32_t pixelSizeX = 160;
17  constexpr uint32_t pixelSizeY = 416;
18 
19  namespace pixelStatus {
20  // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
21  enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
22 
23  constexpr uint32_t bits = 2;
24  constexpr uint32_t mask = (0x01 << bits) - 1;
25  constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
26  constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
27 
28  __device__ static constexpr inline uint32_t getIndex(uint16_t x, uint16_t y) {
29  return (pixelSizeX * y + x) / valuesPerWord;
30  }
31 
32  __device__ constexpr inline uint32_t getShift(uint16_t x, uint16_t y) { return (x % valuesPerWord) * 2; }
33 
34  __device__ constexpr inline Status getStatus(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
35  uint32_t index = getIndex(x, y);
36  uint32_t shift = getShift(x, y);
37  return Status{(status[index] >> shift) & mask};
38  }
39 
40  __device__ constexpr inline bool isDuplicate(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
41  return getStatus(status, x, y) == kDuplicate;
42  }
43 
44  __device__ constexpr inline void promote(uint32_t* __restrict__ status, const uint16_t x, const uint16_t y) {
45  uint32_t index = getIndex(x, y);
46  uint32_t shift = getShift(x, y);
47  uint32_t old_word = status[index];
48  uint32_t expected = old_word;
49  do {
50  expected = old_word;
51  Status old_status{(old_word >> shift) & mask};
52  if (kDuplicate == old_status) {
53  // nothing to do
54  return;
55  }
56  Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
57  uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
58  old_word = atomicCAS(&status[index], expected, new_word);
59  } while (expected != old_word);
60  }
61 
62  } // namespace pixelStatus
63 
64 #ifdef GPU_DEBUG
65  __device__ uint32_t gMaxHit = 0;
66 #endif
67 
68  template <bool isPhase2>
69  __global__ void countModules(uint16_t const* __restrict__ id,
70  uint32_t* __restrict__ moduleStart,
71  int32_t* __restrict__ clusterId,
72  int numElements) {
73  int first = blockDim.x * blockIdx.x + threadIdx.x;
74  [[maybe_unused]] constexpr int nMaxModules =
77  for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
78  clusterId[i] = i;
79  if (invalidModuleId == id[i])
80  continue;
81  auto j = i - 1;
82  while (j >= 0 and id[j] == invalidModuleId)
83  --j;
84  if (j < 0 or id[j] != id[i]) {
85  // boundary...
86  auto loc = atomicInc(moduleStart, nMaxModules);
87  moduleStart[loc + 1] = i;
88  }
89  }
90  }
91 
92  template <bool isPhase2>
93  __global__ void findClus(uint32_t* __restrict__ rawIdArr,
94  uint16_t* __restrict__ id, // module id of each pixel
95  uint16_t const* __restrict__ x, // local coordinates of each pixel
96  uint16_t const* __restrict__ y, //
97  uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module
98  uint32_t* __restrict__ nClustersInModule, // output: number of clusters found in each module
99  uint32_t* __restrict__ moduleId, // output: module id of each module
100  int32_t* __restrict__ clusterId, // output: cluster id of each pixel
101  int numElements) {
102  // status is only used for Phase-1, but it cannot be declared conditionally only if isPhase2 is false;
103  // to minimize the impact on Phase-2 reconstruction it is declared with a very small size.
104  constexpr const uint32_t pixelStatusSize = isPhase2 ? 1 : pixelStatus::size;
105  __shared__ uint32_t status[pixelStatusSize]; // packed words array used to store the PixelStatus of each pixel
106  __shared__ int msize;
107 
108  auto firstModule = blockIdx.x;
109  auto endModule = moduleStart[0];
110 
111  [[maybe_unused]] constexpr int nMaxModules =
114 
115  for (auto module = firstModule; module < endModule; module += gridDim.x) {
116  auto firstPixel = moduleStart[1 + module];
117  auto thisModuleId = id[firstPixel];
118  assert(thisModuleId < nMaxModules);
119 
120 #ifdef GPU_DEBUG
121  if (thisModuleId % 100 == 1)
122  if (threadIdx.x == 0)
123  printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
124 #endif
125 
126  auto first = firstPixel + threadIdx.x;
127 
128  // find the index of the first pixel not belonging to this module (or invalid)
129  msize = numElements;
130  __syncthreads();
131 
132  // skip threads not associated to an existing pixel
133  for (int i = first; i < numElements; i += blockDim.x) {
134  if (id[i] == invalidModuleId) // skip invalid pixels
135  continue;
136  if (id[i] != thisModuleId) { // find the first pixel in a different module
137  atomicMin(&msize, i);
138  break;
139  }
140  }
141 
142  //init hist (ymax=416 < 512 : 9bits)
143  //6000 max pixels required for HI operations with no measurable impact on pp performance
144  constexpr uint32_t maxPixInModule = 6000;
145  constexpr auto nbins = isPhase2 ? 1024 : phase1PixelTopology::numColsInModule + 2; //2+2;
146  constexpr auto nbits = isPhase2 ? 10 : 9; //2+2;
148  __shared__ Hist hist;
149  __shared__ typename Hist::Counter ws[32];
150  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
151  hist.off[j] = 0;
152  }
153  __syncthreads();
154 
155  assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
156 
157  // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
158  if (0 == threadIdx.x) {
159  if (msize - firstPixel > maxPixInModule) {
160  printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
161  msize = maxPixInModule + firstPixel;
162  }
163  }
164 
165  __syncthreads();
166  assert(msize - firstPixel <= maxPixInModule);
167 
168 #ifdef GPU_DEBUG
169  __shared__ uint32_t totGood;
170  totGood = 0;
171  __syncthreads();
172 #endif
173 
174  // remove duplicate pixels
175  if constexpr (not isPhase2) {
176  if (msize > 1) {
177  for (uint32_t i = threadIdx.x; i < pixelStatus::size; i += blockDim.x) {
178  status[i] = 0;
179  }
180  __syncthreads();
181  for (int i = first; i < msize - 1; i += blockDim.x) {
182  // skip invalid pixels
183  if (id[i] == invalidModuleId)
184  continue;
186  }
187  __syncthreads();
188  for (int i = first; i < msize - 1; i += blockDim.x) {
189  // skip invalid pixels
190  if (id[i] == invalidModuleId)
191  continue;
192  if (pixelStatus::isDuplicate(status, x[i], y[i])) {
193  // printf("found dup %d %d %d %d\n", i, id[i], x[i], y[i]);
194  id[i] = invalidModuleId;
195  rawIdArr[i] = 0;
196  }
197  }
198  __syncthreads();
199  }
200  }
201 
202  // fill histo
203  for (int i = first; i < msize; i += blockDim.x) {
204  if (id[i] == invalidModuleId) // skip invalid pixels
205  continue;
206  hist.count(y[i]);
207 #ifdef GPU_DEBUG
208  atomicAdd(&totGood, 1);
209 #endif
210  }
211  __syncthreads();
212  if (threadIdx.x < 32)
213  ws[threadIdx.x] = 0; // used by prefix scan...
214  __syncthreads();
215  hist.finalize(ws);
216  __syncthreads();
217 #ifdef GPU_DEBUG
218  assert(hist.size() == totGood);
219  if (thisModuleId % 100 == 1)
220  if (threadIdx.x == 0)
221  printf("histo size %d\n", hist.size());
222 #endif
223  for (int i = first; i < msize; i += blockDim.x) {
224  if (id[i] == invalidModuleId) // skip invalid pixels
225  continue;
226  hist.fill(y[i], i - firstPixel);
227  }
228 
229 #ifdef __CUDA_ARCH__
230  // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
231  constexpr int maxiter = 16;
232  if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
233  printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
234  thisModuleId,
235  hist.size(),
236  blockDim.x);
237 #else
238  auto maxiter = hist.size();
239 #endif
240  // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
241  constexpr int maxNeighbours = 10;
242  assert((hist.size() / blockDim.x) <= maxiter);
243  // nearest neighbour
244  uint16_t nn[maxiter][maxNeighbours];
245  uint8_t nnn[maxiter]; // number of nn
246  for (uint32_t k = 0; k < maxiter; ++k)
247  nnn[k] = 0;
248 
249  __syncthreads(); // for hit filling!
250 
251 #ifdef GPU_DEBUG
252  // look for anomalous high occupancy
253  __shared__ uint32_t n40, n60;
254  n40 = n60 = 0;
255  __syncthreads();
256  for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
257  if (hist.size(j) > 60)
258  atomicAdd(&n60, 1);
259  if (hist.size(j) > 40)
260  atomicAdd(&n40, 1);
261  }
262  __syncthreads();
263  if (0 == threadIdx.x) {
264  if (n60 > 0)
265  printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
266  else if (n40 > 0)
267  printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
268  }
269  __syncthreads();
270 #endif
271 
272  // fill NN
273  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
274  assert(k < maxiter);
275  auto p = hist.begin() + j;
276  auto i = *p + firstPixel;
277  assert(id[i] != invalidModuleId);
278  assert(id[i] == thisModuleId); // same module
279  int be = Hist::bin(y[i] + 1);
280  auto e = hist.end(be);
281  ++p;
282  assert(0 == nnn[k]);
283  for (; p < e; ++p) {
284  auto m = (*p) + firstPixel;
285  assert(m != i);
286  assert(int(y[m]) - int(y[i]) >= 0);
287  assert(int(y[m]) - int(y[i]) <= 1);
288  if (std::abs(int(x[m]) - int(x[i])) > 1)
289  continue;
290  auto l = nnn[k]++;
291  assert(l < maxNeighbours);
292  nn[k][l] = *p;
293  }
294  }
295 
296  // for each pixel, look at all the pixels until the end of the module;
297  // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
298  // after the loop, all the pixel in each cluster should have the id equeal to the lowest
299  // pixel in the cluster ( clus[i] == i ).
300  bool more = true;
301  int nloops = 0;
302  while (__syncthreads_or(more)) {
303  if (1 == nloops % 2) {
304  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
305  auto p = hist.begin() + j;
306  auto i = *p + firstPixel;
307  auto m = clusterId[i];
308  while (m != clusterId[m])
309  m = clusterId[m];
310  clusterId[i] = m;
311  }
312  } else {
313  more = false;
314  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
315  auto p = hist.begin() + j;
316  auto i = *p + firstPixel;
317  for (int kk = 0; kk < nnn[k]; ++kk) {
318  auto l = nn[k][kk];
319  auto m = l + firstPixel;
320  assert(m != i);
321  auto old = atomicMin_block(&clusterId[m], clusterId[i]);
322  // do we need memory fence?
323  if (old != clusterId[i]) {
324  // end the loop only if no changes were applied
325  more = true;
326  }
327  atomicMin_block(&clusterId[i], old);
328  } // nnloop
329  } // pixel loop
330  }
331  ++nloops;
332  } // end while
333 
334 #ifdef GPU_DEBUG
335  {
336  __shared__ int n0;
337  if (threadIdx.x == 0)
338  n0 = nloops;
339  __syncthreads();
340  auto ok = n0 == nloops;
342  if (thisModuleId % 100 == 1)
343  if (threadIdx.x == 0)
344  printf("# loops %d\n", nloops);
345  }
346 #endif
347 
348  __shared__ unsigned int foundClusters;
349  foundClusters = 0;
350  __syncthreads();
351 
352  // find the number of different clusters, identified by a pixels with clus[i] == i;
353  // mark these pixels with a negative id.
354  for (int i = first; i < msize; i += blockDim.x) {
355  if (id[i] == invalidModuleId) // skip invalid pixels
356  continue;
357  if (clusterId[i] == i) {
358  auto old = atomicInc(&foundClusters, 0xffffffff);
359  clusterId[i] = -(old + 1);
360  }
361  }
362  __syncthreads();
363 
364  // propagate the negative id to all the pixels in the cluster.
365  for (int i = first; i < msize; i += blockDim.x) {
366  if (id[i] == invalidModuleId) // skip invalid pixels
367  continue;
368  if (clusterId[i] >= 0) {
369  // mark each pixel in a cluster with the same id as the first one
371  }
372  }
373  __syncthreads();
374 
375  // adjust the cluster id to be a positive value starting from 0
376  for (int i = first; i < msize; i += blockDim.x) {
377  if (id[i] == invalidModuleId) { // skip invalid pixels
379  continue;
380  }
381  clusterId[i] = -clusterId[i] - 1;
382  }
383  __syncthreads();
384 
385  if (threadIdx.x == 0) {
386  nClustersInModule[thisModuleId] = foundClusters;
387  moduleId[module] = thisModuleId;
388 #ifdef GPU_DEBUG
389  if (foundClusters > gMaxHit) {
390  gMaxHit = foundClusters;
391  if (foundClusters > 8)
392  printf("max hit %d in %d\n", foundClusters, thisModuleId);
393  }
394 #endif
395 #ifdef GPU_DEBUG
396  if (thisModuleId % 100 == 1)
397  printf("%d clusters in module %d\n", foundClusters, thisModuleId);
398 #endif
399  }
400  } // module loop
401  }
402 } // namespace gpuClustering
403 
404 #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
const dim3 threadIdx
Definition: cudaCompat.h:29
constexpr uint32_t valuesPerWord
Definition: gpuClustering.h:25
__shared__ uint8_t ok[maxNumClustersPerModules]
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ nClustersInModule
constexpr int nMaxModules
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:134
const dim3 gridDim
Definition: cudaCompat.h:33
T1 atomicCAS(T1 *address, T1 compare, T2 val)
Definition: cudaCompat.h:36
__device__ constexpr Status getStatus(uint32_t const *__restrict__ status, uint16_t x, uint16_t y)
Definition: gpuClustering.h:34
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
constexpr uint32_t bits
Definition: gpuClustering.h:23
constexpr uint32_t pixelSizeY
Definition: gpuClustering.h:17
constexpr uint32_t numberOfModules
constexpr uint16_t numColsInModule
uint16_t *__restrict__ uint16_t const *__restrict__ x
Definition: gpuClustering.h:94
constexpr uint32_t mask
Definition: gpuClustering.h:24
constexpr uint32_t pixelSizeX
Definition: gpuClustering.h:16
uint16_t *__restrict__ uint16_t const *__restrict__ uint16_t const *__restrict__ y
Definition: gpuClustering.h:94
auto &__restrict__ ws
assert(nMaxModules< maxNumModules)
std::function< unsigned int(align::ID)> Counter
__shared__ uint32_t status[pixelStatusSize]
int n0
Definition: AMPTWrapper.h:44
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
static __device__ constexpr uint32_t getIndex(uint16_t x, uint16_t y)
Definition: gpuClustering.h:28
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
constexpr uint32_t size
Definition: gpuClustering.h:26
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
static const MaxIter maxiter
constexpr uint16_t maxNumModules
const dim3 blockIdx
Definition: cudaCompat.h:32
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ clusterId
constexpr uint16_t invalidModuleId
__shared__ Hist hist
T1 atomicMin_block(T1 *a, T2 b)
Definition: cudaCompat.h:92
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
constexpr uint32_t numberOfModules
__device__ constexpr uint32_t getShift(uint16_t x, uint16_t y)
Definition: gpuClustering.h:32
__device__ constexpr void promote(uint32_t *__restrict__ status, const uint16_t x, const uint16_t y)
Definition: gpuClustering.h:44
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ moduleStart
void __syncthreads()
Definition: cudaCompat.h:132
static unsigned int const shift
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
float x
constexpr int invalidClusterId
__shared__ int msize
bool __syncthreads_and(bool x)
Definition: cudaCompat.h:135
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
__device__ constexpr bool isDuplicate(uint32_t const *__restrict__ status, uint16_t x, uint16_t y)
Definition: gpuClustering.h:40
#define __device__
__shared__ unsigned int foundClusters
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61