CMS 3D CMS Logo

gpuClustering.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
2 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
3 
4 #include <cstdint>
5 #include <cstdio>
6 
12 
13 //#define GPU_DEBUG
14 
15 namespace gpuClustering {
16 
17  // Phase-1 pixel modules
18  constexpr uint32_t pixelSizeX = 160;
19  constexpr uint32_t pixelSizeY = 416;
20 
21  namespace pixelStatus {
22  // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
23  enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
24 
25  constexpr uint32_t bits = 2;
26  constexpr uint32_t mask = (0x01 << bits) - 1;
27  constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
28  constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
29 
30  __device__ static constexpr inline uint32_t getIndex(uint16_t x, uint16_t y) {
31  return (pixelSizeX * y + x) / valuesPerWord;
32  }
33 
34  __device__ constexpr inline uint32_t getShift(uint16_t x, uint16_t y) { return (x % valuesPerWord) * 2; }
35 
36  __device__ constexpr inline Status getStatus(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
37  uint32_t index = getIndex(x, y);
38  uint32_t shift = getShift(x, y);
39  return Status{(status[index] >> shift) & mask};
40  }
41 
42  __device__ constexpr inline bool isDuplicate(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
43  return getStatus(status, x, y) == kDuplicate;
44  }
45 
46  __device__ constexpr inline void promote(uint32_t* __restrict__ status, const uint16_t x, const uint16_t y) {
47  uint32_t index = getIndex(x, y);
48  uint32_t shift = getShift(x, y);
49  uint32_t old_word = status[index];
50  uint32_t expected = old_word;
51  do {
52  expected = old_word;
53  Status old_status{(old_word >> shift) & mask};
54  if (kDuplicate == old_status) {
55  // nothing to do
56  return;
57  }
58  Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
59  uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
60  old_word = atomicCAS(&status[index], expected, new_word);
61  } while (expected != old_word);
62  }
63 
64  } // namespace pixelStatus
65 
66 #ifdef GPU_DEBUG
67  __device__ uint32_t gMaxHit = 0;
68 #endif
69 
70  template <typename TrackerTraits>
71  __global__ void countModules(uint16_t const* __restrict__ id,
72  uint32_t* __restrict__ moduleStart,
73  int32_t* __restrict__ clusterId,
74  int numElements) {
75  int first = blockDim.x * blockIdx.x + threadIdx.x;
76 
77  [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules;
78 
80  for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
81  clusterId[i] = i;
82  if (invalidModuleId == id[i])
83  continue;
84  auto j = i - 1;
85  while (j >= 0 and id[j] == invalidModuleId)
86  --j;
87  if (j < 0 or id[j] != id[i]) {
88  // boundary...
89  auto loc = atomicInc(moduleStart, nMaxModules);
90  moduleStart[loc + 1] = i;
91  }
92  }
93  }
94 
95  template <typename TrackerTraits>
96  __global__ void findClus(uint32_t* __restrict__ rawIdArr,
97  uint16_t* __restrict__ id, // module id of each pixel
98  uint16_t const* __restrict__ x, // local coordinates of each pixel
99  uint16_t const* __restrict__ y, //
100  uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module
101  uint32_t* __restrict__ nClustersInModule, // output: number of clusters found in each module
102  uint32_t* __restrict__ moduleId, // output: module id of each module
103  int32_t* __restrict__ clusterId, // output: cluster id of each pixel
104  int numElements) {
105  // status is only used for Phase-1, but it cannot be declared conditionally only if isPhase2 is false;
106  // to minimize the impact on Phase-2 reconstruction it is declared with a very small size.
108  constexpr const uint32_t pixelStatusSize = isPhase2 ? 1 : pixelStatus::size;
109  __shared__ uint32_t status[pixelStatusSize]; // packed words array used to store the PixelStatus of each pixel
110  __shared__ int msize;
111 
112  auto firstModule = blockIdx.x;
113  auto endModule = moduleStart[0];
114 
116 
117  for (auto module = firstModule; module < endModule; module += gridDim.x) {
118  auto firstPixel = moduleStart[1 + module];
119  auto thisModuleId = id[firstPixel];
120  assert(thisModuleId < TrackerTraits::numberOfModules);
121 
122 #ifdef GPU_DEBUG
123  if (thisModuleId % 100 == 1)
124  if (threadIdx.x == 0)
125  printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
126 #endif
127 
128  auto first = firstPixel + threadIdx.x;
129 
130  // find the index of the first pixel not belonging to this module (or invalid)
131  msize = numElements;
132  __syncthreads();
133 
134  // skip threads not associated to an existing pixel
135  for (int i = first; i < numElements; i += blockDim.x) {
136  if (id[i] == invalidModuleId) // skip invalid pixels
137  continue;
138  if (id[i] != thisModuleId) { // find the first pixel in a different module
139  atomicMin(&msize, i);
140  break;
141  }
142  }
143 
144  //init hist (ymax=416 < 512 : 9bits)
145  //6000 max pixels required for HI operations with no measurable impact on pp performance
146  constexpr uint32_t maxPixInModule = TrackerTraits::maxPixInModule;
147  constexpr auto nbins = TrackerTraits::clusterBinning;
148  constexpr auto nbits = TrackerTraits::clusterBits;
149 
151  __shared__ Hist hist;
152  __shared__ typename Hist::Counter ws[32];
153  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
154  hist.off[j] = 0;
155  }
156  __syncthreads();
157 
158  assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
159 
160  // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
161  if (0 == threadIdx.x) {
162  if (msize - firstPixel > maxPixInModule) {
163  printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
164  msize = maxPixInModule + firstPixel;
165  }
166 #ifdef GPU_DEBUG
167  printf("pixelInModule > %d\n", msize - firstPixel);
168 #endif
169  }
170 
171  __syncthreads();
172  assert(msize - firstPixel <= maxPixInModule);
173 
174 #ifdef GPU_DEBUG
175  __shared__ uint32_t totGood;
176  totGood = 0;
177  __syncthreads();
178 #endif
179 
180  // remove duplicate pixels
181  if constexpr (not isPhase2) {
182  if (msize > 1) {
183  for (uint32_t i = threadIdx.x; i < pixelStatus::size; i += blockDim.x) {
184  status[i] = 0;
185  }
186  __syncthreads();
187  for (int i = first; i < msize - 1; i += blockDim.x) {
188  // skip invalid pixels
189  if (id[i] == invalidModuleId)
190  continue;
192  }
193  __syncthreads();
194  for (int i = first; i < msize - 1; i += blockDim.x) {
195  // skip invalid pixels
196  if (id[i] == invalidModuleId)
197  continue;
198  if (pixelStatus::isDuplicate(status, x[i], y[i])) {
199  // printf("found dup %d %d %d %d\n", i, id[i], x[i], y[i]);
200  id[i] = invalidModuleId;
201  rawIdArr[i] = 0;
202  }
203  }
204  __syncthreads();
205  }
206  }
207 
208  // fill histo
209  for (int i = first; i < msize; i += blockDim.x) {
210  if (id[i] == invalidModuleId) // skip invalid pixels
211  continue;
212  hist.count(y[i]);
213 #ifdef GPU_DEBUG
214  atomicAdd(&totGood, 1);
215 #endif
216  }
217  __syncthreads();
218  if (threadIdx.x < 32)
219  ws[threadIdx.x] = 0; // used by prefix scan...
220  __syncthreads();
221  hist.finalize(ws);
222  __syncthreads();
223 #ifdef GPU_DEBUG
224  assert(hist.size() == totGood);
225  if (thisModuleId % 100 == 1)
226  if (threadIdx.x == 0)
227  printf("histo size %d\n", hist.size());
228 #endif
229  for (int i = first; i < msize; i += blockDim.x) {
230  if (id[i] == invalidModuleId) // skip invalid pixels
231  continue;
232  hist.fill(y[i], i - firstPixel);
233  }
234 
235 #ifdef __CUDA_ARCH__
236  // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
237  constexpr int maxiter = 16;
238  if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
239  printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
240  thisModuleId,
241  hist.size(),
242  blockDim.x);
243 #else
244  auto maxiter = hist.size();
245 #endif
246  // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
247  constexpr int maxNeighbours = 10;
248  assert((hist.size() / blockDim.x) <= maxiter);
249  // nearest neighbour
250  uint16_t nn[maxiter][maxNeighbours];
251  uint8_t nnn[maxiter]; // number of nn
252  for (uint32_t k = 0; k < maxiter; ++k)
253  nnn[k] = 0;
254 
255  __syncthreads(); // for hit filling!
256 
257 #ifdef GPU_DEBUG
258  // look for anomalous high occupancy
259  __shared__ uint32_t n40, n60;
260  n40 = n60 = 0;
261  __syncthreads();
262  for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
263  if (hist.size(j) > 60)
264  atomicAdd(&n60, 1);
265  if (hist.size(j) > 40)
266  atomicAdd(&n40, 1);
267  }
268  __syncthreads();
269  if (0 == threadIdx.x) {
270  if (n60 > 0)
271  printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
272  else if (n40 > 0)
273  printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
274  }
275  __syncthreads();
276 #endif
277 
278  // fill NN
279  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
280  assert(k < maxiter);
281  auto p = hist.begin() + j;
282  auto i = *p + firstPixel;
283  assert(id[i] != invalidModuleId);
284  assert(id[i] == thisModuleId); // same module
285  int be = Hist::bin(y[i] + 1);
286  auto e = hist.end(be);
287  ++p;
288  assert(0 == nnn[k]);
289  for (; p < e; ++p) {
290  auto m = (*p) + firstPixel;
291  assert(m != i);
292  assert(int(y[m]) - int(y[i]) >= 0);
293  assert(int(y[m]) - int(y[i]) <= 1);
294  if (std::abs(int(x[m]) - int(x[i])) > 1)
295  continue;
296  auto l = nnn[k]++;
297  assert(l < maxNeighbours);
298  nn[k][l] = *p;
299  }
300  }
301 
302  // for each pixel, look at all the pixels until the end of the module;
303  // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
304  // after the loop, all the pixel in each cluster should have the id equeal to the lowest
305  // pixel in the cluster ( clus[i] == i ).
306  bool more = true;
307  int nloops = 0;
308  while (__syncthreads_or(more)) {
309  if (1 == nloops % 2) {
310  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
311  auto p = hist.begin() + j;
312  auto i = *p + firstPixel;
313  auto m = clusterId[i];
314  while (m != clusterId[m])
315  m = clusterId[m];
316  clusterId[i] = m;
317  }
318  } else {
319  more = false;
320  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
321  auto p = hist.begin() + j;
322  auto i = *p + firstPixel;
323  for (int kk = 0; kk < nnn[k]; ++kk) {
324  auto l = nn[k][kk];
325  auto m = l + firstPixel;
326  assert(m != i);
327  auto old = atomicMin_block(&clusterId[m], clusterId[i]);
328  // do we need memory fence?
329  if (old != clusterId[i]) {
330  // end the loop only if no changes were applied
331  more = true;
332  }
333  atomicMin_block(&clusterId[i], old);
334  } // nnloop
335  } // pixel loop
336  }
337  ++nloops;
338  } // end while
339 
340 #ifdef GPU_DEBUG
341  {
342  __shared__ int n0;
343  if (threadIdx.x == 0)
344  n0 = nloops;
345  __syncthreads();
346  auto ok = n0 == nloops;
348  if (thisModuleId % 100 == 1)
349  if (threadIdx.x == 0)
350  printf("# loops %d\n", nloops);
351  }
352 #endif
353 
354  __shared__ unsigned int foundClusters;
355  foundClusters = 0;
356  __syncthreads();
357 
358  // find the number of different clusters, identified by a pixels with clus[i] == i;
359  // mark these pixels with a negative id.
360  for (int i = first; i < msize; i += blockDim.x) {
361  if (id[i] == invalidModuleId) // skip invalid pixels
362  continue;
363  if (clusterId[i] == i) {
364  auto old = atomicInc(&foundClusters, 0xffffffff);
365  clusterId[i] = -(old + 1);
366  }
367  }
368  __syncthreads();
369 
370  // propagate the negative id to all the pixels in the cluster.
371  for (int i = first; i < msize; i += blockDim.x) {
372  if (id[i] == invalidModuleId) // skip invalid pixels
373  continue;
374  if (clusterId[i] >= 0) {
375  // mark each pixel in a cluster with the same id as the first one
377  }
378  }
379  __syncthreads();
380 
381  // adjust the cluster id to be a positive value starting from 0
382  for (int i = first; i < msize; i += blockDim.x) {
383  if (id[i] == invalidModuleId) { // skip invalid pixels
385  continue;
386  }
387  clusterId[i] = -clusterId[i] - 1;
388  }
389  __syncthreads();
390 
391  if (threadIdx.x == 0) {
392  nClustersInModule[thisModuleId] = foundClusters;
393  moduleId[module] = thisModuleId;
394 #ifdef GPU_DEBUG
395  if (foundClusters > gMaxHit) {
396  gMaxHit = foundClusters;
397  if (foundClusters > 8)
398  printf("max hit %d in %d\n", foundClusters, thisModuleId);
399  }
400 #endif
401 #ifdef GPU_DEBUG
402  if (thisModuleId % 100 == 1)
403  printf("%d clusters in module %d\n", foundClusters, thisModuleId);
404 #endif
405  }
406  } // module loop
407  }
408 } // namespace gpuClustering
409 
410 #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
const dim3 threadIdx
Definition: cudaCompat.h:29
constexpr uint32_t valuesPerWord
Definition: gpuClustering.h:27
__shared__ uint8_t ok[maxNumClustersPerModules]
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ nClustersInModule
constexpr int nMaxModules
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:134
const dim3 gridDim
Definition: cudaCompat.h:33
T1 atomicCAS(T1 *address, T1 compare, T2 val)
Definition: cudaCompat.h:36
__device__ constexpr Status getStatus(uint32_t const *__restrict__ status, uint16_t x, uint16_t y)
Definition: gpuClustering.h:36
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
constexpr uint32_t bits
Definition: gpuClustering.h:25
constexpr uint32_t pixelSizeY
Definition: gpuClustering.h:19
uint16_t *__restrict__ uint16_t const *__restrict__ x
Definition: gpuClustering.h:97
constexpr uint16_t numberOfModules
constexpr uint32_t mask
Definition: gpuClustering.h:26
constexpr uint32_t pixelSizeX
Definition: gpuClustering.h:18
uint16_t *__restrict__ uint16_t const *__restrict__ uint16_t const *__restrict__ y
Definition: gpuClustering.h:97
auto &__restrict__ ws
assert(nMaxModules< maxNumModules)
std::function< unsigned int(align::ID)> Counter
__shared__ uint32_t status[pixelStatusSize]
int n0
Definition: AMPTWrapper.h:44
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
static __device__ constexpr uint32_t getIndex(uint16_t x, uint16_t y)
Definition: gpuClustering.h:30
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
constexpr uint32_t size
Definition: gpuClustering.h:28
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
static const MaxIter maxiter
constexpr uint16_t maxNumModules
const dim3 blockIdx
Definition: cudaCompat.h:32
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ clusterId
constexpr uint16_t invalidModuleId
T1 atomicMin_block(T1 *a, T2 b)
Definition: cudaCompat.h:92
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
__device__ constexpr uint32_t getShift(uint16_t x, uint16_t y)
Definition: gpuClustering.h:34
__device__ constexpr void promote(uint32_t *__restrict__ status, const uint16_t x, const uint16_t y)
Definition: gpuClustering.h:46
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ moduleStart
void __syncthreads()
Definition: cudaCompat.h:132
static unsigned int const shift
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
float x
constexpr int invalidClusterId
__shared__ int msize
bool __syncthreads_and(bool x)
Definition: cudaCompat.h:135
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
__device__ constexpr bool isDuplicate(uint32_t const *__restrict__ status, uint16_t x, uint16_t y)
Definition: gpuClustering.h:42
#define __device__
__shared__ unsigned int foundClusters
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr const uint32_t pixelStatusSize