CMS 3D CMS Logo

gpuClustering.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
2 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
3 
4 #include <cstdint>
5 #include <cstdio>
6 
12 
13 //#define GPU_DEBUG
14 
15 namespace gpuClustering {
16 
17  // Phase-1 pixel modules
18  constexpr uint32_t pixelSizeX = 160;
19  constexpr uint32_t pixelSizeY = 416;
20 
21  namespace pixelStatus {
22  // Use 0x00, 0x01, 0x03 so each can be OR'ed on top of the previous ones
23  enum Status : uint32_t { kEmpty = 0x00, kFound = 0x01, kDuplicate = 0x03 };
24 
25  constexpr uint32_t bits = 2;
26  constexpr uint32_t mask = (0x01 << bits) - 1;
27  constexpr uint32_t valuesPerWord = sizeof(uint32_t) * 8 / bits;
28  constexpr uint32_t size = pixelSizeX * pixelSizeY / valuesPerWord;
29 
30  __device__ static constexpr inline uint32_t getIndex(uint16_t x, uint16_t y) {
31  return (pixelSizeX * y + x) / valuesPerWord;
32  }
33 
34  __device__ constexpr inline uint32_t getShift(uint16_t x, uint16_t y) { return (x % valuesPerWord) * 2; }
35 
36  __device__ constexpr inline Status getStatus(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
37  uint32_t index = getIndex(x, y);
38  uint32_t shift = getShift(x, y);
39  return Status{(status[index] >> shift) & mask};
40  }
41 
42  __device__ constexpr inline bool isDuplicate(uint32_t const* __restrict__ status, uint16_t x, uint16_t y) {
43  return getStatus(status, x, y) == kDuplicate;
44  }
45 
46  __device__ constexpr inline void promote(uint32_t* __restrict__ status, const uint16_t x, const uint16_t y) {
47  uint32_t index = getIndex(x, y);
48  uint32_t shift = getShift(x, y);
49  uint32_t old_word = status[index];
50  uint32_t expected = old_word;
51  do {
52  expected = old_word;
53  Status old_status{(old_word >> shift) & mask};
54  if (kDuplicate == old_status) {
55  // nothing to do
56  return;
57  }
58  Status new_status = (kEmpty == old_status) ? kFound : kDuplicate;
59  uint32_t new_word = old_word | (static_cast<uint32_t>(new_status) << shift);
60  old_word = atomicCAS(&status[index], expected, new_word);
61  } while (expected != old_word);
62  }
63 
64  } // namespace pixelStatus
65 
66 #ifdef GPU_DEBUG
67  __device__ uint32_t gMaxHit = 0;
68 #endif
69 
70  template <typename TrackerTraits>
71  __global__ void countModules(uint16_t const* __restrict__ id,
72  uint32_t* __restrict__ moduleStart,
73  int32_t* __restrict__ clusterId,
74  int numElements) {
75  int first = blockDim.x * blockIdx.x + threadIdx.x;
76 
77  [[maybe_unused]] constexpr int nMaxModules = TrackerTraits::numberOfModules;
78 
80  for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
81  clusterId[i] = i;
82  if (invalidModuleId == id[i])
83  continue;
84  auto j = i - 1;
85  while (j >= 0 and id[j] == invalidModuleId)
86  --j;
87  if (j < 0 or id[j] != id[i]) {
88  // boundary...
89  auto loc = atomicInc(moduleStart, nMaxModules);
90  moduleStart[loc + 1] = i;
91  }
92  }
93  }
94 
95  template <typename TrackerTraits>
96  __global__ void findClus(uint32_t* __restrict__ rawIdArr,
97  uint16_t* __restrict__ id, // module id of each pixel
98  uint16_t const* __restrict__ x, // local coordinates of each pixel
99  uint16_t const* __restrict__ y, //
100  uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module
101  uint32_t* __restrict__ nClustersInModule, // output: number of clusters found in each module
102  uint32_t* __restrict__ moduleId, // output: module id of each module
103  int32_t* __restrict__ clusterId, // output: cluster id of each pixel
104  int numElements) {
105  // status is only used for Phase-1, but it cannot be declared conditionally only if isPhase2 is false;
106  // to minimize the impact on Phase-2 reconstruction it is declared with a very small size.
108  constexpr const uint32_t pixelStatusSize = isPhase2 ? 1 : pixelStatus::size;
109  __shared__ uint32_t status[pixelStatusSize]; // packed words array used to store the PixelStatus of each pixel
110  __shared__ int msize;
111 
112  auto firstModule = blockIdx.x;
113  auto endModule = moduleStart[0];
114 
116 
117  for (auto module = firstModule; module < endModule; module += gridDim.x) {
118  auto firstPixel = moduleStart[1 + module];
119  auto thisModuleId = id[firstPixel];
120  assert(thisModuleId < TrackerTraits::numberOfModules);
121 
122 #ifdef GPU_DEBUG
123  if (thisModuleId % 100 == 1)
124  if (threadIdx.x == 0)
125  printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
126 #endif
127 
128  auto first = firstPixel + threadIdx.x;
129 
130  // find the index of the first pixel not belonging to this module (or invalid)
131  msize = numElements;
132  __syncthreads();
133 
134  // skip threads not associated to an existing pixel
135  for (int i = first; i < numElements; i += blockDim.x) {
136  if (id[i] == invalidModuleId) // skip invalid pixels
137  continue;
138  if (id[i] != thisModuleId) { // find the first pixel in a different module
139  atomicMin(&msize, i);
140  break;
141  }
142  }
143 
144  //init hist (ymax=416 < 512 : 9bits)
145  //6000 max pixels required for HI operations with no measurable impact on pp performance
146  constexpr uint32_t maxPixInModule = TrackerTraits::maxPixInModule;
147  constexpr auto nbins = TrackerTraits::clusterBinning;
148  constexpr auto nbits = TrackerTraits::clusterBits;
149 
151  __shared__ Hist hist;
152  __shared__ typename Hist::Counter ws[32];
153  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
154  hist.off[j] = 0;
155  }
156  __syncthreads();
157 
158  assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
159 
160  // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
161  if (0 == threadIdx.x) {
162  if (msize - firstPixel > maxPixInModule) {
163  printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
164  msize = maxPixInModule + firstPixel;
165  }
166  }
167 
168  __syncthreads();
169  assert(msize - firstPixel <= maxPixInModule);
170 
171 #ifdef GPU_DEBUG
172  __shared__ uint32_t totGood;
173  totGood = 0;
174  __syncthreads();
175 #endif
176 
177  // remove duplicate pixels
178  if constexpr (not isPhase2) {
179  if (msize > 1) {
180  for (uint32_t i = threadIdx.x; i < pixelStatus::size; i += blockDim.x) {
181  status[i] = 0;
182  }
183  __syncthreads();
184  for (int i = first; i < msize - 1; i += blockDim.x) {
185  // skip invalid pixels
186  if (id[i] == invalidModuleId)
187  continue;
189  }
190  __syncthreads();
191  for (int i = first; i < msize - 1; i += blockDim.x) {
192  // skip invalid pixels
193  if (id[i] == invalidModuleId)
194  continue;
195  if (pixelStatus::isDuplicate(status, x[i], y[i])) {
196  // printf("found dup %d %d %d %d\n", i, id[i], x[i], y[i]);
197  id[i] = invalidModuleId;
198  rawIdArr[i] = 0;
199  }
200  }
201  __syncthreads();
202  }
203  }
204 
205  // fill histo
206  for (int i = first; i < msize; i += blockDim.x) {
207  if (id[i] == invalidModuleId) // skip invalid pixels
208  continue;
209  hist.count(y[i]);
210 #ifdef GPU_DEBUG
211  atomicAdd(&totGood, 1);
212 #endif
213  }
214  __syncthreads();
215  if (threadIdx.x < 32)
216  ws[threadIdx.x] = 0; // used by prefix scan...
217  __syncthreads();
218  hist.finalize(ws);
219  __syncthreads();
220 #ifdef GPU_DEBUG
221  assert(hist.size() == totGood);
222  if (thisModuleId % 100 == 1)
223  if (threadIdx.x == 0)
224  printf("histo size %d\n", hist.size());
225 #endif
226  for (int i = first; i < msize; i += blockDim.x) {
227  if (id[i] == invalidModuleId) // skip invalid pixels
228  continue;
229  hist.fill(y[i], i - firstPixel);
230  }
231 
232 #ifdef __CUDA_ARCH__
233  // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
234  constexpr int maxiter = 16;
235  if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
236  printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
237  thisModuleId,
238  hist.size(),
239  blockDim.x);
240 #else
241  auto maxiter = hist.size();
242 #endif
243  // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
244  constexpr int maxNeighbours = 10;
245  assert((hist.size() / blockDim.x) <= maxiter);
246  // nearest neighbour
247  uint16_t nn[maxiter][maxNeighbours];
248  uint8_t nnn[maxiter]; // number of nn
249  for (uint32_t k = 0; k < maxiter; ++k)
250  nnn[k] = 0;
251 
252  __syncthreads(); // for hit filling!
253 
254 #ifdef GPU_DEBUG
255  // look for anomalous high occupancy
256  __shared__ uint32_t n40, n60;
257  n40 = n60 = 0;
258  __syncthreads();
259  for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
260  if (hist.size(j) > 60)
261  atomicAdd(&n60, 1);
262  if (hist.size(j) > 40)
263  atomicAdd(&n40, 1);
264  }
265  __syncthreads();
266  if (0 == threadIdx.x) {
267  if (n60 > 0)
268  printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
269  else if (n40 > 0)
270  printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
271  }
272  __syncthreads();
273 #endif
274 
275  // fill NN
276  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
277  assert(k < maxiter);
278  auto p = hist.begin() + j;
279  auto i = *p + firstPixel;
280  assert(id[i] != invalidModuleId);
281  assert(id[i] == thisModuleId); // same module
282  int be = Hist::bin(y[i] + 1);
283  auto e = hist.end(be);
284  ++p;
285  assert(0 == nnn[k]);
286  for (; p < e; ++p) {
287  auto m = (*p) + firstPixel;
288  assert(m != i);
289  assert(int(y[m]) - int(y[i]) >= 0);
290  assert(int(y[m]) - int(y[i]) <= 1);
291  if (std::abs(int(x[m]) - int(x[i])) > 1)
292  continue;
293  auto l = nnn[k]++;
294  assert(l < maxNeighbours);
295  nn[k][l] = *p;
296  }
297  }
298 
299  // for each pixel, look at all the pixels until the end of the module;
300  // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
301  // after the loop, all the pixel in each cluster should have the id equeal to the lowest
302  // pixel in the cluster ( clus[i] == i ).
303  bool more = true;
304  int nloops = 0;
305  while (__syncthreads_or(more)) {
306  if (1 == nloops % 2) {
307  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
308  auto p = hist.begin() + j;
309  auto i = *p + firstPixel;
310  auto m = clusterId[i];
311  while (m != clusterId[m])
312  m = clusterId[m];
313  clusterId[i] = m;
314  }
315  } else {
316  more = false;
317  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
318  auto p = hist.begin() + j;
319  auto i = *p + firstPixel;
320  for (int kk = 0; kk < nnn[k]; ++kk) {
321  auto l = nn[k][kk];
322  auto m = l + firstPixel;
323  assert(m != i);
324  auto old = atomicMin_block(&clusterId[m], clusterId[i]);
325  // do we need memory fence?
326  if (old != clusterId[i]) {
327  // end the loop only if no changes were applied
328  more = true;
329  }
330  atomicMin_block(&clusterId[i], old);
331  } // nnloop
332  } // pixel loop
333  }
334  ++nloops;
335  } // end while
336 
337 #ifdef GPU_DEBUG
338  {
339  __shared__ int n0;
340  if (threadIdx.x == 0)
341  n0 = nloops;
342  __syncthreads();
343  auto ok = n0 == nloops;
345  if (thisModuleId % 100 == 1)
346  if (threadIdx.x == 0)
347  printf("# loops %d\n", nloops);
348  }
349 #endif
350 
351  __shared__ unsigned int foundClusters;
352  foundClusters = 0;
353  __syncthreads();
354 
355  // find the number of different clusters, identified by a pixels with clus[i] == i;
356  // mark these pixels with a negative id.
357  for (int i = first; i < msize; i += blockDim.x) {
358  if (id[i] == invalidModuleId) // skip invalid pixels
359  continue;
360  if (clusterId[i] == i) {
361  auto old = atomicInc(&foundClusters, 0xffffffff);
362  clusterId[i] = -(old + 1);
363  }
364  }
365  __syncthreads();
366 
367  // propagate the negative id to all the pixels in the cluster.
368  for (int i = first; i < msize; i += blockDim.x) {
369  if (id[i] == invalidModuleId) // skip invalid pixels
370  continue;
371  if (clusterId[i] >= 0) {
372  // mark each pixel in a cluster with the same id as the first one
374  }
375  }
376  __syncthreads();
377 
378  // adjust the cluster id to be a positive value starting from 0
379  for (int i = first; i < msize; i += blockDim.x) {
380  if (id[i] == invalidModuleId) { // skip invalid pixels
382  continue;
383  }
384  clusterId[i] = -clusterId[i] - 1;
385  }
386  __syncthreads();
387 
388  if (threadIdx.x == 0) {
389  nClustersInModule[thisModuleId] = foundClusters;
390  moduleId[module] = thisModuleId;
391 #ifdef GPU_DEBUG
392  if (foundClusters > gMaxHit) {
393  gMaxHit = foundClusters;
394  if (foundClusters > 8)
395  printf("max hit %d in %d\n", foundClusters, thisModuleId);
396  }
397 #endif
398 #ifdef GPU_DEBUG
399  if (thisModuleId % 100 == 1)
400  printf("%d clusters in module %d\n", foundClusters, thisModuleId);
401 #endif
402  }
403  } // module loop
404  }
405 } // namespace gpuClustering
406 
407 #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
const dim3 threadIdx
Definition: cudaCompat.h:29
constexpr uint32_t valuesPerWord
Definition: gpuClustering.h:27
__shared__ uint8_t ok[maxNumClustersPerModules]
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ nClustersInModule
constexpr int nMaxModules
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:134
const dim3 gridDim
Definition: cudaCompat.h:33
T1 atomicCAS(T1 *address, T1 compare, T2 val)
Definition: cudaCompat.h:36
__device__ constexpr Status getStatus(uint32_t const *__restrict__ status, uint16_t x, uint16_t y)
Definition: gpuClustering.h:36
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
constexpr uint32_t bits
Definition: gpuClustering.h:25
constexpr uint32_t pixelSizeY
Definition: gpuClustering.h:19
uint16_t *__restrict__ uint16_t const *__restrict__ x
Definition: gpuClustering.h:97
constexpr uint16_t numberOfModules
constexpr uint32_t mask
Definition: gpuClustering.h:26
constexpr uint32_t pixelSizeX
Definition: gpuClustering.h:18
uint16_t *__restrict__ uint16_t const *__restrict__ uint16_t const *__restrict__ y
Definition: gpuClustering.h:97
auto &__restrict__ ws
assert(nMaxModules< maxNumModules)
std::function< unsigned int(align::ID)> Counter
__shared__ uint32_t status[pixelStatusSize]
int n0
Definition: AMPTWrapper.h:44
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
static __device__ constexpr uint32_t getIndex(uint16_t x, uint16_t y)
Definition: gpuClustering.h:30
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
constexpr uint32_t size
Definition: gpuClustering.h:28
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
static const MaxIter maxiter
constexpr uint16_t maxNumModules
const dim3 blockIdx
Definition: cudaCompat.h:32
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ clusterId
constexpr uint16_t invalidModuleId
T1 atomicMin_block(T1 *a, T2 b)
Definition: cudaCompat.h:92
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
__device__ constexpr uint32_t getShift(uint16_t x, uint16_t y)
Definition: gpuClustering.h:34
__device__ constexpr void promote(uint32_t *__restrict__ status, const uint16_t x, const uint16_t y)
Definition: gpuClustering.h:46
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ moduleStart
void __syncthreads()
Definition: cudaCompat.h:132
static unsigned int const shift
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
float x
constexpr int invalidClusterId
__shared__ int msize
bool __syncthreads_and(bool x)
Definition: cudaCompat.h:135
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
__device__ constexpr bool isDuplicate(uint32_t const *__restrict__ status, uint16_t x, uint16_t y)
Definition: gpuClustering.h:42
#define __device__
__shared__ unsigned int foundClusters
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr const uint32_t pixelStatusSize