CMS 3D CMS Logo

gpuClustering.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
2 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
3 
4 #include <cstdint>
5 #include <cstdio>
6 
11 
12 namespace gpuClustering {
13 
14 #ifdef GPU_DEBUG
15  __device__ uint32_t gMaxHit = 0;
16 #endif
17 
18  template <bool isPhase2>
19  __global__ void countModules(uint16_t const* __restrict__ id,
20  uint32_t* __restrict__ moduleStart,
21  int32_t* __restrict__ clusterId,
22  int numElements) {
23  int first = blockDim.x * blockIdx.x + threadIdx.x;
26  for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
27  clusterId[i] = i;
28  if (invalidModuleId == id[i])
29  continue;
30  auto j = i - 1;
31  while (j >= 0 and id[j] == invalidModuleId)
32  --j;
33  if (j < 0 or id[j] != id[i]) {
34  // boundary...
35  auto loc = atomicInc(moduleStart, nMaxModules);
36  moduleStart[loc + 1] = i;
37  }
38  }
39  }
40 
41  template <bool isPhase2>
42  __global__ void findClus(uint16_t const* __restrict__ id, // module id of each pixel
43  uint16_t const* __restrict__ x, // local coordinates of each pixel
44  uint16_t const* __restrict__ y, //
45  uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module
46  uint32_t* __restrict__ nClustersInModule, // output: number of clusters found in each module
47  uint32_t* __restrict__ moduleId, // output: module id of each module
48  int32_t* __restrict__ clusterId, // output: cluster id of each pixel
49  int numElements) {
50  __shared__ int msize;
51 
52  auto firstModule = blockIdx.x;
53  auto endModule = moduleStart[0];
54 
57 
58  for (auto module = firstModule; module < endModule; module += gridDim.x) {
59  auto firstPixel = moduleStart[1 + module];
60  auto thisModuleId = id[firstPixel];
61  assert(thisModuleId < nMaxModules);
62 
63 #ifdef GPU_DEBUG
64  if (thisModuleId % 100 == 1)
65  if (threadIdx.x == 0)
66  printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
67 #endif
68 
69  auto first = firstPixel + threadIdx.x;
70 
71  // find the index of the first pixel not belonging to this module (or invalid)
72  msize = numElements;
73  __syncthreads();
74 
75  // skip threads not associated to an existing pixel
76  for (int i = first; i < numElements; i += blockDim.x) {
77  if (id[i] == invalidModuleId) // skip invalid pixels
78  continue;
79  if (id[i] != thisModuleId) { // find the first pixel in a different module
80  atomicMin(&msize, i);
81  break;
82  }
83  }
84 
85  //init hist (ymax=416 < 512 : 9bits)
86  //6000 max pixels required for HI operations with no measurable impact on pp performance
87  constexpr uint32_t maxPixInModule = 6000;
88  constexpr auto nbins = isPhase2 ? 1024 : phase1PixelTopology::numColsInModule + 2; //2+2;
89  constexpr auto nbits = isPhase2 ? 10 : 9; //2+2;
91  __shared__ Hist hist;
92  __shared__ typename Hist::Counter ws[32];
93  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
94  hist.off[j] = 0;
95  }
96  __syncthreads();
97 
98  assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
99 
100  // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
101  if (0 == threadIdx.x) {
102  if (msize - firstPixel > maxPixInModule) {
103  printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
104  msize = maxPixInModule + firstPixel;
105  }
106  }
107 
108  __syncthreads();
109  assert(msize - firstPixel <= maxPixInModule);
110 
111 #ifdef GPU_DEBUG
112  __shared__ uint32_t totGood;
113  totGood = 0;
114  __syncthreads();
115 #endif
116 
117  // fill histo
118  for (int i = first; i < msize; i += blockDim.x) {
119  if (id[i] == invalidModuleId) // skip invalid pixels
120  continue;
121  hist.count(y[i]);
122 #ifdef GPU_DEBUG
123  atomicAdd(&totGood, 1);
124 #endif
125  }
126  __syncthreads();
127  if (threadIdx.x < 32)
128  ws[threadIdx.x] = 0; // used by prefix scan...
129  __syncthreads();
130  hist.finalize(ws);
131  __syncthreads();
132 #ifdef GPU_DEBUG
133  assert(hist.size() == totGood);
134  if (thisModuleId % 100 == 1)
135  if (threadIdx.x == 0)
136  printf("histo size %d\n", hist.size());
137 #endif
138  for (int i = first; i < msize; i += blockDim.x) {
139  if (id[i] == invalidModuleId) // skip invalid pixels
140  continue;
141  hist.fill(y[i], i - firstPixel);
142  }
143 
144 #ifdef __CUDA_ARCH__
145  // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
146  constexpr int maxiter = 16;
147  if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
148  printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
149  thisModuleId,
150  hist.size(),
151  blockDim.x);
152 #else
153  auto maxiter = hist.size();
154 #endif
155  // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
156  constexpr int maxNeighbours = 10;
157  assert((hist.size() / blockDim.x) <= maxiter);
158  // nearest neighbour
159  uint16_t nn[maxiter][maxNeighbours];
160  uint8_t nnn[maxiter]; // number of nn
161  for (uint32_t k = 0; k < maxiter; ++k)
162  nnn[k] = 0;
163 
164  __syncthreads(); // for hit filling!
165 
166 #ifdef GPU_DEBUG
167  // look for anomalous high occupancy
168  __shared__ uint32_t n40, n60;
169  n40 = n60 = 0;
170  __syncthreads();
171  for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
172  if (hist.size(j) > 60)
173  atomicAdd(&n60, 1);
174  if (hist.size(j) > 40)
175  atomicAdd(&n40, 1);
176  }
177  __syncthreads();
178  if (0 == threadIdx.x) {
179  if (n60 > 0)
180  printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
181  else if (n40 > 0)
182  printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
183  }
184  __syncthreads();
185 #endif
186 
187  // fill NN
188  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
189  assert(k < maxiter);
190  auto p = hist.begin() + j;
191  auto i = *p + firstPixel;
192  assert(id[i] != invalidModuleId);
193  assert(id[i] == thisModuleId); // same module
194  int be = Hist::bin(y[i] + 1);
195  auto e = hist.end(be);
196  ++p;
197  assert(0 == nnn[k]);
198  for (; p < e; ++p) {
199  auto m = (*p) + firstPixel;
200  assert(m != i);
201  assert(int(y[m]) - int(y[i]) >= 0);
202  assert(int(y[m]) - int(y[i]) <= 1);
203  if (std::abs(int(x[m]) - int(x[i])) > 1)
204  continue;
205  auto l = nnn[k]++;
206  assert(l < maxNeighbours);
207  nn[k][l] = *p;
208  }
209  }
210 
211  // for each pixel, look at all the pixels until the end of the module;
212  // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
213  // after the loop, all the pixel in each cluster should have the id equeal to the lowest
214  // pixel in the cluster ( clus[i] == i ).
215  bool more = true;
216  int nloops = 0;
217  while (__syncthreads_or(more)) {
218  if (1 == nloops % 2) {
219  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
220  auto p = hist.begin() + j;
221  auto i = *p + firstPixel;
222  auto m = clusterId[i];
223  while (m != clusterId[m])
224  m = clusterId[m];
225  clusterId[i] = m;
226  }
227  } else {
228  more = false;
229  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
230  auto p = hist.begin() + j;
231  auto i = *p + firstPixel;
232  for (int kk = 0; kk < nnn[k]; ++kk) {
233  auto l = nn[k][kk];
234  auto m = l + firstPixel;
235  assert(m != i);
236  auto old = atomicMin_block(&clusterId[m], clusterId[i]);
237  // do we need memory fence?
238  if (old != clusterId[i]) {
239  // end the loop only if no changes were applied
240  more = true;
241  }
242  atomicMin_block(&clusterId[i], old);
243  } // nnloop
244  } // pixel loop
245  }
246  ++nloops;
247  } // end while
248 
249 #ifdef GPU_DEBUG
250  {
251  __shared__ int n0;
252  if (threadIdx.x == 0)
253  n0 = nloops;
254  __syncthreads();
255  auto ok = n0 == nloops;
257  if (thisModuleId % 100 == 1)
258  if (threadIdx.x == 0)
259  printf("# loops %d\n", nloops);
260  }
261 #endif
262 
263  __shared__ unsigned int foundClusters;
264  foundClusters = 0;
265  __syncthreads();
266 
267  // find the number of different clusters, identified by a pixels with clus[i] == i;
268  // mark these pixels with a negative id.
269  for (int i = first; i < msize; i += blockDim.x) {
270  if (id[i] == invalidModuleId) // skip invalid pixels
271  continue;
272  if (clusterId[i] == i) {
273  auto old = atomicInc(&foundClusters, 0xffffffff);
274  clusterId[i] = -(old + 1);
275  }
276  }
277  __syncthreads();
278 
279  // propagate the negative id to all the pixels in the cluster.
280  for (int i = first; i < msize; i += blockDim.x) {
281  if (id[i] == invalidModuleId) // skip invalid pixels
282  continue;
283  if (clusterId[i] >= 0) {
284  // mark each pixel in a cluster with the same id as the first one
286  }
287  }
288  __syncthreads();
289 
290  // adjust the cluster id to be a positive value starting from 0
291  for (int i = first; i < msize; i += blockDim.x) {
292  if (id[i] == invalidModuleId) { // skip invalid pixels
294  continue;
295  }
296  clusterId[i] = -clusterId[i] - 1;
297  }
298  __syncthreads();
299 
300  if (threadIdx.x == 0) {
301  nClustersInModule[thisModuleId] = foundClusters;
302  moduleId[module] = thisModuleId;
303 #ifdef GPU_DEBUG
304  if (foundClusters > gMaxHit) {
305  gMaxHit = foundClusters;
306  if (foundClusters > 8)
307  printf("max hit %d in %d\n", foundClusters, thisModuleId);
308  }
309 #endif
310 #ifdef GPU_DEBUG
311  if (thisModuleId % 100 == 1)
312  printf("%d clusters in module %d\n", foundClusters, thisModuleId);
313 #endif
314  }
315  } // module loop
316  }
317 } // namespace gpuClustering
318 
319 #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
const dim3 threadIdx
Definition: cudaCompat.h:29
__shared__ uint8_t ok[maxNumClustersPerModules]
constexpr int nMaxModules
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:110
const dim3 gridDim
Definition: cudaCompat.h:33
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ moduleStart
constexpr uint32_t numberOfModules
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
constexpr uint16_t numColsInModule
auto &__restrict__ ws
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ clusterId
assert(nMaxModules< maxNumModules)
std::function< unsigned int(align::ID)> Counter
int n0
Definition: AMPTWrapper.h:44
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
uint16_t const *__restrict__ x
Definition: gpuClustering.h:43
static const MaxIter maxiter
constexpr uint16_t maxNumModules
const dim3 blockIdx
Definition: cudaCompat.h:32
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ nClustersInModule
constexpr uint16_t invalidModuleId
__shared__ Hist hist
T1 atomicMin_block(T1 *a, T2 b)
Definition: cudaCompat.h:92
constexpr uint32_t numberOfModules
void __syncthreads()
Definition: cudaCompat.h:108
constexpr int invalidClusterId
bool __syncthreads_and(bool x)
Definition: cudaCompat.h:111
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
uint16_t const *__restrict__ uint16_t const *__restrict__ y
Definition: gpuClustering.h:43
#define __device__
__shared__ unsigned int foundClusters
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61