CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
gpuClustering.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
2 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
3 
4 #include <cstdint>
5 #include <cstdio>
6 
11 
12 namespace gpuClustering {
13 
14 #ifdef GPU_DEBUG
15  __device__ uint32_t gMaxHit = 0;
16 #endif
17 
18  __global__ void countModules(uint16_t const* __restrict__ id,
19  uint32_t* __restrict__ moduleStart,
20  int32_t* __restrict__ clusterId,
21  int numElements) {
22  int first = blockDim.x * blockIdx.x + threadIdx.x;
23  for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
24  clusterId[i] = i;
25  if (invalidModuleId == id[i])
26  continue;
27  auto j = i - 1;
28  while (j >= 0 and id[j] == invalidModuleId)
29  --j;
30  if (j < 0 or id[j] != id[i]) {
31  // boundary...
32  auto loc = atomicInc(moduleStart, maxNumModules);
33  moduleStart[loc + 1] = i;
34  }
35  }
36  }
37 
38  __global__ void findClus(uint16_t const* __restrict__ id, // module id of each pixel
39  uint16_t const* __restrict__ x, // local coordinates of each pixel
40  uint16_t const* __restrict__ y, //
41  uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module
42  uint32_t* __restrict__ nClustersInModule, // output: number of clusters found in each module
43  uint32_t* __restrict__ moduleId, // output: module id of each module
44  int32_t* __restrict__ clusterId, // output: cluster id of each pixel
45  int numElements) {
46  __shared__ int msize;
47 
48  auto firstModule = blockIdx.x;
49  auto endModule = moduleStart[0];
50  for (auto module = firstModule; module < endModule; module += gridDim.x) {
51  auto firstPixel = moduleStart[1 + module];
52  auto thisModuleId = id[firstPixel];
53  assert(thisModuleId < maxNumModules);
54 
55 #ifdef GPU_DEBUG
56  if (thisModuleId % 100 == 1)
57  if (threadIdx.x == 0)
58  printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
59 #endif
60 
61  auto first = firstPixel + threadIdx.x;
62 
63  // find the index of the first pixel not belonging to this module (or invalid)
64  msize = numElements;
65  __syncthreads();
66 
67  // skip threads not associated to an existing pixel
68  for (int i = first; i < numElements; i += blockDim.x) {
69  if (id[i] == invalidModuleId) // skip invalid pixels
70  continue;
71  if (id[i] != thisModuleId) { // find the first pixel in a different module
72  atomicMin(&msize, i);
73  break;
74  }
75  }
76 
77  //init hist (ymax=416 < 512 : 9bits)
78  //6000 max pixels required for HI operations with no measurable impact on pp performance
79  constexpr uint32_t maxPixInModule = 6000;
80  constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2;
82  __shared__ Hist hist;
83  __shared__ typename Hist::Counter ws[32];
84  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
85  hist.off[j] = 0;
86  }
87  __syncthreads();
88 
89  assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
90 
91  // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
92  if (0 == threadIdx.x) {
93  if (msize - firstPixel > maxPixInModule) {
94  printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
95  msize = maxPixInModule + firstPixel;
96  }
97  }
98 
99  __syncthreads();
100  assert(msize - firstPixel <= maxPixInModule);
101 
102 #ifdef GPU_DEBUG
103  __shared__ uint32_t totGood;
104  totGood = 0;
105  __syncthreads();
106 #endif
107 
108  // fill histo
109  for (int i = first; i < msize; i += blockDim.x) {
110  if (id[i] == invalidModuleId) // skip invalid pixels
111  continue;
112  hist.count(y[i]);
113 #ifdef GPU_DEBUG
114  atomicAdd(&totGood, 1);
115 #endif
116  }
117  __syncthreads();
118  if (threadIdx.x < 32)
119  ws[threadIdx.x] = 0; // used by prefix scan...
120  __syncthreads();
121  hist.finalize(ws);
122  __syncthreads();
123 #ifdef GPU_DEBUG
124  assert(hist.size() == totGood);
125  if (thisModuleId % 100 == 1)
126  if (threadIdx.x == 0)
127  printf("histo size %d\n", hist.size());
128 #endif
129  for (int i = first; i < msize; i += blockDim.x) {
130  if (id[i] == invalidModuleId) // skip invalid pixels
131  continue;
132  hist.fill(y[i], i - firstPixel);
133  }
134 
135 #ifdef __CUDA_ARCH__
136  // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
137  constexpr int maxiter = 16;
138  if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
139  printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
140  thisModuleId,
141  hist.size(),
142  blockDim.x);
143 #else
144  auto maxiter = hist.size();
145 #endif
146  // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
147  constexpr int maxNeighbours = 10;
148  assert((hist.size() / blockDim.x) <= maxiter);
149  // nearest neighbour
150  uint16_t nn[maxiter][maxNeighbours];
151  uint8_t nnn[maxiter]; // number of nn
152  for (uint32_t k = 0; k < maxiter; ++k)
153  nnn[k] = 0;
154 
155  __syncthreads(); // for hit filling!
156 
157 #ifdef GPU_DEBUG
158  // look for anomalous high occupancy
159  __shared__ uint32_t n40, n60;
160  n40 = n60 = 0;
161  __syncthreads();
162  for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
163  if (hist.size(j) > 60)
164  atomicAdd(&n60, 1);
165  if (hist.size(j) > 40)
166  atomicAdd(&n40, 1);
167  }
168  __syncthreads();
169  if (0 == threadIdx.x) {
170  if (n60 > 0)
171  printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
172  else if (n40 > 0)
173  printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
174  }
175  __syncthreads();
176 #endif
177 
178  // fill NN
179  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
180  assert(k < maxiter);
181  auto p = hist.begin() + j;
182  auto i = *p + firstPixel;
183  assert(id[i] != invalidModuleId);
184  assert(id[i] == thisModuleId); // same module
185  int be = Hist::bin(y[i] + 1);
186  auto e = hist.end(be);
187  ++p;
188  assert(0 == nnn[k]);
189  for (; p < e; ++p) {
190  auto m = (*p) + firstPixel;
191  assert(m != i);
192  assert(int(y[m]) - int(y[i]) >= 0);
193  assert(int(y[m]) - int(y[i]) <= 1);
194  if (std::abs(int(x[m]) - int(x[i])) > 1)
195  continue;
196  auto l = nnn[k]++;
197  assert(l < maxNeighbours);
198  nn[k][l] = *p;
199  }
200  }
201 
202  // for each pixel, look at all the pixels until the end of the module;
203  // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
204  // after the loop, all the pixel in each cluster should have the id equeal to the lowest
205  // pixel in the cluster ( clus[i] == i ).
206  bool more = true;
207  int nloops = 0;
208  while (__syncthreads_or(more)) {
209  if (1 == nloops % 2) {
210  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
211  auto p = hist.begin() + j;
212  auto i = *p + firstPixel;
213  auto m = clusterId[i];
214  while (m != clusterId[m])
215  m = clusterId[m];
216  clusterId[i] = m;
217  }
218  } else {
219  more = false;
220  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
221  auto p = hist.begin() + j;
222  auto i = *p + firstPixel;
223  for (int kk = 0; kk < nnn[k]; ++kk) {
224  auto l = nn[k][kk];
225  auto m = l + firstPixel;
226  assert(m != i);
227  auto old = atomicMin_block(&clusterId[m], clusterId[i]);
228  // do we need memory fence?
229  if (old != clusterId[i]) {
230  // end the loop only if no changes were applied
231  more = true;
232  }
233  atomicMin_block(&clusterId[i], old);
234  } // nnloop
235  } // pixel loop
236  }
237  ++nloops;
238  } // end while
239 
240 #ifdef GPU_DEBUG
241  {
242  __shared__ int n0;
243  if (threadIdx.x == 0)
244  n0 = nloops;
245  __syncthreads();
246  auto ok = n0 == nloops;
248  if (thisModuleId % 100 == 1)
249  if (threadIdx.x == 0)
250  printf("# loops %d\n", nloops);
251  }
252 #endif
253 
254  __shared__ unsigned int foundClusters;
255  foundClusters = 0;
256  __syncthreads();
257 
258  // find the number of different clusters, identified by a pixels with clus[i] == i;
259  // mark these pixels with a negative id.
260  for (int i = first; i < msize; i += blockDim.x) {
261  if (id[i] == invalidModuleId) // skip invalid pixels
262  continue;
263  if (clusterId[i] == i) {
264  auto old = atomicInc(&foundClusters, 0xffffffff);
265  clusterId[i] = -(old + 1);
266  }
267  }
268  __syncthreads();
269 
270  // propagate the negative id to all the pixels in the cluster.
271  for (int i = first; i < msize; i += blockDim.x) {
272  if (id[i] == invalidModuleId) // skip invalid pixels
273  continue;
274  if (clusterId[i] >= 0) {
275  // mark each pixel in a cluster with the same id as the first one
276  clusterId[i] = clusterId[clusterId[i]];
277  }
278  }
279  __syncthreads();
280 
281  // adjust the cluster id to be a positive value starting from 0
282  for (int i = first; i < msize; i += blockDim.x) {
283  if (id[i] == invalidModuleId) { // skip invalid pixels
284  clusterId[i] = invalidClusterId;
285  continue;
286  }
287  clusterId[i] = -clusterId[i] - 1;
288  }
289  __syncthreads();
290 
291  if (threadIdx.x == 0) {
292  nClustersInModule[thisModuleId] = foundClusters;
293  moduleId[module] = thisModuleId;
294 #ifdef GPU_DEBUG
295  if (foundClusters > gMaxHit) {
296  gMaxHit = foundClusters;
297  if (foundClusters > 8)
298  printf("max hit %d in %d\n", foundClusters, thisModuleId);
299  }
300 #endif
301 #ifdef GPU_DEBUG
302  if (thisModuleId % 100 == 1)
303  printf("%d clusters in module %d\n", foundClusters, thisModuleId);
304 #endif
305  }
306  } // module loop
307  }
308 } // namespace gpuClustering
309 
310 #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
const dim3 threadIdx
Definition: cudaCompat.h:29
__shared__ uint8_t ok[maxNumClustersPerModules]
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:110
const dim3 gridDim
Definition: cudaCompat.h:33
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventIDconst &, edm::Timestampconst & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ clusterId
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
assert(be >=bs)
constexpr uint16_t numColsInModule
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ nClustersInModule
auto &__restrict__ ws
std::function< unsigned int(align::ID)> Counter
int n0
Definition: AMPTWrapper.h:44
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
printf("params %d %f %f %f\n", minT, eps, errmax, chi2max)
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
uint16_t const *__restrict__ x
Definition: gpuClustering.h:39
static const MaxIter maxiter
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ moduleStart
constexpr uint16_t maxNumModules
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
const dim3 blockIdx
Definition: cudaCompat.h:32
constexpr uint16_t invalidModuleId
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
__shared__ Hist hist
T1 atomicMin_block(T1 *a, T2 b)
Definition: cudaCompat.h:92
void __syncthreads()
Definition: cudaCompat.h:108
int32_t *__restrict__ nn
constexpr int invalidClusterId
bool __syncthreads_and(bool x)
Definition: cudaCompat.h:111
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
uint16_t const *__restrict__ uint16_t const *__restrict__ y
Definition: gpuClustering.h:39
#define __device__
__shared__ unsigned int foundClusters
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
tuple module
Definition: callgraph.py:69