CMS 3D CMS Logo

gpuClustering.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
2 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
3 
4 #include <cstdint>
5 #include <cstdio>
6 
11 
12 namespace gpuClustering {
13 
14 #ifdef GPU_DEBUG
15  __device__ uint32_t gMaxHit = 0;
16 #endif
17 
18  __global__ void countModules(uint16_t const* __restrict__ id,
19  uint32_t* __restrict__ moduleStart,
20  int32_t* __restrict__ clusterId,
21  int numElements) {
22  int first = blockDim.x * blockIdx.x + threadIdx.x;
23  for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
24  clusterId[i] = i;
25  if (invalidModuleId == id[i])
26  continue;
27  auto j = i - 1;
28  while (j >= 0 and id[j] == invalidModuleId)
29  --j;
30  if (j < 0 or id[j] != id[i]) {
31  // boundary...
32  auto loc = atomicInc(moduleStart, maxNumModules);
33  moduleStart[loc + 1] = i;
34  }
35  }
36  }
37 
38  __global__ void findClus(uint16_t const* __restrict__ id, // module id of each pixel
39  uint16_t const* __restrict__ x, // local coordinates of each pixel
40  uint16_t const* __restrict__ y, //
41  uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module
42  uint32_t* __restrict__ nClustersInModule, // output: number of clusters found in each module
43  uint32_t* __restrict__ moduleId, // output: module id of each module
44  int32_t* __restrict__ clusterId, // output: cluster id of each pixel
45  int numElements) {
46  __shared__ int msize;
47 
48  auto firstModule = blockIdx.x;
49  auto endModule = moduleStart[0];
50  for (auto module = firstModule; module < endModule; module += gridDim.x) {
51  auto firstPixel = moduleStart[1 + module];
52  auto thisModuleId = id[firstPixel];
53  assert(thisModuleId < maxNumModules);
54 
55 #ifdef GPU_DEBUG
56  if (thisModuleId % 100 == 1)
57  if (threadIdx.x == 0)
58  printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
59 #endif
60 
61  auto first = firstPixel + threadIdx.x;
62 
63  // find the index of the first pixel not belonging to this module (or invalid)
64  msize = numElements;
65  __syncthreads();
66 
67  // skip threads not associated to an existing pixel
68  for (int i = first; i < numElements; i += blockDim.x) {
69  if (id[i] == invalidModuleId) // skip invalid pixels
70  continue;
71  if (id[i] != thisModuleId) { // find the first pixel in a different module
72  atomicMin(&msize, i);
73  break;
74  }
75  }
76 
77  //init hist (ymax=416 < 512 : 9bits)
78  constexpr uint32_t maxPixInModule = 4000;
79  constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2;
81  __shared__ Hist hist;
82  __shared__ typename Hist::Counter ws[32];
83  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
84  hist.off[j] = 0;
85  }
86  __syncthreads();
87 
88  assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
89 
90  // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
91  if (0 == threadIdx.x) {
92  if (msize - firstPixel > maxPixInModule) {
93  printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
94  msize = maxPixInModule + firstPixel;
95  }
96  }
97 
98  __syncthreads();
99  assert(msize - firstPixel <= maxPixInModule);
100 
101 #ifdef GPU_DEBUG
102  __shared__ uint32_t totGood;
103  totGood = 0;
104  __syncthreads();
105 #endif
106 
107  // fill histo
108  for (int i = first; i < msize; i += blockDim.x) {
109  if (id[i] == invalidModuleId) // skip invalid pixels
110  continue;
111  hist.count(y[i]);
112 #ifdef GPU_DEBUG
113  atomicAdd(&totGood, 1);
114 #endif
115  }
116  __syncthreads();
117  if (threadIdx.x < 32)
118  ws[threadIdx.x] = 0; // used by prefix scan...
119  __syncthreads();
120  hist.finalize(ws);
121  __syncthreads();
122 #ifdef GPU_DEBUG
123  assert(hist.size() == totGood);
124  if (thisModuleId % 100 == 1)
125  if (threadIdx.x == 0)
126  printf("histo size %d\n", hist.size());
127 #endif
128  for (int i = first; i < msize; i += blockDim.x) {
129  if (id[i] == invalidModuleId) // skip invalid pixels
130  continue;
131  hist.fill(y[i], i - firstPixel);
132  }
133 
134 #ifdef __CUDA_ARCH__
135  // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
136  constexpr int maxiter = 16;
137 #else
138  auto maxiter = hist.size();
139 #endif
140  // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
141  constexpr int maxNeighbours = 10;
142  assert((hist.size() / blockDim.x) <= maxiter);
143  // nearest neighbour
144  uint16_t nn[maxiter][maxNeighbours];
145  uint8_t nnn[maxiter]; // number of nn
146  for (uint32_t k = 0; k < maxiter; ++k)
147  nnn[k] = 0;
148 
149  __syncthreads(); // for hit filling!
150 
151 #ifdef GPU_DEBUG
152  // look for anomalous high occupancy
153  __shared__ uint32_t n40, n60;
154  n40 = n60 = 0;
155  __syncthreads();
156  for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
157  if (hist.size(j) > 60)
158  atomicAdd(&n60, 1);
159  if (hist.size(j) > 40)
160  atomicAdd(&n40, 1);
161  }
162  __syncthreads();
163  if (0 == threadIdx.x) {
164  if (n60 > 0)
165  printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
166  else if (n40 > 0)
167  printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
168  }
169  __syncthreads();
170 #endif
171 
172  // fill NN
173  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
174  assert(k < maxiter);
175  auto p = hist.begin() + j;
176  auto i = *p + firstPixel;
177  assert(id[i] != invalidModuleId);
178  assert(id[i] == thisModuleId); // same module
179  int be = Hist::bin(y[i] + 1);
180  auto e = hist.end(be);
181  ++p;
182  assert(0 == nnn[k]);
183  for (; p < e; ++p) {
184  auto m = (*p) + firstPixel;
185  assert(m != i);
186  assert(int(y[m]) - int(y[i]) >= 0);
187  assert(int(y[m]) - int(y[i]) <= 1);
188  if (std::abs(int(x[m]) - int(x[i])) > 1)
189  continue;
190  auto l = nnn[k]++;
191  assert(l < maxNeighbours);
192  nn[k][l] = *p;
193  }
194  }
195 
196  // for each pixel, look at all the pixels until the end of the module;
197  // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
198  // after the loop, all the pixel in each cluster should have the id equeal to the lowest
199  // pixel in the cluster ( clus[i] == i ).
200  bool more = true;
201  int nloops = 0;
202  while (__syncthreads_or(more)) {
203  if (1 == nloops % 2) {
204  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
205  auto p = hist.begin() + j;
206  auto i = *p + firstPixel;
207  auto m = clusterId[i];
208  while (m != clusterId[m])
209  m = clusterId[m];
210  clusterId[i] = m;
211  }
212  } else {
213  more = false;
214  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
215  auto p = hist.begin() + j;
216  auto i = *p + firstPixel;
217  for (int kk = 0; kk < nnn[k]; ++kk) {
218  auto l = nn[k][kk];
219  auto m = l + firstPixel;
220  assert(m != i);
221  auto old = atomicMin(&clusterId[m], clusterId[i]);
222  if (old != clusterId[i]) {
223  // end the loop only if no changes were applied
224  more = true;
225  }
226  atomicMin(&clusterId[i], old);
227  } // nnloop
228  } // pixel loop
229  }
230  ++nloops;
231  } // end while
232 
233 #ifdef GPU_DEBUG
234  {
235  __shared__ int n0;
236  if (threadIdx.x == 0)
237  n0 = nloops;
238  __syncthreads();
239  auto ok = n0 == nloops;
241  if (thisModuleId % 100 == 1)
242  if (threadIdx.x == 0)
243  printf("# loops %d\n", nloops);
244  }
245 #endif
246 
247  __shared__ unsigned int foundClusters;
248  foundClusters = 0;
249  __syncthreads();
250 
251  // find the number of different clusters, identified by a pixels with clus[i] == i;
252  // mark these pixels with a negative id.
253  for (int i = first; i < msize; i += blockDim.x) {
254  if (id[i] == invalidModuleId) // skip invalid pixels
255  continue;
256  if (clusterId[i] == i) {
257  auto old = atomicInc(&foundClusters, 0xffffffff);
258  clusterId[i] = -(old + 1);
259  }
260  }
261  __syncthreads();
262 
263  // propagate the negative id to all the pixels in the cluster.
264  for (int i = first; i < msize; i += blockDim.x) {
265  if (id[i] == invalidModuleId) // skip invalid pixels
266  continue;
267  if (clusterId[i] >= 0) {
268  // mark each pixel in a cluster with the same id as the first one
270  }
271  }
272  __syncthreads();
273 
274  // adjust the cluster id to be a positive value starting from 0
275  for (int i = first; i < msize; i += blockDim.x) {
276  if (id[i] == invalidModuleId) { // skip invalid pixels
277  clusterId[i] = -9999;
278  continue;
279  }
280  clusterId[i] = -clusterId[i] - 1;
281  }
282  __syncthreads();
283 
284  if (threadIdx.x == 0) {
285  nClustersInModule[thisModuleId] = foundClusters;
286  moduleId[module] = thisModuleId;
287 #ifdef GPU_DEBUG
288  if (foundClusters > gMaxHit) {
289  gMaxHit = foundClusters;
290  if (foundClusters > 8)
291  printf("max hit %d in %d\n", foundClusters, thisModuleId);
292  }
293 #endif
294 #ifdef GPU_DEBUG
295  if (thisModuleId % 100 == 1)
296  printf("%d clusters in module %d\n", foundClusters, thisModuleId);
297 #endif
298  }
299  } // module loop
300  }
301 } // namespace gpuClustering
302 
303 #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
cms::cuda::HistoContainer::off
off[c.m]
Definition: HistoContainer.h:239
gpuClustering
Definition: gpuClusteringConstants.h:18
mps_fire.i
i
Definition: mps_fire.py:428
align::Counter
std::function< unsigned int(align::ID)> Counter
Definition: AlignableIndexer.h:31
gpuClustering::y
uint16_t const *__restrict__ uint16_t const *__restrict__ y
Definition: gpuClustering.h:39
AlCaHLTBitMon_ParallelJobs.p
p
Definition: AlCaHLTBitMon_ParallelJobs.py:153
n0
int n0
Definition: AMPTWrapper.h:44
cms::cuda::assert
assert(be >=bs)
cms::cudacompat::__syncthreads
void __syncthreads()
Definition: cudaCompat.h:77
gpuClustering::firstModule
auto firstModule
Definition: gpuClusterChargeCut.h:31
gpuVertexFinder::nloops
__shared__ int nloops
Definition: gpuClusterTracksIterative.h:102
gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksDBSCAN.h:32
gpuClustering::moduleId
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
Definition: gpuClusterChargeCut.h:20
cms::cudacompat::__syncthreads_or
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:79
gpuClustering::numElements
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
Definition: gpuClusterChargeCut.h:26
__global__
#define __global__
Definition: cudaCompat.h:19
gpuClustering::moduleStart
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ moduleStart
Definition: gpuClusterChargeCut.h:20
gpuVertexFinder::more
bool more
Definition: gpuClusterTracksIterative.h:108
gpuVertexFinder::Hist
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
Definition: gpuClusterTracksDBSCAN.h:47
visualization-live-secondInstance_cfg.m
m
Definition: visualization-live-secondInstance_cfg.py:72
LaserClient_cfi.nbins
nbins
Definition: LaserClient_cfi.py:51
cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:51
gpuClustering::x
uint16_t const *__restrict__ x
Definition: gpuClustering.h:39
GetRecoTauVFromDQM_MC_cff.kk
kk
Definition: GetRecoTauVFromDQM_MC_cff.py:84
phase1PixelTopology::numColsInModule
constexpr uint16_t numColsInModule
Definition: phase1PixelTopology.h:15
gpuClustering::clusterId
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ clusterId
Definition: gpuClusterChargeCut.h:20
dqmdumpme.k
k
Definition: dqmdumpme.py:60
gpuClustering::maxNumModules
constexpr uint16_t maxNumModules
Definition: gpuClusteringConstants.h:29
first
auto first
Definition: CAHitNtupletGeneratorKernelsImpl.h:112
mitigatedMETSequence_cff.U
U
Definition: mitigatedMETSequence_cff.py:36
cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33
cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30
gpuClustering::invalidModuleId
constexpr uint16_t invalidModuleId
Definition: gpuClusteringConstants.h:32
gpuVertexFinder::hist
__shared__ Hist hist
Definition: gpuClusterTracksDBSCAN.h:48
maxiter
static const MaxIter maxiter
Definition: HelixArbitraryPlaneCrossing.cc:30
gpuClusteringConstants.h
__device__
#define __device__
Definition: SiPixelGainForHLTonGPU.h:15
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29
cmsLHEtoEOSManager.l
l
Definition: cmsLHEtoEOSManager.py:204
HistoContainer.h
newFWLiteAna.bin
bin
Definition: newFWLiteAna.py:161
groupFilesInBlocks.nn
nn
Definition: groupFilesInBlocks.py:150
gpuClustering::nClustersInModule
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ nClustersInModule
Definition: gpuClusterChargeCut.h:20
gpuClustering::endModule
auto endModule
Definition: gpuClusterChargeCut.h:32
cms::cudacompat::atomicMin
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:65
or
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
gpuClustering::ok
__shared__ uint8_t ok[maxNumClustersPerModules]
Definition: gpuClusterChargeCut.h:28
funct::abs
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
gpuVertexFinder::foundClusters
__shared__ unsigned int foundClusters
Definition: gpuClusterTracksDBSCAN.h:199
cms::cuda::be
int be
Definition: HistoContainer.h:126
cms::cuda::HistoContainer
Definition: HistoContainer.h:152
cms::cudacompat::__syncthreads_and
bool __syncthreads_and(bool x)
Definition: cudaCompat.h:80
cuda_assert.h
dqmiolumiharvest.j
j
Definition: dqmiolumiharvest.py:66
phase1PixelTopology.h
cms::cudacompat::atomicInc
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:43
MillePedeFileConverter_cfg.e
e
Definition: MillePedeFileConverter_cfg.py:37
cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32