CMS 3D CMS Logo

gpuClustering.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
2 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
3 
4 #include <cstdint>
5 #include <cstdio>
6 
11 
12 namespace gpuClustering {
13 
14 #ifdef GPU_DEBUG
15  __device__ uint32_t gMaxHit = 0;
16 #endif
17 
18  __global__ void countModules(uint16_t const* __restrict__ id,
19  uint32_t* __restrict__ moduleStart,
20  int32_t* __restrict__ clusterId,
21  int numElements) {
22  int first = blockDim.x * blockIdx.x + threadIdx.x;
23  for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
24  clusterId[i] = i;
25  if (invalidModuleId == id[i])
26  continue;
27  auto j = i - 1;
28  while (j >= 0 and id[j] == invalidModuleId)
29  --j;
30  if (j < 0 or id[j] != id[i]) {
31  // boundary...
32  auto loc = atomicInc(moduleStart, maxNumModules);
33  moduleStart[loc + 1] = i;
34  }
35  }
36  }
37 
38  __global__ void findClus(uint16_t const* __restrict__ id, // module id of each pixel
39  uint16_t const* __restrict__ x, // local coordinates of each pixel
40  uint16_t const* __restrict__ y, //
41  uint32_t const* __restrict__ moduleStart, // index of the first pixel of each module
42  uint32_t* __restrict__ nClustersInModule, // output: number of clusters found in each module
43  uint32_t* __restrict__ moduleId, // output: module id of each module
44  int32_t* __restrict__ clusterId, // output: cluster id of each pixel
45  int numElements) {
46  __shared__ int msize;
47 
48  auto firstModule = blockIdx.x;
49  auto endModule = moduleStart[0];
50  for (auto module = firstModule; module < endModule; module += gridDim.x) {
51  auto firstPixel = moduleStart[1 + module];
52  auto thisModuleId = id[firstPixel];
53  assert(thisModuleId < maxNumModules);
54 
55 #ifdef GPU_DEBUG
56  if (thisModuleId % 100 == 1)
57  if (threadIdx.x == 0)
58  printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
59 #endif
60 
61  auto first = firstPixel + threadIdx.x;
62 
63  // find the index of the first pixel not belonging to this module (or invalid)
64  msize = numElements;
65  __syncthreads();
66 
67  // skip threads not associated to an existing pixel
68  for (int i = first; i < numElements; i += blockDim.x) {
69  if (id[i] == invalidModuleId) // skip invalid pixels
70  continue;
71  if (id[i] != thisModuleId) { // find the first pixel in a different module
72  atomicMin(&msize, i);
73  break;
74  }
75  }
76 
77  //init hist (ymax=416 < 512 : 9bits)
78  constexpr uint32_t maxPixInModule = 4000;
79  constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2;
81  __shared__ Hist hist;
82  __shared__ typename Hist::Counter ws[32];
83  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
84  hist.off[j] = 0;
85  }
86  __syncthreads();
87 
88  assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
89 
90  // limit to maxPixInModule (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
91  if (0 == threadIdx.x) {
92  if (msize - firstPixel > maxPixInModule) {
93  printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
94  msize = maxPixInModule + firstPixel;
95  }
96  }
97 
98  __syncthreads();
99  assert(msize - firstPixel <= maxPixInModule);
100 
101 #ifdef GPU_DEBUG
102  __shared__ uint32_t totGood;
103  totGood = 0;
104  __syncthreads();
105 #endif
106 
107  // fill histo
108  for (int i = first; i < msize; i += blockDim.x) {
109  if (id[i] == invalidModuleId) // skip invalid pixels
110  continue;
111  hist.count(y[i]);
112 #ifdef GPU_DEBUG
113  atomicAdd(&totGood, 1);
114 #endif
115  }
116  __syncthreads();
117  if (threadIdx.x < 32)
118  ws[threadIdx.x] = 0; // used by prefix scan...
119  __syncthreads();
120  hist.finalize(ws);
121  __syncthreads();
122 #ifdef GPU_DEBUG
123  assert(hist.size() == totGood);
124  if (thisModuleId % 100 == 1)
125  if (threadIdx.x == 0)
126  printf("histo size %d\n", hist.size());
127 #endif
128  for (int i = first; i < msize; i += blockDim.x) {
129  if (id[i] == invalidModuleId) // skip invalid pixels
130  continue;
131  hist.fill(y[i], i - firstPixel);
132  }
133 
134 #ifdef __CUDA_ARCH__
135  // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
136  constexpr int maxiter = 16;
137 #else
138  auto maxiter = hist.size();
139 #endif
140  // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
141  constexpr int maxNeighbours = 10;
142  assert((hist.size() / blockDim.x) <= maxiter);
143  // nearest neighbour
144  uint16_t nn[maxiter][maxNeighbours];
145  uint8_t nnn[maxiter]; // number of nn
146  for (uint32_t k = 0; k < maxiter; ++k)
147  nnn[k] = 0;
148 
149  __syncthreads(); // for hit filling!
150 
151 #ifdef GPU_DEBUG
152  // look for anomalous high occupancy
153  __shared__ uint32_t n40, n60;
154  n40 = n60 = 0;
155  __syncthreads();
156  for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
157  if (hist.size(j) > 60)
158  atomicAdd(&n60, 1);
159  if (hist.size(j) > 40)
160  atomicAdd(&n40, 1);
161  }
162  __syncthreads();
163  if (0 == threadIdx.x) {
164  if (n60 > 0)
165  printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
166  else if (n40 > 0)
167  printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
168  }
169  __syncthreads();
170 #endif
171 
172  // fill NN
173  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
174  assert(k < maxiter);
175  auto p = hist.begin() + j;
176  auto i = *p + firstPixel;
177  assert(id[i] != invalidModuleId);
178  assert(id[i] == thisModuleId); // same module
179  int be = Hist::bin(y[i] + 1);
180  auto e = hist.end(be);
181  ++p;
182  assert(0 == nnn[k]);
183  for (; p < e; ++p) {
184  auto m = (*p) + firstPixel;
185  assert(m != i);
186  assert(int(y[m]) - int(y[i]) >= 0);
187  assert(int(y[m]) - int(y[i]) <= 1);
188  if (std::abs(int(x[m]) - int(x[i])) > 1)
189  continue;
190  auto l = nnn[k]++;
191  assert(l < maxNeighbours);
192  nn[k][l] = *p;
193  }
194  }
195 
196  // for each pixel, look at all the pixels until the end of the module;
197  // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
198  // after the loop, all the pixel in each cluster should have the id equeal to the lowest
199  // pixel in the cluster ( clus[i] == i ).
200  bool more = true;
201  int nloops = 0;
202  while (__syncthreads_or(more)) {
203  if (1 == nloops % 2) {
204  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
205  auto p = hist.begin() + j;
206  auto i = *p + firstPixel;
207  auto m = clusterId[i];
208  while (m != clusterId[m])
209  m = clusterId[m];
210  clusterId[i] = m;
211  }
212  } else {
213  more = false;
214  for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
215  auto p = hist.begin() + j;
216  auto i = *p + firstPixel;
217  for (int kk = 0; kk < nnn[k]; ++kk) {
218  auto l = nn[k][kk];
219  auto m = l + firstPixel;
220  assert(m != i);
221  auto old = atomicMin_block(&clusterId[m], clusterId[i]);
222  // do we need memory fence?
223  if (old != clusterId[i]) {
224  // end the loop only if no changes were applied
225  more = true;
226  }
227  atomicMin_block(&clusterId[i], old);
228  } // nnloop
229  } // pixel loop
230  }
231  ++nloops;
232  } // end while
233 
234 #ifdef GPU_DEBUG
235  {
236  __shared__ int n0;
237  if (threadIdx.x == 0)
238  n0 = nloops;
239  __syncthreads();
240  auto ok = n0 == nloops;
242  if (thisModuleId % 100 == 1)
243  if (threadIdx.x == 0)
244  printf("# loops %d\n", nloops);
245  }
246 #endif
247 
248  __shared__ unsigned int foundClusters;
249  foundClusters = 0;
250  __syncthreads();
251 
252  // find the number of different clusters, identified by a pixels with clus[i] == i;
253  // mark these pixels with a negative id.
254  for (int i = first; i < msize; i += blockDim.x) {
255  if (id[i] == invalidModuleId) // skip invalid pixels
256  continue;
257  if (clusterId[i] == i) {
258  auto old = atomicInc(&foundClusters, 0xffffffff);
259  clusterId[i] = -(old + 1);
260  }
261  }
262  __syncthreads();
263 
264  // propagate the negative id to all the pixels in the cluster.
265  for (int i = first; i < msize; i += blockDim.x) {
266  if (id[i] == invalidModuleId) // skip invalid pixels
267  continue;
268  if (clusterId[i] >= 0) {
269  // mark each pixel in a cluster with the same id as the first one
271  }
272  }
273  __syncthreads();
274 
275  // adjust the cluster id to be a positive value starting from 0
276  for (int i = first; i < msize; i += blockDim.x) {
277  if (id[i] == invalidModuleId) { // skip invalid pixels
279  continue;
280  }
281  clusterId[i] = -clusterId[i] - 1;
282  }
283  __syncthreads();
284 
285  if (threadIdx.x == 0) {
286  nClustersInModule[thisModuleId] = foundClusters;
287  moduleId[module] = thisModuleId;
288 #ifdef GPU_DEBUG
289  if (foundClusters > gMaxHit) {
290  gMaxHit = foundClusters;
291  if (foundClusters > 8)
292  printf("max hit %d in %d\n", foundClusters, thisModuleId);
293  }
294 #endif
295 #ifdef GPU_DEBUG
296  if (thisModuleId % 100 == 1)
297  printf("%d clusters in module %d\n", foundClusters, thisModuleId);
298 #endif
299  }
300  } // module loop
301  }
302 } // namespace gpuClustering
303 
304 #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
gpuClustering
Definition: gpuClusteringConstants.h:7
mps_fire.i
i
Definition: mps_fire.py:428
align::Counter
std::function< unsigned int(align::ID)> Counter
Definition: AlignableIndexer.h:31
gpuClustering::y
uint16_t const *__restrict__ uint16_t const *__restrict__ y
Definition: gpuClustering.h:39
n0
int n0
Definition: AMPTWrapper.h:44
cms::cuda::assert
assert(be >=bs)
cms::cudacompat::__syncthreads
void __syncthreads()
Definition: cudaCompat.h:108
gpuClustering::firstModule
auto firstModule
Definition: gpuClusterChargeCut.h:31
gpuVertexFinder::nloops
__shared__ int nloops
Definition: gpuClusterTracksIterative.h:102
gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksDBSCAN.h:32
gpuClustering::moduleId
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
Definition: gpuClusterChargeCut.h:20
cms::cudacompat::__syncthreads_or
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:110
gpuClustering::numElements
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
Definition: gpuClusterChargeCut.h:26
__global__
#define __global__
Definition: cudaCompat.h:19
gpuClustering::moduleStart
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ moduleStart
Definition: gpuClusterChargeCut.h:20
gpuVertexFinder::more
bool more
Definition: gpuClusterTracksIterative.h:108
gpuVertexFinder::Hist
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
Definition: gpuClusterTracksDBSCAN.h:47
visualization-live-secondInstance_cfg.m
m
Definition: visualization-live-secondInstance_cfg.py:79
LaserClient_cfi.nbins
nbins
Definition: LaserClient_cfi.py:51
cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
gpuClustering::x
uint16_t const *__restrict__ x
Definition: gpuClustering.h:39
GetRecoTauVFromDQM_MC_cff.kk
kk
Definition: GetRecoTauVFromDQM_MC_cff.py:84
phase1PixelTopology::numColsInModule
constexpr uint16_t numColsInModule
Definition: phase1PixelTopology.h:15
gpuClustering::clusterId
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ clusterId
Definition: gpuClusterChargeCut.h:20
dqmdumpme.k
k
Definition: dqmdumpme.py:60
gpuClustering::maxNumModules
constexpr uint16_t maxNumModules
Definition: gpuClusteringConstants.h:18
first
auto first
Definition: CAHitNtupletGeneratorKernelsImpl.h:125
mitigatedMETSequence_cff.U
U
Definition: mitigatedMETSequence_cff.py:36
cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33
AlCaHLTBitMon_ParallelJobs.p
def p
Definition: AlCaHLTBitMon_ParallelJobs.py:153
cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30
gpuClustering::invalidModuleId
constexpr uint16_t invalidModuleId
Definition: gpuClusteringConstants.h:20
gpuVertexFinder::hist
__shared__ Hist hist
Definition: gpuClusterTracksDBSCAN.h:48
cms::cudacompat::atomicMin_block
T1 atomicMin_block(T1 *a, T2 b)
Definition: cudaCompat.h:92
maxiter
static const MaxIter maxiter
Definition: HelixArbitraryPlaneCrossing.cc:30
gpuClusteringConstants.h
__device__
#define __device__
Definition: SiPixelGainForHLTonGPU.h:15
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29
cmsLHEtoEOSManager.l
l
Definition: cmsLHEtoEOSManager.py:204
HistoContainer.h
newFWLiteAna.bin
bin
Definition: newFWLiteAna.py:161
groupFilesInBlocks.nn
nn
Definition: groupFilesInBlocks.py:150
callgraph.module
module
Definition: callgraph.py:61
gpuClustering::nClustersInModule
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ nClustersInModule
Definition: gpuClusterChargeCut.h:20
gpuClustering::endModule
auto endModule
Definition: gpuClusterChargeCut.h:32
cms::cudacompat::atomicMin
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
or
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const &, edm::Timestamp const & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12
gpuClustering::ok
__shared__ uint8_t ok[maxNumClustersPerModules]
Definition: gpuClusterChargeCut.h:28
funct::abs
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
gpuVertexFinder::foundClusters
__shared__ unsigned int foundClusters
Definition: gpuClusterTracksDBSCAN.h:199
cms::cuda::be
int be
Definition: HistoContainer.h:75
cms::cuda::HistoContainer
Definition: HistoContainer.h:101
cms::cudacompat::__syncthreads_and
bool __syncthreads_and(bool x)
Definition: cudaCompat.h:111
cuda_assert.h
dqmiolumiharvest.j
j
Definition: dqmiolumiharvest.py:66
gpuClustering::invalidClusterId
constexpr int invalidClusterId
Definition: gpuClusteringConstants.h:21
phase1PixelTopology.h
cms::cudacompat::atomicInc
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
MillePedeFileConverter_cfg.e
e
Definition: MillePedeFileConverter_cfg.py:37
cms::cuda::OneToManyAssoc::off
off[c.m]
Definition: OneToManyAssoc.h:236
cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32