CMS 3D CMS Logo

gpuClusterTracksByDensity.h
Go to the documentation of this file.
1 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
2 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <cstdint>
7 
10 
11 #include "gpuVertexFinder.h"
12 
13 namespace gpuVertexFinder {
14 
15  // this algo does not really scale as it works in a single block...
16  // enough for <10K tracks we have
17  //
18  // based on Rodrighez&Laio algo
19  //
20  __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView& pdata,
22  int minT, // min number of neighbours to be "seed"
23  float eps, // max absolute distance to cluster
24  float errmax, // max error to be "seed"
25  float chi2max // max normalized distance to cluster
26  ) {
27  using namespace gpuVertexFinder;
28  constexpr bool verbose = false; // in principle the compiler should optmize out if false
29 
30  if (verbose && 0 == threadIdx.x)
31  printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
32 
33  auto er2mx = errmax * errmax;
34 
35  auto& __restrict__ data = pdata;
36  auto& __restrict__ ws = pws;
37  auto nt = ws.ntrks();
38  float const* __restrict__ zt = ws.zt();
39  float const* __restrict__ ezt2 = ws.ezt2();
40 
41  uint32_t& nvFinal = data.nvFinal();
42  uint32_t& nvIntermediate = ws.nvIntermediate();
43 
44  uint8_t* __restrict__ izt = ws.izt();
45  int32_t* __restrict__ nn = data.ndof();
46  int32_t* __restrict__ iv = ws.iv();
47 
48  assert(zt);
49  assert(ezt2);
50  assert(izt);
51  assert(nn);
52  assert(iv);
53 
55  __shared__ Hist hist;
56  __shared__ typename Hist::Counter hws[32];
57  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
58  hist.off[j] = 0;
59  }
60  __syncthreads();
61 
62  if (verbose && 0 == threadIdx.x)
63  printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
64 
65  assert((int)nt <= hist.capacity());
66 
67  // fill hist (bin shall be wider than "eps")
68  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
70  int iz = int(zt[i] * 10.); // valid if eps<=0.1
71  // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only
72  iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
73  izt[i] = iz - INT8_MIN;
74  assert(iz - INT8_MIN >= 0);
75  assert(iz - INT8_MIN < 256);
76  hist.count(izt[i]);
77  iv[i] = i;
78  nn[i] = 0;
79  }
80  __syncthreads();
81  if (threadIdx.x < 32)
82  hws[threadIdx.x] = 0; // used by prefix scan...
83  __syncthreads();
84  hist.finalize(hws);
85  __syncthreads();
86  assert(hist.size() == nt);
87  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
88  hist.fill(izt[i], uint16_t(i));
89  }
90  __syncthreads();
91 
92  // count neighbours
93  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
94  if (ezt2[i] > er2mx)
95  continue;
96  auto loop = [&](uint32_t j) {
97  if (i == j)
98  return;
99  auto dist = std::abs(zt[i] - zt[j]);
100  if (dist > eps)
101  return;
102  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
103  return;
104  nn[i]++;
105  };
106 
107  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
108  }
109 
110  __syncthreads();
111 
112  // find closest above me .... (we ignore the possibility of two j at same distance from i)
113  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
114  float mdist = eps;
115  auto loop = [&](uint32_t j) {
116  if (nn[j] < nn[i])
117  return;
118  if (nn[j] == nn[i] && zt[j] >= zt[i])
119  return; // if equal use natural order...
120  auto dist = std::abs(zt[i] - zt[j]);
121  if (dist > mdist)
122  return;
123  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
124  return; // (break natural order???)
125  mdist = dist;
126  iv[i] = j; // assign to cluster (better be unique??)
127  };
128  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
129  }
130 
131  __syncthreads();
132 
133 #ifdef GPU_DEBUG
134  // mini verification
135  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
136  if (iv[i] != int(i))
137  assert(iv[iv[i]] != int(i));
138  }
139  __syncthreads();
140 #endif
141 
142  // consolidate graph (percolate index of seed)
143  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
144  auto m = iv[i];
145  while (m != iv[m])
146  m = iv[m];
147  iv[i] = m;
148  }
149 
150 #ifdef GPU_DEBUG
151  __syncthreads();
152  // mini verification
153  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
154  if (iv[i] != int(i))
155  assert(iv[iv[i]] != int(i));
156  }
157 #endif
158 
159 #ifdef GPU_DEBUG
160  // and verify that we did not spit any cluster...
161  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
162  auto minJ = i;
163  auto mdist = eps;
164  auto loop = [&](uint32_t j) {
165  if (nn[j] < nn[i])
166  return;
167  if (nn[j] == nn[i] && zt[j] >= zt[i])
168  return; // if equal use natural order...
169  auto dist = std::abs(zt[i] - zt[j]);
170  if (dist > mdist)
171  return;
172  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
173  return;
174  mdist = dist;
175  minJ = j;
176  };
177  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
178  // should belong to the same cluster...
179  assert(iv[i] == iv[minJ]);
180  assert(nn[i] <= nn[iv[i]]);
181  }
182  __syncthreads();
183 #endif
184 
185  __shared__ unsigned int foundClusters;
186  foundClusters = 0;
187  __syncthreads();
188 
189  // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
190  // mark these tracks with a negative id.
191  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
192  if (iv[i] == int(i)) {
193  if (nn[i] >= minT) {
194  auto old = atomicInc(&foundClusters, 0xffffffff);
195  iv[i] = -(old + 1);
196  } else { // noise
197  iv[i] = -9998;
198  }
199  }
200  }
201  __syncthreads();
202 
204 
205  // propagate the negative id to all the tracks in the cluster.
206  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
207  if (iv[i] >= 0) {
208  // mark each track in a cluster with the same id as the first one
209  iv[i] = iv[iv[i]];
210  }
211  }
212  __syncthreads();
213 
214  // adjust the cluster id to be a positive value starting from 0
215  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
216  iv[i] = -iv[i] - 1;
217  }
218 
220 
221  if (verbose && 0 == threadIdx.x)
222  printf("found %d proto vertices\n", foundClusters);
223  }
224 
225  __global__ void clusterTracksByDensityKernel(VtxSoAView pdata,
226  WsSoAView pws,
227  int minT, // min number of neighbours to be "seed"
228  float eps, // max absolute distance to cluster
229  float errmax, // max error to be "seed"
230  float chi2max // max normalized distance to cluster
231  ) {
232  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
233  }
234 
235 } // namespace gpuVertexFinder
236 
237 #endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
const dim3 threadIdx
Definition: cudaCompat.h:29
#define __forceinline__
Definition: cudaCompat.h:22
int32_t *__restrict__ iv
__device__ WsSoAView int float float float chi2max
static constexpr uint32_t MAXVTX
auto &__restrict__ data
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
__device__ WsSoAView int float eps
float const *__restrict__ zt
auto &__restrict__ ws
__device__ WsSoAView & pws
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
__device__ WsSoAView int minT
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
typename Base::Counter Counter
float const *__restrict__ ezt2
__shared__ Hist::Counter hws[32]
uint8_t *__restrict__ izt
static constexpr uint32_t nbins()
__device__ WsSoAView int float float errmax
gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAView WsSoAView
static constexpr uint32_t totbins()
int32_t *__restrict__ nn
constexpr auto capacity() const
zVertex::ZVertexSoAView VtxSoAView
#define __device__
__shared__ unsigned int foundClusters
static constexpr uint32_t MAXTRACKS