CMS 3D CMS Logo

gpuClusterTracksByDensity.h
Go to the documentation of this file.
1 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
2 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <cstdint>
7 
10 
11 #include "gpuVertexFinder.h"
12 
13 namespace gpuVertexFinder {
14 
15  // this algo does not really scale as it works in a single block...
16  // enough for <10K tracks we have
17  //
18  // based on Rodrighez&Laio algo
19  //
20  __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
22  int minT, // min number of neighbours to be "seed"
23  float eps, // max absolute distance to cluster
24  float errmax, // max error to be "seed"
25  float chi2max // max normalized distance to cluster
26  ) {
27  using namespace gpuVertexFinder;
28  constexpr bool verbose = false; // in principle the compiler should optmize out if false
29 
30  if (verbose && 0 == threadIdx.x)
31  printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
32 
33  auto er2mx = errmax * errmax;
34 
35  auto& __restrict__ data = *pdata;
36  auto& __restrict__ ws = *pws;
37  auto nt = ws.ntrks;
38  float const* __restrict__ zt = ws.zt;
39  float const* __restrict__ ezt2 = ws.ezt2;
40 
41  uint32_t& nvFinal = data.nvFinal;
42  uint32_t& nvIntermediate = ws.nvIntermediate;
43 
44  uint8_t* __restrict__ izt = ws.izt;
45  int32_t* __restrict__ nn = data.ndof;
46  int32_t* __restrict__ iv = ws.iv;
47 
48  assert(pdata);
49  assert(zt);
50 
52  __shared__ Hist hist;
53  __shared__ typename Hist::Counter hws[32];
54  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
55  hist.off[j] = 0;
56  }
57  __syncthreads();
58 
59  if (verbose && 0 == threadIdx.x)
60  printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
61 
62  assert((int)nt <= hist.capacity());
63 
64  // fill hist (bin shall be wider than "eps")
65  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
67  int iz = int(zt[i] * 10.); // valid if eps<=0.1
68  // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only
69  iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
70  izt[i] = iz - INT8_MIN;
71  assert(iz - INT8_MIN >= 0);
72  assert(iz - INT8_MIN < 256);
73  hist.count(izt[i]);
74  iv[i] = i;
75  nn[i] = 0;
76  }
77  __syncthreads();
78  if (threadIdx.x < 32)
79  hws[threadIdx.x] = 0; // used by prefix scan...
80  __syncthreads();
81  hist.finalize(hws);
82  __syncthreads();
83  assert(hist.size() == nt);
84  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
85  hist.fill(izt[i], uint16_t(i));
86  }
87  __syncthreads();
88 
89  // count neighbours
90  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
91  if (ezt2[i] > er2mx)
92  continue;
93  auto loop = [&](uint32_t j) {
94  if (i == j)
95  return;
96  auto dist = std::abs(zt[i] - zt[j]);
97  if (dist > eps)
98  return;
99  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
100  return;
101  nn[i]++;
102  };
103 
104  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
105  }
106 
107  __syncthreads();
108 
109  // find closest above me .... (we ignore the possibility of two j at same distance from i)
110  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
111  float mdist = eps;
112  auto loop = [&](uint32_t j) {
113  if (nn[j] < nn[i])
114  return;
115  if (nn[j] == nn[i] && zt[j] >= zt[i])
116  return; // if equal use natural order...
117  auto dist = std::abs(zt[i] - zt[j]);
118  if (dist > mdist)
119  return;
120  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
121  return; // (break natural order???)
122  mdist = dist;
123  iv[i] = j; // assign to cluster (better be unique??)
124  };
125  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
126  }
127 
128  __syncthreads();
129 
130 #ifdef GPU_DEBUG
131  // mini verification
132  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
133  if (iv[i] != int(i))
134  assert(iv[iv[i]] != int(i));
135  }
136  __syncthreads();
137 #endif
138 
139  // consolidate graph (percolate index of seed)
140  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
141  auto m = iv[i];
142  while (m != iv[m])
143  m = iv[m];
144  iv[i] = m;
145  }
146 
147 #ifdef GPU_DEBUG
148  __syncthreads();
149  // mini verification
150  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
151  if (iv[i] != int(i))
152  assert(iv[iv[i]] != int(i));
153  }
154 #endif
155 
156 #ifdef GPU_DEBUG
157  // and verify that we did not spit any cluster...
158  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
159  auto minJ = i;
160  auto mdist = eps;
161  auto loop = [&](uint32_t j) {
162  if (nn[j] < nn[i])
163  return;
164  if (nn[j] == nn[i] && zt[j] >= zt[i])
165  return; // if equal use natural order...
166  auto dist = std::abs(zt[i] - zt[j]);
167  if (dist > mdist)
168  return;
169  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
170  return;
171  mdist = dist;
172  minJ = j;
173  };
174  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
175  // should belong to the same cluster...
176  assert(iv[i] == iv[minJ]);
177  assert(nn[i] <= nn[iv[i]]);
178  }
179  __syncthreads();
180 #endif
181 
182  __shared__ unsigned int foundClusters;
183  foundClusters = 0;
184  __syncthreads();
185 
186  // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
187  // mark these tracks with a negative id.
188  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
189  if (iv[i] == int(i)) {
190  if (nn[i] >= minT) {
191  auto old = atomicInc(&foundClusters, 0xffffffff);
192  iv[i] = -(old + 1);
193  } else { // noise
194  iv[i] = -9998;
195  }
196  }
197  }
198  __syncthreads();
199 
201 
202  // propagate the negative id to all the tracks in the cluster.
203  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
204  if (iv[i] >= 0) {
205  // mark each track in a cluster with the same id as the first one
206  iv[i] = iv[iv[i]];
207  }
208  }
209  __syncthreads();
210 
211  // adjust the cluster id to be a positive value starting from 0
212  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
213  iv[i] = -iv[i] - 1;
214  }
215 
217 
218  if (verbose && 0 == threadIdx.x)
219  printf("found %d proto vertices\n", foundClusters);
220  }
221 
222  __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
224  int minT, // min number of neighbours to be "seed"
225  float eps, // max absolute distance to cluster
226  float errmax, // max error to be "seed"
227  float chi2max // max normalized distance to cluster
228  ) {
229  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
230  }
231 
232 } // namespace gpuVertexFinder
233 
234 #endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
const dim3 threadIdx
Definition: cudaCompat.h:29
#define __forceinline__
Definition: cudaCompat.h:22
int32_t *__restrict__ iv
WorkSpace int float eps
bool verbose
auto &__restrict__ data
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
static constexpr uint32_t MAXVTX
Definition: ZVertexSoA.h:12
float const *__restrict__ zt
auto &__restrict__ ws
WorkSpace int float float float chi2max
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
typename Base::Counter Counter
float const *__restrict__ ezt2
__shared__ Hist::Counter hws[32]
uint8_t *__restrict__ izt
WorkSpace int float float errmax
__shared__ Hist hist
static constexpr uint32_t nbins()
static constexpr uint32_t totbins()
int32_t *__restrict__ nn
static constexpr uint32_t MAXTRACKS
Definition: ZVertexSoA.h:11
constexpr auto capacity() const
#define __device__
__shared__ unsigned int foundClusters