CMS 3D CMS Logo

gpuClusterTracksDBSCAN.h
Go to the documentation of this file.
1 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
2 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <cstdint>
7 
10 
11 #include "gpuVertexFinder.h"
12 
13 namespace gpuVertexFinder {
14 
15  // this algo does not really scale as it works in a single block...
16  // enough for <10K tracks we have
17  __global__ void clusterTracksDBSCAN(VtxSoAView pdata,
18  WsSoAView pws,
19  int minT, // min number of neighbours to be "core"
20  float eps, // max absolute distance to cluster
21  float errmax, // max error to be "seed"
22  float chi2max // max normalized distance to cluster
23  ) {
24  constexpr bool verbose = false; // in principle the compiler should optmize out if false
25 
26  if (verbose && 0 == threadIdx.x)
27  printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
28 
29  auto er2mx = errmax * errmax;
30 
31  auto& __restrict__ data = pdata;
32  auto& __restrict__ ws = pws;
33  auto nt = ws.ntrks();
34  float const* __restrict__ zt = ws.zt();
35  float const* __restrict__ ezt2 = ws.ezt2();
36 
37  uint32_t& nvFinal = data.nvFinal();
38  uint32_t& nvIntermediate = ws.nvIntermediate();
39 
40  uint8_t* __restrict__ izt = ws.izt();
41  int32_t* __restrict__ nn = data.ndof();
42  int32_t* __restrict__ iv = ws.iv();
43 
44  assert(zt);
45  assert(iv);
46  assert(nn);
47  assert(ezt2);
48 
50  __shared__ Hist hist;
51  __shared__ typename Hist::Counter hws[32];
52  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
53  hist.off[j] = 0;
54  }
55  __syncthreads();
56 
57  if (verbose && 0 == threadIdx.x)
58  printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
59 
60  assert((int)nt <= hist.capacity());
61 
62  // fill hist (bin shall be wider than "eps")
63  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
65  int iz = int(zt[i] * 10.); // valid if eps<=0.1
66  iz = std::clamp(iz, INT8_MIN, INT8_MAX);
67  izt[i] = iz - INT8_MIN;
68  assert(iz - INT8_MIN >= 0);
69  assert(iz - INT8_MIN < 256);
70  hist.count(izt[i]);
71  iv[i] = i;
72  nn[i] = 0;
73  }
74  __syncthreads();
75  if (threadIdx.x < 32)
76  hws[threadIdx.x] = 0; // used by prefix scan...
77  __syncthreads();
78  hist.finalize(hws);
79  __syncthreads();
80  assert(hist.size() == nt);
81  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
82  hist.fill(izt[i], uint16_t(i));
83  }
84  __syncthreads();
85 
86  // count neighbours
87  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
88  if (ezt2[i] > er2mx)
89  continue;
90  auto loop = [&](uint32_t j) {
91  if (i == j)
92  return;
93  auto dist = std::abs(zt[i] - zt[j]);
94  if (dist > eps)
95  return;
96  // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
97  nn[i]++;
98  };
99 
100  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
101  }
102 
103  __syncthreads();
104 
105  // find NN with smaller z...
106  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
107  if (nn[i] < minT)
108  continue; // DBSCAN core rule
109  float mz = zt[i];
110  auto loop = [&](uint32_t j) {
111  if (zt[j] >= mz)
112  return;
113  if (nn[j] < minT)
114  return; // DBSCAN core rule
115  auto dist = std::abs(zt[i] - zt[j]);
116  if (dist > eps)
117  return;
118  // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
119  mz = zt[j];
120  iv[i] = j; // assign to cluster (better be unique??)
121  };
122  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
123  }
124 
125  __syncthreads();
126 
127 #ifdef GPU_DEBUG
128  // mini verification
129  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
130  if (iv[i] != int(i))
131  assert(iv[iv[i]] != int(i));
132  }
133  __syncthreads();
134 #endif
135 
136  // consolidate graph (percolate index of seed)
137  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
138  auto m = iv[i];
139  while (m != iv[m])
140  m = iv[m];
141  iv[i] = m;
142  }
143 
144  __syncthreads();
145 
146 #ifdef GPU_DEBUG
147  // mini verification
148  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
149  if (iv[i] != int(i))
150  assert(iv[iv[i]] != int(i));
151  }
152  __syncthreads();
153 #endif
154 
155 #ifdef GPU_DEBUG
156  // and verify that we did not spit any cluster...
157  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
158  if (nn[i] < minT)
159  continue; // DBSCAN core rule
160  assert(zt[iv[i]] <= zt[i]);
161  auto loop = [&](uint32_t j) {
162  if (nn[j] < minT)
163  return; // DBSCAN core rule
164  auto dist = std::abs(zt[i] - zt[j]);
165  if (dist > eps)
166  return;
167  // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
168  // they should belong to the same cluster, isn't it?
169  if (iv[i] != iv[j]) {
170  printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
171  printf(" %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
172  ;
173  }
174  assert(iv[i] == iv[j]);
175  };
176  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
177  }
178  __syncthreads();
179 #endif
180 
181  // collect edges (assign to closest cluster of closest point??? here to closest point)
182  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
183  // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule
184  if (nn[i] >= minT)
185  continue; // DBSCAN edge rule
186  float mdist = eps;
187  auto loop = [&](uint32_t j) {
188  if (nn[j] < minT)
189  return; // DBSCAN core rule
190  auto dist = std::abs(zt[i] - zt[j]);
191  if (dist > mdist)
192  return;
193  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
194  return; // needed?
195  mdist = dist;
196  iv[i] = iv[j]; // assign to cluster (better be unique??)
197  };
198  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
199  }
200 
201  __shared__ unsigned int foundClusters;
202  foundClusters = 0;
203  __syncthreads();
204 
205  // find the number of different clusters, identified by a tracks with clus[i] == i;
206  // mark these tracks with a negative id.
207  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
208  if (iv[i] == int(i)) {
209  if (nn[i] >= minT) {
210  auto old = atomicInc(&foundClusters, 0xffffffff);
211  iv[i] = -(old + 1);
212  } else { // noise
213  iv[i] = -9998;
214  }
215  }
216  }
217  __syncthreads();
218 
220 
221  // propagate the negative id to all the tracks in the cluster.
222  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
223  if (iv[i] >= 0) {
224  // mark each track in a cluster with the same id as the first one
225  iv[i] = iv[iv[i]];
226  }
227  }
228  __syncthreads();
229 
230  // adjust the cluster id to be a positive value starting from 0
231  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
232  iv[i] = -iv[i] - 1;
233  }
234 
236 
237  if (verbose && 0 == threadIdx.x)
238  printf("found %d proto vertices\n", foundClusters);
239  }
240 
241 } // namespace gpuVertexFinder
242 
243 #endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
const dim3 threadIdx
Definition: cudaCompat.h:29
int32_t *__restrict__ iv
__device__ WsSoAView int float float float chi2max
static constexpr uint32_t MAXVTX
auto &__restrict__ data
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
__device__ WsSoAView int float eps
float const *__restrict__ zt
auto &__restrict__ ws
__device__ WsSoAView & pws
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
__device__ WsSoAView int minT
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
typename Base::Counter Counter
float const *__restrict__ ezt2
__shared__ Hist::Counter hws[32]
uint8_t *__restrict__ izt
static constexpr uint32_t nbins()
__device__ WsSoAView int float float errmax
gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAView WsSoAView
static constexpr uint32_t totbins()
int32_t *__restrict__ nn
constexpr auto capacity() const
zVertex::ZVertexSoAView VtxSoAView
__shared__ unsigned int foundClusters
static constexpr uint32_t MAXTRACKS