CMS 3D CMS Logo

gpuClusterTracksIterative.h
Go to the documentation of this file.
1 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
2 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <cstdint>
7 
10 
11 #include "gpuVertexFinder.h"
12 
13 namespace gpuVertexFinder {
14 
15  // this algo does not really scale as it works in a single block...
16  // enough for <10K tracks we have
17  __global__ void clusterTracksIterative(ZVertices* pdata,
18  WorkSpace* pws,
19  int minT, // min number of neighbours to be "core"
20  float eps, // max absolute distance to cluster
21  float errmax, // max error to be "seed"
22  float chi2max // max normalized distance to cluster
23  ) {
24  constexpr bool verbose = false; // in principle the compiler should optmize out if false
25 
26  if (verbose && 0 == threadIdx.x)
27  printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
28 
29  auto er2mx = errmax * errmax;
30 
31  auto& __restrict__ data = *pdata;
32  auto& __restrict__ ws = *pws;
33  auto nt = ws.ntrks;
34  float const* __restrict__ zt = ws.zt;
35  float const* __restrict__ ezt2 = ws.ezt2;
36 
37  uint32_t& nvFinal = data.nvFinal;
38  uint32_t& nvIntermediate = ws.nvIntermediate;
39 
40  uint8_t* __restrict__ izt = ws.izt;
41  int32_t* __restrict__ nn = data.ndof;
42  int32_t* __restrict__ iv = ws.iv;
43 
44  assert(pdata);
45  assert(zt);
46 
48  __shared__ Hist hist;
49  __shared__ typename Hist::Counter hws[32];
50  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
51  hist.off[j] = 0;
52  }
53  __syncthreads();
54 
55  if (verbose && 0 == threadIdx.x)
56  printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
57 
58  assert((int)nt <= hist.capacity());
59 
60  // fill hist (bin shall be wider than "eps")
61  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
63  int iz = int(zt[i] * 10.); // valid if eps<=0.1
64  iz = std::clamp(iz, INT8_MIN, INT8_MAX);
65  izt[i] = iz - INT8_MIN;
66  assert(iz - INT8_MIN >= 0);
67  assert(iz - INT8_MIN < 256);
68  hist.count(izt[i]);
69  iv[i] = i;
70  nn[i] = 0;
71  }
72  __syncthreads();
73  if (threadIdx.x < 32)
74  hws[threadIdx.x] = 0; // used by prefix scan...
75  __syncthreads();
76  hist.finalize(hws);
77  __syncthreads();
78  assert(hist.size() == nt);
79  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
80  hist.fill(izt[i], uint16_t(i));
81  }
82  __syncthreads();
83 
84  // count neighbours
85  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
86  if (ezt2[i] > er2mx)
87  continue;
88  auto loop = [&](uint32_t j) {
89  if (i == j)
90  return;
91  auto dist = std::abs(zt[i] - zt[j]);
92  if (dist > eps)
93  return;
94  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
95  return;
96  nn[i]++;
97  };
98 
99  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
100  }
101 
102  __shared__ int nloops;
103  nloops = 0;
104 
105  __syncthreads();
106 
107  // cluster seeds only
108  bool more = true;
110  if (1 == nloops % 2) {
111  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
112  auto m = iv[i];
113  while (m != iv[m])
114  m = iv[m];
115  iv[i] = m;
116  }
117  } else {
118  more = false;
119  for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
120  auto p = hist.begin() + k;
121  auto i = (*p);
122  auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
123  if (nn[i] < minT)
124  continue; // DBSCAN core rule
125  auto loop = [&](uint32_t j) {
126  assert(i != j);
127  if (nn[j] < minT)
128  return; // DBSCAN core rule
129  auto dist = std::abs(zt[i] - zt[j]);
130  if (dist > eps)
131  return;
132  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
133  return;
134  auto old = atomicMin(&iv[j], iv[i]);
135  if (old != iv[i]) {
136  // end the loop only if no changes were applied
137  more = true;
138  }
139  atomicMin(&iv[i], old);
140  };
141  ++p;
142  for (; p < hist.end(be); ++p)
143  loop(*p);
144  } // for i
145  }
146  if (threadIdx.x == 0)
147  ++nloops;
148  } // while
149 
150  // collect edges (assign to closest cluster of closest point??? here to closest point)
151  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
152  // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule
153  if (nn[i] >= minT)
154  continue; // DBSCAN edge rule
155  float mdist = eps;
156  auto loop = [&](int j) {
157  if (nn[j] < minT)
158  return; // DBSCAN core rule
159  auto dist = std::abs(zt[i] - zt[j]);
160  if (dist > mdist)
161  return;
162  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
163  return; // needed?
164  mdist = dist;
165  iv[i] = iv[j]; // assign to cluster (better be unique??)
166  };
167  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
168  }
169 
170  __shared__ unsigned int foundClusters;
171  foundClusters = 0;
172  __syncthreads();
173 
174  // find the number of different clusters, identified by a tracks with clus[i] == i;
175  // mark these tracks with a negative id.
176  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
177  if (iv[i] == int(i)) {
178  if (nn[i] >= minT) {
179  auto old = atomicInc(&foundClusters, 0xffffffff);
180  iv[i] = -(old + 1);
181  } else { // noise
182  iv[i] = -9998;
183  }
184  }
185  }
186  __syncthreads();
187 
189 
190  // propagate the negative id to all the tracks in the cluster.
191  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
192  if (iv[i] >= 0) {
193  // mark each track in a cluster with the same id as the first one
194  iv[i] = iv[iv[i]];
195  }
196  }
197  __syncthreads();
198 
199  // adjust the cluster id to be a positive value starting from 0
200  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
201  iv[i] = -iv[i] - 1;
202  }
203 
205 
206  if (verbose && 0 == threadIdx.x)
207  printf("found %d proto vertices\n", foundClusters);
208  }
209 
210 } // namespace gpuVertexFinder
211 
212 #endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
const dim3 threadIdx
Definition: cudaCompat.h:29
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:134
int32_t *__restrict__ iv
WorkSpace int float eps
bool verbose
auto &__restrict__ data
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
static constexpr uint32_t MAXVTX
Definition: ZVertexSoA.h:12
float const *__restrict__ zt
auto &__restrict__ ws
WorkSpace int float float float chi2max
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
typename Base::Counter Counter
float const *__restrict__ ezt2
__shared__ Hist::Counter hws[32]
uint8_t *__restrict__ izt
WorkSpace int float float errmax
__shared__ Hist hist
static constexpr uint32_t nbins()
ZVertexSoA ZVertices
static constexpr UT bin(T t)
static constexpr uint32_t totbins()
int32_t *__restrict__ nn
static constexpr uint32_t MAXTRACKS
Definition: ZVertexSoA.h:11
constexpr auto capacity() const
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
__shared__ unsigned int foundClusters