CMS 3D CMS Logo

gpuClusterTracksIterative.h
Go to the documentation of this file.
1 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
2 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <cstdint>
7 
10 
11 #include "gpuVertexFinder.h"
12 
13 namespace gpuVertexFinder {
14 
15  // this algo does not really scale as it works in a single block...
16  // enough for <10K tracks we have
17  __global__ void clusterTracksIterative(VtxSoAView pdata,
18  WsSoAView pws,
19  int minT, // min number of neighbours to be "core"
20  float eps, // max absolute distance to cluster
21  float errmax, // max error to be "seed"
22  float chi2max // max normalized distance to cluster
23  ) {
24  constexpr bool verbose = false; // in principle the compiler should optmize out if false
25 
26  if (verbose && 0 == threadIdx.x)
27  printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
28 
29  auto er2mx = errmax * errmax;
30 
31  auto& __restrict__ data = pdata;
32  auto& __restrict__ ws = pws;
33  auto nt = ws.ntrks();
34  float const* __restrict__ zt = ws.zt();
35  float const* __restrict__ ezt2 = ws.ezt2();
36 
37  uint32_t& nvFinal = data.nvFinal();
38  uint32_t& nvIntermediate = ws.nvIntermediate();
39 
40  uint8_t* __restrict__ izt = ws.izt();
41  int32_t* __restrict__ nn = data.ndof();
42  int32_t* __restrict__ iv = ws.iv();
43 
44  assert(zt);
45  assert(nn);
46  assert(iv);
47  assert(ezt2);
48 
50  __shared__ Hist hist;
51  __shared__ typename Hist::Counter hws[32];
52  for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
53  hist.off[j] = 0;
54  }
55  __syncthreads();
56 
57  if (verbose && 0 == threadIdx.x)
58  printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
59 
60  assert((int)nt <= hist.capacity());
61 
62  // fill hist (bin shall be wider than "eps")
63  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
65  int iz = int(zt[i] * 10.); // valid if eps<=0.1
66  iz = std::clamp(iz, INT8_MIN, INT8_MAX);
67  izt[i] = iz - INT8_MIN;
68  assert(iz - INT8_MIN >= 0);
69  assert(iz - INT8_MIN < 256);
70  hist.count(izt[i]);
71  iv[i] = i;
72  nn[i] = 0;
73  }
74  __syncthreads();
75  if (threadIdx.x < 32)
76  hws[threadIdx.x] = 0; // used by prefix scan...
77  __syncthreads();
78  hist.finalize(hws);
79  __syncthreads();
80  assert(hist.size() == nt);
81  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
82  hist.fill(izt[i], uint16_t(i));
83  }
84  __syncthreads();
85 
86  // count neighbours
87  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
88  if (ezt2[i] > er2mx)
89  continue;
90  auto loop = [&](uint32_t j) {
91  if (i == j)
92  return;
93  auto dist = std::abs(zt[i] - zt[j]);
94  if (dist > eps)
95  return;
96  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
97  return;
98  nn[i]++;
99  };
100 
101  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
102  }
103 
104  __shared__ int nloops;
105  nloops = 0;
106 
107  __syncthreads();
108 
109  // cluster seeds only
110  bool more = true;
112  if (1 == nloops % 2) {
113  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
114  auto m = iv[i];
115  while (m != iv[m])
116  m = iv[m];
117  iv[i] = m;
118  }
119  } else {
120  more = false;
121  for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
122  auto p = hist.begin() + k;
123  auto i = (*p);
124  auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
125  if (nn[i] < minT)
126  continue; // DBSCAN core rule
127  auto loop = [&](uint32_t j) {
128  assert(i != j);
129  if (nn[j] < minT)
130  return; // DBSCAN core rule
131  auto dist = std::abs(zt[i] - zt[j]);
132  if (dist > eps)
133  return;
134  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
135  return;
136  auto old = atomicMin(&iv[j], iv[i]);
137  if (old != iv[i]) {
138  // end the loop only if no changes were applied
139  more = true;
140  }
141  atomicMin(&iv[i], old);
142  };
143  ++p;
144  for (; p < hist.end(be); ++p)
145  loop(*p);
146  } // for i
147  }
148  if (threadIdx.x == 0)
149  ++nloops;
150  } // while
151 
152  // collect edges (assign to closest cluster of closest point??? here to closest point)
153  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
154  // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule
155  if (nn[i] >= minT)
156  continue; // DBSCAN edge rule
157  float mdist = eps;
158  auto loop = [&](int j) {
159  if (nn[j] < minT)
160  return; // DBSCAN core rule
161  auto dist = std::abs(zt[i] - zt[j]);
162  if (dist > mdist)
163  return;
164  if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
165  return; // needed?
166  mdist = dist;
167  iv[i] = iv[j]; // assign to cluster (better be unique??)
168  };
169  cms::cuda::forEachInBins(hist, izt[i], 1, loop);
170  }
171 
172  __shared__ unsigned int foundClusters;
173  foundClusters = 0;
174  __syncthreads();
175 
176  // find the number of different clusters, identified by a tracks with clus[i] == i;
177  // mark these tracks with a negative id.
178  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
179  if (iv[i] == int(i)) {
180  if (nn[i] >= minT) {
181  auto old = atomicInc(&foundClusters, 0xffffffff);
182  iv[i] = -(old + 1);
183  } else { // noise
184  iv[i] = -9998;
185  }
186  }
187  }
188  __syncthreads();
189 
191 
192  // propagate the negative id to all the tracks in the cluster.
193  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
194  if (iv[i] >= 0) {
195  // mark each track in a cluster with the same id as the first one
196  iv[i] = iv[iv[i]];
197  }
198  }
199  __syncthreads();
200 
201  // adjust the cluster id to be a positive value starting from 0
202  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
203  iv[i] = -iv[i] - 1;
204  }
205 
207 
208  if (verbose && 0 == threadIdx.x)
209  printf("found %d proto vertices\n", foundClusters);
210  }
211 
212 } // namespace gpuVertexFinder
213 
214 #endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
const dim3 threadIdx
Definition: cudaCompat.h:29
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:134
int32_t *__restrict__ iv
__device__ WsSoAView int float float float chi2max
static constexpr uint32_t MAXVTX
auto &__restrict__ data
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
__device__ WsSoAView int float eps
float const *__restrict__ zt
auto &__restrict__ ws
__device__ WsSoAView & pws
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
__device__ WsSoAView int minT
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
typename Base::Counter Counter
float const *__restrict__ ezt2
__shared__ Hist::Counter hws[32]
uint8_t *__restrict__ izt
static constexpr uint32_t nbins()
static constexpr UT bin(T t)
__device__ WsSoAView int float float errmax
gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAView WsSoAView
static constexpr uint32_t totbins()
int32_t *__restrict__ nn
constexpr auto capacity() const
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
zVertex::ZVertexSoAView VtxSoAView
__shared__ unsigned int foundClusters
static constexpr uint32_t MAXTRACKS