CMS 3D CMS Logo

gpuSplitVertices.h
Go to the documentation of this file.
1 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
2 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <cstdint>
7 
10 
11 #include "gpuVertexFinder.h"
12 
13 namespace gpuVertexFinder {
14 
16  constexpr bool verbose = false; // in principle the compiler should optmize out if false
17 
18  auto& __restrict__ data = pdata;
19  auto& __restrict__ ws = pws;
20  auto nt = ws.ntrks();
21  float const* __restrict__ zt = ws.zt();
22  float const* __restrict__ ezt2 = ws.ezt2();
23  float* __restrict__ zv = data.zv();
24  float* __restrict__ wv = data.wv();
25  float const* __restrict__ chi2 = data.chi2();
26  uint32_t& nvFinal = data.nvFinal();
27 
28  int32_t const* __restrict__ nn = data.ndof();
29  int32_t* __restrict__ iv = ws.iv();
30 
31  assert(zt);
32  assert(wv);
33  assert(chi2);
34  assert(nn);
35 
36  // one vertex per block
37  for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
38  if (nn[kv] < 4)
39  continue;
40  if (chi2[kv] < maxChi2 * float(nn[kv]))
41  continue;
42 
43  constexpr int MAXTK = 512;
44  assert(nn[kv] < MAXTK);
45  if (nn[kv] >= MAXTK)
46  continue; // too bad FIXME
47  __shared__ uint32_t it[MAXTK]; // track index
48  __shared__ float zz[MAXTK]; // z pos
49  __shared__ uint8_t newV[MAXTK]; // 0 or 1
50  __shared__ float ww[MAXTK]; // z weight
51 
52  __shared__ uint32_t nq; // number of track for this vertex
53  nq = 0;
54  __syncthreads();
55 
56  // copy to local
57  for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
58  if (iv[k] == int(kv)) {
59  auto old = atomicInc(&nq, MAXTK);
60  zz[old] = zt[k] - zv[kv];
61  newV[old] = zz[old] < 0 ? 0 : 1;
62  ww[old] = 1.f / ezt2[k];
63  it[old] = k;
64  }
65  }
66 
67  __shared__ float znew[2], wnew[2]; // the new vertices
68 
69  __syncthreads();
70  assert(int(nq) == nn[kv] + 1);
71 
72  int maxiter = 20;
73  // kt-min....
74  bool more = true;
75  while (__syncthreads_or(more)) {
76  more = false;
77  if (0 == threadIdx.x) {
78  znew[0] = 0;
79  znew[1] = 0;
80  wnew[0] = 0;
81  wnew[1] = 0;
82  }
83  __syncthreads();
84  for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
85  auto i = newV[k];
86  atomicAdd(&znew[i], zz[k] * ww[k]);
87  atomicAdd(&wnew[i], ww[k]);
88  }
89  __syncthreads();
90  if (0 == threadIdx.x) {
91  znew[0] /= wnew[0];
92  znew[1] /= wnew[1];
93  }
94  __syncthreads();
95  for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
96  auto d0 = fabs(zz[k] - znew[0]);
97  auto d1 = fabs(zz[k] - znew[1]);
98  auto newer = d0 < d1 ? 0 : 1;
99  more |= newer != newV[k];
100  newV[k] = newer;
101  }
102  --maxiter;
103  if (maxiter <= 0)
104  more = false;
105  }
106 
107  // avoid empty vertices
108  if (0 == wnew[0] || 0 == wnew[1])
109  continue;
110 
111  // quality cut
112  auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
113 
114  auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
115 
116  if (verbose && 0 == threadIdx.x)
117  printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
118 
119  if (chi2Dist < 4)
120  continue;
121 
122  // get a new global vertex
123  __shared__ uint32_t igv;
124  if (0 == threadIdx.x)
125  igv = atomicAdd(&ws.nvIntermediate(), 1);
126  __syncthreads();
127  for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
128  if (1 == newV[k])
129  iv[it[k]] = igv;
130  }
131 
132  } // loop on vertices
133  }
134 
135  __global__ void splitVerticesKernel(VtxSoAView pdata, WsSoAView pws, float maxChi2) {
136  splitVertices(pdata, pws, maxChi2);
137  }
138 
139 } // namespace gpuVertexFinder
140 
141 #endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
const dim3 threadIdx
Definition: cudaCompat.h:29
__device__ WsSoAView float maxChi2
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:134
float *__restrict__ chi2
#define __forceinline__
Definition: cudaCompat.h:22
int32_t *__restrict__ iv
const dim3 gridDim
Definition: cudaCompat.h:33
auto &__restrict__ data
#define __global__
Definition: cudaCompat.h:19
float *__restrict__ wv
float *__restrict__ zv
const dim3 blockDim
Definition: cudaCompat.h:30
float const *__restrict__ zt
auto &__restrict__ ws
__device__ WsSoAView & pws
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48
static const MaxIter maxiter
float const *__restrict__ ezt2
splitVertices(pdata, pws, maxChi2ForSplit)
const dim3 blockIdx
Definition: cudaCompat.h:32
static constexpr float d0
gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAView WsSoAView
int32_t *__restrict__ nn
static constexpr float d1
zVertex::ZVertexSoAView VtxSoAView
#define __device__
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61