CMS 3D CMS Logo

gpuFitVertices.h
Go to the documentation of this file.
1 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
2 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <cstdint>
7 
10 
11 #include "gpuVertexFinder.h"
12 
13 namespace gpuVertexFinder {
14 
15  __device__ __forceinline__ void fitVertices(ZVertices* pdata,
16  WorkSpace* pws,
17  float chi2Max // for outlier rejection
18  ) {
19  constexpr bool verbose = false; // in principle the compiler should optmize out if false
20 
21  auto& __restrict__ data = *pdata;
22  auto& __restrict__ ws = *pws;
23  auto nt = ws.ntrks;
24  float const* __restrict__ zt = ws.zt;
25  float const* __restrict__ ezt2 = ws.ezt2;
26  float* __restrict__ zv = data.zv;
27  float* __restrict__ wv = data.wv;
28  float* __restrict__ chi2 = data.chi2;
29  uint32_t& nvFinal = data.nvFinal;
30  uint32_t& nvIntermediate = ws.nvIntermediate;
31 
32  int32_t* __restrict__ nn = data.ndof;
33  int32_t* __restrict__ iv = ws.iv;
34 
35  assert(pdata);
36  assert(zt);
37 
40  auto foundClusters = nvFinal;
41 
42  // zero
43  for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
44  zv[i] = 0;
45  wv[i] = 0;
46  chi2[i] = 0;
47  }
48 
49  // only for test
50  __shared__ int noise;
51  if (verbose && 0 == threadIdx.x)
52  noise = 0;
53 
54  __syncthreads();
55 
56  // compute cluster location
57  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
58  if (iv[i] > 9990) {
59  if (verbose)
60  atomicAdd(&noise, 1);
61  continue;
62  }
63  assert(iv[i] >= 0);
64  assert(iv[i] < int(foundClusters));
65  auto w = 1.f / ezt2[i];
66  atomicAdd_block(&zv[iv[i]], zt[i] * w);
67  atomicAdd_block(&wv[iv[i]], w);
68  }
69 
70  __syncthreads();
71  // reuse nn
72  for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
73  assert(wv[i] > 0.f);
74  zv[i] /= wv[i];
75  nn[i] = -1; // ndof
76  }
77  __syncthreads();
78 
79  // compute chi2
80  for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
81  if (iv[i] > 9990)
82  continue;
83 
84  auto c2 = zv[iv[i]] - zt[i];
85  c2 *= c2 / ezt2[i];
86  if (c2 > chi2Max) {
87  iv[i] = 9999;
88  continue;
89  }
90  atomicAdd_block(&chi2[iv[i]], c2);
91  atomicAdd_block(&nn[iv[i]], 1);
92  }
93  __syncthreads();
94  for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)
95  if (nn[i] > 0)
96  wv[i] *= float(nn[i]) / chi2[i];
97 
98  if (verbose && 0 == threadIdx.x)
99  printf("found %d proto clusters ", foundClusters);
100  if (verbose && 0 == threadIdx.x)
101  printf("and %d noise\n", noise);
102  }
103 
104  __global__ void fitVerticesKernel(ZVertices* pdata,
105  WorkSpace* pws,
106  float chi2Max // for outlier rejection
107  ) {
108  fitVertices(pdata, pws, chi2Max);
109  }
110 
111 } // namespace gpuVertexFinder
112 
113 #endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
const dim3 threadIdx
Definition: cudaCompat.h:29
float *__restrict__ chi2
#define __forceinline__
Definition: cudaCompat.h:22
__device__ WorkSpace float chi2Max
int32_t *__restrict__ iv
T w() const
bool verbose
auto &__restrict__ data
#define __global__
Definition: cudaCompat.h:19
float *__restrict__ wv
float *__restrict__ zv
const dim3 blockDim
Definition: cudaCompat.h:30
float const *__restrict__ zt
auto &__restrict__ ws
double f[11][100]
float const *__restrict__ ezt2
__shared__ int noise
ZVertexSoA ZVertices
T1 atomicAdd_block(T1 *a, T2 b)
Definition: cudaCompat.h:68
int32_t *__restrict__ nn
#define __device__
__shared__ unsigned int foundClusters
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61