CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
gpuVertexFinder.cc
Go to the documentation of this file.
2 
6 #include "gpuFitVertices.h"
7 #include "gpuSortByPt2.h"
8 #include "gpuSplitVertices.h"
9 
10 #undef PIXVERTEX_DEBUG_PRODUCE
11 
12 namespace gpuVertexFinder {
13 
14  // reject outlier tracks that contribute more than this to the chi2 of the vertex fit
15  constexpr float maxChi2ForFirstFit = 50.f;
16  constexpr float maxChi2ForFinalFit = 5000.f;
17 
18  // split vertices with a chi2/NDoF greater than this
19  constexpr float maxChi2ForSplit = 9.f;
20 
21  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
22  assert(ptracks);
23  assert(soa);
24  auto const& tracks = *ptracks;
25  auto const& fit = tracks.stateAtBS;
26  auto const* quality = tracks.qualityData();
27 
28  auto first = blockIdx.x * blockDim.x + threadIdx.x;
29  for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
30  auto nHits = tracks.nHits(idx);
31  if (nHits == 0)
32  break; // this is a guard: maybe we need to move to nTracks...
33 
34  // initialize soa...
35  soa->idv[idx] = -1;
36 
37  if (tracks.isTriplet(idx))
38  continue; // no triplets
40  continue;
41 
42  auto pt = tracks.pt(idx);
43 
44  if (pt < ptMin)
45  continue;
46 
47  auto& data = *pws;
48  auto it = atomicAdd(&data.ntrks, 1);
49  data.itrk[it] = idx;
50  data.zt[it] = tracks.zip(idx);
51  data.ezt2[it] = fit.covariance(idx)(14);
52  data.ptt2[it] = pt * pt;
53  }
54  }
55 
56 // #define THREE_KERNELS
57 #ifndef THREE_KERNELS
58  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
60  int minT, // min number of neighbours to be "seed"
61  float eps, // max absolute distance to cluster
62  float errmax, // max error to be "seed"
63  float chi2max // max normalized distance to cluster,
64  ) {
65  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
66  __syncthreads();
67  fitVertices(pdata, pws, maxChi2ForFirstFit);
68  __syncthreads();
69  splitVertices(pdata, pws, maxChi2ForSplit);
70  __syncthreads();
71  fitVertices(pdata, pws, maxChi2ForFinalFit);
72  __syncthreads();
73  sortByPt2(pdata, pws);
74  }
75 #else
76  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
78  int minT, // min number of neighbours to be "seed"
79  float eps, // max absolute distance to cluster
80  float errmax, // max error to be "seed"
81  float chi2max // max normalized distance to cluster,
82  ) {
83  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
84  __syncthreads();
85  fitVertices(pdata, pws, maxChi2ForFirstFit);
86  }
87 
88  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
89  fitVertices(pdata, pws, maxChi2ForFinalFit);
90  __syncthreads();
91  sortByPt2(pdata, pws);
92  }
93 #endif
94 
95 #ifdef __CUDACC__
96  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
97 #ifdef PIXVERTEX_DEBUG_PRODUCE
98  std::cout << "producing Vertices on GPU" << std::endl;
99 #endif // PIXVERTEX_DEBUG_PRODUCE
100  ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
101 #else
102  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
103 #ifdef PIXVERTEX_DEBUG_PRODUCE
104  std::cout << "producing Vertices on CPU" << std::endl;
105 #endif // PIXVERTEX_DEBUG_PRODUCE
106  ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
107 #endif
108  assert(tksoa);
109  auto* soa = vertices.get();
110  assert(soa);
111 
112 #ifdef __CUDACC__
113  auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
114 #else
115  auto ws_d = std::make_unique<WorkSpace>();
116 #endif
117 
118 #ifdef __CUDACC__
119  init<<<1, 1, 0, stream>>>(soa, ws_d.get());
120  auto blockSize = 128;
121  auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
122  loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
123  cudaCheck(cudaGetLastError());
124 #else
125  init(soa, ws_d.get());
126  loadTracks(tksoa, soa, ws_d.get(), ptMin);
127 #endif
128 
129 #ifdef __CUDACC__
130  // Running too many thread lead to problems when printf is enabled.
131  constexpr int maxThreadsForPrint = 1024 - 128;
132  constexpr int numBlocks = 1024;
133  constexpr int threadsPerBlock = 128;
134 
135  if (oneKernel_) {
136  // implemented only for density clustesrs
137 #ifndef THREE_KERNELS
138  vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
139 #else
140  vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
141  cudaCheck(cudaGetLastError());
142  // one block per vertex...
143  splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
144  cudaCheck(cudaGetLastError());
145  vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
146 #endif
147  } else { // five kernels
148  if (useDensity_) {
149  clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
150  } else if (useDBSCAN_) {
151  clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
152  } else if (useIterative_) {
153  clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
154  }
155  cudaCheck(cudaGetLastError());
156  fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
157  cudaCheck(cudaGetLastError());
158  // one block per vertex...
159  splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
160  cudaCheck(cudaGetLastError());
161  fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
162  cudaCheck(cudaGetLastError());
163  sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
164  }
165  cudaCheck(cudaGetLastError());
166 #else // __CUDACC__
167  if (useDensity_) {
168  clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
169  } else if (useDBSCAN_) {
170  clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
171  } else if (useIterative_) {
172  clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
173  }
174 #ifdef PIXVERTEX_DEBUG_PRODUCE
175  std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
176 #endif // PIXVERTEX_DEBUG_PRODUCE
177  fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
178  // one block per vertex!
179  splitVertices(soa, ws_d.get(), maxChi2ForSplit);
180  fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
181  sortByPt2(soa, ws_d.get());
182 #endif
183 
184  return vertices;
185  }
186 
187 } // namespace gpuVertexFinder
const dim3 threadIdx
Definition: cudaCompat.h:29
const dim3 gridDim
Definition: cudaCompat.h:33
WorkSpace int float eps
auto &__restrict__ data
uint32_t const *__restrict__ TkSoA const *__restrict__ ptracks
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
static constexpr int32_t stride()
ZVertexHeterogeneous make(TkSoA const *tksoa, float ptMin) const
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
constexpr float maxChi2ForFirstFit
auto const * get() const
auto const * quality
WorkSpace int float float float chi2max
ZVertexSoA * soa
auto const & fit
WorkSpace int float float errmax
ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const *tksoa, float ptMin) const
const dim3 blockIdx
Definition: cudaCompat.h:32
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t nHits
constexpr float maxChi2ForFinalFit
int16_t idv[MAXTRACKS]
Definition: ZVertexSoA.h:14
auto const & tracks
tuple cout
Definition: gather_cfg.py:144
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
constexpr float maxChi2ForSplit
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
ZVertexSoA WorkSpace float ptMin