CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
gpuVertexFinder.cc
Go to the documentation of this file.
2 
6 #include "gpuFitVertices.h"
7 #include "gpuSortByPt2.h"
8 #include "gpuSplitVertices.h"
9 
10 #undef PIXVERTEX_DEBUG_PRODUCE
11 
12 namespace gpuVertexFinder {
13 
14  // reject outlier tracks that contribute more than this to the chi2 of the vertex fit
15  constexpr float maxChi2ForFirstFit = 50.f;
16  constexpr float maxChi2ForFinalFit = 5000.f;
17 
18  // split vertices with a chi2/NDoF greater than this
19  constexpr float maxChi2ForSplit = 9.f;
20 
21  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
22  assert(ptracks);
23  assert(soa);
24  auto const& tracks = *ptracks;
25  auto const& fit = tracks.stateAtBS;
26  auto const* quality = tracks.qualityData();
27 
28  auto first = blockIdx.x * blockDim.x + threadIdx.x;
29  for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
30  auto nHits = tracks.nHits(idx);
31  assert(nHits >= 3);
32 
33  // initialize soa...
34  soa->idv[idx] = -1;
35 
36  if (tracks.isTriplet(idx))
37  continue; // no triplets
39  continue;
40 
41  auto pt = tracks.pt(idx);
42 
43  if (pt < ptMin)
44  continue;
45 
46  // clamp pt
47  pt = std::min(pt, ptMax);
48 
49  auto& data = *pws;
50  auto it = atomicAdd(&data.ntrks, 1);
51  data.itrk[it] = idx;
52  data.zt[it] = tracks.zip(idx);
53  data.ezt2[it] = fit.covariance(idx)(14);
54  data.ptt2[it] = pt * pt;
55  }
56  }
57 
58 // #define THREE_KERNELS
59 #ifndef THREE_KERNELS
60  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
62  int minT, // min number of neighbours to be "seed"
63  float eps, // max absolute distance to cluster
64  float errmax, // max error to be "seed"
65  float chi2max // max normalized distance to cluster,
66  ) {
67  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
68  __syncthreads();
69  fitVertices(pdata, pws, maxChi2ForFirstFit);
70  __syncthreads();
71  splitVertices(pdata, pws, maxChi2ForSplit);
72  __syncthreads();
73  fitVertices(pdata, pws, maxChi2ForFinalFit);
74  __syncthreads();
75  sortByPt2(pdata, pws);
76  }
77 #else
78  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
80  int minT, // min number of neighbours to be "seed"
81  float eps, // max absolute distance to cluster
82  float errmax, // max error to be "seed"
83  float chi2max // max normalized distance to cluster,
84  ) {
85  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
86  __syncthreads();
87  fitVertices(pdata, pws, maxChi2ForFirstFit);
88  }
89 
90  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
91  fitVertices(pdata, pws, maxChi2ForFinalFit);
92  __syncthreads();
93  sortByPt2(pdata, pws);
94  }
95 #endif
96 
97 #ifdef __CUDACC__
98  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const {
99 #ifdef PIXVERTEX_DEBUG_PRODUCE
100  std::cout << "producing Vertices on GPU" << std::endl;
101 #endif // PIXVERTEX_DEBUG_PRODUCE
102  ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
103 #else
104  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const {
105 #ifdef PIXVERTEX_DEBUG_PRODUCE
106  std::cout << "producing Vertices on CPU" << std::endl;
107 #endif // PIXVERTEX_DEBUG_PRODUCE
108  ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
109 #endif
110  assert(tksoa);
111  auto* soa = vertices.get();
112  assert(soa);
113 
114 #ifdef __CUDACC__
115  auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
116 #else
117  auto ws_d = std::make_unique<WorkSpace>();
118 #endif
119 
120 #ifdef __CUDACC__
121  init<<<1, 1, 0, stream>>>(soa, ws_d.get());
122  auto blockSize = 128;
123  auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
124  loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin, ptMax);
125  cudaCheck(cudaGetLastError());
126 #else
127  init(soa, ws_d.get());
128  loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax);
129 #endif
130 
131 #ifdef __CUDACC__
132  // Running too many thread lead to problems when printf is enabled.
133  constexpr int maxThreadsForPrint = 1024 - 128;
134  constexpr int numBlocks = 1024;
135  constexpr int threadsPerBlock = 128;
136 
137  if (oneKernel_) {
138  // implemented only for density clustesrs
139 #ifndef THREE_KERNELS
140  vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
141 #else
142  vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
143  cudaCheck(cudaGetLastError());
144  // one block per vertex...
145  splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
146  cudaCheck(cudaGetLastError());
147  vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
148 #endif
149  } else { // five kernels
150  if (useDensity_) {
151  clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
152  } else if (useDBSCAN_) {
153  clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
154  } else if (useIterative_) {
155  clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
156  }
157  cudaCheck(cudaGetLastError());
158  fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
159  cudaCheck(cudaGetLastError());
160  // one block per vertex...
161  splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
162  cudaCheck(cudaGetLastError());
163  fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
164  cudaCheck(cudaGetLastError());
165  sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
166  }
167  cudaCheck(cudaGetLastError());
168 #else // __CUDACC__
169  if (useDensity_) {
170  clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
171  } else if (useDBSCAN_) {
172  clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
173  } else if (useIterative_) {
174  clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
175  }
176 #ifdef PIXVERTEX_DEBUG_PRODUCE
177  std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
178 #endif // PIXVERTEX_DEBUG_PRODUCE
179  fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
180  // one block per vertex!
181  splitVertices(soa, ws_d.get(), maxChi2ForSplit);
182  fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
183  sortByPt2(soa, ws_d.get());
184 #endif
185 
186  return vertices;
187  }
188 
189 } // namespace gpuVertexFinder
const dim3 threadIdx
Definition: cudaCompat.h:29
const dim3 gridDim
Definition: cudaCompat.h:33
WorkSpace int float eps
auto &__restrict__ data
uint32_t const *__restrict__ TkSoA const *__restrict__ ptracks
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
static constexpr int32_t stride()
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
constexpr float maxChi2ForFirstFit
auto const * get() const
auto const * quality
WorkSpace int float float float chi2max
ZVertexSoA * soa
auto const & fit
WorkSpace int float float errmax
const dim3 blockIdx
Definition: cudaCompat.h:32
ZVertexHeterogeneous make(TkSoA const *tksoa, float ptMin, float ptMax) const
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t nHits
constexpr float maxChi2ForFinalFit
int16_t idv[MAXTRACKS]
Definition: ZVertexSoA.h:14
auto const & tracks
ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const *tksoa, float ptMin, float ptMax) const
tuple cout
Definition: gather_cfg.py:144
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
constexpr float maxChi2ForSplit
ZVertexSoA WorkSpace float float ptMax
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
ZVertexSoA WorkSpace float ptMin