CMS 3D CMS Logo

gpuVertexFinderImpl.h
Go to the documentation of this file.
1 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
2 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
3 
5 
9 #include "gpuFitVertices.h"
10 #include "gpuSortByPt2.h"
11 #include "gpuSplitVertices.h"
12 
13 #undef PIXVERTEX_DEBUG_PRODUCE
14 
15 namespace gpuVertexFinder {
16 
17  // reject outlier tracks that contribute more than this to the chi2 of the vertex fit
18  constexpr float maxChi2ForFirstFit = 50.f;
19  constexpr float maxChi2ForFinalFit = 5000.f;
20 
21  // split vertices with a chi2/NDoF greater than this
22  constexpr float maxChi2ForSplit = 9.f;
23 
24  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
25  assert(ptracks);
26  assert(soa);
27  auto const& tracks = *ptracks;
28  auto const& fit = tracks.stateAtBS;
29  auto const* quality = tracks.qualityData();
30 
31  auto first = blockIdx.x * blockDim.x + threadIdx.x;
32  for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
33  auto nHits = tracks.nHits(idx);
34  if (nHits == 0)
35  break; // this is a guard: maybe we need to move to nTracks...
36 
37  // initialize soa...
38  soa->idv[idx] = -1;
39 
40  if (nHits < 4)
41  continue; // no triplets
43  continue;
44 
45  auto pt = tracks.pt(idx);
46 
47  if (pt < ptMin)
48  continue;
49 
50  auto& data = *pws;
51  auto it = atomicAdd(&data.ntrks, 1);
52  data.itrk[it] = idx;
53  data.zt[it] = tracks.zip(idx);
54  data.ezt2[it] = fit.covariance(idx)(14);
55  data.ptt2[it] = pt * pt;
56  }
57  }
58 
59 // #define THREE_KERNELS
60 #ifndef THREE_KERNELS
61  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
63  int minT, // min number of neighbours to be "seed"
64  float eps, // max absolute distance to cluster
65  float errmax, // max error to be "seed"
66  float chi2max // max normalized distance to cluster,
67  ) {
68  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
69  __syncthreads();
70  fitVertices(pdata, pws, maxChi2ForFirstFit);
71  __syncthreads();
72  splitVertices(pdata, pws, maxChi2ForSplit);
73  __syncthreads();
74  fitVertices(pdata, pws, maxChi2ForFinalFit);
75  __syncthreads();
76  sortByPt2(pdata, pws);
77  }
78 #else
79  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
81  int minT, // min number of neighbours to be "seed"
82  float eps, // max absolute distance to cluster
83  float errmax, // max error to be "seed"
84  float chi2max // max normalized distance to cluster,
85  ) {
86  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
87  __syncthreads();
88  fitVertices(pdata, pws, maxChi2ForFirstFit);
89  }
90 
91  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
92  fitVertices(pdata, pws, maxChi2ForFinalFit);
93  __syncthreads();
94  sortByPt2(pdata, pws);
95  }
96 #endif
97 
98 #ifdef __CUDACC__
99  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
100 #ifdef PIXVERTEX_DEBUG_PRODUCE
101  std::cout << "producing Vertices on GPU" << std::endl;
102 #endif // PIXVERTEX_DEBUG_PRODUCE
103  ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
104 #else
105  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
106 #ifdef PIXVERTEX_DEBUG_PRODUCE
107  std::cout << "producing Vertices on CPU" << std::endl;
108 #endif // PIXVERTEX_DEBUG_PRODUCE
109  ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
110 #endif
111  assert(tksoa);
112  auto* soa = vertices.get();
113  assert(soa);
114 
115 #ifdef __CUDACC__
116  auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
117 #else
118  auto ws_d = std::make_unique<WorkSpace>();
119 #endif
120 
121 #ifdef __CUDACC__
122  init<<<1, 1, 0, stream>>>(soa, ws_d.get());
123  auto blockSize = 128;
124  auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
125  loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
126  cudaCheck(cudaGetLastError());
127 #else
128  init(soa, ws_d.get());
129  loadTracks(tksoa, soa, ws_d.get(), ptMin);
130 #endif
131 
132 #ifdef __CUDACC__
133  // Running too many thread lead to problems when printf is enabled.
134  constexpr int maxThreadsForPrint = 1024 - 256;
135  constexpr int numBlocks = 1024;
136  constexpr int threadsPerBlock = 128;
137 
138  if (oneKernel_) {
139  // implemented only for density clustesrs
140 #ifndef THREE_KERNELS
141  vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
142 #else
143  vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
144  cudaCheck(cudaGetLastError());
145  // one block per vertex...
146  splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
147  cudaCheck(cudaGetLastError());
148  vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
149 #endif
150  } else { // five kernels
151  if (useDensity_) {
152  clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
153  } else if (useDBSCAN_) {
154  clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
155  } else if (useIterative_) {
156  clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
157  }
158  cudaCheck(cudaGetLastError());
159  fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
160  cudaCheck(cudaGetLastError());
161  // one block per vertex...
162  splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
163  cudaCheck(cudaGetLastError());
164  fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
165  cudaCheck(cudaGetLastError());
166  sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
167  }
168  cudaCheck(cudaGetLastError());
169 #else // __CUDACC__
170  if (useDensity_) {
171  clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
172  } else if (useDBSCAN_) {
173  clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
174  } else if (useIterative_) {
175  clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
176  }
177 #ifdef PIXVERTEX_DEBUG_PRODUCE
178  std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
179 #endif // PIXVERTEX_DEBUG_PRODUCE
180  fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
181  // one block per vertex!
182  splitVertices(soa, ws_d.get(), maxChi2ForSplit);
183  fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
184  sortByPt2(soa, ws_d.get());
185 #endif
186 
187  return vertices;
188  }
189 
190 } // namespace gpuVertexFinder
191 
192 #endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
gpuVertexFinder::maxChi2ForSplit
constexpr float maxChi2ForSplit
Definition: gpuVertexFinderImpl.h:22
gpuVertexFinder::WorkSpace
Definition: gpuVertexFinder.h:15
gpuVertexFinder::Producer::useDBSCAN_
const bool useDBSCAN_
Definition: gpuVertexFinder.h:72
gpuVertexFinder::init
pws init()
gpuVertexFinder::Producer::oneKernel_
const bool oneKernel_
Definition: gpuVertexFinder.h:70
gpuClusterTracksDBSCAN.h
gpuVertexFinder::Producer::makeAsync
ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const *tksoa, float ptMin) const
gpuVertexFinder::assert
assert(pdata)
DiDispStaMuonMonitor_cfi.pt
pt
Definition: DiDispStaMuonMonitor_cfi.py:39
gpuVertexFinder::Producer::useIterative_
const bool useIterative_
Definition: gpuVertexFinder.h:73
cms::cuda::stream
cudaStream_t stream
Definition: HistoContainer.h:57
gpuVertexFinder::eps
WorkSpace int float eps
Definition: gpuClusterTracksDBSCAN.h:18
gather_cfg.cout
cout
Definition: gather_cfg.py:144
gpuVertexFinder
Definition: gpuClusterTracksByDensity.h:13
gpuVertexFinder::soa
ZVertexSoA * soa
Definition: gpuVertexFinderImpl.h:24
HeterogeneousSoA
Definition: HeterogeneousSoA.h:13
ptracks
const HitContainer *__restrict__ const TkSoA *__restrict__ ptracks
Definition: CAHitNtupletGeneratorKernelsImpl.h:477
gpuVertexFinder::first
auto first
Definition: gpuVertexFinderImpl.h:31
TrackSoAHeterogeneousT::stride
static constexpr int32_t stride()
Definition: TrackSoAHeterogeneousT.h:16
__global__
#define __global__
Definition: cudaCompat.h:19
gpuVertexFinder::data
auto &__restrict__ data
Definition: gpuClusterTracksDBSCAN.h:31
heavyIonCSV_trainingSettings.idx
idx
Definition: heavyIonCSV_trainingSettings.py:5
gpuVertexFinder::Producer::minT
int minT
Definition: gpuVertexFinder.h:75
gpuVertexFinder::Producer::eps
float eps
Definition: gpuVertexFinder.h:76
gpuVertexFinder::Producer::make
ZVertexHeterogeneous make(TkSoA const *tksoa, float ptMin) const
Definition: gpuVertexFinderImpl.h:105
gpuVertexFinder::maxChi2ForFirstFit
constexpr float maxChi2ForFirstFit
Definition: gpuVertexFinderImpl.h:18
nHits
const caConstants::TupleMultiplicity *__restrict__ const HitsOnGPU *__restrict__ double *__restrict__ float *__restrict__ double *__restrict__ uint32_t nHits
Definition: BrokenLineFitOnGPU.h:27
gpuVertexFinder::quality
auto const * quality
Definition: gpuVertexFinderImpl.h:29
cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:51
gpuVertexFinder::nt
auto nt
Definition: gpuClusterTracksDBSCAN.h:33
cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33
TrackSoAHeterogeneousT
Definition: TrackSoAHeterogeneousT.h:14
gpuVertexFinder::Producer::errmax
float errmax
Definition: gpuVertexFinder.h:77
gpuVertexFinder::errmax
WorkSpace int float float errmax
Definition: gpuClusterTracksDBSCAN.h:18
gpuSortByPt2.h
gpuVertexFinder::pws
WorkSpace * pws
Definition: gpuClusterTracksDBSCAN.h:18
cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30
gpuClusterTracksIterative.h
gpuSplitVertices.h
gpuVertexFinder::tracks
auto const & tracks
Definition: gpuVertexFinderImpl.h:27
gpuVertexFinder::Producer::chi2max
float chi2max
Definition: gpuVertexFinder.h:78
cudaCheck.h
gpuVertexFinder::maxChi2ForFinalFit
constexpr float maxChi2ForFinalFit
Definition: gpuVertexFinderImpl.h:19
gpuVertexFinder::chi2max
WorkSpace int float float float chi2max
Definition: gpuClusterTracksDBSCAN.h:23
ZVertexSoA::idv
int16_t idv[MAXTRACKS]
Definition: ZVertexSoA.h:14
gpuVertexFinder::minT
WorkSpace int minT
Definition: gpuClusterTracksDBSCAN.h:18
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29
gpuClusterTracksByDensity.h
gpuFitVertices.h
cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:62
ZVertexSoA
Definition: ZVertexSoA.h:10
gpuVertexFinder::Producer::useDensity_
const bool useDensity_
Definition: gpuVertexFinder.h:71
gpuVertexFinder::__syncthreads
__syncthreads()
Definition: cudaCompat.h:77
pixelTrack::Quality::loose
gpuVertexFinder::ptMin
ZVertexSoA WorkSpace float ptMin
Definition: gpuVertexFinderImpl.h:24
fit
Definition: CombinedChiSquaredLikelihood.h:6
pwdgSkimBPark_cfi.vertices
vertices
Definition: pwdgSkimBPark_cfi.py:7
cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32