d5/d77/gpuVertexFinder_8cc_source.html

 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

 #include "gpuClusterTracksByDensity.h"
 #include "gpuClusterTracksDBSCAN.h"
 #include "gpuClusterTracksIterative.h"
 #include "gpuFitVertices.h"
 #include "gpuSortByPt2.h"
 #include "gpuSplitVertices.h"

 #undef PIXVERTEX_DEBUG_PRODUCE

 namespace gpuVertexFinder {

   // reject outlier tracks that contribute more than this to the chi2 of the vertex fit
   constexpr float maxChi2ForFirstFit = 50.f;
   constexpr float maxChi2ForFinalFit = 5000.f;

   // split vertices with a chi2/NDoF greater than this
   constexpr float maxChi2ForSplit = 9.f;

   __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
     assert(ptracks);
     assert(soa);
     auto const& tracks = *ptracks;
     auto const& fit = tracks.stateAtBS;
     auto const* quality = tracks.qualityData();

     auto first = blockIdx.x * blockDim.x + threadIdx.x;
     for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
       auto nHits = tracks.nHits(idx);
       assert(nHits >= 3);

       // initialize soa...
       soa->idv[idx] = -1;

       if (tracks.isTriplet(idx))
         continue;  // no triplets
       if (quality[idx] < pixelTrack::Quality::highPurity)
         continue;

       auto pt = tracks.pt(idx);

       if (pt < ptMin)
         continue;

       // clamp pt
       pt = std::min(pt, ptMax);

       auto& data = *pws;
       auto it = atomicAdd(&data.ntrks, 1);
       data.itrk[it] = idx;
       data.zt[it] = tracks.zip(idx);
       data.ezt2[it] = fit.covariance(idx)(14);
       data.ptt2[it] = pt * pt;
     }
   }

 // #define THREE_KERNELS
 #ifndef THREE_KERNELS
   __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
                                         gpuVertexFinder::WorkSpace* pws,
                                         int minT,      // min number of neighbours to be "seed"
                                         float eps,     // max absolute distance to cluster
                                         float errmax,  // max error to be "seed"
                                         float chi2max  // max normalized distance to cluster,
   ) {
     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
     __syncthreads();
     fitVertices(pdata, pws, maxChi2ForFirstFit);
     __syncthreads();
     splitVertices(pdata, pws, maxChi2ForSplit);
     __syncthreads();
     fitVertices(pdata, pws, maxChi2ForFinalFit);
     __syncthreads();
     sortByPt2(pdata, pws);
   }
 #else
   __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
                                       gpuVertexFinder::WorkSpace* pws,
                                       int minT,      // min number of neighbours to be "seed"
                                       float eps,     // max absolute distance to cluster
                                       float errmax,  // max error to be "seed"
                                       float chi2max  // max normalized distance to cluster,
   ) {
     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
     __syncthreads();
     fitVertices(pdata, pws, maxChi2ForFirstFit);
   }

   __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
     fitVertices(pdata, pws, maxChi2ForFinalFit);
     __syncthreads();
     sortByPt2(pdata, pws);
   }
 #endif

 #ifdef __CUDACC__
   ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on GPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
     ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
 #else
   ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on  CPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
     ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
 #endif
     assert(tksoa);
     auto* soa = vertices.get();
     assert(soa);

 #ifdef __CUDACC__
     auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
 #else
     auto ws_d = std::make_unique<WorkSpace>();
 #endif

 #ifdef __CUDACC__
     init<<<1, 1, 0, stream>>>(soa, ws_d.get());
     auto blockSize = 128;
     auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
     loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin, ptMax);
     cudaCheck(cudaGetLastError());
 #else
     init(soa, ws_d.get());
     loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax);
 #endif

 #ifdef __CUDACC__
     // Running too many thread lead to problems when printf is enabled.
     constexpr int maxThreadsForPrint = 1024 - 128;
     constexpr int numBlocks = 1024;
     constexpr int threadsPerBlock = 128;

     if (oneKernel_) {
       // implemented only for density clustesrs
 #ifndef THREE_KERNELS
       vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
 #else
       vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
       splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
       vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
 #endif
     } else {  // five kernels
       if (useDensity_) {
         clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       } else if (useDBSCAN_) {
         clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       } else if (useIterative_) {
         clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
       }
       cudaCheck(cudaGetLastError());
       fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
       splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
       fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
       cudaCheck(cudaGetLastError());
       sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
     }
     cudaCheck(cudaGetLastError());
 #else  // __CUDACC__
     if (useDensity_) {
       clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
     } else if (useDBSCAN_) {
       clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
     } else if (useIterative_) {
       clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
     }
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
     fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
     // one block per vertex!
     splitVertices(soa, ws_d.get(), maxChi2ForSplit);
     fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
     sortByPt2(soa, ws_d.get());
 #endif

     return vertices;
   }

 }  // namespace gpuVertexFinder
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

gpuVertexFinder::__syncthreads
__syncthreads()
Definition: cudaCompat.h:132

gpuSplitVertices.h

gpuVertexFinder::Producer::makeAsync
ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const *tksoa, float ptMin, float ptMax) const

gpuSortByPt2.h

gpuClusterTracksIterative.h

cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33

gpuVertexFinder::eps
WorkSpace int float eps
Definition: gpuClusterTracksDBSCAN.h:18

heavyIonCSV_trainingSettings.idx
idx
Definition: heavyIonCSV_trainingSettings.py:5

gpuVertexFinder::assert
assert(pdata)

gpuVertexFinder::data
auto &__restrict__ data
Definition: gpuClusterTracksDBSCAN.h:31

__global__
#define __global__
Definition: cudaCompat.h:19

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

gpuVertexFinder::Producer::useIterative_
const bool useIterative_
Definition: gpuVertexFinder.h:74

gpuVertexFinder::first
auto first
Definition: gpuVertexFinder.cc:28

TrackSoAHeterogeneousT::stride
static constexpr int32_t stride()
Definition: TrackSoAHeterogeneousT.h:26

gpuClusterTracksByDensity.h

gpuVertexFinder::Producer::minT
int minT
Definition: gpuVertexFinder.h:76

gpuVertexFinder::Producer::eps
float eps
Definition: gpuVertexFinder.h:77

gpuVertexFinder::Producer::errmax
float errmax
Definition: gpuVertexFinder.h:78

gpuVertexFinder::Producer::chi2max
float chi2max
Definition: gpuVertexFinder.h:79

cms::cuda::stream
uint32_t T const  *__restrict__ uint32_t const  *__restrict__ int32_t int Histo::index_type cudaStream_t stream
Definition: HistoContainer.h:51

DiDispStaMuonMonitor_cfi.pt
pt
Definition: DiDispStaMuonMonitor_cfi.py:39

gpuVertexFinder::maxChi2ForFirstFit
constexpr float maxChi2ForFirstFit
Definition: gpuVertexFinder.cc:15

gpuVertexFinder::quality
auto const  * quality
Definition: gpuVertexFinder.cc:26

AlignmentTracksFromVertexSelector_cfi.vertices
vertices
Definition: AlignmentTracksFromVertexSelector_cfi.py:5

ZVertexSoA
Definition: ZVertexSoA.h:10

gpuVertexFinder::nt
auto nt
Definition: gpuClusterTracksDBSCAN.h:33

gpuVertexFinder::chi2max
WorkSpace int float float float chi2max
Definition: gpuClusterTracksDBSCAN.h:23

gpuVertexFinder::pws
WorkSpace * pws
Definition: gpuClusterTracksDBSCAN.h:18

gpuVertexFinder::soa
ZVertexSoA * soa
Definition: gpuVertexFinder.cc:21

SiStripPI::min
Definition: SiStripPayloadInspectorHelper.h:178

gpuVertexFinder::Producer::useDensity_
const bool useDensity_
Definition: gpuVertexFinder.h:72

gpuVertexFinder
Definition: gpuClusterTracksByDensity.h:13

ptracks
uint32_t const  *__restrict__ TkSoA const  *__restrict__ ptracks
Definition: CAHitNtupletGeneratorKernelsImpl.h:141

gpuVertexFinder::errmax
WorkSpace int float float errmax
Definition: gpuClusterTracksDBSCAN.h:18

cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32

gpuVertexFinder::minT
WorkSpace int minT
Definition: gpuClusterTracksDBSCAN.h:18

fit
Definition: CombinedChiSquaredLikelihood.h:6

gpuClusterTracksDBSCAN.h

cudaCheck.h

gpuFitVertices.h

nHits
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const  *__restrict__ uint32_t const  *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t nHits
Definition: CAHitNtupletGeneratorKernelsImpl.h:43

pixelTrack::Quality::highPurity

TrackSoAHeterogeneousT
Definition: TrackSoAHeterogeneousT.h:24

gpuVertexFinder::maxChi2ForFinalFit
constexpr float maxChi2ForFinalFit
Definition: gpuVertexFinder.cc:16

gpuVertexFinder::Producer::make
ZVertexHeterogeneous make(TkSoA const *tksoa, float ptMin, float ptMax) const
Definition: gpuVertexFinder.cc:104

HeterogeneousSoA
Definition: HeterogeneousSoA.h:13

ZVertexSoA::idv
int16_t idv[MAXTRACKS]
Definition: ZVertexSoA.h:14

gpuVertexFinder::init
pws init()

gpuVertexFinder::tracks
auto const  & tracks
Definition: gpuVertexFinder.cc:24

gather_cfg.cout
cout
Definition: gather_cfg.py:144

cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69

gpuVertexFinder::maxChi2ForSplit
constexpr float maxChi2ForSplit
Definition: gpuVertexFinder.cc:19

gpuVertexFinder::ptMax
ZVertexSoA WorkSpace float float ptMax
Definition: gpuVertexFinder.cc:21

gpuVertexFinder::Producer::oneKernel_
const bool oneKernel_
Definition: gpuVertexFinder.h:71

gpuVertexFinder::Producer::useDBSCAN_
const bool useDBSCAN_
Definition: gpuVertexFinder.h:73

cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61

gpuVertexFinder::ptMin
ZVertexSoA WorkSpace float ptMin
Definition: gpuVertexFinder.cc:21

gpuVertexFinder::WorkSpace
Definition: gpuVertexFinder.h:16