d5/d77/gpuVertexFinder_8cc_source.html

 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"


 #include "gpuClusterTracksByDensity.h"

 #include "gpuClusterTracksDBSCAN.h"

 #include "gpuClusterTracksIterative.h"

 #include "gpuFitVertices.h"

 #include "gpuSortByPt2.h"

 #include "gpuSplitVertices.h"


 #undef PIXVERTEX_DEBUG_PRODUCE


 namespace gpuVertexFinder {


   // reject outlier tracks that contribute more than this to the chi2 of the vertex fit

   constexpr float maxChi2ForFirstFit = 50.f;

   constexpr float maxChi2ForFinalFit = 5000.f;


   // split vertices with a chi2/NDoF greater than this

   constexpr float maxChi2ForSplit = 9.f;


   __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {

     assert(ptracks);

     assert(soa);

     auto const& tracks = *ptracks;

     auto const& fit = tracks.stateAtBS;

     auto const* quality = tracks.qualityData();


     auto first = blockIdx.x * blockDim.x + threadIdx.x;

     for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {

       auto nHits = tracks.nHits(idx);

       if (nHits == 0)

         break;  // this is a guard: maybe we need to move to nTracks...


       // initialize soa...

       soa->idv[idx] = -1;


       if (tracks.isTriplet(idx))

         continue;  // no triplets

       if (quality[idx] < pixelTrack::Quality::highPurity)

         continue;


       auto pt = tracks.pt(idx);


       if (pt < ptMin)

         continue;


       auto& data = *pws;

       auto it = atomicAdd(&data.ntrks, 1);

       data.itrk[it] = idx;

       data.zt[it] = tracks.zip(idx);

       data.ezt2[it] = fit.covariance(idx)(14);

       data.ptt2[it] = pt * pt;

     }

   }


 // #define THREE_KERNELS

 #ifndef THREE_KERNELS

   __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,

                                         gpuVertexFinder::WorkSpace* pws,

                                         int minT,      // min number of neighbours to be "seed"

                                         float eps,     // max absolute distance to cluster

                                         float errmax,  // max error to be "seed"

                                         float chi2max  // max normalized distance to cluster,

   ) {

     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);

     __syncthreads();

     fitVertices(pdata, pws, maxChi2ForFirstFit);

     __syncthreads();

     splitVertices(pdata, pws, maxChi2ForSplit);

     __syncthreads();

     fitVertices(pdata, pws, maxChi2ForFinalFit);

     __syncthreads();

     sortByPt2(pdata, pws);

   }

 #else

   __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,

                                       gpuVertexFinder::WorkSpace* pws,

                                       int minT,      // min number of neighbours to be "seed"

                                       float eps,     // max absolute distance to cluster

                                       float errmax,  // max error to be "seed"

                                       float chi2max  // max normalized distance to cluster,

   ) {

     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);

     __syncthreads();

     fitVertices(pdata, pws, maxChi2ForFirstFit);

   }


   __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {

     fitVertices(pdata, pws, maxChi2ForFinalFit);

     __syncthreads();

     sortByPt2(pdata, pws);

   }

 #endif


 #ifdef __CUDACC__

   ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {

 #ifdef PIXVERTEX_DEBUG_PRODUCE

     std::cout << "producing Vertices on GPU" << std::endl;

 #endif  // PIXVERTEX_DEBUG_PRODUCE

     ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));

 #else

   ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {

 #ifdef PIXVERTEX_DEBUG_PRODUCE

     std::cout << "producing Vertices on  CPU" << std::endl;

 #endif  // PIXVERTEX_DEBUG_PRODUCE

     ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());

 #endif

     assert(tksoa);

     auto* soa = vertices.get();

     assert(soa);


 #ifdef __CUDACC__

     auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);

 #else

     auto ws_d = std::make_unique<WorkSpace>();

 #endif


 #ifdef __CUDACC__

     init<<<1, 1, 0, stream>>>(soa, ws_d.get());

     auto blockSize = 128;

     auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;

     loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);

     cudaCheck(cudaGetLastError());

 #else

     init(soa, ws_d.get());

     loadTracks(tksoa, soa, ws_d.get(), ptMin);

 #endif


 #ifdef __CUDACC__

     // Running too many thread lead to problems when printf is enabled.

     constexpr int maxThreadsForPrint = 1024 - 128;

     constexpr int numBlocks = 1024;

     constexpr int threadsPerBlock = 128;


     if (oneKernel_) {

       // implemented only for density clustesrs

 #ifndef THREE_KERNELS

       vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);

 #else

       vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);

       cudaCheck(cudaGetLastError());

       // one block per vertex...

       splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);

       cudaCheck(cudaGetLastError());

       vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());

 #endif

     } else {  // five kernels

       if (useDensity_) {

         clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);

       } else if (useDBSCAN_) {

         clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);

       } else if (useIterative_) {

         clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);

       }

       cudaCheck(cudaGetLastError());

       fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);

       cudaCheck(cudaGetLastError());

       // one block per vertex...

       splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);

       cudaCheck(cudaGetLastError());

       fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);

       cudaCheck(cudaGetLastError());

       sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());

     }

     cudaCheck(cudaGetLastError());

 #else  // __CUDACC__

     if (useDensity_) {

       clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);

     } else if (useDBSCAN_) {

       clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);

     } else if (useIterative_) {

       clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);

     }

 #ifdef PIXVERTEX_DEBUG_PRODUCE

     std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;

 #endif  // PIXVERTEX_DEBUG_PRODUCE

     fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);

     // one block per vertex!

     splitVertices(soa, ws_d.get(), maxChi2ForSplit);

     fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);

     sortByPt2(soa, ws_d.get());

 #endif


     return vertices;

   }


 }  // namespace gpuVertexFinder

cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

gpuVertexFinder::__syncthreads
__syncthreads()
Definition: cudaCompat.h:108

gpuSplitVertices.h

gpuSortByPt2.h

gpuClusterTracksIterative.h

DiDispStaMuonMonitor_cfi.pt
tuple pt
Definition: DiDispStaMuonMonitor_cfi.py:39

cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33

gpuVertexFinder::eps
WorkSpace int float eps
Definition: gpuClusterTracksDBSCAN.h:18

gpuVertexFinder::assert
assert(pdata)

gpuVertexFinder::data
auto &__restrict__ data
Definition: gpuClusterTracksDBSCAN.h:31

ptracks
uint32_t const *__restrict__ TkSoA const *__restrict__ ptracks
Definition: CAHitNtupletGeneratorKernelsImpl.h:139

__global__
#define __global__
Definition: cudaCompat.h:19

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

gpuVertexFinder::Producer::useIterative_
const bool useIterative_
Definition: gpuVertexFinder.h:73

gpuVertexFinder::first
auto first
Definition: gpuVertexFinder.cc:28

TrackSoAHeterogeneousT::stride
static constexpr int32_t stride()
Definition: TrackSoAHeterogeneousT.h:26

gpuClusterTracksByDensity.h

gpuVertexFinder::Producer::minT
int minT
Definition: gpuVertexFinder.h:75

gpuVertexFinder::Producer::eps
float eps
Definition: gpuVertexFinder.h:76

gpuVertexFinder::Producer::make
ZVertexHeterogeneous make(TkSoA const *tksoa, float ptMin) const
Definition: gpuVertexFinder.cc:102

gpuVertexFinder::Producer::errmax
float errmax
Definition: gpuVertexFinder.h:77

beam_dqm_sourceclient-live_cfg.vertices
tuple vertices
Definition: beam_dqm_sourceclient-live_cfg.py:130

gpuVertexFinder::Producer::chi2max
float chi2max
Definition: gpuVertexFinder.h:78

cms::cuda::stream
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
Definition: HistoContainer.h:51

gpuVertexFinder::maxChi2ForFirstFit
constexpr float maxChi2ForFirstFit
Definition: gpuVertexFinder.cc:15

HeterogeneousSoA::get
auto const * get() const
Definition: HeterogeneousSoA.h:26

gpuVertexFinder::quality
auto const * quality
Definition: gpuVertexFinder.cc:26

ZVertexSoA
Definition: ZVertexSoA.h:10

gpuVertexFinder::nt
auto nt
Definition: gpuClusterTracksDBSCAN.h:33

gpuVertexFinder::chi2max
WorkSpace int float float float chi2max
Definition: gpuClusterTracksDBSCAN.h:23

gpuVertexFinder::pws
WorkSpace * pws
Definition: gpuClusterTracksDBSCAN.h:18

gpuVertexFinder::soa
ZVertexSoA * soa
Definition: gpuVertexFinder.cc:21

gpuVertexFinder::fit
auto const & fit
Definition: gpuVertexFinder.cc:25

gpuVertexFinder::Producer::useDensity_
const bool useDensity_
Definition: gpuVertexFinder.h:71

gpuVertexFinder::errmax
WorkSpace int float float errmax
Definition: gpuClusterTracksDBSCAN.h:18

gpuVertexFinder::Producer::makeAsync
ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const *tksoa, float ptMin) const

cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32

gpuVertexFinder::minT
WorkSpace int minT
Definition: gpuClusterTracksDBSCAN.h:18

gpuClusterTracksDBSCAN.h

cudaCheck.h

gpuFitVertices.h

nHits
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t nHits
Definition: CAHitNtupletGeneratorKernelsImpl.h:43

pixelTrack::Quality::highPurity

TrackSoAHeterogeneousT
Definition: TrackSoAHeterogeneousT.h:24

gpuVertexFinder::maxChi2ForFinalFit
constexpr float maxChi2ForFinalFit
Definition: gpuVertexFinder.cc:16

HeterogeneousSoA
Definition: HeterogeneousSoA.h:13

ZVertexSoA::idv
int16_t idv[MAXTRACKS]
Definition: ZVertexSoA.h:14

gpuVertexFinder::init
pws init()

gpuVertexFinder::tracks
auto const & tracks
Definition: gpuVertexFinder.cc:24

gather_cfg.cout
tuple cout
Definition: gather_cfg.py:144

cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69

gpuVertexFinder::maxChi2ForSplit
constexpr float maxChi2ForSplit
Definition: gpuVertexFinder.cc:19

gpuVertexFinder::Producer::oneKernel_
const bool oneKernel_
Definition: gpuVertexFinder.h:70

gpuVertexFinder::Producer::useDBSCAN_
const bool useDBSCAN_
Definition: gpuVertexFinder.h:72

cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61

gpuVertexFinder::ptMin
ZVertexSoA WorkSpace float ptMin
Definition: gpuVertexFinder.cc:21

gpuVertexFinder::WorkSpace
Definition: gpuVertexFinder.h:15