de/d68/gpuClusterTracksByDensity_8h_source.html

 #ifndef RecoTracker_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
 #define RecoTracker_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h

 #include <algorithm>
 #include <cmath>
 #include <cstdint>

 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"

 #include "gpuVertexFinder.h"

 namespace gpuVertexFinder {

   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
   //
   // based on Rodrighez&Laio algo
   //
   __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView& pdata,
                                                          WsSoAView& pws,
                                                          int minT,      // min number of neighbours to be "seed"
                                                          float eps,     // max absolute distance to cluster
                                                          float errmax,  // max error to be "seed"
                                                          float chi2max  // max normalized distance to cluster
   ) {
     using namespace gpuVertexFinder;
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false

     if (verbose && 0 == threadIdx.x)
       printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);

     auto er2mx = errmax * errmax;

     auto& __restrict__ data = pdata;
     auto& __restrict__ ws = pws;
     auto nt = ws.ntrks();
     float const* __restrict__ zt = ws.zt();
     float const* __restrict__ ezt2 = ws.ezt2();

     uint32_t& nvFinal = data.nvFinal();
     uint32_t& nvIntermediate = ws.nvIntermediate();

     uint8_t* __restrict__ izt = ws.izt();
     int32_t* __restrict__ nn = data.ndof();
     int32_t* __restrict__ iv = ws.iv();

     assert(zt);
     assert(ezt2);
     assert(izt);
     assert(nn);
     assert(iv);

     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
     __shared__ typename Hist::Counter hws[32];
     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
       hist.off[j] = 0;
     }
     __syncthreads();

     if (verbose && 0 == threadIdx.x)
       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);

     assert((int)nt <= hist.capacity());

     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       assert(i < zVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
       iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
       izt[i] = iz - INT8_MIN;
       assert(iz - INT8_MIN >= 0);
       assert(iz - INT8_MIN < 256);
       hist.count(izt[i]);
       iv[i] = i;
       nn[i] = 0;
     }
     __syncthreads();
     if (threadIdx.x < 32)
       hws[threadIdx.x] = 0;  // used by prefix scan...
     __syncthreads();
     hist.finalize(hws);
     __syncthreads();
     assert(hist.size() == nt);
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       hist.fill(izt[i], uint16_t(i));
     }
     __syncthreads();

     // count neighbours
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (ezt2[i] > er2mx)
         continue;
       auto loop = [&](uint32_t j) {
         if (i == j)
           return;
         auto dist = std::abs(zt[i] - zt[j]);
         if (dist > eps)
           return;
         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
           return;
         nn[i]++;
       };

       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }

     __syncthreads();

     // find closest above me .... (we ignore the possibility of two j at same distance from i)
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       float mdist = eps;
       auto loop = [&](uint32_t j) {
         if (nn[j] < nn[i])
           return;
         if (nn[j] == nn[i] && zt[j] >= zt[i])
           return;  // if equal use natural order...
         auto dist = std::abs(zt[i] - zt[j]);
         if (dist > mdist)
           return;
         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
           return;  // (break natural order???)
         mdist = dist;
         iv[i] = j;  // assign to cluster (better be unique??)
       };
       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }

     __syncthreads();

 #ifdef GPU_DEBUG
     //  mini verification
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] != int(i))
         assert(iv[iv[i]] != int(i));
     }
     __syncthreads();
 #endif

     // consolidate graph (percolate index of seed)
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       auto m = iv[i];
       while (m != iv[m])
         m = iv[m];
       iv[i] = m;
     }

 #ifdef GPU_DEBUG
     __syncthreads();
     //  mini verification
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] != int(i))
         assert(iv[iv[i]] != int(i));
     }
 #endif

 #ifdef GPU_DEBUG
     // and verify that we did not spit any cluster...
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       auto minJ = i;
       auto mdist = eps;
       auto loop = [&](uint32_t j) {
         if (nn[j] < nn[i])
           return;
         if (nn[j] == nn[i] && zt[j] >= zt[i])
           return;  // if equal use natural order...
         auto dist = std::abs(zt[i] - zt[j]);
         if (dist > mdist)
           return;
         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
           return;
         mdist = dist;
         minJ = j;
       };
       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
       // should belong to the same cluster...
       assert(iv[i] == iv[minJ]);
       assert(nn[i] <= nn[iv[i]]);
     }
     __syncthreads();
 #endif

     __shared__ unsigned int foundClusters;
     foundClusters = 0;
     __syncthreads();

     // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
     // mark these tracks with a negative id.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] == int(i)) {
         if (nn[i] >= minT) {
           auto old = atomicInc(&foundClusters, 0xffffffff);
           iv[i] = -(old + 1);
         } else {  // noise
           iv[i] = -9998;
         }
       }
     }
     __syncthreads();

     assert(foundClusters < zVertex::utilities::MAXVTX);

     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] >= 0) {
         // mark each track in a cluster with the same id as the first one
         iv[i] = iv[iv[i]];
       }
     }
     __syncthreads();

     // adjust the cluster id to be a positive value starting from 0
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       iv[i] = -iv[i] - 1;
     }

     nvIntermediate = nvFinal = foundClusters;

     if (verbose && 0 == threadIdx.x)
       printf("found %d proto vertices\n", foundClusters);
   }

   __global__ void clusterTracksByDensityKernel(VtxSoAView pdata,
                                                WsSoAView pws,
                                                int minT,      // min number of neighbours to be "seed"
                                                float eps,     // max absolute distance to cluster
                                                float errmax,  // max error to be "seed"
                                                float chi2max  // max normalized distance to cluster
   ) {
     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
   }

 }  // namespace gpuVertexFinder

 #endif  // RecoTracker_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

gpuVertexFinder::__syncthreads
__syncthreads()
Definition: cudaCompat.h:132

__forceinline__
#define __forceinline__
Definition: cudaCompat.h:22

mps_fire.i
i
Definition: mps_fire.py:429

heppy_loop.loop
loop
Definition: heppy_loop.py:28

gpuVertexFinder::iv
int32_t *__restrict__ iv
Definition: gpuClusterTracksByDensity.h:46

ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::pdata
ALPAKA_FN_ACC ALPAKA_FN_INLINE void VtxSoAView & pdata
Definition: clusterTracksByDensity.h:30

dqmiolumiharvest.j
j
Definition: dqmiolumiharvest.py:66

gpuVertexFinder::chi2max
__device__ WsSoAView int float float float chi2max
Definition: gpuClusterTracksByDensity.h:26

zVertex::utilities::MAXVTX
static constexpr uint32_t MAXVTX
Definition: ZVertexUtilities.h:26

gpuVertexFinder::data
auto &__restrict__ data
Definition: gpuClusterTracksByDensity.h:35

__global__
#define __global__
Definition: cudaCompat.h:19

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

gpuVertexFinder::eps
__device__ WsSoAView int float eps
Definition: gpuClusterTracksByDensity.h:21

ALPAKA_ACCELERATOR_NAMESPACE::brokenline::constexpr
if constexpr(n > 3)
Definition: BrokenLine.h:164

gpuVertexFinder::zt
float const  *__restrict__ zt
Definition: gpuClusterTracksByDensity.h:38

cms::cuda::HistoContainer
Definition: HistoContainer.h:101

gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksByDensity.h:36

HistoContainer.h

gpuVertexFinder::nt
auto nt
Definition: gpuClusterTracksByDensity.h:37

createfilelist.int
int
Definition: createfilelist.py:10

gpuVertexFinder::pws
__device__ WsSoAView & pws
Definition: gpuClusterTracksByDensity.h:21

cms::cudacompat::atomicInc
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48

visualization-live-secondInstance_cfg.m
m
Definition: visualization-live-secondInstance_cfg.py:84

SiStripPI::min
Definition: SiStripPayloadInspectorHelper.h:178

gpuVertexFinder::minT
__device__ WsSoAView int minT
Definition: gpuClusterTracksByDensity.h:21

funct::abs
Abs< T >::type abs(const T &t)
Definition: Abs.h:22

cms::alpakatools::forEachInBins
ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func)
Definition: HistoContainer.h:101

gpuVertexFinder.h

gpuVertexFinder
Definition: gpuClusterTracksByDensity.h:13

cms::cuda::HistoContainer::Counter
typename Base::Counter Counter
Definition: HistoContainer.h:105

gpuVertexFinder::ezt2
float const  *__restrict__ ezt2
Definition: gpuClusterTracksByDensity.h:39

gpuVertexFinder::hws
__shared__ Hist::Counter hws[32]
Definition: gpuClusterTracksByDensity.h:56

gpuVertexFinder::izt
uint8_t *__restrict__ izt
Definition: gpuClusterTracksByDensity.h:44

gpuVertexFinder::nvFinal
uint32_t & nvFinal
Definition: gpuClusterTracksByDensity.h:41

SiStripPI::max
Definition: SiStripPayloadInspectorHelper.h:178

gpuVertexFinder::hist
__shared__ Hist hist
Definition: gpuClusterTracksByDensity.h:55

cms::cuda::HistoContainer::nbins
static constexpr uint32_t nbins()
Definition: HistoContainer.h:123

gpuVertexFinder::errmax
__device__ WsSoAView int float float errmax
Definition: gpuClusterTracksByDensity.h:21

gpuVertexFinder::WsSoAView
gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAView WsSoAView
Definition: gpuVertexFinder.h:18

cms::cuda::HistoContainer::totbins
static constexpr uint32_t totbins()
Definition: HistoContainer.h:125

gpuVertexFinder::nn
int32_t *__restrict__ nn
Definition: gpuClusterTracksByDensity.h:45

gpuVertexFinder::er2mx
auto er2mx
Definition: gpuClusterTracksByDensity.h:33

cms::cuda::OneToManyAssoc::capacity
constexpr auto capacity() const
Definition: OneToManyAssoc.h:169

gpuVertexFinder::VtxSoAView
zVertex::ZVertexSoAView VtxSoAView
Definition: gpuVertexFinder.h:17

__device__
#define __device__
Definition: SiPixelGainForHLTonGPU.h:15

gpuVertexFinder::foundClusters
__shared__ unsigned int foundClusters
Definition: gpuClusterTracksByDensity.h:185

cms::cuda::OneToManyAssoc::off
off[c.m]
Definition: OneToManyAssoc.h:236

gpuVertexFinder::nvIntermediate
uint32_t & nvIntermediate
Definition: gpuClusterTracksByDensity.h:42

gpuVertexFinder::assert
assert(zt)

cuda_assert.h

gpuVertexFinder::verbose
constexpr bool verbose
Definition: gpuClusterTracksByDensity.h:28

zVertex::utilities::MAXTRACKS
static constexpr uint32_t MAXTRACKS
Definition: ZVertexUtilities.h:25