de/d68/gpuClusterTracksByDensity_8h_source.html

 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h

 #include <algorithm>
 #include <cmath>
 #include <cstdint>

 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"

 #include "gpuVertexFinder.h"

 namespace gpuVertexFinder {

   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
   //
   // based on Rodrighez&Laio algo
   //
   __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
                                                          gpuVertexFinder::WorkSpace* pws,
                                                          int minT,      // min number of neighbours to be "seed"
                                                          float eps,     // max absolute distance to cluster
                                                          float errmax,  // max error to be "seed"
                                                          float chi2max  // max normalized distance to cluster
   ) {
     using namespace gpuVertexFinder;
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false

     if (verbose && 0 == threadIdx.x)
       printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);

     auto er2mx = errmax * errmax;

     auto& __restrict__ data = *pdata;
     auto& __restrict__ ws = *pws;
     auto nt = ws.ntrks;
     float const* __restrict__ zt = ws.zt;
     float const* __restrict__ ezt2 = ws.ezt2;

     uint32_t& nvFinal = data.nvFinal;
     uint32_t& nvIntermediate = ws.nvIntermediate;

     uint8_t* __restrict__ izt = ws.izt;
     int32_t* __restrict__ nn = data.ndof;
     int32_t* __restrict__ iv = ws.iv;

     assert(pdata);
     assert(zt);

     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
     __shared__ typename Hist::Counter hws[32];
     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
       hist.off[j] = 0;
     }
     __syncthreads();

     if (verbose && 0 == threadIdx.x)
       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);

     assert((int)nt <= hist.capacity());

     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       assert(i < ZVertices::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
       iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
       izt[i] = iz - INT8_MIN;
       assert(iz - INT8_MIN >= 0);
       assert(iz - INT8_MIN < 256);
       hist.count(izt[i]);
       iv[i] = i;
       nn[i] = 0;
     }
     __syncthreads();
     if (threadIdx.x < 32)
       hws[threadIdx.x] = 0;  // used by prefix scan...
     __syncthreads();
     hist.finalize(hws);
     __syncthreads();
     assert(hist.size() == nt);
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       hist.fill(izt[i], uint16_t(i));
     }
     __syncthreads();

     // count neighbours
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (ezt2[i] > er2mx)
         continue;
       auto loop = [&](uint32_t j) {
         if (i == j)
           return;
         auto dist = std::abs(zt[i] - zt[j]);
         if (dist > eps)
           return;
         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
           return;
         nn[i]++;
       };

       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }

     __syncthreads();

     // find closest above me .... (we ignore the possibility of two j at same distance from i)
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       float mdist = eps;
       auto loop = [&](uint32_t j) {
         if (nn[j] < nn[i])
           return;
         if (nn[j] == nn[i] && zt[j] >= zt[i])
           return;  // if equal use natural order...
         auto dist = std::abs(zt[i] - zt[j]);
         if (dist > mdist)
           return;
         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
           return;  // (break natural order???)
         mdist = dist;
         iv[i] = j;  // assign to cluster (better be unique??)
       };
       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }

     __syncthreads();

 #ifdef GPU_DEBUG
     //  mini verification
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] != int(i))
         assert(iv[iv[i]] != int(i));
     }
     __syncthreads();
 #endif

     // consolidate graph (percolate index of seed)
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       auto m = iv[i];
       while (m != iv[m])
         m = iv[m];
       iv[i] = m;
     }

 #ifdef GPU_DEBUG
     __syncthreads();
     //  mini verification
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] != int(i))
         assert(iv[iv[i]] != int(i));
     }
 #endif

 #ifdef GPU_DEBUG
     // and verify that we did not spit any cluster...
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       auto minJ = i;
       auto mdist = eps;
       auto loop = [&](uint32_t j) {
         if (nn[j] < nn[i])
           return;
         if (nn[j] == nn[i] && zt[j] >= zt[i])
           return;  // if equal use natural order...
         auto dist = std::abs(zt[i] - zt[j]);
         if (dist > mdist)
           return;
         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
           return;
         mdist = dist;
         minJ = j;
       };
       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
       // should belong to the same cluster...
       assert(iv[i] == iv[minJ]);
       assert(nn[i] <= nn[iv[i]]);
     }
     __syncthreads();
 #endif

     __shared__ unsigned int foundClusters;
     foundClusters = 0;
     __syncthreads();

     // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
     // mark these tracks with a negative id.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] == int(i)) {
         if (nn[i] >= minT) {
           auto old = atomicInc(&foundClusters, 0xffffffff);
           iv[i] = -(old + 1);
         } else {  // noise
           iv[i] = -9998;
         }
       }
     }
     __syncthreads();

     assert(foundClusters < ZVertices::MAXVTX);

     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] >= 0) {
         // mark each track in a cluster with the same id as the first one
         iv[i] = iv[iv[i]];
       }
     }
     __syncthreads();

     // adjust the cluster id to be a positive value starting from 0
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       iv[i] = -iv[i] - 1;
     }

     nvIntermediate = nvFinal = foundClusters;

     if (verbose && 0 == threadIdx.x)
       printf("found %d proto vertices\n", foundClusters);
   }

   __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
                                                gpuVertexFinder::WorkSpace* pws,
                                                int minT,      // min number of neighbours to be "seed"
                                                float eps,     // max absolute distance to cluster
                                                float errmax,  // max error to be "seed"
                                                float chi2max  // max normalized distance to cluster
   ) {
     clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
   }

 }  // namespace gpuVertexFinder

 #endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

gpuVertexFinder::__syncthreads
__syncthreads()
Definition: cudaCompat.h:132

__forceinline__
#define __forceinline__
Definition: cudaCompat.h:22

mps_fire.i
i
Definition: mps_fire.py:429

heppy_loop.loop
loop
Definition: heppy_loop.py:28

gpuVertexFinder::iv
int32_t *__restrict__ iv
Definition: gpuClusterTracksDBSCAN.h:42

gpuVertexFinder::eps
WorkSpace int float eps
Definition: gpuClusterTracksDBSCAN.h:18

dqmiolumiharvest.j
j
Definition: dqmiolumiharvest.py:66

verbose
bool verbose
Definition: beamSpotDipStandalone.cc:87

gpuVertexFinder::assert
assert(pdata)

gpuVertexFinder::data
auto &__restrict__ data
Definition: gpuClusterTracksDBSCAN.h:31

__global__
#define __global__
Definition: cudaCompat.h:19

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

ZVertexSoA::MAXVTX
static constexpr uint32_t MAXVTX
Definition: ZVertexSoA.h:12

gpuVertexFinder::zt
float const  *__restrict__ zt
Definition: gpuClusterTracksDBSCAN.h:34

cms::cuda::HistoContainer
Definition: HistoContainer.h:101

gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksDBSCAN.h:32

ZVertexSoA
Definition: ZVertexSoA.h:10

gpuVertexFinder::nt
auto nt
Definition: gpuClusterTracksDBSCAN.h:33

createfilelist.int
int
Definition: createfilelist.py:10

gpuVertexFinder::chi2max
WorkSpace int float float float chi2max
Definition: gpuClusterTracksDBSCAN.h:23

gpuVertexFinder::pws
WorkSpace * pws
Definition: gpuClusterTracksDBSCAN.h:18

cms::cudacompat::atomicInc
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48

visualization-live-secondInstance_cfg.m
m
Definition: visualization-live-secondInstance_cfg.py:80

SiStripPI::min
Definition: SiStripPayloadInspectorHelper.h:178

funct::abs
Abs< T >::type abs(const T &t)
Definition: Abs.h:22

gpuVertexFinder.h

gpuVertexFinder
Definition: gpuClusterTracksByDensity.h:13

cms::cuda::HistoContainer::Counter
typename Base::Counter Counter
Definition: HistoContainer.h:105

gpuVertexFinder::ezt2
float const  *__restrict__ ezt2
Definition: gpuClusterTracksDBSCAN.h:35

gpuVertexFinder::hws
__shared__ Hist::Counter hws[32]
Definition: gpuClusterTracksDBSCAN.h:49

gpuVertexFinder::izt
uint8_t *__restrict__ izt
Definition: gpuClusterTracksDBSCAN.h:40

gpuVertexFinder::nvFinal
uint32_t & nvFinal
Definition: gpuClusterTracksDBSCAN.h:37

SiStripPI::max
Definition: SiStripPayloadInspectorHelper.h:178

gpuVertexFinder::errmax
WorkSpace int float float errmax
Definition: gpuClusterTracksDBSCAN.h:18

gpuVertexFinder::minT
WorkSpace int minT
Definition: gpuClusterTracksDBSCAN.h:18

gpuVertexFinder::hist
__shared__ Hist hist
Definition: gpuClusterTracksDBSCAN.h:48

HistoContainer.h

cms::cuda::HistoContainer::nbins
static constexpr uint32_t nbins()
Definition: HistoContainer.h:123

cms::cuda::HistoContainer::totbins
static constexpr uint32_t totbins()
Definition: HistoContainer.h:125

gpuVertexFinder::nn
int32_t *__restrict__ nn
Definition: gpuClusterTracksDBSCAN.h:41

gpuVertexFinder::er2mx
auto er2mx
Definition: gpuClusterTracksDBSCAN.h:29

ZVertexSoA::MAXTRACKS
static constexpr uint32_t MAXTRACKS
Definition: ZVertexSoA.h:11

cms::cuda::OneToManyAssoc::capacity
constexpr auto capacity() const
Definition: OneToManyAssoc.h:169

__device__
#define __device__
Definition: SiPixelGainForHLTonGPU.h:15

gpuVertexFinder::foundClusters
__shared__ unsigned int foundClusters
Definition: gpuClusterTracksDBSCAN.h:199

cms::cuda::OneToManyAssoc::off
off[c.m]
Definition: OneToManyAssoc.h:236

gpuVertexFinder::nvIntermediate
uint32_t & nvIntermediate
Definition: gpuClusterTracksDBSCAN.h:38

cuda_assert.h

gpuVertexFinder::WorkSpace
Definition: gpuVertexFinder.h:16