d8/d0c/gpuClusterTracksIterative_8h_source.html

 #ifndef RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
 #define RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksIterative_h

 #include <algorithm>
 #include <cmath>
 #include <cstdint>

 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"

 #include "gpuVertexFinder.h"

 namespace gpuVertexFinder {

   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
   __global__ void clusterTracksIterative(VtxSoAView pdata,
                                          WsSoAView pws,
                                          int minT,      // min number of neighbours to be "core"
                                          float eps,     // max absolute distance to cluster
                                          float errmax,  // max error to be "seed"
                                          float chi2max  // max normalized distance to cluster
   ) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false

     if (verbose && 0 == threadIdx.x)
       printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);

     auto er2mx = errmax * errmax;

     auto& __restrict__ data = pdata;
     auto& __restrict__ ws = pws;
     auto nt = ws.ntrks();
     float const* __restrict__ zt = ws.zt();
     float const* __restrict__ ezt2 = ws.ezt2();

     uint32_t& nvFinal = data.nvFinal();
     uint32_t& nvIntermediate = ws.nvIntermediate();

     uint8_t* __restrict__ izt = ws.izt();
     int32_t* __restrict__ nn = data.ndof();
     int32_t* __restrict__ iv = ws.iv();

     assert(zt);
     assert(nn);
     assert(iv);
     assert(ezt2);

     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
     __shared__ typename Hist::Counter hws[32];
     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
       hist.off[j] = 0;
     }
     __syncthreads();

     if (verbose && 0 == threadIdx.x)
       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);

     assert((int)nt <= hist.capacity());

     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       assert(i < zVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
       assert(iz - INT8_MIN >= 0);
       assert(iz - INT8_MIN < 256);
       hist.count(izt[i]);
       iv[i] = i;
       nn[i] = 0;
     }
     __syncthreads();
     if (threadIdx.x < 32)
       hws[threadIdx.x] = 0;  // used by prefix scan...
     __syncthreads();
     hist.finalize(hws);
     __syncthreads();
     assert(hist.size() == nt);
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       hist.fill(izt[i], uint16_t(i));
     }
     __syncthreads();

     // count neighbours
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (ezt2[i] > er2mx)
         continue;
       auto loop = [&](uint32_t j) {
         if (i == j)
           return;
         auto dist = std::abs(zt[i] - zt[j]);
         if (dist > eps)
           return;
         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
           return;
         nn[i]++;
       };

       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }

     __shared__ int nloops;
     nloops = 0;

     __syncthreads();

     // cluster seeds only
     bool more = true;
     while (__syncthreads_or(more)) {
       if (1 == nloops % 2) {
         for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
           auto m = iv[i];
           while (m != iv[m])
             m = iv[m];
           iv[i] = m;
         }
       } else {
         more = false;
         for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
           auto p = hist.begin() + k;
           auto i = (*p);
           auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
           if (nn[i] < minT)
             continue;  // DBSCAN core rule
           auto loop = [&](uint32_t j) {
             assert(i != j);
             if (nn[j] < minT)
               return;  // DBSCAN core rule
             auto dist = std::abs(zt[i] - zt[j]);
             if (dist > eps)
               return;
             if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
               return;
             auto old = atomicMin(&iv[j], iv[i]);
             if (old != iv[i]) {
               // end the loop only if no changes were applied
               more = true;
             }
             atomicMin(&iv[i], old);
           };
           ++p;
           for (; p < hist.end(be); ++p)
             loop(*p);
         }  // for i
       }
       if (threadIdx.x == 0)
         ++nloops;
     }  // while

     // collect edges (assign to closest cluster of closest point??? here to closest point)
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
       if (nn[i] >= minT)
         continue;  // DBSCAN edge rule
       float mdist = eps;
       auto loop = [&](int j) {
         if (nn[j] < minT)
           return;  // DBSCAN core rule
         auto dist = std::abs(zt[i] - zt[j]);
         if (dist > mdist)
           return;
         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
           return;  // needed?
         mdist = dist;
         iv[i] = iv[j];  // assign to cluster (better be unique??)
       };
       cms::cuda::forEachInBins(hist, izt[i], 1, loop);
     }

     __shared__ unsigned int foundClusters;
     foundClusters = 0;
     __syncthreads();

     // find the number of different clusters, identified by a tracks with clus[i] == i;
     // mark these tracks with a negative id.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] == int(i)) {
         if (nn[i] >= minT) {
           auto old = atomicInc(&foundClusters, 0xffffffff);
           iv[i] = -(old + 1);
         } else {  // noise
           iv[i] = -9998;
         }
       }
     }
     __syncthreads();

     assert(foundClusters < zVertex::utilities::MAXVTX);

     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       if (iv[i] >= 0) {
         // mark each track in a cluster with the same id as the first one
         iv[i] = iv[iv[i]];
       }
     }
     __syncthreads();

     // adjust the cluster id to be a positive value starting from 0
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
       iv[i] = -iv[i] - 1;
     }

     nvIntermediate = nvFinal = foundClusters;

     if (verbose && 0 == threadIdx.x)
       printf("found %d proto vertices\n", foundClusters);
   }

 }  // namespace gpuVertexFinder

 #endif  // RecoVertex_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::pdata
ALPAKA_FN_ACC ALPAKA_FN_INLINE VtxSoAView & pdata
Definition: fitVertices.h:19

gpuVertexFinder::__syncthreads
__syncthreads()
Definition: cudaCompat.h:132

cms::cudacompat::__syncthreads_or
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:134

mps_fire.i
i
Definition: mps_fire.py:429

heppy_loop.loop
loop
Definition: heppy_loop.py:28

gpuVertexFinder::iv
int32_t *__restrict__ iv
Definition: gpuClusterTracksByDensity.h:46

dqmiolumiharvest.j
j
Definition: dqmiolumiharvest.py:66

gpuVertexFinder::chi2max
__device__ WsSoAView int float float float chi2max
Definition: gpuClusterTracksByDensity.h:26

zVertex::utilities::MAXVTX
static constexpr uint32_t MAXVTX
Definition: ZVertexUtilities.h:26

gpuVertexFinder::nloops
__shared__ int nloops
Definition: gpuClusterTracksIterative.h:104

gpuVertexFinder::data
auto &__restrict__ data
Definition: gpuClusterTracksByDensity.h:35

__global__
#define __global__
Definition: cudaCompat.h:19

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

gpuVertexFinder::eps
__device__ WsSoAView int float eps
Definition: gpuClusterTracksByDensity.h:21

ALPAKA_ACCELERATOR_NAMESPACE::brokenline::constexpr
if constexpr(n > 3)
Definition: BrokenLine.h:164

gpuVertexFinder::zt
float const  *__restrict__ zt
Definition: gpuClusterTracksByDensity.h:38

cms::cuda::HistoContainer
Definition: HistoContainer.h:101

gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksByDensity.h:36

gpuVertexFinder::more
bool more
Definition: gpuClusterTracksIterative.h:110

HistoContainer.h

gpuVertexFinder::nt
auto nt
Definition: gpuClusterTracksByDensity.h:37

createfilelist.int
int
Definition: createfilelist.py:10

gpuVertexFinder::pws
__device__ WsSoAView & pws
Definition: gpuClusterTracksByDensity.h:21

gpuVertexFinder::Hist
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
Definition: gpuClusterTracksByDensity.h:54

cms::cudacompat::atomicInc
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48

visualization-live-secondInstance_cfg.m
m
Definition: visualization-live-secondInstance_cfg.py:84

cms::cuda::be
int be
Definition: HistoContainer.h:75

SiStripPI::min
Definition: SiStripPayloadInspectorHelper.h:178

gpuVertexFinder::minT
__device__ WsSoAView int minT
Definition: gpuClusterTracksByDensity.h:21

funct::abs
Abs< T >::type abs(const T &t)
Definition: Abs.h:22

cms::alpakatools::forEachInBins
ALPAKA_FN_ACC ALPAKA_FN_INLINE void forEachInBins(Hist const &hist, V value, int n, Func func)
Definition: HistoContainer.h:101

gpuVertexFinder.h

gpuVertexFinder
Definition: gpuClusterTracksByDensity.h:13

cms::cuda::HistoContainer::Counter
typename Base::Counter Counter
Definition: HistoContainer.h:105

gpuVertexFinder::ezt2
float const  *__restrict__ ezt2
Definition: gpuClusterTracksByDensity.h:39

gpuVertexFinder::hws
__shared__ Hist::Counter hws[32]
Definition: gpuClusterTracksByDensity.h:56

gpuVertexFinder::izt
uint8_t *__restrict__ izt
Definition: gpuClusterTracksByDensity.h:44

gpuVertexFinder::nvFinal
uint32_t & nvFinal
Definition: gpuClusterTracksByDensity.h:41

gpuVertexFinder::hist
__shared__ Hist hist
Definition: gpuClusterTracksByDensity.h:55

cms::cuda::HistoContainer::nbins
static constexpr uint32_t nbins()
Definition: HistoContainer.h:123

cms::cuda::HistoContainer::bin
static constexpr UT bin(T t)
Definition: HistoContainer.h:132

gpuVertexFinder::errmax
__device__ WsSoAView int float float errmax
Definition: gpuClusterTracksByDensity.h:21

gpuVertexFinder::WsSoAView
gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAView WsSoAView
Definition: gpuVertexFinder.h:18

cms::cuda::HistoContainer::totbins
static constexpr uint32_t totbins()
Definition: HistoContainer.h:125

gpuVertexFinder::nn
int32_t *__restrict__ nn
Definition: gpuClusterTracksByDensity.h:45

isotrackApplyRegressor.k
k
Definition: isotrackApplyRegressor.py:90

gpuVertexFinder::er2mx
auto er2mx
Definition: gpuClusterTracksByDensity.h:33

cms::cuda::OneToManyAssoc::capacity
constexpr auto capacity() const
Definition: OneToManyAssoc.h:169

cms::cudacompat::atomicMin
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85

gpuVertexFinder::VtxSoAView
zVertex::ZVertexSoAView VtxSoAView
Definition: gpuVertexFinder.h:17

gpuVertexFinder::foundClusters
__shared__ unsigned int foundClusters
Definition: gpuClusterTracksByDensity.h:185

cms::cuda::OneToManyAssoc::off
off[c.m]
Definition: OneToManyAssoc.h:236

gpuVertexFinder::nvIntermediate
uint32_t & nvIntermediate
Definition: gpuClusterTracksByDensity.h:42

gpuVertexFinder::assert
assert(zt)

cuda_assert.h

AlCaHLTBitMon_ParallelJobs.p
def p
Definition: AlCaHLTBitMon_ParallelJobs.py:153

gpuVertexFinder::verbose
constexpr bool verbose
Definition: gpuClusterTracksByDensity.h:28

zVertex::utilities::MAXTRACKS
static constexpr uint32_t MAXTRACKS
Definition: ZVertexUtilities.h:25