d8/d0c/gpuClusterTracksIterative_8h_source.html

 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h

 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h


 #include <algorithm>

 #include <cmath>

 #include <cstdint>


 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"

 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"


 #include "gpuVertexFinder.h"


 namespace gpuVertexFinder {


   // this algo does not really scale as it works in a single block...

   // enough for <10K tracks we have

   __global__ void clusterTracksIterative(ZVertices* pdata,

                                          WorkSpace* pws,

                                          int minT,      // min number of neighbours to be "core"

                                          float eps,     // max absolute distance to cluster

                                          float errmax,  // max error to be "seed"

                                          float chi2max  // max normalized distance to cluster

   ) {

     constexpr bool verbose = false;  // in principle the compiler should optmize out if false


     if (verbose && 0 == threadIdx.x)

       printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);


     auto er2mx = errmax * errmax;


     auto& __restrict__ data = *pdata;

     auto& __restrict__ ws = *pws;

     auto nt = ws.ntrks;

     float const* __restrict__ zt = ws.zt;

     float const* __restrict__ ezt2 = ws.ezt2;


     uint32_t& nvFinal = data.nvFinal;

     uint32_t& nvIntermediate = ws.nvIntermediate;


     uint8_t* __restrict__ izt = ws.izt;

     int32_t* __restrict__ nn = data.ndof;

     int32_t* __restrict__ iv = ws.iv;


     assert(pdata);

     assert(zt);


     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;

     __shared__ Hist hist;

     __shared__ typename Hist::Counter hws[32];

     for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {

       hist.off[j] = 0;

     }

     __syncthreads();


     if (verbose && 0 == threadIdx.x)

       printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);


     assert((int)nt <= hist.capacity());


     // fill hist  (bin shall be wider than "eps")

     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {

       assert(i < ZVertices::MAXTRACKS);

       int iz = int(zt[i] * 10.);  // valid if eps<=0.1

       iz = std::clamp(iz, INT8_MIN, INT8_MAX);

       izt[i] = iz - INT8_MIN;

       assert(iz - INT8_MIN >= 0);

       assert(iz - INT8_MIN < 256);

       hist.count(izt[i]);

       iv[i] = i;

       nn[i] = 0;

     }

     __syncthreads();

     if (threadIdx.x < 32)

       hws[threadIdx.x] = 0;  // used by prefix scan...

     __syncthreads();

     hist.finalize(hws);

     __syncthreads();

     assert(hist.size() == nt);

     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {

       hist.fill(izt[i], uint16_t(i));

     }

     __syncthreads();


     // count neighbours

     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {

       if (ezt2[i] > er2mx)

         continue;

       auto loop = [&](uint32_t j) {

         if (i == j)

           return;

         auto dist = std::abs(zt[i] - zt[j]);

         if (dist > eps)

           return;

         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))

           return;

         nn[i]++;

       };


       cms::cuda::forEachInBins(hist, izt[i], 1, loop);

     }


     __shared__ int nloops;

     nloops = 0;


     __syncthreads();


     // cluster seeds only

     bool more = true;

     while (__syncthreads_or(more)) {

       if (1 == nloops % 2) {

         for (auto i = threadIdx.x; i < nt; i += blockDim.x) {

           auto m = iv[i];

           while (m != iv[m])

             m = iv[m];

           iv[i] = m;

         }

       } else {

         more = false;

         for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {

           auto p = hist.begin() + k;

           auto i = (*p);

           auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));

           if (nn[i] < minT)

             continue;  // DBSCAN core rule

           auto loop = [&](uint32_t j) {

             assert(i != j);

             if (nn[j] < minT)

               return;  // DBSCAN core rule

             auto dist = std::abs(zt[i] - zt[j]);

             if (dist > eps)

               return;

             if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))

               return;

             auto old = atomicMin(&iv[j], iv[i]);

             if (old != iv[i]) {

               // end the loop only if no changes were applied

               more = true;

             }

             atomicMin(&iv[i], old);

           };

           ++p;

           for (; p < hist.end(be); ++p)

             loop(*p);

         }  // for i

       }

       if (threadIdx.x == 0)

         ++nloops;

     }  // while


     // collect edges (assign to closest cluster of closest point??? here to closest point)

     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {

       //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule

       if (nn[i] >= minT)

         continue;  // DBSCAN edge rule

       float mdist = eps;

       auto loop = [&](int j) {

         if (nn[j] < minT)

           return;  // DBSCAN core rule

         auto dist = std::abs(zt[i] - zt[j]);

         if (dist > mdist)

           return;

         if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))

           return;  // needed?

         mdist = dist;

         iv[i] = iv[j];  // assign to cluster (better be unique??)

       };

       cms::cuda::forEachInBins(hist, izt[i], 1, loop);

     }


     __shared__ unsigned int foundClusters;

     foundClusters = 0;

     __syncthreads();


     // find the number of different clusters, identified by a tracks with clus[i] == i;

     // mark these tracks with a negative id.

     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {

       if (iv[i] == int(i)) {

         if (nn[i] >= minT) {

           auto old = atomicInc(&foundClusters, 0xffffffff);

           iv[i] = -(old + 1);

         } else {  // noise

           iv[i] = -9998;

         }

       }

     }

     __syncthreads();


     assert(foundClusters < ZVertices::MAXVTX);


     // propagate the negative id to all the tracks in the cluster.

     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {

       if (iv[i] >= 0) {

         // mark each track in a cluster with the same id as the first one

         iv[i] = iv[iv[i]];

       }

     }

     __syncthreads();


     // adjust the cluster id to be a positive value starting from 0

     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {

       iv[i] = -iv[i] - 1;

     }


     nvIntermediate = nvFinal = foundClusters;


     if (verbose && 0 == threadIdx.x)

       printf("found %d proto vertices\n", foundClusters);

   }


 }  // namespace gpuVertexFinder


 #endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h

cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

gpuVertexFinder::__syncthreads
__syncthreads()
Definition: cudaCompat.h:108

cms::cudacompat::__syncthreads_or
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:110

dqmiolumiharvest.j
tuple j
Definition: dqmiolumiharvest.py:66

mps_fire.i
i
Definition: mps_fire.py:428

gpuVertexFinder::iv
int32_t *__restrict__ iv
Definition: gpuClusterTracksDBSCAN.h:42

gpuVertexFinder::eps
WorkSpace int float eps
Definition: gpuClusterTracksDBSCAN.h:18

isotrackApplyRegressor.k
int k
Definition: isotrackApplyRegressor.py:91

verbose
bool verbose
Definition: beamSpotDipStandalone.cc:87

gpuVertexFinder::assert
assert(pdata)

gpuVertexFinder::nloops
__shared__ int nloops
Definition: gpuClusterTracksIterative.h:102

gpuVertexFinder::ezt2
float const *__restrict__ ezt2
Definition: gpuClusterTracksDBSCAN.h:35

gpuVertexFinder::data
auto &__restrict__ data
Definition: gpuClusterTracksDBSCAN.h:31

gpuVertexFinder::zt
float const *__restrict__ zt
Definition: gpuClusterTracksDBSCAN.h:34

__global__
#define __global__
Definition: cudaCompat.h:19

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

ZVertexSoA::MAXVTX
static constexpr uint32_t MAXVTX
Definition: ZVertexSoA.h:12

cms::cuda::HistoContainer
Definition: HistoContainer.h:101

gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksDBSCAN.h:32

visualization-live-secondInstance_cfg.m
tuple m
Definition: visualization-live-secondInstance_cfg.py:79

gpuVertexFinder::more
bool more
Definition: gpuClusterTracksIterative.h:108

gpuVertexFinder::nt
auto nt
Definition: gpuClusterTracksDBSCAN.h:33

gpuVertexFinder::chi2max
WorkSpace int float float float chi2max
Definition: gpuClusterTracksDBSCAN.h:23

gpuVertexFinder::pws
WorkSpace * pws
Definition: gpuClusterTracksDBSCAN.h:18

gpuVertexFinder::Hist
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
Definition: gpuClusterTracksDBSCAN.h:47

cms::cudacompat::atomicInc
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48

gpuVertexFinder::printf
printf("params %d %f %f %f\n", minT, eps, errmax, chi2max)

cms::cuda::be
int be
Definition: HistoContainer.h:75

SiStripPI::min
Definition: SiStripPayloadInspectorHelper.h:169

funct::abs
Abs< T >::type abs(const T &t)
Definition: Abs.h:22

gpuVertexFinder.h

cms::cuda::HistoContainer::Counter
typename Base::Counter Counter
Definition: HistoContainer.h:105

gpuVertexFinder::hws
__shared__ Hist::Counter hws[32]
Definition: gpuClusterTracksDBSCAN.h:49

gpuVertexFinder::izt
uint8_t *__restrict__ izt
Definition: gpuClusterTracksDBSCAN.h:40

gpuVertexFinder::nvFinal
uint32_t & nvFinal
Definition: gpuClusterTracksDBSCAN.h:37

gpuVertexFinder::errmax
WorkSpace int float float errmax
Definition: gpuClusterTracksDBSCAN.h:18

heppy_loop.loop
loop
Definition: heppy_loop.py:28

gpuVertexFinder::minT
WorkSpace int minT
Definition: gpuClusterTracksDBSCAN.h:18

gpuVertexFinder::hist
__shared__ Hist hist
Definition: gpuClusterTracksDBSCAN.h:48

HistoContainer.h

gpuVertexFinder::ZVertices
ZVertexSoA ZVertices
Definition: gpuVertexFinder.h:12

cms::cuda::HistoContainer::bin
static constexpr UT bin(T t)
Definition: HistoContainer.h:132

AlCaHLTBitMon_ParallelJobs.p
tuple p
Definition: AlCaHLTBitMon_ParallelJobs.py:153

cms::cuda::HistoContainer::totbins
static constexpr uint32_t totbins()
Definition: HistoContainer.h:125

gpuVertexFinder::er2mx
auto er2mx
Definition: gpuClusterTracksDBSCAN.h:29

gpuVertexFinder::nn
int32_t *__restrict__ nn
Definition: gpuClusterTracksDBSCAN.h:41

ZVertexSoA::MAXTRACKS
static constexpr uint32_t MAXTRACKS
Definition: ZVertexSoA.h:11

cms::cudacompat::atomicMin
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85

gpuVertexFinder::foundClusters
__shared__ unsigned int foundClusters
Definition: gpuClusterTracksDBSCAN.h:199

gpuVertexFinder::nvIntermediate
uint32_t & nvIntermediate
Definition: gpuClusterTracksDBSCAN.h:38

cuda_assert.h