de/d94/gpuSplitVertices_8h_source.html

 #ifndef RecoTracker_PixelVertexFinding_plugins_gpuSplitVertices_h
 #define RecoTracker_PixelVertexFinding_plugins_gpuSplitVertices_h

 #include <algorithm>
 #include <cmath>
 #include <cstdint>

 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"

 #include "gpuVertexFinder.h"

 namespace gpuVertexFinder {

   __device__ __forceinline__ void splitVertices(VtxSoAView& pdata, WsSoAView& pws, float maxChi2) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false

     auto& __restrict__ data = pdata;
     auto& __restrict__ ws = pws;
     auto nt = ws.ntrks();
     float const* __restrict__ zt = ws.zt();
     float const* __restrict__ ezt2 = ws.ezt2();
     float* __restrict__ zv = data.zv();
     float* __restrict__ wv = data.wv();
     float const* __restrict__ chi2 = data.chi2();
     uint32_t& nvFinal = data.nvFinal();

     int32_t const* __restrict__ nn = data.ndof();
     int32_t* __restrict__ iv = ws.iv();

     assert(zt);
     assert(wv);
     assert(chi2);
     assert(nn);

     // one vertex per block
     for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
       if (nn[kv] < 4)
         continue;
       if (chi2[kv] < maxChi2 * float(nn[kv]))
         continue;

       constexpr int MAXTK = 512;
       assert(nn[kv] < MAXTK);
       if (nn[kv] >= MAXTK)
         continue;                      // too bad FIXME
       __shared__ uint32_t it[MAXTK];   // track index
       __shared__ float zz[MAXTK];      // z pos
       __shared__ uint8_t newV[MAXTK];  // 0 or 1
       __shared__ float ww[MAXTK];      // z weight

       __shared__ uint32_t nq;  // number of track for this vertex
       nq = 0;
       __syncthreads();

       // copy to local
       for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
         if (iv[k] == int(kv)) {
           auto old = atomicInc(&nq, MAXTK);
           zz[old] = zt[k] - zv[kv];
           newV[old] = zz[old] < 0 ? 0 : 1;
           ww[old] = 1.f / ezt2[k];
           it[old] = k;
         }
       }

       __shared__ float znew[2], wnew[2];  // the new vertices

       __syncthreads();
       assert(int(nq) == nn[kv] + 1);

       int maxiter = 20;
       // kt-min....
       bool more = true;
       while (__syncthreads_or(more)) {
         more = false;
         if (0 == threadIdx.x) {
           znew[0] = 0;
           znew[1] = 0;
           wnew[0] = 0;
           wnew[1] = 0;
         }
         __syncthreads();
         for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
           auto i = newV[k];
           atomicAdd(&znew[i], zz[k] * ww[k]);
           atomicAdd(&wnew[i], ww[k]);
         }
         __syncthreads();
         if (0 == threadIdx.x) {
           znew[0] /= wnew[0];
           znew[1] /= wnew[1];
         }
         __syncthreads();
         for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
           auto d0 = fabs(zz[k] - znew[0]);
           auto d1 = fabs(zz[k] - znew[1]);
           auto newer = d0 < d1 ? 0 : 1;
           more |= newer != newV[k];
           newV[k] = newer;
         }
         --maxiter;
         if (maxiter <= 0)
           more = false;
       }

       // avoid empty vertices
       if (0 == wnew[0] || 0 == wnew[1])
         continue;

       // quality cut
       auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);

       auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);

       if (verbose && 0 == threadIdx.x)
         printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);

       if (chi2Dist < 4)
         continue;

       // get a new global vertex
       __shared__ uint32_t igv;
       if (0 == threadIdx.x)
         igv = atomicAdd(&ws.nvIntermediate(), 1);
       __syncthreads();
       for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
         if (1 == newV[k])
           iv[it[k]] = igv;
       }

     }  // loop on vertices
   }

   __global__ void splitVerticesKernel(VtxSoAView pdata, WsSoAView pws, float maxChi2) {
     splitVertices(pdata, pws, maxChi2);
   }

 }  // namespace gpuVertexFinder

 #endif  // RecoTracker_PixelVertexFinding_plugins_gpuSplitVertices_h
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

gpuVertexFinder::maxChi2
__device__ WsSoAView float maxChi2
Definition: gpuSplitVertices.h:15

gpuVertexFinder::__syncthreads
__syncthreads()
Definition: cudaCompat.h:132

cms::cudacompat::__syncthreads_or
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:134

gpuVertexFinder::chi2
float *__restrict__ chi2
Definition: gpuFitVertices.h:28

__forceinline__
#define __forceinline__
Definition: cudaCompat.h:22

mps_fire.i
i
Definition: mps_fire.py:429

gpuVertexFinder::iv
int32_t *__restrict__ iv
Definition: gpuClusterTracksByDensity.h:46

ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::newV
auto & newV
Definition: splitVertices.h:50

ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::pdata
ALPAKA_FN_ACC ALPAKA_FN_INLINE void VtxSoAView & pdata
Definition: clusterTracksByDensity.h:30

cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33

dqmPostProcessing_online.newer
newer
Definition: dqmPostProcessing_online.py:130

geometryCSVtoXML.zz
zz
Definition: geometryCSVtoXML.py:19

gpuVertexFinder::data
auto &__restrict__ data
Definition: gpuClusterTracksByDensity.h:35

__global__
#define __global__
Definition: cudaCompat.h:19

gpuVertexFinder::wv
float *__restrict__ wv
Definition: gpuFitVertices.h:27

gpuVertexFinder::zv
float *__restrict__ zv
Definition: gpuFitVertices.h:26

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::ww
auto & ww
Definition: splitVertices.h:51

ALPAKA_ACCELERATOR_NAMESPACE::brokenline::constexpr
if constexpr(n > 3)
Definition: BrokenLine.h:164

gpuVertexFinder::zt
float const  *__restrict__ zt
Definition: gpuClusterTracksByDensity.h:38

gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksByDensity.h:36

gpuVertexFinder::more
bool more
Definition: gpuClusterTracksIterative.h:110

HistoContainer.h

gpuVertexFinder::nt
auto nt
Definition: gpuClusterTracksByDensity.h:37

gpuVertexFinder::pws
__device__ WsSoAView & pws
Definition: gpuClusterTracksByDensity.h:21

cms::cudacompat::atomicInc
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48

gpuVertexFinder.h

ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::MAXTK
constexpr uint32_t MAXTK
Definition: splitVertices.h:46

gpuVertexFinder
Definition: gpuClusterTracksByDensity.h:13

maxiter
static const MaxIter maxiter
Definition: HelixArbitraryPlaneCrossing.cc:30

gpuVertexFinder::ezt2
float const  *__restrict__ ezt2
Definition: gpuClusterTracksByDensity.h:39

gpuVertexFinder::nvFinal
uint32_t & nvFinal
Definition: gpuClusterTracksByDensity.h:41

gpuVertexFinder::splitVertices
splitVertices(pdata, pws, maxChi2ForSplit)

cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32

d0
static constexpr float d0
Definition: L1EGammaCrystalsEmulatorProducer.cc:83

gpuVertexFinder::WsSoAView
gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAView WsSoAView
Definition: gpuVertexFinder.h:18

ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::it
auto & it
Definition: splitVertices.h:48

ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::nq
auto & nq
Definition: splitVertices.h:52

gpuVertexFinder::nn
int32_t *__restrict__ nn
Definition: gpuClusterTracksByDensity.h:45

d1
static constexpr float d1
Definition: L1EGammaCrystalsEmulatorProducer.cc:83

gpuVertexFinder::VtxSoAView
zVertex::ZVertexSoAView VtxSoAView
Definition: gpuVertexFinder.h:17

__device__
#define __device__
Definition: SiPixelGainForHLTonGPU.h:15

gpuVertexFinder::assert
assert(zt)

cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61

cuda_assert.h

gpuVertexFinder::verbose
constexpr bool verbose
Definition: gpuClusterTracksByDensity.h:28

dqmdumpme.k
k
Definition: dqmdumpme.py:60