de/d94/gpuSplitVertices_8h_source.html

 #ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h

 #define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h


 #include <algorithm>

 #include <cmath>

 #include <cstdint>


 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"

 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"


 #include "gpuVertexFinder.h"


 namespace gpuVertexFinder {


   __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {

     constexpr bool verbose = false;  // in principle the compiler should optmize out if false


     auto& __restrict__ data = *pdata;

     auto& __restrict__ ws = *pws;

     auto nt = ws.ntrks;

     float const* __restrict__ zt = ws.zt;

     float const* __restrict__ ezt2 = ws.ezt2;

     float* __restrict__ zv = data.zv;

     float* __restrict__ wv = data.wv;

     float const* __restrict__ chi2 = data.chi2;

     uint32_t& nvFinal = data.nvFinal;


     int32_t const* __restrict__ nn = data.ndof;

     int32_t* __restrict__ iv = ws.iv;


     assert(pdata);

     assert(zt);


     // one vertex per block

     for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {

       if (nn[kv] < 4)

         continue;

       if (chi2[kv] < maxChi2 * float(nn[kv]))

         continue;


       constexpr int MAXTK = 512;

       assert(nn[kv] < MAXTK);

       if (nn[kv] >= MAXTK)

         continue;                      // too bad FIXME

       __shared__ uint32_t it[MAXTK];   // track index

       __shared__ float zz[MAXTK];      // z pos

       __shared__ uint8_t newV[MAXTK];  // 0 or 1

       __shared__ float ww[MAXTK];      // z weight


       __shared__ uint32_t nq;  // number of track for this vertex

       nq = 0;

       __syncthreads();


       // copy to local

       for (auto k = threadIdx.x; k < nt; k += blockDim.x) {

         if (iv[k] == int(kv)) {

           auto old = atomicInc(&nq, MAXTK);

           zz[old] = zt[k] - zv[kv];

           newV[old] = zz[old] < 0 ? 0 : 1;

           ww[old] = 1.f / ezt2[k];

           it[old] = k;

         }

       }


       __shared__ float znew[2], wnew[2];  // the new vertices


       __syncthreads();

       assert(int(nq) == nn[kv] + 1);


       int maxiter = 20;

       // kt-min....

       bool more = true;

       while (__syncthreads_or(more)) {

         more = false;

         if (0 == threadIdx.x) {

           znew[0] = 0;

           znew[1] = 0;

           wnew[0] = 0;

           wnew[1] = 0;

         }

         __syncthreads();

         for (auto k = threadIdx.x; k < nq; k += blockDim.x) {

           auto i = newV[k];

           atomicAdd(&znew[i], zz[k] * ww[k]);

           atomicAdd(&wnew[i], ww[k]);

         }

         __syncthreads();

         if (0 == threadIdx.x) {

           znew[0] /= wnew[0];

           znew[1] /= wnew[1];

         }

         __syncthreads();

         for (auto k = threadIdx.x; k < nq; k += blockDim.x) {

           auto d0 = fabs(zz[k] - znew[0]);

           auto d1 = fabs(zz[k] - znew[1]);

           auto newer = d0 < d1 ? 0 : 1;

           more |= newer != newV[k];

           newV[k] = newer;

         }

         --maxiter;

         if (maxiter <= 0)

           more = false;

       }


       // avoid empty vertices

       if (0 == wnew[0] || 0 == wnew[1])

         continue;


       // quality cut

       auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);


       auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);


       if (verbose && 0 == threadIdx.x)

         printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);


       if (chi2Dist < 4)

         continue;


       // get a new global vertex

       __shared__ uint32_t igv;

       if (0 == threadIdx.x)

         igv = atomicAdd(&ws.nvIntermediate, 1);

       __syncthreads();

       for (auto k = threadIdx.x; k < nq; k += blockDim.x) {

         if (1 == newV[k])

           iv[it[k]] = igv;

       }


     }  // loop on vertices

   }


   __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {

     splitVertices(pdata, pws, maxChi2);

   }


 }  // namespace gpuVertexFinder


 #endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h

cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

gpuVertexFinder::__syncthreads
__syncthreads()
Definition: cudaCompat.h:108

cms::cudacompat::__syncthreads_or
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:110

__forceinline__
#define __forceinline__
Definition: cudaCompat.h:22

mps_fire.i
i
Definition: mps_fire.py:428

dqmPostProcessing_online.newer
tuple newer
Definition: dqmPostProcessing_online.py:130

gpuVertexFinder::iv
int32_t *__restrict__ iv
Definition: gpuClusterTracksDBSCAN.h:42

cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33

isotrackApplyRegressor.k
int k
Definition: isotrackApplyRegressor.py:91

gpuVertexFinder::assert
assert(pdata)

gpuVertexFinder::ezt2
float const *__restrict__ ezt2
Definition: gpuClusterTracksDBSCAN.h:35

gpuVertexFinder::data
auto &__restrict__ data
Definition: gpuClusterTracksDBSCAN.h:31

gpuVertexFinder::zt
float const *__restrict__ zt
Definition: gpuClusterTracksDBSCAN.h:34

__global__
#define __global__
Definition: cudaCompat.h:19

gpuVertexFinder::wv
float *__restrict__ wv
Definition: gpuFitVertices.h:27

gpuVertexFinder::zv
float *__restrict__ zv
Definition: gpuFitVertices.h:26

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

gpuVertexFinder::chi2
float *__restrict__ chi2
Definition: gpuFitVertices.h:28

gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksDBSCAN.h:32

ZVertexSoA
Definition: ZVertexSoA.h:10

gpuVertexFinder::more
bool more
Definition: gpuClusterTracksIterative.h:108

gpuVertexFinder::nt
auto nt
Definition: gpuClusterTracksDBSCAN.h:33

gpuVertexFinder::pws
WorkSpace * pws
Definition: gpuClusterTracksDBSCAN.h:18

verbose
static constexpr int verbose
Definition: HLTExoticaSubAnalysis.cc:25

cms::cudacompat::atomicInc
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48

gpuVertexFinder::printf
printf("params %d %f %f %f\n", minT, eps, errmax, chi2max)

gpuVertexFinder.h

maxiter
static const MaxIter maxiter
Definition: HelixArbitraryPlaneCrossing.cc:30

gpuVertexFinder::nvFinal
uint32_t & nvFinal
Definition: gpuClusterTracksDBSCAN.h:37

cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32

HistoContainer.h

d0
static constexpr float d0
Definition: L1EGammaCrystalsEmulatorProducer.cc:85

gpuVertexFinder::ZVertices
ZVertexSoA ZVertices
Definition: gpuVertexFinder.h:11

gpuVertexFinder::maxChi2
__device__ WorkSpace float maxChi2
Definition: gpuSplitVertices.h:15

gpuVertexFinder::nn
int32_t *__restrict__ nn
Definition: gpuClusterTracksDBSCAN.h:41

d1
static constexpr float d1
Definition: L1EGammaCrystalsEmulatorProducer.cc:85

__device__
#define __device__
Definition: SiPixelGainForHLTonGPU.h:15

cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61

cuda_assert.h

gpuVertexFinder::WorkSpace
Definition: gpuVertexFinder.h:15