d4/d3f/gpuClustering_8h_source.html

 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h

 #include <cstdint>
 #include <cstdio>

 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"

 namespace gpuClustering {

 #ifdef GPU_DEBUG
   __device__ uint32_t gMaxHit = 0;
 #endif

   template <bool isPhase2>
   __global__ void countModules(uint16_t const* __restrict__ id,
                                uint32_t* __restrict__ moduleStart,
                                int32_t* __restrict__ clusterId,
                                int numElements) {
     int first = blockDim.x * blockIdx.x + threadIdx.x;
     constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
     assert(nMaxModules < maxNumModules);
     for (int i = first; i < numElements; i += gridDim.x * blockDim.x) {
       clusterId[i] = i;
       if (invalidModuleId == id[i])
         continue;
       auto j = i - 1;
       while (j >= 0 and id[j] == invalidModuleId)
         --j;
       if (j < 0 or id[j] != id[i]) {
         // boundary...
         auto loc = atomicInc(moduleStart, nMaxModules);
         moduleStart[loc + 1] = i;
       }
     }
   }

   template <bool isPhase2>
   __global__ void findClus(uint16_t const* __restrict__ id,           // module id of each pixel
                            uint16_t const* __restrict__ x,            // local coordinates of each pixel
                            uint16_t const* __restrict__ y,            //
                            uint32_t const* __restrict__ moduleStart,  // index of the first pixel of each module
                            uint32_t* __restrict__ nClustersInModule,  // output: number of clusters found in each module
                            uint32_t* __restrict__ moduleId,           // output: module id of each module
                            int32_t* __restrict__ clusterId,           // output: cluster id of each pixel
                            int numElements) {
     __shared__ int msize;

     auto firstModule = blockIdx.x;
     auto endModule = moduleStart[0];

     constexpr int nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
     assert(nMaxModules < maxNumModules);

     for (auto module = firstModule; module < endModule; module += gridDim.x) {
       auto firstPixel = moduleStart[1 + module];
       auto thisModuleId = id[firstPixel];
       assert(thisModuleId < nMaxModules);

 #ifdef GPU_DEBUG
       if (thisModuleId % 100 == 1)
         if (threadIdx.x == 0)
           printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
 #endif

       auto first = firstPixel + threadIdx.x;

       // find the index of the first pixel not belonging to this module (or invalid)
       msize = numElements;
       __syncthreads();

       // skip threads not associated to an existing pixel
       for (int i = first; i < numElements; i += blockDim.x) {
         if (id[i] == invalidModuleId)  // skip invalid pixels
           continue;
         if (id[i] != thisModuleId) {  // find the first pixel in a different module
           atomicMin(&msize, i);
           break;
         }
       }

       //init hist  (ymax=416 < 512 : 9bits)
       //6000 max pixels required for HI operations with no measurable impact on pp performance
       constexpr uint32_t maxPixInModule = 6000;
       constexpr auto nbins = isPhase2 ? 1024 : phase1PixelTopology::numColsInModule + 2;  //2+2;
       constexpr auto nbits = isPhase2 ? 10 : 9;                                           //2+2;
       using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, nbits, uint16_t>;
       __shared__ Hist hist;
       __shared__ typename Hist::Counter ws[32];
       for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
         hist.off[j] = 0;
       }
       __syncthreads();

       assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));

       // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
       if (0 == threadIdx.x) {
         if (msize - firstPixel > maxPixInModule) {
           printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
           msize = maxPixInModule + firstPixel;
         }
       }

       __syncthreads();
       assert(msize - firstPixel <= maxPixInModule);

 #ifdef GPU_DEBUG
       __shared__ uint32_t totGood;
       totGood = 0;
       __syncthreads();
 #endif

       // fill histo
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == invalidModuleId)  // skip invalid pixels
           continue;
         hist.count(y[i]);
 #ifdef GPU_DEBUG
         atomicAdd(&totGood, 1);
 #endif
       }
       __syncthreads();
       if (threadIdx.x < 32)
         ws[threadIdx.x] = 0;  // used by prefix scan...
       __syncthreads();
       hist.finalize(ws);
       __syncthreads();
 #ifdef GPU_DEBUG
       assert(hist.size() == totGood);
       if (thisModuleId % 100 == 1)
         if (threadIdx.x == 0)
           printf("histo size %d\n", hist.size());
 #endif
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == invalidModuleId)  // skip invalid pixels
           continue;
         hist.fill(y[i], i - firstPixel);
       }

 #ifdef __CUDA_ARCH__
       // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
       constexpr int maxiter = 16;
       if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
         printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
                thisModuleId,
                hist.size(),
                blockDim.x);
 #else
       auto maxiter = hist.size();
 #endif
       // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
       constexpr int maxNeighbours = 10;
       assert((hist.size() / blockDim.x) <= maxiter);
       // nearest neighbour
       uint16_t nn[maxiter][maxNeighbours];
       uint8_t nnn[maxiter];  // number of nn
       for (uint32_t k = 0; k < maxiter; ++k)
         nnn[k] = 0;

       __syncthreads();  // for hit filling!

 #ifdef GPU_DEBUG
       // look for anomalous high occupancy
       __shared__ uint32_t n40, n60;
       n40 = n60 = 0;
       __syncthreads();
       for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
         if (hist.size(j) > 60)
           atomicAdd(&n60, 1);
         if (hist.size(j) > 40)
           atomicAdd(&n40, 1);
       }
       __syncthreads();
       if (0 == threadIdx.x) {
         if (n60 > 0)
           printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
         else if (n40 > 0)
           printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
       }
       __syncthreads();
 #endif

       // fill NN
       for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
         assert(k < maxiter);
         auto p = hist.begin() + j;
         auto i = *p + firstPixel;
         assert(id[i] != invalidModuleId);
         assert(id[i] == thisModuleId);  // same module
         int be = Hist::bin(y[i] + 1);
         auto e = hist.end(be);
         ++p;
         assert(0 == nnn[k]);
         for (; p < e; ++p) {
           auto m = (*p) + firstPixel;
           assert(m != i);
           assert(int(y[m]) - int(y[i]) >= 0);
           assert(int(y[m]) - int(y[i]) <= 1);
           if (std::abs(int(x[m]) - int(x[i])) > 1)
             continue;
           auto l = nnn[k]++;
           assert(l < maxNeighbours);
           nn[k][l] = *p;
         }
       }

       // for each pixel, look at all the pixels until the end of the module;
       // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
       // after the loop, all the pixel in each cluster should have the id equeal to the lowest
       // pixel in the cluster ( clus[i] == i ).
       bool more = true;
       int nloops = 0;
       while (__syncthreads_or(more)) {
         if (1 == nloops % 2) {
           for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
             auto p = hist.begin() + j;
             auto i = *p + firstPixel;
             auto m = clusterId[i];
             while (m != clusterId[m])
               m = clusterId[m];
             clusterId[i] = m;
           }
         } else {
           more = false;
           for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
             auto p = hist.begin() + j;
             auto i = *p + firstPixel;
             for (int kk = 0; kk < nnn[k]; ++kk) {
               auto l = nn[k][kk];
               auto m = l + firstPixel;
               assert(m != i);
               auto old = atomicMin_block(&clusterId[m], clusterId[i]);
               // do we need memory fence?
               if (old != clusterId[i]) {
                 // end the loop only if no changes were applied
                 more = true;
               }
               atomicMin_block(&clusterId[i], old);
             }  // nnloop
           }    // pixel loop
         }
         ++nloops;
       }  // end while

 #ifdef GPU_DEBUG
       {
         __shared__ int n0;
         if (threadIdx.x == 0)
           n0 = nloops;
         __syncthreads();
         auto ok = n0 == nloops;
         assert(__syncthreads_and(ok));
         if (thisModuleId % 100 == 1)
           if (threadIdx.x == 0)
             printf("# loops %d\n", nloops);
       }
 #endif

       __shared__ unsigned int foundClusters;
       foundClusters = 0;
       __syncthreads();

       // find the number of different clusters, identified by a pixels with clus[i] == i;
       // mark these pixels with a negative id.
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == invalidModuleId)  // skip invalid pixels
           continue;
         if (clusterId[i] == i) {
           auto old = atomicInc(&foundClusters, 0xffffffff);
           clusterId[i] = -(old + 1);
         }
       }
       __syncthreads();

       // propagate the negative id to all the pixels in the cluster.
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == invalidModuleId)  // skip invalid pixels
           continue;
         if (clusterId[i] >= 0) {
           // mark each pixel in a cluster with the same id as the first one
           clusterId[i] = clusterId[clusterId[i]];
         }
       }
       __syncthreads();

       // adjust the cluster id to be a positive value starting from 0
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == invalidModuleId) {  // skip invalid pixels
           clusterId[i] = invalidClusterId;
           continue;
         }
         clusterId[i] = -clusterId[i] - 1;
       }
       __syncthreads();

       if (threadIdx.x == 0) {
         nClustersInModule[thisModuleId] = foundClusters;
         moduleId[module] = thisModuleId;
 #ifdef GPU_DEBUG
         if (foundClusters > gMaxHit) {
           gMaxHit = foundClusters;
           if (foundClusters > 8)
             printf("max hit %d in %d\n", foundClusters, thisModuleId);
         }
 #endif
 #ifdef GPU_DEBUG
         if (thisModuleId % 100 == 1)
           printf("%d clusters in module %d\n", foundClusters, thisModuleId);
 #endif
       }
     }  // module loop
   }
 }  // namespace gpuClustering

 #endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

first
auto first
Definition: CAHitNtupletGeneratorKernelsImpl.h:127

gpuClustering::ok
__shared__ uint8_t ok[maxNumClustersPerModules]
Definition: gpuClusterChargeCut.h:29

gpuClustering::nMaxModules
constexpr int nMaxModules
Definition: gpuClusterChargeCut.h:33

cms::cudacompat::__syncthreads_or
bool __syncthreads_or(bool x)
Definition: cudaCompat.h:110

gpuClustering::firstModule
auto firstModule
Definition: gpuClusterChargeCut.h:38

SimplePixelTopology.h

mps_fire.i
i
Definition: mps_fire.py:428

gpuClusteringConstants.h

cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33

dqmiolumiharvest.j
j
Definition: dqmiolumiharvest.py:66

gpuVertexFinder::nloops
__shared__ int nloops
Definition: gpuClusterTracksIterative.h:102

__global__
#define __global__
Definition: cudaCompat.h:19

HLT_2022v12_cff.isPhase2
isPhase2
Definition: HLT_2022v12_cff.py:4282

LaserClient_cfi.nbins
nbins
Definition: LaserClient_cfi.py:51

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

gpuClustering::numElements
uint16_t *__restrict__ uint16_t const  *__restrict__ uint32_t const  *__restrict__ uint32_t *__restrict__ uint32_t const  *__restrict__ int32_t *__restrict__ uint32_t numElements
Definition: gpuClusterChargeCut.h:27

gpuClustering::moduleStart
uint16_t *__restrict__ uint16_t const  *__restrict__ uint32_t const  *__restrict__ moduleStart
Definition: gpuClusterChargeCut.h:21

MillePedeFileConverter_cfg.e
e
Definition: MillePedeFileConverter_cfg.py:37

phase1PixelTopology::numberOfModules
constexpr uint32_t numberOfModules
Definition: SimplePixelTopology.h:70

gpuClustering::moduleId
uint16_t *__restrict__ uint16_t const  *__restrict__ uint32_t const  *__restrict__ uint32_t *__restrict__ uint32_t const  *__restrict__ moduleId
Definition: gpuClusterChargeCut.h:21

phase1PixelTopology::numColsInModule
constexpr uint16_t numColsInModule
Definition: SimplePixelTopology.h:57

cms::cuda::HistoContainer
Definition: HistoContainer.h:101

mitigatedMETSequence_cff.U
U
Definition: mitigatedMETSequence_cff.py:36

newFWLiteAna.bin
bin
Definition: newFWLiteAna.py:161

gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksDBSCAN.h:32

callgraph.module
module
Definition: callgraph.py:69

gpuClustering::clusterId
uint16_t *__restrict__ uint16_t const  *__restrict__ uint32_t const  *__restrict__ uint32_t *__restrict__ uint32_t const  *__restrict__ int32_t *__restrict__ clusterId
Definition: gpuClusterChargeCut.h:21

gpuVertexFinder::more
bool more
Definition: gpuClusterTracksIterative.h:108

gpuClustering::assert
assert(nMaxModules< maxNumModules)

align::Counter
std::function< unsigned int(align::ID)> Counter
Definition: AlignableIndexer.h:31

n0
int n0
Definition: AMPTWrapper.h:44

gpuVertexFinder::Hist
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
Definition: gpuClusterTracksDBSCAN.h:47

cms::cudacompat::atomicInc
T1 atomicInc(T1 *a, T2 b)
Definition: cudaCompat.h:48

or
The Signals That Services Can Subscribe To This is based on ActivityRegistry and is current per Services can connect to the signals distributed by the ActivityRegistry in order to monitor the activity of the application Each possible callback has some defined which we here list in angle e< void, edm::EventID const  &, edm::Timestamp const  & > We also list in braces which AR_WATCH_USING_METHOD_ is used for those or
Definition: Activities.doc:12

visualization-live-secondInstance_cfg.m
m
Definition: visualization-live-secondInstance_cfg.py:80

cms::cuda::be
int be
Definition: HistoContainer.h:75

funct::abs
Abs< T >::type abs(const T &t)
Definition: Abs.h:22

gpuClustering::x
uint16_t const  *__restrict__ x
Definition: gpuClustering.h:43

maxiter
static const MaxIter maxiter
Definition: HelixArbitraryPlaneCrossing.cc:30

cmsLHEtoEOSManager.l
l
Definition: cmsLHEtoEOSManager.py:212

gpuClustering::maxNumModules
constexpr uint16_t maxNumModules
Definition: gpuClusteringConstants.h:19

cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32

gpuClustering::nClustersInModule
uint16_t *__restrict__ uint16_t const  *__restrict__ uint32_t const  *__restrict__ uint32_t *__restrict__ nClustersInModule
Definition: gpuClusterChargeCut.h:21

gpuClustering::invalidModuleId
constexpr uint16_t invalidModuleId
Definition: gpuClusteringConstants.h:22

gpuVertexFinder::hist
__shared__ Hist hist
Definition: gpuClusterTracksDBSCAN.h:48

cms::cudacompat::atomicMin_block
T1 atomicMin_block(T1 *a, T2 b)
Definition: cudaCompat.h:92

HistoContainer.h

phase2PixelTopology::numberOfModules
constexpr uint32_t numberOfModules
Definition: SimplePixelTopology.h:211

cms::cudacompat::__syncthreads
void __syncthreads()
Definition: cudaCompat.h:108

GetRecoTauVFromDQM_MC_cff.kk
kk
Definition: GetRecoTauVFromDQM_MC_cff.py:84

groupFilesInBlocks.nn
nn
Definition: groupFilesInBlocks.py:150

gpuClustering
Definition: gpuClusteringConstants.h:7

gpuClustering::endModule
auto endModule
Definition: gpuClusterChargeCut.h:39

gpuClustering::invalidClusterId
constexpr int invalidClusterId
Definition: gpuClusteringConstants.h:23

cms::cudacompat::__syncthreads_and
bool __syncthreads_and(bool x)
Definition: cudaCompat.h:111

cms::cudacompat::atomicMin
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85

gpuClustering::y
uint16_t const  *__restrict__ uint16_t const  *__restrict__ y
Definition: gpuClustering.h:43

__device__
#define __device__
Definition: SiPixelGainForHLTonGPU.h:15

gpuVertexFinder::foundClusters
__shared__ unsigned int foundClusters
Definition: gpuClusterTracksDBSCAN.h:199

cms::cuda::OneToManyAssoc::off
off[c.m]
Definition: OneToManyAssoc.h:236

cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61

cuda_assert.h

AlCaHLTBitMon_ParallelJobs.p
def p
Definition: AlCaHLTBitMon_ParallelJobs.py:153

dqmdumpme.k
k
Definition: dqmdumpme.py:60