d2/dd4/OneToManyAssoc_8h_source.html

 #ifndef HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
 #define HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h

 #include <algorithm>
 #ifndef __CUDA_ARCH__
 #include <atomic>
 #endif  // __CUDA_ARCH__
 #include <cstddef>
 #include <cstdint>
 #include <type_traits>

 #include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h"

 namespace cms {
   namespace cuda {

     template <typename Assoc>
     struct OneToManyAssocView {
       using Counter = typename Assoc::Counter;
       using index_type = typename Assoc::index_type;

       Assoc *assoc = nullptr;
       Counter *offStorage = nullptr;
       index_type *contentStorage = nullptr;
       int32_t offSize = -1;
       int32_t contentSize = -1;
     };

     // this MUST BE DONE in a single block (or in two kernels!)
     template <typename Assoc>
     __global__ void zeroAndInit(OneToManyAssocView<Assoc> view) {
       auto h = view.assoc;
       assert(1 == gridDim.x);
       assert(0 == blockIdx.x);

       int first = threadIdx.x;

       if (0 == first) {
         h->psws = 0;
         h->initStorage(view);
       }
       __syncthreads();
       for (int i = first, nt = h->totOnes(); i < nt; i += blockDim.x) {
         h->off[i] = 0;
       }
     }

     template <typename Assoc>
     inline __attribute__((always_inline)) void launchZero(Assoc *h,
                                                           cudaStream_t stream
 #ifndef __CUDACC__
                                                           = cudaStreamDefault
 #endif
     ) {
       typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
       launchZero(view, stream);
     }
     template <typename Assoc>
     inline __attribute__((always_inline)) void launchZero(OneToManyAssocView<Assoc> view,
                                                           cudaStream_t stream
 #ifndef __CUDACC__
                                                           = cudaStreamDefault
 #endif
     ) {

       if constexpr (Assoc::ctCapacity() < 0) {
         assert(view.contentStorage);
         assert(view.contentSize > 0);
       }
       if constexpr (Assoc::ctNOnes() < 0) {
         assert(view.offStorage);
         assert(view.offSize > 0);
       }
 #ifdef __CUDACC__
       auto nthreads = 1024;
       auto nblocks = 1;  // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels);
       zeroAndInit<<<nblocks, nthreads, 0, stream>>>(view);
       cudaCheck(cudaGetLastError());
 #else
       auto h = view.assoc;
       assert(h);
       h->initStorage(view);
       h->zero();
       h->psws = 0;
 #endif
     }

     template <typename Assoc>
     inline __attribute__((always_inline)) void launchFinalize(Assoc *h,
                                                               cudaStream_t stream
 #ifndef __CUDACC__
                                                               = cudaStreamDefault
 #endif
     ) {
       typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
       launchFinalize(view, stream);
     }

     template <typename Assoc>
     inline __attribute__((always_inline)) void launchFinalize(OneToManyAssocView<Assoc> view,
                                                               cudaStream_t stream
 #ifndef __CUDACC__
                                                               = cudaStreamDefault
 #endif
     ) {
       auto h = view.assoc;
       assert(h);
 #ifdef __CUDACC__
       using Counter = typename Assoc::Counter;
       Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
       auto nOnes = Assoc::ctNOnes();
       if constexpr (Assoc::ctNOnes() < 0) {
         assert(view.offStorage);
         assert(view.offSize > 0);
         nOnes = view.offSize;
         poff = view.offStorage;
       }
       assert(nOnes > 0);
       int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
       auto nthreads = 1024;
       auto nblocks = (nOnes + nthreads - 1) / nthreads;
       multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(poff, poff, nOnes, ppsws);
       cudaCheck(cudaGetLastError());
 #else
       h->finalize();
 #endif
     }

     template <typename Assoc>
     __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
       assoc->bulkFinalizeFill(*apc);
     }

     template <typename I,    // type stored in the container (usually an index in a vector of the input values)
               int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage
               int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage
               >
     class OneToManyAssoc {
     public:
       using View = OneToManyAssocView<OneToManyAssoc<I, ONES, SIZE>>;
       using Counter = uint32_t;

       using CountersOnly = OneToManyAssoc<I, ONES, 0>;

       using index_type = I;

       static constexpr uint32_t ilog2(uint32_t v) {
         constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
         constexpr uint32_t s[] = {1, 2, 4, 8, 16};

         uint32_t r = 0;  // result of log2(v) will go here
         for (auto i = 4; i >= 0; i--)
           if (v & b[i]) {
             v >>= s[i];
             r |= s[i];
           }
         return r;
       }

       static constexpr int32_t ctNOnes() { return ONES; }
       constexpr auto totOnes() const { return off.capacity(); }
       constexpr auto nOnes() const { return totOnes() - 1; }
       static constexpr int32_t ctCapacity() { return SIZE; }
       constexpr auto capacity() const { return content.capacity(); }

       __host__ __device__ void initStorage(View view) {
         assert(view.assoc == this);
         if constexpr (ctCapacity() < 0) {
           assert(view.contentStorage);
           assert(view.contentSize > 0);
           content.init(view.contentStorage, view.contentSize);
         }
         if constexpr (ctNOnes() < 0) {
           assert(view.offStorage);
           assert(view.offSize > 0);
           off.init(view.offStorage, view.offSize);
         }
       }

       __host__ __device__ void zero() {
         for (int32_t i = 0; i < totOnes(); ++i) {
           off[i] = 0;
         }
       }

       __host__ __device__ __forceinline__ void add(CountersOnly const &co) {
         for (int32_t i = 0; i < totOnes(); ++i) {
 #ifdef __CUDA_ARCH__
           atomicAdd(off.data() + i, co.off[i]);
 #else
           auto &a = (std::atomic<Counter> &)(off[i]);
           a += co.off[i];
 #endif
         }
       }

       static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
 #ifdef __CUDA_ARCH__
         return atomicAdd(&x, 1);
 #else
         auto &a = (std::atomic<Counter> &)(x);
         return a++;
 #endif
       }

       static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
 #ifdef __CUDA_ARCH__
         return atomicSub(&x, 1);
 #else
         auto &a = (std::atomic<Counter> &)(x);
         return a--;
 #endif
       }

       __host__ __device__ __forceinline__ void count(int32_t b) {
         assert(b < nOnes());
         atomicIncrement(off[b]);
       }

       __host__ __device__ __forceinline__ void fill(int32_t b, index_type j) {
         assert(b < nOnes());
         auto w = atomicDecrement(off[b]);
         assert(w > 0);
         content[w - 1] = j;
       }

       __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
         auto c = apc.add(n);
         if (int(c.m) >= nOnes())
           return -int32_t(c.m);
         off[c.m] = c.n;
         for (uint32_t j = 0; j < n; ++j)
           content[c.n + j] = v[j];
         return c.m;
       }

       __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
         off[apc.get().m] = apc.get().n;
       }

       __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
         int m = apc.get().m;
         auto n = apc.get().n;
         if (m >= nOnes()) {  // overflow!
           off[nOnes()] = uint32_t(off[nOnes() - 1]);
           return;
         }
         auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
         for (int i = first; i < totOnes(); i += gridDim.x * blockDim.x) {
           off[i] = n;
         }
       }

       __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
         assert(off[totOnes() - 1] == 0);
         blockPrefixScan(off.data(), totOnes(), ws);
         assert(off[totOnes() - 1] == off[totOnes() - 2]);
       }

       constexpr auto size() const { return uint32_t(off[totOnes() - 1]); }
       constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }

       constexpr index_type const *begin() const { return content.data(); }
       constexpr index_type const *end() const { return begin() + size(); }

       constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; }
       constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; }

       FlexiStorage<Counter, ONES> off;
       int32_t psws;  // prefix-scan working space
       FlexiStorage<index_type, SIZE> content;
     };

   }  // namespace cuda
 }  // namespace cms

 #endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

cms::cuda::ONES
cudaStream_t int32_t ONES
Definition: OneToManyAssoc.h:57

first
auto first
Definition: CAHitNtupletGeneratorKernelsImpl.h:127

cms::cuda::OneToManyAssoc::content
FlexiStorage< index_type, SIZE > content
Definition: OneToManyAssoc.h:276

__forceinline__
#define __forceinline__
Definition: cudaCompat.h:22

mps_fire.i
i
Definition: mps_fire.py:429

ntupleEnum.SIZE
SIZE
Definition: ntupleEnum.py:86

cms::cuda::OneToManyAssocView::assoc
Assoc * assoc
Definition: OneToManyAssoc.h:27

gpuVertexFinder::finalize
hist finalize(hws)

cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33

cms::cuda::OneToManyAssocView::offSize
int32_t offSize
Definition: OneToManyAssoc.h:30

__host__
#define __host__
Definition: SiPixelGainForHLTonGPU.h:12

cms::cudacompat::atomicSub
T1 atomicSub(T1 *a, T2 b)
Definition: cudaCompat.h:73

__global__
#define __global__
Definition: cudaCompat.h:19

alignCSCRings.s
s
Definition: alignCSCRings.py:92

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

pv::view
view
Definition: PreparePVTrends.h:58

cms::cuda::OneToManyAssocView::offStorage
Counter * offStorage
Definition: OneToManyAssoc.h:28

cms::cuda::OneToManyAssocView::contentStorage
index_type * contentStorage
Definition: OneToManyAssoc.h:29

FlexiStorage.h

cms::cuda::stream
uint32_t T const  *__restrict__ uint32_t const  *__restrict__ int32_t int Histo::index_type cudaStream_t stream
Definition: HistoContainer.h:51

cms::cuda::OneToManyAssoc::psws
int32_t psws
Definition: OneToManyAssoc.h:275

cms::cuda::assert
assert(be >=bs)

cms::cuda::co
__host__ __device__ VT * co
Definition: prefixScan.h:47

trackingPlots.assoc
assoc
Definition: trackingPlots.py:183

gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksDBSCAN.h:32

cms::cuda::OneToManyAssoc::w
auto w
Definition: OneToManyAssoc.h:227

cms::cuda::__attribute__
__attribute__((always_inline)) void countFromVector(Histo *__restrict__ h

cudastdAlgorithm.h

ntuplemaker.fill
fill
Definition: ntuplemaker.py:304

c
auto & c
Definition: CAHitNtupletGeneratorKernelsImpl.h:56

trackerTFP::Process::begin

align::Counter
std::function< unsigned int(align::ID)> Counter
Definition: AlignableIndexer.h:31

cms::cuda::OneToManyAssoc::v
__host__ __device__ index_type const  * v
Definition: OneToManyAssoc.h:232

cms::cuda::OneToManyAssoc::off
__host__ __device__ ONES off
Definition: OneToManyAssoc.h:274

cms::cuda::OneToManyAssoc::ilog2
static constexpr uint32_t ilog2(uint32_t v)
Definition: OneToManyAssoc.h:152

prefixScan.h

cms::cuda::OneToManyAssoc< hindex_type, S+1, 5 *S >::Counter
uint32_t Counter
Definition: OneToManyAssoc.h:146

cms::cuda::OneToManyAssoc::m
return c m
Definition: OneToManyAssoc.h:239

cms::cuda::OneToManyAssoc::j
__host__ __device__ index_type j
Definition: OneToManyAssoc.h:225

Exhume::I
const std::complex< double > I
Definition: I.h:8

cms::cuda::OneToManyAssocView
Definition: OneToManyAssoc.h:23

cms::cuda::nthreads
uint32_t T const  *__restrict__ uint32_t const  *__restrict__ int32_t int nthreads
Definition: HistoContainer.h:43

cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32

cms::cuda::FlexiStorage
Definition: FlexiStorage.h:10

cms::cuda::OneToManyAssoc::initStorage
__host__ __device__ void initStorage(View view)
Definition: OneToManyAssoc.h:171

submitPVResolutionJobs.count
count
Definition: submitPVResolutionJobs.py:352

nt
int nt
Definition: AMPTWrapper.h:42

cms::cuda::OneToManyAssoc::nOnes
constexpr auto nOnes() const
Definition: OneToManyAssoc.h:167

cms::cuda::OneToManyAssocView::Counter
typename Assoc::Counter Counter
Definition: OneToManyAssoc.h:24

cms
Namespace of DDCMS conversion namespace.
Definition: ProducerAnalyzer.cc:41

cms::cuda::OneToManyAssoc::ctCapacity
static constexpr int32_t ctCapacity()
Definition: OneToManyAssoc.h:168

cms::cuda::OneToManyAssocView::contentSize
int32_t contentSize
Definition: OneToManyAssoc.h:31

apc
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter * apc
Definition: CAHitNtupletGeneratorKernelsImpl.h:43

mps_fire.end
end
Definition: mps_fire.py:242

cms::cuda::size
__host__ __device__ VT uint32_t size
Definition: prefixScan.h:47

cms::cuda::OneToManyAssoc::ctNOnes
static constexpr int32_t ctNOnes()
Definition: OneToManyAssoc.h:165

cudaCheck.h

cms::cuda::OneToManyAssoc::zero
__host__ __device__ void zero()
Definition: OneToManyAssoc.h:185

cms::cuda::AtomicPairCounter
Definition: AtomicPairCounter.h:11

b
double b
Definition: hdecay.h:118

cms::cudacompat::__syncthreads
void __syncthreads()
Definition: cudaCompat.h:132

cms::cuda::OneToManyAssoc::content
content[w - 1]
Definition: OneToManyAssoc.h:229

cms::cuda::OneToManyAssoc::assert
assert(w > 0)

ecalDigis_cff.cuda
cuda
Definition: ecalDigis_cff.py:35

PVValHelper::add
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
Definition: PVValidationHelpers.cc:12

a
double a
Definition: hdecay.h:119

AtomicPairCounter.h

cms::cuda::OneToManyAssoc::totOnes
constexpr auto totOnes() const
Definition: OneToManyAssoc.h:166

x
float x
Definition: beamSpotDipStandalone.cc:55

cms::cuda::OneToManyAssocView::index_type
typename Assoc::index_type index_type
Definition: OneToManyAssoc.h:25

cms::cuda::OneToManyAssoc
Definition: OneToManyAssoc.h:143

cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69

cms::cuda::OneToManyAssoc::capacity
constexpr auto capacity() const
Definition: OneToManyAssoc.h:169

cms::cuda::OneToManyAssoc::n
__host__ __device__ index_type const uint32_t n
Definition: OneToManyAssoc.h:232

h
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4

__device__
#define __device__
Definition: SiPixelGainForHLTonGPU.h:15

cms::cuda::OneToManyAssoc::off
off[c.m]
Definition: OneToManyAssoc.h:236

sistrip::View
View
Definition: ConstantsForView.h:26

cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61

cuda_assert.h

cms::cuda::OneToManyAssoc< hindex_type, S+1, 5 *S >::index_type
I index_type
Definition: OneToManyAssoc.h:150