d2/dd4/OneToManyAssoc_8h_source.html

 #ifndef HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h

 #define HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h


 #include <algorithm>

 #ifndef __CUDA_ARCH__

 #include <atomic>

 #endif  // __CUDA_ARCH__

 #include <cstddef>

 #include <cstdint>

 #include <type_traits>


 #include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"

 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"

 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"

 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"

 #include "HeterogeneousCore/CUDAUtilities/interface/FlexiStorage.h"


 namespace cms {

   namespace cuda {


     template <typename Assoc>

     struct OneToManyAssocView {

       using Counter = typename Assoc::Counter;

       using index_type = typename Assoc::index_type;


       Assoc *assoc = nullptr;

       Counter *offStorage = nullptr;

       index_type *contentStorage = nullptr;

       int32_t offSize = -1;

       int32_t contentSize = -1;

     };


     // this MUST BE DONE in a single block (or in two kernels!)

     template <typename Assoc>

     __global__ void zeroAndInit(OneToManyAssocView<Assoc> view) {

       auto h = view.assoc;

       assert(1 == gridDim.x);

       assert(0 == blockIdx.x);


       int first = threadIdx.x;


       if (0 == first) {

         h->psws = 0;

         h->initStorage(view);

       }

       __syncthreads();

       for (int i = first, nt = h->totOnes(); i < nt; i += blockDim.x) {

         h->off[i] = 0;

       }

     }


     template <typename Assoc>

     inline __attribute__((always_inline)) void launchZero(Assoc *h,

                                                           cudaStream_t stream

 #ifndef __CUDACC__

                                                           = cudaStreamDefault

 #endif

     ) {

       typename Assoc::View view = {h, nullptr, nullptr, -1, -1};

       launchZero(view, stream);

     }

     template <typename Assoc>

     inline __attribute__((always_inline)) void launchZero(OneToManyAssocView<Assoc> view,

                                                           cudaStream_t stream

 #ifndef __CUDACC__

                                                           = cudaStreamDefault

 #endif

     ) {


       if constexpr (Assoc::ctCapacity() < 0) {

         assert(view.contentStorage);

         assert(view.contentSize > 0);

       }

       if constexpr (Assoc::ctNOnes() < 0) {

         assert(view.offStorage);

         assert(view.offSize > 0);

       }

 #ifdef __CUDACC__

       auto nthreads = 1024;

       auto nblocks = 1;  // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels);

       zeroAndInit<<<nblocks, nthreads, 0, stream>>>(view);

       cudaCheck(cudaGetLastError());

 #else

       auto h = view.assoc;

       assert(h);

       h->initStorage(view);

       h->zero();

       h->psws = 0;

 #endif

     }


     template <typename Assoc>

     inline __attribute__((always_inline)) void launchFinalize(Assoc *h,

                                                               cudaStream_t stream

 #ifndef __CUDACC__

                                                               = cudaStreamDefault

 #endif

     ) {

       typename Assoc::View view = {h, nullptr, nullptr, -1, -1};

       launchFinalize(view, stream);

     }


     template <typename Assoc>

     inline __attribute__((always_inline)) void launchFinalize(OneToManyAssocView<Assoc> view,

                                                               cudaStream_t stream

 #ifndef __CUDACC__

                                                               = cudaStreamDefault

 #endif

     ) {

       auto h = view.assoc;

       assert(h);

 #ifdef __CUDACC__

       using Counter = typename Assoc::Counter;

       Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));

       auto nOnes = Assoc::ctNOnes();

       if constexpr (Assoc::ctNOnes() < 0) {

         assert(view.offStorage);

         assert(view.offSize > 0);

         nOnes = view.offSize;

         poff = view.offStorage;

       }

       assert(nOnes > 0);

       int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));

       auto nthreads = 1024;

       auto nblocks = (nOnes + nthreads - 1) / nthreads;

       multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(poff, poff, nOnes, ppsws);

       cudaCheck(cudaGetLastError());

 #else

       h->finalize();

 #endif

     }


     template <typename Assoc>

     __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {

       assoc->bulkFinalizeFill(*apc);

     }


     template <typename I,    // type stored in the container (usually an index in a vector of the input values)

               int32_t ONES,  // number of "Ones"  +1. If -1 is initialized at runtime using external storage

               int32_t SIZE   // max number of element. If -1 is initialized at runtime using external storage

               >

     class OneToManyAssoc {

     public:

       using View = OneToManyAssocView<OneToManyAssoc<I, ONES, SIZE>>;

       using Counter = uint32_t;


       using CountersOnly = OneToManyAssoc<I, ONES, 0>;


       using index_type = I;


       static constexpr uint32_t ilog2(uint32_t v) {

         constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};

         constexpr uint32_t s[] = {1, 2, 4, 8, 16};


         uint32_t r = 0;  // result of log2(v) will go here

         for (auto i = 4; i >= 0; i--)

           if (v & b[i]) {

             v >>= s[i];

             r |= s[i];

           }

         return r;

       }


       static constexpr int32_t ctNOnes() { return ONES; }

       constexpr auto totOnes() const { return off.capacity(); }

       constexpr auto nOnes() const { return totOnes() - 1; }

       static constexpr int32_t ctCapacity() { return SIZE; }

       constexpr auto capacity() const { return content.capacity(); }


       __host__ __device__ void initStorage(View view) {

         assert(view.assoc == this);

         if constexpr (ctCapacity() < 0) {

           assert(view.contentStorage);

           assert(view.contentSize > 0);

           content.init(view.contentStorage, view.contentSize);

         }

         if constexpr (ctNOnes() < 0) {

           assert(view.offStorage);

           assert(view.offSize > 0);

           off.init(view.offStorage, view.offSize);

         }

       }


       __host__ __device__ void zero() {

         for (int32_t i = 0; i < totOnes(); ++i) {

           off[i] = 0;

         }

       }


       __host__ __device__ __forceinline__ void add(CountersOnly const &co) {

         for (int32_t i = 0; i < totOnes(); ++i) {

 #ifdef __CUDA_ARCH__

           atomicAdd(off.data() + i, co.off[i]);

 #else

           auto &a = (std::atomic<Counter> &)(off[i]);

           a += co.off[i];

 #endif

         }

       }


       static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {

 #ifdef __CUDA_ARCH__

         return atomicAdd(&x, 1);

 #else

         auto &a = (std::atomic<Counter> &)(x);

         return a++;

 #endif

       }


       static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {

 #ifdef __CUDA_ARCH__

         return atomicSub(&x, 1);

 #else

         auto &a = (std::atomic<Counter> &)(x);

         return a--;

 #endif

       }


       __host__ __device__ __forceinline__ void count(int32_t b) {

         assert(b < nOnes());

         atomicIncrement(off[b]);

       }


       __host__ __device__ __forceinline__ void fill(int32_t b, index_type j) {

         assert(b < nOnes());

         auto w = atomicDecrement(off[b]);

         assert(w > 0);

         content[w - 1] = j;

       }


       __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {

         auto c = apc.add(n);

         if (int(c.m) >= nOnes())

           return -int32_t(c.m);

         off[c.m] = c.n;

         for (uint32_t j = 0; j < n; ++j)

           content[c.n + j] = v[j];

         return c.m;

       }


       __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {

         off[apc.get().m] = apc.get().n;

       }


       __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {

         int m = apc.get().m;

         auto n = apc.get().n;

         if (m >= nOnes()) {  // overflow!

           off[nOnes()] = uint32_t(off[nOnes() - 1]);

           return;

         }

         auto first = m + blockDim.x * blockIdx.x + threadIdx.x;

         for (int i = first; i < totOnes(); i += gridDim.x * blockDim.x) {

           off[i] = n;

         }

       }


       __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {

         assert(off[totOnes() - 1] == 0);

         blockPrefixScan(off.data(), totOnes(), ws);

         assert(off[totOnes() - 1] == off[totOnes() - 2]);

       }


       constexpr auto size() const { return uint32_t(off[totOnes() - 1]); }

       constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }


       constexpr index_type const *begin() const { return content.data(); }

       constexpr index_type const *end() const { return begin() + size(); }


       constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; }

       constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; }


       FlexiStorage<Counter, ONES> off;

       int32_t psws;  // prefix-scan working space

       FlexiStorage<index_type, SIZE> content;

     };


   }  // namespace cuda

 }  // namespace cms


 #endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h

cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29

cms::cuda::ONES
cudaStream_t int32_t ONES
Definition: OneToManyAssoc.h:57

first
auto first
Definition: CAHitNtupletGeneratorKernelsImpl.h:127

cms::cuda::OneToManyAssoc::content
FlexiStorage< index_type, SIZE > content
Definition: OneToManyAssoc.h:276

c
const edm::EventSetup & c
Definition: SiStripLAProfileBooker.cc:66

__forceinline__
#define __forceinline__
Definition: cudaCompat.h:22

mps_fire.i
i
Definition: mps_fire.py:428

ecalDigis_cff.cuda
tuple cuda
Definition: ecalDigis_cff.py:35

cms::cuda::OneToManyAssocView::assoc
Assoc * assoc
Definition: OneToManyAssoc.h:27

gpuVertexFinder::finalize
hist finalize(hws)

cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33

cms::cuda::OneToManyAssocView::offSize
int32_t offSize
Definition: OneToManyAssoc.h:30

__host__
#define __host__
Definition: SiPixelGainForHLTonGPU.h:12

cms::cudacompat::atomicSub
T1 atomicSub(T1 *a, T2 b)
Definition: cudaCompat.h:73

SplitLinear.begin
list begin
Definition: SplitLinear.py:25

__global__
#define __global__
Definition: cudaCompat.h:19

cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30

cms::cuda::OneToManyAssocView::offStorage
Counter * offStorage
Definition: OneToManyAssoc.h:28

cms::cuda::OneToManyAssocView::contentStorage
index_type * contentStorage
Definition: OneToManyAssoc.h:29

FlexiStorage.h

cms::cuda::stream
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
Definition: HistoContainer.h:51

cms::cuda::OneToManyAssoc::psws
int32_t psws
Definition: OneToManyAssoc.h:275

cms::cuda::assert
assert(be >=bs)

cms::cuda::OneToManyAssoc::nOnes
constexpr auto nOnes() const
Definition: OneToManyAssoc.h:167

cms::cuda::co
__host__ __device__ VT * co
Definition: prefixScan.h:47

gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksDBSCAN.h:32

cms::cuda::OneToManyAssoc::w
auto w
Definition: OneToManyAssoc.h:227

cms::cuda::OneToManyAssoc::content
content[w-1]
Definition: OneToManyAssoc.h:229

cudastdAlgorithm.h

align::Counter
std::function< unsigned int(align::ID)> Counter
Definition: AlignableIndexer.h:31

cms::cuda::OneToManyAssoc::v
__host__ __device__ index_type const * v
Definition: OneToManyAssoc.h:232

cms::cuda::OneToManyAssoc::off
__host__ __device__ ONES off
Definition: OneToManyAssoc.h:274

cms::cuda::OneToManyAssoc::ilog2
static constexpr uint32_t ilog2(uint32_t v)
Definition: OneToManyAssoc.h:152

prefixScan.h

cms::cuda::OneToManyAssoc< hindex_type, S+1, 5 *S >::Counter
uint32_t Counter
Definition: OneToManyAssoc.h:146

cms::cuda::__attribute__
__attribute__((always_inline)) void countFromVector(Histo *__restrict__ h

cms::cuda::OneToManyAssoc::m
return c m
Definition: OneToManyAssoc.h:239

cms::cuda::OneToManyAssoc::j
__host__ __device__ index_type j
Definition: OneToManyAssoc.h:225

Exhume::I
const std::complex< double > I
Definition: I.h:8

cms::cuda::OneToManyAssoc::capacity
constexpr auto capacity() const
Definition: OneToManyAssoc.h:169

cms::cuda::OneToManyAssocView
Definition: OneToManyAssoc.h:23

cms::cuda::nthreads
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int nthreads
Definition: HistoContainer.h:43

cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32

cms::cuda::FlexiStorage
Definition: FlexiStorage.h:10

cms::cuda::OneToManyAssoc::initStorage
__host__ __device__ void initStorage(View view)
Definition: OneToManyAssoc.h:171

submitPVResolutionJobs.count
tuple count
Definition: submitPVResolutionJobs.py:352

nt
int nt
Definition: AMPTWrapper.h:42

cms::cuda::AtomicPairCounter::Counters::m
uint32_t m
Definition: AtomicPairCounter.h:25

StopReason::SIZE

cms::cuda::OneToManyAssocView::Counter
typename Assoc::Counter Counter
Definition: OneToManyAssoc.h:24

cms::cuda::OneToManyAssoc::ctCapacity
static constexpr int32_t ctCapacity()
Definition: OneToManyAssoc.h:168

PVValHelper::add
void add(std::map< std::string, TH1 * > &h, TH1 *hist)
Definition: PVValidationHelpers.cc:12

cms::cuda::OneToManyAssocView::contentSize
int32_t contentSize
Definition: OneToManyAssoc.h:31

apc
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter * apc
Definition: CAHitNtupletGeneratorKernelsImpl.h:43

PVValHelper::fill
void fill(std::map< std::string, TH1 * > &h, const std::string &s, double x)
Definition: PVValidationHelpers.cc:20

cms::cuda::size
__host__ __device__ VT uint32_t size
Definition: prefixScan.h:47

cms::cuda::OneToManyAssoc::ctNOnes
static constexpr int32_t ctNOnes()
Definition: OneToManyAssoc.h:165

cudaCheck.h

cms::cuda::OneToManyAssoc::zero
__host__ __device__ void zero()
Definition: OneToManyAssoc.h:185

alignCSCRings.s
list s
Definition: alignCSCRings.py:92

cms::cuda::AtomicPairCounter
Definition: AtomicPairCounter.h:11

b
double b
Definition: hdecay.h:118

cms::cuda::AtomicPairCounter::Counters::n
uint32_t n
Definition: AtomicPairCounter.h:24

cms::cudacompat::__syncthreads
void __syncthreads()
Definition: cudaCompat.h:108

cms::cuda::OneToManyAssoc::assert
assert(w > 0)

cms::cuda::AtomicPairCounter::get
__device__ __host__ Counters get() const
Definition: AtomicPairCounter.h:35

a
double a
Definition: hdecay.h:119

AtomicPairCounter.h

cms::cuda::OneToManyAssoc::totOnes
constexpr auto totOnes() const
Definition: OneToManyAssoc.h:166

dataset.end
string end
Definition: dataset.py:937

alignCSCRings.r
list r
Definition: alignCSCRings.py:93

cms::cuda::OneToManyAssocView::index_type
typename Assoc::index_type index_type
Definition: OneToManyAssoc.h:25

cms::cuda::OneToManyAssoc
Definition: OneToManyAssoc.h:143

cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69

cms::cuda::OneToManyAssoc::n
__host__ __device__ index_type const uint32_t n
Definition: OneToManyAssoc.h:232

h
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4

__device__
#define __device__
Definition: SiPixelGainForHLTonGPU.h:15

cms::cuda::OneToManyAssoc::off
off[c.m]
Definition: OneToManyAssoc.h:236

sistrip::View
View
Definition: ConstantsForView.h:26

cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61

cuda_assert.h

cms::cuda::OneToManyAssoc< hindex_type, S+1, 5 *S >::index_type
I index_type
Definition: OneToManyAssoc.h:150