1 #ifndef HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
2 #define HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
7 #endif // __CUDA_ARCH__
10 #include <type_traits>
22 template <
typename Assoc>
35 template <
typename Assoc>
53 template <
typename Assoc>
60 typename Assoc::View view = {
h,
nullptr,
nullptr, -1, -1};
63 template <
typename Assoc>
64 inline __attribute__((always_inline)) void launchZero(OneToManyAssocView<Assoc> view,
71 if constexpr (Assoc::ctCapacity() < 0) {
72 assert(view.contentStorage);
73 assert(view.contentSize > 0);
75 if constexpr (Assoc::ctNOnes() < 0) {
82 zeroAndInit<<<nblocks, nthreads, 0, stream>>>(view);
93 template <
typename Assoc>
94 inline __attribute__((always_inline)) void launchFinalize(Assoc *h,
100 typename Assoc::View view = {
h,
nullptr,
nullptr, -1, -1};
101 launchFinalize(view,
stream);
104 template <
typename Assoc>
105 inline __attribute__((always_inline)) void launchFinalize(OneToManyAssocView<Assoc> view,
116 auto nOnes = Assoc::ctNOnes();
117 if constexpr (Assoc::ctNOnes() < 0) {
120 nOnes = view.offSize;
121 poff = view.offStorage;
124 int32_t *ppsws = (int32_t *)((
char *)(
h) + offsetof(Assoc, psws));
127 multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(poff, poff, nOnes, ppsws);
134 template <
typename Assoc>
135 __global__ void finalizeBulk(AtomicPairCounter
const *
apc, Assoc *__restrict__ assoc) {
136 assoc->bulkFinalizeFill(*apc);
139 template <
typename I,
152 static constexpr uint32_t
ilog2(uint32_t
v) {
153 constexpr uint32_t
b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
154 constexpr uint32_t
s[] = {1, 2, 4, 8, 16};
157 for (
auto i = 4;
i >= 0;
i--)
196 auto &
a = (std::atomic<Counter> &)(
off[
i]);
206 auto &
a = (std::atomic<Counter> &)(x);
215 auto &
a = (std::atomic<Counter> &)(x);
222 atomicIncrement(
off[b]);
227 auto w = atomicDecrement(
off[b]);
234 if (
int(
c.m) >= nOnes())
235 return -int32_t(
c.m);
237 for (uint32_t
j = 0;
j <
n; ++
j)
248 auto n = apc.
get().
n;
265 constexpr
auto size()
const {
return uint32_t(
off[
totOnes() - 1]); }
266 constexpr
auto size(uint32_t
b)
const {
return off[b + 1] -
off[
b]; }
282 #endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
cudaStream_t int32_t ONES
FlexiStorage< index_type, SIZE > content
const edm::EventSetup & c
T1 atomicSub(T1 *a, T2 b)
index_type * contentStorage
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
constexpr auto nOnes() const
__host__ __device__ VT * co
std::function< unsigned int(align::ID)> Counter
__host__ __device__ index_type const * v
__host__ __device__ ONES off
static constexpr uint32_t ilog2(uint32_t v)
__attribute__((always_inline)) void countFromVector(Histo *__restrict__ h
__host__ __device__ index_type j
const std::complex< double > I
constexpr auto capacity() const
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int nthreads
__host__ __device__ void initStorage(View view)
typename Assoc::Counter Counter
static constexpr int32_t ctCapacity()
void add(std::map< std::string, TH1 * > &h, TH1 *hist)
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter * apc
void fill(std::map< std::string, TH1 * > &h, const std::string &s, double x)
__host__ __device__ VT uint32_t size
static constexpr int32_t ctNOnes()
__host__ __device__ void zero()
__device__ __host__ Counters get() const
constexpr auto totOnes() const
typename Assoc::index_type index_type
#define cudaCheck(ARG,...)
__host__ __device__ index_type const uint32_t n
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
T1 atomicAdd(T1 *a, T2 b)