1 #ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
2 #define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
7 #endif // __CUDA_ARCH__
10 #include <type_traits>
21 template <
typename Histo,
typename T>
22 __global__ void countFromVector(Histo *__restrict__
h,
24 T const *__restrict__
v,
37 template <
typename Histo,
typename T>
38 __global__ void fillFromVector(Histo *__restrict__
h,
40 T const *__restrict__
v,
41 uint32_t
const *__restrict__
offsets) {
49 (*h).fill(
v[
i],
i, ih);
53 template <
typename Histo>
54 inline __attribute__((always_inline))
void launchZero(Histo *__restrict__
h,
60 uint32_t *poff = (uint32_t *)((
char *)(
h) + offsetof(Histo, off));
61 int32_t
size = offsetof(Histo,
bins) - offsetof(Histo, off);
62 assert(
size >=
int(
sizeof(uint32_t) * Histo::totbins()));
66 ::memset(poff, 0,
size);
70 template <
typename Histo>
71 inline __attribute__((always_inline))
void launchFinalize(Histo *__restrict__
h,
78 uint32_t *poff = (uint32_t *)((
char *)(
h) + offsetof(Histo, off));
79 int32_t *ppsws = (int32_t *)((
char *)(
h) + offsetof(Histo, psws));
82 multiBlockPrefixScan<<<nblocks,
nthreads,
sizeof(int32_t) * nblocks,
stream>>>(
83 poff, poff, Histo::totbins(), ppsws);
90 template <
typename Histo,
typename T>
91 inline __attribute__((always_inline))
void fillManyFromVector(Histo *__restrict__
h,
93 T const *__restrict__
v,
94 uint32_t
const *__restrict__
offsets,
105 countFromVector<<<nblocks, nthreads, 0, stream>>>(
h,
nh,
v,
offsets);
108 fillFromVector<<<nblocks, nthreads, 0, stream>>>(
h,
nh,
v,
offsets);
117 template <
typename Assoc>
118 __global__ void finalizeBulk(AtomicPairCounter
const *
apc, Assoc *__restrict__
assoc) {
123 template <
typename Hist,
typename V,
typename Func>
135 template <
typename Hist,
typename V,
typename Func>
140 for (
auto pj =
hist.begin(
bs); pj <
hist.end(
be); ++pj) {
145 template <
typename T,
148 uint32_t
S =
sizeof(
T) * 8,
149 typename I = uint32_t,
161 static constexpr uint32_t
ilog2(uint32_t
v) {
162 constexpr uint32_t
b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
163 constexpr uint32_t
s[] = {1, 2, 4, 8, 16};
166 for (
auto i = 4;
i >= 0;
i--)
174 static constexpr uint32_t
sizeT() {
return S; }
176 static constexpr uint32_t
nhists() {
return NHISTS; }
178 static constexpr uint32_t
nbits() {
return ilog2(
NBINS - 1) + 1; }
179 static constexpr uint32_t
capacity() {
return SIZE; }
184 constexpr uint32_t
shift = sizeT() - nbits();
185 constexpr uint32_t mask = (1 << nbits()) - 1;
186 return (
t >>
shift) & mask;
195 for (uint32_t
i = 0;
i < totbins(); ++
i) {
199 auto &
a = (std::atomic<Counter> &)(off[
i]);
209 auto &
a = (std::atomic<Counter> &)(x);
218 auto &
a = (std::atomic<Counter> &)(x);
225 atomicIncrement(off[
b]);
230 auto w = atomicDecrement(off[
b]);
238 return -int32_t(
c.m);
240 for (uint32_t
j = 0;
j <
n; ++
j)
250 auto m =
apc.get().
m;
251 auto n =
apc.get().
n;
265 atomicIncrement(off[
b]);
271 auto w = atomicDecrement(off[
b]);
281 atomicIncrement(off[
b]);
289 auto w = atomicDecrement(off[
b]);
295 assert(off[totbins() - 1] == 0);
296 blockPrefixScan(off, totbins(),
ws);
297 assert(off[totbins() - 1] == off[totbins() - 2]);
300 constexpr
auto size()
const {
return uint32_t(off[totbins() - 1]); }
301 constexpr
auto size(uint32_t
b)
const {
return off[
b + 1] - off[
b]; }
303 constexpr index_type
const *begin()
const {
return bins; }
304 constexpr index_type
const *
end()
const {
return begin() +
size(); }
306 constexpr index_type
const *begin(uint32_t
b)
const {
return bins + off[
b]; }
307 constexpr index_type
const *
end(uint32_t
b)
const {
return bins + off[
b + 1]; }
314 template <
typename I,
323 #endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h