1 #ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
2 #define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
18 auto y = __shfl_up_sync(mask,
x,
offset);
31 auto y = __shfl_up_sync(mask,
x,
offset);
44 template <
typename VT,
typename T>
58 auto mask = __ballot_sync(0xffffffff,
first <
size);
61 warpPrefixScan(ci,
co,
i, mask);
77 co[
i] += ws[warpId - 1];
82 for (uint32_t
i = 1;
i <
size; ++
i)
102 auto mask = __ballot_sync(0xffffffff,
first <
size);
105 warpPrefixScan(
c,
i, mask);
107 auto warpId =
i / 32;
120 auto warpId =
i / 32;
121 c[
i] += ws[warpId - 1];
125 for (uint32_t
i = 1;
i <
size; ++
i)
134 asm volatile(
"mov.u32 %0, %dynamic_smem_size;" :
"=r"(
ret));
140 template <
typename T>
141 __global__ void multiBlockPrefixScan(
T const* ici,
T* ico, int32_t
size, int32_t* pc) {
142 volatile T const* ci = ici;
143 volatile T*
co = ico;
164 if (!isLastBlockDone)
178 blockPrefixScan(psum, psum,
gridDim.
x, ws);
188 #endif // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h