1 #ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
2 #define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
19 auto y = __shfl_up_sync(mask,
x,
offset);
32 auto y = __shfl_up_sync(mask,
x,
offset);
45 template <
typename VT,
typename T>
59 auto mask = __ballot_sync(0xffffffff,
first <
size);
62 warpPrefixScan(ci,
co,
i, mask);
78 co[
i] +=
ws[warpId - 1];
83 for (uint32_t
i = 1;
i <
size; ++
i)
103 auto mask = __ballot_sync(0xffffffff,
first <
size);
106 warpPrefixScan(
c,
i, mask);
108 auto warpId =
i / 32;
121 auto warpId =
i / 32;
122 c[
i] +=
ws[warpId - 1];
126 for (uint32_t
i = 1;
i <
size; ++
i)
135 asm volatile(
"mov.u32 %0, %dynamic_smem_size;" :
"=r"(
ret));
141 template <
typename T>
142 __global__ void multiBlockPrefixScan(
T const* ici,
T* ico, int32_t
size, int32_t* pc) {
143 volatile T const* ci = ici;
144 volatile T*
co = ico;
156 __shared__
bool isLastBlockDone;
165 if (!isLastBlockDone)
173 extern __shared__
T psum[];
179 blockPrefixScan(psum, psum,
gridDim.x,
ws);
189 #endif // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h