1 #ifndef HeterogeneousCore_AlpakaInterface_interface_prefixScan_h 2 #define HeterogeneousCore_AlpakaInterface_interface_prefixScan_h 4 #include <alpaka/alpaka.hpp> 10 template <
typename T,
typename = std::enable_if_t<std::is_
integral_v<T>>>
22 template <
typename TAcc,
typename T,
typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
24 const TAcc& acc, int32_t laneId,
T const* ci,
T*
co, uint32_t
i,
bool active =
true) {
26 T x = active ? ci[
i] : 0;
30 using dataType = std::conditional_t<std::is_floating_point_v<T>,
T, std::int32_t>;
31 T y = alpaka::warp::shfl(acc, static_cast<dataType>(
x), laneId -
offset);
39 template <
typename TAcc,
typename T,
typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
41 const TAcc& acc, int32_t laneId,
T*
c, uint32_t
i,
bool active =
true) {
46 template <
typename TAcc,
typename T>
48 const TAcc& acc,
T const* ci,
T*
co, int32_t size,
T*
ws =
nullptr) {
49 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
50 const auto warpSize = alpaka::warp::getSize(acc);
51 int32_t
const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
52 int32_t
const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
56 auto first = blockThreadIdx;
58 auto laneId = blockThreadIdx & (warpSize - 1);
59 auto warpUpRoundedSize = (size + warpSize - 1) / warpSize * warpSize;
61 for (
auto i =
first;
i < warpUpRoundedSize;
i += blockDimension) {
66 auto warpId =
i / warpSize;
68 if ((warpSize - 1) == laneId)
72 alpaka::syncBlockThreads(acc);
75 if (blockThreadIdx < warpSize) {
78 alpaka::syncBlockThreads(acc);
79 for (
auto i =
first + warpSize;
i < size;
i += blockDimension) {
80 int32_t warpId =
i / warpSize;
81 co[
i] +=
ws[warpId - 1];
83 alpaka::syncBlockThreads(acc);
86 for (int32_t
i = 1;
i < size; ++
i)
91 template <
typename TAcc,
typename T>
95 T* __restrict__
ws =
nullptr) {
96 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
97 const auto warpSize = alpaka::warp::getSize(acc);
98 int32_t
const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
99 int32_t
const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
103 auto first = blockThreadIdx;
104 auto laneId = blockThreadIdx & (warpSize - 1);
105 auto warpUpRoundedSize = (size + warpSize - 1) / warpSize * warpSize;
107 for (
auto i =
first;
i < warpUpRoundedSize;
i += blockDimension) {
112 auto warpId =
i / warpSize;
114 if ((warpSize - 1) == laneId)
118 alpaka::syncBlockThreads(acc);
119 if (size <= warpSize)
121 if (blockThreadIdx < warpSize) {
124 alpaka::syncBlockThreads(acc);
125 for (
auto i =
first + warpSize;
i < size;
i += blockDimension) {
126 auto warpId =
i / warpSize;
127 c[
i] +=
ws[warpId - 1];
129 alpaka::syncBlockThreads(acc);
131 for (int32_t
i = 1;
i < size; ++
i)
137 template <
typename T>
139 template <
typename TAcc>
141 const TAcc& acc,
T const* ci,
T*
co, uint32_t
size, int32_t numBlocks, int32_t* pc, std::size_t warpSize)
const {
144 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
145 ws = alpaka::getDynSharedMem<T>(acc);
147 ALPAKA_ASSERT_ACC(warpSize == static_cast<std::size_t>(alpaka::warp::getSize(acc)));
148 [[maybe_unused]]
const auto elementsPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u];
149 const auto elementsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
150 const auto threadsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
151 const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
152 const auto blockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
153 const auto threadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
156 [[maybe_unused]]
int off = elementsPerBlock *
blockIdx;
157 if (
size - off > 0) {
162 auto& isLastBlockDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
165 alpaka::mem_fence(acc, alpaka::memory_scope::Device{});
167 isLastBlockDone = (
value == (
int(blocksPerGrid) - 1));
170 alpaka::syncBlockThreads(acc);
172 if (!isLastBlockDone)
181 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
182 psum =
ws + warpSize;
184 psum = alpaka::getDynSharedMem<T>(acc);
186 for (int32_t
i =
threadIdx, ni = blocksPerGrid;
i < ni;
i += threadsPerBlock) {
187 auto j = elementsPerBlock *
i + elementsPerBlock - 1;
190 alpaka::syncBlockThreads(acc);
196 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
198 for (uint32_t
i =
threadIdx + threadsPerBlock,
k = 0;
i <
size;
i += threadsPerBlock, ++
k) {
203 for (uint32_t
i = elementsPerBlock;
i <
size;
i++) {
204 co[
i] += psum[
i / elementsPerBlock - 1];
214 template <
typename TAcc,
typename T>
215 struct BlockSharedMemDynSizeBytes<
cms::alpakatools::multiBlockPrefixScan<T>, TAcc> {
216 template <
typename TVec>
227 std::size_t warpSize) {
229 if constexpr (cms::alpakatools::requires_single_thread_per_block_v<TAcc>) {
230 return sizeof(
T) * numBlocks;
232 return sizeof(
T) * (warpSize + numBlocks);
239 #endif // HeterogeneousCore_AlpakaInterface_interface_prefixScan_h
__host__ __device__ VT * co
std::vector< Block > Blocks
Namespace of DDCMS conversion namespace.
ALPAKA_ASSERT_ACC(offsets)
T1 atomicAdd(T1 *a, T2 b)