144 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
145 ws = alpaka::getDynSharedMem<T>(acc);
147 ALPAKA_ASSERT_ACC(warpSize == static_cast<std::size_t>(alpaka::warp::getSize(acc)));
148 [[maybe_unused]]
const auto elementsPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u];
149 const auto elementsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
150 const auto threadsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
151 const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
152 const auto blockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
153 const auto threadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
156 [[maybe_unused]]
int off = elementsPerBlock *
blockIdx;
157 if (
size - off > 0) {
162 auto& isLastBlockDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
165 alpaka::mem_fence(acc, alpaka::memory_scope::Device{});
167 isLastBlockDone = (
value == (
int(blocksPerGrid) - 1));
170 alpaka::syncBlockThreads(acc);
172 if (!isLastBlockDone)
181 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
182 psum =
ws + warpSize;
184 psum = alpaka::getDynSharedMem<T>(acc);
186 for (int32_t
i =
threadIdx, ni = blocksPerGrid;
i < ni;
i += threadsPerBlock) {
187 auto j = elementsPerBlock *
i + elementsPerBlock - 1;
190 alpaka::syncBlockThreads(acc);
196 if constexpr (!requires_single_thread_per_block_v<TAcc>) {
198 for (uint32_t
i =
threadIdx + threadsPerBlock,
k = 0;
i <
size;
i += threadsPerBlock, ++
k) {
203 for (uint32_t
i = elementsPerBlock;
i <
size;
i++) {
204 co[
i] += psum[
i / elementsPerBlock - 1];
__host__ __device__ VT * co
std::vector< Block > Blocks
ALPAKA_ASSERT_ACC(offsets)
T1 atomicAdd(T1 *a, T2 b)