CMS 3D CMS Logo

prefixScan.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_AlpakaInterface_interface_prefixScan_h
2 #define HeterogeneousCore_AlpakaInterface_interface_prefixScan_h
3 
4 #include <alpaka/alpaka.hpp>
5 
9 namespace cms::alpakatools {
10  template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
12  // returns true iif v has only one bit set.
13  while (v) {
14  if (v & 1)
15  return !(v >> 1);
16  else
17  v >>= 1;
18  }
19  return false;
20  }
21 
22  template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
23  ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(
24  const TAcc& acc, int32_t laneId, T const* ci, T* co, uint32_t i, bool active = true) {
25  // ci and co may be the same
26  T x = active ? ci[i] : 0;
28  for (int32_t offset = 1; offset < alpaka::warp::getSize(acc); offset <<= 1) {
29  // Force the exact type for integer types otherwise the compiler will find the template resolution ambiguous.
30  using dataType = std::conditional_t<std::is_floating_point_v<T>, T, std::int32_t>;
31  T y = alpaka::warp::shfl(acc, static_cast<dataType>(x), laneId - offset);
32  if (laneId >= offset)
33  x += y;
34  }
35  if (active)
36  co[i] = x;
37  }
38 
39  template <typename TAcc, typename T, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
40  ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(
41  const TAcc& acc, int32_t laneId, T* c, uint32_t i, bool active = true) {
42  warpPrefixScan(acc, laneId, c, c, i, active);
43  }
44 
45  // limited to warpSize² elements
46  template <typename TAcc, typename T>
47  ALPAKA_FN_ACC ALPAKA_FN_INLINE void blockPrefixScan(
48  const TAcc& acc, T const* ci, T* co, int32_t size, T* ws = nullptr) {
49  if constexpr (!requires_single_thread_per_block_v<TAcc>) {
50  const auto warpSize = alpaka::warp::getSize(acc);
51  int32_t const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
52  int32_t const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
54  ALPAKA_ASSERT_ACC(size <= warpSize * warpSize);
55  ALPAKA_ASSERT_ACC(0 == blockDimension % warpSize);
56  auto first = blockThreadIdx;
57  ALPAKA_ASSERT_ACC(isPowerOf2(warpSize));
58  auto laneId = blockThreadIdx & (warpSize - 1);
59  auto warpUpRoundedSize = (size + warpSize - 1) / warpSize * warpSize;
60 
61  for (auto i = first; i < warpUpRoundedSize; i += blockDimension) {
62  // When padding the warp, warpPrefixScan is a noop
63  warpPrefixScan(acc, laneId, ci, co, i, i < size);
64  if (i < size) {
65  // Skipped in warp padding threads.
66  auto warpId = i / warpSize;
67  ALPAKA_ASSERT_ACC(warpId < warpSize);
68  if ((warpSize - 1) == laneId)
69  ws[warpId] = co[i];
70  }
71  }
72  alpaka::syncBlockThreads(acc);
73  if (size <= warpSize)
74  return;
75  if (blockThreadIdx < warpSize) {
76  warpPrefixScan(acc, laneId, ws, blockThreadIdx);
77  }
78  alpaka::syncBlockThreads(acc);
79  for (auto i = first + warpSize; i < size; i += blockDimension) {
80  int32_t warpId = i / warpSize;
81  co[i] += ws[warpId - 1];
82  }
83  alpaka::syncBlockThreads(acc);
84  } else {
85  co[0] = ci[0];
86  for (int32_t i = 1; i < size; ++i)
87  co[i] = ci[i] + co[i - 1];
88  }
89  }
90 
91  template <typename TAcc, typename T>
92  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const TAcc& acc,
93  T* __restrict__ c,
94  int32_t size,
95  T* __restrict__ ws = nullptr) {
96  if constexpr (!requires_single_thread_per_block_v<TAcc>) {
97  const auto warpSize = alpaka::warp::getSize(acc);
98  int32_t const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
99  int32_t const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
101  ALPAKA_ASSERT_ACC(size <= warpSize * warpSize);
102  ALPAKA_ASSERT_ACC(0 == blockDimension % warpSize);
103  auto first = blockThreadIdx;
104  auto laneId = blockThreadIdx & (warpSize - 1);
105  auto warpUpRoundedSize = (size + warpSize - 1) / warpSize * warpSize;
106 
107  for (auto i = first; i < warpUpRoundedSize; i += blockDimension) {
108  // When padding the warp, warpPrefixScan is a noop
109  warpPrefixScan(acc, laneId, c, i, i < size);
110  if (i < size) {
111  // Skipped in warp padding threads.
112  auto warpId = i / warpSize;
113  ALPAKA_ASSERT_ACC(warpId < warpSize);
114  if ((warpSize - 1) == laneId)
115  ws[warpId] = c[i];
116  }
117  }
118  alpaka::syncBlockThreads(acc);
119  if (size <= warpSize)
120  return;
121  if (blockThreadIdx < warpSize) {
122  warpPrefixScan(acc, laneId, ws, blockThreadIdx);
123  }
124  alpaka::syncBlockThreads(acc);
125  for (auto i = first + warpSize; i < size; i += blockDimension) {
126  auto warpId = i / warpSize;
127  c[i] += ws[warpId - 1];
128  }
129  alpaka::syncBlockThreads(acc);
130  } else {
131  for (int32_t i = 1; i < size; ++i)
132  c[i] += c[i - 1];
133  }
134  }
135 
136  // in principle not limited....
137  template <typename T>
139  template <typename TAcc>
140  ALPAKA_FN_ACC void operator()(
141  const TAcc& acc, T const* ci, T* co, uint32_t size, int32_t numBlocks, int32_t* pc, std::size_t warpSize) const {
142  // Get shared variable. The workspace is needed only for multi-threaded accelerators.
143  T* ws = nullptr;
144  if constexpr (!requires_single_thread_per_block_v<TAcc>) {
145  ws = alpaka::getDynSharedMem<T>(acc);
146  }
147  ALPAKA_ASSERT_ACC(warpSize == static_cast<std::size_t>(alpaka::warp::getSize(acc)));
148  [[maybe_unused]] const auto elementsPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u];
149  const auto elementsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
150  const auto threadsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
151  const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
152  const auto blockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
153  const auto threadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
154  ALPAKA_ASSERT_ACC(elementsPerGrid >= size);
155  // first each block does a scan
156  [[maybe_unused]] int off = elementsPerBlock * blockIdx;
157  if (size - off > 0) {
158  blockPrefixScan(acc, ci + off, co + off, std::min(elementsPerBlock, size - off), ws);
159  }
160 
161  // count blocks that finished
162  auto& isLastBlockDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
163  //__shared__ bool isLastBlockDone;
164  if (0 == threadIdx) {
165  alpaka::mem_fence(acc, alpaka::memory_scope::Device{});
166  auto value = alpaka::atomicAdd(acc, pc, 1, alpaka::hierarchy::Blocks{}); // block counter
167  isLastBlockDone = (value == (int(blocksPerGrid) - 1));
168  }
169 
170  alpaka::syncBlockThreads(acc);
171 
172  if (!isLastBlockDone)
173  return;
174 
175  ALPAKA_ASSERT_ACC(int(blocksPerGrid) == *pc);
176 
177  // good each block has done its work and now we are left in last block
178 
179  // let's get the partial sums from each block except the last, which receives 0.
180  T* psum = nullptr;
181  if constexpr (!requires_single_thread_per_block_v<TAcc>) {
182  psum = ws + warpSize;
183  } else {
184  psum = alpaka::getDynSharedMem<T>(acc);
185  }
186  for (int32_t i = threadIdx, ni = blocksPerGrid; i < ni; i += threadsPerBlock) {
187  auto j = elementsPerBlock * i + elementsPerBlock - 1;
188  psum[i] = (j < size) ? co[j] : T(0);
189  }
190  alpaka::syncBlockThreads(acc);
191  blockPrefixScan(acc, psum, psum, blocksPerGrid, ws);
192 
193  // now it would have been handy to have the other blocks around...
194  // Simplify the computation by having one version where threads per block = block size
195  // and a second for the one thread per block accelerator.
196  if constexpr (!requires_single_thread_per_block_v<TAcc>) {
197  // Here threadsPerBlock == elementsPerBlock
198  for (uint32_t i = threadIdx + threadsPerBlock, k = 0; i < size; i += threadsPerBlock, ++k) {
199  co[i] += psum[k];
200  }
201  } else {
202  // We are single threaded here, adding partial sums starting with the 2nd block.
203  for (uint32_t i = elementsPerBlock; i < size; i++) {
204  co[i] += psum[i / elementsPerBlock - 1];
205  }
206  }
207  }
208  };
209 } // namespace cms::alpakatools
210 
211 // declare the amount of block shared memory used by the multiBlockPrefixScan kernel
212 namespace alpaka::trait {
213  // Variable size shared mem
214  template <typename TAcc, typename T>
215  struct BlockSharedMemDynSizeBytes<cms::alpakatools::multiBlockPrefixScan<T>, TAcc> {
216  template <typename TVec>
217  ALPAKA_FN_HOST_ACC static std::size_t getBlockSharedMemDynSizeBytes(
218  cms::alpakatools::multiBlockPrefixScan<T> const& /* kernel */,
219  TVec const& /* blockThreadExtent */,
220  TVec const& /* threadElemExtent */,
221  T const* /* ci */,
222  T const* /* co */,
223  int32_t /* size */,
224  int32_t numBlocks,
225  int32_t const* /* pc */,
226  // This trait function does not receive the accelerator object to look up the warp size
227  std::size_t warpSize) {
228  // We need workspace (T[warpsize]) + partial sums (T[numblocks]).
229  if constexpr (cms::alpakatools::requires_single_thread_per_block_v<TAcc>) {
230  return sizeof(T) * numBlocks;
231  } else {
232  return sizeof(T) * (warpSize + numBlocks);
233  }
234  }
235  };
236 
237 } // namespace alpaka::trait
238 
239 #endif // HeterogeneousCore_AlpakaInterface_interface_prefixScan_h
const dim3 threadIdx
Definition: cudaCompat.h:29
static ALPAKA_FN_HOST_ACC std::size_t getBlockSharedMemDynSizeBytes(cms::alpakatools::multiBlockPrefixScan< T > const &, TVec const &, TVec const &, T const *, T const *, int32_t, int32_t numBlocks, int32_t const *, std::size_t warpSize)
Definition: prefixScan.h:217
__host__ __device__ VT * co
Definition: prefixScan.h:47
#define CMS_UNROLL_LOOP
Definition: CMSUnrollLoop.h:47
std::vector< Block > Blocks
Definition: Block.h:99
Definition: value.py:1
const dim3 blockIdx
Definition: cudaCompat.h:32
Namespace of DDCMS conversion namespace.
ALPAKA_FN_ACC ALPAKA_FN_INLINE void blockPrefixScan(const TAcc &acc, T const *ci, T *co, int32_t size, T *ws=nullptr)
Definition: prefixScan.h:47
float x
ALPAKA_FN_ACC void operator()(const TAcc &acc, T const *ci, T *co, uint32_t size, int32_t numBlocks, int32_t *pc, std::size_t warpSize) const
Definition: prefixScan.h:140
long double T
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr bool isPowerOf2(T v)
Definition: prefixScan.h:11
ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(const TAcc &acc, int32_t laneId, T const *ci, T *co, uint32_t i, bool active=true)
Definition: prefixScan.h:23