CMS 3D CMS Logo

workdivision.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
2 #define HeterogeneousCore_AlpakaInterface_interface_workdivision_h
3 
4 #include <algorithm>
5 #include <cstddef>
6 #include <type_traits>
7 
8 #include <alpaka/alpaka.hpp>
9 
11 
12 namespace cms::alpakatools {
13 
14  using namespace alpaka_common;
15 
16  // If the first argument is not a multiple of the second argument, round it up to the next multiple
17  inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }
18 
19  // Return the integer division of the first argument by the second argument, rounded up to the next integer
20  inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
21 
22  // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
23  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
24  struct requires_single_thread_per_block : public std::true_type {};
25 
26 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
27  template <typename TDim>
28  struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
29 #endif // ALPAKA_ACC_GPU_CUDA_ENABLED
30 
31 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
32  template <typename TDim>
33  struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
34 #endif // ALPAKA_ACC_GPU_HIP_ENABLED
35 
36 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
37  template <typename TDim>
38  struct requires_single_thread_per_block<alpaka::AccCpuThreads<TDim, Idx>> : public std::false_type {};
39 #endif // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
40 
41  // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
42  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
44 
45  // Create an accelerator-dependent work division for 1-dimensional kernels
46  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
48  if constexpr (not requires_single_thread_per_block_v<TAcc>) {
49  // On GPU backends, each thread is looking at a single element:
50  // - the number of threads per block is "elements";
51  // - the number of elements per thread is always 1.
52  return WorkDiv<Dim1D>(blocks, elements, Idx{1});
53  } else {
54  // On CPU backends, run serially with a single thread per block:
55  // - the number of threads per block is always 1;
56  // - the number of elements per thread is "elements".
57  return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
58  }
59  }
60 
61  // Create the accelerator-dependent workdiv for N-dimensional kernels
62  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
63  inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
64  const Vec<alpaka::Dim<TAcc>>& elements) {
65  using Dim = alpaka::Dim<TAcc>;
66  if constexpr (not requires_single_thread_per_block_v<TAcc>) {
67  // On GPU backends, each thread is looking at a single element:
68  // - the number of threads per block is "elements";
69  // - the number of elements per thread is always 1.
71  } else {
72  // On CPU backends, run serially with a single thread per block:
73  // - the number of threads per block is always 1;
74  // - the number of elements per thread is "elements".
76  }
77  }
78 
79  /* ElementIndex
80  *
81  * an aggregate that containes the `.global` and `.local` indices of an element; returned by iterating over the objecs
82  * returned by `uniform_group_elements` and similar functions.
83  */
84 
85  struct ElementIndex {
88  };
89 
90  namespace detail {
91 
92  /* UniformElementsAlong
93  *
94  * `UniformElementsAlong<TAcc, Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that spans the
95  * element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension.
96  * If `first` is not specified, it defaults to 0.
97  * If `extent` is not specified, it defaults to the kernel grid size along the `Dim` dimension.
98  *
99  * `uniform_elements_along<Dim>(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, Dim>(acc, ...)` that can
100  * infer the accelerator type from the argument.
101  *
102  * In a 1-dimensional kernel, `uniform_elements(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, 0>(acc, ...)`.
103  *
104  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed
105  * by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
106  * For convenience when converting CUDA or HIP code, `uniform_elements_x(acc, ...)`, `_y` and `_z` are shorthands for
107  * `UniformElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
108  *
109  * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
110  * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
111  * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
112  * loop over each group's elements, and synchronise only in the outer loop:
113  *
114  * for (auto group : uniform_groups_along<Dim>(acc, extent)) {
115  * for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
116  * // first part of the computation
117  * // no synchronisations here
118  * ...
119  * }
120  * // wait for all threads to complete the first part
121  * alpaka::syncBlockThreads();
122  * for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
123  * // second part of the computation
124  * // no synchronisations here
125  * ...
126  * }
127  * // wait for all threads to complete the second part
128  * alpaka::syncBlockThreads();
129  * ...
130  * }
131  *
132  * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
133  * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
134  * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
135  * explicitly inside the loop:
136  *
137  * for (auto element : uniform_elements_along<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
138  * bool flag = false;
139  * if (element < extent) {
140  * // do some work and compute a result flag only for the valid elements
141  * flag = do_some_work();
142  * }
143  * // check if any valid element had a positive result
144  * if (alpaka::warp::any(acc, flag)) {
145  * // ...
146  * }
147  * }
148  *
149  * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
150  */
151 
152  template <typename TAcc,
153  std::size_t Dim,
154  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
156  public:
157  ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc)
158  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
159  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
160  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
161  extent_{stride_} {}
162 
163  ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent)
164  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
165  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
166  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
167  extent_{extent} {}
168 
169  ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent)
170  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
171  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first},
172  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
173  extent_{extent} {}
174 
175  class const_iterator;
177 
178  ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, first_); }
179 
180  ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
181 
183  friend class UniformElementsAlong;
184 
185  ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
186  : elements_{elements},
187  stride_{stride},
188  extent_{extent},
189  first_{std::min(first, extent)},
190  index_{first_},
191  range_{std::min(first + elements, extent)} {}
192 
193  public:
194  ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
195 
196  // pre-increment the iterator
197  ALPAKA_FN_ACC inline const_iterator& operator++() {
198  if constexpr (requires_single_thread_per_block_v<TAcc>) {
199  // increment the index along the elements processed by the current thread
200  ++index_;
201  if (index_ < range_)
202  return *this;
203  }
204 
205  // increment the thread index with the grid stride
206  first_ += stride_;
207  index_ = first_;
208  range_ = std::min(first_ + elements_, extent_);
209  if (index_ < extent_)
210  return *this;
211 
212  // the iterator has reached or passed the end of the extent, clamp it to the extent
213  first_ = extent_;
214  index_ = extent_;
215  range_ = extent_;
216  return *this;
217  }
218 
219  // post-increment the iterator
220  ALPAKA_FN_ACC inline const_iterator operator++(int) {
221  const_iterator old = *this;
222  ++(*this);
223  return old;
224  }
225 
226  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
227  return (index_ == other.index_) and (first_ == other.first_);
228  }
229 
230  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
231 
232  private:
233  // non-const to support iterator copy and assignment
237  // modified by the pre/post-increment operator
241  };
242 
243  private:
244  const Idx elements_;
245  const Idx first_;
246  const Idx stride_;
247  const Idx extent_;
248  };
249 
250  } // namespace detail
251 
252  /* uniform_elements
253  *
254  * `uniform_elements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element indices
255  * from `first` (inclusive) to `extent` (exlusive).
256  * If `first` is not specified, it defaults to 0.
257  * If `extent` is not specified, it defaults to the kernel grid size.
258  *
259  * `uniform_elements(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, 0>(acc, ...)`.
260  *
261  * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
262  * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
263  * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
264  * loop over each group's elements, and synchronise only in the outer loop:
265  *
266  * for (auto group : uniform_groups(acc, extent)) {
267  * for (auto element : uniform_group_elements(acc, group, extent)) {
268  * // first part of the computation
269  * // no synchronisations here
270  * ...
271  * }
272  * // wait for all threads to complete the first part
273  * alpaka::syncBlockThreads();
274  * for (auto element : uniform_group_elements(acc, group, extent)) {
275  * // second part of the computation
276  * // no synchronisations here
277  * ...
278  * }
279  * // wait for all threads to complete the second part
280  * alpaka::syncBlockThreads();
281  * ...
282  * }
283  *
284  * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
285  * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
286  * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
287  * explicitly inside the loop:
288  *
289  * for (auto element : uniform_elements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
290  * bool flag = false;
291  * if (element < extent) {
292  * // do some work and compute a result flag only for elements up to extent
293  * flag = do_some_work();
294  * }
295  * // check if any valid element had a positive result
296  * if (alpaka::warp::any(acc, flag)) {
297  * // ...
298  * }
299  * }
300  *
301  * Note that `uniform_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
302  * - `uniform_elements_nd(acc, ...)` to cover an N-dimensional problem space with a single loop;
303  * - `uniform_elements_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
304  * - `uniform_elements_x(acc, ...)`, `uniform_elements_y(acc, ...)`, or `uniform_elements_z(acc, ...)` to loop
305  * along the fastest, second-fastest, or third-fastest dimension.
306  */
307 
308  template <typename TAcc,
309  typename... TArgs,
310  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
311  ALPAKA_FN_ACC inline auto uniform_elements(TAcc const& acc, TArgs... args) {
312  return detail::UniformElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
313  }
314 
315  /* uniform_elements_along<Dim>
316  *
317  * `uniform_elements_along<Dim>(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, Dim>(acc, ...)` that can
318  * infer the accelerator type from the argument.
319  */
320 
321  template <typename TAcc,
322  std::size_t Dim,
323  typename... TArgs,
324  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
325  ALPAKA_FN_ACC inline auto uniform_elements_along(TAcc const& acc, TArgs... args) {
326  return detail::UniformElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
327  }
328 
329  /* uniform_elements_x, _y, _z
330  *
331  * Like `uniform_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
332  */
333 
334  template <typename TAcc,
335  typename... TArgs,
336  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
337  ALPAKA_FN_ACC inline auto uniform_elements_x(TAcc const& acc, TArgs... args) {
338  return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
339  }
340 
341  template <typename TAcc,
342  typename... TArgs,
343  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
344  ALPAKA_FN_ACC inline auto uniform_elements_y(TAcc const& acc, TArgs... args) {
345  return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
346  }
347 
348  template <typename TAcc,
349  typename... TArgs,
350  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
351  ALPAKA_FN_ACC inline auto uniform_elements_z(TAcc const& acc, TArgs... args) {
352  return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
353  }
354 
355  namespace detail {
356 
357  /* UniformElementsND
358  *
359  * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
360  * required to cover the given problem size, indicated by `extent`.
361  *
362  * `uniform_elements_nd(acc, ...)` is an alias for `UniformElementsND<TAcc>(acc, ...)`.
363  *
364  * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
365  * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
366  * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
367  * loop over each group's elements, and synchronise only in the outer loop:
368  *
369  * for (auto group0 : uniform_groups_along<0>(acc, extent[0])) {
370  * for (auto group1 : uniform_groups_along<1>(acc, extent[1])) {
371  * for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
372  * for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
373  * // first part of the computation
374  * // no synchronisations here
375  * ...
376  * }
377  * }
378  * // wait for all threads to complete the first part
379  * alpaka::syncBlockThreads();
380  * for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
381  * for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
382  * // second part of the computation
383  * // no synchronisations here
384  * ...
385  * }
386  * }
387  * // wait for all threads to complete the second part
388  * alpaka::syncBlockThreads();
389  * ...
390  * }
391  * }
392  *
393  * For more details, see `UniformElementsAlong<TAcc, Dim>(acc, ...)`.
394  */
395 
396  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
398  public:
399  using Dim = alpaka::Dim<TAcc>;
400  using Vec = alpaka::Vec<Dim, Idx>;
401 
402  ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc)
403  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
404  thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
405  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
406  extent_{stride_} {}
407 
408  ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent)
409  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
410  thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
411  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
412  extent_{extent} {}
413 
414  // tag used to construct an end iterator
415  struct at_end_t {};
416 
417  class const_iterator;
419 
420  ALPAKA_FN_ACC inline const_iterator begin() const {
421  // check that all dimensions of the current thread index are within the extent
422  if ((thread_ < extent_).all()) {
423  // construct an iterator pointing to the first element to be processed by the current thread
424  return const_iterator{this, thread_};
425  } else {
426  // construct an end iterator, pointing post the end of the extent
427  return const_iterator{this, at_end_t{}};
428  }
429  }
430 
431  ALPAKA_FN_ACC inline const_iterator end() const {
432  // construct an end iterator, pointing post the end of the extent
433  return const_iterator{this, at_end_t{}};
434  }
435 
437  friend class UniformElementsND;
438 
439  public:
440  ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
441 
442  // pre-increment the iterator
443  ALPAKA_FN_ACC constexpr inline const_iterator operator++() {
444  increment();
445  return *this;
446  }
447 
448  // post-increment the iterator
449  ALPAKA_FN_ACC constexpr inline const_iterator operator++(int) {
450  const_iterator old = *this;
451  increment();
452  return old;
453  }
454 
455  ALPAKA_FN_ACC constexpr inline bool operator==(const_iterator const& other) const {
456  return (index_ == other.index_);
457  }
458 
459  ALPAKA_FN_ACC constexpr inline bool operator!=(const_iterator const& other) const {
460  return not(*this == other);
461  }
462 
463  private:
464  // construct an iterator pointing to the first element to be processed by the current thread
465  ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first)
466  : loop_{loop},
467  first_{alpaka::elementwise_min(first, loop->extent_)},
468  range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
469  index_{first_} {}
470 
471  // construct an end iterator, pointing post the end of the extent
472  ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&)
473  : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}
474 
475  template <size_t I>
476  ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
477  bool overflow = false;
478  ++index_[I];
479  if (index_[I] >= range_[I]) {
480  index_[I] = first_[I];
481  overflow = true;
482  }
483  return overflow;
484  }
485 
486  template <size_t N>
487  ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
488  if constexpr (N == 0) {
489  // overflow
490  return true;
491  } else {
492  if (not nth_elements_loop<N - 1>()) {
493  return false;
494  } else {
495  return do_elements_loops<N - 1>();
496  }
497  }
498  }
499 
500  template <size_t I>
501  ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
502  bool overflow = false;
503  first_[I] += loop_->stride_[I];
504  if (first_[I] >= loop_->extent_[I]) {
505  first_[I] = loop_->thread_[I];
506  overflow = true;
507  }
508  index_[I] = first_[I];
509  range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
510  return overflow;
511  }
512 
513  template <size_t N>
514  ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
515  if constexpr (N == 0) {
516  // overflow
517  return true;
518  } else {
519  if (not nth_strided_loop<N - 1>()) {
520  return false;
521  } else {
522  return do_strided_loops<N - 1>();
523  }
524  }
525  }
526 
527  // increment the iterator
528  ALPAKA_FN_ACC inline constexpr void increment() {
529  if constexpr (requires_single_thread_per_block_v<TAcc>) {
530  // linear N-dimensional loops over the elements associated to the thread;
531  // do_elements_loops<>() returns true if any of those loops overflows
532  if (not do_elements_loops<Dim::value>()) {
533  // the elements loops did not overflow, return the next index
534  return;
535  }
536  }
537 
538  // strided N-dimensional loop over the threads in the kernel launch grid;
539  // do_strided_loops<>() returns true if any of those loops overflows
540  if (not do_strided_loops<Dim::value>()) {
541  // the strided loops did not overflow, return the next index
542  return;
543  }
544 
545  // the iterator has reached or passed the end of the extent, clamp it to the extent
546  first_ = loop_->extent_;
547  range_ = loop_->extent_;
548  index_ = loop_->extent_;
549  }
550 
551  // const pointer to the UniformElementsND that the iterator refers to
553 
554  // modified by the pre/post-increment operator
555  Vec first_; // first element processed by this thread
556  Vec range_; // last element processed by this thread
557  Vec index_; // current element processed by this thread
558  };
559 
560  private:
561  const Vec elements_;
562  const Vec thread_;
563  const Vec stride_;
564  const Vec extent_;
565  };
566 
567  } // namespace detail
568 
569  /* uniform_elements_nd
570  *
571  * `uniform_elements_nd(acc, ...)` is a shorthand for `detail::UniformElementsND<TAcc>(acc, ...)`.
572  */
573 
574  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
575  ALPAKA_FN_ACC inline auto uniform_elements_nd(TAcc const& acc) {
577  }
578 
579  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
580  ALPAKA_FN_ACC inline auto uniform_elements_nd(TAcc const& acc, alpaka::Vec<alpaka::Dim<TAcc>, Idx> extent) {
581  return detail::UniformElementsND<TAcc>(acc, extent);
582  }
583 
584  namespace detail {
585 
586  /* UniformGroupsAlong
587  *
588  * `UniformGroupsAlong<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group indices
589  * required to cover the given problem size along the `Dim` dimension, in units of the block size. `elements`
590  * indicates the total number of elements, across all groups; if not specified, it defaults to the kernel grid size
591  * along the `Dim` dimension.
592  *
593  * `uniform_groups_along<Dim>(acc, ...)` is a shorthand for `UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can infer
594  * the accelerator type from the argument.
595  *
596  * In a 1-dimensional kernel, `uniform_groups(acc, ...)` is a shorthand for `UniformGroupsAlong<Tacc, 0>(acc, ...)`.
597  *
598  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
599  * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
600  * For convenience when converting CUDA or HIP code, `uniform_groups_x(acc, ...)`, `_y` and `_z` are shorthands for
601  * `UniformGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
602  *
603  * `uniform_groups_along<Dim>(acc, ...)` should be called consistently by all the threads in a block. All threads in a
604  * block see the same loop iterations, while threads in different blocks may see a different number of iterations.
605  * If the work division has more blocks than the required number of groups, the first blocks will perform one
606  * iteration of the loop, while the other blocks will exit the loop immediately.
607  * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
608  * one iteration, in order to cover then whole problem space.
609  *
610  * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
611  * than the block size. However, also in this case all threads in the block will execute the same number of iterations
612  * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
613  * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by
614  * `uniform_group_elements_along<Dim>(acc, group, elements)`.
615  *
616  * For example, if the block size is 64 and there are 400 elements
617  *
618  * for (auto group: uniform_groups_along<Dim>(acc, 400)
619  *
620  * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
621  * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
622  * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
623  * the inner loop to not process the non-existing elements after 399.
624  *
625  * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
626  * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
627  * process one group while block 7 will no process any.
628  *
629  * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
630  * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
631  * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
632  * group 3.
633  *
634  * See `UniformElementsAlong<TAcc, Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
635  * `uniform_group_elements_along<Dim>`.
636  */
637 
638  template <typename TAcc,
639  std::size_t Dim,
640  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
642  public:
643  ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc)
644  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
645  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
646  extent_{stride_} {}
647 
648  // extent is the total number of elements (not blocks)
649  ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent)
650  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
651  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
652  extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])} {}
653 
654  class const_iterator;
656 
657  ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }
658 
659  ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
660 
662  friend class UniformGroupsAlong;
663 
664  ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
665  : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
666 
667  public:
668  ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
669 
670  // pre-increment the iterator
671  ALPAKA_FN_ACC inline const_iterator& operator++() {
672  // increment the first-element-in-block index by the grid stride
673  first_ += stride_;
674  if (first_ < extent_)
675  return *this;
676 
677  // the iterator has reached or passed the end of the extent, clamp it to the extent
678  first_ = extent_;
679  return *this;
680  }
681 
682  // post-increment the iterator
683  ALPAKA_FN_ACC inline const_iterator operator++(int) {
684  const_iterator old = *this;
685  ++(*this);
686  return old;
687  }
688 
689  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }
690 
691  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
692 
693  private:
694  // non-const to support iterator copy and assignment
697  // modified by the pre/post-increment operator
699  };
700 
701  private:
702  const Idx first_;
703  const Idx stride_;
704  const Idx extent_;
705  };
706 
707  } // namespace detail
708 
709  /* uniform_groups
710  *
711  * `uniform_groups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required to
712  * cover the given problem size, in units of the block size. `elements` indicates the total number of elements, across
713  * all groups; if not specified, it defaults to the kernel grid size.
714  *
715  * `uniform_groups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, 0>(acc, ...)`.
716  *
717  * `uniform_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block see
718  * the same loop iterations, while threads in different blocks may see a different number of iterations.
719  * If the work division has more blocks than the required number of groups, the first blocks will perform one
720  * iteration of the loop, while the other blocks will exit the loop immediately.
721  * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
722  * one iteration, in order to cover then whole problem space.
723  *
724  * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
725  * than the block size. However, also in this case all threads in the block will execute the same number of iterations
726  * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
727  * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by
728  * `uniform_group_elements(acc, group, elements)`.
729  *
730  * For example, if the block size is 64 and there are 400 elements
731  *
732  * for (auto group: uniform_groups(acc, 400)
733  *
734  * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
735  * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
736  * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
737  * the inner loop to not process the non-existing elements after 399.
738  *
739  * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
740  * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
741  * process one group while block 7 will no process any.
742  *
743  * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
744  * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
745  * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
746  * group 3.
747  *
748  * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
749  *
750  * Note that `uniform_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
751  * - `uniform_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
752  * - `uniform_groups_x(acc, ...)`, `uniform_groups_y(acc, ...)`, or `uniform_groups_z(acc, ...)` to loop
753  * along the fastest, second-fastest, or third-fastest dimension.
754  */
755 
756  template <typename TAcc,
757  typename... TArgs,
758  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
759  ALPAKA_FN_ACC inline auto uniform_groups(TAcc const& acc, TArgs... args) {
760  return detail::UniformGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
761  }
762 
763  /* uniform_groups_along<Dim>
764  *
765  * `uniform_groups_along<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can infer
766  * the accelerator type from the argument.
767  */
768 
769  template <typename TAcc,
770  std::size_t Dim,
771  typename... TArgs,
772  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
773  ALPAKA_FN_ACC inline auto uniform_groups_along(TAcc const& acc, TArgs... args) {
774  return detail::UniformGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
775  }
776 
777  /* uniform_groups_x, _y, _z
778  *
779  * Like `uniform_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
780  */
781 
782  template <typename TAcc,
783  typename... TArgs,
784  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
785  ALPAKA_FN_ACC inline auto uniform_groups_x(TAcc const& acc, TArgs... args) {
786  return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
787  }
788 
789  template <typename TAcc,
790  typename... TArgs,
791  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
792  ALPAKA_FN_ACC inline auto uniform_groups_y(TAcc const& acc, TArgs... args) {
793  return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
794  }
795 
796  template <typename TAcc,
797  typename... TArgs,
798  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
799  ALPAKA_FN_ACC inline auto uniform_groups_z(TAcc const& acc, TArgs... args) {
800  return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
801  }
802 
803  namespace detail {
804 
805  /* UniformGroupElementsAlong
806  *
807  * `UniformGroupElementsAlong<TAcc, Dim>(acc, group, elements)` returns a one-dimensional iteratable range that spans
808  * all the elements within the given `group` along dimension `Dim`, as obtained from `UniformGroupsAlong<Dim>`, up to
809  * `elements` (exclusive). `elements` indicates the total number of elements across all groups; if not specified, it
810  * defaults to the kernel grid size.
811  *
812  * `uniform_group_elements_along<Dim>(acc, ...)` is a shorthand for `UniformGroupElementsAlong<TAcc, Dim>(acc, ...)`
813  * that can infer the accelerator type from the argument.
814  *
815  * In a 1-dimensional kernel, `uniform_group_elements(acc, ...)` is a shorthand for
816  * `UniformGroupElementsAlong<0>(acc, ...)`.
817  *
818  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
819  * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
820  * For convenience when converting CUDA or HIP code, `uniform_group_elements_x(acc, ...)`, `_y` and `_z` are
821  * shorthands for `UniformGroupElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
822  *
823  * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
824  * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
825  * local index spans the range from 0 to the block size (excluded).
826  *
827  * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
828  * global element index reaches `elements`.
829  *
830  * If the problem size is not a multiple of the block size, different threads may execute a different number of
831  * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
832  * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
833  * See `UniformElementsAlong<Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
834  * `uniform_group_elements_along<Dim>`.
835  *
836  * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
837  * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
838  * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
839  * index explicitly inside the loop:
840  *
841  * for (auto element : uniform_group_elements_along<N-1>(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
842  * bool flag = false;
843  * if (element < elements) {
844  * // do some work and compute a result flag only for the valid elements
845  * flag = do_some_work();
846  * }
847  * // check if any valid element had a positive result
848  * if (alpaka::warp::any(acc, flag)) {
849  * // ...
850  * }
851  * }
852  *
853  * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
854  */
855 
856  template <typename TAcc,
857  std::size_t Dim,
858  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
860  public:
861  ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block)
862  : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
863  local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
864  alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
865  range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]} {}
866 
867  ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent)
868  : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
869  local_{std::min(extent - first_,
870  alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
871  alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])},
872  range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])} {}
873 
874  class const_iterator;
876 
877  ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(local_, first_, range_); }
878 
879  ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(range_, first_, range_); }
880 
883 
884  ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
885  : index_{local}, first_{first}, range_{range} {}
886 
887  public:
888  ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; }
889 
890  // pre-increment the iterator
891  ALPAKA_FN_ACC inline const_iterator& operator++() {
892  if constexpr (requires_single_thread_per_block_v<TAcc>) {
893  // increment the index along the elements processed by the current thread
894  ++index_;
895  if (index_ < range_)
896  return *this;
897  }
898 
899  // the iterator has reached or passed the end of the extent, clamp it to the extent
900  index_ = range_;
901  return *this;
902  }
903 
904  // post-increment the iterator
905  ALPAKA_FN_ACC inline const_iterator operator++(int) {
906  const_iterator old = *this;
907  ++(*this);
908  return old;
909  }
910 
911  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (index_ == other.index_); }
912 
913  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
914 
915  private:
916  // modified by the pre/post-increment operator
918  // non-const to support iterator copy and assignment
921  };
922 
923  private:
924  const Idx first_;
925  const Idx local_;
926  const Idx range_;
927  };
928 
929  } // namespace detail
930 
931  /* uniform_group_elements
932  *
933  * `uniform_group_elements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
934  * elements within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements`
935  * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
936  *
937  * `uniform_group_elements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`.
938  *
939  * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
940  * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
941  * local index spans the range from 0 to the block size (excluded).
942  *
943  * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
944  * global element index reaches `elements`.
945  *
946  * If the problem size is not a multiple of the block size, different threads may execute a different number of
947  * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
948  * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
949  * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
950  *
951  * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
952  * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
953  * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
954  * index explicitly inside the loop:
955  *
956  * for (auto element : uniform_group_elements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
957  * bool flag = false;
958  * if (element < elements) {
959  * // do some work and compute a result flag only for the valid elements
960  * flag = do_some_work();
961  * }
962  * // check if any valid element had a positive result
963  * if (alpaka::warp::any(acc, flag)) {
964  * // ...
965  * }
966  * }
967  *
968  * Note that `uniform_group_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
969  * kernels, use
970  * - `detail::UniformGroupElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
971  * - `uniform_group_elements_x(acc, ...)`, `uniform_group_elements_y(acc, ...)`, or
972  * `uniform_group_elements_z(acc, ...)` to loop along the fastest, second-fastest, or third-fastest dimension.
973  */
974 
975  template <typename TAcc,
976  typename... TArgs,
977  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
978  ALPAKA_FN_ACC inline auto uniform_group_elements(TAcc const& acc, TArgs... args) {
979  return detail::UniformGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
980  }
981 
982  /* uniform_group_elements_along<Dim>
983  *
984  * `uniform_group_elements_along<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<TAcc, Dim>(acc, ...)`
985  * that can infer the accelerator type from the argument.
986  */
987 
988  template <typename TAcc,
989  std::size_t Dim,
990  typename... TArgs,
991  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
992  ALPAKA_FN_ACC inline auto uniform_group_elements_along(TAcc const& acc, TArgs... args) {
993  return detail::UniformGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
994  }
995 
996  /* uniform_group_elements_x, _y, _z
997  *
998  * Like `uniform_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
999  * dimensions.
1000  */
1001 
1002  template <typename TAcc,
1003  typename... TArgs,
1004  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1005  ALPAKA_FN_ACC inline auto uniform_group_elements_x(TAcc const& acc, TArgs... args) {
1007  }
1008 
1009  template <typename TAcc,
1010  typename... TArgs,
1011  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1012  ALPAKA_FN_ACC inline auto uniform_group_elements_y(TAcc const& acc, TArgs... args) {
1014  }
1015 
1016  template <typename TAcc,
1017  typename... TArgs,
1018  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1019  ALPAKA_FN_ACC inline auto uniform_group_elements_z(TAcc const& acc, TArgs... args) {
1021  }
1022 
1023  namespace detail {
1024 
1025  /* IndependentGroupsAlong
1026  *
1027  * `IndependentGroupsAlong<TAcc, Dim>(acc, groups)` returns a one-dimensional iteratable range than spans the group
1028  * indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If `groups` is not
1029  * specified, it defaults to the number of blocks along the `Dim` dimension.
1030  *
1031  * `independent_groups_along<Dim>(acc, ...)` is a shorthand for `IndependentGroupsAlong<TAcc, Dim>(acc, ...)` that can
1032  * infer the accelerator type from the argument.
1033  *
1034  * In a 1-dimensional kernel, `independent_groups(acc, ...)` is a shorthand for
1035  * `IndependentGroupsAlong<TAcc, 0>(acc, ...)`.
1036  *
1037  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
1038  * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
1039  * For convenience when converting CUDA or HIP code, `independent_groups_x(acc, ...)`, `_y` and `_z` are shorthands
1040  * for `IndependentGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
1041  *
1042  * `independent_groups_along<Dim>(acc, ...)` should be called consistently by all the threads in a block. All threads
1043  * in a block see the same loop iterations, while threads in different blocks may see a different number of iterations.
1044  * If the work division has more blocks than the required number of groups, the first blocks will perform one
1045  * iteration of the loop, while the other blocks will exit the loop immediately.
1046  * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
1047  * one iteration, in order to cover then whole problem space.
1048  *
1049  * For example,
1050  *
1051  * for (auto group: independent_groups_along<Dim>(acc, 7))
1052  *
1053  * will return the group range from 0 to 6, distributed across all blocks in the work division.
1054  * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
1055  * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
1056  * process one group while block 7 will no process any.
1057  * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
1058  * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
1059  * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
1060  * group 3.
1061  */
1062 
1063  template <typename TAcc,
1064  std::size_t Dim,
1065  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1067  public:
1068  ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc)
1069  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1070  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1071  extent_{stride_} {}
1072 
1073  ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc, Idx groups)
1074  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1075  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1076  extent_{groups} {}
1077 
1078  class const_iterator;
1080 
1081  ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }
1082 
1083  ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
1084 
1087 
1088  ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
1089  : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
1090 
1091  public:
1092  ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
1093 
1094  // pre-increment the iterator
1095  ALPAKA_FN_ACC inline const_iterator& operator++() {
1096  // increment the first-element-in-block index by the grid stride
1097  first_ += stride_;
1098  if (first_ < extent_)
1099  return *this;
1100 
1101  // the iterator has reached or passed the end of the extent, clamp it to the extent
1102  first_ = extent_;
1103  return *this;
1104  }
1105 
1106  // post-increment the iterator
1107  ALPAKA_FN_ACC inline const_iterator operator++(int) {
1108  const_iterator old = *this;
1109  ++(*this);
1110  return old;
1111  }
1112 
1113  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }
1114 
1115  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
1116 
1117  private:
1118  // non-const to support iterator copy and assignment
1121  // modified by the pre/post-increment operator
1123  };
1124 
1125  private:
1126  const Idx first_;
1127  const Idx stride_;
1128  const Idx extent_;
1129  };
1130 
1131  } // namespace detail
1132 
1133  /* independent_groups
1134  *
1135  * `independent_groups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0 to
1136  * `groups`. If `groups` is not specified, it defaults to the number of blocks.
1137  *
1138  * `independent_groups(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, 0>(acc, ...)`.
1139  *
1140  * `independent_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
1141  * see the same loop iterations, while threads in different blocks may see a different number of iterations.
1142  * If the work division has more blocks than the required number of groups, the first blocks will perform one
1143  * iteration of the loop, while the other blocks will exit the loop immediately.
1144  * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
1145  * one iteration, in order to cover then whole problem space.
1146  *
1147  * For example,
1148  *
1149  * for (auto group: independent_groups(acc, 7))
1150  *
1151  * will return the group range from 0 to 6, distributed across all blocks in the work division.
1152  * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
1153  * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
1154  * process one group while block 7 will no process any.
1155  * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
1156  * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
1157  * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
1158  * group 3.
1159  *
1160  * Note that `independent_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
1161  * use
1162  * - `independent_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
1163  * - `independent_groups_x(acc, ...)`, `independent_groups_y(acc, ...)`, or `independent_groups_z(acc, ...)` to loop
1164  * along the fastest, second-fastest, or third-fastest dimension.
1165  */
1166 
1167  template <typename TAcc,
1168  typename... TArgs,
1169  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1170  ALPAKA_FN_ACC inline auto independent_groups(TAcc const& acc, TArgs... args) {
1171  return detail::IndependentGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
1172  }
1173 
1174  /* independent_groups_along<Dim>
1175  *
1176  * `independent_groups_along<Dim>(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, Dim>(acc, ...)` that can
1177  * infer the accelerator type from the argument.
1178  */
1179 
1180  template <typename TAcc,
1181  std::size_t Dim,
1182  typename... TArgs,
1183  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1184  ALPAKA_FN_ACC inline auto independent_groups_along(TAcc const& acc, TArgs... args) {
1185  return detail::IndependentGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
1186  }
1187 
1188  /* independent_groups_x, _y, _z
1189  *
1190  * Like `independent_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1191  * dimensions.
1192  */
1193 
1194  template <typename TAcc,
1195  typename... TArgs,
1196  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1197  ALPAKA_FN_ACC inline auto independent_groups_x(TAcc const& acc, TArgs... args) {
1198  return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1199  }
1200 
1201  template <typename TAcc,
1202  typename... TArgs,
1203  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1204  ALPAKA_FN_ACC inline auto independent_groups_y(TAcc const& acc, TArgs... args) {
1205  return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1206  }
1207 
1208  template <typename TAcc,
1209  typename... TArgs,
1210  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1211  ALPAKA_FN_ACC inline auto independent_groups_z(TAcc const& acc, TArgs... args) {
1212  return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1213  }
1214 
1215  namespace detail {
1216 
1217  /* IndependentGroupElementsAlong
1218  *
1219  * `independent_group_elements_along<Dim>(acc, ...)` is a shorthand for
1220  * `IndependentGroupElementsAlong<TAcc, Dim>(acc, ...)` that can infer the accelerator type from the argument.
1221  */
1222 
1223  template <typename TAcc,
1224  std::size_t Dim,
1225  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1227  public:
1228  ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc)
1229  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1230  thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1231  stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1232  extent_{stride_} {}
1233 
1234  ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx extent)
1235  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1236  thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1237  stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1238  extent_{extent} {}
1239 
1240  ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx first, Idx extent)
1241  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1242  thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_ + first},
1243  stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1244  extent_{extent} {}
1245 
1246  class const_iterator;
1248 
1249  ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, thread_); }
1250 
1251  ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
1252 
1255 
1256  ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
1257  : elements_{elements},
1258  stride_{stride},
1259  extent_{extent},
1260  first_{std::min(first, extent)},
1261  index_{first_},
1262  range_{std::min(first + elements, extent)} {}
1263 
1264  public:
1265  ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
1266 
1267  // pre-increment the iterator
1268  ALPAKA_FN_ACC inline const_iterator& operator++() {
1269  if constexpr (requires_single_thread_per_block_v<TAcc>) {
1270  // increment the index along the elements processed by the current thread
1271  ++index_;
1272  if (index_ < range_)
1273  return *this;
1274  }
1275 
1276  // increment the thread index with the block stride
1277  first_ += stride_;
1278  index_ = first_;
1279  range_ = std::min(first_ + elements_, extent_);
1280  if (index_ < extent_)
1281  return *this;
1282 
1283  // the iterator has reached or passed the end of the extent, clamp it to the extent
1284  first_ = extent_;
1285  index_ = extent_;
1286  range_ = extent_;
1287  return *this;
1288  }
1289 
1290  // post-increment the iterator
1291  ALPAKA_FN_ACC inline const_iterator operator++(int) {
1292  const_iterator old = *this;
1293  ++(*this);
1294  return old;
1295  }
1296 
1297  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
1298  return (index_ == other.index_) and (first_ == other.first_);
1299  }
1300 
1301  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
1302 
1303  private:
1304  // non-const to support iterator copy and assignment
1308  // modified by the pre/post-increment operator
1312  };
1313 
1314  private:
1316  const Idx thread_;
1317  const Idx stride_;
1318  const Idx extent_;
1319  };
1320 
1321  } // namespace detail
1322 
1323  /* independent_group_elements
1324  */
1325 
1326  template <typename TAcc,
1327  typename... TArgs,
1328  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1329  ALPAKA_FN_ACC inline auto independent_group_elements(TAcc const& acc, TArgs... args) {
1330  return detail::IndependentGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
1331  }
1332 
1333  /* independent_group_elements_along<Dim>
1334  *
1335  * `independent_group_elements_along<Dim>(acc, ...)` is a shorthand for
1336  * `detail::IndependentGroupElementsAlong<TAcc, Dim>(acc, ...)` that can infer the accelerator type from the argument.
1337  */
1338 
1339  template <typename TAcc,
1340  std::size_t Dim,
1341  typename... TArgs,
1342  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1343  ALPAKA_FN_ACC inline auto independent_group_elements_along(TAcc const& acc, TArgs... args) {
1344  return detail::IndependentGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
1345  }
1346 
1347  /* independent_group_elements_x, _y, _z
1348  *
1349  * Like `independent_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1350  * dimensions.
1351  */
1352 
1353  template <typename TAcc,
1354  typename... TArgs,
1355  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1356  ALPAKA_FN_ACC inline auto independent_group_elements_x(TAcc const& acc, TArgs... args) {
1358  }
1359 
1360  template <typename TAcc,
1361  typename... TArgs,
1362  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1363  ALPAKA_FN_ACC inline auto independent_group_elements_y(TAcc const& acc, TArgs... args) {
1365  }
1366 
1367  template <typename TAcc,
1368  typename... TArgs,
1369  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1370  ALPAKA_FN_ACC inline auto independent_group_elements_z(TAcc const& acc, TArgs... args) {
1372  }
1373 
1374  /* once_per_grid
1375  *
1376  * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.
1377  *
1378  * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
1379  */
1380 
1381  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1382  ALPAKA_FN_ACC inline constexpr bool once_per_grid(TAcc const& acc) {
1383  return alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
1384  }
1385 
1386  /* once_per_block
1387  *
1388  * `once_per_block(acc)` returns true for a single thread within the block.
1389  *
1390  * Usually the condition is true for thread 0, but this index should not be relied upon.
1391  */
1392 
1393  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1394  ALPAKA_FN_ACC inline constexpr bool once_per_block(TAcc const& acc) {
1395  return alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
1396  }
1397 
1398 } // namespace cms::alpakatools
1399 
1400 #endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h
ALPAKA_FN_ACC auto uniform_elements_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:351
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:879
ALPAKA_FN_ACC auto uniform_elements_along(TAcc const &acc, TArgs... args)
Definition: workdivision.h:325
ALPAKA_FN_ACC constexpr bool once_per_block(TAcc const &acc)
ALPAKA_FN_ACC constexpr const_iterator operator++()
Definition: workdivision.h:443
ALPAKA_FN_ACC auto uniform_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:311
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc)
Definition: workdivision.h:157
ALPAKA_FN_ACC const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
ALPAKA_FN_ACC constexpr bool operator!=(const_iterator const &other) const
Definition: workdivision.h:459
ALPAKA_FN_ACC const_iterator(Idx stride, Idx extent, Idx first)
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:657
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:659
WorkDiv< Dim1D > make_workdiv(Idx blocks, Idx elements)
Definition: workdivision.h:47
def all(container)
workaround iterator generators for ROOT classes
Definition: cmstools.py:25
constexpr Idx divide_up_by(Idx value, Idx divisor)
Definition: workdivision.h:20
ALPAKA_FN_ACC auto uniform_groups_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:785
ALPAKA_FN_ACC IndependentGroupElementsAlong(TAcc const &acc, Idx first, Idx extent)
ALPAKA_FN_ACC IndependentGroupsAlong(TAcc const &acc, Idx groups)
ALPAKA_FN_ACC IndependentGroupElementsAlong(TAcc const &acc)
ALPAKA_FN_ACC auto uniform_elements_nd(TAcc const &acc)
Definition: workdivision.h:575
uint32_t Idx
Definition: config.h:14
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx extent)
Definition: workdivision.h:163
ALPAKA_FN_ACC auto independent_group_elements(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:180
ALPAKA_FN_ACC constexpr bool operator==(const_iterator const &other) const
Definition: workdivision.h:455
ALPAKA_FN_ACC auto uniform_group_elements_y(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC const_iterator begin() const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:226
ALPAKA_FN_ACC UniformGroupsAlong(TAcc const &acc, Idx extent)
Definition: workdivision.h:649
ALPAKA_FN_ACC IndependentGroupElementsAlong(TAcc const &acc, Idx extent)
ALPAKA_FN_ACC auto uniform_group_elements_along(TAcc const &acc, TArgs... args)
Definition: workdivision.h:992
ALPAKA_FN_ACC auto uniform_group_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:978
ALPAKA_FN_ACC auto independent_groups(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniform_elements_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:337
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:691
ALPAKA_FN_ACC auto independent_groups_y(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:911
ALPAKA_FN_ACC auto independent_group_elements_y(TAcc const &acc, TArgs... args)
constexpr uint32_t stride
Definition: HelixFit.h:22
ALPAKA_FN_ACC auto independent_groups_x(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto independent_group_elements_z(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC const_iterator(UniformElementsND const *loop, Vec first)
Definition: workdivision.h:465
alpaka::WorkDivMembers< TDim, Idx > WorkDiv
Definition: config.h:31
ALPAKA_FN_ACC const_iterator(UniformElementsND const *loop, at_end_t const &)
Definition: workdivision.h:472
ALPAKA_FN_ACC auto independent_groups_along(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniform_elements_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:344
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:689
constexpr Idx round_up_by(Idx value, Idx divisor)
Definition: workdivision.h:17
ALPAKA_FN_ACC auto independent_group_elements_along(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx first, Idx extent)
Definition: workdivision.h:169
const std::complex< double > I
Definition: I.h:8
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:230
ALPAKA_FN_ACC const_iterator(Idx local, Idx first, Idx range)
Definition: workdivision.h:884
ALPAKA_FN_ACC auto independent_group_elements_x(TAcc const &acc, TArgs... args)
Definition: value.py:1
ALPAKA_FN_ACC auto independent_groups_z(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:877
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:178
ALPAKA_FN_ACC auto uniform_groups_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:792
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC IndependentGroupsAlong(TAcc const &acc)
ALPAKA_FN_ACC auto uniform_group_elements_z(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC UniformGroupElementsAlong(TAcc const &acc, Idx block)
Definition: workdivision.h:861
ALPAKA_FN_ACC const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
Definition: workdivision.h:185
#define N
Definition: blowfish.cc:9
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC auto uniform_groups_along(TAcc const &acc, TArgs... args)
Definition: workdivision.h:773
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:431
alpaka::Vec< TDim, Idx > Vec
Definition: config.h:24
ALPAKA_FN_ACC auto uniform_group_elements_x(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC constexpr bool once_per_grid(TAcc const &acc)
ALPAKA_FN_ACC auto uniform_groups_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:799
ALPAKA_FN_ACC UniformElementsND(TAcc const &acc, Vec extent)
Definition: workdivision.h:408
ALPAKA_FN_ACC UniformGroupsAlong(TAcc const &acc)
Definition: workdivision.h:643
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC auto uniform_groups(TAcc const &acc, TArgs... args)
Definition: workdivision.h:759
ALPAKA_FN_ACC UniformGroupElementsAlong(TAcc const &acc, Idx block, Idx extent)
Definition: workdivision.h:867
ALPAKA_FN_ACC constexpr const_iterator operator++(int)
Definition: workdivision.h:449
ALPAKA_FN_ACC UniformElementsND(TAcc const &acc)
Definition: workdivision.h:402
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:913
constexpr bool requires_single_thread_per_block_v
Definition: workdivision.h:43
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:420
ALPAKA_FN_ACC const_iterator(Idx stride, Idx extent, Idx first)
Definition: workdivision.h:664
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC const_iterator end() const
ALPAKA_FN_ACC const_iterator begin() const