CMS 3D CMS Logo

workdivision.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
2 #define HeterogeneousCore_AlpakaInterface_interface_workdivision_h
3 
4 #include <type_traits>
5 
6 #include <alpaka/alpaka.hpp>
7 
10 
11 namespace cms::alpakatools {
12 
13  using namespace alpaka_common;
14 
15  // If the first argument is not a multiple of the second argument, round it up to the next multiple
16  inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }
17 
18  // Return the integer division of the first argument by the second argument, rounded up to the next integer
19  inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
20 
21  // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
22  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
23  struct requires_single_thread_per_block : public std::true_type {};
24 
25 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
26  template <typename TDim>
27  struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
28 #endif // ALPAKA_ACC_GPU_CUDA_ENABLED
29 
30 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
31  template <typename TDim>
32  struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
33 #endif // ALPAKA_ACC_GPU_HIP_ENABLED
34 
35 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
36  template <typename TDim>
37  struct requires_single_thread_per_block<alpaka::AccCpuThreads<TDim, Idx>> : public std::false_type {};
38 #endif // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
39 
40  // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
41  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
43 
44  // Create an accelerator-dependent work division for 1-dimensional kernels
45  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
47  if constexpr (not requires_single_thread_per_block_v<TAcc>) {
48  // On GPU backends, each thread is looking at a single element:
49  // - the number of threads per block is "elements";
50  // - the number of elements per thread is always 1.
51  return WorkDiv<Dim1D>(blocks, elements, Idx{1});
52  } else {
53  // On CPU backends, run serially with a single thread per block:
54  // - the number of threads per block is always 1;
55  // - the number of elements per thread is "elements".
56  return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
57  }
58  }
59 
60  // Create the accelerator-dependent workdiv for N-dimensional kernels
61  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
62  inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
63  const Vec<alpaka::Dim<TAcc>>& elements) {
64  using Dim = alpaka::Dim<TAcc>;
65  if constexpr (not requires_single_thread_per_block_v<TAcc>) {
66  // On GPU backends, each thread is looking at a single element:
67  // - the number of threads per block is "elements";
68  // - the number of elements per thread is always 1.
70  } else {
71  // On CPU backends, run serially with a single thread per block:
72  // - the number of threads per block is always 1;
73  // - the number of elements per thread is "elements".
75  }
76  }
77 
78  /* ElementIndex
79  *
80  * an aggregate that containes the `.global` and `.local` indices of an element; returned by iterating over the objecs
81  * returned by `elements_in_block` and similar functions.
82  */
83 
84  struct ElementIndex {
87  };
88 
89  /* uniform_elements_along
90  *
91  * `uniform_elements_along<Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that spans the
92  * element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension.
93  * If `first` is not specified, it defaults to 0.
94  * If `extent` is not specified, it defaults to the kernel grid size along the `Dim` dimension.
95  *
96  * In a 1-dimensional kernel, `uniform_elements(acc, ...)` is a shorthand for `uniform_elements_along<0>(acc, ...)`.
97  *
98  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed
99  * by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
100  * For convenience when converting CUDA or HIP code, `uniform_elements_x(acc, ...)`, `_y` and `_z` are shorthands for
101  * `uniform_elements_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
102  *
103  * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
104  * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
105  * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
106  * loop over each group's elements, and synchronise only in the outer loop:
107  *
108  * for (auto group : uniform_groups_along<Dim>(acc, extent)) {
109  * for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
110  * // first part of the computation
111  * // no synchronisations here
112  * ...
113  * }
114  * // wait for all threads to complete the first part
115  * alpaka::syncBlockThreads();
116  * for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
117  * // second part of the computation
118  * // no synchronisations here
119  * ...
120  * }
121  * // wait for all threads to complete the second part
122  * alpaka::syncBlockThreads();
123  * ...
124  * }
125  *
126  * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
127  * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
128  * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
129  * explicitly inside the loop:
130  *
131  * for (auto element : uniform_elements_along<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
132  * bool flag = false;
133  * if (element < extent) {
134  * // do some work and compute a result flag only for the valid elements
135  * flag = do_some_work();
136  * }
137  * // check if any valid element had a positive result
138  * if (alpaka::warp::any(acc, flag)) {
139  * // ...
140  * }
141  * }
142  *
143  * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
144  */
145 
146  template <typename TAcc,
147  std::size_t Dim,
148  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
150  public:
151  ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc)
152  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
153  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
154  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
155  extent_{stride_} {}
156 
157  ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc, Idx extent)
158  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
159  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
160  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
161  extent_{extent} {}
162 
163  ALPAKA_FN_ACC inline uniform_elements_along(TAcc const& acc, Idx first, Idx extent)
164  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
165  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first},
166  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
167  extent_{extent} {}
168 
169  class const_iterator;
171 
172  ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, first_); }
173 
174  ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
175 
178 
179  ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
180  : elements_{elements},
181  stride_{stride},
182  extent_{extent},
183  first_{std::min(first, extent)},
184  index_{first_},
185  range_{std::min(first + elements, extent)} {}
186 
187  public:
188  ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
189 
190  // pre-increment the iterator
191  ALPAKA_FN_ACC inline const_iterator& operator++() {
192  if constexpr (requires_single_thread_per_block_v<TAcc>) {
193  // increment the index along the elements processed by the current thread
194  ++index_;
195  if (index_ < range_)
196  return *this;
197  }
198 
199  // increment the thread index with the grid stride
200  first_ += stride_;
201  index_ = first_;
202  range_ = std::min(first_ + elements_, extent_);
203  if (index_ < extent_)
204  return *this;
205 
206  // the iterator has reached or passed the end of the extent, clamp it to the extent
207  first_ = extent_;
208  index_ = extent_;
209  range_ = extent_;
210  return *this;
211  }
212 
213  // post-increment the iterator
214  ALPAKA_FN_ACC inline const_iterator operator++(int) {
215  const_iterator old = *this;
216  ++(*this);
217  return old;
218  }
219 
220  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
221  return (index_ == other.index_) and (first_ == other.first_);
222  }
223 
224  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
225 
226  private:
227  // non-const to support iterator copy and assignment
231  // modified by the pre/post-increment operator
235  };
236 
237  private:
238  const Idx elements_;
239  const Idx first_;
240  const Idx stride_;
241  const Idx extent_;
242  };
243 
244  /* uniform_elements
245  *
246  * `uniform_elements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element indices
247  * from `first` (inclusive) to `extent` (exlusive).
248  * If `first` is not specified, it defaults to 0.
249  * If `extent` is not specified, it defaults to the kernel grid size.
250  *
251  * `uniform_elements(acc, ...)` is a shorthand for `uniform_elements_along<0>(acc, ...)`.
252  *
253  * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
254  * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
255  * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
256  * loop over each group's elements, and synchronise only in the outer loop:
257  *
258  * for (auto group : uniform_groups(acc, extent)) {
259  * for (auto element : uniform_group_elements(acc, group, extent)) {
260  * // first part of the computation
261  * // no synchronisations here
262  * ...
263  * }
264  * // wait for all threads to complete the first part
265  * alpaka::syncBlockThreads();
266  * for (auto element : uniform_group_elements(acc, group, extent)) {
267  * // second part of the computation
268  * // no synchronisations here
269  * ...
270  * }
271  * // wait for all threads to complete the second part
272  * alpaka::syncBlockThreads();
273  * ...
274  * }
275  *
276  * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
277  * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
278  * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
279  * explicitly inside the loop:
280  *
281  * for (auto element : uniform_elements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
282  * bool flag = false;
283  * if (element < extent) {
284  * // do some work and compute a result flag only for elements up to extent
285  * flag = do_some_work();
286  * }
287  * // check if any valid element had a positive result
288  * if (alpaka::warp::any(acc, flag)) {
289  * // ...
290  * }
291  * }
292  *
293  * Note that `uniform_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
294  * - `uniform_elements_nd(acc, ...)` to cover an N-dimensional problem space with a single loop;
295  * - `uniform_elements_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
296  * - `uniform_elements_x(acc, ...)`, `uniform_elements_y(acc, ...)`, or `uniform_elements_z(acc, ...)` to loop
297  * along the fastest, second-fastest, or third-fastest dimension.
298  */
299 
300  template <typename TAcc,
301  typename... TArgs,
302  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
303  ALPAKA_FN_ACC inline auto uniform_elements(TAcc const& acc, TArgs... args) {
304  return uniform_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
305  }
306 
307  /* uniform_elements_x, _y, _z
308  *
309  * Like `uniform_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
310  */
311 
312  template <typename TAcc,
313  typename... TArgs,
314  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
315  ALPAKA_FN_ACC inline auto uniform_elements_x(TAcc const& acc, TArgs... args) {
316  return uniform_elements_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
317  }
318 
319  template <typename TAcc,
320  typename... TArgs,
321  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
322  ALPAKA_FN_ACC inline auto uniform_elements_y(TAcc const& acc, TArgs... args) {
323  return uniform_elements_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
324  }
325 
326  template <typename TAcc,
327  typename... TArgs,
328  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
329  ALPAKA_FN_ACC inline auto uniform_elements_z(TAcc const& acc, TArgs... args) {
330  return uniform_elements_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
331  }
332 
333  /* elements_with_stride
334  *
335  * `elements_with_stride(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element
336  * indices from `first` (inclusive) to `extent` (exlusive).
337  * If `first` is not specified, it defaults to 0.
338  * If `extent` is not specified, it defaults to the kernel grid size.
339  *
340  * `elements_with_stride(acc, ...)` is a legacy name for `uniform_elements(acc, ...)`.
341  */
342 
343  template <typename TAcc,
344  typename... TArgs,
345  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
346  ALPAKA_FN_ACC inline auto elements_with_stride(TAcc const& acc, TArgs... args) {
347  return uniform_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
348  }
349 
350  /* uniform_elements_nd
351  *
352  * `uniform_elements_nd(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
353  * required to cover the given problem size, indicated by `extent`.
354  *
355  * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
356  * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
357  * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
358  * loop over each group's elements, and synchronise only in the outer loop:
359  *
360  * for (auto group0 : uniform_groups_along<0>(acc, extent[0])) {
361  * for (auto group1 : uniform_groups_along<1>(acc, extent[1])) {
362  * for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
363  * for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
364  * // first part of the computation
365  * // no synchronisations here
366  * ...
367  * }
368  * }
369  * // wait for all threads to complete the first part
370  * alpaka::syncBlockThreads();
371  * for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
372  * for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
373  * // second part of the computation
374  * // no synchronisations here
375  * ...
376  * }
377  * }
378  * // wait for all threads to complete the second part
379  * alpaka::syncBlockThreads();
380  * ...
381  * }
382  * }
383  *
384  * For more details, see `uniform_elements_along<Dim>(acc, ...)`.
385  */
386 
387  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
389  public:
390  using Dim = alpaka::Dim<TAcc>;
391  using Vec = alpaka::Vec<Dim, Idx>;
392 
393  ALPAKA_FN_ACC inline uniform_elements_nd(TAcc const& acc)
394  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
395  thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
396  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
397  extent_{stride_} {}
398 
399  ALPAKA_FN_ACC inline uniform_elements_nd(TAcc const& acc, Vec extent)
400  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
401  thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
402  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
403  extent_{extent} {}
404 
405  // tag used to construct an end iterator
406  struct at_end_t {};
407 
408  class const_iterator;
410 
411  ALPAKA_FN_ACC inline const_iterator begin() const {
412  // check that all dimensions of the current thread index are within the extent
413  if ((thread_ < extent_).all()) {
414  // construct an iterator pointing to the first element to be processed by the current thread
415  return const_iterator{this, thread_};
416  } else {
417  // construct an end iterator, pointing post the end of the extent
418  return const_iterator{this, at_end_t{}};
419  }
420  }
421 
422  ALPAKA_FN_ACC inline const_iterator end() const {
423  // construct an end iterator, pointing post the end of the extent
424  return const_iterator{this, at_end_t{}};
425  }
426 
428  friend class uniform_elements_nd;
429 
430  public:
431  ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
432 
433  // pre-increment the iterator
434  ALPAKA_FN_ACC constexpr inline const_iterator operator++() {
435  increment();
436  return *this;
437  }
438 
439  // post-increment the iterator
440  ALPAKA_FN_ACC constexpr inline const_iterator operator++(int) {
441  const_iterator old = *this;
442  increment();
443  return old;
444  }
445 
446  ALPAKA_FN_ACC constexpr inline bool operator==(const_iterator const& other) const {
447  return (index_ == other.index_);
448  }
449 
450  ALPAKA_FN_ACC constexpr inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
451 
452  private:
453  // construct an iterator pointing to the first element to be processed by the current thread
454  ALPAKA_FN_ACC inline const_iterator(uniform_elements_nd const* loop, Vec first)
455  : loop_{loop},
456  first_{alpaka::elementwise_min(first, loop->extent_)},
457  range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
458  index_{first_} {}
459 
460  // construct an end iterator, pointing post the end of the extent
461  ALPAKA_FN_ACC inline const_iterator(uniform_elements_nd const* loop, at_end_t const&)
462  : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}
463 
464  template <size_t I>
465  ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
466  bool overflow = false;
467  ++index_[I];
468  if (index_[I] >= range_[I]) {
469  index_[I] = first_[I];
470  overflow = true;
471  }
472  return overflow;
473  }
474 
475  template <size_t N>
476  ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
477  if constexpr (N == 0) {
478  // overflow
479  return true;
480  } else {
481  if (not nth_elements_loop<N - 1>()) {
482  return false;
483  } else {
484  return do_elements_loops<N - 1>();
485  }
486  }
487  }
488 
489  template <size_t I>
490  ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
491  bool overflow = false;
492  first_[I] += loop_->stride_[I];
493  if (first_[I] >= loop_->extent_[I]) {
494  first_[I] = loop_->thread_[I];
495  overflow = true;
496  }
497  index_[I] = first_[I];
498  range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
499  return overflow;
500  }
501 
502  template <size_t N>
503  ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
504  if constexpr (N == 0) {
505  // overflow
506  return true;
507  } else {
508  if (not nth_strided_loop<N - 1>()) {
509  return false;
510  } else {
511  return do_strided_loops<N - 1>();
512  }
513  }
514  }
515 
516  // increment the iterator
517  ALPAKA_FN_ACC inline constexpr void increment() {
518  if constexpr (requires_single_thread_per_block_v<TAcc>) {
519  // linear N-dimensional loops over the elements associated to the thread;
520  // do_elements_loops<>() returns true if any of those loops overflows
521  if (not do_elements_loops<Dim::value>()) {
522  // the elements loops did not overflow, return the next index
523  return;
524  }
525  }
526 
527  // strided N-dimensional loop over the threads in the kernel launch grid;
528  // do_strided_loops<>() returns true if any of those loops overflows
529  if (not do_strided_loops<Dim::value>()) {
530  // the strided loops did not overflow, return the next index
531  return;
532  }
533 
534  // the iterator has reached or passed the end of the extent, clamp it to the extent
535  first_ = loop_->extent_;
536  range_ = loop_->extent_;
537  index_ = loop_->extent_;
538  }
539 
540  // const pointer to the uniform_elements_nd that the iterator refers to
542 
543  // modified by the pre/post-increment operator
544  Vec first_; // first element processed by this thread
545  Vec range_; // last element processed by this thread
546  Vec index_; // current element processed by this thread
547  };
548 
549  private:
550  const Vec elements_;
551  const Vec thread_;
552  const Vec stride_;
553  const Vec extent_;
554  };
555 
556  /* elements_with_stride_nd
557  *
558  * `elements_with_stride_nd(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
559  * required to cover the given problem size, indicated by `extent`.
560  *
561  * `elements_with_stride_nd(acc, ...)` is a legacy name for `uniform_elements_nd(acc, ...)`.
562  */
563 
564  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
565  ALPAKA_FN_ACC inline auto elements_with_stride_nd(TAcc const& acc) {
566  return uniform_elements_nd<TAcc>(acc);
567  }
568 
569  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
570  ALPAKA_FN_ACC inline auto elements_with_stride_nd(TAcc const& acc, alpaka::Vec<alpaka::Dim<TAcc>, Idx> extent) {
571  return uniform_elements_nd<TAcc>(acc, extent);
572  }
573 
574  /* uniform_groups_along
575  *
576  * `uniform_groups_along<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group indices
577  * required to cover the given problem size along the `Dim` dimension, in units of the block size. `elements`
578  * indicates the total number of elements, across all groups; if not specified, it defaults to the kernel grid size
579  * along the `Dim` dimension.
580  *
581  * In a 1-dimensional kernel, `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`.
582  *
583  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
584  * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
585  * For convenience when converting CUDA or HIP code, `uniform_groups_x(acc, ...)`, `_y` and `_z` are shorthands for
586  * `uniform_groups_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
587  *
588  * `uniform_groups_along<Dim>` should be called consistently by all the threads in a block. All threads in a block see
589  * the same loop iterations, while threads in different blocks may see a different number of iterations.
590  * If the work division has more blocks than the required number of groups, the first blocks will perform one
591  * iteration of the loop, while the other blocks will exit the loop immediately.
592  * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
593  * one iteration, in order to cover then whole problem space.
594  *
595  * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
596  * than the block size. However, also in this case all threads in the block will execute the same number of iterations
597  * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
598  * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by
599  * `uniform_group_elements_along<Dim>(acc, group, elements)`.
600  *
601  * For example, if the block size is 64 and there are 400 elements
602  *
603  * for (auto group: uniform_groups_along<Dim>(acc, 400)
604  *
605  * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
606  * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
607  * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
608  * the inner loop to not process the non-existing elements after 399.
609  *
610  * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
611  * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
612  * process one group while block 7 will no process any.
613  *
614  * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
615  * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
616  * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
617  * group 3.
618  *
619  * See `uniform_elements_along<Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
620  * `uniform_group_elements_along<Dim>`.
621  */
622 
623  template <typename TAcc,
624  std::size_t Dim,
625  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
627  public:
628  ALPAKA_FN_ACC inline uniform_groups_along(TAcc const& acc)
629  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
630  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
631  extent_{stride_} {}
632 
633  // extent is the total number of elements (not blocks)
634  ALPAKA_FN_ACC inline uniform_groups_along(TAcc const& acc, Idx extent)
635  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
636  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
637  extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])} {}
638 
639  class const_iterator;
641 
642  ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }
643 
644  ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
645 
647  friend class uniform_groups_along;
648 
649  ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
650  : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
651 
652  public:
653  ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
654 
655  // pre-increment the iterator
656  ALPAKA_FN_ACC inline const_iterator& operator++() {
657  // increment the first-element-in-block index by the grid stride
658  first_ += stride_;
659  if (first_ < extent_)
660  return *this;
661 
662  // the iterator has reached or passed the end of the extent, clamp it to the extent
663  first_ = extent_;
664  return *this;
665  }
666 
667  // post-increment the iterator
668  ALPAKA_FN_ACC inline const_iterator operator++(int) {
669  const_iterator old = *this;
670  ++(*this);
671  return old;
672  }
673 
674  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }
675 
676  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
677 
678  private:
679  // non-const to support iterator copy and assignment
682  // modified by the pre/post-increment operator
684  };
685 
686  private:
687  const Idx first_;
688  const Idx stride_;
689  const Idx extent_;
690  };
691 
692  /* uniform_groups
693  *
694  * `uniform_groups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required to
695  * cover the given problem size, in units of the block size. `elements` indicates the total number of elements, across
696  * all groups; if not specified, it defaults to the kernel grid size.
697  *
698  * `uniform_groups(acc, ...)` is a shorthand for `uniform_groups_along<0>(acc, ...)`.
699  *
700  * `uniform_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block see
701  * the same loop iterations, while threads in different blocks may see a different number of iterations.
702  * If the work division has more blocks than the required number of groups, the first blocks will perform one
703  * iteration of the loop, while the other blocks will exit the loop immediately.
704  * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
705  * one iteration, in order to cover then whole problem space.
706  *
707  * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
708  * than the block size. However, also in this case all threads in the block will execute the same number of iterations
709  * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
710  * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by
711  * `uniform_group_elements(acc, group, elements)`.
712  *
713  * For example, if the block size is 64 and there are 400 elements
714  *
715  * for (auto group: uniform_groups(acc, 400)
716  *
717  * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
718  * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
719  * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
720  * the inner loop to not process the non-existing elements after 399.
721  *
722  * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
723  * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
724  * process one group while block 7 will no process any.
725  *
726  * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
727  * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
728  * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
729  * group 3.
730  *
731  * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
732  *
733  * Note that `uniform_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
734  * - `uniform_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
735  * - `uniform_groups_x(acc, ...)`, `uniform_groups_y(acc, ...)`, or `uniform_groups_z(acc, ...)` to loop
736  * along the fastest, second-fastest, or third-fastest dimension.
737  */
738 
739  template <typename TAcc,
740  typename... TArgs,
741  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
742  ALPAKA_FN_ACC inline auto uniform_groups(TAcc const& acc, TArgs... args) {
743  return uniform_groups_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
744  }
745 
746  /* uniform_groups_x, _y, _z
747  *
748  * Like `uniform_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
749  */
750 
751  template <typename TAcc,
752  typename... TArgs,
753  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
754  ALPAKA_FN_ACC inline auto uniform_groups_x(TAcc const& acc, TArgs... args) {
755  return uniform_groups_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
756  }
757 
758  template <typename TAcc,
759  typename... TArgs,
760  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
761  ALPAKA_FN_ACC inline auto uniform_groups_y(TAcc const& acc, TArgs... args) {
762  return uniform_groups_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
763  }
764 
765  template <typename TAcc,
766  typename... TArgs,
767  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
768  ALPAKA_FN_ACC inline auto uniform_groups_z(TAcc const& acc, TArgs... args) {
769  return uniform_groups_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
770  }
771 
772  /* blocks_with_stride
773  *
774  * `blocks_with_stride(acc, elements)` returns a one-dimensional iteratable range than spans the group indices
775  * required to cover the given problem size, in units of the block size. `elements` indicates the total number of
776  * elements, across all groups; if not specified, it defaults to the kernel grid size.
777  *
778  * `blocks_with_stride(acc, ...)` is a legacy name for `uniform_groups(acc, ...)`.
779  */
780 
781  template <typename TAcc,
782  typename... TArgs,
783  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
784  ALPAKA_FN_ACC inline auto blocks_with_stride(TAcc const& acc, TArgs... args) {
785  return uniform_groups_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
786  }
787 
788  /* uniform_group_elements_along
789  *
790  * `uniform_group_elements_along<Dim>(acc, group, elements)` returns a one-dimensional iteratable range that spans all
791  * the elements within the given `group` along dimension `Dim`, as obtained from `uniform_groups_along<Dim>`, up to
792  * `elements` (exclusive). `elements` indicates the total number of elements across all groups; if not specified, it
793  * defaults to the kernel grid size.
794  *
795  * In a 1-dimensional kernel, `uniform_group_elements(acc, ...)` is a shorthand for
796  * `uniform_group_elements_along<0>(acc, ...)`.
797  *
798  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
799  * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
800  * For convenience when converting CUDA or HIP code, `uniform_group_elements_x(acc, ...)`, `_y` and `_z` are
801  * shorthands for `uniform_group_elements_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
802  *
803  * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
804  * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
805  * local index spans the range from 0 to the block size (excluded).
806  *
807  * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
808  * global element index reaches `elements`.
809  *
810  * If the problem size is not a multiple of the block size, different threads may execute a different number of
811  * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
812  * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
813  * See `uniform_elements_along<Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
814  * `uniform_group_elements_along<Dim>`.
815  *
816  * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
817  * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
818  * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
819  * index explicitly inside the loop:
820  *
821  * for (auto element : uniform_group_elements_along<N-1>(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
822  * bool flag = false;
823  * if (element < elements) {
824  * // do some work and compute a result flag only for the valid elements
825  * flag = do_some_work();
826  * }
827  * // check if any valid element had a positive result
828  * if (alpaka::warp::any(acc, flag)) {
829  * // ...
830  * }
831  * }
832  *
833  * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
834  */
835 
836  template <typename TAcc,
837  std::size_t Dim,
838  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
840  public:
841  ALPAKA_FN_ACC inline uniform_group_elements_along(TAcc const& acc, Idx block)
842  : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
843  local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
844  alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
845  range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]} {}
846 
847  ALPAKA_FN_ACC inline uniform_group_elements_along(TAcc const& acc, Idx block, Idx extent)
848  : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
849  local_{std::min(extent - first_,
850  alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
851  alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])},
852  range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])} {}
853 
854  class const_iterator;
856 
857  ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(local_, first_, range_); }
858 
859  ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(range_, first_, range_); }
860 
863 
864  ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
865  : index_{local}, first_{first}, range_{range} {}
866 
867  public:
868  ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; }
869 
870  // pre-increment the iterator
871  ALPAKA_FN_ACC inline const_iterator& operator++() {
872  if constexpr (requires_single_thread_per_block_v<TAcc>) {
873  // increment the index along the elements processed by the current thread
874  ++index_;
875  if (index_ < range_)
876  return *this;
877  }
878 
879  // the iterator has reached or passed the end of the extent, clamp it to the extent
880  index_ = range_;
881  return *this;
882  }
883 
884  // post-increment the iterator
885  ALPAKA_FN_ACC inline const_iterator operator++(int) {
886  const_iterator old = *this;
887  ++(*this);
888  return old;
889  }
890 
891  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (index_ == other.index_); }
892 
893  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
894 
895  private:
896  // modified by the pre/post-increment operator
898  // non-const to support iterator copy and assignment
901  };
902 
903  private:
904  const Idx first_;
905  const Idx local_;
906  const Idx range_;
907  };
908 
909  /* uniform_group_elements
910  *
911  * `uniform_group_elements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
912  * elements within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements`
913  * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
914  *
915  * `uniform_group_elements(acc, ...)` is a shorthand for `uniform_group_elements_along<0>(acc, ...)`.
916  *
917  * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
918  * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
919  * local index spans the range from 0 to the block size (excluded).
920  *
921  * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
922  * global element index reaches `elements`.
923  *
924  * If the problem size is not a multiple of the block size, different threads may execute a different number of
925  * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
926  * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
927  * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
928  *
929  * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
930  * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
931  * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
932  * index explicitly inside the loop:
933  *
934  * for (auto element : uniform_group_elements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
935  * bool flag = false;
936  * if (element < elements) {
937  * // do some work and compute a result flag only for the valid elements
938  * flag = do_some_work();
939  * }
940  * // check if any valid element had a positive result
941  * if (alpaka::warp::any(acc, flag)) {
942  * // ...
943  * }
944  * }
945  *
946  * Note that `uniform_group_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
947  * kernels, use
948  * - `uniform_group_elements_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
949  * - `uniform_group_elements_x(acc, ...)`, `uniform_group_elements_y(acc, ...)`, or
950  * `uniform_group_elements_z(acc, ...)` to loop along the fastest, second-fastest, or third-fastest dimension.
951  */
952 
953  template <typename TAcc,
954  typename... TArgs,
955  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
956  ALPAKA_FN_ACC inline auto uniform_group_elements(TAcc const& acc, TArgs... args) {
957  return uniform_group_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
958  }
959 
960  /* uniform_group_elements_x, _y, _z
961  *
962  * Like `uniform_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
963  * dimensions.
964  */
965 
966  template <typename TAcc,
967  typename... TArgs,
968  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
969  ALPAKA_FN_ACC inline auto uniform_group_elements_x(TAcc const& acc, TArgs... args) {
970  return uniform_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
971  }
972 
973  template <typename TAcc,
974  typename... TArgs,
975  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
976  ALPAKA_FN_ACC inline auto uniform_group_elements_y(TAcc const& acc, TArgs... args) {
977  return uniform_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
978  }
979 
980  template <typename TAcc,
981  typename... TArgs,
982  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
983  ALPAKA_FN_ACC inline auto uniform_group_elements_z(TAcc const& acc, TArgs... args) {
984  return uniform_group_elements_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
985  }
986 
987  /* elements_in_block
988  *
989  * `elements_in_block(acc, group, elements)` returns a one-dimensional iteratable range that spans all the elements
990  * within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements` indicates the
991  * total number of elements across all groups; if not specified, it defaults to the kernel grid size.
992  *
993  * `elements_in_block(acc, ...)` is a legacy for `uniform_group_elements(acc, ...)`.
994  */
995 
996  template <typename TAcc,
997  typename... TArgs,
998  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
999  ALPAKA_FN_ACC inline auto elements_in_block(TAcc const& acc, TArgs... args) {
1000  return uniform_group_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
1001  }
1002 
1003  /* independent_groups_along
1004  *
1005  * `independent_groups_along<Dim>(acc, groups)` returns a one-dimensional iteratable range than spans the group
1006  * indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If `groups` is not
1007  * specified, it defaults to the number of blocks along the `Dim` dimension.
1008  *
1009  * In a 1-dimensional kernel, `independent_groups(acc, ...)` is a shorthand for
1010  * `independent_groups_along<0>(acc, ...)`.
1011  *
1012  * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
1013  * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
1014  * For convenience when converting CUDA or HIP code, `independent_groups_x(acc, ...)`, `_y` and `_z` are shorthands
1015  * for `independent_groups_along<N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
1016  *
1017  * `independent_groups_along<Dim>` should be called consistently by all the threads in a block. All threads in a block
1018  * see the same loop iterations, while threads in different blocks may see a different number of iterations.
1019  * If the work division has more blocks than the required number of groups, the first blocks will perform one
1020  * iteration of the loop, while the other blocks will exit the loop immediately.
1021  * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
1022  * one iteration, in order to cover then whole problem space.
1023  *
1024  * For example,
1025  *
1026  * for (auto group: independent_groups_along<Dim>(acc, 7))
1027  *
1028  * will return the group range from 0 to 6, distributed across all blocks in the work division.
1029  * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
1030  * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
1031  * process one group while block 7 will no process any.
1032  * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
1033  * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
1034  * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
1035  * group 3.
1036  */
1037 
1038  template <typename TAcc,
1039  std::size_t Dim,
1040  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1042  public:
1043  ALPAKA_FN_ACC inline independent_groups_along(TAcc const& acc)
1044  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1045  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1046  extent_{stride_} {}
1047 
1048  ALPAKA_FN_ACC inline independent_groups_along(TAcc const& acc, Idx groups)
1049  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1050  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
1051  extent_{groups} {}
1052 
1053  class const_iterator;
1055 
1056  ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }
1057 
1058  ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }
1059 
1062 
1063  ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
1064  : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
1065 
1066  public:
1067  ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
1068 
1069  // pre-increment the iterator
1070  ALPAKA_FN_ACC inline const_iterator& operator++() {
1071  // increment the first-element-in-block index by the grid stride
1072  first_ += stride_;
1073  if (first_ < extent_)
1074  return *this;
1075 
1076  // the iterator has reached or passed the end of the extent, clamp it to the extent
1077  first_ = extent_;
1078  return *this;
1079  }
1080 
1081  // post-increment the iterator
1082  ALPAKA_FN_ACC inline const_iterator operator++(int) {
1083  const_iterator old = *this;
1084  ++(*this);
1085  return old;
1086  }
1087 
1088  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }
1089 
1090  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
1091 
1092  private:
1093  // non-const to support iterator copy and assignment
1096  // modified by the pre/post-increment operator
1098  };
1099 
1100  private:
1101  const Idx first_;
1102  const Idx stride_;
1103  const Idx extent_;
1104  };
1105 
1106  /* independent_groups
1107  *
1108  * `independent_groups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0 to
1109  * `groups`. If `groups` is not specified, it defaults to the number of blocks.
1110  *
1111  * `independent_groups(acc, ...)` is a shorthand for `independent_groups_along<0>(acc, ...)`.
1112  *
1113  * `independent_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
1114  * see the same loop iterations, while threads in different blocks may see a different number of iterations.
1115  * If the work division has more blocks than the required number of groups, the first blocks will perform one
1116  * iteration of the loop, while the other blocks will exit the loop immediately.
1117  * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
1118  * one iteration, in order to cover then whole problem space.
1119  *
1120  * For example,
1121  *
1122  * for (auto group: independent_groups(acc, 7))
1123  *
1124  * will return the group range from 0 to 6, distributed across all blocks in the work division.
1125  * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
1126  * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
1127  * process one group while block 7 will no process any.
1128  * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
1129  * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
1130  * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
1131  * group 3.
1132  *
1133  * Note that `independent_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
1134  * use
1135  * - `independent_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
1136  * - `independent_groups_x(acc, ...)`, `independent_groups_y(acc, ...)`, or `independent_groups_z(acc, ...)` to loop
1137  * along the fastest, second-fastest, or third-fastest dimension.
1138  */
1139 
1140  template <typename TAcc,
1141  typename... TArgs,
1142  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1143  ALPAKA_FN_ACC inline auto independent_groups(TAcc const& acc, TArgs... args) {
1144  return independent_groups_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
1145  }
1146 
1147  /* independent_groups_x, _y, _z
1148  *
1149  * Like `independent_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1150  * dimensions.
1151  */
1152 
1153  template <typename TAcc,
1154  typename... TArgs,
1155  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1156  ALPAKA_FN_ACC inline auto independent_groups_x(TAcc const& acc, TArgs... args) {
1157  return independent_groups_along<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
1158  }
1159 
1160  template <typename TAcc,
1161  typename... TArgs,
1162  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1163  ALPAKA_FN_ACC inline auto independent_groups_y(TAcc const& acc, TArgs... args) {
1164  return independent_groups_along<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
1165  }
1166 
1167  template <typename TAcc,
1168  typename... TArgs,
1169  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1170  ALPAKA_FN_ACC inline auto independent_groups_z(TAcc const& acc, TArgs... args) {
1171  return independent_groups_along<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
1172  }
1173 
1174  /* independent_group_elements_along
1175  */
1176 
1177  template <typename TAcc,
1178  std::size_t Dim,
1179  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
1181  public:
1182  ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc)
1183  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1184  thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1185  stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1186  extent_{stride_} {}
1187 
1188  ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc, Idx extent)
1189  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1190  thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1191  stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1192  extent_{extent} {}
1193 
1194  ALPAKA_FN_ACC inline independent_group_elements_along(TAcc const& acc, Idx first, Idx extent)
1195  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
1196  thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_ + first},
1197  stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
1198  extent_{extent} {}
1199 
1200  class const_iterator;
1202 
1203  ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, thread_); }
1204 
1205  ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }
1206 
1209 
1210  ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
1211  : elements_{elements},
1212  stride_{stride},
1213  extent_{extent},
1214  first_{std::min(first, extent)},
1215  index_{first_},
1216  range_{std::min(first + elements, extent)} {}
1217 
1218  public:
1219  ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
1220 
1221  // pre-increment the iterator
1222  ALPAKA_FN_ACC inline const_iterator& operator++() {
1223  if constexpr (requires_single_thread_per_block_v<TAcc>) {
1224  // increment the index along the elements processed by the current thread
1225  ++index_;
1226  if (index_ < range_)
1227  return *this;
1228  }
1229 
1230  // increment the thread index with the block stride
1231  first_ += stride_;
1232  index_ = first_;
1233  range_ = std::min(first_ + elements_, extent_);
1234  if (index_ < extent_)
1235  return *this;
1236 
1237  // the iterator has reached or passed the end of the extent, clamp it to the extent
1238  first_ = extent_;
1239  index_ = extent_;
1240  range_ = extent_;
1241  return *this;
1242  }
1243 
1244  // post-increment the iterator
1245  ALPAKA_FN_ACC inline const_iterator operator++(int) {
1246  const_iterator old = *this;
1247  ++(*this);
1248  return old;
1249  }
1250 
1251  ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
1252  return (index_ == other.index_) and (first_ == other.first_);
1253  }
1254 
1255  ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }
1256 
1257  private:
1258  // non-const to support iterator copy and assignment
1262  // modified by the pre/post-increment operator
1266  };
1267 
1268  private:
1270  const Idx thread_;
1271  const Idx stride_;
1272  const Idx extent_;
1273  };
1274 
1275  /* independent_group_elements
1276  */
1277 
1278  template <typename TAcc,
1279  typename... TArgs,
1280  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
1281  ALPAKA_FN_ACC inline auto independent_group_elements(TAcc const& acc, TArgs... args) {
1282  return independent_group_elements_along<TAcc, 0>(acc, static_cast<Idx>(args)...);
1283  }
1284 
1285  /* independent_group_elements_x, _y, _z
1286  *
1287  * Like `independent_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
1288  * dimensions.
1289  */
1290 
1291  template <typename TAcc,
1292  typename... TArgs,
1293  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
1294  ALPAKA_FN_ACC inline auto independent_group_elements_x(TAcc const& acc, TArgs... args) {
1296  }
1297 
1298  template <typename TAcc,
1299  typename... TArgs,
1300  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
1301  ALPAKA_FN_ACC inline auto independent_group_elements_y(TAcc const& acc, TArgs... args) {
1303  }
1304 
1305  template <typename TAcc,
1306  typename... TArgs,
1307  typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
1308  ALPAKA_FN_ACC inline auto independent_group_elements_z(TAcc const& acc, TArgs... args) {
1310  }
1311 
1312  /* once_per_grid
1313  *
1314  * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.
1315  *
1316  * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
1317  */
1318 
1319  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1320  ALPAKA_FN_ACC inline constexpr bool once_per_grid(TAcc const& acc) {
1321  return alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
1322  }
1323 
1324  /* once_per_block
1325  *
1326  * `once_per_block(acc)` returns true for a single thread within the block.
1327  *
1328  * Usually the condition is true for thread 0, but this index should not be relied upon.
1329  */
1330 
1331  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
1332  ALPAKA_FN_ACC inline constexpr bool once_per_block(TAcc const& acc) {
1333  return alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
1334  }
1335 
1336 } // namespace cms::alpakatools
1337 
1338 #endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h
ALPAKA_FN_ACC constexpr const_iterator operator++()
Definition: workdivision.h:434
ALPAKA_FN_ACC auto uniform_elements_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:329
ALPAKA_FN_ACC constexpr bool operator==(const_iterator const &other) const
Definition: workdivision.h:446
ALPAKA_FN_ACC auto blocks_with_stride(TAcc const &acc, TArgs... args)
Definition: workdivision.h:784
ALPAKA_FN_ACC uniform_groups_along(TAcc const &acc, Idx extent)
Definition: workdivision.h:634
ALPAKA_FN_ACC uniform_elements_along(TAcc const &acc, Idx first, Idx extent)
Definition: workdivision.h:163
ALPAKA_FN_ACC constexpr bool operator!=(const_iterator const &other) const
Definition: workdivision.h:450
ALPAKA_FN_ACC const_iterator(Idx local, Idx first, Idx range)
Definition: workdivision.h:864
ALPAKA_FN_ACC uniform_groups_along(TAcc const &acc)
Definition: workdivision.h:628
ALPAKA_FN_ACC constexpr bool once_per_block(TAcc const &acc)
ALPAKA_FN_ACC const_iterator begin() const
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:674
ALPAKA_FN_ACC auto uniform_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:303
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:644
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:893
ALPAKA_FN_ACC const_iterator end() const
ALPAKA_FN_ACC const_iterator(Idx stride, Idx extent, Idx first)
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:422
WorkDiv< Dim1D > make_workdiv(Idx blocks, Idx elements)
Definition: workdivision.h:46
def all(container)
workaround iterator generators for ROOT classes
Definition: cmstools.py:25
constexpr Idx divide_up_by(Idx value, Idx divisor)
Definition: workdivision.h:19
ALPAKA_FN_ACC auto uniform_groups_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:754
ALPAKA_FN_ACC constexpr bool do_elements_loops()
Definition: workdivision.h:476
uint32_t Idx
Definition: config.h:14
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:224
ALPAKA_FN_ACC independent_group_elements_along(TAcc const &acc)
ALPAKA_FN_ACC auto independent_group_elements(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniform_group_elements_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:976
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:859
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:174
ALPAKA_FN_ACC const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
Definition: workdivision.h:179
ALPAKA_FN_ACC auto uniform_group_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:956
ALPAKA_FN_ACC auto independent_groups(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniform_elements_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:315
ALPAKA_FN_ACC uniform_elements_along(TAcc const &acc, Idx extent)
Definition: workdivision.h:157
ALPAKA_FN_ACC auto independent_groups_y(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:676
ALPAKA_FN_ACC auto independent_group_elements_y(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
constexpr uint32_t stride
Definition: HelixFit.h:22
ALPAKA_FN_ACC uniform_group_elements_along(TAcc const &acc, Idx block, Idx extent)
Definition: workdivision.h:847
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:220
ALPAKA_FN_ACC auto independent_groups_x(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto independent_group_elements_z(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC independent_group_elements_along(TAcc const &acc, Idx first, Idx extent)
alpaka::WorkDivMembers< TDim, Idx > WorkDiv
Definition: config.h:31
ALPAKA_FN_ACC const_iterator(uniform_elements_nd const *loop, Vec first)
Definition: workdivision.h:454
ALPAKA_FN_ACC const_iterator operator++(int)
Definition: workdivision.h:668
ALPAKA_FN_ACC independent_groups_along(TAcc const &acc)
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC auto uniform_elements_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:322
ALPAKA_FN_ACC independent_group_elements_along(TAcc const &acc, Idx extent)
ALPAKA_FN_ACC uniform_elements_along(TAcc const &acc)
Definition: workdivision.h:151
constexpr Idx round_up_by(Idx value, Idx divisor)
Definition: workdivision.h:16
ALPAKA_FN_ACC const_iterator(Idx stride, Idx extent, Idx first)
Definition: workdivision.h:649
ALPAKA_FN_ACC auto elements_with_stride_nd(TAcc const &acc)
Definition: workdivision.h:565
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:411
const std::complex< double > I
Definition: I.h:8
ALPAKA_FN_ACC auto elements_with_stride(TAcc const &acc, TArgs... args)
Definition: workdivision.h:346
ALPAKA_FN_ACC auto independent_group_elements_x(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:642
Definition: value.py:1
ALPAKA_FN_ACC auto independent_groups_z(TAcc const &acc, TArgs... args)
ALPAKA_FN_ACC auto uniform_groups_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:761
ALPAKA_FN_ACC const_iterator(uniform_elements_nd const *loop, at_end_t const &)
Definition: workdivision.h:461
ALPAKA_FN_ACC constexpr const_iterator operator++(int)
Definition: workdivision.h:440
ALPAKA_FN_ACC uniform_group_elements_along(TAcc const &acc, Idx block)
Definition: workdivision.h:841
ALPAKA_FN_ACC auto uniform_group_elements_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:983
#define N
Definition: blowfish.cc:9
ALPAKA_FN_ACC auto elements_in_block(TAcc const &acc, TArgs... args)
Definition: workdivision.h:999
alpaka::Vec< TDim, Idx > Vec
Definition: config.h:24
ALPAKA_FN_ACC auto uniform_group_elements_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:969
ALPAKA_FN_ACC constexpr bool once_per_grid(TAcc const &acc)
ALPAKA_FN_ACC auto uniform_groups_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:768
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:891
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
ALPAKA_FN_ACC const_iterator begin() const
ALPAKA_FN_ACC uniform_elements_nd(TAcc const &acc, Vec extent)
Definition: workdivision.h:399
ALPAKA_FN_ACC constexpr bool nth_elements_loop()
Definition: workdivision.h:465
ALPAKA_FN_ACC auto uniform_groups(TAcc const &acc, TArgs... args)
Definition: workdivision.h:742
ALPAKA_FN_ACC const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
constexpr bool requires_single_thread_per_block_v
Definition: workdivision.h:42
ALPAKA_FN_ACC const_iterator end() const
ALPAKA_FN_ACC independent_groups_along(TAcc const &acc, Idx groups)
ALPAKA_FN_ACC uniform_elements_nd(TAcc const &acc)
Definition: workdivision.h:393
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:172
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:857