CMS 3D CMS Logo

workdivision.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
2 #define HeterogeneousCore_AlpakaInterface_interface_workdivision_h
3 
4 #include <type_traits>
5 
6 #include <alpaka/alpaka.hpp>
7 
10 
11 namespace cms::alpakatools {
12 
13  using namespace alpaka_common;
14 
15  // If the first argument is not a multiple of the second argument, round it up to the next multiple
16  inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }
17 
18  // Return the integer division of the first argument by the second argument, rounded up to the next integer
19  inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
20 
21  // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
22  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
23  struct requires_single_thread_per_block : public std::true_type {};
24 
25 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
26  template <typename TDim>
27  struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
28 #endif // ALPAKA_ACC_GPU_CUDA_ENABLED
29 
30 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
31  template <typename TDim>
32  struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
33 #endif // ALPAKA_ACC_GPU_HIP_ENABLED
34 
35 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
36  template <typename TDim>
37  struct requires_single_thread_per_block<alpaka::AccCpuThreads<TDim, Idx>> : public std::false_type {};
38 #endif // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
39 
40  // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
41  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
43 
44  // Create an accelerator-dependent work division for 1-dimensional kernels
45  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
47  if constexpr (not requires_single_thread_per_block_v<TAcc>) {
48  // On GPU backends, each thread is looking at a single element:
49  // - the number of threads per block is "elements";
50  // - the number of elements per thread is always 1.
51  return WorkDiv<Dim1D>(blocks, elements, Idx{1});
52  } else {
53  // On CPU backends, run serially with a single thread per block:
54  // - the number of threads per block is always 1;
55  // - the number of elements per thread is "elements".
56  return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
57  }
58  }
59 
60  // Create the accelerator-dependent workdiv for N-dimensional kernels
61  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
62  inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
63  const Vec<alpaka::Dim<TAcc>>& elements) {
64  using Dim = alpaka::Dim<TAcc>;
65  if constexpr (not requires_single_thread_per_block_v<TAcc>) {
66  // On GPU backends, each thread is looking at a single element:
67  // - the number of threads per block is "elements";
68  // - the number of elements per thread is always 1.
70  } else {
71  // On CPU backends, run serially with a single thread per block:
72  // - the number of threads per block is always 1;
73  // - the number of elements per thread is "elements".
75  }
76  }
77 
78  /* ElementIndex
79  *
80  * an aggregate that containes the .global and .local indices of an element; returned by iterating over elements_in_block.
81  */
82 
83  struct ElementIndex {
86  };
87 
88  /* elements_with_stride
89  *
90  * `elements_with_stride(acc, [first, ]extent)` returns an iteratable range that spans the element indices required to
91  * cover the given problem size:
92  * - `first` (optional) is index to the first element; if not specified, the loop starts from 0;
93  * - `extent` is the total size of the problem, including any elements that may come before `first`.
94  */
95 
96  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
98  public:
99  ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
100  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
101  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
102  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
103  extent_{stride_} {}
104 
105  ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
106  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
107  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
108  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
109  extent_{extent} {}
110 
111  ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx first, Idx extent)
112  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
113  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_ + first},
114  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
115  extent_{extent} {}
116 
117  class iterator {
118  friend class elements_with_stride;
119 
120  ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first)
121  : elements_{elements},
122  stride_{stride},
123  extent_{extent},
124  first_{std::min(first, extent)},
125  index_{first_},
126  range_{std::min(first + elements, extent)} {}
127 
128  public:
129  ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
130 
131  // pre-increment the iterator
132  ALPAKA_FN_ACC inline iterator& operator++() {
133  if constexpr (requires_single_thread_per_block_v<TAcc>) {
134  // increment the index along the elements processed by the current thread
135  ++index_;
136  if (index_ < range_)
137  return *this;
138  }
139 
140  // increment the thread index with the grid stride
141  first_ += stride_;
142  index_ = first_;
143  range_ = std::min(first_ + elements_, extent_);
144  if (index_ < extent_)
145  return *this;
146 
147  // the iterator has reached or passed the end of the extent, clamp it to the extent
148  first_ = extent_;
149  index_ = extent_;
150  range_ = extent_;
151  return *this;
152  }
153 
154  // post-increment the iterator
155  ALPAKA_FN_ACC inline iterator operator++(int) {
156  iterator old = *this;
157  ++(*this);
158  return old;
159  }
160 
161  ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
162  return (index_ == other.index_) and (first_ == other.first_);
163  }
164 
165  ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
166 
167  private:
168  // non-const to support iterator copy and assignment
172  // modified by the pre/post-increment operator
176  };
177 
178  ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
179 
180  ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
181 
182  private:
183  const Idx elements_;
184  const Idx first_;
185  const Idx stride_;
186  const Idx extent_;
187  };
188 
189  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
191  public:
192  using Dim = alpaka::Dim<TAcc>;
193  using Vec = alpaka::Vec<Dim, Idx>;
194 
195  ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
196  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
197  thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
198  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
199  extent_{stride_} {}
200 
201  ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
202  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
203  thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
204  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
205  extent_{extent} {}
206 
207  // tag used to construct an end iterator
208  struct at_end_t {};
209 
210  class iterator {
212 
213  public:
214  ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
215 
216  // pre-increment the iterator
217  ALPAKA_FN_ACC constexpr inline iterator operator++() {
218  increment();
219  return *this;
220  }
221 
222  // post-increment the iterator
223  ALPAKA_FN_ACC constexpr inline iterator operator++(int) {
224  iterator old = *this;
225  increment();
226  return old;
227  }
228 
229  ALPAKA_FN_ACC constexpr inline bool operator==(iterator const& other) const { return (index_ == other.index_); }
230 
231  ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); }
232 
233  private:
234  // construct an iterator pointing to the first element to be processed by the current thread
235  ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first)
236  : loop_{loop},
237  first_{alpaka::elementwise_min(first, loop->extent_)},
238  range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
239  index_{first_} {}
240 
241  // construct an end iterator, pointing post the end of the extent
242  ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, at_end_t const&)
243  : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}
244 
245  template <size_t I>
246  ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
247  bool overflow = false;
248  ++index_[I];
249  if (index_[I] >= range_[I]) {
250  index_[I] = first_[I];
251  overflow = true;
252  }
253  return overflow;
254  }
255 
256  template <size_t N>
257  ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
258  if constexpr (N == 0) {
259  // overflow
260  return true;
261  } else {
262  if (not nth_elements_loop<N - 1>()) {
263  return false;
264  } else {
265  return do_elements_loops<N - 1>();
266  }
267  }
268  }
269 
270  template <size_t I>
271  ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
272  bool overflow = false;
273  first_[I] += loop_->stride_[I];
274  if (first_[I] >= loop_->extent_[I]) {
275  first_[I] = loop_->thread_[I];
276  overflow = true;
277  }
278  index_[I] = first_[I];
279  range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
280  return overflow;
281  }
282 
283  template <size_t N>
284  ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
285  if constexpr (N == 0) {
286  // overflow
287  return true;
288  } else {
289  if (not nth_strided_loop<N - 1>()) {
290  return false;
291  } else {
292  return do_strided_loops<N - 1>();
293  }
294  }
295  }
296 
297  // increment the iterator
298  ALPAKA_FN_ACC inline constexpr void increment() {
299  if constexpr (requires_single_thread_per_block_v<TAcc>) {
300  // linear N-dimensional loops over the elements associated to the thread;
301  // do_elements_loops<>() returns true if any of those loops overflows
302  if (not do_elements_loops<Dim::value>()) {
303  // the elements loops did not overflow, return the next index
304  return;
305  }
306  }
307 
308  // strided N-dimensional loop over the threads in the kernel launch grid;
309  // do_strided_loops<>() returns true if any of those loops overflows
310  if (not do_strided_loops<Dim::value>()) {
311  // the strided loops did not overflow, return the next index
312  return;
313  }
314 
315  // the iterator has reached or passed the end of the extent, clamp it to the extent
316  first_ = loop_->extent_;
317  range_ = loop_->extent_;
318  index_ = loop_->extent_;
319  }
320 
321  // const pointer to the elements_with_stride_nd that the iterator refers to
323 
324  // modified by the pre/post-increment operator
325  Vec first_; // first element processed by this thread
326  Vec range_; // last element processed by this thread
327  Vec index_; // current element processed by this thread
328  };
329 
330  ALPAKA_FN_ACC inline iterator begin() const {
331  // check that all dimensions of the current thread index are within the extent
332  if ((thread_ < extent_).all()) {
333  // construct an iterator pointing to the first element to be processed by the current thread
334  return iterator{this, thread_};
335  } else {
336  // construct an end iterator, pointing post the end of the extent
337  return iterator{this, at_end_t{}};
338  }
339  }
340 
341  ALPAKA_FN_ACC inline iterator end() const {
342  // construct an end iterator, pointing post the end of the extent
343  return iterator{this, at_end_t{}};
344  }
345 
346  private:
347  const Vec elements_;
348  const Vec thread_;
349  const Vec stride_;
350  const Vec extent_;
351  };
352 
353  /* blocks_with_stride
354  *
355  * `blocks_with_stride(acc, size)` returns a range than spans the (virtual) block indices required to cover the given
356  * problem size.
357  *
358  * For example, if size is 1000 and the block size is 16, it will return the range from 1 to 62.
359  * If the work division has more than 63 blocks, only the first 63 will perform one iteration of the loop, and the
360  * other will exit immediately.
361  * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to
362  * cover then whole problem space.
363  *
364  * All threads in a block see the same loop iterations, while threads in different blocks may see a different number
365  * of iterations.
366  */
367 
368  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
370  public:
371  ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc)
372  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
373  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
374  extent_{stride_} {}
375 
376  // extent is the total number of elements (not blocks)
377  ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc, Idx extent)
378  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
379  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
380  extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u])} {}
381 
382  class iterator {
383  friend class blocks_with_stride;
384 
385  ALPAKA_FN_ACC inline iterator(Idx stride, Idx extent, Idx first)
386  : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
387 
388  public:
389  ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
390 
391  // pre-increment the iterator
392  ALPAKA_FN_ACC inline iterator& operator++() {
393  // increment the first-element-in-block index by the grid stride
394  first_ += stride_;
395  if (first_ < extent_)
396  return *this;
397 
398  // the iterator has reached or passed the end of the extent, clamp it to the extent
399  first_ = extent_;
400  return *this;
401  }
402 
403  // post-increment the iterator
404  ALPAKA_FN_ACC inline iterator operator++(int) {
405  iterator old = *this;
406  ++(*this);
407  return old;
408  }
409 
410  ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (first_ == other.first_); }
411 
412  ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
413 
414  private:
415  // non-const to support iterator copy and assignment
418  // modified by the pre/post-increment operator
420  };
421 
422  ALPAKA_FN_ACC inline iterator begin() const { return iterator(stride_, extent_, first_); }
423 
424  ALPAKA_FN_ACC inline iterator end() const { return iterator(stride_, extent_, extent_); }
425 
426  private:
427  const Idx first_;
428  const Idx stride_;
429  const Idx extent_;
430  };
431 
432  /* elements_in_block
433  *
434  * `elements_in_block(acc, block, size)` returns a range that spans all the elements within the given block.
435  * Iterating over the range yields values of type ElementIndex, that contain both .global and .local indices
436  * of the corresponding element.
437  *
438  * If the work division has only one element per thread, the loop will perform at most one iteration.
439  * If the work division has more than one elements per thread, the loop will perform that number of iterations,
440  * or less if it reaches size.
441  */
442 
443  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
445  public:
446  ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block)
447  : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]},
448  local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] *
449  alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
450  range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]} {}
451 
452  ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block, Idx extent)
453  : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]},
454  local_{std::min(extent - first_,
455  alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] *
456  alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u])},
457  range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u])} {}
458 
459  class iterator {
460  friend class elements_in_block;
461 
462  ALPAKA_FN_ACC inline iterator(Idx local, Idx first, Idx range) : index_{local}, first_{first}, range_{range} {}
463 
464  public:
465  ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; }
466 
467  // pre-increment the iterator
468  ALPAKA_FN_ACC inline iterator& operator++() {
469  if constexpr (requires_single_thread_per_block_v<TAcc>) {
470  // increment the index along the elements processed by the current thread
471  ++index_;
472  if (index_ < range_)
473  return *this;
474  }
475 
476  // the iterator has reached or passed the end of the extent, clamp it to the extent
477  index_ = range_;
478  return *this;
479  }
480 
481  // post-increment the iterator
482  ALPAKA_FN_ACC inline iterator operator++(int) {
483  iterator old = *this;
484  ++(*this);
485  return old;
486  }
487 
488  ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (index_ == other.index_); }
489 
490  ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
491 
492  private:
493  // modified by the pre/post-increment operator
495  // non-const to support iterator copy and assignment
498  };
499 
500  ALPAKA_FN_ACC inline iterator begin() const { return iterator(local_, first_, range_); }
501 
502  ALPAKA_FN_ACC inline iterator end() const { return iterator(range_, first_, range_); }
503 
504  private:
505  const Idx first_;
506  const Idx local_;
507  const Idx range_;
508  };
509 
510  /* uniform_groups
511  *
512  * `uniform_groups(acc, elements)` returns a range than spans the group indices required to cover the given problem
513  * size, in units of the block size:
514  * - the `elements` argument indicates the total number of elements, across all groups.
515  *
516  * `uniform_groups` should be called consistently by all the threads in a block. All threads in a block see the same
517  * loop iterations, while threads in different blocks may see a different number of iterations.
518  *
519  * For example, if `size` is 1000 and the block size is 16,
520  *
521  * for (auto group: uniform_groups(acc, 1000)
522  *
523  * will return the range from 0 to 62, split across all blocks in the work division.
524  *
525  * If the work division has more than 63 blocks, the first 63 will perform one iteration of the loop, while the other
526  * blocks will exit immediately.
527  * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to
528  * cover then whole problem space.
529  */
530 
531  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
533 
534  /* uniform_group_elements
535  *
536  * `uniform_group_elements(acc, group, elements)` returns a range that spans all the elements within the given group:
537  * - the `group` argument indicates the id of the current group, for example as obtained from `uniform_groups`;
538  * - the `elements` argument indicates the total number of elements, across all groups.
539  *
540  * Iterating over the range yields values of type `ElementIndex`, that contain the `.global` and `.local` indices of
541  * the corresponding element.
542  *
543  * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier when the
544  * element index reaches `size`.
545  */
546 
547  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
549 
550  /* independent_groups
551  *
552  * `independent_groups(acc, groups)` returns a range than spans the group indices from 0 to `groups`, with one group
553  * per block:
554  * - the `groups` argument indicates the total number of groups.
555  *
556  * If the work division has more blocks than `groups`, only the first `groups` blocks will perform one iteration of
557  * the loop, while the other blocks will exit immediately.
558  * If the work division has less blocks than `groups`, some of the blocks will perform more than one iteration, in
559  * order to cover then whole problem space.
560  */
561 
562  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
564  public:
565  ALPAKA_FN_ACC inline independent_groups(TAcc const& acc)
566  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
567  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
568  extent_{stride_} {}
569 
570  // extent is the total number of elements (not blocks)
571  ALPAKA_FN_ACC inline independent_groups(TAcc const& acc, Idx groups)
572  : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
573  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
574  extent_{groups} {}
575 
576  class iterator {
577  friend class independent_groups;
578 
579  ALPAKA_FN_ACC inline iterator(Idx stride, Idx extent, Idx first)
580  : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}
581 
582  public:
583  ALPAKA_FN_ACC inline Idx operator*() const { return first_; }
584 
585  // pre-increment the iterator
586  ALPAKA_FN_ACC inline iterator& operator++() {
587  // increment the first-element-in-block index by the grid stride
588  first_ += stride_;
589  if (first_ < extent_)
590  return *this;
591 
592  // the iterator has reached or passed the end of the extent, clamp it to the extent
593  first_ = extent_;
594  return *this;
595  }
596 
597  // post-increment the iterator
598  ALPAKA_FN_ACC inline iterator operator++(int) {
599  iterator old = *this;
600  ++(*this);
601  return old;
602  }
603 
604  ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (first_ == other.first_); }
605 
606  ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
607 
608  private:
609  // non-const to support iterator copy and assignment
612  // modified by the pre/post-increment operator
614  };
615 
616  ALPAKA_FN_ACC inline iterator begin() const { return iterator(stride_, extent_, first_); }
617 
618  ALPAKA_FN_ACC inline iterator end() const { return iterator(stride_, extent_, extent_); }
619 
620  private:
621  const Idx first_;
622  const Idx stride_;
623  const Idx extent_;
624  };
625 
626  /* independent_group_elements
627  *
628  * `independent_group_elements(acc, elements)` returns a range that spans all the elements within the given group:
629  * - the `elements` argument indicates the number of elements in the current group.
630  *
631  * Iterating over the range yields the local element index, between `0` and `elements - 1`. The threads in the block
632  * will perform one or more iterations, depending on the number of elements per thread, and on the number of threads
633  * per block, ocmpared with the total number of elements.
634  */
635 
636  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
638  public:
639  ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc)
640  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
641  thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
642  stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
643  extent_{stride_} {}
644 
645  ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc, Idx extent)
646  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
647  thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
648  stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
649  extent_{extent} {}
650 
651  class iterator {
653 
654  ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first)
655  : elements_{elements},
656  stride_{stride},
657  extent_{extent},
658  first_{std::min(first, extent)},
659  index_{first_},
660  range_{std::min(first + elements, extent)} {}
661 
662  public:
663  ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
664 
665  // pre-increment the iterator
666  ALPAKA_FN_ACC inline iterator& operator++() {
667  if constexpr (requires_single_thread_per_block_v<TAcc>) {
668  // increment the index along the elements processed by the current thread
669  ++index_;
670  if (index_ < range_)
671  return *this;
672  }
673 
674  // increment the thread index with the block stride
675  first_ += stride_;
676  index_ = first_;
677  range_ = std::min(first_ + elements_, extent_);
678  if (index_ < extent_)
679  return *this;
680 
681  // the iterator has reached or passed the end of the extent, clamp it to the extent
682  first_ = extent_;
683  index_ = extent_;
684  range_ = extent_;
685  return *this;
686  }
687 
688  // post-increment the iterator
689  ALPAKA_FN_ACC inline iterator operator++(int) {
690  iterator old = *this;
691  ++(*this);
692  return old;
693  }
694 
695  ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
696  return (index_ == other.index_) and (first_ == other.first_);
697  }
698 
699  ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
700 
701  private:
702  // non-const to support iterator copy and assignment
706  // modified by the pre/post-increment operator
710  };
711 
712  ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, thread_); }
713 
714  ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
715 
716  private:
717  const Idx elements_;
718  const Idx thread_;
719  const Idx stride_;
720  const Idx extent_;
721  };
722 
723  /* once_per_grid
724  *
725  * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.
726  *
727  * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
728  */
729 
730  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
731  ALPAKA_FN_ACC inline constexpr bool once_per_grid(TAcc const& acc) {
732  return alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
733  }
734 
735  /* once_per_block
736  *
737  * `once_per_block(acc)` returns true for a single thread within the block.
738  *
739  * Usually the condition is true for thread 0, but this index should not be relied upon.
740  */
741 
742  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
743  ALPAKA_FN_ACC inline constexpr bool once_per_block(TAcc const& acc) {
744  return alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
745  }
746 
747 } // namespace cms::alpakatools
748 
749 #endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h
ALPAKA_FN_ACC constexpr bool nth_strided_loop()
Definition: workdivision.h:271
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:604
ALPAKA_FN_ACC independent_groups(TAcc const &acc)
Definition: workdivision.h:565
ALPAKA_FN_ACC elements_with_stride_nd(TAcc const &acc, Vec extent)
Definition: workdivision.h:201
ALPAKA_FN_ACC iterator(Idx elements, Idx stride, Idx extent, Idx first)
Definition: workdivision.h:654
ALPAKA_FN_ACC constexpr bool once_per_block(TAcc const &acc)
Definition: workdivision.h:743
ALPAKA_FN_ACC constexpr bool operator==(iterator const &other) const
Definition: workdivision.h:229
ALPAKA_FN_ACC iterator(Idx stride, Idx extent, Idx first)
Definition: workdivision.h:579
ALPAKA_FN_ACC elements_with_stride(TAcc const &acc)
Definition: workdivision.h:99
ALPAKA_FN_ACC elements_in_block(TAcc const &acc, Idx block, Idx extent)
Definition: workdivision.h:452
WorkDiv< Dim1D > make_workdiv(Idx blocks, Idx elements)
Definition: workdivision.h:46
def all(container)
workaround iterator generators for ROOT classes
Definition: cmstools.py:25
constexpr Idx divide_up_by(Idx value, Idx divisor)
Definition: workdivision.h:19
ALPAKA_FN_ACC elements_with_stride(TAcc const &acc, Idx extent)
Definition: workdivision.h:105
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:410
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:500
uint32_t Idx
Definition: config.h:14
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:606
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:178
ALPAKA_FN_ACC ElementIndex operator*() const
Definition: workdivision.h:465
ALPAKA_FN_ACC iterator(Idx local, Idx first, Idx range)
Definition: workdivision.h:462
ALPAKA_FN_ACC independent_group_elements(TAcc const &acc)
Definition: workdivision.h:639
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:616
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:618
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:424
ALPAKA_FN_ACC elements_with_stride_nd(TAcc const &acc)
Definition: workdivision.h:195
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:490
alpaka::WorkDivMembers< TDim, Idx > WorkDiv
Definition: config.h:31
ALPAKA_FN_ACC iterator(elements_with_stride_nd const *loop, at_end_t const &)
Definition: workdivision.h:242
ALPAKA_FN_ACC constexpr bool nth_elements_loop()
Definition: workdivision.h:246
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:502
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:180
constexpr Idx round_up_by(Idx value, Idx divisor)
Definition: workdivision.h:16
ALPAKA_FN_ACC iterator(Idx elements, Idx stride, Idx extent, Idx first)
Definition: workdivision.h:120
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:699
const std::complex< double > I
Definition: I.h:8
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:712
Definition: value.py:1
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:412
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:341
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:695
ALPAKA_FN_ACC constexpr iterator operator++()
Definition: workdivision.h:217
ALPAKA_FN_ACC independent_groups(TAcc const &acc, Idx groups)
Definition: workdivision.h:571
ALPAKA_FN_ACC iterator(Idx stride, Idx extent, Idx first)
Definition: workdivision.h:385
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:422
#define N
Definition: blowfish.cc:9
ALPAKA_FN_ACC elements_with_stride(TAcc const &acc, Idx first, Idx extent)
Definition: workdivision.h:111
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:488
alpaka::Vec< TDim, Idx > Vec
Definition: config.h:24
ALPAKA_FN_ACC elements_in_block(TAcc const &acc, Idx block)
Definition: workdivision.h:446
ALPAKA_FN_ACC constexpr bool once_per_grid(TAcc const &acc)
Definition: workdivision.h:731
ALPAKA_FN_ACC blocks_with_stride(TAcc const &acc, Idx extent)
Definition: workdivision.h:377
ALPAKA_FN_ACC constexpr bool do_elements_loops()
Definition: workdivision.h:257
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:161
ALPAKA_FN_ACC iterator operator++(int)
Definition: workdivision.h:482
ALPAKA_FN_ACC independent_group_elements(TAcc const &acc, Idx extent)
Definition: workdivision.h:645
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:330
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:714
ALPAKA_FN_ACC constexpr iterator operator++(int)
Definition: workdivision.h:223
constexpr bool requires_single_thread_per_block_v
Definition: workdivision.h:42
ALPAKA_FN_ACC iterator(elements_with_stride_nd const *loop, Vec first)
Definition: workdivision.h:235
ALPAKA_FN_ACC blocks_with_stride(TAcc const &acc)
Definition: workdivision.h:371
ALPAKA_FN_ACC constexpr bool do_strided_loops()
Definition: workdivision.h:284
ALPAKA_FN_ACC constexpr bool operator!=(iterator const &other) const
Definition: workdivision.h:231
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:165