CMS 3D CMS Logo

workdivision.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
2 #define HeterogeneousCore_AlpakaInterface_interface_workdivision_h
3 
4 #include <type_traits>
5 
6 #include <alpaka/alpaka.hpp>
7 
10 
11 namespace cms::alpakatools {
12 
13  using namespace alpaka_common;
14 
15  // If the first argument is not a multiple of the second argument, round it up to the next multiple
16  inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }
17 
18  // Return the integer division of the first argument by the second argument, rounded up to the next integer
19  inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
20 
21  // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
22  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
23  struct requires_single_thread_per_block : public std::true_type {};
24 
25 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
26  template <typename TDim>
27  struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
28 #endif // ALPAKA_ACC_GPU_CUDA_ENABLED
29 
30 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
31  template <typename TDim>
32  struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
33 #endif // ALPAKA_ACC_GPU_HIP_ENABLED
34 
35  // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
36  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
38 
39  // Create an accelerator-dependent work division for 1-dimensional kernels
40  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
42  if constexpr (not requires_single_thread_per_block_v<TAcc>) {
43  // On GPU backends, each thread is looking at a single element:
44  // - the number of threads per block is "elements";
45  // - the number of elements per thread is always 1.
46  return WorkDiv<Dim1D>(blocks, elements, Idx{1});
47  } else {
48  // On CPU backends, run serially with a single thread per block:
49  // - the number of threads per block is always 1;
50  // - the number of elements per thread is "elements".
51  return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
52  }
53  }
54 
55  // Create the accelerator-dependent workdiv for N-dimensional kernels
56  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
57  inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
58  const Vec<alpaka::Dim<TAcc>>& elements) {
59  using Dim = alpaka::Dim<TAcc>;
60  if constexpr (not requires_single_thread_per_block_v<TAcc>) {
61  // On GPU backends, each thread is looking at a single element:
62  // - the number of threads per block is "elements";
63  // - the number of elements per thread is always 1.
65  } else {
66  // On CPU backends, run serially with a single thread per block:
67  // - the number of threads per block is always 1;
68  // - the number of elements per thread is "elements".
70  }
71  }
72 
73  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
75  public:
76  ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
77  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
78  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
79  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
80  extent_{stride_} {}
81 
82  ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
83  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
84  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
85  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
86  extent_{extent} {}
87 
88  class iterator {
89  friend class elements_with_stride;
90 
91  ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first)
92  : elements_{elements},
93  stride_{stride},
94  extent_{extent},
95  first_{std::min(first, extent)},
96  index_{first_},
97  last_{std::min(first + elements, extent)} {}
98 
99  public:
100  ALPAKA_FN_ACC inline Idx operator*() const { return index_; }
101 
102  // pre-increment the iterator
103  ALPAKA_FN_ACC inline iterator& operator++() {
104  if constexpr (requires_single_thread_per_block_v<TAcc>) {
105  // increment the index along the elements processed by the current thread
106  ++index_;
107  if (index_ < last_)
108  return *this;
109  }
110 
111  // increment the thread index with the grid stride
112  first_ += stride_;
113  index_ = first_;
114  last_ = std::min(first_ + elements_, extent_);
115  if (index_ < extent_)
116  return *this;
117 
118  // the iterator has reached or passed the end of the extent, clamp it to the extent
119  first_ = extent_;
120  index_ = extent_;
121  last_ = extent_;
122  return *this;
123  }
124 
125  // post-increment the iterator
126  ALPAKA_FN_ACC inline iterator operator++(int) {
127  iterator old = *this;
128  ++(*this);
129  return old;
130  }
131 
132  ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
133  return (index_ == other.index_) and (first_ == other.first_);
134  }
135 
136  ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
137 
138  private:
139  // non-const to support iterator copy and assignment
143  // modified by the pre/post-increment operator
147  };
148 
149  ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
150 
151  ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
152 
153  private:
154  const Idx elements_;
155  const Idx first_;
156  const Idx stride_;
157  const Idx extent_;
158  };
159 
160  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
162  public:
163  using Dim = alpaka::Dim<TAcc>;
164  using Vec = alpaka::Vec<Dim, Idx>;
165 
166  ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
167  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
168  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
169  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
170  extent_{stride_} {}
171 
172  ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
173  : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
174  first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
175  stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
176  extent_{extent} {}
177 
178  class iterator {
180 
181  public:
182  ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
183 
184  // pre-increment the iterator
185  ALPAKA_FN_ACC constexpr inline iterator operator++() {
186  increment();
187  return *this;
188  }
189 
190  // post-increment the iterator
191  ALPAKA_FN_ACC constexpr inline iterator operator++(int) {
192  iterator old = *this;
193  increment();
194  return old;
195  }
196 
197  ALPAKA_FN_ACC constexpr inline bool operator==(iterator const& other) const { return (index_ == other.index_); }
198 
199  ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); }
200 
201  private:
202  // private, explicit constructor
203  ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first)
204  : loop_{loop},
205  thread_{alpaka::elementwise_min(first, loop->extent_)},
206  range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
207  index_{thread_} {}
208 
209  template <size_t I>
210  ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
211  bool overflow = false;
212  ++index_[I];
213  if (index_[I] >= range_[I]) {
214  index_[I] = thread_[I];
215  overflow = true;
216  }
217  return overflow;
218  }
219 
220  template <size_t N>
221  ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
222  if constexpr (N == 0) {
223  // overflow
224  return true;
225  } else {
226  if (not nth_elements_loop<N - 1>()) {
227  return false;
228  } else {
229  return do_elements_loops<N - 1>();
230  }
231  }
232  }
233 
234  template <size_t I>
235  ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
236  bool overflow = false;
237  thread_[I] += loop_->stride_[I];
238  if (thread_[I] >= loop_->extent_[I]) {
239  thread_[I] = loop_->first_[I];
240  overflow = true;
241  }
242  index_[I] = thread_[I];
243  range_[I] = std::min(thread_[I] + loop_->elements_[I], loop_->extent_[I]);
244  return overflow;
245  }
246 
247  template <size_t N>
248  ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
249  if constexpr (N == 0) {
250  // overflow
251  return true;
252  } else {
253  if (not nth_strided_loop<N - 1>()) {
254  return false;
255  } else {
256  return do_strided_loops<N - 1>();
257  }
258  }
259  }
260 
261  // increment the iterator
262  ALPAKA_FN_ACC inline constexpr void increment() {
263  if constexpr (requires_single_thread_per_block_v<TAcc>) {
264  // linear N-dimensional loops over the elements associated to the thread;
265  // do_elements_loops<>() returns true if any of those loops overflows
266  if (not do_elements_loops<Dim::value>()) {
267  // the elements loops did not overflow, return the next index
268  return;
269  }
270  }
271 
272  // strided N-dimensional loop over the threads in the kernel launch grid;
273  // do_strided_loops<>() returns true if any of those loops overflows
274  if (not do_strided_loops<Dim::value>()) {
275  // the strided loops did not overflow, return the next index
276  return;
277  }
278 
279  // the iterator has reached or passed the end of the extent, clamp it to the extent
280  thread_ = loop_->extent_;
281  range_ = loop_->extent_;
282  index_ = loop_->extent_;
283  }
284 
285  // const pointer to the elements_with_stride_nd that the iterator refers to
287 
288  // modified by the pre/post-increment operator
289  Vec thread_; // first element processed by this thread
290  Vec range_; // last element processed by this thread
291  Vec index_; // current element processed by this thread
292  };
293 
294  ALPAKA_FN_ACC inline iterator begin() const { return iterator{this, first_}; }
295 
296  ALPAKA_FN_ACC inline iterator end() const { return iterator{this, extent_}; }
297 
298  private:
299  const Vec elements_;
300  const Vec first_;
301  const Vec stride_;
302  const Vec extent_;
303  };
304 
305 } // namespace cms::alpakatools
306 
307 #endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h
ALPAKA_FN_ACC constexpr bool nth_strided_loop()
Definition: workdivision.h:235
ALPAKA_FN_ACC elements_with_stride_nd(TAcc const &acc, Vec extent)
Definition: workdivision.h:172
ALPAKA_FN_ACC constexpr bool operator==(iterator const &other) const
Definition: workdivision.h:197
ALPAKA_FN_ACC elements_with_stride(TAcc const &acc)
Definition: workdivision.h:76
WorkDiv< Dim1D > make_workdiv(Idx blocks, Idx elements)
Definition: workdivision.h:41
constexpr Idx divide_up_by(Idx value, Idx divisor)
Definition: workdivision.h:19
ALPAKA_FN_ACC elements_with_stride(TAcc const &acc, Idx extent)
Definition: workdivision.h:82
uint32_t Idx
Definition: config.h:13
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:149
ALPAKA_FN_ACC elements_with_stride_nd(TAcc const &acc)
Definition: workdivision.h:166
alpaka::WorkDivMembers< TDim, Idx > WorkDiv
Definition: config.h:30
ALPAKA_FN_ACC constexpr bool nth_elements_loop()
Definition: workdivision.h:210
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:151
constexpr Idx round_up_by(Idx value, Idx divisor)
Definition: workdivision.h:16
ALPAKA_FN_ACC iterator(Idx elements, Idx stride, Idx extent, Idx first)
Definition: workdivision.h:91
const std::complex< double > I
Definition: I.h:8
Definition: value.py:1
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:296
ALPAKA_FN_ACC constexpr iterator operator++()
Definition: workdivision.h:185
#define N
Definition: blowfish.cc:9
alpaka::Vec< TDim, Idx > Vec
Definition: config.h:23
ALPAKA_FN_ACC constexpr bool do_elements_loops()
Definition: workdivision.h:221
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:132
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:294
ALPAKA_FN_ACC constexpr iterator operator++(int)
Definition: workdivision.h:191
constexpr bool requires_single_thread_per_block_v
Definition: workdivision.h:37
ALPAKA_FN_ACC iterator(elements_with_stride_nd const *loop, Vec first)
Definition: workdivision.h:203
ALPAKA_FN_ACC constexpr bool do_strided_loops()
Definition: workdivision.h:248
ALPAKA_FN_ACC constexpr bool operator!=(iterator const &other) const
Definition: workdivision.h:199
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:136