d9/d97/workdivision_8h_source.html

 #ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
 #define HeterogeneousCore_AlpakaInterface_interface_workdivision_h

 #include <algorithm>
 #include <cstddef>
 #include <type_traits>

 #include <alpaka/alpaka.hpp>

 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"

 namespace cms::alpakatools {

   using namespace alpaka_common;

   // If the first argument is not a multiple of the second argument, round it up to the next multiple
   inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }

   // Return the integer division of the first argument by the second argument, rounded up to the next integer
   inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }

   // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   struct requires_single_thread_per_block : public std::true_type {};

 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
   template <typename TDim>
   struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED

 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
   template <typename TDim>
   struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED

 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
   template <typename TDim>
   struct requires_single_thread_per_block<alpaka::AccCpuThreads<TDim, Idx>> : public std::false_type {};
 #endif  // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED

   // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block<TAcc>::value;

   // Create an accelerator-dependent work division for 1-dimensional kernels
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
     if constexpr (not requires_single_thread_per_block_v<TAcc>) {
       // On GPU backends, each thread is looking at a single element:
       //   - the number of threads per block is "elements";
       //   - the number of elements per thread is always 1.
       return WorkDiv<Dim1D>(blocks, elements, Idx{1});
     } else {
       // On CPU backends, run serially with a single thread per block:
       //   - the number of threads per block is always 1;
       //   - the number of elements per thread is "elements".
       return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
     }
   }

   // Create the accelerator-dependent workdiv for N-dimensional kernels
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
                                                  const Vec<alpaka::Dim<TAcc>>& elements) {
     using Dim = alpaka::Dim<TAcc>;
     if constexpr (not requires_single_thread_per_block_v<TAcc>) {
       // On GPU backends, each thread is looking at a single element:
       //   - the number of threads per block is "elements";
       //   - the number of elements per thread is always 1.
       return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
     } else {
       // On CPU backends, run serially with a single thread per block:
       //   - the number of threads per block is always 1;
       //   - the number of elements per thread is "elements".
       return WorkDiv<Dim>(blocks, Vec<Dim>::ones(), elements);
     }
   }

   /* ElementIndex
    *
    * an aggregate that containes the `.global` and `.local` indices of an element; returned by iterating over the objecs
    * returned by `uniform_group_elements` and similar functions.
    */

   struct ElementIndex {
     Idx global;
     Idx local;
   };

   namespace detail {

     /* UniformElementsAlong
    *
    * `UniformElementsAlong<TAcc, Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that spans the
    * element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension.
    * If `first` is not specified, it defaults to 0.
    * If `extent` is not specified, it defaults to the kernel grid size along the `Dim` dimension.
    *
    * `uniform_elements_along<Dim>(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, Dim>(acc, ...)` that can
    * infer the accelerator type from the argument.
    *
    * In a 1-dimensional kernel, `uniform_elements(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, 0>(acc, ...)`.
    *
    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed
    * by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
    * For convenience when converting CUDA or HIP code, `uniform_elements_x(acc, ...)`, `_y` and `_z` are shorthands for
    * `UniformElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
    *
    * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
    * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
    * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
    * loop over each group's elements, and synchronise only in the outer loop:
    *
    *  for (auto group : uniform_groups_along<Dim>(acc, extent)) {
    *    for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
    *       // first part of the computation
    *       // no synchronisations here
    *       ...
    *    }
    *    // wait for all threads to complete the first part
    *    alpaka::syncBlockThreads();
    *    for (auto element : uniform_group_elements_along<Dim>(acc, group, extent)) {
    *       // second part of the computation
    *       // no synchronisations here
    *       ...
    *    }
    *    // wait for all threads to complete the second part
    *    alpaka::syncBlockThreads();
    *    ...
    *  }
    *
    * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
    * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
    * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
    * explicitly inside the loop:
    *
    *  for (auto element : uniform_elements_along<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
    *    bool flag = false;
    *    if (element < extent) {
    *      // do some work and compute a result flag only for the valid elements
    *      flag = do_some_work();
    *    }
    *    // check if any valid element had a positive result
    *    if (alpaka::warp::any(acc, flag)) {
    *      // ...
    *    }
    *  }
    *
    * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
    */

     template <typename TAcc,
               std::size_t Dim,
               typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
     class UniformElementsAlong {
     public:
       ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc)
           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
             first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
             extent_{stride_} {}

       ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent)
           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
             first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
             extent_{extent} {}

       ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent)
           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
             first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first},
             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_},
             extent_{extent} {}

       class const_iterator;
       using iterator = const_iterator;

       ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, first_); }

       ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }

       class const_iterator {
         friend class UniformElementsAlong;

         ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
             : elements_{elements},
               stride_{stride},
               extent_{extent},
               first_{std::min(first, extent)},
               index_{first_},
               range_{std::min(first + elements, extent)} {}

       public:
         ALPAKA_FN_ACC inline Idx operator*() const { return index_; }

         // pre-increment the iterator
         ALPAKA_FN_ACC inline const_iterator& operator++() {
           if constexpr (requires_single_thread_per_block_v<TAcc>) {
             // increment the index along the elements processed by the current thread
             ++index_;
             if (index_ < range_)
               return *this;
           }

           // increment the thread index with the grid stride
           first_ += stride_;
           index_ = first_;
           range_ = std::min(first_ + elements_, extent_);
           if (index_ < extent_)
             return *this;

           // the iterator has reached or passed the end of the extent, clamp it to the extent
           first_ = extent_;
           index_ = extent_;
           range_ = extent_;
           return *this;
         }

         // post-increment the iterator
         ALPAKA_FN_ACC inline const_iterator operator++(int) {
           const_iterator old = *this;
           ++(*this);
           return old;
         }

         ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
           return (index_ == other.index_) and (first_ == other.first_);
         }

         ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }

       private:
         // non-const to support iterator copy and assignment
         Idx elements_;
         Idx stride_;
         Idx extent_;
         // modified by the pre/post-increment operator
         Idx first_;
         Idx index_;
         Idx range_;
       };

     private:
       const Idx elements_;
       const Idx first_;
       const Idx stride_;
       const Idx extent_;
     };

   }  // namespace detail

   /* uniform_elements
    *
    * `uniform_elements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element indices
    * from `first` (inclusive) to `extent` (exlusive).
    * If `first` is not specified, it defaults to 0.
    * If `extent` is not specified, it defaults to the kernel grid size.
    *
    * `uniform_elements(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, 0>(acc, ...)`.
    *
    * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
    * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
    * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
    * loop over each group's elements, and synchronise only in the outer loop:
    *
    *  for (auto group : uniform_groups(acc, extent)) {
    *    for (auto element : uniform_group_elements(acc, group, extent)) {
    *       // first part of the computation
    *       // no synchronisations here
    *       ...
    *    }
    *    // wait for all threads to complete the first part
    *    alpaka::syncBlockThreads();
    *    for (auto element : uniform_group_elements(acc, group, extent)) {
    *       // second part of the computation
    *       // no synchronisations here
    *       ...
    *    }
    *    // wait for all threads to complete the second part
    *    alpaka::syncBlockThreads();
    *    ...
    *  }
    *
    * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a multiple
    * of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the kernel may
    * hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the element index
    * explicitly inside the loop:
    *
    *  for (auto element : uniform_elements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
    *    bool flag = false;
    *    if (element < extent) {
    *      // do some work and compute a result flag only for elements up to extent
    *      flag = do_some_work();
    *    }
    *    // check if any valid element had a positive result
    *    if (alpaka::warp::any(acc, flag)) {
    *      // ...
    *    }
    *  }
    *
    * Note that `uniform_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
    *   - `uniform_elements_nd(acc, ...)` to cover an N-dimensional problem space with a single loop;
    *   - `uniform_elements_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
    *   - `uniform_elements_x(acc, ...)`, `uniform_elements_y(acc, ...)`, or `uniform_elements_z(acc, ...)` to loop
    *     along the fastest, second-fastest, or third-fastest dimension.
    */

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   ALPAKA_FN_ACC inline auto uniform_elements(TAcc const& acc, TArgs... args) {
     return detail::UniformElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
   }

   /* uniform_elements_along<Dim>
    *
    * `uniform_elements_along<Dim>(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, Dim>(acc, ...)` that can
    * infer the accelerator type from the argument.
    */

   template <typename TAcc,
             std::size_t Dim,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
   ALPAKA_FN_ACC inline auto uniform_elements_along(TAcc const& acc, TArgs... args) {
     return detail::UniformElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
   }

   /* uniform_elements_x, _y, _z
    *
    * Like `uniform_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
    */

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
   ALPAKA_FN_ACC inline auto uniform_elements_x(TAcc const& acc, TArgs... args) {
     return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
   }

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
   ALPAKA_FN_ACC inline auto uniform_elements_y(TAcc const& acc, TArgs... args) {
     return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
   }

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
   ALPAKA_FN_ACC inline auto uniform_elements_z(TAcc const& acc, TArgs... args) {
     return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
   }

   namespace detail {

     /* UniformElementsND
    *
    * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
    * required to cover the given problem size, indicated by `extent`.
    *
    * `uniform_elements_nd(acc, ...)` is an alias for `UniformElementsND<TAcc>(acc, ...)`.
    *
    * To cover the problem space, different threads may execute a different number of iterations. As a result, it is not
    * safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop.
    * If a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
    * loop over each group's elements, and synchronise only in the outer loop:
    *
    *  for (auto group0 : uniform_groups_along<0>(acc, extent[0])) {
    *    for (auto group1 : uniform_groups_along<1>(acc, extent[1])) {
    *      for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
    *        for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
    *           // first part of the computation
    *           // no synchronisations here
    *           ...
    *        }
    *      }
    *      // wait for all threads to complete the first part
    *      alpaka::syncBlockThreads();
    *      for (auto element0 : uniform_group_elements_along<0>(acc, group0, extent[0])) {
    *        for (auto element1 : uniform_group_elements_along<1>(acc, group1, extent[1])) {
    *           // second part of the computation
    *           // no synchronisations here
    *           ...
    *        }
    *      }
    *      // wait for all threads to complete the second part
    *      alpaka::syncBlockThreads();
    *      ...
    *    }
    *  }
    *
    * For more details, see `UniformElementsAlong<TAcc, Dim>(acc, ...)`.
    */

     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
     class UniformElementsND {
     public:
       using Dim = alpaka::Dim<TAcc>;
       using Vec = alpaka::Vec<Dim, Idx>;

       ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc)
           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
             thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
             extent_{stride_} {}

       ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent)
           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
             thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
             extent_{extent} {}

       // tag used to construct an end iterator
       struct at_end_t {};

       class const_iterator;
       using iterator = const_iterator;

       ALPAKA_FN_ACC inline const_iterator begin() const {
         // check that all dimensions of the current thread index are within the extent
         if ((thread_ < extent_).all()) {
           // construct an iterator pointing to the first element to be processed by the current thread
           return const_iterator{this, thread_};
         } else {
           // construct an end iterator, pointing post the end of the extent
           return const_iterator{this, at_end_t{}};
         }
       }

       ALPAKA_FN_ACC inline const_iterator end() const {
         // construct an end iterator, pointing post the end of the extent
         return const_iterator{this, at_end_t{}};
       }

       class const_iterator {
         friend class UniformElementsND;

       public:
         ALPAKA_FN_ACC inline Vec operator*() const { return index_; }

         // pre-increment the iterator
         ALPAKA_FN_ACC constexpr inline const_iterator operator++() {
           increment();
           return *this;
         }

         // post-increment the iterator
         ALPAKA_FN_ACC constexpr inline const_iterator operator++(int) {
           const_iterator old = *this;
           increment();
           return old;
         }

         ALPAKA_FN_ACC constexpr inline bool operator==(const_iterator const& other) const {
           return (index_ == other.index_);
         }

         ALPAKA_FN_ACC constexpr inline bool operator!=(const_iterator const& other) const {
           return not(*this == other);
         }

       private:
         // construct an iterator pointing to the first element to be processed by the current thread
         ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first)
             : loop_{loop},
               first_{alpaka::elementwise_min(first, loop->extent_)},
               range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
               index_{first_} {}

         // construct an end iterator, pointing post the end of the extent
         ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&)
             : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}

         template <size_t I>
         ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
           bool overflow = false;
           ++index_[I];
           if (index_[I] >= range_[I]) {
             index_[I] = first_[I];
             overflow = true;
           }
           return overflow;
         }

         template <size_t N>
         ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
           if constexpr (N == 0) {
             // overflow
             return true;
           } else {
             if (not nth_elements_loop<N - 1>()) {
               return false;
             } else {
               return do_elements_loops<N - 1>();
             }
           }
         }

         template <size_t I>
         ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
           bool overflow = false;
           first_[I] += loop_->stride_[I];
           if (first_[I] >= loop_->extent_[I]) {
             first_[I] = loop_->thread_[I];
             overflow = true;
           }
           index_[I] = first_[I];
           range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
           return overflow;
         }

         template <size_t N>
         ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
           if constexpr (N == 0) {
             // overflow
             return true;
           } else {
             if (not nth_strided_loop<N - 1>()) {
               return false;
             } else {
               return do_strided_loops<N - 1>();
             }
           }
         }

         // increment the iterator
         ALPAKA_FN_ACC inline constexpr void increment() {
           if constexpr (requires_single_thread_per_block_v<TAcc>) {
             // linear N-dimensional loops over the elements associated to the thread;
             // do_elements_loops<>() returns true if any of those loops overflows
             if (not do_elements_loops<Dim::value>()) {
               // the elements loops did not overflow, return the next index
               return;
             }
           }

           // strided N-dimensional loop over the threads in the kernel launch grid;
           // do_strided_loops<>() returns true if any of those loops overflows
           if (not do_strided_loops<Dim::value>()) {
             // the strided loops did not overflow, return the next index
             return;
           }

           // the iterator has reached or passed the end of the extent, clamp it to the extent
           first_ = loop_->extent_;
           range_ = loop_->extent_;
           index_ = loop_->extent_;
         }

         // const pointer to the UniformElementsND that the iterator refers to
         const UniformElementsND* loop_;

         // modified by the pre/post-increment operator
         Vec first_;  // first element processed by this thread
         Vec range_;  // last element processed by this thread
         Vec index_;  // current element processed by this thread
       };

     private:
       const Vec elements_;
       const Vec thread_;
       const Vec stride_;
       const Vec extent_;
     };

   }  // namespace detail

   /* uniform_elements_nd
    *
    * `uniform_elements_nd(acc, ...)` is a shorthand for `detail::UniformElementsND<TAcc>(acc, ...)`.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
   ALPAKA_FN_ACC inline auto uniform_elements_nd(TAcc const& acc) {
     return detail::UniformElementsND<TAcc>(acc);
   }

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
   ALPAKA_FN_ACC inline auto uniform_elements_nd(TAcc const& acc, alpaka::Vec<alpaka::Dim<TAcc>, Idx> extent) {
     return detail::UniformElementsND<TAcc>(acc, extent);
   }

   namespace detail {

     /* UniformGroupsAlong
    *
    * `UniformGroupsAlong<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group indices
    * required to cover the given problem size along the `Dim` dimension, in units of the block size. `elements`
    * indicates the total number of elements, across all groups; if not specified, it defaults to the kernel grid size
    * along the `Dim` dimension.
    *
    * `uniform_groups_along<Dim>(acc, ...)` is a shorthand for `UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can infer
    * the accelerator type from the argument.
    *
    * In a 1-dimensional kernel, `uniform_groups(acc, ...)` is a shorthand for `UniformGroupsAlong<Tacc, 0>(acc, ...)`.
    *
    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
    * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
    * For convenience when converting CUDA or HIP code, `uniform_groups_x(acc, ...)`, `_y` and `_z` are shorthands for
    * `UniformGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
    *
    * `uniform_groups_along<Dim>(acc, ...)` should be called consistently by all the threads in a block. All threads in a
    * block see the same loop iterations, while threads in different blocks may see a different number of iterations.
    * If the work division has more blocks than the required number of groups, the first blocks will perform one
    * iteration of the loop, while the other blocks will exit the loop immediately.
    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
    * one iteration, in order to cover then whole problem space.
    *
    * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
    * than the block size. However, also in this case all threads in the block will execute the same number of iterations
    * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
    * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by
    * `uniform_group_elements_along<Dim>(acc, group, elements)`.
    *
    * For example, if the block size is 64 and there are 400 elements
    *
    *   for (auto group: uniform_groups_along<Dim>(acc, 400)
    *
    * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
    * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
    * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
    * the inner loop to not process the non-existing elements after 399.
    *
    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
    * process one group while block 7 will no process any.
    *
    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
    * group 3.
    *
    * See `UniformElementsAlong<TAcc, Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
    * `uniform_group_elements_along<Dim>`.
    */

     template <typename TAcc,
               std::size_t Dim,
               typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
     class UniformGroupsAlong {
     public:
       ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc)
           : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
             extent_{stride_} {}

       // extent is the total number of elements (not blocks)
       ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent)
           : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
             extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])} {}

       class const_iterator;
       using iterator = const_iterator;

       ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }

       ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }

       class const_iterator {
         friend class UniformGroupsAlong;

         ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
             : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}

       public:
         ALPAKA_FN_ACC inline Idx operator*() const { return first_; }

         // pre-increment the iterator
         ALPAKA_FN_ACC inline const_iterator& operator++() {
           // increment the first-element-in-block index by the grid stride
           first_ += stride_;
           if (first_ < extent_)
             return *this;

           // the iterator has reached or passed the end of the extent, clamp it to the extent
           first_ = extent_;
           return *this;
         }

         // post-increment the iterator
         ALPAKA_FN_ACC inline const_iterator operator++(int) {
           const_iterator old = *this;
           ++(*this);
           return old;
         }

         ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }

         ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }

       private:
         // non-const to support iterator copy and assignment
         Idx stride_;
         Idx extent_;
         // modified by the pre/post-increment operator
         Idx first_;
       };

     private:
       const Idx first_;
       const Idx stride_;
       const Idx extent_;
     };

   }  // namespace detail

   /* uniform_groups
    *
    * `uniform_groups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required to
    * cover the given problem size, in units of the block size. `elements` indicates the total number of elements, across
    * all groups; if not specified, it defaults to the kernel grid size.
    *
    * `uniform_groups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, 0>(acc, ...)`.
    *
    * `uniform_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block see
    * the same loop iterations, while threads in different blocks may see a different number of iterations.
    * If the work division has more blocks than the required number of groups, the first blocks will perform one
    * iteration of the loop, while the other blocks will exit the loop immediately.
    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
    * one iteration, in order to cover then whole problem space.
    *
    * If the problem size is not a multiple of the block size, the last group will process a number of elements smaller
    * than the block size. However, also in this case all threads in the block will execute the same number of iterations
    * of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to the inner loop
    * (or the user) to ensure that only the correct number of threads process any data; this logic is implemented by
    * `uniform_group_elements(acc, group, elements)`.
    *
    * For example, if the block size is 64 and there are 400 elements
    *
    *   for (auto group: uniform_groups(acc, 400)
    *
    * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should cover
    * the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group, group 6,
    * should cover the elements from 384 to 399. All the threads of the block will process this last group; it is up to
    * the inner loop to not process the non-existing elements after 399.
    *
    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
    * process one group while block 7 will no process any.
    *
    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
    * group 3.
    *
    * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
    *
    * Note that `uniform_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels, use
    *   - `uniform_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
    *   - `uniform_groups_x(acc, ...)`, `uniform_groups_y(acc, ...)`, or `uniform_groups_z(acc, ...)` to loop
    *     along the fastest, second-fastest, or third-fastest dimension.
    */

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   ALPAKA_FN_ACC inline auto uniform_groups(TAcc const& acc, TArgs... args) {
     return detail::UniformGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
   }

   /* uniform_groups_along<Dim>
    *
    * `uniform_groups_along<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can infer
    * the accelerator type from the argument.
    */

   template <typename TAcc,
             std::size_t Dim,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
   ALPAKA_FN_ACC inline auto uniform_groups_along(TAcc const& acc, TArgs... args) {
     return detail::UniformGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
   }

   /* uniform_groups_x, _y, _z
    *
    * Like `uniform_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest dimensions.
    */

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
   ALPAKA_FN_ACC inline auto uniform_groups_x(TAcc const& acc, TArgs... args) {
     return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
   }

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
   ALPAKA_FN_ACC inline auto uniform_groups_y(TAcc const& acc, TArgs... args) {
     return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
   }

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
   ALPAKA_FN_ACC inline auto uniform_groups_z(TAcc const& acc, TArgs... args) {
     return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
   }

   namespace detail {

     /* UniformGroupElementsAlong
    *
    * `UniformGroupElementsAlong<TAcc, Dim>(acc, group, elements)` returns a one-dimensional iteratable range that spans
    * all the elements within the given `group` along dimension `Dim`, as obtained from `UniformGroupsAlong<Dim>`, up to
    * `elements` (exclusive). `elements` indicates the total number of elements across all groups; if not specified, it
    * defaults to the kernel grid size.
    *
    * `uniform_group_elements_along<Dim>(acc, ...)` is a shorthand for `UniformGroupElementsAlong<TAcc, Dim>(acc, ...)`
    * that can infer the accelerator type from the argument.
    *
    * In a 1-dimensional kernel, `uniform_group_elements(acc, ...)` is a shorthand for
    * `UniformGroupElementsAlong<0>(acc, ...)`.
    *
    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
    * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
    * For convenience when converting CUDA or HIP code, `uniform_group_elements_x(acc, ...)`, `_y` and `_z` are
    * shorthands for `UniformGroupElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
    *
    * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
    * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
    * local index spans the range from 0 to the block size (excluded).
    *
    * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
    * global element index reaches `elements`.
    *
    * If the problem size is not a multiple of the block size, different threads may execute a different number of
    * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
    * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
    * See `UniformElementsAlong<Dim>(acc, ...)` for a concrete example using `uniform_groups_along<Dim>` and
    * `uniform_group_elements_along<Dim>`.
    *
    * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
    * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
    * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
    * index explicitly inside the loop:
    *
    *  for (auto element : uniform_group_elements_along<N-1>(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
    *    bool flag = false;
    *    if (element < elements) {
    *      // do some work and compute a result flag only for the valid elements
    *      flag = do_some_work();
    *    }
    *    // check if any valid element had a positive result
    *    if (alpaka::warp::any(acc, flag)) {
    *      // ...
    *    }
    *  }
    *
    * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension, `N-1`.
    */

     template <typename TAcc,
               std::size_t Dim,
               typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
     class UniformGroupElementsAlong {
     public:
       ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block)
           : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
             local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
                    alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
             range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]} {}

       ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent)
           : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]},
             local_{std::min(extent - first_,
                             alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] *
                                 alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])},
             range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])} {}

       class const_iterator;
       using iterator = const_iterator;

       ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(local_, first_, range_); }

       ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(range_, first_, range_); }

       class const_iterator {
         friend class UniformGroupElementsAlong;

         ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
             : index_{local}, first_{first}, range_{range} {}

       public:
         ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; }

         // pre-increment the iterator
         ALPAKA_FN_ACC inline const_iterator& operator++() {
           if constexpr (requires_single_thread_per_block_v<TAcc>) {
             // increment the index along the elements processed by the current thread
             ++index_;
             if (index_ < range_)
               return *this;
           }

           // the iterator has reached or passed the end of the extent, clamp it to the extent
           index_ = range_;
           return *this;
         }

         // post-increment the iterator
         ALPAKA_FN_ACC inline const_iterator operator++(int) {
           const_iterator old = *this;
           ++(*this);
           return old;
         }

         ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (index_ == other.index_); }

         ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }

       private:
         // modified by the pre/post-increment operator
         Idx index_;
         // non-const to support iterator copy and assignment
         Idx first_;
         Idx range_;
       };

     private:
       const Idx first_;
       const Idx local_;
       const Idx range_;
     };

   }  // namespace detail

   /* uniform_group_elements
    *
    * `uniform_group_elements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
    * elements within the given `group`, as obtained from `uniform_groups`, up to `elements` (exclusive). `elements`
    * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
    *
    * `uniform_group_elements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`.
    *
    * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices of
    * the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded), while the
    * local index spans the range from 0 to the block size (excluded).
    *
    * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
    * global element index reaches `elements`.
    *
    * If the problem size is not a multiple of the block size, different threads may execute a different number of
    * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
    * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
    * See `uniform_elements(acc, ...)` for a concrete example using `uniform_groups` and `uniform_group_elements`.
    *
    * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
    * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example, the
    * kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the element
    * index explicitly inside the loop:
    *
    *  for (auto element : uniform_group_elements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
    *    bool flag = false;
    *    if (element < elements) {
    *      // do some work and compute a result flag only for the valid elements
    *      flag = do_some_work();
    *    }
    *    // check if any valid element had a positive result
    *    if (alpaka::warp::any(acc, flag)) {
    *      // ...
    *    }
    *  }
    *
    * Note that `uniform_group_elements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
    * kernels, use
    *   - `detail::UniformGroupElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
    *   - `uniform_group_elements_x(acc, ...)`, `uniform_group_elements_y(acc, ...)`, or
    *     `uniform_group_elements_z(acc, ...)` to loop along the fastest, second-fastest, or third-fastest dimension.
    */

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   ALPAKA_FN_ACC inline auto uniform_group_elements(TAcc const& acc, TArgs... args) {
     return detail::UniformGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
   }

   /* uniform_group_elements_along<Dim>
    *
    * `uniform_group_elements_along<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<TAcc, Dim>(acc, ...)`
    * that can infer the accelerator type from the argument.
    */

   template <typename TAcc,
             std::size_t Dim,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
   ALPAKA_FN_ACC inline auto uniform_group_elements_along(TAcc const& acc, TArgs... args) {
     return detail::UniformGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
   }

   /* uniform_group_elements_x, _y, _z
    *
    * Like `uniform_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
    * dimensions.
    */

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
   ALPAKA_FN_ACC inline auto uniform_group_elements_x(TAcc const& acc, TArgs... args) {
     return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
   }

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
   ALPAKA_FN_ACC inline auto uniform_group_elements_y(TAcc const& acc, TArgs... args) {
     return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
   }

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
   ALPAKA_FN_ACC inline auto uniform_group_elements_z(TAcc const& acc, TArgs... args) {
     return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
   }

   namespace detail {

     /* IndependentGroupsAlong
    *
    * `IndependentGroupsAlong<TAcc, Dim>(acc, groups)` returns a one-dimensional iteratable range than spans the group
    * indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If `groups` is not
    * specified, it defaults to the number of blocks along the `Dim` dimension.
    *
    * `independent_groups_along<Dim>(acc, ...)` is a shorthand for `IndependentGroupsAlong<TAcc, Dim>(acc, ...)` that can
    * infer the accelerator type from the argument.
    *
    * In a 1-dimensional kernel, `independent_groups(acc, ...)` is a shorthand for
    * `IndependentGroupsAlong<TAcc, 0>(acc, ...)`.
    *
    * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop), followed by
    * dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop).
    * For convenience when converting CUDA or HIP code, `independent_groups_x(acc, ...)`, `_y` and `_z` are shorthands
    * for `IndependentGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
    *
    * `independent_groups_along<Dim>(acc, ...)` should be called consistently by all the threads in a block. All threads
    * in a block see the same loop iterations, while threads in different blocks may see a different number of iterations.
    * If the work division has more blocks than the required number of groups, the first blocks will perform one
    * iteration of the loop, while the other blocks will exit the loop immediately.
    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
    * one iteration, in order to cover then whole problem space.
    *
    * For example,
    *
    *   for (auto group: independent_groups_along<Dim>(acc, 7))
    *
    * will return the group range from 0 to 6, distributed across all blocks in the work division.
    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
    * process one group while block 7 will no process any.
    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
    * group 3.
    */

     template <typename TAcc,
               std::size_t Dim,
               typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
     class IndependentGroupsAlong {
     public:
       ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc)
           : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
             extent_{stride_} {}

       ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc, Idx groups)
           : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
             stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]},
             extent_{groups} {}

       class const_iterator;
       using iterator = const_iterator;

       ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(stride_, extent_, first_); }

       ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(stride_, extent_, extent_); }

       class const_iterator {
         friend class IndependentGroupsAlong;

         ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
             : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}

       public:
         ALPAKA_FN_ACC inline Idx operator*() const { return first_; }

         // pre-increment the iterator
         ALPAKA_FN_ACC inline const_iterator& operator++() {
           // increment the first-element-in-block index by the grid stride
           first_ += stride_;
           if (first_ < extent_)
             return *this;

           // the iterator has reached or passed the end of the extent, clamp it to the extent
           first_ = extent_;
           return *this;
         }

         // post-increment the iterator
         ALPAKA_FN_ACC inline const_iterator operator++(int) {
           const_iterator old = *this;
           ++(*this);
           return old;
         }

         ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const { return (first_ == other.first_); }

         ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }

       private:
         // non-const to support iterator copy and assignment
         Idx stride_;
         Idx extent_;
         // modified by the pre/post-increment operator
         Idx first_;
       };

     private:
       const Idx first_;
       const Idx stride_;
       const Idx extent_;
     };

   }  // namespace detail

   /* independent_groups
    *
    * `independent_groups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0 to
    * `groups`. If `groups` is not specified, it defaults to the number of blocks.
    *
    * `independent_groups(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, 0>(acc, ...)`.
    *
    * `independent_groups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
    * see the same loop iterations, while threads in different blocks may see a different number of iterations.
    * If the work division has more blocks than the required number of groups, the first blocks will perform one
    * iteration of the loop, while the other blocks will exit the loop immediately.
    * If the work division has less blocks than the required number of groups, some of the blocks will perform more than
    * one iteration, in order to cover then whole problem space.
    *
    * For example,
    *
    *   for (auto group: independent_groups(acc, 7))
    *
    * will return the group range from 0 to 6, distributed across all blocks in the work division.
    * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
    * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6 will
    * process one group while block 7 will no process any.
    * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the loop,
    * in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will process the
    * groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block 3 will process
    * group 3.
    *
    * Note that `independent_groups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
    * use
    *   - `independent_groups_along<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
    *   - `independent_groups_x(acc, ...)`, `independent_groups_y(acc, ...)`, or `independent_groups_z(acc, ...)` to loop
    *     along the fastest, second-fastest, or third-fastest dimension.
    */

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   ALPAKA_FN_ACC inline auto independent_groups(TAcc const& acc, TArgs... args) {
     return detail::IndependentGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
   }

   /* independent_groups_along<Dim>
    *
    * `independent_groups_along<Dim>(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, Dim>(acc, ...)` that can
    * infer the accelerator type from the argument.
    */

   template <typename TAcc,
             std::size_t Dim,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
   ALPAKA_FN_ACC inline auto independent_groups_along(TAcc const& acc, TArgs... args) {
     return detail::IndependentGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
   }

   /* independent_groups_x, _y, _z
    *
    * Like `independent_groups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
    * dimensions.
    */

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
   ALPAKA_FN_ACC inline auto independent_groups_x(TAcc const& acc, TArgs... args) {
     return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
   }

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
   ALPAKA_FN_ACC inline auto independent_groups_y(TAcc const& acc, TArgs... args) {
     return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
   }

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
   ALPAKA_FN_ACC inline auto independent_groups_z(TAcc const& acc, TArgs... args) {
     return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
   }

   namespace detail {

     /* IndependentGroupElementsAlong
    *
    * `independent_group_elements_along<Dim>(acc, ...)` is a shorthand for
    * `IndependentGroupElementsAlong<TAcc, Dim>(acc, ...)` that can infer the accelerator type from the argument.
    */

     template <typename TAcc,
               std::size_t Dim,
               typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
     class IndependentGroupElementsAlong {
     public:
       ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc)
           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
             thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
             stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
             extent_{stride_} {}

       ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx extent)
           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
             thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
             stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
             extent_{extent} {}

       ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx first, Idx extent)
           : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]},
             thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_ + first},
             stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_},
             extent_{extent} {}

       class const_iterator;
       using iterator = const_iterator;

       ALPAKA_FN_ACC inline const_iterator begin() const { return const_iterator(elements_, stride_, extent_, thread_); }

       ALPAKA_FN_ACC inline const_iterator end() const { return const_iterator(elements_, stride_, extent_, extent_); }

       class const_iterator {
         friend class IndependentGroupElementsAlong;

         ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
             : elements_{elements},
               stride_{stride},
               extent_{extent},
               first_{std::min(first, extent)},
               index_{first_},
               range_{std::min(first + elements, extent)} {}

       public:
         ALPAKA_FN_ACC inline Idx operator*() const { return index_; }

         // pre-increment the iterator
         ALPAKA_FN_ACC inline const_iterator& operator++() {
           if constexpr (requires_single_thread_per_block_v<TAcc>) {
             // increment the index along the elements processed by the current thread
             ++index_;
             if (index_ < range_)
               return *this;
           }

           // increment the thread index with the block stride
           first_ += stride_;
           index_ = first_;
           range_ = std::min(first_ + elements_, extent_);
           if (index_ < extent_)
             return *this;

           // the iterator has reached or passed the end of the extent, clamp it to the extent
           first_ = extent_;
           index_ = extent_;
           range_ = extent_;
           return *this;
         }

         // post-increment the iterator
         ALPAKA_FN_ACC inline const_iterator operator++(int) {
           const_iterator old = *this;
           ++(*this);
           return old;
         }

         ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const {
           return (index_ == other.index_) and (first_ == other.first_);
         }

         ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const { return not(*this == other); }

       private:
         // non-const to support iterator copy and assignment
         Idx elements_;
         Idx stride_;
         Idx extent_;
         // modified by the pre/post-increment operator
         Idx first_;
         Idx index_;
         Idx range_;
       };

     private:
       const Idx elements_;
       const Idx thread_;
       const Idx stride_;
       const Idx extent_;
     };

   }  // namespace detail

   /* independent_group_elements
    */

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   ALPAKA_FN_ACC inline auto independent_group_elements(TAcc const& acc, TArgs... args) {
     return detail::IndependentGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
   }

   /* independent_group_elements_along<Dim>
    *
    * `independent_group_elements_along<Dim>(acc, ...)` is a shorthand for
    * `detail::IndependentGroupElementsAlong<TAcc, Dim>(acc, ...)` that can infer the accelerator type from the argument.
    */

   template <typename TAcc,
             std::size_t Dim,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
   ALPAKA_FN_ACC inline auto independent_group_elements_along(TAcc const& acc, TArgs... args) {
     return detail::IndependentGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
   }

   /* independent_group_elements_x, _y, _z
    *
    * Like `independent_group_elements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
    * dimensions.
    */

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
   ALPAKA_FN_ACC inline auto independent_group_elements_x(TAcc const& acc, TArgs... args) {
     return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
   }

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
   ALPAKA_FN_ACC inline auto independent_group_elements_y(TAcc const& acc, TArgs... args) {
     return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
   }

   template <typename TAcc,
             typename... TArgs,
             typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
   ALPAKA_FN_ACC inline auto independent_group_elements_z(TAcc const& acc, TArgs... args) {
     return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
   }

   /* once_per_grid
    *
    * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.
    *
    * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   ALPAKA_FN_ACC inline constexpr bool once_per_grid(TAcc const& acc) {
     return alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
   }

   /* once_per_block
    *
    * `once_per_block(acc)` returns true for a single thread within the block.
    *
    * Usually the condition is true for thread 0, but this index should not be relied upon.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   ALPAKA_FN_ACC inline constexpr bool once_per_block(TAcc const& acc) {
     return alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
   }

 }  // namespace cms::alpakatools

 #endif  // HeterogeneousCore_AlpakaInterface_interface_workdivision_h
cms::alpakatools::detail::UniformGroupsAlong::stride_
const Idx stride_
Definition: workdivision.h:703

cms::alpakatools::uniform_elements_z
ALPAKA_FN_ACC auto uniform_elements_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:351

cms::alpakatools::detail::UniformGroupElementsAlong::end
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:879

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::elements_
Idx elements_
Definition: workdivision.h:1305

cms::alpakatools::uniform_elements_along
ALPAKA_FN_ACC auto uniform_elements_along(TAcc const &acc, TArgs... args)
Definition: workdivision.h:325

cms::alpakatools::detail::IndependentGroupsAlong::const_iterator
Definition: workdivision.h:1085

cms::alpakatools::once_per_block
ALPAKA_FN_ACC constexpr bool once_per_block(TAcc const &acc)
Definition: workdivision.h:1394

cms::alpakatools::detail::UniformElementsND::const_iterator::operator++
ALPAKA_FN_ACC constexpr const_iterator operator++()
Definition: workdivision.h:443

cms::alpakatools::detail::UniformElementsND::const_iterator::nth_elements_loop
ALPAKA_FN_ACC constexpr bool nth_elements_loop()
Definition: workdivision.h:476

cms::alpakatools::detail::IndependentGroupElementsAlong
Definition: workdivision.h:1226

cms::alpakatools::detail::UniformGroupElementsAlong
Definition: workdivision.h:859

cms::alpakatools::detail::UniformGroupElementsAlong::const_iterator::operator*
ALPAKA_FN_ACC ElementIndex operator*() const
Definition: workdivision.h:888

cms::alpakatools::uniform_elements
ALPAKA_FN_ACC auto uniform_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:311

cms::alpakatools::detail::UniformElementsAlong::UniformElementsAlong
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc)
Definition: workdivision.h:157

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::const_iterator
ALPAKA_FN_ACC const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
Definition: workdivision.h:1256

cms::alpakatools::detail::IndependentGroupElementsAlong::end
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:1251

heppy_loop.loop
loop
Definition: heppy_loop.py:28

cms::alpakatools::detail::UniformElementsND::const_iterator::operator!=
ALPAKA_FN_ACC constexpr bool operator!=(const_iterator const &other) const
Definition: workdivision.h:459

cms::alpakatools::detail::IndependentGroupsAlong::const_iterator::const_iterator
ALPAKA_FN_ACC const_iterator(Idx stride, Idx extent, Idx first)
Definition: workdivision.h:1088

cms::alpakatools::detail::UniformGroupsAlong::begin
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:657

cms::alpakatools::detail::UniformGroupElementsAlong::const_iterator::operator++
ALPAKA_FN_ACC const_iterator operator++(int)
Definition: workdivision.h:905

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::operator++
ALPAKA_FN_ACC const_iterator operator++(int)
Definition: workdivision.h:1291

cms::alpakatools::detail::UniformGroupsAlong::end
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:659

cms::alpakatools::make_workdiv
WorkDiv< Dim1D > make_workdiv(Idx blocks, Idx elements)
Definition: workdivision.h:47

python.cmstools.all
def all(container)
workaround iterator generators for ROOT classes
Definition: cmstools.py:25

cms::alpakatools::divide_up_by
constexpr Idx divide_up_by(Idx value, Idx divisor)
Definition: workdivision.h:20

cms::alpakatools::uniform_groups_x
ALPAKA_FN_ACC auto uniform_groups_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:785

cms::alpakatools::detail::UniformGroupElementsAlong::const_iterator::first_
Idx first_
Definition: workdivision.h:919

cms::alpakatools::detail::IndependentGroupElementsAlong::IndependentGroupElementsAlong
ALPAKA_FN_ACC IndependentGroupElementsAlong(TAcc const &acc, Idx first, Idx extent)
Definition: workdivision.h:1240

cms::alpakatools::detail::IndependentGroupsAlong::IndependentGroupsAlong
ALPAKA_FN_ACC IndependentGroupsAlong(TAcc const &acc, Idx groups)
Definition: workdivision.h:1073

cms::alpakatools::detail::IndependentGroupElementsAlong::IndependentGroupElementsAlong
ALPAKA_FN_ACC IndependentGroupElementsAlong(TAcc const &acc)
Definition: workdivision.h:1228

cms::alpakatools::detail::UniformGroupElementsAlong::const_iterator::index_
Idx index_
Definition: workdivision.h:917

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::operator++
ALPAKA_FN_ACC const_iterator & operator++()
Definition: workdivision.h:1268

cms::alpakatools::uniform_elements_nd
ALPAKA_FN_ACC auto uniform_elements_nd(TAcc const &acc)
Definition: workdivision.h:575

cms::alpakatools::detail::UniformGroupElementsAlong::range_
const Idx range_
Definition: workdivision.h:926

alpaka_common::Idx
uint32_t Idx
Definition: config.h:14

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::operator*
ALPAKA_FN_ACC Idx operator*() const
Definition: workdivision.h:1265

cms::alpakatools::detail::UniformElementsND::const_iterator::do_strided_loops
ALPAKA_FN_ACC constexpr bool do_strided_loops()
Definition: workdivision.h:514

cms::alpakatools::detail::UniformElementsAlong::UniformElementsAlong
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx extent)
Definition: workdivision.h:163

cms::alpakatools::independent_group_elements
ALPAKA_FN_ACC auto independent_group_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1329

cms::alpakatools::detail::UniformElementsAlong::end
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:180

cms::alpakatools::detail::IndependentGroupsAlong::const_iterator::operator*
ALPAKA_FN_ACC Idx operator*() const
Definition: workdivision.h:1092

cms::alpakatools::ElementIndex::global
Idx global
Definition: workdivision.h:86

cms::alpakatools::ElementIndex::local
Idx local
Definition: workdivision.h:87

cms::alpakatools::detail::IndependentGroupsAlong::const_iterator::extent_
Idx extent_
Definition: workdivision.h:1120

cms::alpakatools::detail::UniformElementsND::const_iterator::operator==
ALPAKA_FN_ACC constexpr bool operator==(const_iterator const &other) const
Definition: workdivision.h:455

cms::alpakatools::uniform_group_elements_y
ALPAKA_FN_ACC auto uniform_group_elements_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1012

cms::alpakatools::detail::IndependentGroupsAlong::begin
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:1081

cms::alpakatools::detail::UniformElementsAlong::extent_
const Idx extent_
Definition: workdivision.h:247

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::index_
Idx index_
Definition: workdivision.h:1310

cms::alpakatools::detail::UniformElementsND::const_iterator::increment
ALPAKA_FN_ACC constexpr void increment()
Definition: workdivision.h:528

cms::alpakatools::detail::UniformElementsAlong::const_iterator::operator==
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:226

cms::alpakatools::detail::UniformGroupsAlong::UniformGroupsAlong
ALPAKA_FN_ACC UniformGroupsAlong(TAcc const &acc, Idx extent)
Definition: workdivision.h:649

cms::alpakatools::detail::UniformElementsND::const_iterator::index_
Vec index_
Definition: workdivision.h:557

cms::alpakatools::detail::IndependentGroupElementsAlong::IndependentGroupElementsAlong
ALPAKA_FN_ACC IndependentGroupElementsAlong(TAcc const &acc, Idx extent)
Definition: workdivision.h:1234

cms::alpakatools::uniform_group_elements_along
ALPAKA_FN_ACC auto uniform_group_elements_along(TAcc const &acc, TArgs... args)
Definition: workdivision.h:992

cms::alpakatools::uniform_group_elements
ALPAKA_FN_ACC auto uniform_group_elements(TAcc const &acc, TArgs... args)
Definition: workdivision.h:978

cms::alpakatools::independent_groups
ALPAKA_FN_ACC auto independent_groups(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1170

cms::alpakatools::detail::UniformElementsND::const_iterator::loop_
const UniformElementsND * loop_
Definition: workdivision.h:552

ALPAKA_ACCELERATOR_NAMESPACE::brokenline::constexpr
if constexpr(n > 3)
Definition: BrokenLine.h:164

cms::alpakatools::uniform_elements_x
ALPAKA_FN_ACC auto uniform_elements_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:337

cms::alpakatools::detail::UniformGroupElementsAlong::const_iterator::operator++
ALPAKA_FN_ACC const_iterator & operator++()
Definition: workdivision.h:891

cms::alpakatools::detail::UniformGroupsAlong::const_iterator::operator!=
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:691

alpaka_common
Definition: config.h:11

cms::alpakatools::detail::UniformElementsAlong::const_iterator::extent_
Idx extent_
Definition: workdivision.h:236

cms::alpakatools::detail::UniformGroupsAlong::const_iterator::stride_
Idx stride_
Definition: workdivision.h:695

cms::alpakatools::independent_groups_y
ALPAKA_FN_ACC auto independent_groups_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1204

cms::alpakatools
Definition: HcalRecoParamWithPulseShapeHost.h:11

cms::alpakatools::detail::UniformGroupElementsAlong::const_iterator::operator==
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:911

cms::alpakatools::independent_group_elements_y
ALPAKA_FN_ACC auto independent_group_elements_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1363

cms::alpakatools::detail::IndependentGroupsAlong::stride_
const Idx stride_
Definition: workdivision.h:1127

riemannFit::stride
constexpr uint32_t stride
Definition: HelixFit.h:22

cms::alpakatools::detail::UniformElementsND::extent_
const Vec extent_
Definition: workdivision.h:564

cms::alpakatools::independent_groups_x
ALPAKA_FN_ACC auto independent_groups_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1197

cms::alpakatools::independent_group_elements_z
ALPAKA_FN_ACC auto independent_group_elements_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1370

gather_cfg.blocks
blocks
Definition: gather_cfg.py:90

cms::alpakatools::detail::UniformElementsND::const_iterator::const_iterator
ALPAKA_FN_ACC const_iterator(UniformElementsND const *loop, Vec first)
Definition: workdivision.h:465

alpaka_common::WorkDiv
alpaka::WorkDivMembers< TDim, Idx > WorkDiv
Definition: config.h:31

cms::alpakatools::detail::UniformGroupsAlong::const_iterator::operator++
ALPAKA_FN_ACC const_iterator & operator++()
Definition: workdivision.h:671

cms::alpakatools::detail::UniformElementsAlong::const_iterator::first_
Idx first_
Definition: workdivision.h:238

cms::alpakatools::detail::UniformElementsND::const_iterator::const_iterator
ALPAKA_FN_ACC const_iterator(UniformElementsND const *loop, at_end_t const &)
Definition: workdivision.h:472

cms::alpakatools::detail::UniformGroupsAlong::first_
const Idx first_
Definition: workdivision.h:702

cms::alpakatools::detail::UniformGroupElementsAlong::first_
const Idx first_
Definition: workdivision.h:924

cms::alpakatools::detail::IndependentGroupElementsAlong::thread_
const Idx thread_
Definition: workdivision.h:1316

cms::alpakatools::independent_groups_along
ALPAKA_FN_ACC auto independent_groups_along(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1184

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::range_
Idx range_
Definition: workdivision.h:1311

cms::alpakatools::detail::IndependentGroupsAlong
Definition: workdivision.h:1066

cms::alpakatools::detail::IndependentGroupElementsAlong::stride_
const Idx stride_
Definition: workdivision.h:1317

detail
Definition: SiStripClustersSOA.h:8

cms::alpakatools::detail::IndependentGroupsAlong::const_iterator::operator++
ALPAKA_FN_ACC const_iterator operator++(int)
Definition: workdivision.h:1107

cms::alpakatools::uniform_elements_y
ALPAKA_FN_ACC auto uniform_elements_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:344

cms::alpakatools::detail::IndependentGroupElementsAlong::extent_
const Idx extent_
Definition: workdivision.h:1318

cms::alpakatools::detail::UniformElementsAlong::const_iterator::operator*
ALPAKA_FN_ACC Idx operator*() const
Definition: workdivision.h:194

cms::alpakatools::detail::UniformElementsAlong::first_
const Idx first_
Definition: workdivision.h:245

cms::alpakatools::detail::UniformElementsAlong::elements_
const Idx elements_
Definition: workdivision.h:244

cms::alpakatools::detail::UniformElementsAlong::stride_
const Idx stride_
Definition: workdivision.h:246

cms::alpakatools::detail::UniformElementsND::const_iterator::range_
Vec range_
Definition: workdivision.h:556

cms::alpakatools::detail::UniformElementsND::const_iterator::operator*
ALPAKA_FN_ACC Vec operator*() const
Definition: workdivision.h:440

cms::alpakatools::detail::IndependentGroupElementsAlong::elements_
const Idx elements_
Definition: workdivision.h:1315

trackingPlots.other
other
Definition: trackingPlots.py:1467

cms::alpakatools::detail::UniformElementsAlong
Definition: workdivision.h:155

cms::alpakatools::detail::UniformGroupsAlong::const_iterator::operator==
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:689

cms::alpakatools::detail::UniformGroupsAlong::const_iterator::extent_
Idx extent_
Definition: workdivision.h:696

cms::alpakatools::round_up_by
constexpr Idx round_up_by(Idx value, Idx divisor)
Definition: workdivision.h:17

cms::alpakatools::detail::UniformElementsAlong::const_iterator::stride_
Idx stride_
Definition: workdivision.h:235

SiStripPI::min
Definition: SiStripPayloadInspectorHelper.h:178

cms::alpakatools::detail::UniformElementsND::Vec
alpaka::Vec< Dim, Idx > Vec
Definition: workdivision.h:400

cms::alpakatools::independent_group_elements_along
ALPAKA_FN_ACC auto independent_group_elements_along(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1343

cms::alpakatools::detail::UniformGroupsAlong
Definition: workdivision.h:641

cms::alpakatools::detail::UniformElementsAlong::UniformElementsAlong
ALPAKA_FN_ACC UniformElementsAlong(TAcc const &acc, Idx first, Idx extent)
Definition: workdivision.h:169

cms::alpakatools::detail::IndependentGroupsAlong::first_
const Idx first_
Definition: workdivision.h:1126

Exhume::I
const std::complex< double > I
Definition: I.h:8

alpaka
Definition: HostOnlyTask.h:11

cms::alpakatools::detail::UniformElementsAlong::const_iterator::operator!=
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:230

cms::alpakatools::detail::IndependentGroupsAlong::const_iterator::stride_
Idx stride_
Definition: workdivision.h:1119

cms::alpakatools::detail::UniformGroupElementsAlong::const_iterator::const_iterator
ALPAKA_FN_ACC const_iterator(Idx local, Idx first, Idx range)
Definition: workdivision.h:884

cms::alpakatools::independent_group_elements_x
ALPAKA_FN_ACC auto independent_group_elements_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1356

cms::alpakatools::detail::IndependentGroupsAlong::const_iterator::first_
Idx first_
Definition: workdivision.h:1122

value
Definition: value.py:1

cms::alpakatools::independent_groups_z
ALPAKA_FN_ACC auto independent_groups_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1211

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::stride_
Idx stride_
Definition: workdivision.h:1306

cms::alpakatools::detail::UniformElementsND::at_end_t
Definition: workdivision.h:415

groupFilesInBlocks.block
block
Definition: groupFilesInBlocks.py:150

cms::alpakatools::detail::UniformGroupElementsAlong::begin
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:877

cms::alpakatools::detail::UniformElementsND
Definition: workdivision.h:397

cms::alpakatools::detail::UniformElementsAlong::begin
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:178

cms::alpakatools::uniform_groups_y
ALPAKA_FN_ACC auto uniform_groups_y(TAcc const &acc, TArgs... args)
Definition: workdivision.h:792

relativeConstraints.value
value
Definition: relativeConstraints.py:53

cms::alpakatools::detail::UniformElementsND::const_iterator::first_
Vec first_
Definition: workdivision.h:555

DTRecHitClients_cfi.local
local
Definition: DTRecHitClients_cfi.py:10

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::operator!=
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:1301

cms::alpakatools::detail::UniformElementsAlong::const_iterator::range_
Idx range_
Definition: workdivision.h:240

cms::alpakatools::detail::IndependentGroupsAlong::IndependentGroupsAlong
ALPAKA_FN_ACC IndependentGroupsAlong(TAcc const &acc)
Definition: workdivision.h:1068

cms::alpakatools::uniform_group_elements_z
ALPAKA_FN_ACC auto uniform_group_elements_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1019

cms::alpakatools::detail::UniformGroupElementsAlong::UniformGroupElementsAlong
ALPAKA_FN_ACC UniformGroupElementsAlong(TAcc const &acc, Idx block)
Definition: workdivision.h:861

cms::alpakatools::detail::UniformElementsAlong::const_iterator::const_iterator
ALPAKA_FN_ACC const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
Definition: workdivision.h:185

cms::alpakatools::detail::UniformElementsND::stride_
const Vec stride_
Definition: workdivision.h:563

N
#define N
Definition: blowfish.cc:9

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator
Definition: workdivision.h:1253

cms::alpakatools::detail::IndependentGroupsAlong::const_iterator::operator==
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:1113

cms::alpakatools::detail::UniformGroupsAlong::extent_
const Idx extent_
Definition: workdivision.h:704

cms::alpakatools::requires_single_thread_per_block
Definition: workdivision.h:24

cms::alpakatools::detail::UniformElementsAlong::const_iterator
Definition: workdivision.h:182

cms::alpakatools::uniform_groups_along
ALPAKA_FN_ACC auto uniform_groups_along(TAcc const &acc, TArgs... args)
Definition: workdivision.h:773

cms::alpakatools::detail::UniformElementsAlong::const_iterator::elements_
Idx elements_
Definition: workdivision.h:234

cms::alpakatools::detail::UniformElementsND::end
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:431

cms::alpakatools::detail::UniformElementsND::const_iterator::nth_strided_loop
ALPAKA_FN_ACC constexpr bool nth_strided_loop()
Definition: workdivision.h:501

alpaka_common::Vec
alpaka::Vec< TDim, Idx > Vec
Definition: config.h:24

cms::alpakatools::detail::UniformElementsND::const_iterator::do_elements_loops
ALPAKA_FN_ACC constexpr bool do_elements_loops()
Definition: workdivision.h:487

cms::alpakatools::uniform_group_elements_x
ALPAKA_FN_ACC auto uniform_group_elements_x(TAcc const &acc, TArgs... args)
Definition: workdivision.h:1005

cms::alpakatools::once_per_grid
ALPAKA_FN_ACC constexpr bool once_per_grid(TAcc const &acc)
Definition: workdivision.h:1382

cms::alpakatools::uniform_groups_z
ALPAKA_FN_ACC auto uniform_groups_z(TAcc const &acc, TArgs... args)
Definition: workdivision.h:799

writedatasetfile.args
args
Definition: writedatasetfile.py:18

cms::alpakatools::detail::IndependentGroupsAlong::const_iterator::operator++
ALPAKA_FN_ACC const_iterator & operator++()
Definition: workdivision.h:1095

cms::alpakatools::detail::UniformElementsND::Dim
alpaka::Dim< TAcc > Dim
Definition: workdivision.h:399

cms::alpakatools::detail::IndependentGroupsAlong::extent_
const Idx extent_
Definition: workdivision.h:1128

dqmdumpme.first
first
Definition: dqmdumpme.py:55

cms::alpakatools::detail::UniformGroupsAlong::const_iterator::first_
Idx first_
Definition: workdivision.h:698

cms::alpakatools::detail::UniformGroupElementsAlong::const_iterator
Definition: workdivision.h:881

cms::alpakatools::detail::UniformElementsND::UniformElementsND
ALPAKA_FN_ACC UniformElementsND(TAcc const &acc, Vec extent)
Definition: workdivision.h:408

cms::alpakatools::detail::UniformGroupsAlong::UniformGroupsAlong
ALPAKA_FN_ACC UniformGroupsAlong(TAcc const &acc)
Definition: workdivision.h:643

cms::alpakatools::detail::UniformGroupElementsAlong::const_iterator::range_
Idx range_
Definition: workdivision.h:920

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::operator==
ALPAKA_FN_ACC bool operator==(const_iterator const &other) const
Definition: workdivision.h:1297

cms::alpakatools::uniform_groups
ALPAKA_FN_ACC auto uniform_groups(TAcc const &acc, TArgs... args)
Definition: workdivision.h:759

cms::alpakatools::detail::UniformGroupElementsAlong::local_
const Idx local_
Definition: workdivision.h:925

cms::alpakatools::detail::UniformGroupsAlong::const_iterator
Definition: workdivision.h:661

cms::alpakatools::detail::UniformGroupElementsAlong::UniformGroupElementsAlong
ALPAKA_FN_ACC UniformGroupElementsAlong(TAcc const &acc, Idx block, Idx extent)
Definition: workdivision.h:867

cms::alpakatools::detail::UniformElementsND::const_iterator::operator++
ALPAKA_FN_ACC constexpr const_iterator operator++(int)
Definition: workdivision.h:449

cms::alpakatools::detail::UniformElementsND::elements_
const Vec elements_
Definition: workdivision.h:561

cms::alpakatools::detail::UniformGroupsAlong::const_iterator::operator*
ALPAKA_FN_ACC Idx operator*() const
Definition: workdivision.h:668

config.h

cms::alpakatools::detail::UniformElementsAlong::const_iterator::operator++
ALPAKA_FN_ACC const_iterator & operator++()
Definition: workdivision.h:197

cms::alpakatools::detail::UniformElementsND::UniformElementsND
ALPAKA_FN_ACC UniformElementsND(TAcc const &acc)
Definition: workdivision.h:402

cms::alpakatools::detail::UniformElementsAlong::const_iterator::operator++
ALPAKA_FN_ACC const_iterator operator++(int)
Definition: workdivision.h:220

cms::alpakatools::detail::UniformGroupElementsAlong::const_iterator::operator!=
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:913

isotrackApplyRegressor.range
range
Definition: isotrackApplyRegressor.py:124

cms::alpakatools::requires_single_thread_per_block_v
constexpr bool requires_single_thread_per_block_v
Definition: workdivision.h:43

cms::alpakatools::detail::UniformElementsND::begin
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:420

cms::alpakatools::detail::UniformGroupsAlong::const_iterator::const_iterator
ALPAKA_FN_ACC const_iterator(Idx stride, Idx extent, Idx first)
Definition: workdivision.h:664

bookConverter.elements
elements
Definition: bookConverter.py:147

cms::alpakatools::detail::UniformElementsAlong::const_iterator::index_
Idx index_
Definition: workdivision.h:239

cms::alpakatools::detail::UniformGroupsAlong::const_iterator::operator++
ALPAKA_FN_ACC const_iterator operator++(int)
Definition: workdivision.h:683

cms::alpakatools::detail::UniformElementsND::thread_
const Vec thread_
Definition: workdivision.h:562

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::first_
Idx first_
Definition: workdivision.h:1309

cms::alpakatools::detail::IndependentGroupElementsAlong::const_iterator::extent_
Idx extent_
Definition: workdivision.h:1307

cms::alpakatools::detail::IndependentGroupsAlong::const_iterator::operator!=
ALPAKA_FN_ACC bool operator!=(const_iterator const &other) const
Definition: workdivision.h:1115

cms::alpakatools::detail::UniformElementsND::const_iterator
Definition: workdivision.h:436

cms::alpakatools::detail::IndependentGroupsAlong::end
ALPAKA_FN_ACC const_iterator end() const
Definition: workdivision.h:1083

cms::alpakatools::detail::IndependentGroupElementsAlong::begin
ALPAKA_FN_ACC const_iterator begin() const
Definition: workdivision.h:1249

cms::alpakatools::ElementIndex
Definition: workdivision.h:85