d9/d97/workdivision_8h_source.html

 #ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
 #define HeterogeneousCore_AlpakaInterface_interface_workdivision_h

 #include <type_traits>

 #include <alpaka/alpaka.hpp>

 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"

 namespace cms::alpakatools {

   using namespace alpaka_common;

   // If the first argument is not a multiple of the second argument, round it up to the next multiple
   inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }

   // Return the integer division of the first argument by the second argument, rounded up to the next integer
   inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }

   // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   struct requires_single_thread_per_block : public std::true_type {};

 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
   template <typename TDim>
   struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED

 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
   template <typename TDim>
   struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED

 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
   template <typename TDim>
   struct requires_single_thread_per_block<alpaka::AccCpuThreads<TDim, Idx>> : public std::false_type {};
 #endif  // ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED

   // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block<TAcc>::value;

   // Create an accelerator-dependent work division for 1-dimensional kernels
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
     if constexpr (not requires_single_thread_per_block_v<TAcc>) {
       // On GPU backends, each thread is looking at a single element:
       //   - the number of threads per block is "elements";
       //   - the number of elements per thread is always 1.
       return WorkDiv<Dim1D>(blocks, elements, Idx{1});
     } else {
       // On CPU backends, run serially with a single thread per block:
       //   - the number of threads per block is always 1;
       //   - the number of elements per thread is "elements".
       return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
     }
   }

   // Create the accelerator-dependent workdiv for N-dimensional kernels
   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
                                                  const Vec<alpaka::Dim<TAcc>>& elements) {
     using Dim = alpaka::Dim<TAcc>;
     if constexpr (not requires_single_thread_per_block_v<TAcc>) {
       // On GPU backends, each thread is looking at a single element:
       //   - the number of threads per block is "elements";
       //   - the number of elements per thread is always 1.
       return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
     } else {
       // On CPU backends, run serially with a single thread per block:
       //   - the number of threads per block is always 1;
       //   - the number of elements per thread is "elements".
       return WorkDiv<Dim>(blocks, Vec<Dim>::ones(), elements);
     }
   }

   /* ElementIndex
    *
    * an aggregate that containes the .global and .local indices of an element; returned by iterating over elements_in_block.
    */

   struct ElementIndex {
     Idx global;
     Idx local;
   };

   /* elements_with_stride
    *
    * `elements_with_stride(acc, [first, ]extent)` returns an iteratable range that spans the element indices required to
    * cover the given problem size:
    *   - `first` (optional) is index to the first element; if not specified, the loop starts from 0;
    *   - `extent` is the total size of the problem, including any elements that may come before `first`.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   class elements_with_stride {
   public:
     ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
           first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           extent_{stride_} {}

     ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
           first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           extent_{extent} {}

     ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx first, Idx extent)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
           first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_ + first},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
           extent_{extent} {}

     class iterator {
       friend class elements_with_stride;

       ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first)
           : elements_{elements},
             stride_{stride},
             extent_{extent},
             first_{std::min(first, extent)},
             index_{first_},
             range_{std::min(first + elements, extent)} {}

     public:
       ALPAKA_FN_ACC inline Idx operator*() const { return index_; }

       // pre-increment the iterator
       ALPAKA_FN_ACC inline iterator& operator++() {
         if constexpr (requires_single_thread_per_block_v<TAcc>) {
           // increment the index along the elements processed by the current thread
           ++index_;
           if (index_ < range_)
             return *this;
         }

         // increment the thread index with the grid stride
         first_ += stride_;
         index_ = first_;
         range_ = std::min(first_ + elements_, extent_);
         if (index_ < extent_)
           return *this;

         // the iterator has reached or passed the end of the extent, clamp it to the extent
         first_ = extent_;
         index_ = extent_;
         range_ = extent_;
         return *this;
       }

       // post-increment the iterator
       ALPAKA_FN_ACC inline iterator operator++(int) {
         iterator old = *this;
         ++(*this);
         return old;
       }

       ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
         return (index_ == other.index_) and (first_ == other.first_);
       }

       ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }

     private:
       // non-const to support iterator copy and assignment
       Idx elements_;
       Idx stride_;
       Idx extent_;
       // modified by the pre/post-increment operator
       Idx first_;
       Idx index_;
       Idx range_;
     };

     ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }

     ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }

   private:
     const Idx elements_;
     const Idx first_;
     const Idx stride_;
     const Idx extent_;
   };

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
   class elements_with_stride_nd {
   public:
     using Dim = alpaka::Dim<TAcc>;
     using Vec = alpaka::Vec<Dim, Idx>;

     ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
           thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           extent_{stride_} {}

     ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
           thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
           extent_{extent} {}

     // tag used to construct an end iterator
     struct at_end_t {};

     class iterator {
       friend class elements_with_stride_nd;

     public:
       ALPAKA_FN_ACC inline Vec operator*() const { return index_; }

       // pre-increment the iterator
       ALPAKA_FN_ACC constexpr inline iterator operator++() {
         increment();
         return *this;
       }

       // post-increment the iterator
       ALPAKA_FN_ACC constexpr inline iterator operator++(int) {
         iterator old = *this;
         increment();
         return old;
       }

       ALPAKA_FN_ACC constexpr inline bool operator==(iterator const& other) const { return (index_ == other.index_); }

       ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); }

     private:
       // construct an iterator pointing to the first element to be processed by the current thread
       ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first)
           : loop_{loop},
             first_{alpaka::elementwise_min(first, loop->extent_)},
             range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
             index_{first_} {}

       // construct an end iterator, pointing post the end of the extent
       ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, at_end_t const&)
           : loop_{loop}, first_{loop_->extent_}, range_{loop_->extent_}, index_{loop_->extent_} {}

       template <size_t I>
       ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
         bool overflow = false;
         ++index_[I];
         if (index_[I] >= range_[I]) {
           index_[I] = first_[I];
           overflow = true;
         }
         return overflow;
       }

       template <size_t N>
       ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
         if constexpr (N == 0) {
           // overflow
           return true;
         } else {
           if (not nth_elements_loop<N - 1>()) {
             return false;
           } else {
             return do_elements_loops<N - 1>();
           }
         }
       }

       template <size_t I>
       ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
         bool overflow = false;
         first_[I] += loop_->stride_[I];
         if (first_[I] >= loop_->extent_[I]) {
           first_[I] = loop_->thread_[I];
           overflow = true;
         }
         index_[I] = first_[I];
         range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
         return overflow;
       }

       template <size_t N>
       ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
         if constexpr (N == 0) {
           // overflow
           return true;
         } else {
           if (not nth_strided_loop<N - 1>()) {
             return false;
           } else {
             return do_strided_loops<N - 1>();
           }
         }
       }

       // increment the iterator
       ALPAKA_FN_ACC inline constexpr void increment() {
         if constexpr (requires_single_thread_per_block_v<TAcc>) {
           // linear N-dimensional loops over the elements associated to the thread;
           // do_elements_loops<>() returns true if any of those loops overflows
           if (not do_elements_loops<Dim::value>()) {
             // the elements loops did not overflow, return the next index
             return;
           }
         }

         // strided N-dimensional loop over the threads in the kernel launch grid;
         // do_strided_loops<>() returns true if any of those loops overflows
         if (not do_strided_loops<Dim::value>()) {
           // the strided loops did not overflow, return the next index
           return;
         }

         // the iterator has reached or passed the end of the extent, clamp it to the extent
         first_ = loop_->extent_;
         range_ = loop_->extent_;
         index_ = loop_->extent_;
       }

       // const pointer to the elements_with_stride_nd that the iterator refers to
       const elements_with_stride_nd* loop_;

       // modified by the pre/post-increment operator
       Vec first_;  // first element processed by this thread
       Vec range_;  // last element processed by this thread
       Vec index_;  // current element processed by this thread
     };

     ALPAKA_FN_ACC inline iterator begin() const {
       // check that all dimensions of the current thread index are within the extent
       if ((thread_ < extent_).all()) {
         // construct an iterator pointing to the first element to be processed by the current thread
         return iterator{this, thread_};
       } else {
         // construct an end iterator, pointing post the end of the extent
         return iterator{this, at_end_t{}};
       }
     }

     ALPAKA_FN_ACC inline iterator end() const {
       // construct an end iterator, pointing post the end of the extent
       return iterator{this, at_end_t{}};
     }

   private:
     const Vec elements_;
     const Vec thread_;
     const Vec stride_;
     const Vec extent_;
   };

   /* blocks_with_stride
    *
    * `blocks_with_stride(acc, size)` returns a range than spans the (virtual) block indices required to cover the given
    * problem size.
    *
    * For example, if size is 1000 and the block size is 16, it will return the range from 1 to 62.
    * If the work division has more than 63 blocks, only the first 63 will perform one iteration of the loop, and the
    * other will exit immediately.
    * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to
    * cover then whole problem space.
    *
    * All threads in a block see the same loop iterations, while threads in different blocks may see a different number
    * of iterations.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   class blocks_with_stride {
   public:
     ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc)
         : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
           extent_{stride_} {}

     // extent is the total number of elements (not blocks)
     ALPAKA_FN_ACC inline blocks_with_stride(TAcc const& acc, Idx extent)
         : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
           extent_{divide_up_by(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u])} {}

     class iterator {
       friend class blocks_with_stride;

       ALPAKA_FN_ACC inline iterator(Idx stride, Idx extent, Idx first)
           : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}

     public:
       ALPAKA_FN_ACC inline Idx operator*() const { return first_; }

       // pre-increment the iterator
       ALPAKA_FN_ACC inline iterator& operator++() {
         // increment the first-element-in-block index by the grid stride
         first_ += stride_;
         if (first_ < extent_)
           return *this;

         // the iterator has reached or passed the end of the extent, clamp it to the extent
         first_ = extent_;
         return *this;
       }

       // post-increment the iterator
       ALPAKA_FN_ACC inline iterator operator++(int) {
         iterator old = *this;
         ++(*this);
         return old;
       }

       ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (first_ == other.first_); }

       ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }

     private:
       // non-const to support iterator copy and assignment
       Idx stride_;
       Idx extent_;
       // modified by the pre/post-increment operator
       Idx first_;
     };

     ALPAKA_FN_ACC inline iterator begin() const { return iterator(stride_, extent_, first_); }

     ALPAKA_FN_ACC inline iterator end() const { return iterator(stride_, extent_, extent_); }

   private:
     const Idx first_;
     const Idx stride_;
     const Idx extent_;
   };

   /* elements_in_block
    *
    * `elements_in_block(acc, block, size)` returns a range that spans all the elements within the given block.
    * Iterating over the range yields values of type ElementIndex, that contain both .global and .local indices
    * of the corresponding element.
    *
    * If the work division has only one element per thread, the loop will perform at most one iteration.
    * If the work division has more than one elements per thread, the loop will perform that number of iterations,
    * or less if it reaches size.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   class elements_in_block {
   public:
     ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block)
         : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]},
           local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] *
                  alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
           range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]} {}

     ALPAKA_FN_ACC inline elements_in_block(TAcc const& acc, Idx block, Idx extent)
         : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u]},
           local_{std::min(extent - first_,
                           alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] *
                               alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u])},
           range_{std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u])} {}

     class iterator {
       friend class elements_in_block;

       ALPAKA_FN_ACC inline iterator(Idx local, Idx first, Idx range) : index_{local}, first_{first}, range_{range} {}

     public:
       ALPAKA_FN_ACC inline ElementIndex operator*() const { return ElementIndex{index_ + first_, index_}; }

       // pre-increment the iterator
       ALPAKA_FN_ACC inline iterator& operator++() {
         if constexpr (requires_single_thread_per_block_v<TAcc>) {
           // increment the index along the elements processed by the current thread
           ++index_;
           if (index_ < range_)
             return *this;
         }

         // the iterator has reached or passed the end of the extent, clamp it to the extent
         index_ = range_;
         return *this;
       }

       // post-increment the iterator
       ALPAKA_FN_ACC inline iterator operator++(int) {
         iterator old = *this;
         ++(*this);
         return old;
       }

       ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (index_ == other.index_); }

       ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }

     private:
       // modified by the pre/post-increment operator
       Idx index_;
       // non-const to support iterator copy and assignment
       Idx first_;
       Idx range_;
     };

     ALPAKA_FN_ACC inline iterator begin() const { return iterator(local_, first_, range_); }

     ALPAKA_FN_ACC inline iterator end() const { return iterator(range_, first_, range_); }

   private:
     const Idx first_;
     const Idx local_;
     const Idx range_;
   };

   /* uniform_groups
    *
    * `uniform_groups(acc, elements)` returns a range than spans the group indices required to cover the given problem
    * size, in units of the block size:
    *   - the `elements` argument indicates the total number of elements, across all groups.
    *
    * `uniform_groups` should be called consistently by all the threads in a block. All threads in a block see the same
    * loop iterations, while threads in different blocks may see a different number of iterations.
    *
    * For example, if `size` is 1000 and the block size is 16,
    *
    *   for (auto group: uniform_groups(acc, 1000)
    *
    * will return the range from 0 to 62, split across all blocks in the work division.
    *
    * If the work division has more than 63 blocks, the first 63 will perform one iteration of the loop, while the other
    * blocks will exit immediately.
    * If the work division has less than 63 blocks, some of the blocks will perform more than one iteration, in order to
    * cover then whole problem space.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   using uniform_groups = blocks_with_stride<TAcc>;

   /* uniform_group_elements
    *
    * `uniform_group_elements(acc, group, elements)` returns a range that spans all the elements within the given group:
    *   - the `group` argument indicates the id of the current group, for example as obtained from `uniform_groups`;
    *   - the `elements` argument indicates the total number of elements, across all groups.
    *
    * Iterating over the range yields values of type `ElementIndex`, that contain the `.global` and `.local` indices of
    * the corresponding element.
    *
    * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier when the
    * element index reaches `size`.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   using uniform_group_elements = elements_in_block<TAcc>;

   /* independent_groups
    *
    * `independent_groups(acc, groups)` returns a range than spans the group indices from 0 to `groups`, with one group
    * per block:
    *   - the `groups` argument indicates the total number of groups.
    *
    * If the work division has more blocks than `groups`, only the first `groups` blocks will perform one iteration of
    * the loop, while the other blocks will exit immediately.
    * If the work division has less blocks than `groups`, some of the blocks will perform more than one iteration, in
    * order to cover then whole problem space.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   class independent_groups {
   public:
     ALPAKA_FN_ACC inline independent_groups(TAcc const& acc)
         : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
           extent_{stride_} {}

     // extent is the total number of elements (not blocks)
     ALPAKA_FN_ACC inline independent_groups(TAcc const& acc, Idx groups)
         : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
           stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]},
           extent_{groups} {}

     class iterator {
       friend class independent_groups;

       ALPAKA_FN_ACC inline iterator(Idx stride, Idx extent, Idx first)
           : stride_{stride}, extent_{extent}, first_{std::min(first, extent)} {}

     public:
       ALPAKA_FN_ACC inline Idx operator*() const { return first_; }

       // pre-increment the iterator
       ALPAKA_FN_ACC inline iterator& operator++() {
         // increment the first-element-in-block index by the grid stride
         first_ += stride_;
         if (first_ < extent_)
           return *this;

         // the iterator has reached or passed the end of the extent, clamp it to the extent
         first_ = extent_;
         return *this;
       }

       // post-increment the iterator
       ALPAKA_FN_ACC inline iterator operator++(int) {
         iterator old = *this;
         ++(*this);
         return old;
       }

       ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { return (first_ == other.first_); }

       ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }

     private:
       // non-const to support iterator copy and assignment
       Idx stride_;
       Idx extent_;
       // modified by the pre/post-increment operator
       Idx first_;
     };

     ALPAKA_FN_ACC inline iterator begin() const { return iterator(stride_, extent_, first_); }

     ALPAKA_FN_ACC inline iterator end() const { return iterator(stride_, extent_, extent_); }

   private:
     const Idx first_;
     const Idx stride_;
     const Idx extent_;
   };

   /* independent_group_elements
    *
    * `independent_group_elements(acc, elements)` returns a range that spans all the elements within the given group:
    *   - the `elements` argument indicates the number of elements in the current group.
    *
    * Iterating over the range yields the local element index, between `0` and `elements - 1`. The threads in the block
    * will perform one or more iterations, depending on the number of elements per thread, and on the number of threads
    * per block, ocmpared with the total number of elements.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   class independent_group_elements {
   public:
     ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
           thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
           extent_{stride_} {}

     ALPAKA_FN_ACC inline independent_group_elements(TAcc const& acc, Idx extent)
         : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
           thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
           stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u] * elements_},
           extent_{extent} {}

     class iterator {
       friend class independent_group_elements;

       ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first)
           : elements_{elements},
             stride_{stride},
             extent_{extent},
             first_{std::min(first, extent)},
             index_{first_},
             range_{std::min(first + elements, extent)} {}

     public:
       ALPAKA_FN_ACC inline Idx operator*() const { return index_; }

       // pre-increment the iterator
       ALPAKA_FN_ACC inline iterator& operator++() {
         if constexpr (requires_single_thread_per_block_v<TAcc>) {
           // increment the index along the elements processed by the current thread
           ++index_;
           if (index_ < range_)
             return *this;
         }

         // increment the thread index with the block stride
         first_ += stride_;
         index_ = first_;
         range_ = std::min(first_ + elements_, extent_);
         if (index_ < extent_)
           return *this;

         // the iterator has reached or passed the end of the extent, clamp it to the extent
         first_ = extent_;
         index_ = extent_;
         range_ = extent_;
         return *this;
       }

       // post-increment the iterator
       ALPAKA_FN_ACC inline iterator operator++(int) {
         iterator old = *this;
         ++(*this);
         return old;
       }

       ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
         return (index_ == other.index_) and (first_ == other.first_);
       }

       ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }

     private:
       // non-const to support iterator copy and assignment
       Idx elements_;
       Idx stride_;
       Idx extent_;
       // modified by the pre/post-increment operator
       Idx first_;
       Idx index_;
       Idx range_;
     };

     ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, thread_); }

     ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }

   private:
     const Idx elements_;
     const Idx thread_;
     const Idx stride_;
     const Idx extent_;
   };

   /* once_per_grid
    *
    * `once_per_grid(acc)` returns true for a single thread within the kernel execution grid.
    *
    * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   ALPAKA_FN_ACC inline constexpr bool once_per_grid(TAcc const& acc) {
     return alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
   }

   /* once_per_block
    *
    * `once_per_block(acc)` returns true for a single thread within the block.
    *
    * Usually the condition is true for thread 0, but this index should not be relied upon.
    */

   template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   ALPAKA_FN_ACC inline constexpr bool once_per_block(TAcc const& acc) {
     return alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc) == Vec<alpaka::Dim<TAcc>>::zeros();
   }

 }  // namespace cms::alpakatools

 #endif  // HeterogeneousCore_AlpakaInterface_interface_workdivision_h
cms::alpakatools::blocks_with_stride::iterator::stride_
Idx stride_
Definition: workdivision.h:416

cms::alpakatools::elements_in_block::iterator::range_
Idx range_
Definition: workdivision.h:497

cms::alpakatools::elements_with_stride_nd::iterator::nth_strided_loop
ALPAKA_FN_ACC constexpr bool nth_strided_loop()
Definition: workdivision.h:271

cms::alpakatools::independent_groups::iterator::operator==
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:604

cms::alpakatools::independent_groups::independent_groups
ALPAKA_FN_ACC independent_groups(TAcc const &acc)
Definition: workdivision.h:565

cms::alpakatools::elements_with_stride_nd::elements_with_stride_nd
ALPAKA_FN_ACC elements_with_stride_nd(TAcc const &acc, Vec extent)
Definition: workdivision.h:201

cms::alpakatools::independent_group_elements::iterator::iterator
ALPAKA_FN_ACC iterator(Idx elements, Idx stride, Idx extent, Idx first)
Definition: workdivision.h:654

cms::alpakatools::once_per_block
ALPAKA_FN_ACC constexpr bool once_per_block(TAcc const &acc)
Definition: workdivision.h:743

cms::alpakatools::elements_with_stride_nd::iterator::operator==
ALPAKA_FN_ACC constexpr bool operator==(iterator const &other) const
Definition: workdivision.h:229

cms::alpakatools::independent_groups::iterator::iterator
ALPAKA_FN_ACC iterator(Idx stride, Idx extent, Idx first)
Definition: workdivision.h:579

cms::alpakatools::elements_with_stride::iterator::index_
Idx index_
Definition: workdivision.h:174

cms::alpakatools::elements_with_stride_nd::iterator::increment
ALPAKA_FN_ACC constexpr void increment()
Definition: workdivision.h:298

heppy_loop.loop
loop
Definition: heppy_loop.py:28

cms::alpakatools::elements_with_stride::first_
const Idx first_
Definition: workdivision.h:184

cms::alpakatools::elements_with_stride::elements_with_stride
ALPAKA_FN_ACC elements_with_stride(TAcc const &acc)
Definition: workdivision.h:99

cms::alpakatools::independent_groups::iterator::stride_
Idx stride_
Definition: workdivision.h:610

cms::alpakatools::independent_groups::iterator::extent_
Idx extent_
Definition: workdivision.h:611

cms::alpakatools::blocks_with_stride::extent_
const Idx extent_
Definition: workdivision.h:429

cms::alpakatools::elements_in_block::elements_in_block
ALPAKA_FN_ACC elements_in_block(TAcc const &acc, Idx block, Idx extent)
Definition: workdivision.h:452

cms::alpakatools::make_workdiv
WorkDiv< Dim1D > make_workdiv(Idx blocks, Idx elements)
Definition: workdivision.h:46

cms::alpakatools::independent_group_elements::iterator::first_
Idx first_
Definition: workdivision.h:707

python.cmstools.all
def all(container)
workaround iterator generators for ROOT classes
Definition: cmstools.py:25

cms::alpakatools::independent_groups
Definition: workdivision.h:563

cms::alpakatools::divide_up_by
constexpr Idx divide_up_by(Idx value, Idx divisor)
Definition: workdivision.h:19

cms::alpakatools::elements_with_stride::elements_with_stride
ALPAKA_FN_ACC elements_with_stride(TAcc const &acc, Idx extent)
Definition: workdivision.h:105

cms::alpakatools::elements_with_stride::iterator::elements_
Idx elements_
Definition: workdivision.h:169

cms::alpakatools::blocks_with_stride::first_
const Idx first_
Definition: workdivision.h:427

cms::alpakatools::blocks_with_stride::iterator::operator==
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:410

cms::alpakatools::elements_with_stride_nd::iterator
Definition: workdivision.h:210

cms::alpakatools::elements_in_block::begin
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:500

cms::alpakatools::independent_groups::iterator::operator++
ALPAKA_FN_ACC iterator & operator++()
Definition: workdivision.h:586

cms::alpakatools::blocks_with_stride::iterator::operator*
ALPAKA_FN_ACC Idx operator*() const
Definition: workdivision.h:389

alpaka_common::Idx
uint32_t Idx
Definition: config.h:14

cms::alpakatools::independent_groups::iterator::operator!=
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:606

cms::alpakatools::elements_with_stride_nd
Definition: workdivision.h:190

cms::alpakatools::elements_with_stride_nd::elements_
const Vec elements_
Definition: workdivision.h:347

cms::alpakatools::elements_with_stride_nd::iterator::loop_
const elements_with_stride_nd * loop_
Definition: workdivision.h:322

cms::alpakatools::blocks_with_stride::iterator::first_
Idx first_
Definition: workdivision.h:419

cms::alpakatools::ElementIndex::global
Idx global
Definition: workdivision.h:84

cms::alpakatools::ElementIndex::local
Idx local
Definition: workdivision.h:85

cms::alpakatools::blocks_with_stride::iterator::operator++
ALPAKA_FN_ACC iterator & operator++()
Definition: workdivision.h:392

cms::alpakatools::elements_with_stride::begin
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:178

cms::alpakatools::elements_in_block::iterator::operator++
ALPAKA_FN_ACC iterator & operator++()
Definition: workdivision.h:468

cms::alpakatools::independent_group_elements::iterator::operator*
ALPAKA_FN_ACC Idx operator*() const
Definition: workdivision.h:663

cms::alpakatools::elements_with_stride::iterator::stride_
Idx stride_
Definition: workdivision.h:170

cms::alpakatools::elements_with_stride::iterator
Definition: workdivision.h:117

cms::alpakatools::elements_in_block::iterator::operator*
ALPAKA_FN_ACC ElementIndex operator*() const
Definition: workdivision.h:465

alpaka_common
Definition: config.h:11

cms::alpakatools::elements_in_block::iterator::iterator
ALPAKA_FN_ACC iterator(Idx local, Idx first, Idx range)
Definition: workdivision.h:462

cms::alpakatools::blocks_with_stride::iterator
Definition: workdivision.h:382

cms::alpakatools::independent_group_elements::independent_group_elements
ALPAKA_FN_ACC independent_group_elements(TAcc const &acc)
Definition: workdivision.h:639

cms::alpakatools::elements_with_stride_nd::iterator::operator*
ALPAKA_FN_ACC Vec operator*() const
Definition: workdivision.h:214

cms::alpakatools
Definition: PortableCollection.h:46

cms::alpakatools::independent_groups::begin
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:616

cms::alpakatools::independent_group_elements
Definition: workdivision.h:637

cms::alpakatools::elements_with_stride::iterator::first_
Idx first_
Definition: workdivision.h:173

cms::alpakatools::independent_groups::end
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:618

cms::alpakatools::blocks_with_stride::end
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:424

cms::alpakatools::elements_with_stride::iterator::extent_
Idx extent_
Definition: workdivision.h:171

cms::alpakatools::elements_with_stride_nd::elements_with_stride_nd
ALPAKA_FN_ACC elements_with_stride_nd(TAcc const &acc)
Definition: workdivision.h:195

cms::alpakatools::independent_group_elements::iterator
Definition: workdivision.h:651

cms::alpakatools::elements_in_block::iterator::operator!=
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:490

cms::alpakatools::elements_with_stride_nd::at_end_t
Definition: workdivision.h:208

cms::alpakatools::blocks_with_stride::iterator::extent_
Idx extent_
Definition: workdivision.h:417

gather_cfg.blocks
blocks
Definition: gather_cfg.py:90

alpaka_common::WorkDiv
alpaka::WorkDivMembers< TDim, Idx > WorkDiv
Definition: config.h:31

cms::alpakatools::elements_in_block::iterator::index_
Idx index_
Definition: workdivision.h:494

cms::alpakatools::elements_with_stride_nd::iterator::iterator
ALPAKA_FN_ACC iterator(elements_with_stride_nd const *loop, at_end_t const &)
Definition: workdivision.h:242

cms::alpakatools::elements_with_stride_nd::iterator::nth_elements_loop
ALPAKA_FN_ACC constexpr bool nth_elements_loop()
Definition: workdivision.h:246

cms::alpakatools::elements_with_stride::stride_
const Idx stride_
Definition: workdivision.h:185

cms::alpakatools::elements_with_stride::iterator::operator*
ALPAKA_FN_ACC Idx operator*() const
Definition: workdivision.h:129

cms::alpakatools::elements_in_block::iterator::first_
Idx first_
Definition: workdivision.h:496

cms::alpakatools::blocks_with_stride::stride_
const Idx stride_
Definition: workdivision.h:428

cms::alpakatools::elements_in_block::end
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:502

cms::alpakatools::independent_groups::iterator::operator*
ALPAKA_FN_ACC Idx operator*() const
Definition: workdivision.h:583

cms::alpakatools::elements_in_block
Definition: workdivision.h:444

trackingPlots.other
other
Definition: trackingPlots.py:1467

cms::alpakatools::elements_with_stride::end
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:180

cms::alpakatools::round_up_by
constexpr Idx round_up_by(Idx value, Idx divisor)
Definition: workdivision.h:16

cms::alpakatools::elements_in_block::iterator
Definition: workdivision.h:459

cms::alpakatools::elements_with_stride::iterator::iterator
ALPAKA_FN_ACC iterator(Idx elements, Idx stride, Idx extent, Idx first)
Definition: workdivision.h:120

SiStripPI::min
Definition: SiStripPayloadInspectorHelper.h:178

cms::alpakatools::independent_group_elements::iterator::operator!=
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:699

Exhume::I
const std::complex< double > I
Definition: I.h:8

alpaka
Definition: HostOnlyTask.h:9

cms::alpakatools::independent_group_elements::begin
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:712

cms::alpakatools::elements_with_stride::extent_
const Idx extent_
Definition: workdivision.h:186

value
Definition: value.py:1

cms::alpakatools::independent_group_elements::iterator::range_
Idx range_
Definition: workdivision.h:709

cms::alpakatools::blocks_with_stride::iterator::operator!=
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:412

cms::alpakatools::elements_with_stride_nd::end
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:341

cms::alpakatools::independent_group_elements::iterator::operator==
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:695

groupFilesInBlocks.block
block
Definition: groupFilesInBlocks.py:150

cms::alpakatools::independent_group_elements::iterator::elements_
Idx elements_
Definition: workdivision.h:703

cms::alpakatools::blocks_with_stride::iterator::operator++
ALPAKA_FN_ACC iterator operator++(int)
Definition: workdivision.h:404

cms::alpakatools::elements_with_stride_nd::iterator::operator++
ALPAKA_FN_ACC constexpr iterator operator++()
Definition: workdivision.h:217

cms::alpakatools::independent_groups::independent_groups
ALPAKA_FN_ACC independent_groups(TAcc const &acc, Idx groups)
Definition: workdivision.h:571

cms::alpakatools::independent_group_elements::iterator::operator++
ALPAKA_FN_ACC iterator & operator++()
Definition: workdivision.h:666

cms::alpakatools::independent_groups::iterator::first_
Idx first_
Definition: workdivision.h:613

cms::alpakatools::independent_groups::stride_
const Idx stride_
Definition: workdivision.h:622

traits.h

DTRecHitClients_cfi.local
local
Definition: DTRecHitClients_cfi.py:10

cms::alpakatools::elements_with_stride_nd::Vec
alpaka::Vec< Dim, Idx > Vec
Definition: workdivision.h:193

cms::alpakatools::blocks_with_stride::iterator::iterator
ALPAKA_FN_ACC iterator(Idx stride, Idx extent, Idx first)
Definition: workdivision.h:385

cms::alpakatools::elements_with_stride::iterator::operator++
ALPAKA_FN_ACC iterator operator++(int)
Definition: workdivision.h:155

cms::alpakatools::independent_groups::extent_
const Idx extent_
Definition: workdivision.h:623

cms::alpakatools::elements_in_block::first_
const Idx first_
Definition: workdivision.h:505

cms::alpakatools::blocks_with_stride::begin
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:422

cms::alpakatools::elements_with_stride
Definition: workdivision.h:97

N
#define N
Definition: blowfish.cc:9

cms::alpakatools::independent_group_elements::extent_
const Idx extent_
Definition: workdivision.h:720

cms::alpakatools::elements_with_stride::elements_with_stride
ALPAKA_FN_ACC elements_with_stride(TAcc const &acc, Idx first, Idx extent)
Definition: workdivision.h:111

cms::alpakatools::elements_in_block::range_
const Idx range_
Definition: workdivision.h:507

cms::alpakatools::elements_in_block::iterator::operator==
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:488

cms::alpakatools::requires_single_thread_per_block
Definition: workdivision.h:23

cms::alpakatools::independent_group_elements::thread_
const Idx thread_
Definition: workdivision.h:718

alpaka_common::Vec
alpaka::Vec< TDim, Idx > Vec
Definition: config.h:24

cms::alpakatools::elements_in_block::elements_in_block
ALPAKA_FN_ACC elements_in_block(TAcc const &acc, Idx block)
Definition: workdivision.h:446

cms::alpakatools::elements_with_stride_nd::extent_
const Vec extent_
Definition: workdivision.h:350

cms::alpakatools::once_per_grid
ALPAKA_FN_ACC constexpr bool once_per_grid(TAcc const &acc)
Definition: workdivision.h:731

cms::alpakatools::independent_group_elements::elements_
const Idx elements_
Definition: workdivision.h:717

cms::alpakatools::independent_group_elements::iterator::operator++
ALPAKA_FN_ACC iterator operator++(int)
Definition: workdivision.h:689

cms::alpakatools::blocks_with_stride::blocks_with_stride
ALPAKA_FN_ACC blocks_with_stride(TAcc const &acc, Idx extent)
Definition: workdivision.h:377

cms::alpakatools::elements_with_stride_nd::iterator::do_elements_loops
ALPAKA_FN_ACC constexpr bool do_elements_loops()
Definition: workdivision.h:257

cms::alpakatools::elements_with_stride_nd::Dim
alpaka::Dim< TAcc > Dim
Definition: workdivision.h:192

cms::alpakatools::elements_with_stride_nd::iterator::range_
Vec range_
Definition: workdivision.h:326

dqmdumpme.first
first
Definition: dqmdumpme.py:55

cms::alpakatools::elements_with_stride::iterator::operator==
ALPAKA_FN_ACC bool operator==(iterator const &other) const
Definition: workdivision.h:161

cms::alpakatools::independent_group_elements::stride_
const Idx stride_
Definition: workdivision.h:719

cms::alpakatools::elements_in_block::iterator::operator++
ALPAKA_FN_ACC iterator operator++(int)
Definition: workdivision.h:482

FastTimerService_cff.range
range
Definition: FastTimerService_cff.py:34

cms::alpakatools::independent_groups::iterator
Definition: workdivision.h:576

cms::alpakatools::independent_group_elements::independent_group_elements
ALPAKA_FN_ACC independent_group_elements(TAcc const &acc, Idx extent)
Definition: workdivision.h:645

cms::alpakatools::independent_group_elements::iterator::stride_
Idx stride_
Definition: workdivision.h:704

cms::alpakatools::elements_with_stride_nd::begin
ALPAKA_FN_ACC iterator begin() const
Definition: workdivision.h:330

cms::alpakatools::independent_groups::first_
const Idx first_
Definition: workdivision.h:621

cms::alpakatools::elements_with_stride_nd::iterator::first_
Vec first_
Definition: workdivision.h:325

cms::alpakatools::elements_with_stride::iterator::range_
Idx range_
Definition: workdivision.h:175

cms::alpakatools::elements_with_stride_nd::stride_
const Vec stride_
Definition: workdivision.h:349

cms::alpakatools::elements_with_stride::iterator::operator++
ALPAKA_FN_ACC iterator & operator++()
Definition: workdivision.h:132

cms::alpakatools::independent_group_elements::end
ALPAKA_FN_ACC iterator end() const
Definition: workdivision.h:714

cms::alpakatools::independent_group_elements::iterator::index_
Idx index_
Definition: workdivision.h:708

cms::alpakatools::elements_with_stride_nd::thread_
const Vec thread_
Definition: workdivision.h:348

cms::alpakatools::elements_with_stride_nd::iterator::operator++
ALPAKA_FN_ACC constexpr iterator operator++(int)
Definition: workdivision.h:223

config.h

gpuPixelDoublets::stride
auto stride
Definition: gpuPixelDoubletsAlgos.h:170

cms::alpakatools::elements_in_block::local_
const Idx local_
Definition: workdivision.h:506

cms::alpakatools::independent_group_elements::iterator::extent_
Idx extent_
Definition: workdivision.h:705

cms::alpakatools::blocks_with_stride
Definition: workdivision.h:369

cms::alpakatools::requires_single_thread_per_block_v
constexpr bool requires_single_thread_per_block_v
Definition: workdivision.h:42

cms::alpakatools::elements_with_stride_nd::iterator::index_
Vec index_
Definition: workdivision.h:327

cms::alpakatools::elements_with_stride_nd::iterator::iterator
ALPAKA_FN_ACC iterator(elements_with_stride_nd const *loop, Vec first)
Definition: workdivision.h:235

bookConverter.elements
elements
Definition: bookConverter.py:147

cms::alpakatools::blocks_with_stride::blocks_with_stride
ALPAKA_FN_ACC blocks_with_stride(TAcc const &acc)
Definition: workdivision.h:371

cms::alpakatools::elements_with_stride_nd::iterator::do_strided_loops
ALPAKA_FN_ACC constexpr bool do_strided_loops()
Definition: workdivision.h:284

cms::alpakatools::elements_with_stride_nd::iterator::operator!=
ALPAKA_FN_ACC constexpr bool operator!=(iterator const &other) const
Definition: workdivision.h:231

cms::alpakatools::elements_with_stride::iterator::operator!=
ALPAKA_FN_ACC bool operator!=(iterator const &other) const
Definition: workdivision.h:165

cms::alpakatools::independent_groups::iterator::operator++
ALPAKA_FN_ACC iterator operator++(int)
Definition: workdivision.h:598

cms::alpakatools::elements_with_stride::elements_
const Idx elements_
Definition: workdivision.h:183

cms::alpakatools::ElementIndex
Definition: workdivision.h:83