CMS 3D CMS Logo

List of all members | Public Member Functions
cms::alpakatools::multiBlockPrefixScan< T > Struct Template Reference

#include <prefixScan.h>

Public Member Functions

template<typename TAcc >
ALPAKA_FN_ACC void operator() (const TAcc &acc, T const *ci, T *co, uint32_t size, int32_t numBlocks, int32_t *pc, std::size_t warpSize) const
 

Detailed Description

template<typename T>
struct cms::alpakatools::multiBlockPrefixScan< T >

Definition at line 138 of file prefixScan.h.

Member Function Documentation

◆ operator()()

template<typename T >
template<typename TAcc >
ALPAKA_FN_ACC void cms::alpakatools::multiBlockPrefixScan< T >::operator() ( const TAcc &  acc,
T const *  ci,
T co,
uint32_t  size,
int32_t  numBlocks,
int32_t *  pc,
std::size_t  warpSize 
) const
inline

Definition at line 140 of file prefixScan.h.

References ALPAKA_ACCELERATOR_NAMESPACE::caPixelDoublets::ALPAKA_ASSERT_ACC(), cms::cudacompat::atomicAdd(), cms::cudacompat::blockIdx, cms::alpakatools::blockPrefixScan(), cms::cuda::co, ALPAKA_ACCELERATOR_NAMESPACE::brokenline::constexpr(), mps_fire::i, createfilelist::int, dqmiolumiharvest::j, dqmdumpme::k, SiStripPI::min, cms::alpakatools::size, cms::cudacompat::threadIdx, and ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::ws.

141  {
142  // Get shared variable. The workspace is needed only for multi-threaded accelerators.
143  T* ws = nullptr;
144  if constexpr (!requires_single_thread_per_block_v<TAcc>) {
145  ws = alpaka::getDynSharedMem<T>(acc);
146  }
147  ALPAKA_ASSERT_ACC(warpSize == static_cast<std::size_t>(alpaka::warp::getSize(acc)));
148  [[maybe_unused]] const auto elementsPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[0u];
149  const auto elementsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
150  const auto threadsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
151  const auto blocksPerGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
152  const auto blockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
153  const auto threadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
154  ALPAKA_ASSERT_ACC(elementsPerGrid >= size);
155  // first each block does a scan
156  [[maybe_unused]] int off = elementsPerBlock * blockIdx;
157  if (size - off > 0) {
158  blockPrefixScan(acc, ci + off, co + off, std::min(elementsPerBlock, size - off), ws);
159  }
160 
161  // count blocks that finished
162  auto& isLastBlockDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
163  //__shared__ bool isLastBlockDone;
164  if (0 == threadIdx) {
165  alpaka::mem_fence(acc, alpaka::memory_scope::Device{});
166  auto value = alpaka::atomicAdd(acc, pc, 1, alpaka::hierarchy::Blocks{}); // block counter
167  isLastBlockDone = (value == (int(blocksPerGrid) - 1));
168  }
169 
170  alpaka::syncBlockThreads(acc);
171 
172  if (!isLastBlockDone)
173  return;
174 
175  ALPAKA_ASSERT_ACC(int(blocksPerGrid) == *pc);
176 
177  // good each block has done its work and now we are left in last block
178 
179  // let's get the partial sums from each block except the last, which receives 0.
180  T* psum = nullptr;
181  if constexpr (!requires_single_thread_per_block_v<TAcc>) {
182  psum = ws + warpSize;
183  } else {
184  psum = alpaka::getDynSharedMem<T>(acc);
185  }
186  for (int32_t i = threadIdx, ni = blocksPerGrid; i < ni; i += threadsPerBlock) {
187  auto j = elementsPerBlock * i + elementsPerBlock - 1;
188  psum[i] = (j < size) ? co[j] : T(0);
189  }
190  alpaka::syncBlockThreads(acc);
191  blockPrefixScan(acc, psum, psum, blocksPerGrid, ws);
192 
193  // now it would have been handy to have the other blocks around...
194  // Simplify the computation by having one version where threads per block = block size
195  // and a second for the one thread per block accelerator.
196  if constexpr (!requires_single_thread_per_block_v<TAcc>) {
197  // Here threadsPerBlock == elementsPerBlock
198  for (uint32_t i = threadIdx + threadsPerBlock, k = 0; i < size; i += threadsPerBlock, ++k) {
199  co[i] += psum[k];
200  }
201  } else {
202  // We are single threaded here, adding partial sums starting with the 2nd block.
203  for (uint32_t i = elementsPerBlock; i < size; i++) {
204  co[i] += psum[i / elementsPerBlock - 1];
205  }
206  }
207  }
const dim3 threadIdx
Definition: cudaCompat.h:29
__host__ __device__ VT * co
Definition: prefixScan.h:47
std::vector< Block > Blocks
Definition: Block.h:99
Definition: value.py:1
const dim3 blockIdx
Definition: cudaCompat.h:32
ALPAKA_FN_ACC ALPAKA_FN_INLINE void blockPrefixScan(const TAcc &acc, T const *ci, T *co, int32_t size, T *ws=nullptr)
Definition: prefixScan.h:47
long double T
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61