CMS 3D CMS Logo

prefixScan.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
2 #define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
3 
4 #include <cstdint>
5 
9 
10 #ifdef __CUDA_ARCH__
11 
12 template <typename T>
13 __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) {
14  // ci and co may be the same
15  auto x = ci[i];
16  auto laneId = threadIdx.x & 0x1f;
18  for (int offset = 1; offset < 32; offset <<= 1) {
19  auto y = __shfl_up_sync(mask, x, offset);
20  if (laneId >= offset)
21  x += y;
22  }
23  co[i] = x;
24 }
25 
26 template <typename T>
27 __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
28  auto x = c[i];
29  auto laneId = threadIdx.x & 0x1f;
31  for (int offset = 1; offset < 32; offset <<= 1) {
32  auto y = __shfl_up_sync(mask, x, offset);
33  if (laneId >= offset)
34  x += y;
35  }
36  c[i] = x;
37 }
38 
39 #endif
40 
41 namespace cms {
42  namespace cuda {
43 
44  // limited to 32*32 elements....
45  template <typename VT, typename T>
47  VT* co,
48  uint32_t size,
49  T* ws
50 #ifndef __CUDA_ARCH__
51  = nullptr
52 #endif
53  ) {
54 #ifdef __CUDA_ARCH__
55  assert(ws);
56  assert(size <= 1024);
57  assert(0 == blockDim.x % 32);
58  auto first = threadIdx.x;
59  auto mask = __ballot_sync(0xffffffff, first < size);
60 
61  for (auto i = first; i < size; i += blockDim.x) {
62  warpPrefixScan(ci, co, i, mask);
63  auto laneId = threadIdx.x & 0x1f;
64  auto warpId = i / 32;
65  assert(warpId < 32);
66  if (31 == laneId)
67  ws[warpId] = co[i];
68  mask = __ballot_sync(mask, i + blockDim.x < size);
69  }
70  __syncthreads();
71  if (size <= 32)
72  return;
73  if (threadIdx.x < 32)
74  warpPrefixScan(ws, threadIdx.x, 0xffffffff);
75  __syncthreads();
76  for (auto i = first + 32; i < size; i += blockDim.x) {
77  auto warpId = i / 32;
78  co[i] += ws[warpId - 1];
79  }
80  __syncthreads();
81 #else
82  co[0] = ci[0];
83  for (uint32_t i = 1; i < size; ++i)
84  co[i] = ci[i] + co[i - 1];
85 #endif
86  }
87 
88  // same as above, may remove
89  // limited to 32*32 elements....
90  template <typename T>
92  uint32_t size,
93  T* ws
94 #ifndef __CUDA_ARCH__
95  = nullptr
96 #endif
97  ) {
98 #ifdef __CUDA_ARCH__
99  assert(ws);
100  assert(size <= 1024);
101  assert(0 == blockDim.x % 32);
102  auto first = threadIdx.x;
103  auto mask = __ballot_sync(0xffffffff, first < size);
104 
105  for (auto i = first; i < size; i += blockDim.x) {
106  warpPrefixScan(c, i, mask);
107  auto laneId = threadIdx.x & 0x1f;
108  auto warpId = i / 32;
109  assert(warpId < 32);
110  if (31 == laneId)
111  ws[warpId] = c[i];
112  mask = __ballot_sync(mask, i + blockDim.x < size);
113  }
114  __syncthreads();
115  if (size <= 32)
116  return;
117  if (threadIdx.x < 32)
118  warpPrefixScan(ws, threadIdx.x, 0xffffffff);
119  __syncthreads();
120  for (auto i = first + 32; i < size; i += blockDim.x) {
121  auto warpId = i / 32;
122  c[i] += ws[warpId - 1];
123  }
124  __syncthreads();
125 #else
126  for (uint32_t i = 1; i < size; ++i)
127  c[i] += c[i - 1];
128 #endif
129  }
130 
131 #ifdef __CUDA_ARCH__
132  // see https://stackoverflow.com/questions/40021086/can-i-obtain-the-amount-of-allocated-dynamic-shared-memory-from-within-a-kernel/40021087#40021087
133  __device__ __forceinline__ unsigned dynamic_smem_size() {
134  unsigned ret;
135  asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
136  return ret;
137  }
138 #endif
139 
140  // in principle not limited....
141  template <typename T>
142  __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
143  volatile T const* ci = ici;
144  volatile T* co = ico;
145  __shared__ T ws[32];
146 #ifdef __CUDA_ARCH__
147  assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below
148 #endif
149  assert(blockDim.x * gridDim.x >= size);
150  // first each block does a scan
151  int off = blockDim.x * blockIdx.x;
152  if (size - off > 0)
153  blockPrefixScan(ci + off, co + off, std::min(int(blockDim.x), size - off), ws);
154 
155  // count blocks that finished
156  __shared__ bool isLastBlockDone;
157  if (0 == threadIdx.x) {
158  __threadfence();
159  auto value = atomicAdd(pc, 1); // block counter
160  isLastBlockDone = (value == (int(gridDim.x) - 1));
161  }
162 
163  __syncthreads();
164 
165  if (!isLastBlockDone)
166  return;
167 
168  assert(int(gridDim.x) == *pc);
169 
170  // good each block has done its work and now we are left in last block
171 
172  // let's get the partial sums from each block
173  extern __shared__ T psum[];
174  for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
175  auto j = blockDim.x * i + blockDim.x - 1;
176  psum[i] = (j < size) ? co[j] : T(0);
177  }
178  __syncthreads();
179  blockPrefixScan(psum, psum, gridDim.x, ws);
180 
181  // now it would have been handy to have the other blocks around...
182  for (int i = threadIdx.x + blockDim.x, k = 0; i < size; i += blockDim.x, ++k) {
183  co[i] += psum[k];
184  }
185  }
186  } // namespace cuda
187 } // namespace cms
188 
189 #endif // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
const dim3 threadIdx
Definition: cudaCompat.h:29
ALPAKA_FN_ACC ALPAKA_FN_INLINE void warpPrefixScan(const TAcc &acc, int32_t laneId, T *c, uint32_t i, bool active=true)
Definition: prefixScan.h:40
#define __forceinline__
Definition: cudaCompat.h:22
const dim3 gridDim
Definition: cudaCompat.h:33
#define __host__
ret
prodAgent to be discontinued
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
assert(be >=bs)
__host__ __device__ VT * co
Definition: prefixScan.h:47
#define CMS_UNROLL_LOOP
Definition: CMSUnrollLoop.h:47
Definition: value.py:1
const dim3 blockIdx
Definition: cudaCompat.h:32
Namespace of DDCMS conversion namespace.
__host__ __device__ VT uint32_t size
Definition: prefixScan.h:47
void __syncthreads()
Definition: cudaCompat.h:132
float x
long double T
#define __device__
ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void blockPrefixScan(const TAcc &acc, T *__restrict__ c, int32_t size, T *__restrict__ ws=nullptr)
Definition: prefixScan.h:92
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
void __threadfence()
Definition: cudaCompat.h:133