CMS 3D CMS Logo

prefixScan.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
2 #define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
3 
4 #include <cstdint>
5 
8 
9 #ifdef __CUDA_ARCH__
10 
11 template <typename T>
12 __device__ void __forceinline__ warpPrefixScan(T const* __restrict__ ci, T* __restrict__ co, uint32_t i, uint32_t mask) {
13  // ci and co may be the same
14  auto x = ci[i];
15  auto laneId = threadIdx.x & 0x1f;
16 #pragma unroll
17  for (int offset = 1; offset < 32; offset <<= 1) {
18  auto y = __shfl_up_sync(mask, x, offset);
19  if (laneId >= offset)
20  x += y;
21  }
22  co[i] = x;
23 }
24 
25 template <typename T>
26 __device__ void __forceinline__ warpPrefixScan(T* c, uint32_t i, uint32_t mask) {
27  auto x = c[i];
28  auto laneId = threadIdx.x & 0x1f;
29 #pragma unroll
30  for (int offset = 1; offset < 32; offset <<= 1) {
31  auto y = __shfl_up_sync(mask, x, offset);
32  if (laneId >= offset)
33  x += y;
34  }
35  c[i] = x;
36 }
37 
38 #endif
39 
40 namespace cms {
41  namespace cuda {
42 
43  // limited to 32*32 elements....
44  template <typename VT, typename T>
45  __host__ __device__ __forceinline__ void blockPrefixScan(VT const* ci,
46  VT* co,
47  uint32_t size,
48  T* ws
49 #ifndef __CUDA_ARCH__
50  = nullptr
51 #endif
52  ) {
53 #ifdef __CUDA_ARCH__
54  assert(ws);
55  assert(size <= 1024);
56  assert(0 == blockDim.x % 32);
57  auto first = threadIdx.x;
58  auto mask = __ballot_sync(0xffffffff, first < size);
59 
60  for (auto i = first; i < size; i += blockDim.x) {
61  warpPrefixScan(ci, co, i, mask);
62  auto laneId = threadIdx.x & 0x1f;
63  auto warpId = i / 32;
64  assert(warpId < 32);
65  if (31 == laneId)
66  ws[warpId] = co[i];
67  mask = __ballot_sync(mask, i + blockDim.x < size);
68  }
69  __syncthreads();
70  if (size <= 32)
71  return;
72  if (threadIdx.x < 32)
73  warpPrefixScan(ws, threadIdx.x, 0xffffffff);
74  __syncthreads();
75  for (auto i = first + 32; i < size; i += blockDim.x) {
76  auto warpId = i / 32;
77  co[i] += ws[warpId - 1];
78  }
79  __syncthreads();
80 #else
81  co[0] = ci[0];
82  for (uint32_t i = 1; i < size; ++i)
83  co[i] = ci[i] + co[i - 1];
84 #endif
85  }
86 
87  // same as above, may remove
88  // limited to 32*32 elements....
89  template <typename T>
90  __host__ __device__ __forceinline__ void blockPrefixScan(T* c,
91  uint32_t size,
92  T* ws
93 #ifndef __CUDA_ARCH__
94  = nullptr
95 #endif
96  ) {
97 #ifdef __CUDA_ARCH__
98  assert(ws);
99  assert(size <= 1024);
100  assert(0 == blockDim.x % 32);
101  auto first = threadIdx.x;
102  auto mask = __ballot_sync(0xffffffff, first < size);
103 
104  for (auto i = first; i < size; i += blockDim.x) {
105  warpPrefixScan(c, i, mask);
106  auto laneId = threadIdx.x & 0x1f;
107  auto warpId = i / 32;
108  assert(warpId < 32);
109  if (31 == laneId)
110  ws[warpId] = c[i];
111  mask = __ballot_sync(mask, i + blockDim.x < size);
112  }
113  __syncthreads();
114  if (size <= 32)
115  return;
116  if (threadIdx.x < 32)
117  warpPrefixScan(ws, threadIdx.x, 0xffffffff);
118  __syncthreads();
119  for (auto i = first + 32; i < size; i += blockDim.x) {
120  auto warpId = i / 32;
121  c[i] += ws[warpId - 1];
122  }
123  __syncthreads();
124 #else
125  for (uint32_t i = 1; i < size; ++i)
126  c[i] += c[i - 1];
127 #endif
128  }
129 
130 #ifdef __CUDA_ARCH__
131  // see https://stackoverflow.com/questions/40021086/can-i-obtain-the-amount-of-allocated-dynamic-shared-memory-from-within-a-kernel/40021087#40021087
132  __device__ __forceinline__ unsigned dynamic_smem_size() {
133  unsigned ret;
134  asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(ret));
135  return ret;
136  }
137 #endif
138 
139  // in principle not limited....
140  template <typename T>
141  __global__ void multiBlockPrefixScan(T const* ici, T* ico, int32_t size, int32_t* pc) {
142  volatile T const* ci = ici;
143  volatile T* co = ico;
144  __shared__ T ws[32];
145 #ifdef __CUDA_ARCH__
146  assert(sizeof(T) * gridDim.x <= dynamic_smem_size()); // size of psum below
147 #endif
148  assert(blockDim.x * gridDim.x >= size);
149  // first each block does a scan
150  int off = blockDim.x * blockIdx.x;
151  if (size - off > 0)
152  blockPrefixScan(ci + off, co + off, std::min(int(blockDim.x), size - off), ws);
153 
154  // count blocks that finished
155  __shared__ bool isLastBlockDone;
156  if (0 == threadIdx.x) {
157  __threadfence();
158  auto value = atomicAdd(pc, 1); // block counter
159  isLastBlockDone = (value == (int(gridDim.x) - 1));
160  }
161 
162  __syncthreads();
163 
164  if (!isLastBlockDone)
165  return;
166 
167  assert(int(gridDim.x) == *pc);
168 
169  // good each block has done its work and now we are left in last block
170 
171  // let's get the partial sums from each block
172  extern __shared__ T psum[];
173  for (int i = threadIdx.x, ni = gridDim.x; i < ni; i += blockDim.x) {
174  auto j = blockDim.x * i + blockDim.x - 1;
175  psum[i] = (j < size) ? co[j] : T(0);
176  }
177  __syncthreads();
178  blockPrefixScan(psum, psum, gridDim.x, ws);
179 
180  // now it would have been handy to have the other blocks around...
181  for (int i = threadIdx.x + blockDim.x, k = 0; i < size; i += blockDim.x, ++k) {
182  co[i] += psum[k];
183  }
184  }
185  } // namespace cuda
186 } // namespace cms
187 
188 #endif // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
runTheMatrix.ret
ret
prodAgent to be discontinued
Definition: runTheMatrix.py:367
nvidia::inferenceserver
Definition: TritonData.cc:14
mps_fire.i
i
Definition: mps_fire.py:428
min
T min(T a, T b)
Definition: MathUtil.h:58
__device__
#define __device__
Definition: cudaCompat.h:92
cms::cuda::size
VT uint32_t size
Definition: prefixScan.h:46
cms::cudacompat::__threadfence
void __threadfence()
Definition: cudaCompat.h:73
cms::cuda::assert
assert(be >=bs)
dqmdumpme.first
first
Definition: dqmdumpme.py:55
cms::cudacompat::gridDim
thread_local dim3 gridDim
Definition: cudaCompat.cc:6
__global__
#define __global__
Definition: cudaCompat.h:101
cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:46
prod1Switch_cff.cuda
cuda
Definition: prod1Switch_cff.py:11
dqmdumpme.k
k
Definition: dqmdumpme.py:60
__host__
#define __host__
Definition: cudaCompat.h:91
cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:25
createfilelist.int
int
Definition: createfilelist.py:10
cms::cudacompat::dim3::x
uint32_t x
Definition: cudaCompat.h:21
value
Definition: value.py:1
HltBtagPostValidation_cff.c
c
Definition: HltBtagPostValidation_cff.py:31
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:24
__forceinline__
#define __forceinline__
Definition: cudaCompat.h:103
__shared__
#define __shared__
Definition: cudaCompat.h:94
T
long double T
Definition: Basic3DVectorLD.h:48
genVertex_cff.x
x
Definition: genVertex_cff.py:12
detailsBasic3DVector::y
float float y
Definition: extBasic3DVector.h:14
cudaCompat.h
cms::cuda::co
VT * co
Definition: prefixScan.h:46
cuda_assert.h
dqmiolumiharvest.j
j
Definition: dqmiolumiharvest.py:66
hltrates_dqm_sourceclient-live_cfg.offset
offset
Definition: hltrates_dqm_sourceclient-live_cfg.py:82
cms::cudacompat::__syncthreads
void __syncthreads()
Definition: cudaCompat.h:72
cms
Namespace of DDCMS conversion namespace.
Definition: ProducerAnalyzer.cc:21
cms::cudacompat::blockIdx
thread_local dim3 blockIdx
Definition: cudaCompat.cc:5