1 #ifndef HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
2 #define HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
19 auto y = __shfl_up_sync(mask,
x,
offset);
32 auto y = __shfl_up_sync(mask,
x,
offset);
45 template <
typename VT,
typename T>
59 auto mask = __ballot_sync(0xffffffff,
first < size);
62 warpPrefixScan(ci, co, i, mask);
68 mask = __ballot_sync(mask, i +
blockDim.x < size);
78 co[
i] +=
ws[warpId - 1];
83 for (uint32_t i = 1; i <
size; ++
i)
84 co[i] = ci[i] + co[i - 1];
103 auto mask = __ballot_sync(0xffffffff,
first < size);
106 warpPrefixScan(c, i, mask);
108 auto warpId = i / 32;
112 mask = __ballot_sync(mask, i +
blockDim.x < size);
121 auto warpId = i / 32;
122 c[
i] +=
ws[warpId - 1];
126 for (uint32_t i = 1; i <
size; ++
i)
135 asm volatile(
"mov.u32 %0, %dynamic_smem_size;" :
"=r"(
ret));
141 template <
typename T>
142 __global__ void multiBlockPrefixScan(
T const* ici,
T* ico, int32_t size, int32_t* pc) {
143 volatile T const* ci = ici;
144 volatile T* co = ico;
153 blockPrefixScan(ci + off, co + off,
std::min(
int(
blockDim.x), size - off), ws);
156 __shared__
bool isLastBlockDone;
165 if (!isLastBlockDone)
173 extern __shared__
T psum[];
176 psum[
i] = (
j <
size) ? co[
j] :
T(0);
179 blockPrefixScan(psum, psum,
gridDim.x, ws);
189 #endif // HeterogeneousCore_CUDAUtilities_interface_prefixScan_h
tuple ret
prodAgent to be discontinued
const edm::EventSetup & c
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t Func __host__ __device__ V value
__host__ __device__ VT * co
uint16_t const *__restrict__ x
__host__ __device__ VT uint32_t size
T1 atomicAdd(T1 *a, T2 b)