CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
OneToManyAssoc.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
2 #define HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
3 
4 #include <algorithm>
5 #ifndef __CUDA_ARCH__
6 #include <atomic>
7 #endif // __CUDA_ARCH__
8 #include <cstddef>
9 #include <cstdint>
10 #include <type_traits>
11 
18 
19 namespace cms {
20  namespace cuda {
21 
22  template <typename Assoc>
24  using Counter = typename Assoc::Counter;
25  using index_type = typename Assoc::index_type;
26 
27  Assoc *assoc = nullptr;
28  Counter *offStorage = nullptr;
30  int32_t offSize = -1;
31  int32_t contentSize = -1;
32  };
33 
34  // this MUST BE DONE in a single block (or in two kernels!)
35  template <typename Assoc>
36  __global__ void zeroAndInit(OneToManyAssocView<Assoc> view) {
37  auto h = view.assoc;
38  assert(1 == gridDim.x);
39  assert(0 == blockIdx.x);
40 
41  int first = threadIdx.x;
42 
43  if (0 == first) {
44  h->psws = 0;
45  h->initStorage(view);
46  }
47  __syncthreads();
48  for (int i = first, nt = h->totOnes(); i < nt; i += blockDim.x) {
49  h->off[i] = 0;
50  }
51  }
52 
53  template <typename Assoc>
54  inline __attribute__((always_inline)) void launchZero(Assoc *h,
55  cudaStream_t stream
56 #ifndef __CUDACC__
57  = cudaStreamDefault
58 #endif
59  ) {
60  typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
61  launchZero(view, stream);
62  }
63  template <typename Assoc>
64  inline __attribute__((always_inline)) void launchZero(OneToManyAssocView<Assoc> view,
65  cudaStream_t stream
66 #ifndef __CUDACC__
67  = cudaStreamDefault
68 #endif
69  ) {
70 
71  if constexpr (Assoc::ctCapacity() < 0) {
72  assert(view.contentStorage);
73  assert(view.contentSize > 0);
74  }
75  if constexpr (Assoc::ctNOnes() < 0) {
76  assert(view.offStorage);
77  assert(view.offSize > 0);
78  }
79 #ifdef __CUDACC__
80  auto nthreads = 1024;
81  auto nblocks = 1; // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels);
82  zeroAndInit<<<nblocks, nthreads, 0, stream>>>(view);
83  cudaCheck(cudaGetLastError());
84 #else
85  auto h = view.assoc;
86  assert(h);
87  h->initStorage(view);
88  h->zero();
89  h->psws = 0;
90 #endif
91  }
92 
93  template <typename Assoc>
94  inline __attribute__((always_inline)) void launchFinalize(Assoc *h,
95  cudaStream_t stream
96 #ifndef __CUDACC__
97  = cudaStreamDefault
98 #endif
99  ) {
100  typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
101  launchFinalize(view, stream);
102  }
103 
104  template <typename Assoc>
105  inline __attribute__((always_inline)) void launchFinalize(OneToManyAssocView<Assoc> view,
106  cudaStream_t stream
107 #ifndef __CUDACC__
108  = cudaStreamDefault
109 #endif
110  ) {
111  auto h = view.assoc;
112  assert(h);
113 #ifdef __CUDACC__
114  using Counter = typename Assoc::Counter;
115  Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
116  auto nOnes = Assoc::ctNOnes();
117  if constexpr (Assoc::ctNOnes() < 0) {
118  assert(view.offStorage);
119  assert(view.offSize > 0);
120  nOnes = view.offSize;
121  poff = view.offStorage;
122  }
123  assert(nOnes > 0);
124  int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
125  auto nthreads = 1024;
126  auto nblocks = (nOnes + nthreads - 1) / nthreads;
127  multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(poff, poff, nOnes, ppsws);
128  cudaCheck(cudaGetLastError());
129 #else
130  h->finalize();
131 #endif
132  }
133 
134  template <typename Assoc>
135  __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
136  assoc->bulkFinalizeFill(*apc);
137  }
138 
139  template <typename I, // type stored in the container (usually an index in a vector of the input values)
140  int32_t ONES, // number of "Ones" +1. If -1 is initialized at runtime using external storage
141  int32_t SIZE // max number of element. If -1 is initialized at runtime using external storage
142  >
144  public:
146  using Counter = uint32_t;
147 
149 
150  using index_type = I;
151 
152  static constexpr uint32_t ilog2(uint32_t v) {
153  constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
154  constexpr uint32_t s[] = {1, 2, 4, 8, 16};
155 
156  uint32_t r = 0; // result of log2(v) will go here
157  for (auto i = 4; i >= 0; i--)
158  if (v & b[i]) {
159  v >>= s[i];
160  r |= s[i];
161  }
162  return r;
163  }
164 
165  static constexpr int32_t ctNOnes() { return ONES; }
166  constexpr auto totOnes() const { return off.capacity(); }
167  constexpr auto nOnes() const { return totOnes() - 1; }
168  static constexpr int32_t ctCapacity() { return SIZE; }
169  constexpr auto capacity() const { return content.capacity(); }
170 
172  assert(view.assoc == this);
173  if constexpr (ctCapacity() < 0) {
174  assert(view.contentStorage);
175  assert(view.contentSize > 0);
176  content.init(view.contentStorage, view.contentSize);
177  }
178  if constexpr (ctNOnes() < 0) {
179  assert(view.offStorage);
180  assert(view.offSize > 0);
181  off.init(view.offStorage, view.offSize);
182  }
183  }
184 
186  for (int32_t i = 0; i < totOnes(); ++i) {
187  off[i] = 0;
188  }
189  }
190 
192  for (int32_t i = 0; i < totOnes(); ++i) {
193 #ifdef __CUDA_ARCH__
194  atomicAdd(off.data() + i, co.off[i]);
195 #else
196  auto &a = (std::atomic<Counter> &)(off[i]);
197  a += co.off[i];
198 #endif
199  }
200  }
201 
202  static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
203 #ifdef __CUDA_ARCH__
204  return atomicAdd(&x, 1);
205 #else
206  auto &a = (std::atomic<Counter> &)(x);
207  return a++;
208 #endif
209  }
210 
211  static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
212 #ifdef __CUDA_ARCH__
213  return atomicSub(&x, 1);
214 #else
215  auto &a = (std::atomic<Counter> &)(x);
216  return a--;
217 #endif
218  }
219 
221  assert(b < nOnes());
222  atomicIncrement(off[b]);
223  }
224 
226  assert(b < nOnes());
227  auto w = atomicDecrement(off[b]);
228  assert(w > 0);
229  content[w - 1] = j;
230  }
231 
232  __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
233  auto c = apc.add(n);
234  if (int(c.m) >= nOnes())
235  return -int32_t(c.m);
236  off[c.m] = c.n;
237  for (uint32_t j = 0; j < n; ++j)
238  content[c.n + j] = v[j];
239  return c.m;
240  }
241 
242  __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
243  off[apc.get().m] = apc.get().n;
244  }
245 
246  __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
247  int m = apc.get().m;
248  auto n = apc.get().n;
249  if (m >= nOnes()) { // overflow!
250  off[nOnes()] = uint32_t(off[nOnes() - 1]);
251  return;
252  }
253  auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
254  for (int i = first; i < totOnes(); i += gridDim.x * blockDim.x) {
255  off[i] = n;
256  }
257  }
258 
260  assert(off[totOnes() - 1] == 0);
261  blockPrefixScan(off.data(), totOnes(), ws);
262  assert(off[totOnes() - 1] == off[totOnes() - 2]);
263  }
264 
265  constexpr auto size() const { return uint32_t(off[totOnes() - 1]); }
266  constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
267 
268  constexpr index_type const *begin() const { return content.data(); }
269  constexpr index_type const *end() const { return begin() + size(); }
270 
271  constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; }
272  constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; }
273 
275  int32_t psws; // prefix-scan working space
277  };
278 
279  } // namespace cuda
280 } // namespace cms
281 
282 #endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
const dim3 threadIdx
Definition: cudaCompat.h:29
cudaStream_t int32_t ONES
FlexiStorage< index_type, SIZE > content
const edm::EventSetup & c
#define __forceinline__
Definition: cudaCompat.h:22
hist finalize(hws)
const dim3 gridDim
Definition: cudaCompat.h:33
#define __host__
T1 atomicSub(T1 *a, T2 b)
Definition: cudaCompat.h:73
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
assert(be >=bs)
constexpr auto nOnes() const
__host__ __device__ VT * co
Definition: prefixScan.h:47
auto &__restrict__ ws
std::function< unsigned int(align::ID)> Counter
__host__ __device__ index_type const * v
__host__ __device__ ONES off
static constexpr uint32_t ilog2(uint32_t v)
__attribute__((always_inline)) void countFromVector(Histo *__restrict__ h
__host__ __device__ index_type j
const std::complex< double > I
Definition: I.h:8
constexpr auto capacity() const
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int nthreads
const dim3 blockIdx
Definition: cudaCompat.h:32
__host__ __device__ void initStorage(View view)
int nt
Definition: AMPTWrapper.h:42
typename Assoc::Counter Counter
static constexpr int32_t ctCapacity()
void add(std::map< std::string, TH1 * > &h, TH1 *hist)
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter * apc
void fill(std::map< std::string, TH1 * > &h, const std::string &s, double x)
__host__ __device__ VT uint32_t size
Definition: prefixScan.h:47
static constexpr int32_t ctNOnes()
__host__ __device__ void zero()
double b
Definition: hdecay.h:118
void __syncthreads()
Definition: cudaCompat.h:108
__device__ __host__ Counters get() const
double a
Definition: hdecay.h:119
constexpr auto totOnes() const
string end
Definition: dataset.py:937
typename Assoc::index_type index_type
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
__host__ __device__ index_type const uint32_t n
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4
#define __device__
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61