CMS 3D CMS Logo

OneToManyAssoc.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
2 #define HeterogeneousCore_CUDAUtilities_interface_OneToManyAssoc_h
3 
4 #include <algorithm>
5 #ifndef __CUDA_ARCH__
6 #include <atomic>
7 #endif // __CUDA_ARCH__
8 #include <cstddef>
9 #include <cstdint>
10 #include <type_traits>
11 
18 
19 namespace cms {
20  namespace cuda {
21 
22  template <typename Assoc>
24  using Counter = typename Assoc::Counter;
25  using index_type = typename Assoc::index_type;
26 
27  Assoc *assoc = nullptr;
28  Counter *offStorage = nullptr;
30  int32_t offSize = -1;
31  int32_t contentSize = -1;
32  };
33 
34  // this MUST BE DONE in a single block (or in two kernels!)
35  template <typename Assoc>
36  __global__ void zeroAndInit(OneToManyAssocView<Assoc> view) {
37  auto h = view.assoc;
38  assert(1 == gridDim.x);
39  assert(0 == blockIdx.x);
40 
41  int first = threadIdx.x;
42 
43  if (0 == first) {
44  h->psws = 0;
45  h->initStorage(view);
46  }
47  __syncthreads();
48  for (int i = first, nt = h->totOnes(); i < nt; i += blockDim.x) {
49  h->off[i] = 0;
50  }
51  }
52 
53  template <typename Assoc>
54  inline __attribute__((always_inline)) void launchZero(Assoc *h,
55  cudaStream_t stream
56 #ifndef __CUDACC__
57  = cudaStreamDefault
58 #endif
59  ) {
60  typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
61  launchZero(view, stream);
62  }
63  template <typename Assoc>
64  inline __attribute__((always_inline)) void launchZero(OneToManyAssocView<Assoc> view,
65  cudaStream_t stream
66 #ifndef __CUDACC__
67  = cudaStreamDefault
68 #endif
69  ) {
70 
71  if constexpr (Assoc::ctCapacity() < 0) {
72  assert(view.contentStorage);
73  assert(view.contentSize > 0);
74  }
75  if constexpr (Assoc::ctNOnes() < 0) {
76  assert(view.offStorage);
77  assert(view.offSize > 0);
78  }
79 #ifdef __CUDACC__
80  auto nthreads = 1024;
81  auto nblocks = 1; // MUST BE ONE as memory is initialize in thread 0 (alternative is two kernels);
82  zeroAndInit<<<nblocks, nthreads, 0, stream>>>(view);
83  cudaCheck(cudaGetLastError());
84 #else
85  auto h = view.assoc;
86  assert(h);
87  h->initStorage(view);
88  h->zero();
89  h->psws = 0;
90 #endif
91  }
92 
93  template <typename Assoc>
94  inline __attribute__((always_inline)) void launchFinalize(Assoc *h,
95  cudaStream_t stream
96 #ifndef __CUDACC__
97  = cudaStreamDefault
98 #endif
99  ) {
100  typename Assoc::View view = {h, nullptr, nullptr, -1, -1};
101  launchFinalize(view, stream);
102  }
103 
104  template <typename Assoc>
105  inline __attribute__((always_inline)) void launchFinalize(OneToManyAssocView<Assoc> view,
106  cudaStream_t stream
107 #ifndef __CUDACC__
108  = cudaStreamDefault
109 #endif
110  ) {
111  auto h = view.assoc;
112  assert(h);
113 #ifdef __CUDACC__
114  using Counter = typename Assoc::Counter;
115  Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
116  auto nOnes = Assoc::ctNOnes();
117  if constexpr (Assoc::ctNOnes() < 0) {
118  assert(view.offStorage);
119  assert(view.offSize > 0);
120  nOnes = view.offSize;
121  poff = view.offStorage;
122  }
123  assert(nOnes > 0);
124  int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
125  auto nthreads = 1024;
126  auto nblocks = (nOnes + nthreads - 1) / nthreads;
127  multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(poff, poff, nOnes, ppsws);
128  cudaCheck(cudaGetLastError());
129 #else
130  h->finalize();
131 #endif
132  }
133 
134  template <typename Assoc>
135  __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
136  assoc->bulkFinalizeFill(*apc);
137  }
138 
139  template <typename I, // type stored in the container (usually an index in a vector of the input values)
140  int32_t ONES, // number of "Ones" +1. If -1 is initialized at runtime using external storage
141  int32_t SIZE // max number of element. If -1 is initialized at runtime using external storage
142  >
144  public:
146  using Counter = uint32_t;
147 
149 
150  using index_type = I;
151 
152  static constexpr uint32_t ilog2(uint32_t v) {
153  constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
154  constexpr uint32_t s[] = {1, 2, 4, 8, 16};
155 
156  uint32_t r = 0; // result of log2(v) will go here
157  for (auto i = 4; i >= 0; i--)
158  if (v & b[i]) {
159  v >>= s[i];
160  r |= s[i];
161  }
162  return r;
163  }
164 
165  static constexpr int32_t ctNOnes() { return ONES; }
166  constexpr auto totOnes() const { return off.capacity(); }
167  constexpr auto nOnes() const { return totOnes() - 1; }
168  static constexpr int32_t ctCapacity() { return SIZE; }
169  constexpr auto capacity() const { return content.capacity(); }
170 
172  assert(view.assoc == this);
173  if constexpr (ctCapacity() < 0) {
174  assert(view.contentStorage);
175  assert(view.contentSize > 0);
176  content.init(view.contentStorage, view.contentSize);
177  }
178  if constexpr (ctNOnes() < 0) {
179  assert(view.offStorage);
180  assert(view.offSize > 0);
181  off.init(view.offStorage, view.offSize);
182  }
183  }
184 
186  for (int32_t i = 0; i < totOnes(); ++i) {
187  off[i] = 0;
188  }
189  }
190 
192  for (int32_t i = 0; i < totOnes(); ++i) {
193 #ifdef __CUDA_ARCH__
194  atomicAdd(off.data() + i, co.off[i]);
195 #else
196  auto &a = (std::atomic<Counter> &)(off[i]);
197  a += co.off[i];
198 #endif
199  }
200  }
201 
202  static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
203 #ifdef __CUDA_ARCH__
204  return atomicAdd(&x, 1);
205 #else
206  auto &a = (std::atomic<Counter> &)(x);
207  return a++;
208 #endif
209  }
210 
211  static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
212 #ifdef __CUDA_ARCH__
213  return atomicSub(&x, 1);
214 #else
215  auto &a = (std::atomic<Counter> &)(x);
216  return a--;
217 #endif
218  }
219 
221  assert(b < nOnes());
222  atomicIncrement(off[b]);
223  }
224 
226  assert(b < nOnes());
227  auto w = atomicDecrement(off[b]);
228  assert(w > 0);
229  content[w - 1] = j;
230  }
231 
232  __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
233  auto c = apc.add(n);
234  if (int(c.m) >= nOnes())
235  return -int32_t(c.m);
236  off[c.m] = c.n;
237  for (uint32_t j = 0; j < n; ++j)
238  content[c.n + j] = v[j];
239  return c.m;
240  }
241 
243  off[apc.get().m] = apc.get().n;
244  }
245 
246  __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
247  int m = apc.get().m;
248  auto n = apc.get().n;
249  if (m >= nOnes()) { // overflow!
250  off[nOnes()] = uint32_t(off[nOnes() - 1]);
251  return;
252  }
253  auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
254  for (int i = first; i < totOnes(); i += gridDim.x * blockDim.x) {
255  off[i] = n;
256  }
257  }
258 
260  assert(off[totOnes() - 1] == 0);
261  blockPrefixScan(off.data(), totOnes(), ws);
262  assert(off[totOnes() - 1] == off[totOnes() - 2]);
263  }
264 
265  constexpr auto size() const { return uint32_t(off[totOnes() - 1]); }
266  constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
267 
268  constexpr index_type const *begin() const { return content.data(); }
269  constexpr index_type const *end() const { return begin() + size(); }
270 
271  constexpr index_type const *begin(uint32_t b) const { return content.data() + off[b]; }
272  constexpr index_type const *end(uint32_t b) const { return content.data() + off[b + 1]; }
273 
275  int32_t psws; // prefix-scan working space
277  };
278 
279  } // namespace cuda
280 } // namespace cms
281 
282 #endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
cms::cuda::OneToManyAssocView::assoc
Assoc * assoc
Definition: OneToManyAssoc.h:27
cms::cudacompat::atomicSub
T1 atomicSub(T1 *a, T2 b)
Definition: cudaCompat.h:73
cms::cuda::OneToManyAssoc::n
__host__ __device__ const index_type uint32_t n
Definition: OneToManyAssoc.h:232
cms::cuda::OneToManyAssocView::offSize
int32_t offSize
Definition: OneToManyAssoc.h:30
mps_fire.i
i
Definition: mps_fire.py:428
align::Counter
std::function< unsigned int(align::ID)> Counter
Definition: AlignableIndexer.h:31
cms::cuda::OneToManyAssocView::offStorage
Counter * offStorage
Definition: OneToManyAssoc.h:28
sistrip::View
View
Definition: ConstantsForView.h:26
nt
int nt
Definition: AMPTWrapper.h:42
cms::cuda::OneToManyAssoc::content
FlexiStorage< index_type, SIZE > content
Definition: OneToManyAssoc.h:276
cms::cuda::stream
uint32_t const T *__restrict__ const uint32_t *__restrict__ int32_t int Histo::index_type cudaStream_t stream
Definition: HistoContainer.h:51
cms::cuda::assert
assert(be >=bs)
ntupleEnum.SIZE
SIZE
Definition: ntupleEnum.py:86
cms::cudacompat::__syncthreads
void __syncthreads()
Definition: cudaCompat.h:108
gpuVertexFinder::ws
auto &__restrict__ ws
Definition: gpuClusterTracksDBSCAN.h:32
cms::cuda::OneToManyAssoc::w
auto w
Definition: OneToManyAssoc.h:227
h
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4
cms::cuda::OneToManyAssoc
Definition: OneToManyAssoc.h:143
__global__
#define __global__
Definition: cudaCompat.h:19
cms::cuda::ONES
cudaStream_t int32_t ONES
Definition: OneToManyAssoc.h:57
cms::cuda::OneToManyAssocView::contentStorage
index_type * contentStorage
Definition: OneToManyAssoc.h:29
cms::cuda::OneToManyAssoc::v
__host__ __device__ const index_type * v
Definition: OneToManyAssoc.h:232
trackingPlots.assoc
assoc
Definition: trackingPlots.py:183
cms::cuda::__attribute__
__attribute__((always_inline)) void countFromVector(Histo *__restrict__ h
prefixScan.h
alignCSCRings.s
s
Definition: alignCSCRings.py:92
FlexiStorage.h
h
Exhume::I
const std::complex< double > I
Definition: I.h:8
cms::cuda::co
__host__ __device__ VT * co
Definition: prefixScan.h:47
cms::cuda::OneToManyAssoc::j
__host__ __device__ index_type j
Definition: OneToManyAssoc.h:225
cms::cuda::OneToManyAssoc::psws
int32_t psws
Definition: OneToManyAssoc.h:275
mps_fire.end
end
Definition: mps_fire.py:242
cms::cuda::OneToManyAssoc< hindex_type, S+1, 5 *S >::Counter
uint32_t Counter
Definition: OneToManyAssoc.h:146
submitPVResolutionJobs.count
count
Definition: submitPVResolutionJobs.py:352
cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
b
double b
Definition: hdecay.h:118
first
auto first
Definition: CAHitNtupletGeneratorKernelsImpl.h:125
cms::cudacompat::gridDim
const dim3 gridDim
Definition: cudaCompat.h:33
cms::cuda::OneToManyAssocView
Definition: OneToManyAssoc.h:23
cms::cuda::FlexiStorage
Definition: FlexiStorage.h:10
ntuplemaker.fill
fill
Definition: ntuplemaker.py:304
cms::cuda::AtomicPairCounter
Definition: AtomicPairCounter.h:11
a
double a
Definition: hdecay.h:119
cms::cuda::OneToManyAssoc::off
__host__ __device__ ONES off
Definition: OneToManyAssoc.h:274
cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:30
PVValHelper::add
void add(std::map< std::string, TH1 * > &h, TH1 *hist)
Definition: PVValidationHelpers.cc:12
cms::cuda::OneToManyAssoc::ilog2
static constexpr uint32_t ilog2(uint32_t v)
Definition: OneToManyAssoc.h:152
cms::cuda::OneToManyAssoc::zero
__host__ __device__ void zero()
Definition: OneToManyAssoc.h:185
cms::cuda::nthreads
uint32_t const T *__restrict__ const uint32_t *__restrict__ int32_t int nthreads
Definition: HistoContainer.h:43
cms::cuda::OneToManyAssoc::ctNOnes
static constexpr int32_t ctNOnes()
Definition: OneToManyAssoc.h:165
cudaCheck.h
cms::cuda::OneToManyAssoc::m
return c m
Definition: OneToManyAssoc.h:239
__device__
#define __device__
Definition: SiPixelGainForHLTonGPU.h:15
cms::cuda::OneToManyAssoc::assert
assert(w > 0)
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:29
cms::cuda::OneToManyAssoc::initStorage
__host__ __device__ void initStorage(View view)
Definition: OneToManyAssoc.h:171
__forceinline__
#define __forceinline__
Definition: cudaCompat.h:22
cms::cuda::OneToManyAssoc::nOnes
constexpr auto nOnes() const
Definition: OneToManyAssoc.h:167
cms::cuda::OneToManyAssoc::totOnes
constexpr auto totOnes() const
Definition: OneToManyAssoc.h:166
alignCSCRings.r
r
Definition: alignCSCRings.py:93
cms::cuda::OneToManyAssocView::contentSize
int32_t contentSize
Definition: OneToManyAssoc.h:31
cms::cuda::OneToManyAssoc::content
content[w - 1]
Definition: OneToManyAssoc.h:229
cms::cuda::OneToManyAssoc::ctCapacity
static constexpr int32_t ctCapacity()
Definition: OneToManyAssoc.h:168
cudastdAlgorithm.h
cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
cms::cuda::size
__host__ __device__ VT uint32_t size
Definition: prefixScan.h:47
cms::cuda::OneToManyAssocView::Counter
typename Assoc::Counter Counter
Definition: OneToManyAssoc.h:24
apc
const caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple cms::cuda::AtomicPairCounter * apc
Definition: CAHitNtupletGeneratorKernelsImpl.h:43
cms::cuda::OneToManyAssoc::capacity
constexpr auto capacity() const
Definition: OneToManyAssoc.h:169
ecalDigis_cff.cuda
cuda
Definition: ecalDigis_cff.py:35
cms::cuda::OneToManyAssoc< hindex_type, S+1, 5 *S >::index_type
I index_type
Definition: OneToManyAssoc.h:150
gpuVertexFinder::finalize
hist finalize(hws)
AtomicPairCounter.h
cuda_assert.h
c
auto & c
Definition: CAHitNtupletGeneratorKernelsImpl.h:56
cms::cuda::OneToManyAssocView::index_type
typename Assoc::index_type index_type
Definition: OneToManyAssoc.h:25
__host__
#define __host__
Definition: SiPixelGainForHLTonGPU.h:12
cms
Namespace of DDCMS conversion namespace.
Definition: ProducerAnalyzer.cc:21
cms::cuda::OneToManyAssoc::off
off[c.m]
Definition: OneToManyAssoc.h:236
cms::cudacompat::blockIdx
const dim3 blockIdx
Definition: cudaCompat.h:32