CMS 3D CMS Logo

HistoContainer.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
2 #define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
3 
4 #include <algorithm>
5 #ifndef __CUDA_ARCH__
6 #include <atomic>
7 #endif // __CUDA_ARCH__
8 #include <cstddef>
9 #include <cstdint>
10 #include <type_traits>
11 
17 
18 namespace cms {
19  namespace cuda {
20 
21  template <typename Histo, typename T>
22  __global__ void countFromVector(Histo *__restrict__ h,
23  uint32_t nh,
24  T const *__restrict__ v,
25  uint32_t const *__restrict__ offsets) {
26  int first = blockDim.x * blockIdx.x + threadIdx.x;
27  for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
28  auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
29  assert((*off) > 0);
30  int32_t ih = off - offsets - 1;
31  assert(ih >= 0);
32  assert(ih < int(nh));
33  (*h).count(v[i], ih);
34  }
35  }
36 
37  template <typename Histo, typename T>
38  __global__ void fillFromVector(Histo *__restrict__ h,
39  uint32_t nh,
40  T const *__restrict__ v,
41  uint32_t const *__restrict__ offsets) {
42  int first = blockDim.x * blockIdx.x + threadIdx.x;
43  for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
44  auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
45  assert((*off) > 0);
46  int32_t ih = off - offsets - 1;
47  assert(ih >= 0);
48  assert(ih < int(nh));
49  (*h).fill(v[i], i, ih);
50  }
51  }
52 
53  template <typename Histo>
54  inline __attribute__((always_inline)) void launchZero(Histo *__restrict__ h,
55  cudaStream_t stream
56 #ifndef __CUDACC__
57  = cudaStreamDefault
58 #endif
59  ) {
60  uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
61  int32_t size = offsetof(Histo, bins) - offsetof(Histo, off);
62  assert(size >= int(sizeof(uint32_t) * Histo::totbins()));
63 #ifdef __CUDACC__
64  cudaCheck(cudaMemsetAsync(poff, 0, size, stream));
65 #else
66  ::memset(poff, 0, size);
67 #endif
68  }
69 
70  template <typename Histo>
71  inline __attribute__((always_inline)) void launchFinalize(Histo *__restrict__ h,
72  cudaStream_t stream
73 #ifndef __CUDACC__
74  = cudaStreamDefault
75 #endif
76  ) {
77 #ifdef __CUDACC__
78  uint32_t *poff = (uint32_t *)((char *)(h) + offsetof(Histo, off));
79  int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Histo, psws));
80  auto nthreads = 1024;
81  auto nblocks = (Histo::totbins() + nthreads - 1) / nthreads;
82  multiBlockPrefixScan<<<nblocks, nthreads, sizeof(int32_t) * nblocks, stream>>>(
83  poff, poff, Histo::totbins(), ppsws);
84  cudaCheck(cudaGetLastError());
85 #else
86  h->finalize();
87 #endif
88  }
89 
90  template <typename Histo, typename T>
91  inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
92  uint32_t nh,
93  T const *__restrict__ v,
94  uint32_t const *__restrict__ offsets,
95  uint32_t totSize,
96  int nthreads,
97  cudaStream_t stream
98 #ifndef __CUDACC__
99  = cudaStreamDefault
100 #endif
101  ) {
102  launchZero(h, stream);
103 #ifdef __CUDACC__
104  auto nblocks = (totSize + nthreads - 1) / nthreads;
105  countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
106  cudaCheck(cudaGetLastError());
107  launchFinalize(h, stream);
108  fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
109  cudaCheck(cudaGetLastError());
110 #else
111  countFromVector(h, nh, v, offsets);
112  h->finalize();
113  fillFromVector(h, nh, v, offsets);
114 #endif
115  }
116 
117  template <typename Assoc>
118  __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
119  assoc->bulkFinalizeFill(*apc);
120  }
121 
122  // iteratate over N bins left and right of the one containing "v"
123  template <typename Hist, typename V, typename Func>
124  __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
125  int bs = Hist::bin(value);
126  int be = std::min(int(Hist::nbins() - 1), bs + n);
127  bs = std::max(0, bs - n);
128  assert(be >= bs);
129  for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
130  func(*pj);
131  }
132  }
133 
134  // iteratate over bins containing all values in window wmin, wmax
135  template <typename Hist, typename V, typename Func>
136  __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
137  auto bs = Hist::bin(wmin);
138  auto be = Hist::bin(wmax);
139  assert(be >= bs);
140  for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
141  func(*pj);
142  }
143  }
144 
145  template <typename T, // the type of the discretized input values
146  uint32_t NBINS, // number of bins
147  uint32_t SIZE, // max number of element
148  uint32_t S = sizeof(T) * 8, // number of significant bits in T
149  typename I = uint32_t, // type stored in the container (usually an index in a vector of the input values)
150  uint32_t NHISTS = 1 // number of histos stored
151  >
153  public:
154  using Counter = uint32_t;
155 
157 
158  using index_type = I;
159  using UT = typename std::make_unsigned<T>::type;
160 
161  static constexpr uint32_t ilog2(uint32_t v) {
162  constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
163  constexpr uint32_t s[] = {1, 2, 4, 8, 16};
164 
165  uint32_t r = 0; // result of log2(v) will go here
166  for (auto i = 4; i >= 0; i--)
167  if (v & b[i]) {
168  v >>= s[i];
169  r |= s[i];
170  }
171  return r;
172  }
173 
174  static constexpr uint32_t sizeT() { return S; }
175  static constexpr uint32_t nbins() { return NBINS; }
176  static constexpr uint32_t nhists() { return NHISTS; }
177  static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
178  static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
179  static constexpr uint32_t capacity() { return SIZE; }
180 
181  static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
182 
183  static constexpr UT bin(T t) {
184  constexpr uint32_t shift = sizeT() - nbits();
185  constexpr uint32_t mask = (1 << nbits()) - 1;
186  return (t >> shift) & mask;
187  }
188 
190  for (auto &i : off)
191  i = 0;
192  }
193 
195  for (uint32_t i = 0; i < totbins(); ++i) {
196 #ifdef __CUDA_ARCH__
197  atomicAdd(off + i, co.off[i]);
198 #else
199  auto &a = (std::atomic<Counter> &)(off[i]);
200  a += co.off[i];
201 #endif
202  }
203  }
204 
205  static __host__ __device__ __forceinline__ uint32_t atomicIncrement(Counter &x) {
206 #ifdef __CUDA_ARCH__
207  return atomicAdd(&x, 1);
208 #else
209  auto &a = (std::atomic<Counter> &)(x);
210  return a++;
211 #endif
212  }
213 
214  static __host__ __device__ __forceinline__ uint32_t atomicDecrement(Counter &x) {
215 #ifdef __CUDA_ARCH__
216  return atomicSub(&x, 1);
217 #else
218  auto &a = (std::atomic<Counter> &)(x);
219  return a--;
220 #endif
221  }
222 
223  __host__ __device__ __forceinline__ void countDirect(T b) {
224  assert(b < nbins());
225  atomicIncrement(off[b]);
226  }
227 
229  assert(b < nbins());
230  auto w = atomicDecrement(off[b]);
231  assert(w > 0);
232  bins[w - 1] = j;
233  }
234 
235  __host__ __device__ __forceinline__ int32_t bulkFill(AtomicPairCounter &apc, index_type const *v, uint32_t n) {
236  auto c = apc.add(n);
237  if (c.m >= nbins())
238  return -int32_t(c.m);
239  off[c.m] = c.n;
240  for (uint32_t j = 0; j < n; ++j)
241  bins[c.n + j] = v[j];
242  return c.m;
243  }
244 
245  __host__ __device__ __forceinline__ void bulkFinalize(AtomicPairCounter const &apc) {
246  off[apc.get().m] = apc.get().n;
247  }
248 
249  __host__ __device__ __forceinline__ void bulkFinalizeFill(AtomicPairCounter const &apc) {
250  auto m = apc.get().m;
251  auto n = apc.get().n;
252  if (m >= nbins()) { // overflow!
253  off[nbins()] = uint32_t(off[nbins() - 1]);
254  return;
255  }
256  auto first = m + blockDim.x * blockIdx.x + threadIdx.x;
257  for (auto i = first; i < totbins(); i += gridDim.x * blockDim.x) {
258  off[i] = n;
259  }
260  }
261 
263  uint32_t b = bin(t);
264  assert(b < nbins());
265  atomicIncrement(off[b]);
266  }
267 
268  __host__ __device__ __forceinline__ void fill(T t, index_type j) {
269  uint32_t b = bin(t);
270  assert(b < nbins());
271  auto w = atomicDecrement(off[b]);
272  assert(w > 0);
273  bins[w - 1] = j;
274  }
275 
277  uint32_t b = bin(t);
278  assert(b < nbins());
279  b += histOff(nh);
280  assert(b < totbins());
281  atomicIncrement(off[b]);
282  }
283 
285  uint32_t b = bin(t);
286  assert(b < nbins());
287  b += histOff(nh);
288  assert(b < totbins());
289  auto w = atomicDecrement(off[b]);
290  assert(w > 0);
291  bins[w - 1] = j;
292  }
293 
294  __host__ __device__ __forceinline__ void finalize(Counter *ws = nullptr) {
295  assert(off[totbins() - 1] == 0);
296  blockPrefixScan(off, totbins(), ws);
297  assert(off[totbins() - 1] == off[totbins() - 2]);
298  }
299 
300  constexpr auto size() const { return uint32_t(off[totbins() - 1]); }
301  constexpr auto size(uint32_t b) const { return off[b + 1] - off[b]; }
302 
303  constexpr index_type const *begin() const { return bins; }
304  constexpr index_type const *end() const { return begin() + size(); }
305 
306  constexpr index_type const *begin(uint32_t b) const { return bins + off[b]; }
307  constexpr index_type const *end(uint32_t b) const { return bins + off[b + 1]; }
308 
309  Counter off[totbins()];
310  int32_t psws; // prefix-scan working space
311  index_type bins[capacity()];
312  };
313 
314  template <typename I, // type stored in the container (usually an index in a vector of the input values)
315  uint32_t MAXONES, // max number of "ones"
316  uint32_t MAXMANYS // max number of "manys"
317  >
318  using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
319 
320  } // namespace cuda
321 } // namespace cms
322 
323 #endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
cms::cuda::V
cudaStream_t T uint32_t const T *__restrict__ const uint32_t *__restrict__ uint32_t int cudaStream_t V
Definition: HistoContainer.h:99
cms::cudacompat::atomicSub
T1 atomicSub(T1 *a, T2 b)
Definition: cudaCompat.h:53
mps_fire.i
i
Definition: mps_fire.py:428
cms::cuda::n
cudaStream_t T uint32_t const T *__restrict__ const uint32_t *__restrict__ uint32_t int cudaStream_t Func V int n
Definition: HistoContainer.h:124
align::Counter
std::function< unsigned int(align::ID)> Counter
Definition: AlignableIndexer.h:31
NBINS
const int NBINS
Definition: CaloCachedShapeIntegrator.cc:3
cms::cuda::HistoContainer::ilog2
static constexpr uint32_t ilog2(uint32_t v)
Definition: HistoContainer.h:161
nt
int nt
Definition: AMPTWrapper.h:42
min
T min(T a, T b)
Definition: MathUtil.h:58
cms::cuda::HistoContainer::capacity
static constexpr uint32_t capacity()
Definition: HistoContainer.h:179
cms::cuda::totSize
cudaStream_t T uint32_t const T *__restrict__ const uint32_t *__restrict__ uint32_t totSize
Definition: HistoContainer.h:92
__device__
#define __device__
Definition: cudaCompat.h:92
h
FWCore Framework interface EventSetupRecordImplementation h
Helper function to determine trigger accepts.
Definition: L1TUtmAlgorithmRcd.h:4
cms::cuda::size
VT uint32_t size
Definition: prefixScan.h:46
cms::cuda::stream
cudaStream_t stream
Definition: HistoContainer.h:57
cms::cuda::HistoContainer::zero
void zero()
Definition: HistoContainer.h:189
cms::cuda::assert
assert(be >=bs)
cms::cuda::AtomicPairCounter::get
Counters get() const
Definition: AtomicPairCounter.h:35
cms::cuda::HistoContainer::index_type
I index_type
Definition: HistoContainer.h:158
dqmdumpme.first
first
Definition: dqmdumpme.py:55
cms::cudacompat::gridDim
thread_local dim3 gridDim
Definition: cudaCompat.cc:6
__global__
#define __global__
Definition: cudaCompat.h:101
trackingPlots.assoc
assoc
Definition: trackingPlots.py:184
cms::cuda::__attribute__
__attribute__((always_inline)) void countFromVector(Histo *__restrict__ h
prefixScan.h
alignCSCRings.s
s
Definition: alignCSCRings.py:92
cms::cuda::bs
bs
Definition: HistoContainer.h:127
h
cms::cuda::HistoContainer::nhists
static constexpr uint32_t nhists()
Definition: HistoContainer.h:176
Exhume::I
const std::complex< double > I
Definition: I.h:8
compare.hist
hist
Definition: compare.py:376
cuda_std::upper_bound
__host__ constexpr __device__ RandomIt upper_bound(RandomIt first, RandomIt last, const T &value, Compare comp={})
Definition: cudastdAlgorithm.h:45
cms::cuda::func
cudaStream_t T uint32_t const T *__restrict__ const uint32_t *__restrict__ uint32_t int cudaStream_t Func V int Func func
Definition: HistoContainer.h:124
w
const double w
Definition: UKUtility.cc:23
visualization-live-secondInstance_cfg.m
m
Definition: visualization-live-secondInstance_cfg.py:72
cms::cuda::nh
uint32_t nh
Definition: HistoContainer.h:23
mps_fire.end
end
Definition: mps_fire.py:242
LaserClient_cfi.nbins
nbins
Definition: LaserClient_cfi.py:51
submitPVResolutionJobs.count
count
Definition: submitPVResolutionJobs.py:352
cms::cudacompat::atomicAdd
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:46
prod1Switch_cff.cuda
cuda
Definition: prod1Switch_cff.py:11
cms::cuda::HistoContainer::Counter
uint32_t Counter
Definition: HistoContainer.h:154
cms::cuda::HistoContainer::nbits
static constexpr uint32_t nbits()
Definition: HistoContainer.h:178
b
double b
Definition: hdecay.h:118
cms::cuda::AtomicPairCounter::Counters::m
uint32_t m
Definition: AtomicPairCounter.h:25
S
double S(const TLorentzVector &, const TLorentzVector &)
Definition: Particle.cc:97
cms::cuda::HistoContainer::sizeT
static constexpr uint32_t sizeT()
Definition: HistoContainer.h:174
ntuplemaker.fill
fill
Definition: ntuplemaker.py:304
cms::cuda::wmin
V wmin
Definition: HistoContainer.h:136
cms::cuda::AtomicPairCounter
Definition: AtomicPairCounter.h:11
__host__
#define __host__
Definition: cudaCompat.h:91
a
double a
Definition: hdecay.h:119
cms::cudacompat::blockDim
const dim3 blockDim
Definition: cudaCompat.h:25
SiStripPI::max
Definition: SiStripPayloadInspectorHelper.h:169
PVValHelper::add
void add(std::map< std::string, TH1 * > &h, TH1 *hist)
Definition: PVValidationHelpers.cc:12
cms::cuda::wmax
V V wmax
Definition: HistoContainer.h:136
gainCalibHelper::gainCalibPI::type
type
Definition: SiPixelGainCalibHelper.h:39
cms::cudacompat::dim3::x
uint32_t x
Definition: cudaCompat.h:21
value
Definition: value.py:1
cms::cuda::AtomicPairCounter::Counters::n
uint32_t n
Definition: AtomicPairCounter.h:24
cudaCheck.h
cms::cuda::HistoContainer::UT
typename std::make_unsigned< T >::type UT
Definition: HistoContainer.h:159
HltBtagPostValidation_cff.c
c
Definition: HltBtagPostValidation_cff.py:31
cms::cudacompat::threadIdx
const dim3 threadIdx
Definition: cudaCompat.h:24
__forceinline__
#define __forceinline__
Definition: cudaCompat.h:103
alignCSCRings.r
r
Definition: alignCSCRings.py:93
newFWLiteAna.bin
bin
Definition: newFWLiteAna.py:161
cudastdAlgorithm.h
cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:62
cms::cuda::HistoContainer::psws
int32_t psws
Definition: HistoContainer.h:310
cms::cuda::HistoContainer::nbins
static constexpr uint32_t nbins()
Definition: HistoContainer.h:175
edm::shift
static unsigned const int shift
Definition: LuminosityBlockID.cc:7
cms::cuda::offsets
uint32_t const T *__restrict__ const uint32_t *__restrict__ offsets
Definition: HistoContainer.h:25
T
long double T
Definition: Basic3DVectorLD.h:48
cms::cuda::HistoContainer::bin
static constexpr UT bin(T t)
Definition: HistoContainer.h:183
cms::cuda::nthreads
cudaStream_t T uint32_t const T *__restrict__ const uint32_t *__restrict__ uint32_t int nthreads
Definition: HistoContainer.h:92
S
Definition: CSCDBL1TPParametersExtended.h:16
AtomicPairCounter.h
cms::cuda::co
VT * co
Definition: prefixScan.h:46
trigObjTnPSource_cfi.bins
bins
Definition: trigObjTnPSource_cfi.py:20
cms::cuda::be
int be
Definition: HistoContainer.h:126
cms::cuda::HistoContainer
Definition: HistoContainer.h:152
cuda_assert.h
dqmiolumiharvest.j
j
Definition: dqmiolumiharvest.py:66
cms::cuda::HistoContainer::totbins
static constexpr uint32_t totbins()
Definition: HistoContainer.h:177
submitPVValidationJobs.t
string t
Definition: submitPVValidationJobs.py:644
cms::cuda::v
uint32_t const T *__restrict__ v
Definition: HistoContainer.h:23
cms::cuda::HistoContainer::histOff
static constexpr auto histOff(uint32_t nh)
Definition: HistoContainer.h:181
cms
Namespace of DDCMS conversion namespace.
Definition: ProducerAnalyzer.cc:21
cms::cudacompat::blockIdx
thread_local dim3 blockIdx
Definition: cudaCompat.cc:5