CMS 3D CMS Logo

HistoContainer.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
2 #define HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
3 
5 
6 namespace cms {
7  namespace cuda {
8 
9  template <typename Histo, typename T>
10  __global__ void countFromVector(Histo *__restrict__ h,
11  uint32_t nh,
12  T const *__restrict__ v,
13  uint32_t const *__restrict__ offsets) {
14  int first = blockDim.x * blockIdx.x + threadIdx.x;
15  for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
16  auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
17  assert((*off) > 0);
18  int32_t ih = off - offsets - 1;
19  assert(ih >= 0);
20  assert(ih < int(nh));
21  (*h).count(v[i], ih);
22  }
23  }
24 
25  template <typename Histo, typename T>
26  __global__ void fillFromVector(Histo *__restrict__ h,
27  uint32_t nh,
28  T const *__restrict__ v,
29  uint32_t const *__restrict__ offsets) {
30  int first = blockDim.x * blockIdx.x + threadIdx.x;
31  for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
32  auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
33  assert((*off) > 0);
34  int32_t ih = off - offsets - 1;
35  assert(ih >= 0);
36  assert(ih < int(nh));
37  (*h).fill(v[i], i, ih);
38  }
39  }
40 
41  template <typename Histo, typename T>
42  inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
43  uint32_t nh,
44  T const *__restrict__ v,
45  uint32_t const *__restrict__ offsets,
46  int32_t totSize,
47  int nthreads,
48  typename Histo::index_type *mem,
49  cudaStream_t stream
50 #ifndef __CUDACC__
51  = cudaStreamDefault
52 #endif
53  ) {
54  typename Histo::View view = {h, nullptr, mem, -1, totSize};
55  launchZero(view, stream);
56 #ifdef __CUDACC__
57  auto nblocks = (totSize + nthreads - 1) / nthreads;
58  assert(nblocks > 0);
59  countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
60  cudaCheck(cudaGetLastError());
61  launchFinalize(view, stream);
62  fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
63  cudaCheck(cudaGetLastError());
64 #else
65  countFromVector(h, nh, v, offsets);
66  h->finalize();
67  fillFromVector(h, nh, v, offsets);
68 #endif
69  }
70 
71  // iteratate over N bins left and right of the one containing "v"
72  template <typename Hist, typename V, typename Func>
73  __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
74  int bs = Hist::bin(value);
75  int be = std::min(int(Hist::nbins() - 1), bs + n);
76  bs = std::max(0, bs - n);
77  assert(be >= bs);
78  for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
79  func(*pj);
80  }
81  }
82 
83  // iteratate over bins containing all values in window wmin, wmax
84  template <typename Hist, typename V, typename Func>
85  __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
86  auto bs = Hist::bin(wmin);
87  auto be = Hist::bin(wmax);
88  assert(be >= bs);
89  for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
90  func(*pj);
91  }
92  }
93 
94  template <typename T, // the type of the discretized input values
95  uint32_t NBINS, // number of bins
96  int32_t SIZE, // max number of element. If -1 is initialized at runtime using external storage
97  uint32_t S = sizeof(T) * 8, // number of significant bits in T
98  typename I = uint32_t, // type stored in the container (usually an index in a vector of the input values)
99  uint32_t NHISTS = 1 // number of histos stored
100  >
102  public:
104  using View = typename Base::View;
105  using Counter = typename Base::Counter;
106  using index_type = typename Base::index_type;
107  using UT = typename std::make_unsigned<T>::type;
108 
109  static constexpr uint32_t ilog2(uint32_t v) {
110  constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
111  constexpr uint32_t s[] = {1, 2, 4, 8, 16};
112 
113  uint32_t r = 0; // result of log2(v) will go here
114  for (auto i = 4; i >= 0; i--)
115  if (v & b[i]) {
116  v >>= s[i];
117  r |= s[i];
118  }
119  return r;
120  }
121 
122  static constexpr uint32_t sizeT() { return S; }
123  static constexpr uint32_t nbins() { return NBINS; }
124  static constexpr uint32_t nhists() { return NHISTS; }
125  static constexpr uint32_t totbins() { return NHISTS * NBINS + 1; }
126  static constexpr uint32_t nbits() { return ilog2(NBINS - 1) + 1; }
127 
128  // static_assert(int32_t(totbins())==Base::ctNOnes());
129 
130  static constexpr auto histOff(uint32_t nh) { return NBINS * nh; }
131 
132  static constexpr UT bin(T t) {
133  constexpr uint32_t shift = sizeT() - nbits();
134  constexpr uint32_t mask = (1 << nbits()) - 1;
135  return (t >> shift) & mask;
136  }
137 
139  uint32_t b = bin(t);
140  assert(b < nbins());
141  Base::atomicIncrement(this->off[b]);
142  }
143 
145  uint32_t b = bin(t);
146  assert(b < nbins());
147  auto w = Base::atomicDecrement(this->off[b]);
148  assert(w > 0);
149  this->content[w - 1] = j;
150  }
151 
153  uint32_t b = bin(t);
154  assert(b < nbins());
155  b += histOff(nh);
156  assert(b < totbins());
157  Base::atomicIncrement(this->off[b]);
158  }
159 
161  uint32_t b = bin(t);
162  assert(b < nbins());
163  b += histOff(nh);
164  assert(b < totbins());
165  auto w = Base::atomicDecrement(this->off[b]);
166  assert(w > 0);
167  this->content[w - 1] = j;
168  }
169  };
170 
171  } // namespace cuda
172 } // namespace cms
173 
174 #endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainer_h
const dim3 threadIdx
Definition: cudaCompat.h:29
static constexpr auto histOff(uint32_t nh)
#define __forceinline__
Definition: cudaCompat.h:22
const dim3 gridDim
Definition: cudaCompat.h:33
#define __host__
T w() const
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
__host__ __device__ constexpr RandomIt upper_bound(RandomIt first, RandomIt last, const T &value, Compare comp={})
static constexpr uint32_t ilog2(uint32_t v)
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type * mem
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t stream
assert(be >=bs)
uint32_t T const *__restrict__ uint32_t const *__restrict__ offsets
typename Base::View View
constexpr uint32_t mask
Definition: gpuClustering.h:26
static constexpr uint32_t nbits()
__attribute__((always_inline)) void countFromVector(Histo *__restrict__ h
std::function< unsigned int(align::ID)> Counter
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t V
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t Func __host__ __device__ V int Func func
typename Base::index_type index_type
int ilog2(double factor)
Definition: Util.h:104
uint32_t T const *__restrict__ v
cms::cuda::HistoContainer< uint8_t, 256, 16000, 8, uint16_t > Hist
static constexpr uint32_t nhists()
const std::complex< double > I
Definition: I.h:8
typename Base::Counter Counter
Definition: value.py:1
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int nthreads
uint32_t nh
const dim3 blockIdx
Definition: cudaCompat.h:32
static constexpr uint32_t sizeT()
int nt
Definition: AMPTWrapper.h:42
Namespace of DDCMS conversion namespace.
static constexpr uint32_t nbins()
static constexpr UT bin(T t)
double b
Definition: hdecay.h:118
typename std::make_unsigned< T >::type UT
static constexpr uint32_t totbins()
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t Func __host__ __device__ V int n
static unsigned int const shift
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
__host__ __device__ V wmin
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4
long double T
#define __device__
const int NBINS
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t totSize
__host__ __device__ V V wmax