CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
launch.h
Go to the documentation of this file.
1 #ifndef HeterogeneousCore_CUDAUtilities_launch_h
2 #define HeterogeneousCore_CUDAUtilities_launch_h
3 
4 #include <tuple>
5 
6 #include <cuda_runtime.h>
7 
9 
10 /*
11  * `cms::cuda::launch` and `cms::cuda::launch_cooperative` are wrappers around
12  * the CUDA Runtime API calls to setup and call a CUDA kernel from the host.
13  *
14  * `kernel` should be a pointer to a __global__ void(...) function.
15  * `config` describe the launch configuration: the grid size and block size, the
16  * dynamic shared memory size (default to 0) and the CUDA stream to use
17  * (default to 0, the default stream).
18  * `args` are the arguments passed (by value) to the kernel.
19  *
20  * Currently this is requires an extra copy to perform the necessary implicit
21  * conversions and ensure that the arguments match the kernel function signature;
22  * the extra copy could eventually be avoided for arguments that are already of
23  * the exact type.
24  *
25  * Unlike the `kernel<<<...>>>(...)` syntax and the `cuda::launch(...)`
26  * implementation from the CUDA API Wrappers, `cms::cuda::launch(...)` and
27  * `cms::cuda::launch_cooperative` can be called from standard C++ host code.
28  *
29  * Possible optimisations
30  *
31  * - once C++17 is available in CUDA, replace the `pointer_setter` functor
32  * with a simpler function using fold expressions:
33  *
34  * template<int N, class Tuple, std::size_t... Is>
35  * void pointer_setter(void* ptrs[N], Tuple const& t, std::index_sequence<Is...>)
36  * {
37  * ((ptrs[Is] = & std::get<Is>(t)), ...);
38  * }
39  *
40  * - add a template specialisation to `launch` and `launch_cooperative` to
41  * avoid making a temporary copy of the parameters when they match the
42  * kernel signature.
43  */
44 
45 namespace cms {
46  namespace cuda {
47 
49  dim3 gridDim;
50  dim3 blockDim;
51  size_t sharedMem;
52  cudaStream_t stream;
53 
54  LaunchParameters(dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
55  : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
56 
57  LaunchParameters(int gridDim, int blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
58  : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
59  };
60 
61  namespace detail {
62 
63  template <typename T>
64  struct kernel_traits;
65 
66  template <typename... Args>
67  struct kernel_traits<void(Args...)> {
68  static constexpr size_t arguments_size = sizeof...(Args);
69 
70  using argument_type_tuple = std::tuple<Args...>;
71 
72  template <size_t i>
74  };
75 
76  // fill an array with the pointers to the elements of a tuple
77  template <int I>
78  struct pointer_setter {
79  template <typename Tuple>
80  void operator()(void const* ptrs[], Tuple const& t) {
81  pointer_setter<I - 1>()(ptrs, t);
82  ptrs[I - 1] = &std::get<I - 1>(t);
83  }
84  };
85 
86  template <>
87  struct pointer_setter<0> {
88  template <typename Tuple>
89  void operator()(void const* ptrs[], Tuple const& t) {}
90  };
91 
92  } // namespace detail
93 
94  // wrappers for cudaLaunchKernel
95 
96  inline void launch(void (*kernel)(), LaunchParameters config) {
97  cudaCheck(cudaLaunchKernel(
98  (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
99  }
100 
101  template <typename F, typename... Args>
102 #if __cplusplus >= 201703L
103  std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
104 #else
105  std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
106 #endif
107  launch(F* kernel, LaunchParameters config, Args&&... args) {
108  using function_type = detail::kernel_traits<F>;
109  typename function_type::argument_type_tuple args_copy(args...);
110 
111  constexpr auto size = function_type::arguments_size;
112  void const* pointers[size];
113 
114  detail::pointer_setter<size>()(pointers, args_copy);
115  cudaCheck(cudaLaunchKernel(
116  (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
117  }
118 
119  // wrappers for cudaLaunchCooperativeKernel
120 
121  inline void launch_cooperative(void (*kernel)(), LaunchParameters config) {
122  cudaCheck(cudaLaunchCooperativeKernel(
123  (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
124  }
125 
126  template <typename F, typename... Args>
127 #if __cplusplus >= 201703L
128  std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
129 #else
130  std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
131 #endif
133  using function_type = detail::kernel_traits<F>;
134  typename function_type::argument_type_tuple args_copy(args...);
135 
136  constexpr auto size = function_type::arguments_size;
137  void const* pointers[size];
138 
139  detail::pointer_setter<size>()(pointers, args_copy);
140  cudaCheck(cudaLaunchCooperativeKernel(
141  (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
142  }
143 
144  } // namespace cuda
145 } // namespace cms
146 
147 #endif // HeterogeneousCore_CUDAUtilities_launch_h
void launch_cooperative(void(*kernel)(), LaunchParameters config)
Definition: launch.h:121
uint32_t T const *__restrict__ uint32_t const *__restrict__ int32_t int Histo::index_type cudaStream_t Func __host__ __device__ V value
LaunchParameters(int gridDim, int blockDim, size_t sharedMem=0, cudaStream_t stream=nullptr)
Definition: launch.h:57
cudaStream_t stream
Definition: launch.h:52
LaunchParameters(dim3 gridDim, dim3 blockDim, size_t sharedMem=0, cudaStream_t stream=nullptr)
Definition: launch.h:54
const std::complex< double > I
Definition: I.h:8
void operator()(void const *ptrs[], Tuple const &t)
Definition: launch.h:89
__host__ __device__ VT uint32_t size
Definition: prefixScan.h:47
void launch(void(*kernel)(), LaunchParameters config)
Definition: launch.h:96
void operator()(void const *ptrs[], Tuple const &t)
Definition: launch.h:80
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
#define get
static uInt32 F(BLOWFISH_CTX *ctx, uInt32 x)
Definition: blowfish.cc:163
def config
Definition: cms.py:19
typename std::tuple_element< i, argument_type_tuple >::type argument_type
Definition: launch.h:73