1 #ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h 2 #define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h 111 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 154 if (
a.device ==
b.device)
155 return (
a.d_ptr <
b.d_ptr);
157 return (
a.device <
b.device);
162 if (
a.device ==
b.device)
163 return (
a.bytes <
b.bytes);
165 return (
a.device <
b.device);
192 unsigned int retval = 1;
195 retval = retval *
base;
212 power =
sizeof(size_t) * 8;
213 rounded_bytes = size_t(0) - 1;
217 while (rounded_bytes <
value) {
218 rounded_bytes *=
base;
246 #endif // DOXYGEN_SHOULD_SKIP_THIS 308 std::unique_lock mutex_locker(
mutex);
312 printf(
"Changing max_cached_bytes (%lld -> %lld)\n",
313 (
long long)this->max_cached_bytes,
319 mutex_locker.unlock();
335 cudaStream_t active_stream =
nullptr)
338 std::unique_lock<std::mutex> mutex_locker(
mutex, std::defer_lock);
341 cudaError_t
error = cudaSuccess;
346 device = entrypoint_device;
361 search_key.
bytes = bytes;
373 CachedBlocks::iterator block_itr =
cached_blocks.lower_bound(search_key);
374 while ((block_itr !=
cached_blocks.end()) && (block_itr->device == device) &&
375 (block_itr->bin == search_key.
bin)) {
379 if ((active_stream == block_itr->associated_stream) ||
380 (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
383 search_key = *block_itr;
396 "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously " 397 "associated with stream %lld, event %lld).\n",
400 (
long long)search_key.
bytes,
403 (
long long)block_itr->associated_stream,
404 (
long long)block_itr->ready_event);
414 mutex_locker.unlock();
420 if (device != entrypoint_device) {
428 if ((
error = cudaMalloc(&search_key.
d_ptr, search_key.
bytes)) == cudaErrorMemoryAllocation) {
433 "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
435 (
long long)search_key.
bytes,
446 CachedBlocks::iterator block_itr =
cached_blocks.lower_bound(free_key);
448 while ((block_itr !=
cached_blocks.end()) && (block_itr->device == device)) {
455 if ((
error = cudaFree(block_itr->d_ptr)))
457 if ((
error = cudaEventDestroy(block_itr->ready_event)))
466 "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks " 467 "(%lld bytes) outstanding.\n",
469 (
long long)block_itr->bytes,
481 mutex_locker.unlock();
501 mutex_locker.unlock();
506 printf(
"\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
509 (
long long)search_key.
bytes,
521 *d_ptr = search_key.
d_ptr;
525 printf(
"\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
544 cudaStream_t active_stream =
nullptr)
558 cudaError_t
error = cudaSuccess;
560 std::unique_lock<std::mutex> mutex_locker(
mutex, std::defer_lock);
565 device = entrypoint_device;
572 bool recached =
false;
574 BusyBlocks::iterator block_itr =
live_blocks.find(search_key);
577 search_key = *block_itr;
593 "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " 594 "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
596 (
long long)search_key.
bytes,
608 if (device != entrypoint_device) {
621 mutex_locker.unlock();
632 "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available " 633 "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
635 (
long long)search_key.
bytes,
667 cudaError_t
error = cudaSuccess;
671 std::unique_lock<std::mutex> mutex_locker(
mutex);
680 if ((
error = cudaGetDevice(&entrypoint_device)))
685 if (begin->device != current_device) {
687 if ((
error = cudaSetDevice(begin->device)))
689 current_device = begin->device;
694 if ((
error = cudaFree(begin->d_ptr)))
696 if ((
error = cudaEventDestroy(begin->ready_event)))
704 "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " 705 "bytes) outstanding.\n",
707 (
long long)begin->bytes,
716 mutex_locker.unlock();
729 std::unique_lock mutex_locker(
mutex);
CachedBlocks cached_blocks
Map of device ordinal to aggregate cached bytes on that device.
static const unsigned int INVALID_BIN
Out-of-bounds bin.
cudaStream_t associated_stream
CachingDeviceAllocator(bool skip_cleanup=false, bool debug=false)
Default constructor.
A simple caching allocator for device memory allocations.
BusyBlocks live_blocks
Set of cached device allocations available for reuse.
BlockDescriptor(int device)
cudaError_t DeviceFree(void *d_ptr)
Frees a live allocation of device memory on the current device, returning it to the allocator...
GpuCachedBytes CacheStatus() const
cms::cuda::allocator::GpuCachedBytes GpuCachedBytes
Map type of device ordinals to the number of cached bytes cached by each device.
cudaError_t DeviceFree(int device, void *d_ptr)
Frees a live allocation of device memory on the specified device, returning it to the allocator...
void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value)
~CachingDeviceAllocator()
Destructor.
bool(* Compare)(const BlockDescriptor &, const BlockDescriptor &)
BlockDescriptor comparator function interface.
static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
std::multiset< BlockDescriptor, Compare > BusyBlocks
Set type for live blocks (ordered by ptr)
static unsigned int IntPow(unsigned int base, unsigned int exp)
BlockDescriptor(void *d_ptr, int device)
static const size_t INVALID_SIZE
Invalid size.
static const int INVALID_DEVICE_ORDINAL
Invalid device ordinal.
const bool skip_cleanup
Maximum aggregate cached bytes per device.
unsigned int bin_growth
Mutex for thread-safety.
cudaError_t DeviceAllocate(int device, void **d_ptr, size_t bytes, cudaStream_t active_stream=nullptr)
Provides a suitable allocation of device memory for the given size on the specified device...
static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
unsigned int max_bin
Minimum bin enumeration.
size_t max_bin_bytes
Minimum bin size.
CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin=1, unsigned int max_bin=INVALID_BIN, size_t max_cached_bytes=INVALID_SIZE, bool skip_cleanup=false, bool debug=false)
Set of live device allocations currently in use.
std::multiset< BlockDescriptor, Compare > CachedBlocks
Set type for cached blocks (ordered by size)
cudaError_t DeviceAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream=nullptr)
Provides a suitable allocation of device memory for the given size on the current device...
cudaError_t FreeAllCached()
Frees all cached device allocations on all devices.
GpuCachedBytes cached_bytes
Whether or not to print (de)allocation events to stdout.
size_t max_cached_bytes
Maximum bin size.
cudaError_t SetMaxCachedBytes(size_t max_cached_bytes)
Sets the limit on the number bytes this allocator is allowed to cache per device. ...
unsigned int min_bin
Geometric growth factor for bin-sizes.
#define cudaCheck(ARG,...)
bool debug
Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may hav...
std::map< int, TotalBytes > GpuCachedBytes
Map type of device ordinals to the number of cached bytes cached by each device.
size_t min_bin_bytes
Maximum bin enumeration.