1 #ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h 2 #define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h 111 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 180 unsigned int retval = 1;
183 retval = retval *
base;
200 power =
sizeof(size_t) * 8;
201 rounded_bytes = size_t(0) - 1;
205 while (rounded_bytes <
value) {
206 rounded_bytes *=
base;
233 #endif // DOXYGEN_SHOULD_SKIP_THIS 294 std::unique_lock mutex_locker(
mutex);
297 printf(
"Changing max_cached_bytes (%lld -> %lld)\n",
298 (
long long)this->max_cached_bytes,
304 mutex_locker.unlock();
315 cudaStream_t active_stream =
nullptr)
317 std::unique_lock<std::mutex> mutex_locker(
mutex, std::defer_lock);
320 cudaError_t
error = cudaSuccess;
327 search_key.
device = device;
336 search_key.
bytes = bytes;
348 CachedBlocks::iterator block_itr =
cached_blocks.lower_bound(search_key);
349 while ((block_itr !=
cached_blocks.end()) && (block_itr->bin == search_key.
bin)) {
352 if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
355 search_key = *block_itr;
357 if (search_key.
device != device) {
363 search_key.
device = device;
374 "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld " 375 "(previously associated with stream %lld, event %lld).\n",
377 (
long long)search_key.
bytes,
380 (
long long)search_key.
device,
381 (
long long)block_itr->associated_stream,
382 (
long long)block_itr->ready_event);
392 mutex_locker.unlock();
399 if (cudaHostAlloc(&search_key.
d_ptr, search_key.
bytes, cudaHostAllocDefault) == cudaErrorMemoryAllocation) {
403 "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached " 405 (
long long)search_key.
bytes,
407 (
long long)search_key.
device);
424 if ((
error = cudaFreeHost(block_itr->d_ptr)))
426 if ((
error = cudaEventDestroy(block_itr->ready_event)))
434 "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " 435 "bytes) outstanding.\n",
436 (
long long)block_itr->bytes,
448 mutex_locker.unlock();
465 mutex_locker.unlock();
469 "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device " 472 (
long long)search_key.
bytes,
475 (
long long)search_key.
device);
479 *d_ptr = search_key.
d_ptr;
482 printf(
"\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
498 cudaError_t
error = cudaSuccess;
501 std::unique_lock<std::mutex> mutex_locker(
mutex);
504 bool recached =
false;
506 BusyBlocks::iterator block_itr =
live_blocks.find(search_key);
509 search_key = *block_itr;
522 "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld " 523 "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
524 (
long long)search_key.
bytes,
527 (
long long)search_key.
device,
536 if (entrypoint_device != search_key.
device) {
546 mutex_locker.unlock();
555 "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available " 556 "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
557 (
long long)search_key.
bytes,
560 (
long long)search_key.
device,
579 cudaError_t
error = cudaSuccess;
583 std::unique_lock<std::mutex> mutex_locker(
mutex);
591 if ((
error = cudaGetDevice(&entrypoint_device)))
596 if (begin->device != current_device) {
597 if ((
error = cudaSetDevice(begin->device)))
599 current_device = begin->device;
603 if ((
error = cudaFreeHost(begin->d_ptr)))
605 if ((
error = cudaEventDestroy(begin->ready_event)))
613 "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld " 614 "bytes) outstanding.\n",
615 (
long long)begin->bytes,
624 mutex_locker.unlock();
bool(* Compare)(const BlockDescriptor &, const BlockDescriptor &)
BlockDescriptor comparator function interface.
CachingHostAllocator(bool skip_cleanup=false, bool debug=false)
Default constructor.
static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
static unsigned int IntPow(unsigned int base, unsigned int exp)
static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
void SetMaxCachedBytes(size_t max_cached_bytes)
Sets the limit on the number bytes this allocator is allowed to cache.
void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value)
BlockDescriptor(void *d_ptr)
CachedBlocks cached_blocks
Aggregate cached bytes.
std::multiset< BlockDescriptor, Compare > CachedBlocks
Set type for cached blocks (ordered by size)
size_t min_bin_bytes
Maximum bin enumeration.
cudaStream_t associated_stream
static const size_t INVALID_SIZE
Invalid size.
CachingHostAllocator(unsigned int bin_growth, unsigned int min_bin=1, unsigned int max_bin=INVALID_BIN, size_t max_cached_bytes=INVALID_SIZE, bool skip_cleanup=false, bool debug=false)
Set of live pinned host allocations currently in use.
unsigned int max_bin
Minimum bin enumeration.
size_t max_cached_bytes
Maximum bin size.
TotalBytes cached_bytes
Whether or not to print (de)allocation events to stdout.
size_t max_bin_bytes
Minimum bin size.
static const unsigned int INVALID_BIN
Out-of-bounds bin.
unsigned int min_bin
Geometric growth factor for bin-sizes.
cudaError_t FreeAllCached()
Frees all cached pinned host allocations.
unsigned int bin_growth
Mutex for thread-safety.
static const int INVALID_DEVICE_ORDINAL
Invalid device ordinal.
~CachingHostAllocator()
Destructor.
bool debug
Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may hav...
BusyBlocks live_blocks
Set of cached pinned host allocations available for reuse.
cudaError_t HostFree(void *d_ptr)
Frees a live allocation of pinned host memory, returning it to the allocator.
const bool skip_cleanup
Maximum aggregate cached bytes.
cudaError_t HostAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream=nullptr)
Provides a suitable allocation of pinned host memory for the given size.
#define cudaCheck(ARG,...)
A simple caching allocator pinned host memory allocations.
std::multiset< BlockDescriptor, Compare > BusyBlocks
Set type for live blocks (ordered by ptr)