1 #ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
2 #define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
111 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
123 struct BlockDescriptor {
154 if (
a.device ==
b.device)
155 return (
a.d_ptr <
b.d_ptr);
157 return (
a.device <
b.device);
162 if (
a.device ==
b.device)
163 return (
a.bytes <
b.bytes);
165 return (
a.device <
b.device);
175 typedef std::multiset<BlockDescriptor, Compare>
CachedBlocks;
191 static unsigned int IntPow(
unsigned int base,
unsigned int exp) {
192 unsigned int retval = 1;
212 power =
sizeof(size_t) * 8;
213 rounded_bytes = size_t(0) - 1;
217 while (rounded_bytes <
value) {
218 rounded_bytes *=
base;
246 #endif // DOXYGEN_SHOULD_SKIP_THIS
308 std::unique_lock mutex_locker(
mutex);
312 printf(
"Changing max_cached_bytes (%lld -> %lld)\n",
313 (
long long)this->max_cached_bytes,
319 mutex_locker.unlock();
335 cudaStream_t active_stream =
nullptr)
338 std::unique_lock<std::mutex> mutex_locker(
mutex, std::defer_lock);
341 cudaError_t
error = cudaSuccess;
346 device = entrypoint_device;
351 BlockDescriptor search_key(device);
352 search_key.bytesRequested = bytes;
353 search_key.associated_stream = active_stream;
361 search_key.bytes = bytes;
366 if (search_key.bin <
min_bin) {
373 CachedBlocks::iterator block_itr =
cached_blocks.lower_bound(search_key);
374 while ((block_itr !=
cached_blocks.end()) && (block_itr->device == device) &&
375 (block_itr->bin == search_key.bin)) {
379 if ((active_stream == block_itr->associated_stream) ||
380 (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
383 search_key = *block_itr;
384 search_key.associated_stream = active_stream;
390 cached_bytes[device].liveRequested += search_key.bytesRequested;
396 "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
397 "associated with stream %lld, event %lld).\n",
400 (
long long)search_key.bytes,
401 (
long long)search_key.associated_stream,
402 (
long long)search_key.ready_event,
403 (
long long)block_itr->associated_stream,
404 (
long long)block_itr->ready_event);
414 mutex_locker.unlock();
420 if (device != entrypoint_device) {
428 if ((
error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
433 "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
435 (
long long)search_key.bytes,
436 (
long long)search_key.associated_stream);
445 BlockDescriptor free_key(device);
446 CachedBlocks::iterator block_itr =
cached_blocks.lower_bound(free_key);
448 while ((block_itr !=
cached_blocks.end()) && (block_itr->device == device)) {
455 if ((
error = cudaFree(block_itr->d_ptr)))
457 if ((
error = cudaEventDestroy(block_itr->ready_event)))
466 "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks "
467 "(%lld bytes) outstanding.\n",
469 (
long long)block_itr->bytes,
481 mutex_locker.unlock();
489 cudaCheck(
error = cudaMalloc(&search_key.d_ptr, search_key.bytes));
494 cudaCheck(
error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
500 cached_bytes[device].liveRequested += search_key.bytesRequested;
501 mutex_locker.unlock();
506 printf(
"\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
509 (
long long)search_key.bytes,
510 (
long long)search_key.associated_stream,
511 (
long long)search_key.ready_event);
521 *d_ptr = search_key.d_ptr;
525 printf(
"\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
544 cudaStream_t active_stream =
nullptr)
556 cudaError_t
DeviceFree(
int device,
void *d_ptr) {
558 cudaError_t
error = cudaSuccess;
560 std::unique_lock<std::mutex> mutex_locker(
mutex, std::defer_lock);
565 device = entrypoint_device;
572 bool recached =
false;
573 BlockDescriptor search_key(d_ptr, device);
574 BusyBlocks::iterator block_itr =
live_blocks.find(search_key);
577 search_key = *block_itr;
580 cached_bytes[device].liveRequested -= search_key.bytesRequested;
593 "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
594 "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
596 (
long long)search_key.bytes,
598 (
long long)search_key.associated_stream,
599 (
long long)search_key.ready_event,
608 if (device != entrypoint_device) {
617 cudaCheck(
error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
621 mutex_locker.unlock();
632 "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
633 "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
635 (
long long)search_key.bytes,
637 (
long long)search_key.associated_stream,
638 (
long long)search_key.ready_event,
667 cudaError_t
error = cudaSuccess;
671 std::unique_lock<std::mutex> mutex_locker(
mutex);
680 if ((
error = cudaGetDevice(&entrypoint_device)))
685 if (begin->device != current_device) {
687 if ((
error = cudaSetDevice(begin->device)))
689 current_device = begin->device;
694 if ((
error = cudaFree(begin->d_ptr)))
696 if ((
error = cudaEventDestroy(begin->ready_event)))
704 "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
705 "bytes) outstanding.\n",
707 (
long long)begin->bytes,
716 mutex_locker.unlock();
729 std::unique_lock mutex_locker(
mutex);