1 #ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
2 #define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
110 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
122 struct BlockDescriptor {
145 if (
a.device ==
b.device)
146 return (
a.d_ptr <
b.d_ptr);
148 return (
a.device <
b.device);
153 if (
a.device ==
b.device)
154 return (
a.bytes <
b.bytes);
156 return (
a.device <
b.device);
161 typedef bool (*
Compare)(
const BlockDescriptor &,
const BlockDescriptor &);
171 typedef std::multiset<BlockDescriptor, Compare>
CachedBlocks;
174 typedef std::multiset<BlockDescriptor, Compare>
BusyBlocks;
187 unsigned int retval = 1;
207 power =
sizeof(size_t) * 8;
208 rounded_bytes = size_t(0) - 1;
212 while (rounded_bytes <
value) {
213 rounded_bytes *=
base;
241 #endif // DOXYGEN_SHOULD_SKIP_THIS
306 printf(
"Changing max_cached_bytes (%lld -> %lld)\n",
307 (
long long)this->max_cached_bytes,
329 cudaStream_t active_stream =
nullptr)
333 cudaError_t
error = cudaSuccess;
338 device = entrypoint_device;
344 search_key.associated_stream = active_stream;
347 if (search_key.bin >
max_bin) {
352 search_key.bytes = bytes;
357 if (search_key.bin <
min_bin) {
364 CachedBlocks::iterator block_itr =
cached_blocks.lower_bound(search_key);
365 while ((block_itr !=
cached_blocks.end()) && (block_itr->device == device) &&
366 (block_itr->bin == search_key.bin)) {
370 if ((active_stream == block_itr->associated_stream) ||
371 (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
374 search_key = *block_itr;
375 search_key.associated_stream = active_stream;
386 "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
387 "associated with stream %lld, event %lld).\n",
390 (
long long)search_key.bytes,
391 (
long long)search_key.associated_stream,
392 (
long long)search_key.ready_event,
393 (
long long)block_itr->associated_stream,
394 (
long long)block_itr->ready_event);
410 if (device != entrypoint_device) {
418 if ((
error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
423 "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
425 (
long long)search_key.bytes,
426 (
long long)search_key.associated_stream);
435 BlockDescriptor free_key(device);
436 CachedBlocks::iterator block_itr =
cached_blocks.lower_bound(free_key);
438 while ((block_itr !=
cached_blocks.end()) && (block_itr->device == device)) {
445 if ((
error = cudaFree(block_itr->d_ptr)))
447 if ((
error = cudaEventDestroy(block_itr->ready_event)))
456 "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks "
457 "(%lld bytes) outstanding.\n",
459 (
long long)block_itr->bytes,
479 cudaCheck(
error = cudaMalloc(&search_key.d_ptr, search_key.bytes));
484 cudaCheck(
error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
495 printf(
"\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
498 (
long long)search_key.bytes,
499 (
long long)search_key.associated_stream,
500 (
long long)search_key.ready_event);
510 *d_ptr = search_key.d_ptr;
514 printf(
"\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
533 cudaStream_t active_stream =
nullptr)
545 cudaError_t
DeviceFree(
int device,
void *d_ptr) {
547 cudaError_t
error = cudaSuccess;
552 device = entrypoint_device;
559 bool recached =
false;
561 BusyBlocks::iterator block_itr =
live_blocks.find(search_key);
564 search_key = *block_itr;
579 "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
580 "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
582 (
long long)search_key.bytes,
584 (
long long)search_key.associated_stream,
585 (
long long)search_key.ready_event,
594 if (device != entrypoint_device) {
603 cudaCheck(
error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
618 "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
619 "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
621 (
long long)search_key.bytes,
623 (
long long)search_key.associated_stream,
624 (
long long)search_key.ready_event,
653 cudaError_t
error = cudaSuccess;
666 if ((
error = cudaGetDevice(&entrypoint_device)))
671 if (
begin->device != current_device) {
675 current_device =
begin->device;
682 if ((
error = cudaEventDestroy(
begin->ready_event)))
690 "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
691 "bytes) outstanding.\n",
693 (
long long)
begin->bytes,