1 #ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
2 #define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
100 struct CachingHostAllocator {
111 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
123 struct BlockDescriptor {
167 typedef std::multiset<BlockDescriptor, Compare>
CachedBlocks;
170 typedef std::multiset<BlockDescriptor, Compare>
BusyBlocks;
179 static unsigned int IntPow(
unsigned int base,
unsigned int exp) {
180 unsigned int retval = 1;
183 retval = retval *
base;
200 power =
sizeof(size_t) * 8;
201 rounded_bytes = size_t(0) - 1;
205 while (rounded_bytes <
value) {
206 rounded_bytes *=
base;
233 #endif // DOXYGEN_SHOULD_SKIP_THIS
294 std::unique_lock mutex_locker(
mutex);
297 printf(
"Changing max_cached_bytes (%lld -> %lld)\n",
298 (
long long)this->max_cached_bytes,
304 mutex_locker.unlock();
315 cudaStream_t active_stream =
nullptr)
317 std::unique_lock<std::mutex> mutex_locker(
mutex, std::defer_lock);
320 cudaError_t
error = cudaSuccess;
327 search_key.device = device;
328 search_key.associated_stream = active_stream;
331 if (search_key.bin >
max_bin) {
336 search_key.bytes = bytes;
341 if (search_key.bin <
min_bin) {
348 CachedBlocks::iterator block_itr =
cached_blocks.lower_bound(search_key);
349 while ((block_itr !=
cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
352 if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
355 search_key = *block_itr;
356 search_key.associated_stream = active_stream;
357 if (search_key.device != device) {
362 cudaCheck(
error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
363 search_key.device = device;
374 "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
375 "(previously associated with stream %lld, event %lld).\n",
377 (
long long)search_key.bytes,
378 (
long long)search_key.associated_stream,
379 (
long long)search_key.ready_event,
380 (
long long)search_key.device,
381 (
long long)block_itr->associated_stream,
382 (
long long)block_itr->ready_event);
392 mutex_locker.unlock();
399 if ((
error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
400 cudaErrorMemoryAllocation) {
404 "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
406 (
long long)search_key.bytes,
407 (
long long)search_key.associated_stream,
408 (
long long)search_key.device);
425 if ((
error = cudaFreeHost(block_itr->d_ptr)))
427 if ((
error = cudaEventDestroy(block_itr->ready_event)))
435 "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
436 "bytes) outstanding.\n",
437 (
long long)block_itr->bytes,
449 mutex_locker.unlock();
456 cudaCheck(
error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));
460 cudaCheck(
error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
466 mutex_locker.unlock();
470 "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
473 (
long long)search_key.bytes,
474 (
long long)search_key.associated_stream,
475 (
long long)search_key.ready_event,
476 (
long long)search_key.device);
480 *d_ptr = search_key.d_ptr;
483 printf(
"\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
499 cudaError_t
error = cudaSuccess;
502 std::unique_lock<std::mutex> mutex_locker(
mutex);
505 bool recached =
false;
506 BlockDescriptor search_key(d_ptr);
507 BusyBlocks::iterator block_itr =
live_blocks.find(search_key);
510 search_key = *block_itr;
523 "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
524 "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
525 (
long long)search_key.bytes,
526 (
long long)search_key.associated_stream,
527 (
long long)search_key.ready_event,
528 (
long long)search_key.device,
537 if (entrypoint_device != search_key.device) {
543 cudaCheck(
error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
547 mutex_locker.unlock();
556 "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available "
557 "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
558 (
long long)search_key.bytes,
559 (
long long)search_key.associated_stream,
560 (
long long)search_key.ready_event,
561 (
long long)search_key.device,
580 cudaError_t
error = cudaSuccess;
584 std::unique_lock<std::mutex> mutex_locker(
mutex);
592 if ((
error = cudaGetDevice(&entrypoint_device)))
597 if (begin->device != current_device) {
598 if ((
error = cudaSetDevice(begin->device)))
600 current_device = begin->device;
604 if ((
error = cudaFreeHost(begin->d_ptr)))
606 if ((
error = cudaEventDestroy(begin->ready_event)))
614 "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
615 "bytes) outstanding.\n",
616 (
long long)begin->bytes,
625 mutex_locker.unlock();