CMS 3D CMS Logo

CachingHostAllocator.h
Go to the documentation of this file.
1 #ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
2 #define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
3 
4 /******************************************************************************
5  * Copyright (c) 2011, Duane Merrill. All rights reserved.
6  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  * * Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  * * Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  * * Neither the name of the NVIDIA CORPORATION nor the
16  * names of its contributors may be used to endorse or promote products
17  * derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
23  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  ******************************************************************************/
31 
36 /******************************************************************************
37  * Simple caching allocator for pinned host memory allocations. The allocator is
38  * thread-safe.
39  ******************************************************************************/
40 
41 #include <cmath>
42 #include <map>
43 #include <set>
44 #include <mutex>
45 
47 
49 namespace notcub {
50 
56  /******************************************************************************
57  * CachingHostAllocator (host use)
58  ******************************************************************************/
59 
101  //---------------------------------------------------------------------
102  // Constants
103  //---------------------------------------------------------------------
104 
106  static const unsigned int INVALID_BIN = (unsigned int)-1;
107 
109  static const size_t INVALID_SIZE = (size_t)-1;
110 
111 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
112 
114  static const int INVALID_DEVICE_ORDINAL = -1;
115 
116  //---------------------------------------------------------------------
117  // Type definitions and helper types
118  //---------------------------------------------------------------------
119 
124  void *d_ptr; // Host pointer
125  size_t bytes; // Size of allocation in bytes
126  unsigned int bin; // Bin enumeration
127  int device; // device ordinal
128  cudaStream_t associated_stream; // Associated associated_stream
129  cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed
130 
131  // Constructor (suitable for searching maps for a specific block, given its pointer)
133  : d_ptr(d_ptr),
134  bytes(0),
135  bin(INVALID_BIN),
137  associated_stream(nullptr),
138  ready_event(nullptr) {}
139 
140  // Constructor (suitable for searching maps for a range of suitable blocks)
142  : d_ptr(nullptr),
143  bytes(0),
144  bin(INVALID_BIN),
146  associated_stream(nullptr),
147  ready_event(nullptr) {}
148 
149  // Comparison functor for comparing host pointers
150  static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
151 
152  // Comparison functor for comparing allocation sizes
153  static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
154  };
155 
157  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
158 
159  class TotalBytes {
160  public:
161  size_t free;
162  size_t live;
163  TotalBytes() { free = live = 0; }
164  };
165 
167  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
168 
170  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
171 
172  //---------------------------------------------------------------------
173  // Utility functions
174  //---------------------------------------------------------------------
175 
179  static unsigned int IntPow(unsigned int base, unsigned int exp) {
180  unsigned int retval = 1;
181  while (exp > 0) {
182  if (exp & 1) {
183  retval = retval * base; // multiply the result by the current base
184  }
185  base = base * base; // square the base
186  exp = exp >> 1; // divide the exponent in half
187  }
188  return retval;
189  }
190 
194  void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
195  power = 0;
196  rounded_bytes = 1;
197 
198  if (value * base < value) {
199  // Overflow
200  power = sizeof(size_t) * 8;
201  rounded_bytes = size_t(0) - 1;
202  return;
203  }
204 
205  while (rounded_bytes < value) {
206  rounded_bytes *= base;
207  power++;
208  }
209  }
210 
211  //---------------------------------------------------------------------
212  // Fields
213  //---------------------------------------------------------------------
214 
216 
217  unsigned int bin_growth;
218  unsigned int min_bin;
219  unsigned int max_bin;
220 
221  size_t min_bin_bytes;
222  size_t max_bin_bytes;
224 
225  const bool
227  bool debug;
228 
232 
233 #endif // DOXYGEN_SHOULD_SKIP_THIS
234 
235  //---------------------------------------------------------------------
236  // Methods
237  //---------------------------------------------------------------------
238 
243  unsigned int bin_growth,
244  unsigned int min_bin = 1,
245  unsigned int max_bin = INVALID_BIN,
246  size_t max_cached_bytes = INVALID_SIZE,
247  bool skip_cleanup =
248  false,
249  bool debug = false)
251  min_bin(min_bin),
252  max_bin(max_bin),
257  debug(debug),
258  cached_blocks(BlockDescriptor::SizeCompare),
259  live_blocks(BlockDescriptor::PtrCompare) {}
260 
274  CachingHostAllocator(bool skip_cleanup = false, bool debug = false)
275  : bin_growth(8),
276  min_bin(3),
277  max_bin(7),
280  max_cached_bytes((max_bin_bytes * 3) - 1),
282  debug(debug),
283  cached_blocks(BlockDescriptor::SizeCompare),
284  live_blocks(BlockDescriptor::PtrCompare) {}
285 
293  // Lock
294  std::unique_lock mutex_locker(mutex);
295 
296  if (debug)
297  printf("Changing max_cached_bytes (%lld -> %lld)\n",
298  (long long)this->max_cached_bytes,
299  (long long)max_cached_bytes);
300 
301  this->max_cached_bytes = max_cached_bytes;
302 
303  // Unlock (redundant, kept for style uniformity)
304  mutex_locker.unlock();
305  }
306 
312  cudaError_t HostAllocate(
313  void **d_ptr,
314  size_t bytes,
315  cudaStream_t active_stream = nullptr)
316  {
317  std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
318  *d_ptr = nullptr;
319  int device = INVALID_DEVICE_ORDINAL;
320  cudaError_t error = cudaSuccess;
321 
322  cudaCheck(error = cudaGetDevice(&device));
323 
324  // Create a block descriptor for the requested allocation
325  bool found = false;
326  BlockDescriptor search_key;
327  search_key.device = device;
328  search_key.associated_stream = active_stream;
329  NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
330 
331  if (search_key.bin > max_bin) {
332  // Bin is greater than our maximum bin: allocate the request
333  // exactly and give out-of-bounds bin. It will not be cached
334  // for reuse when returned.
335  search_key.bin = INVALID_BIN;
336  search_key.bytes = bytes;
337  } else {
338  // Search for a suitable cached allocation: lock
339  mutex_locker.lock();
340 
341  if (search_key.bin < min_bin) {
342  // Bin is less than minimum bin: round up
343  search_key.bin = min_bin;
344  search_key.bytes = min_bin_bytes;
345  }
346 
347  // Iterate through the range of cached blocks in the same bin
348  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
349  while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
350  // To prevent races with reusing blocks returned by the host but still
351  // in use for transfers, only consider cached blocks that are from an idle stream
352  if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
353  // Reuse existing cache block. Insert into live blocks.
354  found = true;
355  search_key = *block_itr;
356  search_key.associated_stream = active_stream;
357  if (search_key.device != device) {
358  // If "associated" device changes, need to re-create the event on the right device
359  cudaCheck(error = cudaSetDevice(search_key.device));
360  cudaCheck(error = cudaEventDestroy(search_key.ready_event));
361  cudaCheck(error = cudaSetDevice(device));
362  cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
363  search_key.device = device;
364  }
365 
366  live_blocks.insert(search_key);
367 
368  // Remove from free blocks
369  cached_bytes.free -= search_key.bytes;
370  cached_bytes.live += search_key.bytes;
371 
372  if (debug)
373  printf(
374  "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
375  "(previously associated with stream %lld, event %lld).\n",
376  search_key.d_ptr,
377  (long long)search_key.bytes,
378  (long long)search_key.associated_stream,
379  (long long)search_key.ready_event,
380  (long long)search_key.device,
381  (long long)block_itr->associated_stream,
382  (long long)block_itr->ready_event);
383 
384  cached_blocks.erase(block_itr);
385 
386  break;
387  }
388  block_itr++;
389  }
390 
391  // Done searching: unlock
392  mutex_locker.unlock();
393  }
394 
395  // Allocate the block if necessary
396  if (!found) {
397  // Attempt to allocate
398  // TODO: eventually support allocation flags
399  if ((error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
400  cudaErrorMemoryAllocation) {
401  // The allocation attempt failed: free all cached blocks on device and retry
402  if (debug)
403  printf(
404  "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
405  "allocations",
406  (long long)search_key.bytes,
407  (long long)search_key.associated_stream,
408  (long long)search_key.device);
409 
410  error = cudaSuccess; // Reset the error we will return
411  cudaGetLastError(); // Reset CUDART's error
412 
413  // Lock
414  mutex_locker.lock();
415 
416  // Iterate the range of free blocks
417  CachedBlocks::iterator block_itr = cached_blocks.begin();
418 
419  while ((block_itr != cached_blocks.end())) {
420  // No need to worry about synchronization with the device: cudaFree is
421  // blocking and will synchronize across all kernels executing
422  // on the current device
423 
424  // Free pinned host memory.
425  if ((error = cudaFreeHost(block_itr->d_ptr)))
426  break;
427  if ((error = cudaEventDestroy(block_itr->ready_event)))
428  break;
429 
430  // Reduce balance and erase entry
431  cached_bytes.free -= block_itr->bytes;
432 
433  if (debug)
434  printf(
435  "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
436  "bytes) outstanding.\n",
437  (long long)block_itr->bytes,
438  (long long)cached_blocks.size(),
439  (long long)cached_bytes.free,
440  (long long)live_blocks.size(),
441  (long long)cached_bytes.live);
442 
443  cached_blocks.erase(block_itr);
444 
445  block_itr++;
446  }
447 
448  // Unlock
449  mutex_locker.unlock();
450 
451  // Return under error
452  if (error)
453  return error;
454 
455  // Try to allocate again
456  cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));
457  }
458 
459  // Create ready event
460  cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
461 
462  // Insert into live blocks
463  mutex_locker.lock();
464  live_blocks.insert(search_key);
465  cached_bytes.live += search_key.bytes;
466  mutex_locker.unlock();
467 
468  if (debug)
469  printf(
470  "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
471  "%lld).\n",
472  search_key.d_ptr,
473  (long long)search_key.bytes,
474  (long long)search_key.associated_stream,
475  (long long)search_key.ready_event,
476  (long long)search_key.device);
477  }
478 
479  // Copy host pointer to output parameter
480  *d_ptr = search_key.d_ptr;
481 
482  if (debug)
483  printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
484  (long long)cached_blocks.size(),
485  (long long)cached_bytes.free,
486  (long long)live_blocks.size(),
487  (long long)cached_bytes.live);
488 
489  return error;
490  }
491 
497  cudaError_t HostFree(void *d_ptr) {
498  int entrypoint_device = INVALID_DEVICE_ORDINAL;
499  cudaError_t error = cudaSuccess;
500 
501  // Lock
502  std::unique_lock<std::mutex> mutex_locker(mutex);
503 
504  // Find corresponding block descriptor
505  bool recached = false;
506  BlockDescriptor search_key(d_ptr);
507  BusyBlocks::iterator block_itr = live_blocks.find(search_key);
508  if (block_itr != live_blocks.end()) {
509  // Remove from live blocks
510  search_key = *block_itr;
511  live_blocks.erase(block_itr);
512  cached_bytes.live -= search_key.bytes;
513 
514  // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
515  if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
516  // Insert returned allocation into free blocks
517  recached = true;
518  cached_blocks.insert(search_key);
519  cached_bytes.free += search_key.bytes;
520 
521  if (debug)
522  printf(
523  "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
524  "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
525  (long long)search_key.bytes,
526  (long long)search_key.associated_stream,
527  (long long)search_key.ready_event,
528  (long long)search_key.device,
529  (long long)cached_blocks.size(),
530  (long long)cached_bytes.free,
531  (long long)live_blocks.size(),
532  (long long)cached_bytes.live);
533  }
534  }
535 
536  cudaCheck(error = cudaGetDevice(&entrypoint_device));
537  if (entrypoint_device != search_key.device) {
538  cudaCheck(error = cudaSetDevice(search_key.device));
539  }
540 
541  if (recached) {
542  // Insert the ready event in the associated stream (must have current device set properly)
543  cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
544  }
545 
546  // Unlock
547  mutex_locker.unlock();
548 
549  if (!recached) {
550  // Free the allocation from the runtime and cleanup the event.
551  cudaCheck(error = cudaFreeHost(d_ptr));
552  cudaCheck(error = cudaEventDestroy(search_key.ready_event));
553 
554  if (debug)
555  printf(
556  "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available "
557  "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
558  (long long)search_key.bytes,
559  (long long)search_key.associated_stream,
560  (long long)search_key.ready_event,
561  (long long)search_key.device,
562  (long long)cached_blocks.size(),
563  (long long)cached_bytes.free,
564  (long long)live_blocks.size(),
565  (long long)cached_bytes.live);
566  }
567 
568  // Reset device
569  if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
570  cudaCheck(error = cudaSetDevice(entrypoint_device));
571  }
572 
573  return error;
574  }
575 
579  cudaError_t FreeAllCached() {
580  cudaError_t error = cudaSuccess;
581  int entrypoint_device = INVALID_DEVICE_ORDINAL;
582  int current_device = INVALID_DEVICE_ORDINAL;
583 
584  std::unique_lock<std::mutex> mutex_locker(mutex);
585 
586  while (!cached_blocks.empty()) {
587  // Get first block
588  CachedBlocks::iterator begin = cached_blocks.begin();
589 
590  // Get entry-point device ordinal if necessary
591  if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
592  if ((error = cudaGetDevice(&entrypoint_device)))
593  break;
594  }
595 
596  // Set current device ordinal if necessary
597  if (begin->device != current_device) {
598  if ((error = cudaSetDevice(begin->device)))
599  break;
600  current_device = begin->device;
601  }
602 
603  // Free host memory
604  if ((error = cudaFreeHost(begin->d_ptr)))
605  break;
606  if ((error = cudaEventDestroy(begin->ready_event)))
607  break;
608 
609  // Reduce balance and erase entry
610  cached_bytes.free -= begin->bytes;
611 
612  if (debug)
613  printf(
614  "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
615  "bytes) outstanding.\n",
616  (long long)begin->bytes,
617  (long long)cached_blocks.size(),
618  (long long)cached_bytes.free,
619  (long long)live_blocks.size(),
620  (long long)cached_bytes.live);
621 
622  cached_blocks.erase(begin);
623  }
624 
625  mutex_locker.unlock();
626 
627  // Attempt to revert back to entry-point device if necessary
628  if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
629  cudaCheck(error = cudaSetDevice(entrypoint_device));
630  }
631 
632  return error;
633  }
634 
639  if (!skip_cleanup)
640  FreeAllCached();
641  }
642  };
643  // end group UtilMgmt
645 
646 } // namespace notcub
647 
648 #endif
bool(* Compare)(const BlockDescriptor &, const BlockDescriptor &)
BlockDescriptor comparator function interface.
CachingHostAllocator(bool skip_cleanup=false, bool debug=false)
Default constructor.
static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
static unsigned int IntPow(unsigned int base, unsigned int exp)
static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
CUB namespace.
void SetMaxCachedBytes(size_t max_cached_bytes)
Sets the limit on the number bytes this allocator is allowed to cache.
static std::mutex mutex
Definition: Proxy.cc:8
void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value)
CachedBlocks cached_blocks
Aggregate cached bytes.
std::multiset< BlockDescriptor, Compare > CachedBlocks
Set type for cached blocks (ordered by size)
size_t min_bin_bytes
Maximum bin enumeration.
static const size_t INVALID_SIZE
Invalid size.
CachingHostAllocator(unsigned int bin_growth, unsigned int min_bin=1, unsigned int max_bin=INVALID_BIN, size_t max_cached_bytes=INVALID_SIZE, bool skip_cleanup=false, bool debug=false)
Set of live pinned host allocations currently in use.
unsigned int max_bin
Minimum bin enumeration.
size_t max_cached_bytes
Maximum bin size.
TotalBytes cached_bytes
Whether or not to print (de)allocation events to stdout.
Definition: value.py:1
size_t max_bin_bytes
Minimum bin size.
static const unsigned int INVALID_BIN
Out-of-bounds bin.
constexpr unsigned int power(unsigned int base, unsigned int exponent)
unsigned int min_bin
Geometric growth factor for bin-sizes.
cudaError_t FreeAllCached()
Frees all cached pinned host allocations.
unsigned int bin_growth
Mutex for thread-safety.
static const int INVALID_DEVICE_ORDINAL
Invalid device ordinal.
double b
Definition: hdecay.h:120
bool debug
Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may hav...
BusyBlocks live_blocks
Set of cached pinned host allocations available for reuse.
double a
Definition: hdecay.h:121
cudaError_t HostFree(void *d_ptr)
Frees a live allocation of pinned host memory, returning it to the allocator.
const bool skip_cleanup
Maximum aggregate cached bytes.
cudaError_t HostAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream=nullptr)
Provides a suitable allocation of pinned host memory for the given size.
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
A simple caching allocator pinned host memory allocations.
std::multiset< BlockDescriptor, Compare > BusyBlocks
Set type for live blocks (ordered by ptr)