CMS 3D CMS Logo

CachingHostAllocator.h
Go to the documentation of this file.
1 #ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
2 #define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
3 
4 /******************************************************************************
5  * Copyright (c) 2011, Duane Merrill. All rights reserved.
6  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  * * Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  * * Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  * * Neither the name of the NVIDIA CORPORATION nor the
16  * names of its contributors may be used to endorse or promote products
17  * derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
23  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  ******************************************************************************/
31 
36 /******************************************************************************
37  * Simple caching allocator for pinned host memory allocations. The allocator is
38  * thread-safe.
39  ******************************************************************************/
40 
41 #include <cmath>
42 #include <map>
43 #include <set>
44 #include <mutex>
45 
47 
49 namespace notcub {
50 
56  /******************************************************************************
57  * CachingHostAllocator (host use)
58  ******************************************************************************/
59 
101  //---------------------------------------------------------------------
102  // Constants
103  //---------------------------------------------------------------------
104 
106  static const unsigned int INVALID_BIN = (unsigned int)-1;
107 
109  static const size_t INVALID_SIZE = (size_t)-1;
110 
111 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
112 
114  static const int INVALID_DEVICE_ORDINAL = -1;
115 
116  //---------------------------------------------------------------------
117  // Type definitions and helper types
118  //---------------------------------------------------------------------
119 
124  void *d_ptr; // Host pointer
125  size_t bytes; // Size of allocation in bytes
126  unsigned int bin; // Bin enumeration
127  int device; // device ordinal
128  cudaStream_t associated_stream; // Associated associated_stream
129  cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed
130 
131  // Constructor (suitable for searching maps for a specific block, given its pointer)
133  : d_ptr(d_ptr),
134  bytes(0),
135  bin(INVALID_BIN),
137  associated_stream(nullptr),
138  ready_event(nullptr) {}
139 
140  // Constructor (suitable for searching maps for a range of suitable blocks)
142  : d_ptr(nullptr),
143  bytes(0),
144  bin(INVALID_BIN),
146  associated_stream(nullptr),
147  ready_event(nullptr) {}
148 
149  // Comparison functor for comparing host pointers
150  static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
151 
152  // Comparison functor for comparing allocation sizes
153  static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
154  };
155 
157  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
158 
159  class TotalBytes {
160  public:
161  size_t free;
162  size_t live;
163  TotalBytes() { free = live = 0; }
164  };
165 
167  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
168 
170  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
171 
172  //---------------------------------------------------------------------
173  // Utility functions
174  //---------------------------------------------------------------------
175 
179  static unsigned int IntPow(unsigned int base, unsigned int exp) {
180  unsigned int retval = 1;
181  while (exp > 0) {
182  if (exp & 1) {
183  retval = retval * base; // multiply the result by the current base
184  }
185  base = base * base; // square the base
186  exp = exp >> 1; // divide the exponent in half
187  }
188  return retval;
189  }
190 
194  void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
195  power = 0;
196  rounded_bytes = 1;
197 
198  if (value * base < value) {
199  // Overflow
200  power = sizeof(size_t) * 8;
201  rounded_bytes = size_t(0) - 1;
202  return;
203  }
204 
205  while (rounded_bytes < value) {
206  rounded_bytes *= base;
207  power++;
208  }
209  }
210 
211  //---------------------------------------------------------------------
212  // Fields
213  //---------------------------------------------------------------------
214 
216 
217  unsigned int bin_growth;
218  unsigned int min_bin;
219  unsigned int max_bin;
220 
221  size_t min_bin_bytes;
222  size_t max_bin_bytes;
224 
225  const bool
227  bool debug;
228 
232 
233 #endif // DOXYGEN_SHOULD_SKIP_THIS
234 
235  //---------------------------------------------------------------------
236  // Methods
237  //---------------------------------------------------------------------
238 
243  unsigned int bin_growth,
244  unsigned int min_bin = 1,
245  unsigned int max_bin = INVALID_BIN,
246  size_t max_cached_bytes = INVALID_SIZE,
247  bool skip_cleanup =
248  false,
249  bool debug = false)
251  min_bin(min_bin),
252  max_bin(max_bin),
257  debug(debug),
258  cached_blocks(BlockDescriptor::SizeCompare),
259  live_blocks(BlockDescriptor::PtrCompare) {}
260 
274  CachingHostAllocator(bool skip_cleanup = false, bool debug = false)
275  : bin_growth(8),
276  min_bin(3),
277  max_bin(7),
280  max_cached_bytes((max_bin_bytes * 3) - 1),
282  debug(debug),
283  cached_blocks(BlockDescriptor::SizeCompare),
284  live_blocks(BlockDescriptor::PtrCompare) {}
285 
293  // Lock
294  std::unique_lock mutex_locker(mutex);
295 
296  if (debug)
297  printf("Changing max_cached_bytes (%lld -> %lld)\n",
298  (long long)this->max_cached_bytes,
299  (long long)max_cached_bytes);
300 
301  this->max_cached_bytes = max_cached_bytes;
302 
303  // Unlock (redundant, kept for style uniformity)
304  mutex_locker.unlock();
305  }
306 
312  cudaError_t HostAllocate(
313  void **d_ptr,
314  size_t bytes,
315  cudaStream_t active_stream = nullptr)
316  {
317  std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
318  *d_ptr = nullptr;
319  int device = INVALID_DEVICE_ORDINAL;
320  cudaError_t error = cudaSuccess;
321 
322  cudaCheck(error = cudaGetDevice(&device));
323 
324  // Create a block descriptor for the requested allocation
325  bool found = false;
326  BlockDescriptor search_key;
327  search_key.device = device;
328  search_key.associated_stream = active_stream;
329  NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
330 
331  if (search_key.bin > max_bin) {
332  // Bin is greater than our maximum bin: allocate the request
333  // exactly and give out-of-bounds bin. It will not be cached
334  // for reuse when returned.
335  search_key.bin = INVALID_BIN;
336  search_key.bytes = bytes;
337  } else {
338  // Search for a suitable cached allocation: lock
339  mutex_locker.lock();
340 
341  if (search_key.bin < min_bin) {
342  // Bin is less than minimum bin: round up
343  search_key.bin = min_bin;
344  search_key.bytes = min_bin_bytes;
345  }
346 
347  // Iterate through the range of cached blocks in the same bin
348  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
349  while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
350  // To prevent races with reusing blocks returned by the host but still
351  // in use for transfers, only consider cached blocks that are from an idle stream
352  if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
353  // Reuse existing cache block. Insert into live blocks.
354  found = true;
355  search_key = *block_itr;
356  search_key.associated_stream = active_stream;
357  if (search_key.device != device) {
358  // If "associated" device changes, need to re-create the event on the right device
359  cudaCheck(error = cudaSetDevice(search_key.device));
360  cudaCheck(error = cudaEventDestroy(search_key.ready_event));
361  cudaCheck(error = cudaSetDevice(device));
362  cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
363  search_key.device = device;
364  }
365 
366  live_blocks.insert(search_key);
367 
368  // Remove from free blocks
369  cached_bytes.free -= search_key.bytes;
370  cached_bytes.live += search_key.bytes;
371 
372  if (debug)
373  printf(
374  "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
375  "(previously associated with stream %lld, event %lld).\n",
376  search_key.d_ptr,
377  (long long)search_key.bytes,
378  (long long)search_key.associated_stream,
379  (long long)search_key.ready_event,
380  (long long)search_key.device,
381  (long long)block_itr->associated_stream,
382  (long long)block_itr->ready_event);
383 
384  cached_blocks.erase(block_itr);
385 
386  break;
387  }
388  block_itr++;
389  }
390 
391  // Done searching: unlock
392  mutex_locker.unlock();
393  }
394 
395  // Allocate the block if necessary
396  if (!found) {
397  // Attempt to allocate
398  // TODO: eventually support allocation flags
399  if (cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault) == cudaErrorMemoryAllocation) {
400  // The allocation attempt failed: free all cached blocks on device and retry
401  if (debug)
402  printf(
403  "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
404  "allocations",
405  (long long)search_key.bytes,
406  (long long)search_key.associated_stream,
407  (long long)search_key.device);
408 
409  error = cudaSuccess; // Reset the error we will return
410  cudaGetLastError(); // Reset CUDART's error
411 
412  // Lock
413  mutex_locker.lock();
414 
415  // Iterate the range of free blocks
416  CachedBlocks::iterator block_itr = cached_blocks.begin();
417 
418  while ((block_itr != cached_blocks.end())) {
419  // No need to worry about synchronization with the device: cudaFree is
420  // blocking and will synchronize across all kernels executing
421  // on the current device
422 
423  // Free pinned host memory.
424  if ((error = cudaFreeHost(block_itr->d_ptr)))
425  break;
426  if ((error = cudaEventDestroy(block_itr->ready_event)))
427  break;
428 
429  // Reduce balance and erase entry
430  cached_bytes.free -= block_itr->bytes;
431 
432  if (debug)
433  printf(
434  "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
435  "bytes) outstanding.\n",
436  (long long)block_itr->bytes,
437  (long long)cached_blocks.size(),
438  (long long)cached_bytes.free,
439  (long long)live_blocks.size(),
440  (long long)cached_bytes.live);
441 
442  cached_blocks.erase(block_itr);
443 
444  block_itr++;
445  }
446 
447  // Unlock
448  mutex_locker.unlock();
449 
450  // Return under error
451  if (error)
452  return error;
453 
454  // Try to allocate again
455  cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));
456  }
457 
458  // Create ready event
459  cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
460 
461  // Insert into live blocks
462  mutex_locker.lock();
463  live_blocks.insert(search_key);
464  cached_bytes.live += search_key.bytes;
465  mutex_locker.unlock();
466 
467  if (debug)
468  printf(
469  "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
470  "%lld).\n",
471  search_key.d_ptr,
472  (long long)search_key.bytes,
473  (long long)search_key.associated_stream,
474  (long long)search_key.ready_event,
475  (long long)search_key.device);
476  }
477 
478  // Copy host pointer to output parameter
479  *d_ptr = search_key.d_ptr;
480 
481  if (debug)
482  printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
483  (long long)cached_blocks.size(),
484  (long long)cached_bytes.free,
485  (long long)live_blocks.size(),
486  (long long)cached_bytes.live);
487 
488  return error;
489  }
490 
496  cudaError_t HostFree(void *d_ptr) {
497  int entrypoint_device = INVALID_DEVICE_ORDINAL;
498  cudaError_t error = cudaSuccess;
499 
500  // Lock
501  std::unique_lock<std::mutex> mutex_locker(mutex);
502 
503  // Find corresponding block descriptor
504  bool recached = false;
505  BlockDescriptor search_key(d_ptr);
506  BusyBlocks::iterator block_itr = live_blocks.find(search_key);
507  if (block_itr != live_blocks.end()) {
508  // Remove from live blocks
509  search_key = *block_itr;
510  live_blocks.erase(block_itr);
511  cached_bytes.live -= search_key.bytes;
512 
513  // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
514  if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
515  // Insert returned allocation into free blocks
516  recached = true;
517  cached_blocks.insert(search_key);
518  cached_bytes.free += search_key.bytes;
519 
520  if (debug)
521  printf(
522  "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
523  "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
524  (long long)search_key.bytes,
525  (long long)search_key.associated_stream,
526  (long long)search_key.ready_event,
527  (long long)search_key.device,
528  (long long)cached_blocks.size(),
529  (long long)cached_bytes.free,
530  (long long)live_blocks.size(),
531  (long long)cached_bytes.live);
532  }
533  }
534 
535  cudaCheck(error = cudaGetDevice(&entrypoint_device));
536  if (entrypoint_device != search_key.device) {
537  cudaCheck(error = cudaSetDevice(search_key.device));
538  }
539 
540  if (recached) {
541  // Insert the ready event in the associated stream (must have current device set properly)
542  cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
543  }
544 
545  // Unlock
546  mutex_locker.unlock();
547 
548  if (!recached) {
549  // Free the allocation from the runtime and cleanup the event.
550  cudaCheck(error = cudaFreeHost(d_ptr));
551  cudaCheck(error = cudaEventDestroy(search_key.ready_event));
552 
553  if (debug)
554  printf(
555  "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available "
556  "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
557  (long long)search_key.bytes,
558  (long long)search_key.associated_stream,
559  (long long)search_key.ready_event,
560  (long long)search_key.device,
561  (long long)cached_blocks.size(),
562  (long long)cached_bytes.free,
563  (long long)live_blocks.size(),
564  (long long)cached_bytes.live);
565  }
566 
567  // Reset device
568  if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
569  cudaCheck(error = cudaSetDevice(entrypoint_device));
570  }
571 
572  return error;
573  }
574 
578  cudaError_t FreeAllCached() {
579  cudaError_t error = cudaSuccess;
580  int entrypoint_device = INVALID_DEVICE_ORDINAL;
581  int current_device = INVALID_DEVICE_ORDINAL;
582 
583  std::unique_lock<std::mutex> mutex_locker(mutex);
584 
585  while (!cached_blocks.empty()) {
586  // Get first block
587  CachedBlocks::iterator begin = cached_blocks.begin();
588 
589  // Get entry-point device ordinal if necessary
590  if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
591  if ((error = cudaGetDevice(&entrypoint_device)))
592  break;
593  }
594 
595  // Set current device ordinal if necessary
596  if (begin->device != current_device) {
597  if ((error = cudaSetDevice(begin->device)))
598  break;
599  current_device = begin->device;
600  }
601 
602  // Free host memory
603  if ((error = cudaFreeHost(begin->d_ptr)))
604  break;
605  if ((error = cudaEventDestroy(begin->ready_event)))
606  break;
607 
608  // Reduce balance and erase entry
609  cached_bytes.free -= begin->bytes;
610 
611  if (debug)
612  printf(
613  "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
614  "bytes) outstanding.\n",
615  (long long)begin->bytes,
616  (long long)cached_blocks.size(),
617  (long long)cached_bytes.free,
618  (long long)live_blocks.size(),
619  (long long)cached_bytes.live);
620 
621  cached_blocks.erase(begin);
622  }
623 
624  mutex_locker.unlock();
625 
626  // Attempt to revert back to entry-point device if necessary
627  if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
628  cudaCheck(error = cudaSetDevice(entrypoint_device));
629  }
630 
631  return error;
632  }
633 
638  if (!skip_cleanup)
639  FreeAllCached();
640  }
641  };
642  // end group UtilMgmt
644 
645 } // namespace notcub
646 
647 #endif
bool(* Compare)(const BlockDescriptor &, const BlockDescriptor &)
BlockDescriptor comparator function interface.
CachingHostAllocator(bool skip_cleanup=false, bool debug=false)
Default constructor.
static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
static unsigned int IntPow(unsigned int base, unsigned int exp)
static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
CUB namespace.
void SetMaxCachedBytes(size_t max_cached_bytes)
Sets the limit on the number bytes this allocator is allowed to cache.
static std::mutex mutex
Definition: Proxy.cc:8
void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value)
CachedBlocks cached_blocks
Aggregate cached bytes.
std::multiset< BlockDescriptor, Compare > CachedBlocks
Set type for cached blocks (ordered by size)
size_t min_bin_bytes
Maximum bin enumeration.
static const size_t INVALID_SIZE
Invalid size.
CachingHostAllocator(unsigned int bin_growth, unsigned int min_bin=1, unsigned int max_bin=INVALID_BIN, size_t max_cached_bytes=INVALID_SIZE, bool skip_cleanup=false, bool debug=false)
Set of live pinned host allocations currently in use.
unsigned int max_bin
Minimum bin enumeration.
size_t max_cached_bytes
Maximum bin size.
TotalBytes cached_bytes
Whether or not to print (de)allocation events to stdout.
Definition: value.py:1
size_t max_bin_bytes
Minimum bin size.
static const unsigned int INVALID_BIN
Out-of-bounds bin.
constexpr unsigned int power(unsigned int base, unsigned int exponent)
unsigned int min_bin
Geometric growth factor for bin-sizes.
cudaError_t FreeAllCached()
Frees all cached pinned host allocations.
unsigned int bin_growth
Mutex for thread-safety.
static const int INVALID_DEVICE_ORDINAL
Invalid device ordinal.
double b
Definition: hdecay.h:120
bool debug
Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may hav...
BusyBlocks live_blocks
Set of cached pinned host allocations available for reuse.
double a
Definition: hdecay.h:121
cudaError_t HostFree(void *d_ptr)
Frees a live allocation of pinned host memory, returning it to the allocator.
const bool skip_cleanup
Maximum aggregate cached bytes.
cudaError_t HostAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream=nullptr)
Provides a suitable allocation of pinned host memory for the given size.
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
A simple caching allocator pinned host memory allocations.
std::multiset< BlockDescriptor, Compare > BusyBlocks
Set type for live blocks (ordered by ptr)