CMS 3D CMS Logo

CachingDeviceAllocator.h
Go to the documentation of this file.
1 #ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
2 #define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
3 
4 /******************************************************************************
5  * Copyright (c) 2011, Duane Merrill. All rights reserved.
6  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  * * Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  * * Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  * * Neither the name of the NVIDIA CORPORATION nor the
16  * names of its contributors may be used to endorse or promote products
17  * derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
23  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  ******************************************************************************/
31 
36 /******************************************************************************
37  * Simple caching allocator for device memory allocations. The allocator is
38  * thread-safe and capable of managing device allocations on multiple devices.
39  ******************************************************************************/
40 
41 #include <cmath>
42 #include <map>
43 #include <set>
44 #include <mutex>
45 
48 
50 namespace notcub {
51 
57  /******************************************************************************
58  * CachingDeviceAllocator (host use)
59  ******************************************************************************/
60 
101  //---------------------------------------------------------------------
102  // Constants
103  //---------------------------------------------------------------------
104 
106  static const unsigned int INVALID_BIN = (unsigned int)-1;
107 
109  static const size_t INVALID_SIZE = (size_t)-1;
110 
111 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
112 
114  static const int INVALID_DEVICE_ORDINAL = -1;
115 
116  //---------------------------------------------------------------------
117  // Type definitions and helper types
118  //---------------------------------------------------------------------
119 
124  void *d_ptr; // Device pointer
125  size_t bytes; // Size of allocation in bytes
126  size_t bytesRequested; // CMS: requested allocatoin size (for monitoring only)
127  unsigned int bin; // Bin enumeration
128  int device; // device ordinal
129  cudaStream_t associated_stream; // Associated associated_stream
130  cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed
131 
132  // Constructor (suitable for searching maps for a specific block, given its pointer and device)
134  : d_ptr(d_ptr),
135  bytes(0),
136  bytesRequested(0), // CMS
137  bin(INVALID_BIN),
138  device(device),
139  associated_stream(nullptr),
140  ready_event(nullptr) {}
141 
142  // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
144  : d_ptr(nullptr),
145  bytes(0),
146  bytesRequested(0), // CMS
147  bin(INVALID_BIN),
148  device(device),
149  associated_stream(nullptr),
150  ready_event(nullptr) {}
151 
152  // Comparison functor for comparing device pointers
153  static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
154  if (a.device == b.device)
155  return (a.d_ptr < b.d_ptr);
156  else
157  return (a.device < b.device);
158  }
159 
160  // Comparison functor for comparing allocation sizes
161  static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
162  if (a.device == b.device)
163  return (a.bytes < b.bytes);
164  else
165  return (a.device < b.device);
166  }
167  };
168 
170  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
171 
172  // CMS: Moved TotalBytes to deviceAllocatorStatus.h
173 
175  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
176 
178  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
179 
181  // CMS: Moved definition to deviceAllocatorStatus.h
183 
184  //---------------------------------------------------------------------
185  // Utility functions
186  //---------------------------------------------------------------------
187 
191  static unsigned int IntPow(unsigned int base, unsigned int exp) {
192  unsigned int retval = 1;
193  while (exp > 0) {
194  if (exp & 1) {
195  retval = retval * base; // multiply the result by the current base
196  }
197  base = base * base; // square the base
198  exp = exp >> 1; // divide the exponent in half
199  }
200  return retval;
201  }
202 
206  void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
207  power = 0;
208  rounded_bytes = 1;
209 
210  if (value * base < value) {
211  // Overflow
212  power = sizeof(size_t) * 8;
213  rounded_bytes = size_t(0) - 1;
214  return;
215  }
216 
217  while (rounded_bytes < value) {
218  rounded_bytes *= base;
219  power++;
220  }
221  }
222 
223  //---------------------------------------------------------------------
224  // Fields
225  //---------------------------------------------------------------------
226 
227  // CMS: use std::mutex instead of cub::Mutex, declare mutable
228  mutable std::mutex mutex;
229 
230  unsigned int bin_growth;
231  unsigned int min_bin;
232  unsigned int max_bin;
233 
234  size_t min_bin_bytes;
235  size_t max_bin_bytes;
237 
238  const bool
240  bool debug;
241 
245 
246 #endif // DOXYGEN_SHOULD_SKIP_THIS
247 
248  //---------------------------------------------------------------------
249  // Methods
250  //---------------------------------------------------------------------
251 
256  unsigned int bin_growth,
257  unsigned int min_bin = 1,
258  unsigned int max_bin = INVALID_BIN,
259  size_t max_cached_bytes = INVALID_SIZE,
260  bool skip_cleanup =
261  false,
262  bool debug = false)
264  min_bin(min_bin),
265  max_bin(max_bin),
270  debug(debug),
271  cached_blocks(BlockDescriptor::SizeCompare),
272  live_blocks(BlockDescriptor::PtrCompare) {}
273 
287  CachingDeviceAllocator(bool skip_cleanup = false, bool debug = false)
288  : bin_growth(8),
289  min_bin(3),
290  max_bin(7),
293  max_cached_bytes((max_bin_bytes * 3) - 1),
295  debug(debug),
296  cached_blocks(BlockDescriptor::SizeCompare),
297  live_blocks(BlockDescriptor::PtrCompare) {}
298 
305  cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) {
306  // Lock
307  // CMS: use RAII instead of (un)locking explicitly
308  std::unique_lock mutex_locker(mutex);
309 
310  if (debug)
311  // CMS: use raw printf
312  printf("Changing max_cached_bytes (%lld -> %lld)\n",
313  (long long)this->max_cached_bytes,
314  (long long)max_cached_bytes);
315 
316  this->max_cached_bytes = max_cached_bytes;
317 
318  // Unlock (redundant, kept for style uniformity)
319  mutex_locker.unlock();
320 
321  return cudaSuccess;
322  }
323 
331  cudaError_t DeviceAllocate(
332  int device,
333  void **d_ptr,
334  size_t bytes,
335  cudaStream_t active_stream = nullptr)
336  {
337  // CMS: use RAII instead of (un)locking explicitly
338  std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
339  *d_ptr = nullptr;
340  int entrypoint_device = INVALID_DEVICE_ORDINAL;
341  cudaError_t error = cudaSuccess;
342 
343  if (device == INVALID_DEVICE_ORDINAL) {
344  // CMS: throw exception on error
345  cudaCheck(error = cudaGetDevice(&entrypoint_device));
346  device = entrypoint_device;
347  }
348 
349  // Create a block descriptor for the requested allocation
350  bool found = false;
351  BlockDescriptor search_key(device);
352  search_key.bytesRequested = bytes; // CMS
353  search_key.associated_stream = active_stream;
354  NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
355 
356  if (search_key.bin > max_bin) {
357  // Bin is greater than our maximum bin: allocate the request
358  // exactly and give out-of-bounds bin. It will not be cached
359  // for reuse when returned.
360  search_key.bin = INVALID_BIN;
361  search_key.bytes = bytes;
362  } else {
363  // Search for a suitable cached allocation: lock
364  mutex_locker.lock();
365 
366  if (search_key.bin < min_bin) {
367  // Bin is less than minimum bin: round up
368  search_key.bin = min_bin;
369  search_key.bytes = min_bin_bytes;
370  }
371 
372  // Iterate through the range of cached blocks on the same device in the same bin
373  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
374  while ((block_itr != cached_blocks.end()) && (block_itr->device == device) &&
375  (block_itr->bin == search_key.bin)) {
376  // To prevent races with reusing blocks returned by the host but still
377  // in use by the device, only consider cached blocks that are
378  // either (from the active stream) or (from an idle stream)
379  if ((active_stream == block_itr->associated_stream) ||
380  (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
381  // Reuse existing cache block. Insert into live blocks.
382  found = true;
383  search_key = *block_itr;
384  search_key.associated_stream = active_stream;
385  live_blocks.insert(search_key);
386 
387  // Remove from free blocks
388  cached_bytes[device].free -= search_key.bytes;
389  cached_bytes[device].live += search_key.bytes;
390  cached_bytes[device].liveRequested += search_key.bytesRequested; // CMS
391 
392  if (debug)
393  // CMS: improved debug message
394  // CMS: use raw printf
395  printf(
396  "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
397  "associated with stream %lld, event %lld).\n",
398  device,
399  search_key.d_ptr,
400  (long long)search_key.bytes,
401  (long long)search_key.associated_stream,
402  (long long)search_key.ready_event,
403  (long long)block_itr->associated_stream,
404  (long long)block_itr->ready_event);
405 
406  cached_blocks.erase(block_itr);
407 
408  break;
409  }
410  block_itr++;
411  }
412 
413  // Done searching: unlock
414  mutex_locker.unlock();
415  }
416 
417  // Allocate the block if necessary
418  if (!found) {
419  // Set runtime's current device to specified device (entrypoint may not be set)
420  if (device != entrypoint_device) {
421  // CMS: throw exception on error
422  cudaCheck(error = cudaGetDevice(&entrypoint_device));
423  cudaCheck(error = cudaSetDevice(device));
424  }
425 
426  // Attempt to allocate
427  // CMS: silently ignore errors and retry or pass them to the caller
428  if ((error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
429  // The allocation attempt failed: free all cached blocks on device and retry
430  if (debug)
431  // CMS: use raw printf
432  printf(
433  "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
434  device,
435  (long long)search_key.bytes,
436  (long long)search_key.associated_stream);
437 
438  error = cudaSuccess; // Reset the error we will return
439  cudaGetLastError(); // Reset CUDART's error
440 
441  // Lock
442  mutex_locker.lock();
443 
444  // Iterate the range of free blocks on the same device
445  BlockDescriptor free_key(device);
446  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
447 
448  while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) {
449  // No need to worry about synchronization with the device: cudaFree is
450  // blocking and will synchronize across all kernels executing
451  // on the current device
452 
453  // Free device memory and destroy stream event.
454  // CMS: silently ignore errors and pass them to the caller
455  if ((error = cudaFree(block_itr->d_ptr)))
456  break;
457  if ((error = cudaEventDestroy(block_itr->ready_event)))
458  break;
459 
460  // Reduce balance and erase entry
461  cached_bytes[device].free -= block_itr->bytes;
462 
463  if (debug)
464  // CMS: use raw printf
465  printf(
466  "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks "
467  "(%lld bytes) outstanding.\n",
468  device,
469  (long long)block_itr->bytes,
470  (long long)cached_blocks.size(),
471  (long long)cached_bytes[device].free,
472  (long long)live_blocks.size(),
473  (long long)cached_bytes[device].live);
474 
475  cached_blocks.erase(block_itr);
476 
477  block_itr++;
478  }
479 
480  // Unlock
481  mutex_locker.unlock();
482 
483  // Return under error
484  if (error)
485  return error;
486 
487  // Try to allocate again
488  // CMS: throw exception on error
489  cudaCheck(error = cudaMalloc(&search_key.d_ptr, search_key.bytes));
490  }
491 
492  // Create ready event
493  // CMS: throw exception on error
494  cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
495 
496  // Insert into live blocks
497  mutex_locker.lock();
498  live_blocks.insert(search_key);
499  cached_bytes[device].live += search_key.bytes;
500  cached_bytes[device].liveRequested += search_key.bytesRequested; // CMS
501  mutex_locker.unlock();
502 
503  if (debug)
504  // CMS: improved debug message
505  // CMS: use raw printf
506  printf("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
507  device,
508  search_key.d_ptr,
509  (long long)search_key.bytes,
510  (long long)search_key.associated_stream,
511  (long long)search_key.ready_event);
512 
513  // Attempt to revert back to previous device if necessary
514  if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
515  // CMS: throw exception on error
516  cudaCheck(error = cudaSetDevice(entrypoint_device));
517  }
518  }
519 
520  // Copy device pointer to output parameter
521  *d_ptr = search_key.d_ptr;
522 
523  if (debug)
524  // CMS: use raw printf
525  printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
526  (long long)cached_blocks.size(),
527  (long long)cached_bytes[device].free,
528  (long long)live_blocks.size(),
529  (long long)cached_bytes[device].live);
530 
531  return error;
532  }
533 
541  cudaError_t DeviceAllocate(
542  void **d_ptr,
543  size_t bytes,
544  cudaStream_t active_stream = nullptr)
545  {
546  return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
547  }
548 
556  cudaError_t DeviceFree(int device, void *d_ptr) {
557  int entrypoint_device = INVALID_DEVICE_ORDINAL;
558  cudaError_t error = cudaSuccess;
559  // CMS: use RAII instead of (un)locking explicitly
560  std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
561 
562  if (device == INVALID_DEVICE_ORDINAL) {
563  // CMS: throw exception on error
564  cudaCheck(error = cudaGetDevice(&entrypoint_device));
565  device = entrypoint_device;
566  }
567 
568  // Lock
569  mutex_locker.lock();
570 
571  // Find corresponding block descriptor
572  bool recached = false;
573  BlockDescriptor search_key(d_ptr, device);
574  BusyBlocks::iterator block_itr = live_blocks.find(search_key);
575  if (block_itr != live_blocks.end()) {
576  // Remove from live blocks
577  search_key = *block_itr;
578  live_blocks.erase(block_itr);
579  cached_bytes[device].live -= search_key.bytes;
580  cached_bytes[device].liveRequested -= search_key.bytesRequested; // CMS
581 
582  // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
583  if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
584  // Insert returned allocation into free blocks
585  recached = true;
586  cached_blocks.insert(search_key);
587  cached_bytes[device].free += search_key.bytes;
588 
589  if (debug)
590  // CMS: improved debug message
591  // CMS: use raw printf
592  printf(
593  "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
594  "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
595  device,
596  (long long)search_key.bytes,
597  d_ptr,
598  (long long)search_key.associated_stream,
599  (long long)search_key.ready_event,
600  (long long)cached_blocks.size(),
601  (long long)cached_bytes[device].free,
602  (long long)live_blocks.size(),
603  (long long)cached_bytes[device].live);
604  }
605  }
606 
607  // First set to specified device (entrypoint may not be set)
608  if (device != entrypoint_device) {
609  // CMS: throw exception on error
610  cudaCheck(error = cudaGetDevice(&entrypoint_device));
611  cudaCheck(error = cudaSetDevice(device));
612  }
613 
614  if (recached) {
615  // Insert the ready event in the associated stream (must have current device set properly)
616  // CMS: throw exception on error
617  cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
618  }
619 
620  // Unlock
621  mutex_locker.unlock();
622 
623  if (!recached) {
624  // Free the allocation from the runtime and cleanup the event.
625  // CMS: throw exception on error
626  cudaCheck(error = cudaFree(d_ptr));
627  cudaCheck(error = cudaEventDestroy(search_key.ready_event));
628 
629  if (debug)
630  // CMS: improved debug message
631  printf(
632  "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
633  "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
634  device,
635  (long long)search_key.bytes,
636  d_ptr,
637  (long long)search_key.associated_stream,
638  (long long)search_key.ready_event,
639  (long long)cached_blocks.size(),
640  (long long)cached_bytes[device].free,
641  (long long)live_blocks.size(),
642  (long long)cached_bytes[device].live);
643  }
644 
645  // Reset device
646  if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
647  // CMS: throw exception on error
648  cudaCheck(error = cudaSetDevice(entrypoint_device));
649  }
650 
651  return error;
652  }
653 
661  cudaError_t DeviceFree(void *d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); }
662 
666  cudaError_t FreeAllCached() {
667  cudaError_t error = cudaSuccess;
668  int entrypoint_device = INVALID_DEVICE_ORDINAL;
669  int current_device = INVALID_DEVICE_ORDINAL;
670  // CMS: use RAII instead of (un)locking explicitly
671  std::unique_lock<std::mutex> mutex_locker(mutex);
672 
673  while (!cached_blocks.empty()) {
674  // Get first block
675  CachedBlocks::iterator begin = cached_blocks.begin();
676 
677  // Get entry-point device ordinal if necessary
678  if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
679  // CMS: silently ignore errors and pass them to the caller
680  if ((error = cudaGetDevice(&entrypoint_device)))
681  break;
682  }
683 
684  // Set current device ordinal if necessary
685  if (begin->device != current_device) {
686  // CMS: silently ignore errors and pass them to the caller
687  if ((error = cudaSetDevice(begin->device)))
688  break;
689  current_device = begin->device;
690  }
691 
692  // Free device memory
693  // CMS: silently ignore errors and pass them to the caller
694  if ((error = cudaFree(begin->d_ptr)))
695  break;
696  if ((error = cudaEventDestroy(begin->ready_event)))
697  break;
698 
699  // Reduce balance and erase entry
700  cached_bytes[current_device].free -= begin->bytes;
701 
702  if (debug)
703  printf(
704  "\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
705  "bytes) outstanding.\n",
706  current_device,
707  (long long)begin->bytes,
708  (long long)cached_blocks.size(),
709  (long long)cached_bytes[current_device].free,
710  (long long)live_blocks.size(),
711  (long long)cached_bytes[current_device].live);
712 
713  cached_blocks.erase(begin);
714  }
715 
716  mutex_locker.unlock();
717 
718  // Attempt to revert back to entry-point device if necessary
719  if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
720  // CMS: throw exception on error
721  cudaCheck(error = cudaSetDevice(entrypoint_device));
722  }
723 
724  return error;
725  }
726 
727  // CMS: give access to cache allocation status
729  std::unique_lock mutex_locker(mutex);
730  return cached_bytes;
731  }
732 
736  // CMS: make the destructor not virtual
738  if (!skip_cleanup)
739  FreeAllCached();
740  }
741  };
742  // end group UtilMgmt
744 
745 } // namespace notcub
746 
747 #endif
CachedBlocks cached_blocks
Map of device ordinal to aggregate cached bytes on that device.
static const unsigned int INVALID_BIN
Out-of-bounds bin.
CachingDeviceAllocator(bool skip_cleanup=false, bool debug=false)
Default constructor.
CUB namespace.
A simple caching allocator for device memory allocations.
static std::mutex mutex
Definition: Proxy.cc:8
BusyBlocks live_blocks
Set of cached device allocations available for reuse.
cudaError_t DeviceFree(void *d_ptr)
Frees a live allocation of device memory on the current device, returning it to the allocator...
cms::cuda::allocator::GpuCachedBytes GpuCachedBytes
Map type of device ordinals to the number of cached bytes cached by each device.
cudaError_t DeviceFree(int device, void *d_ptr)
Frees a live allocation of device memory on the specified device, returning it to the allocator...
void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value)
bool(* Compare)(const BlockDescriptor &, const BlockDescriptor &)
BlockDescriptor comparator function interface.
static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
std::multiset< BlockDescriptor, Compare > BusyBlocks
Set type for live blocks (ordered by ptr)
void free(void *ptr) noexcept
static unsigned int IntPow(unsigned int base, unsigned int exp)
static const size_t INVALID_SIZE
Invalid size.
static const int INVALID_DEVICE_ORDINAL
Invalid device ordinal.
const bool skip_cleanup
Maximum aggregate cached bytes per device.
unsigned int bin_growth
Mutex for thread-safety.
cudaError_t DeviceAllocate(int device, void **d_ptr, size_t bytes, cudaStream_t active_stream=nullptr)
Provides a suitable allocation of device memory for the given size on the specified device...
Definition: value.py:1
static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
unsigned int max_bin
Minimum bin enumeration.
constexpr unsigned int power(unsigned int base, unsigned int exponent)
size_t max_bin_bytes
Minimum bin size.
CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin=1, unsigned int max_bin=INVALID_BIN, size_t max_cached_bytes=INVALID_SIZE, bool skip_cleanup=false, bool debug=false)
Set of live device allocations currently in use.
double b
Definition: hdecay.h:120
std::multiset< BlockDescriptor, Compare > CachedBlocks
Set type for cached blocks (ordered by size)
cudaError_t DeviceAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream=nullptr)
Provides a suitable allocation of device memory for the given size on the current device...
cudaError_t FreeAllCached()
Frees all cached device allocations on all devices.
GpuCachedBytes cached_bytes
Whether or not to print (de)allocation events to stdout.
double a
Definition: hdecay.h:121
size_t max_cached_bytes
Maximum bin size.
cudaError_t SetMaxCachedBytes(size_t max_cached_bytes)
Sets the limit on the number bytes this allocator is allowed to cache per device. ...
unsigned int min_bin
Geometric growth factor for bin-sizes.
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69
bool debug
Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may hav...
std::map< int, TotalBytes > GpuCachedBytes
Map type of device ordinals to the number of cached bytes cached by each device.
size_t min_bin_bytes
Maximum bin enumeration.