dd/dad/CachingDeviceAllocator_8h_source.html

 #ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
 #define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h

 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
  *       notice, this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright
  *       notice, this list of conditions and the following disclaimer in the
  *       documentation and/or other materials provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/

 /******************************************************************************
  * Simple caching allocator for device memory allocations. The allocator is
  * thread-safe and capable of managing device allocations on multiple devices.
  ******************************************************************************/

 #include <cmath>
 #include <map>
 #include <set>
 #include <mutex>

 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"

 namespace notcub {

   /******************************************************************************
  * CachingDeviceAllocator (host use)
  ******************************************************************************/

   struct CachingDeviceAllocator {
     //---------------------------------------------------------------------
     // Constants
     //---------------------------------------------------------------------

     static const unsigned int INVALID_BIN = (unsigned int)-1;

     static const size_t INVALID_SIZE = (size_t)-1;

 #ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document

     static const int INVALID_DEVICE_ORDINAL = -1;

     //---------------------------------------------------------------------
     // Type definitions and helper types
     //---------------------------------------------------------------------

     struct BlockDescriptor {
       void *d_ptr;                     // Device pointer
       size_t bytes;                    // Size of allocation in bytes
       size_t bytesRequested;           // CMS: requested allocatoin size (for monitoring only)
       unsigned int bin;                // Bin enumeration
       int device;                      // device ordinal
       cudaStream_t associated_stream;  // Associated associated_stream
       cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed

       // Constructor (suitable for searching maps for a specific block, given its pointer and device)
       BlockDescriptor(void *d_ptr, int device)
           : d_ptr(d_ptr),
             bytes(0),
             bytesRequested(0),  // CMS
             bin(INVALID_BIN),
             device(device),
             associated_stream(nullptr),
             ready_event(nullptr) {}

       // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
       BlockDescriptor(int device)
           : d_ptr(nullptr),
             bytes(0),
             bytesRequested(0),  // CMS
             bin(INVALID_BIN),
             device(device),
             associated_stream(nullptr),
             ready_event(nullptr) {}

       // Comparison functor for comparing device pointers
       static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
         if (a.device == b.device)
           return (a.d_ptr < b.d_ptr);
         else
           return (a.device < b.device);
       }

       // Comparison functor for comparing allocation sizes
       static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
         if (a.device == b.device)
           return (a.bytes < b.bytes);
         else
           return (a.device < b.device);
       }
     };

     typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);

     // CMS: Moved TotalBytes to deviceAllocatorStatus.h

     typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;

     typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;

     // CMS: Moved definition to deviceAllocatorStatus.h
     using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes;

     //---------------------------------------------------------------------
     // Utility functions
     //---------------------------------------------------------------------

     static unsigned int IntPow(unsigned int base, unsigned int exp) {
       unsigned int retval = 1;
       while (exp > 0) {
         if (exp & 1) {
           retval = retval * base;  // multiply the result by the current base
         }
         base = base * base;  // square the base
         exp = exp >> 1;      // divide the exponent in half
       }
       return retval;
     }

     void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
       power = 0;
       rounded_bytes = 1;

       if (value * base < value) {
         // Overflow
         power = sizeof(size_t) * 8;
         rounded_bytes = size_t(0) - 1;
         return;
       }

       while (rounded_bytes < value) {
         rounded_bytes *= base;
         power++;
       }
     }

     //---------------------------------------------------------------------
     // Fields
     //---------------------------------------------------------------------

     // CMS: use std::mutex instead of cub::Mutex, declare mutable
     mutable std::mutex mutex;

     unsigned int bin_growth;
     unsigned int min_bin;
     unsigned int max_bin;

     size_t min_bin_bytes;
     size_t max_bin_bytes;
     size_t max_cached_bytes;

     const bool
         skip_cleanup;
     bool debug;

     GpuCachedBytes cached_bytes;
     CachedBlocks cached_blocks;
     BusyBlocks live_blocks;

 #endif  // DOXYGEN_SHOULD_SKIP_THIS

     //---------------------------------------------------------------------
     // Methods
     //---------------------------------------------------------------------

     CachingDeviceAllocator(
         unsigned int bin_growth,
         unsigned int min_bin = 1,
         unsigned int max_bin = INVALID_BIN,
         size_t max_cached_bytes = INVALID_SIZE,
         bool skip_cleanup =
             false,
         bool debug = false)
         : bin_growth(bin_growth),
           min_bin(min_bin),
           max_bin(max_bin),
           min_bin_bytes(IntPow(bin_growth, min_bin)),
           max_bin_bytes(IntPow(bin_growth, max_bin)),
           max_cached_bytes(max_cached_bytes),
           skip_cleanup(skip_cleanup),
           debug(debug),
           cached_blocks(BlockDescriptor::SizeCompare),
           live_blocks(BlockDescriptor::PtrCompare) {}

     CachingDeviceAllocator(bool skip_cleanup = false, bool debug = false)
         : bin_growth(8),
           min_bin(3),
           max_bin(7),
           min_bin_bytes(IntPow(bin_growth, min_bin)),
           max_bin_bytes(IntPow(bin_growth, max_bin)),
           max_cached_bytes((max_bin_bytes * 3) - 1),
           skip_cleanup(skip_cleanup),
           debug(debug),
           cached_blocks(BlockDescriptor::SizeCompare),
           live_blocks(BlockDescriptor::PtrCompare) {}

     cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) {
       // Lock
       // CMS: use RAII instead of (un)locking explicitly
       std::unique_lock mutex_locker(mutex);

       if (debug)
         // CMS: use raw printf
         printf("Changing max_cached_bytes (%lld -> %lld)\n",
                (long long)this->max_cached_bytes,
                (long long)max_cached_bytes);

       this->max_cached_bytes = max_cached_bytes;

       // Unlock (redundant, kept for style uniformity)
       mutex_locker.unlock();

       return cudaSuccess;
     }

     cudaError_t DeviceAllocate(
         int device,
         void **d_ptr,
         size_t bytes,
         cudaStream_t active_stream = nullptr)
     {
       // CMS: use RAII instead of (un)locking explicitly
       std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
       *d_ptr = nullptr;
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       cudaError_t error = cudaSuccess;

       if (device == INVALID_DEVICE_ORDINAL) {
         // CMS: throw exception on error
         cudaCheck(error = cudaGetDevice(&entrypoint_device));
         device = entrypoint_device;
       }

       // Create a block descriptor for the requested allocation
       bool found = false;
       BlockDescriptor search_key(device);
       search_key.bytesRequested = bytes;  // CMS
       search_key.associated_stream = active_stream;
       NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);

       if (search_key.bin > max_bin) {
         // Bin is greater than our maximum bin: allocate the request
         // exactly and give out-of-bounds bin.  It will not be cached
         // for reuse when returned.
         search_key.bin = INVALID_BIN;
         search_key.bytes = bytes;
       } else {
         // Search for a suitable cached allocation: lock
         mutex_locker.lock();

         if (search_key.bin < min_bin) {
           // Bin is less than minimum bin: round up
           search_key.bin = min_bin;
           search_key.bytes = min_bin_bytes;
         }

         // Iterate through the range of cached blocks on the same device in the same bin
         CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
         while ((block_itr != cached_blocks.end()) && (block_itr->device == device) &&
                (block_itr->bin == search_key.bin)) {
           // To prevent races with reusing blocks returned by the host but still
           // in use by the device, only consider cached blocks that are
           // either (from the active stream) or (from an idle stream)
           if ((active_stream == block_itr->associated_stream) ||
               (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
             // Reuse existing cache block.  Insert into live blocks.
             found = true;
             search_key = *block_itr;
             search_key.associated_stream = active_stream;
             live_blocks.insert(search_key);

             // Remove from free blocks
             cached_bytes[device].free -= search_key.bytes;
             cached_bytes[device].live += search_key.bytes;
             cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS

             if (debug)
               // CMS: improved debug message
               // CMS: use raw printf
               printf(
                   "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
                   "associated with stream %lld, event %lld).\n",
                   device,
                   search_key.d_ptr,
                   (long long)search_key.bytes,
                   (long long)search_key.associated_stream,
                   (long long)search_key.ready_event,
                   (long long)block_itr->associated_stream,
                   (long long)block_itr->ready_event);

             cached_blocks.erase(block_itr);

             break;
           }
           block_itr++;
         }

         // Done searching: unlock
         mutex_locker.unlock();
       }

       // Allocate the block if necessary
       if (!found) {
         // Set runtime's current device to specified device (entrypoint may not be set)
         if (device != entrypoint_device) {
           // CMS: throw exception on error
           cudaCheck(error = cudaGetDevice(&entrypoint_device));
           cudaCheck(error = cudaSetDevice(device));
         }

         // Attempt to allocate
         // CMS: silently ignore errors and retry or pass them to the caller
         if ((error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
           // The allocation attempt failed: free all cached blocks on device and retry
           if (debug)
             // CMS: use raw printf
             printf(
                 "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
                 device,
                 (long long)search_key.bytes,
                 (long long)search_key.associated_stream);

           error = cudaSuccess;  // Reset the error we will return
           cudaGetLastError();   // Reset CUDART's error

           // Lock
           mutex_locker.lock();

           // Iterate the range of free blocks on the same device
           BlockDescriptor free_key(device);
           CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);

           while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) {
             // No need to worry about synchronization with the device: cudaFree is
             // blocking and will synchronize across all kernels executing
             // on the current device

             // Free device memory and destroy stream event.
             // CMS: silently ignore errors and pass them to the caller
             if ((error = cudaFree(block_itr->d_ptr)))
               break;
             if ((error = cudaEventDestroy(block_itr->ready_event)))
               break;

             // Reduce balance and erase entry
             cached_bytes[device].free -= block_itr->bytes;

             if (debug)
               // CMS: use raw printf
               printf(
                   "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
                   "(%lld bytes) outstanding.\n",
                   device,
                   (long long)block_itr->bytes,
                   (long long)cached_blocks.size(),
                   (long long)cached_bytes[device].free,
                   (long long)live_blocks.size(),
                   (long long)cached_bytes[device].live);

             cached_blocks.erase(block_itr);

             block_itr++;
           }

           // Unlock
           mutex_locker.unlock();

           // Return under error
           if (error)
             return error;

           // Try to allocate again
           // CMS: throw exception on error
           cudaCheck(error = cudaMalloc(&search_key.d_ptr, search_key.bytes));
         }

         // Create ready event
         // CMS: throw exception on error
         cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));

         // Insert into live blocks
         mutex_locker.lock();
         live_blocks.insert(search_key);
         cached_bytes[device].live += search_key.bytes;
         cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS
         mutex_locker.unlock();

         if (debug)
           // CMS: improved debug message
           // CMS: use raw printf
           printf("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
                  device,
                  search_key.d_ptr,
                  (long long)search_key.bytes,
                  (long long)search_key.associated_stream,
                  (long long)search_key.ready_event);

         // Attempt to revert back to previous device if necessary
         if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
           // CMS: throw exception on error
           cudaCheck(error = cudaSetDevice(entrypoint_device));
         }
       }

       // Copy device pointer to output parameter
       *d_ptr = search_key.d_ptr;

       if (debug)
         // CMS: use raw printf
         printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
                (long long)cached_blocks.size(),
                (long long)cached_bytes[device].free,
                (long long)live_blocks.size(),
                (long long)cached_bytes[device].live);

       return error;
     }

     cudaError_t DeviceAllocate(
         void **d_ptr,
         size_t bytes,
         cudaStream_t active_stream = nullptr)
     {
       return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
     }

     cudaError_t DeviceFree(int device, void *d_ptr) {
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       cudaError_t error = cudaSuccess;
       // CMS: use RAII instead of (un)locking explicitly
       std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);

       if (device == INVALID_DEVICE_ORDINAL) {
         // CMS: throw exception on error
         cudaCheck(error = cudaGetDevice(&entrypoint_device));
         device = entrypoint_device;
       }

       // Lock
       mutex_locker.lock();

       // Find corresponding block descriptor
       bool recached = false;
       BlockDescriptor search_key(d_ptr, device);
       BusyBlocks::iterator block_itr = live_blocks.find(search_key);
       if (block_itr != live_blocks.end()) {
         // Remove from live blocks
         search_key = *block_itr;
         live_blocks.erase(block_itr);
         cached_bytes[device].live -= search_key.bytes;
         cached_bytes[device].liveRequested -= search_key.bytesRequested;  // CMS

         // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
         if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
           // Insert returned allocation into free blocks
           recached = true;
           cached_blocks.insert(search_key);
           cached_bytes[device].free += search_key.bytes;

           if (debug)
             // CMS: improved debug message
             // CMS: use raw printf
             printf(
                 "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
                 "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
                 device,
                 (long long)search_key.bytes,
                 d_ptr,
                 (long long)search_key.associated_stream,
                 (long long)search_key.ready_event,
                 (long long)cached_blocks.size(),
                 (long long)cached_bytes[device].free,
                 (long long)live_blocks.size(),
                 (long long)cached_bytes[device].live);
         }
       }

       // First set to specified device (entrypoint may not be set)
       if (device != entrypoint_device) {
         // CMS: throw exception on error
         cudaCheck(error = cudaGetDevice(&entrypoint_device));
         cudaCheck(error = cudaSetDevice(device));
       }

       if (recached) {
         // Insert the ready event in the associated stream (must have current device set properly)
         // CMS: throw exception on error
         cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
       }

       // Unlock
       mutex_locker.unlock();

       if (!recached) {
         // Free the allocation from the runtime and cleanup the event.
         // CMS: throw exception on error
         cudaCheck(error = cudaFree(d_ptr));
         cudaCheck(error = cudaEventDestroy(search_key.ready_event));

         if (debug)
           // CMS: improved debug message
           printf(
               "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
               "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
               device,
               (long long)search_key.bytes,
               d_ptr,
               (long long)search_key.associated_stream,
               (long long)search_key.ready_event,
               (long long)cached_blocks.size(),
               (long long)cached_bytes[device].free,
               (long long)live_blocks.size(),
               (long long)cached_bytes[device].live);
       }

       // Reset device
       if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
         // CMS: throw exception on error
         cudaCheck(error = cudaSetDevice(entrypoint_device));
       }

       return error;
     }

     cudaError_t DeviceFree(void *d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); }

     cudaError_t FreeAllCached() {
       cudaError_t error = cudaSuccess;
       int entrypoint_device = INVALID_DEVICE_ORDINAL;
       int current_device = INVALID_DEVICE_ORDINAL;
       // CMS: use RAII instead of (un)locking explicitly
       std::unique_lock<std::mutex> mutex_locker(mutex);

       while (!cached_blocks.empty()) {
         // Get first block
         CachedBlocks::iterator begin = cached_blocks.begin();

         // Get entry-point device ordinal if necessary
         if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
           // CMS: silently ignore errors and pass them to the caller
           if ((error = cudaGetDevice(&entrypoint_device)))
             break;
         }

         // Set current device ordinal if necessary
         if (begin->device != current_device) {
           // CMS: silently ignore errors and pass them to the caller
           if ((error = cudaSetDevice(begin->device)))
             break;
           current_device = begin->device;
         }

         // Free device memory
         // CMS: silently ignore errors and pass them to the caller
         if ((error = cudaFree(begin->d_ptr)))
           break;
         if ((error = cudaEventDestroy(begin->ready_event)))
           break;

         // Reduce balance and erase entry
         cached_bytes[current_device].free -= begin->bytes;

         if (debug)
           printf(
               "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
               "bytes) outstanding.\n",
               current_device,
               (long long)begin->bytes,
               (long long)cached_blocks.size(),
               (long long)cached_bytes[current_device].free,
               (long long)live_blocks.size(),
               (long long)cached_bytes[current_device].live);

         cached_blocks.erase(begin);
       }

       mutex_locker.unlock();

       // Attempt to revert back to entry-point device if necessary
       if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
         // CMS: throw exception on error
         cudaCheck(error = cudaSetDevice(entrypoint_device));
       }

       return error;
     }

     // CMS: give access to cache allocation status
     GpuCachedBytes CacheStatus() const {
       std::unique_lock mutex_locker(mutex);
       return cached_bytes;
     }

     // CMS: make the destructor not virtual
     ~CachingDeviceAllocator() {
       if (!skip_cleanup)
         FreeAllCached();
     }
   };
   // end group UtilMgmt

 }  // namespace notcub

 #endif
notcub::CachingDeviceAllocator::cached_blocks
CachedBlocks cached_blocks
Map of device ordinal to aggregate cached bytes on that device.
Definition: CachingDeviceAllocator.h:243

notcub::CachingDeviceAllocator::INVALID_BIN
static const unsigned int INVALID_BIN
Out-of-bounds bin.
Definition: CachingDeviceAllocator.h:106

notcub::CachingDeviceAllocator::BlockDescriptor::ready_event
cudaEvent_t ready_event
Definition: CachingDeviceAllocator.h:130

notcub::CachingDeviceAllocator::BlockDescriptor::associated_stream
cudaStream_t associated_stream
Definition: CachingDeviceAllocator.h:129

relativeConstraints.error
error
Definition: relativeConstraints.py:53

notcub::CachingDeviceAllocator::CachingDeviceAllocator
CachingDeviceAllocator(bool skip_cleanup=false, bool debug=false)
Default constructor.
Definition: CachingDeviceAllocator.h:287

notcub
CUB namespace.
Definition: CachingDeviceAllocator.h:50

notcub::CachingDeviceAllocator
A simple caching allocator for device memory allocations.
Definition: CachingDeviceAllocator.h:100

mutex
static std::mutex mutex
Definition: Proxy.cc:8

newFWLiteAna.base
base
Main Program
Definition: newFWLiteAna.py:92

notcub::CachingDeviceAllocator::live_blocks
BusyBlocks live_blocks
Set of cached device allocations available for reuse.
Definition: CachingDeviceAllocator.h:244

notcub::CachingDeviceAllocator::BlockDescriptor::BlockDescriptor
BlockDescriptor(int device)
Definition: CachingDeviceAllocator.h:143

notcub::CachingDeviceAllocator::DeviceFree
cudaError_t DeviceFree(void *d_ptr)
Frees a live allocation of device memory on the current device, returning it to the allocator...
Definition: CachingDeviceAllocator.h:661

notcub::CachingDeviceAllocator::CacheStatus
GpuCachedBytes CacheStatus() const
Definition: CachingDeviceAllocator.h:728

notcub::CachingDeviceAllocator::GpuCachedBytes
cms::cuda::allocator::GpuCachedBytes GpuCachedBytes
Map type of device ordinals to the number of cached bytes cached by each device.
Definition: CachingDeviceAllocator.h:182

notcub::CachingDeviceAllocator::DeviceFree
cudaError_t DeviceFree(int device, void *d_ptr)
Frees a live allocation of device memory on the specified device, returning it to the allocator...
Definition: CachingDeviceAllocator.h:556

notcub::CachingDeviceAllocator::NearestPowerOf
void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value)
Definition: CachingDeviceAllocator.h:206

notcub::CachingDeviceAllocator::~CachingDeviceAllocator
~CachingDeviceAllocator()
Destructor.
Definition: CachingDeviceAllocator.h:737

notcub::CachingDeviceAllocator::Compare
bool(* Compare)(const BlockDescriptor &, const BlockDescriptor &)
BlockDescriptor comparator function interface.
Definition: CachingDeviceAllocator.h:170

notcub::CachingDeviceAllocator::BlockDescriptor::PtrCompare
static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
Definition: CachingDeviceAllocator.h:153

beam_dqm_sourceclient-live_cfg.live
live
Definition: beam_dqm_sourceclient-live_cfg.py:29

notcub::CachingDeviceAllocator::BlockDescriptor::bytesRequested
size_t bytesRequested
Definition: CachingDeviceAllocator.h:126

notcub::CachingDeviceAllocator::BusyBlocks
std::multiset< BlockDescriptor, Compare > BusyBlocks
Set type for live blocks (ordered by ptr)
Definition: CachingDeviceAllocator.h:178

createfilelist.int
int
Definition: createfilelist.py:10

notcub::CachingDeviceAllocator::IntPow
static unsigned int IntPow(unsigned int base, unsigned int exp)
Definition: CachingDeviceAllocator.h:191

notcub::CachingDeviceAllocator::BlockDescriptor::BlockDescriptor
BlockDescriptor(void *d_ptr, int device)
Definition: CachingDeviceAllocator.h:133

notcub::CachingDeviceAllocator::INVALID_SIZE
static const size_t INVALID_SIZE
Invalid size.
Definition: CachingDeviceAllocator.h:109

notcub::CachingDeviceAllocator::INVALID_DEVICE_ORDINAL
static const int INVALID_DEVICE_ORDINAL
Invalid device ordinal.
Definition: CachingDeviceAllocator.h:114

notcub::CachingDeviceAllocator::skip_cleanup
const bool skip_cleanup
Maximum aggregate cached bytes per device.
Definition: CachingDeviceAllocator.h:239

notcub::CachingDeviceAllocator::BlockDescriptor::device
int device
Definition: CachingDeviceAllocator.h:128

notcub::CachingDeviceAllocator::bin_growth
unsigned int bin_growth
Mutex for thread-safety.
Definition: CachingDeviceAllocator.h:230

notcub::CachingDeviceAllocator::DeviceAllocate
cudaError_t DeviceAllocate(int device, void **d_ptr, size_t bytes, cudaStream_t active_stream=nullptr)
Provides a suitable allocation of device memory for the given size on the specified device...
Definition: CachingDeviceAllocator.h:331

value
Definition: value.py:1

notcub::CachingDeviceAllocator::BlockDescriptor::bytes
size_t bytes
Definition: CachingDeviceAllocator.h:125

notcub::CachingDeviceAllocator::BlockDescriptor::d_ptr
void * d_ptr
Definition: CachingDeviceAllocator.h:124

notcub::CachingDeviceAllocator::BlockDescriptor::SizeCompare
static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
Definition: CachingDeviceAllocator.h:161

electrons_cff.bool
bool
Definition: electrons_cff.py:315

notcub::CachingDeviceAllocator::max_bin
unsigned int max_bin
Minimum bin enumeration.
Definition: CachingDeviceAllocator.h:232

notcub::CachingDeviceAllocator::mutex
std::mutex mutex
Definition: CachingDeviceAllocator.h:228

cms::alpakatools::detail::power
constexpr unsigned int power(unsigned int base, unsigned int exponent)
Definition: CachingAllocator.h:27

cudaCheck.h

notcub::CachingDeviceAllocator::max_bin_bytes
size_t max_bin_bytes
Minimum bin size.
Definition: CachingDeviceAllocator.h:235

notcub::CachingDeviceAllocator::CachingDeviceAllocator
CachingDeviceAllocator(unsigned int bin_growth, unsigned int min_bin=1, unsigned int max_bin=INVALID_BIN, size_t max_cached_bytes=INVALID_SIZE, bool skip_cleanup=false, bool debug=false)
Set of live device allocations currently in use.
Definition: CachingDeviceAllocator.h:255

b
double b
Definition: hdecay.h:118

notcub::CachingDeviceAllocator::CachedBlocks
std::multiset< BlockDescriptor, Compare > CachedBlocks
Set type for cached blocks (ordered by size)
Definition: CachingDeviceAllocator.h:175

notcub::CachingDeviceAllocator::DeviceAllocate
cudaError_t DeviceAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream=nullptr)
Provides a suitable allocation of device memory for the given size on the current device...
Definition: CachingDeviceAllocator.h:541

notcub::CachingDeviceAllocator::BlockDescriptor
Definition: CachingDeviceAllocator.h:123

notcub::CachingDeviceAllocator::FreeAllCached
cudaError_t FreeAllCached()
Frees all cached device allocations on all devices.
Definition: CachingDeviceAllocator.h:666

notcub::CachingDeviceAllocator::cached_bytes
GpuCachedBytes cached_bytes
Whether or not to print (de)allocation events to stdout.
Definition: CachingDeviceAllocator.h:242

a
double a
Definition: hdecay.h:119

notcub::CachingDeviceAllocator::max_cached_bytes
size_t max_cached_bytes
Maximum bin size.
Definition: CachingDeviceAllocator.h:236

notcub::CachingDeviceAllocator::SetMaxCachedBytes
cudaError_t SetMaxCachedBytes(size_t max_cached_bytes)
Sets the limit on the number bytes this allocator is allowed to cache per device. ...
Definition: CachingDeviceAllocator.h:305

notcub::CachingDeviceAllocator::min_bin
unsigned int min_bin
Geometric growth factor for bin-sizes.
Definition: CachingDeviceAllocator.h:231

cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69

notcub::CachingDeviceAllocator::debug
bool debug
Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may hav...
Definition: CachingDeviceAllocator.h:240

JetChargeProducer_cfi.exp
exp
Definition: JetChargeProducer_cfi.py:6

newFWLiteAna.found
found
Definition: newFWLiteAna.py:118

notcub::CachingDeviceAllocator::BlockDescriptor::bin
unsigned int bin
Definition: CachingDeviceAllocator.h:127

cms::cuda::allocator::GpuCachedBytes
std::map< int, TotalBytes > GpuCachedBytes
Map type of device ordinals to the number of cached bytes cached by each device.
Definition: deviceAllocatorStatus.h:18

notcub::CachingDeviceAllocator::min_bin_bytes
size_t min_bin_bytes
Maximum bin enumeration.
Definition: CachingDeviceAllocator.h:234

deviceAllocatorStatus.h