dd/dad/CachingDeviceAllocator_8h_source.html

#ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h

#define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h


/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


/******************************************************************************

 * Simple caching allocator for device memory allocations. The allocator is

 * thread-safe and capable of managing device allocations on multiple devices.

 ******************************************************************************/


#include <cmath>

#include <map>

#include <set>

#include <mutex>


#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

#include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"


namespace notcub {


  /******************************************************************************

 * CachingDeviceAllocator (host use)

 ******************************************************************************/


  struct CachingDeviceAllocator {

    //---------------------------------------------------------------------

    // Constants

    //---------------------------------------------------------------------


    static const unsigned int INVALID_BIN = (unsigned int)-1;


    static const size_t INVALID_SIZE = (size_t)-1;


#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document


    static const int INVALID_DEVICE_ORDINAL = -1;


    //---------------------------------------------------------------------

    // Type definitions and helper types

    //---------------------------------------------------------------------


    struct BlockDescriptor {

      void *d_ptr;                     // Device pointer

      size_t bytes;                    // Size of allocation in bytes

      size_t bytesRequested;           // CMS: requested allocatoin size (for monitoring only)

      unsigned int bin;                // Bin enumeration

      int device;                      // device ordinal

      cudaStream_t associated_stream;  // Associated associated_stream

      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed


      // Constructor (suitable for searching maps for a specific block, given its pointer and device)

      BlockDescriptor(void *d_ptr, int device)

          : d_ptr(d_ptr),

            bytes(0),

            bytesRequested(0),  // CMS

            bin(INVALID_BIN),

            device(device),

            associated_stream(nullptr),

            ready_event(nullptr) {}


      // Constructor (suitable for searching maps for a range of suitable blocks, given a device)

      BlockDescriptor(int device)

          : d_ptr(nullptr),

            bytes(0),

            bytesRequested(0),  // CMS

            bin(INVALID_BIN),

            device(device),

            associated_stream(nullptr),

            ready_event(nullptr) {}


      // Comparison functor for comparing device pointers

      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {

        if (a.device == b.device)

          return (a.d_ptr < b.d_ptr);

        else

          return (a.device < b.device);

      }


      // Comparison functor for comparing allocation sizes

      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) {

        if (a.device == b.device)

          return (a.bytes < b.bytes);

        else

          return (a.device < b.device);

      }

    };


    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);


    // CMS: Moved TotalBytes to deviceAllocatorStatus.h


    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;


    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;


    // CMS: Moved definition to deviceAllocatorStatus.h

    using GpuCachedBytes = cms::cuda::allocator::GpuCachedBytes;


    //---------------------------------------------------------------------

    // Utility functions

    //---------------------------------------------------------------------


    static unsigned int IntPow(unsigned int base, unsigned int exp) {

      unsigned int retval = 1;

      while (exp > 0) {

        if (exp & 1) {

          retval = retval * base;  // multiply the result by the current base

        }

        base = base * base;  // square the base

        exp = exp >> 1;      // divide the exponent in half

      }

      return retval;

    }


    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {

      power = 0;

      rounded_bytes = 1;


      if (value * base < value) {

        // Overflow

        power = sizeof(size_t) * 8;

        rounded_bytes = size_t(0) - 1;

        return;

      }


      while (rounded_bytes < value) {

        rounded_bytes *= base;

        power++;

      }

    }


    //---------------------------------------------------------------------

    // Fields

    //---------------------------------------------------------------------


    // CMS: use std::mutex instead of cub::Mutex, declare mutable

    mutable std::mutex mutex;


    unsigned int bin_growth;

    unsigned int min_bin;

    unsigned int max_bin;


    size_t min_bin_bytes;

    size_t max_bin_bytes;

    size_t max_cached_bytes;


    const bool

        skip_cleanup;

    bool debug;


    GpuCachedBytes cached_bytes;

    CachedBlocks cached_blocks;

    BusyBlocks live_blocks;


#endif  // DOXYGEN_SHOULD_SKIP_THIS


    //---------------------------------------------------------------------

    // Methods

    //---------------------------------------------------------------------


    CachingDeviceAllocator(

        unsigned int bin_growth,

        unsigned int min_bin = 1,

        unsigned int max_bin = INVALID_BIN,

        size_t max_cached_bytes = INVALID_SIZE,

        bool skip_cleanup =

            false,

        bool debug = false)

        : bin_growth(bin_growth),

          min_bin(min_bin),

          max_bin(max_bin),

          min_bin_bytes(IntPow(bin_growth, min_bin)),

          max_bin_bytes(IntPow(bin_growth, max_bin)),

          max_cached_bytes(max_cached_bytes),

          skip_cleanup(skip_cleanup),

          debug(debug),

          cached_blocks(BlockDescriptor::SizeCompare),

          live_blocks(BlockDescriptor::PtrCompare) {}


    CachingDeviceAllocator(bool skip_cleanup = false, bool debug = false)

        : bin_growth(8),

          min_bin(3),

          max_bin(7),

          min_bin_bytes(IntPow(bin_growth, min_bin)),

          max_bin_bytes(IntPow(bin_growth, max_bin)),

          max_cached_bytes((max_bin_bytes * 3) - 1),

          skip_cleanup(skip_cleanup),

          debug(debug),

          cached_blocks(BlockDescriptor::SizeCompare),

          live_blocks(BlockDescriptor::PtrCompare) {}


    cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) {

      // Lock

      // CMS: use RAII instead of (un)locking explicitly

      std::unique_lock mutex_locker(mutex);


      if (debug)

        // CMS: use raw printf

        printf("Changing max_cached_bytes (%lld -> %lld)\n",

               (long long)this->max_cached_bytes,

               (long long)max_cached_bytes);


      this->max_cached_bytes = max_cached_bytes;


      // Unlock (redundant, kept for style uniformity)

      mutex_locker.unlock();


      return cudaSuccess;

    }


    cudaError_t DeviceAllocate(

        int device,

        void **d_ptr,

        size_t bytes,

        cudaStream_t active_stream = nullptr)

    {

      // CMS: use RAII instead of (un)locking explicitly

      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);

      *d_ptr = nullptr;

      int entrypoint_device = INVALID_DEVICE_ORDINAL;

      cudaError_t error = cudaSuccess;


      if (device == INVALID_DEVICE_ORDINAL) {

        // CMS: throw exception on error

        cudaCheck(error = cudaGetDevice(&entrypoint_device));

        device = entrypoint_device;

      }


      // Create a block descriptor for the requested allocation

      bool found = false;

      BlockDescriptor search_key(device);

      search_key.bytesRequested = bytes;  // CMS

      search_key.associated_stream = active_stream;

      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);


      if (search_key.bin > max_bin) {

        // Bin is greater than our maximum bin: allocate the request

        // exactly and give out-of-bounds bin.  It will not be cached

        // for reuse when returned.

        search_key.bin = INVALID_BIN;

        search_key.bytes = bytes;

      } else {

        // Search for a suitable cached allocation: lock

        mutex_locker.lock();


        if (search_key.bin < min_bin) {

          // Bin is less than minimum bin: round up

          search_key.bin = min_bin;

          search_key.bytes = min_bin_bytes;

        }


        // Iterate through the range of cached blocks on the same device in the same bin

        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);

        while ((block_itr != cached_blocks.end()) && (block_itr->device == device) &&

               (block_itr->bin == search_key.bin)) {

          // To prevent races with reusing blocks returned by the host but still

          // in use by the device, only consider cached blocks that are

          // either (from the active stream) or (from an idle stream)

          if ((active_stream == block_itr->associated_stream) ||

              (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {

            // Reuse existing cache block.  Insert into live blocks.

            found = true;

            search_key = *block_itr;

            search_key.associated_stream = active_stream;

            live_blocks.insert(search_key);


            // Remove from free blocks

            cached_bytes[device].free -= search_key.bytes;

            cached_bytes[device].live += search_key.bytes;

            cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS


            if (debug)

              // CMS: improved debug message

              // CMS: use raw printf

              printf(

                  "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "

                  "associated with stream %lld, event %lld).\n",

                  device,

                  search_key.d_ptr,

                  (long long)search_key.bytes,

                  (long long)search_key.associated_stream,

                  (long long)search_key.ready_event,

                  (long long)block_itr->associated_stream,

                  (long long)block_itr->ready_event);


            cached_blocks.erase(block_itr);


            break;

          }

          block_itr++;

        }


        // Done searching: unlock

        mutex_locker.unlock();

      }


      // Allocate the block if necessary

      if (!found) {

        // Set runtime's current device to specified device (entrypoint may not be set)

        if (device != entrypoint_device) {

          // CMS: throw exception on error

          cudaCheck(error = cudaGetDevice(&entrypoint_device));

          cudaCheck(error = cudaSetDevice(device));

        }


        // Attempt to allocate

        // CMS: silently ignore errors and retry or pass them to the caller

        if ((error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {

          // The allocation attempt failed: free all cached blocks on device and retry

          if (debug)

            // CMS: use raw printf

            printf(

                "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",

                device,

                (long long)search_key.bytes,

                (long long)search_key.associated_stream);


          error = cudaSuccess;  // Reset the error we will return

          cudaGetLastError();   // Reset CUDART's error


          // Lock

          mutex_locker.lock();


          // Iterate the range of free blocks on the same device

          BlockDescriptor free_key(device);

          CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);


          while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) {

            // No need to worry about synchronization with the device: cudaFree is

            // blocking and will synchronize across all kernels executing

            // on the current device


            // Free device memory and destroy stream event.

            // CMS: silently ignore errors and pass them to the caller

            if ((error = cudaFree(block_itr->d_ptr)))

              break;

            if ((error = cudaEventDestroy(block_itr->ready_event)))

              break;


            // Reduce balance and erase entry

            cached_bytes[device].free -= block_itr->bytes;


            if (debug)

              // CMS: use raw printf

              printf(

                  "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "

                  "(%lld bytes) outstanding.\n",

                  device,

                  (long long)block_itr->bytes,

                  (long long)cached_blocks.size(),

                  (long long)cached_bytes[device].free,

                  (long long)live_blocks.size(),

                  (long long)cached_bytes[device].live);


            cached_blocks.erase(block_itr);


            block_itr++;

          }


          // Unlock

          mutex_locker.unlock();


          // Return under error

          if (error)

            return error;


          // Try to allocate again

          // CMS: throw exception on error

          cudaCheck(error = cudaMalloc(&search_key.d_ptr, search_key.bytes));

        }


        // Create ready event

        // CMS: throw exception on error

        cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));


        // Insert into live blocks

        mutex_locker.lock();

        live_blocks.insert(search_key);

        cached_bytes[device].live += search_key.bytes;

        cached_bytes[device].liveRequested += search_key.bytesRequested;  // CMS

        mutex_locker.unlock();


        if (debug)

          // CMS: improved debug message

          // CMS: use raw printf

          printf("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",

                 device,

                 search_key.d_ptr,

                 (long long)search_key.bytes,

                 (long long)search_key.associated_stream,

                 (long long)search_key.ready_event);


        // Attempt to revert back to previous device if necessary

        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {

          // CMS: throw exception on error

          cudaCheck(error = cudaSetDevice(entrypoint_device));

        }

      }


      // Copy device pointer to output parameter

      *d_ptr = search_key.d_ptr;


      if (debug)

        // CMS: use raw printf

        printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",

               (long long)cached_blocks.size(),

               (long long)cached_bytes[device].free,

               (long long)live_blocks.size(),

               (long long)cached_bytes[device].live);


      return error;

    }


    cudaError_t DeviceAllocate(

        void **d_ptr,

        size_t bytes,

        cudaStream_t active_stream = nullptr)

    {

      return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);

    }


    cudaError_t DeviceFree(int device, void *d_ptr) {

      int entrypoint_device = INVALID_DEVICE_ORDINAL;

      cudaError_t error = cudaSuccess;

      // CMS: use RAII instead of (un)locking explicitly

      std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);


      if (device == INVALID_DEVICE_ORDINAL) {

        // CMS: throw exception on error

        cudaCheck(error = cudaGetDevice(&entrypoint_device));

        device = entrypoint_device;

      }


      // Lock

      mutex_locker.lock();


      // Find corresponding block descriptor

      bool recached = false;

      BlockDescriptor search_key(d_ptr, device);

      BusyBlocks::iterator block_itr = live_blocks.find(search_key);

      if (block_itr != live_blocks.end()) {

        // Remove from live blocks

        search_key = *block_itr;

        live_blocks.erase(block_itr);

        cached_bytes[device].live -= search_key.bytes;

        cached_bytes[device].liveRequested -= search_key.bytesRequested;  // CMS


        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold

        if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {

          // Insert returned allocation into free blocks

          recached = true;

          cached_blocks.insert(search_key);

          cached_bytes[device].free += search_key.bytes;


          if (debug)

            // CMS: improved debug message

            // CMS: use raw printf

            printf(

                "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "

                "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",

                device,

                (long long)search_key.bytes,

                d_ptr,

                (long long)search_key.associated_stream,

                (long long)search_key.ready_event,

                (long long)cached_blocks.size(),

                (long long)cached_bytes[device].free,

                (long long)live_blocks.size(),

                (long long)cached_bytes[device].live);

        }

      }


      // First set to specified device (entrypoint may not be set)

      if (device != entrypoint_device) {

        // CMS: throw exception on error

        cudaCheck(error = cudaGetDevice(&entrypoint_device));

        cudaCheck(error = cudaSetDevice(device));

      }


      if (recached) {

        // Insert the ready event in the associated stream (must have current device set properly)

        // CMS: throw exception on error

        cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));

      }


      // Unlock

      mutex_locker.unlock();


      if (!recached) {

        // Free the allocation from the runtime and cleanup the event.

        // CMS: throw exception on error

        cudaCheck(error = cudaFree(d_ptr));

        cudaCheck(error = cudaEventDestroy(search_key.ready_event));


        if (debug)

          // CMS: improved debug message

          printf(

              "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "

              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",

              device,

              (long long)search_key.bytes,

              d_ptr,

              (long long)search_key.associated_stream,

              (long long)search_key.ready_event,

              (long long)cached_blocks.size(),

              (long long)cached_bytes[device].free,

              (long long)live_blocks.size(),

              (long long)cached_bytes[device].live);

      }


      // Reset device

      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {

        // CMS: throw exception on error

        cudaCheck(error = cudaSetDevice(entrypoint_device));

      }


      return error;

    }


    cudaError_t DeviceFree(void *d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); }


    cudaError_t FreeAllCached() {

      cudaError_t error = cudaSuccess;

      int entrypoint_device = INVALID_DEVICE_ORDINAL;

      int current_device = INVALID_DEVICE_ORDINAL;

      // CMS: use RAII instead of (un)locking explicitly

      std::unique_lock<std::mutex> mutex_locker(mutex);


      while (!cached_blocks.empty()) {

        // Get first block

        CachedBlocks::iterator begin = cached_blocks.begin();


        // Get entry-point device ordinal if necessary

        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {

          // CMS: silently ignore errors and pass them to the caller

          if ((error = cudaGetDevice(&entrypoint_device)))

            break;

        }


        // Set current device ordinal if necessary

        if (begin->device != current_device) {

          // CMS: silently ignore errors and pass them to the caller

          if ((error = cudaSetDevice(begin->device)))

            break;

          current_device = begin->device;

        }


        // Free device memory

        // CMS: silently ignore errors and pass them to the caller

        if ((error = cudaFree(begin->d_ptr)))

          break;

        if ((error = cudaEventDestroy(begin->ready_event)))

          break;


        // Reduce balance and erase entry

        cached_bytes[current_device].free -= begin->bytes;


        if (debug)

          printf(

              "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "

              "bytes) outstanding.\n",

              current_device,

              (long long)begin->bytes,

              (long long)cached_blocks.size(),

              (long long)cached_bytes[current_device].free,

              (long long)live_blocks.size(),

              (long long)cached_bytes[current_device].live);


        cached_blocks.erase(begin);

      }


      mutex_locker.unlock();


      // Attempt to revert back to entry-point device if necessary

      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {

        // CMS: throw exception on error

        cudaCheck(error = cudaSetDevice(entrypoint_device));

      }


      return error;

    }


    // CMS: give access to cache allocation status

    GpuCachedBytes CacheStatus() const {

      std::unique_lock mutex_locker(mutex);

      return cached_bytes;

    }


    // CMS: make the destructor not virtual

    ~CachingDeviceAllocator() {

      if (!skip_cleanup)

        FreeAllCached();

    }

  };

  // end group UtilMgmt


}  // namespace notcub


#endif