dd/d5a/CachingHostAllocator_8h_source.html

#ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h

#define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h


/******************************************************************************

 * Copyright (c) 2011, Duane Merrill.  All rights reserved.

 * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *     * Redistributions of source code must retain the above copyright

 *       notice, this list of conditions and the following disclaimer.

 *     * Redistributions in binary form must reproduce the above copyright

 *       notice, this list of conditions and the following disclaimer in the

 *       documentation and/or other materials provided with the distribution.

 *     * Neither the name of the NVIDIA CORPORATION nor the

 *       names of its contributors may be used to endorse or promote products

 *       derived from this software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY

 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 *

 ******************************************************************************/


/******************************************************************************

 * Simple caching allocator for pinned host memory allocations. The allocator is

 * thread-safe.

 ******************************************************************************/


#include <cmath>

#include <map>

#include <set>

#include <mutex>


#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"


namespace notcub {


  /******************************************************************************

 * CachingHostAllocator (host use)

 ******************************************************************************/


  struct CachingHostAllocator {

    //---------------------------------------------------------------------

    // Constants

    //---------------------------------------------------------------------


    static const unsigned int INVALID_BIN = (unsigned int)-1;


    static const size_t INVALID_SIZE = (size_t)-1;


#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document


    static const int INVALID_DEVICE_ORDINAL = -1;


    //---------------------------------------------------------------------

    // Type definitions and helper types

    //---------------------------------------------------------------------


    struct BlockDescriptor {

      void *d_ptr;                     // Host pointer

      size_t bytes;                    // Size of allocation in bytes

      unsigned int bin;                // Bin enumeration

      int device;                      // device ordinal

      cudaStream_t associated_stream;  // Associated associated_stream

      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed


      // Constructor (suitable for searching maps for a specific block, given its pointer)

      BlockDescriptor(void *d_ptr)

          : d_ptr(d_ptr),

            bytes(0),

            bin(INVALID_BIN),

            device(INVALID_DEVICE_ORDINAL),

            associated_stream(nullptr),

            ready_event(nullptr) {}


      // Constructor (suitable for searching maps for a range of suitable blocks)

      BlockDescriptor()

          : d_ptr(nullptr),

            bytes(0),

            bin(INVALID_BIN),

            device(INVALID_DEVICE_ORDINAL),

            associated_stream(nullptr),

            ready_event(nullptr) {}


      // Comparison functor for comparing host pointers

      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }


      // Comparison functor for comparing allocation sizes

      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }

    };


    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);


    class TotalBytes {

    public:

      size_t free;

      size_t live;

      TotalBytes() { free = live = 0; }

    };


    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;


    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;


    //---------------------------------------------------------------------

    // Utility functions

    //---------------------------------------------------------------------


    static unsigned int IntPow(unsigned int base, unsigned int exp) {

      unsigned int retval = 1;

      while (exp > 0) {

        if (exp & 1) {

          retval = retval * base;  // multiply the result by the current base

        }

        base = base * base;  // square the base

        exp = exp >> 1;      // divide the exponent in half

      }

      return retval;

    }


    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {

      power = 0;

      rounded_bytes = 1;


      if (value * base < value) {

        // Overflow

        power = sizeof(size_t) * 8;

        rounded_bytes = size_t(0) - 1;

        return;

      }


      while (rounded_bytes < value) {

        rounded_bytes *= base;

        power++;

      }

    }


    //---------------------------------------------------------------------

    // Fields

    //---------------------------------------------------------------------


    std::mutex mutex;


    unsigned int bin_growth;

    unsigned int min_bin;

    unsigned int max_bin;


    size_t min_bin_bytes;

    size_t max_bin_bytes;

    size_t max_cached_bytes;


    const bool

        skip_cleanup;

    bool debug;


    TotalBytes cached_bytes;

    CachedBlocks cached_blocks;

    BusyBlocks live_blocks;


#endif  // DOXYGEN_SHOULD_SKIP_THIS


    //---------------------------------------------------------------------

    // Methods

    //---------------------------------------------------------------------


    CachingHostAllocator(

        unsigned int bin_growth,

        unsigned int min_bin = 1,

        unsigned int max_bin = INVALID_BIN,

        size_t max_cached_bytes = INVALID_SIZE,

        bool skip_cleanup =

            false,

        bool debug = false)

        : bin_growth(bin_growth),

          min_bin(min_bin),

          max_bin(max_bin),

          min_bin_bytes(IntPow(bin_growth, min_bin)),

          max_bin_bytes(IntPow(bin_growth, max_bin)),

          max_cached_bytes(max_cached_bytes),

          skip_cleanup(skip_cleanup),

          debug(debug),

          cached_blocks(BlockDescriptor::SizeCompare),

          live_blocks(BlockDescriptor::PtrCompare) {}


    CachingHostAllocator(bool skip_cleanup = false, bool debug = false)

        : bin_growth(8),

          min_bin(3),

          max_bin(7),

          min_bin_bytes(IntPow(bin_growth, min_bin)),

          max_bin_bytes(IntPow(bin_growth, max_bin)),

          max_cached_bytes((max_bin_bytes * 3) - 1),

          skip_cleanup(skip_cleanup),

          debug(debug),

          cached_blocks(BlockDescriptor::SizeCompare),

          live_blocks(BlockDescriptor::PtrCompare) {}


    void SetMaxCachedBytes(size_t max_cached_bytes) {

      // Lock

      mutex.lock();


      if (debug)

        printf("Changing max_cached_bytes (%lld -> %lld)\n",

               (long long)this->max_cached_bytes,

               (long long)max_cached_bytes);


      this->max_cached_bytes = max_cached_bytes;


      // Unlock

      mutex.unlock();

    }


    cudaError_t HostAllocate(

        void **d_ptr,

        size_t bytes,

        cudaStream_t active_stream = nullptr)

    {

      *d_ptr = nullptr;

      int device = INVALID_DEVICE_ORDINAL;

      cudaError_t error = cudaSuccess;


      cudaCheck(error = cudaGetDevice(&device));


      // Create a block descriptor for the requested allocation

      bool found = false;

      BlockDescriptor search_key;

      search_key.device = device;

      search_key.associated_stream = active_stream;

      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);


      if (search_key.bin > max_bin) {

        // Bin is greater than our maximum bin: allocate the request

        // exactly and give out-of-bounds bin.  It will not be cached

        // for reuse when returned.

        search_key.bin = INVALID_BIN;

        search_key.bytes = bytes;

      } else {

        // Search for a suitable cached allocation: lock

        mutex.lock();


        if (search_key.bin < min_bin) {

          // Bin is less than minimum bin: round up

          search_key.bin = min_bin;

          search_key.bytes = min_bin_bytes;

        }


        // Iterate through the range of cached blocks in the same bin

        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);

        while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {

          // To prevent races with reusing blocks returned by the host but still

          // in use for transfers, only consider cached blocks that are from an idle stream

          if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {

            // Reuse existing cache block.  Insert into live blocks.

            found = true;

            search_key = *block_itr;

            search_key.associated_stream = active_stream;

            if (search_key.device != device) {

              // If "associated" device changes, need to re-create the event on the right device

              cudaCheck(error = cudaSetDevice(search_key.device));

              cudaCheck(error = cudaEventDestroy(search_key.ready_event));

              cudaCheck(error = cudaSetDevice(device));

              cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));

              search_key.device = device;

            }


            live_blocks.insert(search_key);


            // Remove from free blocks

            cached_bytes.free -= search_key.bytes;

            cached_bytes.live += search_key.bytes;


            if (debug)

              printf(

                  "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "

                  "(previously associated with stream %lld, event %lld).\n",

                  search_key.d_ptr,

                  (long long)search_key.bytes,

                  (long long)search_key.associated_stream,

                  (long long)search_key.ready_event,

                  (long long)search_key.device,

                  (long long)block_itr->associated_stream,

                  (long long)block_itr->ready_event);


            cached_blocks.erase(block_itr);


            break;

          }

          block_itr++;

        }


        // Done searching: unlock

        mutex.unlock();

      }


      // Allocate the block if necessary

      if (!found) {

        // Attempt to allocate

        // TODO: eventually support allocation flags

        if ((error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==

            cudaErrorMemoryAllocation) {

          // The allocation attempt failed: free all cached blocks on device and retry

          if (debug)

            printf(

                "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "

                "allocations",

                (long long)search_key.bytes,

                (long long)search_key.associated_stream,

                (long long)search_key.device);


          error = cudaSuccess;  // Reset the error we will return

          cudaGetLastError();   // Reset CUDART's error


          // Lock

          mutex.lock();


          // Iterate the range of free blocks

          CachedBlocks::iterator block_itr = cached_blocks.begin();


          while ((block_itr != cached_blocks.end())) {

            // No need to worry about synchronization with the device: cudaFree is

            // blocking and will synchronize across all kernels executing

            // on the current device


            // Free pinned host memory.

            if ((error = cudaFreeHost(block_itr->d_ptr)))

              break;

            if ((error = cudaEventDestroy(block_itr->ready_event)))

              break;


            // Reduce balance and erase entry

            cached_bytes.free -= block_itr->bytes;


            if (debug)

              printf(

                  "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "

                  "bytes) outstanding.\n",

                  (long long)block_itr->bytes,

                  (long long)cached_blocks.size(),

                  (long long)cached_bytes.free,

                  (long long)live_blocks.size(),

                  (long long)cached_bytes.live);


            cached_blocks.erase(block_itr);


            block_itr++;

          }


          // Unlock

          mutex.unlock();


          // Return under error

          if (error)

            return error;


          // Try to allocate again

          cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));

        }


        // Create ready event

        cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));


        // Insert into live blocks

        mutex.lock();

        live_blocks.insert(search_key);

        cached_bytes.live += search_key.bytes;

        mutex.unlock();


        if (debug)

          printf(

              "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "

              "%lld).\n",

              search_key.d_ptr,

              (long long)search_key.bytes,

              (long long)search_key.associated_stream,

              (long long)search_key.ready_event,

              (long long)search_key.device);

      }


      // Copy host pointer to output parameter

      *d_ptr = search_key.d_ptr;


      if (debug)

        printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",

               (long long)cached_blocks.size(),

               (long long)cached_bytes.free,

               (long long)live_blocks.size(),

               (long long)cached_bytes.live);


      return error;

    }


    cudaError_t HostFree(void *d_ptr) {

      int entrypoint_device = INVALID_DEVICE_ORDINAL;

      cudaError_t error = cudaSuccess;


      // Lock

      mutex.lock();


      // Find corresponding block descriptor

      bool recached = false;

      BlockDescriptor search_key(d_ptr);

      BusyBlocks::iterator block_itr = live_blocks.find(search_key);

      if (block_itr != live_blocks.end()) {

        // Remove from live blocks

        search_key = *block_itr;

        live_blocks.erase(block_itr);

        cached_bytes.live -= search_key.bytes;


        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold

        if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {

          // Insert returned allocation into free blocks

          recached = true;

          cached_blocks.insert(search_key);

          cached_bytes.free += search_key.bytes;


          if (debug)

            printf(

                "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "

                "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",

                (long long)search_key.bytes,

                (long long)search_key.associated_stream,

                (long long)search_key.ready_event,

                (long long)search_key.device,

                (long long)cached_blocks.size(),

                (long long)cached_bytes.free,

                (long long)live_blocks.size(),

                (long long)cached_bytes.live);

        }

      }


      cudaCheck(error = cudaGetDevice(&entrypoint_device));

      if (entrypoint_device != search_key.device) {

        cudaCheck(error = cudaSetDevice(search_key.device));

      }


      if (recached) {

        // Insert the ready event in the associated stream (must have current device set properly)

        cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));

      }


      // Unlock

      mutex.unlock();


      if (!recached) {

        // Free the allocation from the runtime and cleanup the event.

        cudaCheck(error = cudaFreeHost(d_ptr));

        cudaCheck(error = cudaEventDestroy(search_key.ready_event));


        if (debug)

          printf(

              "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "

              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",

              (long long)search_key.bytes,

              (long long)search_key.associated_stream,

              (long long)search_key.ready_event,

              (long long)search_key.device,

              (long long)cached_blocks.size(),

              (long long)cached_bytes.free,

              (long long)live_blocks.size(),

              (long long)cached_bytes.live);

      }


      // Reset device

      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {

        cudaCheck(error = cudaSetDevice(entrypoint_device));

      }


      return error;

    }


    cudaError_t FreeAllCached() {

      cudaError_t error = cudaSuccess;

      int entrypoint_device = INVALID_DEVICE_ORDINAL;

      int current_device = INVALID_DEVICE_ORDINAL;


      mutex.lock();


      while (!cached_blocks.empty()) {

        // Get first block

        CachedBlocks::iterator begin = cached_blocks.begin();


        // Get entry-point device ordinal if necessary

        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {

          if ((error = cudaGetDevice(&entrypoint_device)))

            break;

        }


        // Set current device ordinal if necessary

        if (begin->device != current_device) {

          if ((error = cudaSetDevice(begin->device)))

            break;

          current_device = begin->device;

        }


        // Free host memory

        if ((error = cudaFreeHost(begin->d_ptr)))

          break;

        if ((error = cudaEventDestroy(begin->ready_event)))

          break;


        // Reduce balance and erase entry

        cached_bytes.free -= begin->bytes;


        if (debug)

          printf(

              "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "

              "bytes) outstanding.\n",

              (long long)begin->bytes,

              (long long)cached_blocks.size(),

              (long long)cached_bytes.free,

              (long long)live_blocks.size(),

              (long long)cached_bytes.live);


        cached_blocks.erase(begin);

      }


      mutex.unlock();


      // Attempt to revert back to entry-point device if necessary

      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {

        cudaCheck(error = cudaSetDevice(entrypoint_device));

      }


      return error;

    }


    ~CachingHostAllocator() {

      if (!skip_cleanup)

        FreeAllCached();

    }

  };

  // end group UtilMgmt


}  // namespace notcub


#endif