d2/d47/ESProduct_8h_source.html

 #ifndef HeterogeneousCore_CUDACore_ESProduct_h
 #define HeterogeneousCore_CUDACore_ESProduct_h

 #include <atomic>
 #include <cassert>
 #include <mutex>
 #include <vector>

 #include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAServices/interface/numberOfDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"

 namespace cms {
   namespace cuda {
     template <typename T>
     class ESProduct {
     public:
       ESProduct() : gpuDataPerDevice_(numberOfDevices()) {
         if (not gpuDataPerDevice_.empty()) {
           cms::cuda::ScopedSetDevice scopedDevice;
           for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
             scopedDevice.set(i);
             gpuDataPerDevice_[i].m_event = getEventCache().get();
           }
         }
       }

       ~ESProduct() = default;

       // transferAsync should be a function of (T&, cudaStream_t)
       // which enqueues asynchronous transfers (possibly kernels as well)
       // to the CUDA stream
       template <typename F>
       const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
         int device = currentDevice();
         auto& data = gpuDataPerDevice_[device];

         // If the GPU data has already been filled, we can return it immediately
         if (not data.m_filled.load()) {
           // It wasn't, so need to fill it
           std::scoped_lock<std::mutex> lk{data.m_mutex};

           if (data.m_filled.load()) {
             // Other thread marked it filled while we were locking the mutex, so we're free to return it
             return data.m_data;
           }

           if (data.m_fillingStream != nullptr) {
             // Someone else is filling

             // Check first if the recorded event has occurred
             if (eventWorkHasCompleted(data.m_event.get())) {
               // It was, so data is accessible from all CUDA streams on
               // the device. Set the 'filled' for all subsequent calls and
               // return the value
               auto should_be_false = data.m_filled.exchange(true);
               assert(not should_be_false);
               data.m_fillingStream = nullptr;
             } else if (data.m_fillingStream != cudaStream) {
               // Filling is still going on. For other CUDA stream, add
               // wait on the CUDA stream and return the value. Subsequent
               // work queued on the stream will wait for the event to
               // occur (i.e. transfer to finish).
               cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0),
                         "Failed to make a stream to wait for an event");
             }
             // else: filling is still going on. But for the same CUDA
             // stream (which would be a bit strange but fine), we can just
             // return as all subsequent work should be enqueued to the
             // same CUDA stream (or stream to be explicitly synchronized
             // by the caller)
           } else {
             // Now we can be sure that the data is not yet on the GPU, and
             // this thread is the first to try that.
             transferAsync(data.m_data, cudaStream);
             assert(data.m_fillingStream == nullptr);
             data.m_fillingStream = cudaStream;
             // Record in the cudaStream an event to mark the readiness of the
             // EventSetup data on the GPU, so other streams can check for it
             cudaCheck(cudaEventRecord(data.m_event.get(), cudaStream));
             // Now the filling has been enqueued to the cudaStream, so we
             // can return the GPU data immediately, since all subsequent
             // work must be either enqueued to the cudaStream, or the cudaStream
             // must be synchronized by the caller
           }
         }

         return data.m_data;
       }

     private:
       struct Item {
         mutable std::mutex m_mutex;
         CMS_THREAD_GUARD(m_mutex) mutable SharedEventPtr m_event;
         // non-null if some thread is already filling (cudaStream_t is just a pointer)
         CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr;
         mutable std::atomic<bool> m_filled = false;  // easy check if data has been filled already or not
         CMS_THREAD_GUARD(m_mutex) mutable T m_data;
       };

       std::vector<Item> gpuDataPerDevice_;
     };
   }  // namespace cuda
 }  // namespace cms

 #endif  // HeterogeneousCore_CUDACore_ESProduct_h
cms::cuda::ESProduct::dataForCurrentDeviceAsync
const T & dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const
Definition: ESProduct.h:38

mps_fire.i
i
Definition: mps_fire.py:429

cms::cuda::SharedEventPtr
std::shared_ptr< std::remove_pointer_t< cudaEvent_t > > SharedEventPtr
Definition: SharedEventPtr.h:14

cms::cuda::ESProduct::Item::m_data
T m_data
Definition: ESProduct.h:102

cms::cuda::ESProduct::Item
Definition: ESProduct.h:96

mutex
static std::mutex mutex
Definition: Proxy.cc:8

cms::cuda::ESProduct::ESProduct
ESProduct()
Definition: ESProduct.h:22

thread_safety_macros.h

cms::cuda::ESProduct::Item::m_fillingStream
cudaStream_t m_fillingStream
Definition: ESProduct.h:100

cms::cuda::assert
assert(be >=bs)

cms::cuda::ESProduct
Definition: ESProduct.h:20

currentDevice.h

numberOfDevices.h

cms::cuda::ESProduct::~ESProduct
~ESProduct()=default

cms::cuda::ESProduct::Item::m_mutex
std::mutex m_mutex
Definition: ESProduct.h:97

cms::cuda::EventCache::get
SharedEventPtr get()
Definition: EventCache.cc:21

eventWorkHasCompleted.h

cms::cuda::ScopedSetDevice::set
void set(int device)
Definition: ScopedSetDevice.h:34

cms::cuda::ScopedSetDevice
Definition: ScopedSetDevice.h:10

CMS_THREAD_GUARD
#define CMS_THREAD_GUARD(_var_)

cms
Namespace of DDCMS conversion namespace.
Definition: ProducerAnalyzer.cc:41

cms::cuda::ESProduct::gpuDataPerDevice_
std::vector< Item > gpuDataPerDevice_
Definition: ESProduct.h:105

cms::cuda::ESProduct::Item::m_filled
std::atomic< bool > m_filled
Definition: ESProduct.h:101

cudaCheck.h

ecalDigis_cff.cuda
cuda
Definition: ecalDigis_cff.py:35

data
char data[epos_bytes_allocation]
Definition: EPOS_Wrapper.h:80

cms::cuda::ESProduct::Item::m_event
SharedEventPtr m_event
Definition: ESProduct.h:98

ScopedSetDevice.h

cms::cuda::numberOfDevices
int numberOfDevices()
Definition: numberOfDevices.cc:6

cms::cuda::getEventCache
EventCache & getEventCache()
Definition: EventCache.cc:64

cudaCheck
#define cudaCheck(ARG,...)
Definition: cudaCheck.h:69

F
static uInt32 F(BLOWFISH_CTX *ctx, uInt32 x)
Definition: blowfish.cc:163

T
long double T
Definition: Basic3DVectorLD.h:48

cms::cuda::eventWorkHasCompleted
bool eventWorkHasCompleted(cudaEvent_t event)
Definition: eventWorkHasCompleted.h:18

EventCache.h

cms::cuda::currentDevice
int currentDevice()
Definition: currentDevice.h:10