db/d55/CAHitNtupletGeneratorKernels_8cc_source.html

 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"

 #include <mutex>

 namespace {
   // cuda atomics are NOT atomics on CPU so protect stat update with a mutex
   // waiting for a more general solution (incuding multiple devices) to be proposed and implemented
   std::mutex lock_stat;
 }  // namespace

 template <>
 void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
   kernel_printCounters(counters);
 }

 template <>
 void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
   auto nhits = hh.nHits();

 #ifdef NTUPLE_DEBUG
   std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl;
 #endif

   // use "nhits" to heuristically dimension the workspace

   // no need to use the Traits allocations, since we know this is being compiled for the CPU
   //device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
   device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1U, nhits));
   assert(device_isOuterHitOfCell_.get());
   isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};

   auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
                          caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
   // no need to use the Traits allocations, since we know this is being compiled for the CPU
   //cellStorage_ = Traits::template make_unique<unsigned char[]>(cellStorageSize, stream);
   cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
   device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
                                                                                       sizeof(GPUCACell::CellNeighbors));

   gpuPixelDoublets::initDoublets(isOuterHitOfCell_,
                                  nhits,
                                  device_theCellNeighbors_.get(),
                                  device_theCellNeighborsContainer_,
                                  device_theCellTracks_.get(),
                                  device_theCellTracksContainer_);

   // no need to use the Traits allocations, since we know this is being compiled for the CPU
   //device_theCells_ = Traits::template make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
   device_theCells_ = std::make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_);
   if (0 == nhits)
     return;  // protect against empty events

   // take all layer pairs into account
   auto nActualPairs = gpuPixelDoublets::nPairs;
   if (not params_.includeJumpingForwardDoublets_) {
     // exclude forward "jumping" layer pairs
     nActualPairs = gpuPixelDoublets::nPairsForTriplets;
   }
   if (params_.minHitsPerNtuplet_ > 3) {
     // for quadruplets, exclude all "jumping" layer pairs
     nActualPairs = gpuPixelDoublets::nPairsForQuadruplets;
   }

   assert(nActualPairs <= gpuPixelDoublets::nPairs);
   gpuPixelDoublets::getDoubletsFromHisto(device_theCells_.get(),
                                          device_nCells_,
                                          device_theCellNeighbors_.get(),
                                          device_theCellTracks_.get(),
                                          hh.view(),
                                          isOuterHitOfCell_,
                                          nActualPairs,
                                          params_.idealConditions_,
                                          params_.doClusterCut_,
                                          params_.doZ0Cut_,
                                          params_.doPtCut_,
                                          params_.maxNumberOfDoublets_);
 }

 template <>
 void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   auto *tuples_d = &tracks_d->hitIndices;
   auto *detId_d = &tracks_d->detIndices;
   auto *quality_d = tracks_d->qualityData();

   assert(tuples_d && quality_d);

   // zero tuples
   cms::cuda::launchZero(tuples_d, cudaStream);

   auto nhits = hh.nHits();

   // std::cout << "N hits " << nhits << std::endl;
   // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;

   //
   // applying conbinatoric cleaning such as fishbone at this stage is too expensive
   //

   kernel_connect(device_hitTuple_apc_,
                  device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
                  hh.view(),
                  device_theCells_.get(),
                  device_nCells_,
                  device_theCellNeighbors_.get(),
                  isOuterHitOfCell_,
                  params_.hardCurvCut_,
                  params_.ptmin_,
                  params_.CAThetaCutBarrel_,
                  params_.CAThetaCutForward_,
                  params_.dcaCutInnerTriplet_,
                  params_.dcaCutOuterTriplet_);

   if (nhits > 1 && params_.earlyFishbone_) {
     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false);
   }

   kernel_find_ntuplets(hh.view(),
                        device_theCells_.get(),
                        device_nCells_,
                        device_theCellTracks_.get(),
                        tuples_d,
                        device_hitTuple_apc_,
                        quality_d,
                        params_.minHitsPerNtuplet_);
   if (params_.doStats_)
     kernel_mark_used(device_theCells_.get(), device_nCells_);

   cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);

   kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
   kernel_fillNLayers(tracks_d, device_hitTuple_apc_);

   // remove duplicates (tracks that share a doublet)
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);

   kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());

   if (nhits > 1 && params_.lateFishbone_) {
     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
   }
 }

 template <>
 void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
   int32_t nhits = hh.nHits();

   auto const *tuples_d = &tracks_d->hitIndices;
   auto *quality_d = tracks_d->qualityData();

   // classify tracks based on kinematics
   kernel_classifyTracks(tuples_d, tracks_d, params_.cuts_, quality_d);

   if (params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
     kernel_fishboneCleaner(device_theCells_.get(), device_nCells_, quality_d);
   }

   // remove duplicates (tracks that share a doublet)
   kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_);

   // fill hit->track "map"
   if (params_.doSharedHitCut_ || params_.doStats_) {
     kernel_countHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
     cms::cuda::launchFinalize(hitToTupleView_, cudaStream);
     kernel_fillHitInTracks(tuples_d, quality_d, device_hitToTuple_.get());
   }

   // remove duplicates (tracks that share at least one hit)
   if (params_.doSharedHitCut_) {
     kernel_rejectDuplicate(
         tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());

     kernel_sharedHitCleaner(hh.view(),
                             tracks_d,
                             quality_d,
                             params_.minHitsForSharingCut_,
                             params_.dupPassThrough_,
                             device_hitToTuple_.get());
     if (params_.useSimpleTripletCleaner_) {
       kernel_simpleTripletCleaner(
           tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     } else {
       kernel_tripletCleaner(
           tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get());
     }
   }

   if (params_.doStats_) {
     std::lock_guard guard(lock_stat);
     kernel_checkOverflows(tuples_d,
                           device_tupleMultiplicity_.get(),
                           device_hitToTuple_.get(),
                           device_hitTuple_apc_,
                           device_theCells_.get(),
                           device_nCells_,
                           device_theCellNeighbors_.get(),
                           device_theCellTracks_.get(),
                           isOuterHitOfCell_,
                           nhits,
                           params_.maxNumberOfDoublets_,
                           counters_);
   }

   if (params_.doStats_) {
     // counters (add flag???)
     std::lock_guard guard(lock_stat);
     kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
     kernel_doStatsForTracks(tuples_d, quality_d, counters_);
   }

 #ifdef DUMP_GPU_TK_TUPLES
   static std::atomic<int> iev(0);
   static std::mutex lock;
   {
     std::lock_guard<std::mutex> guard(lock);
     ++iev;
     kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 0, 1000000, iev);
   }
 #endif
 }
CommonMethods.lock
def lock()
Definition: CommonMethods.py:81

CAHitNtupletGeneratorKernels::launchKernels
void launchKernels(HitsOnCPU const &hh, TkSoA *tuples_d, cudaStream_t cudaStream)
Definition: CAHitNtupletGeneratorKernels.cc:81

cAHitNtupletGenerator::Params::includeJumpingForwardDoublets_
const bool includeJumpingForwardDoublets_
Definition: CAHitNtupletGeneratorKernels.h:114

caConstants::maxNumOfActiveDoublets
constexpr uint32_t maxNumOfActiveDoublets
Definition: CAConstants.h:41

CAHitNtupletGeneratorKernels::device_hitToTuple_
unique_ptr< HitToTuple > device_hitToTuple_
Definition: CAHitNtupletGeneratorKernels.h:211

cAHitNtupletGenerator::Params::doSharedHitCut_
const bool doSharedHitCut_
Definition: CAHitNtupletGeneratorKernels.h:122

CAHitNtupletGeneratorKernels::buildDoublets
void buildDoublets(HitsOnCPU const &hh, cudaStream_t stream)
Definition: CAHitNtupletGeneratorKernels.cc:17

CAHitNtupletGeneratorKernels::hitToTupleView_
HitToTuple::View hitToTupleView_
Definition: CAHitNtupletGeneratorKernels.h:213

cAHitNtupletGenerator::Params::hardCurvCut_
const float hardCurvCut_
Definition: CAHitNtupletGeneratorKernels.h:128

CAHitNtupletGeneratorKernels::classifyTuples
void classifyTuples(HitsOnCPU const &hh, TkSoA *tuples_d, cudaStream_t cudaStream)
Definition: CAHitNtupletGeneratorKernels.cc:147

mutex
static std::mutex mutex
Definition: Proxy.cc:8

hh
auto const  & hh
Definition: CAHitNtupletGeneratorKernelsImpl.h:550

cAHitNtupletGenerator::Params::doClusterCut_
const bool doClusterCut_
Definition: CAHitNtupletGeneratorKernels.h:119

counters
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const  *__restrict__ uint32_t const  *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t uint32_t CAHitNtupletGeneratorKernelsGPU::Counters * counters
Definition: CAHitNtupletGeneratorKernelsImpl.h:53

cAHitNtupletGenerator::Params::doStats_
const bool doStats_
Definition: CAHitNtupletGeneratorKernels.h:118

cms::cuda::stream
uint32_t T const  *__restrict__ uint32_t const  *__restrict__ int32_t int Histo::index_type cudaStream_t stream
Definition: HistoContainer.h:51

TrackingRecHit2DHeterogeneous
Definition: TrackingRecHit2DHeterogeneous.h:9

gpuPixelDoublets::nPairs
constexpr int nPairs
Definition: gpuPixelDoublets.h:12

cms::cuda::assert
assert(be >=bs)

CAHitNtupletGeneratorKernels::counters_
Counters * counters_
Definition: CAHitNtupletGeneratorKernels.h:197

cAHitNtupletGenerator::Counters
Definition: CAHitNtupletGeneratorKernels.h:14

mitigatedMETSequence_cff.U
U
Definition: mitigatedMETSequence_cff.py:36

CAHitNtupletGeneratorKernels::printCounters
static void printCounters(Counters const *counters)
Definition: CAHitNtupletGeneratorKernels.cc:12

cAHitNtupletGenerator::Params::doZ0Cut_
const bool doZ0Cut_
Definition: CAHitNtupletGeneratorKernels.h:120

nhits
Definition: HIMultiTrackSelector.h:42

caConstants::OuterHitOfCell
Definition: CAConstants.h:82

cAHitNtupletGenerator::Params::minHitsPerNtuplet_
const uint32_t minHitsPerNtuplet_
Definition: CAHitNtupletGeneratorKernels.h:109

cAHitNtupletGenerator::Params::useSimpleTripletCleaner_
const bool useSimpleTripletCleaner_
Definition: CAHitNtupletGeneratorKernels.h:124

cAHitNtupletGenerator::Params::dcaCutInnerTriplet_
const float dcaCutInnerTriplet_
Definition: CAHitNtupletGeneratorKernels.h:129

TrackSoAHeterogeneousT::qualityData
constexpr Quality const  * qualityData() const
Definition: TrackSoAHeterogeneousT.h:40

CAHitNtupletGeneratorKernels::device_theCells_
unique_ptr< GPUCACell[]> device_theCells_
Definition: CAHitNtupletGeneratorKernels.h:206

cAHitNtupletGenerator::Params::earlyFishbone_
const bool earlyFishbone_
Definition: CAHitNtupletGeneratorKernels.h:115

cAHitNtupletGenerator::Params::minHitsForSharingCut_
const uint16_t minHitsForSharingCut_
Definition: CAHitNtupletGeneratorKernels.h:111

cAHitNtupletGenerator::Params::idealConditions_
const bool idealConditions_
Definition: CAHitNtupletGeneratorKernels.h:117

CAHitNtupletGeneratorKernels::device_theCellNeighbors_
unique_ptr< caConstants::CellNeighborsVector > device_theCellNeighbors_
Definition: CAHitNtupletGeneratorKernels.h:201

SiStripPI::max
Definition: SiStripPayloadInspectorHelper.h:178

CAHitNtupletGeneratorKernels::cellStorage_
unique_ptr< unsigned char[]> cellStorage_
Definition: CAHitNtupletGeneratorKernels.h:200

GPUCACell::CellNeighbors
caConstants::CellNeighbors CellNeighbors
Definition: GPUCACell.h:28

cAHitNtupletGenerator::Params::CAThetaCutForward_
const float CAThetaCutForward_
Definition: CAHitNtupletGeneratorKernels.h:127

gpuPixelDoublets::nPairsForTriplets
constexpr int nPairsForTriplets
Definition: gpuPixelDoublets.h:11

cAHitNtupletGenerator::Params::dcaCutOuterTriplet_
const float dcaCutOuterTriplet_
Definition: CAHitNtupletGeneratorKernels.h:130

CAHitNtupletGeneratorKernels::device_theCellNeighborsContainer_
caConstants::CellNeighbors * device_theCellNeighborsContainer_
Definition: CAHitNtupletGeneratorKernels.h:202

CAHitNtupletGeneratorKernels::device_nCells_
uint32_t * device_nCells_
Definition: CAHitNtupletGeneratorKernels.h:209

cms::cuda::VecArray
Definition: VecArray.h:14

gpuPixelDoublets::nActualPairs
uint32_t CellNeighborsVector CellTracksVector TrackingRecHit2DSOAView const  *__restrict__ GPUCACell::OuterHitOfCell int nActualPairs
Definition: gpuPixelDoublets.h:99

cAHitNtupletGenerator::Params::doPtCut_
const bool doPtCut_
Definition: CAHitNtupletGeneratorKernels.h:121

cAHitNtupletGenerator::Params::ptmin_
const float ptmin_
Definition: CAHitNtupletGeneratorKernels.h:125

CAHitNtupletGeneratorKernels::device_hitTuple_apc_
cms::cuda::AtomicPairCounter * device_hitTuple_apc_
Definition: CAHitNtupletGeneratorKernels.h:217

iev
HitContainer const  *__restrict__ TkSoA const  *__restrict__ Quality const  *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const  *__restrict__ int32_t int32_t int iev
Definition: CAHitNtupletGeneratorKernelsImpl.h:866

CAHitNtupletGeneratorKernels::device_isOuterHitOfCell_
unique_ptr< GPUCACell::OuterHitOfCellContainer[]> device_isOuterHitOfCell_
Definition: CAHitNtupletGeneratorKernels.h:207

CAHitNtupletGeneratorKernels::isOuterHitOfCell_
GPUCACell::OuterHitOfCell isOuterHitOfCell_
Definition: CAHitNtupletGeneratorKernels.h:208

CAHitNtupletGeneratorKernels::device_theCellTracksContainer_
caConstants::CellTracks * device_theCellTracksContainer_
Definition: CAHitNtupletGeneratorKernels.h:204

TrackSoAHeterogeneousT::detIndices
HitContainer detIndices
Definition: TrackSoAHeterogeneousT.h:85

TrackSoAHeterogeneousT
Definition: TrackSoAHeterogeneousT.h:24

cAHitNtupletGenerator::Params::CAThetaCutBarrel_
const float CAThetaCutBarrel_
Definition: CAHitNtupletGeneratorKernels.h:126

cAHitNtupletGenerator::Params::lateFishbone_
const bool lateFishbone_
Definition: CAHitNtupletGeneratorKernels.h:116

gpuPixelDoublets::nPairsForQuadruplets
constexpr int nPairsForQuadruplets
Definition: gpuPixelDoublets.h:10

cAHitNtupletGenerator::Params::dupPassThrough_
const bool dupPassThrough_
Definition: CAHitNtupletGeneratorKernels.h:123

cAHitNtupletGenerator::Params::maxNumberOfDoublets_
const uint32_t maxNumberOfDoublets_
Definition: CAHitNtupletGeneratorKernels.h:110

CAHitNtupletGeneratorKernelsImpl.h

gather_cfg.cout
cout
Definition: gather_cfg.py:144

cAHitNtupletGenerator::Params::cuts_
QualityCuts cuts_
Definition: CAHitNtupletGeneratorKernels.h:133

TrackSoAHeterogeneousT::hitIndices
HitContainer hitIndices
Definition: TrackSoAHeterogeneousT.h:84

CAHitNtupletGeneratorKernels::params_
Params const  & params_
Definition: CAHitNtupletGeneratorKernels.h:223

CAHitNtupletGeneratorKernels::device_theCellTracks_
unique_ptr< caConstants::CellTracksVector > device_theCellTracks_
Definition: CAHitNtupletGeneratorKernels.h:203

CAHitNtupletGeneratorKernels::device_tupleMultiplicity_
unique_ptr< TupleMultiplicity > device_tupleMultiplicity_
Definition: CAHitNtupletGeneratorKernels.h:219

CAHitNtupletGeneratorKernels::device_hitToTuple_apc_
cms::cuda::AtomicPairCounter * device_hitToTuple_apc_
Definition: CAHitNtupletGeneratorKernels.h:215