CMS 3D CMS Logo

gpuPixelDoubletsAlgos.h
Go to the documentation of this file.
1 #ifndef RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
2 #define RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <cstdint>
7 #include <cstdio>
8 #include <limits>
9 
14 
15 #include "CAConstants.h"
16 #include "GPUCACell.h"
17 
18 namespace gpuPixelDoublets {
19 
24 
25  __device__ __forceinline__ void doubletsFromHisto(uint8_t const* __restrict__ layerPairs,
26  uint32_t nPairs,
28  uint32_t* nCells,
31  TrackingRecHit2DSOAView const& __restrict__ hh,
33  int16_t const* __restrict__ phicuts,
34  float const* __restrict__ minz,
35  float const* __restrict__ maxz,
36  float const* __restrict__ maxr,
37  bool ideal_cond,
38  bool doClusterCut,
39  bool doZ0Cut,
40  bool doPtCut,
41  uint32_t maxNumOfDoublets) {
42  // ysize cuts (z in the barrel) times 8
43  // these are used if doClusterCut is true
44  constexpr int minYsizeB1 = 36;
45  constexpr int minYsizeB2 = 28;
46  constexpr int maxDYsize12 = 28;
47  constexpr int maxDYsize = 20;
48  constexpr int maxDYPred = 20;
49  constexpr float dzdrFact = 8 * 0.0285 / 0.015; // from dz/dr to "DY"
50 
52 
54 
55  auto const& __restrict__ phiBinner = hh.phiBinner();
56  uint32_t const* __restrict__ offsets = hh.hitsLayerStart();
57  assert(offsets);
58 
59  auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; };
60 
61  // nPairsMax to be optimized later (originally was 64).
62  // If it should be much bigger, consider using a block-wide parallel prefix scan,
63  // e.g. see https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
66  __shared__ uint32_t innerLayerCumulativeSize[nPairsMax];
67  __shared__ uint32_t ntot;
68  if (threadIdx.y == 0 && threadIdx.x == 0) {
70  for (uint32_t i = 1; i < nPairs; ++i) {
72  }
74  }
75  __syncthreads();
76 
77  // x runs faster
78  auto idy = blockIdx.y * blockDim.y + threadIdx.y;
79  auto first = threadIdx.x;
80  auto stride = blockDim.x;
81 
82  uint32_t pairLayerId = 0; // cannot go backward
83  for (auto j = idy; j < ntot; j += blockDim.y * gridDim.y) {
85  ;
86  --pairLayerId; // move to lower_bound ??
87 
91 
92  uint8_t inner = layerPairs[2 * pairLayerId];
93  uint8_t outer = layerPairs[2 * pairLayerId + 1];
94  assert(outer > inner);
95 
96  auto hoff = PhiBinner::histOff(outer);
97 
98  auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
99  i += offsets[inner];
100 
101  // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
102 
103  assert(i >= offsets[inner]);
104  assert(i < offsets[inner + 1]);
105 
106  // found hit corresponding to our cuda thread, now do the job
107  auto mi = hh.detectorIndex(i);
109  continue; // invalid
110 
111  /* maybe clever, not effective when zoCut is on
112  auto bpos = (mi%8)/4; // if barrel is 1 for z>0
113  auto fpos = (outer>3) & (outer<7);
114  if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue;
115  */
116 
117  auto mez = hh.zGlobal(i);
118 
119  if (mez < minz[pairLayerId] || mez > maxz[pairLayerId])
120  continue;
121 
122  int16_t mes = -1; // make compiler happy
123  if (doClusterCut) {
124  // if ideal treat inner ladder as outer
125  if (inner == 0)
126  assert(mi < 96);
127  isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue...
128 
129  // in any case we always test mes>0 ...
130  mes = inner > 0 || isOuterLadder ? hh.clusterSizeY(i) : -1;
131 
132  if (inner == 0 && outer > 3) // B1 and F1
133  if (mes > 0 && mes < minYsizeB1)
134  continue; // only long cluster (5*8)
135  if (inner == 1 && outer > 3) // B2 and F1
136  if (mes > 0 && mes < minYsizeB2)
137  continue;
138  }
139  auto mep = hh.iphi(i);
140  auto mer = hh.rGlobal(i);
141 
142  // all cuts: true if fails
143  constexpr float z0cut = 12.f; // cm
144  constexpr float hardPtCut = 0.5f; // GeV
145  // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
146  constexpr float minRadius = hardPtCut * 87.78f;
147  constexpr float minRadius2T4 = 4.f * minRadius * minRadius;
148  auto ptcut = [&](int j, int16_t idphi) {
149  auto r2t4 = minRadius2T4;
150  auto ri = mer;
151  auto ro = hh.rGlobal(j);
152  auto dphi = short2phi(idphi);
153  return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
154  };
155  auto z0cutoff = [&](int j) {
156  auto zo = hh.zGlobal(j);
157  auto ro = hh.rGlobal(j);
158  auto dr = ro - mer;
159  return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
160  };
161 
162  auto zsizeCut = [&](int j) {
163  auto onlyBarrel = outer < 4;
164  auto so = hh.clusterSizeY(j);
165  auto dy = inner == 0 ? maxDYsize12 : maxDYsize;
166  // in the barrel cut on difference in size
167  // in the endcap on the prediction on the first layer (actually in the barrel only: happen to be safe for endcap as well)
168  // FIXME move pred cut to z0cutoff to optmize loading of and computaiton ...
169  auto zo = hh.zGlobal(j);
170  auto ro = hh.rGlobal(j);
171  return onlyBarrel ? mes > 0 && so > 0 && std::abs(so - mes) > dy
172  : (inner < 4) && mes > 0 &&
173  std::abs(mes - int(std::abs((mez - zo) / (mer - ro)) * dzdrFact + 0.5f)) > maxDYPred;
174  };
175 
176  auto iphicut = phicuts[pairLayerId];
177 
178  auto kl = PhiBinner::bin(int16_t(mep - iphicut));
179  auto kh = PhiBinner::bin(int16_t(mep + iphicut));
180  auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); };
181 
182 #ifdef GPU_DEBUG
183  int tot = 0;
184  int nmin = 0;
185  int tooMany = 0;
186 #endif
187 
188  auto khh = kh;
189  incr(khh);
190  for (auto kk = kl; kk != khh; incr(kk)) {
191 #ifdef GPU_DEBUG
192  if (kk != kl && kk != kh)
193  nmin += phiBinner.size(kk + hoff);
194 #endif
195  auto const* __restrict__ p = phiBinner.begin(kk + hoff);
196  auto const* __restrict__ e = phiBinner.end(kk + hoff);
197  p += first;
198  for (; p < e; p += stride) {
199  auto oi = __ldg(p);
200  assert(oi >= offsets[outer]);
201  assert(oi < offsets[outer + 1]);
202  auto mo = hh.detectorIndex(oi);
204  continue; // invalid
205 
206  if (doZ0Cut && z0cutoff(oi))
207  continue;
208 
209  auto mop = hh.iphi(oi);
210  uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
211  if (idphi > iphicut)
212  continue;
213 
214  if (doClusterCut && zsizeCut(oi))
215  continue;
216  if (doPtCut && ptcut(oi, idphi))
217  continue;
218 
219  auto ind = atomicAdd(nCells, 1);
220  if (ind >= maxNumOfDoublets) {
221  atomicSub(nCells, 1);
222  break;
223  } // move to SimpleVector??
224  // int layerPairId, int doubletId, int innerHitId, int outerHitId)
225  cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, i, oi);
226  isOuterHitOfCell[oi].push_back(ind);
227 #ifdef GPU_DEBUG
228  if (isOuterHitOfCell[oi].full())
229  ++tooMany;
230  ++tot;
231 #endif
232  }
233  }
234 #ifdef GPU_DEBUG
235  if (tooMany > 0)
236  printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d\n", i, inner, outer, nmin, tot, tooMany);
237 #endif
238  } // loop in block...
239  }
240 
241 } // namespace gpuPixelDoublets
242 
243 #endif // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h
const dim3 threadIdx
Definition: cudaCompat.h:29
uint32_t CellNeighborsVector CellTracksVector * cellTracks
cms::cuda::VecArray< tindex_type, maxCellTracks > CellTracks
Definition: CAConstants.h:72
static constexpr auto histOff(uint32_t nh)
__constant__ const uint8_t layerPairs[2 *nPairs]
cms::cuda::SimpleVector< CellNeighbors > CellNeighborsVector
Definition: CAConstants.h:74
#define __forceinline__
Definition: cudaCompat.h:22
__constant__ float const maxz[nPairs]
__constant__ float const minz[nPairs]
const dim3 gridDim
Definition: cudaCompat.h:33
__device__ uint32_t GPUCACell * cells
uint32_t CellNeighborsVector CellTracksVector TrackingRecHit2DSOAView const *__restrict__ GPUCACell::OuterHitOfCell int bool bool bool doZ0Cut
caConstants::CellNeighborsVector CellNeighborsVector
T1 atomicSub(T1 *a, T2 b)
Definition: cudaCompat.h:73
constexpr uint32_t maxNumberOfLayerPairs
Definition: CAConstants.h:44
const dim3 blockDim
Definition: cudaCompat.h:30
uint32_t const *__restrict__ offsets
__constant__ float const maxr[nPairs]
uint32_t CellNeighborsVector CellTracksVector TrackingRecHit2DSOAView const *__restrict__ GPUCACell::OuterHitOfCell int bool bool bool bool doPtCut
caConstants::CellNeighbors CellNeighbors
constexpr int nPairs
caConstants::CellTracks CellTracks
caConstants::CellTracksVector CellTracksVector
uint32_t CellNeighborsVector CellTracksVector TrackingRecHit2DSOAView const *__restrict__ GPUCACell::OuterHitOfCell isOuterHitOfCell
__device__ int push_back(const T &element)
Definition: SimpleVector.h:60
cms::cuda::SimpleVector< CellTracks > CellTracksVector
Definition: CAConstants.h:75
auto const &__restrict__ phiBinner
cms::cuda::HistoContainer< int16_t, 256, -1, 8 *sizeof(int16_t), hindex_type, pixelTopology::maxLayers > PhiBinner
Definition: GenABIO.cc:168
__shared__ uint32_t innerLayerCumulativeSize[nPairsMax]
uint32_t CellNeighborsVector CellTracksVector TrackingRecHit2DSOAView const *__restrict__ GPUCACell::OuterHitOfCell int bool bool doClusterCut
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
double f[11][100]
constexpr uint16_t maxNumModules
constexpr int maxDYsize12
Quality *__restrict__ uint16_t nmin
uint32_t CellNeighborsVector CellTracksVector TrackingRecHit2DSOAView const *__restrict__ GPUCACell::OuterHitOfCell int bool ideal_cond
const dim3 blockIdx
Definition: cudaCompat.h:32
__constant__ const int16_t phicuts[nPairs]
constexpr float dzdrFact
static constexpr uint32_t nbins()
T __ldg(T const *x)
Definition: cudaCompat.h:137
static constexpr UT bin(T t)
constexpr float short2phi(short x)
Definition: approx_atan2.h:285
uint32_t CellNeighborsVector * cellNeighbors
uint32_t CellNeighborsVector CellTracksVector TrackingRecHit2DSOAView const *__restrict__ GPUCACell::OuterHitOfCell int bool bool bool bool uint32_t maxNumOfDoublets
constexpr int minYsizeB2
#define __device__
__shared__ uint32_t ntot
__device__ uint32_t GPUCACell uint32_t CellNeighborsVector CellTracksVector TrackingRecHit2DSOAView const &__restrict__ hh
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
doubletsFromHisto(layerPairs, nActualPairs, cells, nCells, cellNeighbors, cellTracks, hh, isOuterHitOfCell, phicuts, minz, maxz, maxr, ideal_cond, doClusterCut, doZ0Cut, doPtCut, maxNumOfDoublets)
cms::cuda::VecArray< uint32_t, maxCellNeighbors > CellNeighbors
Definition: CAConstants.h:71