CMS 3D CMS Logo

Functions
gpuPixelRecHits Namespace Reference

Functions

 __attribute__ ((always_inline)) void getHits(pixelCPEforGPU
 

Function Documentation

◆ __attribute__()

gpuPixelRecHits::__attribute__ ( (always_inline)  )
inline

Definition at line 18 of file gpuPixelRecHits.h.

References cms::cudacompat::__syncthreads(), SiPixelDigisCUDASOAView::adc(), cms::cuda::assert(), cms::cudacompat::atomicAdd(), cms::cudacompat::atomicMax(), cms::cudacompat::atomicMin(), cms::cudacompat::blockDim, cms::cudacompat::blockIdx, cms::cuda::bs, haddnano::cl, SiPixelDigisCUDASOAView::clus(), bsc_activity_cfg::clusters, pixelCPEforGPU::errorFromDB(), pixelCPEforGPU::errorFromSize(), first, h, hfClusterShapes_cfi::hits, mps_fire::i, gpuClustering::invalidModuleId, HLT_2022v15_cff::isPhase2, dqmdumpme::k, SiStripPI::max, pixelCPEforGPU::MaxHitsInIter, hlt_dqm_clientPB-live_cfg::me, SiStripPI::min, SiPixelDigisCUDASOAView::moduleInd(), phase1PixelTopology::numberOfLaddersInBarrel, phase2PixelTopology::numberOfLaddersInBarrel, gpuClustering::numElements, phits, pixelCPEforGPU::position(), mathSSE::sqrt(), cms::cudacompat::threadIdx, x, SiPixelDigisCUDASOAView::xx(), and SiPixelDigisCUDASOAView::yy().

23  {
24  // FIXME
25  // the compiler seems NOT to optimize loads from views (even in a simple test case)
26  // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
27  // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
28  assert(phits);
29  assert(cpeParams);
30  auto& hits = *phits;
31 
32  auto const& clusters = *pclusters;
33  auto isPhase2 = cpeParams->commonParams().isPhase2;
34  // copy average geometry corrected by beamspot . FIXME (move it somewhere else???)
35  if (0 == blockIdx.x) {
36  auto& agc = hits.averageGeometry();
37  auto const& ag = cpeParams->averageGeometry();
38  auto nLadders =
40 
41  for (int il = threadIdx.x, nl = nLadders; il < nl; il += blockDim.x) {
42  agc.ladderZ[il] = ag.ladderZ[il] - bs->z;
43  agc.ladderX[il] = ag.ladderX[il] - bs->x;
44  agc.ladderY[il] = ag.ladderY[il] - bs->y;
45  agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]);
46  agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z;
47  agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z;
48  }
49 
50  if (0 == threadIdx.x) {
51  agc.endCapZ[0] = ag.endCapZ[0] - bs->z;
52  agc.endCapZ[1] = ag.endCapZ[1] - bs->z;
53  // printf("endcapZ %f %f\n",agc.endCapZ[0],agc.endCapZ[1]);
54  }
55  }
56 
57  // to be moved in common namespace...
59  constexpr int32_t MaxHitsInIter = pixelCPEforGPU::MaxHitsInIter;
60 
62 
63  // as usual one block per module
64  __shared__ ClusParams clusParams;
65 
66  auto me = clusters.moduleId(blockIdx.x);
67  int nclus = clusters.clusInModule(me);
68 
69  if (0 == nclus)
70  return;
71 #ifdef GPU_DEBUG
72  if (threadIdx.x == 0) {
73  auto k = clusters.moduleStart(1 + blockIdx.x);
74  while (digis.moduleInd(k) == invalidModuleId)
75  ++k;
76  assert(digis.moduleInd(k) == me);
77  }
78 #endif
79 #ifdef GPU_DEBUG
80  if (me % 100 == 1)
81  if (threadIdx.x == 0)
82  printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me));
83 #endif
84  for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
85  int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
86  int lastClus = startClus + nClusInIter;
87  assert(nClusInIter <= nclus);
88  assert(nClusInIter > 0);
89  assert(lastClus <= nclus);
90 
91  assert(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
92 
93  // init
94  for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
95  clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
96  clusParams.maxRow[ic] = 0;
97  clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
98  clusParams.maxCol[ic] = 0;
99  clusParams.charge[ic] = 0;
100  clusParams.q_f_X[ic] = 0;
101  clusParams.q_l_X[ic] = 0;
102  clusParams.q_f_Y[ic] = 0;
103  clusParams.q_l_Y[ic] = 0;
104  }
105 
106  __syncthreads();
107 
108  // one thread per "digi"
109  auto first = clusters.moduleStart(1 + blockIdx.x) + threadIdx.x;
110  for (int i = first; i < numElements; i += blockDim.x) {
111  auto id = digis.moduleInd(i);
112  if (id == invalidModuleId)
113  continue; // not valid
114  if (id != me)
115  break; // end of module
116  auto cl = digis.clus(i);
117  if (cl < startClus || cl >= lastClus)
118  continue;
119  cl -= startClus;
120  assert(cl >= 0);
122  auto x = digis.xx(i);
123  auto y = digis.yy(i);
124  atomicMin(&clusParams.minRow[cl], x);
125  atomicMax(&clusParams.maxRow[cl], x);
126  atomicMin(&clusParams.minCol[cl], y);
127  atomicMax(&clusParams.maxCol[cl], y);
128  }
129 
130  __syncthreads();
131 
132  auto pixmx = cpeParams->detParams(me).pixmx;
133  for (int i = first; i < numElements; i += blockDim.x) {
134  auto id = digis.moduleInd(i);
135  if (id == invalidModuleId)
136  continue; // not valid
137  if (id != me)
138  break; // end of module
139  auto cl = digis.clus(i);
140  if (cl < startClus || cl >= lastClus)
141  continue;
142  cl -= startClus;
143  assert(cl >= 0);
145  auto x = digis.xx(i);
146  auto y = digis.yy(i);
147  auto ch = digis.adc(i);
148  atomicAdd(&clusParams.charge[cl], ch);
149  ch = std::min(ch, pixmx);
150  if (clusParams.minRow[cl] == x)
151  atomicAdd(&clusParams.q_f_X[cl], ch);
152  if (clusParams.maxRow[cl] == x)
153  atomicAdd(&clusParams.q_l_X[cl], ch);
154  if (clusParams.minCol[cl] == y)
155  atomicAdd(&clusParams.q_f_Y[cl], ch);
156  if (clusParams.maxCol[cl] == y)
157  atomicAdd(&clusParams.q_l_Y[cl], ch);
158  }
159 
160  __syncthreads();
161 
162  // next one cluster per thread...
163 
164  first = clusters.clusModuleStart(me) + startClus;
165  for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
166  auto h = first + ic; // output index in global memory
167 
168  assert(h < hits.nHits());
169  assert(h < clusters.clusModuleStart(me + 1));
170 
171  pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
172  if (!isPhase2)
173  pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
174  else
175  pixelCPEforGPU::errorFromSize(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
176 
177  // store it
178  hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]);
179  hits.detectorIndex(h) = me;
180 
181  float xl, yl;
182  hits.xLocal(h) = xl = clusParams.xpos[ic];
183  hits.yLocal(h) = yl = clusParams.ypos[ic];
184 
185  hits.clusterSizeX(h) = clusParams.xsize[ic];
186  hits.clusterSizeY(h) = clusParams.ysize[ic];
187 
188  hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
189  hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
190 
191  // keep it local for computations
192  float xg, yg, zg;
193  // to global and compute phi...
194  cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
195  // here correct for the beamspot...
196  xg -= bs->x;
197  yg -= bs->y;
198  zg -= bs->z;
199 
200  hits.xGlobal(h) = xg;
201  hits.yGlobal(h) = yg;
202  hits.zGlobal(h) = zg;
203 
204  hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
205  hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
206  }
207  __syncthreads();
208  } // end loop on batches
209  }
const dim3 threadIdx
Definition: cudaCompat.h:29
T1 atomicMax(T1 *a, T2 b)
Definition: cudaCompat.h:97
ClusParamsT< MaxHitsInIter > ClusParams
const dim3 blockDim
Definition: cudaCompat.h:30
constexpr void position(CommonParams const &__restrict__ comParams, DetParams const &__restrict__ detParams, ClusParams &cp, uint32_t ic)
assert(be >=bs)
constexpr uint16_t numberOfLaddersInBarrel
T sqrt(T t)
Definition: SSEVec.h:19
const dim3 blockIdx
Definition: cudaCompat.h:32
constexpr uint32_t numberOfLaddersInBarrel
constexpr uint16_t invalidModuleId
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
void __syncthreads()
Definition: cudaCompat.h:132
constexpr void errorFromSize(CommonParams const &__restrict__ comParams, DetParams const &__restrict__ detParams, ClusParams &cp, uint32_t ic)
float x
constexpr void errorFromDB(CommonParams const &__restrict__ comParams, DetParams const &__restrict__ detParams, ClusParams &cp, uint32_t ic)
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr int32_t MaxHitsInIter
caConstants::TupleMultiplicity const *__restrict__ HitsOnGPU const *__restrict__ tindex_type *__restrict__ double *__restrict__ phits