CMS 3D CMS Logo

Functions
gpuPixelRecHits Namespace Reference

Functions

template<typename TrackerTraits >
 __attribute__ ((always_inline)) void getHits(pixelCPEforGPU
 

Function Documentation

◆ __attribute__()

template<typename TrackerTraits >
gpuPixelRecHits::__attribute__ ( (always_inline)  )
inline

Definition at line 20 of file gpuPixelRecHits.h.

References cms::cudacompat::__syncthreads(), cms::cuda::assert(), cms::cudacompat::atomicAdd(), cms::cudacompat::atomicMax(), cms::cudacompat::atomicMin(), pixelCPEforGPU::ParamsOnGPUT< TrackerTopology >::averageGeometry(), cms::cudacompat::blockDim, cms::cudacompat::blockIdx, cms::cuda::bs, haddnano::cl, bsc_activity_cfg::clusters, pixelCPEforGPU::ParamsOnGPUT< TrackerTopology >::commonParams(), pixelCPEforGPU::ParamsOnGPUT< TrackerTopology >::detParams(), dqmdumpme::first, h, hfClusterShapes_cfi::hits, mps_fire::i, gpuClustering::invalidModuleId, dqmdumpme::k, pixelTopology::AverageGeometryT< TrackerTraits >::ladderZ, SiStripPI::max, pixelCPEforGPU::MaxHitsInIter, hlt_dqm_clientPB-live_cfg::me, SiStripPI::min, gpuClustering::moduleId, gpuClustering::numElements, mathSSE::sqrt(), cms::cudacompat::threadIdx, and x.

25  {
26  // FIXME
27  // the compiler seems NOT to optimize loads from views (even in a simple test case)
28  // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
29  // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
30 
31  assert(cpeParams);
32 
33  // copy average geometry corrected by beamspot . FIXME (move it somewhere else???)
34  if (0 == blockIdx.x) {
35  auto& agc = hits.averageGeometry();
36  auto const& ag = cpeParams->averageGeometry();
37  auto nLadders = TrackerTraits::numberOfLaddersInBarrel;
38 
39  for (int il = threadIdx.x, nl = nLadders; il < nl; il += blockDim.x) {
40  agc.ladderZ[il] = ag.ladderZ[il] - bs->z;
41  agc.ladderX[il] = ag.ladderX[il] - bs->x;
42  agc.ladderY[il] = ag.ladderY[il] - bs->y;
43  agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]);
44  agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z;
45  agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z;
46  }
47 
48  if (0 == threadIdx.x) {
49  agc.endCapZ[0] = ag.endCapZ[0] - bs->z;
50  agc.endCapZ[1] = ag.endCapZ[1] - bs->z;
51  }
52  }
53 
54  // to be moved in common namespace...
56  constexpr int32_t MaxHitsInIter = pixelCPEforGPU::MaxHitsInIter;
57 
59 
60  // as usual one block per module
61  __shared__ ClusParams clusParams;
62 
63  auto me = clusters[blockIdx.x].moduleId();
64  int nclus = clusters[me].clusInModule();
65 
66  if (0 == nclus)
67  return;
68 #ifdef GPU_DEBUG
69  if (threadIdx.x == 0) {
70  auto k = clusters[1 + blockIdx.x].moduleStart();
71  while (digis[k].moduleId() == invalidModuleId)
72  ++k;
73  assert(digis[k].moduleId() == me);
74  }
75 
76  if (me % 100 == 1)
77  if (threadIdx.x == 0)
78  printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters[me].clusModuleStart());
79 #endif
80 
81  for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
82  int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
83  int lastClus = startClus + nClusInIter;
84  assert(nClusInIter <= nclus);
85  assert(nClusInIter > 0);
86  assert(lastClus <= nclus);
87 
88  assert(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
89 
90  // init
91  for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
92  clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
93  clusParams.maxRow[ic] = 0;
94  clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
95  clusParams.maxCol[ic] = 0;
96  clusParams.charge[ic] = 0;
97  clusParams.q_f_X[ic] = 0;
98  clusParams.q_l_X[ic] = 0;
99  clusParams.q_f_Y[ic] = 0;
100  clusParams.q_l_Y[ic] = 0;
101  }
102 
103  __syncthreads();
104 
105  // one thread per "digi"
106  auto first = clusters[1 + blockIdx.x].moduleStart() + threadIdx.x;
107  for (int i = first; i < numElements; i += blockDim.x) {
108  auto id = digis[i].moduleId();
109  if (id == invalidModuleId)
110  continue; // not valid
111  if (id != me)
112  break; // end of module
113  auto cl = digis[i].clus();
114  if (cl < startClus || cl >= lastClus)
115  continue;
116  cl -= startClus;
117  assert(cl >= 0);
119  auto x = digis[i].xx();
120  auto y = digis[i].yy();
121  atomicMin(&clusParams.minRow[cl], x);
122  atomicMax(&clusParams.maxRow[cl], x);
123  atomicMin(&clusParams.minCol[cl], y);
124  atomicMax(&clusParams.maxCol[cl], y);
125  }
126 
127  __syncthreads();
128 
129  auto pixmx = cpeParams->detParams(me).pixmx;
130  for (int i = first; i < numElements; i += blockDim.x) {
131  auto id = digis[i].moduleId();
132  if (id == invalidModuleId)
133  continue; // not valid
134  if (id != me)
135  break; // end of module
136  auto cl = digis[i].clus();
137  if (cl < startClus || cl >= lastClus)
138  continue;
139  cl -= startClus;
140  assert(cl >= 0);
142  auto x = digis[i].xx();
143  auto y = digis[i].yy();
144  auto ch = digis[i].adc();
145  atomicAdd(&clusParams.charge[cl], ch);
146  ch = std::min(ch, pixmx);
147  if (clusParams.minRow[cl] == x)
148  atomicAdd(&clusParams.q_f_X[cl], ch);
149  if (clusParams.maxRow[cl] == x)
150  atomicAdd(&clusParams.q_l_X[cl], ch);
151  if (clusParams.minCol[cl] == y)
152  atomicAdd(&clusParams.q_f_Y[cl], ch);
153  if (clusParams.maxCol[cl] == y)
154  atomicAdd(&clusParams.q_l_Y[cl], ch);
155  }
156 
157  __syncthreads();
158 
159  // next one cluster per thread...
160 
161  first = clusters[me].clusModuleStart() + startClus;
162  for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
163  auto h = first + ic; // output index in global memory
164 
165  assert(h < hits.nHits());
166  assert(h < clusters[me + 1].clusModuleStart());
167 
168  pixelCPEforGPU::position<TrackerTraits>(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
169 
170  pixelCPEforGPU::errorFromDB<TrackerTraits>(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
171 
172  // store it
173  hits[h].chargeAndStatus().charge = clusParams.charge[ic];
174  hits[h].chargeAndStatus().status = clusParams.status[ic];
175  hits[h].detectorIndex() = me;
176 
177  float xl, yl;
178  hits[h].xLocal() = xl = clusParams.xpos[ic];
179  hits[h].yLocal() = yl = clusParams.ypos[ic];
180 
181  hits[h].clusterSizeX() = clusParams.xsize[ic];
182  hits[h].clusterSizeY() = clusParams.ysize[ic];
183 
184  hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
185  hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
186 
187  // keep it local for computations
188  float xg, yg, zg;
189  // to global and compute phi...
190  cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
191  // here correct for the beamspot...
192  xg -= bs->x;
193  yg -= bs->y;
194  zg -= bs->z;
195 
196  hits[h].xGlobal() = xg;
197  hits[h].yGlobal() = yg;
198  hits[h].zGlobal() = zg;
199 
200  hits[h].rGlobal() = std::sqrt(xg * xg + yg * yg);
201  hits[h].iphi() = unsafe_atan2s<7>(yg, xg);
202  }
203  __syncthreads();
204  } // end loop on batches
205  }
const dim3 threadIdx
Definition: cudaCompat.h:29
T1 atomicMax(T1 *a, T2 b)
Definition: cudaCompat.h:97
ClusParamsT< MaxHitsInIter > ClusParams
const dim3 blockDim
Definition: cudaCompat.h:30
assert(be >=bs)
T sqrt(T t)
Definition: SSEVec.h:19
const dim3 blockIdx
Definition: cudaCompat.h:32
constexpr uint16_t invalidModuleId
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
void __syncthreads()
Definition: cudaCompat.h:132
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
float x
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr int32_t MaxHitsInIter