CMS 3D CMS Logo

gpuPixelRecHits.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
2 #define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
3 
4 #include <cstdint>
5 #include <cstdio>
6 #include <limits>
7 
15 
16 //#define GPU_DEBUG
17 
18 namespace gpuPixelRecHits {
19 
20  template <typename TrackerTraits>
21  __global__ void getHits(pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* __restrict__ cpeParams,
22  BeamSpotPOD const* __restrict__ bs,
24  int numElements,
27  // FIXME
28  // the compiler seems NOT to optimize loads from views (even in a simple test case)
29  // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
30  // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
31 
32  assert(cpeParams);
33 
34  // copy average geometry corrected by beamspot . FIXME (move it somewhere else???)
35  if (0 == blockIdx.x) {
36  auto& agc = hits.averageGeometry();
37  auto const& ag = cpeParams->averageGeometry();
38  auto nLadders = TrackerTraits::numberOfLaddersInBarrel;
39 
40  for (int il = threadIdx.x, nl = nLadders; il < nl; il += blockDim.x) {
41  agc.ladderZ[il] = ag.ladderZ[il] - bs->z;
42  agc.ladderX[il] = ag.ladderX[il] - bs->x;
43  agc.ladderY[il] = ag.ladderY[il] - bs->y;
44  agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]);
45  agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z;
46  agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z;
47  }
48 
49  if (0 == threadIdx.x) {
50  agc.endCapZ[0] = ag.endCapZ[0] - bs->z;
51  agc.endCapZ[1] = ag.endCapZ[1] - bs->z;
52  }
53  }
54 
55  // to be moved in common namespace...
58 
60 
61  // as usual one block per module
62  __shared__ ClusParams clusParams;
63 
64  auto me = clusters[blockIdx.x].moduleId();
65  int nclus = clusters[me].clusInModule();
66 
67  if (0 == nclus)
68  return;
69 #ifdef GPU_DEBUG
70  if (threadIdx.x == 0) {
71  auto k = clusters[1 + blockIdx.x].moduleStart();
72  while (digis[k].moduleId() == invalidModuleId)
73  ++k;
74  assert(digis[k].moduleId() == me);
75  }
76 
77  if (me % 100 == 1)
78  if (threadIdx.x == 0)
79  printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters[me].clusModuleStart());
80 #endif
81 
82  for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
83  int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
84  int lastClus = startClus + nClusInIter;
85  assert(nClusInIter <= nclus);
86  assert(nClusInIter > 0);
87  assert(lastClus <= nclus);
88 
89  assert(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
90 
91  // init
92  for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
93  clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
94  clusParams.maxRow[ic] = 0;
95  clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
96  clusParams.maxCol[ic] = 0;
97  clusParams.charge[ic] = 0;
98  clusParams.q_f_X[ic] = 0;
99  clusParams.q_l_X[ic] = 0;
100  clusParams.q_f_Y[ic] = 0;
101  clusParams.q_l_Y[ic] = 0;
102  }
103 
104  __syncthreads();
105 
106  // one thread per "digi"
107  auto first = clusters[1 + blockIdx.x].moduleStart() + threadIdx.x;
108  for (int i = first; i < numElements; i += blockDim.x) {
109  auto id = digis[i].moduleId();
110  if (id == invalidModuleId)
111  continue; // not valid
112  if (id != me)
113  break; // end of module
114  auto cl = digis[i].clus();
115  if (cl < startClus || cl >= lastClus)
116  continue;
117  cl -= startClus;
118  assert(cl >= 0);
120  auto x = digis[i].xx();
121  auto y = digis[i].yy();
122  atomicMin(&clusParams.minRow[cl], x);
123  atomicMax(&clusParams.maxRow[cl], x);
124  atomicMin(&clusParams.minCol[cl], y);
125  atomicMax(&clusParams.maxCol[cl], y);
126  }
127 
128  __syncthreads();
129 
130  auto pixmx = cpeParams->detParams(me).pixmx;
131  for (int i = first; i < numElements; i += blockDim.x) {
132  auto id = digis[i].moduleId();
133  if (id == invalidModuleId)
134  continue; // not valid
135  if (id != me)
136  break; // end of module
137  auto cl = digis[i].clus();
138  if (cl < startClus || cl >= lastClus)
139  continue;
140  cl -= startClus;
141  assert(cl >= 0);
143  auto x = digis[i].xx();
144  auto y = digis[i].yy();
145  auto ch = digis[i].adc();
146  atomicAdd(&clusParams.charge[cl], ch);
147  ch = std::min(ch, pixmx);
148  if (clusParams.minRow[cl] == x)
149  atomicAdd(&clusParams.q_f_X[cl], ch);
150  if (clusParams.maxRow[cl] == x)
151  atomicAdd(&clusParams.q_l_X[cl], ch);
152  if (clusParams.minCol[cl] == y)
153  atomicAdd(&clusParams.q_f_Y[cl], ch);
154  if (clusParams.maxCol[cl] == y)
155  atomicAdd(&clusParams.q_l_Y[cl], ch);
156  }
157 
158  __syncthreads();
159 
160  // next one cluster per thread...
161 
162  first = clusters[me].clusModuleStart() + startClus;
163  for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
164  auto h = first + ic; // output index in global memory
165 
166  assert(h < hits.nHits());
167  assert(h < clusters[me + 1].clusModuleStart());
168 
169  pixelCPEforGPU::position<TrackerTraits>(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
170 
171  pixelCPEforGPU::errorFromDB<TrackerTraits>(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
172 
173  // store it
174  hits[h].chargeAndStatus().charge = clusParams.charge[ic];
175  hits[h].chargeAndStatus().status = clusParams.status[ic];
176  hits[h].detectorIndex() = me;
177 
178  float xl, yl;
179  hits[h].xLocal() = xl = clusParams.xpos[ic];
180  hits[h].yLocal() = yl = clusParams.ypos[ic];
181 
182  hits[h].clusterSizeX() = clusParams.xsize[ic];
183  hits[h].clusterSizeY() = clusParams.ysize[ic];
184 
185  hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
186  hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
187 
188  // keep it local for computations
189  float xg, yg, zg;
190  // to global and compute phi...
191  cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
192  // here correct for the beamspot...
193  xg -= bs->x;
194  yg -= bs->y;
195  zg -= bs->z;
196 
197  hits[h].xGlobal() = xg;
198  hits[h].yGlobal() = yg;
199  hits[h].zGlobal() = zg;
200 
201  hits[h].rGlobal() = std::sqrt(xg * xg + yg * yg);
202  hits[h].iphi() = unsafe_atan2s<7>(yg, xg);
203  }
204  __syncthreads();
205  } // end loop on batches
206  }
207 
208 } // namespace gpuPixelRecHits
209 
210 #endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
const dim3 threadIdx
Definition: cudaCompat.h:29
T1 atomicMax(T1 *a, T2 b)
Definition: cudaCompat.h:97
ClusParamsT< MaxHitsInIter > ClusParams
constexpr AverageGeometry const &__restrict__ averageGeometry() const
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
constexpr int32_t MaxHitsInIter
assert(be >=bs)
constexpr DetParams const &__restrict__ detParams(int i) const
ClusParamsT< MaxHitsInIter > ClusParams
float ladderZ[TrackerTraits::numberOfLaddersInBarrel]
T sqrt(T t)
Definition: SSEVec.h:19
const dim3 blockIdx
Definition: cudaCompat.h:32
constexpr uint16_t invalidModuleId
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
constexpr CommonParams const &__restrict__ commonParams() const
void __syncthreads()
Definition: cudaCompat.h:132
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ moduleId
float x
typename TrackingRecHitSoA< TrackerTraits >::template TrackingRecHitSoALayout<>::View TrackingRecHitSoAView
SiPixelClustersCUDALayout<>::ConstView SiPixelClustersCUDASOAConstView
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr int32_t MaxHitsInIter