CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
gpuPixelRecHits.h
Go to the documentation of this file.
1 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
2 #define RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
3 
4 #include <cstdint>
5 #include <cstdio>
6 #include <limits>
7 
14 
15 namespace gpuPixelRecHits {
16 
17  __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams,
18  BeamSpotPOD const* __restrict__ bs,
19  SiPixelDigisCUDA::DeviceConstView const* __restrict__ pdigis,
20  int numElements,
21  SiPixelClustersCUDA::DeviceConstView const* __restrict__ pclusters,
23  // FIXME
24  // the compiler seems NOT to optimize loads from views (even in a simple test case)
25  // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
26  // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
27 
28  assert(phits);
29  assert(cpeParams);
30 
31  auto& hits = *phits;
32 
33  auto const digis = *pdigis; // the copy is intentional!
34  auto const& clusters = *pclusters;
35 
36  // copy average geometry corrected by beamspot . FIXME (move it somewhere else???)
37  if (0 == blockIdx.x) {
38  auto& agc = hits.averageGeometry();
39  auto const& ag = cpeParams->averageGeometry();
41  il += blockDim.x) {
42  agc.ladderZ[il] = ag.ladderZ[il] - bs->z;
43  agc.ladderX[il] = ag.ladderX[il] - bs->x;
44  agc.ladderY[il] = ag.ladderY[il] - bs->y;
45  agc.ladderR[il] = sqrt(agc.ladderX[il] * agc.ladderX[il] + agc.ladderY[il] * agc.ladderY[il]);
46  agc.ladderMinZ[il] = ag.ladderMinZ[il] - bs->z;
47  agc.ladderMaxZ[il] = ag.ladderMaxZ[il] - bs->z;
48  }
49  if (0 == threadIdx.x) {
50  agc.endCapZ[0] = ag.endCapZ[0] - bs->z;
51  agc.endCapZ[1] = ag.endCapZ[1] - bs->z;
52  // printf("endcapZ %f %f\n",agc.endCapZ[0],agc.endCapZ[1]);
53  }
54  }
55 
56  // to be moved in common namespace...
58  constexpr int32_t MaxHitsInIter = pixelCPEforGPU::MaxHitsInIter;
59 
61 
62  // as usual one block per module
63  __shared__ ClusParams clusParams;
64 
65  auto me = clusters.moduleId(blockIdx.x);
66  int nclus = clusters.clusInModule(me);
67 
68  if (0 == nclus)
69  return;
70 
71 #ifdef GPU_DEBUG
72  if (threadIdx.x == 0) {
73  auto k = clusters.moduleStart(1 + blockIdx.x);
74  while (digis.moduleInd(k) == invalidModuleId)
75  ++k;
76  assert(digis.moduleInd(k) == me);
77  }
78 #endif
79 
80 #ifdef GPU_DEBUG
81  if (me % 100 == 1)
82  if (threadIdx.x == 0)
83  printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me));
84 #endif
85 
86  for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
87  int nClusInIter = std::min(MaxHitsInIter, endClus - startClus);
88  int lastClus = startClus + nClusInIter;
89  assert(nClusInIter <= nclus);
90  assert(nClusInIter > 0);
91  assert(lastClus <= nclus);
92 
93  assert(nclus > MaxHitsInIter || (0 == startClus && nClusInIter == nclus && lastClus == nclus));
94 
95  // init
96  for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
97  clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
98  clusParams.maxRow[ic] = 0;
99  clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
100  clusParams.maxCol[ic] = 0;
101  clusParams.charge[ic] = 0;
102  clusParams.q_f_X[ic] = 0;
103  clusParams.q_l_X[ic] = 0;
104  clusParams.q_f_Y[ic] = 0;
105  clusParams.q_l_Y[ic] = 0;
106  }
107 
108  __syncthreads();
109 
110  // one thread per "digi"
111  auto first = clusters.moduleStart(1 + blockIdx.x) + threadIdx.x;
112  for (int i = first; i < numElements; i += blockDim.x) {
113  auto id = digis.moduleInd(i);
114  if (id == invalidModuleId)
115  continue; // not valid
116  if (id != me)
117  break; // end of module
118  auto cl = digis.clus(i);
119  if (cl < startClus || cl >= lastClus)
120  continue;
121  cl -= startClus;
122  assert(cl >= 0);
123  assert(cl < MaxHitsInIter);
124  auto x = digis.xx(i);
125  auto y = digis.yy(i);
126  atomicMin(&clusParams.minRow[cl], x);
127  atomicMax(&clusParams.maxRow[cl], x);
128  atomicMin(&clusParams.minCol[cl], y);
129  atomicMax(&clusParams.maxCol[cl], y);
130  }
131 
132  __syncthreads();
133 
134  auto pixmx = cpeParams->detParams(me).pixmx;
135  for (int i = first; i < numElements; i += blockDim.x) {
136  auto id = digis.moduleInd(i);
137  if (id == invalidModuleId)
138  continue; // not valid
139  if (id != me)
140  break; // end of module
141  auto cl = digis.clus(i);
142  if (cl < startClus || cl >= lastClus)
143  continue;
144  cl -= startClus;
145  assert(cl >= 0);
146  assert(cl < MaxHitsInIter);
147  auto x = digis.xx(i);
148  auto y = digis.yy(i);
149  auto ch = digis.adc(i);
150  atomicAdd(&clusParams.charge[cl], ch);
151  ch = std::min(ch, pixmx);
152  if (clusParams.minRow[cl] == x)
153  atomicAdd(&clusParams.q_f_X[cl], ch);
154  if (clusParams.maxRow[cl] == x)
155  atomicAdd(&clusParams.q_l_X[cl], ch);
156  if (clusParams.minCol[cl] == y)
157  atomicAdd(&clusParams.q_f_Y[cl], ch);
158  if (clusParams.maxCol[cl] == y)
159  atomicAdd(&clusParams.q_l_Y[cl], ch);
160  }
161 
162  __syncthreads();
163 
164  // next one cluster per thread...
165 
166  first = clusters.clusModuleStart(me) + startClus;
167  for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
168  auto h = first + ic; // output index in global memory
169 
170  assert(h < hits.nHits());
171  assert(h < clusters.clusModuleStart(me + 1));
172 
173  pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
174  pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
175 
176  // store it
177  hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]);
178  hits.detectorIndex(h) = me;
179 
180  float xl, yl;
181  hits.xLocal(h) = xl = clusParams.xpos[ic];
182  hits.yLocal(h) = yl = clusParams.ypos[ic];
183 
184  hits.clusterSizeX(h) = clusParams.xsize[ic];
185  hits.clusterSizeY(h) = clusParams.ysize[ic];
186 
187  hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
188  hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
189 
190  // keep it local for computations
191  float xg, yg, zg;
192  // to global and compute phi...
193  cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
194  // here correct for the beamspot...
195  xg -= bs->x;
196  yg -= bs->y;
197  zg -= bs->z;
198 
199  hits.xGlobal(h) = xg;
200  hits.yGlobal(h) = yg;
201  hits.zGlobal(h) = zg;
202 
203  hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
204  hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
205  }
206  __syncthreads();
207  } // end loop on batches
208  }
209 
210 } // namespace gpuPixelRecHits
211 
212 #endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h
const dim3 threadIdx
Definition: cudaCompat.h:29
T1 atomicMax(T1 *a, T2 b)
Definition: cudaCompat.h:97
ClusParamsT< MaxHitsInIter > ClusParams
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
constexpr void position(CommonParams const &__restrict__ comParams, DetParams const &__restrict__ detParams, ClusParams &cp, uint32_t ic)
assert(be >=bs)
tuple cl
Definition: haddnano.py:49
T sqrt(T t)
Definition: SSEVec.h:19
printf("params %d %f %f %f\n", minT, eps, errmax, chi2max)
T min(T a, T b)
Definition: MathUtil.h:58
uint16_t *__restrict__ uint16_t const *__restrict__ uint32_t const *__restrict__ uint32_t *__restrict__ uint32_t const *__restrict__ int32_t *__restrict__ uint32_t numElements
const dim3 blockIdx
Definition: cudaCompat.h:32
constexpr uint16_t invalidModuleId
void __syncthreads()
Definition: cudaCompat.h:108
constexpr void errorFromDB(CommonParams const &__restrict__ comParams, DetParams const &__restrict__ detParams, ClusParams &cp, uint32_t ic)
T1 atomicMin(T1 *a, T2 b)
Definition: cudaCompat.h:85
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
constexpr int32_t MaxHitsInIter
caConstants::TupleMultiplicity const *__restrict__ HitsOnGPU const *__restrict__ tindex_type *__restrict__ double *__restrict__ phits