CMS 3D CMS Logo

CAHitNtupletGeneratorKernelsImpl.h
Go to the documentation of this file.
1 //
2 // Original Author: Felice Pantaleo, CERN
3 //
4 
5 // #define NTUPLE_DEBUG
6 // #define GPU_DEBUG
7 
8 #include <cmath>
9 #include <cstdint>
10 #include <limits>
11 
12 #include <cuda_runtime.h>
13 
17 
18 #include "CAConstants.h"
20 #include "GPUCACell.h"
21 #include "gpuFishbone.h"
22 #include "gpuPixelDoublets.h"
23 
26 
29 
33 
34 namespace {
35 
36  constexpr uint16_t tkNotFound = std::numeric_limits<uint16_t>::max();
37  constexpr float maxScore = std::numeric_limits<float>::max();
38  constexpr float nSigma2 = 25.f;
39 
40 } // namespace
41 
42 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
46  GPUCACell const *__restrict__ cells,
47  uint32_t const *__restrict__ nCells,
51  int32_t nHits,
52  uint32_t maxNumberOfDoublets,
54  auto first = threadIdx.x + blockIdx.x * blockDim.x;
55 
56  auto &c = *counters;
57  // counters once per event
58  if (0 == first) {
59  atomicAdd(&c.nEvents, 1);
60  atomicAdd(&c.nHits, nHits);
61  atomicAdd(&c.nCells, *nCells);
62  atomicAdd(&c.nTuples, apc->get().m);
63  atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
64  }
65 
66 #ifdef NTUPLE_DEBUG
67  if (0 == first) {
68  printf("number of found cells %d, found tuples %d with total hits %d out of %d %d\n",
69  *nCells,
70  apc->get().m,
71  apc->get().n,
72  nHits,
73  hitToTuple->totOnes());
75  assert(foundNtuplets->size(apc->get().m) == 0);
76  assert(foundNtuplets->size() == apc->get().n);
77  }
78  }
79 
80  for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
81  if (foundNtuplets->size(idx) > 7) // current real limit
82  printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
84  for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
85  assert(int(*ih) < nHits);
86  }
87 #endif
88 
89  if (0 == first) {
91  printf("Tuples overflow\n");
93  printf("Cells overflow\n");
94  if (cellNeighbors && cellNeighbors->full())
95  printf("cellNeighbors overflow\n");
96  if (cellTracks && cellTracks->full())
97  printf("cellTracks overflow\n");
98  if (int(hitToTuple->nOnes()) < nHits)
99  printf("ERROR hitToTuple overflow %d %d\n", hitToTuple->nOnes(), nHits);
100  }
101 
102  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
103  auto const &thisCell = cells[idx];
104  if (thisCell.hasFishbone() && !thisCell.isKilled())
105  atomicAdd(&c.nFishCells, 1);
106  if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId];
107  printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
108  if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId];
109  printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
110  if (thisCell.isKilled())
111  atomicAdd(&c.nKilledCells, 1);
112  if (!thisCell.unused())
113  atomicAdd(&c.nEmptyCells, 1);
114  if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
115  atomicAdd(&c.nZeroTrackCells, 1);
116  }
117 
118  for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) {
119  if (isOuterHitOfCell.container[idx].full()) // ++tooManyOuterHitOfCell;
120  printf("OuterHitOfCell overflow %d\n", idx);
121  }
122 }
123 
124 __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
125  constexpr auto reject = pixelTrack::Quality::dup;
126 
127  auto first = threadIdx.x + blockIdx.x * blockDim.x;
128  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
129  auto const &thisCell = cells[idx];
130  if (!thisCell.isKilled())
131  continue;
132 
133  for (auto it : thisCell.tracks())
134  quality[it] = reject;
135  }
136 }
137 
138 // remove shorter tracks if sharing a cell
139 // It does not seem to affect efficiency in any way!
140 __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
141  uint32_t const *__restrict__ nCells,
142  TkSoA const *__restrict__ ptracks,
143  Quality *quality,
145  // quality to mark rejected
146  constexpr auto reject = pixelTrack::Quality::edup;
147 
148  auto const &tracks = *ptracks;
149 
150  assert(nCells);
151  auto first = threadIdx.x + blockIdx.x * blockDim.x;
152  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
153  auto const &thisCell = cells[idx];
154 
155  if (thisCell.tracks().size() < 2)
156  continue;
157 
158  int8_t maxNl = 0;
159 
160  // find maxNl
161  for (auto it : thisCell.tracks()) {
162  auto nl = tracks.nLayers(it);
163  maxNl = std::max(nl, maxNl);
164  }
165 
166  // if (maxNl<4) continue;
167  // quad pass through (leave it her for tests)
168  // maxNl = std::min(4, maxNl);
169 
170  for (auto it : thisCell.tracks()) {
171  if (tracks.nLayers(it) < maxNl)
172  quality[it] = reject; //no race: simple assignment of the same constant
173  }
174  }
175 }
176 
177 // assume the above (so, short tracks already removed)
178 __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
179  uint32_t const *__restrict__ nCells,
180  TkSoA *__restrict__ tracks,
181  bool dupPassThrough) {
182  // quality to mark rejected
185 
186  assert(nCells);
187 
188  auto first = threadIdx.x + blockIdx.x * blockDim.x;
189  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
190  auto const &thisCell = cells[idx];
191  if (thisCell.tracks().size() < 2)
192  continue;
193 
194  float mc = maxScore;
195  uint16_t im = tkNotFound;
196 
197  /* chi2 penalize higher-pt tracks (try rescale it?)
198  auto score = [&](auto it) {
199  return tracks->nLayers(it) < 4 ?
200  std::abs(tracks->tip(it)) : // tip for triplets
201  tracks->chi2(it); //chi2 for quads
202  };
203  */
204 
205  auto score = [&](auto it) { return std::abs(tracks->tip(it)); };
206 
207  // full crazy combinatorics
208  int ntr = thisCell.tracks().size();
209  for (int i = 0; i < ntr - 1; ++i) {
210  auto it = thisCell.tracks()[i];
211  auto qi = tracks->quality(it);
212  if (qi <= reject)
213  continue;
214  auto opi = tracks->stateAtBS.state(it)(2);
215  auto e2opi = tracks->stateAtBS.covariance(it)(9);
216  auto cti = tracks->stateAtBS.state(it)(3);
217  auto e2cti = tracks->stateAtBS.covariance(it)(12);
218  for (auto j = i + 1; j < ntr; ++j) {
219  auto jt = thisCell.tracks()[j];
220  auto qj = tracks->quality(jt);
221  if (qj <= reject)
222  continue;
223 #ifdef GPU_DEBUG
224  if (foundNtuplets->size(it) != foundNtuplets->size(jt))
225  printf(" a mess\n");
226 #endif
227  auto opj = tracks->stateAtBS.state(jt)(2);
228  auto ctj = tracks->stateAtBS.state(jt)(3);
229  auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);
230  if ((cti - ctj) * (cti - ctj) > dct)
231  continue;
232  auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi);
233  if ((opi - opj) * (opi - opj) > dop)
234  continue;
235  if ((qj < qi) || (qj == qi && score(it) < score(jt)))
236  tracks->quality(jt) = reject;
237  else {
238  tracks->quality(it) = reject;
239  break;
240  }
241  }
242  }
243 
244  // find maxQual
245  auto maxQual = reject; // no duplicate!
246  for (auto it : thisCell.tracks()) {
247  if (tracks->quality(it) > maxQual)
248  maxQual = tracks->quality(it);
249  }
250 
251  if (maxQual <= loose)
252  continue;
253 
254  // find min score
255  for (auto it : thisCell.tracks()) {
256  if (tracks->quality(it) == maxQual && score(it) < mc) {
257  mc = score(it);
258  im = it;
259  }
260  }
261 
262  if (tkNotFound == im)
263  continue;
264 
265  // mark all other duplicates (not yet, keep it loose)
266  for (auto it : thisCell.tracks()) {
267  if (tracks->quality(it) > loose && it != im)
268  tracks->quality(it) = loose; //no race: simple assignment of the same constant
269  }
270  }
271 }
272 
273 __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
274  cms::cuda::AtomicPairCounter *apc2, // just to zero them,
275  GPUCACell::Hits const *__restrict__ hhp,
276  GPUCACell *cells,
277  uint32_t const *__restrict__ nCells,
280  float hardCurvCut,
281  float ptmin,
282  float CAThetaCutBarrel,
283  float CAThetaCutForward,
284  float dcaCutInnerTriplet,
285  float dcaCutOuterTriplet) {
286  auto const &hh = *hhp;
287 
288  auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
289  auto first = threadIdx.x;
290  auto stride = blockDim.x;
291 
292  if (0 == (firstCellIndex + first)) {
293  (*apc1) = 0;
294  (*apc2) = 0;
295  } // ready for next kernel
296 
297  for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
298  auto cellIndex = idx;
299  auto &thisCell = cells[idx];
300  auto innerHitId = thisCell.inner_hit_id();
301  if (int(innerHitId) < isOuterHitOfCell.offset)
302  continue;
303  int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
304  auto vi = isOuterHitOfCell[innerHitId].data();
305 
306  auto ri = thisCell.inner_r(hh);
307  auto zi = thisCell.inner_z(hh);
308 
309  auto ro = thisCell.outer_r(hh);
310  auto zo = thisCell.outer_z(hh);
311  auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex;
312 
313  for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
314  auto otherCell = __ldg(vi + j);
315  auto &oc = cells[otherCell];
316  auto r1 = oc.inner_r(hh);
317  auto z1 = oc.inner_z(hh);
318  bool aligned = GPUCACell::areAlignedRZ(
319  r1,
320  z1,
321  ri,
322  zi,
323  ro,
324  zo,
325  ptmin,
326  isBarrel ? CAThetaCutBarrel : CAThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts
327  if (aligned && thisCell.dcaCut(hh,
328  oc,
331  hardCurvCut)) { // FIXME tune cuts
332  oc.addOuterNeighbor(cellIndex, *cellNeighbors);
333  thisCell.setStatusBits(GPUCACell::StatusBit::kUsed);
334  oc.setStatusBits(GPUCACell::StatusBit::kUsed);
335  }
336  } // loop on inner cells
337  } // loop on outer cells
338 }
339 
340 __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
341  GPUCACell *__restrict__ cells,
342  uint32_t const *nCells,
346  Quality *__restrict__ quality,
347  unsigned int minHitsPerNtuplet) {
348  // recursive: not obvious to widen
349  auto const &hh = *hhp;
350 
351  auto first = threadIdx.x + blockIdx.x * blockDim.x;
352  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
353  auto const &thisCell = cells[idx];
354  if (thisCell.isKilled())
355  continue; // cut by earlyFishbone
356  // we require at least three hits...
357  if (thisCell.outerNeighbors().empty())
358  continue;
359  auto pid = thisCell.layerPairId();
360  auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
361  if (doit) {
363  stack.reset();
364  thisCell.find_ntuplets<6>(
366  assert(stack.empty());
367  // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
368  }
369  }
370 }
371 
372 __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *nCells) {
373  auto first = threadIdx.x + blockIdx.x * blockDim.x;
374  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
375  auto &thisCell = cells[idx];
376  if (!thisCell.tracks().empty())
377  thisCell.setStatusBits(GPUCACell::StatusBit::kInTrack);
378  }
379 }
380 
381 __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
382  Quality const *__restrict__ quality,
384  auto first = blockIdx.x * blockDim.x + threadIdx.x;
385  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
386  auto nhits = foundNtuplets->size(it);
387  if (nhits < 3)
388  continue;
390  continue;
392  if (nhits > 7) // current limit
393  printf("wrong mult %d %d\n", it, nhits);
395  tupleMultiplicity->count(nhits);
396  }
397 }
398 
399 __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
400  Quality const *__restrict__ quality,
402  auto first = blockIdx.x * blockDim.x + threadIdx.x;
403  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
404  auto nhits = foundNtuplets->size(it);
405  if (nhits < 3)
406  continue;
408  continue;
410  if (nhits > 7)
411  printf("wrong mult %d %d\n", it, nhits);
413  tupleMultiplicity->fill(nhits, it);
414  }
415 }
416 
417 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
418  TkSoA const *__restrict__ tracks,
420  Quality *__restrict__ quality) {
421  int first = blockDim.x * blockIdx.x + threadIdx.x;
422  for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
423  auto nhits = tuples->size(it);
424  if (nhits == 0)
425  break; // guard
426 
427  // if duplicate: not even fit
429  continue;
430 
432 
433  // mark doublets as bad
434  if (nhits < 3)
435  continue;
436 
437  // if the fit has any invalid parameters, mark it as bad
438  bool isNaN = false;
439  for (int i = 0; i < 5; ++i) {
440  isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
441  }
442  if (isNaN) {
443 #ifdef NTUPLE_DEBUG
444  printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
445 #endif
446  continue;
447  }
448 
450 
451  // compute a pT-dependent chi2 cut
452 
453  auto roughLog = [](float x) {
454  // max diff [0.5,12] at 1.25 0.16143
455  // average diff 0.0662998
456  union IF {
457  uint32_t i;
458  float f;
459  };
460  IF z;
461  z.f = x;
462  uint32_t lsb = 1 < 21;
463  z.i += lsb;
464  z.i >>= 21;
465  auto f = z.i & 3;
466  int ex = int(z.i >> 2) - 127;
467 
468  // log2(1+0.25*f)
469  // averaged over bins
470  const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f};
471  return float(ex) + frac[f];
472  };
473 
474  // (see CAHitNtupletGeneratorGPU.cc)
475  float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
476  float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]);
477  if (tracks->chi2(it) >= chi2Cut) {
478 #ifdef NTUPLE_FIT_DEBUG
479  printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n",
480  it,
481  tuples->size(it),
482  tracks->pt(it),
483  tracks->eta(it),
484  tracks->chi2(it));
485 #endif
486  continue;
487  }
488 
490 
491  // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
492  // default cuts:
493  // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
494  // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
495  // (see CAHitNtupletGeneratorGPU.cc)
496  auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
497  bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
498  (std::abs(tracks->zip(it)) < region.maxZip);
499 
500  if (isOk)
502  }
503 }
504 
505 __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
506  Quality const *__restrict__ quality,
508  int first = blockDim.x * blockIdx.x + threadIdx.x;
509  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
510  if (tuples->size(idx) == 0)
511  break; //guard
513  continue;
514  atomicAdd(&(counters->nLooseTracks), 1);
516  continue;
517  atomicAdd(&(counters->nGoodTracks), 1);
518  }
519 }
520 
521 __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
522  Quality const *__restrict__ quality,
524  int first = blockDim.x * blockIdx.x + threadIdx.x;
525  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
526  if (tuples->size(idx) == 0)
527  break; // guard
528  for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
529  hitToTuple->count(*h);
530  }
531 }
532 
533 __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
534  Quality const *__restrict__ quality,
536  int first = blockDim.x * blockIdx.x + threadIdx.x;
537  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
538  if (tuples->size(idx) == 0)
539  break; // guard
540  for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
541  hitToTuple->fill(*h, idx);
542  }
543 }
544 
545 __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
546  TrackingRecHit2DSOAView const *__restrict__ hhp,
547  HitContainer *__restrict__ hitDetIndices) {
548  int first = blockDim.x * blockIdx.x + threadIdx.x;
549  // copy offsets
550  for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
551  hitDetIndices->off[idx] = tuples->off[idx];
552  }
553  // fill hit indices
554  auto const &hh = *hhp;
555  auto nhits = hh.nHits();
556  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
558  hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
559  }
560 }
561 
562 __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
563  auto &tracks = *ptracks;
564  auto first = blockIdx.x * blockDim.x + threadIdx.x;
565  auto ntracks = apc->get().m;
566  if (0 == first)
567  tracks.setNTracks(ntracks);
568  for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
569  auto nHits = tracks.nHits(idx);
570  assert(nHits >= 3);
571  tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
572  }
573 }
574 
575 __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
577  auto &c = *counters;
578  int first = blockDim.x * blockIdx.x + threadIdx.x;
579  for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
580  if (hitToTuple->size(idx) == 0)
581  continue; // SHALL NOT BE break
582  atomicAdd(&c.nUsedHits, 1);
583  if (hitToTuple->size(idx) > 1)
584  atomicAdd(&c.nDupHits, 1);
585  }
586 }
587 
588 __global__ void kernel_countSharedHit(int *__restrict__ nshared,
589  HitContainer const *__restrict__ ptuples,
590  Quality const *__restrict__ quality,
592  constexpr auto loose = pixelTrack::Quality::loose;
593 
594  auto &hitToTuple = *phitToTuple;
595  auto const &foundNtuplets = *ptuples;
596 
597  int first = blockDim.x * blockIdx.x + threadIdx.x;
598  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
599  if (hitToTuple.size(idx) < 2)
600  continue;
601 
602  int nt = 0;
603 
604  // count "good" tracks
605  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
606  if (quality[*it] < loose)
607  continue;
608  ++nt;
609  }
610 
611  if (nt < 2)
612  continue;
613 
614  // now mark each track triplet as sharing a hit
615  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
616  if (foundNtuplets.size(*it) > 3)
617  continue;
618  atomicAdd(&nshared[*it], 1);
619  }
620 
621  } // hit loop
622 }
623 
624 __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
625  HitContainer const *__restrict__ tuples,
626  Quality *__restrict__ quality,
627  bool dupPassThrough) {
628  // constexpr auto bad = pixelTrack::Quality::bad;
629  constexpr auto dup = pixelTrack::Quality::dup;
630  constexpr auto loose = pixelTrack::Quality::loose;
631  // constexpr auto strict = pixelTrack::Quality::strict;
632 
633  // quality to mark rejected
634  auto const reject = dupPassThrough ? loose : dup;
635 
636  int first = blockDim.x * blockIdx.x + threadIdx.x;
637  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
638  if (tuples->size(idx) == 0)
639  break; //guard
640  if (quality[idx] <= reject)
641  continue;
642  if (nshared[idx] > 2)
643  quality[idx] = reject;
644  }
645 }
646 
647 // mostly for very forward triplets.....
648 __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
649  Quality *__restrict__ quality,
650  uint16_t nmin,
651  bool dupPassThrough,
653  // quality to mark rejected
655 
656  auto &hitToTuple = *phitToTuple;
657  auto const &tracks = *ptracks;
658 
659  int first = blockDim.x * blockIdx.x + threadIdx.x;
660  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
661  if (hitToTuple.size(idx) < 2)
662  continue;
663 
664  /* chi2 is bad for large pt
665  auto score = [&](auto it, auto nl) {
666  return nl < 4 ? std::abs(tracks.tip(it)) : // tip for triplets
667  tracks.chi2(it); //chi2
668  };
669  */
670  auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
671 
672  // full combinatorics
673  for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
674  auto const it = *ip;
675  auto qi = quality[it];
676  if (qi <= reject)
677  continue;
678  auto opi = tracks.stateAtBS.state(it)(2);
679  auto e2opi = tracks.stateAtBS.covariance(it)(9);
680  auto cti = tracks.stateAtBS.state(it)(3);
681  auto e2cti = tracks.stateAtBS.covariance(it)(12);
682  auto nli = tracks.nLayers(it);
683  for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
684  auto const jt = *jp;
685  auto qj = quality[jt];
686  if (qj <= reject)
687  continue;
688  auto opj = tracks.stateAtBS.state(jt)(2);
689  auto ctj = tracks.stateAtBS.state(jt)(3);
690  auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti);
691  if ((cti - ctj) * (cti - ctj) > dct)
692  continue;
693  auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
694  if ((opi - opj) * (opi - opj) > dop)
695  continue;
696  auto nlj = tracks.nLayers(jt);
697  if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
698  quality[jt] = reject;
699  else {
700  quality[it] = reject;
701  break;
702  }
703  }
704  }
705  }
706 }
707 
708 __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
709  TkSoA const *__restrict__ ptracks,
710  Quality *__restrict__ quality,
711  int nmin,
712  bool dupPassThrough,
714  // quality to mark rejected
716  // quality of longest track
718 
719  auto &hitToTuple = *phitToTuple;
720  auto const &tracks = *ptracks;
721 
722  auto const &hh = *hhp;
723  int l1end = hh.hitsLayerStart()[1];
724 
725  int first = blockDim.x * blockIdx.x + threadIdx.x;
726  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
727  if (hitToTuple.size(idx) < 2)
728  continue;
729 
730  int8_t maxNl = 0;
731 
732  // find maxNl
733  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
734  if (quality[*it] < longTqual)
735  continue;
736  // if (tracks.nHits(*it)==3) continue;
737  auto nl = tracks.nLayers(*it);
738  maxNl = std::max(nl, maxNl);
739  }
740 
741  if (maxNl < 4)
742  continue;
743 
744  // quad pass through (leave for tests)
745  // maxNl = std::min(4, maxNl);
746 
747  // kill all tracks shorter than maxHl (only triplets???
748  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
749  auto nl = tracks.nLayers(*it);
750 
751  //checking if shared hit is on bpix1 and if the tuple is short enough
752  if (idx < l1end and nl > nmin)
753  continue;
754 
755  if (nl < maxNl && quality[*it] > reject)
756  quality[*it] = reject;
757  }
758  }
759 }
760 
761 __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
762  Quality *__restrict__ quality,
763  uint16_t nmin,
764  bool dupPassThrough,
766  // quality to mark rejected
767  auto const reject = pixelTrack::Quality::loose;
770 
771  auto &hitToTuple = *phitToTuple;
772  auto const &tracks = *ptracks;
773 
774  int first = blockDim.x * blockIdx.x + threadIdx.x;
775  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
776  if (hitToTuple.size(idx) < 2)
777  continue;
778 
779  float mc = maxScore;
780  uint16_t im = tkNotFound;
781  bool onlyTriplets = true;
782 
783  // check if only triplets
784  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
785  if (quality[*it] <= good)
786  continue;
787  onlyTriplets &= tracks.isTriplet(*it);
788  if (!onlyTriplets)
789  break;
790  }
791 
792  // only triplets
793  if (!onlyTriplets)
794  continue;
795 
796  // for triplets choose best tip! (should we first find best quality???)
797  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
798  auto const it = *ip;
799  if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
800  mc = std::abs(tracks.tip(it));
801  im = it;
802  }
803  }
804 
805  if (tkNotFound == im)
806  continue;
807 
808  // mark worse ambiguities
809  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
810  auto const it = *ip;
811  if (quality[it] > reject && it != im)
812  quality[it] = reject; //no race: simple assignment of the same constant
813  }
814 
815  } // loop over hits
816 }
817 
818 __global__ void kernel_simpleTripletCleaner(
819  TkSoA const *__restrict__ ptracks,
820  Quality *__restrict__ quality,
821  uint16_t nmin,
822  bool dupPassThrough,
824  // quality to mark rejected
825  auto const reject = pixelTrack::Quality::loose;
827  auto const good = pixelTrack::Quality::loose;
828 
829  auto &hitToTuple = *phitToTuple;
830  auto const &tracks = *ptracks;
831 
832  int first = blockDim.x * blockIdx.x + threadIdx.x;
833  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
834  if (hitToTuple.size(idx) < 2)
835  continue;
836 
837  float mc = maxScore;
838  uint16_t im = tkNotFound;
839 
840  // choose best tip! (should we first find best quality???)
841  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
842  auto const it = *ip;
843  if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
844  mc = std::abs(tracks.tip(it));
845  im = it;
846  }
847  }
848 
849  if (tkNotFound == im)
850  continue;
851 
852  // mark worse ambiguities
853  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
854  auto const it = *ip;
855  if (quality[it] > reject && tracks.isTriplet(it) && it != im)
856  quality[it] = reject; //no race: simple assignment of the same constant
857  }
858 
859  } // loop over hits
860 }
861 
862 __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
863  HitContainer const *__restrict__ ptuples,
864  TkSoA const *__restrict__ ptracks,
865  Quality const *__restrict__ quality,
867  int32_t firstPrint,
868  int32_t lastPrint,
869  int iev) {
870  constexpr auto loose = pixelTrack::Quality::loose;
871  auto const &hh = *hhp;
872  auto const &foundNtuplets = *ptuples;
873  auto const &tracks = *ptracks;
874  int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
875  for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
876  auto nh = foundNtuplets.size(i);
877  if (nh < 3)
878  continue;
879  if (quality[i] < loose)
880  continue;
881  printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
882  10000 * iev + i,
883  int(quality[i]),
884  nh,
885  tracks.nLayers(i),
886  tracks.charge(i),
887  tracks.pt(i),
888  tracks.eta(i),
889  tracks.phi(i),
890  tracks.tip(i),
891  tracks.zip(i),
892  // asinhf(fit_results[i].par(3)),
893  tracks.chi2(i),
894  hh.zGlobal(*foundNtuplets.begin(i)),
895  hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
896  hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
897  nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
898  nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
899  nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
900  nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
901  }
902 }
903 
904 __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) {
905  auto const &c = *counters;
906  printf(
907  "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nLooseTracks | nGoodTracks | nUsedHits | "
908  "nDupHits | "
909  "nFishCells | "
910  "nKilledCells | "
911  "nUsedCells | nZeroTrackCells ||\n");
912  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
913  c.nEvents,
914  c.nHits,
915  c.nCells,
916  c.nTuples,
917  c.nFitTracks,
918  c.nLooseTracks,
919  c.nGoodTracks,
920  c.nUsedHits,
921  c.nDupHits,
922  c.nFishCells,
923  c.nKilledCells,
924  c.nEmptyCells,
925  c.nZeroTrackCells);
926  printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f| %.3f| %.3f||\n",
927  c.nEvents,
928  c.nHits / double(c.nEvents),
929  c.nCells / double(c.nEvents),
930  c.nTuples / double(c.nEvents),
931  c.nFitTracks / double(c.nEvents),
932  c.nLooseTracks / double(c.nEvents),
933  c.nGoodTracks / double(c.nEvents),
934  c.nUsedHits / double(c.nEvents),
935  c.nDupHits / double(c.nEvents),
936  c.nFishCells / double(c.nCells),
937  c.nKilledCells / double(c.nCells),
938  c.nEmptyCells / double(c.nCells),
939  c.nZeroTrackCells / double(c.nCells));
940 }
TrackingRecHit2DSOAView const *__restrict__ hhp
const dim3 threadIdx
Definition: cudaCompat.h:29
constexpr int32_t maxHitsOnTrack
Definition: CAConstants.h:47
def isnan(num)
const dim3 gridDim
Definition: cudaCompat.h:33
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const * cellNeighbors
constexpr uint32_t maxNumberOfQuadruplets
Definition: CAConstants.h:42
auto const & hh
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
TrackingRecHit2DHeterogeneous< cms::cudacompat::GPUTraits > TrackingRecHit2DGPU
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const * hitToTuple
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t uint32_t CAHitNtupletGeneratorKernelsGPU::Counters * counters
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ nCells
HitContainer const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple
auto const longTqual
float float float z
TkSoA const *__restrict__ CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts
int np
Definition: AMPTWrapper.h:43
TrackingRecHit2DSOAView const *__restrict__ HitContainer *__restrict__ hitDetIndices
HitContainer const *__restrict__ tuples
stack
Definition: svgfig.py:559
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const isOuterHitOfCell
TrackSoA::HitContainer HitContainer
double f[11][100]
HitContainer const *__restrict__ ptuples
uint32_t const *__restrict__ TkSoA const *__restrict__ ptracks
uint32_t nh
Quality *__restrict__ uint16_t nmin
TrackSoAHeterogeneousT< maxNumber()> TrackSoA
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t uint32_t maxNumberOfDoublets
const dim3 blockIdx
Definition: cudaCompat.h:32
caConstants::TupleMultiplicity const * tupleMultiplicity
int nt
Definition: AMPTWrapper.h:42
constexpr auto nOnes() const
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ cells
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter * apc
assert(nCells)
auto const & tracks
cannot be loose
auto const good
min quality of good
HitContainer const *__restrict__ TkSoA const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ int32_t firstPrint
T __ldg(T const *x)
Definition: cudaCompat.h:113
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const * cellTracks
HitContainer const *__restrict__ TkSoA const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ int32_t int32_t int iev
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t nHits
uint32_t const *__restrict__ TkSoA const *__restrict__ Quality bool dupPassThrough
double ptmin
Definition: HydjetWrapper.h:84
constexpr auto totOnes() const
cms::cuda::OneToManyAssoc< tindex_type, -1, 4 *maxTuples > HitToTuple
Definition: CAConstants.h:79
constexpr auto loose
float x
HitContainer const *__restrict__ TkSoA const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ int32_t int32_t lastPrint
uint32_t const *__restrict__ Quality * quality
__host__ __device__ index_type const uint32_t n
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4
constexpr uint32_t last_bpix1_detIndex
Definition: CAConstants.h:64
constexpr uint32_t last_barrel_detIndex
Definition: CAConstants.h:65
cms::cuda::OneToManyAssoc< tindex_type, maxHitsOnTrack+1, maxTuples > TupleMultiplicity
Definition: CAConstants.h:80
auto const & foundNtuplets
__shared__ uint32_t ntot
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61