CMS 3D CMS Logo

CAHitNtupletGeneratorKernelsImpl.h
Go to the documentation of this file.
1 //
2 // Original Author: Felice Pantaleo, CERN
3 //
4 
5 // #define NTUPLE_DEBUG
6 // #define GPU_DEBUG
7 
8 #include <cmath>
9 #include <cstdint>
10 #include <limits>
11 
12 #include <cuda_runtime.h>
13 
17 
18 #include "CAConstants.h"
20 #include "GPUCACell.h"
21 #include "gpuFishbone.h"
22 #include "gpuPixelDoublets.h"
23 
26 
29 
33 
34 namespace {
35 
36  constexpr uint16_t tkNotFound = std::numeric_limits<uint16_t>::max();
37  constexpr float maxScore = std::numeric_limits<float>::max();
38  constexpr float nSigma2 = 25.f;
39 
40 } // namespace
41 
42 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
46  GPUCACell const *__restrict__ cells,
47  uint32_t const *__restrict__ nCells,
51  int32_t nHits,
52  uint32_t maxNumberOfDoublets,
54  auto first = threadIdx.x + blockIdx.x * blockDim.x;
55 
56  auto &c = *counters;
57  // counters once per event
58  if (0 == first) {
59  atomicAdd(&c.nEvents, 1);
60  atomicAdd(&c.nHits, nHits);
61  atomicAdd(&c.nCells, *nCells);
62  atomicAdd(&c.nTuples, apc->get().m);
63  atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
64  }
65 
66 #ifdef NTUPLE_DEBUG
67  if (0 == first) {
68  printf("number of found cells %d, found tuples %d with total hits %d out of %d %d\n",
69  *nCells,
70  apc->get().m,
71  apc->get().n,
72  nHits,
73  hitToTuple->totOnes());
75  assert(foundNtuplets->size(apc->get().m) == 0);
76  assert(foundNtuplets->size() == apc->get().n);
77  }
78  }
79 
80  for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
81  if (foundNtuplets->size(idx) > 7) // current real limit
82  printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
84  for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
85  assert(int(*ih) < nHits);
86  }
87 #endif
88 
89  if (0 == first) {
91  printf("Tuples overflow\n");
93  printf("Cells overflow\n");
94  if (cellNeighbors && cellNeighbors->full())
95  printf("cellNeighbors overflow\n");
96  if (cellTracks && cellTracks->full())
97  printf("cellTracks overflow\n");
98  if (int(hitToTuple->nOnes()) < nHits)
99  printf("ERROR hitToTuple overflow %d %d\n", hitToTuple->nOnes(), nHits);
100  }
101 
102  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
103  auto const &thisCell = cells[idx];
104  if (thisCell.hasFishbone() && !thisCell.isKilled())
105  atomicAdd(&c.nFishCells, 1);
106  if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId];
107  printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
108  if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId];
109  printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
110  if (thisCell.isKilled())
111  atomicAdd(&c.nKilledCells, 1);
112  if (!thisCell.unused())
113  atomicAdd(&c.nEmptyCells, 1);
114  if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
115  atomicAdd(&c.nZeroTrackCells, 1);
116  }
117 
118  for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) {
119  if (isOuterHitOfCell.container[idx].full()) // ++tooManyOuterHitOfCell;
120  printf("OuterHitOfCell overflow %d\n", idx);
121  }
122 }
123 
124 __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
125  constexpr auto reject = pixelTrack::Quality::dup;
126 
127  auto first = threadIdx.x + blockIdx.x * blockDim.x;
128  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
129  auto const &thisCell = cells[idx];
130  if (!thisCell.isKilled())
131  continue;
132 
133  for (auto it : thisCell.tracks())
134  quality[it] = reject;
135  }
136 }
137 
138 // remove shorter tracks if sharing a cell
139 // It does not seem to affect efficiency in any way!
140 __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
141  uint32_t const *__restrict__ nCells,
142  TkSoA const *__restrict__ ptracks,
143  Quality *quality,
145  // quality to mark rejected
146  constexpr auto reject = pixelTrack::Quality::edup;
147 
148  auto const &tracks = *ptracks;
149 
150  assert(nCells);
151  auto first = threadIdx.x + blockIdx.x * blockDim.x;
152  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
153  auto const &thisCell = cells[idx];
154 
155  if (thisCell.tracks().size() < 2)
156  continue;
157 
158  int8_t maxNl = 0;
159 
160  // find maxNl
161  for (auto it : thisCell.tracks()) {
162  auto nl = tracks.nLayers(it);
163  maxNl = std::max(nl, maxNl);
164  }
165 
166  // if (maxNl<4) continue;
167  // quad pass through (leave it her for tests)
168  // maxNl = std::min(4, maxNl);
169 
170  for (auto it : thisCell.tracks()) {
171  if (tracks.nLayers(it) < maxNl)
172  quality[it] = reject; //no race: simple assignment of the same constant
173  }
174  }
175 }
176 
177 // assume the above (so, short tracks already removed)
178 __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
179  uint32_t const *__restrict__ nCells,
180  TkSoA *__restrict__ tracks,
181  bool dupPassThrough) {
182  // quality to mark rejected
185 
186  assert(nCells);
187 
188  auto first = threadIdx.x + blockIdx.x * blockDim.x;
189  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
190  auto const &thisCell = cells[idx];
191  if (thisCell.tracks().size() < 2)
192  continue;
193 
194  float mc = maxScore;
195  uint16_t im = tkNotFound;
196 
197  /* chi2 penalize higher-pt tracks (try rescale it?)
198  auto score = [&](auto it) {
199  return tracks->nLayers(it) < 4 ?
200  std::abs(tracks->tip(it)) : // tip for triplets
201  tracks->chi2(it); //chi2 for quads
202  };
203  */
204 
205  auto score = [&](auto it) { return std::abs(tracks->tip(it)); };
206 
207  // full crazy combinatorics
208  int ntr = thisCell.tracks().size();
209  for (int i = 0; i < ntr - 1; ++i) {
210  auto it = thisCell.tracks()[i];
211  auto qi = tracks->quality(it);
212  if (qi <= reject)
213  continue;
214  auto opi = tracks->stateAtBS.state(it)(2);
215  auto e2opi = tracks->stateAtBS.covariance(it)(9);
216  auto cti = tracks->stateAtBS.state(it)(3);
217  auto e2cti = tracks->stateAtBS.covariance(it)(12);
218  for (auto j = i + 1; j < ntr; ++j) {
219  auto jt = thisCell.tracks()[j];
220  auto qj = tracks->quality(jt);
221  if (qj <= reject)
222  continue;
223  auto opj = tracks->stateAtBS.state(jt)(2);
224  auto ctj = tracks->stateAtBS.state(jt)(3);
225  auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);
226  if ((cti - ctj) * (cti - ctj) > dct)
227  continue;
228  auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi);
229  if ((opi - opj) * (opi - opj) > dop)
230  continue;
231  if ((qj < qi) || (qj == qi && score(it) < score(jt)))
232  tracks->quality(jt) = reject;
233  else {
234  tracks->quality(it) = reject;
235  break;
236  }
237  }
238  }
239 
240  // find maxQual
241  auto maxQual = reject; // no duplicate!
242  for (auto it : thisCell.tracks()) {
243  if (tracks->quality(it) > maxQual)
244  maxQual = tracks->quality(it);
245  }
246 
247  if (maxQual <= loose)
248  continue;
249 
250  // find min score
251  for (auto it : thisCell.tracks()) {
252  if (tracks->quality(it) == maxQual && score(it) < mc) {
253  mc = score(it);
254  im = it;
255  }
256  }
257 
258  if (tkNotFound == im)
259  continue;
260 
261  // mark all other duplicates (not yet, keep it loose)
262  for (auto it : thisCell.tracks()) {
263  if (tracks->quality(it) > loose && it != im)
264  tracks->quality(it) = loose; //no race: simple assignment of the same constant
265  }
266  }
267 }
268 
269 __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
270  cms::cuda::AtomicPairCounter *apc2, // just to zero them,
271  GPUCACell::Hits const *__restrict__ hhp,
272  GPUCACell *cells,
273  uint32_t const *__restrict__ nCells,
276  float hardCurvCut,
277  float ptmin,
278  float CAThetaCutBarrel,
279  float CAThetaCutForward,
280  float dcaCutInnerTriplet,
281  float dcaCutOuterTriplet) {
282  auto const &hh = *hhp;
283 
284  auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
285  auto first = threadIdx.x;
286  auto stride = blockDim.x;
287 
288  if (0 == (firstCellIndex + first)) {
289  (*apc1) = 0;
290  (*apc2) = 0;
291  } // ready for next kernel
292 
293  for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
294  auto cellIndex = idx;
295  auto &thisCell = cells[idx];
296  auto innerHitId = thisCell.inner_hit_id();
297  if (int(innerHitId) < isOuterHitOfCell.offset)
298  continue;
299  int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
300  auto vi = isOuterHitOfCell[innerHitId].data();
301 
302  auto ri = thisCell.inner_r(hh);
303  auto zi = thisCell.inner_z(hh);
304 
305  auto ro = thisCell.outer_r(hh);
306  auto zo = thisCell.outer_z(hh);
307  auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex;
308 
309  for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
310  auto otherCell = __ldg(vi + j);
311  auto &oc = cells[otherCell];
312  auto r1 = oc.inner_r(hh);
313  auto z1 = oc.inner_z(hh);
314  bool aligned = GPUCACell::areAlignedRZ(
315  r1,
316  z1,
317  ri,
318  zi,
319  ro,
320  zo,
321  ptmin,
322  isBarrel ? CAThetaCutBarrel : CAThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts
323  if (aligned && thisCell.dcaCut(hh,
324  oc,
327  hardCurvCut)) { // FIXME tune cuts
328  oc.addOuterNeighbor(cellIndex, *cellNeighbors);
329  thisCell.setStatusBits(GPUCACell::StatusBit::kUsed);
330  oc.setStatusBits(GPUCACell::StatusBit::kUsed);
331  }
332  } // loop on inner cells
333  } // loop on outer cells
334 }
335 
336 __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
337  GPUCACell *__restrict__ cells,
338  uint32_t const *nCells,
342  Quality *__restrict__ quality,
343  unsigned int minHitsPerNtuplet) {
344  // recursive: not obvious to widen
345  auto const &hh = *hhp;
346 
347  auto first = threadIdx.x + blockIdx.x * blockDim.x;
348  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
349  auto const &thisCell = cells[idx];
350  if (thisCell.isKilled())
351  continue; // cut by earlyFishbone
352  // we require at least three hits...
353  if (thisCell.outerNeighbors().empty())
354  continue;
355  auto pid = thisCell.layerPairId();
356  auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
357  if (doit) {
359  stack.reset();
360  thisCell.find_ntuplets<6>(
362  assert(stack.empty());
363  // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
364  }
365  }
366 }
367 
368 __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *nCells) {
369  auto first = threadIdx.x + blockIdx.x * blockDim.x;
370  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
371  auto &thisCell = cells[idx];
372  if (!thisCell.tracks().empty())
373  thisCell.setStatusBits(GPUCACell::StatusBit::kInTrack);
374  }
375 }
376 
377 __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
378  Quality const *__restrict__ quality,
380  auto first = blockIdx.x * blockDim.x + threadIdx.x;
381  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
382  auto nhits = foundNtuplets->size(it);
383  if (nhits < 3)
384  continue;
386  continue;
388  if (nhits > 7) // current limit
389  printf("wrong mult %d %d\n", it, nhits);
391  tupleMultiplicity->count(nhits);
392  }
393 }
394 
395 __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
396  Quality const *__restrict__ quality,
398  auto first = blockIdx.x * blockDim.x + threadIdx.x;
399  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
400  auto nhits = foundNtuplets->size(it);
401  if (nhits < 3)
402  continue;
404  continue;
406  if (nhits > 7)
407  printf("wrong mult %d %d\n", it, nhits);
409  tupleMultiplicity->fill(nhits, it);
410  }
411 }
412 
413 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
414  TkSoA const *__restrict__ tracks,
416  Quality *__restrict__ quality) {
417  int first = blockDim.x * blockIdx.x + threadIdx.x;
418  for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
419  auto nhits = tuples->size(it);
420  if (nhits == 0)
421  break; // guard
422 
423  // if duplicate: not even fit
425  continue;
426 
428 
429  // mark doublets as bad
430  if (nhits < 3)
431  continue;
432 
433  // if the fit has any invalid parameters, mark it as bad
434  bool isNaN = false;
435  for (int i = 0; i < 5; ++i) {
436  isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
437  }
438  if (isNaN) {
439 #ifdef NTUPLE_DEBUG
440  printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
441 #endif
442  continue;
443  }
444 
446 
447  // compute a pT-dependent chi2 cut
448 
449  auto roughLog = [](float x) {
450  // max diff [0.5,12] at 1.25 0.16143
451  // average diff 0.0662998
452  union IF {
453  uint32_t i;
454  float f;
455  };
456  IF z;
457  z.f = x;
458  uint32_t lsb = 1 < 21;
459  z.i += lsb;
460  z.i >>= 21;
461  auto f = z.i & 3;
462  int ex = int(z.i >> 2) - 127;
463 
464  // log2(1+0.25*f)
465  // averaged over bins
466  const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f};
467  return float(ex) + frac[f];
468  };
469 
470  // (see CAHitNtupletGeneratorGPU.cc)
471  float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
472  float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]);
473  if (tracks->chi2(it) >= chi2Cut) {
474 #ifdef NTUPLE_FIT_DEBUG
475  printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n",
476  it,
477  tuples->size(it),
478  tracks->pt(it),
479  tracks->eta(it),
480  tracks->chi2(it));
481 #endif
482  continue;
483  }
484 
486 
487  // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
488  // default cuts:
489  // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
490  // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
491  // (see CAHitNtupletGeneratorGPU.cc)
492  auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
493  bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
494  (std::abs(tracks->zip(it)) < region.maxZip);
495 
496  if (isOk)
498  }
499 }
500 
501 __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
502  Quality const *__restrict__ quality,
504  int first = blockDim.x * blockIdx.x + threadIdx.x;
505  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
506  if (tuples->size(idx) == 0)
507  break; //guard
509  continue;
510  atomicAdd(&(counters->nLooseTracks), 1);
512  continue;
513  atomicAdd(&(counters->nGoodTracks), 1);
514  }
515 }
516 
517 __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
518  Quality const *__restrict__ quality,
520  int first = blockDim.x * blockIdx.x + threadIdx.x;
521  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
522  if (tuples->size(idx) == 0)
523  break; // guard
524  for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
525  hitToTuple->count(*h);
526  }
527 }
528 
529 __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
530  Quality const *__restrict__ quality,
532  int first = blockDim.x * blockIdx.x + threadIdx.x;
533  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
534  if (tuples->size(idx) == 0)
535  break; // guard
536  for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
537  hitToTuple->fill(*h, idx);
538  }
539 }
540 
541 __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
542  TrackingRecHit2DSOAView const *__restrict__ hhp,
543  HitContainer *__restrict__ hitDetIndices) {
544  int first = blockDim.x * blockIdx.x + threadIdx.x;
545  // copy offsets
546  for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
547  hitDetIndices->off[idx] = tuples->off[idx];
548  }
549  // fill hit indices
550  auto const &hh = *hhp;
551  auto nhits = hh.nHits();
552  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
554  hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
555  }
556 }
557 
558 __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
559  auto &tracks = *ptracks;
560  auto first = blockIdx.x * blockDim.x + threadIdx.x;
561  // clamp the number of tracks to the capacity of the SoA
562  auto ntracks = std::min<int>(apc->get().m, tracks.stride() - 1);
563  if (0 == first)
564  tracks.setNTracks(ntracks);
565  for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
566  auto nHits = tracks.nHits(idx);
567  assert(nHits >= 3);
568  tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
569  }
570 }
571 
572 __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
574  auto &c = *counters;
575  int first = blockDim.x * blockIdx.x + threadIdx.x;
576  for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
577  if (hitToTuple->size(idx) == 0)
578  continue; // SHALL NOT BE break
579  atomicAdd(&c.nUsedHits, 1);
580  if (hitToTuple->size(idx) > 1)
581  atomicAdd(&c.nDupHits, 1);
582  }
583 }
584 
585 __global__ void kernel_countSharedHit(int *__restrict__ nshared,
586  HitContainer const *__restrict__ ptuples,
587  Quality const *__restrict__ quality,
589  constexpr auto loose = pixelTrack::Quality::loose;
590 
591  auto &hitToTuple = *phitToTuple;
592  auto const &foundNtuplets = *ptuples;
593 
594  int first = blockDim.x * blockIdx.x + threadIdx.x;
595  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
596  if (hitToTuple.size(idx) < 2)
597  continue;
598 
599  int nt = 0;
600 
601  // count "good" tracks
602  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
603  if (quality[*it] < loose)
604  continue;
605  ++nt;
606  }
607 
608  if (nt < 2)
609  continue;
610 
611  // now mark each track triplet as sharing a hit
612  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
613  if (foundNtuplets.size(*it) > 3)
614  continue;
615  atomicAdd(&nshared[*it], 1);
616  }
617 
618  } // hit loop
619 }
620 
621 __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
622  HitContainer const *__restrict__ tuples,
623  Quality *__restrict__ quality,
624  bool dupPassThrough) {
625  // constexpr auto bad = pixelTrack::Quality::bad;
626  constexpr auto dup = pixelTrack::Quality::dup;
627  constexpr auto loose = pixelTrack::Quality::loose;
628  // constexpr auto strict = pixelTrack::Quality::strict;
629 
630  // quality to mark rejected
631  auto const reject = dupPassThrough ? loose : dup;
632 
633  int first = blockDim.x * blockIdx.x + threadIdx.x;
634  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
635  if (tuples->size(idx) == 0)
636  break; //guard
637  if (quality[idx] <= reject)
638  continue;
639  if (nshared[idx] > 2)
640  quality[idx] = reject;
641  }
642 }
643 
644 // mostly for very forward triplets.....
645 __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
646  Quality *__restrict__ quality,
647  uint16_t nmin,
648  bool dupPassThrough,
650  // quality to mark rejected
652 
653  auto &hitToTuple = *phitToTuple;
654  auto const &tracks = *ptracks;
655 
656  int first = blockDim.x * blockIdx.x + threadIdx.x;
657  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
658  if (hitToTuple.size(idx) < 2)
659  continue;
660 
661  /* chi2 is bad for large pt
662  auto score = [&](auto it, auto nl) {
663  return nl < 4 ? std::abs(tracks.tip(it)) : // tip for triplets
664  tracks.chi2(it); //chi2
665  };
666  */
667  auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
668 
669  // full combinatorics
670  for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
671  auto const it = *ip;
672  auto qi = quality[it];
673  if (qi <= reject)
674  continue;
675  auto opi = tracks.stateAtBS.state(it)(2);
676  auto e2opi = tracks.stateAtBS.covariance(it)(9);
677  auto cti = tracks.stateAtBS.state(it)(3);
678  auto e2cti = tracks.stateAtBS.covariance(it)(12);
679  auto nli = tracks.nLayers(it);
680  for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
681  auto const jt = *jp;
682  auto qj = quality[jt];
683  if (qj <= reject)
684  continue;
685  auto opj = tracks.stateAtBS.state(jt)(2);
686  auto ctj = tracks.stateAtBS.state(jt)(3);
687  auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti);
688  if ((cti - ctj) * (cti - ctj) > dct)
689  continue;
690  auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
691  if ((opi - opj) * (opi - opj) > dop)
692  continue;
693  auto nlj = tracks.nLayers(jt);
694  if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
695  quality[jt] = reject;
696  else {
697  quality[it] = reject;
698  break;
699  }
700  }
701  }
702  }
703 }
704 
705 __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
706  TkSoA const *__restrict__ ptracks,
707  Quality *__restrict__ quality,
708  int nmin,
709  bool dupPassThrough,
711  // quality to mark rejected
713  // quality of longest track
715 
716  auto &hitToTuple = *phitToTuple;
717  auto const &tracks = *ptracks;
718 
719  auto const &hh = *hhp;
720  int l1end = hh.hitsLayerStart()[1];
721 
722  int first = blockDim.x * blockIdx.x + threadIdx.x;
723  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
724  if (hitToTuple.size(idx) < 2)
725  continue;
726 
727  int8_t maxNl = 0;
728 
729  // find maxNl
730  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
731  if (quality[*it] < longTqual)
732  continue;
733  // if (tracks.nHits(*it)==3) continue;
734  auto nl = tracks.nLayers(*it);
735  maxNl = std::max(nl, maxNl);
736  }
737 
738  if (maxNl < 4)
739  continue;
740 
741  // quad pass through (leave for tests)
742  // maxNl = std::min(4, maxNl);
743 
744  // kill all tracks shorter than maxHl (only triplets???
745  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
746  auto nl = tracks.nLayers(*it);
747 
748  //checking if shared hit is on bpix1 and if the tuple is short enough
749  if (idx < l1end and nl > nmin)
750  continue;
751 
752  if (nl < maxNl && quality[*it] > reject)
753  quality[*it] = reject;
754  }
755  }
756 }
757 
758 __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
759  Quality *__restrict__ quality,
760  uint16_t nmin,
761  bool dupPassThrough,
763  // quality to mark rejected
764  auto const reject = pixelTrack::Quality::loose;
767 
768  auto &hitToTuple = *phitToTuple;
769  auto const &tracks = *ptracks;
770 
771  int first = blockDim.x * blockIdx.x + threadIdx.x;
772  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
773  if (hitToTuple.size(idx) < 2)
774  continue;
775 
776  float mc = maxScore;
777  uint16_t im = tkNotFound;
778  bool onlyTriplets = true;
779 
780  // check if only triplets
781  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
782  if (quality[*it] <= good)
783  continue;
784  onlyTriplets &= tracks.isTriplet(*it);
785  if (!onlyTriplets)
786  break;
787  }
788 
789  // only triplets
790  if (!onlyTriplets)
791  continue;
792 
793  // for triplets choose best tip! (should we first find best quality???)
794  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
795  auto const it = *ip;
796  if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
797  mc = std::abs(tracks.tip(it));
798  im = it;
799  }
800  }
801 
802  if (tkNotFound == im)
803  continue;
804 
805  // mark worse ambiguities
806  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
807  auto const it = *ip;
808  if (quality[it] > reject && it != im)
809  quality[it] = reject; //no race: simple assignment of the same constant
810  }
811 
812  } // loop over hits
813 }
814 
815 __global__ void kernel_simpleTripletCleaner(
816  TkSoA const *__restrict__ ptracks,
817  Quality *__restrict__ quality,
818  uint16_t nmin,
819  bool dupPassThrough,
821  // quality to mark rejected
822  auto const reject = pixelTrack::Quality::loose;
824  auto const good = pixelTrack::Quality::loose;
825 
826  auto &hitToTuple = *phitToTuple;
827  auto const &tracks = *ptracks;
828 
829  int first = blockDim.x * blockIdx.x + threadIdx.x;
830  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
831  if (hitToTuple.size(idx) < 2)
832  continue;
833 
834  float mc = maxScore;
835  uint16_t im = tkNotFound;
836 
837  // choose best tip! (should we first find best quality???)
838  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
839  auto const it = *ip;
840  if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
841  mc = std::abs(tracks.tip(it));
842  im = it;
843  }
844  }
845 
846  if (tkNotFound == im)
847  continue;
848 
849  // mark worse ambiguities
850  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
851  auto const it = *ip;
852  if (quality[it] > reject && tracks.isTriplet(it) && it != im)
853  quality[it] = reject; //no race: simple assignment of the same constant
854  }
855 
856  } // loop over hits
857 }
858 
859 __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
860  HitContainer const *__restrict__ ptuples,
861  TkSoA const *__restrict__ ptracks,
862  Quality const *__restrict__ quality,
864  int32_t firstPrint,
865  int32_t lastPrint,
866  int iev) {
867  constexpr auto loose = pixelTrack::Quality::loose;
868  auto const &hh = *hhp;
869  auto const &foundNtuplets = *ptuples;
870  auto const &tracks = *ptracks;
871  int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
872  for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
873  auto nh = foundNtuplets.size(i);
874  if (nh < 3)
875  continue;
876  if (quality[i] < loose)
877  continue;
878  printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
879  10000 * iev + i,
880  int(quality[i]),
881  nh,
882  tracks.nLayers(i),
883  tracks.charge(i),
884  tracks.pt(i),
885  tracks.eta(i),
886  tracks.phi(i),
887  tracks.tip(i),
888  tracks.zip(i),
889  // asinhf(fit_results[i].par(3)),
890  tracks.chi2(i),
891  hh.zGlobal(*foundNtuplets.begin(i)),
892  hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
893  hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
894  nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
895  nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
896  nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
897  nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
898  }
899 }
900 
901 __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) {
902  auto const &c = *counters;
903  printf(
904  "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nLooseTracks | nGoodTracks | nUsedHits | "
905  "nDupHits | "
906  "nFishCells | "
907  "nKilledCells | "
908  "nUsedCells | nZeroTrackCells ||\n");
909  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
910  c.nEvents,
911  c.nHits,
912  c.nCells,
913  c.nTuples,
914  c.nFitTracks,
915  c.nLooseTracks,
916  c.nGoodTracks,
917  c.nUsedHits,
918  c.nDupHits,
919  c.nFishCells,
920  c.nKilledCells,
921  c.nEmptyCells,
922  c.nZeroTrackCells);
923  printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f| %.3f| %.3f||\n",
924  c.nEvents,
925  c.nHits / double(c.nEvents),
926  c.nCells / double(c.nEvents),
927  c.nTuples / double(c.nEvents),
928  c.nFitTracks / double(c.nEvents),
929  c.nLooseTracks / double(c.nEvents),
930  c.nGoodTracks / double(c.nEvents),
931  c.nUsedHits / double(c.nEvents),
932  c.nDupHits / double(c.nEvents),
933  c.nFishCells / double(c.nCells),
934  c.nKilledCells / double(c.nCells),
935  c.nEmptyCells / double(c.nCells),
936  c.nZeroTrackCells / double(c.nCells));
937 }
TrackingRecHit2DSOAView const *__restrict__ hhp
const dim3 threadIdx
Definition: cudaCompat.h:29
constexpr int32_t maxHitsOnTrack
Definition: CAConstants.h:47
def isnan(num)
const dim3 gridDim
Definition: cudaCompat.h:33
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const * cellNeighbors
constexpr uint32_t maxNumberOfQuadruplets
Definition: CAConstants.h:42
auto const & hh
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
TrackingRecHit2DHeterogeneous< cms::cudacompat::GPUTraits > TrackingRecHit2DGPU
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const * hitToTuple
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t uint32_t CAHitNtupletGeneratorKernelsGPU::Counters * counters
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ nCells
HitContainer const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple
auto const longTqual
float float float z
TkSoA const *__restrict__ CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts
int np
Definition: AMPTWrapper.h:43
TrackingRecHit2DSOAView const *__restrict__ HitContainer *__restrict__ hitDetIndices
HitContainer const *__restrict__ tuples
stack
Definition: svgfig.py:559
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const isOuterHitOfCell
TrackSoA::HitContainer HitContainer
double f[11][100]
HitContainer const *__restrict__ ptuples
uint32_t const *__restrict__ TkSoA const *__restrict__ ptracks
uint32_t nh
Quality *__restrict__ uint16_t nmin
TrackSoAHeterogeneousT< maxNumber()> TrackSoA
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t uint32_t maxNumberOfDoublets
const dim3 blockIdx
Definition: cudaCompat.h:32
caConstants::TupleMultiplicity const * tupleMultiplicity
int nt
Definition: AMPTWrapper.h:42
constexpr auto nOnes() const
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ cells
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter * apc
assert(nCells)
auto const & tracks
cannot be loose
auto const good
min quality of good
HitContainer const *__restrict__ TkSoA const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ int32_t firstPrint
T __ldg(T const *x)
Definition: cudaCompat.h:137
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const * cellTracks
HitContainer const *__restrict__ TkSoA const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ int32_t int32_t int iev
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t nHits
uint32_t const *__restrict__ TkSoA const *__restrict__ Quality bool dupPassThrough
double ptmin
Definition: HydjetWrapper.h:84
constexpr auto totOnes() const
cms::cuda::OneToManyAssoc< tindex_type, -1, 4 *maxTuples > HitToTuple
Definition: CAConstants.h:79
constexpr auto loose
float x
HitContainer const *__restrict__ TkSoA const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ int32_t int32_t lastPrint
uint32_t const *__restrict__ Quality * quality
__host__ __device__ index_type const uint32_t n
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4
constexpr uint32_t last_bpix1_detIndex
Definition: CAConstants.h:64
constexpr uint32_t last_barrel_detIndex
Definition: CAConstants.h:65
cms::cuda::OneToManyAssoc< tindex_type, maxHitsOnTrack+1, maxTuples > TupleMultiplicity
Definition: CAConstants.h:80
auto const & foundNtuplets
__shared__ uint32_t ntot
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61