CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
CAHitNtupletGeneratorKernelsImpl.h
Go to the documentation of this file.
1 //
2 // Original Author: Felice Pantaleo, CERN
3 //
4 
5 // #define NTUPLE_DEBUG
6 // #define GPU_DEBUG
7 
8 #include <cmath>
9 #include <cstdint>
10 #include <limits>
11 
12 #include <cuda_runtime.h>
13 
17 
18 #include "CAConstants.h"
20 #include "GPUCACell.h"
21 #include "gpuFishbone.h"
22 #include "gpuPixelDoublets.h"
23 
26 
29 
33 
34 namespace {
35 
36  constexpr uint16_t tkNotFound = std::numeric_limits<uint16_t>::max();
37  constexpr float maxScore = std::numeric_limits<float>::max();
38  constexpr float nSigma2 = 25.f;
39 
40 } // namespace
41 
42 __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
46  GPUCACell const *__restrict__ cells,
47  uint32_t const *__restrict__ nCells,
51  int32_t nHits,
52  uint32_t maxNumberOfDoublets,
54  auto first = threadIdx.x + blockIdx.x * blockDim.x;
55 
56  auto &c = *counters;
57  // counters once per event
58  if (0 == first) {
59  atomicAdd(&c.nEvents, 1);
60  atomicAdd(&c.nHits, nHits);
61  atomicAdd(&c.nCells, *nCells);
62  atomicAdd(&c.nTuples, apc->get().m);
63  atomicAdd(&c.nFitTracks, tupleMultiplicity->size());
64  }
65 
66 #ifdef NTUPLE_DEBUG
67  if (0 == first) {
68  printf("number of found cells %d, found tuples %d with total hits %d out of %d %d\n",
69  *nCells,
70  apc->get().m,
71  apc->get().n,
72  nHits,
73  hitToTuple->totOnes());
75  assert(foundNtuplets->size(apc->get().m) == 0);
76  assert(foundNtuplets->size() == apc->get().n);
77  }
78  }
79 
80  for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
81  if (foundNtuplets->size(idx) > 7) // current real limit
82  printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
83  assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack);
84  for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
85  assert(int(*ih) < nHits);
86  }
87 #endif
88 
89  if (0 == first) {
91  printf("Tuples overflow\n");
92  if (*nCells >= maxNumberOfDoublets)
93  printf("Cells overflow\n");
94  if (cellNeighbors && cellNeighbors->full())
95  printf("cellNeighbors overflow\n");
96  if (cellTracks && cellTracks->full())
97  printf("cellTracks overflow\n");
98  if (int(hitToTuple->nOnes()) < nHits)
99  printf("ERROR hitToTuple overflow %d %d\n", hitToTuple->nOnes(), nHits);
100  }
101 
102  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
103  auto const &thisCell = cells[idx];
104  if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId];
105  printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
106  if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId];
107  printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
108  if (thisCell.isKilled())
109  atomicAdd(&c.nKilledCells, 1);
110  if (!thisCell.unused())
111  atomicAdd(&c.nEmptyCells, 1);
112  if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
113  atomicAdd(&c.nZeroTrackCells, 1);
114  }
115 
116  for (int idx = first, nt = nHits - isOuterHitOfCell.offset; idx < nt; idx += gridDim.x * blockDim.x) {
117  if (isOuterHitOfCell.container[idx].full()) // ++tooManyOuterHitOfCell;
118  printf("OuterHitOfCell overflow %d\n", idx);
119  }
120 }
121 
122 __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *__restrict__ nCells, Quality *quality) {
123  constexpr auto reject = pixelTrack::Quality::dup;
124 
125  auto first = threadIdx.x + blockIdx.x * blockDim.x;
126  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
127  auto const &thisCell = cells[idx];
128  if (!thisCell.isKilled())
129  continue;
130 
131  for (auto it : thisCell.tracks())
132  quality[it] = reject;
133  }
134 }
135 
136 // remove shorter tracks if sharing a cell
137 // It does not seem to affect efficiency in any way!
138 __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
139  uint32_t const *__restrict__ nCells,
140  TkSoA const *__restrict__ ptracks,
141  Quality *quality,
143  // quality to mark rejected
144  constexpr auto reject = pixelTrack::Quality::edup;
145 
146  auto const &tracks = *ptracks;
147 
148  assert(nCells);
149  auto first = threadIdx.x + blockIdx.x * blockDim.x;
150  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
151  auto const &thisCell = cells[idx];
152 
153  if (thisCell.tracks().size() < 2)
154  continue;
155 
156  int8_t maxNl = 0;
157 
158  // find maxNl
159  for (auto it : thisCell.tracks()) {
160  auto nl = tracks.nLayers(it);
161  maxNl = std::max(nl, maxNl);
162  }
163 
164  // if (maxNl<4) continue;
165  // quad pass through (leave it her for tests)
166  // maxNl = std::min(4, maxNl);
167 
168  for (auto it : thisCell.tracks()) {
169  if (tracks.nLayers(it) < maxNl)
170  quality[it] = reject; //no race: simple assignment of the same constant
171  }
172  }
173 }
174 
175 // assume the above (so, short tracks already removed)
176 __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
177  uint32_t const *__restrict__ nCells,
178  TkSoA *__restrict__ tracks,
179  bool dupPassThrough) {
180  // quality to mark rejected
181  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
183 
184  assert(nCells);
185 
186  auto first = threadIdx.x + blockIdx.x * blockDim.x;
187  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
188  auto const &thisCell = cells[idx];
189  if (thisCell.tracks().size() < 2)
190  continue;
191 
192  float mc = maxScore;
193  uint16_t im = tkNotFound;
194 
195  /* chi2 penalize higher-pt tracks (try rescale it?)
196  auto score = [&](auto it) {
197  return tracks->nHits(it) < 4 ?
198  std::abs(tracks->tip(it)) : // tip for triplets
199  tracks->chi2(it); //chi2 for quads
200  };
201  */
202 
203  auto score = [&](auto it) { return std::abs(tracks->tip(it)); };
204 
205  // full crazy combinatorics
206  int ntr = thisCell.tracks().size();
207  for (int i = 0; i < ntr; ++i) {
208  auto it = thisCell.tracks()[i];
209  auto qi = tracks->quality(it);
210  if (qi <= reject)
211  continue;
212  auto opi = tracks->stateAtBS.state(it)(2);
213  auto e2opi = tracks->stateAtBS.covariance(it)(9);
214  auto cti = tracks->stateAtBS.state(it)(3);
215  auto e2cti = tracks->stateAtBS.covariance(it)(12);
216  for (auto j = i + 1; j < ntr; ++j) {
217  auto jt = thisCell.tracks()[j];
218  auto qj = tracks->quality(jt);
219  if (qj <= reject)
220  continue;
221 #ifdef GPU_DEBUG
222  if (foundNtuplets->size(it) != foundNtuplets->size(jt))
223  printf(" a mess\n");
224 #endif
225  auto opj = tracks->stateAtBS.state(jt)(2);
226  auto ctj = tracks->stateAtBS.state(jt)(3);
227  auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);
228  if ((cti - ctj) * (cti - ctj) > dct)
229  continue;
230  auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi);
231  if ((opi - opj) * (opi - opj) > dop)
232  continue;
233  if ((qj < qi) || (qj == qi && score(it) < score(jt)))
234  tracks->quality(jt) = reject;
235  else {
236  tracks->quality(it) = reject;
237  break;
238  }
239  }
240  }
241 
242  // find maxQual
243  auto maxQual = reject; // no duplicate!
244  for (auto it : thisCell.tracks()) {
245  if (tracks->quality(it) > maxQual)
246  maxQual = tracks->quality(it);
247  }
248 
249  if (maxQual <= loose)
250  continue;
251 
252  // find min score
253  for (auto it : thisCell.tracks()) {
254  if (tracks->quality(it) == maxQual && score(it) < mc) {
255  mc = score(it);
256  im = it;
257  }
258  }
259 
260  if (tkNotFound == im)
261  continue;
262 
263  // mark all other duplicates (not yet, keep it loose)
264  for (auto it : thisCell.tracks()) {
265  if (tracks->quality(it) > loose && it != im)
266  tracks->quality(it) = loose; //no race: simple assignment of the same constant
267  }
268  }
269 }
270 
271 __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
272  cms::cuda::AtomicPairCounter *apc2, // just to zero them,
273  GPUCACell::Hits const *__restrict__ hhp,
274  GPUCACell *cells,
275  uint32_t const *__restrict__ nCells,
277  GPUCACell::OuterHitOfCell const isOuterHitOfCell,
278  float hardCurvCut,
279  float ptmin,
280  float CAThetaCutBarrel,
281  float CAThetaCutForward,
282  float dcaCutInnerTriplet,
283  float dcaCutOuterTriplet) {
284  auto const &hh = *hhp;
285 
286  auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
287  auto first = threadIdx.x;
288  auto stride = blockDim.x;
289 
290  if (0 == (firstCellIndex + first)) {
291  (*apc1) = 0;
292  (*apc2) = 0;
293  } // ready for next kernel
294 
295  for (int idx = firstCellIndex, nt = (*nCells); idx < nt; idx += gridDim.y * blockDim.y) {
296  auto cellIndex = idx;
297  auto &thisCell = cells[idx];
298  auto innerHitId = thisCell.inner_hit_id();
299  if (int(innerHitId) < isOuterHitOfCell.offset)
300  continue;
301  int numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
302  auto vi = isOuterHitOfCell[innerHitId].data();
303 
304  auto ri = thisCell.inner_r(hh);
305  auto zi = thisCell.inner_z(hh);
306 
307  auto ro = thisCell.outer_r(hh);
308  auto zo = thisCell.outer_z(hh);
309  auto isBarrel = thisCell.inner_detIndex(hh) < caConstants::last_barrel_detIndex;
310 
311  for (int j = first; j < numberOfPossibleNeighbors; j += stride) {
312  auto otherCell = __ldg(vi + j);
313  auto &oc = cells[otherCell];
314  auto r1 = oc.inner_r(hh);
315  auto z1 = oc.inner_z(hh);
316  bool aligned = GPUCACell::areAlignedRZ(
317  r1,
318  z1,
319  ri,
320  zi,
321  ro,
322  zo,
323  ptmin,
324  isBarrel ? CAThetaCutBarrel : CAThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts
325  if (aligned && thisCell.dcaCut(hh,
326  oc,
327  oc.inner_detIndex(hh) < caConstants::last_bpix1_detIndex ? dcaCutInnerTriplet
328  : dcaCutOuterTriplet,
329  hardCurvCut)) { // FIXME tune cuts
330  oc.addOuterNeighbor(cellIndex, *cellNeighbors);
331  thisCell.setStatusBits(GPUCACell::StatusBit::kUsed);
332  oc.setStatusBits(GPUCACell::StatusBit::kUsed);
333  }
334  } // loop on inner cells
335  } // loop on outer cells
336 }
337 
338 __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
339  GPUCACell *__restrict__ cells,
340  uint32_t const *nCells,
342  HitContainer *foundNtuplets,
344  Quality *__restrict__ quality,
345  unsigned int minHitsPerNtuplet) {
346  // recursive: not obvious to widen
347  auto const &hh = *hhp;
348 
349  auto first = threadIdx.x + blockIdx.x * blockDim.x;
350  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
351  auto const &thisCell = cells[idx];
352  if (thisCell.isKilled())
353  continue; // cut by earlyFishbone
354  // we require at least three hits...
355  if (thisCell.outerNeighbors().empty())
356  continue;
357  auto pid = thisCell.layerPairId();
358  auto doit = minHitsPerNtuplet > 3 ? pid < 3 : pid < 8 || pid > 12;
359  if (doit) {
361  stack.reset();
362  thisCell.find_ntuplets<6>(
363  hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, minHitsPerNtuplet, pid < 3);
364  assert(stack.empty());
365  // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
366  }
367  }
368 }
369 
370 __global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *nCells) {
371  auto first = threadIdx.x + blockIdx.x * blockDim.x;
372  for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
373  auto &thisCell = cells[idx];
374  if (!thisCell.tracks().empty())
375  thisCell.setStatusBits(GPUCACell::StatusBit::kInTrack);
376  }
377 }
378 
379 __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
380  Quality const *__restrict__ quality,
381  caConstants::TupleMultiplicity *tupleMultiplicity) {
382  auto first = blockIdx.x * blockDim.x + threadIdx.x;
383  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
384  auto nhits = foundNtuplets->size(it);
385  if (nhits < 3)
386  continue;
387  if (quality[it] == pixelTrack::Quality::edup)
388  continue;
389  assert(quality[it] == pixelTrack::Quality::bad);
390  if (nhits > 7) // current limit
391  printf("wrong mult %d %d\n", it, nhits);
393  tupleMultiplicity->count(nhits);
394  }
395 }
396 
397 __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
398  Quality const *__restrict__ quality,
399  caConstants::TupleMultiplicity *tupleMultiplicity) {
400  auto first = blockIdx.x * blockDim.x + threadIdx.x;
401  for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
402  auto nhits = foundNtuplets->size(it);
403  if (nhits < 3)
404  continue;
405  if (quality[it] == pixelTrack::Quality::edup)
406  continue;
407  assert(quality[it] == pixelTrack::Quality::bad);
408  if (nhits > 7)
409  printf("wrong mult %d %d\n", it, nhits);
411  tupleMultiplicity->fill(nhits, it);
412  }
413 }
414 
415 __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
416  TkSoA const *__restrict__ tracks,
418  Quality *__restrict__ quality) {
419  int first = blockDim.x * blockIdx.x + threadIdx.x;
420  for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
421  auto nhits = tuples->size(it);
422  if (nhits == 0)
423  break; // guard
424 
425  // if duplicate: not even fit
426  if (quality[it] == pixelTrack::Quality::edup)
427  continue;
428 
429  assert(quality[it] == pixelTrack::Quality::bad);
430 
431  // mark doublets as bad
432  if (nhits < 3)
433  continue;
434 
435  // if the fit has any invalid parameters, mark it as bad
436  bool isNaN = false;
437  for (int i = 0; i < 5; ++i) {
438  isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
439  }
440  if (isNaN) {
441 #ifdef NTUPLE_DEBUG
442  printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
443 #endif
444  continue;
445  }
446 
447  quality[it] = pixelTrack::Quality::strict;
448 
449  // compute a pT-dependent chi2 cut
450 
451  auto roughLog = [](float x) {
452  // max diff [0.5,12] at 1.25 0.16143
453  // average diff 0.0662998
454  union IF {
455  uint32_t i;
456  float f;
457  };
458  IF z;
459  z.f = x;
460  uint32_t lsb = 1 < 21;
461  z.i += lsb;
462  z.i >>= 21;
463  auto f = z.i & 3;
464  int ex = int(z.i >> 2) - 127;
465 
466  // log2(1+0.25*f)
467  // averaged over bins
468  const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f};
469  return float(ex) + frac[f];
470  };
471 
472  // (see CAHitNtupletGeneratorGPU.cc)
473  float pt = std::min<float>(tracks->pt(it), cuts.chi2MaxPt);
474  float chi2Cut = cuts.chi2Scale * (cuts.chi2Coeff[0] + roughLog(pt) * cuts.chi2Coeff[1]);
475  if (tracks->chi2(it) >= chi2Cut) {
476 #ifdef NTUPLE_FIT_DEBUG
477  printf("Bad chi2 %d size %d pt %f eta %f chi2 %f\n",
478  it,
479  tuples->size(it),
480  tracks->pt(it),
481  tracks->eta(it),
482  tracks->chi2(it));
483 #endif
484  continue;
485  }
486 
487  quality[it] = pixelTrack::Quality::tight;
488 
489  // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
490  // default cuts:
491  // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
492  // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
493  // (see CAHitNtupletGeneratorGPU.cc)
494  auto const &region = (nhits > 3) ? cuts.quadruplet : cuts.triplet;
495  bool isOk = (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
496  (std::abs(tracks->zip(it)) < region.maxZip);
497 
498  if (isOk)
499  quality[it] = pixelTrack::Quality::highPurity;
500  }
501 }
502 
503 __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
504  Quality const *__restrict__ quality,
506  int first = blockDim.x * blockIdx.x + threadIdx.x;
507  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
508  if (tuples->size(idx) == 0)
509  break; //guard
510  if (quality[idx] < pixelTrack::Quality::loose)
511  continue;
512  atomicAdd(&(counters->nLooseTracks), 1);
513  if (quality[idx] < pixelTrack::Quality::strict)
514  continue;
515  atomicAdd(&(counters->nGoodTracks), 1);
516  }
517 }
518 
519 __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
520  Quality const *__restrict__ quality,
522  int first = blockDim.x * blockIdx.x + threadIdx.x;
523  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
524  if (tuples->size(idx) == 0)
525  break; // guard
526  for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
527  hitToTuple->count(*h);
528  }
529 }
530 
531 __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
532  Quality const *__restrict__ quality,
534  int first = blockDim.x * blockIdx.x + threadIdx.x;
535  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
536  if (tuples->size(idx) == 0)
537  break; // guard
538  for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
539  hitToTuple->fill(*h, idx);
540  }
541 }
542 
543 __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
544  TrackingRecHit2DSOAView const *__restrict__ hhp,
545  HitContainer *__restrict__ hitDetIndices) {
546  int first = blockDim.x * blockIdx.x + threadIdx.x;
547  // copy offsets
548  for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
549  hitDetIndices->off[idx] = tuples->off[idx];
550  }
551  // fill hit indices
552  auto const &hh = *hhp;
553  auto nhits = hh.nHits();
554  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
555  assert(tuples->content[idx] < nhits);
556  hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
557  }
558 }
559 
560 __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks) {
561  auto &tracks = *ptracks;
562  auto first = blockIdx.x * blockDim.x + threadIdx.x;
563  for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
564  auto nHits = tracks.nHits(idx);
565  if (nHits == 0)
566  break; // this is a guard: maybe we need to move to nTracks...
567  tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
568  }
569 }
570 
571 __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple,
573  auto &c = *counters;
574  int first = blockDim.x * blockIdx.x + threadIdx.x;
575  for (int idx = first, ntot = hitToTuple->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
576  if (hitToTuple->size(idx) == 0)
577  continue; // SHALL NOT BE break
578  atomicAdd(&c.nUsedHits, 1);
579  if (hitToTuple->size(idx) > 1)
580  atomicAdd(&c.nDupHits, 1);
581  }
582 }
583 
584 __global__ void kernel_countSharedHit(int *__restrict__ nshared,
585  HitContainer const *__restrict__ ptuples,
586  Quality const *__restrict__ quality,
588  constexpr auto loose = pixelTrack::Quality::loose;
589 
590  auto &hitToTuple = *phitToTuple;
591  auto const &foundNtuplets = *ptuples;
592 
593  int first = blockDim.x * blockIdx.x + threadIdx.x;
594  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
595  if (hitToTuple.size(idx) < 2)
596  continue;
597 
598  int nt = 0;
599 
600  // count "good" tracks
601  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
602  if (quality[*it] < loose)
603  continue;
604  ++nt;
605  }
606 
607  if (nt < 2)
608  continue;
609 
610  // now mark each track triplet as sharing a hit
611  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
612  if (foundNtuplets.size(*it) > 3)
613  continue;
614  atomicAdd(&nshared[*it], 1);
615  }
616 
617  } // hit loop
618 }
619 
620 __global__ void kernel_markSharedHit(int const *__restrict__ nshared,
621  HitContainer const *__restrict__ tuples,
622  Quality *__restrict__ quality,
623  bool dupPassThrough) {
624  // constexpr auto bad = pixelTrack::Quality::bad;
625  constexpr auto dup = pixelTrack::Quality::dup;
626  constexpr auto loose = pixelTrack::Quality::loose;
627  // constexpr auto strict = pixelTrack::Quality::strict;
628 
629  // quality to mark rejected
630  auto const reject = dupPassThrough ? loose : dup;
631 
632  int first = blockDim.x * blockIdx.x + threadIdx.x;
633  for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
634  if (tuples->size(idx) == 0)
635  break; //guard
636  if (quality[idx] <= reject)
637  continue;
638  if (nshared[idx] > 2)
639  quality[idx] = reject;
640  }
641 }
642 
643 // mostly for very forward triplets.....
644 __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
645  Quality *__restrict__ quality,
646  uint16_t nmin,
647  bool dupPassThrough,
648  CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
649  // quality to mark rejected
650  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
651 
652  auto &hitToTuple = *phitToTuple;
653  auto const &tracks = *ptracks;
654 
655  int first = blockDim.x * blockIdx.x + threadIdx.x;
656  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
657  if (hitToTuple.size(idx) < 2)
658  continue;
659 
660  /* chi2 is bad for large pt
661  auto score = [&](auto it, auto nl) {
662  return nl < 4 ? std::abs(tracks.tip(it)) : // tip for triplets
663  tracks.chi2(it); //chi2
664  };
665  */
666  auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
667 
668  // full combinatorics
669  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
670  auto const it = *ip;
671  auto qi = quality[it];
672  if (qi <= reject)
673  continue;
674  auto opi = tracks.stateAtBS.state(it)(2);
675  auto e2opi = tracks.stateAtBS.covariance(it)(9);
676  auto cti = tracks.stateAtBS.state(it)(3);
677  auto e2cti = tracks.stateAtBS.covariance(it)(12);
678  auto nli = tracks.nLayers(it);
679  for (auto jp = ip + 1; jp != hitToTuple.end(idx); ++jp) {
680  auto const jt = *jp;
681  auto qj = quality[jt];
682  if (qj <= reject)
683  continue;
684  auto opj = tracks.stateAtBS.state(jt)(2);
685  auto ctj = tracks.stateAtBS.state(jt)(3);
686  auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti);
687  if ((cti - ctj) * (cti - ctj) > dct)
688  continue;
689  auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
690  if ((opi - opj) * (opi - opj) > dop)
691  continue;
692  auto nlj = tracks.nLayers(jt);
693  if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
694  quality[jt] = reject;
695  else {
696  quality[it] = reject;
697  break;
698  }
699  }
700  }
701  }
702 }
703 
704 __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
705  TkSoA const *__restrict__ ptracks,
706  Quality *__restrict__ quality,
707  int nmin,
708  bool dupPassThrough,
709  CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
710  // quality to mark rejected
711  auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
712  // quality of longest track
714 
715  auto &hitToTuple = *phitToTuple;
716  auto const &tracks = *ptracks;
717 
718  auto const &hh = *hhp;
719  int l1end = hh.hitsLayerStart()[1];
720 
721  int first = blockDim.x * blockIdx.x + threadIdx.x;
722  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
723  if (hitToTuple.size(idx) < 2)
724  continue;
725 
726  int8_t maxNl = 0;
727 
728  // find maxNl
729  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
730  if (quality[*it] < longTqual)
731  continue;
732  // if (tracks.nHits(*it)==3) continue;
733  auto nl = tracks.nLayers(*it);
734  maxNl = std::max(nl, maxNl);
735  }
736 
737  if (maxNl < 4)
738  continue;
739 
740  // quad pass through (leave for tests)
741  // maxNl = std::min(4, maxNl);
742 
743  // kill all tracks shorter than maxHl (only triplets???
744  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
745  auto nl = tracks.nLayers(*it);
746 
747  //checking if shared hit is on bpix1 and if the tuple is short enough
748  if (idx < l1end and nl > nmin)
749  continue;
750 
751  if (nl < maxNl && quality[*it] > reject)
752  quality[*it] = reject;
753  }
754  }
755 }
756 
757 __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks,
758  Quality *__restrict__ quality,
759  uint16_t nmin,
760  bool dupPassThrough,
761  CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
762  // quality to mark rejected
763  auto const reject = pixelTrack::Quality::loose;
766 
767  auto &hitToTuple = *phitToTuple;
768  auto const &tracks = *ptracks;
769 
770  int first = blockDim.x * blockIdx.x + threadIdx.x;
771  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
772  if (hitToTuple.size(idx) < 2)
773  continue;
774 
775  float mc = maxScore;
776  uint16_t im = tkNotFound;
777  bool onlyTriplets = true;
778 
779  // check if only triplets
780  for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
781  if (quality[*it] <= good)
782  continue;
783  onlyTriplets &= tracks.isTriplet(*it);
784  if (!onlyTriplets)
785  break;
786  }
787 
788  // only triplets
789  if (!onlyTriplets)
790  continue;
791 
792  // for triplets choose best tip! (should we first find best quality???)
793  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
794  auto const it = *ip;
795  if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
796  mc = std::abs(tracks.tip(it));
797  im = it;
798  }
799  }
800 
801  if (tkNotFound == im)
802  continue;
803 
804  // mark worse ambiguities
805  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
806  auto const it = *ip;
807  if (quality[it] > reject && it != im)
808  quality[it] = reject; //no race: simple assignment of the same constant
809  }
810 
811  } // loop over hits
812 }
813 
814 __global__ void kernel_simpleTripletCleaner(
815  TkSoA const *__restrict__ ptracks,
816  Quality *__restrict__ quality,
817  uint16_t nmin,
818  bool dupPassThrough,
819  CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) {
820  // quality to mark rejected
821  auto const reject = pixelTrack::Quality::loose;
823  auto const good = pixelTrack::Quality::loose;
824 
825  auto &hitToTuple = *phitToTuple;
826  auto const &tracks = *ptracks;
827 
828  int first = blockDim.x * blockIdx.x + threadIdx.x;
829  for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
830  if (hitToTuple.size(idx) < 2)
831  continue;
832 
833  float mc = maxScore;
834  uint16_t im = tkNotFound;
835 
836  // choose best tip! (should we first find best quality???)
837  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
838  auto const it = *ip;
839  if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
840  mc = std::abs(tracks.tip(it));
841  im = it;
842  }
843  }
844 
845  if (tkNotFound == im)
846  continue;
847 
848  // mark worse ambiguities
849  for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
850  auto const it = *ip;
851  if (quality[it] > reject && tracks.isTriplet(it) && it != im)
852  quality[it] = reject; //no race: simple assignment of the same constant
853  }
854 
855  } // loop over hits
856 }
857 
858 __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
859  HitContainer const *__restrict__ ptuples,
860  TkSoA const *__restrict__ ptracks,
861  Quality const *__restrict__ quality,
862  CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
863  int32_t maxPrint,
864  int iev) {
865  auto const &foundNtuplets = *ptuples;
866  auto const &tracks = *ptracks;
867  int first = blockDim.x * blockIdx.x + threadIdx.x;
868  for (int i = first, np = std::min(maxPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
869  auto nh = foundNtuplets.size(i);
870  if (nh < 3)
871  continue;
872  printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
873  10000 * iev + i,
874  int(quality[i]),
875  nh,
876  tracks.charge(i),
877  tracks.pt(i),
878  tracks.eta(i),
879  tracks.phi(i),
880  tracks.tip(i),
881  tracks.zip(i),
882  // asinhf(fit_results[i].par(3)),
883  tracks.chi2(i),
884  *foundNtuplets.begin(i),
885  *(foundNtuplets.begin(i) + 1),
886  *(foundNtuplets.begin(i) + 2),
887  nh > 3 ? int(*(foundNtuplets.begin(i) + 3)) : -1,
888  nh > 4 ? int(*(foundNtuplets.begin(i) + 4)) : -1);
889  }
890 }
891 
892 __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *counters) {
893  auto const &c = *counters;
894  printf(
895  "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nLooseTracks | nGoodTracks | nUsedHits | "
896  "nDupHits | "
897  "nKilledCells | "
898  "nUsedCells | nZeroTrackCells ||\n");
899  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
900  c.nEvents,
901  c.nHits,
902  c.nCells,
903  c.nTuples,
904  c.nLooseTracks,
905  c.nGoodTracks,
906  c.nFitTracks,
907  c.nUsedHits,
908  c.nDupHits,
909  c.nKilledCells,
910  c.nEmptyCells,
911  c.nZeroTrackCells);
912  printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f| %.3f||\n",
913  c.nEvents,
914  c.nHits / double(c.nEvents),
915  c.nCells / double(c.nEvents),
916  c.nTuples / double(c.nEvents),
917  c.nFitTracks / double(c.nEvents),
918  c.nLooseTracks / double(c.nEvents),
919  c.nGoodTracks / double(c.nEvents),
920  c.nUsedHits / double(c.nEvents),
921  c.nDupHits / double(c.nEvents),
922  c.nKilledCells / double(c.nCells),
923  c.nEmptyCells / double(c.nCells),
924  c.nZeroTrackCells / double(c.nCells));
925 }
const dim3 threadIdx
Definition: cudaCompat.h:29
constexpr int32_t maxHitsOnTrack
Definition: CAConstants.h:47
const edm::EventSetup & c
const dim3 gridDim
Definition: cudaCompat.h:33
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const * cellNeighbors
HitContainer const *__restrict__ ptuples
HitContainer const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple
uint32_t const *__restrict__ Quality * quality
constexpr uint32_t maxNumberOfQuadruplets
Definition: CAConstants.h:42
uint32_t const *__restrict__ TkSoA const *__restrict__ ptracks
#define __global__
Definition: cudaCompat.h:19
const dim3 blockDim
Definition: cudaCompat.h:30
auto const & foundNtuplets
TrackingRecHit2DHeterogeneous< cms::cudacompat::GPUTraits > TrackingRecHit2DGPU
static constexpr int32_t stride()
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const * hitToTuple
auto const & tracks
cannot be loose
assert(be >=bs)
constexpr auto nOnes() const
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter * apc
auto const longTqual
float float float z
constexpr bool full() const
Definition: VecArray.h:95
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t uint32_t CAHitNtupletGeneratorKernelsGPU::Counters * counters
TkSoA const *__restrict__ CAHitNtupletGeneratorKernelsGPU::QualityCuts cuts
int np
Definition: AMPTWrapper.h:43
TrackingRecHit2DSOAView const *__restrict__ HitContainer *__restrict__ hitDetIndices
HitContainer const *__restrict__ tuples
constexpr void reset()
Definition: VecArray.h:90
caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity
printf("params %d %f %f %f\n", minT, eps, errmax, chi2max)
stack
Definition: svgfig.py:559
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const isOuterHitOfCell
uint16_t const *__restrict__ x
Definition: gpuClustering.h:39
TrackSoA::HitContainer HitContainer
T min(T a, T b)
Definition: MathUtil.h:58
uint32_t nh
Quality *__restrict__ uint16_t nmin
TrackSoAHeterogeneousT< maxNumber()> TrackSoA
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t uint32_t maxNumberOfDoublets
const dim3 blockIdx
Definition: cudaCompat.h:32
int nt
Definition: AMPTWrapper.h:42
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ cells
auto const good
min quality of good
T __ldg(T const *x)
Definition: cudaCompat.h:113
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const * cellTracks
caConstants::TupleMultiplicity const *__restrict__ HitsOnGPU const *__restrict__ hhp
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ gpuPixelDoublets::CellNeighborsVector const gpuPixelDoublets::CellTracksVector const GPUCACell::OuterHitOfCell const int32_t nHits
caConstants::TupleMultiplicity const CAHitNtupletGeneratorKernelsGPU::HitToTuple const cms::cuda::AtomicPairCounter GPUCACell const *__restrict__ uint32_t const *__restrict__ nCells
uint32_t const *__restrict__ TkSoA const *__restrict__ Quality bool dupPassThrough
double ptmin
Definition: HydjetWrapper.h:84
constexpr bool full() const
Definition: SimpleVector.h:105
__device__ __host__ Counters get() const
constexpr unsigned int inner_hit_id() const
Definition: GPUCACell.h:130
constexpr auto totOnes() const
cms::cuda::OneToManyAssoc< tindex_type,-1, 4 *maxTuples > HitToTuple
Definition: CAConstants.h:79
constexpr auto loose
HitContainer const *__restrict__ TkSoA const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ int32_t int iev
The Signals That Services Can Subscribe To This is based on ActivityRegistry h
Helper function to determine trigger accepts.
Definition: Activities.doc:4
constexpr uint32_t last_bpix1_detIndex
Definition: CAConstants.h:64
constexpr uint32_t last_barrel_detIndex
Definition: CAConstants.h:65
HitContainer const *__restrict__ TkSoA const *__restrict__ Quality const *__restrict__ CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ int32_t maxPrint
cms::cuda::OneToManyAssoc< tindex_type, maxHitsOnTrack+1, maxTuples > TupleMultiplicity
Definition: CAConstants.h:80
__shared__ uint32_t ntot
T1 atomicAdd(T1 *a, T2 b)
Definition: cudaCompat.h:61
OuterHitOfCellContainer * container
Definition: CAConstants.h:83
auto const & hh