CMS 3D CMS Logo

BestCands.h
Go to the documentation of this file.
1 #ifndef RecoTracker_MkFitCore_standalone_attic_BestCands_h
2 #define RecoTracker_MkFitCore_standalone_attic_BestCands_h
3 
4 #include "Config.h"
5 
6 #include <cstdio>
7 #include <limits>
8 
9 namespace CandsGPU {
10 
11  constexpr int trkIdx_sentinel = -1;
12  constexpr int hitIdx_sentinel = -1;
13  constexpr int nhits_sentinel = -1;
15 
16  template <typename T>
17  __device__ void swap_values(T& a, T& b) {
18  T c(a);
19  a = b;
20  b = c;
21  }
22 
23  template <int MaxCandsPerSeed, int BlockSize>
24  struct BestCands {
25  // AoS would generate bank conflicts when used in SM
26  int trkIdx[MaxCandsPerSeed][BlockSize];
27  int hitIdx[MaxCandsPerSeed][BlockSize];
28  int nhits[MaxCandsPerSeed][BlockSize];
29  float chi2[MaxCandsPerSeed][BlockSize];
30 
31  __device__ void reset(int itrack);
32  __device__ void update(int itrack, int cand_trIdx, int cand_hitIdx, int cand_nhits, float cand_chi2);
33  __device__ void heapify(int itrack, int idx, int heap_size);
34  __device__ int left(int idx);
35  __device__ int right(int idx);
36 
37  __device__ bool better(int icand_fst, int fst, int icand_snd, int snd);
38 
39  __device__ void heap_sort(int icand, int heap_size);
40  __device__ void merge_cands_for_seed(int iseed, int icand);
41  __device__ void swap_nodes(int icand_fst, int fst, int icand_snd, int snd);
42  __device__ void copy_node(int icand_fst, int fst, int icand_snd, int snd);
43 
44  __device__ int count_valid_cands(int itrack);
45 
46  // TODO: Should really return a IdxChi2List
48  const int tid, const int cid, int& my_trkIdx, int& my_hitIdx, int& my_nhits, float& my_chi2);
49  __device__ int get_nhits(const int tid, const int cid) { return nhits[cid][tid]; }
50 
51  __device__ void print_heap(const int tid);
52  };
53 
54  template <int M, int B>
56  for (auto j = 0; j < M; ++j) {
57  trkIdx[j][itrack] = trkIdx_sentinel;
58  hitIdx[j][itrack] = hitIdx_sentinel;
59  nhits[j][itrack] = nhits_sentinel;
60  chi2[j][itrack] = chi2_sentinel;
61  }
62  }
63 
64  template <int M, int B>
65  __device__ void BestCands<M, B>::update(int itrack, int cand_trIdx, int cand_hitIdx, int cand_nhits, float cand_chi2) {
66  if (cand_nhits < nhits[0][itrack])
67  return;
68  if (cand_chi2 > chi2[0][itrack])
69  return;
70  trkIdx[0][itrack] = cand_trIdx;
71  hitIdx[0][itrack] = cand_hitIdx;
72  nhits[0][itrack] = cand_nhits;
73  chi2[0][itrack] = cand_chi2;
74 
75  heapify(itrack, 0, M);
76  }
77 
78  template <int M, int B>
80  for (int cid = 0; cid < M; cid++) {
81  printf(">>>>> tid %d rowIdx %d hitIdx %d nhits %d chi2 %f\n",
82  tid,
83  cid,
84  hitIdx[cid][tid],
85  nhits[cid][tid],
86  chi2[cid][tid]);
87  }
88  }
89 
90  template <int M, int B>
92  return (++idx << 1) - 1;
93  }
94 
95  template <int M, int B>
97  return ++idx << 1;
98  }
99 
100  template <int M, int B>
101  __device__ bool BestCands<M, B>::better(int icand_fst, int fst, int icand_snd, int snd) {
102  return (nhits[fst][icand_fst] > nhits[snd][icand_snd]) ||
103  ((nhits[fst][icand_fst] == nhits[snd][icand_snd]) && (chi2[fst][icand_fst] < chi2[snd][icand_snd]));
104  }
105 
106  template <int M, int B>
107  __device__ void BestCands<M, B>::swap_nodes(int icand_fst, int fst, int icand_snd, int snd) {
108  swap_values(trkIdx[fst][icand_fst], trkIdx[snd][icand_snd]);
109  swap_values(hitIdx[fst][icand_fst], hitIdx[snd][icand_snd]);
110  swap_values(nhits[fst][icand_fst], nhits[snd][icand_snd]);
111  swap_values(chi2[fst][icand_fst], chi2[snd][icand_snd]);
112  }
113 
114  template <int M, int B>
115  __device__ void BestCands<M, B>::copy_node(int icand_fst, int fst, int icand_snd, int snd) {
116  trkIdx[snd][icand_snd] = trkIdx[fst][icand_fst];
117  hitIdx[snd][icand_snd] = hitIdx[fst][icand_fst];
118  nhits[snd][icand_snd] = nhits[fst][icand_fst];
119  chi2[snd][icand_snd] = chi2[fst][icand_fst];
120  }
121 
122  template <int M, int B>
123  __device__ void BestCands<M, B>::heapify(int icand, int idx, int heap_size) {
124  // We want to move idx down so the smallest value is at the root
125  int smallest = -1;
126  while (idx != smallest) {
127  if (idx < 0 || idx >= heap_size / 2)
128  return;
129 
130  smallest = idx;
131  if (heap_size > left(idx) && better(icand, smallest, icand, left(idx)))
132  smallest = left(idx);
133  if (heap_size > right(idx) && better(icand, smallest, icand, right(idx)))
134  smallest = right(idx);
135 
136  if (smallest != idx) {
137  swap_nodes(icand, smallest, icand, idx);
138  idx = smallest;
139  smallest = -1;
140  }
141  }
142  }
143 
144  template <int M, int B>
146  int itrack = iseed * M + icand;
147 // TODO: Need a better way to reduce candidates.
148 // So far, binary tree reduction is a bit slower than the naive approach
149 #if 1
150  if (icand) {
151  heap_sort(itrack, M);
152  }
153  __syncthreads(); // cand 0 waits;
154  if (icand)
155  return; // reduction by the first cand of each seed
156 
157  for (int i = itrack + 1; i < itrack + M; ++i) { // over cands
158  for (int j = 0; j < M; ++j) { // inside heap
159  if (better(i, j, itrack, 0)) {
160  copy_node(i, j, itrack, 0);
161  heapify(itrack, 0, M);
162  } else {
163  break;
164  }
165  }
166  }
167  heap_sort(itrack, M);
168  __syncthreads(); // TODO: Volta: sync only on MaxCandsPerSeeds threads
169 #else
170 
171  for (int step = 2; step <= Config::maxCandsPerSeed; step <<= 1) {
172  if (icand % step == step / 2) {
173  heap_sort(itrack, M);
174  }
175  __syncthreads();
176 
177  if (icand % step == 0) {
178  int i = itrack + step / 2;
179  if ((i < iseed * M + M) && (i < B) && (icand + step / 2 < M)) {
180  for (int j = 0; j < M; ++j) { // inside heap
181  if (better(i, j, itrack, 0)) {
182  copy_node(i, j, itrack, 0);
183  heapify(itrack, 0, M);
184  } else {
185  break;
186  }
187  }
188  }
189  }
190  //__syncthreads();
191  }
192 
193  if (icand == 0) {
194  heap_sort(itrack, M);
195  }
196  __syncthreads();
197 #endif
198  }
199 
200  template <int M, int B>
201  __device__ void BestCands<M, B>::heap_sort(int icand, int heap_size) {
202  int num_unsorted_elts = heap_size;
203  // Assume that we have a heap with the worst one at the root.
204  for (int i = heap_size - 1; i > 0; --i) {
205  swap_nodes(icand, 0, icand, i); // worst at the end
206  heapify(icand, 0, --num_unsorted_elts);
207  }
208  }
209 
210  template <int MaxCandsPerSeed, int BlockSize>
212  const int tid, const int cid, int& my_trkIdx, int& my_hitIdx, int& my_nhits, float& my_chi2) {
213  if (cid < MaxCandsPerSeed && tid < BlockSize) {
214  my_trkIdx = trkIdx[cid][tid];
215  my_hitIdx = hitIdx[cid][tid];
216  my_nhits = nhits[cid][tid];
217  my_chi2 = chi2[cid][tid];
218  }
219  }
220 
221  template <int M, int B>
223  int count = 0;
224  for (int i = 0; i < M; ++i) {
225  if (trkIdx[i][itrack] != trkIdx_sentinel)
226  ++count;
227  }
228  return count;
229  }
230 
231 } // namespace CandsGPU
232 
233 #endif // _BEST_CANDS_H_
__device__ void heap_sort(int icand, int heap_size)
Definition: BestCands.h:201
Definition: APVGainStruct.h:7
constexpr int nhits_sentinel
Definition: BestCands.h:13
float chi2[MaxCandsPerSeed][BlockSize]
Definition: BestCands.h:29
__device__ void get_cand_info(const int tid, const int cid, int &my_trkIdx, int &my_hitIdx, int &my_nhits, float &my_chi2)
Definition: BestCands.h:211
__device__ int left(int idx)
Definition: BestCands.h:91
constexpr int hitIdx_sentinel
Definition: BestCands.h:12
__device__ void swap_nodes(int icand_fst, int fst, int icand_snd, int snd)
Definition: BestCands.h:107
__device__ void copy_node(int icand_fst, int fst, int icand_snd, int snd)
Definition: BestCands.h:115
__device__ int count_valid_cands(int itrack)
Definition: BestCands.h:222
int nhits[MaxCandsPerSeed][BlockSize]
Definition: BestCands.h:28
__device__ int right(int idx)
Definition: BestCands.h:96
__device__ bool better(int icand_fst, int fst, int icand_snd, int snd)
Definition: BestCands.h:101
__device__ void swap_values(T &a, T &b)
Definition: BestCands.h:17
__device__ void print_heap(const int tid)
Definition: BestCands.h:79
__device__ void reset(int itrack)
Definition: BestCands.h:55
int iseed
Definition: AMPTWrapper.h:134
double b
Definition: hdecay.h:120
void __syncthreads()
Definition: cudaCompat.h:132
__device__ void heapify(int itrack, int idx, int heap_size)
Definition: BestCands.h:123
__device__ void merge_cands_for_seed(int iseed, int icand)
Definition: BestCands.h:145
constexpr float chi2_sentinel
Definition: BestCands.h:14
double a
Definition: hdecay.h:121
__device__ int get_nhits(const int tid, const int cid)
Definition: BestCands.h:49
step
Definition: StallMonitor.cc:83
__device__ void update(int itrack, int cand_trIdx, int cand_hitIdx, int cand_nhits, float cand_chi2)
Definition: BestCands.h:65
long double T
#define __device__
constexpr int trkIdx_sentinel
Definition: BestCands.h:11
int trkIdx[MaxCandsPerSeed][BlockSize]
Definition: BestCands.h:26
int hitIdx[MaxCandsPerSeed][BlockSize]
Definition: BestCands.h:27