CMS 3D CMS Logo

MatriplexSym.h
Go to the documentation of this file.
1 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
2 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
3 
4 #include "MatriplexCommon.h"
5 #include "Matriplex.h"
6 
7 //==============================================================================
8 // MatriplexSym
9 //==============================================================================
10 
11 namespace Matriplex {
12 
13  const idx_t gSymOffsets[7][36] = {{},
14  {},
15  {0, 1, 1, 2},
16  {0, 1, 3, 1, 2, 4, 3, 4, 5}, // 3
17  {},
18  {},
19  {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
20  6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
21 
22  //------------------------------------------------------------------------------
23 
24  template <typename T, idx_t D, idx_t N>
25  class __attribute__((aligned(MPLEX_ALIGN))) MatriplexSym {
26  public:
27  typedef T value_type;
28 
30  static constexpr int kRows = D;
32  static constexpr int kCols = D;
34  static constexpr int kSize = (D + 1) * D / 2;
36  static constexpr int kTotSize = N * kSize;
37 
38  T fArray[kTotSize];
39 
40  MatriplexSym() {}
41  MatriplexSym(T v) { setVal(v); }
42 
43  idx_t plexSize() const { return N; }
44 
45  void setVal(T v) {
46  for (idx_t i = 0; i < kTotSize; ++i) {
47  fArray[i] = v;
48  }
49  }
50 
51  void add(const MatriplexSym& v) {
52  for (idx_t i = 0; i < kTotSize; ++i) {
53  fArray[i] += v.fArray[i];
54  }
55  }
56 
57  void scale(T scale) {
58  for (idx_t i = 0; i < kTotSize; ++i) {
59  fArray[i] *= scale;
60  }
61  }
62 
63  T operator[](idx_t xx) const { return fArray[xx]; }
64  T& operator[](idx_t xx) { return fArray[xx]; }
65 
66  const idx_t* offsets() const { return gSymOffsets[D]; }
67  idx_t off(idx_t i) const { return gSymOffsets[D][i]; }
68 
69  const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[off(i * D + j) * N + n]; }
70 
71  T& At(idx_t n, idx_t i, idx_t j) { return fArray[off(i * D + j) * N + n]; }
72 
73  T& operator()(idx_t n, idx_t i, idx_t j) { return At(n, i, j); }
74  const T& operator()(idx_t n, idx_t i, idx_t j) const { return constAt(n, i, j); }
75 
76  MatriplexSym& operator=(const MatriplexSym& m) {
77  memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
78  return *this;
79  }
80 
81  MatriplexSym(const MatriplexSym& m) = default;
82 
83  void copySlot(idx_t n, const MatriplexSym& m) {
84  for (idx_t i = n; i < kTotSize; i += N) {
85  fArray[i] = m.fArray[i];
86  }
87  }
88 
89  void copyIn(idx_t n, const T* arr) {
90  for (idx_t i = n; i < kTotSize; i += N) {
91  fArray[i] = *(arr++);
92  }
93  }
94 
95  void copyIn(idx_t n, const MatriplexSym& m, idx_t in) {
96  for (idx_t i = n; i < kTotSize; i += N, in += N) {
97  fArray[i] = m[in];
98  }
99  }
100 
101  void copy(idx_t n, idx_t in) {
102  for (idx_t i = n; i < kTotSize; i += N, in += N) {
103  fArray[i] = fArray[in];
104  }
105  }
106 
107 #if defined(AVX512_INTRINSICS)
108 
109  template <typename U>
110  void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
111  //_mm512_prefetch_i32gather_ps(vi, arr, 1, _MM_HINT_T0);
112 
113  const __m512 src = {0};
114  const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
115 
116  for (int i = 0; i < kSize; ++i, ++arr) {
117  //_mm512_prefetch_i32gather_ps(vi, arr+2, 1, _MM_HINT_NTA);
118 
119  __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
120  _mm512_mask_store_ps(&fArray[i * N], k, reg);
121  }
122  }
123 
124  // Experimental methods, slurpIn() seems to be at least as fast.
125  // See comments in mkFit/MkFitter.cc MkFitter::addBestHit().
126 
127  void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
128  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
129 
130  for (int i = 0; i < N; ++i) {
131  __m512 reg = _mm512_load_ps(arr + vi[i]);
132  _mm512_store_ps((void*)(tmp + 64 * i), reg);
133  }
134 
135  for (int i = 0; i < kSize; ++i) {
136  __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
137  _mm512_store_ps(&fArray[i * N], reg);
138  }
139  }
140 
141  void Contaginate(const char* arr, int vi[N], const char* tmp) {
142  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
143 
144  for (int i = 0; i < N; ++i) {
145  __m512 reg = _mm512_load_ps(arr + vi[i]);
146  _mm512_store_ps((void*)(tmp + 64 * i), reg);
147  }
148  }
149 
150  void Plexify(const char* tmp, __m512i& ui) {
151  for (int i = 0; i < kSize; ++i) {
152  __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
153  _mm512_store_ps(&fArray[i * N], reg);
154  }
155  }
156 
157 #elif defined(AVX2_INTRINSICS)
158 
159  template <typename U>
160  void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
161  const __m256 src = {0};
162 
163  __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
164  __m256i k_sel = _mm256_set1_epi32(N_proc);
165  __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
166 
167  k = k_master;
168  for (int i = 0; i < kSize; ++i, ++arr) {
169  __m256 reg = _mm256_mask_i32gather_ps(src, arr, vi, (__m256)k, sizeof(U));
170  // Restore mask (docs say gather clears it but it doesn't seem to).
171  k = k_master;
172  _mm256_maskstore_ps(&fArray[i * N], k, reg);
173  }
174  }
175 
176 #else
177 
178  void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
179  // Separate N_proc == N case (gains about 7% in fit test).
180  if (N_proc == N) {
181  for (int i = 0; i < kSize; ++i) {
182  for (int j = 0; j < N; ++j) {
183  fArray[i * N + j] = *(arr + i + vi[j]);
184  }
185  }
186  } else {
187  for (int i = 0; i < kSize; ++i) {
188  for (int j = 0; j < N_proc; ++j) {
189  fArray[i * N + j] = *(arr + i + vi[j]);
190  }
191  }
192  }
193  }
194 
195 #endif
196 
197  void copyOut(idx_t n, T* arr) const {
198  for (idx_t i = n; i < kTotSize; i += N) {
199  *(arr++) = fArray[i];
200  }
201  }
202 
203  void setDiagonal3x3(idx_t n, T d) {
204  T* p = fArray + n;
205 
206  p[0 * N] = d;
207  p[1 * N] = 0;
208  p[2 * N] = d;
209  p[3 * N] = 0;
210  p[4 * N] = 0;
211  p[5 * N] = d;
212  }
213 
214  MatriplexSym& subtract(const MatriplexSym& a, const MatriplexSym& b) {
215  // Does *this = a - b;
216 
217 #pragma omp simd
218  for (idx_t i = 0; i < kTotSize; ++i) {
219  fArray[i] = a.fArray[i] - b.fArray[i];
220  }
221 
222  return *this;
223  }
224 
225  // ==================================================================
226  // Operations specific to Kalman fit in 6 parameter space
227  // ==================================================================
228 
229  void addNoiseIntoUpperLeft3x3(T noise) {
230  T* p = fArray;
231  ASSUME_ALIGNED(p, 64);
232 
233 #pragma omp simd
234  for (idx_t n = 0; n < N; ++n) {
235  p[0 * N + n] += noise;
236  p[2 * N + n] += noise;
237  p[5 * N + n] += noise;
238  }
239  }
240 
241  void invertUpperLeft3x3() {
242  typedef T TT;
243 
244  T* a = fArray;
245  ASSUME_ALIGNED(a, 64);
246 
247 #pragma omp simd
248  for (idx_t n = 0; n < N; ++n) {
249  const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
250  const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
251  const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
252  const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
253  const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
254  const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
255 
256  // Force determinant calculation in double precision.
257  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
258  const TT s = TT(1) / det;
259 
260  a[0 * N + n] = s * c00;
261  a[1 * N + n] = s * c01;
262  a[2 * N + n] = s * c11;
263  a[3 * N + n] = s * c02;
264  a[4 * N + n] = s * c12;
265  a[5 * N + n] = s * c22;
266  }
267  }
268 
269  Matriplex<T, 1, 1, N> ReduceFixedIJ(idx_t i, idx_t j) const {
270  Matriplex<T, 1, 1, N> t;
271  for (idx_t n = 0; n < N; ++n) {
272  t[n] = constAt(n, i, j);
273  }
274  return t;
275  }
276  };
277 
278  template <typename T, idx_t D, idx_t N>
279  using MPlexSym = MatriplexSym<T, D, N>;
280 
281  //==============================================================================
282  // Multiplications
283  //==============================================================================
284 
285  template <typename T, idx_t D, idx_t N>
286  struct SymMultiplyCls {
288  throw std::runtime_error("general symmetric multiplication not supported");
289  }
290  };
291 
292  template <typename T, idx_t N>
293  struct SymMultiplyCls<T, 3, N> {
295  const T* a = A.fArray;
296  ASSUME_ALIGNED(a, 64);
297  const T* b = B.fArray;
298  ASSUME_ALIGNED(b, 64);
299  T* c = C.fArray;
300  ASSUME_ALIGNED(c, 64);
301 
302 #ifdef MPLEX_INTRINSICS
303 
304  for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
305 #include "intr_sym_3x3.ah"
306  }
307 
308 #else
309 
310 #pragma omp simd
311  for (idx_t n = 0; n < N; ++n) {
312 #include "std_sym_3x3.ah"
313  }
314 
315 #endif
316  }
317  };
318 
319  template <typename T, idx_t N>
320  struct SymMultiplyCls<T, 6, N> {
322  const T* a = A.fArray;
323  ASSUME_ALIGNED(a, 64);
324  const T* b = B.fArray;
325  ASSUME_ALIGNED(b, 64);
326  T* c = C.fArray;
327  ASSUME_ALIGNED(c, 64);
328 
329 #ifdef MPLEX_INTRINSICS
330 
331  for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
332 #include "intr_sym_6x6.ah"
333  }
334 
335 #else
336 
337 #pragma omp simd
338  for (idx_t n = 0; n < N; ++n) {
339 #include "std_sym_6x6.ah"
340  }
341 
342 #endif
343  }
344  };
345 
346  template <typename T, idx_t D, idx_t N>
349  }
350 
351  //==============================================================================
352  // Cramer inversion
353  //==============================================================================
354 
355  template <typename T, idx_t D, idx_t N>
357  static void invert(MPlexSym<T, D, N>& A, double* determ = nullptr) {
358  throw std::runtime_error("general cramer inversion not supported");
359  }
360  };
361 
362  template <typename T, idx_t N>
363  struct CramerInverterSym<T, 2, N> {
364  static void invert(MPlexSym<T, 2, N>& A, double* determ = nullptr) {
365  typedef T TT;
366 
367  T* a = A.fArray;
368  ASSUME_ALIGNED(a, 64);
369 
370 #pragma omp simd
371  for (idx_t n = 0; n < N; ++n) {
372  // Force determinant calculation in double precision.
373  const double det = (double)a[0 * N + n] * a[2 * N + n] - (double)a[1 * N + n] * a[1 * N + n];
374  if (determ)
375  determ[n] = det;
376 
377  const TT s = TT(1) / det;
378  const TT tmp = s * a[2 * N + n];
379  a[1 * N + n] *= -s;
380  a[2 * N + n] = s * a[0 * N + n];
381  a[0 * N + n] = tmp;
382  }
383  }
384  };
385 
386  template <typename T, idx_t N>
387  struct CramerInverterSym<T, 3, N> {
388  static void invert(MPlexSym<T, 3, N>& A, double* determ = nullptr) {
389  typedef T TT;
390 
391  T* a = A.fArray;
392  ASSUME_ALIGNED(a, 64);
393 
394 #pragma omp simd
395  for (idx_t n = 0; n < N; ++n) {
396  const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
397  const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
398  const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
399  const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
400  const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
401  const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
402 
403  // Force determinant calculation in double precision.
404  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
405  if (determ)
406  determ[n] = det;
407 
408  const TT s = TT(1) / det;
409  a[0 * N + n] = s * c00;
410  a[1 * N + n] = s * c01;
411  a[2 * N + n] = s * c11;
412  a[3 * N + n] = s * c02;
413  a[4 * N + n] = s * c12;
414  a[5 * N + n] = s * c22;
415  }
416  }
417  };
418 
419  template <typename T, idx_t D, idx_t N>
420  void invertCramerSym(MPlexSym<T, D, N>& A, double* determ = nullptr) {
422  }
423 
424  //==============================================================================
425  // Cholesky inversion
426  //==============================================================================
427 
428  template <typename T, idx_t D, idx_t N>
430  static void invert(MPlexSym<T, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
431  };
432 
433  template <typename T, idx_t N>
434  struct CholeskyInverterSym<T, 3, N> {
435  static void invert(MPlexSym<T, 3, N>& A) {
436  typedef T TT;
437 
438  T* a = A.fArray;
439 
440 #pragma omp simd
441  for (idx_t n = 0; n < N; ++n) {
442  TT l0 = std::sqrt(T(1) / a[0 * N + n]);
443  TT l1 = a[1 * N + n] * l0;
444  TT l2 = a[2 * N + n] - l1 * l1;
445  l2 = std::sqrt(T(1) / l2);
446  TT l3 = a[3 * N + n] * l0;
447  TT l4 = (a[4 * N + n] - l1 * l3) * l2;
448  TT l5 = a[5 * N + n] - (l3 * l3 + l4 * l4);
449  l5 = std::sqrt(T(1) / l5);
450 
451  // decomposition done
452 
453  l3 = (l1 * l4 * l2 - l3) * l0 * l5;
454  l1 = -l1 * l0 * l2;
455  l4 = -l4 * l2 * l5;
456 
457  a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
458  a[1 * N + n] = l3 * l4 + l1 * l2;
459  a[2 * N + n] = l4 * l4 + l2 * l2;
460  a[3 * N + n] = l3 * l5;
461  a[4 * N + n] = l4 * l5;
462  a[5 * N + n] = l5 * l5;
463 
464  // m(2,x) are all zero if anything went wrong at l5.
465  // all zero, if anything went wrong already for l0 or l2.
466  }
467  }
468  };
469 
470  template <typename T, idx_t D, idx_t N>
473  }
474 
475 } // end namespace Matriplex
476 
477 #endif
Basic3DVector & operator=(const Basic3DVector &)=default
Assignment operator.
MatriplexSym< T, D, N > MPlexSym
Definition: MatriplexSym.h:279
Definition: APVGainStruct.h:7
Matriplex< T, D1, D2, N > MPlex
Definition: Matriplex.h:327
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
Definition: Matriplex.h:646
const idx_t gSymOffsets[7][36]
Definition: MatriplexSym.h:13
void invertCholeskySym(MPlexSym< T, D, N > &A)
Definition: MatriplexSym.h:471
static void invert(MPlexSym< T, 2, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:364
static void multiply(const MPlexSym< T, 3, N > &A, const MPlexSym< T, 3, N > &B, MPlex< T, 3, 3, N > &C)
Definition: MatriplexSym.h:294
void invertCramerSym(MPlexSym< T, D, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:420
static void invert(MPlexSym< T, D, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:357
static void invert(MPlexSym< T, 3, N > &A)
Definition: MatriplexSym.h:435
T sqrt(T t)
Definition: SSEVec.h:23
static void multiply(const MPlexSym< float, 6, N > &A, const MPlexSym< float, 6, N > &B, MPlex< float, 6, 6, N > &C)
Definition: MatriplexSym.h:321
#define MPLEX_ALIGN
static void invert(MPlexSym< T, 3, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:388
d
Definition: ztail.py:151
#define N
Definition: blowfish.cc:9
DecomposeProduct< arg, typename Div::arg > D
Definition: Factorize.h:141
double b
Definition: hdecay.h:120
class __attribute__((aligned(32))) Matriplex
Definition: Matriplex.h:11
static void multiply(const MPlexSym< T, D, N > &A, const MPlexSym< T, D, N > &B, MPlex< T, D, D, N > &C)
Definition: MatriplexSym.h:287
static void invert(MPlexSym< T, D, N > &A)
Definition: MatriplexSym.h:430
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
double a
Definition: hdecay.h:121
Definition: APVGainStruct.h:7
T operator[](int i) const
tmp
align.sh
Definition: createJobs.py:716
long double T
#define ASSUME_ALIGNED(a, b)