CMS 3D CMS Logo

MatriplexSym.h
Go to the documentation of this file.
1 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
2 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
3 
4 #include "MatriplexCommon.h"
5 #include "Matriplex.h"
6 
7 //==============================================================================
8 // MatriplexSym
9 //==============================================================================
10 
11 namespace Matriplex {
12 
13  const idx_t gSymOffsets[7][36] = {{},
14  {},
15  {0, 1, 1, 2},
16  {0, 1, 3, 1, 2, 4, 3, 4, 5}, // 3
17  {},
18  {},
19  {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
20  6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
21 
22  //------------------------------------------------------------------------------
23 
24  template <typename T, idx_t D, idx_t N>
25  class __attribute__((aligned(MPLEX_ALIGN))) MatriplexSym {
26  public:
27  typedef T value_type;
28 
30  static constexpr int kRows = D;
32  static constexpr int kCols = D;
34  static constexpr int kSize = (D + 1) * D / 2;
36  static constexpr int kTotSize = N * kSize;
37 
38  T fArray[kTotSize];
39 
40  MatriplexSym() {}
41  MatriplexSym(T v) { setVal(v); }
42 
43  idx_t plexSize() const { return N; }
44 
45  void setVal(T v) {
46  for (idx_t i = 0; i < kTotSize; ++i) {
47  fArray[i] = v;
48  }
49  }
50 
51  void add(const MatriplexSym& v) {
52  for (idx_t i = 0; i < kTotSize; ++i) {
53  fArray[i] += v.fArray[i];
54  }
55  }
56 
57  void scale(T scale) {
58  for (idx_t i = 0; i < kTotSize; ++i) {
59  fArray[i] *= scale;
60  }
61  }
62 
63  T operator[](idx_t xx) const { return fArray[xx]; }
64  T& operator[](idx_t xx) { return fArray[xx]; }
65 
66  const idx_t* offsets() const { return gSymOffsets[D]; }
67  idx_t off(idx_t i) const { return gSymOffsets[D][i]; }
68 
69  const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[off(i * D + j) * N + n]; }
70 
71  T& At(idx_t n, idx_t i, idx_t j) { return fArray[off(i * D + j) * N + n]; }
72 
73  T& operator()(idx_t n, idx_t i, idx_t j) { return At(n, i, j); }
74  const T& operator()(idx_t n, idx_t i, idx_t j) const { return constAt(n, i, j); }
75 
76  MatriplexSym& operator=(const MatriplexSym& m) {
77  memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
78  return *this;
79  }
80 
81  void copySlot(idx_t n, const MatriplexSym& m) {
82  for (idx_t i = n; i < kTotSize; i += N) {
83  fArray[i] = m.fArray[i];
84  }
85  }
86 
87  void copyIn(idx_t n, const T* arr) {
88  for (idx_t i = n; i < kTotSize; i += N) {
89  fArray[i] = *(arr++);
90  }
91  }
92 
93  void copyIn(idx_t n, const MatriplexSym& m, idx_t in) {
94  for (idx_t i = n; i < kTotSize; i += N, in += N) {
95  fArray[i] = m[in];
96  }
97  }
98 
99  void copy(idx_t n, idx_t in) {
100  for (idx_t i = n; i < kTotSize; i += N, in += N) {
101  fArray[i] = fArray[in];
102  }
103  }
104 
105 #if defined(AVX512_INTRINSICS)
106 
107  template <typename U>
108  void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
109  //_mm512_prefetch_i32gather_ps(vi, arr, 1, _MM_HINT_T0);
110 
111  const __m512 src = {0};
112  const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
113 
114  for (int i = 0; i < kSize; ++i, ++arr) {
115  //_mm512_prefetch_i32gather_ps(vi, arr+2, 1, _MM_HINT_NTA);
116 
117  __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
118  _mm512_mask_store_ps(&fArray[i * N], k, reg);
119  }
120  }
121 
122  // Experimental methods, slurpIn() seems to be at least as fast.
123  // See comments in mkFit/MkFitter.cc MkFitter::addBestHit().
124 
125  void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
126  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
127 
128  for (int i = 0; i < N; ++i) {
129  __m512 reg = _mm512_load_ps(arr + vi[i]);
130  _mm512_store_ps((void*)(tmp + 64 * i), reg);
131  }
132 
133  for (int i = 0; i < kSize; ++i) {
134  __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
135  _mm512_store_ps(&fArray[i * N], reg);
136  }
137  }
138 
139  void Contaginate(const char* arr, int vi[N], const char* tmp) {
140  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
141 
142  for (int i = 0; i < N; ++i) {
143  __m512 reg = _mm512_load_ps(arr + vi[i]);
144  _mm512_store_ps((void*)(tmp + 64 * i), reg);
145  }
146  }
147 
148  void Plexify(const char* tmp, __m512i& ui) {
149  for (int i = 0; i < kSize; ++i) {
150  __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
151  _mm512_store_ps(&fArray[i * N], reg);
152  }
153  }
154 
155 #elif defined(AVX2_INTRINSICS)
156 
157  template <typename U>
158  void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
159  const __m256 src = {0};
160 
161  __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
162  __m256i k_sel = _mm256_set1_epi32(N_proc);
163  __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
164 
165  k = k_master;
166  for (int i = 0; i < kSize; ++i, ++arr) {
167  __m256 reg = _mm256_mask_i32gather_ps(src, arr, vi, (__m256)k, sizeof(U));
168  // Restore mask (docs say gather clears it but it doesn't seem to).
169  k = k_master;
170  _mm256_maskstore_ps(&fArray[i * N], k, reg);
171  }
172  }
173 
174 #else
175 
176  void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
177  // Separate N_proc == N case (gains about 7% in fit test).
178  if (N_proc == N) {
179  for (int i = 0; i < kSize; ++i) {
180  for (int j = 0; j < N; ++j) {
181  fArray[i * N + j] = *(arr + i + vi[j]);
182  }
183  }
184  } else {
185  for (int i = 0; i < kSize; ++i) {
186  for (int j = 0; j < N_proc; ++j) {
187  fArray[i * N + j] = *(arr + i + vi[j]);
188  }
189  }
190  }
191  }
192 
193 #endif
194 
195  void copyOut(idx_t n, T* arr) const {
196  for (idx_t i = n; i < kTotSize; i += N) {
197  *(arr++) = fArray[i];
198  }
199  }
200 
201  void setDiagonal3x3(idx_t n, T d) {
202  T* p = fArray + n;
203 
204  p[0 * N] = d;
205  p[1 * N] = 0;
206  p[2 * N] = d;
207  p[3 * N] = 0;
208  p[4 * N] = 0;
209  p[5 * N] = d;
210  }
211 
212  MatriplexSym& subtract(const MatriplexSym& a, const MatriplexSym& b) {
213  // Does *this = a - b;
214 
215 #pragma omp simd
216  for (idx_t i = 0; i < kTotSize; ++i) {
217  fArray[i] = a.fArray[i] - b.fArray[i];
218  }
219 
220  return *this;
221  }
222 
223  // ==================================================================
224  // Operations specific to Kalman fit in 6 parameter space
225  // ==================================================================
226 
227  void addNoiseIntoUpperLeft3x3(T noise) {
228  T* p = fArray;
229  ASSUME_ALIGNED(p, 64);
230 
231 #pragma omp simd
232  for (idx_t n = 0; n < N; ++n) {
233  p[0 * N + n] += noise;
234  p[2 * N + n] += noise;
235  p[5 * N + n] += noise;
236  }
237  }
238 
239  void invertUpperLeft3x3() {
240  typedef T TT;
241 
242  T* a = fArray;
243  ASSUME_ALIGNED(a, 64);
244 
245 #pragma omp simd
246  for (idx_t n = 0; n < N; ++n) {
247  const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
248  const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
249  const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
250  const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
251  const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
252  const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
253 
254  // Force determinant calculation in double precision.
255  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
256  const TT s = TT(1) / det;
257 
258  a[0 * N + n] = s * c00;
259  a[1 * N + n] = s * c01;
260  a[2 * N + n] = s * c11;
261  a[3 * N + n] = s * c02;
262  a[4 * N + n] = s * c12;
263  a[5 * N + n] = s * c22;
264  }
265  }
266 
267  Matriplex<T, 1, 1, N> ReduceFixedIJ(idx_t i, idx_t j) const {
268  Matriplex<T, 1, 1, N> t;
269  for (idx_t n = 0; n < N; ++n) {
270  t[n] = constAt(n, i, j);
271  }
272  return t;
273  }
274  };
275 
276  template <typename T, idx_t D, idx_t N>
277  using MPlexSym = MatriplexSym<T, D, N>;
278 
279  //==============================================================================
280  // Multiplications
281  //==============================================================================
282 
283  template <typename T, idx_t D, idx_t N>
284  struct SymMultiplyCls {
286  throw std::runtime_error("general symmetric multiplication not supported");
287  }
288  };
289 
290  template <typename T, idx_t N>
291  struct SymMultiplyCls<T, 3, N> {
293  const T* a = A.fArray;
294  ASSUME_ALIGNED(a, 64);
295  const T* b = B.fArray;
296  ASSUME_ALIGNED(b, 64);
297  T* c = C.fArray;
298  ASSUME_ALIGNED(c, 64);
299 
300 #ifdef MPLEX_INTRINSICS
301 
302  for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
303 #include "intr_sym_3x3.ah"
304  }
305 
306 #else
307 
308 #pragma omp simd
309  for (idx_t n = 0; n < N; ++n) {
310 #include "std_sym_3x3.ah"
311  }
312 
313 #endif
314  }
315  };
316 
317  template <typename T, idx_t N>
318  struct SymMultiplyCls<T, 6, N> {
320  const T* a = A.fArray;
321  ASSUME_ALIGNED(a, 64);
322  const T* b = B.fArray;
323  ASSUME_ALIGNED(b, 64);
324  T* c = C.fArray;
325  ASSUME_ALIGNED(c, 64);
326 
327 #ifdef MPLEX_INTRINSICS
328 
329  for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
330 #include "intr_sym_6x6.ah"
331  }
332 
333 #else
334 
335 #pragma omp simd
336  for (idx_t n = 0; n < N; ++n) {
337 #include "std_sym_6x6.ah"
338  }
339 
340 #endif
341  }
342  };
343 
344  template <typename T, idx_t D, idx_t N>
347  }
348 
349  //==============================================================================
350  // Cramer inversion
351  //==============================================================================
352 
353  template <typename T, idx_t D, idx_t N>
355  static void invert(MPlexSym<T, D, N>& A, double* determ = nullptr) {
356  throw std::runtime_error("general cramer inversion not supported");
357  }
358  };
359 
360  template <typename T, idx_t N>
361  struct CramerInverterSym<T, 2, N> {
362  static void invert(MPlexSym<T, 2, N>& A, double* determ = nullptr) {
363  typedef T TT;
364 
365  T* a = A.fArray;
366  ASSUME_ALIGNED(a, 64);
367 
368 #pragma omp simd
369  for (idx_t n = 0; n < N; ++n) {
370  // Force determinant calculation in double precision.
371  const double det = (double)a[0 * N + n] * a[2 * N + n] - (double)a[1 * N + n] * a[1 * N + n];
372  if (determ)
373  determ[n] = det;
374 
375  const TT s = TT(1) / det;
376  const TT tmp = s * a[2 * N + n];
377  a[1 * N + n] *= -s;
378  a[2 * N + n] = s * a[0 * N + n];
379  a[0 * N + n] = tmp;
380  }
381  }
382  };
383 
384  template <typename T, idx_t N>
385  struct CramerInverterSym<T, 3, N> {
386  static void invert(MPlexSym<T, 3, N>& A, double* determ = nullptr) {
387  typedef T TT;
388 
389  T* a = A.fArray;
390  ASSUME_ALIGNED(a, 64);
391 
392 #pragma omp simd
393  for (idx_t n = 0; n < N; ++n) {
394  const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
395  const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
396  const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
397  const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
398  const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
399  const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
400 
401  // Force determinant calculation in double precision.
402  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
403  if (determ)
404  determ[n] = det;
405 
406  const TT s = TT(1) / det;
407  a[0 * N + n] = s * c00;
408  a[1 * N + n] = s * c01;
409  a[2 * N + n] = s * c11;
410  a[3 * N + n] = s * c02;
411  a[4 * N + n] = s * c12;
412  a[5 * N + n] = s * c22;
413  }
414  }
415  };
416 
417  template <typename T, idx_t D, idx_t N>
418  void invertCramerSym(MPlexSym<T, D, N>& A, double* determ = nullptr) {
420  }
421 
422  //==============================================================================
423  // Cholesky inversion
424  //==============================================================================
425 
426  template <typename T, idx_t D, idx_t N>
428  static void invert(MPlexSym<T, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
429  };
430 
431  template <typename T, idx_t N>
432  struct CholeskyInverterSym<T, 3, N> {
433  static void invert(MPlexSym<T, 3, N>& A) {
434  typedef T TT;
435 
436  T* a = A.fArray;
437 
438 #pragma omp simd
439  for (idx_t n = 0; n < N; ++n) {
440  TT l0 = std::sqrt(T(1) / a[0 * N + n]);
441  TT l1 = a[1 * N + n] * l0;
442  TT l2 = a[2 * N + n] - l1 * l1;
443  l2 = std::sqrt(T(1) / l2);
444  TT l3 = a[3 * N + n] * l0;
445  TT l4 = (a[4 * N + n] - l1 * l3) * l2;
446  TT l5 = a[5 * N + n] - (l3 * l3 + l4 * l4);
447  l5 = std::sqrt(T(1) / l5);
448 
449  // decomposition done
450 
451  l3 = (l1 * l4 * l2 - l3) * l0 * l5;
452  l1 = -l1 * l0 * l2;
453  l4 = -l4 * l2 * l5;
454 
455  a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
456  a[1 * N + n] = l3 * l4 + l1 * l2;
457  a[2 * N + n] = l4 * l4 + l2 * l2;
458  a[3 * N + n] = l3 * l5;
459  a[4 * N + n] = l4 * l5;
460  a[5 * N + n] = l5 * l5;
461 
462  // m(2,x) are all zero if anything went wrong at l5.
463  // all zero, if anything went wrong already for l0 or l2.
464  }
465  }
466  };
467 
468  template <typename T, idx_t D, idx_t N>
471  }
472 
473 } // end namespace Matriplex
474 
475 #endif
Basic3DVector & operator=(const Basic3DVector &)=default
Assignment operator.
MatriplexSym< T, D, N > MPlexSym
Definition: MatriplexSym.h:277
Definition: APVGainStruct.h:7
Matriplex< T, D1, D2, N > MPlex
Definition: Matriplex.h:327
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
Definition: Matriplex.h:646
const idx_t gSymOffsets[7][36]
Definition: MatriplexSym.h:13
void invertCholeskySym(MPlexSym< T, D, N > &A)
Definition: MatriplexSym.h:469
static void invert(MPlexSym< T, 2, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:362
static void multiply(const MPlexSym< T, 3, N > &A, const MPlexSym< T, 3, N > &B, MPlex< T, 3, 3, N > &C)
Definition: MatriplexSym.h:292
void invertCramerSym(MPlexSym< T, D, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:418
static void invert(MPlexSym< T, D, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:355
static void invert(MPlexSym< T, 3, N > &A)
Definition: MatriplexSym.h:433
T sqrt(T t)
Definition: SSEVec.h:19
static void multiply(const MPlexSym< float, 6, N > &A, const MPlexSym< float, 6, N > &B, MPlex< float, 6, 6, N > &C)
Definition: MatriplexSym.h:319
#define MPLEX_ALIGN
static void invert(MPlexSym< T, 3, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:386
d
Definition: ztail.py:151
#define N
Definition: blowfish.cc:9
DecomposeProduct< arg, typename Div::arg > D
Definition: Factorize.h:141
double b
Definition: hdecay.h:120
class __attribute__((aligned(32))) Matriplex
Definition: Matriplex.h:11
static void multiply(const MPlexSym< T, D, N > &A, const MPlexSym< T, D, N > &B, MPlex< T, D, D, N > &C)
Definition: MatriplexSym.h:285
static void invert(MPlexSym< T, D, N > &A)
Definition: MatriplexSym.h:428
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
double a
Definition: hdecay.h:121
Definition: APVGainStruct.h:7
T operator[](int i) const
tmp
align.sh
Definition: createJobs.py:716
long double T
#define ASSUME_ALIGNED(a, b)