CMS 3D CMS Logo

MatriplexSym.h
Go to the documentation of this file.
1 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
2 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
3 
4 #include "MatriplexCommon.h"
5 #include "Matriplex.h"
6 
7 //==============================================================================
8 // MatriplexSym
9 //==============================================================================
10 
11 namespace Matriplex {
12 
13  const idx_t gSymOffsets[7][36] = {
14  {},
15  {},
16  {0, 1, 1, 2},
17  {0, 1, 3, 1, 2, 4, 3, 4, 5}, // 3
18  {},
19  {0, 1, 3, 6, 10, 1, 2, 4, 7, 11, 3, 4, 5, 8, 12, 6, 7, 8, 9, 13, 10, 11, 12, 13, 14},
20  {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
21  6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
22 
23  //------------------------------------------------------------------------------
24 
25  template <typename T, idx_t D, idx_t N>
26  class __attribute__((aligned(MPLEX_ALIGN))) MatriplexSym {
27  public:
28  typedef T value_type;
29 
31  static constexpr int kRows = D;
33  static constexpr int kCols = D;
35  static constexpr int kSize = (D + 1) * D / 2;
37  static constexpr int kTotSize = N * kSize;
38 
39  T fArray[kTotSize];
40 
41  MatriplexSym() {}
42  MatriplexSym(T v) { setVal(v); }
43 
44  idx_t plexSize() const { return N; }
45 
46  void setVal(T v) {
47  for (idx_t i = 0; i < kTotSize; ++i) {
48  fArray[i] = v;
49  }
50  }
51 
52  void add(const MatriplexSym& v) {
53  for (idx_t i = 0; i < kTotSize; ++i) {
54  fArray[i] += v.fArray[i];
55  }
56  }
57 
58  void scale(T scale) {
59  for (idx_t i = 0; i < kTotSize; ++i) {
60  fArray[i] *= scale;
61  }
62  }
63 
64  T operator[](idx_t xx) const { return fArray[xx]; }
65  T& operator[](idx_t xx) { return fArray[xx]; }
66 
67  const idx_t* offsets() const { return gSymOffsets[D]; }
68  idx_t off(idx_t i) const { return gSymOffsets[D][i]; }
69 
70  const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[off(i * D + j) * N + n]; }
71 
72  T& At(idx_t n, idx_t i, idx_t j) { return fArray[off(i * D + j) * N + n]; }
73 
74  T& operator()(idx_t n, idx_t i, idx_t j) { return At(n, i, j); }
75  const T& operator()(idx_t n, idx_t i, idx_t j) const { return constAt(n, i, j); }
76 
77  MatriplexSym& operator=(const MatriplexSym& m) {
78  memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
79  return *this;
80  }
81 
82  MatriplexSym(const MatriplexSym& m) = default;
83 
84  void copySlot(idx_t n, const MatriplexSym& m) {
85  for (idx_t i = n; i < kTotSize; i += N) {
86  fArray[i] = m.fArray[i];
87  }
88  }
89 
90  void copyIn(idx_t n, const T* arr) {
91  for (idx_t i = n; i < kTotSize; i += N) {
92  fArray[i] = *(arr++);
93  }
94  }
95 
96  void copyIn(idx_t n, const MatriplexSym& m, idx_t in) {
97  for (idx_t i = n; i < kTotSize; i += N, in += N) {
98  fArray[i] = m[in];
99  }
100  }
101 
102  void copy(idx_t n, idx_t in) {
103  for (idx_t i = n; i < kTotSize; i += N, in += N) {
104  fArray[i] = fArray[in];
105  }
106  }
107 
108 #if defined(AVX512_INTRINSICS)
109 
110  template <typename U>
111  void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
112  //_mm512_prefetch_i32gather_ps(vi, arr, 1, _MM_HINT_T0);
113 
114  const __m512 src = {0};
115  const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
116 
117  for (int i = 0; i < kSize; ++i, ++arr) {
118  //_mm512_prefetch_i32gather_ps(vi, arr+2, 1, _MM_HINT_NTA);
119 
120  __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
121  _mm512_mask_store_ps(&fArray[i * N], k, reg);
122  }
123  }
124 
125  // Experimental methods, slurpIn() seems to be at least as fast.
126  // See comments in mkFit/MkFitter.cc MkFitter::addBestHit().
127 
128  void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
129  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
130 
131  for (int i = 0; i < N; ++i) {
132  __m512 reg = _mm512_load_ps(arr + vi[i]);
133  _mm512_store_ps((void*)(tmp + 64 * i), reg);
134  }
135 
136  for (int i = 0; i < kSize; ++i) {
137  __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
138  _mm512_store_ps(&fArray[i * N], reg);
139  }
140  }
141 
142  void Contaginate(const char* arr, int vi[N], const char* tmp) {
143  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
144 
145  for (int i = 0; i < N; ++i) {
146  __m512 reg = _mm512_load_ps(arr + vi[i]);
147  _mm512_store_ps((void*)(tmp + 64 * i), reg);
148  }
149  }
150 
151  void Plexify(const char* tmp, __m512i& ui) {
152  for (int i = 0; i < kSize; ++i) {
153  __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
154  _mm512_store_ps(&fArray[i * N], reg);
155  }
156  }
157 
158 #elif defined(AVX2_INTRINSICS)
159 
160  template <typename U>
161  void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
162  const __m256 src = {0};
163 
164  __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
165  __m256i k_sel = _mm256_set1_epi32(N_proc);
166  __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
167 
168  k = k_master;
169  for (int i = 0; i < kSize; ++i, ++arr) {
170  __m256 reg = _mm256_mask_i32gather_ps(src, arr, vi, (__m256)k, sizeof(U));
171  // Restore mask (docs say gather clears it but it doesn't seem to).
172  k = k_master;
173  _mm256_maskstore_ps(&fArray[i * N], k, reg);
174  }
175  }
176 
177 #else
178 
179  void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
180  // Separate N_proc == N case (gains about 7% in fit test).
181  if (N_proc == N) {
182  for (int i = 0; i < kSize; ++i) {
183  for (int j = 0; j < N; ++j) {
184  fArray[i * N + j] = *(arr + i + vi[j]);
185  }
186  }
187  } else {
188  for (int i = 0; i < kSize; ++i) {
189  for (int j = 0; j < N_proc; ++j) {
190  fArray[i * N + j] = *(arr + i + vi[j]);
191  }
192  }
193  }
194  }
195 
196 #endif
197 
198  void copyOut(idx_t n, T* arr) const {
199  for (idx_t i = n; i < kTotSize; i += N) {
200  *(arr++) = fArray[i];
201  }
202  }
203 
204  void setDiagonal3x3(idx_t n, T d) {
205  T* p = fArray + n;
206 
207  p[0 * N] = d;
208  p[1 * N] = 0;
209  p[2 * N] = d;
210  p[3 * N] = 0;
211  p[4 * N] = 0;
212  p[5 * N] = d;
213  }
214 
215  MatriplexSym& subtract(const MatriplexSym& a, const MatriplexSym& b) {
216  // Does *this = a - b;
217 
218 #pragma omp simd
219  for (idx_t i = 0; i < kTotSize; ++i) {
220  fArray[i] = a.fArray[i] - b.fArray[i];
221  }
222 
223  return *this;
224  }
225 
226  // ==================================================================
227  // Operations specific to Kalman fit in 6 parameter space
228  // ==================================================================
229 
230  void addNoiseIntoUpperLeft3x3(T noise) {
231  T* p = fArray;
232  ASSUME_ALIGNED(p, 64);
233 
234 #pragma omp simd
235  for (idx_t n = 0; n < N; ++n) {
236  p[0 * N + n] += noise;
237  p[2 * N + n] += noise;
238  p[5 * N + n] += noise;
239  }
240  }
241 
242  void invertUpperLeft3x3() {
243  typedef T TT;
244 
245  T* a = fArray;
246  ASSUME_ALIGNED(a, 64);
247 
248 #pragma omp simd
249  for (idx_t n = 0; n < N; ++n) {
250  const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
251  const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
252  const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
253  const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
254  const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
255  const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
256 
257  // Force determinant calculation in double precision.
258  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
259  const TT s = TT(1) / det;
260 
261  a[0 * N + n] = s * c00;
262  a[1 * N + n] = s * c01;
263  a[2 * N + n] = s * c11;
264  a[3 * N + n] = s * c02;
265  a[4 * N + n] = s * c12;
266  a[5 * N + n] = s * c22;
267  }
268  }
269 
270  Matriplex<T, 1, 1, N> ReduceFixedIJ(idx_t i, idx_t j) const {
271  Matriplex<T, 1, 1, N> t;
272  for (idx_t n = 0; n < N; ++n) {
273  t[n] = constAt(n, i, j);
274  }
275  return t;
276  }
277  };
278 
279  template <typename T, idx_t D, idx_t N>
280  using MPlexSym = MatriplexSym<T, D, N>;
281 
282  //==============================================================================
283  // Multiplications
284  //==============================================================================
285 
286  template <typename T, idx_t D, idx_t N>
287  struct SymMultiplyCls {
289  throw std::runtime_error("general symmetric multiplication not supported");
290  }
291  };
292 
293  template <typename T, idx_t N>
294  struct SymMultiplyCls<T, 3, N> {
296  const T* a = A.fArray;
297  ASSUME_ALIGNED(a, 64);
298  const T* b = B.fArray;
299  ASSUME_ALIGNED(b, 64);
300  T* c = C.fArray;
301  ASSUME_ALIGNED(c, 64);
302 
303 #ifdef MPLEX_INTRINSICS
304 
305  for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
306 #include "intr_sym_3x3.ah"
307  }
308 
309 #else
310 
311 #pragma omp simd
312  for (idx_t n = 0; n < N; ++n) {
313 #include "std_sym_3x3.ah"
314  }
315 
316 #endif
317  }
318  };
319 
320  template <typename T, idx_t N>
321  struct SymMultiplyCls<T, 6, N> {
323  const T* a = A.fArray;
324  ASSUME_ALIGNED(a, 64);
325  const T* b = B.fArray;
326  ASSUME_ALIGNED(b, 64);
327  T* c = C.fArray;
328  ASSUME_ALIGNED(c, 64);
329 
330 #ifdef MPLEX_INTRINSICS
331 
332  for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
333 #include "intr_sym_6x6.ah"
334  }
335 
336 #else
337 
338 #pragma omp simd
339  for (idx_t n = 0; n < N; ++n) {
340 #include "std_sym_6x6.ah"
341  }
342 
343 #endif
344  }
345  };
346 
347  template <typename T, idx_t D, idx_t N>
350  }
351 
352  //==============================================================================
353  // Cramer inversion
354  //==============================================================================
355 
356  template <typename T, idx_t D, idx_t N>
358  static void invert(MPlexSym<T, D, N>& A, double* determ = nullptr) {
359  throw std::runtime_error("general cramer inversion not supported");
360  }
361  };
362 
363  template <typename T, idx_t N>
364  struct CramerInverterSym<T, 2, N> {
365  static void invert(MPlexSym<T, 2, N>& A, double* determ = nullptr) {
366  typedef T TT;
367 
368  T* a = A.fArray;
369  ASSUME_ALIGNED(a, 64);
370 
371 #pragma omp simd
372  for (idx_t n = 0; n < N; ++n) {
373  // Force determinant calculation in double precision.
374  const double det = (double)a[0 * N + n] * a[2 * N + n] - (double)a[1 * N + n] * a[1 * N + n];
375  if (determ)
376  determ[n] = det;
377 
378  const TT s = TT(1) / det;
379  const TT tmp = s * a[2 * N + n];
380  a[1 * N + n] *= -s;
381  a[2 * N + n] = s * a[0 * N + n];
382  a[0 * N + n] = tmp;
383  }
384  }
385  };
386 
387  template <typename T, idx_t N>
388  struct CramerInverterSym<T, 3, N> {
389  static void invert(MPlexSym<T, 3, N>& A, double* determ = nullptr) {
390  typedef T TT;
391 
392  T* a = A.fArray;
393  ASSUME_ALIGNED(a, 64);
394 
395 #pragma omp simd
396  for (idx_t n = 0; n < N; ++n) {
397  const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
398  const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
399  const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
400  const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
401  const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
402  const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
403 
404  // Force determinant calculation in double precision.
405  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
406  if (determ)
407  determ[n] = det;
408 
409  const TT s = TT(1) / det;
410  a[0 * N + n] = s * c00;
411  a[1 * N + n] = s * c01;
412  a[2 * N + n] = s * c11;
413  a[3 * N + n] = s * c02;
414  a[4 * N + n] = s * c12;
415  a[5 * N + n] = s * c22;
416  }
417  }
418  };
419 
420  template <typename T, idx_t D, idx_t N>
421  void invertCramerSym(MPlexSym<T, D, N>& A, double* determ = nullptr) {
423  }
424 
425  //==============================================================================
426  // Cholesky inversion
427  //==============================================================================
428 
429  template <typename T, idx_t D, idx_t N>
431  static void invert(MPlexSym<T, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
432  };
433 
434  template <typename T, idx_t N>
435  struct CholeskyInverterSym<T, 3, N> {
436  static void invert(MPlexSym<T, 3, N>& A) {
437  typedef T TT;
438 
439  T* a = A.fArray;
440 
441 #pragma omp simd
442  for (idx_t n = 0; n < N; ++n) {
443  TT l0 = std::sqrt(T(1) / a[0 * N + n]);
444  TT l1 = a[1 * N + n] * l0;
445  TT l2 = a[2 * N + n] - l1 * l1;
446  l2 = std::sqrt(T(1) / l2);
447  TT l3 = a[3 * N + n] * l0;
448  TT l4 = (a[4 * N + n] - l1 * l3) * l2;
449  TT l5 = a[5 * N + n] - (l3 * l3 + l4 * l4);
450  l5 = std::sqrt(T(1) / l5);
451 
452  // decomposition done
453 
454  l3 = (l1 * l4 * l2 - l3) * l0 * l5;
455  l1 = -l1 * l0 * l2;
456  l4 = -l4 * l2 * l5;
457 
458  a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
459  a[1 * N + n] = l3 * l4 + l1 * l2;
460  a[2 * N + n] = l4 * l4 + l2 * l2;
461  a[3 * N + n] = l3 * l5;
462  a[4 * N + n] = l4 * l5;
463  a[5 * N + n] = l5 * l5;
464 
465  // m(2,x) are all zero if anything went wrong at l5.
466  // all zero, if anything went wrong already for l0 or l2.
467  }
468  }
469  };
470 
471  template <typename T, idx_t D, idx_t N>
474  }
475 
476 } // end namespace Matriplex
477 
478 #endif
Basic3DVector & operator=(const Basic3DVector &)=default
Assignment operator.
MatriplexSym< T, D, N > MPlexSym
Definition: MatriplexSym.h:280
Definition: APVGainStruct.h:7
Matriplex< T, D1, D2, N > MPlex
Definition: Matriplex.h:483
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
Definition: Matriplex.h:881
const idx_t gSymOffsets[7][36]
Definition: MatriplexSym.h:13
void invertCholeskySym(MPlexSym< T, D, N > &A)
Definition: MatriplexSym.h:472
static void invert(MPlexSym< T, 2, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:365
static void multiply(const MPlexSym< T, 3, N > &A, const MPlexSym< T, 3, N > &B, MPlex< T, 3, 3, N > &C)
Definition: MatriplexSym.h:295
void invertCramerSym(MPlexSym< T, D, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:421
static void invert(MPlexSym< T, D, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:358
static void invert(MPlexSym< T, 3, N > &A)
Definition: MatriplexSym.h:436
T sqrt(T t)
Definition: SSEVec.h:23
static void multiply(const MPlexSym< float, 6, N > &A, const MPlexSym< float, 6, N > &B, MPlex< float, 6, 6, N > &C)
Definition: MatriplexSym.h:322
#define MPLEX_ALIGN
static void invert(MPlexSym< T, 3, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:389
d
Definition: ztail.py:151
#define N
Definition: blowfish.cc:9
DecomposeProduct< arg, typename Div::arg > D
Definition: Factorize.h:141
double b
Definition: hdecay.h:120
class __attribute__((aligned(32))) Matriplex
Definition: Matriplex.h:35
static void multiply(const MPlexSym< T, D, N > &A, const MPlexSym< T, D, N > &B, MPlex< T, D, D, N > &C)
Definition: MatriplexSym.h:288
static void invert(MPlexSym< T, D, N > &A)
Definition: MatriplexSym.h:431
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
double a
Definition: hdecay.h:121
Definition: APVGainStruct.h:7
T operator[](int i) const
tmp
align.sh
Definition: createJobs.py:716
long double T
#define ASSUME_ALIGNED(a, b)