CMS 3D CMS Logo

MatriplexSym.h
Go to the documentation of this file.
1 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
2 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
3 
4 #include "MatriplexCommon.h"
5 #include "Matriplex.h"
6 
7 //==============================================================================
8 // MatriplexSym
9 //==============================================================================
10 
11 namespace Matriplex {
12 
13  const idx_t gSymOffsets[7][36] = {{},
14  {},
15  {0, 1, 1, 2},
16  {0, 1, 3, 1, 2, 4, 3, 4, 5}, // 3
17  {},
18  {},
19  {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
20  6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
21 
22  //------------------------------------------------------------------------------
23 
24  template <typename T, idx_t D, idx_t N>
25  class MatriplexSym {
26  public:
27  typedef T value_type;
28 
30  static constexpr int kRows = D;
32  static constexpr int kCols = D;
34  static constexpr int kSize = (D + 1) * D / 2;
36  static constexpr int kTotSize = N * kSize;
37 
38  T fArray[kTotSize] __attribute__((aligned(64)));
39 
42 
43  idx_t plexSize() const { return N; }
44 
45  void setVal(T v) {
46  for (idx_t i = 0; i < kTotSize; ++i) {
47  fArray[i] = v;
48  }
49  }
50 
51  void add(const MatriplexSym& v) {
52  for (idx_t i = 0; i < kTotSize; ++i) {
53  fArray[i] += v.fArray[i];
54  }
55  }
56 
57  void scale(T scale) {
58  for (idx_t i = 0; i < kTotSize; ++i) {
59  fArray[i] *= scale;
60  }
61  }
62 
63  T operator[](idx_t xx) const { return fArray[xx]; }
64  T& operator[](idx_t xx) { return fArray[xx]; }
65 
66  const idx_t* offsets() const { return gSymOffsets[D]; }
67  idx_t off(idx_t i) const { return gSymOffsets[D][i]; }
68 
69  const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[off(i * D + j) * N + n]; }
70 
71  T& At(idx_t n, idx_t i, idx_t j) { return fArray[off(i * D + j) * N + n]; }
72 
73  T& operator()(idx_t n, idx_t i, idx_t j) { return At(n, i, j); }
74  const T& operator()(idx_t n, idx_t i, idx_t j) const { return constAt(n, i, j); }
75 
77  memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
78  return *this;
79  }
80 
81  void copySlot(idx_t n, const MatriplexSym& m) {
82  for (idx_t i = n; i < kTotSize; i += N) {
83  fArray[i] = m.fArray[i];
84  }
85  }
86 
87  void copyIn(idx_t n, const T* arr) {
88  for (idx_t i = n; i < kTotSize; i += N) {
89  fArray[i] = *(arr++);
90  }
91  }
92 
93  void copyIn(idx_t n, const MatriplexSym& m, idx_t in) {
94  for (idx_t i = n; i < kTotSize; i += N, in += N) {
95  fArray[i] = m[in];
96  }
97  }
98 
99  void copy(idx_t n, idx_t in) {
100  for (idx_t i = n; i < kTotSize; i += N, in += N) {
101  fArray[i] = fArray[in];
102  }
103  }
104 
105 #if defined(AVX512_INTRINSICS)
106 
107  template <typename U>
108  void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
109  //_mm512_prefetch_i32gather_ps(vi, arr, 1, _MM_HINT_T0);
110 
111  const __m512 src = {0};
112  const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
113 
114  for (int i = 0; i < kSize; ++i, ++arr) {
115  //_mm512_prefetch_i32gather_ps(vi, arr+2, 1, _MM_HINT_NTA);
116 
117  __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
118  _mm512_mask_store_ps(&fArray[i * N], k, reg);
119  }
120  }
121 
122  // Experimental methods, slurpIn() seems to be at least as fast.
123  // See comments in mkFit/MkFitter.cc MkFitter::addBestHit().
124 
125  void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
126  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
127 
128  for (int i = 0; i < N; ++i) {
129  __m512 reg = _mm512_load_ps(arr + vi[i]);
130  _mm512_store_ps((void*)(tmp + 64 * i), reg);
131  }
132 
133  for (int i = 0; i < kSize; ++i) {
134  __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
135  _mm512_store_ps(&fArray[i * N], reg);
136  }
137  }
138 
139  void Contaginate(const char* arr, int vi[N], const char* tmp) {
140  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
141 
142  for (int i = 0; i < N; ++i) {
143  __m512 reg = _mm512_load_ps(arr + vi[i]);
144  _mm512_store_ps((void*)(tmp + 64 * i), reg);
145  }
146  }
147 
148  void Plexify(const char* tmp, __m512i& ui) {
149  for (int i = 0; i < kSize; ++i) {
150  __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
151  _mm512_store_ps(&fArray[i * N], reg);
152  }
153  }
154 
155 #elif defined(AVX2_INTRINSICS)
156 
157  template <typename U>
158  void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
159  const __m256 src = {0};
160 
161  __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
162  __m256i k_sel = _mm256_set1_epi32(N_proc);
163  __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
164 
165  k = k_master;
166  for (int i = 0; i < kSize; ++i, ++arr) {
167  __m256 reg = _mm256_mask_i32gather_ps(src, arr, vi, (__m256)k, sizeof(U));
168  // Restore mask (docs say gather clears it but it doesn't seem to).
169  k = k_master;
170  _mm256_maskstore_ps(&fArray[i * N], k, reg);
171  }
172  }
173 
174 #else
175 
176  void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
177  // Separate N_proc == N case (gains about 7% in fit test).
178  if (N_proc == N) {
179  for (int i = 0; i < kSize; ++i) {
180  for (int j = 0; j < N; ++j) {
181  fArray[i * N + j] = *(arr + i + vi[j]);
182  }
183  }
184  } else {
185  for (int i = 0; i < kSize; ++i) {
186  for (int j = 0; j < N_proc; ++j) {
187  fArray[i * N + j] = *(arr + i + vi[j]);
188  }
189  }
190  }
191  }
192 
193 #endif
194 
195  void copyOut(idx_t n, T* arr) const {
196  for (idx_t i = n; i < kTotSize; i += N) {
197  *(arr++) = fArray[i];
198  }
199  }
200 
202  T* p = fArray + n;
203 
204  p[0 * N] = d;
205  p[1 * N] = 0;
206  p[2 * N] = d;
207  p[3 * N] = 0;
208  p[4 * N] = 0;
209  p[5 * N] = d;
210  }
211 
213  // Does *this = a - b;
214 
215 #pragma omp simd
216  for (idx_t i = 0; i < kTotSize; ++i) {
217  fArray[i] = a.fArray[i] - b.fArray[i];
218  }
219 
220  return *this;
221  }
222 
223  // ==================================================================
224  // Operations specific to Kalman fit in 6 parameter space
225  // ==================================================================
226 
228  T* p = fArray;
229  ASSUME_ALIGNED(p, 64);
230 
231 #pragma omp simd
232  for (idx_t n = 0; n < N; ++n) {
233  p[0 * N + n] += noise;
234  p[2 * N + n] += noise;
235  p[5 * N + n] += noise;
236  }
237  }
238 
240  typedef T TT;
241 
242  T* a = fArray;
243  ASSUME_ALIGNED(a, 64);
244 
245 #pragma omp simd
246  for (idx_t n = 0; n < N; ++n) {
247  const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
248  const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
249  const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
250  const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
251  const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
252  const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
253 
254  // Force determinant calculation in double precision.
255  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
256  const TT s = TT(1) / det;
257 
258  a[0 * N + n] = s * c00;
259  a[1 * N + n] = s * c01;
260  a[2 * N + n] = s * c11;
261  a[3 * N + n] = s * c02;
262  a[4 * N + n] = s * c12;
263  a[5 * N + n] = s * c22;
264  }
265  }
266  };
267 
268  template <typename T, idx_t D, idx_t N>
270 
271  //==============================================================================
272  // Multiplications
273  //==============================================================================
274 
275  template <typename T, idx_t D, idx_t N>
276  struct SymMultiplyCls {
278  throw std::runtime_error("general symmetric multiplication not supported");
279  }
280  };
281 
282  template <typename T, idx_t N>
283  struct SymMultiplyCls<T, 3, N> {
285  const T* a = A.fArray;
286  ASSUME_ALIGNED(a, 64);
287  const T* b = B.fArray;
288  ASSUME_ALIGNED(b, 64);
289  T* c = C.fArray;
290  ASSUME_ALIGNED(c, 64);
291 
292 #ifdef MPLEX_INTRINSICS
293 
294  for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
295 #include "intr_sym_3x3.ah"
296  }
297 
298 #else
299 
300 #pragma omp simd
301  for (idx_t n = 0; n < N; ++n) {
302 #include "std_sym_3x3.ah"
303  }
304 
305 #endif
306  }
307  };
308 
309  template <typename T, idx_t N>
310  struct SymMultiplyCls<T, 6, N> {
312  const T* a = A.fArray;
313  ASSUME_ALIGNED(a, 64);
314  const T* b = B.fArray;
315  ASSUME_ALIGNED(b, 64);
316  T* c = C.fArray;
317  ASSUME_ALIGNED(c, 64);
318 
319 #ifdef MPLEX_INTRINSICS
320 
321  for (idx_t n = 0; n < N; n += 64 / sizeof(T)) {
322 #include "intr_sym_6x6.ah"
323  }
324 
325 #else
326 
327 #pragma omp simd
328  for (idx_t n = 0; n < N; ++n) {
329 #include "std_sym_6x6.ah"
330  }
331 
332 #endif
333  }
334  };
335 
336  template <typename T, idx_t D, idx_t N>
339  }
340 
341  //==============================================================================
342  // Cramer inversion
343  //==============================================================================
344 
345  template <typename T, idx_t D, idx_t N>
347  static void invert(MPlexSym<T, D, N>& A, double* determ = nullptr) {
348  throw std::runtime_error("general cramer inversion not supported");
349  }
350  };
351 
352  template <typename T, idx_t N>
353  struct CramerInverterSym<T, 2, N> {
354  static void invert(MPlexSym<T, 2, N>& A, double* determ = nullptr) {
355  typedef T TT;
356 
357  T* a = A.fArray;
358  ASSUME_ALIGNED(a, 64);
359 
360 #pragma omp simd
361  for (idx_t n = 0; n < N; ++n) {
362  // Force determinant calculation in double precision.
363  const double det = (double)a[0 * N + n] * a[2 * N + n] - (double)a[1 * N + n] * a[1 * N + n];
364  if (determ)
365  determ[n] = det;
366 
367  const TT s = TT(1) / det;
368  const TT tmp = s * a[2 * N + n];
369  a[1 * N + n] *= -s;
370  a[2 * N + n] = s * a[0 * N + n];
371  a[0 * N + n] = tmp;
372  }
373  }
374  };
375 
376  template <typename T, idx_t N>
377  struct CramerInverterSym<T, 3, N> {
378  static void invert(MPlexSym<T, 3, N>& A, double* determ = nullptr) {
379  typedef T TT;
380 
381  T* a = A.fArray;
382  ASSUME_ALIGNED(a, 64);
383 
384 #pragma omp simd
385  for (idx_t n = 0; n < N; ++n) {
386  const TT c00 = a[2 * N + n] * a[5 * N + n] - a[4 * N + n] * a[4 * N + n];
387  const TT c01 = a[4 * N + n] * a[3 * N + n] - a[1 * N + n] * a[5 * N + n];
388  const TT c02 = a[1 * N + n] * a[4 * N + n] - a[2 * N + n] * a[3 * N + n];
389  const TT c11 = a[5 * N + n] * a[0 * N + n] - a[3 * N + n] * a[3 * N + n];
390  const TT c12 = a[3 * N + n] * a[1 * N + n] - a[4 * N + n] * a[0 * N + n];
391  const TT c22 = a[0 * N + n] * a[2 * N + n] - a[1 * N + n] * a[1 * N + n];
392 
393  // Force determinant calculation in double precision.
394  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[3 * N + n] * c02;
395  if (determ)
396  determ[n] = det;
397 
398  const TT s = TT(1) / det;
399  a[0 * N + n] = s * c00;
400  a[1 * N + n] = s * c01;
401  a[2 * N + n] = s * c11;
402  a[3 * N + n] = s * c02;
403  a[4 * N + n] = s * c12;
404  a[5 * N + n] = s * c22;
405  }
406  }
407  };
408 
409  template <typename T, idx_t D, idx_t N>
410  void invertCramerSym(MPlexSym<T, D, N>& A, double* determ = nullptr) {
412  }
413 
414  //==============================================================================
415  // Cholesky inversion
416  //==============================================================================
417 
418  template <typename T, idx_t D, idx_t N>
420  static void invert(MPlexSym<T, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
421  };
422 
423  template <typename T, idx_t N>
424  struct CholeskyInverterSym<T, 3, N> {
425  static void invert(MPlexSym<T, 3, N>& A) {
426  typedef T TT;
427 
428  T* a = A.fArray;
429 
430 #pragma omp simd
431  for (idx_t n = 0; n < N; ++n) {
432  TT l0 = std::sqrt(T(1) / a[0 * N + n]);
433  TT l1 = a[1 * N + n] * l0;
434  TT l2 = a[2 * N + n] - l1 * l1;
435  l2 = std::sqrt(T(1) / l2);
436  TT l3 = a[3 * N + n] * l0;
437  TT l4 = (a[4 * N + n] - l1 * l3) * l2;
438  TT l5 = a[5 * N + n] - (l3 * l3 + l4 * l4);
439  l5 = std::sqrt(T(1) / l5);
440 
441  // decomposition done
442 
443  l3 = (l1 * l4 * l2 - l3) * l0 * l5;
444  l1 = -l1 * l0 * l2;
445  l4 = -l4 * l2 * l5;
446 
447  a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
448  a[1 * N + n] = l3 * l4 + l1 * l2;
449  a[2 * N + n] = l4 * l4 + l2 * l2;
450  a[3 * N + n] = l3 * l5;
451  a[4 * N + n] = l4 * l5;
452  a[5 * N + n] = l5 * l5;
453 
454  // m(2,x) are all zero if anything went wrong at l5.
455  // all zero, if anything went wrong already for l0 or l2.
456  }
457  }
458  };
459 
460  template <typename T, idx_t D, idx_t N>
463  }
464 
465 } // end namespace Matriplex
466 
467 #endif
MatriplexSym & subtract(const MatriplexSym &a, const MatriplexSym &b)
Definition: MatriplexSym.h:212
static constexpr int kCols
no. of matrix columns
Definition: MatriplexSym.h:32
Definition: APVGainStruct.h:7
void setDiagonal3x3(idx_t n, T d)
Definition: MatriplexSym.h:201
void add(const MatriplexSym &v)
Definition: MatriplexSym.h:51
void copySlot(idx_t n, const MatriplexSym &m)
Definition: MatriplexSym.h:81
idx_t off(idx_t i) const
Definition: MatriplexSym.h:67
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
Definition: Matriplex.h:340
const idx_t * offsets() const
Definition: MatriplexSym.h:66
static constexpr int kTotSize
size of the whole matriplex
Definition: MatriplexSym.h:36
T & At(idx_t n, idx_t i, idx_t j)
Definition: MatriplexSym.h:71
T operator[](idx_t xx) const
Definition: MatriplexSym.h:63
const idx_t gSymOffsets[7][36]
Definition: MatriplexSym.h:13
void invertCholeskySym(MPlexSym< T, D, N > &A)
Definition: MatriplexSym.h:461
static void invert(MPlexSym< T, 2, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:354
static void multiply(const MPlexSym< T, 3, N > &A, const MPlexSym< T, 3, N > &B, MPlex< T, 3, 3, N > &C)
Definition: MatriplexSym.h:284
T & operator()(idx_t n, idx_t i, idx_t j)
Definition: MatriplexSym.h:73
void invertCramerSym(MPlexSym< T, D, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:410
static void invert(MPlexSym< T, D, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:347
const T & operator()(idx_t n, idx_t i, idx_t j) const
Definition: MatriplexSym.h:74
const T & constAt(idx_t n, idx_t i, idx_t j) const
Definition: MatriplexSym.h:69
static void invert(MPlexSym< T, 3, N > &A)
Definition: MatriplexSym.h:425
T sqrt(T t)
Definition: SSEVec.h:19
static void multiply(const MPlexSym< float, 6, N > &A, const MPlexSym< float, 6, N > &B, MPlex< float, 6, 6, N > &C)
Definition: MatriplexSym.h:311
void copyOut(idx_t n, T *arr) const
Definition: MatriplexSym.h:195
static void invert(MPlexSym< T, 3, N > &A, double *determ=nullptr)
Definition: MatriplexSym.h:378
d
Definition: ztail.py:151
idx_t plexSize() const
Definition: MatriplexSym.h:43
void addNoiseIntoUpperLeft3x3(T noise)
Definition: MatriplexSym.h:227
void slurpIn(const T *arr, int vi[N], const int N_proc=N)
Definition: MatriplexSym.h:176
#define N
Definition: blowfish.cc:9
DecomposeProduct< arg, typename Div::arg > D
Definition: Factorize.h:141
void copyIn(idx_t n, const MatriplexSym &m, idx_t in)
Definition: MatriplexSym.h:93
void copy(idx_t n, idx_t in)
Definition: MatriplexSym.h:99
double b
Definition: hdecay.h:120
T fArray [kTotSize] __attribute__((aligned(64)))
static void multiply(const MPlexSym< T, D, N > &A, const MPlexSym< T, D, N > &B, MPlex< T, D, D, N > &C)
Definition: MatriplexSym.h:277
static void invert(MPlexSym< T, D, N > &A)
Definition: MatriplexSym.h:420
T & operator[](idx_t xx)
Definition: MatriplexSym.h:64
void copyIn(idx_t n, const T *arr)
Definition: MatriplexSym.h:87
double a
Definition: hdecay.h:121
Definition: APVGainStruct.h:7
static constexpr int kSize
no of elements: lower triangle
Definition: MatriplexSym.h:34
tmp
align.sh
Definition: createJobs.py:716
long double T
#define ASSUME_ALIGNED(a, b)
MatriplexSym & operator=(const MatriplexSym &m)
Definition: MatriplexSym.h:76
static constexpr int kRows
no. of matrix rows
Definition: MatriplexSym.h:30