CMS 3D CMS Logo

Matriplex.h
Go to the documentation of this file.
1 #ifndef RecoTracker_MkFitCore_src_Matriplex_Matriplex_h
2 #define RecoTracker_MkFitCore_src_Matriplex_Matriplex_h
3 
4 #include "MatriplexCommon.h"
5 
6 namespace Matriplex {
7 
8  //------------------------------------------------------------------------------
9 
10  template <typename T, idx_t D1, idx_t D2, idx_t N>
11  class __attribute__((aligned(MPLEX_ALIGN))) Matriplex {
12  public:
13  typedef T value_type;
14 
16  static constexpr int kRows = D1;
18  static constexpr int kCols = D2;
20  static constexpr int kSize = D1 * D2;
22  static constexpr int kTotSize = N * kSize;
23 
24  T fArray[kTotSize];
25 
26  Matriplex() {}
27  Matriplex(T v) { setVal(v); }
28 
29  idx_t plexSize() const { return N; }
30 
31  void setVal(T v) {
32  for (idx_t i = 0; i < kTotSize; ++i) {
33  fArray[i] = v;
34  }
35  }
36 
37  void add(const Matriplex& v) {
38  for (idx_t i = 0; i < kTotSize; ++i) {
39  fArray[i] += v.fArray[i];
40  }
41  }
42 
43  void scale(T scale) {
44  for (idx_t i = 0; i < kTotSize; ++i) {
45  fArray[i] *= scale;
46  }
47  }
48 
49  T operator[](idx_t xx) const { return fArray[xx]; }
50  T& operator[](idx_t xx) { return fArray[xx]; }
51 
52  const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[(i * D2 + j) * N + n]; }
53 
54  T& At(idx_t n, idx_t i, idx_t j) { return fArray[(i * D2 + j) * N + n]; }
55 
56  T& operator()(idx_t n, idx_t i, idx_t j) { return fArray[(i * D2 + j) * N + n]; }
57  const T& operator()(idx_t n, idx_t i, idx_t j) const { return fArray[(i * D2 + j) * N + n]; }
58 
60  for (idx_t i = 0; i < kTotSize; ++i)
61  fArray[i] = t;
62  return *this;
63  }
64 
66  for (idx_t i = 0; i < kTotSize; ++i)
67  fArray[i] += t;
68  return *this;
69  }
70 
72  for (idx_t i = 0; i < kTotSize; ++i)
73  fArray[i] -= t;
74  return *this;
75  }
76 
78  for (idx_t i = 0; i < kTotSize; ++i)
79  fArray[i] *= t;
80  return *this;
81  }
82 
84  for (idx_t i = 0; i < kTotSize; ++i)
85  fArray[i] /= t;
86  return *this;
87  }
88 
89  Matriplex& operator+=(const Matriplex& a) {
90  for (idx_t i = 0; i < kTotSize; ++i)
91  fArray[i] += a.fArray[i];
92  return *this;
93  }
94 
95  Matriplex& operator-=(const Matriplex& a) {
96  for (idx_t i = 0; i < kTotSize; ++i)
97  fArray[i] -= a.fArray[i];
98  return *this;
99  }
100 
101  Matriplex& operator*=(const Matriplex& a) {
102  for (idx_t i = 0; i < kTotSize; ++i)
103  fArray[i] *= a.fArray[i];
104  return *this;
105  }
106 
107  Matriplex& operator/=(const Matriplex& a) {
108  for (idx_t i = 0; i < kTotSize; ++i)
109  fArray[i] /= a.fArray[i];
110  return *this;
111  }
112 
113  Matriplex operator-() {
114  Matriplex t;
115  for (idx_t i = 0; i < kTotSize; ++i)
116  t.fArray[i] = -fArray[i];
117  return t;
118  }
119 
120  Matriplex& abs(const Matriplex& a) {
121  for (idx_t i = 0; i < kTotSize; ++i)
122  fArray[i] = std::abs(a.fArray[i]);
123  return *this;
124  }
125  Matriplex& abs() {
126  for (idx_t i = 0; i < kTotSize; ++i)
127  fArray[i] = std::abs(fArray[i]);
128  return *this;
129  }
130 
131  Matriplex& sqrt(const Matriplex& a) {
132  for (idx_t i = 0; i < kTotSize; ++i)
133  fArray[i] = std::sqrt(a.fArray[i]);
134  return *this;
135  }
136  Matriplex& sqrt() {
137  for (idx_t i = 0; i < kTotSize; ++i)
138  fArray[i] = std::sqrt(fArray[i]);
139  return *this;
140  }
141 
142  Matriplex& sqr(const Matriplex& a) {
143  for (idx_t i = 0; i < kTotSize; ++i)
144  fArray[i] = a.fArray[i] * a.fArray[i];
145  return *this;
146  }
147  Matriplex& sqr() {
148  for (idx_t i = 0; i < kTotSize; ++i)
149  fArray[i] = fArray[i] * fArray[i];
150  return *this;
151  }
152 
153  Matriplex& hypot(const Matriplex& a, const Matriplex& b) {
154  for (idx_t i = 0; i < kTotSize; ++i) {
155  fArray[i] = a.fArray[i] * a.fArray[i] + b.fArray[i] * b.fArray[i];
156  }
157  return sqrt();
158  }
159 
160  Matriplex& sin(const Matriplex& a) {
161  for (idx_t i = 0; i < kTotSize; ++i)
162  fArray[i] = std::sin(a.fArray[i]);
163  return *this;
164  }
165  Matriplex& sin() {
166  for (idx_t i = 0; i < kTotSize; ++i)
167  fArray[i] = std::sin(fArray[i]);
168  return *this;
169  }
170 
171  Matriplex& cos(const Matriplex& a) {
172  for (idx_t i = 0; i < kTotSize; ++i)
173  fArray[i] = std::cos(a.fArray[i]);
174  return *this;
175  }
176  Matriplex& cos() {
177  for (idx_t i = 0; i < kTotSize; ++i)
178  fArray[i] = std::cos(fArray[i]);
179  return *this;
180  }
181 
182  Matriplex& tan(const Matriplex& a) {
183  for (idx_t i = 0; i < kTotSize; ++i)
184  fArray[i] = std::tan(a.fArray[i]);
185  return *this;
186  }
187  Matriplex& tan() {
188  for (idx_t i = 0; i < kTotSize; ++i)
189  fArray[i] = std::tan(fArray[i]);
190  return *this;
191  }
192 
193  //---------------------------------------------------------
194 
195  void copySlot(idx_t n, const Matriplex& m) {
196  for (idx_t i = n; i < kTotSize; i += N) {
197  fArray[i] = m.fArray[i];
198  }
199  }
200 
201  void copyIn(idx_t n, const T* arr) {
202  for (idx_t i = n; i < kTotSize; i += N) {
203  fArray[i] = *(arr++);
204  }
205  }
206 
207  void copyIn(idx_t n, const Matriplex& m, idx_t in) {
208  for (idx_t i = n; i < kTotSize; i += N, in += N) {
209  fArray[i] = m[in];
210  }
211  }
212 
213  void copy(idx_t n, idx_t in) {
214  for (idx_t i = n; i < kTotSize; i += N, in += N) {
215  fArray[i] = fArray[in];
216  }
217  }
218 
219 #if defined(AVX512_INTRINSICS)
220 
221  template <typename U>
222  void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
223  //_mm512_prefetch_i32gather_ps(vi, arr, 1, _MM_HINT_T0);
224 
225  const __m512 src = {0};
226  const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
227 
228  for (int i = 0; i < kSize; ++i, ++arr) {
229  //_mm512_prefetch_i32gather_ps(vi, arr+2, 1, _MM_HINT_NTA);
230 
231  __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
232  _mm512_mask_store_ps(&fArray[i * N], k, reg);
233  }
234  }
235 
236  // Experimental methods, slurpIn() seems to be at least as fast.
237  // See comments in mkFit/MkFitter.cc MkFitter::addBestHit().
238  void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
239  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
240 
241  for (int i = 0; i < N; ++i) {
242  __m512 reg = _mm512_load_ps(arr + vi[i]);
243  _mm512_store_ps((void*)(tmp + 64 * i), reg);
244  }
245 
246  for (int i = 0; i < kSize; ++i) {
247  __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
248  _mm512_store_ps(&fArray[i * N], reg);
249  }
250  }
251 
252  void Contaginate(const char* arr, int vi[N], const char* tmp) {
253  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
254 
255  for (int i = 0; i < N; ++i) {
256  __m512 reg = _mm512_load_ps(arr + vi[i]);
257  _mm512_store_ps((void*)(tmp + 64 * i), reg);
258  }
259  }
260 
261  void Plexify(const char* tmp, __m512i& ui) {
262  for (int i = 0; i < kSize; ++i) {
263  __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
264  _mm512_store_ps(&fArray[i * N], reg);
265  }
266  }
267 
268 #elif defined(AVX2_INTRINSICS)
269 
270  template <typename U>
271  void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
272  // Casts to float* needed to "support" also T=HitOnTrack.
273  // Note that sizeof(float) == sizeof(HitOnTrack) == 4.
274 
275  const __m256 src = {0};
276 
277  __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
278  __m256i k_sel = _mm256_set1_epi32(N_proc);
279  __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
280 
281  k = k_master;
282  for (int i = 0; i < kSize; ++i, ++arr) {
283  __m256 reg = _mm256_mask_i32gather_ps(src, (float*)arr, vi, (__m256)k, sizeof(U));
284  // Restore mask (docs say gather clears it but it doesn't seem to).
285  k = k_master;
286  _mm256_maskstore_ps((float*)&fArray[i * N], k, reg);
287  }
288  }
289 
290 #else
291 
292  void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
293  // Separate N_proc == N case (gains about 7% in fit test).
294  if (N_proc == N) {
295  for (int i = 0; i < kSize; ++i) {
296  for (int j = 0; j < N; ++j) {
297  fArray[i * N + j] = *(arr + i + vi[j]);
298  }
299  }
300  } else {
301  for (int i = 0; i < kSize; ++i) {
302  for (int j = 0; j < N_proc; ++j) {
303  fArray[i * N + j] = *(arr + i + vi[j]);
304  }
305  }
306  }
307  }
308 
309 #endif
310 
311  void copyOut(idx_t n, T* arr) const {
312  for (idx_t i = n; i < kTotSize; i += N) {
313  *(arr++) = fArray[i];
314  }
315  }
316 
317  Matriplex<T, 1, 1, N> ReduceFixedIJ(idx_t i, idx_t j) const {
318  Matriplex<T, 1, 1, N> t;
319  for (idx_t n = 0; n < N; ++n) {
320  t[n] = constAt(n, i, j);
321  }
322  return t;
323  }
324  };
325 
326  template <typename T, idx_t D1, idx_t D2, idx_t N>
327  using MPlex = Matriplex<T, D1, D2, N>;
328 
329  //==============================================================================
330  // Operators
331  //==============================================================================
332 
333  template <typename T, idx_t D1, idx_t D2, idx_t N>
336  t += b;
337  return t;
338  }
339 
340  template <typename T, idx_t D1, idx_t D2, idx_t N>
343  t -= b;
344  return t;
345  }
346 
347  template <typename T, idx_t D1, idx_t D2, idx_t N>
350  t *= b;
351  return t;
352  }
353 
354  template <typename T, idx_t D1, idx_t D2, idx_t N>
357  t /= b;
358  return t;
359  }
360 
361  template <typename T, idx_t D1, idx_t D2, idx_t N>
364  t += b;
365  return t;
366  }
367 
368  template <typename T, idx_t D1, idx_t D2, idx_t N>
371  t -= b;
372  return t;
373  }
374 
375  template <typename T, idx_t D1, idx_t D2, idx_t N>
378  t *= b;
379  return t;
380  }
381 
382  template <typename T, idx_t D1, idx_t D2, idx_t N>
385  t /= b;
386  return t;
387  }
388 
389  template <typename T, idx_t D1, idx_t D2, idx_t N>
392  t += b;
393  return t;
394  }
395 
396  template <typename T, idx_t D1, idx_t D2, idx_t N>
399  t -= b;
400  return t;
401  }
402 
403  template <typename T, idx_t D1, idx_t D2, idx_t N>
406  t *= b;
407  return t;
408  }
409 
410  template <typename T, idx_t D1, idx_t D2, idx_t N>
413  t /= b;
414  return t;
415  }
416 
417  template <typename T, idx_t D1, idx_t D2, idx_t N>
420  return t.abs(a);
421  }
422 
423  template <typename T, idx_t D1, idx_t D2, idx_t N>
426  return t.sqrt(a);
427  }
428 
429  template <typename T, idx_t D1, idx_t D2, idx_t N>
432  return t.sqr(a);
433  }
434 
435  template <typename T, idx_t D1, idx_t D2, idx_t N>
438  return t.hypot(a, b);
439  }
440 
441  template <typename T, idx_t D1, idx_t D2, idx_t N>
444  return t.sin(a);
445  }
446 
447  template <typename T, idx_t D1, idx_t D2, idx_t N>
450  return t.cos(a);
451  }
452 
453  template <typename T, idx_t D1, idx_t D2, idx_t N>
455  for (idx_t i = 0; i < a.kTotSize; ++i) {
456  s.fArray[i] = std::sin(a.fArray[i]);
457  c.fArray[i] = std::cos(a.fArray[i]);
458  }
459  }
460 
461  template <typename T, idx_t D1, idx_t D2, idx_t N>
464  return t.tan(a);
465  }
466 
467  template <typename T, idx_t D1, idx_t D2, idx_t N>
469  const MPlex<T, D1, D2, N>& b,
472  for (idx_t i = 0; i < a.kTotSize; ++i) {
473  min.fArray[i] = std::min(a.fArray[i], b.fArray[i]);
474  max.fArray[i] = std::max(a.fArray[i], b.fArray[i]);
475  }
476  }
477 
478  template <typename T, idx_t D1, idx_t D2, idx_t N>
481  for (idx_t i = 0; i < a.kTotSize; ++i) {
482  t.fArray[i] = std::min(a.fArray[i], b.fArray[i]);
483  }
484  return t;
485  }
486 
487  template <typename T, idx_t D1, idx_t D2, idx_t N>
490  for (idx_t i = 0; i < a.kTotSize; ++i) {
491  t.fArray[i] = std::max(a.fArray[i], b.fArray[i]);
492  }
493  return t;
494  }
495 
496  //==============================================================================
497  // Multiplications
498  //==============================================================================
499 
500  template <typename T, idx_t D1, idx_t D2, idx_t D3, idx_t N>
502  for (idx_t i = 0; i < D1; ++i) {
503  for (idx_t j = 0; j < D3; ++j) {
504  const idx_t ijo = N * (i * D3 + j);
505 
506 #pragma omp simd
507  for (idx_t n = 0; n < N; ++n) {
508  C.fArray[ijo + n] = 0;
509  }
510 
511  for (idx_t k = 0; k < D2; ++k) {
512  const idx_t iko = N * (i * D2 + k);
513  const idx_t kjo = N * (k * D3 + j);
514 
515 #pragma omp simd
516  for (idx_t n = 0; n < N; ++n) {
517  C.fArray[ijo + n] += A.fArray[iko + n] * B.fArray[kjo + n];
518  }
519  }
520  }
521  }
522  }
523 
524  //------------------------------------------------------------------------------
525 
526  template <typename T, idx_t D, idx_t N>
527  struct MultiplyCls {
529  throw std::runtime_error("general multiplication not supported, well, call multiplyGeneral()");
530  }
531  };
532 
533  template <typename T, idx_t N>
534  struct MultiplyCls<T, 3, N> {
536  const T* a = A.fArray;
537  ASSUME_ALIGNED(a, 64);
538  const T* b = B.fArray;
539  ASSUME_ALIGNED(b, 64);
540  T* c = C.fArray;
541  ASSUME_ALIGNED(c, 64);
542 
543 #pragma omp simd
544  for (idx_t n = 0; n < N; ++n) {
545  c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[3 * N + n] + a[2 * N + n] * b[6 * N + n];
546  c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[4 * N + n] + a[2 * N + n] * b[7 * N + n];
547  c[2 * N + n] = a[0 * N + n] * b[2 * N + n] + a[1 * N + n] * b[5 * N + n] + a[2 * N + n] * b[8 * N + n];
548  c[3 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[3 * N + n] + a[5 * N + n] * b[6 * N + n];
549  c[4 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[7 * N + n];
550  c[5 * N + n] = a[3 * N + n] * b[2 * N + n] + a[4 * N + n] * b[5 * N + n] + a[5 * N + n] * b[8 * N + n];
551  c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[3 * N + n] + a[8 * N + n] * b[6 * N + n];
552  c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[4 * N + n] + a[8 * N + n] * b[7 * N + n];
553  c[8 * N + n] = a[6 * N + n] * b[2 * N + n] + a[7 * N + n] * b[5 * N + n] + a[8 * N + n] * b[8 * N + n];
554  }
555  }
556  };
557 
558  template <typename T, idx_t N>
559  struct MultiplyCls<T, 6, N> {
561  const T* a = A.fArray;
562  ASSUME_ALIGNED(a, 64);
563  const T* b = B.fArray;
564  ASSUME_ALIGNED(b, 64);
565  T* c = C.fArray;
566  ASSUME_ALIGNED(c, 64);
567 #pragma omp simd
568  for (idx_t n = 0; n < N; ++n) {
569  c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[6 * N + n] + a[2 * N + n] * b[12 * N + n] +
570  a[3 * N + n] * b[18 * N + n] + a[4 * N + n] * b[24 * N + n] + a[5 * N + n] * b[30 * N + n];
571  c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[7 * N + n] + a[2 * N + n] * b[13 * N + n] +
572  a[3 * N + n] * b[19 * N + n] + a[4 * N + n] * b[25 * N + n] + a[5 * N + n] * b[31 * N + n];
573  c[2 * N + n] = a[0 * N + n] * b[2 * N + n] + a[1 * N + n] * b[8 * N + n] + a[2 * N + n] * b[14 * N + n] +
574  a[3 * N + n] * b[20 * N + n] + a[4 * N + n] * b[26 * N + n] + a[5 * N + n] * b[32 * N + n];
575  c[3 * N + n] = a[0 * N + n] * b[3 * N + n] + a[1 * N + n] * b[9 * N + n] + a[2 * N + n] * b[15 * N + n] +
576  a[3 * N + n] * b[21 * N + n] + a[4 * N + n] * b[27 * N + n] + a[5 * N + n] * b[33 * N + n];
577  c[4 * N + n] = a[0 * N + n] * b[4 * N + n] + a[1 * N + n] * b[10 * N + n] + a[2 * N + n] * b[16 * N + n] +
578  a[3 * N + n] * b[22 * N + n] + a[4 * N + n] * b[28 * N + n] + a[5 * N + n] * b[34 * N + n];
579  c[5 * N + n] = a[0 * N + n] * b[5 * N + n] + a[1 * N + n] * b[11 * N + n] + a[2 * N + n] * b[17 * N + n] +
580  a[3 * N + n] * b[23 * N + n] + a[4 * N + n] * b[29 * N + n] + a[5 * N + n] * b[35 * N + n];
581  c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[6 * N + n] + a[8 * N + n] * b[12 * N + n] +
582  a[9 * N + n] * b[18 * N + n] + a[10 * N + n] * b[24 * N + n] + a[11 * N + n] * b[30 * N + n];
583  c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[7 * N + n] + a[8 * N + n] * b[13 * N + n] +
584  a[9 * N + n] * b[19 * N + n] + a[10 * N + n] * b[25 * N + n] + a[11 * N + n] * b[31 * N + n];
585  c[8 * N + n] = a[6 * N + n] * b[2 * N + n] + a[7 * N + n] * b[8 * N + n] + a[8 * N + n] * b[14 * N + n] +
586  a[9 * N + n] * b[20 * N + n] + a[10 * N + n] * b[26 * N + n] + a[11 * N + n] * b[32 * N + n];
587  c[9 * N + n] = a[6 * N + n] * b[3 * N + n] + a[7 * N + n] * b[9 * N + n] + a[8 * N + n] * b[15 * N + n] +
588  a[9 * N + n] * b[21 * N + n] + a[10 * N + n] * b[27 * N + n] + a[11 * N + n] * b[33 * N + n];
589  c[10 * N + n] = a[6 * N + n] * b[4 * N + n] + a[7 * N + n] * b[10 * N + n] + a[8 * N + n] * b[16 * N + n] +
590  a[9 * N + n] * b[22 * N + n] + a[10 * N + n] * b[28 * N + n] + a[11 * N + n] * b[34 * N + n];
591  c[11 * N + n] = a[6 * N + n] * b[5 * N + n] + a[7 * N + n] * b[11 * N + n] + a[8 * N + n] * b[17 * N + n] +
592  a[9 * N + n] * b[23 * N + n] + a[10 * N + n] * b[29 * N + n] + a[11 * N + n] * b[35 * N + n];
593  c[12 * N + n] = a[12 * N + n] * b[0 * N + n] + a[13 * N + n] * b[6 * N + n] + a[14 * N + n] * b[12 * N + n] +
594  a[15 * N + n] * b[18 * N + n] + a[16 * N + n] * b[24 * N + n] + a[17 * N + n] * b[30 * N + n];
595  c[13 * N + n] = a[12 * N + n] * b[1 * N + n] + a[13 * N + n] * b[7 * N + n] + a[14 * N + n] * b[13 * N + n] +
596  a[15 * N + n] * b[19 * N + n] + a[16 * N + n] * b[25 * N + n] + a[17 * N + n] * b[31 * N + n];
597  c[14 * N + n] = a[12 * N + n] * b[2 * N + n] + a[13 * N + n] * b[8 * N + n] + a[14 * N + n] * b[14 * N + n] +
598  a[15 * N + n] * b[20 * N + n] + a[16 * N + n] * b[26 * N + n] + a[17 * N + n] * b[32 * N + n];
599  c[15 * N + n] = a[12 * N + n] * b[3 * N + n] + a[13 * N + n] * b[9 * N + n] + a[14 * N + n] * b[15 * N + n] +
600  a[15 * N + n] * b[21 * N + n] + a[16 * N + n] * b[27 * N + n] + a[17 * N + n] * b[33 * N + n];
601  c[16 * N + n] = a[12 * N + n] * b[4 * N + n] + a[13 * N + n] * b[10 * N + n] + a[14 * N + n] * b[16 * N + n] +
602  a[15 * N + n] * b[22 * N + n] + a[16 * N + n] * b[28 * N + n] + a[17 * N + n] * b[34 * N + n];
603  c[17 * N + n] = a[12 * N + n] * b[5 * N + n] + a[13 * N + n] * b[11 * N + n] + a[14 * N + n] * b[17 * N + n] +
604  a[15 * N + n] * b[23 * N + n] + a[16 * N + n] * b[29 * N + n] + a[17 * N + n] * b[35 * N + n];
605  c[18 * N + n] = a[18 * N + n] * b[0 * N + n] + a[19 * N + n] * b[6 * N + n] + a[20 * N + n] * b[12 * N + n] +
606  a[21 * N + n] * b[18 * N + n] + a[22 * N + n] * b[24 * N + n] + a[23 * N + n] * b[30 * N + n];
607  c[19 * N + n] = a[18 * N + n] * b[1 * N + n] + a[19 * N + n] * b[7 * N + n] + a[20 * N + n] * b[13 * N + n] +
608  a[21 * N + n] * b[19 * N + n] + a[22 * N + n] * b[25 * N + n] + a[23 * N + n] * b[31 * N + n];
609  c[20 * N + n] = a[18 * N + n] * b[2 * N + n] + a[19 * N + n] * b[8 * N + n] + a[20 * N + n] * b[14 * N + n] +
610  a[21 * N + n] * b[20 * N + n] + a[22 * N + n] * b[26 * N + n] + a[23 * N + n] * b[32 * N + n];
611  c[21 * N + n] = a[18 * N + n] * b[3 * N + n] + a[19 * N + n] * b[9 * N + n] + a[20 * N + n] * b[15 * N + n] +
612  a[21 * N + n] * b[21 * N + n] + a[22 * N + n] * b[27 * N + n] + a[23 * N + n] * b[33 * N + n];
613  c[22 * N + n] = a[18 * N + n] * b[4 * N + n] + a[19 * N + n] * b[10 * N + n] + a[20 * N + n] * b[16 * N + n] +
614  a[21 * N + n] * b[22 * N + n] + a[22 * N + n] * b[28 * N + n] + a[23 * N + n] * b[34 * N + n];
615  c[23 * N + n] = a[18 * N + n] * b[5 * N + n] + a[19 * N + n] * b[11 * N + n] + a[20 * N + n] * b[17 * N + n] +
616  a[21 * N + n] * b[23 * N + n] + a[22 * N + n] * b[29 * N + n] + a[23 * N + n] * b[35 * N + n];
617  c[24 * N + n] = a[24 * N + n] * b[0 * N + n] + a[25 * N + n] * b[6 * N + n] + a[26 * N + n] * b[12 * N + n] +
618  a[27 * N + n] * b[18 * N + n] + a[28 * N + n] * b[24 * N + n] + a[29 * N + n] * b[30 * N + n];
619  c[25 * N + n] = a[24 * N + n] * b[1 * N + n] + a[25 * N + n] * b[7 * N + n] + a[26 * N + n] * b[13 * N + n] +
620  a[27 * N + n] * b[19 * N + n] + a[28 * N + n] * b[25 * N + n] + a[29 * N + n] * b[31 * N + n];
621  c[26 * N + n] = a[24 * N + n] * b[2 * N + n] + a[25 * N + n] * b[8 * N + n] + a[26 * N + n] * b[14 * N + n] +
622  a[27 * N + n] * b[20 * N + n] + a[28 * N + n] * b[26 * N + n] + a[29 * N + n] * b[32 * N + n];
623  c[27 * N + n] = a[24 * N + n] * b[3 * N + n] + a[25 * N + n] * b[9 * N + n] + a[26 * N + n] * b[15 * N + n] +
624  a[27 * N + n] * b[21 * N + n] + a[28 * N + n] * b[27 * N + n] + a[29 * N + n] * b[33 * N + n];
625  c[28 * N + n] = a[24 * N + n] * b[4 * N + n] + a[25 * N + n] * b[10 * N + n] + a[26 * N + n] * b[16 * N + n] +
626  a[27 * N + n] * b[22 * N + n] + a[28 * N + n] * b[28 * N + n] + a[29 * N + n] * b[34 * N + n];
627  c[29 * N + n] = a[24 * N + n] * b[5 * N + n] + a[25 * N + n] * b[11 * N + n] + a[26 * N + n] * b[17 * N + n] +
628  a[27 * N + n] * b[23 * N + n] + a[28 * N + n] * b[29 * N + n] + a[29 * N + n] * b[35 * N + n];
629  c[30 * N + n] = a[30 * N + n] * b[0 * N + n] + a[31 * N + n] * b[6 * N + n] + a[32 * N + n] * b[12 * N + n] +
630  a[33 * N + n] * b[18 * N + n] + a[34 * N + n] * b[24 * N + n] + a[35 * N + n] * b[30 * N + n];
631  c[31 * N + n] = a[30 * N + n] * b[1 * N + n] + a[31 * N + n] * b[7 * N + n] + a[32 * N + n] * b[13 * N + n] +
632  a[33 * N + n] * b[19 * N + n] + a[34 * N + n] * b[25 * N + n] + a[35 * N + n] * b[31 * N + n];
633  c[32 * N + n] = a[30 * N + n] * b[2 * N + n] + a[31 * N + n] * b[8 * N + n] + a[32 * N + n] * b[14 * N + n] +
634  a[33 * N + n] * b[20 * N + n] + a[34 * N + n] * b[26 * N + n] + a[35 * N + n] * b[32 * N + n];
635  c[33 * N + n] = a[30 * N + n] * b[3 * N + n] + a[31 * N + n] * b[9 * N + n] + a[32 * N + n] * b[15 * N + n] +
636  a[33 * N + n] * b[21 * N + n] + a[34 * N + n] * b[27 * N + n] + a[35 * N + n] * b[33 * N + n];
637  c[34 * N + n] = a[30 * N + n] * b[4 * N + n] + a[31 * N + n] * b[10 * N + n] + a[32 * N + n] * b[16 * N + n] +
638  a[33 * N + n] * b[22 * N + n] + a[34 * N + n] * b[28 * N + n] + a[35 * N + n] * b[34 * N + n];
639  c[35 * N + n] = a[30 * N + n] * b[5 * N + n] + a[31 * N + n] * b[11 * N + n] + a[32 * N + n] * b[17 * N + n] +
640  a[33 * N + n] * b[23 * N + n] + a[34 * N + n] * b[29 * N + n] + a[35 * N + n] * b[35 * N + n];
641  }
642  }
643  };
644 
645  template <typename T, idx_t D, idx_t N>
647 #ifdef DEBUG
648  printf("Multipl %d %d\n", D, N);
649 #endif
650 
652  }
653 
654  //==============================================================================
655  // Cramer inversion
656  //==============================================================================
657 
658  template <typename T, idx_t D, idx_t N>
659  struct CramerInverter {
660  static void invert(MPlex<T, D, D, N>& A, double* determ = nullptr) {
661  throw std::runtime_error("general cramer inversion not supported");
662  }
663  };
664 
665  template <typename T, idx_t N>
666  struct CramerInverter<T, 2, N> {
667  static void invert(MPlex<T, 2, 2, N>& A, double* determ = nullptr) {
668  typedef T TT;
669 
670  T* a = A.fArray;
671  ASSUME_ALIGNED(a, 64);
672 
673 #pragma omp simd
674  for (idx_t n = 0; n < N; ++n) {
675  // Force determinant calculation in double precision.
676  const double det = (double)a[0 * N + n] * a[3 * N + n] - (double)a[2 * N + n] * a[1 * N + n];
677  if (determ)
678  determ[n] = det;
679 
680  const TT s = TT(1) / det;
681  const TT tmp = s * a[3 * N + n];
682  a[1 * N + n] *= -s;
683  a[2 * N + n] *= -s;
684  a[3 * N + n] = s * a[0 * N + n];
685  a[0 * N + n] = tmp;
686  }
687  }
688  };
689 
690  template <typename T, idx_t N>
691  struct CramerInverter<T, 3, N> {
692  static void invert(MPlex<T, 3, 3, N>& A, double* determ = nullptr) {
693  typedef T TT;
694 
695  T* a = A.fArray;
696  ASSUME_ALIGNED(a, 64);
697 
698 #pragma omp simd
699  for (idx_t n = 0; n < N; ++n) {
700  const TT c00 = a[4 * N + n] * a[8 * N + n] - a[5 * N + n] * a[7 * N + n];
701  const TT c01 = a[5 * N + n] * a[6 * N + n] - a[3 * N + n] * a[8 * N + n];
702  const TT c02 = a[3 * N + n] * a[7 * N + n] - a[4 * N + n] * a[6 * N + n];
703  const TT c10 = a[7 * N + n] * a[2 * N + n] - a[8 * N + n] * a[1 * N + n];
704  const TT c11 = a[8 * N + n] * a[0 * N + n] - a[6 * N + n] * a[2 * N + n];
705  const TT c12 = a[6 * N + n] * a[1 * N + n] - a[7 * N + n] * a[0 * N + n];
706  const TT c20 = a[1 * N + n] * a[5 * N + n] - a[2 * N + n] * a[4 * N + n];
707  const TT c21 = a[2 * N + n] * a[3 * N + n] - a[0 * N + n] * a[5 * N + n];
708  const TT c22 = a[0 * N + n] * a[4 * N + n] - a[1 * N + n] * a[3 * N + n];
709 
710  // Force determinant calculation in double precision.
711  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[2 * N + n] * c02;
712  if (determ)
713  determ[n] = det;
714 
715  const TT s = TT(1) / det;
716  a[0 * N + n] = s * c00;
717  a[1 * N + n] = s * c10;
718  a[2 * N + n] = s * c20;
719  a[3 * N + n] = s * c01;
720  a[4 * N + n] = s * c11;
721  a[5 * N + n] = s * c21;
722  a[6 * N + n] = s * c02;
723  a[7 * N + n] = s * c12;
724  a[8 * N + n] = s * c22;
725  }
726  }
727  };
728 
729  template <typename T, idx_t D, idx_t N>
730  void invertCramer(MPlex<T, D, D, N>& A, double* determ = nullptr) {
732  }
733 
734  //==============================================================================
735  // Cholesky inversion
736  //==============================================================================
737 
738  template <typename T, idx_t D, idx_t N>
740  static void invert(MPlex<T, D, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
741  };
742 
743  template <typename T, idx_t N>
744  struct CholeskyInverter<T, 3, N> {
745  // Note: this only works on symmetric matrices.
746  // Optimized version for positive definite matrices, no checks.
747  static void invert(MPlex<T, 3, 3, N>& A) {
748  typedef T TT;
749 
750  T* a = A.fArray;
751  ASSUME_ALIGNED(a, 64);
752 
753 #pragma omp simd
754  for (idx_t n = 0; n < N; ++n) {
755  TT l0 = std::sqrt(T(1) / a[0 * N + n]);
756  TT l1 = a[3 * N + n] * l0;
757  TT l2 = a[4 * N + n] - l1 * l1;
758  l2 = std::sqrt(T(1) / l2);
759  TT l3 = a[6 * N + n] * l0;
760  TT l4 = (a[7 * N + n] - l1 * l3) * l2;
761  TT l5 = a[8 * N + n] - (l3 * l3 + l4 * l4);
762  l5 = std::sqrt(T(1) / l5);
763 
764  // decomposition done
765 
766  l3 = (l1 * l4 * l2 - l3) * l0 * l5;
767  l1 = -l1 * l0 * l2;
768  l4 = -l4 * l2 * l5;
769 
770  a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
771  a[1 * N + n] = a[3 * N + n] = l3 * l4 + l1 * l2;
772  a[4 * N + n] = l4 * l4 + l2 * l2;
773  a[2 * N + n] = a[6 * N + n] = l3 * l5;
774  a[5 * N + n] = a[7 * N + n] = l4 * l5;
775  a[8 * N + n] = l5 * l5;
776 
777  // m(2,x) are all zero if anything went wrong at l5.
778  // all zero, if anything went wrong already for l0 or l2.
779  }
780  }
781  };
782 
783  template <typename T, idx_t D, idx_t N>
786  }
787 
788 } // namespace Matriplex
789 
790 #endif
Basic3DVector & operator*=(T t)
Scaling by a scalar value (multiplication)
Divides< B, C > D2
Definition: Factorize.h:137
Basic3DVector & operator=(const Basic3DVector &)=default
Assignment operator.
MPlex< T, D1, D2, N > hypot(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:436
Definition: APVGainStruct.h:7
Matriplex< T, D1, D2, N > MPlex
Definition: Matriplex.h:327
MPlex< T, D1, D2, N > operator+(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:334
MPlex< T, D1, D2, N > min(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:479
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
Definition: Matriplex.h:646
MPlex< T, D1, D2, N > sin(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:442
static void invert(MPlex< T, D, D, N > &A, double *determ=nullptr)
Definition: Matriplex.h:660
MPlex< T, D1, D2, N > operator/(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:355
Basic3DVector & operator-=(const Basic3DVector< U > &p)
MPlex< T, D1, D2, N > sqr(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:430
static void multiply(const MPlex< T, 3, 3, N > &A, const MPlex< T, 3, 3, N > &B, MPlex< T, 3, 3, N > &C)
Definition: Matriplex.h:535
MPlex< T, D1, D2, N > operator-(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:341
static void multiply(const MPlex< T, 6, 6, N > &A, const MPlex< T, 6, 6, N > &B, MPlex< T, 6, 6, N > &C)
Definition: Matriplex.h:560
static void invert(MPlex< T, 2, 2, N > &A, double *determ=nullptr)
Definition: Matriplex.h:667
void sincos(const MPlex< T, D1, D2, N > &a, MPlex< T, D1, D2, N > &s, MPlex< T, D1, D2, N > &c)
Definition: Matriplex.h:454
Divides< A, C > D1
Definition: Factorize.h:136
#define MPLEX_ALIGN
MPlex< T, D1, D2, N > abs(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:418
static void invert(MPlex< T, 3, 3, N > &A, double *determ=nullptr)
Definition: Matriplex.h:692
MPlex< T, D1, D2, N > max(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:488
void min_max(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b, MPlex< T, D1, D2, N > &min, MPlex< T, D1, D2, N > &max)
Definition: Matriplex.h:468
static void invert(MPlex< T, 3, 3, N > &A)
Definition: Matriplex.h:747
void multiplyGeneral(const MPlex< T, D1, D2, N > &A, const MPlex< T, D2, D3, N > &B, MPlex< T, D1, D3, N > &C)
Definition: Matriplex.h:501
MPlex< T, D1, D2, N > operator*(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:348
#define N
Definition: blowfish.cc:9
DecomposeProduct< arg, typename Div::arg > D
Definition: Factorize.h:141
void invertCramer(MPlex< T, D, D, N > &A, double *determ=nullptr)
Definition: Matriplex.h:730
Basic3DVector & operator/=(T t)
Scaling by a scalar value (division)
MPlex< T, D1, D2, N > tan(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:462
double b
Definition: hdecay.h:120
class __attribute__((aligned(32))) Matriplex
Definition: Matriplex.h:11
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
double a
Definition: hdecay.h:121
void invertCholesky(MPlex< T, D, D, N > &A)
Definition: Matriplex.h:784
Definition: APVGainStruct.h:7
static void invert(MPlex< T, D, D, N > &A)
Definition: Matriplex.h:740
static void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
Definition: Matriplex.h:528
T operator[](int i) const
tmp
align.sh
Definition: createJobs.py:716
long double T
#define ASSUME_ALIGNED(a, b)
Basic3DVector & operator+=(const Basic3DVector< U > &p)
MPlex< T, D1, D2, N > cos(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:448
MPlex< T, D1, D2, N > sqrt(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:424