CMS 3D CMS Logo

Matriplex.h
Go to the documentation of this file.
1 #ifndef RecoTracker_MkFitCore_src_Matriplex_Matriplex_h
2 #define RecoTracker_MkFitCore_src_Matriplex_Matriplex_h
3 
4 #include "MatriplexCommon.h"
5 
6 namespace Matriplex {
7 
8  //------------------------------------------------------------------------------
9 
10  template <typename T, idx_t D1, idx_t D2, idx_t N>
11  class __attribute__((aligned(MPLEX_ALIGN))) Matriplex {
12  public:
13  typedef T value_type;
14 
16  static constexpr int kRows = D1;
18  static constexpr int kCols = D2;
20  static constexpr int kSize = D1 * D2;
22  static constexpr int kTotSize = N * kSize;
23 
24  T fArray[kTotSize];
25 
26  Matriplex() {}
27  Matriplex(T v) { setVal(v); }
28 
29  idx_t plexSize() const { return N; }
30 
31  void setVal(T v) {
32  for (idx_t i = 0; i < kTotSize; ++i) {
33  fArray[i] = v;
34  }
35  }
36 
37  void add(const Matriplex& v) {
38  for (idx_t i = 0; i < kTotSize; ++i) {
39  fArray[i] += v.fArray[i];
40  }
41  }
42 
43  void scale(T scale) {
44  for (idx_t i = 0; i < kTotSize; ++i) {
45  fArray[i] *= scale;
46  }
47  }
48 
49  T operator[](idx_t xx) const { return fArray[xx]; }
50  T& operator[](idx_t xx) { return fArray[xx]; }
51 
52  const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[(i * D2 + j) * N + n]; }
53 
54  T& At(idx_t n, idx_t i, idx_t j) { return fArray[(i * D2 + j) * N + n]; }
55 
56  T& operator()(idx_t n, idx_t i, idx_t j) { return fArray[(i * D2 + j) * N + n]; }
57  const T& operator()(idx_t n, idx_t i, idx_t j) const { return fArray[(i * D2 + j) * N + n]; }
58 
59  Matriplex& operator=(const Matriplex& m) {
60  memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
61  return *this;
62  }
63 
64  Matriplex& operator=(T t) {
65  for (idx_t i = 0; i < kTotSize; ++i)
66  fArray[i] = t;
67  return *this;
68  }
69 
71  for (idx_t i = 0; i < kTotSize; ++i)
72  fArray[i] += t;
73  return *this;
74  }
75 
77  for (idx_t i = 0; i < kTotSize; ++i)
78  fArray[i] -= t;
79  return *this;
80  }
81 
83  for (idx_t i = 0; i < kTotSize; ++i)
84  fArray[i] *= t;
85  return *this;
86  }
87 
89  for (idx_t i = 0; i < kTotSize; ++i)
90  fArray[i] /= t;
91  return *this;
92  }
93 
94  Matriplex& operator+=(const Matriplex& a) {
95  for (idx_t i = 0; i < kTotSize; ++i)
96  fArray[i] += a.fArray[i];
97  return *this;
98  }
99 
100  Matriplex& operator-=(const Matriplex& a) {
101  for (idx_t i = 0; i < kTotSize; ++i)
102  fArray[i] -= a.fArray[i];
103  return *this;
104  }
105 
106  Matriplex& operator*=(const Matriplex& a) {
107  for (idx_t i = 0; i < kTotSize; ++i)
108  fArray[i] *= a.fArray[i];
109  return *this;
110  }
111 
112  Matriplex& operator/=(const Matriplex& a) {
113  for (idx_t i = 0; i < kTotSize; ++i)
114  fArray[i] /= a.fArray[i];
115  return *this;
116  }
117 
118  Matriplex& sqrt(const Matriplex& a) {
119  for (idx_t i = 0; i < kTotSize; ++i)
120  fArray[i] = std::sqrt(a.fArray[i]);
121  return *this;
122  }
123  Matriplex& sqrt() {
124  for (idx_t i = 0; i < kTotSize; ++i)
125  fArray[i] = std::sqrt(fArray[i]);
126  return *this;
127  }
128 
129  Matriplex& sqr(const Matriplex& a) {
130  for (idx_t i = 0; i < kTotSize; ++i)
131  fArray[i] = a.fArray[i] * a.fArray[i];
132  return *this;
133  }
134  Matriplex& sqr() {
135  for (idx_t i = 0; i < kTotSize; ++i)
136  fArray[i] = fArray[i] * fArray[i];
137  return *this;
138  }
139 
140  Matriplex& hypot(const Matriplex& a, const Matriplex& b) {
141  for (idx_t i = 0; i < kTotSize; ++i) {
142  fArray[i] = a.fArray[i] * a.fArray[i] + b.fArray[i] * b.fArray[i];
143  }
144  return sqrt();
145  }
146 
147  Matriplex& sin(const Matriplex& a) {
148  for (idx_t i = 0; i < kTotSize; ++i)
149  fArray[i] = std::sin(a.fArray[i]);
150  return *this;
151  }
152  Matriplex& sin() {
153  for (idx_t i = 0; i < kTotSize; ++i)
154  fArray[i] = std::sin(fArray[i]);
155  return *this;
156  }
157 
158  Matriplex& cos(const Matriplex& a) {
159  for (idx_t i = 0; i < kTotSize; ++i)
160  fArray[i] = std::cos(a.fArray[i]);
161  return *this;
162  }
163  Matriplex& cos() {
164  for (idx_t i = 0; i < kTotSize; ++i)
165  fArray[i] = std::cos(fArray[i]);
166  return *this;
167  }
168 
169  Matriplex& tan(const Matriplex& a) {
170  for (idx_t i = 0; i < kTotSize; ++i)
171  fArray[i] = std::tan(a.fArray[i]);
172  return *this;
173  }
174  Matriplex& tan() {
175  for (idx_t i = 0; i < kTotSize; ++i)
176  fArray[i] = std::tan(fArray[i]);
177  return *this;
178  }
179 
180  //---------------------------------------------------------
181 
182  void copySlot(idx_t n, const Matriplex& m) {
183  for (idx_t i = n; i < kTotSize; i += N) {
184  fArray[i] = m.fArray[i];
185  }
186  }
187 
188  void copyIn(idx_t n, const T* arr) {
189  for (idx_t i = n; i < kTotSize; i += N) {
190  fArray[i] = *(arr++);
191  }
192  }
193 
194  void copyIn(idx_t n, const Matriplex& m, idx_t in) {
195  for (idx_t i = n; i < kTotSize; i += N, in += N) {
196  fArray[i] = m[in];
197  }
198  }
199 
200  void copy(idx_t n, idx_t in) {
201  for (idx_t i = n; i < kTotSize; i += N, in += N) {
202  fArray[i] = fArray[in];
203  }
204  }
205 
206 #if defined(AVX512_INTRINSICS)
207 
208  template <typename U>
209  void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
210  //_mm512_prefetch_i32gather_ps(vi, arr, 1, _MM_HINT_T0);
211 
212  const __m512 src = {0};
213  const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
214 
215  for (int i = 0; i < kSize; ++i, ++arr) {
216  //_mm512_prefetch_i32gather_ps(vi, arr+2, 1, _MM_HINT_NTA);
217 
218  __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
219  _mm512_mask_store_ps(&fArray[i * N], k, reg);
220  }
221  }
222 
223  // Experimental methods, slurpIn() seems to be at least as fast.
224  // See comments in mkFit/MkFitter.cc MkFitter::addBestHit().
225  void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
226  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
227 
228  for (int i = 0; i < N; ++i) {
229  __m512 reg = _mm512_load_ps(arr + vi[i]);
230  _mm512_store_ps((void*)(tmp + 64 * i), reg);
231  }
232 
233  for (int i = 0; i < kSize; ++i) {
234  __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
235  _mm512_store_ps(&fArray[i * N], reg);
236  }
237  }
238 
239  void Contaginate(const char* arr, int vi[N], const char* tmp) {
240  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
241 
242  for (int i = 0; i < N; ++i) {
243  __m512 reg = _mm512_load_ps(arr + vi[i]);
244  _mm512_store_ps((void*)(tmp + 64 * i), reg);
245  }
246  }
247 
248  void Plexify(const char* tmp, __m512i& ui) {
249  for (int i = 0; i < kSize; ++i) {
250  __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
251  _mm512_store_ps(&fArray[i * N], reg);
252  }
253  }
254 
255 #elif defined(AVX2_INTRINSICS)
256 
257  template <typename U>
258  void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
259  // Casts to float* needed to "support" also T=HitOnTrack.
260  // Note that sizeof(float) == sizeof(HitOnTrack) == 4.
261 
262  const __m256 src = {0};
263 
264  __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
265  __m256i k_sel = _mm256_set1_epi32(N_proc);
266  __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
267 
268  k = k_master;
269  for (int i = 0; i < kSize; ++i, ++arr) {
270  __m256 reg = _mm256_mask_i32gather_ps(src, (float*)arr, vi, (__m256)k, sizeof(U));
271  // Restore mask (docs say gather clears it but it doesn't seem to).
272  k = k_master;
273  _mm256_maskstore_ps((float*)&fArray[i * N], k, reg);
274  }
275  }
276 
277 #else
278 
279  void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
280  // Separate N_proc == N case (gains about 7% in fit test).
281  if (N_proc == N) {
282  for (int i = 0; i < kSize; ++i) {
283  for (int j = 0; j < N; ++j) {
284  fArray[i * N + j] = *(arr + i + vi[j]);
285  }
286  }
287  } else {
288  for (int i = 0; i < kSize; ++i) {
289  for (int j = 0; j < N_proc; ++j) {
290  fArray[i * N + j] = *(arr + i + vi[j]);
291  }
292  }
293  }
294  }
295 
296 #endif
297 
298  void copyOut(idx_t n, T* arr) const {
299  for (idx_t i = n; i < kTotSize; i += N) {
300  *(arr++) = fArray[i];
301  }
302  }
303 
304  Matriplex<T, 1, 1, N> ReduceFixedIJ(idx_t i, idx_t j) const {
305  Matriplex<T, 1, 1, N> t;
306  for (idx_t n = 0; n < N; ++n) {
307  t[n] = constAt(n, i, j);
308  }
309  return t;
310  }
311  };
312 
313  template <typename T, idx_t D1, idx_t D2, idx_t N>
314  using MPlex = Matriplex<T, D1, D2, N>;
315 
316  //==============================================================================
317  // Operators
318  //==============================================================================
319 
320  template <typename T, idx_t D1, idx_t D2, idx_t N>
323  t += b;
324  return t;
325  }
326 
327  template <typename T, idx_t D1, idx_t D2, idx_t N>
330  t -= b;
331  return t;
332  }
333 
334  template <typename T, idx_t D1, idx_t D2, idx_t N>
337  t *= b;
338  return t;
339  }
340 
341  template <typename T, idx_t D1, idx_t D2, idx_t N>
344  t /= b;
345  return t;
346  }
347 
348  template <typename T, idx_t D1, idx_t D2, idx_t N>
351  t += b;
352  return t;
353  }
354 
355  template <typename T, idx_t D1, idx_t D2, idx_t N>
358  t -= b;
359  return t;
360  }
361 
362  template <typename T, idx_t D1, idx_t D2, idx_t N>
365  t *= b;
366  return t;
367  }
368 
369  template <typename T, idx_t D1, idx_t D2, idx_t N>
372  t /= b;
373  return t;
374  }
375 
376  template <typename T, idx_t D1, idx_t D2, idx_t N>
379  t += b;
380  return t;
381  }
382 
383  template <typename T, idx_t D1, idx_t D2, idx_t N>
386  t -= b;
387  return t;
388  }
389 
390  template <typename T, idx_t D1, idx_t D2, idx_t N>
393  t *= b;
394  return t;
395  }
396 
397  template <typename T, idx_t D1, idx_t D2, idx_t N>
400  t /= b;
401  return t;
402  }
403 
404  template <typename T, idx_t D1, idx_t D2, idx_t N>
407  return t.sqrt(a);
408  }
409 
410  template <typename T, idx_t D1, idx_t D2, idx_t N>
413  return t.sqrt(a);
414  }
415 
416  template <typename T, idx_t D1, idx_t D2, idx_t N>
419  return t.hypot(a, b);
420  }
421 
422  template <typename T, idx_t D1, idx_t D2, idx_t N>
425  return t.sin(a);
426  }
427 
428  template <typename T, idx_t D1, idx_t D2, idx_t N>
431  return t.cos(a);
432  }
433 
434  template <typename T, idx_t D1, idx_t D2, idx_t N>
436  for (idx_t i = 0; i < a.kTotSize; ++i) {
437  s.fArray[i] = std::sin(a.fArray[i]);
438  c.fArray[i] = std::cos(a.fArray[i]);
439  }
440  }
441 
442  template <typename T, idx_t D1, idx_t D2, idx_t N>
445  return t.tan(a);
446  }
447 
448  template <typename T, idx_t D1, idx_t D2, idx_t N>
450  const MPlex<T, D1, D2, N>& b,
453  for (idx_t i = 0; i < a.kTotSize; ++i) {
454  min.fArray[i] = std::min(a.fArray[i], b.fArray[i]);
455  max.fArray[i] = std::max(a.fArray[i], b.fArray[i]);
456  }
457  }
458 
459  template <typename T, idx_t D1, idx_t D2, idx_t N>
462  for (idx_t i = 0; i < a.kTotSize; ++i) {
463  t.fArray[i] = std::min(a.fArray[i], b.fArray[i]);
464  }
465  return t;
466  }
467 
468  template <typename T, idx_t D1, idx_t D2, idx_t N>
471  for (idx_t i = 0; i < a.kTotSize; ++i) {
472  t.fArray[i] = std::max(a.fArray[i], b.fArray[i]);
473  }
474  return t;
475  }
476 
477  //==============================================================================
478  // Multiplications
479  //==============================================================================
480 
481  template <typename T, idx_t D1, idx_t D2, idx_t D3, idx_t N>
483  for (idx_t i = 0; i < D1; ++i) {
484  for (idx_t j = 0; j < D3; ++j) {
485  const idx_t ijo = N * (i * D3 + j);
486 
487 #pragma omp simd
488  for (idx_t n = 0; n < N; ++n) {
489  C.fArray[ijo + n] = 0;
490  }
491 
492  for (idx_t k = 0; k < D2; ++k) {
493  const idx_t iko = N * (i * D2 + k);
494  const idx_t kjo = N * (k * D3 + j);
495 
496 #pragma omp simd
497  for (idx_t n = 0; n < N; ++n) {
498  C.fArray[ijo + n] += A.fArray[iko + n] * B.fArray[kjo + n];
499  }
500  }
501  }
502  }
503  }
504 
505  //------------------------------------------------------------------------------
506 
507  template <typename T, idx_t D, idx_t N>
508  struct MultiplyCls {
510  throw std::runtime_error("general multiplication not supported, well, call multiplyGeneral()");
511  }
512  };
513 
514  template <typename T, idx_t N>
515  struct MultiplyCls<T, 3, N> {
517  const T* a = A.fArray;
518  ASSUME_ALIGNED(a, 64);
519  const T* b = B.fArray;
520  ASSUME_ALIGNED(b, 64);
521  T* c = C.fArray;
522  ASSUME_ALIGNED(c, 64);
523 
524 #pragma omp simd
525  for (idx_t n = 0; n < N; ++n) {
526  c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[3 * N + n] + a[2 * N + n] * b[6 * N + n];
527  c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[4 * N + n] + a[2 * N + n] * b[7 * N + n];
528  c[2 * N + n] = a[0 * N + n] * b[2 * N + n] + a[1 * N + n] * b[5 * N + n] + a[2 * N + n] * b[8 * N + n];
529  c[3 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[3 * N + n] + a[5 * N + n] * b[6 * N + n];
530  c[4 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[7 * N + n];
531  c[5 * N + n] = a[3 * N + n] * b[2 * N + n] + a[4 * N + n] * b[5 * N + n] + a[5 * N + n] * b[8 * N + n];
532  c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[3 * N + n] + a[8 * N + n] * b[6 * N + n];
533  c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[4 * N + n] + a[8 * N + n] * b[7 * N + n];
534  c[8 * N + n] = a[6 * N + n] * b[2 * N + n] + a[7 * N + n] * b[5 * N + n] + a[8 * N + n] * b[8 * N + n];
535  }
536  }
537  };
538 
539  template <typename T, idx_t N>
540  struct MultiplyCls<T, 6, N> {
542  const T* a = A.fArray;
543  ASSUME_ALIGNED(a, 64);
544  const T* b = B.fArray;
545  ASSUME_ALIGNED(b, 64);
546  T* c = C.fArray;
547  ASSUME_ALIGNED(c, 64);
548 #pragma omp simd
549  for (idx_t n = 0; n < N; ++n) {
550  c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[6 * N + n] + a[2 * N + n] * b[12 * N + n] +
551  a[3 * N + n] * b[18 * N + n] + a[4 * N + n] * b[24 * N + n] + a[5 * N + n] * b[30 * N + n];
552  c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[7 * N + n] + a[2 * N + n] * b[13 * N + n] +
553  a[3 * N + n] * b[19 * N + n] + a[4 * N + n] * b[25 * N + n] + a[5 * N + n] * b[31 * N + n];
554  c[2 * N + n] = a[0 * N + n] * b[2 * N + n] + a[1 * N + n] * b[8 * N + n] + a[2 * N + n] * b[14 * N + n] +
555  a[3 * N + n] * b[20 * N + n] + a[4 * N + n] * b[26 * N + n] + a[5 * N + n] * b[32 * N + n];
556  c[3 * N + n] = a[0 * N + n] * b[3 * N + n] + a[1 * N + n] * b[9 * N + n] + a[2 * N + n] * b[15 * N + n] +
557  a[3 * N + n] * b[21 * N + n] + a[4 * N + n] * b[27 * N + n] + a[5 * N + n] * b[33 * N + n];
558  c[4 * N + n] = a[0 * N + n] * b[4 * N + n] + a[1 * N + n] * b[10 * N + n] + a[2 * N + n] * b[16 * N + n] +
559  a[3 * N + n] * b[22 * N + n] + a[4 * N + n] * b[28 * N + n] + a[5 * N + n] * b[34 * N + n];
560  c[5 * N + n] = a[0 * N + n] * b[5 * N + n] + a[1 * N + n] * b[11 * N + n] + a[2 * N + n] * b[17 * N + n] +
561  a[3 * N + n] * b[23 * N + n] + a[4 * N + n] * b[29 * N + n] + a[5 * N + n] * b[35 * N + n];
562  c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[6 * N + n] + a[8 * N + n] * b[12 * N + n] +
563  a[9 * N + n] * b[18 * N + n] + a[10 * N + n] * b[24 * N + n] + a[11 * N + n] * b[30 * N + n];
564  c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[7 * N + n] + a[8 * N + n] * b[13 * N + n] +
565  a[9 * N + n] * b[19 * N + n] + a[10 * N + n] * b[25 * N + n] + a[11 * N + n] * b[31 * N + n];
566  c[8 * N + n] = a[6 * N + n] * b[2 * N + n] + a[7 * N + n] * b[8 * N + n] + a[8 * N + n] * b[14 * N + n] +
567  a[9 * N + n] * b[20 * N + n] + a[10 * N + n] * b[26 * N + n] + a[11 * N + n] * b[32 * N + n];
568  c[9 * N + n] = a[6 * N + n] * b[3 * N + n] + a[7 * N + n] * b[9 * N + n] + a[8 * N + n] * b[15 * N + n] +
569  a[9 * N + n] * b[21 * N + n] + a[10 * N + n] * b[27 * N + n] + a[11 * N + n] * b[33 * N + n];
570  c[10 * N + n] = a[6 * N + n] * b[4 * N + n] + a[7 * N + n] * b[10 * N + n] + a[8 * N + n] * b[16 * N + n] +
571  a[9 * N + n] * b[22 * N + n] + a[10 * N + n] * b[28 * N + n] + a[11 * N + n] * b[34 * N + n];
572  c[11 * N + n] = a[6 * N + n] * b[5 * N + n] + a[7 * N + n] * b[11 * N + n] + a[8 * N + n] * b[17 * N + n] +
573  a[9 * N + n] * b[23 * N + n] + a[10 * N + n] * b[29 * N + n] + a[11 * N + n] * b[35 * N + n];
574  c[12 * N + n] = a[12 * N + n] * b[0 * N + n] + a[13 * N + n] * b[6 * N + n] + a[14 * N + n] * b[12 * N + n] +
575  a[15 * N + n] * b[18 * N + n] + a[16 * N + n] * b[24 * N + n] + a[17 * N + n] * b[30 * N + n];
576  c[13 * N + n] = a[12 * N + n] * b[1 * N + n] + a[13 * N + n] * b[7 * N + n] + a[14 * N + n] * b[13 * N + n] +
577  a[15 * N + n] * b[19 * N + n] + a[16 * N + n] * b[25 * N + n] + a[17 * N + n] * b[31 * N + n];
578  c[14 * N + n] = a[12 * N + n] * b[2 * N + n] + a[13 * N + n] * b[8 * N + n] + a[14 * N + n] * b[14 * N + n] +
579  a[15 * N + n] * b[20 * N + n] + a[16 * N + n] * b[26 * N + n] + a[17 * N + n] * b[32 * N + n];
580  c[15 * N + n] = a[12 * N + n] * b[3 * N + n] + a[13 * N + n] * b[9 * N + n] + a[14 * N + n] * b[15 * N + n] +
581  a[15 * N + n] * b[21 * N + n] + a[16 * N + n] * b[27 * N + n] + a[17 * N + n] * b[33 * N + n];
582  c[16 * N + n] = a[12 * N + n] * b[4 * N + n] + a[13 * N + n] * b[10 * N + n] + a[14 * N + n] * b[16 * N + n] +
583  a[15 * N + n] * b[22 * N + n] + a[16 * N + n] * b[28 * N + n] + a[17 * N + n] * b[34 * N + n];
584  c[17 * N + n] = a[12 * N + n] * b[5 * N + n] + a[13 * N + n] * b[11 * N + n] + a[14 * N + n] * b[17 * N + n] +
585  a[15 * N + n] * b[23 * N + n] + a[16 * N + n] * b[29 * N + n] + a[17 * N + n] * b[35 * N + n];
586  c[18 * N + n] = a[18 * N + n] * b[0 * N + n] + a[19 * N + n] * b[6 * N + n] + a[20 * N + n] * b[12 * N + n] +
587  a[21 * N + n] * b[18 * N + n] + a[22 * N + n] * b[24 * N + n] + a[23 * N + n] * b[30 * N + n];
588  c[19 * N + n] = a[18 * N + n] * b[1 * N + n] + a[19 * N + n] * b[7 * N + n] + a[20 * N + n] * b[13 * N + n] +
589  a[21 * N + n] * b[19 * N + n] + a[22 * N + n] * b[25 * N + n] + a[23 * N + n] * b[31 * N + n];
590  c[20 * N + n] = a[18 * N + n] * b[2 * N + n] + a[19 * N + n] * b[8 * N + n] + a[20 * N + n] * b[14 * N + n] +
591  a[21 * N + n] * b[20 * N + n] + a[22 * N + n] * b[26 * N + n] + a[23 * N + n] * b[32 * N + n];
592  c[21 * N + n] = a[18 * N + n] * b[3 * N + n] + a[19 * N + n] * b[9 * N + n] + a[20 * N + n] * b[15 * N + n] +
593  a[21 * N + n] * b[21 * N + n] + a[22 * N + n] * b[27 * N + n] + a[23 * N + n] * b[33 * N + n];
594  c[22 * N + n] = a[18 * N + n] * b[4 * N + n] + a[19 * N + n] * b[10 * N + n] + a[20 * N + n] * b[16 * N + n] +
595  a[21 * N + n] * b[22 * N + n] + a[22 * N + n] * b[28 * N + n] + a[23 * N + n] * b[34 * N + n];
596  c[23 * N + n] = a[18 * N + n] * b[5 * N + n] + a[19 * N + n] * b[11 * N + n] + a[20 * N + n] * b[17 * N + n] +
597  a[21 * N + n] * b[23 * N + n] + a[22 * N + n] * b[29 * N + n] + a[23 * N + n] * b[35 * N + n];
598  c[24 * N + n] = a[24 * N + n] * b[0 * N + n] + a[25 * N + n] * b[6 * N + n] + a[26 * N + n] * b[12 * N + n] +
599  a[27 * N + n] * b[18 * N + n] + a[28 * N + n] * b[24 * N + n] + a[29 * N + n] * b[30 * N + n];
600  c[25 * N + n] = a[24 * N + n] * b[1 * N + n] + a[25 * N + n] * b[7 * N + n] + a[26 * N + n] * b[13 * N + n] +
601  a[27 * N + n] * b[19 * N + n] + a[28 * N + n] * b[25 * N + n] + a[29 * N + n] * b[31 * N + n];
602  c[26 * N + n] = a[24 * N + n] * b[2 * N + n] + a[25 * N + n] * b[8 * N + n] + a[26 * N + n] * b[14 * N + n] +
603  a[27 * N + n] * b[20 * N + n] + a[28 * N + n] * b[26 * N + n] + a[29 * N + n] * b[32 * N + n];
604  c[27 * N + n] = a[24 * N + n] * b[3 * N + n] + a[25 * N + n] * b[9 * N + n] + a[26 * N + n] * b[15 * N + n] +
605  a[27 * N + n] * b[21 * N + n] + a[28 * N + n] * b[27 * N + n] + a[29 * N + n] * b[33 * N + n];
606  c[28 * N + n] = a[24 * N + n] * b[4 * N + n] + a[25 * N + n] * b[10 * N + n] + a[26 * N + n] * b[16 * N + n] +
607  a[27 * N + n] * b[22 * N + n] + a[28 * N + n] * b[28 * N + n] + a[29 * N + n] * b[34 * N + n];
608  c[29 * N + n] = a[24 * N + n] * b[5 * N + n] + a[25 * N + n] * b[11 * N + n] + a[26 * N + n] * b[17 * N + n] +
609  a[27 * N + n] * b[23 * N + n] + a[28 * N + n] * b[29 * N + n] + a[29 * N + n] * b[35 * N + n];
610  c[30 * N + n] = a[30 * N + n] * b[0 * N + n] + a[31 * N + n] * b[6 * N + n] + a[32 * N + n] * b[12 * N + n] +
611  a[33 * N + n] * b[18 * N + n] + a[34 * N + n] * b[24 * N + n] + a[35 * N + n] * b[30 * N + n];
612  c[31 * N + n] = a[30 * N + n] * b[1 * N + n] + a[31 * N + n] * b[7 * N + n] + a[32 * N + n] * b[13 * N + n] +
613  a[33 * N + n] * b[19 * N + n] + a[34 * N + n] * b[25 * N + n] + a[35 * N + n] * b[31 * N + n];
614  c[32 * N + n] = a[30 * N + n] * b[2 * N + n] + a[31 * N + n] * b[8 * N + n] + a[32 * N + n] * b[14 * N + n] +
615  a[33 * N + n] * b[20 * N + n] + a[34 * N + n] * b[26 * N + n] + a[35 * N + n] * b[32 * N + n];
616  c[33 * N + n] = a[30 * N + n] * b[3 * N + n] + a[31 * N + n] * b[9 * N + n] + a[32 * N + n] * b[15 * N + n] +
617  a[33 * N + n] * b[21 * N + n] + a[34 * N + n] * b[27 * N + n] + a[35 * N + n] * b[33 * N + n];
618  c[34 * N + n] = a[30 * N + n] * b[4 * N + n] + a[31 * N + n] * b[10 * N + n] + a[32 * N + n] * b[16 * N + n] +
619  a[33 * N + n] * b[22 * N + n] + a[34 * N + n] * b[28 * N + n] + a[35 * N + n] * b[34 * N + n];
620  c[35 * N + n] = a[30 * N + n] * b[5 * N + n] + a[31 * N + n] * b[11 * N + n] + a[32 * N + n] * b[17 * N + n] +
621  a[33 * N + n] * b[23 * N + n] + a[34 * N + n] * b[29 * N + n] + a[35 * N + n] * b[35 * N + n];
622  }
623  }
624  };
625 
626  template <typename T, idx_t D, idx_t N>
628 #ifdef DEBUG
629  printf("Multipl %d %d\n", D, N);
630 #endif
631 
633  }
634 
635  //==============================================================================
636  // Cramer inversion
637  //==============================================================================
638 
639  template <typename T, idx_t D, idx_t N>
640  struct CramerInverter {
641  static void invert(MPlex<T, D, D, N>& A, double* determ = nullptr) {
642  throw std::runtime_error("general cramer inversion not supported");
643  }
644  };
645 
646  template <typename T, idx_t N>
647  struct CramerInverter<T, 2, N> {
648  static void invert(MPlex<T, 2, 2, N>& A, double* determ = nullptr) {
649  typedef T TT;
650 
651  T* a = A.fArray;
652  ASSUME_ALIGNED(a, 64);
653 
654 #pragma omp simd
655  for (idx_t n = 0; n < N; ++n) {
656  // Force determinant calculation in double precision.
657  const double det = (double)a[0 * N + n] * a[3 * N + n] - (double)a[2 * N + n] * a[1 * N + n];
658  if (determ)
659  determ[n] = det;
660 
661  const TT s = TT(1) / det;
662  const TT tmp = s * a[3 * N + n];
663  a[1 * N + n] *= -s;
664  a[2 * N + n] *= -s;
665  a[3 * N + n] = s * a[0 * N + n];
666  a[0 * N + n] = tmp;
667  }
668  }
669  };
670 
671  template <typename T, idx_t N>
672  struct CramerInverter<T, 3, N> {
673  static void invert(MPlex<T, 3, 3, N>& A, double* determ = nullptr) {
674  typedef T TT;
675 
676  T* a = A.fArray;
677  ASSUME_ALIGNED(a, 64);
678 
679 #pragma omp simd
680  for (idx_t n = 0; n < N; ++n) {
681  const TT c00 = a[4 * N + n] * a[8 * N + n] - a[5 * N + n] * a[7 * N + n];
682  const TT c01 = a[5 * N + n] * a[6 * N + n] - a[3 * N + n] * a[8 * N + n];
683  const TT c02 = a[3 * N + n] * a[7 * N + n] - a[4 * N + n] * a[6 * N + n];
684  const TT c10 = a[7 * N + n] * a[2 * N + n] - a[8 * N + n] * a[1 * N + n];
685  const TT c11 = a[8 * N + n] * a[0 * N + n] - a[6 * N + n] * a[2 * N + n];
686  const TT c12 = a[6 * N + n] * a[1 * N + n] - a[7 * N + n] * a[0 * N + n];
687  const TT c20 = a[1 * N + n] * a[5 * N + n] - a[2 * N + n] * a[4 * N + n];
688  const TT c21 = a[2 * N + n] * a[3 * N + n] - a[0 * N + n] * a[5 * N + n];
689  const TT c22 = a[0 * N + n] * a[4 * N + n] - a[1 * N + n] * a[3 * N + n];
690 
691  // Force determinant calculation in double precision.
692  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[2 * N + n] * c02;
693  if (determ)
694  determ[n] = det;
695 
696  const TT s = TT(1) / det;
697  a[0 * N + n] = s * c00;
698  a[1 * N + n] = s * c10;
699  a[2 * N + n] = s * c20;
700  a[3 * N + n] = s * c01;
701  a[4 * N + n] = s * c11;
702  a[5 * N + n] = s * c21;
703  a[6 * N + n] = s * c02;
704  a[7 * N + n] = s * c12;
705  a[8 * N + n] = s * c22;
706  }
707  }
708  };
709 
710  template <typename T, idx_t D, idx_t N>
711  void invertCramer(MPlex<T, D, D, N>& A, double* determ = nullptr) {
713  }
714 
715  //==============================================================================
716  // Cholesky inversion
717  //==============================================================================
718 
719  template <typename T, idx_t D, idx_t N>
721  static void invert(MPlex<T, D, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
722  };
723 
724  template <typename T, idx_t N>
725  struct CholeskyInverter<T, 3, N> {
726  // Note: this only works on symmetric matrices.
727  // Optimized version for positive definite matrices, no checks.
728  static void invert(MPlex<T, 3, 3, N>& A) {
729  typedef T TT;
730 
731  T* a = A.fArray;
732  ASSUME_ALIGNED(a, 64);
733 
734 #pragma omp simd
735  for (idx_t n = 0; n < N; ++n) {
736  TT l0 = std::sqrt(T(1) / a[0 * N + n]);
737  TT l1 = a[3 * N + n] * l0;
738  TT l2 = a[4 * N + n] - l1 * l1;
739  l2 = std::sqrt(T(1) / l2);
740  TT l3 = a[6 * N + n] * l0;
741  TT l4 = (a[7 * N + n] - l1 * l3) * l2;
742  TT l5 = a[8 * N + n] - (l3 * l3 + l4 * l4);
743  l5 = std::sqrt(T(1) / l5);
744 
745  // decomposition done
746 
747  l3 = (l1 * l4 * l2 - l3) * l0 * l5;
748  l1 = -l1 * l0 * l2;
749  l4 = -l4 * l2 * l5;
750 
751  a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
752  a[1 * N + n] = a[3 * N + n] = l3 * l4 + l1 * l2;
753  a[4 * N + n] = l4 * l4 + l2 * l2;
754  a[2 * N + n] = a[6 * N + n] = l3 * l5;
755  a[5 * N + n] = a[7 * N + n] = l4 * l5;
756  a[8 * N + n] = l5 * l5;
757 
758  // m(2,x) are all zero if anything went wrong at l5.
759  // all zero, if anything went wrong already for l0 or l2.
760  }
761  }
762  };
763 
764  template <typename T, idx_t D, idx_t N>
767  }
768 
769 } // namespace Matriplex
770 
771 #endif
Basic3DVector & operator*=(T t)
Scaling by a scalar value (multiplication)
Divides< B, C > D2
Definition: Factorize.h:137
MPlex< T, D1, D2, N > hypot(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:417
Definition: APVGainStruct.h:7
Matriplex< T, D1, D2, N > MPlex
Definition: Matriplex.h:314
MPlex< T, D1, D2, N > operator+(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:321
MPlex< T, D1, D2, N > min(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:460
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
Definition: Matriplex.h:627
MPlex< T, D1, D2, N > sin(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:423
static void invert(MPlex< T, D, D, N > &A, double *determ=nullptr)
Definition: Matriplex.h:641
MPlex< T, D1, D2, N > operator/(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:342
Basic3DVector & operator-=(const Basic3DVector< U > &p)
MPlex< T, D1, D2, N > sqr(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:411
static void multiply(const MPlex< T, 3, 3, N > &A, const MPlex< T, 3, 3, N > &B, MPlex< T, 3, 3, N > &C)
Definition: Matriplex.h:516
MPlex< T, D1, D2, N > operator-(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:328
static void multiply(const MPlex< T, 6, 6, N > &A, const MPlex< T, 6, 6, N > &B, MPlex< T, 6, 6, N > &C)
Definition: Matriplex.h:541
static void invert(MPlex< T, 2, 2, N > &A, double *determ=nullptr)
Definition: Matriplex.h:648
void sincos(const MPlex< T, D1, D2, N > &a, MPlex< T, D1, D2, N > &s, MPlex< T, D1, D2, N > &c)
Definition: Matriplex.h:435
Divides< A, C > D1
Definition: Factorize.h:136
#define MPLEX_ALIGN
static void invert(MPlex< T, 3, 3, N > &A, double *determ=nullptr)
Definition: Matriplex.h:673
MPlex< T, D1, D2, N > max(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:469
void min_max(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b, MPlex< T, D1, D2, N > &min, MPlex< T, D1, D2, N > &max)
Definition: Matriplex.h:449
static void invert(MPlex< T, 3, 3, N > &A)
Definition: Matriplex.h:728
void multiplyGeneral(const MPlex< T, D1, D2, N > &A, const MPlex< T, D2, D3, N > &B, MPlex< T, D1, D3, N > &C)
Definition: Matriplex.h:482
MPlex< T, D1, D2, N > operator*(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Definition: Matriplex.h:335
#define N
Definition: blowfish.cc:9
DecomposeProduct< arg, typename Div::arg > D
Definition: Factorize.h:141
void invertCramer(MPlex< T, D, D, N > &A, double *determ=nullptr)
Definition: Matriplex.h:711
Basic3DVector & operator/=(T t)
Scaling by a scalar value (division)
MPlex< T, D1, D2, N > tan(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:443
double b
Definition: hdecay.h:120
class __attribute__((aligned(32))) Matriplex
Definition: Matriplex.h:11
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
double a
Definition: hdecay.h:121
void invertCholesky(MPlex< T, D, D, N > &A)
Definition: Matriplex.h:765
Definition: APVGainStruct.h:7
static void invert(MPlex< T, D, D, N > &A)
Definition: Matriplex.h:721
static void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
Definition: Matriplex.h:509
T operator[](int i) const
tmp
align.sh
Definition: createJobs.py:716
long double T
#define ASSUME_ALIGNED(a, b)
Basic3DVector & operator+=(const Basic3DVector< U > &p)
MPlex< T, D1, D2, N > cos(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:429
MPlex< T, D1, D2, N > sqrt(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:405