CMS 3D CMS Logo

Matriplex.h
Go to the documentation of this file.
1 #ifndef RecoTracker_MkFitCore_src_Matriplex_Matriplex_h
2 #define RecoTracker_MkFitCore_src_Matriplex_Matriplex_h
3 
4 #include "MatriplexCommon.h"
5 
6 namespace Matriplex {
7 
8  //------------------------------------------------------------------------------
9 
10  template <typename T, idx_t D1, idx_t D2, idx_t N>
11  class Matriplex {
12  public:
13  typedef T value_type;
14 
16  static constexpr int kRows = D1;
18  static constexpr int kCols = D2;
20  static constexpr int kSize = D1 * D2;
22  static constexpr int kTotSize = N * kSize;
23 
24  T fArray[kTotSize] __attribute__((aligned(64)));
25 
26  Matriplex() {}
27  Matriplex(T v) { setVal(v); }
28 
29  idx_t plexSize() const { return N; }
30 
31  void setVal(T v) {
32  for (idx_t i = 0; i < kTotSize; ++i) {
33  fArray[i] = v;
34  }
35  }
36 
37  void add(const Matriplex& v) {
38  for (idx_t i = 0; i < kTotSize; ++i) {
39  fArray[i] += v.fArray[i];
40  }
41  }
42 
43  void scale(T scale) {
44  for (idx_t i = 0; i < kTotSize; ++i) {
45  fArray[i] *= scale;
46  }
47  }
48 
49  T operator[](idx_t xx) const { return fArray[xx]; }
50  T& operator[](idx_t xx) { return fArray[xx]; }
51 
52  const T& constAt(idx_t n, idx_t i, idx_t j) const { return fArray[(i * D2 + j) * N + n]; }
53 
54  T& At(idx_t n, idx_t i, idx_t j) { return fArray[(i * D2 + j) * N + n]; }
55 
56  T& operator()(idx_t n, idx_t i, idx_t j) { return fArray[(i * D2 + j) * N + n]; }
57  const T& operator()(idx_t n, idx_t i, idx_t j) const { return fArray[(i * D2 + j) * N + n]; }
58 
60  memcpy(fArray, m.fArray, sizeof(T) * kTotSize);
61  return *this;
62  }
63 
64  void copySlot(idx_t n, const Matriplex& m) {
65  for (idx_t i = n; i < kTotSize; i += N) {
66  fArray[i] = m.fArray[i];
67  }
68  }
69 
70  void copyIn(idx_t n, const T* arr) {
71  for (idx_t i = n; i < kTotSize; i += N) {
72  fArray[i] = *(arr++);
73  }
74  }
75 
76  void copyIn(idx_t n, const Matriplex& m, idx_t in) {
77  for (idx_t i = n; i < kTotSize; i += N, in += N) {
78  fArray[i] = m[in];
79  }
80  }
81 
82  void copy(idx_t n, idx_t in) {
83  for (idx_t i = n; i < kTotSize; i += N, in += N) {
84  fArray[i] = fArray[in];
85  }
86  }
87 
88 #if defined(AVX512_INTRINSICS)
89 
90  template <typename U>
91  void slurpIn(const T* arr, __m512i& vi, const U&, const int N_proc = N) {
92  //_mm512_prefetch_i32gather_ps(vi, arr, 1, _MM_HINT_T0);
93 
94  const __m512 src = {0};
95  const __mmask16 k = N_proc == N ? -1 : (1 << N_proc) - 1;
96 
97  for (int i = 0; i < kSize; ++i, ++arr) {
98  //_mm512_prefetch_i32gather_ps(vi, arr+2, 1, _MM_HINT_NTA);
99 
100  __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr, sizeof(U));
101  _mm512_mask_store_ps(&fArray[i * N], k, reg);
102  }
103  }
104 
105  // Experimental methods, slurpIn() seems to be at least as fast.
106  // See comments in mkFit/MkFitter.cc MkFitter::addBestHit().
107  void ChewIn(const char* arr, int off, int vi[N], const char* tmp, __m512i& ui) {
108  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
109 
110  for (int i = 0; i < N; ++i) {
111  __m512 reg = _mm512_load_ps(arr + vi[i]);
112  _mm512_store_ps((void*)(tmp + 64 * i), reg);
113  }
114 
115  for (int i = 0; i < kSize; ++i) {
116  __m512 reg = _mm512_i32gather_ps(ui, tmp + off + i * sizeof(T), 1);
117  _mm512_store_ps(&fArray[i * N], reg);
118  }
119  }
120 
121  void Contaginate(const char* arr, int vi[N], const char* tmp) {
122  // This is a hack ... we know sizeof(Hit) = 64 = cache line = vector width.
123 
124  for (int i = 0; i < N; ++i) {
125  __m512 reg = _mm512_load_ps(arr + vi[i]);
126  _mm512_store_ps((void*)(tmp + 64 * i), reg);
127  }
128  }
129 
130  void Plexify(const char* tmp, __m512i& ui) {
131  for (int i = 0; i < kSize; ++i) {
132  __m512 reg = _mm512_i32gather_ps(ui, tmp + i * sizeof(T), 1);
133  _mm512_store_ps(&fArray[i * N], reg);
134  }
135  }
136 
137 #elif defined(AVX2_INTRINSICS)
138 
139  template <typename U>
140  void slurpIn(const T* arr, __m256i& vi, const U&, const int N_proc = N) {
141  // Casts to float* needed to "support" also T=HitOnTrack.
142  // Note that sizeof(float) == sizeof(HitOnTrack) == 4.
143 
144  const __m256 src = {0};
145 
146  __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
147  __m256i k_sel = _mm256_set1_epi32(N_proc);
148  __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
149 
150  k = k_master;
151  for (int i = 0; i < kSize; ++i, ++arr) {
152  __m256 reg = _mm256_mask_i32gather_ps(src, (float*)arr, vi, (__m256)k, sizeof(U));
153  // Restore mask (docs say gather clears it but it doesn't seem to).
154  k = k_master;
155  _mm256_maskstore_ps((float*)&fArray[i * N], k, reg);
156  }
157  }
158 
159 #else
160 
161  void slurpIn(const T* arr, int vi[N], const int N_proc = N) {
162  // Separate N_proc == N case (gains about 7% in fit test).
163  if (N_proc == N) {
164  for (int i = 0; i < kSize; ++i) {
165  for (int j = 0; j < N; ++j) {
166  fArray[i * N + j] = *(arr + i + vi[j]);
167  }
168  }
169  } else {
170  for (int i = 0; i < kSize; ++i) {
171  for (int j = 0; j < N_proc; ++j) {
172  fArray[i * N + j] = *(arr + i + vi[j]);
173  }
174  }
175  }
176  }
177 
178 #endif
179 
180  void copyOut(idx_t n, T* arr) const {
181  for (idx_t i = n; i < kTotSize; i += N) {
182  *(arr++) = fArray[i];
183  }
184  }
185  };
186 
187  template <typename T, idx_t D1, idx_t D2, idx_t N>
189 
190  //==============================================================================
191  // Multiplications
192  //==============================================================================
193 
194  template <typename T, idx_t D1, idx_t D2, idx_t D3, idx_t N>
196  for (idx_t i = 0; i < D1; ++i) {
197  for (idx_t j = 0; j < D3; ++j) {
198  const idx_t ijo = N * (i * D3 + j);
199 
200 #pragma omp simd
201  for (idx_t n = 0; n < N; ++n) {
202  C.fArray[ijo + n] = 0;
203  }
204 
205  for (idx_t k = 0; k < D2; ++k) {
206  const idx_t iko = N * (i * D2 + k);
207  const idx_t kjo = N * (k * D3 + j);
208 
209 #pragma omp simd
210  for (idx_t n = 0; n < N; ++n) {
211  C.fArray[ijo + n] += A.fArray[iko + n] * B.fArray[kjo + n];
212  }
213  }
214  }
215  }
216  }
217 
218  //------------------------------------------------------------------------------
219 
220  template <typename T, idx_t D, idx_t N>
221  struct MultiplyCls {
223  throw std::runtime_error("general multiplication not supported, well, call multiplyGeneral()");
224  }
225  };
226 
227  template <typename T, idx_t N>
228  struct MultiplyCls<T, 3, N> {
230  const T* a = A.fArray;
231  ASSUME_ALIGNED(a, 64);
232  const T* b = B.fArray;
233  ASSUME_ALIGNED(b, 64);
234  T* c = C.fArray;
235  ASSUME_ALIGNED(c, 64);
236 
237 #pragma omp simd
238  for (idx_t n = 0; n < N; ++n) {
239  c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[3 * N + n] + a[2 * N + n] * b[6 * N + n];
240  c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[4 * N + n] + a[2 * N + n] * b[7 * N + n];
241  c[2 * N + n] = a[0 * N + n] * b[2 * N + n] + a[1 * N + n] * b[5 * N + n] + a[2 * N + n] * b[8 * N + n];
242  c[3 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[3 * N + n] + a[5 * N + n] * b[6 * N + n];
243  c[4 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[7 * N + n];
244  c[5 * N + n] = a[3 * N + n] * b[2 * N + n] + a[4 * N + n] * b[5 * N + n] + a[5 * N + n] * b[8 * N + n];
245  c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[3 * N + n] + a[8 * N + n] * b[6 * N + n];
246  c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[4 * N + n] + a[8 * N + n] * b[7 * N + n];
247  c[8 * N + n] = a[6 * N + n] * b[2 * N + n] + a[7 * N + n] * b[5 * N + n] + a[8 * N + n] * b[8 * N + n];
248  }
249  }
250  };
251 
252  template <typename T, idx_t N>
253  struct MultiplyCls<T, 6, N> {
255  const T* a = A.fArray;
256  ASSUME_ALIGNED(a, 64);
257  const T* b = B.fArray;
258  ASSUME_ALIGNED(b, 64);
259  T* c = C.fArray;
260  ASSUME_ALIGNED(c, 64);
261 #pragma omp simd
262  for (idx_t n = 0; n < N; ++n) {
263  c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[6 * N + n] + a[2 * N + n] * b[12 * N + n] +
264  a[3 * N + n] * b[18 * N + n] + a[4 * N + n] * b[24 * N + n] + a[5 * N + n] * b[30 * N + n];
265  c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[7 * N + n] + a[2 * N + n] * b[13 * N + n] +
266  a[3 * N + n] * b[19 * N + n] + a[4 * N + n] * b[25 * N + n] + a[5 * N + n] * b[31 * N + n];
267  c[2 * N + n] = a[0 * N + n] * b[2 * N + n] + a[1 * N + n] * b[8 * N + n] + a[2 * N + n] * b[14 * N + n] +
268  a[3 * N + n] * b[20 * N + n] + a[4 * N + n] * b[26 * N + n] + a[5 * N + n] * b[32 * N + n];
269  c[3 * N + n] = a[0 * N + n] * b[3 * N + n] + a[1 * N + n] * b[9 * N + n] + a[2 * N + n] * b[15 * N + n] +
270  a[3 * N + n] * b[21 * N + n] + a[4 * N + n] * b[27 * N + n] + a[5 * N + n] * b[33 * N + n];
271  c[4 * N + n] = a[0 * N + n] * b[4 * N + n] + a[1 * N + n] * b[10 * N + n] + a[2 * N + n] * b[16 * N + n] +
272  a[3 * N + n] * b[22 * N + n] + a[4 * N + n] * b[28 * N + n] + a[5 * N + n] * b[34 * N + n];
273  c[5 * N + n] = a[0 * N + n] * b[5 * N + n] + a[1 * N + n] * b[11 * N + n] + a[2 * N + n] * b[17 * N + n] +
274  a[3 * N + n] * b[23 * N + n] + a[4 * N + n] * b[29 * N + n] + a[5 * N + n] * b[35 * N + n];
275  c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[6 * N + n] + a[8 * N + n] * b[12 * N + n] +
276  a[9 * N + n] * b[18 * N + n] + a[10 * N + n] * b[24 * N + n] + a[11 * N + n] * b[30 * N + n];
277  c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[7 * N + n] + a[8 * N + n] * b[13 * N + n] +
278  a[9 * N + n] * b[19 * N + n] + a[10 * N + n] * b[25 * N + n] + a[11 * N + n] * b[31 * N + n];
279  c[8 * N + n] = a[6 * N + n] * b[2 * N + n] + a[7 * N + n] * b[8 * N + n] + a[8 * N + n] * b[14 * N + n] +
280  a[9 * N + n] * b[20 * N + n] + a[10 * N + n] * b[26 * N + n] + a[11 * N + n] * b[32 * N + n];
281  c[9 * N + n] = a[6 * N + n] * b[3 * N + n] + a[7 * N + n] * b[9 * N + n] + a[8 * N + n] * b[15 * N + n] +
282  a[9 * N + n] * b[21 * N + n] + a[10 * N + n] * b[27 * N + n] + a[11 * N + n] * b[33 * N + n];
283  c[10 * N + n] = a[6 * N + n] * b[4 * N + n] + a[7 * N + n] * b[10 * N + n] + a[8 * N + n] * b[16 * N + n] +
284  a[9 * N + n] * b[22 * N + n] + a[10 * N + n] * b[28 * N + n] + a[11 * N + n] * b[34 * N + n];
285  c[11 * N + n] = a[6 * N + n] * b[5 * N + n] + a[7 * N + n] * b[11 * N + n] + a[8 * N + n] * b[17 * N + n] +
286  a[9 * N + n] * b[23 * N + n] + a[10 * N + n] * b[29 * N + n] + a[11 * N + n] * b[35 * N + n];
287  c[12 * N + n] = a[12 * N + n] * b[0 * N + n] + a[13 * N + n] * b[6 * N + n] + a[14 * N + n] * b[12 * N + n] +
288  a[15 * N + n] * b[18 * N + n] + a[16 * N + n] * b[24 * N + n] + a[17 * N + n] * b[30 * N + n];
289  c[13 * N + n] = a[12 * N + n] * b[1 * N + n] + a[13 * N + n] * b[7 * N + n] + a[14 * N + n] * b[13 * N + n] +
290  a[15 * N + n] * b[19 * N + n] + a[16 * N + n] * b[25 * N + n] + a[17 * N + n] * b[31 * N + n];
291  c[14 * N + n] = a[12 * N + n] * b[2 * N + n] + a[13 * N + n] * b[8 * N + n] + a[14 * N + n] * b[14 * N + n] +
292  a[15 * N + n] * b[20 * N + n] + a[16 * N + n] * b[26 * N + n] + a[17 * N + n] * b[32 * N + n];
293  c[15 * N + n] = a[12 * N + n] * b[3 * N + n] + a[13 * N + n] * b[9 * N + n] + a[14 * N + n] * b[15 * N + n] +
294  a[15 * N + n] * b[21 * N + n] + a[16 * N + n] * b[27 * N + n] + a[17 * N + n] * b[33 * N + n];
295  c[16 * N + n] = a[12 * N + n] * b[4 * N + n] + a[13 * N + n] * b[10 * N + n] + a[14 * N + n] * b[16 * N + n] +
296  a[15 * N + n] * b[22 * N + n] + a[16 * N + n] * b[28 * N + n] + a[17 * N + n] * b[34 * N + n];
297  c[17 * N + n] = a[12 * N + n] * b[5 * N + n] + a[13 * N + n] * b[11 * N + n] + a[14 * N + n] * b[17 * N + n] +
298  a[15 * N + n] * b[23 * N + n] + a[16 * N + n] * b[29 * N + n] + a[17 * N + n] * b[35 * N + n];
299  c[18 * N + n] = a[18 * N + n] * b[0 * N + n] + a[19 * N + n] * b[6 * N + n] + a[20 * N + n] * b[12 * N + n] +
300  a[21 * N + n] * b[18 * N + n] + a[22 * N + n] * b[24 * N + n] + a[23 * N + n] * b[30 * N + n];
301  c[19 * N + n] = a[18 * N + n] * b[1 * N + n] + a[19 * N + n] * b[7 * N + n] + a[20 * N + n] * b[13 * N + n] +
302  a[21 * N + n] * b[19 * N + n] + a[22 * N + n] * b[25 * N + n] + a[23 * N + n] * b[31 * N + n];
303  c[20 * N + n] = a[18 * N + n] * b[2 * N + n] + a[19 * N + n] * b[8 * N + n] + a[20 * N + n] * b[14 * N + n] +
304  a[21 * N + n] * b[20 * N + n] + a[22 * N + n] * b[26 * N + n] + a[23 * N + n] * b[32 * N + n];
305  c[21 * N + n] = a[18 * N + n] * b[3 * N + n] + a[19 * N + n] * b[9 * N + n] + a[20 * N + n] * b[15 * N + n] +
306  a[21 * N + n] * b[21 * N + n] + a[22 * N + n] * b[27 * N + n] + a[23 * N + n] * b[33 * N + n];
307  c[22 * N + n] = a[18 * N + n] * b[4 * N + n] + a[19 * N + n] * b[10 * N + n] + a[20 * N + n] * b[16 * N + n] +
308  a[21 * N + n] * b[22 * N + n] + a[22 * N + n] * b[28 * N + n] + a[23 * N + n] * b[34 * N + n];
309  c[23 * N + n] = a[18 * N + n] * b[5 * N + n] + a[19 * N + n] * b[11 * N + n] + a[20 * N + n] * b[17 * N + n] +
310  a[21 * N + n] * b[23 * N + n] + a[22 * N + n] * b[29 * N + n] + a[23 * N + n] * b[35 * N + n];
311  c[24 * N + n] = a[24 * N + n] * b[0 * N + n] + a[25 * N + n] * b[6 * N + n] + a[26 * N + n] * b[12 * N + n] +
312  a[27 * N + n] * b[18 * N + n] + a[28 * N + n] * b[24 * N + n] + a[29 * N + n] * b[30 * N + n];
313  c[25 * N + n] = a[24 * N + n] * b[1 * N + n] + a[25 * N + n] * b[7 * N + n] + a[26 * N + n] * b[13 * N + n] +
314  a[27 * N + n] * b[19 * N + n] + a[28 * N + n] * b[25 * N + n] + a[29 * N + n] * b[31 * N + n];
315  c[26 * N + n] = a[24 * N + n] * b[2 * N + n] + a[25 * N + n] * b[8 * N + n] + a[26 * N + n] * b[14 * N + n] +
316  a[27 * N + n] * b[20 * N + n] + a[28 * N + n] * b[26 * N + n] + a[29 * N + n] * b[32 * N + n];
317  c[27 * N + n] = a[24 * N + n] * b[3 * N + n] + a[25 * N + n] * b[9 * N + n] + a[26 * N + n] * b[15 * N + n] +
318  a[27 * N + n] * b[21 * N + n] + a[28 * N + n] * b[27 * N + n] + a[29 * N + n] * b[33 * N + n];
319  c[28 * N + n] = a[24 * N + n] * b[4 * N + n] + a[25 * N + n] * b[10 * N + n] + a[26 * N + n] * b[16 * N + n] +
320  a[27 * N + n] * b[22 * N + n] + a[28 * N + n] * b[28 * N + n] + a[29 * N + n] * b[34 * N + n];
321  c[29 * N + n] = a[24 * N + n] * b[5 * N + n] + a[25 * N + n] * b[11 * N + n] + a[26 * N + n] * b[17 * N + n] +
322  a[27 * N + n] * b[23 * N + n] + a[28 * N + n] * b[29 * N + n] + a[29 * N + n] * b[35 * N + n];
323  c[30 * N + n] = a[30 * N + n] * b[0 * N + n] + a[31 * N + n] * b[6 * N + n] + a[32 * N + n] * b[12 * N + n] +
324  a[33 * N + n] * b[18 * N + n] + a[34 * N + n] * b[24 * N + n] + a[35 * N + n] * b[30 * N + n];
325  c[31 * N + n] = a[30 * N + n] * b[1 * N + n] + a[31 * N + n] * b[7 * N + n] + a[32 * N + n] * b[13 * N + n] +
326  a[33 * N + n] * b[19 * N + n] + a[34 * N + n] * b[25 * N + n] + a[35 * N + n] * b[31 * N + n];
327  c[32 * N + n] = a[30 * N + n] * b[2 * N + n] + a[31 * N + n] * b[8 * N + n] + a[32 * N + n] * b[14 * N + n] +
328  a[33 * N + n] * b[20 * N + n] + a[34 * N + n] * b[26 * N + n] + a[35 * N + n] * b[32 * N + n];
329  c[33 * N + n] = a[30 * N + n] * b[3 * N + n] + a[31 * N + n] * b[9 * N + n] + a[32 * N + n] * b[15 * N + n] +
330  a[33 * N + n] * b[21 * N + n] + a[34 * N + n] * b[27 * N + n] + a[35 * N + n] * b[33 * N + n];
331  c[34 * N + n] = a[30 * N + n] * b[4 * N + n] + a[31 * N + n] * b[10 * N + n] + a[32 * N + n] * b[16 * N + n] +
332  a[33 * N + n] * b[22 * N + n] + a[34 * N + n] * b[28 * N + n] + a[35 * N + n] * b[34 * N + n];
333  c[35 * N + n] = a[30 * N + n] * b[5 * N + n] + a[31 * N + n] * b[11 * N + n] + a[32 * N + n] * b[17 * N + n] +
334  a[33 * N + n] * b[23 * N + n] + a[34 * N + n] * b[29 * N + n] + a[35 * N + n] * b[35 * N + n];
335  }
336  }
337  };
338 
339  template <typename T, idx_t D, idx_t N>
341 #ifdef DEBUG
342  printf("Multipl %d %d\n", D, N);
343 #endif
344 
346  }
347 
348  //==============================================================================
349  // Cramer inversion
350  //==============================================================================
351 
352  template <typename T, idx_t D, idx_t N>
353  struct CramerInverter {
354  static void invert(MPlex<T, D, D, N>& A, double* determ = nullptr) {
355  throw std::runtime_error("general cramer inversion not supported");
356  }
357  };
358 
359  template <typename T, idx_t N>
360  struct CramerInverter<T, 2, N> {
361  static void invert(MPlex<T, 2, 2, N>& A, double* determ = nullptr) {
362  typedef T TT;
363 
364  T* a = A.fArray;
365  ASSUME_ALIGNED(a, 64);
366 
367 #pragma omp simd
368  for (idx_t n = 0; n < N; ++n) {
369  // Force determinant calculation in double precision.
370  const double det = (double)a[0 * N + n] * a[3 * N + n] - (double)a[2 * N + n] * a[1 * N + n];
371  if (determ)
372  determ[n] = det;
373 
374  const TT s = TT(1) / det;
375  const TT tmp = s * a[3 * N + n];
376  a[1 * N + n] *= -s;
377  a[2 * N + n] *= -s;
378  a[3 * N + n] = s * a[0 * N + n];
379  a[0 * N + n] = tmp;
380  }
381  }
382  };
383 
384  template <typename T, idx_t N>
385  struct CramerInverter<T, 3, N> {
386  static void invert(MPlex<T, 3, 3, N>& A, double* determ = nullptr) {
387  typedef T TT;
388 
389  T* a = A.fArray;
390  ASSUME_ALIGNED(a, 64);
391 
392 #pragma omp simd
393  for (idx_t n = 0; n < N; ++n) {
394  const TT c00 = a[4 * N + n] * a[8 * N + n] - a[5 * N + n] * a[7 * N + n];
395  const TT c01 = a[5 * N + n] * a[6 * N + n] - a[3 * N + n] * a[8 * N + n];
396  const TT c02 = a[3 * N + n] * a[7 * N + n] - a[4 * N + n] * a[6 * N + n];
397  const TT c10 = a[7 * N + n] * a[2 * N + n] - a[8 * N + n] * a[1 * N + n];
398  const TT c11 = a[8 * N + n] * a[0 * N + n] - a[6 * N + n] * a[2 * N + n];
399  const TT c12 = a[6 * N + n] * a[1 * N + n] - a[7 * N + n] * a[0 * N + n];
400  const TT c20 = a[1 * N + n] * a[5 * N + n] - a[2 * N + n] * a[4 * N + n];
401  const TT c21 = a[2 * N + n] * a[3 * N + n] - a[0 * N + n] * a[5 * N + n];
402  const TT c22 = a[0 * N + n] * a[4 * N + n] - a[1 * N + n] * a[3 * N + n];
403 
404  // Force determinant calculation in double precision.
405  const double det = (double)a[0 * N + n] * c00 + (double)a[1 * N + n] * c01 + (double)a[2 * N + n] * c02;
406  if (determ)
407  determ[n] = det;
408 
409  const TT s = TT(1) / det;
410  a[0 * N + n] = s * c00;
411  a[1 * N + n] = s * c10;
412  a[2 * N + n] = s * c20;
413  a[3 * N + n] = s * c01;
414  a[4 * N + n] = s * c11;
415  a[5 * N + n] = s * c21;
416  a[6 * N + n] = s * c02;
417  a[7 * N + n] = s * c12;
418  a[8 * N + n] = s * c22;
419  }
420  }
421  };
422 
423  template <typename T, idx_t D, idx_t N>
424  void invertCramer(MPlex<T, D, D, N>& A, double* determ = nullptr) {
426  }
427 
428  //==============================================================================
429  // Cholesky inversion
430  //==============================================================================
431 
432  template <typename T, idx_t D, idx_t N>
434  static void invert(MPlex<T, D, D, N>& A) { throw std::runtime_error("general cholesky inversion not supported"); }
435  };
436 
437  template <typename T, idx_t N>
438  struct CholeskyInverter<T, 3, N> {
439  // Note: this only works on symmetric matrices.
440  // Optimized version for positive definite matrices, no checks.
441  static void invert(MPlex<T, 3, 3, N>& A) {
442  typedef T TT;
443 
444  T* a = A.fArray;
445  ASSUME_ALIGNED(a, 64);
446 
447 #pragma omp simd
448  for (idx_t n = 0; n < N; ++n) {
449  TT l0 = std::sqrt(T(1) / a[0 * N + n]);
450  TT l1 = a[3 * N + n] * l0;
451  TT l2 = a[4 * N + n] - l1 * l1;
452  l2 = std::sqrt(T(1) / l2);
453  TT l3 = a[6 * N + n] * l0;
454  TT l4 = (a[7 * N + n] - l1 * l3) * l2;
455  TT l5 = a[8 * N + n] - (l3 * l3 + l4 * l4);
456  l5 = std::sqrt(T(1) / l5);
457 
458  // decomposition done
459 
460  l3 = (l1 * l4 * l2 - l3) * l0 * l5;
461  l1 = -l1 * l0 * l2;
462  l4 = -l4 * l2 * l5;
463 
464  a[0 * N + n] = l3 * l3 + l1 * l1 + l0 * l0;
465  a[1 * N + n] = a[3 * N + n] = l3 * l4 + l1 * l2;
466  a[4 * N + n] = l4 * l4 + l2 * l2;
467  a[2 * N + n] = a[6 * N + n] = l3 * l5;
468  a[5 * N + n] = a[7 * N + n] = l4 * l5;
469  a[8 * N + n] = l5 * l5;
470 
471  // m(2,x) are all zero if anything went wrong at l5.
472  // all zero, if anything went wrong already for l0 or l2.
473  }
474  }
475  };
476 
477  template <typename T, idx_t D, idx_t N>
480  }
481 
482 } // namespace Matriplex
483 
484 #endif
Divides< B, C > D2
Definition: Factorize.h:137
Definition: APVGainStruct.h:7
void copyIn(idx_t n, const T *arr)
Definition: Matriplex.h:70
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
Definition: Matriplex.h:340
static void invert(MPlex< T, D, D, N > &A, double *determ=nullptr)
Definition: Matriplex.h:354
const T & constAt(idx_t n, idx_t i, idx_t j) const
Definition: Matriplex.h:52
T & operator[](idx_t xx)
Definition: Matriplex.h:50
static void multiply(const MPlex< T, 3, 3, N > &A, const MPlex< T, 3, 3, N > &B, MPlex< T, 3, 3, N > &C)
Definition: Matriplex.h:229
static void multiply(const MPlex< T, 6, 6, N > &A, const MPlex< T, 6, 6, N > &B, MPlex< T, 6, 6, N > &C)
Definition: Matriplex.h:254
Matriplex & operator=(const Matriplex &m)
Definition: Matriplex.h:59
void add(const Matriplex &v)
Definition: Matriplex.h:37
static void invert(MPlex< T, 2, 2, N > &A, double *determ=nullptr)
Definition: Matriplex.h:361
const T & operator()(idx_t n, idx_t i, idx_t j) const
Definition: Matriplex.h:57
T sqrt(T t)
Definition: SSEVec.h:19
float __attribute__((vector_size(8))) cms_float32x2_t
Definition: ExtVec.h:8
Divides< A, C > D1
Definition: Factorize.h:136
static void invert(MPlex< T, 3, 3, N > &A, double *determ=nullptr)
Definition: Matriplex.h:386
static void invert(MPlex< T, 3, 3, N > &A)
Definition: Matriplex.h:441
T & operator()(idx_t n, idx_t i, idx_t j)
Definition: Matriplex.h:56
void multiplyGeneral(const MPlex< T, D1, D2, N > &A, const MPlex< T, D2, D3, N > &B, MPlex< T, D1, D3, N > &C)
Definition: Matriplex.h:195
#define N
Definition: blowfish.cc:9
DecomposeProduct< arg, typename Div::arg > D
Definition: Factorize.h:141
void invertCramer(MPlex< T, D, D, N > &A, double *determ=nullptr)
Definition: Matriplex.h:424
void copyIn(idx_t n, const Matriplex &m, idx_t in)
Definition: Matriplex.h:76
void slurpIn(const T *arr, int vi[N], const int N_proc=N)
Definition: Matriplex.h:161
double b
Definition: hdecay.h:120
idx_t plexSize() const
Definition: Matriplex.h:29
void copy(idx_t n, idx_t in)
Definition: Matriplex.h:82
double a
Definition: hdecay.h:121
void invertCholesky(MPlex< T, D, D, N > &A)
Definition: Matriplex.h:478
void copyOut(idx_t n, T *arr) const
Definition: Matriplex.h:180
void copySlot(idx_t n, const Matriplex &m)
Definition: Matriplex.h:64
Definition: APVGainStruct.h:7
static void invert(MPlex< T, D, D, N > &A)
Definition: Matriplex.h:434
void setVal(T v)
Definition: Matriplex.h:31
T operator[](idx_t xx) const
Definition: Matriplex.h:49
static void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
Definition: Matriplex.h:222
void scale(T scale)
Definition: Matriplex.h:43
tmp
align.sh
Definition: createJobs.py:716
long double T
#define ASSUME_ALIGNED(a, b)
T & At(idx_t n, idx_t i, idx_t j)
Definition: Matriplex.h:54