1 #ifndef RecoTracker_MkFitCore_src_Matriplex_Matriplex_h
2 #define RecoTracker_MkFitCore_src_Matriplex_Matriplex_h
10 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
39 fArray[
i] += v.fArray[
i];
60 memcpy(fArray, m.fArray,
sizeof(
T) *
kTotSize);
66 fArray[
i] = m.fArray[
i];
84 fArray[
i] = fArray[
in];
88 #if defined(AVX512_INTRINSICS)
91 void slurpIn(
const T* arr, __m512i& vi,
const U&,
const int N_proc =
N) {
94 const __m512
src = {0};
95 const __mmask16
k = N_proc ==
N ? -1 : (1 << N_proc) - 1;
97 for (
int i = 0;
i <
kSize; ++
i, ++arr) {
100 __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr,
sizeof(U));
101 _mm512_mask_store_ps(&fArray[
i *
N], k, reg);
107 void ChewIn(
const char* arr,
int off,
int vi[
N],
const char*
tmp, __m512i&
ui) {
110 for (
int i = 0;
i <
N; ++
i) {
111 __m512 reg = _mm512_load_ps(arr + vi[
i]);
112 _mm512_store_ps((
void*)(tmp + 64 *
i), reg);
116 __m512 reg = _mm512_i32gather_ps(ui, tmp + off +
i *
sizeof(
T), 1);
117 _mm512_store_ps(&fArray[
i * N], reg);
121 void Contaginate(
const char* arr,
int vi[N],
const char* tmp) {
124 for (
int i = 0;
i <
N; ++
i) {
125 __m512 reg = _mm512_load_ps(arr + vi[
i]);
126 _mm512_store_ps((
void*)(tmp + 64 *
i), reg);
130 void Plexify(
const char* tmp, __m512i& ui) {
132 __m512 reg = _mm512_i32gather_ps(ui, tmp +
i *
sizeof(
T), 1);
133 _mm512_store_ps(&fArray[
i * N], reg);
137 #elif defined(AVX2_INTRINSICS)
139 template <
typename U>
140 void slurpIn(
const T* arr, __m256i& vi,
const U&,
const int N_proc = N) {
144 const __m256 src = {0};
146 __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
147 __m256i k_sel = _mm256_set1_epi32(N_proc);
148 __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
151 for (
int i = 0;
i <
kSize; ++
i, ++arr) {
152 __m256 reg = _mm256_mask_i32gather_ps(src, (
float*)arr, vi, (__m256)k,
sizeof(U));
155 _mm256_maskstore_ps((
float*)&fArray[
i * N], k, reg);
161 void slurpIn(
const T* arr,
int vi[N],
const int N_proc = N) {
165 for (
int j = 0;
j <
N; ++
j) {
166 fArray[
i * N +
j] = *(arr +
i + vi[
j]);
171 for (
int j = 0;
j < N_proc; ++
j) {
172 fArray[
i * N +
j] = *(arr +
i + vi[
j]);
182 *(arr++) = fArray[
i];
187 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
194 template <
typename T,
idx_t D1,
idx_t D2,
idx_t D3,
idx_t N>
195 void multiplyGeneral(
const MPlex<T, D1, D2, N>&
A,
const MPlex<T, D2, D3, N>&
B,
MPlex<T, D1, D3, N>&
C) {
198 const idx_t ijo = N * (
i * D3 +
j);
202 C.fArray[ijo +
n] = 0;
206 const idx_t iko = N * (
i * D2 +
k);
207 const idx_t kjo = N * (k * D3 +
j);
211 C.fArray[ijo +
n] += A.fArray[iko +
n] * B.fArray[kjo +
n];
220 template <
typename T,
idx_t D,
idx_t N>
222 static void multiply(
const MPlex<T, D, D, N>&
A,
const MPlex<T, D, D, N>&
B,
MPlex<T, D, D, N>&
C) {
223 throw std::runtime_error(
"general multiplication not supported, well, call multiplyGeneral()");
227 template <
typename T,
idx_t N>
229 static void multiply(
const MPlex<T, 3, 3, N>&
A,
const MPlex<T, 3, 3, N>&
B,
MPlex<T, 3, 3, N>&
C) {
230 const T*
a = A.fArray;
232 const T*
b = B.fArray;
239 c[0 * N +
n] = a[0 * N +
n] * b[0 * N +
n] + a[1 * N +
n] * b[3 * N +
n] + a[2 * N +
n] * b[6 * N +
n];
240 c[1 * N +
n] = a[0 * N +
n] * b[1 * N +
n] + a[1 * N +
n] * b[4 * N +
n] + a[2 * N +
n] * b[7 * N +
n];
241 c[2 * N +
n] = a[0 * N +
n] * b[2 * N +
n] + a[1 * N +
n] * b[5 * N +
n] + a[2 * N +
n] * b[8 * N +
n];
242 c[3 * N +
n] = a[3 * N +
n] * b[0 * N +
n] + a[4 * N +
n] * b[3 * N +
n] + a[5 * N +
n] * b[6 * N +
n];
243 c[4 * N +
n] = a[3 * N +
n] * b[1 * N +
n] + a[4 * N +
n] * b[4 * N +
n] + a[5 * N +
n] * b[7 * N +
n];
244 c[5 * N +
n] = a[3 * N +
n] * b[2 * N +
n] + a[4 * N +
n] * b[5 * N +
n] + a[5 * N +
n] * b[8 * N +
n];
245 c[6 * N +
n] = a[6 * N +
n] * b[0 * N +
n] + a[7 * N +
n] * b[3 * N +
n] + a[8 * N +
n] * b[6 * N +
n];
246 c[7 * N +
n] = a[6 * N +
n] * b[1 * N +
n] + a[7 * N +
n] * b[4 * N +
n] + a[8 * N +
n] * b[7 * N +
n];
247 c[8 * N +
n] = a[6 * N +
n] * b[2 * N +
n] + a[7 * N +
n] * b[5 * N +
n] + a[8 * N +
n] * b[8 * N +
n];
252 template <
typename T,
idx_t N>
254 static void multiply(
const MPlex<T, 6, 6, N>&
A,
const MPlex<T, 6, 6, N>&
B,
MPlex<T, 6, 6, N>&
C) {
255 const T*
a = A.fArray;
257 const T*
b = B.fArray;
263 c[0 * N +
n] = a[0 * N +
n] * b[0 * N +
n] + a[1 * N +
n] * b[6 * N +
n] + a[2 * N +
n] * b[12 * N +
n] +
264 a[3 * N +
n] * b[18 * N +
n] + a[4 * N +
n] * b[24 * N +
n] + a[5 * N +
n] * b[30 * N +
n];
265 c[1 * N +
n] = a[0 * N +
n] * b[1 * N +
n] + a[1 * N +
n] * b[7 * N +
n] + a[2 * N +
n] * b[13 * N +
n] +
266 a[3 * N +
n] * b[19 * N +
n] + a[4 * N +
n] * b[25 * N +
n] + a[5 * N +
n] * b[31 * N +
n];
267 c[2 * N +
n] = a[0 * N +
n] * b[2 * N +
n] + a[1 * N +
n] * b[8 * N +
n] + a[2 * N +
n] * b[14 * N +
n] +
268 a[3 * N +
n] * b[20 * N +
n] + a[4 * N +
n] * b[26 * N +
n] + a[5 * N +
n] * b[32 * N +
n];
269 c[3 * N +
n] = a[0 * N +
n] * b[3 * N +
n] + a[1 * N +
n] * b[9 * N +
n] + a[2 * N +
n] * b[15 * N +
n] +
270 a[3 * N +
n] * b[21 * N +
n] + a[4 * N +
n] * b[27 * N +
n] + a[5 * N +
n] * b[33 * N +
n];
271 c[4 * N +
n] = a[0 * N +
n] * b[4 * N +
n] + a[1 * N +
n] * b[10 * N +
n] + a[2 * N +
n] * b[16 * N +
n] +
272 a[3 * N +
n] * b[22 * N +
n] + a[4 * N +
n] * b[28 * N +
n] + a[5 * N +
n] * b[34 * N +
n];
273 c[5 * N +
n] = a[0 * N +
n] * b[5 * N +
n] + a[1 * N +
n] * b[11 * N +
n] + a[2 * N +
n] * b[17 * N +
n] +
274 a[3 * N +
n] * b[23 * N +
n] + a[4 * N +
n] * b[29 * N +
n] + a[5 * N +
n] * b[35 * N +
n];
275 c[6 * N +
n] = a[6 * N +
n] * b[0 * N +
n] + a[7 * N +
n] * b[6 * N +
n] + a[8 * N +
n] * b[12 * N +
n] +
276 a[9 * N +
n] * b[18 * N +
n] + a[10 * N +
n] * b[24 * N +
n] + a[11 * N +
n] * b[30 * N +
n];
277 c[7 * N +
n] = a[6 * N +
n] * b[1 * N +
n] + a[7 * N +
n] * b[7 * N +
n] + a[8 * N +
n] * b[13 * N +
n] +
278 a[9 * N +
n] * b[19 * N +
n] + a[10 * N +
n] * b[25 * N +
n] + a[11 * N +
n] * b[31 * N +
n];
279 c[8 * N +
n] = a[6 * N +
n] * b[2 * N +
n] + a[7 * N +
n] * b[8 * N +
n] + a[8 * N +
n] * b[14 * N +
n] +
280 a[9 * N +
n] * b[20 * N +
n] + a[10 * N +
n] * b[26 * N +
n] + a[11 * N +
n] * b[32 * N +
n];
281 c[9 * N +
n] = a[6 * N +
n] * b[3 * N +
n] + a[7 * N +
n] * b[9 * N +
n] + a[8 * N +
n] * b[15 * N +
n] +
282 a[9 * N +
n] * b[21 * N +
n] + a[10 * N +
n] * b[27 * N +
n] + a[11 * N +
n] * b[33 * N +
n];
283 c[10 * N +
n] = a[6 * N +
n] * b[4 * N +
n] + a[7 * N +
n] * b[10 * N +
n] + a[8 * N +
n] * b[16 * N +
n] +
284 a[9 * N +
n] * b[22 * N +
n] + a[10 * N +
n] * b[28 * N +
n] + a[11 * N +
n] * b[34 * N +
n];
285 c[11 * N +
n] = a[6 * N +
n] * b[5 * N +
n] + a[7 * N +
n] * b[11 * N +
n] + a[8 * N +
n] * b[17 * N +
n] +
286 a[9 * N +
n] * b[23 * N +
n] + a[10 * N +
n] * b[29 * N +
n] + a[11 * N +
n] * b[35 * N +
n];
287 c[12 * N +
n] = a[12 * N +
n] * b[0 * N +
n] + a[13 * N +
n] * b[6 * N +
n] + a[14 * N +
n] * b[12 * N +
n] +
288 a[15 * N +
n] * b[18 * N +
n] + a[16 * N +
n] * b[24 * N +
n] + a[17 * N +
n] * b[30 * N +
n];
289 c[13 * N +
n] = a[12 * N +
n] * b[1 * N +
n] + a[13 * N +
n] * b[7 * N +
n] + a[14 * N +
n] * b[13 * N +
n] +
290 a[15 * N +
n] * b[19 * N +
n] + a[16 * N +
n] * b[25 * N +
n] + a[17 * N +
n] * b[31 * N +
n];
291 c[14 * N +
n] = a[12 * N +
n] * b[2 * N +
n] + a[13 * N +
n] * b[8 * N +
n] + a[14 * N +
n] * b[14 * N +
n] +
292 a[15 * N +
n] * b[20 * N +
n] + a[16 * N +
n] * b[26 * N +
n] + a[17 * N +
n] * b[32 * N +
n];
293 c[15 * N +
n] = a[12 * N +
n] * b[3 * N +
n] + a[13 * N +
n] * b[9 * N +
n] + a[14 * N +
n] * b[15 * N +
n] +
294 a[15 * N +
n] * b[21 * N +
n] + a[16 * N +
n] * b[27 * N +
n] + a[17 * N +
n] * b[33 * N +
n];
295 c[16 * N +
n] = a[12 * N +
n] * b[4 * N +
n] + a[13 * N +
n] * b[10 * N +
n] + a[14 * N +
n] * b[16 * N +
n] +
296 a[15 * N +
n] * b[22 * N +
n] + a[16 * N +
n] * b[28 * N +
n] + a[17 * N +
n] * b[34 * N +
n];
297 c[17 * N +
n] = a[12 * N +
n] * b[5 * N +
n] + a[13 * N +
n] * b[11 * N +
n] + a[14 * N +
n] * b[17 * N +
n] +
298 a[15 * N +
n] * b[23 * N +
n] + a[16 * N +
n] * b[29 * N +
n] + a[17 * N +
n] * b[35 * N +
n];
299 c[18 * N +
n] = a[18 * N +
n] * b[0 * N +
n] + a[19 * N +
n] * b[6 * N +
n] + a[20 * N +
n] * b[12 * N +
n] +
300 a[21 * N +
n] * b[18 * N +
n] + a[22 * N +
n] * b[24 * N +
n] + a[23 * N +
n] * b[30 * N +
n];
301 c[19 * N +
n] = a[18 * N +
n] * b[1 * N +
n] + a[19 * N +
n] * b[7 * N +
n] + a[20 * N +
n] * b[13 * N +
n] +
302 a[21 * N +
n] * b[19 * N +
n] + a[22 * N +
n] * b[25 * N +
n] + a[23 * N +
n] * b[31 * N +
n];
303 c[20 * N +
n] = a[18 * N +
n] * b[2 * N +
n] + a[19 * N +
n] * b[8 * N +
n] + a[20 * N +
n] * b[14 * N +
n] +
304 a[21 * N +
n] * b[20 * N +
n] + a[22 * N +
n] * b[26 * N +
n] + a[23 * N +
n] * b[32 * N +
n];
305 c[21 * N +
n] = a[18 * N +
n] * b[3 * N +
n] + a[19 * N +
n] * b[9 * N +
n] + a[20 * N +
n] * b[15 * N +
n] +
306 a[21 * N +
n] * b[21 * N +
n] + a[22 * N +
n] * b[27 * N +
n] + a[23 * N +
n] * b[33 * N +
n];
307 c[22 * N +
n] = a[18 * N +
n] * b[4 * N +
n] + a[19 * N +
n] * b[10 * N +
n] + a[20 * N +
n] * b[16 * N +
n] +
308 a[21 * N +
n] * b[22 * N +
n] + a[22 * N +
n] * b[28 * N +
n] + a[23 * N +
n] * b[34 * N +
n];
309 c[23 * N +
n] = a[18 * N +
n] * b[5 * N +
n] + a[19 * N +
n] * b[11 * N +
n] + a[20 * N +
n] * b[17 * N +
n] +
310 a[21 * N +
n] * b[23 * N +
n] + a[22 * N +
n] * b[29 * N +
n] + a[23 * N +
n] * b[35 * N +
n];
311 c[24 * N +
n] = a[24 * N +
n] * b[0 * N +
n] + a[25 * N +
n] * b[6 * N +
n] + a[26 * N +
n] * b[12 * N +
n] +
312 a[27 * N +
n] * b[18 * N +
n] + a[28 * N +
n] * b[24 * N +
n] + a[29 * N +
n] * b[30 * N +
n];
313 c[25 * N +
n] = a[24 * N +
n] * b[1 * N +
n] + a[25 * N +
n] * b[7 * N +
n] + a[26 * N +
n] * b[13 * N +
n] +
314 a[27 * N +
n] * b[19 * N +
n] + a[28 * N +
n] * b[25 * N +
n] + a[29 * N +
n] * b[31 * N +
n];
315 c[26 * N +
n] = a[24 * N +
n] * b[2 * N +
n] + a[25 * N +
n] * b[8 * N +
n] + a[26 * N +
n] * b[14 * N +
n] +
316 a[27 * N +
n] * b[20 * N +
n] + a[28 * N +
n] * b[26 * N +
n] + a[29 * N +
n] * b[32 * N +
n];
317 c[27 * N +
n] = a[24 * N +
n] * b[3 * N +
n] + a[25 * N +
n] * b[9 * N +
n] + a[26 * N +
n] * b[15 * N +
n] +
318 a[27 * N +
n] * b[21 * N +
n] + a[28 * N +
n] * b[27 * N +
n] + a[29 * N +
n] * b[33 * N +
n];
319 c[28 * N +
n] = a[24 * N +
n] * b[4 * N +
n] + a[25 * N +
n] * b[10 * N +
n] + a[26 * N +
n] * b[16 * N +
n] +
320 a[27 * N +
n] * b[22 * N +
n] + a[28 * N +
n] * b[28 * N +
n] + a[29 * N +
n] * b[34 * N +
n];
321 c[29 * N +
n] = a[24 * N +
n] * b[5 * N +
n] + a[25 * N +
n] * b[11 * N +
n] + a[26 * N +
n] * b[17 * N +
n] +
322 a[27 * N +
n] * b[23 * N +
n] + a[28 * N +
n] * b[29 * N +
n] + a[29 * N +
n] * b[35 * N +
n];
323 c[30 * N +
n] = a[30 * N +
n] * b[0 * N +
n] + a[31 * N +
n] * b[6 * N +
n] + a[32 * N +
n] * b[12 * N +
n] +
324 a[33 * N +
n] * b[18 * N +
n] + a[34 * N +
n] * b[24 * N +
n] + a[35 * N +
n] * b[30 * N +
n];
325 c[31 * N +
n] = a[30 * N +
n] * b[1 * N +
n] + a[31 * N +
n] * b[7 * N +
n] + a[32 * N +
n] * b[13 * N +
n] +
326 a[33 * N +
n] * b[19 * N +
n] + a[34 * N +
n] * b[25 * N +
n] + a[35 * N +
n] * b[31 * N +
n];
327 c[32 * N +
n] = a[30 * N +
n] * b[2 * N +
n] + a[31 * N +
n] * b[8 * N +
n] + a[32 * N +
n] * b[14 * N +
n] +
328 a[33 * N +
n] * b[20 * N +
n] + a[34 * N +
n] * b[26 * N +
n] + a[35 * N +
n] * b[32 * N +
n];
329 c[33 * N +
n] = a[30 * N +
n] * b[3 * N +
n] + a[31 * N +
n] * b[9 * N +
n] + a[32 * N +
n] * b[15 * N +
n] +
330 a[33 * N +
n] * b[21 * N +
n] + a[34 * N +
n] * b[27 * N +
n] + a[35 * N +
n] * b[33 * N +
n];
331 c[34 * N +
n] = a[30 * N +
n] * b[4 * N +
n] + a[31 * N +
n] * b[10 * N +
n] + a[32 * N +
n] * b[16 * N +
n] +
332 a[33 * N +
n] * b[22 * N +
n] + a[34 * N +
n] * b[28 * N +
n] + a[35 * N +
n] * b[34 * N +
n];
333 c[35 * N +
n] = a[30 * N +
n] * b[5 * N +
n] + a[31 * N +
n] * b[11 * N +
n] + a[32 * N +
n] * b[17 * N +
n] +
334 a[33 * N +
n] * b[23 * N +
n] + a[34 * N +
n] * b[29 * N +
n] + a[35 * N +
n] * b[35 * N +
n];
339 template <
typename T,
idx_t D,
idx_t N>
340 void multiply(
const MPlex<T, D, D, N>&
A,
const MPlex<T, D, D, N>&
B,
MPlex<T, D, D, N>&
C) {
342 printf(
"Multipl %d %d\n",
D, N);
352 template <
typename T,
idx_t D,
idx_t N>
355 throw std::runtime_error(
"general cramer inversion not supported");
359 template <
typename T,
idx_t N>
370 const double det = (double)a[0 * N +
n] * a[3 * N +
n] - (
double)a[2 * N +
n] * a[1 * N +
n];
374 const TT
s = TT(1) / det;
375 const TT tmp = s * a[3 * N +
n];
378 a[3 * N +
n] = s * a[0 * N +
n];
384 template <
typename T,
idx_t N>
394 const TT c00 = a[4 * N +
n] * a[8 * N +
n] - a[5 * N +
n] * a[7 * N +
n];
395 const TT c01 = a[5 * N +
n] * a[6 * N +
n] - a[3 * N +
n] * a[8 * N +
n];
396 const TT c02 = a[3 * N +
n] * a[7 * N +
n] - a[4 * N +
n] * a[6 * N +
n];
397 const TT c10 = a[7 * N +
n] * a[2 * N +
n] - a[8 * N +
n] * a[1 * N +
n];
398 const TT c11 = a[8 * N +
n] * a[0 * N +
n] - a[6 * N +
n] * a[2 * N +
n];
399 const TT c12 = a[6 * N +
n] * a[1 * N +
n] - a[7 * N +
n] * a[0 * N +
n];
400 const TT c20 = a[1 * N +
n] * a[5 * N +
n] - a[2 * N +
n] * a[4 * N +
n];
401 const TT c21 = a[2 * N +
n] * a[3 * N +
n] - a[0 * N +
n] * a[5 * N +
n];
402 const TT c22 = a[0 * N +
n] * a[4 * N +
n] - a[1 * N +
n] * a[3 * N +
n];
405 const double det = (double)a[0 * N +
n] * c00 + (
double)a[1 * N +
n] * c01 + (double)a[2 * N +
n] * c02;
409 const TT
s = TT(1) / det;
410 a[0 * N +
n] = s * c00;
411 a[1 * N +
n] = s * c10;
412 a[2 * N +
n] = s * c20;
413 a[3 * N +
n] = s * c01;
414 a[4 * N +
n] = s * c11;
415 a[5 * N +
n] = s * c21;
416 a[6 * N +
n] = s * c02;
417 a[7 * N +
n] = s * c12;
418 a[8 * N +
n] = s * c22;
423 template <
typename T,
idx_t D,
idx_t N>
432 template <
typename T,
idx_t D,
idx_t N>
437 template <
typename T,
idx_t N>
450 TT l1 = a[3 * N +
n] * l0;
451 TT l2 = a[4 * N +
n] - l1 * l1;
453 TT l3 = a[6 * N +
n] * l0;
454 TT l4 = (a[7 * N +
n] - l1 * l3) * l2;
455 TT l5 = a[8 * N +
n] - (l3 * l3 + l4 * l4);
460 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
464 a[0 * N +
n] = l3 * l3 + l1 * l1 + l0 * l0;
465 a[1 * N +
n] = a[3 * N +
n] = l3 * l4 + l1 * l2;
466 a[4 * N +
n] = l4 * l4 + l2 * l2;
467 a[2 * N +
n] = a[6 * N +
n] = l3 * l5;
468 a[5 * N +
n] = a[7 * N +
n] = l4 * l5;
469 a[8 * N +
n] = l5 * l5;
477 template <
typename T,
idx_t D,
idx_t N>
T operator[](idx_t xx) const
const edm::EventSetup & c
void copyIn(idx_t n, const T *arr)
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
static void invert(MPlex< T, D, D, N > &A, double *determ=nullptr)
static constexpr int kTotSize
size of the whole matriplex
static void multiply(const MPlex< T, 3, 3, N > &A, const MPlex< T, 3, 3, N > &B, MPlex< T, 3, 3, N > &C)
static void multiply(const MPlex< T, 6, 6, N > &A, const MPlex< T, 6, 6, N > &B, MPlex< T, 6, 6, N > &C)
Matriplex & operator=(const Matriplex &m)
void add(const Matriplex &v)
const T & operator()(idx_t n, idx_t i, idx_t j) const
const T & constAt(idx_t n, idx_t i, idx_t j) const
static void invert(MPlex< T, 2, 2, N > &A, double *determ=nullptr)
printf("params %d %f %f %f\n", minT, eps, errmax, chi2max)
static void invert(MPlex< T, 3, 3, N > &A, double *determ=nullptr)
static constexpr int kRows
return no. of matrix rows
static void invert(MPlex< T, 3, 3, N > &A)
T & operator()(idx_t n, idx_t i, idx_t j)
void multiplyGeneral(const MPlex< T, D1, D2, N > &A, const MPlex< T, D2, D3, N > &B, MPlex< T, D1, D3, N > &C)
DecomposeProduct< arg, typename Div::arg > D
void invertCramer(MPlex< T, D, D, N > &A, double *determ=nullptr)
void copyIn(idx_t n, const Matriplex &m, idx_t in)
void slurpIn(const T *arr, int vi[N], const int N_proc=N)
void copy(idx_t n, idx_t in)
void invertCholesky(MPlex< T, D, D, N > &A)
static constexpr int kCols
return no. of matrix columns
void copySlot(idx_t n, const Matriplex &m)
static void invert(MPlex< T, D, D, N > &A)
static void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
#define ASSUME_ALIGNED(a, b)
T & At(idx_t n, idx_t i, idx_t j)
void copyOut(idx_t n, T *arr) const
static constexpr int kSize
return no of elements: rows*columns
T fArray[kTotSize] __attribute__((aligned(64)))