1 #ifndef RecoTracker_MkFitCore_src_Matriplex_Matriplex_h 2 #define RecoTracker_MkFitCore_src_Matriplex_Matriplex_h 10 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
16 static constexpr
int kRows =
D1;
18 static constexpr
int kCols =
D2;
20 static constexpr
int kSize =
D1 *
D2;
22 static constexpr
int kTotSize =
N * kSize;
32 for (
idx_t i = 0;
i < kTotSize; ++
i) {
38 for (
idx_t i = 0;
i < kTotSize; ++
i) {
39 fArray[
i] +=
v.fArray[
i];
44 for (
idx_t i = 0;
i < kTotSize; ++
i) {
60 memcpy(fArray,
m.fArray,
sizeof(
T) * kTotSize);
66 fArray[
i] =
m.fArray[
i];
84 fArray[
i] = fArray[
in];
88 #if defined(AVX512_INTRINSICS) 91 void slurpIn(
const T* arr, __m512i& vi,
const U&,
const int N_proc =
N) {
94 const __m512
src = {0};
95 const __mmask16
k = N_proc ==
N ? -1 : (1 << N_proc) - 1;
97 for (
int i = 0;
i < kSize; ++
i, ++arr) {
100 __m512 reg = _mm512_mask_i32gather_ps(
src,
k, vi, arr,
sizeof(
U));
101 _mm512_mask_store_ps(&fArray[
i *
N],
k, reg);
107 void ChewIn(
const char* arr,
int off,
int vi[
N],
const char*
tmp, __m512i&
ui) {
110 for (
int i = 0;
i <
N; ++
i) {
111 __m512 reg = _mm512_load_ps(arr + vi[
i]);
112 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
115 for (
int i = 0;
i < kSize; ++
i) {
116 __m512 reg = _mm512_i32gather_ps(
ui,
tmp + off +
i *
sizeof(
T), 1);
117 _mm512_store_ps(&fArray[
i *
N], reg);
121 void Contaginate(
const char* arr,
int vi[
N],
const char*
tmp) {
124 for (
int i = 0;
i <
N; ++
i) {
125 __m512 reg = _mm512_load_ps(arr + vi[
i]);
126 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
130 void Plexify(
const char*
tmp, __m512i&
ui) {
131 for (
int i = 0;
i < kSize; ++
i) {
132 __m512 reg = _mm512_i32gather_ps(
ui,
tmp +
i *
sizeof(
T), 1);
133 _mm512_store_ps(&fArray[
i *
N], reg);
137 #elif defined(AVX2_INTRINSICS) 139 template <
typename U>
140 void slurpIn(
const T* arr, __m256i& vi,
const U&,
const int N_proc =
N) {
144 const __m256
src = {0};
146 __m256i
k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
147 __m256i k_sel = _mm256_set1_epi32(N_proc);
148 __m256i k_master = _mm256_cmpgt_epi32(k_sel,
k);
151 for (
int i = 0;
i < kSize; ++
i, ++arr) {
152 __m256 reg = _mm256_mask_i32gather_ps(
src, (
float*)arr, vi, (__m256)
k,
sizeof(
U));
155 _mm256_maskstore_ps((
float*)&fArray[
i *
N],
k, reg);
161 void slurpIn(
const T* arr,
int vi[
N],
const int N_proc =
N) {
164 for (
int i = 0;
i < kSize; ++
i) {
165 for (
int j = 0;
j <
N; ++
j) {
166 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
170 for (
int i = 0;
i < kSize; ++
i) {
171 for (
int j = 0;
j < N_proc; ++
j) {
172 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
182 *(arr++) = fArray[
i];
187 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
194 template <
typename T,
idx_t D1,
idx_t D2,
idx_t D3,
idx_t N>
195 void multiplyGeneral(
const MPlex<T, D1, D2, N>&
A,
const MPlex<T, D2, D3, N>&
B,
MPlex<T, D1, D3, N>&
C) {
202 C.fArray[ijo +
n] = 0;
211 C.fArray[ijo +
n] +=
A.fArray[iko +
n] *
B.fArray[kjo +
n];
220 template <
typename T,
idx_t D,
idx_t N>
222 static void multiply(
const MPlex<T, D, D, N>&
A,
const MPlex<T, D, D, N>&
B,
MPlex<T, D, D, N>&
C) {
223 throw std::runtime_error(
"general multiplication not supported, well, call multiplyGeneral()");
227 template <
typename T,
idx_t N>
229 static void multiply(
const MPlex<T, 3, 3, N>&
A,
const MPlex<T, 3, 3, N>&
B,
MPlex<T, 3, 3, N>&
C) {
230 const T*
a =
A.fArray;
232 const T*
b =
B.fArray;
239 c[0 *
N +
n] =
a[0 *
N +
n] *
b[0 *
N +
n] +
a[1 *
N +
n] *
b[3 *
N +
n] +
a[2 *
N +
n] *
b[6 *
N +
n];
240 c[1 *
N +
n] =
a[0 *
N +
n] *
b[1 *
N +
n] +
a[1 *
N +
n] *
b[4 *
N +
n] +
a[2 *
N +
n] *
b[7 *
N +
n];
241 c[2 *
N +
n] =
a[0 *
N +
n] *
b[2 *
N +
n] +
a[1 *
N +
n] *
b[5 *
N +
n] +
a[2 *
N +
n] *
b[8 *
N +
n];
242 c[3 *
N +
n] =
a[3 *
N +
n] *
b[0 *
N +
n] +
a[4 *
N +
n] *
b[3 *
N +
n] +
a[5 *
N +
n] *
b[6 *
N +
n];
243 c[4 *
N +
n] =
a[3 *
N +
n] *
b[1 *
N +
n] +
a[4 *
N +
n] *
b[4 *
N +
n] +
a[5 *
N +
n] *
b[7 *
N +
n];
244 c[5 *
N +
n] =
a[3 *
N +
n] *
b[2 *
N +
n] +
a[4 *
N +
n] *
b[5 *
N +
n] +
a[5 *
N +
n] *
b[8 *
N +
n];
245 c[6 *
N +
n] =
a[6 *
N +
n] *
b[0 *
N +
n] +
a[7 *
N +
n] *
b[3 *
N +
n] +
a[8 *
N +
n] *
b[6 *
N +
n];
246 c[7 *
N +
n] =
a[6 *
N +
n] *
b[1 *
N +
n] +
a[7 *
N +
n] *
b[4 *
N +
n] +
a[8 *
N +
n] *
b[7 *
N +
n];
247 c[8 *
N +
n] =
a[6 *
N +
n] *
b[2 *
N +
n] +
a[7 *
N +
n] *
b[5 *
N +
n] +
a[8 *
N +
n] *
b[8 *
N +
n];
252 template <
typename T,
idx_t N>
254 static void multiply(
const MPlex<T, 6, 6, N>&
A,
const MPlex<T, 6, 6, N>&
B,
MPlex<T, 6, 6, N>&
C) {
255 const T*
a =
A.fArray;
257 const T*
b =
B.fArray;
263 c[0 *
N +
n] =
a[0 *
N +
n] *
b[0 *
N +
n] +
a[1 *
N +
n] *
b[6 *
N +
n] +
a[2 *
N +
n] *
b[12 *
N +
n] +
264 a[3 *
N +
n] *
b[18 *
N +
n] +
a[4 *
N +
n] *
b[24 *
N +
n] +
a[5 *
N +
n] *
b[30 *
N +
n];
265 c[1 *
N +
n] =
a[0 *
N +
n] *
b[1 *
N +
n] +
a[1 *
N +
n] *
b[7 *
N +
n] +
a[2 *
N +
n] *
b[13 *
N +
n] +
266 a[3 *
N +
n] *
b[19 *
N +
n] +
a[4 *
N +
n] *
b[25 *
N +
n] +
a[5 *
N +
n] *
b[31 *
N +
n];
267 c[2 *
N +
n] =
a[0 *
N +
n] *
b[2 *
N +
n] +
a[1 *
N +
n] *
b[8 *
N +
n] +
a[2 *
N +
n] *
b[14 *
N +
n] +
268 a[3 *
N +
n] *
b[20 *
N +
n] +
a[4 *
N +
n] *
b[26 *
N +
n] +
a[5 *
N +
n] *
b[32 *
N +
n];
269 c[3 *
N +
n] =
a[0 *
N +
n] *
b[3 *
N +
n] +
a[1 *
N +
n] *
b[9 *
N +
n] +
a[2 *
N +
n] *
b[15 *
N +
n] +
270 a[3 *
N +
n] *
b[21 *
N +
n] +
a[4 *
N +
n] *
b[27 *
N +
n] +
a[5 *
N +
n] *
b[33 *
N +
n];
271 c[4 *
N +
n] =
a[0 *
N +
n] *
b[4 *
N +
n] +
a[1 *
N +
n] *
b[10 *
N +
n] +
a[2 *
N +
n] *
b[16 *
N +
n] +
272 a[3 *
N +
n] *
b[22 *
N +
n] +
a[4 *
N +
n] *
b[28 *
N +
n] +
a[5 *
N +
n] *
b[34 *
N +
n];
273 c[5 *
N +
n] =
a[0 *
N +
n] *
b[5 *
N +
n] +
a[1 *
N +
n] *
b[11 *
N +
n] +
a[2 *
N +
n] *
b[17 *
N +
n] +
274 a[3 *
N +
n] *
b[23 *
N +
n] +
a[4 *
N +
n] *
b[29 *
N +
n] +
a[5 *
N +
n] *
b[35 *
N +
n];
275 c[6 *
N +
n] =
a[6 *
N +
n] *
b[0 *
N +
n] +
a[7 *
N +
n] *
b[6 *
N +
n] +
a[8 *
N +
n] *
b[12 *
N +
n] +
276 a[9 *
N +
n] *
b[18 *
N +
n] +
a[10 *
N +
n] *
b[24 *
N +
n] +
a[11 *
N +
n] *
b[30 *
N +
n];
277 c[7 *
N +
n] =
a[6 *
N +
n] *
b[1 *
N +
n] +
a[7 *
N +
n] *
b[7 *
N +
n] +
a[8 *
N +
n] *
b[13 *
N +
n] +
278 a[9 *
N +
n] *
b[19 *
N +
n] +
a[10 *
N +
n] *
b[25 *
N +
n] +
a[11 *
N +
n] *
b[31 *
N +
n];
279 c[8 *
N +
n] =
a[6 *
N +
n] *
b[2 *
N +
n] +
a[7 *
N +
n] *
b[8 *
N +
n] +
a[8 *
N +
n] *
b[14 *
N +
n] +
280 a[9 *
N +
n] *
b[20 *
N +
n] +
a[10 *
N +
n] *
b[26 *
N +
n] +
a[11 *
N +
n] *
b[32 *
N +
n];
281 c[9 *
N +
n] =
a[6 *
N +
n] *
b[3 *
N +
n] +
a[7 *
N +
n] *
b[9 *
N +
n] +
a[8 *
N +
n] *
b[15 *
N +
n] +
282 a[9 *
N +
n] *
b[21 *
N +
n] +
a[10 *
N +
n] *
b[27 *
N +
n] +
a[11 *
N +
n] *
b[33 *
N +
n];
283 c[10 *
N +
n] =
a[6 *
N +
n] *
b[4 *
N +
n] +
a[7 *
N +
n] *
b[10 *
N +
n] +
a[8 *
N +
n] *
b[16 *
N +
n] +
284 a[9 *
N +
n] *
b[22 *
N +
n] +
a[10 *
N +
n] *
b[28 *
N +
n] +
a[11 *
N +
n] *
b[34 *
N +
n];
285 c[11 *
N +
n] =
a[6 *
N +
n] *
b[5 *
N +
n] +
a[7 *
N +
n] *
b[11 *
N +
n] +
a[8 *
N +
n] *
b[17 *
N +
n] +
286 a[9 *
N +
n] *
b[23 *
N +
n] +
a[10 *
N +
n] *
b[29 *
N +
n] +
a[11 *
N +
n] *
b[35 *
N +
n];
287 c[12 *
N +
n] =
a[12 *
N +
n] *
b[0 *
N +
n] +
a[13 *
N +
n] *
b[6 *
N +
n] +
a[14 *
N +
n] *
b[12 *
N +
n] +
288 a[15 *
N +
n] *
b[18 *
N +
n] +
a[16 *
N +
n] *
b[24 *
N +
n] +
a[17 *
N +
n] *
b[30 *
N +
n];
289 c[13 *
N +
n] =
a[12 *
N +
n] *
b[1 *
N +
n] +
a[13 *
N +
n] *
b[7 *
N +
n] +
a[14 *
N +
n] *
b[13 *
N +
n] +
290 a[15 *
N +
n] *
b[19 *
N +
n] +
a[16 *
N +
n] *
b[25 *
N +
n] +
a[17 *
N +
n] *
b[31 *
N +
n];
291 c[14 *
N +
n] =
a[12 *
N +
n] *
b[2 *
N +
n] +
a[13 *
N +
n] *
b[8 *
N +
n] +
a[14 *
N +
n] *
b[14 *
N +
n] +
292 a[15 *
N +
n] *
b[20 *
N +
n] +
a[16 *
N +
n] *
b[26 *
N +
n] +
a[17 *
N +
n] *
b[32 *
N +
n];
293 c[15 *
N +
n] =
a[12 *
N +
n] *
b[3 *
N +
n] +
a[13 *
N +
n] *
b[9 *
N +
n] +
a[14 *
N +
n] *
b[15 *
N +
n] +
294 a[15 *
N +
n] *
b[21 *
N +
n] +
a[16 *
N +
n] *
b[27 *
N +
n] +
a[17 *
N +
n] *
b[33 *
N +
n];
295 c[16 *
N +
n] =
a[12 *
N +
n] *
b[4 *
N +
n] +
a[13 *
N +
n] *
b[10 *
N +
n] +
a[14 *
N +
n] *
b[16 *
N +
n] +
296 a[15 *
N +
n] *
b[22 *
N +
n] +
a[16 *
N +
n] *
b[28 *
N +
n] +
a[17 *
N +
n] *
b[34 *
N +
n];
297 c[17 *
N +
n] =
a[12 *
N +
n] *
b[5 *
N +
n] +
a[13 *
N +
n] *
b[11 *
N +
n] +
a[14 *
N +
n] *
b[17 *
N +
n] +
298 a[15 *
N +
n] *
b[23 *
N +
n] +
a[16 *
N +
n] *
b[29 *
N +
n] +
a[17 *
N +
n] *
b[35 *
N +
n];
299 c[18 *
N +
n] =
a[18 *
N +
n] *
b[0 *
N +
n] +
a[19 *
N +
n] *
b[6 *
N +
n] +
a[20 *
N +
n] *
b[12 *
N +
n] +
300 a[21 *
N +
n] *
b[18 *
N +
n] +
a[22 *
N +
n] *
b[24 *
N +
n] +
a[23 *
N +
n] *
b[30 *
N +
n];
301 c[19 *
N +
n] =
a[18 *
N +
n] *
b[1 *
N +
n] +
a[19 *
N +
n] *
b[7 *
N +
n] +
a[20 *
N +
n] *
b[13 *
N +
n] +
302 a[21 *
N +
n] *
b[19 *
N +
n] +
a[22 *
N +
n] *
b[25 *
N +
n] +
a[23 *
N +
n] *
b[31 *
N +
n];
303 c[20 *
N +
n] =
a[18 *
N +
n] *
b[2 *
N +
n] +
a[19 *
N +
n] *
b[8 *
N +
n] +
a[20 *
N +
n] *
b[14 *
N +
n] +
304 a[21 *
N +
n] *
b[20 *
N +
n] +
a[22 *
N +
n] *
b[26 *
N +
n] +
a[23 *
N +
n] *
b[32 *
N +
n];
305 c[21 *
N +
n] =
a[18 *
N +
n] *
b[3 *
N +
n] +
a[19 *
N +
n] *
b[9 *
N +
n] +
a[20 *
N +
n] *
b[15 *
N +
n] +
306 a[21 *
N +
n] *
b[21 *
N +
n] +
a[22 *
N +
n] *
b[27 *
N +
n] +
a[23 *
N +
n] *
b[33 *
N +
n];
307 c[22 *
N +
n] =
a[18 *
N +
n] *
b[4 *
N +
n] +
a[19 *
N +
n] *
b[10 *
N +
n] +
a[20 *
N +
n] *
b[16 *
N +
n] +
308 a[21 *
N +
n] *
b[22 *
N +
n] +
a[22 *
N +
n] *
b[28 *
N +
n] +
a[23 *
N +
n] *
b[34 *
N +
n];
309 c[23 *
N +
n] =
a[18 *
N +
n] *
b[5 *
N +
n] +
a[19 *
N +
n] *
b[11 *
N +
n] +
a[20 *
N +
n] *
b[17 *
N +
n] +
310 a[21 *
N +
n] *
b[23 *
N +
n] +
a[22 *
N +
n] *
b[29 *
N +
n] +
a[23 *
N +
n] *
b[35 *
N +
n];
311 c[24 *
N +
n] =
a[24 *
N +
n] *
b[0 *
N +
n] +
a[25 *
N +
n] *
b[6 *
N +
n] +
a[26 *
N +
n] *
b[12 *
N +
n] +
312 a[27 *
N +
n] *
b[18 *
N +
n] +
a[28 *
N +
n] *
b[24 *
N +
n] +
a[29 *
N +
n] *
b[30 *
N +
n];
313 c[25 *
N +
n] =
a[24 *
N +
n] *
b[1 *
N +
n] +
a[25 *
N +
n] *
b[7 *
N +
n] +
a[26 *
N +
n] *
b[13 *
N +
n] +
314 a[27 *
N +
n] *
b[19 *
N +
n] +
a[28 *
N +
n] *
b[25 *
N +
n] +
a[29 *
N +
n] *
b[31 *
N +
n];
315 c[26 *
N +
n] =
a[24 *
N +
n] *
b[2 *
N +
n] +
a[25 *
N +
n] *
b[8 *
N +
n] +
a[26 *
N +
n] *
b[14 *
N +
n] +
316 a[27 *
N +
n] *
b[20 *
N +
n] +
a[28 *
N +
n] *
b[26 *
N +
n] +
a[29 *
N +
n] *
b[32 *
N +
n];
317 c[27 *
N +
n] =
a[24 *
N +
n] *
b[3 *
N +
n] +
a[25 *
N +
n] *
b[9 *
N +
n] +
a[26 *
N +
n] *
b[15 *
N +
n] +
318 a[27 *
N +
n] *
b[21 *
N +
n] +
a[28 *
N +
n] *
b[27 *
N +
n] +
a[29 *
N +
n] *
b[33 *
N +
n];
319 c[28 *
N +
n] =
a[24 *
N +
n] *
b[4 *
N +
n] +
a[25 *
N +
n] *
b[10 *
N +
n] +
a[26 *
N +
n] *
b[16 *
N +
n] +
320 a[27 *
N +
n] *
b[22 *
N +
n] +
a[28 *
N +
n] *
b[28 *
N +
n] +
a[29 *
N +
n] *
b[34 *
N +
n];
321 c[29 *
N +
n] =
a[24 *
N +
n] *
b[5 *
N +
n] +
a[25 *
N +
n] *
b[11 *
N +
n] +
a[26 *
N +
n] *
b[17 *
N +
n] +
322 a[27 *
N +
n] *
b[23 *
N +
n] +
a[28 *
N +
n] *
b[29 *
N +
n] +
a[29 *
N +
n] *
b[35 *
N +
n];
323 c[30 *
N +
n] =
a[30 *
N +
n] *
b[0 *
N +
n] +
a[31 *
N +
n] *
b[6 *
N +
n] +
a[32 *
N +
n] *
b[12 *
N +
n] +
324 a[33 *
N +
n] *
b[18 *
N +
n] +
a[34 *
N +
n] *
b[24 *
N +
n] +
a[35 *
N +
n] *
b[30 *
N +
n];
325 c[31 *
N +
n] =
a[30 *
N +
n] *
b[1 *
N +
n] +
a[31 *
N +
n] *
b[7 *
N +
n] +
a[32 *
N +
n] *
b[13 *
N +
n] +
326 a[33 *
N +
n] *
b[19 *
N +
n] +
a[34 *
N +
n] *
b[25 *
N +
n] +
a[35 *
N +
n] *
b[31 *
N +
n];
327 c[32 *
N +
n] =
a[30 *
N +
n] *
b[2 *
N +
n] +
a[31 *
N +
n] *
b[8 *
N +
n] +
a[32 *
N +
n] *
b[14 *
N +
n] +
328 a[33 *
N +
n] *
b[20 *
N +
n] +
a[34 *
N +
n] *
b[26 *
N +
n] +
a[35 *
N +
n] *
b[32 *
N +
n];
329 c[33 *
N +
n] =
a[30 *
N +
n] *
b[3 *
N +
n] +
a[31 *
N +
n] *
b[9 *
N +
n] +
a[32 *
N +
n] *
b[15 *
N +
n] +
330 a[33 *
N +
n] *
b[21 *
N +
n] +
a[34 *
N +
n] *
b[27 *
N +
n] +
a[35 *
N +
n] *
b[33 *
N +
n];
331 c[34 *
N +
n] =
a[30 *
N +
n] *
b[4 *
N +
n] +
a[31 *
N +
n] *
b[10 *
N +
n] +
a[32 *
N +
n] *
b[16 *
N +
n] +
332 a[33 *
N +
n] *
b[22 *
N +
n] +
a[34 *
N +
n] *
b[28 *
N +
n] +
a[35 *
N +
n] *
b[34 *
N +
n];
333 c[35 *
N +
n] =
a[30 *
N +
n] *
b[5 *
N +
n] +
a[31 *
N +
n] *
b[11 *
N +
n] +
a[32 *
N +
n] *
b[17 *
N +
n] +
334 a[33 *
N +
n] *
b[23 *
N +
n] +
a[34 *
N +
n] *
b[29 *
N +
n] +
a[35 *
N +
n] *
b[35 *
N +
n];
339 template <
typename T,
idx_t D,
idx_t N>
340 void multiply(
const MPlex<T, D, D, N>&
A,
const MPlex<T, D, D, N>&
B,
MPlex<T, D, D, N>&
C) {
342 printf(
"Multipl %d %d\n",
D,
N);
352 template <
typename T,
idx_t D,
idx_t N>
355 throw std::runtime_error(
"general cramer inversion not supported");
359 template <
typename T,
idx_t N>
370 const double det = (double)
a[0 *
N +
n] *
a[3 *
N +
n] - (
double)
a[2 *
N +
n] *
a[1 *
N +
n];
374 const TT
s = TT(1) / det;
375 const TT
tmp =
s *
a[3 *
N +
n];
378 a[3 *
N +
n] =
s *
a[0 *
N +
n];
384 template <
typename T,
idx_t N>
394 const TT c00 =
a[4 *
N +
n] *
a[8 *
N +
n] -
a[5 *
N +
n] *
a[7 *
N +
n];
395 const TT c01 =
a[5 *
N +
n] *
a[6 *
N +
n] -
a[3 *
N +
n] *
a[8 *
N +
n];
396 const TT c02 =
a[3 *
N +
n] *
a[7 *
N +
n] -
a[4 *
N +
n] *
a[6 *
N +
n];
397 const TT c10 =
a[7 *
N +
n] *
a[2 *
N +
n] -
a[8 *
N +
n] *
a[1 *
N +
n];
398 const TT c11 =
a[8 *
N +
n] *
a[0 *
N +
n] -
a[6 *
N +
n] *
a[2 *
N +
n];
399 const TT c12 =
a[6 *
N +
n] *
a[1 *
N +
n] -
a[7 *
N +
n] *
a[0 *
N +
n];
400 const TT c20 =
a[1 *
N +
n] *
a[5 *
N +
n] -
a[2 *
N +
n] *
a[4 *
N +
n];
401 const TT c21 =
a[2 *
N +
n] *
a[3 *
N +
n] -
a[0 *
N +
n] *
a[5 *
N +
n];
402 const TT c22 =
a[0 *
N +
n] *
a[4 *
N +
n] -
a[1 *
N +
n] *
a[3 *
N +
n];
405 const double det = (double)
a[0 *
N +
n] * c00 + (
double)
a[1 *
N +
n] * c01 + (double)
a[2 *
N +
n] * c02;
409 const TT
s = TT(1) / det;
410 a[0 *
N +
n] =
s * c00;
411 a[1 *
N +
n] =
s * c10;
412 a[2 *
N +
n] =
s * c20;
413 a[3 *
N +
n] =
s * c01;
414 a[4 *
N +
n] =
s * c11;
415 a[5 *
N +
n] =
s * c21;
416 a[6 *
N +
n] =
s * c02;
417 a[7 *
N +
n] =
s * c12;
418 a[8 *
N +
n] =
s * c22;
423 template <
typename T,
idx_t D,
idx_t N>
432 template <
typename T,
idx_t D,
idx_t N>
437 template <
typename T,
idx_t N>
450 TT l1 =
a[3 *
N +
n] * l0;
451 TT l2 =
a[4 *
N +
n] - l1 * l1;
453 TT l3 =
a[6 *
N +
n] * l0;
454 TT l4 = (
a[7 *
N +
n] - l1 * l3) * l2;
455 TT l5 =
a[8 *
N +
n] - (l3 * l3 + l4 * l4);
460 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
464 a[0 *
N +
n] = l3 * l3 + l1 * l1 + l0 * l0;
465 a[1 *
N +
n] =
a[3 *
N +
n] = l3 * l4 + l1 * l2;
466 a[4 *
N +
n] = l4 * l4 + l2 * l2;
467 a[2 *
N +
n] =
a[6 *
N +
n] = l3 * l5;
468 a[5 *
N +
n] =
a[7 *
N +
n] = l4 * l5;
469 a[8 *
N +
n] = l5 * l5;
477 template <
typename T,
idx_t D,
idx_t N>
void copyIn(idx_t n, const T *arr)
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
static void invert(MPlex< T, D, D, N > &A, double *determ=nullptr)
const T & constAt(idx_t n, idx_t i, idx_t j) const
static void multiply(const MPlex< T, 3, 3, N > &A, const MPlex< T, 3, 3, N > &B, MPlex< T, 3, 3, N > &C)
static void multiply(const MPlex< T, 6, 6, N > &A, const MPlex< T, 6, 6, N > &B, MPlex< T, 6, 6, N > &C)
Matriplex & operator=(const Matriplex &m)
void add(const Matriplex &v)
static void invert(MPlex< T, 2, 2, N > &A, double *determ=nullptr)
const T & operator()(idx_t n, idx_t i, idx_t j) const
float __attribute__((vector_size(8))) cms_float32x2_t
static void invert(MPlex< T, 3, 3, N > &A, double *determ=nullptr)
static void invert(MPlex< T, 3, 3, N > &A)
T & operator()(idx_t n, idx_t i, idx_t j)
void multiplyGeneral(const MPlex< T, D1, D2, N > &A, const MPlex< T, D2, D3, N > &B, MPlex< T, D1, D3, N > &C)
DecomposeProduct< arg, typename Div::arg > D
void invertCramer(MPlex< T, D, D, N > &A, double *determ=nullptr)
void copyIn(idx_t n, const Matriplex &m, idx_t in)
void slurpIn(const T *arr, int vi[N], const int N_proc=N)
void copy(idx_t n, idx_t in)
void invertCholesky(MPlex< T, D, D, N > &A)
void copyOut(idx_t n, T *arr) const
void copySlot(idx_t n, const Matriplex &m)
static void invert(MPlex< T, D, D, N > &A)
T operator[](idx_t xx) const
static void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
#define ASSUME_ALIGNED(a, b)
T & At(idx_t n, idx_t i, idx_t j)