1 #ifndef RecoTracker_MkFitCore_src_Matriplex_Matriplex_h 2 #define RecoTracker_MkFitCore_src_Matriplex_Matriplex_h 10 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
16 static constexpr
int kRows =
D1;
18 static constexpr
int kCols =
D2;
20 static constexpr
int kSize =
D1 *
D2;
22 static constexpr
int kTotSize =
N * kSize;
29 idx_t plexSize()
const {
return N; }
32 for (
idx_t i = 0;
i < kTotSize; ++
i) {
38 for (
idx_t i = 0;
i < kTotSize; ++
i) {
39 fArray[
i] +=
v.fArray[
i];
44 for (
idx_t i = 0;
i < kTotSize; ++
i) {
60 memcpy(fArray,
m.fArray,
sizeof(
T) * kTotSize);
96 fArray[
i] +=
a.fArray[
i];
101 for (
idx_t i = 0;
i < kTotSize; ++
i)
102 fArray[
i] -=
a.fArray[
i];
107 for (
idx_t i = 0;
i < kTotSize; ++
i)
108 fArray[
i] *=
a.fArray[
i];
113 for (
idx_t i = 0;
i < kTotSize; ++
i)
114 fArray[
i] /=
a.fArray[
i];
119 for (
idx_t i = 0;
i < kTotSize; ++
i)
124 for (
idx_t i = 0;
i < kTotSize; ++
i)
130 for (
idx_t i = 0;
i < kTotSize; ++
i)
131 fArray[
i] =
a.fArray[
i] *
a.fArray[
i];
135 for (
idx_t i = 0;
i < kTotSize; ++
i)
136 fArray[
i] = fArray[
i] * fArray[
i];
141 for (
idx_t i = 0;
i < kTotSize; ++
i) {
142 fArray[
i] =
a.fArray[
i] *
a.fArray[
i] +
b.fArray[
i] *
b.fArray[
i];
148 for (
idx_t i = 0;
i < kTotSize; ++
i)
153 for (
idx_t i = 0;
i < kTotSize; ++
i)
159 for (
idx_t i = 0;
i < kTotSize; ++
i)
164 for (
idx_t i = 0;
i < kTotSize; ++
i)
170 for (
idx_t i = 0;
i < kTotSize; ++
i)
175 for (
idx_t i = 0;
i < kTotSize; ++
i)
184 fArray[
i] =
m.fArray[
i];
188 void copyIn(
idx_t n,
const T* arr) {
190 fArray[
i] = *(arr++);
202 fArray[
i] = fArray[
in];
206 #if defined(AVX512_INTRINSICS) 208 template <
typename U>
209 void slurpIn(
const T* arr, __m512i& vi,
const U&,
const int N_proc =
N) {
212 const __m512
src = {0};
213 const __mmask16
k = N_proc ==
N ? -1 : (1 << N_proc) - 1;
215 for (
int i = 0;
i < kSize; ++
i, ++arr) {
218 __m512 reg = _mm512_mask_i32gather_ps(
src,
k, vi, arr,
sizeof(
U));
219 _mm512_mask_store_ps(&fArray[
i *
N],
k, reg);
225 void ChewIn(
const char* arr,
int off,
int vi[
N],
const char*
tmp, __m512i&
ui) {
228 for (
int i = 0;
i <
N; ++
i) {
229 __m512 reg = _mm512_load_ps(arr + vi[
i]);
230 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
233 for (
int i = 0;
i < kSize; ++
i) {
234 __m512 reg = _mm512_i32gather_ps(
ui,
tmp + off +
i *
sizeof(
T), 1);
235 _mm512_store_ps(&fArray[
i *
N], reg);
239 void Contaginate(
const char* arr,
int vi[
N],
const char*
tmp) {
242 for (
int i = 0;
i <
N; ++
i) {
243 __m512 reg = _mm512_load_ps(arr + vi[
i]);
244 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
248 void Plexify(
const char*
tmp, __m512i&
ui) {
249 for (
int i = 0;
i < kSize; ++
i) {
250 __m512 reg = _mm512_i32gather_ps(
ui,
tmp +
i *
sizeof(
T), 1);
251 _mm512_store_ps(&fArray[
i *
N], reg);
255 #elif defined(AVX2_INTRINSICS) 257 template <
typename U>
258 void slurpIn(
const T* arr, __m256i& vi,
const U&,
const int N_proc =
N) {
262 const __m256
src = {0};
264 __m256i
k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
265 __m256i k_sel = _mm256_set1_epi32(N_proc);
266 __m256i k_master = _mm256_cmpgt_epi32(k_sel,
k);
269 for (
int i = 0;
i < kSize; ++
i, ++arr) {
270 __m256 reg = _mm256_mask_i32gather_ps(
src, (
float*)arr, vi, (__m256)
k,
sizeof(
U));
273 _mm256_maskstore_ps((
float*)&fArray[
i *
N],
k, reg);
279 void slurpIn(
const T* arr,
int vi[
N],
const int N_proc =
N) {
282 for (
int i = 0;
i < kSize; ++
i) {
283 for (
int j = 0;
j <
N; ++
j) {
284 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
288 for (
int i = 0;
i < kSize; ++
i) {
289 for (
int j = 0;
j < N_proc; ++
j) {
290 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
298 void copyOut(
idx_t n,
T* arr)
const {
300 *(arr++) = fArray[
i];
304 Matriplex<T, 1, 1, N> ReduceFixedIJ(
idx_t i,
idx_t j)
const {
305 Matriplex<T, 1, 1, N>
t;
307 t[
n] = constAt(
n,
i,
j);
313 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
314 using MPlex = Matriplex<T, D1, D2, N>;
320 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
321 MPlex<T, D1, D2, N> operator+(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
327 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
328 MPlex<T, D1, D2, N> operator-(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
334 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
335 MPlex<T, D1, D2, N> operator*(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
341 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
342 MPlex<T, D1, D2, N> operator/(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
348 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
355 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
362 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
369 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
376 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
383 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
390 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
397 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
404 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
410 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
416 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
417 MPlex<T, D1, D2, N> hypot(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
419 return t.hypot(
a,
b);
422 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
428 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
434 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
435 void sincos(
const MPlex<T, D1, D2, N>&
a,
MPlex<T, D1, D2, N>&
s,
MPlex<T, D1, D2, N>&
c) {
442 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
448 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
459 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
460 MPlex<T, D1, D2, N> min(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
468 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
469 MPlex<T, D1, D2, N> max(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
481 template <
typename T,
idx_t D1,
idx_t D2,
idx_t D3,
idx_t N>
482 void multiplyGeneral(
const MPlex<T, D1, D2, N>&
A,
const MPlex<T, D2, D3, N>&
B,
MPlex<T, D1, D3, N>&
C) {
489 C.fArray[ijo +
n] = 0;
498 C.fArray[ijo +
n] +=
A.fArray[iko +
n] *
B.fArray[kjo +
n];
507 template <
typename T,
idx_t D,
idx_t N>
509 static void multiply(
const MPlex<T, D, D, N>&
A,
const MPlex<T, D, D, N>&
B,
MPlex<T, D, D, N>&
C) {
510 throw std::runtime_error(
"general multiplication not supported, well, call multiplyGeneral()");
514 template <
typename T,
idx_t N>
516 static void multiply(
const MPlex<T, 3, 3, N>&
A,
const MPlex<T, 3, 3, N>&
B,
MPlex<T, 3, 3, N>&
C) {
517 const T*
a =
A.fArray;
519 const T*
b =
B.fArray;
526 c[0 *
N +
n] =
a[0 *
N +
n] *
b[0 *
N +
n] +
a[1 *
N +
n] *
b[3 *
N +
n] +
a[2 *
N +
n] *
b[6 *
N +
n];
527 c[1 *
N +
n] =
a[0 *
N +
n] *
b[1 *
N +
n] +
a[1 *
N +
n] *
b[4 *
N +
n] +
a[2 *
N +
n] *
b[7 *
N +
n];
528 c[2 *
N +
n] =
a[0 *
N +
n] *
b[2 *
N +
n] +
a[1 *
N +
n] *
b[5 *
N +
n] +
a[2 *
N +
n] *
b[8 *
N +
n];
529 c[3 *
N +
n] =
a[3 *
N +
n] *
b[0 *
N +
n] +
a[4 *
N +
n] *
b[3 *
N +
n] +
a[5 *
N +
n] *
b[6 *
N +
n];
530 c[4 *
N +
n] =
a[3 *
N +
n] *
b[1 *
N +
n] +
a[4 *
N +
n] *
b[4 *
N +
n] +
a[5 *
N +
n] *
b[7 *
N +
n];
531 c[5 *
N +
n] =
a[3 *
N +
n] *
b[2 *
N +
n] +
a[4 *
N +
n] *
b[5 *
N +
n] +
a[5 *
N +
n] *
b[8 *
N +
n];
532 c[6 *
N +
n] =
a[6 *
N +
n] *
b[0 *
N +
n] +
a[7 *
N +
n] *
b[3 *
N +
n] +
a[8 *
N +
n] *
b[6 *
N +
n];
533 c[7 *
N +
n] =
a[6 *
N +
n] *
b[1 *
N +
n] +
a[7 *
N +
n] *
b[4 *
N +
n] +
a[8 *
N +
n] *
b[7 *
N +
n];
534 c[8 *
N +
n] =
a[6 *
N +
n] *
b[2 *
N +
n] +
a[7 *
N +
n] *
b[5 *
N +
n] +
a[8 *
N +
n] *
b[8 *
N +
n];
539 template <
typename T,
idx_t N>
541 static void multiply(
const MPlex<T, 6, 6, N>&
A,
const MPlex<T, 6, 6, N>&
B,
MPlex<T, 6, 6, N>&
C) {
542 const T*
a =
A.fArray;
544 const T*
b =
B.fArray;
550 c[0 *
N +
n] =
a[0 *
N +
n] *
b[0 *
N +
n] +
a[1 *
N +
n] *
b[6 *
N +
n] +
a[2 *
N +
n] *
b[12 *
N +
n] +
551 a[3 *
N +
n] *
b[18 *
N +
n] +
a[4 *
N +
n] *
b[24 *
N +
n] +
a[5 *
N +
n] *
b[30 *
N +
n];
552 c[1 *
N +
n] =
a[0 *
N +
n] *
b[1 *
N +
n] +
a[1 *
N +
n] *
b[7 *
N +
n] +
a[2 *
N +
n] *
b[13 *
N +
n] +
553 a[3 *
N +
n] *
b[19 *
N +
n] +
a[4 *
N +
n] *
b[25 *
N +
n] +
a[5 *
N +
n] *
b[31 *
N +
n];
554 c[2 *
N +
n] =
a[0 *
N +
n] *
b[2 *
N +
n] +
a[1 *
N +
n] *
b[8 *
N +
n] +
a[2 *
N +
n] *
b[14 *
N +
n] +
555 a[3 *
N +
n] *
b[20 *
N +
n] +
a[4 *
N +
n] *
b[26 *
N +
n] +
a[5 *
N +
n] *
b[32 *
N +
n];
556 c[3 *
N +
n] =
a[0 *
N +
n] *
b[3 *
N +
n] +
a[1 *
N +
n] *
b[9 *
N +
n] +
a[2 *
N +
n] *
b[15 *
N +
n] +
557 a[3 *
N +
n] *
b[21 *
N +
n] +
a[4 *
N +
n] *
b[27 *
N +
n] +
a[5 *
N +
n] *
b[33 *
N +
n];
558 c[4 *
N +
n] =
a[0 *
N +
n] *
b[4 *
N +
n] +
a[1 *
N +
n] *
b[10 *
N +
n] +
a[2 *
N +
n] *
b[16 *
N +
n] +
559 a[3 *
N +
n] *
b[22 *
N +
n] +
a[4 *
N +
n] *
b[28 *
N +
n] +
a[5 *
N +
n] *
b[34 *
N +
n];
560 c[5 *
N +
n] =
a[0 *
N +
n] *
b[5 *
N +
n] +
a[1 *
N +
n] *
b[11 *
N +
n] +
a[2 *
N +
n] *
b[17 *
N +
n] +
561 a[3 *
N +
n] *
b[23 *
N +
n] +
a[4 *
N +
n] *
b[29 *
N +
n] +
a[5 *
N +
n] *
b[35 *
N +
n];
562 c[6 *
N +
n] =
a[6 *
N +
n] *
b[0 *
N +
n] +
a[7 *
N +
n] *
b[6 *
N +
n] +
a[8 *
N +
n] *
b[12 *
N +
n] +
563 a[9 *
N +
n] *
b[18 *
N +
n] +
a[10 *
N +
n] *
b[24 *
N +
n] +
a[11 *
N +
n] *
b[30 *
N +
n];
564 c[7 *
N +
n] =
a[6 *
N +
n] *
b[1 *
N +
n] +
a[7 *
N +
n] *
b[7 *
N +
n] +
a[8 *
N +
n] *
b[13 *
N +
n] +
565 a[9 *
N +
n] *
b[19 *
N +
n] +
a[10 *
N +
n] *
b[25 *
N +
n] +
a[11 *
N +
n] *
b[31 *
N +
n];
566 c[8 *
N +
n] =
a[6 *
N +
n] *
b[2 *
N +
n] +
a[7 *
N +
n] *
b[8 *
N +
n] +
a[8 *
N +
n] *
b[14 *
N +
n] +
567 a[9 *
N +
n] *
b[20 *
N +
n] +
a[10 *
N +
n] *
b[26 *
N +
n] +
a[11 *
N +
n] *
b[32 *
N +
n];
568 c[9 *
N +
n] =
a[6 *
N +
n] *
b[3 *
N +
n] +
a[7 *
N +
n] *
b[9 *
N +
n] +
a[8 *
N +
n] *
b[15 *
N +
n] +
569 a[9 *
N +
n] *
b[21 *
N +
n] +
a[10 *
N +
n] *
b[27 *
N +
n] +
a[11 *
N +
n] *
b[33 *
N +
n];
570 c[10 *
N +
n] =
a[6 *
N +
n] *
b[4 *
N +
n] +
a[7 *
N +
n] *
b[10 *
N +
n] +
a[8 *
N +
n] *
b[16 *
N +
n] +
571 a[9 *
N +
n] *
b[22 *
N +
n] +
a[10 *
N +
n] *
b[28 *
N +
n] +
a[11 *
N +
n] *
b[34 *
N +
n];
572 c[11 *
N +
n] =
a[6 *
N +
n] *
b[5 *
N +
n] +
a[7 *
N +
n] *
b[11 *
N +
n] +
a[8 *
N +
n] *
b[17 *
N +
n] +
573 a[9 *
N +
n] *
b[23 *
N +
n] +
a[10 *
N +
n] *
b[29 *
N +
n] +
a[11 *
N +
n] *
b[35 *
N +
n];
574 c[12 *
N +
n] =
a[12 *
N +
n] *
b[0 *
N +
n] +
a[13 *
N +
n] *
b[6 *
N +
n] +
a[14 *
N +
n] *
b[12 *
N +
n] +
575 a[15 *
N +
n] *
b[18 *
N +
n] +
a[16 *
N +
n] *
b[24 *
N +
n] +
a[17 *
N +
n] *
b[30 *
N +
n];
576 c[13 *
N +
n] =
a[12 *
N +
n] *
b[1 *
N +
n] +
a[13 *
N +
n] *
b[7 *
N +
n] +
a[14 *
N +
n] *
b[13 *
N +
n] +
577 a[15 *
N +
n] *
b[19 *
N +
n] +
a[16 *
N +
n] *
b[25 *
N +
n] +
a[17 *
N +
n] *
b[31 *
N +
n];
578 c[14 *
N +
n] =
a[12 *
N +
n] *
b[2 *
N +
n] +
a[13 *
N +
n] *
b[8 *
N +
n] +
a[14 *
N +
n] *
b[14 *
N +
n] +
579 a[15 *
N +
n] *
b[20 *
N +
n] +
a[16 *
N +
n] *
b[26 *
N +
n] +
a[17 *
N +
n] *
b[32 *
N +
n];
580 c[15 *
N +
n] =
a[12 *
N +
n] *
b[3 *
N +
n] +
a[13 *
N +
n] *
b[9 *
N +
n] +
a[14 *
N +
n] *
b[15 *
N +
n] +
581 a[15 *
N +
n] *
b[21 *
N +
n] +
a[16 *
N +
n] *
b[27 *
N +
n] +
a[17 *
N +
n] *
b[33 *
N +
n];
582 c[16 *
N +
n] =
a[12 *
N +
n] *
b[4 *
N +
n] +
a[13 *
N +
n] *
b[10 *
N +
n] +
a[14 *
N +
n] *
b[16 *
N +
n] +
583 a[15 *
N +
n] *
b[22 *
N +
n] +
a[16 *
N +
n] *
b[28 *
N +
n] +
a[17 *
N +
n] *
b[34 *
N +
n];
584 c[17 *
N +
n] =
a[12 *
N +
n] *
b[5 *
N +
n] +
a[13 *
N +
n] *
b[11 *
N +
n] +
a[14 *
N +
n] *
b[17 *
N +
n] +
585 a[15 *
N +
n] *
b[23 *
N +
n] +
a[16 *
N +
n] *
b[29 *
N +
n] +
a[17 *
N +
n] *
b[35 *
N +
n];
586 c[18 *
N +
n] =
a[18 *
N +
n] *
b[0 *
N +
n] +
a[19 *
N +
n] *
b[6 *
N +
n] +
a[20 *
N +
n] *
b[12 *
N +
n] +
587 a[21 *
N +
n] *
b[18 *
N +
n] +
a[22 *
N +
n] *
b[24 *
N +
n] +
a[23 *
N +
n] *
b[30 *
N +
n];
588 c[19 *
N +
n] =
a[18 *
N +
n] *
b[1 *
N +
n] +
a[19 *
N +
n] *
b[7 *
N +
n] +
a[20 *
N +
n] *
b[13 *
N +
n] +
589 a[21 *
N +
n] *
b[19 *
N +
n] +
a[22 *
N +
n] *
b[25 *
N +
n] +
a[23 *
N +
n] *
b[31 *
N +
n];
590 c[20 *
N +
n] =
a[18 *
N +
n] *
b[2 *
N +
n] +
a[19 *
N +
n] *
b[8 *
N +
n] +
a[20 *
N +
n] *
b[14 *
N +
n] +
591 a[21 *
N +
n] *
b[20 *
N +
n] +
a[22 *
N +
n] *
b[26 *
N +
n] +
a[23 *
N +
n] *
b[32 *
N +
n];
592 c[21 *
N +
n] =
a[18 *
N +
n] *
b[3 *
N +
n] +
a[19 *
N +
n] *
b[9 *
N +
n] +
a[20 *
N +
n] *
b[15 *
N +
n] +
593 a[21 *
N +
n] *
b[21 *
N +
n] +
a[22 *
N +
n] *
b[27 *
N +
n] +
a[23 *
N +
n] *
b[33 *
N +
n];
594 c[22 *
N +
n] =
a[18 *
N +
n] *
b[4 *
N +
n] +
a[19 *
N +
n] *
b[10 *
N +
n] +
a[20 *
N +
n] *
b[16 *
N +
n] +
595 a[21 *
N +
n] *
b[22 *
N +
n] +
a[22 *
N +
n] *
b[28 *
N +
n] +
a[23 *
N +
n] *
b[34 *
N +
n];
596 c[23 *
N +
n] =
a[18 *
N +
n] *
b[5 *
N +
n] +
a[19 *
N +
n] *
b[11 *
N +
n] +
a[20 *
N +
n] *
b[17 *
N +
n] +
597 a[21 *
N +
n] *
b[23 *
N +
n] +
a[22 *
N +
n] *
b[29 *
N +
n] +
a[23 *
N +
n] *
b[35 *
N +
n];
598 c[24 *
N +
n] =
a[24 *
N +
n] *
b[0 *
N +
n] +
a[25 *
N +
n] *
b[6 *
N +
n] +
a[26 *
N +
n] *
b[12 *
N +
n] +
599 a[27 *
N +
n] *
b[18 *
N +
n] +
a[28 *
N +
n] *
b[24 *
N +
n] +
a[29 *
N +
n] *
b[30 *
N +
n];
600 c[25 *
N +
n] =
a[24 *
N +
n] *
b[1 *
N +
n] +
a[25 *
N +
n] *
b[7 *
N +
n] +
a[26 *
N +
n] *
b[13 *
N +
n] +
601 a[27 *
N +
n] *
b[19 *
N +
n] +
a[28 *
N +
n] *
b[25 *
N +
n] +
a[29 *
N +
n] *
b[31 *
N +
n];
602 c[26 *
N +
n] =
a[24 *
N +
n] *
b[2 *
N +
n] +
a[25 *
N +
n] *
b[8 *
N +
n] +
a[26 *
N +
n] *
b[14 *
N +
n] +
603 a[27 *
N +
n] *
b[20 *
N +
n] +
a[28 *
N +
n] *
b[26 *
N +
n] +
a[29 *
N +
n] *
b[32 *
N +
n];
604 c[27 *
N +
n] =
a[24 *
N +
n] *
b[3 *
N +
n] +
a[25 *
N +
n] *
b[9 *
N +
n] +
a[26 *
N +
n] *
b[15 *
N +
n] +
605 a[27 *
N +
n] *
b[21 *
N +
n] +
a[28 *
N +
n] *
b[27 *
N +
n] +
a[29 *
N +
n] *
b[33 *
N +
n];
606 c[28 *
N +
n] =
a[24 *
N +
n] *
b[4 *
N +
n] +
a[25 *
N +
n] *
b[10 *
N +
n] +
a[26 *
N +
n] *
b[16 *
N +
n] +
607 a[27 *
N +
n] *
b[22 *
N +
n] +
a[28 *
N +
n] *
b[28 *
N +
n] +
a[29 *
N +
n] *
b[34 *
N +
n];
608 c[29 *
N +
n] =
a[24 *
N +
n] *
b[5 *
N +
n] +
a[25 *
N +
n] *
b[11 *
N +
n] +
a[26 *
N +
n] *
b[17 *
N +
n] +
609 a[27 *
N +
n] *
b[23 *
N +
n] +
a[28 *
N +
n] *
b[29 *
N +
n] +
a[29 *
N +
n] *
b[35 *
N +
n];
610 c[30 *
N +
n] =
a[30 *
N +
n] *
b[0 *
N +
n] +
a[31 *
N +
n] *
b[6 *
N +
n] +
a[32 *
N +
n] *
b[12 *
N +
n] +
611 a[33 *
N +
n] *
b[18 *
N +
n] +
a[34 *
N +
n] *
b[24 *
N +
n] +
a[35 *
N +
n] *
b[30 *
N +
n];
612 c[31 *
N +
n] =
a[30 *
N +
n] *
b[1 *
N +
n] +
a[31 *
N +
n] *
b[7 *
N +
n] +
a[32 *
N +
n] *
b[13 *
N +
n] +
613 a[33 *
N +
n] *
b[19 *
N +
n] +
a[34 *
N +
n] *
b[25 *
N +
n] +
a[35 *
N +
n] *
b[31 *
N +
n];
614 c[32 *
N +
n] =
a[30 *
N +
n] *
b[2 *
N +
n] +
a[31 *
N +
n] *
b[8 *
N +
n] +
a[32 *
N +
n] *
b[14 *
N +
n] +
615 a[33 *
N +
n] *
b[20 *
N +
n] +
a[34 *
N +
n] *
b[26 *
N +
n] +
a[35 *
N +
n] *
b[32 *
N +
n];
616 c[33 *
N +
n] =
a[30 *
N +
n] *
b[3 *
N +
n] +
a[31 *
N +
n] *
b[9 *
N +
n] +
a[32 *
N +
n] *
b[15 *
N +
n] +
617 a[33 *
N +
n] *
b[21 *
N +
n] +
a[34 *
N +
n] *
b[27 *
N +
n] +
a[35 *
N +
n] *
b[33 *
N +
n];
618 c[34 *
N +
n] =
a[30 *
N +
n] *
b[4 *
N +
n] +
a[31 *
N +
n] *
b[10 *
N +
n] +
a[32 *
N +
n] *
b[16 *
N +
n] +
619 a[33 *
N +
n] *
b[22 *
N +
n] +
a[34 *
N +
n] *
b[28 *
N +
n] +
a[35 *
N +
n] *
b[34 *
N +
n];
620 c[35 *
N +
n] =
a[30 *
N +
n] *
b[5 *
N +
n] +
a[31 *
N +
n] *
b[11 *
N +
n] +
a[32 *
N +
n] *
b[17 *
N +
n] +
621 a[33 *
N +
n] *
b[23 *
N +
n] +
a[34 *
N +
n] *
b[29 *
N +
n] +
a[35 *
N +
n] *
b[35 *
N +
n];
626 template <
typename T,
idx_t D,
idx_t N>
627 void multiply(
const MPlex<T, D, D, N>&
A,
const MPlex<T, D, D, N>&
B,
MPlex<T, D, D, N>&
C) {
629 printf(
"Multipl %d %d\n",
D,
N);
639 template <
typename T,
idx_t D,
idx_t N>
642 throw std::runtime_error(
"general cramer inversion not supported");
646 template <
typename T,
idx_t N>
657 const double det = (double)
a[0 *
N +
n] *
a[3 *
N +
n] - (
double)
a[2 *
N +
n] *
a[1 *
N +
n];
661 const TT
s = TT(1) / det;
662 const TT
tmp =
s *
a[3 *
N +
n];
665 a[3 *
N +
n] =
s *
a[0 *
N +
n];
671 template <
typename T,
idx_t N>
681 const TT c00 =
a[4 *
N +
n] *
a[8 *
N +
n] -
a[5 *
N +
n] *
a[7 *
N +
n];
682 const TT c01 =
a[5 *
N +
n] *
a[6 *
N +
n] -
a[3 *
N +
n] *
a[8 *
N +
n];
683 const TT c02 =
a[3 *
N +
n] *
a[7 *
N +
n] -
a[4 *
N +
n] *
a[6 *
N +
n];
684 const TT c10 =
a[7 *
N +
n] *
a[2 *
N +
n] -
a[8 *
N +
n] *
a[1 *
N +
n];
685 const TT c11 =
a[8 *
N +
n] *
a[0 *
N +
n] -
a[6 *
N +
n] *
a[2 *
N +
n];
686 const TT c12 =
a[6 *
N +
n] *
a[1 *
N +
n] -
a[7 *
N +
n] *
a[0 *
N +
n];
687 const TT c20 =
a[1 *
N +
n] *
a[5 *
N +
n] -
a[2 *
N +
n] *
a[4 *
N +
n];
688 const TT c21 =
a[2 *
N +
n] *
a[3 *
N +
n] -
a[0 *
N +
n] *
a[5 *
N +
n];
689 const TT c22 =
a[0 *
N +
n] *
a[4 *
N +
n] -
a[1 *
N +
n] *
a[3 *
N +
n];
692 const double det = (double)
a[0 *
N +
n] * c00 + (
double)
a[1 *
N +
n] * c01 + (double)
a[2 *
N +
n] * c02;
696 const TT
s = TT(1) / det;
697 a[0 *
N +
n] =
s * c00;
698 a[1 *
N +
n] =
s * c10;
699 a[2 *
N +
n] =
s * c20;
700 a[3 *
N +
n] =
s * c01;
701 a[4 *
N +
n] =
s * c11;
702 a[5 *
N +
n] =
s * c21;
703 a[6 *
N +
n] =
s * c02;
704 a[7 *
N +
n] =
s * c12;
705 a[8 *
N +
n] =
s * c22;
710 template <
typename T,
idx_t D,
idx_t N>
719 template <
typename T,
idx_t D,
idx_t N>
724 template <
typename T,
idx_t N>
737 TT l1 =
a[3 *
N +
n] * l0;
738 TT l2 =
a[4 *
N +
n] - l1 * l1;
740 TT l3 =
a[6 *
N +
n] * l0;
741 TT l4 = (
a[7 *
N +
n] - l1 * l3) * l2;
742 TT l5 =
a[8 *
N +
n] - (l3 * l3 + l4 * l4);
747 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
751 a[0 *
N +
n] = l3 * l3 + l1 * l1 + l0 * l0;
752 a[1 *
N +
n] =
a[3 *
N +
n] = l3 * l4 + l1 * l2;
753 a[4 *
N +
n] = l4 * l4 + l2 * l2;
754 a[2 *
N +
n] =
a[6 *
N +
n] = l3 * l5;
755 a[5 *
N +
n] =
a[7 *
N +
n] = l4 * l5;
756 a[8 *
N +
n] = l5 * l5;
764 template <
typename T,
idx_t D,
idx_t N>
Basic3DVector & operator*=(T t)
Scaling by a scalar value (multiplication)
MPlex< T, D1, D2, N > hypot(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Matriplex< T, D1, D2, N > MPlex
MPlex< T, D1, D2, N > operator+(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
MPlex< T, D1, D2, N > min(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
MPlex< T, D1, D2, N > sin(const MPlex< T, D1, D2, N > &a)
static void invert(MPlex< T, D, D, N > &A, double *determ=nullptr)
MPlex< T, D1, D2, N > operator/(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Basic3DVector & operator-=(const Basic3DVector< U > &p)
MPlex< T, D1, D2, N > sqr(const MPlex< T, D1, D2, N > &a)
static void multiply(const MPlex< T, 3, 3, N > &A, const MPlex< T, 3, 3, N > &B, MPlex< T, 3, 3, N > &C)
MPlex< T, D1, D2, N > operator-(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
static void multiply(const MPlex< T, 6, 6, N > &A, const MPlex< T, 6, 6, N > &B, MPlex< T, 6, 6, N > &C)
Container::value_type value_type
static void invert(MPlex< T, 2, 2, N > &A, double *determ=nullptr)
void sincos(const MPlex< T, D1, D2, N > &a, MPlex< T, D1, D2, N > &s, MPlex< T, D1, D2, N > &c)
static void invert(MPlex< T, 3, 3, N > &A, double *determ=nullptr)
MPlex< T, D1, D2, N > max(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
void min_max(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b, MPlex< T, D1, D2, N > &min, MPlex< T, D1, D2, N > &max)
static void invert(MPlex< T, 3, 3, N > &A)
void multiplyGeneral(const MPlex< T, D1, D2, N > &A, const MPlex< T, D2, D3, N > &B, MPlex< T, D1, D3, N > &C)
MPlex< T, D1, D2, N > operator*(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
DecomposeProduct< arg, typename Div::arg > D
void invertCramer(MPlex< T, D, D, N > &A, double *determ=nullptr)
Basic3DVector & operator/=(T t)
Scaling by a scalar value (division)
MPlex< T, D1, D2, N > tan(const MPlex< T, D1, D2, N > &a)
class __attribute__((aligned(32))) Matriplex
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
void invertCholesky(MPlex< T, D, D, N > &A)
static void invert(MPlex< T, D, D, N > &A)
static void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
T operator[](int i) const
#define ASSUME_ALIGNED(a, b)
Basic3DVector & operator+=(const Basic3DVector< U > &p)
MPlex< T, D1, D2, N > cos(const MPlex< T, D1, D2, N > &a)
MPlex< T, D1, D2, N > sqrt(const MPlex< T, D1, D2, N > &a)