1 #ifndef RecoTracker_MkFitCore_src_Matriplex_Matriplex_h 2 #define RecoTracker_MkFitCore_src_Matriplex_Matriplex_h 8 #ifdef MPLEX_VDT_USE_STD 26 #include "vdt/atan2.h" 34 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
53 idx_t plexSize()
const {
return N; }
56 for (
idx_t i = 0;
i < kTotSize; ++
i) {
62 for (
idx_t i = 0;
i < kTotSize; ++
i) {
63 fArray[
i] +=
v.fArray[
i];
68 for (
idx_t i = 0;
i < kTotSize; ++
i) {
74 for (
idx_t i = 0;
i < kTotSize; ++
i) {
75 fArray[
i] = -fArray[
i];
80 template <
typename TT>
82 for (
idx_t i = 0;
i < kTotSize; ++
i) {
83 if (
sign.fArray[
i] < 0)
84 fArray[
i] = -fArray[
i];
101 using QReduced = Matriplex<T, 1, 1, N>;
106 t[
n] = constAt(
n,
i,
j);
110 QReduced rij(
idx_t i,
idx_t j)
const {
return ReduceFixedIJ(
i,
j); }
111 QReduced operator()(
idx_t i,
idx_t j)
const {
return ReduceFixedIJ(
i,
j); }
117 QAssigner(
Matriplex&
m,
int i,
int j) : m_matriplex(
m), m_i(
i), m_j(
j) {}
120 m_matriplex(
n, m_i, m_j) = qvec[
n];
126 m_matriplex(
n, m_i, m_j) = qscalar;
132 QAssigner AssignFixedIJ(
idx_t i,
idx_t j) {
return QAssigner(*
this,
i,
j); }
138 for (
idx_t i = 0;
i < kTotSize; ++
i)
144 for (
idx_t i = 0;
i < kTotSize; ++
i)
150 for (
idx_t i = 0;
i < kTotSize; ++
i)
156 for (
idx_t i = 0;
i < kTotSize; ++
i)
162 for (
idx_t i = 0;
i < kTotSize; ++
i)
168 for (
idx_t i = 0;
i < kTotSize; ++
i)
169 fArray[
i] +=
a.fArray[
i];
174 for (
idx_t i = 0;
i < kTotSize; ++
i)
175 fArray[
i] -=
a.fArray[
i];
180 for (
idx_t i = 0;
i < kTotSize; ++
i)
181 fArray[
i] *=
a.fArray[
i];
186 for (
idx_t i = 0;
i < kTotSize; ++
i)
187 fArray[
i] /=
a.fArray[
i];
193 for (
idx_t i = 0;
i < kTotSize; ++
i)
194 t.fArray[
i] = -fArray[
i];
199 for (
idx_t i = 0;
i < kTotSize; ++
i)
204 for (
idx_t i = 0;
i < kTotSize; ++
i)
210 for (
idx_t i = 0;
i < kTotSize; ++
i)
211 fArray[
i] =
a.fArray[
i] *
a.fArray[
i];
215 for (
idx_t i = 0;
i < kTotSize; ++
i)
216 fArray[
i] = fArray[
i] * fArray[
i];
224 for (
idx_t i = 0;
i < kTotSize; ++
i)
229 for (
idx_t i = 0;
i < kTotSize; ++
i)
235 for (
idx_t i = 0;
i < kTotSize; ++
i) {
236 fArray[
i] =
a.fArray[
i] *
a.fArray[
i] +
b.fArray[
i] *
b.fArray[
i];
242 for (
idx_t i = 0;
i < kTotSize; ++
i)
247 for (
idx_t i = 0;
i < kTotSize; ++
i)
253 for (
idx_t i = 0;
i < kTotSize; ++
i)
258 for (
idx_t i = 0;
i < kTotSize; ++
i)
264 for (
idx_t i = 0;
i < kTotSize; ++
i)
269 for (
idx_t i = 0;
i < kTotSize; ++
i)
275 for (
idx_t i = 0;
i < kTotSize; ++
i)
285 #define ASS fArray[i] = 286 #define ARR fArray[i] 287 #define A_ARR a.fArray[i] 289 #ifdef MPLEX_VDT_USE_STD 290 #define VDT_INVOKE(_ass_, _func_, ...) \ 291 for (idx_t i = 0; i < kTotSize; ++i) \ 292 _ass_ std::_func_(__VA_ARGS__); 294 #define VDT_INVOKE(_ass_, _func_, ...) \ 295 for (idx_t i = 0; i < kTotSize; ++i) \ 296 if constexpr (std::is_same<T, float>()) \ 297 _ass_ vdt::fast_##_func_##f(__VA_ARGS__); \ 299 _ass_ vdt::fast_##_func_(__VA_ARGS__); 303 VDT_INVOKE(ASS, isqrt, A_ARR);
307 VDT_INVOKE(ASS, isqrt, ARR);
312 VDT_INVOKE(ASS,
sin, A_ARR);
316 VDT_INVOKE(ASS,
sin, ARR);
321 VDT_INVOKE(ASS,
cos, A_ARR);
325 VDT_INVOKE(ASS,
cos, ARR);
332 VDT_INVOKE(ASS,
tan, A_ARR);
336 VDT_INVOKE(ASS,
tan, ARR);
341 VDT_INVOKE(ASS,
atan2, y.fArray[
i],
x.fArray[
i]);
353 for (
idx_t i = 0;
i < kTotSize; ++
i)
361 fArray[
i] =
m.fArray[
i];
365 void copyIn(
idx_t n,
const T* arr) {
367 fArray[
i] = *(arr++);
379 fArray[
i] = fArray[
in];
383 #if defined(AVX512_INTRINSICS) 385 template <
typename U>
386 void slurpIn(
const T* arr, __m512i& vi,
const U&,
const int N_proc =
N) {
389 const __m512
src = {0};
390 const __mmask16
k = N_proc ==
N ? -1 : (1 << N_proc) - 1;
392 for (
int i = 0;
i < kSize; ++
i, ++arr) {
395 __m512 reg = _mm512_mask_i32gather_ps(
src,
k, vi, arr,
sizeof(
U));
396 _mm512_mask_store_ps(&fArray[
i *
N],
k, reg);
402 void ChewIn(
const char* arr,
int off,
int vi[
N],
const char*
tmp, __m512i&
ui) {
405 for (
int i = 0;
i <
N; ++
i) {
406 __m512 reg = _mm512_load_ps(arr + vi[
i]);
407 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
410 for (
int i = 0;
i < kSize; ++
i) {
411 __m512 reg = _mm512_i32gather_ps(
ui,
tmp + off +
i *
sizeof(
T), 1);
412 _mm512_store_ps(&fArray[
i *
N], reg);
416 void Contaginate(
const char* arr,
int vi[
N],
const char*
tmp) {
419 for (
int i = 0;
i <
N; ++
i) {
420 __m512 reg = _mm512_load_ps(arr + vi[
i]);
421 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
425 void Plexify(
const char*
tmp, __m512i&
ui) {
426 for (
int i = 0;
i < kSize; ++
i) {
427 __m512 reg = _mm512_i32gather_ps(
ui,
tmp +
i *
sizeof(
T), 1);
428 _mm512_store_ps(&fArray[
i *
N], reg);
432 #elif defined(AVX2_INTRINSICS) 434 template <
typename U>
435 void slurpIn(
const T* arr, __m256i& vi,
const U&,
const int N_proc =
N) {
439 const __m256
src = {0};
441 __m256i
k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
442 __m256i k_sel = _mm256_set1_epi32(N_proc);
443 __m256i k_master = _mm256_cmpgt_epi32(k_sel,
k);
446 for (
int i = 0;
i < kSize; ++
i, ++arr) {
447 __m256 reg = _mm256_mask_i32gather_ps(
src, (
float*)arr, vi, (__m256)
k,
sizeof(
U));
450 _mm256_maskstore_ps((
float*)&fArray[
i *
N],
k, reg);
456 void slurpIn(
const T* arr,
int vi[
N],
const int N_proc =
N) {
459 for (
int i = 0;
i < kSize; ++
i) {
460 for (
int j = 0;
j <
N; ++
j) {
461 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
465 for (
int i = 0;
i < kSize; ++
i) {
466 for (
int j = 0;
j < N_proc; ++
j) {
467 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
475 void copyOut(
idx_t n,
T* arr)
const {
477 *(arr++) = fArray[
i];
482 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
483 using MPlex = Matriplex<T, D1, D2, N>;
489 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
496 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
503 template <
typename T,
typename TT,
idx_t D1,
idx_t D2,
idx_t N>
504 MPlex<T, D1, D2, N> negate_if_ltz(
const MPlex<T, D1, D2, N>&
a,
const MPlex<TT, D1, D2, N>&
sign) {
506 t.negate_if_ltz(
sign);
510 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
511 MPlex<T, D1, D2, N> operator+(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
517 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
518 MPlex<T, D1, D2, N> operator-(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
524 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
525 MPlex<T, D1, D2, N> operator*(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
531 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
532 MPlex<T, D1, D2, N> operator/(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
538 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
545 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
552 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
559 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
566 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
573 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
580 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
587 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
594 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
600 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
609 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
615 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
616 MPlex<T, D1, D2, N> hypot(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
618 return t.hypot(
a,
b);
621 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
627 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
633 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
634 void sincos(
const MPlex<T, D1, D2, N>&
a,
MPlex<T, D1, D2, N>&
s,
MPlex<T, D1, D2, N>&
c) {
641 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
647 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
648 MPlex<T, D1, D2, N> atan2(
const MPlex<T, D1, D2, N>& y,
const MPlex<T, D1, D2, N>& x) {
650 return t.atan2(y,
x);
658 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
659 MPlex<T, D1, D2, N> fast_isqrt(
const MPlex<T, D1, D2, N>&
a) {
660 MPlex<T, D1, D2, N>
t;
661 return t.fast_isqrt(
a);
664 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
665 MPlex<T, D1, D2, N> fast_sin(
const MPlex<T, D1, D2, N>&
a) {
666 MPlex<T, D1, D2, N>
t;
667 return t.fast_sin(
a);
670 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
671 MPlex<T, D1, D2, N> fast_cos(
const MPlex<T, D1, D2, N>&
a) {
672 MPlex<T, D1, D2, N>
t;
673 return t.fast_cos(
a);
676 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
677 void fast_sincos(
const MPlex<T, D1, D2, N>&
a, MPlex<T, D1, D2, N>&
s, MPlex<T, D1, D2, N>&
c) {
681 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
682 MPlex<T, D1, D2, N>
fast_tan(
const MPlex<T, D1, D2, N>&
a) {
683 MPlex<T, D1, D2, N>
t;
684 return t.fast_tan(
a);
687 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
688 MPlex<T, D1, D2, N>
fast_atan2(
const MPlex<T, D1, D2, N>& y,
const MPlex<T, D1, D2, N>& x) {
689 MPlex<T, D1, D2, N>
t;
690 return t.fast_atan2(y, x);
695 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
696 void sincos4(
const MPlex<T, D1, D2, N>&
a,
MPlex<T, D1, D2, N>&
s,
MPlex<T, D1, D2, N>&
c) {
702 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
713 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
714 MPlex<T, D1, D2, N> min(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
722 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
723 MPlex<T, D1, D2, N> max(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
735 template <
typename T,
idx_t D1,
idx_t D2,
idx_t D3,
idx_t N>
736 void multiplyGeneral(
const MPlex<T, D1, D2, N>&
A,
const MPlex<T, D2, D3, N>&
B,
MPlex<T, D1, D3, N>&
C) {
743 C.fArray[ijo +
n] = 0;
752 C.fArray[ijo +
n] +=
A.fArray[iko +
n] *
B.fArray[kjo +
n];
761 template <
typename T,
idx_t D,
idx_t N>
763 static void multiply(
const MPlex<T, D, D, N>&
A,
const MPlex<T, D, D, N>&
B,
MPlex<T, D, D, N>&
C) {
764 throw std::runtime_error(
"general multiplication not supported, well, call multiplyGeneral()");
768 template <
typename T,
idx_t N>
770 static void multiply(
const MPlex<T, 3, 3, N>&
A,
const MPlex<T, 3, 3, N>&
B,
MPlex<T, 3, 3, N>&
C) {
771 const T*
a =
A.fArray;
773 const T*
b =
B.fArray;
780 c[0 *
N +
n] =
a[0 *
N +
n] *
b[0 *
N +
n] +
a[1 *
N +
n] *
b[3 *
N +
n] +
a[2 *
N +
n] *
b[6 *
N +
n];
781 c[1 *
N +
n] =
a[0 *
N +
n] *
b[1 *
N +
n] +
a[1 *
N +
n] *
b[4 *
N +
n] +
a[2 *
N +
n] *
b[7 *
N +
n];
782 c[2 *
N +
n] =
a[0 *
N +
n] *
b[2 *
N +
n] +
a[1 *
N +
n] *
b[5 *
N +
n] +
a[2 *
N +
n] *
b[8 *
N +
n];
783 c[3 *
N +
n] =
a[3 *
N +
n] *
b[0 *
N +
n] +
a[4 *
N +
n] *
b[3 *
N +
n] +
a[5 *
N +
n] *
b[6 *
N +
n];
784 c[4 *
N +
n] =
a[3 *
N +
n] *
b[1 *
N +
n] +
a[4 *
N +
n] *
b[4 *
N +
n] +
a[5 *
N +
n] *
b[7 *
N +
n];
785 c[5 *
N +
n] =
a[3 *
N +
n] *
b[2 *
N +
n] +
a[4 *
N +
n] *
b[5 *
N +
n] +
a[5 *
N +
n] *
b[8 *
N +
n];
786 c[6 *
N +
n] =
a[6 *
N +
n] *
b[0 *
N +
n] +
a[7 *
N +
n] *
b[3 *
N +
n] +
a[8 *
N +
n] *
b[6 *
N +
n];
787 c[7 *
N +
n] =
a[6 *
N +
n] *
b[1 *
N +
n] +
a[7 *
N +
n] *
b[4 *
N +
n] +
a[8 *
N +
n] *
b[7 *
N +
n];
788 c[8 *
N +
n] =
a[6 *
N +
n] *
b[2 *
N +
n] +
a[7 *
N +
n] *
b[5 *
N +
n] +
a[8 *
N +
n] *
b[8 *
N +
n];
793 template <
typename T,
idx_t N>
795 static void multiply(
const MPlex<T, 6, 6, N>&
A,
const MPlex<T, 6, 6, N>&
B,
MPlex<T, 6, 6, N>&
C) {
796 const T*
a =
A.fArray;
798 const T*
b =
B.fArray;
804 c[0 *
N +
n] =
a[0 *
N +
n] *
b[0 *
N +
n] +
a[1 *
N +
n] *
b[6 *
N +
n] +
a[2 *
N +
n] *
b[12 *
N +
n] +
805 a[3 *
N +
n] *
b[18 *
N +
n] +
a[4 *
N +
n] *
b[24 *
N +
n] +
a[5 *
N +
n] *
b[30 *
N +
n];
806 c[1 *
N +
n] =
a[0 *
N +
n] *
b[1 *
N +
n] +
a[1 *
N +
n] *
b[7 *
N +
n] +
a[2 *
N +
n] *
b[13 *
N +
n] +
807 a[3 *
N +
n] *
b[19 *
N +
n] +
a[4 *
N +
n] *
b[25 *
N +
n] +
a[5 *
N +
n] *
b[31 *
N +
n];
808 c[2 *
N +
n] =
a[0 *
N +
n] *
b[2 *
N +
n] +
a[1 *
N +
n] *
b[8 *
N +
n] +
a[2 *
N +
n] *
b[14 *
N +
n] +
809 a[3 *
N +
n] *
b[20 *
N +
n] +
a[4 *
N +
n] *
b[26 *
N +
n] +
a[5 *
N +
n] *
b[32 *
N +
n];
810 c[3 *
N +
n] =
a[0 *
N +
n] *
b[3 *
N +
n] +
a[1 *
N +
n] *
b[9 *
N +
n] +
a[2 *
N +
n] *
b[15 *
N +
n] +
811 a[3 *
N +
n] *
b[21 *
N +
n] +
a[4 *
N +
n] *
b[27 *
N +
n] +
a[5 *
N +
n] *
b[33 *
N +
n];
812 c[4 *
N +
n] =
a[0 *
N +
n] *
b[4 *
N +
n] +
a[1 *
N +
n] *
b[10 *
N +
n] +
a[2 *
N +
n] *
b[16 *
N +
n] +
813 a[3 *
N +
n] *
b[22 *
N +
n] +
a[4 *
N +
n] *
b[28 *
N +
n] +
a[5 *
N +
n] *
b[34 *
N +
n];
814 c[5 *
N +
n] =
a[0 *
N +
n] *
b[5 *
N +
n] +
a[1 *
N +
n] *
b[11 *
N +
n] +
a[2 *
N +
n] *
b[17 *
N +
n] +
815 a[3 *
N +
n] *
b[23 *
N +
n] +
a[4 *
N +
n] *
b[29 *
N +
n] +
a[5 *
N +
n] *
b[35 *
N +
n];
816 c[6 *
N +
n] =
a[6 *
N +
n] *
b[0 *
N +
n] +
a[7 *
N +
n] *
b[6 *
N +
n] +
a[8 *
N +
n] *
b[12 *
N +
n] +
817 a[9 *
N +
n] *
b[18 *
N +
n] +
a[10 *
N +
n] *
b[24 *
N +
n] +
a[11 *
N +
n] *
b[30 *
N +
n];
818 c[7 *
N +
n] =
a[6 *
N +
n] *
b[1 *
N +
n] +
a[7 *
N +
n] *
b[7 *
N +
n] +
a[8 *
N +
n] *
b[13 *
N +
n] +
819 a[9 *
N +
n] *
b[19 *
N +
n] +
a[10 *
N +
n] *
b[25 *
N +
n] +
a[11 *
N +
n] *
b[31 *
N +
n];
820 c[8 *
N +
n] =
a[6 *
N +
n] *
b[2 *
N +
n] +
a[7 *
N +
n] *
b[8 *
N +
n] +
a[8 *
N +
n] *
b[14 *
N +
n] +
821 a[9 *
N +
n] *
b[20 *
N +
n] +
a[10 *
N +
n] *
b[26 *
N +
n] +
a[11 *
N +
n] *
b[32 *
N +
n];
822 c[9 *
N +
n] =
a[6 *
N +
n] *
b[3 *
N +
n] +
a[7 *
N +
n] *
b[9 *
N +
n] +
a[8 *
N +
n] *
b[15 *
N +
n] +
823 a[9 *
N +
n] *
b[21 *
N +
n] +
a[10 *
N +
n] *
b[27 *
N +
n] +
a[11 *
N +
n] *
b[33 *
N +
n];
824 c[10 *
N +
n] =
a[6 *
N +
n] *
b[4 *
N +
n] +
a[7 *
N +
n] *
b[10 *
N +
n] +
a[8 *
N +
n] *
b[16 *
N +
n] +
825 a[9 *
N +
n] *
b[22 *
N +
n] +
a[10 *
N +
n] *
b[28 *
N +
n] +
a[11 *
N +
n] *
b[34 *
N +
n];
826 c[11 *
N +
n] =
a[6 *
N +
n] *
b[5 *
N +
n] +
a[7 *
N +
n] *
b[11 *
N +
n] +
a[8 *
N +
n] *
b[17 *
N +
n] +
827 a[9 *
N +
n] *
b[23 *
N +
n] +
a[10 *
N +
n] *
b[29 *
N +
n] +
a[11 *
N +
n] *
b[35 *
N +
n];
828 c[12 *
N +
n] =
a[12 *
N +
n] *
b[0 *
N +
n] +
a[13 *
N +
n] *
b[6 *
N +
n] +
a[14 *
N +
n] *
b[12 *
N +
n] +
829 a[15 *
N +
n] *
b[18 *
N +
n] +
a[16 *
N +
n] *
b[24 *
N +
n] +
a[17 *
N +
n] *
b[30 *
N +
n];
830 c[13 *
N +
n] =
a[12 *
N +
n] *
b[1 *
N +
n] +
a[13 *
N +
n] *
b[7 *
N +
n] +
a[14 *
N +
n] *
b[13 *
N +
n] +
831 a[15 *
N +
n] *
b[19 *
N +
n] +
a[16 *
N +
n] *
b[25 *
N +
n] +
a[17 *
N +
n] *
b[31 *
N +
n];
832 c[14 *
N +
n] =
a[12 *
N +
n] *
b[2 *
N +
n] +
a[13 *
N +
n] *
b[8 *
N +
n] +
a[14 *
N +
n] *
b[14 *
N +
n] +
833 a[15 *
N +
n] *
b[20 *
N +
n] +
a[16 *
N +
n] *
b[26 *
N +
n] +
a[17 *
N +
n] *
b[32 *
N +
n];
834 c[15 *
N +
n] =
a[12 *
N +
n] *
b[3 *
N +
n] +
a[13 *
N +
n] *
b[9 *
N +
n] +
a[14 *
N +
n] *
b[15 *
N +
n] +
835 a[15 *
N +
n] *
b[21 *
N +
n] +
a[16 *
N +
n] *
b[27 *
N +
n] +
a[17 *
N +
n] *
b[33 *
N +
n];
836 c[16 *
N +
n] =
a[12 *
N +
n] *
b[4 *
N +
n] +
a[13 *
N +
n] *
b[10 *
N +
n] +
a[14 *
N +
n] *
b[16 *
N +
n] +
837 a[15 *
N +
n] *
b[22 *
N +
n] +
a[16 *
N +
n] *
b[28 *
N +
n] +
a[17 *
N +
n] *
b[34 *
N +
n];
838 c[17 *
N +
n] =
a[12 *
N +
n] *
b[5 *
N +
n] +
a[13 *
N +
n] *
b[11 *
N +
n] +
a[14 *
N +
n] *
b[17 *
N +
n] +
839 a[15 *
N +
n] *
b[23 *
N +
n] +
a[16 *
N +
n] *
b[29 *
N +
n] +
a[17 *
N +
n] *
b[35 *
N +
n];
840 c[18 *
N +
n] =
a[18 *
N +
n] *
b[0 *
N +
n] +
a[19 *
N +
n] *
b[6 *
N +
n] +
a[20 *
N +
n] *
b[12 *
N +
n] +
841 a[21 *
N +
n] *
b[18 *
N +
n] +
a[22 *
N +
n] *
b[24 *
N +
n] +
a[23 *
N +
n] *
b[30 *
N +
n];
842 c[19 *
N +
n] =
a[18 *
N +
n] *
b[1 *
N +
n] +
a[19 *
N +
n] *
b[7 *
N +
n] +
a[20 *
N +
n] *
b[13 *
N +
n] +
843 a[21 *
N +
n] *
b[19 *
N +
n] +
a[22 *
N +
n] *
b[25 *
N +
n] +
a[23 *
N +
n] *
b[31 *
N +
n];
844 c[20 *
N +
n] =
a[18 *
N +
n] *
b[2 *
N +
n] +
a[19 *
N +
n] *
b[8 *
N +
n] +
a[20 *
N +
n] *
b[14 *
N +
n] +
845 a[21 *
N +
n] *
b[20 *
N +
n] +
a[22 *
N +
n] *
b[26 *
N +
n] +
a[23 *
N +
n] *
b[32 *
N +
n];
846 c[21 *
N +
n] =
a[18 *
N +
n] *
b[3 *
N +
n] +
a[19 *
N +
n] *
b[9 *
N +
n] +
a[20 *
N +
n] *
b[15 *
N +
n] +
847 a[21 *
N +
n] *
b[21 *
N +
n] +
a[22 *
N +
n] *
b[27 *
N +
n] +
a[23 *
N +
n] *
b[33 *
N +
n];
848 c[22 *
N +
n] =
a[18 *
N +
n] *
b[4 *
N +
n] +
a[19 *
N +
n] *
b[10 *
N +
n] +
a[20 *
N +
n] *
b[16 *
N +
n] +
849 a[21 *
N +
n] *
b[22 *
N +
n] +
a[22 *
N +
n] *
b[28 *
N +
n] +
a[23 *
N +
n] *
b[34 *
N +
n];
850 c[23 *
N +
n] =
a[18 *
N +
n] *
b[5 *
N +
n] +
a[19 *
N +
n] *
b[11 *
N +
n] +
a[20 *
N +
n] *
b[17 *
N +
n] +
851 a[21 *
N +
n] *
b[23 *
N +
n] +
a[22 *
N +
n] *
b[29 *
N +
n] +
a[23 *
N +
n] *
b[35 *
N +
n];
852 c[24 *
N +
n] =
a[24 *
N +
n] *
b[0 *
N +
n] +
a[25 *
N +
n] *
b[6 *
N +
n] +
a[26 *
N +
n] *
b[12 *
N +
n] +
853 a[27 *
N +
n] *
b[18 *
N +
n] +
a[28 *
N +
n] *
b[24 *
N +
n] +
a[29 *
N +
n] *
b[30 *
N +
n];
854 c[25 *
N +
n] =
a[24 *
N +
n] *
b[1 *
N +
n] +
a[25 *
N +
n] *
b[7 *
N +
n] +
a[26 *
N +
n] *
b[13 *
N +
n] +
855 a[27 *
N +
n] *
b[19 *
N +
n] +
a[28 *
N +
n] *
b[25 *
N +
n] +
a[29 *
N +
n] *
b[31 *
N +
n];
856 c[26 *
N +
n] =
a[24 *
N +
n] *
b[2 *
N +
n] +
a[25 *
N +
n] *
b[8 *
N +
n] +
a[26 *
N +
n] *
b[14 *
N +
n] +
857 a[27 *
N +
n] *
b[20 *
N +
n] +
a[28 *
N +
n] *
b[26 *
N +
n] +
a[29 *
N +
n] *
b[32 *
N +
n];
858 c[27 *
N +
n] =
a[24 *
N +
n] *
b[3 *
N +
n] +
a[25 *
N +
n] *
b[9 *
N +
n] +
a[26 *
N +
n] *
b[15 *
N +
n] +
859 a[27 *
N +
n] *
b[21 *
N +
n] +
a[28 *
N +
n] *
b[27 *
N +
n] +
a[29 *
N +
n] *
b[33 *
N +
n];
860 c[28 *
N +
n] =
a[24 *
N +
n] *
b[4 *
N +
n] +
a[25 *
N +
n] *
b[10 *
N +
n] +
a[26 *
N +
n] *
b[16 *
N +
n] +
861 a[27 *
N +
n] *
b[22 *
N +
n] +
a[28 *
N +
n] *
b[28 *
N +
n] +
a[29 *
N +
n] *
b[34 *
N +
n];
862 c[29 *
N +
n] =
a[24 *
N +
n] *
b[5 *
N +
n] +
a[25 *
N +
n] *
b[11 *
N +
n] +
a[26 *
N +
n] *
b[17 *
N +
n] +
863 a[27 *
N +
n] *
b[23 *
N +
n] +
a[28 *
N +
n] *
b[29 *
N +
n] +
a[29 *
N +
n] *
b[35 *
N +
n];
864 c[30 *
N +
n] =
a[30 *
N +
n] *
b[0 *
N +
n] +
a[31 *
N +
n] *
b[6 *
N +
n] +
a[32 *
N +
n] *
b[12 *
N +
n] +
865 a[33 *
N +
n] *
b[18 *
N +
n] +
a[34 *
N +
n] *
b[24 *
N +
n] +
a[35 *
N +
n] *
b[30 *
N +
n];
866 c[31 *
N +
n] =
a[30 *
N +
n] *
b[1 *
N +
n] +
a[31 *
N +
n] *
b[7 *
N +
n] +
a[32 *
N +
n] *
b[13 *
N +
n] +
867 a[33 *
N +
n] *
b[19 *
N +
n] +
a[34 *
N +
n] *
b[25 *
N +
n] +
a[35 *
N +
n] *
b[31 *
N +
n];
868 c[32 *
N +
n] =
a[30 *
N +
n] *
b[2 *
N +
n] +
a[31 *
N +
n] *
b[8 *
N +
n] +
a[32 *
N +
n] *
b[14 *
N +
n] +
869 a[33 *
N +
n] *
b[20 *
N +
n] +
a[34 *
N +
n] *
b[26 *
N +
n] +
a[35 *
N +
n] *
b[32 *
N +
n];
870 c[33 *
N +
n] =
a[30 *
N +
n] *
b[3 *
N +
n] +
a[31 *
N +
n] *
b[9 *
N +
n] +
a[32 *
N +
n] *
b[15 *
N +
n] +
871 a[33 *
N +
n] *
b[21 *
N +
n] +
a[34 *
N +
n] *
b[27 *
N +
n] +
a[35 *
N +
n] *
b[33 *
N +
n];
872 c[34 *
N +
n] =
a[30 *
N +
n] *
b[4 *
N +
n] +
a[31 *
N +
n] *
b[10 *
N +
n] +
a[32 *
N +
n] *
b[16 *
N +
n] +
873 a[33 *
N +
n] *
b[22 *
N +
n] +
a[34 *
N +
n] *
b[28 *
N +
n] +
a[35 *
N +
n] *
b[34 *
N +
n];
874 c[35 *
N +
n] =
a[30 *
N +
n] *
b[5 *
N +
n] +
a[31 *
N +
n] *
b[11 *
N +
n] +
a[32 *
N +
n] *
b[17 *
N +
n] +
875 a[33 *
N +
n] *
b[23 *
N +
n] +
a[34 *
N +
n] *
b[29 *
N +
n] +
a[35 *
N +
n] *
b[35 *
N +
n];
880 template <
typename T,
idx_t D,
idx_t N>
881 void multiply(
const MPlex<T, D, D, N>&
A,
const MPlex<T, D, D, N>&
B,
MPlex<T, D, D, N>&
C) {
883 printf(
"Multipl %d %d\n",
D,
N);
893 template <
typename T,
idx_t D,
idx_t N>
896 throw std::runtime_error(
"general cramer inversion not supported");
900 template <
typename T,
idx_t N>
911 const double det = (double)
a[0 *
N +
n] *
a[3 *
N +
n] - (
double)
a[2 *
N +
n] *
a[1 *
N +
n];
915 const TT
s = TT(1) / det;
916 const TT
tmp =
s *
a[3 *
N +
n];
919 a[3 *
N +
n] =
s *
a[0 *
N +
n];
925 template <
typename T,
idx_t N>
935 const TT c00 =
a[4 *
N +
n] *
a[8 *
N +
n] -
a[5 *
N +
n] *
a[7 *
N +
n];
936 const TT c01 =
a[5 *
N +
n] *
a[6 *
N +
n] -
a[3 *
N +
n] *
a[8 *
N +
n];
937 const TT c02 =
a[3 *
N +
n] *
a[7 *
N +
n] -
a[4 *
N +
n] *
a[6 *
N +
n];
938 const TT c10 =
a[7 *
N +
n] *
a[2 *
N +
n] -
a[8 *
N +
n] *
a[1 *
N +
n];
939 const TT c11 =
a[8 *
N +
n] *
a[0 *
N +
n] -
a[6 *
N +
n] *
a[2 *
N +
n];
940 const TT c12 =
a[6 *
N +
n] *
a[1 *
N +
n] -
a[7 *
N +
n] *
a[0 *
N +
n];
941 const TT c20 =
a[1 *
N +
n] *
a[5 *
N +
n] -
a[2 *
N +
n] *
a[4 *
N +
n];
942 const TT c21 =
a[2 *
N +
n] *
a[3 *
N +
n] -
a[0 *
N +
n] *
a[5 *
N +
n];
943 const TT c22 =
a[0 *
N +
n] *
a[4 *
N +
n] -
a[1 *
N +
n] *
a[3 *
N +
n];
946 const double det = (double)
a[0 *
N +
n] * c00 + (
double)
a[1 *
N +
n] * c01 + (double)
a[2 *
N +
n] * c02;
950 const TT
s = TT(1) / det;
951 a[0 *
N +
n] =
s * c00;
952 a[1 *
N +
n] =
s * c10;
953 a[2 *
N +
n] =
s * c20;
954 a[3 *
N +
n] =
s * c01;
955 a[4 *
N +
n] =
s * c11;
956 a[5 *
N +
n] =
s * c21;
957 a[6 *
N +
n] =
s * c02;
958 a[7 *
N +
n] =
s * c12;
959 a[8 *
N +
n] =
s * c22;
964 template <
typename T,
idx_t D,
idx_t N>
973 template <
typename T,
idx_t D,
idx_t N>
978 template <
typename T,
idx_t N>
991 TT l1 =
a[3 *
N +
n] * l0;
992 TT l2 =
a[4 *
N +
n] - l1 * l1;
994 TT l3 =
a[6 *
N +
n] * l0;
995 TT l4 = (
a[7 *
N +
n] - l1 * l3) * l2;
996 TT l5 =
a[8 *
N +
n] - (l3 * l3 + l4 * l4);
1001 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
1005 a[0 *
N +
n] = l3 * l3 + l1 * l1 + l0 * l0;
1006 a[1 *
N +
n] =
a[3 *
N +
n] = l3 * l4 + l1 * l2;
1007 a[4 *
N +
n] = l4 * l4 + l2 * l2;
1008 a[2 *
N +
n] =
a[6 *
N +
n] = l3 * l5;
1009 a[5 *
N +
n] =
a[7 *
N +
n] = l4 * l5;
1010 a[8 *
N +
n] = l5 * l5;
1018 template <
typename T,
idx_t D,
idx_t N>
Basic3DVector & operator*=(T t)
Scaling by a scalar value (multiplication)
void sincos4(const MPlex< T, D1, D2, N > &a, MPlex< T, D1, D2, N > &s, MPlex< T, D1, D2, N > &c)
void sincos4(const T x, T &sin, T &cos)
Basic3DVector & operator=(const Basic3DVector &)=default
Assignment operator.
MPlex< T, D1, D2, N > negate(const MPlex< T, D1, D2, N > &a)
MPlex< T, D1, D2, N > hypot(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Matriplex< T, D1, D2, N > MPlex
MPlex< T, D1, D2, N > operator+(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
MPlex< T, D1, D2, N > min(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
MPlex< T, D1, D2, N > sin(const MPlex< T, D1, D2, N > &a)
static void invert(MPlex< T, D, D, N > &A, double *determ=nullptr)
MPlex< T, D1, D2, N > operator-(const MPlex< T, D1, D2, N > &a)
MPlex< T, D1, D2, N > operator/(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Basic3DVector & operator-=(const Basic3DVector< U > &p)
MPlex< T, D1, D2, N > sqr(const MPlex< T, D1, D2, N > &a)
static void multiply(const MPlex< T, 3, 3, N > &A, const MPlex< T, 3, 3, N > &B, MPlex< T, 3, 3, N > &C)
static void multiply(const MPlex< T, 6, 6, N > &A, const MPlex< T, 6, 6, N > &B, MPlex< T, 6, 6, N > &C)
Container::value_type value_type
static void invert(MPlex< T, 2, 2, N > &A, double *determ=nullptr)
void sincos(const MPlex< T, D1, D2, N > &a, MPlex< T, D1, D2, N > &s, MPlex< T, D1, D2, N > &c)
MPF fast_tan(const MPF &a)
MPlex< T, D1, D2, N > abs(const MPlex< T, D1, D2, N > &a)
static void invert(MPlex< T, 3, 3, N > &A, double *determ=nullptr)
MPlex< T, D1, D2, N > max(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
void min_max(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b, MPlex< T, D1, D2, N > &min, MPlex< T, D1, D2, N > &max)
MPlex< T, D1, D2, N > negate_if_ltz(const MPlex< T, D1, D2, N > &a, const MPlex< TT, D1, D2, N > &sign)
static void invert(MPlex< T, 3, 3, N > &A)
void multiplyGeneral(const MPlex< T, D1, D2, N > &A, const MPlex< T, D2, D3, N > &B, MPlex< T, D1, D3, N > &C)
MPlex< T, D1, D2, N > operator*(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
DecomposeProduct< arg, typename Div::arg > D
void invertCramer(MPlex< T, D, D, N > &A, double *determ=nullptr)
Basic3DVector & operator/=(T t)
Scaling by a scalar value (division)
MPlex< T, D1, D2, N > tan(const MPlex< T, D1, D2, N > &a)
class __attribute__((aligned(32))) Matriplex
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
void invertCholesky(MPlex< T, D, D, N > &A)
static void invert(MPlex< T, D, D, N > &A)
MPF fast_atan2(const MPF &y, const MPF &x)
static void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
void fast_sincos(const MPF &a, MPF &s, MPF &c)
T operator[](int i) const
#define ASSUME_ALIGNED(a, b)
Basic3DVector & operator+=(const Basic3DVector< U > &p)
MPlex< T, D1, D2, N > cos(const MPlex< T, D1, D2, N > &a)
MPlex< T, D1, D2, N > sqrt(const MPlex< T, D1, D2, N > &a)
MPlex< T, D1, D2, N > atan2(const MPlex< T, D1, D2, N > &y, const MPlex< T, D1, D2, N > &x)