1 #ifndef RecoTracker_MkFitCore_src_Matriplex_Matriplex_h 2 #define RecoTracker_MkFitCore_src_Matriplex_Matriplex_h 10 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
29 idx_t plexSize()
const {
return N; }
32 for (
idx_t i = 0;
i < kTotSize; ++
i) {
38 for (
idx_t i = 0;
i < kTotSize; ++
i) {
39 fArray[
i] +=
v.fArray[
i];
44 for (
idx_t i = 0;
i < kTotSize; ++
i) {
91 fArray[
i] +=
a.fArray[
i];
97 fArray[
i] -=
a.fArray[
i];
102 for (
idx_t i = 0;
i < kTotSize; ++
i)
103 fArray[
i] *=
a.fArray[
i];
108 for (
idx_t i = 0;
i < kTotSize; ++
i)
109 fArray[
i] /=
a.fArray[
i];
115 for (
idx_t i = 0;
i < kTotSize; ++
i)
116 t.fArray[
i] = -fArray[
i];
121 for (
idx_t i = 0;
i < kTotSize; ++
i)
126 for (
idx_t i = 0;
i < kTotSize; ++
i)
132 for (
idx_t i = 0;
i < kTotSize; ++
i)
137 for (
idx_t i = 0;
i < kTotSize; ++
i)
143 for (
idx_t i = 0;
i < kTotSize; ++
i)
144 fArray[
i] =
a.fArray[
i] *
a.fArray[
i];
148 for (
idx_t i = 0;
i < kTotSize; ++
i)
149 fArray[
i] = fArray[
i] * fArray[
i];
154 for (
idx_t i = 0;
i < kTotSize; ++
i) {
155 fArray[
i] =
a.fArray[
i] *
a.fArray[
i] +
b.fArray[
i] *
b.fArray[
i];
161 for (
idx_t i = 0;
i < kTotSize; ++
i)
166 for (
idx_t i = 0;
i < kTotSize; ++
i)
172 for (
idx_t i = 0;
i < kTotSize; ++
i)
177 for (
idx_t i = 0;
i < kTotSize; ++
i)
183 for (
idx_t i = 0;
i < kTotSize; ++
i)
188 for (
idx_t i = 0;
i < kTotSize; ++
i)
197 fArray[
i] =
m.fArray[
i];
201 void copyIn(
idx_t n,
const T* arr) {
203 fArray[
i] = *(arr++);
215 fArray[
i] = fArray[
in];
219 #if defined(AVX512_INTRINSICS) 221 template <
typename U>
222 void slurpIn(
const T* arr, __m512i& vi,
const U&,
const int N_proc =
N) {
225 const __m512
src = {0};
226 const __mmask16
k = N_proc ==
N ? -1 : (1 << N_proc) - 1;
228 for (
int i = 0;
i < kSize; ++
i, ++arr) {
231 __m512 reg = _mm512_mask_i32gather_ps(
src,
k, vi, arr,
sizeof(
U));
232 _mm512_mask_store_ps(&fArray[
i *
N],
k, reg);
238 void ChewIn(
const char* arr,
int off,
int vi[
N],
const char*
tmp, __m512i&
ui) {
241 for (
int i = 0;
i <
N; ++
i) {
242 __m512 reg = _mm512_load_ps(arr + vi[
i]);
243 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
246 for (
int i = 0;
i < kSize; ++
i) {
247 __m512 reg = _mm512_i32gather_ps(
ui,
tmp + off +
i *
sizeof(
T), 1);
248 _mm512_store_ps(&fArray[
i *
N], reg);
252 void Contaginate(
const char* arr,
int vi[
N],
const char*
tmp) {
255 for (
int i = 0;
i <
N; ++
i) {
256 __m512 reg = _mm512_load_ps(arr + vi[
i]);
257 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
261 void Plexify(
const char*
tmp, __m512i&
ui) {
262 for (
int i = 0;
i < kSize; ++
i) {
263 __m512 reg = _mm512_i32gather_ps(
ui,
tmp +
i *
sizeof(
T), 1);
264 _mm512_store_ps(&fArray[
i *
N], reg);
268 #elif defined(AVX2_INTRINSICS) 270 template <
typename U>
271 void slurpIn(
const T* arr, __m256i& vi,
const U&,
const int N_proc =
N) {
275 const __m256
src = {0};
277 __m256i
k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
278 __m256i k_sel = _mm256_set1_epi32(N_proc);
279 __m256i k_master = _mm256_cmpgt_epi32(k_sel,
k);
282 for (
int i = 0;
i < kSize; ++
i, ++arr) {
283 __m256 reg = _mm256_mask_i32gather_ps(
src, (
float*)arr, vi, (__m256)
k,
sizeof(
U));
286 _mm256_maskstore_ps((
float*)&fArray[
i *
N],
k, reg);
292 void slurpIn(
const T* arr,
int vi[
N],
const int N_proc =
N) {
295 for (
int i = 0;
i < kSize; ++
i) {
296 for (
int j = 0;
j <
N; ++
j) {
297 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
301 for (
int i = 0;
i < kSize; ++
i) {
302 for (
int j = 0;
j < N_proc; ++
j) {
303 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
311 void copyOut(
idx_t n,
T* arr)
const {
313 *(arr++) = fArray[
i];
317 Matriplex<T, 1, 1, N> ReduceFixedIJ(
idx_t i,
idx_t j)
const {
318 Matriplex<T, 1, 1, N>
t;
320 t[
n] = constAt(
n,
i,
j);
326 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
327 using MPlex = Matriplex<T, D1, D2, N>;
333 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
334 MPlex<T, D1, D2, N> operator+(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
340 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
341 MPlex<T, D1, D2, N> operator-(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
347 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
348 MPlex<T, D1, D2, N> operator*(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
354 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
355 MPlex<T, D1, D2, N> operator/(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
361 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
368 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
375 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
382 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
389 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
396 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
403 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
410 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
417 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
423 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
429 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
435 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
436 MPlex<T, D1, D2, N> hypot(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
438 return t.hypot(
a,
b);
441 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
447 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
453 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
454 void sincos(
const MPlex<T, D1, D2, N>&
a,
MPlex<T, D1, D2, N>&
s,
MPlex<T, D1, D2, N>&
c) {
461 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
467 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
478 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
479 MPlex<T, D1, D2, N> min(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
487 template <
typename T,
idx_t D1,
idx_t D2,
idx_t N>
488 MPlex<T, D1, D2, N> max(
const MPlex<T, D1, D2, N>&
a,
const MPlex<T, D1, D2, N>&
b) {
500 template <
typename T,
idx_t D1,
idx_t D2,
idx_t D3,
idx_t N>
501 void multiplyGeneral(
const MPlex<T, D1, D2, N>&
A,
const MPlex<T, D2, D3, N>&
B,
MPlex<T, D1, D3, N>&
C) {
508 C.fArray[ijo +
n] = 0;
517 C.fArray[ijo +
n] +=
A.fArray[iko +
n] *
B.fArray[kjo +
n];
526 template <
typename T,
idx_t D,
idx_t N>
528 static void multiply(
const MPlex<T, D, D, N>&
A,
const MPlex<T, D, D, N>&
B,
MPlex<T, D, D, N>&
C) {
529 throw std::runtime_error(
"general multiplication not supported, well, call multiplyGeneral()");
533 template <
typename T,
idx_t N>
535 static void multiply(
const MPlex<T, 3, 3, N>&
A,
const MPlex<T, 3, 3, N>&
B,
MPlex<T, 3, 3, N>&
C) {
536 const T*
a =
A.fArray;
538 const T*
b =
B.fArray;
545 c[0 *
N +
n] =
a[0 *
N +
n] *
b[0 *
N +
n] +
a[1 *
N +
n] *
b[3 *
N +
n] +
a[2 *
N +
n] *
b[6 *
N +
n];
546 c[1 *
N +
n] =
a[0 *
N +
n] *
b[1 *
N +
n] +
a[1 *
N +
n] *
b[4 *
N +
n] +
a[2 *
N +
n] *
b[7 *
N +
n];
547 c[2 *
N +
n] =
a[0 *
N +
n] *
b[2 *
N +
n] +
a[1 *
N +
n] *
b[5 *
N +
n] +
a[2 *
N +
n] *
b[8 *
N +
n];
548 c[3 *
N +
n] =
a[3 *
N +
n] *
b[0 *
N +
n] +
a[4 *
N +
n] *
b[3 *
N +
n] +
a[5 *
N +
n] *
b[6 *
N +
n];
549 c[4 *
N +
n] =
a[3 *
N +
n] *
b[1 *
N +
n] +
a[4 *
N +
n] *
b[4 *
N +
n] +
a[5 *
N +
n] *
b[7 *
N +
n];
550 c[5 *
N +
n] =
a[3 *
N +
n] *
b[2 *
N +
n] +
a[4 *
N +
n] *
b[5 *
N +
n] +
a[5 *
N +
n] *
b[8 *
N +
n];
551 c[6 *
N +
n] =
a[6 *
N +
n] *
b[0 *
N +
n] +
a[7 *
N +
n] *
b[3 *
N +
n] +
a[8 *
N +
n] *
b[6 *
N +
n];
552 c[7 *
N +
n] =
a[6 *
N +
n] *
b[1 *
N +
n] +
a[7 *
N +
n] *
b[4 *
N +
n] +
a[8 *
N +
n] *
b[7 *
N +
n];
553 c[8 *
N +
n] =
a[6 *
N +
n] *
b[2 *
N +
n] +
a[7 *
N +
n] *
b[5 *
N +
n] +
a[8 *
N +
n] *
b[8 *
N +
n];
558 template <
typename T,
idx_t N>
560 static void multiply(
const MPlex<T, 6, 6, N>&
A,
const MPlex<T, 6, 6, N>&
B,
MPlex<T, 6, 6, N>&
C) {
561 const T*
a =
A.fArray;
563 const T*
b =
B.fArray;
569 c[0 *
N +
n] =
a[0 *
N +
n] *
b[0 *
N +
n] +
a[1 *
N +
n] *
b[6 *
N +
n] +
a[2 *
N +
n] *
b[12 *
N +
n] +
570 a[3 *
N +
n] *
b[18 *
N +
n] +
a[4 *
N +
n] *
b[24 *
N +
n] +
a[5 *
N +
n] *
b[30 *
N +
n];
571 c[1 *
N +
n] =
a[0 *
N +
n] *
b[1 *
N +
n] +
a[1 *
N +
n] *
b[7 *
N +
n] +
a[2 *
N +
n] *
b[13 *
N +
n] +
572 a[3 *
N +
n] *
b[19 *
N +
n] +
a[4 *
N +
n] *
b[25 *
N +
n] +
a[5 *
N +
n] *
b[31 *
N +
n];
573 c[2 *
N +
n] =
a[0 *
N +
n] *
b[2 *
N +
n] +
a[1 *
N +
n] *
b[8 *
N +
n] +
a[2 *
N +
n] *
b[14 *
N +
n] +
574 a[3 *
N +
n] *
b[20 *
N +
n] +
a[4 *
N +
n] *
b[26 *
N +
n] +
a[5 *
N +
n] *
b[32 *
N +
n];
575 c[3 *
N +
n] =
a[0 *
N +
n] *
b[3 *
N +
n] +
a[1 *
N +
n] *
b[9 *
N +
n] +
a[2 *
N +
n] *
b[15 *
N +
n] +
576 a[3 *
N +
n] *
b[21 *
N +
n] +
a[4 *
N +
n] *
b[27 *
N +
n] +
a[5 *
N +
n] *
b[33 *
N +
n];
577 c[4 *
N +
n] =
a[0 *
N +
n] *
b[4 *
N +
n] +
a[1 *
N +
n] *
b[10 *
N +
n] +
a[2 *
N +
n] *
b[16 *
N +
n] +
578 a[3 *
N +
n] *
b[22 *
N +
n] +
a[4 *
N +
n] *
b[28 *
N +
n] +
a[5 *
N +
n] *
b[34 *
N +
n];
579 c[5 *
N +
n] =
a[0 *
N +
n] *
b[5 *
N +
n] +
a[1 *
N +
n] *
b[11 *
N +
n] +
a[2 *
N +
n] *
b[17 *
N +
n] +
580 a[3 *
N +
n] *
b[23 *
N +
n] +
a[4 *
N +
n] *
b[29 *
N +
n] +
a[5 *
N +
n] *
b[35 *
N +
n];
581 c[6 *
N +
n] =
a[6 *
N +
n] *
b[0 *
N +
n] +
a[7 *
N +
n] *
b[6 *
N +
n] +
a[8 *
N +
n] *
b[12 *
N +
n] +
582 a[9 *
N +
n] *
b[18 *
N +
n] +
a[10 *
N +
n] *
b[24 *
N +
n] +
a[11 *
N +
n] *
b[30 *
N +
n];
583 c[7 *
N +
n] =
a[6 *
N +
n] *
b[1 *
N +
n] +
a[7 *
N +
n] *
b[7 *
N +
n] +
a[8 *
N +
n] *
b[13 *
N +
n] +
584 a[9 *
N +
n] *
b[19 *
N +
n] +
a[10 *
N +
n] *
b[25 *
N +
n] +
a[11 *
N +
n] *
b[31 *
N +
n];
585 c[8 *
N +
n] =
a[6 *
N +
n] *
b[2 *
N +
n] +
a[7 *
N +
n] *
b[8 *
N +
n] +
a[8 *
N +
n] *
b[14 *
N +
n] +
586 a[9 *
N +
n] *
b[20 *
N +
n] +
a[10 *
N +
n] *
b[26 *
N +
n] +
a[11 *
N +
n] *
b[32 *
N +
n];
587 c[9 *
N +
n] =
a[6 *
N +
n] *
b[3 *
N +
n] +
a[7 *
N +
n] *
b[9 *
N +
n] +
a[8 *
N +
n] *
b[15 *
N +
n] +
588 a[9 *
N +
n] *
b[21 *
N +
n] +
a[10 *
N +
n] *
b[27 *
N +
n] +
a[11 *
N +
n] *
b[33 *
N +
n];
589 c[10 *
N +
n] =
a[6 *
N +
n] *
b[4 *
N +
n] +
a[7 *
N +
n] *
b[10 *
N +
n] +
a[8 *
N +
n] *
b[16 *
N +
n] +
590 a[9 *
N +
n] *
b[22 *
N +
n] +
a[10 *
N +
n] *
b[28 *
N +
n] +
a[11 *
N +
n] *
b[34 *
N +
n];
591 c[11 *
N +
n] =
a[6 *
N +
n] *
b[5 *
N +
n] +
a[7 *
N +
n] *
b[11 *
N +
n] +
a[8 *
N +
n] *
b[17 *
N +
n] +
592 a[9 *
N +
n] *
b[23 *
N +
n] +
a[10 *
N +
n] *
b[29 *
N +
n] +
a[11 *
N +
n] *
b[35 *
N +
n];
593 c[12 *
N +
n] =
a[12 *
N +
n] *
b[0 *
N +
n] +
a[13 *
N +
n] *
b[6 *
N +
n] +
a[14 *
N +
n] *
b[12 *
N +
n] +
594 a[15 *
N +
n] *
b[18 *
N +
n] +
a[16 *
N +
n] *
b[24 *
N +
n] +
a[17 *
N +
n] *
b[30 *
N +
n];
595 c[13 *
N +
n] =
a[12 *
N +
n] *
b[1 *
N +
n] +
a[13 *
N +
n] *
b[7 *
N +
n] +
a[14 *
N +
n] *
b[13 *
N +
n] +
596 a[15 *
N +
n] *
b[19 *
N +
n] +
a[16 *
N +
n] *
b[25 *
N +
n] +
a[17 *
N +
n] *
b[31 *
N +
n];
597 c[14 *
N +
n] =
a[12 *
N +
n] *
b[2 *
N +
n] +
a[13 *
N +
n] *
b[8 *
N +
n] +
a[14 *
N +
n] *
b[14 *
N +
n] +
598 a[15 *
N +
n] *
b[20 *
N +
n] +
a[16 *
N +
n] *
b[26 *
N +
n] +
a[17 *
N +
n] *
b[32 *
N +
n];
599 c[15 *
N +
n] =
a[12 *
N +
n] *
b[3 *
N +
n] +
a[13 *
N +
n] *
b[9 *
N +
n] +
a[14 *
N +
n] *
b[15 *
N +
n] +
600 a[15 *
N +
n] *
b[21 *
N +
n] +
a[16 *
N +
n] *
b[27 *
N +
n] +
a[17 *
N +
n] *
b[33 *
N +
n];
601 c[16 *
N +
n] =
a[12 *
N +
n] *
b[4 *
N +
n] +
a[13 *
N +
n] *
b[10 *
N +
n] +
a[14 *
N +
n] *
b[16 *
N +
n] +
602 a[15 *
N +
n] *
b[22 *
N +
n] +
a[16 *
N +
n] *
b[28 *
N +
n] +
a[17 *
N +
n] *
b[34 *
N +
n];
603 c[17 *
N +
n] =
a[12 *
N +
n] *
b[5 *
N +
n] +
a[13 *
N +
n] *
b[11 *
N +
n] +
a[14 *
N +
n] *
b[17 *
N +
n] +
604 a[15 *
N +
n] *
b[23 *
N +
n] +
a[16 *
N +
n] *
b[29 *
N +
n] +
a[17 *
N +
n] *
b[35 *
N +
n];
605 c[18 *
N +
n] =
a[18 *
N +
n] *
b[0 *
N +
n] +
a[19 *
N +
n] *
b[6 *
N +
n] +
a[20 *
N +
n] *
b[12 *
N +
n] +
606 a[21 *
N +
n] *
b[18 *
N +
n] +
a[22 *
N +
n] *
b[24 *
N +
n] +
a[23 *
N +
n] *
b[30 *
N +
n];
607 c[19 *
N +
n] =
a[18 *
N +
n] *
b[1 *
N +
n] +
a[19 *
N +
n] *
b[7 *
N +
n] +
a[20 *
N +
n] *
b[13 *
N +
n] +
608 a[21 *
N +
n] *
b[19 *
N +
n] +
a[22 *
N +
n] *
b[25 *
N +
n] +
a[23 *
N +
n] *
b[31 *
N +
n];
609 c[20 *
N +
n] =
a[18 *
N +
n] *
b[2 *
N +
n] +
a[19 *
N +
n] *
b[8 *
N +
n] +
a[20 *
N +
n] *
b[14 *
N +
n] +
610 a[21 *
N +
n] *
b[20 *
N +
n] +
a[22 *
N +
n] *
b[26 *
N +
n] +
a[23 *
N +
n] *
b[32 *
N +
n];
611 c[21 *
N +
n] =
a[18 *
N +
n] *
b[3 *
N +
n] +
a[19 *
N +
n] *
b[9 *
N +
n] +
a[20 *
N +
n] *
b[15 *
N +
n] +
612 a[21 *
N +
n] *
b[21 *
N +
n] +
a[22 *
N +
n] *
b[27 *
N +
n] +
a[23 *
N +
n] *
b[33 *
N +
n];
613 c[22 *
N +
n] =
a[18 *
N +
n] *
b[4 *
N +
n] +
a[19 *
N +
n] *
b[10 *
N +
n] +
a[20 *
N +
n] *
b[16 *
N +
n] +
614 a[21 *
N +
n] *
b[22 *
N +
n] +
a[22 *
N +
n] *
b[28 *
N +
n] +
a[23 *
N +
n] *
b[34 *
N +
n];
615 c[23 *
N +
n] =
a[18 *
N +
n] *
b[5 *
N +
n] +
a[19 *
N +
n] *
b[11 *
N +
n] +
a[20 *
N +
n] *
b[17 *
N +
n] +
616 a[21 *
N +
n] *
b[23 *
N +
n] +
a[22 *
N +
n] *
b[29 *
N +
n] +
a[23 *
N +
n] *
b[35 *
N +
n];
617 c[24 *
N +
n] =
a[24 *
N +
n] *
b[0 *
N +
n] +
a[25 *
N +
n] *
b[6 *
N +
n] +
a[26 *
N +
n] *
b[12 *
N +
n] +
618 a[27 *
N +
n] *
b[18 *
N +
n] +
a[28 *
N +
n] *
b[24 *
N +
n] +
a[29 *
N +
n] *
b[30 *
N +
n];
619 c[25 *
N +
n] =
a[24 *
N +
n] *
b[1 *
N +
n] +
a[25 *
N +
n] *
b[7 *
N +
n] +
a[26 *
N +
n] *
b[13 *
N +
n] +
620 a[27 *
N +
n] *
b[19 *
N +
n] +
a[28 *
N +
n] *
b[25 *
N +
n] +
a[29 *
N +
n] *
b[31 *
N +
n];
621 c[26 *
N +
n] =
a[24 *
N +
n] *
b[2 *
N +
n] +
a[25 *
N +
n] *
b[8 *
N +
n] +
a[26 *
N +
n] *
b[14 *
N +
n] +
622 a[27 *
N +
n] *
b[20 *
N +
n] +
a[28 *
N +
n] *
b[26 *
N +
n] +
a[29 *
N +
n] *
b[32 *
N +
n];
623 c[27 *
N +
n] =
a[24 *
N +
n] *
b[3 *
N +
n] +
a[25 *
N +
n] *
b[9 *
N +
n] +
a[26 *
N +
n] *
b[15 *
N +
n] +
624 a[27 *
N +
n] *
b[21 *
N +
n] +
a[28 *
N +
n] *
b[27 *
N +
n] +
a[29 *
N +
n] *
b[33 *
N +
n];
625 c[28 *
N +
n] =
a[24 *
N +
n] *
b[4 *
N +
n] +
a[25 *
N +
n] *
b[10 *
N +
n] +
a[26 *
N +
n] *
b[16 *
N +
n] +
626 a[27 *
N +
n] *
b[22 *
N +
n] +
a[28 *
N +
n] *
b[28 *
N +
n] +
a[29 *
N +
n] *
b[34 *
N +
n];
627 c[29 *
N +
n] =
a[24 *
N +
n] *
b[5 *
N +
n] +
a[25 *
N +
n] *
b[11 *
N +
n] +
a[26 *
N +
n] *
b[17 *
N +
n] +
628 a[27 *
N +
n] *
b[23 *
N +
n] +
a[28 *
N +
n] *
b[29 *
N +
n] +
a[29 *
N +
n] *
b[35 *
N +
n];
629 c[30 *
N +
n] =
a[30 *
N +
n] *
b[0 *
N +
n] +
a[31 *
N +
n] *
b[6 *
N +
n] +
a[32 *
N +
n] *
b[12 *
N +
n] +
630 a[33 *
N +
n] *
b[18 *
N +
n] +
a[34 *
N +
n] *
b[24 *
N +
n] +
a[35 *
N +
n] *
b[30 *
N +
n];
631 c[31 *
N +
n] =
a[30 *
N +
n] *
b[1 *
N +
n] +
a[31 *
N +
n] *
b[7 *
N +
n] +
a[32 *
N +
n] *
b[13 *
N +
n] +
632 a[33 *
N +
n] *
b[19 *
N +
n] +
a[34 *
N +
n] *
b[25 *
N +
n] +
a[35 *
N +
n] *
b[31 *
N +
n];
633 c[32 *
N +
n] =
a[30 *
N +
n] *
b[2 *
N +
n] +
a[31 *
N +
n] *
b[8 *
N +
n] +
a[32 *
N +
n] *
b[14 *
N +
n] +
634 a[33 *
N +
n] *
b[20 *
N +
n] +
a[34 *
N +
n] *
b[26 *
N +
n] +
a[35 *
N +
n] *
b[32 *
N +
n];
635 c[33 *
N +
n] =
a[30 *
N +
n] *
b[3 *
N +
n] +
a[31 *
N +
n] *
b[9 *
N +
n] +
a[32 *
N +
n] *
b[15 *
N +
n] +
636 a[33 *
N +
n] *
b[21 *
N +
n] +
a[34 *
N +
n] *
b[27 *
N +
n] +
a[35 *
N +
n] *
b[33 *
N +
n];
637 c[34 *
N +
n] =
a[30 *
N +
n] *
b[4 *
N +
n] +
a[31 *
N +
n] *
b[10 *
N +
n] +
a[32 *
N +
n] *
b[16 *
N +
n] +
638 a[33 *
N +
n] *
b[22 *
N +
n] +
a[34 *
N +
n] *
b[28 *
N +
n] +
a[35 *
N +
n] *
b[34 *
N +
n];
639 c[35 *
N +
n] =
a[30 *
N +
n] *
b[5 *
N +
n] +
a[31 *
N +
n] *
b[11 *
N +
n] +
a[32 *
N +
n] *
b[17 *
N +
n] +
640 a[33 *
N +
n] *
b[23 *
N +
n] +
a[34 *
N +
n] *
b[29 *
N +
n] +
a[35 *
N +
n] *
b[35 *
N +
n];
645 template <
typename T,
idx_t D,
idx_t N>
646 void multiply(
const MPlex<T, D, D, N>&
A,
const MPlex<T, D, D, N>&
B,
MPlex<T, D, D, N>&
C) {
648 printf(
"Multipl %d %d\n",
D,
N);
658 template <
typename T,
idx_t D,
idx_t N>
661 throw std::runtime_error(
"general cramer inversion not supported");
665 template <
typename T,
idx_t N>
676 const double det = (double)
a[0 *
N +
n] *
a[3 *
N +
n] - (
double)
a[2 *
N +
n] *
a[1 *
N +
n];
680 const TT
s = TT(1) / det;
681 const TT
tmp =
s *
a[3 *
N +
n];
684 a[3 *
N +
n] =
s *
a[0 *
N +
n];
690 template <
typename T,
idx_t N>
700 const TT c00 =
a[4 *
N +
n] *
a[8 *
N +
n] -
a[5 *
N +
n] *
a[7 *
N +
n];
701 const TT c01 =
a[5 *
N +
n] *
a[6 *
N +
n] -
a[3 *
N +
n] *
a[8 *
N +
n];
702 const TT c02 =
a[3 *
N +
n] *
a[7 *
N +
n] -
a[4 *
N +
n] *
a[6 *
N +
n];
703 const TT c10 =
a[7 *
N +
n] *
a[2 *
N +
n] -
a[8 *
N +
n] *
a[1 *
N +
n];
704 const TT c11 =
a[8 *
N +
n] *
a[0 *
N +
n] -
a[6 *
N +
n] *
a[2 *
N +
n];
705 const TT c12 =
a[6 *
N +
n] *
a[1 *
N +
n] -
a[7 *
N +
n] *
a[0 *
N +
n];
706 const TT c20 =
a[1 *
N +
n] *
a[5 *
N +
n] -
a[2 *
N +
n] *
a[4 *
N +
n];
707 const TT c21 =
a[2 *
N +
n] *
a[3 *
N +
n] -
a[0 *
N +
n] *
a[5 *
N +
n];
708 const TT c22 =
a[0 *
N +
n] *
a[4 *
N +
n] -
a[1 *
N +
n] *
a[3 *
N +
n];
711 const double det = (double)
a[0 *
N +
n] * c00 + (
double)
a[1 *
N +
n] * c01 + (double)
a[2 *
N +
n] * c02;
715 const TT
s = TT(1) / det;
716 a[0 *
N +
n] =
s * c00;
717 a[1 *
N +
n] =
s * c10;
718 a[2 *
N +
n] =
s * c20;
719 a[3 *
N +
n] =
s * c01;
720 a[4 *
N +
n] =
s * c11;
721 a[5 *
N +
n] =
s * c21;
722 a[6 *
N +
n] =
s * c02;
723 a[7 *
N +
n] =
s * c12;
724 a[8 *
N +
n] =
s * c22;
729 template <
typename T,
idx_t D,
idx_t N>
738 template <
typename T,
idx_t D,
idx_t N>
743 template <
typename T,
idx_t N>
756 TT l1 =
a[3 *
N +
n] * l0;
757 TT l2 =
a[4 *
N +
n] - l1 * l1;
759 TT l3 =
a[6 *
N +
n] * l0;
760 TT l4 = (
a[7 *
N +
n] - l1 * l3) * l2;
761 TT l5 =
a[8 *
N +
n] - (l3 * l3 + l4 * l4);
766 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
770 a[0 *
N +
n] = l3 * l3 + l1 * l1 + l0 * l0;
771 a[1 *
N +
n] =
a[3 *
N +
n] = l3 * l4 + l1 * l2;
772 a[4 *
N +
n] = l4 * l4 + l2 * l2;
773 a[2 *
N +
n] =
a[6 *
N +
n] = l3 * l5;
774 a[5 *
N +
n] =
a[7 *
N +
n] = l4 * l5;
775 a[8 *
N +
n] = l5 * l5;
783 template <
typename T,
idx_t D,
idx_t N>
Basic3DVector & operator*=(T t)
Scaling by a scalar value (multiplication)
Basic3DVector & operator=(const Basic3DVector &)=default
Assignment operator.
MPlex< T, D1, D2, N > hypot(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Matriplex< T, D1, D2, N > MPlex
MPlex< T, D1, D2, N > operator+(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
MPlex< T, D1, D2, N > min(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
MPlex< T, D1, D2, N > sin(const MPlex< T, D1, D2, N > &a)
static void invert(MPlex< T, D, D, N > &A, double *determ=nullptr)
MPlex< T, D1, D2, N > operator/(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
Basic3DVector & operator-=(const Basic3DVector< U > &p)
MPlex< T, D1, D2, N > sqr(const MPlex< T, D1, D2, N > &a)
static void multiply(const MPlex< T, 3, 3, N > &A, const MPlex< T, 3, 3, N > &B, MPlex< T, 3, 3, N > &C)
MPlex< T, D1, D2, N > operator-(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
static void multiply(const MPlex< T, 6, 6, N > &A, const MPlex< T, 6, 6, N > &B, MPlex< T, 6, 6, N > &C)
Container::value_type value_type
static void invert(MPlex< T, 2, 2, N > &A, double *determ=nullptr)
void sincos(const MPlex< T, D1, D2, N > &a, MPlex< T, D1, D2, N > &s, MPlex< T, D1, D2, N > &c)
MPlex< T, D1, D2, N > abs(const MPlex< T, D1, D2, N > &a)
static void invert(MPlex< T, 3, 3, N > &A, double *determ=nullptr)
MPlex< T, D1, D2, N > max(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
void min_max(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b, MPlex< T, D1, D2, N > &min, MPlex< T, D1, D2, N > &max)
static void invert(MPlex< T, 3, 3, N > &A)
void multiplyGeneral(const MPlex< T, D1, D2, N > &A, const MPlex< T, D2, D3, N > &B, MPlex< T, D1, D3, N > &C)
MPlex< T, D1, D2, N > operator*(const MPlex< T, D1, D2, N > &a, const MPlex< T, D1, D2, N > &b)
DecomposeProduct< arg, typename Div::arg > D
void invertCramer(MPlex< T, D, D, N > &A, double *determ=nullptr)
Basic3DVector & operator/=(T t)
Scaling by a scalar value (division)
MPlex< T, D1, D2, N > tan(const MPlex< T, D1, D2, N > &a)
class __attribute__((aligned(32))) Matriplex
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
void invertCholesky(MPlex< T, D, D, N > &A)
static void invert(MPlex< T, D, D, N > &A)
static void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
T operator[](int i) const
#define ASSUME_ALIGNED(a, b)
Basic3DVector & operator+=(const Basic3DVector< U > &p)
MPlex< T, D1, D2, N > cos(const MPlex< T, D1, D2, N > &a)
MPlex< T, D1, D2, N > sqrt(const MPlex< T, D1, D2, N > &a)