1 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h 2 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h 16 {0, 1, 3, 1, 2, 4, 3, 4, 5},
19 {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
20 6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
24 template <
typename T,
idx_t D,
idx_t N>
34 static constexpr
int kSize = (
D + 1) *
D / 2;
53 fArray[
i] +=
v.fArray[
i];
77 memcpy(fArray,
m.fArray,
sizeof(
T) *
kTotSize);
83 fArray[
i] =
m.fArray[
i];
101 fArray[
i] = fArray[
in];
105 #if defined(AVX512_INTRINSICS) 107 template <
typename U>
108 void slurpIn(
const T* arr, __m512i& vi,
const U&,
const int N_proc =
N) {
111 const __m512
src = {0};
112 const __mmask16
k = N_proc ==
N ? -1 : (1 << N_proc) - 1;
114 for (
int i = 0;
i <
kSize; ++
i, ++arr) {
117 __m512 reg = _mm512_mask_i32gather_ps(
src,
k, vi, arr,
sizeof(
U));
118 _mm512_mask_store_ps(&fArray[
i *
N],
k, reg);
125 void ChewIn(
const char* arr,
int off,
int vi[
N],
const char*
tmp, __m512i&
ui) {
128 for (
int i = 0;
i <
N; ++
i) {
129 __m512 reg = _mm512_load_ps(arr + vi[
i]);
130 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
134 __m512 reg = _mm512_i32gather_ps(
ui,
tmp +
off +
i *
sizeof(
T), 1);
135 _mm512_store_ps(&fArray[
i *
N], reg);
139 void Contaginate(
const char* arr,
int vi[
N],
const char*
tmp) {
142 for (
int i = 0;
i <
N; ++
i) {
143 __m512 reg = _mm512_load_ps(arr + vi[
i]);
144 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
148 void Plexify(
const char*
tmp, __m512i&
ui) {
150 __m512 reg = _mm512_i32gather_ps(
ui,
tmp +
i *
sizeof(
T), 1);
151 _mm512_store_ps(&fArray[
i *
N], reg);
155 #elif defined(AVX2_INTRINSICS) 157 template <
typename U>
158 void slurpIn(
const T* arr, __m256i& vi,
const U&,
const int N_proc =
N) {
159 const __m256
src = {0};
161 __m256i
k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
162 __m256i k_sel = _mm256_set1_epi32(N_proc);
163 __m256i k_master = _mm256_cmpgt_epi32(k_sel,
k);
166 for (
int i = 0;
i <
kSize; ++
i, ++arr) {
167 __m256 reg = _mm256_mask_i32gather_ps(
src, arr, vi, (__m256)
k,
sizeof(
U));
170 _mm256_maskstore_ps(&fArray[
i *
N],
k, reg);
176 void slurpIn(
const T* arr,
int vi[
N],
const int N_proc =
N) {
180 for (
int j = 0;
j <
N; ++
j) {
181 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
186 for (
int j = 0;
j < N_proc; ++
j) {
187 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
197 *(arr++) = fArray[
i];
217 fArray[
i] =
a.fArray[
i] -
b.fArray[
i];
247 const TT c00 =
a[2 *
N +
n] *
a[5 *
N +
n] -
a[4 *
N +
n] *
a[4 *
N +
n];
248 const TT c01 =
a[4 *
N +
n] *
a[3 *
N +
n] -
a[1 *
N +
n] *
a[5 *
N +
n];
249 const TT c02 =
a[1 *
N +
n] *
a[4 *
N +
n] -
a[2 *
N +
n] *
a[3 *
N +
n];
250 const TT c11 =
a[5 *
N +
n] *
a[0 *
N +
n] -
a[3 *
N +
n] *
a[3 *
N +
n];
251 const TT c12 =
a[3 *
N +
n] *
a[1 *
N +
n] -
a[4 *
N +
n] *
a[0 *
N +
n];
252 const TT c22 =
a[0 *
N +
n] *
a[2 *
N +
n] -
a[1 *
N +
n] *
a[1 *
N +
n];
255 const double det = (double)
a[0 *
N +
n] * c00 + (
double)
a[1 *
N +
n] * c01 + (double)
a[3 *
N +
n] * c02;
256 const TT
s = TT(1) / det;
258 a[0 *
N +
n] =
s * c00;
259 a[1 *
N +
n] =
s * c01;
260 a[2 *
N +
n] =
s * c11;
261 a[3 *
N +
n] =
s * c02;
262 a[4 *
N +
n] =
s * c12;
263 a[5 *
N +
n] =
s * c22;
268 template <
typename T,
idx_t D,
idx_t N>
275 template <
typename T,
idx_t D,
idx_t N>
278 throw std::runtime_error(
"general symmetric multiplication not supported");
282 template <
typename T,
idx_t N>
285 const T*
a =
A.fArray;
287 const T*
b =
B.fArray;
292 #ifdef MPLEX_INTRINSICS 294 for (
idx_t n = 0;
n <
N;
n += 64 /
sizeof(
T)) {
295 #include "intr_sym_3x3.ah" 302 #include "std_sym_3x3.ah" 309 template <
typename T,
idx_t N>
312 const T*
a =
A.fArray;
314 const T*
b =
B.fArray;
319 #ifdef MPLEX_INTRINSICS 321 for (
idx_t n = 0;
n <
N;
n += 64 /
sizeof(
T)) {
322 #include "intr_sym_6x6.ah" 329 #include "std_sym_6x6.ah" 336 template <
typename T,
idx_t D,
idx_t N>
345 template <
typename T,
idx_t D,
idx_t N>
348 throw std::runtime_error(
"general cramer inversion not supported");
352 template <
typename T,
idx_t N>
363 const double det = (double)
a[0 *
N +
n] *
a[2 *
N +
n] - (
double)
a[1 *
N +
n] *
a[1 *
N +
n];
367 const TT
s = TT(1) / det;
368 const TT
tmp =
s *
a[2 *
N +
n];
370 a[2 *
N +
n] =
s *
a[0 *
N +
n];
376 template <
typename T,
idx_t N>
386 const TT c00 =
a[2 *
N +
n] *
a[5 *
N +
n] -
a[4 *
N +
n] *
a[4 *
N +
n];
387 const TT c01 =
a[4 *
N +
n] *
a[3 *
N +
n] -
a[1 *
N +
n] *
a[5 *
N +
n];
388 const TT c02 =
a[1 *
N +
n] *
a[4 *
N +
n] -
a[2 *
N +
n] *
a[3 *
N +
n];
389 const TT c11 =
a[5 *
N +
n] *
a[0 *
N +
n] -
a[3 *
N +
n] *
a[3 *
N +
n];
390 const TT c12 =
a[3 *
N +
n] *
a[1 *
N +
n] -
a[4 *
N +
n] *
a[0 *
N +
n];
391 const TT c22 =
a[0 *
N +
n] *
a[2 *
N +
n] -
a[1 *
N +
n] *
a[1 *
N +
n];
394 const double det = (double)
a[0 *
N +
n] * c00 + (
double)
a[1 *
N +
n] * c01 + (double)
a[3 *
N +
n] * c02;
398 const TT
s = TT(1) / det;
399 a[0 *
N +
n] =
s * c00;
400 a[1 *
N +
n] =
s * c01;
401 a[2 *
N +
n] =
s * c11;
402 a[3 *
N +
n] =
s * c02;
403 a[4 *
N +
n] =
s * c12;
404 a[5 *
N +
n] =
s * c22;
409 template <
typename T,
idx_t D,
idx_t N>
418 template <
typename T,
idx_t D,
idx_t N>
423 template <
typename T,
idx_t N>
433 TT l1 =
a[1 *
N +
n] * l0;
434 TT l2 =
a[2 *
N +
n] - l1 * l1;
436 TT l3 =
a[3 *
N +
n] * l0;
437 TT l4 = (
a[4 *
N +
n] - l1 * l3) * l2;
438 TT l5 =
a[5 *
N +
n] - (l3 * l3 + l4 * l4);
443 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
447 a[0 *
N +
n] = l3 * l3 + l1 * l1 + l0 * l0;
448 a[1 *
N +
n] = l3 * l4 + l1 * l2;
449 a[2 *
N +
n] = l4 * l4 + l2 * l2;
450 a[3 *
N +
n] = l3 * l5;
451 a[4 *
N +
n] = l4 * l5;
452 a[5 *
N +
n] = l5 * l5;
460 template <
typename T,
idx_t D,
idx_t N>
MatriplexSym & subtract(const MatriplexSym &a, const MatriplexSym &b)
static constexpr int kCols
no. of matrix columns
void setDiagonal3x3(idx_t n, T d)
void add(const MatriplexSym &v)
void copySlot(idx_t n, const MatriplexSym &m)
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
const idx_t * offsets() const
static constexpr int kTotSize
size of the whole matriplex
T & At(idx_t n, idx_t i, idx_t j)
T operator[](idx_t xx) const
const idx_t gSymOffsets[7][36]
void invertCholeskySym(MPlexSym< T, D, N > &A)
static void invert(MPlexSym< T, 2, N > &A, double *determ=nullptr)
static void multiply(const MPlexSym< T, 3, N > &A, const MPlexSym< T, 3, N > &B, MPlex< T, 3, 3, N > &C)
T & operator()(idx_t n, idx_t i, idx_t j)
void invertCramerSym(MPlexSym< T, D, N > &A, double *determ=nullptr)
static void invert(MPlexSym< T, D, N > &A, double *determ=nullptr)
const T & operator()(idx_t n, idx_t i, idx_t j) const
const T & constAt(idx_t n, idx_t i, idx_t j) const
static void invert(MPlexSym< T, 3, N > &A)
static void multiply(const MPlexSym< float, 6, N > &A, const MPlexSym< float, 6, N > &B, MPlex< float, 6, 6, N > &C)
void copyOut(idx_t n, T *arr) const
static void invert(MPlexSym< T, 3, N > &A, double *determ=nullptr)
void addNoiseIntoUpperLeft3x3(T noise)
void slurpIn(const T *arr, int vi[N], const int N_proc=N)
DecomposeProduct< arg, typename Div::arg > D
void copyIn(idx_t n, const MatriplexSym &m, idx_t in)
void copy(idx_t n, idx_t in)
T fArray [kTotSize] __attribute__((aligned(64)))
static void multiply(const MPlexSym< T, D, N > &A, const MPlexSym< T, D, N > &B, MPlex< T, D, D, N > &C)
void invertUpperLeft3x3()
static void invert(MPlexSym< T, D, N > &A)
void copyIn(idx_t n, const T *arr)
static constexpr int kSize
no of elements: lower triangle
#define ASSUME_ALIGNED(a, b)
MatriplexSym & operator=(const MatriplexSym &m)
static constexpr int kRows
no. of matrix rows