1 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
2 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h
16 {0, 1, 3, 1, 2, 4, 3, 4, 5},
19 {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
20 6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
24 template <
typename T,
idx_t D,
idx_t N>
34 static constexpr
int kSize = (
D + 1) *
D / 2;
53 fArray[
i] += v.fArray[
i];
77 memcpy(fArray, m.fArray,
sizeof(
T) *
kTotSize);
83 fArray[
i] = m.fArray[
i];
101 fArray[
i] = fArray[
in];
105 #if defined(AVX512_INTRINSICS)
107 template <
typename U>
108 void slurpIn(
const T* arr, __m512i& vi,
const U&,
const int N_proc =
N) {
111 const __m512
src = {0};
112 const __mmask16
k = N_proc ==
N ? -1 : (1 << N_proc) - 1;
114 for (
int i = 0;
i <
kSize; ++
i, ++arr) {
117 __m512 reg = _mm512_mask_i32gather_ps(src, k, vi, arr,
sizeof(U));
118 _mm512_mask_store_ps(&fArray[
i *
N], k, reg);
125 void ChewIn(
const char* arr,
int off,
int vi[
N],
const char*
tmp, __m512i&
ui) {
128 for (
int i = 0;
i <
N; ++
i) {
129 __m512 reg = _mm512_load_ps(arr + vi[
i]);
130 _mm512_store_ps((
void*)(tmp + 64 *
i), reg);
134 __m512 reg = _mm512_i32gather_ps(ui, tmp + off +
i *
sizeof(
T), 1);
135 _mm512_store_ps(&fArray[
i * N], reg);
139 void Contaginate(
const char* arr,
int vi[N],
const char* tmp) {
142 for (
int i = 0;
i <
N; ++
i) {
143 __m512 reg = _mm512_load_ps(arr + vi[
i]);
144 _mm512_store_ps((
void*)(tmp + 64 *
i), reg);
148 void Plexify(
const char* tmp, __m512i& ui) {
150 __m512 reg = _mm512_i32gather_ps(ui, tmp +
i *
sizeof(
T), 1);
151 _mm512_store_ps(&fArray[
i * N], reg);
155 #elif defined(AVX2_INTRINSICS)
157 template <
typename U>
158 void slurpIn(
const T* arr, __m256i& vi,
const U&,
const int N_proc = N) {
159 const __m256 src = {0};
161 __m256i k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
162 __m256i k_sel = _mm256_set1_epi32(N_proc);
163 __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
166 for (
int i = 0;
i <
kSize; ++
i, ++arr) {
167 __m256 reg = _mm256_mask_i32gather_ps(src, arr, vi, (__m256)k,
sizeof(U));
170 _mm256_maskstore_ps(&fArray[
i * N], k, reg);
176 void slurpIn(
const T* arr,
int vi[N],
const int N_proc = N) {
180 for (
int j = 0;
j <
N; ++
j) {
181 fArray[
i * N +
j] = *(arr +
i + vi[
j]);
186 for (
int j = 0;
j < N_proc; ++
j) {
187 fArray[
i * N +
j] = *(arr +
i + vi[
j]);
197 *(arr++) = fArray[
i];
217 fArray[
i] = a.fArray[
i] - b.fArray[
i];
247 const TT c00 = a[2 * N +
n] * a[5 * N +
n] - a[4 * N +
n] * a[4 * N +
n];
248 const TT c01 = a[4 * N +
n] * a[3 * N +
n] - a[1 * N +
n] * a[5 * N +
n];
249 const TT c02 = a[1 * N +
n] * a[4 * N +
n] - a[2 * N +
n] * a[3 * N +
n];
250 const TT c11 = a[5 * N +
n] * a[0 * N +
n] - a[3 * N +
n] * a[3 * N +
n];
251 const TT c12 = a[3 * N +
n] * a[1 * N +
n] - a[4 * N +
n] * a[0 * N +
n];
252 const TT c22 = a[0 * N +
n] * a[2 * N +
n] - a[1 * N +
n] * a[1 * N +
n];
255 const double det = (double)a[0 * N +
n] * c00 + (
double)a[1 * N +
n] * c01 + (double)a[3 * N +
n] * c02;
256 const TT
s = TT(1) / det;
258 a[0 * N +
n] = s * c00;
259 a[1 * N +
n] = s * c01;
260 a[2 * N +
n] = s * c11;
261 a[3 * N +
n] = s * c02;
262 a[4 * N +
n] = s * c12;
263 a[5 * N +
n] = s * c22;
268 template <
typename T,
idx_t D,
idx_t N>
275 template <
typename T,
idx_t D,
idx_t N>
278 throw std::runtime_error(
"general symmetric multiplication not supported");
282 template <
typename T,
idx_t N>
285 const T*
a = A.fArray;
287 const T*
b = B.fArray;
292 #ifdef MPLEX_INTRINSICS
294 for (
idx_t n = 0;
n <
N;
n += 64 /
sizeof(
T)) {
295 #include "intr_sym_3x3.ah"
302 #include "std_sym_3x3.ah"
309 template <
typename T,
idx_t N>
312 const T*
a = A.fArray;
314 const T*
b = B.fArray;
319 #ifdef MPLEX_INTRINSICS
321 for (
idx_t n = 0;
n <
N;
n += 64 /
sizeof(
T)) {
322 #include "intr_sym_6x6.ah"
329 #include "std_sym_6x6.ah"
336 template <
typename T,
idx_t D,
idx_t N>
345 template <
typename T,
idx_t D,
idx_t N>
348 throw std::runtime_error(
"general cramer inversion not supported");
352 template <
typename T,
idx_t N>
363 const double det = (double)a[0 * N +
n] * a[2 * N +
n] - (
double)a[1 * N +
n] * a[1 * N +
n];
367 const TT
s = TT(1) / det;
368 const TT tmp = s * a[2 * N +
n];
370 a[2 * N +
n] = s * a[0 * N +
n];
376 template <
typename T,
idx_t N>
386 const TT c00 = a[2 * N +
n] * a[5 * N +
n] - a[4 * N +
n] * a[4 * N +
n];
387 const TT c01 = a[4 * N +
n] * a[3 * N +
n] - a[1 * N +
n] * a[5 * N +
n];
388 const TT c02 = a[1 * N +
n] * a[4 * N +
n] - a[2 * N +
n] * a[3 * N +
n];
389 const TT c11 = a[5 * N +
n] * a[0 * N +
n] - a[3 * N +
n] * a[3 * N +
n];
390 const TT c12 = a[3 * N +
n] * a[1 * N +
n] - a[4 * N +
n] * a[0 * N +
n];
391 const TT c22 = a[0 * N +
n] * a[2 * N +
n] - a[1 * N +
n] * a[1 * N +
n];
394 const double det = (double)a[0 * N +
n] * c00 + (
double)a[1 * N +
n] * c01 + (double)a[3 * N +
n] * c02;
398 const TT
s = TT(1) / det;
399 a[0 * N +
n] = s * c00;
400 a[1 * N +
n] = s * c01;
401 a[2 * N +
n] = s * c11;
402 a[3 * N +
n] = s * c02;
403 a[4 * N +
n] = s * c12;
404 a[5 * N +
n] = s * c22;
409 template <
typename T,
idx_t D,
idx_t N>
418 template <
typename T,
idx_t D,
idx_t N>
423 template <
typename T,
idx_t N>
433 TT l1 = a[1 * N +
n] * l0;
434 TT l2 = a[2 * N +
n] - l1 * l1;
436 TT l3 = a[3 * N +
n] * l0;
437 TT l4 = (a[4 * N +
n] - l1 * l3) * l2;
438 TT l5 = a[5 * N +
n] - (l3 * l3 + l4 * l4);
443 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
447 a[0 * N +
n] = l3 * l3 + l1 * l1 + l0 * l0;
448 a[1 * N +
n] = l3 * l4 + l1 * l2;
449 a[2 * N +
n] = l4 * l4 + l2 * l2;
450 a[3 * N +
n] = l3 * l5;
451 a[4 * N +
n] = l4 * l5;
452 a[5 * N +
n] = l5 * l5;
460 template <
typename T,
idx_t D,
idx_t N>
MatriplexSym & subtract(const MatriplexSym &a, const MatriplexSym &b)
const edm::EventSetup & c
static constexpr int kCols
no. of matrix columns
void setDiagonal3x3(idx_t n, T d)
void add(const MatriplexSym &v)
const T & operator()(idx_t n, idx_t i, idx_t j) const
void copySlot(idx_t n, const MatriplexSym &m)
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
const idx_t * offsets() const
void copyOut(idx_t n, T *arr) const
static constexpr int kTotSize
size of the whole matriplex
T & At(idx_t n, idx_t i, idx_t j)
const idx_t gSymOffsets[7][36]
void invertCholeskySym(MPlexSym< T, D, N > &A)
static void invert(MPlexSym< T, 2, N > &A, double *determ=nullptr)
static void multiply(const MPlexSym< T, 3, N > &A, const MPlexSym< T, 3, N > &B, MPlex< T, 3, 3, N > &C)
T & operator()(idx_t n, idx_t i, idx_t j)
T operator[](idx_t xx) const
void invertCramerSym(MPlexSym< T, D, N > &A, double *determ=nullptr)
static void invert(MPlexSym< T, D, N > &A, double *determ=nullptr)
static void invert(MPlexSym< T, 3, N > &A)
static void multiply(const MPlexSym< float, 6, N > &A, const MPlexSym< float, 6, N > &B, MPlex< float, 6, 6, N > &C)
static void invert(MPlexSym< T, 3, N > &A, double *determ=nullptr)
void addNoiseIntoUpperLeft3x3(T noise)
void slurpIn(const T *arr, int vi[N], const int N_proc=N)
DecomposeProduct< arg, typename Div::arg > D
void copyIn(idx_t n, const MatriplexSym &m, idx_t in)
void copy(idx_t n, idx_t in)
T fArray[kTotSize] __attribute__((aligned(64)))
static void multiply(const MPlexSym< T, D, N > &A, const MPlexSym< T, D, N > &B, MPlex< T, D, D, N > &C)
void invertUpperLeft3x3()
const T & constAt(idx_t n, idx_t i, idx_t j) const
static void invert(MPlexSym< T, D, N > &A)
void copyIn(idx_t n, const T *arr)
static constexpr int kSize
no of elements: lower triangle
#define ASSUME_ALIGNED(a, b)
MatriplexSym & operator=(const MatriplexSym &m)
static constexpr int kRows
no. of matrix rows