1 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h 2 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h 16 {0, 1, 3, 1, 2, 4, 3, 4, 5},
19 {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
20 6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
24 template <
typename T,
idx_t D,
idx_t N>
41 MatriplexSym(
T v) { setVal(
v); }
43 idx_t plexSize()
const {
return N; }
46 for (
idx_t i = 0;
i < kTotSize; ++
i) {
51 void add(
const MatriplexSym&
v) {
52 for (
idx_t i = 0;
i < kTotSize; ++
i) {
53 fArray[
i] +=
v.fArray[
i];
58 for (
idx_t i = 0;
i < kTotSize; ++
i) {
76 MatriplexSym&
operator=(
const MatriplexSym&
m) {
77 memcpy(fArray,
m.fArray,
sizeof(
T) * kTotSize);
81 MatriplexSym(
const MatriplexSym&
m) =
default;
83 void copySlot(
idx_t n,
const MatriplexSym&
m) {
85 fArray[
i] =
m.fArray[
i];
89 void copyIn(
idx_t n,
const T* arr) {
103 fArray[
i] = fArray[
in];
107 #if defined(AVX512_INTRINSICS) 109 template <
typename U>
110 void slurpIn(
const T* arr, __m512i& vi,
const U&,
const int N_proc =
N) {
113 const __m512
src = {0};
114 const __mmask16
k = N_proc ==
N ? -1 : (1 << N_proc) - 1;
116 for (
int i = 0;
i < kSize; ++
i, ++arr) {
119 __m512 reg = _mm512_mask_i32gather_ps(
src,
k, vi, arr,
sizeof(
U));
120 _mm512_mask_store_ps(&fArray[
i *
N],
k, reg);
127 void ChewIn(
const char* arr,
int off,
int vi[
N],
const char*
tmp, __m512i&
ui) {
130 for (
int i = 0;
i <
N; ++
i) {
131 __m512 reg = _mm512_load_ps(arr + vi[
i]);
132 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
135 for (
int i = 0;
i < kSize; ++
i) {
136 __m512 reg = _mm512_i32gather_ps(
ui,
tmp + off +
i *
sizeof(
T), 1);
137 _mm512_store_ps(&fArray[
i *
N], reg);
141 void Contaginate(
const char* arr,
int vi[
N],
const char*
tmp) {
144 for (
int i = 0;
i <
N; ++
i) {
145 __m512 reg = _mm512_load_ps(arr + vi[
i]);
146 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
150 void Plexify(
const char*
tmp, __m512i&
ui) {
151 for (
int i = 0;
i < kSize; ++
i) {
152 __m512 reg = _mm512_i32gather_ps(
ui,
tmp +
i *
sizeof(
T), 1);
153 _mm512_store_ps(&fArray[
i *
N], reg);
157 #elif defined(AVX2_INTRINSICS) 159 template <
typename U>
160 void slurpIn(
const T* arr, __m256i& vi,
const U&,
const int N_proc =
N) {
161 const __m256
src = {0};
163 __m256i
k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
164 __m256i k_sel = _mm256_set1_epi32(N_proc);
165 __m256i k_master = _mm256_cmpgt_epi32(k_sel,
k);
168 for (
int i = 0;
i < kSize; ++
i, ++arr) {
169 __m256 reg = _mm256_mask_i32gather_ps(
src, arr, vi, (__m256)
k,
sizeof(
U));
172 _mm256_maskstore_ps(&fArray[
i *
N],
k, reg);
178 void slurpIn(
const T* arr,
int vi[
N],
const int N_proc =
N) {
181 for (
int i = 0;
i < kSize; ++
i) {
182 for (
int j = 0;
j <
N; ++
j) {
183 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
187 for (
int i = 0;
i < kSize; ++
i) {
188 for (
int j = 0;
j < N_proc; ++
j) {
189 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
197 void copyOut(
idx_t n,
T* arr)
const {
199 *(arr++) = fArray[
i];
214 MatriplexSym&
subtract(
const MatriplexSym&
a,
const MatriplexSym&
b) {
218 for (
idx_t i = 0;
i < kTotSize; ++
i) {
219 fArray[
i] =
a.fArray[
i] -
b.fArray[
i];
229 void addNoiseIntoUpperLeft3x3(
T noise) {
241 void invertUpperLeft3x3() {
249 const TT c00 =
a[2 *
N +
n] *
a[5 *
N +
n] -
a[4 *
N +
n] *
a[4 *
N +
n];
250 const TT c01 =
a[4 *
N +
n] *
a[3 *
N +
n] -
a[1 *
N +
n] *
a[5 *
N +
n];
251 const TT c02 =
a[1 *
N +
n] *
a[4 *
N +
n] -
a[2 *
N +
n] *
a[3 *
N +
n];
252 const TT c11 =
a[5 *
N +
n] *
a[0 *
N +
n] -
a[3 *
N +
n] *
a[3 *
N +
n];
253 const TT c12 =
a[3 *
N +
n] *
a[1 *
N +
n] -
a[4 *
N +
n] *
a[0 *
N +
n];
254 const TT c22 =
a[0 *
N +
n] *
a[2 *
N +
n] -
a[1 *
N +
n] *
a[1 *
N +
n];
257 const double det = (double)
a[0 *
N +
n] * c00 + (
double)
a[1 *
N +
n] * c01 + (double)
a[3 *
N +
n] * c02;
258 const TT
s = TT(1) / det;
260 a[0 *
N +
n] =
s * c00;
261 a[1 *
N +
n] =
s * c01;
262 a[2 *
N +
n] =
s * c11;
263 a[3 *
N +
n] =
s * c02;
264 a[4 *
N +
n] =
s * c12;
265 a[5 *
N +
n] =
s * c22;
269 Matriplex<T, 1, 1, N> ReduceFixedIJ(
idx_t i,
idx_t j)
const {
270 Matriplex<T, 1, 1, N>
t;
272 t[
n] = constAt(
n,
i,
j);
278 template <
typename T,
idx_t D,
idx_t N>
285 template <
typename T,
idx_t D,
idx_t N>
288 throw std::runtime_error(
"general symmetric multiplication not supported");
292 template <
typename T,
idx_t N>
295 const T*
a =
A.fArray;
297 const T*
b =
B.fArray;
302 #ifdef MPLEX_INTRINSICS 304 for (
idx_t n = 0;
n <
N;
n += 64 /
sizeof(
T)) {
305 #include "intr_sym_3x3.ah" 312 #include "std_sym_3x3.ah" 319 template <
typename T,
idx_t N>
322 const T*
a =
A.fArray;
324 const T*
b =
B.fArray;
329 #ifdef MPLEX_INTRINSICS 331 for (
idx_t n = 0;
n <
N;
n += 64 /
sizeof(
T)) {
332 #include "intr_sym_6x6.ah" 339 #include "std_sym_6x6.ah" 346 template <
typename T,
idx_t D,
idx_t N>
355 template <
typename T,
idx_t D,
idx_t N>
358 throw std::runtime_error(
"general cramer inversion not supported");
362 template <
typename T,
idx_t N>
373 const double det = (double)
a[0 *
N +
n] *
a[2 *
N +
n] - (
double)
a[1 *
N +
n] *
a[1 *
N +
n];
377 const TT
s = TT(1) / det;
378 const TT
tmp =
s *
a[2 *
N +
n];
380 a[2 *
N +
n] =
s *
a[0 *
N +
n];
386 template <
typename T,
idx_t N>
396 const TT c00 =
a[2 *
N +
n] *
a[5 *
N +
n] -
a[4 *
N +
n] *
a[4 *
N +
n];
397 const TT c01 =
a[4 *
N +
n] *
a[3 *
N +
n] -
a[1 *
N +
n] *
a[5 *
N +
n];
398 const TT c02 =
a[1 *
N +
n] *
a[4 *
N +
n] -
a[2 *
N +
n] *
a[3 *
N +
n];
399 const TT c11 =
a[5 *
N +
n] *
a[0 *
N +
n] -
a[3 *
N +
n] *
a[3 *
N +
n];
400 const TT c12 =
a[3 *
N +
n] *
a[1 *
N +
n] -
a[4 *
N +
n] *
a[0 *
N +
n];
401 const TT c22 =
a[0 *
N +
n] *
a[2 *
N +
n] -
a[1 *
N +
n] *
a[1 *
N +
n];
404 const double det = (double)
a[0 *
N +
n] * c00 + (
double)
a[1 *
N +
n] * c01 + (double)
a[3 *
N +
n] * c02;
408 const TT
s = TT(1) / det;
409 a[0 *
N +
n] =
s * c00;
410 a[1 *
N +
n] =
s * c01;
411 a[2 *
N +
n] =
s * c11;
412 a[3 *
N +
n] =
s * c02;
413 a[4 *
N +
n] =
s * c12;
414 a[5 *
N +
n] =
s * c22;
419 template <
typename T,
idx_t D,
idx_t N>
428 template <
typename T,
idx_t D,
idx_t N>
433 template <
typename T,
idx_t N>
443 TT l1 =
a[1 *
N +
n] * l0;
444 TT l2 =
a[2 *
N +
n] - l1 * l1;
446 TT l3 =
a[3 *
N +
n] * l0;
447 TT l4 = (
a[4 *
N +
n] - l1 * l3) * l2;
448 TT l5 =
a[5 *
N +
n] - (l3 * l3 + l4 * l4);
453 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
457 a[0 *
N +
n] = l3 * l3 + l1 * l1 + l0 * l0;
458 a[1 *
N +
n] = l3 * l4 + l1 * l2;
459 a[2 *
N +
n] = l4 * l4 + l2 * l2;
460 a[3 *
N +
n] = l3 * l5;
461 a[4 *
N +
n] = l4 * l5;
462 a[5 *
N +
n] = l5 * l5;
470 template <
typename T,
idx_t D,
idx_t N>
Basic3DVector & operator=(const Basic3DVector &)=default
Assignment operator.
MatriplexSym< T, D, N > MPlexSym
Matriplex< T, D1, D2, N > MPlex
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
const idx_t gSymOffsets[7][36]
void invertCholeskySym(MPlexSym< T, D, N > &A)
static void invert(MPlexSym< T, 2, N > &A, double *determ=nullptr)
static void multiply(const MPlexSym< T, 3, N > &A, const MPlexSym< T, 3, N > &B, MPlex< T, 3, 3, N > &C)
void invertCramerSym(MPlexSym< T, D, N > &A, double *determ=nullptr)
static void invert(MPlexSym< T, D, N > &A, double *determ=nullptr)
Container::value_type value_type
static void invert(MPlexSym< T, 3, N > &A)
static void multiply(const MPlexSym< float, 6, N > &A, const MPlexSym< float, 6, N > &B, MPlex< float, 6, 6, N > &C)
static void invert(MPlexSym< T, 3, N > &A, double *determ=nullptr)
DecomposeProduct< arg, typename Div::arg > D
class __attribute__((aligned(32))) Matriplex
static void multiply(const MPlexSym< T, D, N > &A, const MPlexSym< T, D, N > &B, MPlex< T, D, D, N > &C)
static void invert(MPlexSym< T, D, N > &A)
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
T operator[](int i) const
#define ASSUME_ALIGNED(a, b)