1 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h 2 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexSym_h 17 {0, 1, 3, 1, 2, 4, 3, 4, 5},
19 {0, 1, 3, 6, 10, 1, 2, 4, 7, 11, 3, 4, 5, 8, 12, 6, 7, 8, 9, 13, 10, 11, 12, 13, 14},
20 {0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17,
21 6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20}};
25 template <
typename T,
idx_t D,
idx_t N>
42 MatriplexSym(
T v) { setVal(
v); }
44 idx_t plexSize()
const {
return N; }
47 for (
idx_t i = 0;
i < kTotSize; ++
i) {
52 void add(
const MatriplexSym&
v) {
53 for (
idx_t i = 0;
i < kTotSize; ++
i) {
54 fArray[
i] +=
v.fArray[
i];
59 for (
idx_t i = 0;
i < kTotSize; ++
i) {
77 MatriplexSym&
operator=(
const MatriplexSym&
m) {
78 memcpy(fArray,
m.fArray,
sizeof(
T) * kTotSize);
82 MatriplexSym(
const MatriplexSym&
m) =
default;
84 void copySlot(
idx_t n,
const MatriplexSym&
m) {
86 fArray[
i] =
m.fArray[
i];
90 void copyIn(
idx_t n,
const T* arr) {
104 fArray[
i] = fArray[
in];
108 #if defined(AVX512_INTRINSICS) 110 template <
typename U>
111 void slurpIn(
const T* arr, __m512i& vi,
const U&,
const int N_proc =
N) {
114 const __m512
src = {0};
115 const __mmask16
k = N_proc ==
N ? -1 : (1 << N_proc) - 1;
117 for (
int i = 0;
i < kSize; ++
i, ++arr) {
120 __m512 reg = _mm512_mask_i32gather_ps(
src,
k, vi, arr,
sizeof(
U));
121 _mm512_mask_store_ps(&fArray[
i *
N],
k, reg);
128 void ChewIn(
const char* arr,
int off,
int vi[
N],
const char*
tmp, __m512i&
ui) {
131 for (
int i = 0;
i <
N; ++
i) {
132 __m512 reg = _mm512_load_ps(arr + vi[
i]);
133 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
136 for (
int i = 0;
i < kSize; ++
i) {
137 __m512 reg = _mm512_i32gather_ps(
ui,
tmp + off +
i *
sizeof(
T), 1);
138 _mm512_store_ps(&fArray[
i *
N], reg);
142 void Contaginate(
const char* arr,
int vi[
N],
const char*
tmp) {
145 for (
int i = 0;
i <
N; ++
i) {
146 __m512 reg = _mm512_load_ps(arr + vi[
i]);
147 _mm512_store_ps((
void*)(
tmp + 64 *
i), reg);
151 void Plexify(
const char*
tmp, __m512i&
ui) {
152 for (
int i = 0;
i < kSize; ++
i) {
153 __m512 reg = _mm512_i32gather_ps(
ui,
tmp +
i *
sizeof(
T), 1);
154 _mm512_store_ps(&fArray[
i *
N], reg);
158 #elif defined(AVX2_INTRINSICS) 160 template <
typename U>
161 void slurpIn(
const T* arr, __m256i& vi,
const U&,
const int N_proc =
N) {
162 const __m256
src = {0};
164 __m256i
k = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
165 __m256i k_sel = _mm256_set1_epi32(N_proc);
166 __m256i k_master = _mm256_cmpgt_epi32(k_sel,
k);
169 for (
int i = 0;
i < kSize; ++
i, ++arr) {
170 __m256 reg = _mm256_mask_i32gather_ps(
src, arr, vi, (__m256)
k,
sizeof(
U));
173 _mm256_maskstore_ps(&fArray[
i *
N],
k, reg);
179 void slurpIn(
const T* arr,
int vi[
N],
const int N_proc =
N) {
182 for (
int i = 0;
i < kSize; ++
i) {
183 for (
int j = 0;
j <
N; ++
j) {
184 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
188 for (
int i = 0;
i < kSize; ++
i) {
189 for (
int j = 0;
j < N_proc; ++
j) {
190 fArray[
i *
N +
j] = *(arr +
i + vi[
j]);
198 void copyOut(
idx_t n,
T* arr)
const {
200 *(arr++) = fArray[
i];
215 MatriplexSym&
subtract(
const MatriplexSym&
a,
const MatriplexSym&
b) {
219 for (
idx_t i = 0;
i < kTotSize; ++
i) {
220 fArray[
i] =
a.fArray[
i] -
b.fArray[
i];
230 void addNoiseIntoUpperLeft3x3(
T noise) {
242 void invertUpperLeft3x3() {
250 const TT c00 =
a[2 *
N +
n] *
a[5 *
N +
n] -
a[4 *
N +
n] *
a[4 *
N +
n];
251 const TT c01 =
a[4 *
N +
n] *
a[3 *
N +
n] -
a[1 *
N +
n] *
a[5 *
N +
n];
252 const TT c02 =
a[1 *
N +
n] *
a[4 *
N +
n] -
a[2 *
N +
n] *
a[3 *
N +
n];
253 const TT c11 =
a[5 *
N +
n] *
a[0 *
N +
n] -
a[3 *
N +
n] *
a[3 *
N +
n];
254 const TT c12 =
a[3 *
N +
n] *
a[1 *
N +
n] -
a[4 *
N +
n] *
a[0 *
N +
n];
255 const TT c22 =
a[0 *
N +
n] *
a[2 *
N +
n] -
a[1 *
N +
n] *
a[1 *
N +
n];
258 const double det = (double)
a[0 *
N +
n] * c00 + (
double)
a[1 *
N +
n] * c01 + (double)
a[3 *
N +
n] * c02;
259 const TT
s = TT(1) / det;
261 a[0 *
N +
n] =
s * c00;
262 a[1 *
N +
n] =
s * c01;
263 a[2 *
N +
n] =
s * c11;
264 a[3 *
N +
n] =
s * c02;
265 a[4 *
N +
n] =
s * c12;
266 a[5 *
N +
n] =
s * c22;
270 Matriplex<T, 1, 1, N> ReduceFixedIJ(
idx_t i,
idx_t j)
const {
271 Matriplex<T, 1, 1, N>
t;
273 t[
n] = constAt(
n,
i,
j);
279 template <
typename T,
idx_t D,
idx_t N>
286 template <
typename T,
idx_t D,
idx_t N>
289 throw std::runtime_error(
"general symmetric multiplication not supported");
293 template <
typename T,
idx_t N>
296 const T*
a =
A.fArray;
298 const T*
b =
B.fArray;
303 #ifdef MPLEX_INTRINSICS 305 for (
idx_t n = 0;
n <
N;
n += 64 /
sizeof(
T)) {
306 #include "intr_sym_3x3.ah" 313 #include "std_sym_3x3.ah" 320 template <
typename T,
idx_t N>
323 const T*
a =
A.fArray;
325 const T*
b =
B.fArray;
330 #ifdef MPLEX_INTRINSICS 332 for (
idx_t n = 0;
n <
N;
n += 64 /
sizeof(
T)) {
333 #include "intr_sym_6x6.ah" 340 #include "std_sym_6x6.ah" 347 template <
typename T,
idx_t D,
idx_t N>
356 template <
typename T,
idx_t D,
idx_t N>
359 throw std::runtime_error(
"general cramer inversion not supported");
363 template <
typename T,
idx_t N>
374 const double det = (double)
a[0 *
N +
n] *
a[2 *
N +
n] - (
double)
a[1 *
N +
n] *
a[1 *
N +
n];
378 const TT
s = TT(1) / det;
379 const TT
tmp =
s *
a[2 *
N +
n];
381 a[2 *
N +
n] =
s *
a[0 *
N +
n];
387 template <
typename T,
idx_t N>
397 const TT c00 =
a[2 *
N +
n] *
a[5 *
N +
n] -
a[4 *
N +
n] *
a[4 *
N +
n];
398 const TT c01 =
a[4 *
N +
n] *
a[3 *
N +
n] -
a[1 *
N +
n] *
a[5 *
N +
n];
399 const TT c02 =
a[1 *
N +
n] *
a[4 *
N +
n] -
a[2 *
N +
n] *
a[3 *
N +
n];
400 const TT c11 =
a[5 *
N +
n] *
a[0 *
N +
n] -
a[3 *
N +
n] *
a[3 *
N +
n];
401 const TT c12 =
a[3 *
N +
n] *
a[1 *
N +
n] -
a[4 *
N +
n] *
a[0 *
N +
n];
402 const TT c22 =
a[0 *
N +
n] *
a[2 *
N +
n] -
a[1 *
N +
n] *
a[1 *
N +
n];
405 const double det = (double)
a[0 *
N +
n] * c00 + (
double)
a[1 *
N +
n] * c01 + (double)
a[3 *
N +
n] * c02;
409 const TT
s = TT(1) / det;
410 a[0 *
N +
n] =
s * c00;
411 a[1 *
N +
n] =
s * c01;
412 a[2 *
N +
n] =
s * c11;
413 a[3 *
N +
n] =
s * c02;
414 a[4 *
N +
n] =
s * c12;
415 a[5 *
N +
n] =
s * c22;
420 template <
typename T,
idx_t D,
idx_t N>
429 template <
typename T,
idx_t D,
idx_t N>
434 template <
typename T,
idx_t N>
444 TT l1 =
a[1 *
N +
n] * l0;
445 TT l2 =
a[2 *
N +
n] - l1 * l1;
447 TT l3 =
a[3 *
N +
n] * l0;
448 TT l4 = (
a[4 *
N +
n] - l1 * l3) * l2;
449 TT l5 =
a[5 *
N +
n] - (l3 * l3 + l4 * l4);
454 l3 = (l1 * l4 * l2 - l3) * l0 * l5;
458 a[0 *
N +
n] = l3 * l3 + l1 * l1 + l0 * l0;
459 a[1 *
N +
n] = l3 * l4 + l1 * l2;
460 a[2 *
N +
n] = l4 * l4 + l2 * l2;
461 a[3 *
N +
n] = l3 * l5;
462 a[4 *
N +
n] = l4 * l5;
463 a[5 *
N +
n] = l5 * l5;
471 template <
typename T,
idx_t D,
idx_t N>
Basic3DVector & operator=(const Basic3DVector &)=default
Assignment operator.
MatriplexSym< T, D, N > MPlexSym
Matriplex< T, D1, D2, N > MPlex
void multiply(const MPlex< T, D, D, N > &A, const MPlex< T, D, D, N > &B, MPlex< T, D, D, N > &C)
const idx_t gSymOffsets[7][36]
void invertCholeskySym(MPlexSym< T, D, N > &A)
static void invert(MPlexSym< T, 2, N > &A, double *determ=nullptr)
static void multiply(const MPlexSym< T, 3, N > &A, const MPlexSym< T, 3, N > &B, MPlex< T, 3, 3, N > &C)
void invertCramerSym(MPlexSym< T, D, N > &A, double *determ=nullptr)
static void invert(MPlexSym< T, D, N > &A, double *determ=nullptr)
Container::value_type value_type
static void invert(MPlexSym< T, 3, N > &A)
static void multiply(const MPlexSym< float, 6, N > &A, const MPlexSym< float, 6, N > &B, MPlex< float, 6, 6, N > &C)
static void invert(MPlexSym< T, 3, N > &A, double *determ=nullptr)
DecomposeProduct< arg, typename Div::arg > D
class __attribute__((aligned(32))) Matriplex
static void multiply(const MPlexSym< T, D, N > &A, const MPlexSym< T, D, N > &B, MPlex< T, D, D, N > &C)
static void invert(MPlexSym< T, D, N > &A)
void add(std::map< std::string, TH1 *> &h, TH1 *hist)
T operator[](int i) const
#define ASSUME_ALIGNED(a, b)