CMS 3D CMS Logo

MatriplexCommon.h
Go to the documentation of this file.
1 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexCommon_h
2 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexCommon_h
3 
4 #include <cstring>
5 
6 // Use intrinsics version of code when available, done via CPP flags.
7 // #define MPLEX_USE_INTRINSICS
8 
9 //==============================================================================
10 // Intrinsics -- preamble
11 //==============================================================================
12 
13 #if defined(__x86_64__)
14 #include "immintrin.h"
15 #else
16 #include <cstdlib>
17 #endif
18 
19 #ifndef MPLEX_ALIGN
20 #if defined(__AVX512F__)
21 #define MPLEX_ALIGN 64
22 #elif defined(__AVX__) || defined(__AVX2__)
23 #define MPLEX_ALIGN 32
24 #elif defined(__SSE3__)
25 #define MPLEX_ALIGN 16
26 #else
27 #define MPLEX_ALIGN 32
28 #endif
29 #endif
30 
31 #if defined(MPLEX_USE_INTRINSICS)
32 // This seems unnecessary: __AVX__ is usually defined for all higher ISA extensions
33 #if defined(__AVX__) || defined(__AVX512F__)
34 
35 #define MPLEX_INTRINSICS
36 
37 #endif
38 
39 #if defined(__AVX512F__)
40 
41 typedef __m512 IntrVec_t;
42 #define MPLEX_INTRINSICS_WIDTH_BYTES 64
43 #define MPLEX_INTRINSICS_WIDTH_BITS 512
44 #define AVX512_INTRINSICS
45 #define GATHER_INTRINSICS
46 #define GATHER_IDX_LOAD(name, arr) __m512i name = _mm512_load_epi32(arr);
47 
48 #define LD(a, i) _mm512_load_ps(&a[i * N + n])
49 #define ST(a, i, r) _mm512_store_ps(&a[i * N + n], r)
50 #define ADD(a, b) _mm512_add_ps(a, b)
51 #define MUL(a, b) _mm512_mul_ps(a, b)
52 #define FMA(a, b, v) _mm512_fmadd_ps(a, b, v)
53 
54 #elif defined(__AVX2__) && defined(__FMA__)
55 
56 typedef __m256 IntrVec_t;
57 #define MPLEX_INTRINSICS_WIDTH_BYTES 32
58 #define MPLEX_INTRINSICS_WIDTH_BITS 256
59 #define AVX2_INTRINSICS
60 #define GATHER_INTRINSICS
61 // Previously used _mm256_load_epi32(arr) here, but that's part of AVX-512F, not AVX2
62 #define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_si256(reinterpret_cast<const __m256i *>(arr));
63 
64 #define LD(a, i) _mm256_load_ps(&a[i * N + n])
65 #define ST(a, i, r) _mm256_store_ps(&a[i * N + n], r)
66 #define ADD(a, b) _mm256_add_ps(a, b)
67 #define MUL(a, b) _mm256_mul_ps(a, b)
68 #define FMA(a, b, v) _mm256_fmadd_ps(a, b, v)
69 
70 #elif defined(__AVX__)
71 
72 typedef __m256 IntrVec_t;
73 #define MPLEX_INTRINSICS_WIDTH_BYTES 32
74 #define MPLEX_INTRINSICS_WIDTH_BITS 256
75 #define AVX_INTRINSICS
76 
77 #define LD(a, i) _mm256_load_ps(&a[i * N + n])
78 #define ST(a, i, r) _mm256_store_ps(&a[i * N + n], r)
79 #define ADD(a, b) _mm256_add_ps(a, b)
80 #define MUL(a, b) _mm256_mul_ps(a, b)
81 // #define FMA(a, b, v) { __m256 temp = _mm256_mul_ps(a, b); v = _mm256_add_ps(temp, v); }
82 inline __m256 FMA(const __m256 &a, const __m256 &b, const __m256 &v) {
83  __m256 temp = _mm256_mul_ps(a, b);
84  return _mm256_add_ps(temp, v);
85 }
86 
87 #endif
88 
89 #endif
90 
91 #ifdef __INTEL_COMPILER
92 #define ASSUME_ALIGNED(a, b) __assume_aligned(a, b)
93 #else
94 #define ASSUME_ALIGNED(a, b) a = static_cast<decltype(a)>(__builtin_assume_aligned(a, b))
95 #endif
96 
97 namespace Matriplex {
98  typedef int idx_t;
99 
100  void align_check(const char *pref, void *adr);
101 
102  namespace internal {
103  template <typename T>
104  void sincos4(const T x, T &sin, T &cos) {
105  // Had this writen with explicit division by factorial.
106  // The *whole* fitting test ran like 2.5% slower on MIC, sigh.
107 
108  const T x2 = x * x;
109  cos = T(1.0) - T(0.5) * x2 + T(0.0416666666666666667) * x2 * x2;
110  sin = x - T(0.166666666666666667) * x * x2;
111  }
112  } // namespace internal
113 } // namespace Matriplex
114 
115 #endif
void sincos4(const T x, T &sin, T &cos)
MPlex< T, D1, D2, N > sin(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:622
void align_check(const char *pref, void *adr)
double b
Definition: hdecay.h:120
double a
Definition: hdecay.h:121
float x
long double T
MPlex< T, D1, D2, N > cos(const MPlex< T, D1, D2, N > &a)
Definition: Matriplex.h:628