1 #ifndef RecoTracker_MkFitCore_src_Matriplex_MatriplexCommon_h 2 #define RecoTracker_MkFitCore_src_Matriplex_MatriplexCommon_h 13 #if defined(__x86_64__) 14 #include "immintrin.h" 20 #if defined(__AVX512F__) 21 #define MPLEX_ALIGN 64 22 #elif defined(__AVX__) || defined(__AVX2__) 23 #define MPLEX_ALIGN 32 24 #elif defined(__SSE3__) 25 #define MPLEX_ALIGN 16 27 #define MPLEX_ALIGN 32 31 #if defined(MPLEX_USE_INTRINSICS) 33 #if defined(__AVX__) || defined(__AVX512F__) 35 #define MPLEX_INTRINSICS 39 #if defined(__AVX512F__) 41 typedef __m512 IntrVec_t;
42 #define MPLEX_INTRINSICS_WIDTH_BYTES 64 43 #define MPLEX_INTRINSICS_WIDTH_BITS 512 44 #define AVX512_INTRINSICS 45 #define GATHER_INTRINSICS 46 #define GATHER_IDX_LOAD(name, arr) __m512i name = _mm512_load_epi32(arr); 48 #define LD(a, i) _mm512_load_ps(&a[i * N + n]) 49 #define ST(a, i, r) _mm512_store_ps(&a[i * N + n], r) 50 #define ADD(a, b) _mm512_add_ps(a, b) 51 #define MUL(a, b) _mm512_mul_ps(a, b) 52 #define FMA(a, b, v) _mm512_fmadd_ps(a, b, v) 54 #elif defined(__AVX2__) && defined(__FMA__) 56 typedef __m256 IntrVec_t;
57 #define MPLEX_INTRINSICS_WIDTH_BYTES 32 58 #define MPLEX_INTRINSICS_WIDTH_BITS 256 59 #define AVX2_INTRINSICS 60 #define GATHER_INTRINSICS 62 #define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_si256(reinterpret_cast<const __m256i *>(arr)); 64 #define LD(a, i) _mm256_load_ps(&a[i * N + n]) 65 #define ST(a, i, r) _mm256_store_ps(&a[i * N + n], r) 66 #define ADD(a, b) _mm256_add_ps(a, b) 67 #define MUL(a, b) _mm256_mul_ps(a, b) 68 #define FMA(a, b, v) _mm256_fmadd_ps(a, b, v) 70 #elif defined(__AVX__) 72 typedef __m256 IntrVec_t;
73 #define MPLEX_INTRINSICS_WIDTH_BYTES 32 74 #define MPLEX_INTRINSICS_WIDTH_BITS 256 75 #define AVX_INTRINSICS 77 #define LD(a, i) _mm256_load_ps(&a[i * N + n]) 78 #define ST(a, i, r) _mm256_store_ps(&a[i * N + n], r) 79 #define ADD(a, b) _mm256_add_ps(a, b) 80 #define MUL(a, b) _mm256_mul_ps(a, b) 82 inline __m256 FMA(
const __m256 &
a,
const __m256 &
b,
const __m256 &
v) {
83 __m256
temp = _mm256_mul_ps(
a,
b);
84 return _mm256_add_ps(
temp,
v);
91 #ifdef __INTEL_COMPILER 92 #define ASSUME_ALIGNED(a, b) __assume_aligned(a, b) 94 #define ASSUME_ALIGNED(a, b) a = static_cast<decltype(a)>(__builtin_assume_aligned(a, b)) 103 template <
typename T>
109 cos =
T(1.0) -
T(0.5) * x2 +
T(0.0416666666666666667) * x2 * x2;
110 sin =
x -
T(0.166666666666666667) *
x * x2;
void sincos4(const T x, T &sin, T &cos)
MPlex< T, D1, D2, N > sin(const MPlex< T, D1, D2, N > &a)
void align_check(const char *pref, void *adr)
MPlex< T, D1, D2, N > cos(const MPlex< T, D1, D2, N > &a)