1 #ifndef DataFormat_Math_SSEVec_H 2 #define DataFormat_Math_SSEVec_H 4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && \ 5 !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__) && !defined(__riscv) 33 return _mm_dp_ps(v1, v2, 0xff);
37 __m128 mul = _mm_mul_ps(v1, v2);
39 mul = _mm_hadd_ps(mul, mul);
40 return _mm_hadd_ps(mul, mul);
42 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
43 mul = _mm_add_ps(mul, swp);
44 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
45 return _mm_add_ps(mul, swp);
54 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
56 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
58 __m128 v5 = _mm_mul_ps(v3, v4);
61 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
63 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
65 v3 = _mm_mul_ps(v3, v4);
66 const __m128i neg = _mm_set_epi32(0, 0, 0x80000000, 0);
67 __m128i
ret = __m128i(_mm_sub_ps(v5, v3));
68 return __m128(_mm_xor_si128(
ret, neg));
74 __m256d mul = _mm256_mul_pd(v1, v2);
75 mul = _mm256_hadd_pd(mul, mul);
76 __m256d
tmp = _mm256_permute2f128_pd(mul, mul, 1);
77 return _mm256_add_pd(mul,
tmp);
81 __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2 << 4) + 1);
82 v3 = _mm256_permute_pd(v3, 0);
84 __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2 << 4));
85 v4 = _mm256_permute_pd(v4, 5);
87 __m256d v5 = _mm256_mul_pd(v3, v4);
89 v3 = _mm256_permute2f128_pd(v1, v2, (2 << 4) + 1);
90 v3 = _mm256_permute_pd(v3, 0);
92 v4 = _mm256_permute2f128_pd(v2, v1, (2 << 4));
93 v4 = _mm256_permute_pd(v4, 5);
95 v3 = _mm256_mul_pd(v3, v4);
96 __m256d
ret = _mm256_sub_pd(v5, v3);
97 const __m256i neg = _mm256_set_epi64x(0, 0, 0x8000000000000000, 0);
98 return __m256d(_mm256_xor_si256(__m256i(
ret), neg));
101 #endif // CMS_USE_AVX2 103 template <
typename T>
120 template <
typename T>
123 template <
typename T>
154 template <
typename U>
169 template <
typename T>
184 void set(
float f1,
float f2,
float f3,
float f4 = 0) {
198 return Vec4(arr[
N], arr[
N], arr[
N], arr[
N]);
226 template <
typename T>
228 template <
typename T>
235 Mask4() { vec = _mm_setzero_ps(); }
236 Mask4(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
246 union Mask4<double> {
249 Mask4() { vec = _mm256_setzero_pd(); }
250 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
259 union Mask4<double> {
263 vec[0] = _mm_setzero_pd();
264 vec[1] = _mm_setzero_pd();
266 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
276 union Mask2<double> {
279 Mask2() { vec = _mm_setzero_pd(); }
280 Mask2(
unsigned long long m1,
unsigned long long m2) {
288 typedef __m128 nativeType;
293 Vec4(__m128 ivec) : vec(ivec) {}
295 Vec4(OldVec<float>
const &ivec) :
o(ivec) {}
297 Vec4() { vec = _mm_setzero_ps(); }
303 Vec4(
float f1,
float f2,
float f3,
float f4 = 0) { vec = _mm_set_ps(f4, f3,
f2,
f1); }
306 arr[0] = ivec0.arr[0];
307 arr[1] = ivec0.arr[1];
308 arr[2] = ivec1.arr[0];
309 arr[3] = ivec1.arr[1];
313 arr[0] = ivec0.arr[0];
314 arr[1] = ivec0.arr[1];
320 vec = _mm_setzero_ps();
321 arr[0] = ivec0.arr[0];
322 arr[1] = ivec0.arr[1];
326 void setMask(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
327 Mask4<float>
mask(m1,
m2, m3, m4);
331 void set(
float f1,
float f2,
float f3,
float f4 = 0) { vec = _mm_set_ps(f4, f3,
f2,
f1); }
333 void set1(
float f1) { vec = _mm_set1_ps(
f1); }
337 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(
N,
N,
N,
N));
342 float operator[](
unsigned int n)
const {
return arr[
n]; }
350 typedef __m128d nativeType;
354 Vec2(__m128d ivec) : vec(ivec) {}
356 Vec2() { vec = _mm_setzero_pd(); }
360 Vec2(
double f1,
double f2) { vec = _mm_set_pd(
f2,
f1); }
367 void setMask(
unsigned long long m1,
unsigned long long m2) {
368 Mask2<double>
mask(m1,
m2);
372 void set(
double f1,
double f2) { vec = _mm_set_pd(
f2,
f1); }
374 void set1(
double f1) { vec = _mm_set1_pd(
f1); }
378 return Vec2(arr[
N], arr[
N]);
386 double operator[](
unsigned int n)
const {
return arr[
n]; }
401 Vec4(__m128d ivec[]) {
406 Vec4(__m128d ivec0, __m128d ivec1) {
412 vec[0] = _mm_setzero_pd();
413 vec[1] = _mm_setzero_pd();
417 vec[0] = _mm_cvtps_pd(ivec.vec);
418 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
423 Vec4(
double f1,
double f2,
double f3,
double f4 = 0) {
443 vec[1] = _mm_setzero_pd();
446 Vec4(OldVec<double>
const &ivec) :
o(ivec) {}
449 void setMask(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
450 Mask4<double>
mask(m1,
m2, m3, m4);
451 vec[0] =
mask.vec[0];
452 vec[1] =
mask.vec[1];
455 void set(
double f1,
double f2,
double f3,
double f4 = 0) {
462 void set1(
double f1) { vec[0] = vec[1] = _mm_set1_pd(
f1); }
466 return Vec4(arr[
N], arr[
N], arr[
N], arr[
N]);
471 double operator[](
unsigned int n)
const {
return arr[
n]; }
478 vec = _mm_cvtpd_ps(ivec.vec[0]);
479 __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
480 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
482 #endif // CMS_USE_AVX2 484 #endif // CMS_USE_SSE 493 template <
typename T>
499 template <
typename T>
511 return _mm_movemask_ps(_mm_cmpeq_ps(
a.vec,
b.vec)) == 0xf;
523 const __m128 neg = _mm_set_ps(-0.0, -0.0, -0.0, -0.0);
524 return _mm_xor_ps(
a.vec, neg);
551 using mathSSE::_mm_dot_ps;
553 _mm_store_ss(&
s, _mm_dot_ps(
a.vec,
b.vec));
560 using mathSSE::_mm_cross_ps;
561 return _mm_cross_ps(
a.vec,
b.vec);
567 mul =
hadd(mul, mul);
569 __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
570 mul.vec = _mm_add_ps(mul.vec, swp);
573 _mm_store_ss(&
s, mul.vec);
614 vec = _mm256_castpd256_pd128(v4.vec);
621 arr[0] = ivec.arr[0];
622 arr[1] = ivec.arr[1];
626 const __m128d neg = _mm_set_pd(-0.0, -0.0);
627 return _mm_xor_pd(
a.vec, neg);
660 __m128d
res = _mm_mul_pd(
a.vec,
b.vec);
667 _mm_store_sd(&
s,
res);
674 __m128d
res = _mm_shuffle_pd(
b.vec,
b.vec, 1);
678 _mm_store_sd(&
s,
res);
694 return _mm_movemask_pd(_mm_cmpeq_pd(
a.vec[0],
b.vec[0])) == 0x3 &&
695 _mm_movemask_pd(_mm_cmpeq_pd(
a.vec[1],
b.vec[1])) == 0x3;
699 const __m128d neg = _mm_set_pd(-0.0, -0.0);
700 return mathSSE::Vec4D(_mm_xor_pd(
a.vec[0], neg), _mm_xor_pd(
a.vec[1], neg));
704 return mathSSE::Vec4D(_mm_add_pd(
a.vec[0],
b.vec[0]), _mm_add_pd(
a.vec[1],
b.vec[1]));
708 return mathSSE::Vec4D(_mm_sub_pd(
a.vec[0],
b.vec[0]), _mm_sub_pd(
a.vec[1],
b.vec[1]));
711 return mathSSE::Vec4D(_mm_mul_pd(
a.vec[0],
b.vec[0]), _mm_mul_pd(
a.vec[1],
b.vec[1]));
714 return mathSSE::Vec4D(_mm_div_pd(
a.vec[0],
b.vec[0]), _mm_div_pd(
a.vec[1],
b.vec[1]));
730 __m128d
res = _mm_set1_pd(
a);
735 __m128d
res = _mm_set1_pd(
a);
767 __m128d
res = _mm_add_sd(_mm_mul_pd(
a.vec[0],
b.vec[0]), _mm_mul_sd(
a.vec[1],
b.vec[1]));
770 _mm_store_sd(&
s,
res);
777 const __m128i neg = _mm_set_epi64x(0, 0x8000000000000000);
779 __m128d l1 = _mm_mul_pd(_mm_unpacklo_pd(
a.vec[1],
a.vec[1]),
b.vec[0]);
781 __m128d l2 = _mm_mul_pd(_mm_unpacklo_pd(
b.vec[1],
b.vec[1]),
a.vec[0]);
782 __m128d
m1 = _mm_sub_pd(l1, l2);
783 m1 = _mm_shuffle_pd(m1, m1, 1);
784 m1 = __m128d(_mm_xor_si128(__m128i(m1), neg));
786 l1 = _mm_mul_pd(
a.vec[0], _mm_shuffle_pd(
b.vec[0],
b.vec[0], 1));
788 __m128d
m2 = _mm_sub_sd(l1, _mm_unpackhi_pd(l1, l1));
793 return dot(
a.xy(),
b.xy());
796 #endif // CMS_USE_AVX2 802 return _mm_sqrt_ps(
v.vec);
810 return _mm_sqrt_pd(
v.vec);
815 return _mm256_sqrt_pd(
v.vec);
820 return Vec4D(_mm_sqrt_pd(
v.vec[0]), _mm_sqrt_pd(
v.vec[1]));
825 #endif // CMS_USE_SSE 833 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<float>
const &
v);
834 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<double>
const &
v);
836 #endif // DataFormat_Math_SSEVec_H
T & operator[](unsigned int n)
mathSSE::Vec4< double > andnot(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.
mathSSE::Vec4< double > operator &(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
mathSSE::Vec4< double > operator|(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
ret
prodAgent to be discontinued
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
auto dot3(V1 x, V2 y) -> typename std::remove_reference< decltype(x[0])>::type
mathSSE::Vec4< double > cmpgt(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
std::ostream & operator<<(std::ostream &out, mathSSE::Vec2D const &v)
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
Vec4(float f1, float f2, float f3, float f4=0)
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
def template(fileName, svg, replaceme="REPLACEME")
struct mathSSE::OldVec __attribute__((aligned(16)))
struct mathSSE::Rot3 __attribute__
T1 operator/(const Phi< T1, Range > &a, const Phi< T1, Range > &b)
Division.
T operator[](unsigned int n) const
As3D(Vec4< T > const &iv)
mathSSE::Vec4< double > operator^(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
As3D< V > as3D(V const &v)
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
T operator[](int i) const
mathSSE::Vec4< double > hadd(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
mathSSE::Vec4< double > cmpeq(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
T __attribute__((aligned(16))) arr[4]