1 #ifndef DataFormat_Math_SSEVec_H
2 #define DataFormat_Math_SSEVec_H
4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && \
5 !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__)
29 return _mm_dp_ps(v1, v2, 0xff);
31 __m128 mul = _mm_mul_ps(v1, v2);
33 mul = _mm_hadd_ps(mul, mul);
34 return _mm_hadd_ps(mul, mul);
36 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
37 mul = _mm_add_ps(mul, swp);
38 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
39 return _mm_add_ps(mul, swp);
48 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
50 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
52 __m128 v5 = _mm_mul_ps(v3, v4);
55 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
57 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
59 v3 = _mm_mul_ps(v3, v4);
60 const __m128i neg = _mm_set_epi32(0, 0, 0x80000000, 0);
61 __m128i
ret = __m128i(_mm_sub_ps(v5, v3));
62 return __m128(_mm_xor_si128(ret, neg));
68 __m256d mul = _mm256_mul_pd(v1, v2);
69 mul = _mm256_hadd_pd(mul, mul);
70 __m256d
tmp = _mm256_permute2f128_pd(mul, mul, 1);
71 return _mm256_add_pd(mul, tmp);
75 __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2 << 4) + 1);
76 v3 = _mm256_permute_pd(v3, 0);
78 __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2 << 4));
79 v4 = _mm256_permute_pd(v4, 5);
81 __m256d v5 = _mm256_mul_pd(v3, v4);
83 v3 = _mm256_permute2f128_pd(v1, v2, (2 << 4) + 1);
84 v3 = _mm256_permute_pd(v3, 0);
86 v4 = _mm256_permute2f128_pd(v2, v1, (2 << 4));
87 v4 = _mm256_permute_pd(v4, 5);
89 v3 = _mm256_mul_pd(v3, v4);
90 __m256d
ret = _mm256_sub_pd(v5, v3);
91 const __m256i neg = _mm256_set_epi64x(0, 0, 0x8000000000000000, 0);
92 return __m256d(_mm256_xor_si256(__m256i(ret), neg));
95 #endif // CMS_USE_AVX2
114 template <
typename T>
117 template <
typename T>
148 template <
typename U>
163 template <
typename T>
178 void set(
float f1,
float f2,
float f3,
float f4 = 0) {
192 return Vec4(arr[
N], arr[N], arr[N], arr[N]);
220 template <
typename T>
222 template <
typename T>
229 Mask4() { vec = _mm_setzero_ps(); }
230 Mask4(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
240 union Mask4<double> {
243 Mask4() { vec = _mm256_setzero_pd(); }
244 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
253 union Mask4<double> {
257 vec[0] = _mm_setzero_pd();
258 vec[1] = _mm_setzero_pd();
260 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
270 union Mask2<double> {
273 Mask2() { vec = _mm_setzero_pd(); }
274 Mask2(
unsigned long long m1,
unsigned long long m2) {
282 typedef __m128 nativeType;
287 Vec4(__m128 ivec) : vec(ivec) {}
289 Vec4(OldVec<float>
const &ivec) :
o(ivec) {}
291 Vec4() { vec = _mm_setzero_ps(); }
297 Vec4(
float f1,
float f2,
float f3,
float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
300 arr[0] = ivec0.arr[0];
301 arr[1] = ivec0.arr[1];
302 arr[2] = ivec1.arr[0];
303 arr[3] = ivec1.arr[1];
307 arr[0] = ivec0.arr[0];
308 arr[1] = ivec0.arr[1];
314 vec = _mm_setzero_ps();
315 arr[0] = ivec0.arr[0];
316 arr[1] = ivec0.arr[1];
320 void setMask(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
321 Mask4<float> mask(m1, m2, m3, m4);
325 void set(
float f1,
float f2,
float f3,
float f4 = 0) { vec = _mm_set_ps(f4, f3, f2, f1); }
327 void set1(
float f1) { vec = _mm_set1_ps(f1); }
331 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(
N,
N,
N,
N));
336 float operator[](
unsigned int n)
const {
return arr[
n]; }
344 typedef __m128d nativeType;
348 Vec2(__m128d ivec) : vec(ivec) {}
350 Vec2() { vec = _mm_setzero_pd(); }
354 Vec2(
double f1,
double f2) { vec = _mm_set_pd(f2, f1); }
361 void setMask(
unsigned long long m1,
unsigned long long m2) {
362 Mask2<double> mask(m1, m2);
366 void set(
double f1,
double f2) { vec = _mm_set_pd(f2, f1); }
368 void set1(
double f1) { vec = _mm_set1_pd(f1); }
372 return Vec2(arr[
N], arr[N]);
380 double operator[](
unsigned int n)
const {
return arr[
n]; }
395 Vec4(__m128d ivec[]) {
400 Vec4(__m128d ivec0, __m128d ivec1) {
406 vec[0] = _mm_setzero_pd();
407 vec[1] = _mm_setzero_pd();
411 vec[0] = _mm_cvtps_pd(ivec.vec);
412 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
415 explicit Vec4(
double f1) { set1(f1); }
417 Vec4(
double f1,
double f2,
double f3,
double f4 = 0) {
437 vec[1] = _mm_setzero_pd();
440 Vec4(OldVec<double>
const &ivec) :
o(ivec) {}
443 void setMask(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
444 Mask4<double> mask(m1, m2, m3, m4);
445 vec[0] = mask.vec[0];
446 vec[1] = mask.vec[1];
449 void set(
double f1,
double f2,
double f3,
double f4 = 0) {
456 void set1(
double f1) { vec[0] = vec[1] = _mm_set1_pd(f1); }
460 return Vec4(arr[
N], arr[N], arr[N], arr[N]);
465 double operator[](
unsigned int n)
const {
return arr[
n]; }
472 vec = _mm_cvtpd_ps(ivec.vec[0]);
473 __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
474 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
476 #endif // CMS_USE_AVX2
478 #endif // CMS_USE_SSE
487 template <
typename T>
493 template <
typename T>
505 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec, b.vec)) == 0xf;
517 const __m128 neg = _mm_set_ps(-0.0, -0.0, -0.0, -0.0);
518 return _mm_xor_ps(a.vec, neg);
545 using mathSSE::_mm_dot_ps;
547 _mm_store_ss(&s, _mm_dot_ps(a.vec, b.vec));
552 using mathSSE::_mm_cross_ps;
553 return _mm_cross_ps(a.vec, b.vec);
559 mul =
hadd(mul, mul);
561 __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
562 mul.vec = _mm_add_ps(mul.vec, swp);
565 _mm_store_ss(&s, mul.vec);
594 inline
float dot(mathSSE::
Vec2F a, mathSSE::
Vec2F b) {
return a.arr[0] * b.arr[0] + a.arr[1] * b.arr[1]; }
598 inline
float cross(mathSSE::
Vec2F a, mathSSE::
Vec2F b) {
return a.arr[0] * b.arr[1] - a.arr[1] * b.arr[0]; }
606 vec = _mm256_castpd256_pd128(v4.vec);
613 arr[0] = ivec.arr[0];
614 arr[1] = ivec.arr[1];
618 const __m128d neg = _mm_set_pd(-0.0, -0.0);
619 return _mm_xor_pd(a.vec, neg);
652 __m128d res = _mm_mul_pd(a.vec, b.vec);
654 res = _mm_hadd_pd(res, res);
656 res = _mm_add_sd(_mm_shuffle_pd(res, res, 1), res);
659 _mm_store_sd(&s, res);
666 __m128d res = _mm_shuffle_pd(b.vec, b.vec, 1);
667 res = _mm_mul_pd(a.vec, res);
668 res = _mm_sub_sd(res, _mm_shuffle_pd(res, res, 1));
670 _mm_store_sd(&s, res);
686 return _mm_movemask_pd(_mm_cmpeq_pd(a.
vec[0], b.
vec[0])) == 0x3 &&
687 _mm_movemask_pd(_mm_cmpeq_pd(a.
vec[1], b.
vec[1])) == 0x3;
691 const __m128d neg = _mm_set_pd(-0.0, -0.0);
710 using namespace mathSSE;
716 using namespace mathSSE;
722 __m128d res = _mm_set1_pd(a);
727 __m128d res = _mm_set1_pd(a);
755 __m128d res = _mm_add_sd(_mm_mul_pd(a.vec[0], b.vec[0]), _mm_mul_sd(a.vec[1], b.vec[1]));
756 res = _mm_add_sd(_mm_unpackhi_pd(res, res), res);
758 _mm_store_sd(&s, res);
765 const __m128i neg = _mm_set_epi64x(0, 0x8000000000000000);
767 __m128d l1 = _mm_mul_pd(_mm_unpacklo_pd(a.vec[1], a.vec[1]), b.vec[0]);
769 __m128d l2 = _mm_mul_pd(_mm_unpacklo_pd(b.vec[1], b.vec[1]), a.vec[0]);
770 __m128d m1 = _mm_sub_pd(l1, l2);
771 m1 = _mm_shuffle_pd(m1, m1, 1);
772 m1 = __m128d(_mm_xor_si128(__m128i(m1), neg));
774 l1 = _mm_mul_pd(a.vec[0], _mm_shuffle_pd(b.vec[0], b.vec[0], 1));
776 __m128d
m2 = _mm_sub_sd(l1, _mm_unpackhi_pd(l1, l1));
781 return dot(a.xy(), b.xy());
784 #endif // CMS_USE_AVX2
790 return _mm_sqrt_ps(v.vec);
798 return _mm_sqrt_pd(v.vec);
803 return _mm256_sqrt_pd(v.
vec);
808 return Vec4D(_mm_sqrt_pd(v.
vec[0]), _mm_sqrt_pd(v.
vec[1]));
813 #endif // CMS_USE_SSE
821 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<float>
const &
v);
822 std::ostream &operator<<(std::ostream &out, mathSSE::As3D<double>
const &
v);
824 #endif // DataFormat_Math_SSEVec_H
T & operator[](unsigned int n)
tuple ret
prodAgent to be discontinued
mathSSE::Vec4< double > andnot(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
mathSSE::Vec4< double > operator|(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > operator&(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > cmpgt(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
std::ostream & operator<<(std::ostream &out, const ALILine &li)
T operator[](unsigned int n) const
Vec4(float f1, float f2, float f3, float f4=0)
As3D< T > as3D(Vec4< T > const &v)
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
T operator[](int i) const
struct mathSSE::Rot3 __attribute__
T1 operator/(const Phi< T1, Range > &a, const Phi< T1, Range > &b)
Division.
Basic2DVector< T > xy() const
As3D(Vec4< T > const &iv)
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
mathSSE::Vec4< double > operator^(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
auto zw(V v) -> Vec2< typename std::remove_reference< decltype(v[0])>::type >
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > hadd(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
mathSSE::Vec4< double > cmpeq(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
T __attribute__((aligned(16))) arr[4]
void set(float f1, float f2, float f3, float f4=0)
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.