1 #ifndef DataFormat_Math_SSEVec_H
2 #define DataFormat_Math_SSEVec_H
4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__)
5 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
17 #include <emmintrin.h>
20 #include <pmmintrin.h>
23 #include <smmintrin.h>
57 _mm_dot_ps(__m128 v1, __m128 v2) {
59 return _mm_dp_ps(v1, v2, 0xff);
61 __m128 mul = _mm_mul_ps(v1, v2);
63 mul = _mm_hadd_ps(mul,mul);
64 return _mm_hadd_ps(mul,mul);
66 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
67 mul = _mm_add_ps(mul, swp);
68 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
69 return _mm_add_ps(mul, swp);
78 _mm_cross_ps(__m128 v1, __m128 v2) {
81 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
83 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
85 __m128 v5 = _mm_mul_ps(v3, v4);
88 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
90 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
92 v3 = _mm_mul_ps(v3, v4);
93 const __m128 neg = _mm_set_ps(0.0
f,0.0
f,-0.0
f,0.0
f);
94 return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
101 _mm256_dot_pd(__m256d v1, __m256d v2) {
102 __m256d mul = _mm256_mul_pd(v1, v2);
103 mul = _mm256_hadd_pd(mul,mul);
104 __m256d
tmp = _mm256_permute2f128_pd(mul,mul,1);
105 return _mm256_add_pd(mul,tmp);
110 _mm256_cross_pd(__m256d v1, __m256d v2) {
112 __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2<<4)+1);
113 v3 = _mm256_permute_pd(v3,0);
115 __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2<<4));
116 v4 = _mm256_permute_pd(v4,5);
118 __m256d v5 = _mm256_mul_pd(v3, v4);
120 v3 = _mm256_permute2f128_pd(v1, v2, (2<<4)+1);
121 v3 = _mm256_permute_pd(v3,0);
123 v4 = _mm256_permute2f128_pd(v2, v1, (2<<4));
124 v4 = _mm256_permute_pd(v4,5);
126 v3 = _mm256_mul_pd(v3, v4);
127 const __m256d neg = _mm256_set_pd(0.0,0.0,-0.0,0.0);
128 return _mm256_xor_pd(_mm256_sub_pd(v5, v3), neg);
132 #endif // CMS_USE_AVX
143 template<
typename T>
union Vec4;
145 template<
typename T>
union Vec2{
173 arr[0] = v[0];
arr[1] = v[1];
192 template<
typename T>
union Vec4{
194 arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
197 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
203 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
206 arr[0] =
f1; arr[1] =
f1; arr[2] =
f1; arr[3]=
f1;
210 return Vec4(arr[
N],arr[N],arr[N],arr[N]);
229 arr[0]=v4[0];arr[1]=v4[1];
240 template<
typename T>
union Mask2{};
241 template<
typename T>
union Mask4{};
247 Mask4() {vec = _mm_setzero_ps();}
248 Mask4(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
249 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
255 union Mask4<double> {
259 vec = _mm256_setzero_pd();
261 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
262 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
267 union Mask4<double> {
271 vec[0] = _mm_setzero_pd();
272 vec[1] = _mm_setzero_pd();
274 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
275 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
281 union Mask2<double> {
285 vec = _mm_setzero_pd();
287 Mask2(
unsigned long long m1,
unsigned long long m2) {
288 mask[0]=m1; mask[1]=m2;
294 typedef __m128 nativeType;
299 Vec4(__m128 ivec) : vec(ivec) {}
301 Vec4(OldVec<float>
const & ivec) :
o(ivec) {}
304 vec = _mm_setzero_ps();
315 vec = _mm_set_ps(
f4, f3, f2, f1);
320 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
321 arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
325 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
326 arr[2] =
f3; arr[3] =
f4;
330 vec = _mm_setzero_ps();
331 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
336 void setMask(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
337 Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec;
340 void set(
float f1,
float f2,
float f3,
float f4=0) {
341 vec = _mm_set_ps(
f4, f3, f2, f1);
345 vec = _mm_set1_ps(f1);
350 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(
N,
N,
N,
N));
369 typedef __m128d nativeType;
373 Vec2(__m128d ivec) : vec(ivec) {}
376 vec = _mm_setzero_pd();
384 vec = _mm_set_pd(f2,f1);
387 explicit Vec2(
double f1) {
394 void setMask(
unsigned long long m1,
unsigned long long m2) {
395 Mask2<double> mask(m1,m2); vec=mask.vec;
399 void set(
double f1,
double f2) {
400 vec = _mm_set_pd(f2,f1);
404 vec = _mm_set1_pd(f1);
409 return Vec2(arr[
N],arr[N]);
439 Vec4(__m128d ivec[]) {
444 Vec4(__m128d ivec0, __m128d ivec1) {
450 vec[0] = _mm_setzero_pd();
451 vec[1] = _mm_setzero_pd();
455 vec[0] = _mm_cvtps_pd(ivec.vec);
456 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
459 explicit Vec4(
double f1) {
464 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
474 arr[2] =
f3; arr[3] =
f4;
479 vec[1] = _mm_setzero_pd();
483 Vec4(OldVec<double>
const & ivec) :
o(ivec) {}
486 void setMask(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
487 Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
491 void set(
double f1,
double f2,
double f3,
double f4=0) {
492 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
495 void set1(
double f1) {
496 vec[0] = vec[1]= _mm_set1_pd(f1);
502 return Vec4(arr[
N],arr[N],arr[N],arr[N]);
520 vec = _mm_cvtpd_ps(ivec.vec[0]);
521 __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
522 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
524 #endif // CMS_USE_AVX
527 #endif // CMS_USE_SSE
554 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
558 return _mm_cmpeq_ps(a.vec,b.vec);
562 return _mm_cmpgt_ps(a.vec,b.vec);
567 return _mm_hadd_ps(a.vec,b.vec);
573 const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
574 return _mm_xor_ps(a.vec,neg);
578 return _mm_and_ps(a.vec,b.vec);
581 return _mm_or_ps(a.vec,b.vec);
584 return _mm_xor_ps(a.vec,b.vec);
587 return _mm_andnot_ps(a.vec,b.vec);
592 return _mm_add_ps(a.vec,b.vec);
596 return _mm_sub_ps(a.vec,b.vec);
600 return _mm_mul_ps(a.vec,b.vec);
604 return _mm_div_ps(a.vec,b.vec);
608 return _mm_min_ps(a.vec,b.vec);
612 return _mm_max_ps(a.vec,b.vec);
617 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
621 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
625 return _mm_div_ps(b.vec,_mm_set1_ps(a));
630 using mathSSE::_mm_dot_ps;
632 _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
637 using mathSSE::_mm_cross_ps;
638 return _mm_cross_ps(a.vec,b.vec);
647 __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
648 mul.vec = _mm_add_ps(mul.vec, swp);
651 _mm_store_ss(&s,mul.vec);
705 return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
711 return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
721 vec = _mm256_castpd256_pd128(v4.vec);
728 arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
732 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
733 return _mm_xor_pd(a.vec,neg);
738 return _mm_and_pd(a.vec,b.vec);
741 return _mm_or_pd(a.vec,b.vec);
744 return _mm_xor_pd(a.vec,b.vec);
747 return _mm_andnot_pd(a.vec,b.vec);
752 return _mm_hadd_pd(a.vec,b.vec);
757 return _mm_add_pd(a.vec,b.vec);
761 return _mm_sub_pd(a.vec,b.vec);
765 return _mm_mul_pd(a.vec,b.vec);
769 return _mm_div_pd(a.vec,b.vec);
774 return _mm_min_pd(a.vec,b.vec);
778 return _mm_max_pd(a.vec,b.vec);
783 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
787 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
791 return _mm_div_pd(b.vec,_mm_set1_pd(a));
798 __m128d res = _mm_mul_pd ( a.vec, b.vec);
800 res = _mm_hadd_pd(res,res);
802 res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
805 _mm_store_sd(&s,res);
812 __m128d res = _mm_shuffle_pd ( b.vec, b.vec, 1);
813 res = _mm_mul_pd ( a.vec , res );
814 res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
816 _mm_store_sd(&s,res);
835 _mm_movemask_pd(_mm_cmpeq_pd(a.
vec[0],b.
vec[0]))==0x3 &&
836 _mm_movemask_pd(_mm_cmpeq_pd(a.
vec[1],b.
vec[1]))==0x3 ;
840 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
862 using namespace mathSSE;
867 using namespace mathSSE;
872 __m128d res = _mm_set1_pd(a);
877 __m128d res = _mm_set1_pd(a);
905 __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
906 _mm_mul_sd ( a.vec[1], b.vec[1])
908 res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
910 _mm_store_sd(&s,res);
917 const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
919 __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
921 __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.vec[1], b.vec[1] ), a.vec[0] );
922 __m128d m1 = _mm_sub_pd ( l1 , l2 );
923 m1 = _mm_shuffle_pd ( m1 , m1 , 1 );
924 m1 = _mm_xor_pd ( m1 , neg );
926 l1 = _mm_mul_pd ( a.vec[0] , _mm_shuffle_pd ( b.vec[0] , b.vec[0] , 1 ) );
928 __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
936 return dot(a.xy(),b.xy());
939 #endif // CMS_USE_AVX
946 template<>
inline Vec2D sqrt(
Vec2D v) {
return _mm_sqrt_pd(v.vec);}
951 return Vec4D(_mm_sqrt_pd(v.
vec[0]),_mm_sqrt_pd(v.
vec[1]));
963 inline void sincos(
Vec4F v,
Vec4F & s,
Vec4F &
c) { sincos_ps(v.vec,&s.vec, &c.vec);}
965 inline float log(
float f) {
float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f)));
return s;}
966 inline float exp(
float f) {
float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f)));
return s;}
967 inline float sin(
float f) {
float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f)));
return s;}
968 inline float cos(
float f) {
float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f)));
return s;}
969 inline void sincos(
float f,
float & s,
float &
c) {
971 sincos_ps(_mm_load_ss(&f),&vs, &vc);
972 _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);
975 #endif // CMS_USE_SSE
984 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float>
const &
v);
985 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double>
const &
v);
988 #endif // DataFormat_Math_SSEVec_H
T & operator[](unsigned int n)
mathSSE::Vec4< double > andnot(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
mathSSE::Vec4< double > operator|(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > operator&(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
auto zw(V v) -> Vec2< typenamestd::remove_reference< decltype(v[0])>::type >
Sin< T >::type sin(const T &t)
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > cmpgt(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
std::ostream & operator<<(std::ostream &out, const ALILine &li)
Basic3DVector< long double > operator/(const Basic3DVector< long double > &v, S s)
T operator[](unsigned int n) const
Vec4(float f1, float f2, float f3, float f4=0)
As3D< T > as3D(Vec4< T > const &v)
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
Cos< T >::type cos(const T &t)
T operator[](int i) const
struct mathSSE::Rot3 __attribute__
As3D(Vec4< T > const &iv)
std::vector< std::vector< double > > tmp
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
mathSSE::Vec4< double > operator^(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > hadd(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
mathSSE::Vec4< double > cmpeq(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
T __attribute__((aligned(16))) arr[4]
void set(float f1, float f2, float f3, float f4=0)
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.