1 #ifndef DataFormat_Math_SSEVec_H 2 #define DataFormat_Math_SSEVec_H 4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__) 27 _mm_dot_ps(__m128 v1, __m128 v2) {
29 return _mm_dp_ps(v1, v2, 0xff);
31 __m128 mul = _mm_mul_ps(v1, v2);
33 mul = _mm_hadd_ps(mul,mul);
34 return _mm_hadd_ps(mul,mul);
36 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
37 mul = _mm_add_ps(mul, swp);
38 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
39 return _mm_add_ps(mul, swp);
48 _mm_cross_ps(__m128 v1, __m128 v2) {
51 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
53 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
55 __m128 v5 = _mm_mul_ps(v3, v4);
58 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
60 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
62 v3 = _mm_mul_ps(v3, v4);
63 const __m128 neg = _mm_set_ps(0.0
f,0.0
f,-0.0
f,0.0
f);
64 return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
71 _mm256_dot_pd(__m256d v1, __m256d v2) {
72 __m256d mul = _mm256_mul_pd(v1, v2);
73 mul = _mm256_hadd_pd(mul,mul);
74 __m256d
tmp = _mm256_permute2f128_pd(mul,mul,1);
75 return _mm256_add_pd(mul,tmp);
80 _mm256_cross_pd(__m256d v1, __m256d v2) {
82 __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2<<4)+1);
83 v3 = _mm256_permute_pd(v3,0);
85 __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2<<4));
86 v4 = _mm256_permute_pd(v4,5);
88 __m256d v5 = _mm256_mul_pd(v3, v4);
90 v3 = _mm256_permute2f128_pd(v1, v2, (2<<4)+1);
91 v3 = _mm256_permute_pd(v3,0);
93 v4 = _mm256_permute2f128_pd(v2, v1, (2<<4));
94 v4 = _mm256_permute_pd(v4,5);
96 v3 = _mm256_mul_pd(v3, v4);
97 const __m256d neg = _mm256_set_pd(0.0,0.0,-0.0,0.0);
98 return _mm256_xor_pd(_mm256_sub_pd(v5, v3), neg);
102 #endif // CMS_USE_AVX 113 template<
typename T>
union Vec4;
115 template<
typename T>
union Vec2{
117 arr[0] = 0; arr[1] = 0;
120 arr[0] =
f1; arr[1] =
f2;
123 arr[0] =
f1; arr[1] =
f1;
127 arr[0] =
f1; arr[1] =
f2;
132 return Vec2(arr[
N],arr[N]);
143 arr[0] = v[0]; arr[1] = v[1];
162 template<
typename T>
union Vec4{
164 arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
167 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
172 void set(
float f1,
float f2,
float f3,
float f4=0) {
173 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
176 arr[0] =
f1; arr[1] =
f1; arr[2] =
f1; arr[3]=
f1;
180 return Vec4(arr[
N],arr[N],arr[N],arr[N]);
199 arr[0]=v4[0];arr[1]=v4[1];
210 template<
typename T>
union Mask2{};
211 template<
typename T>
union Mask4{};
217 Mask4() {vec = _mm_setzero_ps();}
218 Mask4(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
219 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
225 union Mask4<double> {
229 vec = _mm256_setzero_pd();
231 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
232 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
237 union Mask4<double> {
241 vec[0] = _mm_setzero_pd();
242 vec[1] = _mm_setzero_pd();
244 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
245 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
251 union Mask2<double> {
255 vec = _mm_setzero_pd();
257 Mask2(
unsigned long long m1,
unsigned long long m2) {
258 mask[0]=m1; mask[1]=m2;
264 typedef __m128 nativeType;
269 Vec4(__m128 ivec) : vec(ivec) {}
274 vec = _mm_setzero_ps();
284 Vec4(
float f1,
float f2,
float f3,
float f4=0) {
285 vec = _mm_set_ps(
f4, f3, f2, f1);
290 arr[0] = ivec0.
arr[0]; arr[1]=ivec0.
arr[1];
291 arr[2] = ivec1.
arr[0]; arr[3]=ivec1.
arr[1];
295 arr[0] = ivec0.
arr[0]; arr[1]=ivec0.
arr[1];
296 arr[2] =
f3; arr[3] =
f4;
300 vec = _mm_setzero_ps();
301 arr[0] = ivec0.
arr[0]; arr[1]=ivec0.
arr[1];
306 void setMask(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
307 Mask4<float>
mask(m1,m2,m3,m4); vec=mask.vec;
310 void set(
float f1,
float f2,
float f3,
float f4=0) {
311 vec = _mm_set_ps(f4, f3, f2, f1);
314 void set1(
float f1) {
315 vec = _mm_set1_ps(f1);
320 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(
N,
N,
N,
N));
339 typedef __m128d nativeType;
343 Vec2(__m128d ivec) : vec(ivec) {}
346 vec = _mm_setzero_pd();
354 vec = _mm_set_pd(f2,f1);
357 explicit Vec2(
double f1) {
364 void setMask(
unsigned long long m1,
unsigned long long m2) {
365 Mask2<double>
mask(m1,m2); vec=mask.vec;
369 void set(
double f1,
double f2) {
370 vec = _mm_set_pd(f2,f1);
373 void set1(
double f1) {
374 vec = _mm_set1_pd(f1);
379 return Vec2(arr[
N],arr[N]);
409 Vec4(__m128d ivec[]) {
414 Vec4(__m128d ivec0, __m128d ivec1) {
420 vec[0] = _mm_setzero_pd();
421 vec[1] = _mm_setzero_pd();
425 vec[0] = _mm_cvtps_pd(ivec.vec);
426 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
429 explicit Vec4(
double f1) {
433 Vec4(
double f1,
double f2,
double f3,
double f4=0) {
434 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
444 arr[2] =
f3; arr[3] =
f4;
449 vec[1] = _mm_setzero_pd();
456 void setMask(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
457 Mask4<double>
mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
461 void set(
double f1,
double f2,
double f3,
double f4=0) {
462 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
465 void set1(
double f1) {
466 vec[0] = vec[1]= _mm_set1_pd(f1);
472 return Vec4(arr[
N],arr[N],arr[N],arr[N]);
490 vec = _mm_cvtpd_ps(ivec.
vec[0]);
491 __m128 v2 = _mm_cvtpd_ps(ivec.
vec[1]);
492 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
494 #endif // CMS_USE_AVX 497 #endif // CMS_USE_SSE 524 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
528 return _mm_cmpeq_ps(a.vec,b.vec);
532 return _mm_cmpgt_ps(a.vec,b.vec);
537 return _mm_hadd_ps(a.vec,b.vec);
543 const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
544 return _mm_xor_ps(a.vec,neg);
548 return _mm_and_ps(a.vec,b.vec);
551 return _mm_or_ps(a.vec,b.vec);
554 return _mm_xor_ps(a.vec,b.vec);
557 return _mm_andnot_ps(a.vec,b.vec);
562 return _mm_add_ps(a.vec,b.vec);
566 return _mm_sub_ps(a.vec,b.vec);
570 return _mm_mul_ps(a.vec,b.vec);
574 return _mm_div_ps(a.vec,b.vec);
578 return _mm_min_ps(a.vec,b.vec);
582 return _mm_max_ps(a.vec,b.vec);
587 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
591 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
595 return _mm_div_ps(b.vec,_mm_set1_ps(a));
600 using mathSSE::_mm_dot_ps;
602 _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
607 using mathSSE::_mm_cross_ps;
608 return _mm_cross_ps(a.vec,b.vec);
617 __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
618 mul.vec = _mm_add_ps(mul.vec, swp);
621 _mm_store_ss(&s,mul.vec);
691 vec = _mm256_castpd256_pd128(v4.
vec);
698 arr[0] = ivec.
arr[0]; arr[1] = ivec.
arr[1];
702 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
703 return _mm_xor_pd(a.vec,neg);
708 return _mm_and_pd(a.vec,b.vec);
711 return _mm_or_pd(a.vec,b.vec);
714 return _mm_xor_pd(a.vec,b.vec);
717 return _mm_andnot_pd(a.vec,b.vec);
722 return _mm_hadd_pd(a.vec,b.vec);
727 return _mm_add_pd(a.vec,b.vec);
731 return _mm_sub_pd(a.vec,b.vec);
735 return _mm_mul_pd(a.vec,b.vec);
739 return _mm_div_pd(a.vec,b.vec);
744 return _mm_min_pd(a.vec,b.vec);
748 return _mm_max_pd(a.vec,b.vec);
753 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
757 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
761 return _mm_div_pd(b.vec,_mm_set1_pd(a));
768 __m128d
res = _mm_mul_pd ( a.vec, b.vec);
770 res = _mm_hadd_pd(res,res);
772 res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
775 _mm_store_sd(&s,res);
782 __m128d
res = _mm_shuffle_pd ( b.vec, b.vec, 1);
783 res = _mm_mul_pd ( a.vec , res );
784 res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
786 _mm_store_sd(&s,res);
805 _mm_movemask_pd(_mm_cmpeq_pd(a.
vec[0],b.
vec[0]))==0x3 &&
806 _mm_movemask_pd(_mm_cmpeq_pd(a.
vec[1],b.
vec[1]))==0x3 ;
810 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
842 __m128d
res = _mm_set1_pd(a);
847 __m128d
res = _mm_set1_pd(a);
875 __m128d
res = _mm_add_sd ( _mm_mul_pd ( a.
vec[0], b.
vec[0]),
876 _mm_mul_sd ( a.
vec[1], b.
vec[1])
878 res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
880 _mm_store_sd(&s,res);
887 const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
889 __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.
vec[1] , a.
vec[1] ), b.
vec[0] );
891 __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.
vec[1], b.
vec[1] ), a.
vec[0] );
892 __m128d m1 = _mm_sub_pd ( l1 , l2 );
893 m1 = _mm_shuffle_pd ( m1 , m1 , 1 );
894 m1 = _mm_xor_pd ( m1 , neg );
896 l1 = _mm_mul_pd ( a.
vec[0] , _mm_shuffle_pd ( b.
vec[0] , b.
vec[0] , 1 ) );
898 __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
909 #endif // CMS_USE_AVX 916 template<>
inline Vec2D sqrt(
Vec2D v) {
return _mm_sqrt_pd(v.vec);}
921 return Vec4D(_mm_sqrt_pd(v.
vec[0]),_mm_sqrt_pd(v.
vec[1]));
926 #endif // CMS_USE_SSE 935 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float>
const &
v);
936 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double>
const &
v);
939 #endif // DataFormat_Math_SSEVec_H
T & operator[](unsigned int n)
mathSSE::Vec4< double > andnot(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
mathSSE::Vec4< double > operator|(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Vec2< double > xy() const
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > operator&(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > cmpgt(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Basic3DVector< long double > operator/(const Basic3DVector< long double > &v, S s)
std::ostream & operator<<(std::ostream &out, mathSSE::Vec2D const &v)
T operator[](unsigned int n) const
Vec4(float f1, float f2, float f3, float f4=0)
As3D< T > as3D(Vec4< T > const &v)
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
T operator[](int i) const
struct mathSSE::Rot3 __attribute__
As3D(Vec4< T > const &iv)
std::vector< std::vector< double > > tmp
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
mathSSE::Vec4< double > operator^(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
auto zw(V v) -> Vec2< typename std::remove_reference< decltype(v[0])>::type >
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > hadd(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
mathSSE::Vec4< double > cmpeq(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.