1 #ifndef DataFormat_Math_SSEVec_H 2 #define DataFormat_Math_SSEVec_H 4 #if !defined(__arm__) && !defined(__aarch64__) && !defined(__MIC__) && !defined(__powerpc64__) && !defined(__PPC64__) && !defined(__powerpc__) && !defined(__NVCC__) 27 _mm_dot_ps(__m128 v1, __m128 v2) {
29 return _mm_dp_ps(v1, v2, 0xff);
31 __m128 mul = _mm_mul_ps(v1, v2);
33 mul = _mm_hadd_ps(mul,mul);
34 return _mm_hadd_ps(mul,mul);
36 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
37 mul = _mm_add_ps(mul, swp);
38 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
39 return _mm_add_ps(mul, swp);
48 _mm_cross_ps(__m128 v1, __m128 v2) {
51 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
53 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
55 __m128 v5 = _mm_mul_ps(v3, v4);
58 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
60 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
62 v3 = _mm_mul_ps(v3, v4);
63 const __m128i neg = _mm_set_epi32(0,0,0x80000000,0);
64 __m128i ret = __m128i(_mm_sub_ps(v5, v3));
65 return __m128(_mm_xor_si128(ret,neg));
72 _mm256_dot_pd(__m256d v1, __m256d v2) {
73 __m256d mul = _mm256_mul_pd(v1, v2);
74 mul = _mm256_hadd_pd(mul,mul);
75 __m256d
tmp = _mm256_permute2f128_pd(mul,mul,1);
76 return _mm256_add_pd(mul,tmp);
81 _mm256_cross_pd(__m256d v1, __m256d v2) {
83 __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2<<4)+1);
84 v3 = _mm256_permute_pd(v3,0);
86 __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2<<4));
87 v4 = _mm256_permute_pd(v4,5);
89 __m256d v5 = _mm256_mul_pd(v3, v4);
91 v3 = _mm256_permute2f128_pd(v1, v2, (2<<4)+1);
92 v3 = _mm256_permute_pd(v3,0);
94 v4 = _mm256_permute2f128_pd(v2, v1, (2<<4));
95 v4 = _mm256_permute_pd(v4,5);
97 v3 = _mm256_mul_pd(v3, v4);
98 __m256d ret = _mm256_sub_pd(v5, v3);
99 const __m256i neg = _mm256_set_epi64x(0,0,0x8000000000000000,0);
100 return __m256d(_mm256_xor_si256(__m256i(ret), neg));
103 #endif // CMS_USE_AVX 114 template<
typename T>
union Vec4;
116 template<
typename T>
union Vec2{
118 arr[0] = 0; arr[1] = 0;
121 arr[0] =
f1; arr[1] =
f2;
124 arr[0] =
f1; arr[1] =
f1;
128 arr[0] =
f1; arr[1] =
f2;
133 return Vec2(arr[
N],arr[N]);
144 arr[0] = v[0]; arr[1] = v[1];
163 template<
typename T>
union Vec4{
165 arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
168 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
173 void set(
float f1,
float f2,
float f3,
float f4=0) {
174 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
177 arr[0] =
f1; arr[1] =
f1; arr[2] =
f1; arr[3]=
f1;
181 return Vec4(arr[
N],arr[N],arr[N],arr[N]);
200 arr[0]=v4[0];arr[1]=v4[1];
211 template<
typename T>
union Mask2{};
212 template<
typename T>
union Mask4{};
218 Mask4() {vec = _mm_setzero_ps();}
219 Mask4(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
220 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
226 union Mask4<double> {
230 vec = _mm256_setzero_pd();
232 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
233 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
238 union Mask4<double> {
242 vec[0] = _mm_setzero_pd();
243 vec[1] = _mm_setzero_pd();
245 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
246 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
252 union Mask2<double> {
256 vec = _mm_setzero_pd();
258 Mask2(
unsigned long long m1,
unsigned long long m2) {
259 mask[0]=m1; mask[1]=m2;
265 typedef __m128 nativeType;
270 Vec4(__m128 ivec) : vec(ivec) {}
275 vec = _mm_setzero_ps();
285 Vec4(
float f1,
float f2,
float f3,
float f4=0) {
286 vec = _mm_set_ps(
f4, f3, f2, f1);
291 arr[0] = ivec0.
arr[0]; arr[1]=ivec0.
arr[1];
292 arr[2] = ivec1.
arr[0]; arr[3]=ivec1.
arr[1];
296 arr[0] = ivec0.
arr[0]; arr[1]=ivec0.
arr[1];
297 arr[2] =
f3; arr[3] =
f4;
301 vec = _mm_setzero_ps();
302 arr[0] = ivec0.
arr[0]; arr[1]=ivec0.
arr[1];
307 void setMask(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
308 Mask4<float>
mask(m1,m2,m3,m4); vec=mask.vec;
311 void set(
float f1,
float f2,
float f3,
float f4=0) {
312 vec = _mm_set_ps(f4, f3, f2, f1);
315 void set1(
float f1) {
316 vec = _mm_set1_ps(f1);
321 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(
N,
N,
N,
N));
340 typedef __m128d nativeType;
344 Vec2(__m128d ivec) : vec(ivec) {}
347 vec = _mm_setzero_pd();
355 vec = _mm_set_pd(f2,f1);
358 explicit Vec2(
double f1) {
365 void setMask(
unsigned long long m1,
unsigned long long m2) {
366 Mask2<double>
mask(m1,m2); vec=mask.vec;
370 void set(
double f1,
double f2) {
371 vec = _mm_set_pd(f2,f1);
374 void set1(
double f1) {
375 vec = _mm_set1_pd(f1);
380 return Vec2(arr[
N],arr[N]);
410 Vec4(__m128d ivec[]) {
415 Vec4(__m128d ivec0, __m128d ivec1) {
421 vec[0] = _mm_setzero_pd();
422 vec[1] = _mm_setzero_pd();
426 vec[0] = _mm_cvtps_pd(ivec.vec);
427 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
430 explicit Vec4(
double f1) {
434 Vec4(
double f1,
double f2,
double f3,
double f4=0) {
435 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
445 arr[2] =
f3; arr[3] =
f4;
450 vec[1] = _mm_setzero_pd();
457 void setMask(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
458 Mask4<double>
mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
462 void set(
double f1,
double f2,
double f3,
double f4=0) {
463 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
466 void set1(
double f1) {
467 vec[0] = vec[1]= _mm_set1_pd(f1);
473 return Vec4(arr[
N],arr[N],arr[N],arr[N]);
491 vec = _mm_cvtpd_ps(ivec.
vec[0]);
492 __m128 v2 = _mm_cvtpd_ps(ivec.
vec[1]);
493 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
495 #endif // CMS_USE_AVX 498 #endif // CMS_USE_SSE 525 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
529 return _mm_cmpeq_ps(a.vec,b.vec);
533 return _mm_cmpgt_ps(a.vec,b.vec);
538 return _mm_hadd_ps(a.vec,b.vec);
544 const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
545 return _mm_xor_ps(a.vec,neg);
549 return _mm_and_ps(a.vec,b.vec);
552 return _mm_or_ps(a.vec,b.vec);
555 return _mm_xor_ps(a.vec,b.vec);
558 return _mm_andnot_ps(a.vec,b.vec);
563 return _mm_add_ps(a.vec,b.vec);
567 return _mm_sub_ps(a.vec,b.vec);
571 return _mm_mul_ps(a.vec,b.vec);
575 return _mm_div_ps(a.vec,b.vec);
579 return _mm_min_ps(a.vec,b.vec);
583 return _mm_max_ps(a.vec,b.vec);
588 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
592 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
596 return _mm_div_ps(b.vec,_mm_set1_ps(a));
601 using mathSSE::_mm_dot_ps;
603 _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
608 using mathSSE::_mm_cross_ps;
609 return _mm_cross_ps(a.vec,b.vec);
618 __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
619 mul.vec = _mm_add_ps(mul.vec, swp);
622 _mm_store_ss(&s,mul.vec);
692 vec = _mm256_castpd256_pd128(v4.
vec);
699 arr[0] = ivec.
arr[0]; arr[1] = ivec.
arr[1];
703 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
704 return _mm_xor_pd(a.vec,neg);
709 return _mm_and_pd(a.vec,b.vec);
712 return _mm_or_pd(a.vec,b.vec);
715 return _mm_xor_pd(a.vec,b.vec);
718 return _mm_andnot_pd(a.vec,b.vec);
723 return _mm_hadd_pd(a.vec,b.vec);
728 return _mm_add_pd(a.vec,b.vec);
732 return _mm_sub_pd(a.vec,b.vec);
736 return _mm_mul_pd(a.vec,b.vec);
740 return _mm_div_pd(a.vec,b.vec);
745 return _mm_min_pd(a.vec,b.vec);
749 return _mm_max_pd(a.vec,b.vec);
754 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
758 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
762 return _mm_div_pd(b.vec,_mm_set1_pd(a));
769 __m128d
res = _mm_mul_pd ( a.vec, b.vec);
771 res = _mm_hadd_pd(res,res);
773 res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
776 _mm_store_sd(&s,res);
783 __m128d
res = _mm_shuffle_pd ( b.vec, b.vec, 1);
784 res = _mm_mul_pd ( a.vec , res );
785 res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
787 _mm_store_sd(&s,res);
806 _mm_movemask_pd(_mm_cmpeq_pd(a.
vec[0],b.
vec[0]))==0x3 &&
807 _mm_movemask_pd(_mm_cmpeq_pd(a.
vec[1],b.
vec[1]))==0x3 ;
811 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
843 __m128d
res = _mm_set1_pd(a);
848 __m128d
res = _mm_set1_pd(a);
876 __m128d
res = _mm_add_sd ( _mm_mul_pd ( a.
vec[0], b.
vec[0]),
877 _mm_mul_sd ( a.
vec[1], b.
vec[1])
879 res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
881 _mm_store_sd(&s,res);
888 const __m128i neg = _mm_set_epi64x( 0, 0x8000000000000000 );
890 __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.
vec[1] , a.
vec[1] ), b.
vec[0] );
892 __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.
vec[1], b.
vec[1] ), a.
vec[0] );
893 __m128d m1 = _mm_sub_pd ( l1 , l2 );
894 m1 = _mm_shuffle_pd ( m1 , m1 , 1 );
895 m1 = __m128d(_mm_xor_si128 ( __m128i(m1) , neg ));
897 l1 = _mm_mul_pd ( a.
vec[0] , _mm_shuffle_pd ( b.
vec[0] , b.
vec[0] , 1 ) );
899 __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
910 #endif // CMS_USE_AVX 917 template<>
inline Vec2D sqrt(
Vec2D v) {
return _mm_sqrt_pd(v.vec);}
922 return Vec4D(_mm_sqrt_pd(v.
vec[0]),_mm_sqrt_pd(v.
vec[1]));
927 #endif // CMS_USE_SSE 936 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float>
const &
v);
937 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double>
const &
v);
940 #endif // DataFormat_Math_SSEVec_H
T & operator[](unsigned int n)
mathSSE::Vec4< double > andnot(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
mathSSE::Vec4< double > operator|(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Vec2< double > xy() const
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > operator&(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > cmpgt(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Basic3DVector< long double > operator/(const Basic3DVector< long double > &v, S s)
std::ostream & operator<<(std::ostream &out, mathSSE::Vec2D const &v)
T operator[](unsigned int n) const
Vec4(float f1, float f2, float f3, float f4=0)
As3D< T > as3D(Vec4< T > const &v)
bool operator==(const QGLikelihoodParameters &lhs, const QGLikelihoodCategory &rhs)
Test if parameters are compatible with category.
T operator[](int i) const
struct mathSSE::Rot3 __attribute__
As3D(Vec4< T > const &iv)
std::vector< std::vector< double > > tmp
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
mathSSE::Vec4< double > operator^(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
auto zw(V v) -> Vec2< typename std::remove_reference< decltype(v[0])>::type >
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
mathSSE::Vec4< double > hadd(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
mathSSE::Vec4< double > cmpeq(mathSSE::Vec4< double > a, mathSSE::Vec4< double > b)
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.