1 #ifndef DataFormat_Math_SSEVec_H
2 #define DataFormat_Math_SSEVec_H
4 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
14 #include <emmintrin.h>
17 #include <pmmintrin.h>
20 #include <smmintrin.h>
38 int const mask= 0x80000000;
39 return ((rh^lh)&mask) == 0;
45 long long const mask= 0x8000000000000000LL;
46 return ((rh^lh)&mask) == 0;
52 union {
int i;
float f; }
a,
b;
60 union {
long long i;
double f; }
a,
b;
70 inline __m128 _mm_dot_ps(__m128 v1, __m128 v2) {
72 return _mm_dp_ps(v1, v2, 0xff);
74 __m128 mul = _mm_mul_ps(v1, v2);
76 mul = _mm_hadd_ps(mul,mul);
77 return _mm_hadd_ps(mul,mul);
79 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
80 mul = _mm_add_ps(mul, swp);
81 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
82 return _mm_add_ps(mul, swp);
89 inline __m128 _mm_cross_ps(__m128 v1, __m128 v2) {
90 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
91 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
93 __m128 v5 = _mm_mul_ps(v3, v4);
95 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
96 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
98 v3 = _mm_mul_ps(v3, v4);
99 const __m128
neg = _mm_set_ps(0.0
f,0.0
f,-0.0
f,0.0
f);
100 return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
104 #endif // CMS_USE_SSE
110 template<
typename T>
union Vec4;
112 template<
typename T>
union Vec2{
132 arr[0] = v[0];
arr[1] = v[1];
151 template<
typename T>
union Vec4{
153 arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
156 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
162 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
165 arr[0] =
f1; arr[1] =
f1; arr[2] =
f1; arr[3]=
f1;
168 return Vec4(arr[n],arr[n],arr[n],arr[n]);
182 arr[0]=v4[0];arr[1]=v4[1];
193 template<
typename T>
union Mask2{};
194 template<
typename T>
union Mask4{};
200 Mask4() {vec = _mm_setzero_ps();}
201 Mask4(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
202 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
207 union Mask4<double> {
211 vec[0] = _mm_setzero_pd();
212 vec[1] = _mm_setzero_pd();
214 Mask4(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
215 mask[0]=m1; mask[1]=m2; mask[2]=m3; mask[3]=m4;
221 union Mask2<double> {
225 vec = _mm_setzero_pd();
227 Mask2(
unsigned long long m1,
unsigned long long m2) {
228 mask[0]=m1; mask[1]=m2;
234 typedef __m128 nativeType;
239 Vec4(__m128 ivec) : vec(ivec) {}
241 Vec4(OldVec<float>
const & ivec) :
o(ivec) {}
244 vec = _mm_setzero_ps();
248 inline Vec4(Vec4<double> ivec);
255 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
259 Vec4( Vec2<float> ivec0, Vec2<float> ivec1) {
260 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
261 arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
264 Vec4( Vec2<float> ivec0,
float f3,
float f4=0) {
265 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
266 arr[2] =
f3; arr[3] =
f4;
269 Vec4( Vec2<float> ivec0) {
270 vec = _mm_setzero_ps();
271 arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
276 void setMask(
unsigned int m1,
unsigned int m2,
unsigned int m3,
unsigned int m4) {
277 Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec;
280 void set(
float f1,
float f2,
float f3,
float f4=0) {
281 vec = _mm_set_ps(
f4, f3, f2, f1);
285 vec = _mm_set1_ps(f1);
289 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(n, n, n, n));
292 float & operator[](
unsigned int n) {
296 float operator[](
unsigned int n)
const {
300 Vec2<float>
xy()
const {
return Vec2<float>(arr[0],arr[1]);}
301 Vec2<float>
zw()
const {
return Vec2<float>(arr[2],arr[3]);}
308 typedef __m128d nativeType;
312 Vec2(__m128d ivec) : vec(ivec) {}
315 vec = _mm_setzero_pd();
319 inline Vec2(Vec2<float> ivec);
322 Vec2(
double f1,
double f2) {
323 arr[0] =
f1; arr[1] =
f2;
326 explicit Vec2(
double f1) {
330 inline Vec2(Vec4<double> v4);
333 void setMask(
unsigned long long m1,
unsigned long long m2) {
334 Mask2<double> mask(m1,m2); vec=mask.vec;
338 void set(
double f1,
double f2) {
339 arr[0] =
f1; arr[1] =
f2;
343 vec = _mm_set1_pd(f1);
346 Vec2
get1(
unsigned int n)
const {
347 return Vec2(arr[n],arr[n]);
350 double operator[](
unsigned int n)
const {
362 Vec4(__m128d ivec[]) {
367 Vec4(__m128d ivec0, __m128d ivec1) {
373 vec[0] = _mm_setzero_pd();
374 vec[1] = _mm_setzero_pd();
377 Vec4( Vec4<float> ivec) {
378 vec[0] = _mm_cvtps_pd(ivec.vec);
379 vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
382 explicit Vec4(
double f1) {
387 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
390 Vec4( Vec2<double> ivec0, Vec2<double> ivec1) {
395 Vec4( Vec2<double> ivec0,
double f3,
double f4=0) {
397 arr[2] =
f3; arr[3] =
f4;
400 Vec4( Vec2<double> ivec0) {
402 vec[1] = _mm_setzero_pd();
406 Vec4(OldVec<double>
const & ivec) :
o(ivec) {}
409 void setMask(
unsigned long long m1,
unsigned long long m2,
unsigned long long m3,
unsigned long long m4) {
410 Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1];
414 void set(
double f1,
double f2,
double f3,
double f4=0) {
415 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
419 vec[0] = vec[1]= _mm_set1_pd(f1);
424 return Vec4(arr[n],arr[n],arr[n],arr[n]);
427 double & operator[](
unsigned int n) {
431 double operator[](
unsigned int n)
const {
435 Vec2<double>
xy()
const {
return vec[0];}
436 Vec2<double>
zw()
const {
return vec[1];}
442 vec = _mm_cvtpd_ps(ivec.vec[0]);
443 __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
444 vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
448 #endif // CMS_USE_SSE
474 using mathSSE::_mm_dot_ps;
476 _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
481 using mathSSE::_mm_cross_ps;
482 return _mm_cross_ps(a.vec,b.vec);
487 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
491 return _mm_cmpeq_ps(a.vec,b.vec);
495 return _mm_cmpgt_ps(a.vec,b.vec);
500 return _mm_hadd_ps(a.vec,b.vec);
506 const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
507 return _mm_xor_ps(a.vec,neg);
511 return _mm_and_ps(a.vec,b.vec);
514 return _mm_or_ps(a.vec,b.vec);
517 return _mm_xor_ps(a.vec,b.vec);
520 return _mm_andnot_ps(a.vec,b.vec);
525 return _mm_add_ps(a.vec,b.vec);
529 return _mm_sub_ps(a.vec,b.vec);
533 return _mm_mul_ps(a.vec,b.vec);
537 return _mm_div_ps(a.vec,b.vec);
541 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
545 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
585 return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
591 return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
603 arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
607 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
608 return _mm_xor_pd(a.vec,neg);
613 return _mm_and_pd(a.vec,b.vec);
616 return _mm_or_pd(a.vec,b.vec);
619 return _mm_xor_pd(a.vec,b.vec);
622 return _mm_andnot_pd(a.vec,b.vec);
627 return _mm_add_pd(a.vec,b.vec);
631 return _mm_sub_pd(a.vec,b.vec);
635 return _mm_mul_pd(a.vec,b.vec);
639 return _mm_div_pd(a.vec,b.vec);
643 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
647 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
653 __m128d res = _mm_mul_pd ( a.vec, b.vec);
654 res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
656 _mm_store_sd(&s,res);
663 __m128d res = _mm_shuffle_pd ( b.vec, b.vec, 1);
664 res = _mm_mul_pd ( a.vec , res );
665 res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
667 _mm_store_sd(&s,res);
676 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 &&
677 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
681 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
682 return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
687 return mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
692 return mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
695 return mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
698 return mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
702 __m128d res = _mm_set1_pd(a);
703 return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
707 __m128d res = _mm_set1_pd(a);
708 return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
730 __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
731 _mm_mul_sd ( a.vec[1], b.vec[1])
733 res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
735 _mm_store_sd(&s,res);
742 const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
744 __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
746 __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.vec[1], b.vec[1] ), a.vec[0] );
747 __m128d m1 = _mm_sub_pd ( l1 , l2 );
748 m1 = _mm_shuffle_pd ( m1 , m1 , 1 );
749 m1 = _mm_xor_pd ( m1 , neg );
751 l1 = _mm_mul_pd ( a.vec[0] , _mm_shuffle_pd ( b.vec[0] , b.vec[0] , 1 ) );
753 __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
764 template<>
inline Vec2D sqrt(
Vec2D v) {
return _mm_sqrt_pd(v.vec);}
766 return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
777 inline void sincos(
Vec4F v,
Vec4F & s,
Vec4F &
c) { sincos_ps(v.vec,&s.vec, &c.vec);}
779 inline float log(
float f) {
float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f)));
return s;}
780 inline float exp(
float f) {
float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f)));
return s;}
781 inline float sin(
float f) {
float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f)));
return s;}
782 inline float cos(
float f) {
float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f)));
return s;}
783 inline void sincos(
float f,
float & s,
float &
c) {
785 sincos_ps(_mm_load_ss(&f),&vs, &vc);
786 _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);
789 #endif // CMS_USE_SSE
798 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float>
const &
v);
799 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double>
const &
v);
802 #endif // DataFormat_Math_SSEVec_H
T & operator[](unsigned int n)
Vec2 get1(unsigned int n) const
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
Sin< T >::type sin(const T &t)
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
strbitset operator|(const strbitset &l, const strbitset &r)
bool operator==(const CaloTower &t1, const CaloTower &t2)
Exp< T >::type exp(const T &t)
std::ostream & operator<<(std::ostream &out, const ALILine &li)
Basic3DVector< long double > operator/(const Basic3DVector< long double > &v, S s)
Vec4 get1(unsigned int n) const
T operator[](unsigned int n) const
Vec4(float f1, float f2, float f3, float f4=0)
As3D< T > as3D(Vec4< T > const &v)
Cos< T >::type cos(const T &t)
struct mathSSE::Rot3 __attribute__
return samesign< long long >(a.i, b.i)
return samesign< int >(a.i, b.i)
Log< T >::type log(const T &t)
As3D(Vec4< T > const &iv)
bool samesign(T rh, T lh)
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
TwoHolder< T, U > operator&(const T &iT, const U &iU)
std::auto_ptr< ParameterDescriptionNode > operator^(ParameterDescriptionNode const &node_left, ParameterDescriptionNode const &node_right)
T __attribute__((aligned(16))) arr[4]
void set(float f1, float f2, float f3, float f4=0)
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.