1 #ifndef DataFormat_Math_SSEVec_H
2 #define DataFormat_Math_SSEVec_H
4 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
14 #include <emmintrin.h>
17 #include <pmmintrin.h>
20 #include <smmintrin.h>
33 template<
typename T>
inline bool samesign(T rh, T
lh);
38 int const mask= 0x80000000;
39 return ((rh^lh)&mask) == 0;
45 long long const mask= 0x8000000000000000LL;
46 return ((rh^lh)&mask) == 0;
52 union {
int i;
float f; }
a,
b;
60 union {
long long i;
double f; }
a,
b;
70 inline __m128 _mm_dot_ps(__m128 v1, __m128 v2) {
72 return _mm_dp_ps(v1, v2, 0xff);
74 __m128 mul = _mm_mul_ps(v1, v2);
76 mul = _mm_hadd_ps(mul,mul);
77 return _mm_hadd_ps(mul,mul);
79 __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
80 mul = _mm_add_ps(mul, swp);
81 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
82 return _mm_add_ps(mul, swp);
89 inline __m128 _mm_cross_ps(__m128 v1, __m128 v2) {
90 __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
91 __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
93 __m128 v5 = _mm_mul_ps(v3, v4);
95 v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
96 v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
98 v3 = _mm_mul_ps(v3, v4);
99 const __m128
neg = _mm_set_ps(0.0
f,0.0
f,-0.0
f,0.0
f);
100 return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
104 #endif // CMS_USE_SSE
111 template<
typename T>
union Vec2{
113 arr[0] = 0; arr[1] = 0;
116 arr[0] =
f1; arr[1] =
f2;
119 arr[0] =
f1; arr[1] =
f1;
122 arr[0] =
f1; arr[1] =
f2;
125 return Vec2(arr[n],arr[n]);
143 arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
146 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
152 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
155 arr[0] =
f1; arr[1] =
f1; arr[2] =
f1; arr[3]=
f1;
158 return Vec4(arr[n],arr[n],arr[n],arr[n]);
175 typedef __m128 nativeType;
180 Vec4(__m128 ivec) : vec(ivec) {}
185 vec = _mm_setzero_ps();
188 explicit Vec4(
float f1) {
192 Vec4(
float f1,
float f2,
float f3,
float f4=0) {
193 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
196 void set(
float f1,
float f2,
float f3,
float f4=0) {
197 vec = _mm_set_ps(
f4, f3, f2, f1);
199 void set1(
float f1) {
200 vec = _mm_set1_ps(f1);
203 Vec4
get1(
unsigned int n)
const {
204 return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(n, n, n, n));
222 typedef __m128d nativeType;
226 Vec2(__m128d ivec) : vec(ivec) {}
229 vec = _mm_setzero_pd();
233 arr[0] =
f1; arr[1] =
f2;
236 explicit Vec2(
double f1) {
240 void set(
double f1,
double f2) {
241 arr[0] =
f1; arr[1] =
f2;
244 void set1(
double f1) {
245 vec = _mm_set1_pd(f1);
249 return Vec2(arr[n],arr[n]);
264 Vec4(__m128d ivec[]) {
269 Vec4(__m128d ivec0, __m128d ivec1) {
275 vec[0] = _mm_setzero_pd();
276 vec[1] = _mm_setzero_pd();
279 explicit Vec4(
double f1) {
283 Vec4(
double f1,
double f2,
double f3,
double f4=0) {
284 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
287 Vec4( Vec2<double> ivec0, Vec2<double> ivec1) {
292 Vec4( Vec2<double> ivec0,
double f3,
double f4=0) {
294 arr[2] =
f3; arr[3] =
f4;
297 Vec4( Vec2<double> ivec0) {
299 vec[1] = _mm_setzero_pd();
303 Vec4(OldVec<double>
const & ivec) : o(ivec) {}
305 void set(
double f1,
double f2,
double f3,
double f4=0) {
306 arr[0] =
f1; arr[1] =
f2; arr[2] =
f3; arr[3]=
f4;
309 void set1(
double f1) {
310 vec[0] = vec[1]= _mm_set1_pd(f1);
314 Vec4
get1(
unsigned int n)
const {
315 return Vec4(arr[n],arr[n],arr[n],arr[n]);
326 Vec2<double>
xy()
const {
return vec[0];}
327 Vec2<double> zw()
const {
return vec[1];}
331 #endif // CMS_USE_SSE
356 using mathSSE::_mm_dot_ps;
358 _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
363 using mathSSE::_mm_cross_ps;
364 return _mm_cross_ps(a.vec,b.vec);
369 return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
373 return _mm_cmpeq_ps(a.vec,b.vec);
377 return _mm_cmpgt_ps(a.vec,b.vec);
382 return _mm_hadd_ps(a.vec,b.vec);
388 const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
389 return _mm_xor_ps(a.vec,neg);
393 return _mm_and_ps(a.vec,b.vec);
396 return _mm_or_ps(a.vec,b.vec);
399 return _mm_xor_ps(a.vec,b.vec);
402 return _mm_andnot_ps(a.vec,b.vec);
407 return _mm_add_ps(a.vec,b.vec);
411 return _mm_sub_ps(a.vec,b.vec);
415 return _mm_mul_ps(a.vec,b.vec);
419 return _mm_div_ps(a.vec,b.vec);
423 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
427 return _mm_mul_ps(_mm_set1_ps(a),b.vec);
433 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
434 return _mm_xor_pd(a.vec,neg);
439 return _mm_and_pd(a.vec,b.vec);
442 return _mm_or_pd(a.vec,b.vec);
445 return _mm_xor_pd(a.vec,b.vec);
448 return _mm_andnot_pd(a.vec,b.vec);
453 return _mm_add_pd(a.vec,b.vec);
457 return _mm_sub_pd(a.vec,b.vec);
461 return _mm_mul_pd(a.vec,b.vec);
465 return _mm_div_pd(a.vec,b.vec);
469 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
473 return _mm_mul_pd(_mm_set1_pd(a),b.vec);
479 __m128d res = _mm_mul_pd ( a.vec, b.vec);
480 res = _mm_add_sd ( _mm_shuffle_pd ( res , res, 1 ), res );
482 _mm_store_sd(&s,res);
489 __m128d res = _mm_shuffle_pd ( b.vec, b.vec, 1);
490 res = _mm_mul_pd ( a.vec , res );
491 res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
493 _mm_store_sd(&s,res);
502 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 &&
503 _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
507 const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
508 return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
513 return mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
516 return mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
519 return mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
522 return mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
526 __m128d res = _mm_set1_pd(a);
527 return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
531 __m128d res = _mm_set1_pd(a);
532 return mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
540 __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
541 _mm_mul_sd ( a.vec[1], b.vec[1])
543 res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
545 _mm_store_sd(&s,res);
552 const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
554 __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
556 __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd ( b.vec[1], b.vec[1] ), a.vec[0] );
557 __m128d m1 = _mm_sub_pd ( l1 , l2 );
558 m1 = _mm_shuffle_pd ( m1 , m1 , 1 );
559 m1 = _mm_xor_pd ( m1 , neg );
561 l1 = _mm_mul_pd ( a.vec[0] , _mm_shuffle_pd ( b.vec[0] , b.vec[0] , 1 ) );
563 __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
575 return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
586 inline void sincos(
Vec4F v,
Vec4F & s,
Vec4F &
c) { sincos_ps(v.vec,&s.vec, &c.vec);}
588 inline float log(
float f) {
float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f)));
return s;}
589 inline float exp(
float f) {
float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f)));
return s;}
590 inline float sin(
float f) {
float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f)));
return s;}
591 inline float cos(
float f) {
float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f)));
return s;}
592 inline void sincos(
float f,
float & s,
float &
c) {
594 sincos_ps(_mm_load_ss(&f),&vs, &vc);
595 _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);
598 #endif // CMS_USE_SSE
606 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float>
const &
v);
607 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double>
const &
v);
610 #endif // DataFormat_Math_SSEVec_H
T & operator[](unsigned int n)
Vec2 get1(unsigned int n) const
MatrixMeschach operator+(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
Sin< T >::type sin(const T &t)
MatrixMeschach operator-(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
Basic3DVector cross(const Basic3DVector &v) const
Vector product, or "cross" product, with a vector of same type.
strbitset operator|(const strbitset &l, const strbitset &r)
T dot(const Basic3DVector &v) const
Scalar product, or "dot" product, with a vector of same type.
bool operator==(const CaloTower &t1, const CaloTower &t2)
Exp< T >::type exp(const T &t)
std::ostream & operator<<(std::ostream &out, const ALILine &li)
T __attribute__((aligned(16))) arr[2]
Vec4 get1(unsigned int n) const
T operator[](unsigned int n) const
Vec4(float f1, float f2, float f3, float f4=0)
As3D< T > as3D(Vec4< T > const &v)
Cos< T >::type cos(const T &t)
Basic2DVector< T > operator/(const Basic2DVector< T > &v, const Scalar &s)
struct mathSSE::Rot3 __attribute__
return samesign< long long >(a.i, b.i)
return samesign< int >(a.i, b.i)
Log< T >::type log(const T &t)
As3D(Vec4< T > const &iv)
bool samesign(T rh, T lh)
MatrixMeschach operator*(const MatrixMeschach &mat1, const MatrixMeschach &mat2)
TwoHolder< T, U > operator&(const T &iT, const U &iU)
std::auto_ptr< ParameterDescriptionNode > operator^(ParameterDescriptionNode const &node_left, ParameterDescriptionNode const &node_right)
void set(float f1, float f2, float f3, float f4=0)
Basic2DVector< T > xy() const