CMS 3D CMS Logo

/data/doxygen/doxygen-1.7.3/gen/CMSSW_4_2_8/src/DataFormats/Math/interface/SSEVec.h

Go to the documentation of this file.
00001 #ifndef DataFormat_Math_SSEVec_H
00002 #define DataFormat_Math_SSEVec_H
00003 
00004 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
00005 #include <x86intrin.h>
00006 #define CMS_USE_SSE
00007 
00008 #else
00009 
00010 #ifdef __SSE2__
00011 #define CMS_USE_SSE
00012 
00013 #include <mmintrin.h>
00014 #include <emmintrin.h>
00015 #endif
00016 #ifdef __SSE3__
00017 #include <pmmintrin.h>
00018 #endif
00019 #ifdef __SSE4_1__
00020 #include <smmintrin.h>
00021 #endif
00022 
00023 #endif
00024 
00025 #include<cmath>
00026 
00027 namespace mathSSE {
00028   template<typename T> inline T sqrt(T t) { return std::sqrt(t);}
00029 }
00030 
00031 namespace mathSSE {
00032   //
00033   template<typename T> inline bool samesign(T rh, T lh);
00034 
00035   template<>
00036   inline bool
00037   __attribute__((always_inline)) __attribute__ ((pure)) samesign<int>(int rh, int lh) {
00038     int const mask= 0x80000000;
00039     return ((rh^lh)&mask) == 0;
00040   }
00041 
00042   template<>
00043   inline bool
00044   __attribute__((always_inline)) __attribute__ ((pure)) samesign<long long>(long long rh, long long lh) {
00045     long long const mask= 0x8000000000000000LL;
00046     return ((rh^lh)&mask) == 0;
00047   }
00048 
00049   template<>
00050   inline bool
00051   __attribute__((always_inline)) __attribute__ ((pure)) samesign<float>(float rh, float lh) {
00052     union { int i; float f; } a, b;
00053     a.f=rh; b.f=lh;
00054     return samesign<int>(a.i,b.i);
00055   }
00056 
00057   template<>
00058   inline bool
00059   __attribute__((always_inline)) __attribute__ ((pure)) samesign<double>(double rh, double lh) {
00060     union { long long i; double f; } a, b;
00061     a.f=rh; b.f=lh;
00062     return samesign<long long>(a.i,b.i);
00063   }
00064 }
00065 
00066 
00067 namespace mathSSE {
00068 #ifdef  CMS_USE_SSE
00069   //dot
00070   inline __m128 _mm_dot_ps(__m128 v1, __m128 v2) {
00071 #ifdef __SSE4_1__
00072     return _mm_dp_ps(v1, v2, 0xff);
00073 #else
00074     __m128 mul = _mm_mul_ps(v1, v2);
00075 #ifdef __SSE3__
00076     mul = _mm_hadd_ps(mul,mul);
00077     return _mm_hadd_ps(mul,mul);
00078 #else
00079     __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
00080     mul = _mm_add_ps(mul, swp);
00081     swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
00082     return _mm_add_ps(mul, swp);
00083 #endif
00084 #endif
00085   }
00086   
00087 
00088   // cross (just 3x3) 
00089   inline __m128 _mm_cross_ps(__m128 v1, __m128 v2) {
00090     __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
00091     __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
00092     
00093     __m128 v5 = _mm_mul_ps(v3, v4);
00094     
00095     v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
00096     v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
00097     
00098     v3 = _mm_mul_ps(v3, v4);
00099     const  __m128 neg = _mm_set_ps(0.0f,0.0f,-0.0f,0.0f);
00100     return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
00101   }
00102 
00103 
00104 #endif // CMS_USE_SSE
00105 
00106 
00107   template<typename T>
00108   struct OldVec { T  theX; T  theY; T  theZ; T  theW;}  __attribute__ ((aligned (16)));
00109   
00110   template<typename T> union Vec4;
00111 
00112   template<typename T> union Vec2{
00113     Vec2() {
00114       arr[0] = 0; arr[1] = 0;
00115     }
00116     Vec2(T f1, T f2) {
00117       arr[0] = f1; arr[1] = f2;
00118     }
00119     explicit Vec2(T f1) {
00120       arr[0] = f1; arr[1] = f1;
00121     }
00122 
00123     void set(T f1, T f2) {
00124       arr[0] = f1; arr[1] = f2;
00125     }
00126     Vec2 get1(unsigned int n) const {
00127       return Vec2(arr[n],arr[n]);
00128     }
00129 
00130     template<typename U> 
00131     Vec2(Vec2<U> v) {
00132       arr[0] = v[0]; arr[1] = v[1];
00133 
00134     }
00135 
00136     inline Vec2(Vec4<T> v4);
00137 
00138     T & operator[](unsigned int n) {
00139       return arr[n];
00140     }
00141 
00142     T operator[](unsigned int n) const {
00143       return arr[n];
00144     }
00145 
00146 
00147     T arr[2];
00148   };
00149 
00150 
00151   template<typename T> union Vec4{
00152     Vec4() {
00153       arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
00154     }
00155     Vec4(float f1, float f2, float f3, float f4=0) {
00156       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00157     }
00158     explicit Vec4(float f1) {
00159       set1(f1);
00160     }
00161     void set(float f1, float f2, float f3, float f4=0) {
00162       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00163     }
00164     void set1(float f1) {
00165       arr[0] = f1; arr[1] = f1; arr[2] = f1; arr[3]=f1;
00166     }
00167     Vec4 get1(unsigned int n) const {
00168       return Vec4(arr[n],arr[n],arr[n],arr[n]);
00169     }
00170 
00171     Vec2<T> xy() const { return  Vec2<T>(arr[0],arr[1]);}
00172     Vec2<T> zw() const { return  Vec2<T>(arr[2],arr[3]);}
00173 
00174 
00175 
00176     T __attribute__ ((aligned(16))) arr[4];
00177     OldVec<T> o;
00178   };
00179 
00180   template<typename T>
00181   inline Vec2<T>::Vec2(Vec4<T> v4) {
00182     arr[0]=v4[0];arr[1]=v4[1];
00183   }
00184 
00185 
00186 #ifdef CMS_USE_SSE
00187 
00188   template<>
00189   union Vec2<double>;
00190   template<>
00191   union Vec4<double>;
00192 
00193   template<typename T> union Mask2{};
00194   template<typename T> union Mask4{};
00195 
00196   template<>
00197   union Mask4<float> {
00198     __m128 vec;
00199     unsigned int __attribute__ ((aligned(16))) mask[4];
00200     Mask4() {vec = _mm_setzero_ps();}
00201     Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00202       mask[0]=m1;  mask[1]=m2;  mask[2]=m3;  mask[3]=m4; 
00203     }
00204   };
00205   
00206   template<>
00207   union Mask4<double> {
00208     __m128d vec[2];
00209     unsigned long long __attribute__ ((aligned(16))) mask[4];
00210     Mask4() {
00211       vec[0] = _mm_setzero_pd();
00212       vec[1] = _mm_setzero_pd();
00213     }
00214     Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00215       mask[0]=m1;  mask[1]=m2;  mask[2]=m3;  mask[3]=m4; 
00216     }
00217   };
00218 
00219 
00220   template<>
00221   union Mask2<double> {
00222     __m128d vec;
00223     unsigned long long __attribute__ ((aligned(16))) mask[2];
00224     Mask2() {
00225       vec = _mm_setzero_pd();
00226     }
00227     Mask2(unsigned long long m1, unsigned long long m2) {
00228       mask[0]=m1;  mask[1]=m2; 
00229     }
00230   };
00231 
00232   template<>
00233   union Vec4<float> {
00234     typedef  __m128 nativeType;
00235     __m128 vec;
00236     float __attribute__ ((aligned(16))) arr[4];
00237     OldVec<float> o;
00238     
00239     Vec4(__m128 ivec) : vec(ivec) {}
00240 
00241     Vec4(OldVec<float> const & ivec) : o(ivec) {}
00242     
00243     Vec4() {
00244       vec = _mm_setzero_ps();
00245     }
00246 
00247 
00248     inline Vec4(Vec4<double> ivec);
00249 
00250     explicit Vec4(float f1) {
00251       set1(f1);
00252     }
00253 
00254     Vec4(float f1, float f2, float f3, float f4=0) {
00255       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00256     }
00257 
00258 
00259     Vec4( Vec2<float> ivec0,   Vec2<float> ivec1) {
00260       arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00261       arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
00262     }
00263     
00264     Vec4( Vec2<float> ivec0,  float f3, float f4=0) {
00265       arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00266       arr[2] = f3; arr[3] = f4;
00267     }
00268 
00269    Vec4( Vec2<float> ivec0) {
00270      vec = _mm_setzero_ps();
00271      arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00272     }
00273 
00274 
00275     // for masking
00276     void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00277       Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec; 
00278     }
00279 
00280     void set(float f1, float f2, float f3, float f4=0) {
00281       vec = _mm_set_ps(f4, f3, f2, f1);
00282     }
00283 
00284     void set1(float f1) {
00285      vec =  _mm_set1_ps(f1);
00286     }
00287 
00288     Vec4 get1(unsigned int n) const { 
00289       return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(n, n, n, n)); 
00290     }
00291 
00292     float & operator[](unsigned int n) {
00293       return arr[n];
00294     }
00295 
00296     float operator[](unsigned int n) const {
00297       return arr[n];
00298     }
00299     
00300     Vec2<float> xy() const { return  Vec2<float>(arr[0],arr[1]);}
00301     Vec2<float> zw() const { return  Vec2<float>(arr[2],arr[3]);}
00302 
00303   };
00304   
00305 
00306   template<>
00307   union Vec2<double> {
00308     typedef  __m128d nativeType;
00309     __m128d vec;
00310     double __attribute__ ((aligned(16))) arr[2];
00311         
00312     Vec2(__m128d ivec) : vec(ivec) {}
00313     
00314     Vec2() {
00315       vec = _mm_setzero_pd();
00316     }
00317 
00318 
00319     inline Vec2(Vec2<float> ivec);
00320 
00321 
00322     Vec2(double f1, double f2) {
00323       arr[0] = f1; arr[1] = f2;
00324     }
00325 
00326     explicit Vec2(double f1) {
00327       set1(f1);
00328     }
00329     
00330     inline Vec2(Vec4<double> v4); 
00331 
00332    // for masking
00333    void setMask(unsigned long long m1, unsigned long long m2) {
00334      Mask2<double> mask(m1,m2); vec=mask.vec; 
00335    }
00336 
00337 
00338     void set(double f1, double f2) {
00339       arr[0] = f1; arr[1] = f2;
00340     }
00341 
00342     void set1(double f1) {
00343       vec = _mm_set1_pd(f1);
00344     }
00345 
00346     Vec2 get1(unsigned int n) const {
00347       return Vec2(arr[n],arr[n]);
00348     }
00349    
00350     double operator[](unsigned int n) const {
00351       return arr[n];
00352     }
00353   };
00354  
00355 
00356   template<>
00357   union Vec4<double> {
00358     __m128d vec[2];
00359     double __attribute__ ((aligned(16))) arr[4];
00360     OldVec<double> o;
00361     
00362     Vec4(__m128d ivec[]) {
00363       vec[0] = ivec[0];
00364       vec[1] = ivec[1];
00365     }
00366     
00367     Vec4(__m128d ivec0, __m128d ivec1) {
00368       vec[0] = ivec0;
00369       vec[1] = ivec1;
00370     }
00371     
00372     Vec4() {
00373       vec[0] = _mm_setzero_pd();
00374       vec[1] = _mm_setzero_pd();
00375     }
00376 
00377     Vec4( Vec4<float> ivec) {
00378       vec[0] = _mm_cvtps_pd(ivec.vec);
00379       vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
00380     }
00381 
00382     explicit Vec4(double f1) {
00383       set1(f1);
00384     }
00385 
00386     Vec4(double f1, double f2, double f3, double f4=0) {
00387       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00388     }
00389     
00390    Vec4( Vec2<double> ivec0,   Vec2<double> ivec1) {
00391       vec[0] = ivec0.vec;
00392       vec[1] = ivec1.vec;
00393     }
00394     
00395     Vec4( Vec2<double> ivec0,  double f3, double f4=0) {
00396       vec[0] = ivec0.vec;
00397       arr[2] = f3; arr[3] = f4;
00398     }
00399 
00400    Vec4( Vec2<double> ivec0) {
00401       vec[0] = ivec0.vec;
00402       vec[1] =  _mm_setzero_pd();
00403     }
00404 
00405 
00406     Vec4(OldVec<double> const & ivec) : o(ivec) {}
00407 
00408     // for masking
00409     void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00410       Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1]; 
00411     }
00412 
00413 
00414     void set(double f1, double f2, double f3, double f4=0) {
00415       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00416     }
00417 
00418     void set1(double f1) {
00419       vec[0] = vec[1]= _mm_set1_pd(f1);
00420     }
00421 
00422 
00423     Vec4 get1(unsigned int n) const {
00424       return Vec4(arr[n],arr[n],arr[n],arr[n]);
00425     }
00426 
00427     double & operator[](unsigned int n) {
00428       return arr[n];
00429     }
00430 
00431     double operator[](unsigned int n) const {
00432       return arr[n];
00433     }
00434   
00435     Vec2<double> xy() const { return vec[0];}
00436     Vec2<double> zw() const { return vec[1];}
00437 
00438   };
00439 
00440   
00441   inline Vec4<float>::Vec4(Vec4<double> ivec) {
00442     vec = _mm_cvtpd_ps(ivec.vec[0]);
00443     __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
00444     vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
00445   }
00446 
00447 
00448 #endif // CMS_USE_SSE
00449   
00450   typedef Vec2<float> Vec2F;
00451   typedef Vec4<float> Vec4F;
00452   typedef Vec4<float> Vec3F;
00453   typedef Vec2<double> Vec2D;
00454   typedef Vec4<double> Vec3D;
00455   typedef Vec4<double> Vec4D;
00456 
00457   template<typename T>
00458   struct As3D {
00459     Vec4<T> const & v;
00460     As3D(Vec4<T> const &iv ) : v(iv){}
00461   };
00462 
00463   template<typename T>
00464   inline As3D<T> as3D(Vec4<T> const &v ) { return v;}
00465 
00466 }
00467 
00468 #ifdef CMS_USE_SSE
00469 
00470 
00471 //float op
00472 
00473 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00474   using  mathSSE::_mm_dot_ps;
00475   float s;
00476   _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
00477   return s;
00478 }
00479 
00480 inline mathSSE::Vec4F cross(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00481   using  mathSSE::_mm_cross_ps;
00482   return _mm_cross_ps(a.vec,b.vec);
00483 }
00484 
00485 
00486 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00487   return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
00488 }
00489 
00490 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00491   return _mm_cmpeq_ps(a.vec,b.vec);
00492 }
00493 
00494 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00495   return _mm_cmpgt_ps(a.vec,b.vec);
00496 }
00497 
00498 #ifdef __SSE3__
00499 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00500   return _mm_hadd_ps(a.vec,b.vec);
00501 }
00502 #endif
00503 
00504 
00505 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a) {
00506   const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
00507   return _mm_xor_ps(a.vec,neg);
00508 }
00509 
00510 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00511   return  _mm_and_ps(a.vec,b.vec);
00512 }
00513 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00514   return  _mm_or_ps(a.vec,b.vec);
00515 }
00516 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00517   return  _mm_xor_ps(a.vec,b.vec);
00518 }
00519 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00520   return  _mm_andnot_ps(a.vec,b.vec);
00521 }
00522 
00523 
00524 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00525   return  _mm_add_ps(a.vec,b.vec);
00526 }
00527 
00528 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00529   return  _mm_sub_ps(a.vec,b.vec);
00530 }
00531 
00532 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00533   return  _mm_mul_ps(a.vec,b.vec);
00534 }
00535 
00536 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00537   return  _mm_div_ps(a.vec,b.vec);
00538 }
00539 
00540 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) {
00541   return  _mm_mul_ps(_mm_set1_ps(a),b.vec);
00542 }
00543 
00544 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b,float a) {
00545   return  _mm_mul_ps(_mm_set1_ps(a),b.vec);
00546 }
00547 
00548 //
00549 // float op 2d (use the 4d one...)
00550 //
00551 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a) {
00552   return -mathSSE::Vec4F(a);
00553 }
00554 
00555 
00556 inline mathSSE::Vec4F operator+(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00557   return  mathSSE::Vec4F(a)+mathSSE::Vec4F(b);
00558 }
00559 
00560 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00561   return  mathSSE::Vec4F(a)-mathSSE::Vec4F(b);
00562 }
00563 
00564 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00565   return  mathSSE::Vec4F(a)*mathSSE::Vec4F(b);
00566 }
00567 
00568 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00569   return  mathSSE::Vec4F(a)/mathSSE::Vec4F(b);
00570 }
00571 
00572 
00573 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) {
00574   return  s*mathSSE::Vec4F(a);
00575 }
00576 
00577 inline mathSSE::Vec4F operator*(float s,mathSSE::Vec2F a) {
00578   return  s*mathSSE::Vec4F(a);
00579 }
00580 
00581 
00582 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b)  __attribute__((always_inline)) __attribute__ ((pure));
00583 
00584 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b){
00585   return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
00586 }
00587 
00588 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b)  __attribute__((always_inline)) __attribute__ ((pure));
00589 
00590 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00591   return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
00592 }
00593 
00594 
00596 // double op 2d
00597 //
00598 inline  mathSSE::Vec2D::Vec2(Vec4D v4) {
00599   vec = v4.vec[0];
00600 }
00601 
00602 inline  mathSSE::Vec2D::Vec2(Vec2F ivec) {
00603   arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
00604 }
00605 
00606 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a) {
00607   const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00608   return _mm_xor_pd(a.vec,neg);
00609 }
00610 
00611 
00612 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00613   return  _mm_and_pd(a.vec,b.vec);
00614 }
00615 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00616   return  _mm_or_pd(a.vec,b.vec);
00617 }
00618 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00619   return  _mm_xor_pd(a.vec,b.vec);
00620 }
00621 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00622   return  _mm_andnot_pd(a.vec,b.vec);
00623 }
00624 
00625 
00626 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00627   return  _mm_add_pd(a.vec,b.vec);
00628 }
00629 
00630 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00631   return  _mm_sub_pd(a.vec,b.vec);
00632 }
00633 
00634 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00635   return  _mm_mul_pd(a.vec,b.vec);
00636 }
00637 
00638 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00639   return  _mm_div_pd(a.vec,b.vec);
00640 }
00641 
00642 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) {
00643   return  _mm_mul_pd(_mm_set1_pd(a),b.vec);
00644 }
00645 
00646 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b,double a) {
00647   return  _mm_mul_pd(_mm_set1_pd(a),b.vec);
00648 }
00649 
00650 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b)  __attribute__((always_inline)) __attribute__ ((pure));
00651 
00652 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b){
00653   __m128d res = _mm_mul_pd ( a.vec, b.vec);
00654   res = _mm_add_sd (  _mm_shuffle_pd ( res , res, 1 ), res );
00655   double s;
00656   _mm_store_sd(&s,res);
00657   return s;
00658 }
00659 
00660 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b)  __attribute__((always_inline)) __attribute__ ((pure));
00661 
00662 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00663   __m128d res =  _mm_shuffle_pd ( b.vec, b.vec, 1);
00664   res = _mm_mul_pd (  a.vec , res );
00665   res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
00666   double s;
00667   _mm_store_sd(&s,res);
00668   return s;
00669 }
00670 
00671 
00672 // double op 3d
00673 
00674 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00675   return 
00676     _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 && 
00677     _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
00678 }
00679 
00680 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a) {
00681   const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00682   return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
00683 }
00684 
00685 
00686 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00687   return  mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
00688 }
00689 
00690 
00691 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00692   return  mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
00693 }
00694 inline mathSSE::Vec4D operator*(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00695   return  mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
00696 }
00697 inline mathSSE::Vec4D operator/(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00698   return  mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
00699 }
00700 
00701 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
00702   __m128d res = _mm_set1_pd(a);
00703   return  mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00704 }
00705 
00706 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
00707   __m128d res = _mm_set1_pd(a);
00708   return  mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00709 }
00710 
00711 // mix algebra (creates ambiguities...)
00712 /*
00713 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
00714   return a + mathSSE::Vec4D(b);
00715 }
00716 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
00717   return a + mathSSE::Vec4D(b);
00718 }
00719 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
00720   return a - mathSSE::Vec4D(b);
00721 }
00722 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
00723   return a - mathSSE::Vec4D(b);
00724 }
00725 */
00726 
00727 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00728 
00729 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00730   __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
00731                              _mm_mul_sd ( a.vec[1], b.vec[1]) 
00732                              );
00733   res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
00734   double s;
00735   _mm_store_sd(&s,res);
00736   return s;
00737 }
00738 
00739 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00740  
00741 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00742   const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
00743   // lh .z * rh .x , lh .z * rh .y
00744   __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
00745   // rh .z * lh .x , rh .z * lh .y
00746   __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd (  b.vec[1],  b.vec[1] ),  a.vec[0] );
00747   __m128d m1 = _mm_sub_pd ( l1 , l2 ); // l1 - l2
00748   m1 = _mm_shuffle_pd ( m1 , m1 , 1 ); // switch the elements
00749   m1 = _mm_xor_pd ( m1 , neg ); // change the sign of the first element
00750   // lh .x * rh .y , lh .y * rh .x
00751   l1 = _mm_mul_pd (  a.vec[0] , _mm_shuffle_pd (  b.vec[0] ,  b.vec[0] , 1 ) );
00752   // lh .x * rh .y - lh .y * rh .x
00753   __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
00754 
00755   return  mathSSE::Vec4D( m1 , m2 );
00756 }
00757 
00758 
00759 
00760 // sqrt
00761 namespace mathSSE {
00762   template<> inline Vec4F sqrt(Vec4F v) { return _mm_sqrt_ps(v.vec);}
00763   template<> inline Vec2F sqrt(Vec2F v) { return sqrt(Vec4F(v));}
00764   template<> inline Vec2D sqrt(Vec2D v) { return _mm_sqrt_pd(v.vec);}
00765   template<> inline Vec4D sqrt(Vec4D v) { 
00766     return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
00767   }
00768 }
00769 
00770 // chephes func
00771 #include "DataFormats/Math/interface/sse_mathfun.h"
00772 namespace mathSSE {
00773   inline Vec4F log(Vec4F v) { return log_ps(v.vec);}
00774   inline Vec4F exp(Vec4F v) { return exp_ps(v.vec);}
00775   inline Vec4F sin(Vec4F v) { return sin_ps(v.vec);}
00776   inline Vec4F cos(Vec4F v) { return cos_ps(v.vec);}
00777   inline void sincos(Vec4F v, Vec4F & s, Vec4F & c) { sincos_ps(v.vec,&s.vec, &c.vec);}
00778 
00779   inline float log(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
00780   inline float exp(float f) { float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f))); return s;}
00781   inline float sin(float f) { float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f))); return s;}
00782   inline float cos(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
00783   inline void sincos(float f, float & s, float & c) { 
00784     __m128 vs, vc; 
00785     sincos_ps(_mm_load_ss(&f),&vs, &vc);   
00786     _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);   
00787   }
00788 }
00789 #endif // CMS_USE_SSE
00790 
00791 
00792 #include <iosfwd>
00793 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2D const & v);
00794 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2F const & v);
00795 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4F const & v);
00796 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4D const & v);
00797 
00798 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float> const & v);
00799 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double> const & v);
00800 
00801 
00802 #endif // DataFormat_Math_SSEVec_H