CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_5_3_10/src/DataFormats/Math/interface/SSEVec.h

Go to the documentation of this file.
00001 #ifndef DataFormat_Math_SSEVec_H
00002 #define DataFormat_Math_SSEVec_H
00003 
00004 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ > 4)
00005 #include <x86intrin.h>
00006 #define CMS_USE_SSE
00007 #ifdef __AVX__
00008 #define CMS_USE_AVX
00009 #endif
00010 #else
00011 
00012 #ifdef __SSE2__
00013 #define CMS_USE_SSE
00014 
00015 #include <mmintrin.h>
00016 #include <emmintrin.h>
00017 #endif
00018 #ifdef __SSE3__
00019 #include <pmmintrin.h>
00020 #endif
00021 #ifdef __SSE4_1__
00022 #include <smmintrin.h>
00023 #endif
00024 
00025 #endif
00026 
00027 #include<cmath>
00028 
00029 // needed fro gcc < 4.6
00030 namespace mathSSE {
00031   struct ZeroUpper {
00032     ZeroUpper() {
00033 #ifdef __AVX__
00034     _mm256_zeroupper();
00035 #endif
00036     }
00037    ~ZeroUpper() {
00038 #ifdef __AVX__
00039     _mm256_zeroupper();
00040 #endif
00041     }
00042   };
00043 }
00044 
00045 namespace mathSSE {
00046   template<typename T> inline T sqrt(T t) { return std::sqrt(t);}
00047 }
00048 
00049 namespace mathSSE {
00050   //
00051   template<typename T> inline bool samesign(T rh, T lh);
00052 
00053   template<>
00054   inline bool
00055   __attribute__((always_inline)) __attribute__ ((pure)) samesign<int>(int rh, int lh) {
00056     int const mask= 0x80000000;
00057     return ((rh^lh)&mask) == 0;
00058   }
00059 
00060   template<>
00061   inline bool
00062   __attribute__((always_inline)) __attribute__ ((pure)) samesign<long long>(long long rh, long long lh) {
00063     long long const mask= 0x8000000000000000LL;
00064     return ((rh^lh)&mask) == 0;
00065   }
00066 
00067   template<>
00068   inline bool
00069   __attribute__((always_inline)) __attribute__ ((pure)) samesign<float>(float rh, float lh) {
00070     union { int i; float f; } a, b;
00071     a.f=rh; b.f=lh;
00072     return samesign<int>(a.i,b.i);
00073   }
00074 
00075   template<>
00076   inline bool
00077   __attribute__((always_inline)) __attribute__ ((pure)) samesign<double>(double rh, double lh) {
00078     union { long long i; double f; } a, b;
00079     a.f=rh; b.f=lh;
00080     return samesign<long long>(a.i,b.i);
00081   }
00082 }
00083 
00084 
00085 namespace mathSSE {
00086 #ifdef  CMS_USE_SSE
00087   //dot
00088   inline __m128 
00089   __attribute__((always_inline)) __attribute__ ((pure))
00090   _mm_dot_ps(__m128 v1, __m128 v2) {
00091 #ifdef __SSE4_1__
00092     return _mm_dp_ps(v1, v2, 0xff);
00093 #else
00094     __m128 mul = _mm_mul_ps(v1, v2);
00095 #ifdef __SSE3__
00096     mul = _mm_hadd_ps(mul,mul);
00097     return _mm_hadd_ps(mul,mul);
00098 #else
00099     __m128 swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 0, 3, 2));
00100     mul = _mm_add_ps(mul, swp);
00101     swp = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
00102     return _mm_add_ps(mul, swp);
00103 #endif
00104 #endif
00105   }
00106   
00107 
00108   // cross (just 3x3) 
00109   inline __m128 
00110   __attribute__((always_inline)) __attribute__ ((pure))
00111   _mm_cross_ps(__m128 v1, __m128 v2) {
00112     // same order is  _MM_SHUFFLE(3,2,1,0)
00113     //                                               x2, z1,z1
00114     __m128 v3 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 0, 2, 2));
00115     //                                               y1, x2,y2
00116     __m128 v4 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 1, 0, 1));
00117     
00118     __m128 v5 = _mm_mul_ps(v3, v4);
00119     
00120     //                                         x1, z2,z2
00121     v3 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3, 0, 2, 2));
00122     //                                        y2, x1,y1
00123     v4 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3, 1, 0, 1));
00124     
00125     v3 = _mm_mul_ps(v3, v4);
00126     const  __m128 neg = _mm_set_ps(0.0f,0.0f,-0.0f,0.0f);
00127     return _mm_xor_ps(_mm_sub_ps(v5, v3), neg);
00128   }
00129 #endif // CMS_USE_SSE
00130 
00131 #ifdef  CMS_USE_AVX
00132   inline __m256d  
00133   __attribute__((always_inline)) __attribute__ ((pure))
00134   _mm256_dot_pd(__m256d v1, __m256d v2) {
00135     __m256d mul = _mm256_mul_pd(v1, v2);
00136     mul = _mm256_hadd_pd(mul,mul);
00137     __m256d tmp = _mm256_permute2f128_pd(mul,mul,1);
00138     return _mm256_add_pd(mul,tmp);
00139   }
00140 
00141   inline __m256d  
00142   __attribute__((always_inline)) __attribute__ ((pure))
00143   _mm256_cross_pd(__m256d v1, __m256d v2) {
00144     
00145     __m256d v3 = _mm256_permute2f128_pd(v2, v1, (2<<4)+1);
00146     v3 = _mm256_permute_pd(v3,0);
00147     
00148     __m256d v4 = _mm256_permute2f128_pd(v1, v2, (2<<4));
00149     v4 = _mm256_permute_pd(v4,5);
00150     
00151     __m256d v5 = _mm256_mul_pd(v3, v4);
00152     
00153     v3 = _mm256_permute2f128_pd(v1, v2, (2<<4)+1);
00154     v3 = _mm256_permute_pd(v3,0);
00155     
00156     v4 = _mm256_permute2f128_pd(v2, v1, (2<<4));
00157     v4 = _mm256_permute_pd(v4,5);
00158     
00159     v3 = _mm256_mul_pd(v3, v4);
00160     const  __m256d neg = _mm256_set_pd(0.0,0.0,-0.0,0.0);
00161     return _mm256_xor_pd(_mm256_sub_pd(v5, v3), neg);
00162  
00163   }
00164 
00165 #endif //  CMS_USE_AVX
00166 
00167 
00168 
00169   template<typename T>
00170   struct OldVec { T  theX; T  theY; T  theZ; T  theW;}  __attribute__ ((aligned (16)));
00171 #ifdef  CMS_USE_AVX
00172   template<>
00173   struct OldVec<double> { double  theX; double  theY; double  theZ; double  theW;}  __attribute__ ((aligned (32)));
00174 #endif
00175   
00176   template<typename T> union Vec4;
00177 
00178   template<typename T> union Vec2{
00179     Vec2() {
00180       arr[0] = 0; arr[1] = 0;
00181     }
00182     Vec2(T f1, T f2) {
00183       arr[0] = f1; arr[1] = f2;
00184     }
00185     explicit Vec2(T f1) {
00186       arr[0] = f1; arr[1] = f1;
00187     }
00188 
00189     void set(T f1, T f2) {
00190       arr[0] = f1; arr[1] = f2;
00191     }
00192 
00193     template <int N>
00194     Vec2 get1() const { 
00195       return Vec2(arr[N],arr[N]);
00196     }
00197 
00198     /*
00199     Vec2 get1(unsigned int n) const {
00200       return Vec2(arr[n],arr[n]);
00201     }
00202     */
00203 
00204     template<typename U> 
00205     Vec2(Vec2<U> v) {
00206       arr[0] = v[0]; arr[1] = v[1];
00207 
00208     }
00209 
00210     inline Vec2(Vec4<T> v4);
00211 
00212     T & operator[](unsigned int n) {
00213       return arr[n];
00214     }
00215 
00216     T operator[](unsigned int n) const {
00217       return arr[n];
00218     }
00219 
00220 
00221     T arr[2];
00222   };
00223 
00224 
00225   template<typename T> union Vec4{
00226     Vec4() {
00227       arr[0] = 0; arr[1] = 0; arr[2] = 0; arr[3]=0;
00228     }
00229     Vec4(float f1, float f2, float f3, float f4=0) {
00230       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00231     }
00232     explicit Vec4(float f1) {
00233       set1(f1);
00234     }
00235     void set(float f1, float f2, float f3, float f4=0) {
00236       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00237     }
00238     void set1(float f1) {
00239       arr[0] = f1; arr[1] = f1; arr[2] = f1; arr[3]=f1;
00240     }
00241    template <int N>
00242     Vec4 get1() const { 
00243      return Vec4(arr[N],arr[N],arr[N],arr[N]);
00244    }
00245     /*
00246     Vec4 get1(unsigned int n) const {
00247       return Vec4(arr[n],arr[n],arr[n],arr[n]);
00248     }
00249     */
00250 
00251     Vec2<T> xy() const { return  Vec2<T>(arr[0],arr[1]);}
00252     Vec2<T> zw() const { return  Vec2<T>(arr[2],arr[3]);}
00253 
00254 
00255 
00256     T __attribute__ ((aligned(16))) arr[4];
00257     OldVec<T> o;
00258   };
00259 
00260   template<typename T>
00261   inline Vec2<T>::Vec2(Vec4<T> v4) {
00262     arr[0]=v4[0];arr[1]=v4[1];
00263   }
00264 
00265 
00266 #ifdef CMS_USE_SSE
00267 
00268   template<>
00269   union Vec2<double>;
00270   template<>
00271   union Vec4<double>;
00272 
00273   template<typename T> union Mask2{};
00274   template<typename T> union Mask4{};
00275 
00276   template<>
00277   union Mask4<float> {
00278     __m128 vec;
00279     unsigned int __attribute__ ((aligned(16))) mask[4];
00280     Mask4() {vec = _mm_setzero_ps();}
00281     Mask4(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00282       mask[0]=m1;  mask[1]=m2;  mask[2]=m3;  mask[3]=m4; 
00283     }
00284   };
00285   
00286 #ifdef  CMS_USE_AVX
00287   template<>
00288   union Mask4<double> {
00289     __m256d vec;
00290     unsigned long long __attribute__ ((aligned(32))) mask[4];
00291     Mask4() {
00292       vec = _mm256_setzero_pd();
00293      }
00294     Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00295       mask[0]=m1;  mask[1]=m2;  mask[2]=m3;  mask[3]=m4; 
00296     }
00297   };
00298 #else
00299   template<>
00300   union Mask4<double> {
00301     __m128d vec[2];
00302     unsigned long long __attribute__ ((aligned(16))) mask[4];
00303     Mask4() {
00304       vec[0] = _mm_setzero_pd();
00305       vec[1] = _mm_setzero_pd();
00306     }
00307     Mask4(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00308       mask[0]=m1;  mask[1]=m2;  mask[2]=m3;  mask[3]=m4; 
00309     }
00310   };
00311 #endif
00312 
00313   template<>
00314   union Mask2<double> {
00315     __m128d vec;
00316     unsigned long long __attribute__ ((aligned(16))) mask[2];
00317     Mask2() {
00318       vec = _mm_setzero_pd();
00319     }
00320     Mask2(unsigned long long m1, unsigned long long m2) {
00321       mask[0]=m1;  mask[1]=m2; 
00322     }
00323   };
00324 
00325   template<>
00326   union Vec4<float> {
00327     typedef  __m128 nativeType;
00328     __m128 vec;
00329     float __attribute__ ((aligned(16))) arr[4];
00330     OldVec<float> o;
00331     
00332     Vec4(__m128 ivec) : vec(ivec) {}
00333 
00334     Vec4(OldVec<float> const & ivec) : o(ivec) {}
00335     
00336     Vec4() {
00337       vec = _mm_setzero_ps();
00338     }
00339 
00340 
00341     inline Vec4(Vec4<double> ivec);
00342 
00343     explicit Vec4(float f1) {
00344       set1(f1);
00345     }
00346 
00347     Vec4(float f1, float f2, float f3, float f4=0) {
00348       vec = _mm_set_ps(f4, f3, f2, f1);
00349     }
00350 
00351 
00352     Vec4( Vec2<float> ivec0,   Vec2<float> ivec1) {
00353       arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00354       arr[2] = ivec1.arr[0]; arr[3]=ivec1.arr[1];
00355     }
00356     
00357     Vec4( Vec2<float> ivec0,  float f3, float f4=0) {
00358       arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00359       arr[2] = f3; arr[3] = f4;
00360     }
00361 
00362    Vec4( Vec2<float> ivec0) {
00363      vec = _mm_setzero_ps();
00364      arr[0] = ivec0.arr[0]; arr[1]=ivec0.arr[1];
00365     }
00366 
00367 
00368     // for masking
00369     void setMask(unsigned int m1, unsigned int m2, unsigned int m3, unsigned int m4) {
00370       Mask4<float> mask(m1,m2,m3,m4); vec=mask.vec; 
00371     }
00372 
00373     void set(float f1, float f2, float f3, float f4=0) {
00374       vec = _mm_set_ps(f4, f3, f2, f1);
00375     }
00376 
00377     void set1(float f1) {
00378      vec =  _mm_set1_ps(f1);
00379     }
00380 
00381     template <int N>
00382     Vec4 get1() const { 
00383       return _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(N, N, N, N)); 
00384     }
00385 
00386     float & operator[](unsigned int n) {
00387       return arr[n];
00388     }
00389 
00390     float operator[](unsigned int n) const {
00391       return arr[n];
00392     }
00393     
00394     Vec2<float> xy() const { return  Vec2<float>(arr[0],arr[1]);}
00395     Vec2<float> zw() const { return  Vec2<float>(arr[2],arr[3]);}
00396 
00397   };
00398   
00399 
00400   template<>
00401   union Vec2<double> {
00402     typedef  __m128d nativeType;
00403     __m128d vec;
00404     double __attribute__ ((aligned(16))) arr[2];
00405         
00406     Vec2(__m128d ivec) : vec(ivec) {}
00407     
00408     Vec2() {
00409       vec = _mm_setzero_pd();
00410     }
00411 
00412 
00413     inline Vec2(Vec2<float> ivec);
00414 
00415 
00416     Vec2(double f1, double f2) {
00417       vec = _mm_set_pd(f2,f1);
00418     }
00419 
00420     explicit Vec2(double f1) {
00421       set1(f1);
00422     }
00423     
00424     inline Vec2(Vec4<double> v4); 
00425 
00426    // for masking
00427    void setMask(unsigned long long m1, unsigned long long m2) {
00428      Mask2<double> mask(m1,m2); vec=mask.vec; 
00429    }
00430 
00431 
00432     void set(double f1, double f2) {
00433       vec = _mm_set_pd(f2,f1);
00434     }
00435 
00436     void set1(double f1) {
00437       vec = _mm_set1_pd(f1);
00438     }
00439 
00440     template <int N>
00441     Vec2 get1() const { 
00442       return Vec2(arr[N],arr[N]);
00443     }
00444     /*
00445     Vec2 get1(unsigned int n) const {
00446       return Vec2(arr[n],arr[n]);
00447     }
00448     */
00449     double & operator[](unsigned int n) {
00450       return arr[n];
00451     }
00452     double operator[](unsigned int n) const {
00453       return arr[n];
00454     }
00455   };
00456  
00457 
00458 
00459 
00460 #ifdef  CMS_USE_AVX
00461 }// namespace mathSSE
00462 #include "AVXVec.h"
00463 
00464 namespace mathSSE {
00465 #else
00466   template<>
00467   union Vec4<double> {
00468     __m128d vec[2];
00469     double __attribute__ ((aligned(16))) arr[4];
00470     OldVec<double> o;
00471     
00472     Vec4(__m128d ivec[]) {
00473       vec[0] = ivec[0];
00474       vec[1] = ivec[1];
00475     }
00476     
00477     Vec4(__m128d ivec0, __m128d ivec1) {
00478       vec[0] = ivec0;
00479       vec[1] = ivec1;
00480     }
00481     
00482     Vec4() {
00483       vec[0] = _mm_setzero_pd();
00484       vec[1] = _mm_setzero_pd();
00485     }
00486 
00487     Vec4( Vec4<float> ivec) {
00488       vec[0] = _mm_cvtps_pd(ivec.vec);
00489       vec[1] = _mm_cvtps_pd(_mm_shuffle_ps(ivec.vec, ivec.vec, _MM_SHUFFLE(1, 0, 3, 2)));
00490     }
00491 
00492     explicit Vec4(double f1) {
00493       set1(f1);
00494     }
00495 
00496     Vec4(double f1, double f2, double f3, double f4=0) {
00497       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00498     }
00499     
00500    Vec4( Vec2<double> ivec0,   Vec2<double> ivec1) {
00501       vec[0] = ivec0.vec;
00502       vec[1] = ivec1.vec;
00503     }
00504     
00505     Vec4( Vec2<double> ivec0,  double f3, double f4=0) {
00506       vec[0] = ivec0.vec;
00507       arr[2] = f3; arr[3] = f4;
00508     }
00509 
00510    Vec4( Vec2<double> ivec0) {
00511       vec[0] = ivec0.vec;
00512       vec[1] =  _mm_setzero_pd();
00513     }
00514 
00515 
00516     Vec4(OldVec<double> const & ivec) : o(ivec) {}
00517 
00518     // for masking
00519     void setMask(unsigned long long m1, unsigned long long m2, unsigned long long m3, unsigned long long m4) {
00520       Mask4<double> mask(m1,m2,m3,m4); vec[0]=mask.vec[0]; vec[1]=mask.vec[1]; 
00521     }
00522 
00523 
00524     void set(double f1, double f2, double f3, double f4=0) {
00525       arr[0] = f1; arr[1] = f2; arr[2] = f3; arr[3]=f4;
00526     }
00527 
00528     void set1(double f1) {
00529       vec[0] = vec[1]= _mm_set1_pd(f1);
00530     }
00531 
00532 
00533     template<int N>
00534     Vec4 get1() const {
00535       return Vec4(arr[N],arr[N],arr[N],arr[N]);
00536     }
00537 
00538     double & operator[](unsigned int n) {
00539       return arr[n];
00540     }
00541 
00542     double operator[](unsigned int n) const {
00543       return arr[n];
00544     }
00545   
00546     Vec2<double> xy() const { return vec[0];}
00547     Vec2<double> zw() const { return vec[1];}
00548 
00549   };
00550 
00551   
00552   inline Vec4<float>::Vec4(Vec4<double> ivec) {
00553     vec = _mm_cvtpd_ps(ivec.vec[0]);
00554     __m128 v2 = _mm_cvtpd_ps(ivec.vec[1]);
00555     vec = _mm_shuffle_ps(vec, v2, _MM_SHUFFLE(1, 0, 1, 0));
00556   }
00557 #endif  // CMS_USE_AVX
00558 
00559 
00560 #endif // CMS_USE_SSE
00561   
00562   typedef Vec2<float> Vec2F;
00563   typedef Vec4<float> Vec4F;
00564   typedef Vec4<float> Vec3F;
00565   typedef Vec2<double> Vec2D;
00566   typedef Vec4<double> Vec3D;
00567   typedef Vec4<double> Vec4D;
00568 
00569   template<typename T>
00570   struct As3D {
00571     Vec4<T> const & v;
00572     As3D(Vec4<T> const &iv ) : v(iv){}
00573   };
00574 
00575   template<typename T>
00576   inline As3D<T> as3D(Vec4<T> const &v ) { return v;}
00577 
00578 } // namespace mathSSE
00579 
00580 #ifdef CMS_USE_SSE
00581 
00582 
00583 //float op
00584 
00585 
00586 inline bool operator==(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00587   return _mm_movemask_ps(_mm_cmpeq_ps(a.vec,b.vec))==0xf;
00588 }
00589 
00590 inline mathSSE::Vec4F cmpeq(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00591   return _mm_cmpeq_ps(a.vec,b.vec);
00592 }
00593 
00594 inline mathSSE::Vec4F cmpgt(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00595   return _mm_cmpgt_ps(a.vec,b.vec);
00596 }
00597 
00598 #ifdef __SSE3__
00599 inline mathSSE::Vec4F hadd(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00600   return _mm_hadd_ps(a.vec,b.vec);
00601 }
00602 #endif
00603 
00604 
00605 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a) {
00606   const __m128 neg = _mm_set_ps ( -0.0 , -0.0 , -0.0, -0.0);
00607   return _mm_xor_ps(a.vec,neg);
00608 }
00609 
00610 inline mathSSE::Vec4F operator&(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00611   return  _mm_and_ps(a.vec,b.vec);
00612 }
00613 inline mathSSE::Vec4F operator|(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00614   return  _mm_or_ps(a.vec,b.vec);
00615 }
00616 inline mathSSE::Vec4F operator^(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00617   return  _mm_xor_ps(a.vec,b.vec);
00618 }
00619 inline mathSSE::Vec4F andnot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00620   return  _mm_andnot_ps(a.vec,b.vec);
00621 }
00622 
00623 
00624 inline mathSSE::Vec4F operator+(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00625   return  _mm_add_ps(a.vec,b.vec);
00626 }
00627 
00628 inline mathSSE::Vec4F operator-(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00629   return  _mm_sub_ps(a.vec,b.vec);
00630 }
00631 
00632 inline mathSSE::Vec4F operator*(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00633   return  _mm_mul_ps(a.vec,b.vec);
00634 }
00635 
00636 inline mathSSE::Vec4F operator/(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00637   return  _mm_div_ps(a.vec,b.vec);
00638 }
00639 
00640 inline mathSSE::Vec4F min(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00641   return  _mm_min_ps(a.vec,b.vec);
00642 }
00643 
00644 inline mathSSE::Vec4F max(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00645   return  _mm_max_ps(a.vec,b.vec);
00646 }
00647 
00648 
00649 inline mathSSE::Vec4F operator*(float a, mathSSE::Vec4F b) {
00650   return  _mm_mul_ps(_mm_set1_ps(a),b.vec);
00651 }
00652 
00653 inline mathSSE::Vec4F operator*(mathSSE::Vec4F b,float a) {
00654   return  _mm_mul_ps(_mm_set1_ps(a),b.vec);
00655 }
00656 
00657 inline mathSSE::Vec4F operator/(mathSSE::Vec4F b,float a) {
00658   return  _mm_div_ps(b.vec,_mm_set1_ps(a));
00659 }
00660 
00661 
00662 inline float dot(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00663   using  mathSSE::_mm_dot_ps;
00664   float s;
00665   _mm_store_ss(&s,_mm_dot_ps(a.vec,b.vec));
00666   return s;
00667 }
00668 
00669 inline mathSSE::Vec4F cross(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00670   using  mathSSE::_mm_cross_ps;
00671   return _mm_cross_ps(a.vec,b.vec);
00672 }
00673 
00674 
00675 inline float dotxy(mathSSE::Vec4F a, mathSSE::Vec4F b) {
00676   mathSSE::Vec4F mul = a*b;
00677 #ifdef __SSE3__
00678    mul = hadd(mul,mul);
00679 #else
00680    __m128 swp = _mm_shuffle_ps(mul.vec, mul.vec, _MM_SHUFFLE(2, 3, 0, 1));
00681    mul.vec = _mm_add_ps(mul.vec, swp);
00682 #endif
00683   float s;
00684   _mm_store_ss(&s,mul.vec);
00685   return s;
00686 }
00687 
00688 
00689 //
00690 // float op 2d (use the 4d one...)
00691 //
00692 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a) {
00693   return -mathSSE::Vec4F(a);
00694 }
00695 
00696 
00697 inline mathSSE::Vec4F operator+(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00698   return  mathSSE::Vec4F(a)+mathSSE::Vec4F(b);
00699 }
00700 
00701 inline mathSSE::Vec4F operator-(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00702   return  mathSSE::Vec4F(a)-mathSSE::Vec4F(b);
00703 }
00704 
00705 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00706   return  mathSSE::Vec4F(a)*mathSSE::Vec4F(b);
00707 }
00708 
00709 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00710   return  mathSSE::Vec4F(a)/mathSSE::Vec4F(b);
00711 }
00712 
00713 inline mathSSE::Vec4F min(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00714   return  min(mathSSE::Vec4F(a),mathSSE::Vec4F(b));
00715 }
00716 
00717 inline mathSSE::Vec4F max(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00718   return  ::max(mathSSE::Vec4F(a),mathSSE::Vec4F(b));
00719 }
00720 
00721 
00722 inline mathSSE::Vec4F operator*(mathSSE::Vec2F a, float s) {
00723   return  s*mathSSE::Vec4F(a);
00724 }
00725 
00726 inline mathSSE::Vec4F operator*(float s, mathSSE::Vec2F a) {
00727   return  s*mathSSE::Vec4F(a);
00728 }
00729 
00730 inline mathSSE::Vec4F operator/(mathSSE::Vec2F a, float s) {
00731   return  mathSSE::Vec4F(a)/s;
00732 }
00733 
00734 
00735 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b)  __attribute__((always_inline)) __attribute__ ((pure));
00736 
00737 inline float dot(mathSSE::Vec2F a, mathSSE::Vec2F b){
00738   return a.arr[0]*b.arr[0] + a.arr[1]*b.arr[1];
00739 }
00740 
00741 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b)  __attribute__((always_inline)) __attribute__ ((pure));
00742 
00743 inline float cross(mathSSE::Vec2F a, mathSSE::Vec2F b) {
00744   return a.arr[0]*b.arr[1] - a.arr[1]*b.arr[0];
00745 }
00746 
00747 
00749 // double op 2d
00750 //
00751 
00752 inline  mathSSE::Vec2D::Vec2(Vec4D v4) {
00753 #ifdef  CMS_USE_AVX
00754   vec = _mm256_castpd256_pd128(v4.vec);
00755 #else
00756   vec = v4.vec[0];
00757 #endif
00758 }
00759 
00760 inline  mathSSE::Vec2D::Vec2(Vec2F ivec) {
00761   arr[0] = ivec.arr[0]; arr[1] = ivec.arr[1];
00762 }
00763 
00764 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a) {
00765   const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00766   return _mm_xor_pd(a.vec,neg);
00767 }
00768 
00769 
00770 inline mathSSE::Vec2D operator&(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00771   return  _mm_and_pd(a.vec,b.vec);
00772 }
00773 inline mathSSE::Vec2D operator|(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00774   return  _mm_or_pd(a.vec,b.vec);
00775 }
00776 inline mathSSE::Vec2D operator^(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00777   return  _mm_xor_pd(a.vec,b.vec);
00778 }
00779 inline mathSSE::Vec2D andnot(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00780   return  _mm_andnot_pd(a.vec,b.vec);
00781 }
00782 
00783 #ifdef __SSE3__
00784 inline mathSSE::Vec2D hadd(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00785   return _mm_hadd_pd(a.vec,b.vec);
00786 }
00787 #endif
00788 
00789 inline mathSSE::Vec2D operator+(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00790   return  _mm_add_pd(a.vec,b.vec);
00791 }
00792 
00793 inline mathSSE::Vec2D operator-(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00794   return  _mm_sub_pd(a.vec,b.vec);
00795 }
00796 
00797 inline mathSSE::Vec2D operator*(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00798   return  _mm_mul_pd(a.vec,b.vec);
00799 }
00800 
00801 inline mathSSE::Vec2D operator/(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00802   return  _mm_div_pd(a.vec,b.vec);
00803 }
00804 
00805 
00806 inline mathSSE::Vec2D min(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00807   return _mm_min_pd(a.vec,b.vec);
00808 }
00809 
00810 inline mathSSE::Vec2D max(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00811   return _mm_max_pd(a.vec,b.vec);
00812 }
00813 
00814 
00815 inline mathSSE::Vec2D operator*(double a, mathSSE::Vec2D b) {
00816   return  _mm_mul_pd(_mm_set1_pd(a),b.vec);
00817 }
00818 
00819 inline mathSSE::Vec2D operator*(mathSSE::Vec2D b,double a) {
00820   return  _mm_mul_pd(_mm_set1_pd(a),b.vec);
00821 }
00822 
00823 inline mathSSE::Vec2D operator/(mathSSE::Vec2D b,double a) {
00824   return  _mm_div_pd(b.vec,_mm_set1_pd(a));
00825 }
00826 
00827 
00828 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b)  __attribute__((always_inline)) __attribute__ ((pure));
00829 
00830 inline double dot(mathSSE::Vec2D a, mathSSE::Vec2D b){
00831   __m128d res = _mm_mul_pd ( a.vec, b.vec);
00832 #ifdef __SSE3__
00833     res = _mm_hadd_pd(res,res);
00834 #else
00835   res = _mm_add_sd (  _mm_shuffle_pd ( res , res, 1 ), res );
00836 #endif
00837   double s;
00838   _mm_store_sd(&s,res);
00839   return s;
00840 }
00841 
00842 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b)  __attribute__((always_inline)) __attribute__ ((pure));
00843 
00844 inline double cross(mathSSE::Vec2D a, mathSSE::Vec2D b) {
00845   __m128d res =  _mm_shuffle_pd ( b.vec, b.vec, 1);
00846   res = _mm_mul_pd (  a.vec , res );
00847   res = _mm_sub_sd (res, _mm_shuffle_pd ( res , res, 1 ));
00848   double s;
00849   _mm_store_sd(&s,res);
00850   return s;
00851 }
00852 
00853 
00854 #ifndef  CMS_USE_AVX
00855 // double op 3d
00856 
00857 
00858 #ifdef __SSE3__
00859 // consistent with AVX...
00860 inline mathSSE::Vec4D hadd(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00861   return  mathSSE::Vec4D(hadd(a.vec[0],b.vec[0]),hadd(a.vec[1],b.vec[1]) );
00862 }
00863 #endif
00864 
00865 
00866 inline bool operator==(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00867   return 
00868     _mm_movemask_pd(_mm_cmpeq_pd(a.vec[0],b.vec[0]))==0x3 && 
00869     _mm_movemask_pd(_mm_cmpeq_pd(a.vec[1],b.vec[1]))==0x3 ;
00870 }
00871 
00872 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a) {
00873   const __m128d neg = _mm_set_pd ( -0.0 , -0.0);
00874   return mathSSE::Vec4D(_mm_xor_pd(a.vec[0],neg),_mm_xor_pd(a.vec[1],neg));
00875 }
00876 
00877 
00878 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00879   return  mathSSE::Vec4D(_mm_add_pd(a.vec[0],b.vec[0]),_mm_add_pd(a.vec[1],b.vec[1]));
00880 }
00881 
00882 
00883 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00884   return  mathSSE::Vec4D(_mm_sub_pd(a.vec[0],b.vec[0]),_mm_sub_pd(a.vec[1],b.vec[1]));
00885 }
00886 inline mathSSE::Vec4D operator*(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00887   return  mathSSE::Vec4D(_mm_mul_pd(a.vec[0],b.vec[0]),_mm_mul_pd(a.vec[1],b.vec[1]));
00888 }
00889 inline mathSSE::Vec4D operator/(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00890   return  mathSSE::Vec4D(_mm_div_pd(a.vec[0],b.vec[0]),_mm_div_pd(a.vec[1],b.vec[1]));
00891 }
00892 
00893 
00894 inline mathSSE::Vec4D min(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00895   using namespace mathSSE;
00896   return  mathSSE::Vec4D(::min(Vec2D(a.vec[0]),Vec2D(b.vec[0])),::min(Vec2D(a.vec[1]),Vec2D(b.vec[1])) );
00897 }
00898 
00899 inline mathSSE::Vec4D max(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00900   using namespace mathSSE;
00901   return  mathSSE::Vec4D(::max(Vec2D(a.vec[0]),Vec2D(b.vec[0])),::max(Vec2D(a.vec[1]),Vec2D(b.vec[1])) );
00902 }
00903 
00904 inline mathSSE::Vec4D operator*(double a, mathSSE::Vec4D b) {
00905   __m128d res = _mm_set1_pd(a);
00906   return  mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00907 }
00908 
00909 inline mathSSE::Vec4D operator*(mathSSE::Vec4D b, double a) {
00910   __m128d res = _mm_set1_pd(a);
00911   return  mathSSE::Vec4D(_mm_mul_pd(res,b.vec[0]),_mm_mul_pd(res,b.vec[1]));
00912 }
00913 
00914 inline mathSSE::Vec4D operator/(mathSSE::Vec4D b, double a) {
00915   a = 1./a;
00916   return a*b;
00917 }
00918 
00919 // mix algebra (creates ambiguities...)
00920 /*
00921 inline mathSSE::Vec4D operator+(mathSSE::Vec4D a, mathSSE::Vec4F b) {
00922   return a + mathSSE::Vec4D(b);
00923 }
00924 inline mathSSE::Vec4D operator+(mathSSE::Vec4D b, mathSSE::Vec4F a) {
00925   return a + mathSSE::Vec4D(b);
00926 }
00927 inline mathSSE::Vec4D operator-(mathSSE::Vec4D a, mathSSE::Vec4F b) {
00928   return a - mathSSE::Vec4D(b);
00929 }
00930 inline mathSSE::Vec4D operator-(mathSSE::Vec4D b, mathSSE::Vec4F a) {
00931   return a - mathSSE::Vec4D(b);
00932 }
00933 */
00934 
00935 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00936 
00937 inline double dot(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00938   __m128d res = _mm_add_sd ( _mm_mul_pd ( a.vec[0], b.vec[0]),
00939                              _mm_mul_sd ( a.vec[1], b.vec[1]) 
00940                              );
00941   res = _mm_add_sd ( _mm_unpackhi_pd ( res , res ), res );
00942   double s;
00943   _mm_store_sd(&s,res);
00944   return s;
00945 }
00946 
00947 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) __attribute__((always_inline)) __attribute__ ((pure));
00948 
00949 inline mathSSE::Vec4D cross(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00950   const __m128d neg = _mm_set_pd ( 0.0 , -0.0 );
00951   // lh .z * rh .x , lh .z * rh .y
00952   __m128d l1 = _mm_mul_pd ( _mm_unpacklo_pd ( a.vec[1] , a.vec[1] ), b.vec[0] );
00953   // rh .z * lh .x , rh .z * lh .y
00954   __m128d l2 = _mm_mul_pd ( _mm_unpacklo_pd (  b.vec[1],  b.vec[1] ),  a.vec[0] );
00955   __m128d m1 = _mm_sub_pd ( l1 , l2 ); // l1 - l2
00956   m1 = _mm_shuffle_pd ( m1 , m1 , 1 ); // switch the elements
00957   m1 = _mm_xor_pd ( m1 , neg ); // change the sign of the first element
00958   // lh .x * rh .y , lh .y * rh .x
00959   l1 = _mm_mul_pd (  a.vec[0] , _mm_shuffle_pd (  b.vec[0] ,  b.vec[0] , 1 ) );
00960   // lh .x * rh .y - lh .y * rh .x
00961   __m128d m2 = _mm_sub_sd ( l1 , _mm_unpackhi_pd ( l1 , l1 ) );
00962   return mathSSE::Vec4D(m1, m2);
00963 }
00964 
00965 
00966 inline double  
00967 __attribute__((always_inline)) __attribute__ ((pure)) 
00968 dotxy(mathSSE::Vec4D a, mathSSE::Vec4D b) {
00969   return dot(a.xy(),b.xy());
00970 }
00971 
00972 #endif   // CMS_USE_AVX
00973 
00974 
00975 // sqrt
00976 namespace mathSSE {
00977   template<> inline Vec4F sqrt(Vec4F v) { return _mm_sqrt_ps(v.vec);}
00978   template<> inline Vec2F sqrt(Vec2F v) { return sqrt(Vec4F(v));}
00979   template<> inline Vec2D sqrt(Vec2D v) { return _mm_sqrt_pd(v.vec);}
00980 #ifdef  CMS_USE_AVX
00981   template<> inline Vec4D sqrt(Vec4D v) { return _mm256_sqrt_pd(v.vec);}
00982 #else
00983   template<> inline Vec4D sqrt(Vec4D v) { 
00984     return Vec4D(_mm_sqrt_pd(v.vec[0]),_mm_sqrt_pd(v.vec[1]));
00985   }
00986 #endif
00987 }
00988 
00989 // chephes func
00990 #include "DataFormats/Math/interface/sse_mathfun.h"
00991 namespace mathSSE {
00992   inline Vec4F log(Vec4F v) { return log_ps(v.vec);}
00993   inline Vec4F exp(Vec4F v) { return exp_ps(v.vec);}
00994   inline Vec4F sin(Vec4F v) { return sin_ps(v.vec);}
00995   inline Vec4F cos(Vec4F v) { return cos_ps(v.vec);}
00996   inline void sincos(Vec4F v, Vec4F & s, Vec4F & c) { sincos_ps(v.vec,&s.vec, &c.vec);}
00997 
00998   inline float log(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
00999   inline float exp(float f) { float s; _mm_store_ss(&s,exp_ps(_mm_load_ss(&f))); return s;}
01000   inline float sin(float f) { float s; _mm_store_ss(&s,sin_ps(_mm_load_ss(&f))); return s;}
01001   inline float cos(float f) { float s; _mm_store_ss(&s,log_ps(_mm_load_ss(&f))); return s;}
01002   inline void sincos(float f, float & s, float & c) { 
01003     __m128 vs, vc; 
01004     sincos_ps(_mm_load_ss(&f),&vs, &vc);   
01005     _mm_store_ss(&s,vs);_mm_store_ss(&c,vc);   
01006   }
01007 }
01008 #endif // CMS_USE_SSE
01009 
01010 
01011 #include <iosfwd>
01012 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2D const & v);
01013 std::ostream & operator<<(std::ostream & out, mathSSE::Vec2F const & v);
01014 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4F const & v);
01015 std::ostream & operator<<(std::ostream & out, mathSSE::Vec4D const & v);
01016 
01017 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<float> const & v);
01018 std::ostream & operator<<(std::ostream & out, mathSSE::As3D<double> const & v);
01019 
01020 
01021 #endif // DataFormat_Math_SSEVec_H